charset: Rewrote utf8_validate, and added four new functions:
authorJoey Adams <joeyadams3.14159@gmail.com>
Sat, 11 Jun 2011 07:58:10 +0000 (03:58 -0400)
committerJoey Adams <joeyadams3.14159@gmail.com>
Sat, 11 Jun 2011 08:10:52 +0000 (04:10 -0400)
 * utf8_read_char
 * utf8_write_char
 * from_surrogate_pair
 * to_surrogate_pair

ccan/charset/_info
ccan/charset/charset.c
ccan/charset/charset.h
ccan/charset/test/common.h [new file with mode: 0644]
ccan/charset/test/run-surrogate-pair.c [new file with mode: 0644]
ccan/charset/test/run-utf8-read-write.c [new file with mode: 0644]
ccan/charset/test/run-utf8_validate.c [new file with mode: 0644]
ccan/charset/test/run.c [deleted file]

index b549acb6eb67cde5c2e3602dff1740f9e5710e5f..246ca0738ad5c851554d743e5773a5da8446bc59 100644 (file)
 /**
  * charset - character set conversion and validation routines
  *
- * This module provides a collection (well, only one, at the moment) of
- * well-tested routines for dealing with character set nonsense.
- *
- * Validation functions:
- *  - bool utf8_validate(const char *str, size_t length);
+ * This module provides a collection of well-tested routines
+ * for dealing with character set nonsense.
  *
  * Example:
  *     #include <err.h>
  *     #include <stdio.h>
+ *     #include <stdlib.h>
  *     #include <string.h>
  *     #include <ccan/charset/charset.h>
  *     #include <ccan/grab_file/grab_file.h>
- *     #include <ccan/talloc/talloc.h> // For talloc_free()
- *
- *     int main(int argc, char *argv[])
+ *     #include <ccan/talloc/talloc.h>
+ *     
+ *     static void print_json_string(const char *s);
+ *     static bool parse_hex16(const char **sp, unsigned int *out);
+ *     
+ *     // Take a JSON-encoded string on input and print its literal value.
+ *     int main(void)
  *     {
- *             size_t len;
- *             char *file;
- *             bool valid;
- *
- *             if (argc != 2)
- *                     err(1, "Expected exactly one argument");
- *
- *             file = grab_file(NULL, argv[1], &len);
- *             if (!file)
- *                     err(1, "Could not read file %s", argv[1]);
- *
- *             valid = utf8_validate(file, len);
- *             printf("File contents are %s UTF-8\n", valid ? "valid" : "invalid");
- *
- *             talloc_free(file);
- *
+ *             char *input;
+ *             size_t length;
+ *     
+ *             input = grab_file(NULL, NULL, &length);
+ *             if (!input)
+ *                     err(1, "Error reading input");
+ *             if (!utf8_validate(input, length)) {
+ *                     fprintf(stderr, "Input contains invalid UTF-8\n");
+ *                     return 1;
+ *             }
+ *             if (strlen(input) != length) {
+ *                     fprintf(stderr, "Input contains null characters\n");
+ *                     return 1;
+ *             }
+ *             
+ *             print_json_string(input);
+ *             
+ *             talloc_free(input);
  *             return 0;
  *     }
+ *     
+ *     static void print_json_string(const char *s)
+ *     {
+ *             char output_buffer[4];
+ *             
+ *             // Skip leading whitespace
+ *             while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')
+ *                     s++;
+ *             
+ *             if (*s++ != '"') {
+ *                     fprintf(stderr, "Expected JSON string literal surrounded by double quotes.\n");
+ *                     exit(EXIT_FAILURE);
+ *             }
+ *             
+ *             while (*s != '"') {
+ *                     unsigned char c = *s++;
+ *                     char *b = output_buffer;
+ *                     
+ *                     if (c == '\\') {
+ *                             c = *s++;
+ *                             switch (c) {
+ *                                     case '"':
+ *                                     case '\\':
+ *                                     case '/':
+ *                                             *b++ = c;
+ *                                             break;
+ *                                     case 'b': *b++ = '\b'; break;
+ *                                     case 'f': *b++ = '\f'; break;
+ *                                     case 'n': *b++ = '\n'; break;
+ *                                     case 'r': *b++ = '\r'; break;
+ *                                     case 't': *b++ = '\t'; break;
+ *                                     case 'u': {
+ *                                             unsigned int uc, lc;
+ *                                             
+ *                                             if (!parse_hex16(&s, &uc))
+ *                                                     goto syntax_error;
+ *                                             
+ *                                             if (uc >= 0xD800 && uc <= 0xDFFF) {
+ *                                                     // Handle UTF-16 surrogate pair (e.g. "\uD834\uDD1E").
+ *                                                     uchar_t unicode;
+ *                                                     
+ *                                                     if (*s++ != '\\' || *s++ != 'u' || !parse_hex16(&s, &lc))
+ *                                                             goto syntax_error;
+ *                                                     
+ *                                                     unicode = from_surrogate_pair(uc, lc);
+ *                                                     if (unicode == REPLACEMENT_CHARACTER) {
+ *                                                             fprintf(stderr, "Invalid surrogate pair.\n");
+ *                                                             exit(EXIT_FAILURE);
+ *                                                     }
+ *                                                     
+ *                                                     b += utf8_write_char(unicode, b);
+ *                                             } else {
+ *                                                     // Handle ordinary Unicode escape (e.g. "\u266B").
+ *                                                     b += utf8_write_char(uc, b);
+ *                                             }
+ *                                             
+ *                                             break;
+ *                                     }
+ *                                     default:
+ *                                             goto syntax_error;
+ *                             }
+ *                     } else if (c <= 0x1F) {
+ *                             // Control characters are not allowed in string literals.
+ *                             goto syntax_error;
+ *                     } else {
+ *                             *b++ = c;
+ *                     }
+ *                     
+ *                     fwrite(output_buffer, 1, b - output_buffer, stdout);
+ *             }
+ *             
+ *             putchar('\n');
+ *             return;
+ *             
+ *     syntax_error:
+ *             fprintf(stderr, "Syntax error in JSON string literal.\n");
+ *             exit(EXIT_FAILURE);
+ *     }
+ *     
+ *     static bool parse_hex16(const char **sp, unsigned int *out)
+ *     {
+ *             const char *s = *sp;
+ *             unsigned int ret = 0;
+ *             unsigned int i;
+ *             unsigned int tmp;
+ *             char            c;
+ *     
+ *             for (i = 0; i < 4; i++)
+ *             {
+ *                     c = *s++;
+ *                     if (c >= '0' && c <= '9')
+ *                             tmp = c - '0';
+ *                     else if (c >= 'A' && c <= 'F')
+ *                             tmp = c - 'A' + 10;
+ *                     else if (c >= 'a' && c <= 'f')
+ *                             tmp = c - 'a' + 10;
+ *                     else
+ *                             return false;
+ *     
+ *                     ret <<= 4;
+ *                     ret += tmp;
+ *             }
+ *             
+ *             *out = ret;
+ *             *sp = s;
+ *             return true;
+ *     }
  *
  * Author: Joey Adams
  * License: MIT
index 756080138d23110e26aa97c6fc3b0a0964b488ce..cd2035969222866c49617f1bd90d022ce82c3453 100644 (file)
 
 #include "charset.h"
 
-bool utf8_allow_surrogates = false;
-
+/*
+ * This function implements the syntax given in RFC3629, which is
+ * the same as that given in The Unicode Standard, Version 6.0.
+ *
+ * It has the following properties:
+ *
+ *  * All codepoints U+0000..U+10FFFF may be encoded,
+ *    except for U+D800..U+DFFF, which are reserved
+ *    for UTF-16 surrogate pair encoding.
+ *  * UTF-8 byte sequences longer than 4 bytes are not permitted,
+ *    as they exceed the range of Unicode.
+ *  * The sixty-six Unicode "non-characters" are permitted
+ *    (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
+ */
 bool utf8_validate(const char *str, size_t length)
 {
        const unsigned char *s = (const unsigned char*)str;
@@ -32,69 +44,145 @@ bool utf8_validate(const char *str, size_t length)
        
        while (s < e) {
                unsigned char c = *s++;
-               unsigned int len; /* number of bytes in sequence - 2 */
+               unsigned char c2;
+               int len_minus_two;
                
-               /* If character is ASCII, move on. */
-               if (c < 0x80)
+               /* Validate the first byte and determine the sequence length. */
+               if (c <= 0x7F)          /* 00..7F */
                        continue;
+               else if (c <= 0xC1)     /* 80..C1 */
+                       return false;
+               else if (c <= 0xDF)     /* C2..DF */
+                       len_minus_two = 0;
+               else if (c <= 0xEF)     /* E0..EF */
+                       len_minus_two = 1;
+               else if (c <= 0xF4)     /* F0..F4 */
+                       len_minus_two = 2;
+               else
+                       return false;
                
-               if (s >= e)
-                       return false; /* Missing bytes in sequence. */
-               
-               if (c < 0xE0) {
-                       /* 2-byte sequence, U+0080 to U+07FF
-                          c must be 11000010 or higher
-                          s[0] must be 10xxxxxx */
-                       len = 0;
-                       if (c < 0xC2)
-                               return false;
-               } else if (c < 0xF0) {
-                       /* 3-byte sequence, U+0800 to U+FFFF
-                          Note that the surrogate range is U+D800 to U+DFFF,
-                                 and that U+FFFE and U+FFFF are illegal characters.
-                          c must be >= 11100000 (which it is)
-                          If c is 11100000, then s[0] must be >= 10100000
-                          If the global parameter utf8_allow_surrogates is false:
-                             If c is 11101101 and s[0] is >= 10100000,
-                                then this is a surrogate and we should fail.
-                          If c is 11101111, s[0] is 10111111, and s[1] >= 10111110,
-                                 then this is an illegal character and we should fail.
-                          s[0] and s[1] must be 10xxxxxx */
-                       len = 1;
-                       if (c == 0xE0 && *s < 0xA0)
-                               return false;
-                       if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
-                               return false;
-                       if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE))
-                               return false;
-               } else {
-                       /* 4-byte sequence, U+010000 to U+10FFFF
-                          c must be >= 11110000 (which it is) and <= 11110100
-                          If c is 11110000, then s[0] must be >= 10010000
-                          If c is 11110100, then s[0] must be < 10010000
-                          s[0], s[1], and s[2] must be 10xxxxxx */
-                       len = 2;
-                       if (c > 0xF4)
-                               return false;
-                       if (c == 0xF0 && *s < 0x90)
-                               return false;
-                       if (c == 0xF4 && *s >= 0x90)
-                               return false;
-               }
+               /* Make sure the character isn't clipped. */
+               if (s + len_minus_two >= e)
+                       return false;
                
-               if (s + len >= e)
-                       return false; /* Missing bytes in sequence. */
+               c2 = *s;
                
+               /* Make sure subsequent bytes are in the range 0x80..0xBF. */
                do {
                        if ((*s++ & 0xC0) != 0x80)
                                return false;
-               } while (len--);
+               } while (len_minus_two--);
+               
+               /* Handle special cases. */
+               switch (c) {
+                       case 0xE0:
+                               /* Disallow overlong 3-byte sequence. */
+                               if (c2 < 0xA0)
+                                       return false;
+                               break;
+                       case 0xED:
+                               /* Disallow U+D800..U+DFFF. */
+                               if (c2 > 0x9F)
+                                       return false;
+                               break;
+                       case 0xF0:
+                               /* Disallow overlong 4-byte sequence. */
+                               if (c2 < 0x90)
+                                       return false;
+                               break;
+                       case 0xF4:
+                               /* Disallow codepoints beyond U+10FFFF. */
+                               if (c2 > 0x8F)
+                                       return false;
+                               break;
+               }
        }
        
        return true;
 }
 
-/*
-  Note to future contributors: These routines are currently all under the
-    MIT license.  It would be nice to keep it that way :)
-*/
+int utf8_read_char(const char *s, uchar_t *out)
+{
+       const unsigned char *c = (const unsigned char*) s;
+
+       if (c[0] <= 0x7F) {
+               /* 00..7F */
+               *out = c[0];
+               return 1;
+       } else if (c[0] <= 0xDF) {
+               /* C2..DF (unless input is invalid) */
+               *out = ((uchar_t)c[0] & 0x1F) << 6 |
+                      ((uchar_t)c[1] & 0x3F);
+               return 2;
+       } else if (c[0] <= 0xEF) {
+               /* E0..EF */
+               *out = ((uchar_t)c[0] &  0xF) << 12 |
+                      ((uchar_t)c[1] & 0x3F) << 6  |
+                      ((uchar_t)c[2] & 0x3F);
+               return 3;
+       } else {
+               /* F0..F4 (unless input is invalid) */
+               *out = ((uchar_t)c[0] &  0x7) << 18 |
+                      ((uchar_t)c[1] & 0x3F) << 12 |
+                      ((uchar_t)c[2] & 0x3F) << 6  |
+                      ((uchar_t)c[3] & 0x3F);
+               return 4;
+       }
+}
+
+int utf8_write_char(uchar_t unicode, char *out)
+{
+       unsigned char *o = (unsigned char*) out;
+
+       if (unicode <= 0x7F) {
+               /* U+0000..U+007F */
+               *o++ = unicode;
+               return 1;
+       } else if (unicode <= 0x7FF) {
+               /* U+0080..U+07FF */
+               *o++ = 0xC0 | unicode >> 6;
+               *o++ = 0x80 | (unicode & 0x3F);
+               return 2;
+       } else if (unicode <= 0xFFFF) {
+               /* U+0800..U+FFFF */
+               if (unicode >= 0xD800 && unicode <= 0xDFFF)
+                       unicode = REPLACEMENT_CHARACTER;
+       three_byte_character:
+               *o++ = 0xE0 | unicode >> 12;
+               *o++ = 0x80 | (unicode >> 6 & 0x3F);
+               *o++ = 0x80 | (unicode & 0x3F);
+               return 3;
+       } else if (unicode <= 0x10FFFF) {
+               /* U+10000..U+10FFFF */
+               *o++ = 0xF0 | unicode >> 18;
+               *o++ = 0x80 | (unicode >> 12 & 0x3F);
+               *o++ = 0x80 | (unicode >> 6 & 0x3F);
+               *o++ = 0x80 | (unicode & 0x3F);
+               return 4;
+       } else {
+               /* U+110000... */
+               unicode = REPLACEMENT_CHARACTER;
+               goto three_byte_character;
+       }
+}
+
+uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc)
+{
+       if (uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF)
+               return 0x10000 + ((((uchar_t)uc & 0x3FF) << 10) | (lc & 0x3FF));
+       else
+               return REPLACEMENT_CHARACTER;
+}
+
+bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc)
+{
+       if (unicode >= 0x10000 && unicode <= 0x10FFFF) {
+               uchar_t n = unicode - 0x10000;
+               *uc = ((n >> 10) & 0x3FF) | 0xD800;
+               *lc = (n & 0x3FF) | 0xDC00;
+               return true;
+       } else {
+               *uc = *lc = REPLACEMENT_CHARACTER;
+               return false;
+       }
+}
index 74317fce19b76b1467fefe9abddb1b605c117cd3..257d2860d70122efb407ae2d4d49b84015f72dcc 100644 (file)
 
 #include <stdbool.h>
 #include <stddef.h>
+#include <stdint.h>
+
+#define REPLACEMENT_CHARACTER 0xFFFD
 
 /*
- * Validate the given UTF-8 string.  If it contains '\0' characters,
- * it is still valid.
- *
- * By default, Unicode characters U+D800 thru U+DFFF will be considered
- * invalid UTF-8.  However, if you set utf8_allow_surrogates to true,
- * they will be allowed.  Allowing the surrogate range makes it possible
- * to losslessly encode malformed UTF-16.
+ * Type for Unicode codepoints.
+ * We need our own because wchar_t might be 16 bits.
+ */
+typedef uint32_t uchar_t;
+
+/*
+ * Validate the given UTF-8 string.
+ * If it contains '\0' characters, it is still valid.
  */
 bool utf8_validate(const char *str, size_t length);
 
-/* Default: false */
-extern bool utf8_allow_surrogates;
+/*
+ * Read a single UTF-8 character starting at @s,
+ * returning the length, in bytes, of the character read.
+ *
+ * This function assumes input is valid UTF-8,
+ * and that there are enough characters in front of @s.
+ */
+int utf8_read_char(const char *s, uchar_t *out);
+
+/*
+ * Write a single UTF-8 character to @s,
+ * returning the length, in bytes, of the character written.
+ *
+ * @unicode should be U+0000..U+10FFFF, but not U+D800..U+DFFF.
+ * If @unicode is invalid, REPLACEMENT_CHARACTER will be emitted instead.
+ *
+ * This function will write up to 4 bytes to @out.
+ */
+int utf8_write_char(uchar_t unicode, char *out);
+
+/*
+ * Compute the Unicode codepoint of a UTF-16 surrogate pair.
+ *
+ * @uc should be 0xD800..0xDBFF, and @lc should be 0xDC00..0xDFFF.
+ * If they aren't, this function returns REPLACEMENT_CHARACTER.
+ */
+uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc);
+
+/*
+ * Construct a UTF-16 surrogate pair given a Unicode codepoint.
+ *
+ * @unicode should be U+10000..U+10FFFF.
+ * If it's not, this function returns false,
+ * and sets *uc and *lc to REPLACEMENT_CHARACTER.
+ */
+bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc);
 
 #endif
diff --git a/ccan/charset/test/common.h b/ccan/charset/test/common.h
new file mode 100644 (file)
index 0000000..83b39c4
--- /dev/null
@@ -0,0 +1,27 @@
+#include <stdint.h>
+#include <stdlib.h>
+
+/*
+ * Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
+ * Uses the BCPL linear congruential generator method.
+ *
+ * Used instead of system RNG to ensure tests are consistent.
+ */
+static uint32_t rand32(void)
+{
+#if 0
+       /*
+        * Tests should be run with a different random function
+        * from time to time.  I've found that the method below
+        * sometimes behaves poorly for testing purposes.
+        * For example, rand32() % N might only return even numbers.
+        */
+       assert(RAND_MAX == 2147483647);
+       return ((random() & 0xFFFF) << 16) | (random() & 0xFFFF);
+#else
+       static uint32_t rand32_state = 0;
+       rand32_state *= (uint32_t)0x7FF8A3ED;
+       rand32_state += (uint32_t)0x2AA01D31;
+       return rand32_state;
+#endif
+}
diff --git a/ccan/charset/test/run-surrogate-pair.c b/ccan/charset/test/run-surrogate-pair.c
new file mode 100644 (file)
index 0000000..f200128
--- /dev/null
@@ -0,0 +1,135 @@
+#include <ccan/charset/charset.c>
+#include <ccan/tap/tap.h>
+
+#include <string.h>
+
+#include "common.h"
+
+/*
+ * Testing procedure for from_surrogate_pair and to_surrogate_pair:
+ *
+ *  * For each Unicode code point from 0x10000 to 0x10FFFF:
+ *    - Call to_surrogate_pair, and make sure that:
+ *      - It returns true.
+ *      - uc is 0xD800..0xDBFF
+ *      - lc is 0xDC00..0xDFFF
+ *    - Call from_surrogate_pair on the pair, and make sure that
+ *      it returns the original character.
+ *  * For various invalid arguments to to_surrogate_pair
+ *    (U+0000..U+FFFF and U+110000...):
+ *    - Call to_surrogate_pair, and make sure it:
+ *      - Returns false.
+ *      - Sets *uc and *lc to REPLACEMENT_CHARACTER.
+ *  * For various invalid arguments to from_surrogate_pair
+ *    (uc: not 0xD800..0xDBFF, lc: not 0xDC00..0xDFFF):
+ *    - Call from_surrogate_pair, and make sure
+ *      it returns REPLACEMENT_CHARACTER.
+ */
+
+#define INVALID_TRIAL_COUNT     10000
+
+#define range(r, lo, hi)  ((r) % ((hi)-(lo)+1) + (lo))
+
+static void test_valid(void)
+{
+       uchar_t unicode;
+       unsigned int uc, lc;
+       
+       for (unicode = 0x10000; unicode <= 0x10FFFF; unicode++) {
+               if (to_surrogate_pair(unicode, &uc, &lc) != true) {
+                       fail("to_surrogate_pair did not return true on valid input.");
+                       return;
+               }
+               if (!(uc >= 0xD800 && uc <= 0xDBFF)) {
+                       fail("to_surrogate_pair: uc is out of range");
+                       return;
+               }
+               if (!(lc >= 0xDC00 && lc <= 0xDFFF)) {
+                       fail("to_surrogate_pair: lc is out of range");
+                       return;
+               }
+               if (from_surrogate_pair(uc, lc) != unicode) {
+                       fail("Surrogate pair conversion did not preserve original value (U+%04lX).", (unsigned long)unicode);
+                       return;
+               }
+       }
+       
+       pass("to_surrogate_pair and from_surrogate_pair work for all valid arguments.");
+}
+
+static void test_invalid_to_surrogate_pair(void)
+{
+       long i;
+       uchar_t unicode;
+       unsigned int uc, lc;
+       
+       for (i = 1; i <= INVALID_TRIAL_COUNT; i++) {
+               if (rand32() % 2) {
+                       unicode = range(rand32(), 0x0, 0xFFFF);
+               } else {
+                       do {
+                               unicode = rand32();
+                       } while (unicode < 0x110000);
+               }
+               
+               if (to_surrogate_pair(unicode, &uc, &lc) != false) {
+                       fail("to_surrogate_pair did not return false on invalid input.");
+                       return;
+               }
+               if (uc != REPLACEMENT_CHARACTER || lc != REPLACEMENT_CHARACTER) {
+                       fail("to_surrogate_pair did not set uc and lc to the replacement character on invalid input.");
+                       return;
+               }
+       }
+       
+       pass("to_surrogate_pair seems to handle invalid argument values properly.");
+}
+
+static void test_invalid_from_surrogate_pair(void)
+{
+       long i;
+       unsigned int uc, lc;
+       
+       for (i = 1; i <= INVALID_TRIAL_COUNT; i++) {
+               switch (rand32() % 3) {
+                       case 0:
+                               uc = range(rand32(), 0x0, 0xD7FF);
+                               break;
+                       case 1:
+                               uc = range(rand32(), 0xDC00, 0xDFFF);
+                               break;
+                       default:
+                               uc = range(rand32(), 0xE000, 0xFFFF);
+                               break;
+               }
+               switch (rand32() % 3) {
+                       case 0:
+                               lc = range(rand32(), 0x0, 0xD7FF);
+                               break;
+                       case 1:
+                               lc = range(rand32(), 0xD800, 0xDBFF);
+                               break;
+                       default:
+                               lc = range(rand32(), 0xE000, 0xFFFF);
+                               break;
+               }
+               
+               if (from_surrogate_pair(uc, lc) != REPLACEMENT_CHARACTER) {
+                       fail("from_surrogate_pair(0x%04X, 0x%04X) did not return the replacement character", uc, lc);
+                       return;
+               }
+       }
+       
+       pass("from_surrogate_pair seems to handle invalid arguments properly.");
+}
+
+int main(void)
+{
+       plan_tests(3);
+       
+       test_valid();
+       test_invalid_to_surrogate_pair();
+       test_invalid_from_surrogate_pair();
+       
+       return exit_status();
+}
diff --git a/ccan/charset/test/run-utf8-read-write.c b/ccan/charset/test/run-utf8-read-write.c
new file mode 100644 (file)
index 0000000..7758b64
--- /dev/null
@@ -0,0 +1,150 @@
+#include <ccan/charset/charset.c>
+#include <ccan/tap/tap.h>
+
+#include <string.h>
+
+#include "common.h"
+
+/*
+ * Testing procedure for utf8_read_char and utf8_write_char:
+ *
+ *  * Generate N valid and invalid Unicode code points.
+ *  * Encode them with utf8_write_char.
+ *  * Copy the resulting string into a buffer sized exactly as big as
+ *    the string produced.  This way, Valgrind can catch buffer overflows
+ *    by utf8_validate and utf8_read_char.
+ *  * Validate the string with utf8_validate.
+ *  * Decode the string, ensuring that:
+ *    - Valid codepoints are read back.
+ *    - Invalid characters are read back, but replaced
+ *      with REPLACEMENT_CHARACTER.
+ *    - No extra characters are read back.
+ */
+
+#define TRIAL_COUNT             1000
+#define MAX_CHARS_PER_TRIAL     100
+
+#define range(r, lo, hi)  ((r) % ((hi)-(lo)+1) + (lo))
+
+int main(void)
+{
+       int trial;
+       
+       plan_tests(TRIAL_COUNT);
+       
+       for (trial = 1; trial <= TRIAL_COUNT; trial++) {
+               int i, count;
+               uchar_t codepoints[MAX_CHARS_PER_TRIAL];
+               uchar_t c;
+               bool c_valid;
+               
+               char write_buffer[MAX_CHARS_PER_TRIAL * 4];
+               char *o = write_buffer;
+               char *oe = write_buffer + sizeof(write_buffer);
+               
+               char *string;
+               const char *s;
+               const char *e;
+               
+               int len;
+               
+               count = rand32() % MAX_CHARS_PER_TRIAL + 1;
+               
+               for (i = 0; i < count; i++) {
+                       if (o >= oe) {
+                               fail("utf8_write_char: Buffer overflow (1)");
+                               goto next_trial;
+                       }
+                       
+                       switch (rand32() % 7) {
+                               case 0:
+                                       c = range(rand32(), 0x0, 0x7F);
+                                       c_valid = true;
+                                       break;
+                               case 1:
+                                       c = range(rand32(), 0x80, 0x7FF);
+                                       c_valid = true;
+                                       break;
+                               case 2:
+                                       c = range(rand32(), 0x800, 0xD7FF);
+                                       c_valid = true;
+                                       break;
+                               case 3:
+                                       c = range(rand32(), 0xD800, 0xDFFF);
+                                       c_valid = false;
+                                       break;
+                               case 4:
+                                       c = range(rand32(), 0xE000, 0xFFFF);
+                                       c_valid = true;
+                                       break;
+                               case 5:
+                                       c = range(rand32(), 0x10000, 0x10FFFF);
+                                       c_valid = true;
+                                       break;
+                               default:
+                                       do {
+                                               c = rand32();
+                                       } while (c < 0x110000);
+                                       c_valid = false;
+                                       break;
+                       }
+                       
+                       codepoints[i] = c_valid ? c : REPLACEMENT_CHARACTER;
+                       
+                       len = utf8_write_char(c, o);
+                       if (len < 1 || len > 4) {
+                               fail("utf8_write_char: Return value is not 1 thru 4.");
+                               goto next_trial;
+                       }
+                       o += len;
+               }
+               if (o > oe) {
+                       fail("utf8_write_char: Buffer overflow (2)");
+                       goto next_trial;
+               }
+               
+               string = malloc(o - write_buffer);
+               memcpy(string, write_buffer, o - write_buffer);
+               s = string;
+               e = string + (o - write_buffer);
+               
+               if (!utf8_validate(s, e - s)) {
+                       fail("Invalid string produced by utf8_write_char.");
+                       goto next_trial_free_string;
+               }
+               
+               for (i = 0; i < count; i++) {
+                       if (s >= e) {
+                               fail("utf8_read_char: Buffer overflow (1)");
+                               goto next_trial_free_string;
+                       }
+                       
+                       len = utf8_read_char(s, &c);
+                       if (len < 1 || len > 4) {
+                               fail("utf8_read_char: Return value is not 1 thru 4.");
+                               goto next_trial_free_string;
+                       }
+                       if (c != codepoints[i]) {
+                               fail("utf8_read_char: Character read differs from that written.");
+                               goto next_trial_free_string;
+                       }
+                       s += len;
+               }
+               if (s > e) {
+                       fail("utf8_read_char: Buffer overflow (2)");
+                       goto next_trial_free_string;
+               }
+               if (s < e) {
+                       fail("utf8_read_char: Did not reach end of string.");
+                       goto next_trial_free_string;
+               }
+               
+               pass("Trial %d: %d characters", trial, count);
+               
+       next_trial_free_string:
+               free(string);
+       next_trial:;
+       }
+       
+       return exit_status();
+}
diff --git a/ccan/charset/test/run-utf8_validate.c b/ccan/charset/test/run-utf8_validate.c
new file mode 100644 (file)
index 0000000..3718b32
--- /dev/null
@@ -0,0 +1,256 @@
+#include <ccan/charset/charset.c>
+#include <ccan/tap/tap.h>
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "common.h"
+
+/* Make a valid or invalid Unicode character fitting in exactly @len UTF-8 bytes. */
+static uchar_t utf8_randcode(int len, bool valid, bool after_clipped)
+{
+       uint32_t r = rand32();
+       uchar_t ret;
+       
+       #define range(lo, hi)  ((r & 0x7FFFFFFF) % ((hi)-(lo)+1) + (lo))
+       #define high_bit_set() (!!(r & 0x80000000))
+       
+       switch (len) {
+               case 1:
+                       if (valid) {
+                               /* Generate a character U+0000..U+007F */
+                               return r & 0x7F;
+                       } else {
+                               /*
+                                * Generate a character U+0080..U+00BF or U+00F8..U+00FF.
+                                *
+                                * However, don't generate U+0080..U+00BF (10xxxxxx) after a
+                                * clipped character, as that can inadvertently form a valid,
+                                * complete character.
+                                */
+                               if (!after_clipped && high_bit_set())
+                                       return range(0x80, 0xBF);
+                               else
+                                       return range(0xF8, 0xFF);
+                       }
+               case 2:
+                       if (valid) {
+                               /* Generate a character U+0080..U+07FF */
+                               return range(0x80, 0x7FF);
+                       } else {
+                               /* Generate a character U+0000..U+007F */
+                               return r & 0x7F;
+                       }
+               case 3:
+                       if (valid) {
+                               /* Generate a character U+0800..U+FFFF, but not U+D800..U+DFFF */
+                               for (;;) {
+                                       ret = range(0x800, 0xFFFF);
+                                       if (ret >= 0xD800 && ret <= 0xDFFF) {
+                                               r = rand32();
+                                               continue;
+                                       } else {
+                                               break;
+                                       }
+                               }
+                               return ret;
+                       } else {
+                               /* Generate a character U+0000..U+07FF or U+D800..U+DFFF */
+                               if (high_bit_set())
+                                       return r & 0x7FF;
+                               else
+                                       return 0xD800 + (r & 0x7FF);
+                       }
+               case 4:
+                       if (valid) {
+                               /* Generate a character U+10000..U+10FFFF */
+                               return range(0x10000, 0x10FFFF);
+                       } else {
+                               /* Generate a character U+0000..0xFFFF or U+110000..U+1FFFFF */
+                               if (high_bit_set())
+                                       return r & 0xFFFF;
+                               else
+                                       return range(0x110000, 0x1FFFFF);
+                       }
+               default:
+                       assert(false);
+       }
+       
+       #undef range
+       #undef high_bit_set
+}
+
+/* Encode @uc as UTF-8 using exactly @len characters.
+   @len should be 1 thru 4. */
+static void utf8_encode_raw(char *out, unsigned int uc, int len)
+{
+       switch (len) {
+               case 1:
+                       assert(uc <= 0xC1 || (uc >= 0xF8 && uc <= 0xFF));
+                       *out++ = uc;
+                       break;
+               case 2:
+                       assert(uc <= 0x7FF);
+                       *out++ = 0xC0 | ((uc >> 6) & 0x1F);
+                       *out++ = 0x80 | (uc & 0x3F);
+                       break;
+               case 3:
+                       assert(uc <= 0xFFFF);
+                       *out++ = 0xE0 | ((uc >> 12) & 0x0F);
+                       *out++ = 0x80 | ((uc >> 6) & 0x3F);
+                       *out++ = 0x80 | (uc & 0x3F);
+                       break;
+               case 4:
+                       assert(uc <= 0x1FFFFF);
+                       *out++ = 0xF0 | ((uc >> 18) & 0x07);
+                       *out++ = 0x80 | ((uc >> 12) & 0x3F);
+                       *out++ = 0x80 | ((uc >> 6) & 0x3F);
+                       *out++ = 0x80 | (uc & 0x3F);
+                       break;
+       }
+}
+
+#if COMPUTE_AVERAGE_LENGTH
+double total_averages;
+#endif
+
+/* Generate a UTF-8 string of the given byte length,
+   randomly deciding if it should be valid or not.
+   
+   Return true if it's valid, false if it's not. */
+static bool utf8_mktest(char *out, int len)
+{
+       double pf;
+       uint32_t pu;
+       int n;
+       bool valid = true;
+       bool v;
+       bool after_clipped = false;
+       
+       #if COMPUTE_AVERAGE_LENGTH
+       int n_total = 0;
+       int count = 0;
+       #endif
+       
+       /*
+        * Probability that, per character, it should be valid.
+        * The goal is to make utf8_mktest as a whole
+        * have a 50% chance of generating a valid string.
+        *
+        * The equation being solved is:
+        *
+        *     p^n = 0.5
+        *
+        * where p is the probability that each character is valid,
+        * and n is the number of characters in the string.
+        *
+        * 2.384 is the approximate average length of each character,
+        * so len/2.384 is about how many characters this string
+        * is expected to contain.
+        */
+       pf = pow(0.5, 2.384/len);
+       
+       /* Convert to uint32_t to test against rand32. */
+       pu = pf * 4294967295.0;
+       
+       for (;len > 0; len -= n, out += n) {
+               v = rand32() <= pu;
+               
+               if (v) {
+                       /* Generate a valid character. */
+                       n = rand32() % (len < 4 ? len : 4) + 1;
+                       utf8_encode_raw(out, utf8_randcode(n, true, after_clipped), n);
+                       after_clipped = false;
+               } else if (rand32() % 5) {
+                       /* Generate an invalid character. */
+                       n = rand32() % (len < 4 ? len : 4) + 1;
+                       utf8_encode_raw(out, utf8_randcode(n, false, after_clipped), n);
+                       after_clipped = false;
+               } else {
+                       /* Generate a clipped but otherwise valid character. */
+                       char tmp[4];
+                       n = rand32() % 3 + 2;
+                       utf8_encode_raw(tmp, utf8_randcode(n, true, after_clipped), n);
+                       n -= rand32() % (n-1) + 1;
+                       if (n > len)
+                               n = len;
+                       assert(n >= 1 && n <= 3);
+                       memcpy(out, tmp, n);
+                       after_clipped = true;
+               }
+               
+               if (!v)
+                       valid = false;
+               
+               #if COMPUTE_AVERAGE_LENGTH
+               n_total += n;
+               count++;
+               #endif
+       }
+       
+       #if COMPUTE_AVERAGE_LENGTH
+       if (count > 0)
+               total_averages += (double)n_total / count;
+       #endif
+       
+       return valid;
+}
+
+static void test_utf8_validate(void)
+{
+       char buffer[128];
+       int i;
+       int len;
+       bool valid;
+       int passed=0, p_valid=0, p_invalid=0, total=0;
+       int count;
+       
+       count = 100000;
+       
+       #if COMPUTE_AVERAGE_LENGTH
+       total_averages = 0.0;
+       #endif
+       
+       for (i=0; i<count; i++) {
+               len = rand32() % (sizeof(buffer) + 1);
+               valid = utf8_mktest(buffer, len);
+               if (utf8_validate(buffer, len) == valid) {
+                       passed++;
+                       if (valid)
+                               p_valid++;
+                       else
+                               p_invalid++;
+               } else {
+                       bool uvalid = utf8_validate(buffer, len);
+                       printf("Failed: generated %s string, but utf8_validate returned %s\n",
+                              valid ? "valid" : "invalid",
+                              uvalid ? "true" : "false");
+               }
+               total++;
+       }
+       
+       if (passed == total)
+               pass("%d valid tests, %d invalid tests", p_valid, p_invalid);
+       else
+               fail("Passed only %d out of %d tests\n", passed, total);
+       
+       ok(p_valid > count/10 && p_invalid > count/10,
+          "Valid and invalid should be balanced");
+       
+       #if COMPUTE_AVERAGE_LENGTH
+       printf("Average character length: %f\n", total_averages / count);
+       #endif
+}
+
+int main(void)
+{
+       /* This is how many tests you plan to run */
+       plan_tests(2);
+       
+       test_utf8_validate();
+
+       /* This exits depending on whether all tests passed */
+       return exit_status();
+}
diff --git a/ccan/charset/test/run.c b/ccan/charset/test/run.c
deleted file mode 100644 (file)
index 5504e88..0000000
+++ /dev/null
@@ -1,199 +0,0 @@
-#include <ccan/charset/charset.h>
-#include <ccan/charset/charset.c>
-#include <ccan/tap/tap.h>
-
-#include <assert.h>
-#include <math.h>
-#include <stdint.h>
-#include <stdio.h>
-
-/*
- * Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
- * Uses the BCPL linear congruential generator method.
- *
- * Used instead of system RNG to ensure tests are consistent.
- */
-static uint32_t rand32(void)
-{
-       static uint32_t rand32_state = 0;
-       rand32_state *= (uint32_t)0x7FF8A3ED;
-       rand32_state += (uint32_t)0x2AA01D31;
-       return rand32_state;
-}
-
-/*
- * Make a Unicode character requiring exactly @len UTF-8 bytes.
- *
- * Unless utf8_allow_surrogates is set,
- * do not return a value in the range U+D800 thru U+DFFF .
- *
- * If @len is not 1 thru 4, generate an out-of-range character.
- */
-static unsigned int utf8_randcode(int len)
-{
-       uint32_t r = rand32();
-       unsigned int ret;
-       
-       switch (len) {
-               case 1: return r % 0x80;
-               case 2: return r % (0x800-0x80) + 0x80;
-               case 3:
-                       for (;;) {
-                               ret = r % (0x10000-0x800) + 0x800;
-                               if ((!utf8_allow_surrogates && ret >= 0xD800 && ret <= 0xDFFF)
-                               || ret >= 0xFFFE)
-                               {
-                                       r = rand32();
-                                       continue;
-                               } else {
-                                       break;
-                               }
-                       }
-                       return ret;
-               case 4: return r % (0x110000-0x10000) + 0x10000;
-               default:
-                       while (r < 0x110000)
-                               r = rand32();
-                       return r;
-       }
-}
-
-static unsigned int rand_surrogate(void)
-{
-       return rand32() % (0xE000 - 0xD800) + 0xD800;
-}
-
-/* Encode @uc as UTF-8 using exactly @len characters.
-   @len should be 1 thru 4.
-   @uc will be truncated to the bits it will go into.
-   If, after bit truncation, @uc is in the wrong range for its length,
-   an invalid character will be generated. */
-static void utf8_encode_raw(char *out, unsigned int uc, int len)
-{
-       switch (len) {
-               case 1:
-                       *out++ = uc & 0x7F;
-                       break;
-               case 2:
-                       *out++ = 0xC0 | ((uc >> 6) & 0x1F);
-                       *out++ = 0x80 | (uc & 0x3F);
-                       break;
-               case 3:
-                       *out++ = 0xE0 | ((uc >> 12) & 0x0F);
-                       *out++ = 0x80 | ((uc >> 6) & 0x3F);
-                       *out++ = 0x80 | (uc & 0x3F);
-                       break;
-               case 4:
-                       *out++ = 0xF0 | ((uc >> 18) & 0x07);
-                       *out++ = 0x80 | ((uc >> 12) & 0x3F);
-                       *out++ = 0x80 | ((uc >> 6) & 0x3F);
-                       *out++ = 0x80 | (uc & 0x3F);
-                       break;
-       }
-}
-
-/* Generate a UTF-8 string of the given byte length,
-   randomly deciding if it should be valid or not.
-   
-   Return true if it's valid, false if it's not. */
-static bool utf8_mktest(char *out, int len)
-{
-       int m, n;
-       bool valid = true;
-       bool v;
-       double pf;
-       uint32_t pu;
-       
-       /* Probability that, per character, it should be valid.
-          The goal is to make utf8_mktest as a whole
-          have a 50% chance of generating a valid string. */
-       pf = pow(0.5, 2.5/len);
-       
-       /* Convert to uint32_t to test against rand32. */
-       pu = pf * 4294967295.0;
-       
-       for (;len; len -= n) {
-               v = len == 1 || rand32() <= pu;
-               m = len < 4 ? len : 4;
-               
-               if (v) {
-                       /* Generate a valid character. */
-                       n = rand32() % m + 1;
-                       utf8_encode_raw(out, utf8_randcode(n), n);
-               } else {
-                       /* Generate an invalid character. */
-                       assert(m >= 2);
-                       n = rand32() % (m-1) + 2;
-                       switch (n) {
-                               case 2:
-                                       utf8_encode_raw(out, utf8_randcode(1), n);
-                                       break;
-                               case 3:
-                                       if (!utf8_allow_surrogates && (rand32() & 1))
-                                               utf8_encode_raw(out, rand_surrogate(), n);
-                                       else
-                                               utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
-                                       break;
-                               case 4:
-                                       utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
-                                       break;
-                       }
-                       valid = false;
-               }
-               out += n;
-       }
-       
-       return valid;
-}
-
-static void test_utf8_validate(bool allow_surrogates)
-{
-       char buffer[1024];
-       int i;
-       int len;
-       bool valid;
-       int passed=0, p_valid=0, p_invalid=0, total=0;
-       int count;
-       
-       count = 10000;
-       
-       utf8_allow_surrogates = allow_surrogates;
-       
-       for (i=0; i<count; i++) {
-               len = rand32() % (1024 + 1);
-               valid = utf8_mktest(buffer, len);
-               if (utf8_validate(buffer, len) == valid) {
-                       passed++;
-                       if (valid)
-                               p_valid++;
-                       else
-                               p_invalid++;
-               }
-               total++;
-       }
-       
-       if (passed == total) {
-               printf("PASS:  %d valid tests, %d invalid tests\n",
-                       p_valid, p_invalid);
-       } else {
-               printf("FAIL:  Passed %d out of %d tests\n", passed, total);
-       }
-       
-       ok(passed, "utf8_validate test passed%s",
-               !allow_surrogates ? " (surrogates disallowed)" : "");
-       
-       ok(p_valid > count/10 && p_invalid > count/10,
-               "   valid/invalid are balanced");
-}
-
-int main(void)
-{
-       /* This is how many tests you plan to run */
-       plan_tests(4);
-       
-       test_utf8_validate(false);
-       test_utf8_validate(true);
-
-       /* This exits depending on whether all tests passed */
-       return exit_status();
-}