From: Joey Adams Date: Sat, 11 Jun 2011 07:58:10 +0000 (-0400) Subject: charset: Rewrote utf8_validate, and added four new functions: X-Git-Url: http://git.ozlabs.org/?p=ccan;a=commitdiff_plain;h=06c4af3163e2bd99999a93a478d1308ea39c5a79 charset: Rewrote utf8_validate, and added four new functions: * utf8_read_char * utf8_write_char * from_surrogate_pair * to_surrogate_pair --- diff --git a/ccan/charset/_info b/ccan/charset/_info index b549acb6..246ca073 100644 --- a/ccan/charset/_info +++ b/ccan/charset/_info @@ -5,40 +5,151 @@ /** * charset - character set conversion and validation routines * - * This module provides a collection (well, only one, at the moment) of - * well-tested routines for dealing with character set nonsense. - * - * Validation functions: - * - bool utf8_validate(const char *str, size_t length); + * This module provides a collection of well-tested routines + * for dealing with character set nonsense. * * Example: * #include * #include + * #include * #include * #include * #include - * #include // For talloc_free() - * - * int main(int argc, char *argv[]) + * #include + * + * static void print_json_string(const char *s); + * static bool parse_hex16(const char **sp, unsigned int *out); + * + * // Take a JSON-encoded string on input and print its literal value. + * int main(void) * { - * size_t len; - * char *file; - * bool valid; - * - * if (argc != 2) - * err(1, "Expected exactly one argument"); - * - * file = grab_file(NULL, argv[1], &len); - * if (!file) - * err(1, "Could not read file %s", argv[1]); - * - * valid = utf8_validate(file, len); - * printf("File contents are %s UTF-8\n", valid ? "valid" : "invalid"); - * - * talloc_free(file); - * + * char *input; + * size_t length; + * + * input = grab_file(NULL, NULL, &length); + * if (!input) + * err(1, "Error reading input"); + * if (!utf8_validate(input, length)) { + * fprintf(stderr, "Input contains invalid UTF-8\n"); + * return 1; + * } + * if (strlen(input) != length) { + * fprintf(stderr, "Input contains null characters\n"); + * return 1; + * } + * + * print_json_string(input); + * + * talloc_free(input); * return 0; * } + * + * static void print_json_string(const char *s) + * { + * char output_buffer[4]; + * + * // Skip leading whitespace + * while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r') + * s++; + * + * if (*s++ != '"') { + * fprintf(stderr, "Expected JSON string literal surrounded by double quotes.\n"); + * exit(EXIT_FAILURE); + * } + * + * while (*s != '"') { + * unsigned char c = *s++; + * char *b = output_buffer; + * + * if (c == '\\') { + * c = *s++; + * switch (c) { + * case '"': + * case '\\': + * case '/': + * *b++ = c; + * break; + * case 'b': *b++ = '\b'; break; + * case 'f': *b++ = '\f'; break; + * case 'n': *b++ = '\n'; break; + * case 'r': *b++ = '\r'; break; + * case 't': *b++ = '\t'; break; + * case 'u': { + * unsigned int uc, lc; + * + * if (!parse_hex16(&s, &uc)) + * goto syntax_error; + * + * if (uc >= 0xD800 && uc <= 0xDFFF) { + * // Handle UTF-16 surrogate pair (e.g. "\uD834\uDD1E"). + * uchar_t unicode; + * + * if (*s++ != '\\' || *s++ != 'u' || !parse_hex16(&s, &lc)) + * goto syntax_error; + * + * unicode = from_surrogate_pair(uc, lc); + * if (unicode == REPLACEMENT_CHARACTER) { + * fprintf(stderr, "Invalid surrogate pair.\n"); + * exit(EXIT_FAILURE); + * } + * + * b += utf8_write_char(unicode, b); + * } else { + * // Handle ordinary Unicode escape (e.g. "\u266B"). + * b += utf8_write_char(uc, b); + * } + * + * break; + * } + * default: + * goto syntax_error; + * } + * } else if (c <= 0x1F) { + * // Control characters are not allowed in string literals. + * goto syntax_error; + * } else { + * *b++ = c; + * } + * + * fwrite(output_buffer, 1, b - output_buffer, stdout); + * } + * + * putchar('\n'); + * return; + * + * syntax_error: + * fprintf(stderr, "Syntax error in JSON string literal.\n"); + * exit(EXIT_FAILURE); + * } + * + * static bool parse_hex16(const char **sp, unsigned int *out) + * { + * const char *s = *sp; + * unsigned int ret = 0; + * unsigned int i; + * unsigned int tmp; + * char c; + * + * for (i = 0; i < 4; i++) + * { + * c = *s++; + * if (c >= '0' && c <= '9') + * tmp = c - '0'; + * else if (c >= 'A' && c <= 'F') + * tmp = c - 'A' + 10; + * else if (c >= 'a' && c <= 'f') + * tmp = c - 'a' + 10; + * else + * return false; + * + * ret <<= 4; + * ret += tmp; + * } + * + * *out = ret; + * *sp = s; + * return true; + * } * * Author: Joey Adams * License: MIT diff --git a/ccan/charset/charset.c b/ccan/charset/charset.c index 75608013..cd203596 100644 --- a/ccan/charset/charset.c +++ b/ccan/charset/charset.c @@ -23,8 +23,20 @@ #include "charset.h" -bool utf8_allow_surrogates = false; - +/* + * This function implements the syntax given in RFC3629, which is + * the same as that given in The Unicode Standard, Version 6.0. + * + * It has the following properties: + * + * * All codepoints U+0000..U+10FFFF may be encoded, + * except for U+D800..U+DFFF, which are reserved + * for UTF-16 surrogate pair encoding. + * * UTF-8 byte sequences longer than 4 bytes are not permitted, + * as they exceed the range of Unicode. + * * The sixty-six Unicode "non-characters" are permitted + * (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF). + */ bool utf8_validate(const char *str, size_t length) { const unsigned char *s = (const unsigned char*)str; @@ -32,69 +44,145 @@ bool utf8_validate(const char *str, size_t length) while (s < e) { unsigned char c = *s++; - unsigned int len; /* number of bytes in sequence - 2 */ + unsigned char c2; + int len_minus_two; - /* If character is ASCII, move on. */ - if (c < 0x80) + /* Validate the first byte and determine the sequence length. */ + if (c <= 0x7F) /* 00..7F */ continue; + else if (c <= 0xC1) /* 80..C1 */ + return false; + else if (c <= 0xDF) /* C2..DF */ + len_minus_two = 0; + else if (c <= 0xEF) /* E0..EF */ + len_minus_two = 1; + else if (c <= 0xF4) /* F0..F4 */ + len_minus_two = 2; + else + return false; - if (s >= e) - return false; /* Missing bytes in sequence. */ - - if (c < 0xE0) { - /* 2-byte sequence, U+0080 to U+07FF - c must be 11000010 or higher - s[0] must be 10xxxxxx */ - len = 0; - if (c < 0xC2) - return false; - } else if (c < 0xF0) { - /* 3-byte sequence, U+0800 to U+FFFF - Note that the surrogate range is U+D800 to U+DFFF, - and that U+FFFE and U+FFFF are illegal characters. - c must be >= 11100000 (which it is) - If c is 11100000, then s[0] must be >= 10100000 - If the global parameter utf8_allow_surrogates is false: - If c is 11101101 and s[0] is >= 10100000, - then this is a surrogate and we should fail. - If c is 11101111, s[0] is 10111111, and s[1] >= 10111110, - then this is an illegal character and we should fail. - s[0] and s[1] must be 10xxxxxx */ - len = 1; - if (c == 0xE0 && *s < 0xA0) - return false; - if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0) - return false; - if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE)) - return false; - } else { - /* 4-byte sequence, U+010000 to U+10FFFF - c must be >= 11110000 (which it is) and <= 11110100 - If c is 11110000, then s[0] must be >= 10010000 - If c is 11110100, then s[0] must be < 10010000 - s[0], s[1], and s[2] must be 10xxxxxx */ - len = 2; - if (c > 0xF4) - return false; - if (c == 0xF0 && *s < 0x90) - return false; - if (c == 0xF4 && *s >= 0x90) - return false; - } + /* Make sure the character isn't clipped. */ + if (s + len_minus_two >= e) + return false; - if (s + len >= e) - return false; /* Missing bytes in sequence. */ + c2 = *s; + /* Make sure subsequent bytes are in the range 0x80..0xBF. */ do { if ((*s++ & 0xC0) != 0x80) return false; - } while (len--); + } while (len_minus_two--); + + /* Handle special cases. */ + switch (c) { + case 0xE0: + /* Disallow overlong 3-byte sequence. */ + if (c2 < 0xA0) + return false; + break; + case 0xED: + /* Disallow U+D800..U+DFFF. */ + if (c2 > 0x9F) + return false; + break; + case 0xF0: + /* Disallow overlong 4-byte sequence. */ + if (c2 < 0x90) + return false; + break; + case 0xF4: + /* Disallow codepoints beyond U+10FFFF. */ + if (c2 > 0x8F) + return false; + break; + } } return true; } -/* - Note to future contributors: These routines are currently all under the - MIT license. It would be nice to keep it that way :) -*/ +int utf8_read_char(const char *s, uchar_t *out) +{ + const unsigned char *c = (const unsigned char*) s; + + if (c[0] <= 0x7F) { + /* 00..7F */ + *out = c[0]; + return 1; + } else if (c[0] <= 0xDF) { + /* C2..DF (unless input is invalid) */ + *out = ((uchar_t)c[0] & 0x1F) << 6 | + ((uchar_t)c[1] & 0x3F); + return 2; + } else if (c[0] <= 0xEF) { + /* E0..EF */ + *out = ((uchar_t)c[0] & 0xF) << 12 | + ((uchar_t)c[1] & 0x3F) << 6 | + ((uchar_t)c[2] & 0x3F); + return 3; + } else { + /* F0..F4 (unless input is invalid) */ + *out = ((uchar_t)c[0] & 0x7) << 18 | + ((uchar_t)c[1] & 0x3F) << 12 | + ((uchar_t)c[2] & 0x3F) << 6 | + ((uchar_t)c[3] & 0x3F); + return 4; + } +} + +int utf8_write_char(uchar_t unicode, char *out) +{ + unsigned char *o = (unsigned char*) out; + + if (unicode <= 0x7F) { + /* U+0000..U+007F */ + *o++ = unicode; + return 1; + } else if (unicode <= 0x7FF) { + /* U+0080..U+07FF */ + *o++ = 0xC0 | unicode >> 6; + *o++ = 0x80 | (unicode & 0x3F); + return 2; + } else if (unicode <= 0xFFFF) { + /* U+0800..U+FFFF */ + if (unicode >= 0xD800 && unicode <= 0xDFFF) + unicode = REPLACEMENT_CHARACTER; + three_byte_character: + *o++ = 0xE0 | unicode >> 12; + *o++ = 0x80 | (unicode >> 6 & 0x3F); + *o++ = 0x80 | (unicode & 0x3F); + return 3; + } else if (unicode <= 0x10FFFF) { + /* U+10000..U+10FFFF */ + *o++ = 0xF0 | unicode >> 18; + *o++ = 0x80 | (unicode >> 12 & 0x3F); + *o++ = 0x80 | (unicode >> 6 & 0x3F); + *o++ = 0x80 | (unicode & 0x3F); + return 4; + } else { + /* U+110000... */ + unicode = REPLACEMENT_CHARACTER; + goto three_byte_character; + } +} + +uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc) +{ + if (uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF) + return 0x10000 + ((((uchar_t)uc & 0x3FF) << 10) | (lc & 0x3FF)); + else + return REPLACEMENT_CHARACTER; +} + +bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc) +{ + if (unicode >= 0x10000 && unicode <= 0x10FFFF) { + uchar_t n = unicode - 0x10000; + *uc = ((n >> 10) & 0x3FF) | 0xD800; + *lc = (n & 0x3FF) | 0xDC00; + return true; + } else { + *uc = *lc = REPLACEMENT_CHARACTER; + return false; + } +} diff --git a/ccan/charset/charset.h b/ccan/charset/charset.h index 74317fce..257d2860 100644 --- a/ccan/charset/charset.h +++ b/ccan/charset/charset.h @@ -26,19 +26,57 @@ #include #include +#include + +#define REPLACEMENT_CHARACTER 0xFFFD /* - * Validate the given UTF-8 string. If it contains '\0' characters, - * it is still valid. - * - * By default, Unicode characters U+D800 thru U+DFFF will be considered - * invalid UTF-8. However, if you set utf8_allow_surrogates to true, - * they will be allowed. Allowing the surrogate range makes it possible - * to losslessly encode malformed UTF-16. + * Type for Unicode codepoints. + * We need our own because wchar_t might be 16 bits. + */ +typedef uint32_t uchar_t; + +/* + * Validate the given UTF-8 string. + * If it contains '\0' characters, it is still valid. */ bool utf8_validate(const char *str, size_t length); -/* Default: false */ -extern bool utf8_allow_surrogates; +/* + * Read a single UTF-8 character starting at @s, + * returning the length, in bytes, of the character read. + * + * This function assumes input is valid UTF-8, + * and that there are enough characters in front of @s. + */ +int utf8_read_char(const char *s, uchar_t *out); + +/* + * Write a single UTF-8 character to @s, + * returning the length, in bytes, of the character written. + * + * @unicode should be U+0000..U+10FFFF, but not U+D800..U+DFFF. + * If @unicode is invalid, REPLACEMENT_CHARACTER will be emitted instead. + * + * This function will write up to 4 bytes to @out. + */ +int utf8_write_char(uchar_t unicode, char *out); + +/* + * Compute the Unicode codepoint of a UTF-16 surrogate pair. + * + * @uc should be 0xD800..0xDBFF, and @lc should be 0xDC00..0xDFFF. + * If they aren't, this function returns REPLACEMENT_CHARACTER. + */ +uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc); + +/* + * Construct a UTF-16 surrogate pair given a Unicode codepoint. + * + * @unicode should be U+10000..U+10FFFF. + * If it's not, this function returns false, + * and sets *uc and *lc to REPLACEMENT_CHARACTER. + */ +bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc); #endif diff --git a/ccan/charset/test/common.h b/ccan/charset/test/common.h new file mode 100644 index 00000000..83b39c49 --- /dev/null +++ b/ccan/charset/test/common.h @@ -0,0 +1,27 @@ +#include +#include + +/* + * Finds a pseudorandom 32-bit number from 0 to 2^32-1 . + * Uses the BCPL linear congruential generator method. + * + * Used instead of system RNG to ensure tests are consistent. + */ +static uint32_t rand32(void) +{ +#if 0 + /* + * Tests should be run with a different random function + * from time to time. I've found that the method below + * sometimes behaves poorly for testing purposes. + * For example, rand32() % N might only return even numbers. + */ + assert(RAND_MAX == 2147483647); + return ((random() & 0xFFFF) << 16) | (random() & 0xFFFF); +#else + static uint32_t rand32_state = 0; + rand32_state *= (uint32_t)0x7FF8A3ED; + rand32_state += (uint32_t)0x2AA01D31; + return rand32_state; +#endif +} diff --git a/ccan/charset/test/run-surrogate-pair.c b/ccan/charset/test/run-surrogate-pair.c new file mode 100644 index 00000000..f2001289 --- /dev/null +++ b/ccan/charset/test/run-surrogate-pair.c @@ -0,0 +1,135 @@ +#include +#include + +#include + +#include "common.h" + +/* + * Testing procedure for from_surrogate_pair and to_surrogate_pair: + * + * * For each Unicode code point from 0x10000 to 0x10FFFF: + * - Call to_surrogate_pair, and make sure that: + * - It returns true. + * - uc is 0xD800..0xDBFF + * - lc is 0xDC00..0xDFFF + * - Call from_surrogate_pair on the pair, and make sure that + * it returns the original character. + * * For various invalid arguments to to_surrogate_pair + * (U+0000..U+FFFF and U+110000...): + * - Call to_surrogate_pair, and make sure it: + * - Returns false. + * - Sets *uc and *lc to REPLACEMENT_CHARACTER. + * * For various invalid arguments to from_surrogate_pair + * (uc: not 0xD800..0xDBFF, lc: not 0xDC00..0xDFFF): + * - Call from_surrogate_pair, and make sure + * it returns REPLACEMENT_CHARACTER. + */ + +#define INVALID_TRIAL_COUNT 10000 + +#define range(r, lo, hi) ((r) % ((hi)-(lo)+1) + (lo)) + +static void test_valid(void) +{ + uchar_t unicode; + unsigned int uc, lc; + + for (unicode = 0x10000; unicode <= 0x10FFFF; unicode++) { + if (to_surrogate_pair(unicode, &uc, &lc) != true) { + fail("to_surrogate_pair did not return true on valid input."); + return; + } + if (!(uc >= 0xD800 && uc <= 0xDBFF)) { + fail("to_surrogate_pair: uc is out of range"); + return; + } + if (!(lc >= 0xDC00 && lc <= 0xDFFF)) { + fail("to_surrogate_pair: lc is out of range"); + return; + } + if (from_surrogate_pair(uc, lc) != unicode) { + fail("Surrogate pair conversion did not preserve original value (U+%04lX).", (unsigned long)unicode); + return; + } + } + + pass("to_surrogate_pair and from_surrogate_pair work for all valid arguments."); +} + +static void test_invalid_to_surrogate_pair(void) +{ + long i; + uchar_t unicode; + unsigned int uc, lc; + + for (i = 1; i <= INVALID_TRIAL_COUNT; i++) { + if (rand32() % 2) { + unicode = range(rand32(), 0x0, 0xFFFF); + } else { + do { + unicode = rand32(); + } while (unicode < 0x110000); + } + + if (to_surrogate_pair(unicode, &uc, &lc) != false) { + fail("to_surrogate_pair did not return false on invalid input."); + return; + } + if (uc != REPLACEMENT_CHARACTER || lc != REPLACEMENT_CHARACTER) { + fail("to_surrogate_pair did not set uc and lc to the replacement character on invalid input."); + return; + } + } + + pass("to_surrogate_pair seems to handle invalid argument values properly."); +} + +static void test_invalid_from_surrogate_pair(void) +{ + long i; + unsigned int uc, lc; + + for (i = 1; i <= INVALID_TRIAL_COUNT; i++) { + switch (rand32() % 3) { + case 0: + uc = range(rand32(), 0x0, 0xD7FF); + break; + case 1: + uc = range(rand32(), 0xDC00, 0xDFFF); + break; + default: + uc = range(rand32(), 0xE000, 0xFFFF); + break; + } + switch (rand32() % 3) { + case 0: + lc = range(rand32(), 0x0, 0xD7FF); + break; + case 1: + lc = range(rand32(), 0xD800, 0xDBFF); + break; + default: + lc = range(rand32(), 0xE000, 0xFFFF); + break; + } + + if (from_surrogate_pair(uc, lc) != REPLACEMENT_CHARACTER) { + fail("from_surrogate_pair(0x%04X, 0x%04X) did not return the replacement character", uc, lc); + return; + } + } + + pass("from_surrogate_pair seems to handle invalid arguments properly."); +} + +int main(void) +{ + plan_tests(3); + + test_valid(); + test_invalid_to_surrogate_pair(); + test_invalid_from_surrogate_pair(); + + return exit_status(); +} diff --git a/ccan/charset/test/run-utf8-read-write.c b/ccan/charset/test/run-utf8-read-write.c new file mode 100644 index 00000000..7758b648 --- /dev/null +++ b/ccan/charset/test/run-utf8-read-write.c @@ -0,0 +1,150 @@ +#include +#include + +#include + +#include "common.h" + +/* + * Testing procedure for utf8_read_char and utf8_write_char: + * + * * Generate N valid and invalid Unicode code points. + * * Encode them with utf8_write_char. + * * Copy the resulting string into a buffer sized exactly as big as + * the string produced. This way, Valgrind can catch buffer overflows + * by utf8_validate and utf8_read_char. + * * Validate the string with utf8_validate. + * * Decode the string, ensuring that: + * - Valid codepoints are read back. + * - Invalid characters are read back, but replaced + * with REPLACEMENT_CHARACTER. + * - No extra characters are read back. + */ + +#define TRIAL_COUNT 1000 +#define MAX_CHARS_PER_TRIAL 100 + +#define range(r, lo, hi) ((r) % ((hi)-(lo)+1) + (lo)) + +int main(void) +{ + int trial; + + plan_tests(TRIAL_COUNT); + + for (trial = 1; trial <= TRIAL_COUNT; trial++) { + int i, count; + uchar_t codepoints[MAX_CHARS_PER_TRIAL]; + uchar_t c; + bool c_valid; + + char write_buffer[MAX_CHARS_PER_TRIAL * 4]; + char *o = write_buffer; + char *oe = write_buffer + sizeof(write_buffer); + + char *string; + const char *s; + const char *e; + + int len; + + count = rand32() % MAX_CHARS_PER_TRIAL + 1; + + for (i = 0; i < count; i++) { + if (o >= oe) { + fail("utf8_write_char: Buffer overflow (1)"); + goto next_trial; + } + + switch (rand32() % 7) { + case 0: + c = range(rand32(), 0x0, 0x7F); + c_valid = true; + break; + case 1: + c = range(rand32(), 0x80, 0x7FF); + c_valid = true; + break; + case 2: + c = range(rand32(), 0x800, 0xD7FF); + c_valid = true; + break; + case 3: + c = range(rand32(), 0xD800, 0xDFFF); + c_valid = false; + break; + case 4: + c = range(rand32(), 0xE000, 0xFFFF); + c_valid = true; + break; + case 5: + c = range(rand32(), 0x10000, 0x10FFFF); + c_valid = true; + break; + default: + do { + c = rand32(); + } while (c < 0x110000); + c_valid = false; + break; + } + + codepoints[i] = c_valid ? c : REPLACEMENT_CHARACTER; + + len = utf8_write_char(c, o); + if (len < 1 || len > 4) { + fail("utf8_write_char: Return value is not 1 thru 4."); + goto next_trial; + } + o += len; + } + if (o > oe) { + fail("utf8_write_char: Buffer overflow (2)"); + goto next_trial; + } + + string = malloc(o - write_buffer); + memcpy(string, write_buffer, o - write_buffer); + s = string; + e = string + (o - write_buffer); + + if (!utf8_validate(s, e - s)) { + fail("Invalid string produced by utf8_write_char."); + goto next_trial_free_string; + } + + for (i = 0; i < count; i++) { + if (s >= e) { + fail("utf8_read_char: Buffer overflow (1)"); + goto next_trial_free_string; + } + + len = utf8_read_char(s, &c); + if (len < 1 || len > 4) { + fail("utf8_read_char: Return value is not 1 thru 4."); + goto next_trial_free_string; + } + if (c != codepoints[i]) { + fail("utf8_read_char: Character read differs from that written."); + goto next_trial_free_string; + } + s += len; + } + if (s > e) { + fail("utf8_read_char: Buffer overflow (2)"); + goto next_trial_free_string; + } + if (s < e) { + fail("utf8_read_char: Did not reach end of string."); + goto next_trial_free_string; + } + + pass("Trial %d: %d characters", trial, count); + + next_trial_free_string: + free(string); + next_trial:; + } + + return exit_status(); +} diff --git a/ccan/charset/test/run-utf8_validate.c b/ccan/charset/test/run-utf8_validate.c new file mode 100644 index 00000000..3718b32f --- /dev/null +++ b/ccan/charset/test/run-utf8_validate.c @@ -0,0 +1,256 @@ +#include +#include + +#include +#include +#include +#include + +#include "common.h" + +/* Make a valid or invalid Unicode character fitting in exactly @len UTF-8 bytes. */ +static uchar_t utf8_randcode(int len, bool valid, bool after_clipped) +{ + uint32_t r = rand32(); + uchar_t ret; + + #define range(lo, hi) ((r & 0x7FFFFFFF) % ((hi)-(lo)+1) + (lo)) + #define high_bit_set() (!!(r & 0x80000000)) + + switch (len) { + case 1: + if (valid) { + /* Generate a character U+0000..U+007F */ + return r & 0x7F; + } else { + /* + * Generate a character U+0080..U+00BF or U+00F8..U+00FF. + * + * However, don't generate U+0080..U+00BF (10xxxxxx) after a + * clipped character, as that can inadvertently form a valid, + * complete character. + */ + if (!after_clipped && high_bit_set()) + return range(0x80, 0xBF); + else + return range(0xF8, 0xFF); + } + case 2: + if (valid) { + /* Generate a character U+0080..U+07FF */ + return range(0x80, 0x7FF); + } else { + /* Generate a character U+0000..U+007F */ + return r & 0x7F; + } + case 3: + if (valid) { + /* Generate a character U+0800..U+FFFF, but not U+D800..U+DFFF */ + for (;;) { + ret = range(0x800, 0xFFFF); + if (ret >= 0xD800 && ret <= 0xDFFF) { + r = rand32(); + continue; + } else { + break; + } + } + return ret; + } else { + /* Generate a character U+0000..U+07FF or U+D800..U+DFFF */ + if (high_bit_set()) + return r & 0x7FF; + else + return 0xD800 + (r & 0x7FF); + } + case 4: + if (valid) { + /* Generate a character U+10000..U+10FFFF */ + return range(0x10000, 0x10FFFF); + } else { + /* Generate a character U+0000..0xFFFF or U+110000..U+1FFFFF */ + if (high_bit_set()) + return r & 0xFFFF; + else + return range(0x110000, 0x1FFFFF); + } + default: + assert(false); + } + + #undef range + #undef high_bit_set +} + +/* Encode @uc as UTF-8 using exactly @len characters. + @len should be 1 thru 4. */ +static void utf8_encode_raw(char *out, unsigned int uc, int len) +{ + switch (len) { + case 1: + assert(uc <= 0xC1 || (uc >= 0xF8 && uc <= 0xFF)); + *out++ = uc; + break; + case 2: + assert(uc <= 0x7FF); + *out++ = 0xC0 | ((uc >> 6) & 0x1F); + *out++ = 0x80 | (uc & 0x3F); + break; + case 3: + assert(uc <= 0xFFFF); + *out++ = 0xE0 | ((uc >> 12) & 0x0F); + *out++ = 0x80 | ((uc >> 6) & 0x3F); + *out++ = 0x80 | (uc & 0x3F); + break; + case 4: + assert(uc <= 0x1FFFFF); + *out++ = 0xF0 | ((uc >> 18) & 0x07); + *out++ = 0x80 | ((uc >> 12) & 0x3F); + *out++ = 0x80 | ((uc >> 6) & 0x3F); + *out++ = 0x80 | (uc & 0x3F); + break; + } +} + +#if COMPUTE_AVERAGE_LENGTH +double total_averages; +#endif + +/* Generate a UTF-8 string of the given byte length, + randomly deciding if it should be valid or not. + + Return true if it's valid, false if it's not. */ +static bool utf8_mktest(char *out, int len) +{ + double pf; + uint32_t pu; + int n; + bool valid = true; + bool v; + bool after_clipped = false; + + #if COMPUTE_AVERAGE_LENGTH + int n_total = 0; + int count = 0; + #endif + + /* + * Probability that, per character, it should be valid. + * The goal is to make utf8_mktest as a whole + * have a 50% chance of generating a valid string. + * + * The equation being solved is: + * + * p^n = 0.5 + * + * where p is the probability that each character is valid, + * and n is the number of characters in the string. + * + * 2.384 is the approximate average length of each character, + * so len/2.384 is about how many characters this string + * is expected to contain. + */ + pf = pow(0.5, 2.384/len); + + /* Convert to uint32_t to test against rand32. */ + pu = pf * 4294967295.0; + + for (;len > 0; len -= n, out += n) { + v = rand32() <= pu; + + if (v) { + /* Generate a valid character. */ + n = rand32() % (len < 4 ? len : 4) + 1; + utf8_encode_raw(out, utf8_randcode(n, true, after_clipped), n); + after_clipped = false; + } else if (rand32() % 5) { + /* Generate an invalid character. */ + n = rand32() % (len < 4 ? len : 4) + 1; + utf8_encode_raw(out, utf8_randcode(n, false, after_clipped), n); + after_clipped = false; + } else { + /* Generate a clipped but otherwise valid character. */ + char tmp[4]; + n = rand32() % 3 + 2; + utf8_encode_raw(tmp, utf8_randcode(n, true, after_clipped), n); + n -= rand32() % (n-1) + 1; + if (n > len) + n = len; + assert(n >= 1 && n <= 3); + memcpy(out, tmp, n); + after_clipped = true; + } + + if (!v) + valid = false; + + #if COMPUTE_AVERAGE_LENGTH + n_total += n; + count++; + #endif + } + + #if COMPUTE_AVERAGE_LENGTH + if (count > 0) + total_averages += (double)n_total / count; + #endif + + return valid; +} + +static void test_utf8_validate(void) +{ + char buffer[128]; + int i; + int len; + bool valid; + int passed=0, p_valid=0, p_invalid=0, total=0; + int count; + + count = 100000; + + #if COMPUTE_AVERAGE_LENGTH + total_averages = 0.0; + #endif + + for (i=0; i count/10 && p_invalid > count/10, + "Valid and invalid should be balanced"); + + #if COMPUTE_AVERAGE_LENGTH + printf("Average character length: %f\n", total_averages / count); + #endif +} + +int main(void) +{ + /* This is how many tests you plan to run */ + plan_tests(2); + + test_utf8_validate(); + + /* This exits depending on whether all tests passed */ + return exit_status(); +} diff --git a/ccan/charset/test/run.c b/ccan/charset/test/run.c deleted file mode 100644 index 5504e886..00000000 --- a/ccan/charset/test/run.c +++ /dev/null @@ -1,199 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include - -/* - * Finds a pseudorandom 32-bit number from 0 to 2^32-1 . - * Uses the BCPL linear congruential generator method. - * - * Used instead of system RNG to ensure tests are consistent. - */ -static uint32_t rand32(void) -{ - static uint32_t rand32_state = 0; - rand32_state *= (uint32_t)0x7FF8A3ED; - rand32_state += (uint32_t)0x2AA01D31; - return rand32_state; -} - -/* - * Make a Unicode character requiring exactly @len UTF-8 bytes. - * - * Unless utf8_allow_surrogates is set, - * do not return a value in the range U+D800 thru U+DFFF . - * - * If @len is not 1 thru 4, generate an out-of-range character. - */ -static unsigned int utf8_randcode(int len) -{ - uint32_t r = rand32(); - unsigned int ret; - - switch (len) { - case 1: return r % 0x80; - case 2: return r % (0x800-0x80) + 0x80; - case 3: - for (;;) { - ret = r % (0x10000-0x800) + 0x800; - if ((!utf8_allow_surrogates && ret >= 0xD800 && ret <= 0xDFFF) - || ret >= 0xFFFE) - { - r = rand32(); - continue; - } else { - break; - } - } - return ret; - case 4: return r % (0x110000-0x10000) + 0x10000; - default: - while (r < 0x110000) - r = rand32(); - return r; - } -} - -static unsigned int rand_surrogate(void) -{ - return rand32() % (0xE000 - 0xD800) + 0xD800; -} - -/* Encode @uc as UTF-8 using exactly @len characters. - @len should be 1 thru 4. - @uc will be truncated to the bits it will go into. - If, after bit truncation, @uc is in the wrong range for its length, - an invalid character will be generated. */ -static void utf8_encode_raw(char *out, unsigned int uc, int len) -{ - switch (len) { - case 1: - *out++ = uc & 0x7F; - break; - case 2: - *out++ = 0xC0 | ((uc >> 6) & 0x1F); - *out++ = 0x80 | (uc & 0x3F); - break; - case 3: - *out++ = 0xE0 | ((uc >> 12) & 0x0F); - *out++ = 0x80 | ((uc >> 6) & 0x3F); - *out++ = 0x80 | (uc & 0x3F); - break; - case 4: - *out++ = 0xF0 | ((uc >> 18) & 0x07); - *out++ = 0x80 | ((uc >> 12) & 0x3F); - *out++ = 0x80 | ((uc >> 6) & 0x3F); - *out++ = 0x80 | (uc & 0x3F); - break; - } -} - -/* Generate a UTF-8 string of the given byte length, - randomly deciding if it should be valid or not. - - Return true if it's valid, false if it's not. */ -static bool utf8_mktest(char *out, int len) -{ - int m, n; - bool valid = true; - bool v; - double pf; - uint32_t pu; - - /* Probability that, per character, it should be valid. - The goal is to make utf8_mktest as a whole - have a 50% chance of generating a valid string. */ - pf = pow(0.5, 2.5/len); - - /* Convert to uint32_t to test against rand32. */ - pu = pf * 4294967295.0; - - for (;len; len -= n) { - v = len == 1 || rand32() <= pu; - m = len < 4 ? len : 4; - - if (v) { - /* Generate a valid character. */ - n = rand32() % m + 1; - utf8_encode_raw(out, utf8_randcode(n), n); - } else { - /* Generate an invalid character. */ - assert(m >= 2); - n = rand32() % (m-1) + 2; - switch (n) { - case 2: - utf8_encode_raw(out, utf8_randcode(1), n); - break; - case 3: - if (!utf8_allow_surrogates && (rand32() & 1)) - utf8_encode_raw(out, rand_surrogate(), n); - else - utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n); - break; - case 4: - utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n); - break; - } - valid = false; - } - out += n; - } - - return valid; -} - -static void test_utf8_validate(bool allow_surrogates) -{ - char buffer[1024]; - int i; - int len; - bool valid; - int passed=0, p_valid=0, p_invalid=0, total=0; - int count; - - count = 10000; - - utf8_allow_surrogates = allow_surrogates; - - for (i=0; i count/10 && p_invalid > count/10, - " valid/invalid are balanced"); -} - -int main(void) -{ - /* This is how many tests you plan to run */ - plan_tests(4); - - test_utf8_validate(false); - test_utf8_validate(true); - - /* This exits depending on whether all tests passed */ - return exit_status(); -}