/**
* charset - character set conversion and validation routines
*
- * This module provides a collection (well, only one, at the moment) of
- * well-tested routines for dealing with character set nonsense.
- *
- * Validation functions:
- * - bool utf8_validate(const char *str, size_t length);
+ * This module provides a collection of well-tested routines
+ * for dealing with character set nonsense.
*
* Example:
* #include <err.h>
* #include <stdio.h>
+ * #include <stdlib.h>
* #include <string.h>
* #include <ccan/charset/charset.h>
* #include <ccan/grab_file/grab_file.h>
- * #include <ccan/talloc/talloc.h> // For talloc_free()
- *
- * int main(int argc, char *argv[])
+ * #include <ccan/talloc/talloc.h>
+ *
+ * static void print_json_string(const char *s);
+ * static bool parse_hex16(const char **sp, unsigned int *out);
+ *
+ * // Take a JSON-encoded string on input and print its literal value.
+ * int main(void)
* {
- * size_t len;
- * char *file;
- * bool valid;
- *
- * if (argc != 2)
- * err(1, "Expected exactly one argument");
- *
- * file = grab_file(NULL, argv[1], &len);
- * if (!file)
- * err(1, "Could not read file %s", argv[1]);
- *
- * valid = utf8_validate(file, len);
- * printf("File contents are %s UTF-8\n", valid ? "valid" : "invalid");
- *
- * talloc_free(file);
- *
+ * char *input;
+ * size_t length;
+ *
+ * input = grab_file(NULL, NULL, &length);
+ * if (!input)
+ * err(1, "Error reading input");
+ * if (!utf8_validate(input, length)) {
+ * fprintf(stderr, "Input contains invalid UTF-8\n");
+ * return 1;
+ * }
+ * if (strlen(input) != length) {
+ * fprintf(stderr, "Input contains null characters\n");
+ * return 1;
+ * }
+ *
+ * print_json_string(input);
+ *
+ * talloc_free(input);
* return 0;
* }
+ *
+ * static void print_json_string(const char *s)
+ * {
+ * char output_buffer[4];
+ *
+ * // Skip leading whitespace
+ * while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')
+ * s++;
+ *
+ * if (*s++ != '"') {
+ * fprintf(stderr, "Expected JSON string literal surrounded by double quotes.\n");
+ * exit(EXIT_FAILURE);
+ * }
+ *
+ * while (*s != '"') {
+ * unsigned char c = *s++;
+ * char *b = output_buffer;
+ *
+ * if (c == '\\') {
+ * c = *s++;
+ * switch (c) {
+ * case '"':
+ * case '\\':
+ * case '/':
+ * *b++ = c;
+ * break;
+ * case 'b': *b++ = '\b'; break;
+ * case 'f': *b++ = '\f'; break;
+ * case 'n': *b++ = '\n'; break;
+ * case 'r': *b++ = '\r'; break;
+ * case 't': *b++ = '\t'; break;
+ * case 'u': {
+ * unsigned int uc, lc;
+ *
+ * if (!parse_hex16(&s, &uc))
+ * goto syntax_error;
+ *
+ * if (uc >= 0xD800 && uc <= 0xDFFF) {
+ * // Handle UTF-16 surrogate pair (e.g. "\uD834\uDD1E").
+ * uchar_t unicode;
+ *
+ * if (*s++ != '\\' || *s++ != 'u' || !parse_hex16(&s, &lc))
+ * goto syntax_error;
+ *
+ * unicode = from_surrogate_pair(uc, lc);
+ * if (unicode == REPLACEMENT_CHARACTER) {
+ * fprintf(stderr, "Invalid surrogate pair.\n");
+ * exit(EXIT_FAILURE);
+ * }
+ *
+ * b += utf8_write_char(unicode, b);
+ * } else {
+ * // Handle ordinary Unicode escape (e.g. "\u266B").
+ * b += utf8_write_char(uc, b);
+ * }
+ *
+ * break;
+ * }
+ * default:
+ * goto syntax_error;
+ * }
+ * } else if (c <= 0x1F) {
+ * // Control characters are not allowed in string literals.
+ * goto syntax_error;
+ * } else {
+ * *b++ = c;
+ * }
+ *
+ * fwrite(output_buffer, 1, b - output_buffer, stdout);
+ * }
+ *
+ * putchar('\n');
+ * return;
+ *
+ * syntax_error:
+ * fprintf(stderr, "Syntax error in JSON string literal.\n");
+ * exit(EXIT_FAILURE);
+ * }
+ *
+ * static bool parse_hex16(const char **sp, unsigned int *out)
+ * {
+ * const char *s = *sp;
+ * unsigned int ret = 0;
+ * unsigned int i;
+ * unsigned int tmp;
+ * char c;
+ *
+ * for (i = 0; i < 4; i++)
+ * {
+ * c = *s++;
+ * if (c >= '0' && c <= '9')
+ * tmp = c - '0';
+ * else if (c >= 'A' && c <= 'F')
+ * tmp = c - 'A' + 10;
+ * else if (c >= 'a' && c <= 'f')
+ * tmp = c - 'a' + 10;
+ * else
+ * return false;
+ *
+ * ret <<= 4;
+ * ret += tmp;
+ * }
+ *
+ * *out = ret;
+ * *sp = s;
+ * return true;
+ * }
*
* Author: Joey Adams
* License: MIT
#include "charset.h"
-bool utf8_allow_surrogates = false;
-
+/*
+ * This function implements the syntax given in RFC3629, which is
+ * the same as that given in The Unicode Standard, Version 6.0.
+ *
+ * It has the following properties:
+ *
+ * * All codepoints U+0000..U+10FFFF may be encoded,
+ * except for U+D800..U+DFFF, which are reserved
+ * for UTF-16 surrogate pair encoding.
+ * * UTF-8 byte sequences longer than 4 bytes are not permitted,
+ * as they exceed the range of Unicode.
+ * * The sixty-six Unicode "non-characters" are permitted
+ * (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
+ */
bool utf8_validate(const char *str, size_t length)
{
const unsigned char *s = (const unsigned char*)str;
while (s < e) {
unsigned char c = *s++;
- unsigned int len; /* number of bytes in sequence - 2 */
+ unsigned char c2;
+ int len_minus_two;
- /* If character is ASCII, move on. */
- if (c < 0x80)
+ /* Validate the first byte and determine the sequence length. */
+ if (c <= 0x7F) /* 00..7F */
continue;
+ else if (c <= 0xC1) /* 80..C1 */
+ return false;
+ else if (c <= 0xDF) /* C2..DF */
+ len_minus_two = 0;
+ else if (c <= 0xEF) /* E0..EF */
+ len_minus_two = 1;
+ else if (c <= 0xF4) /* F0..F4 */
+ len_minus_two = 2;
+ else
+ return false;
- if (s >= e)
- return false; /* Missing bytes in sequence. */
-
- if (c < 0xE0) {
- /* 2-byte sequence, U+0080 to U+07FF
- c must be 11000010 or higher
- s[0] must be 10xxxxxx */
- len = 0;
- if (c < 0xC2)
- return false;
- } else if (c < 0xF0) {
- /* 3-byte sequence, U+0800 to U+FFFF
- Note that the surrogate range is U+D800 to U+DFFF,
- and that U+FFFE and U+FFFF are illegal characters.
- c must be >= 11100000 (which it is)
- If c is 11100000, then s[0] must be >= 10100000
- If the global parameter utf8_allow_surrogates is false:
- If c is 11101101 and s[0] is >= 10100000,
- then this is a surrogate and we should fail.
- If c is 11101111, s[0] is 10111111, and s[1] >= 10111110,
- then this is an illegal character and we should fail.
- s[0] and s[1] must be 10xxxxxx */
- len = 1;
- if (c == 0xE0 && *s < 0xA0)
- return false;
- if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
- return false;
- if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE))
- return false;
- } else {
- /* 4-byte sequence, U+010000 to U+10FFFF
- c must be >= 11110000 (which it is) and <= 11110100
- If c is 11110000, then s[0] must be >= 10010000
- If c is 11110100, then s[0] must be < 10010000
- s[0], s[1], and s[2] must be 10xxxxxx */
- len = 2;
- if (c > 0xF4)
- return false;
- if (c == 0xF0 && *s < 0x90)
- return false;
- if (c == 0xF4 && *s >= 0x90)
- return false;
- }
+ /* Make sure the character isn't clipped. */
+ if (s + len_minus_two >= e)
+ return false;
- if (s + len >= e)
- return false; /* Missing bytes in sequence. */
+ c2 = *s;
+ /* Make sure subsequent bytes are in the range 0x80..0xBF. */
do {
if ((*s++ & 0xC0) != 0x80)
return false;
- } while (len--);
+ } while (len_minus_two--);
+
+ /* Handle special cases. */
+ switch (c) {
+ case 0xE0:
+ /* Disallow overlong 3-byte sequence. */
+ if (c2 < 0xA0)
+ return false;
+ break;
+ case 0xED:
+ /* Disallow U+D800..U+DFFF. */
+ if (c2 > 0x9F)
+ return false;
+ break;
+ case 0xF0:
+ /* Disallow overlong 4-byte sequence. */
+ if (c2 < 0x90)
+ return false;
+ break;
+ case 0xF4:
+ /* Disallow codepoints beyond U+10FFFF. */
+ if (c2 > 0x8F)
+ return false;
+ break;
+ }
}
return true;
}
-/*
- Note to future contributors: These routines are currently all under the
- MIT license. It would be nice to keep it that way :)
-*/
+int utf8_read_char(const char *s, uchar_t *out)
+{
+ const unsigned char *c = (const unsigned char*) s;
+
+ if (c[0] <= 0x7F) {
+ /* 00..7F */
+ *out = c[0];
+ return 1;
+ } else if (c[0] <= 0xDF) {
+ /* C2..DF (unless input is invalid) */
+ *out = ((uchar_t)c[0] & 0x1F) << 6 |
+ ((uchar_t)c[1] & 0x3F);
+ return 2;
+ } else if (c[0] <= 0xEF) {
+ /* E0..EF */
+ *out = ((uchar_t)c[0] & 0xF) << 12 |
+ ((uchar_t)c[1] & 0x3F) << 6 |
+ ((uchar_t)c[2] & 0x3F);
+ return 3;
+ } else {
+ /* F0..F4 (unless input is invalid) */
+ *out = ((uchar_t)c[0] & 0x7) << 18 |
+ ((uchar_t)c[1] & 0x3F) << 12 |
+ ((uchar_t)c[2] & 0x3F) << 6 |
+ ((uchar_t)c[3] & 0x3F);
+ return 4;
+ }
+}
+
+int utf8_write_char(uchar_t unicode, char *out)
+{
+ unsigned char *o = (unsigned char*) out;
+
+ if (unicode <= 0x7F) {
+ /* U+0000..U+007F */
+ *o++ = unicode;
+ return 1;
+ } else if (unicode <= 0x7FF) {
+ /* U+0080..U+07FF */
+ *o++ = 0xC0 | unicode >> 6;
+ *o++ = 0x80 | (unicode & 0x3F);
+ return 2;
+ } else if (unicode <= 0xFFFF) {
+ /* U+0800..U+FFFF */
+ if (unicode >= 0xD800 && unicode <= 0xDFFF)
+ unicode = REPLACEMENT_CHARACTER;
+ three_byte_character:
+ *o++ = 0xE0 | unicode >> 12;
+ *o++ = 0x80 | (unicode >> 6 & 0x3F);
+ *o++ = 0x80 | (unicode & 0x3F);
+ return 3;
+ } else if (unicode <= 0x10FFFF) {
+ /* U+10000..U+10FFFF */
+ *o++ = 0xF0 | unicode >> 18;
+ *o++ = 0x80 | (unicode >> 12 & 0x3F);
+ *o++ = 0x80 | (unicode >> 6 & 0x3F);
+ *o++ = 0x80 | (unicode & 0x3F);
+ return 4;
+ } else {
+ /* U+110000... */
+ unicode = REPLACEMENT_CHARACTER;
+ goto three_byte_character;
+ }
+}
+
+uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc)
+{
+ if (uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF)
+ return 0x10000 + ((((uchar_t)uc & 0x3FF) << 10) | (lc & 0x3FF));
+ else
+ return REPLACEMENT_CHARACTER;
+}
+
+bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc)
+{
+ if (unicode >= 0x10000 && unicode <= 0x10FFFF) {
+ uchar_t n = unicode - 0x10000;
+ *uc = ((n >> 10) & 0x3FF) | 0xD800;
+ *lc = (n & 0x3FF) | 0xDC00;
+ return true;
+ } else {
+ *uc = *lc = REPLACEMENT_CHARACTER;
+ return false;
+ }
+}
#include <stdbool.h>
#include <stddef.h>
+#include <stdint.h>
+
+#define REPLACEMENT_CHARACTER 0xFFFD
/*
- * Validate the given UTF-8 string. If it contains '\0' characters,
- * it is still valid.
- *
- * By default, Unicode characters U+D800 thru U+DFFF will be considered
- * invalid UTF-8. However, if you set utf8_allow_surrogates to true,
- * they will be allowed. Allowing the surrogate range makes it possible
- * to losslessly encode malformed UTF-16.
+ * Type for Unicode codepoints.
+ * We need our own because wchar_t might be 16 bits.
+ */
+typedef uint32_t uchar_t;
+
+/*
+ * Validate the given UTF-8 string.
+ * If it contains '\0' characters, it is still valid.
*/
bool utf8_validate(const char *str, size_t length);
-/* Default: false */
-extern bool utf8_allow_surrogates;
+/*
+ * Read a single UTF-8 character starting at @s,
+ * returning the length, in bytes, of the character read.
+ *
+ * This function assumes input is valid UTF-8,
+ * and that there are enough characters in front of @s.
+ */
+int utf8_read_char(const char *s, uchar_t *out);
+
+/*
+ * Write a single UTF-8 character to @s,
+ * returning the length, in bytes, of the character written.
+ *
+ * @unicode should be U+0000..U+10FFFF, but not U+D800..U+DFFF.
+ * If @unicode is invalid, REPLACEMENT_CHARACTER will be emitted instead.
+ *
+ * This function will write up to 4 bytes to @out.
+ */
+int utf8_write_char(uchar_t unicode, char *out);
+
+/*
+ * Compute the Unicode codepoint of a UTF-16 surrogate pair.
+ *
+ * @uc should be 0xD800..0xDBFF, and @lc should be 0xDC00..0xDFFF.
+ * If they aren't, this function returns REPLACEMENT_CHARACTER.
+ */
+uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc);
+
+/*
+ * Construct a UTF-16 surrogate pair given a Unicode codepoint.
+ *
+ * @unicode should be U+10000..U+10FFFF.
+ * If it's not, this function returns false,
+ * and sets *uc and *lc to REPLACEMENT_CHARACTER.
+ */
+bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc);
#endif
--- /dev/null
+#include <stdint.h>
+#include <stdlib.h>
+
+/*
+ * Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
+ * Uses the BCPL linear congruential generator method.
+ *
+ * Used instead of system RNG to ensure tests are consistent.
+ */
+static uint32_t rand32(void)
+{
+#if 0
+ /*
+ * Tests should be run with a different random function
+ * from time to time. I've found that the method below
+ * sometimes behaves poorly for testing purposes.
+ * For example, rand32() % N might only return even numbers.
+ */
+ assert(RAND_MAX == 2147483647);
+ return ((random() & 0xFFFF) << 16) | (random() & 0xFFFF);
+#else
+ static uint32_t rand32_state = 0;
+ rand32_state *= (uint32_t)0x7FF8A3ED;
+ rand32_state += (uint32_t)0x2AA01D31;
+ return rand32_state;
+#endif
+}
--- /dev/null
+#include <ccan/charset/charset.c>
+#include <ccan/tap/tap.h>
+
+#include <string.h>
+
+#include "common.h"
+
+/*
+ * Testing procedure for from_surrogate_pair and to_surrogate_pair:
+ *
+ * * For each Unicode code point from 0x10000 to 0x10FFFF:
+ * - Call to_surrogate_pair, and make sure that:
+ * - It returns true.
+ * - uc is 0xD800..0xDBFF
+ * - lc is 0xDC00..0xDFFF
+ * - Call from_surrogate_pair on the pair, and make sure that
+ * it returns the original character.
+ * * For various invalid arguments to to_surrogate_pair
+ * (U+0000..U+FFFF and U+110000...):
+ * - Call to_surrogate_pair, and make sure it:
+ * - Returns false.
+ * - Sets *uc and *lc to REPLACEMENT_CHARACTER.
+ * * For various invalid arguments to from_surrogate_pair
+ * (uc: not 0xD800..0xDBFF, lc: not 0xDC00..0xDFFF):
+ * - Call from_surrogate_pair, and make sure
+ * it returns REPLACEMENT_CHARACTER.
+ */
+
+#define INVALID_TRIAL_COUNT 10000
+
+#define range(r, lo, hi) ((r) % ((hi)-(lo)+1) + (lo))
+
+static void test_valid(void)
+{
+ uchar_t unicode;
+ unsigned int uc, lc;
+
+ for (unicode = 0x10000; unicode <= 0x10FFFF; unicode++) {
+ if (to_surrogate_pair(unicode, &uc, &lc) != true) {
+ fail("to_surrogate_pair did not return true on valid input.");
+ return;
+ }
+ if (!(uc >= 0xD800 && uc <= 0xDBFF)) {
+ fail("to_surrogate_pair: uc is out of range");
+ return;
+ }
+ if (!(lc >= 0xDC00 && lc <= 0xDFFF)) {
+ fail("to_surrogate_pair: lc is out of range");
+ return;
+ }
+ if (from_surrogate_pair(uc, lc) != unicode) {
+ fail("Surrogate pair conversion did not preserve original value (U+%04lX).", (unsigned long)unicode);
+ return;
+ }
+ }
+
+ pass("to_surrogate_pair and from_surrogate_pair work for all valid arguments.");
+}
+
+static void test_invalid_to_surrogate_pair(void)
+{
+ long i;
+ uchar_t unicode;
+ unsigned int uc, lc;
+
+ for (i = 1; i <= INVALID_TRIAL_COUNT; i++) {
+ if (rand32() % 2) {
+ unicode = range(rand32(), 0x0, 0xFFFF);
+ } else {
+ do {
+ unicode = rand32();
+ } while (unicode < 0x110000);
+ }
+
+ if (to_surrogate_pair(unicode, &uc, &lc) != false) {
+ fail("to_surrogate_pair did not return false on invalid input.");
+ return;
+ }
+ if (uc != REPLACEMENT_CHARACTER || lc != REPLACEMENT_CHARACTER) {
+ fail("to_surrogate_pair did not set uc and lc to the replacement character on invalid input.");
+ return;
+ }
+ }
+
+ pass("to_surrogate_pair seems to handle invalid argument values properly.");
+}
+
+static void test_invalid_from_surrogate_pair(void)
+{
+ long i;
+ unsigned int uc, lc;
+
+ for (i = 1; i <= INVALID_TRIAL_COUNT; i++) {
+ switch (rand32() % 3) {
+ case 0:
+ uc = range(rand32(), 0x0, 0xD7FF);
+ break;
+ case 1:
+ uc = range(rand32(), 0xDC00, 0xDFFF);
+ break;
+ default:
+ uc = range(rand32(), 0xE000, 0xFFFF);
+ break;
+ }
+ switch (rand32() % 3) {
+ case 0:
+ lc = range(rand32(), 0x0, 0xD7FF);
+ break;
+ case 1:
+ lc = range(rand32(), 0xD800, 0xDBFF);
+ break;
+ default:
+ lc = range(rand32(), 0xE000, 0xFFFF);
+ break;
+ }
+
+ if (from_surrogate_pair(uc, lc) != REPLACEMENT_CHARACTER) {
+ fail("from_surrogate_pair(0x%04X, 0x%04X) did not return the replacement character", uc, lc);
+ return;
+ }
+ }
+
+ pass("from_surrogate_pair seems to handle invalid arguments properly.");
+}
+
+int main(void)
+{
+ plan_tests(3);
+
+ test_valid();
+ test_invalid_to_surrogate_pair();
+ test_invalid_from_surrogate_pair();
+
+ return exit_status();
+}
--- /dev/null
+#include <ccan/charset/charset.c>
+#include <ccan/tap/tap.h>
+
+#include <string.h>
+
+#include "common.h"
+
+/*
+ * Testing procedure for utf8_read_char and utf8_write_char:
+ *
+ * * Generate N valid and invalid Unicode code points.
+ * * Encode them with utf8_write_char.
+ * * Copy the resulting string into a buffer sized exactly as big as
+ * the string produced. This way, Valgrind can catch buffer overflows
+ * by utf8_validate and utf8_read_char.
+ * * Validate the string with utf8_validate.
+ * * Decode the string, ensuring that:
+ * - Valid codepoints are read back.
+ * - Invalid characters are read back, but replaced
+ * with REPLACEMENT_CHARACTER.
+ * - No extra characters are read back.
+ */
+
+#define TRIAL_COUNT 1000
+#define MAX_CHARS_PER_TRIAL 100
+
+#define range(r, lo, hi) ((r) % ((hi)-(lo)+1) + (lo))
+
+int main(void)
+{
+ int trial;
+
+ plan_tests(TRIAL_COUNT);
+
+ for (trial = 1; trial <= TRIAL_COUNT; trial++) {
+ int i, count;
+ uchar_t codepoints[MAX_CHARS_PER_TRIAL];
+ uchar_t c;
+ bool c_valid;
+
+ char write_buffer[MAX_CHARS_PER_TRIAL * 4];
+ char *o = write_buffer;
+ char *oe = write_buffer + sizeof(write_buffer);
+
+ char *string;
+ const char *s;
+ const char *e;
+
+ int len;
+
+ count = rand32() % MAX_CHARS_PER_TRIAL + 1;
+
+ for (i = 0; i < count; i++) {
+ if (o >= oe) {
+ fail("utf8_write_char: Buffer overflow (1)");
+ goto next_trial;
+ }
+
+ switch (rand32() % 7) {
+ case 0:
+ c = range(rand32(), 0x0, 0x7F);
+ c_valid = true;
+ break;
+ case 1:
+ c = range(rand32(), 0x80, 0x7FF);
+ c_valid = true;
+ break;
+ case 2:
+ c = range(rand32(), 0x800, 0xD7FF);
+ c_valid = true;
+ break;
+ case 3:
+ c = range(rand32(), 0xD800, 0xDFFF);
+ c_valid = false;
+ break;
+ case 4:
+ c = range(rand32(), 0xE000, 0xFFFF);
+ c_valid = true;
+ break;
+ case 5:
+ c = range(rand32(), 0x10000, 0x10FFFF);
+ c_valid = true;
+ break;
+ default:
+ do {
+ c = rand32();
+ } while (c < 0x110000);
+ c_valid = false;
+ break;
+ }
+
+ codepoints[i] = c_valid ? c : REPLACEMENT_CHARACTER;
+
+ len = utf8_write_char(c, o);
+ if (len < 1 || len > 4) {
+ fail("utf8_write_char: Return value is not 1 thru 4.");
+ goto next_trial;
+ }
+ o += len;
+ }
+ if (o > oe) {
+ fail("utf8_write_char: Buffer overflow (2)");
+ goto next_trial;
+ }
+
+ string = malloc(o - write_buffer);
+ memcpy(string, write_buffer, o - write_buffer);
+ s = string;
+ e = string + (o - write_buffer);
+
+ if (!utf8_validate(s, e - s)) {
+ fail("Invalid string produced by utf8_write_char.");
+ goto next_trial_free_string;
+ }
+
+ for (i = 0; i < count; i++) {
+ if (s >= e) {
+ fail("utf8_read_char: Buffer overflow (1)");
+ goto next_trial_free_string;
+ }
+
+ len = utf8_read_char(s, &c);
+ if (len < 1 || len > 4) {
+ fail("utf8_read_char: Return value is not 1 thru 4.");
+ goto next_trial_free_string;
+ }
+ if (c != codepoints[i]) {
+ fail("utf8_read_char: Character read differs from that written.");
+ goto next_trial_free_string;
+ }
+ s += len;
+ }
+ if (s > e) {
+ fail("utf8_read_char: Buffer overflow (2)");
+ goto next_trial_free_string;
+ }
+ if (s < e) {
+ fail("utf8_read_char: Did not reach end of string.");
+ goto next_trial_free_string;
+ }
+
+ pass("Trial %d: %d characters", trial, count);
+
+ next_trial_free_string:
+ free(string);
+ next_trial:;
+ }
+
+ return exit_status();
+}
--- /dev/null
+#include <ccan/charset/charset.c>
+#include <ccan/tap/tap.h>
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "common.h"
+
+/* Make a valid or invalid Unicode character fitting in exactly @len UTF-8 bytes. */
+static uchar_t utf8_randcode(int len, bool valid, bool after_clipped)
+{
+ uint32_t r = rand32();
+ uchar_t ret;
+
+ #define range(lo, hi) ((r & 0x7FFFFFFF) % ((hi)-(lo)+1) + (lo))
+ #define high_bit_set() (!!(r & 0x80000000))
+
+ switch (len) {
+ case 1:
+ if (valid) {
+ /* Generate a character U+0000..U+007F */
+ return r & 0x7F;
+ } else {
+ /*
+ * Generate a character U+0080..U+00BF or U+00F8..U+00FF.
+ *
+ * However, don't generate U+0080..U+00BF (10xxxxxx) after a
+ * clipped character, as that can inadvertently form a valid,
+ * complete character.
+ */
+ if (!after_clipped && high_bit_set())
+ return range(0x80, 0xBF);
+ else
+ return range(0xF8, 0xFF);
+ }
+ case 2:
+ if (valid) {
+ /* Generate a character U+0080..U+07FF */
+ return range(0x80, 0x7FF);
+ } else {
+ /* Generate a character U+0000..U+007F */
+ return r & 0x7F;
+ }
+ case 3:
+ if (valid) {
+ /* Generate a character U+0800..U+FFFF, but not U+D800..U+DFFF */
+ for (;;) {
+ ret = range(0x800, 0xFFFF);
+ if (ret >= 0xD800 && ret <= 0xDFFF) {
+ r = rand32();
+ continue;
+ } else {
+ break;
+ }
+ }
+ return ret;
+ } else {
+ /* Generate a character U+0000..U+07FF or U+D800..U+DFFF */
+ if (high_bit_set())
+ return r & 0x7FF;
+ else
+ return 0xD800 + (r & 0x7FF);
+ }
+ case 4:
+ if (valid) {
+ /* Generate a character U+10000..U+10FFFF */
+ return range(0x10000, 0x10FFFF);
+ } else {
+ /* Generate a character U+0000..0xFFFF or U+110000..U+1FFFFF */
+ if (high_bit_set())
+ return r & 0xFFFF;
+ else
+ return range(0x110000, 0x1FFFFF);
+ }
+ default:
+ assert(false);
+ }
+
+ #undef range
+ #undef high_bit_set
+}
+
+/* Encode @uc as UTF-8 using exactly @len characters.
+ @len should be 1 thru 4. */
+static void utf8_encode_raw(char *out, unsigned int uc, int len)
+{
+ switch (len) {
+ case 1:
+ assert(uc <= 0xC1 || (uc >= 0xF8 && uc <= 0xFF));
+ *out++ = uc;
+ break;
+ case 2:
+ assert(uc <= 0x7FF);
+ *out++ = 0xC0 | ((uc >> 6) & 0x1F);
+ *out++ = 0x80 | (uc & 0x3F);
+ break;
+ case 3:
+ assert(uc <= 0xFFFF);
+ *out++ = 0xE0 | ((uc >> 12) & 0x0F);
+ *out++ = 0x80 | ((uc >> 6) & 0x3F);
+ *out++ = 0x80 | (uc & 0x3F);
+ break;
+ case 4:
+ assert(uc <= 0x1FFFFF);
+ *out++ = 0xF0 | ((uc >> 18) & 0x07);
+ *out++ = 0x80 | ((uc >> 12) & 0x3F);
+ *out++ = 0x80 | ((uc >> 6) & 0x3F);
+ *out++ = 0x80 | (uc & 0x3F);
+ break;
+ }
+}
+
+#if COMPUTE_AVERAGE_LENGTH
+double total_averages;
+#endif
+
+/* Generate a UTF-8 string of the given byte length,
+ randomly deciding if it should be valid or not.
+
+ Return true if it's valid, false if it's not. */
+static bool utf8_mktest(char *out, int len)
+{
+ double pf;
+ uint32_t pu;
+ int n;
+ bool valid = true;
+ bool v;
+ bool after_clipped = false;
+
+ #if COMPUTE_AVERAGE_LENGTH
+ int n_total = 0;
+ int count = 0;
+ #endif
+
+ /*
+ * Probability that, per character, it should be valid.
+ * The goal is to make utf8_mktest as a whole
+ * have a 50% chance of generating a valid string.
+ *
+ * The equation being solved is:
+ *
+ * p^n = 0.5
+ *
+ * where p is the probability that each character is valid,
+ * and n is the number of characters in the string.
+ *
+ * 2.384 is the approximate average length of each character,
+ * so len/2.384 is about how many characters this string
+ * is expected to contain.
+ */
+ pf = pow(0.5, 2.384/len);
+
+ /* Convert to uint32_t to test against rand32. */
+ pu = pf * 4294967295.0;
+
+ for (;len > 0; len -= n, out += n) {
+ v = rand32() <= pu;
+
+ if (v) {
+ /* Generate a valid character. */
+ n = rand32() % (len < 4 ? len : 4) + 1;
+ utf8_encode_raw(out, utf8_randcode(n, true, after_clipped), n);
+ after_clipped = false;
+ } else if (rand32() % 5) {
+ /* Generate an invalid character. */
+ n = rand32() % (len < 4 ? len : 4) + 1;
+ utf8_encode_raw(out, utf8_randcode(n, false, after_clipped), n);
+ after_clipped = false;
+ } else {
+ /* Generate a clipped but otherwise valid character. */
+ char tmp[4];
+ n = rand32() % 3 + 2;
+ utf8_encode_raw(tmp, utf8_randcode(n, true, after_clipped), n);
+ n -= rand32() % (n-1) + 1;
+ if (n > len)
+ n = len;
+ assert(n >= 1 && n <= 3);
+ memcpy(out, tmp, n);
+ after_clipped = true;
+ }
+
+ if (!v)
+ valid = false;
+
+ #if COMPUTE_AVERAGE_LENGTH
+ n_total += n;
+ count++;
+ #endif
+ }
+
+ #if COMPUTE_AVERAGE_LENGTH
+ if (count > 0)
+ total_averages += (double)n_total / count;
+ #endif
+
+ return valid;
+}
+
+static void test_utf8_validate(void)
+{
+ char buffer[128];
+ int i;
+ int len;
+ bool valid;
+ int passed=0, p_valid=0, p_invalid=0, total=0;
+ int count;
+
+ count = 100000;
+
+ #if COMPUTE_AVERAGE_LENGTH
+ total_averages = 0.0;
+ #endif
+
+ for (i=0; i<count; i++) {
+ len = rand32() % (sizeof(buffer) + 1);
+ valid = utf8_mktest(buffer, len);
+ if (utf8_validate(buffer, len) == valid) {
+ passed++;
+ if (valid)
+ p_valid++;
+ else
+ p_invalid++;
+ } else {
+ bool uvalid = utf8_validate(buffer, len);
+ printf("Failed: generated %s string, but utf8_validate returned %s\n",
+ valid ? "valid" : "invalid",
+ uvalid ? "true" : "false");
+ }
+ total++;
+ }
+
+ if (passed == total)
+ pass("%d valid tests, %d invalid tests", p_valid, p_invalid);
+ else
+ fail("Passed only %d out of %d tests\n", passed, total);
+
+ ok(p_valid > count/10 && p_invalid > count/10,
+ "Valid and invalid should be balanced");
+
+ #if COMPUTE_AVERAGE_LENGTH
+ printf("Average character length: %f\n", total_averages / count);
+ #endif
+}
+
+int main(void)
+{
+ /* This is how many tests you plan to run */
+ plan_tests(2);
+
+ test_utf8_validate();
+
+ /* This exits depending on whether all tests passed */
+ return exit_status();
+}
+++ /dev/null
-#include <ccan/charset/charset.h>
-#include <ccan/charset/charset.c>
-#include <ccan/tap/tap.h>
-
-#include <assert.h>
-#include <math.h>
-#include <stdint.h>
-#include <stdio.h>
-
-/*
- * Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
- * Uses the BCPL linear congruential generator method.
- *
- * Used instead of system RNG to ensure tests are consistent.
- */
-static uint32_t rand32(void)
-{
- static uint32_t rand32_state = 0;
- rand32_state *= (uint32_t)0x7FF8A3ED;
- rand32_state += (uint32_t)0x2AA01D31;
- return rand32_state;
-}
-
-/*
- * Make a Unicode character requiring exactly @len UTF-8 bytes.
- *
- * Unless utf8_allow_surrogates is set,
- * do not return a value in the range U+D800 thru U+DFFF .
- *
- * If @len is not 1 thru 4, generate an out-of-range character.
- */
-static unsigned int utf8_randcode(int len)
-{
- uint32_t r = rand32();
- unsigned int ret;
-
- switch (len) {
- case 1: return r % 0x80;
- case 2: return r % (0x800-0x80) + 0x80;
- case 3:
- for (;;) {
- ret = r % (0x10000-0x800) + 0x800;
- if ((!utf8_allow_surrogates && ret >= 0xD800 && ret <= 0xDFFF)
- || ret >= 0xFFFE)
- {
- r = rand32();
- continue;
- } else {
- break;
- }
- }
- return ret;
- case 4: return r % (0x110000-0x10000) + 0x10000;
- default:
- while (r < 0x110000)
- r = rand32();
- return r;
- }
-}
-
-static unsigned int rand_surrogate(void)
-{
- return rand32() % (0xE000 - 0xD800) + 0xD800;
-}
-
-/* Encode @uc as UTF-8 using exactly @len characters.
- @len should be 1 thru 4.
- @uc will be truncated to the bits it will go into.
- If, after bit truncation, @uc is in the wrong range for its length,
- an invalid character will be generated. */
-static void utf8_encode_raw(char *out, unsigned int uc, int len)
-{
- switch (len) {
- case 1:
- *out++ = uc & 0x7F;
- break;
- case 2:
- *out++ = 0xC0 | ((uc >> 6) & 0x1F);
- *out++ = 0x80 | (uc & 0x3F);
- break;
- case 3:
- *out++ = 0xE0 | ((uc >> 12) & 0x0F);
- *out++ = 0x80 | ((uc >> 6) & 0x3F);
- *out++ = 0x80 | (uc & 0x3F);
- break;
- case 4:
- *out++ = 0xF0 | ((uc >> 18) & 0x07);
- *out++ = 0x80 | ((uc >> 12) & 0x3F);
- *out++ = 0x80 | ((uc >> 6) & 0x3F);
- *out++ = 0x80 | (uc & 0x3F);
- break;
- }
-}
-
-/* Generate a UTF-8 string of the given byte length,
- randomly deciding if it should be valid or not.
-
- Return true if it's valid, false if it's not. */
-static bool utf8_mktest(char *out, int len)
-{
- int m, n;
- bool valid = true;
- bool v;
- double pf;
- uint32_t pu;
-
- /* Probability that, per character, it should be valid.
- The goal is to make utf8_mktest as a whole
- have a 50% chance of generating a valid string. */
- pf = pow(0.5, 2.5/len);
-
- /* Convert to uint32_t to test against rand32. */
- pu = pf * 4294967295.0;
-
- for (;len; len -= n) {
- v = len == 1 || rand32() <= pu;
- m = len < 4 ? len : 4;
-
- if (v) {
- /* Generate a valid character. */
- n = rand32() % m + 1;
- utf8_encode_raw(out, utf8_randcode(n), n);
- } else {
- /* Generate an invalid character. */
- assert(m >= 2);
- n = rand32() % (m-1) + 2;
- switch (n) {
- case 2:
- utf8_encode_raw(out, utf8_randcode(1), n);
- break;
- case 3:
- if (!utf8_allow_surrogates && (rand32() & 1))
- utf8_encode_raw(out, rand_surrogate(), n);
- else
- utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
- break;
- case 4:
- utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
- break;
- }
- valid = false;
- }
- out += n;
- }
-
- return valid;
-}
-
-static void test_utf8_validate(bool allow_surrogates)
-{
- char buffer[1024];
- int i;
- int len;
- bool valid;
- int passed=0, p_valid=0, p_invalid=0, total=0;
- int count;
-
- count = 10000;
-
- utf8_allow_surrogates = allow_surrogates;
-
- for (i=0; i<count; i++) {
- len = rand32() % (1024 + 1);
- valid = utf8_mktest(buffer, len);
- if (utf8_validate(buffer, len) == valid) {
- passed++;
- if (valid)
- p_valid++;
- else
- p_invalid++;
- }
- total++;
- }
-
- if (passed == total) {
- printf("PASS: %d valid tests, %d invalid tests\n",
- p_valid, p_invalid);
- } else {
- printf("FAIL: Passed %d out of %d tests\n", passed, total);
- }
-
- ok(passed, "utf8_validate test passed%s",
- !allow_surrogates ? " (surrogates disallowed)" : "");
-
- ok(p_valid > count/10 && p_invalid > count/10,
- " valid/invalid are balanced");
-}
-
-int main(void)
-{
- /* This is how many tests you plan to run */
- plan_tests(4);
-
- test_utf8_validate(false);
- test_utf8_validate(true);
-
- /* This exits depending on whether all tests passed */
- return exit_status();
-}