From: Joseph Adams Date: Wed, 31 Mar 2010 12:43:16 +0000 (+1030) Subject: Joey's charset validation module. X-Git-Url: https://git.ozlabs.org/?p=ccan;a=commitdiff_plain;h=c8c69dc68792e85b14646e8a8219dae923b34feb;ds=sidebyside Joey's charset validation module. --- diff --git a/ccan/charset/_info b/ccan/charset/_info new file mode 100644 index 00000000..4319ecb3 --- /dev/null +++ b/ccan/charset/_info @@ -0,0 +1,63 @@ +#include +#include +#include "config.h" + +/** + * charset - character set conversion and validation routines + * + * This module provides a collection (well, only one, at the moment) of + * well-tested routines for dealing with character set nonsense. + * + * Validation functions: + * - bool utf8_validate(const char *str, size_t length); + * + * Example: + * #include + * #include + * #include + * #include + * #include + * #include // For talloc_free() + * + * int main(int argc, char *argv[]) + * { + * size_t len; + * char *file; + * bool valid; + * + * if (argc != 2) + * err(1, "Expected exactly one argument"); + * + * file = grab_file(NULL, argv[1], &len); + * if (!file) + * err(1, "Could not read file %s", argv[1]); + * + * valid = utf8_validate(file, len)); + * printf("File contents are %s UTF-8\n", valid ? "valid" : "invalid"); + * + * talloc_free(file); + * + * return 0; + * } + * + * Author: Joey Adams + * Licence: MIT + */ +int main(int argc, char *argv[]) +{ + /* Expect exactly one argument */ + if (argc != 2) + return 1; + + if (strcmp(argv[1], "depends") == 0) { + /* Nothing */ + return 0; + } + + if (strcmp(argv[1], "libs") == 0) { + printf("m\n"); /* Needed for the pow() invocation in run.c */ + return 0; + } + + return 1; +} diff --git a/ccan/charset/charset.c b/ccan/charset/charset.c new file mode 100644 index 00000000..6c21df38 --- /dev/null +++ b/ccan/charset/charset.c @@ -0,0 +1,95 @@ +/* + Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com) + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "charset.h" + +bool utf8_allow_surrogates = false; + +bool utf8_validate(const char *str, size_t length) +{ + const unsigned char *s = (const unsigned char*)str; + const unsigned char *e = s + length; + + while (s < e) { + unsigned char c = *s++; + unsigned int len; /* number of bytes in sequence - 2 */ + + /* If character is ASCII, move on. */ + if (c < 0x80) + continue; + + if (s >= e) + return false; /* Missing bytes in sequence. */ + + if (c < 0xE0) { + /* 2-byte sequence, U+0080 to U+07FF + c must be 11000010 or higher + s[0] must be 10xxxxxx */ + len = 0; + if (c < 0xC2) + return false; + } else if (c < 0xF0) { + /* 3-byte sequence, U+0800 to U+FFFF + Note that the surrogate range is U+D800 to U+DFFF + c must be >= 11100000 (which it is) + If c is 11100000, then s[0] must be >= 10100000 + If the global parameter utf8_allow_surrogates is false: + If c is 11101101 and s[0] is >= 10100000, + then this is a surrogate and we should fail. + s[0] and s[1] must be 10xxxxxx */ + len = 1; + if (c == 0xE0 && *s < 0xA0) + return false; + if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0) + return false; + } else { + /* 4-byte sequence, U+010000 to U+10FFFF + c must be >= 11110000 (which it is) and <= 11110100 + If c is 11110000, then s[0] must be >= 10010000 + If c is 11110100, then s[0] must be < 10010000 + s[0], s[1], and s[2] must be 10xxxxxx */ + len = 2; + if (c > 0xF4) + return false; + if (c == 0xF0 && *s < 0x90) + return false; + if (c == 0xF4 && *s >= 0x90) + return false; + } + + if (s + len >= e) + return false; /* Missing bytes in sequence. */ + + do { + if ((*s++ & 0xC0) != 0x80) + return false; + } while (len--); + } + + return true; +} + +/* + Note to future contributors: These routines are currently all under the + MIT license. It would be nice to keep it that way :) +*/ diff --git a/ccan/charset/charset.h b/ccan/charset/charset.h new file mode 100644 index 00000000..74317fce --- /dev/null +++ b/ccan/charset/charset.h @@ -0,0 +1,44 @@ +/* + Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com) + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#ifndef CCAN_CHARSET_H +#define CCAN_CHARSET_H + +#include +#include + +/* + * Validate the given UTF-8 string. If it contains '\0' characters, + * it is still valid. + * + * By default, Unicode characters U+D800 thru U+DFFF will be considered + * invalid UTF-8. However, if you set utf8_allow_surrogates to true, + * they will be allowed. Allowing the surrogate range makes it possible + * to losslessly encode malformed UTF-16. + */ +bool utf8_validate(const char *str, size_t length); + +/* Default: false */ +extern bool utf8_allow_surrogates; + +#endif diff --git a/ccan/charset/test/run.c b/ccan/charset/test/run.c new file mode 100644 index 00000000..9f3f9007 --- /dev/null +++ b/ccan/charset/test/run.c @@ -0,0 +1,198 @@ +#include +#include +#include + +#include +#include +#include +#include + +/* + * Finds a pseudorandom 32-bit number from 0 to 2^32-1 . + * Uses the BCPL linear congruential generator method. + * + * Used instead of system RNG to ensure tests are consistent. + */ +static uint32_t rand32(void) +{ + static uint32_t rand32_state = 0; + rand32_state *= (uint32_t)0x7FF8A3ED; + rand32_state += (uint32_t)0x2AA01D31; + return rand32_state; +} + +/* + * Make a Unicode character requiring exactly @len UTF-8 bytes. + * + * Unless utf8_allow_surrogates is set, + * do not return a value in the range U+D800 thru U+DFFF . + * + * If @len is not 1 thru 4, generate an out-of-range character. + */ +static unsigned int utf8_randcode(int len) +{ + uint32_t r = rand32(); + unsigned int ret; + + switch (len) { + case 1: return r % 0x80; + case 2: return r % (0x800-0x80) + 0x80; + case 3: + for (;;) { + ret = r % (0x10000-0x800) + 0x800; + if (!utf8_allow_surrogates && ret >= 0xD800 && ret <= 0xDFFF) + { + r = rand32(); + continue; + } else { + break; + } + } + return ret; + case 4: return r % (0x110000-0x10000) + 0x10000; + default: + while (r < 0x110000) + r = rand32(); + return r; + } +} + +static unsigned int rand_surrogate(void) +{ + return rand32() % (0xE000 - 0xD800) + 0xD800; +} + +/* Encode @uc as UTF-8 using exactly @len characters. + @len should be 1 thru 4. + @uc will be truncated to the bits it will go into. + If, after bit truncation, @uc is in the wrong range for its length, + an invalid character will be generated. */ +static void utf8_encode_raw(char *out, unsigned int uc, int len) +{ + switch (len) { + case 1: + *out++ = uc & 0x7F; + break; + case 2: + *out++ = 0xC0 | ((uc >> 6) & 0x1F); + *out++ = 0x80 | (uc & 0x3F); + break; + case 3: + *out++ = 0xE0 | ((uc >> 12) & 0x0F); + *out++ = 0x80 | ((uc >> 6) & 0x3F); + *out++ = 0x80 | (uc & 0x3F); + break; + case 4: + *out++ = 0xF0 | ((uc >> 18) & 0x07); + *out++ = 0x80 | ((uc >> 12) & 0x3F); + *out++ = 0x80 | ((uc >> 6) & 0x3F); + *out++ = 0x80 | (uc & 0x3F); + break; + } +} + +/* Generate a UTF-8 string of the given byte length, + randomly deciding if it should be valid or not. + + Return true if it's valid, false if it's not. */ +static bool utf8_mktest(char *out, int len) +{ + int m, n; + bool valid = true; + bool v; + double pf; + uint32_t pu; + + /* Probability that, per character, it should be valid. + The goal is to make utf8_mktest as a whole + have a 50% chance of generating a valid string. */ + pf = pow(0.5, 2.5/len); + + /* Convert to uint32_t to test against rand32. */ + pu = pf * 4294967295.0; + + for (;len; len -= n) { + v = len == 1 || rand32() <= pu; + m = len < 4 ? len : 4; + + if (v) { + /* Generate a valid character. */ + n = rand32() % m + 1; + utf8_encode_raw(out, utf8_randcode(n), n); + } else { + /* Generate an invalid character. */ + assert(m >= 2); + n = rand32() % (m-1) + 2; + switch (n) { + case 2: + utf8_encode_raw(out, utf8_randcode(1), n); + break; + case 3: + if (!utf8_allow_surrogates && (rand32() & 1)) + utf8_encode_raw(out, rand_surrogate(), n); + else + utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n); + break; + case 4: + utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n); + break; + } + valid = false; + } + out += n; + } + + return valid; +} + +static void test_utf8_validate(bool allow_surrogates) +{ + char buffer[1024]; + int i; + int len; + bool valid; + int passed=0, p_valid=0, p_invalid=0, total=0; + int count; + + count = 10000; + + utf8_allow_surrogates = allow_surrogates; + + for (i=0; i count/10 && p_invalid > count/10, + " valid/invalid are balanced"); +} + +int main(void) +{ + /* This is how many tests you plan to run */ + plan_tests(4); + + test_utf8_validate(false); + test_utf8_validate(true); + + /* This exits depending on whether all tests passed */ + return exit_status(); +}