Joey's charset validation module.
authorJoseph Adams <joeyadams3.14159@gmail.com>
Wed, 31 Mar 2010 12:43:16 +0000 (23:13 +1030)
committerRusty Russell <rusty@rustcorp.com.au>
Wed, 31 Mar 2010 12:43:16 +0000 (23:13 +1030)
ccan/charset/_info [new file with mode: 0644]
ccan/charset/charset.c [new file with mode: 0644]
ccan/charset/charset.h [new file with mode: 0644]
ccan/charset/test/run.c [new file with mode: 0644]

diff --git a/ccan/charset/_info b/ccan/charset/_info
new file mode 100644 (file)
index 0000000..4319ecb
--- /dev/null
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <string.h>
+#include "config.h"
+
+/**
+ * charset - character set conversion and validation routines
+ *
+ * This module provides a collection (well, only one, at the moment) of
+ * well-tested routines for dealing with character set nonsense.
+ *
+ * Validation functions:
+ *  - bool utf8_validate(const char *str, size_t length);
+ *
+ * Example:
+ *     #include <err.h>
+ *     #include <stdio.h>
+ *     #include <string.h>
+ *     #include <ccan/charset/charset.h>
+ *     #include <ccan/grab_file/grab_file.h>
+ *     #include <ccan/talloc/talloc.h> // For talloc_free()
+ *
+ *     int main(int argc, char *argv[])
+ *     {
+ *             size_t len;
+ *             char *file;
+ *             bool valid;
+ *
+ *             if (argc != 2)
+ *                     err(1, "Expected exactly one argument");
+ *
+ *             file = grab_file(NULL, argv[1], &len);
+ *             if (!file)
+ *                     err(1, "Could not read file %s", argv[1]);
+ *
+ *             valid = utf8_validate(file, len));
+ *             printf("File contents are %s UTF-8\n", valid ? "valid" : "invalid");
+ *
+ *             talloc_free(file);
+ *
+ *             return 0;
+ *     }
+ *
+ * Author: Joey Adams
+ * Licence: MIT
+ */
+int main(int argc, char *argv[])
+{
+       /* Expect exactly one argument */
+       if (argc != 2)
+               return 1;
+
+       if (strcmp(argv[1], "depends") == 0) {
+               /* Nothing */
+               return 0;
+       }
+       
+       if (strcmp(argv[1], "libs") == 0) {
+               printf("m\n"); /* Needed for the pow() invocation in run.c */
+               return 0;
+       }
+
+       return 1;
+}
diff --git a/ccan/charset/charset.c b/ccan/charset/charset.c
new file mode 100644 (file)
index 0000000..6c21df3
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+  Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
+  All rights reserved.
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#include "charset.h"
+
+bool utf8_allow_surrogates = false;
+
+bool utf8_validate(const char *str, size_t length)
+{
+       const unsigned char *s = (const unsigned char*)str;
+       const unsigned char *e = s + length;
+       
+       while (s < e) {
+               unsigned char c = *s++;
+               unsigned int len; /* number of bytes in sequence - 2 */
+               
+               /* If character is ASCII, move on. */
+               if (c < 0x80)
+                       continue;
+               
+               if (s >= e)
+                       return false; /* Missing bytes in sequence. */
+               
+               if (c < 0xE0) {
+                       /* 2-byte sequence, U+0080 to U+07FF
+                          c must be 11000010 or higher
+                          s[0] must be 10xxxxxx */
+                       len = 0;
+                       if (c < 0xC2)
+                               return false;
+               } else if (c < 0xF0) {
+                       /* 3-byte sequence, U+0800 to U+FFFF
+                          Note that the surrogate range is U+D800 to U+DFFF
+                          c must be >= 11100000 (which it is)
+                          If c is 11100000, then s[0] must be >= 10100000
+                          If the global parameter utf8_allow_surrogates is false:
+                             If c is 11101101 and s[0] is >= 10100000,
+                                then this is a surrogate and we should fail.
+                          s[0] and s[1] must be 10xxxxxx */
+                       len = 1;
+                       if (c == 0xE0 && *s < 0xA0)
+                               return false;
+                       if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
+                               return false;
+               } else {
+                       /* 4-byte sequence, U+010000 to U+10FFFF
+                          c must be >= 11110000 (which it is) and <= 11110100
+                          If c is 11110000, then s[0] must be >= 10010000
+                          If c is 11110100, then s[0] must be < 10010000
+                          s[0], s[1], and s[2] must be 10xxxxxx */
+                       len = 2;
+                       if (c > 0xF4)
+                               return false;
+                       if (c == 0xF0 && *s < 0x90)
+                               return false;
+                       if (c == 0xF4 && *s >= 0x90)
+                               return false;
+               }
+               
+               if (s + len >= e)
+                       return false; /* Missing bytes in sequence. */
+               
+               do {
+                       if ((*s++ & 0xC0) != 0x80)
+                               return false;
+               } while (len--);
+       }
+       
+       return true;
+}
+
+/*
+  Note to future contributors: These routines are currently all under the
+    MIT license.  It would be nice to keep it that way :)
+*/
diff --git a/ccan/charset/charset.h b/ccan/charset/charset.h
new file mode 100644 (file)
index 0000000..74317fc
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+  Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
+  All rights reserved.
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#ifndef CCAN_CHARSET_H
+#define CCAN_CHARSET_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+/*
+ * Validate the given UTF-8 string.  If it contains '\0' characters,
+ * it is still valid.
+ *
+ * By default, Unicode characters U+D800 thru U+DFFF will be considered
+ * invalid UTF-8.  However, if you set utf8_allow_surrogates to true,
+ * they will be allowed.  Allowing the surrogate range makes it possible
+ * to losslessly encode malformed UTF-16.
+ */
+bool utf8_validate(const char *str, size_t length);
+
+/* Default: false */
+extern bool utf8_allow_surrogates;
+
+#endif
diff --git a/ccan/charset/test/run.c b/ccan/charset/test/run.c
new file mode 100644 (file)
index 0000000..9f3f900
--- /dev/null
@@ -0,0 +1,198 @@
+#include <ccan/charset/charset.h>
+#include <ccan/charset/charset.c>
+#include <ccan/tap/tap.h>
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/*
+ * Finds a pseudorandom 32-bit number from 0 to 2^32-1 .
+ * Uses the BCPL linear congruential generator method.
+ *
+ * Used instead of system RNG to ensure tests are consistent.
+ */
+static uint32_t rand32(void)
+{
+       static uint32_t rand32_state = 0;
+       rand32_state *= (uint32_t)0x7FF8A3ED;
+       rand32_state += (uint32_t)0x2AA01D31;
+       return rand32_state;
+}
+
+/*
+ * Make a Unicode character requiring exactly @len UTF-8 bytes.
+ *
+ * Unless utf8_allow_surrogates is set,
+ * do not return a value in the range U+D800 thru U+DFFF .
+ *
+ * If @len is not 1 thru 4, generate an out-of-range character.
+ */
+static unsigned int utf8_randcode(int len)
+{
+       uint32_t r = rand32();
+       unsigned int ret;
+       
+       switch (len) {
+               case 1: return r % 0x80;
+               case 2: return r % (0x800-0x80) + 0x80;
+               case 3:
+                       for (;;) {
+                               ret = r % (0x10000-0x800) + 0x800;
+                               if (!utf8_allow_surrogates && ret >= 0xD800 && ret <= 0xDFFF)
+                               {
+                                       r = rand32();
+                                       continue;
+                               } else {
+                                       break;
+                               }
+                       }
+                       return ret;
+               case 4: return r % (0x110000-0x10000) + 0x10000;
+               default:
+                       while (r < 0x110000)
+                               r = rand32();
+                       return r;
+       }
+}
+
+static unsigned int rand_surrogate(void)
+{
+       return rand32() % (0xE000 - 0xD800) + 0xD800;
+}
+
+/* Encode @uc as UTF-8 using exactly @len characters.
+   @len should be 1 thru 4.
+   @uc will be truncated to the bits it will go into.
+   If, after bit truncation, @uc is in the wrong range for its length,
+   an invalid character will be generated. */
+static void utf8_encode_raw(char *out, unsigned int uc, int len)
+{
+       switch (len) {
+               case 1:
+                       *out++ = uc & 0x7F;
+                       break;
+               case 2:
+                       *out++ = 0xC0 | ((uc >> 6) & 0x1F);
+                       *out++ = 0x80 | (uc & 0x3F);
+                       break;
+               case 3:
+                       *out++ = 0xE0 | ((uc >> 12) & 0x0F);
+                       *out++ = 0x80 | ((uc >> 6) & 0x3F);
+                       *out++ = 0x80 | (uc & 0x3F);
+                       break;
+               case 4:
+                       *out++ = 0xF0 | ((uc >> 18) & 0x07);
+                       *out++ = 0x80 | ((uc >> 12) & 0x3F);
+                       *out++ = 0x80 | ((uc >> 6) & 0x3F);
+                       *out++ = 0x80 | (uc & 0x3F);
+                       break;
+       }
+}
+
+/* Generate a UTF-8 string of the given byte length,
+   randomly deciding if it should be valid or not.
+   
+   Return true if it's valid, false if it's not. */
+static bool utf8_mktest(char *out, int len)
+{
+       int m, n;
+       bool valid = true;
+       bool v;
+       double pf;
+       uint32_t pu;
+       
+       /* Probability that, per character, it should be valid.
+          The goal is to make utf8_mktest as a whole
+          have a 50% chance of generating a valid string. */
+       pf = pow(0.5, 2.5/len);
+       
+       /* Convert to uint32_t to test against rand32. */
+       pu = pf * 4294967295.0;
+       
+       for (;len; len -= n) {
+               v = len == 1 || rand32() <= pu;
+               m = len < 4 ? len : 4;
+               
+               if (v) {
+                       /* Generate a valid character. */
+                       n = rand32() % m + 1;
+                       utf8_encode_raw(out, utf8_randcode(n), n);
+               } else {
+                       /* Generate an invalid character. */
+                       assert(m >= 2);
+                       n = rand32() % (m-1) + 2;
+                       switch (n) {
+                               case 2:
+                                       utf8_encode_raw(out, utf8_randcode(1), n);
+                                       break;
+                               case 3:
+                                       if (!utf8_allow_surrogates && (rand32() & 1))
+                                               utf8_encode_raw(out, rand_surrogate(), n);
+                                       else
+                                               utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
+                                       break;
+                               case 4:
+                                       utf8_encode_raw(out, utf8_randcode(rand32() % (n-1) + 1), n);
+                                       break;
+                       }
+                       valid = false;
+               }
+               out += n;
+       }
+       
+       return valid;
+}
+
+static void test_utf8_validate(bool allow_surrogates)
+{
+       char buffer[1024];
+       int i;
+       int len;
+       bool valid;
+       int passed=0, p_valid=0, p_invalid=0, total=0;
+       int count;
+       
+       count = 10000;
+       
+       utf8_allow_surrogates = allow_surrogates;
+       
+       for (i=0; i<count; i++) {
+               len = rand32() % (1024 + 1);
+               valid = utf8_mktest(buffer, len);
+               if (utf8_validate(buffer, len) == valid) {
+                       passed++;
+                       if (valid)
+                               p_valid++;
+                       else
+                               p_invalid++;
+               }
+               total++;
+       }
+       
+       if (passed == total) {
+               printf("PASS:  %d valid tests, %d invalid tests\n",
+                       p_valid, p_invalid);
+       } else {
+               printf("FAIL:  Passed %d out of %d tests\n", passed, total);
+       }
+       
+       ok(passed, "utf8_validate test passed%s",
+               !allow_surrogates ? " (surrogates disallowed)" : "");
+       
+       ok(p_valid > count/10 && p_invalid > count/10,
+               "   valid/invalid are balanced");
+}
+
+int main(void)
+{
+       /* This is how many tests you plan to run */
+       plan_tests(4);
+       
+       test_utf8_validate(false);
+       test_utf8_validate(true);
+
+       /* This exits depending on whether all tests passed */
+       return exit_status();
+}