]> git.ozlabs.org Git - ccan/blobdiff - ccan/charset/test/run-utf8_validate.c
charset: Rewrote utf8_validate, and added four new functions:
[ccan] / ccan / charset / test / run-utf8_validate.c
diff --git a/ccan/charset/test/run-utf8_validate.c b/ccan/charset/test/run-utf8_validate.c
new file mode 100644 (file)
index 0000000..3718b32
--- /dev/null
@@ -0,0 +1,256 @@
+#include <ccan/charset/charset.c>
+#include <ccan/tap/tap.h>
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "common.h"
+
+/* Make a valid or invalid Unicode character fitting in exactly @len UTF-8 bytes. */
+static uchar_t utf8_randcode(int len, bool valid, bool after_clipped)
+{
+       uint32_t r = rand32();
+       uchar_t ret;
+       
+       #define range(lo, hi)  ((r & 0x7FFFFFFF) % ((hi)-(lo)+1) + (lo))
+       #define high_bit_set() (!!(r & 0x80000000))
+       
+       switch (len) {
+               case 1:
+                       if (valid) {
+                               /* Generate a character U+0000..U+007F */
+                               return r & 0x7F;
+                       } else {
+                               /*
+                                * Generate a character U+0080..U+00BF or U+00F8..U+00FF.
+                                *
+                                * However, don't generate U+0080..U+00BF (10xxxxxx) after a
+                                * clipped character, as that can inadvertently form a valid,
+                                * complete character.
+                                */
+                               if (!after_clipped && high_bit_set())
+                                       return range(0x80, 0xBF);
+                               else
+                                       return range(0xF8, 0xFF);
+                       }
+               case 2:
+                       if (valid) {
+                               /* Generate a character U+0080..U+07FF */
+                               return range(0x80, 0x7FF);
+                       } else {
+                               /* Generate a character U+0000..U+007F */
+                               return r & 0x7F;
+                       }
+               case 3:
+                       if (valid) {
+                               /* Generate a character U+0800..U+FFFF, but not U+D800..U+DFFF */
+                               for (;;) {
+                                       ret = range(0x800, 0xFFFF);
+                                       if (ret >= 0xD800 && ret <= 0xDFFF) {
+                                               r = rand32();
+                                               continue;
+                                       } else {
+                                               break;
+                                       }
+                               }
+                               return ret;
+                       } else {
+                               /* Generate a character U+0000..U+07FF or U+D800..U+DFFF */
+                               if (high_bit_set())
+                                       return r & 0x7FF;
+                               else
+                                       return 0xD800 + (r & 0x7FF);
+                       }
+               case 4:
+                       if (valid) {
+                               /* Generate a character U+10000..U+10FFFF */
+                               return range(0x10000, 0x10FFFF);
+                       } else {
+                               /* Generate a character U+0000..0xFFFF or U+110000..U+1FFFFF */
+                               if (high_bit_set())
+                                       return r & 0xFFFF;
+                               else
+                                       return range(0x110000, 0x1FFFFF);
+                       }
+               default:
+                       assert(false);
+       }
+       
+       #undef range
+       #undef high_bit_set
+}
+
+/* Encode @uc as UTF-8 using exactly @len characters.
+   @len should be 1 thru 4. */
+static void utf8_encode_raw(char *out, unsigned int uc, int len)
+{
+       switch (len) {
+               case 1:
+                       assert(uc <= 0xC1 || (uc >= 0xF8 && uc <= 0xFF));
+                       *out++ = uc;
+                       break;
+               case 2:
+                       assert(uc <= 0x7FF);
+                       *out++ = 0xC0 | ((uc >> 6) & 0x1F);
+                       *out++ = 0x80 | (uc & 0x3F);
+                       break;
+               case 3:
+                       assert(uc <= 0xFFFF);
+                       *out++ = 0xE0 | ((uc >> 12) & 0x0F);
+                       *out++ = 0x80 | ((uc >> 6) & 0x3F);
+                       *out++ = 0x80 | (uc & 0x3F);
+                       break;
+               case 4:
+                       assert(uc <= 0x1FFFFF);
+                       *out++ = 0xF0 | ((uc >> 18) & 0x07);
+                       *out++ = 0x80 | ((uc >> 12) & 0x3F);
+                       *out++ = 0x80 | ((uc >> 6) & 0x3F);
+                       *out++ = 0x80 | (uc & 0x3F);
+                       break;
+       }
+}
+
+#if COMPUTE_AVERAGE_LENGTH
+double total_averages;
+#endif
+
+/* Generate a UTF-8 string of the given byte length,
+   randomly deciding if it should be valid or not.
+   
+   Return true if it's valid, false if it's not. */
+static bool utf8_mktest(char *out, int len)
+{
+       double pf;
+       uint32_t pu;
+       int n;
+       bool valid = true;
+       bool v;
+       bool after_clipped = false;
+       
+       #if COMPUTE_AVERAGE_LENGTH
+       int n_total = 0;
+       int count = 0;
+       #endif
+       
+       /*
+        * Probability that, per character, it should be valid.
+        * The goal is to make utf8_mktest as a whole
+        * have a 50% chance of generating a valid string.
+        *
+        * The equation being solved is:
+        *
+        *     p^n = 0.5
+        *
+        * where p is the probability that each character is valid,
+        * and n is the number of characters in the string.
+        *
+        * 2.384 is the approximate average length of each character,
+        * so len/2.384 is about how many characters this string
+        * is expected to contain.
+        */
+       pf = pow(0.5, 2.384/len);
+       
+       /* Convert to uint32_t to test against rand32. */
+       pu = pf * 4294967295.0;
+       
+       for (;len > 0; len -= n, out += n) {
+               v = rand32() <= pu;
+               
+               if (v) {
+                       /* Generate a valid character. */
+                       n = rand32() % (len < 4 ? len : 4) + 1;
+                       utf8_encode_raw(out, utf8_randcode(n, true, after_clipped), n);
+                       after_clipped = false;
+               } else if (rand32() % 5) {
+                       /* Generate an invalid character. */
+                       n = rand32() % (len < 4 ? len : 4) + 1;
+                       utf8_encode_raw(out, utf8_randcode(n, false, after_clipped), n);
+                       after_clipped = false;
+               } else {
+                       /* Generate a clipped but otherwise valid character. */
+                       char tmp[4];
+                       n = rand32() % 3 + 2;
+                       utf8_encode_raw(tmp, utf8_randcode(n, true, after_clipped), n);
+                       n -= rand32() % (n-1) + 1;
+                       if (n > len)
+                               n = len;
+                       assert(n >= 1 && n <= 3);
+                       memcpy(out, tmp, n);
+                       after_clipped = true;
+               }
+               
+               if (!v)
+                       valid = false;
+               
+               #if COMPUTE_AVERAGE_LENGTH
+               n_total += n;
+               count++;
+               #endif
+       }
+       
+       #if COMPUTE_AVERAGE_LENGTH
+       if (count > 0)
+               total_averages += (double)n_total / count;
+       #endif
+       
+       return valid;
+}
+
+static void test_utf8_validate(void)
+{
+       char buffer[128];
+       int i;
+       int len;
+       bool valid;
+       int passed=0, p_valid=0, p_invalid=0, total=0;
+       int count;
+       
+       count = 100000;
+       
+       #if COMPUTE_AVERAGE_LENGTH
+       total_averages = 0.0;
+       #endif
+       
+       for (i=0; i<count; i++) {
+               len = rand32() % (sizeof(buffer) + 1);
+               valid = utf8_mktest(buffer, len);
+               if (utf8_validate(buffer, len) == valid) {
+                       passed++;
+                       if (valid)
+                               p_valid++;
+                       else
+                               p_invalid++;
+               } else {
+                       bool uvalid = utf8_validate(buffer, len);
+                       printf("Failed: generated %s string, but utf8_validate returned %s\n",
+                              valid ? "valid" : "invalid",
+                              uvalid ? "true" : "false");
+               }
+               total++;
+       }
+       
+       if (passed == total)
+               pass("%d valid tests, %d invalid tests", p_valid, p_invalid);
+       else
+               fail("Passed only %d out of %d tests\n", passed, total);
+       
+       ok(p_valid > count/10 && p_invalid > count/10,
+          "Valid and invalid should be balanced");
+       
+       #if COMPUTE_AVERAGE_LENGTH
+       printf("Average character length: %f\n", total_averages / count);
+       #endif
+}
+
+int main(void)
+{
+       /* This is how many tests you plan to run */
+       plan_tests(2);
+       
+       test_utf8_validate();
+
+       /* This exits depending on whether all tests passed */
+       return exit_status();
+}