+#include <ccan/charset/charset.c>
+#include <ccan/tap/tap.h>
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "common.h"
+
+/* Make a valid or invalid Unicode character fitting in exactly @len UTF-8 bytes. */
+static uchar_t utf8_randcode(int len, bool valid, bool after_clipped)
+{
+ uint32_t r = rand32();
+ uchar_t ret;
+
+ #define range(lo, hi) ((r & 0x7FFFFFFF) % ((hi)-(lo)+1) + (lo))
+ #define high_bit_set() (!!(r & 0x80000000))
+
+ switch (len) {
+ case 1:
+ if (valid) {
+ /* Generate a character U+0000..U+007F */
+ return r & 0x7F;
+ } else {
+ /*
+ * Generate a character U+0080..U+00BF or U+00F8..U+00FF.
+ *
+ * However, don't generate U+0080..U+00BF (10xxxxxx) after a
+ * clipped character, as that can inadvertently form a valid,
+ * complete character.
+ */
+ if (!after_clipped && high_bit_set())
+ return range(0x80, 0xBF);
+ else
+ return range(0xF8, 0xFF);
+ }
+ case 2:
+ if (valid) {
+ /* Generate a character U+0080..U+07FF */
+ return range(0x80, 0x7FF);
+ } else {
+ /* Generate a character U+0000..U+007F */
+ return r & 0x7F;
+ }
+ case 3:
+ if (valid) {
+ /* Generate a character U+0800..U+FFFF, but not U+D800..U+DFFF */
+ for (;;) {
+ ret = range(0x800, 0xFFFF);
+ if (ret >= 0xD800 && ret <= 0xDFFF) {
+ r = rand32();
+ continue;
+ } else {
+ break;
+ }
+ }
+ return ret;
+ } else {
+ /* Generate a character U+0000..U+07FF or U+D800..U+DFFF */
+ if (high_bit_set())
+ return r & 0x7FF;
+ else
+ return 0xD800 + (r & 0x7FF);
+ }
+ case 4:
+ if (valid) {
+ /* Generate a character U+10000..U+10FFFF */
+ return range(0x10000, 0x10FFFF);
+ } else {
+ /* Generate a character U+0000..0xFFFF or U+110000..U+1FFFFF */
+ if (high_bit_set())
+ return r & 0xFFFF;
+ else
+ return range(0x110000, 0x1FFFFF);
+ }
+ default:
+ assert(false);
+ }
+
+ #undef range
+ #undef high_bit_set
+}
+
+/* Encode @uc as UTF-8 using exactly @len characters.
+ @len should be 1 thru 4. */
+static void utf8_encode_raw(char *out, unsigned int uc, int len)
+{
+ switch (len) {
+ case 1:
+ assert(uc <= 0xC1 || (uc >= 0xF8 && uc <= 0xFF));
+ *out++ = uc;
+ break;
+ case 2:
+ assert(uc <= 0x7FF);
+ *out++ = 0xC0 | ((uc >> 6) & 0x1F);
+ *out++ = 0x80 | (uc & 0x3F);
+ break;
+ case 3:
+ assert(uc <= 0xFFFF);
+ *out++ = 0xE0 | ((uc >> 12) & 0x0F);
+ *out++ = 0x80 | ((uc >> 6) & 0x3F);
+ *out++ = 0x80 | (uc & 0x3F);
+ break;
+ case 4:
+ assert(uc <= 0x1FFFFF);
+ *out++ = 0xF0 | ((uc >> 18) & 0x07);
+ *out++ = 0x80 | ((uc >> 12) & 0x3F);
+ *out++ = 0x80 | ((uc >> 6) & 0x3F);
+ *out++ = 0x80 | (uc & 0x3F);
+ break;
+ }
+}
+
+#if COMPUTE_AVERAGE_LENGTH
+double total_averages;
+#endif
+
+/* Generate a UTF-8 string of the given byte length,
+ randomly deciding if it should be valid or not.
+
+ Return true if it's valid, false if it's not. */
+static bool utf8_mktest(char *out, int len)
+{
+ double pf;
+ uint32_t pu;
+ int n;
+ bool valid = true;
+ bool v;
+ bool after_clipped = false;
+
+ #if COMPUTE_AVERAGE_LENGTH
+ int n_total = 0;
+ int count = 0;
+ #endif
+
+ /*
+ * Probability that, per character, it should be valid.
+ * The goal is to make utf8_mktest as a whole
+ * have a 50% chance of generating a valid string.
+ *
+ * The equation being solved is:
+ *
+ * p^n = 0.5
+ *
+ * where p is the probability that each character is valid,
+ * and n is the number of characters in the string.
+ *
+ * 2.384 is the approximate average length of each character,
+ * so len/2.384 is about how many characters this string
+ * is expected to contain.
+ */
+ pf = pow(0.5, 2.384/len);
+
+ /* Convert to uint32_t to test against rand32. */
+ pu = pf * 4294967295.0;
+
+ for (;len > 0; len -= n, out += n) {
+ v = rand32() <= pu;
+
+ if (v) {
+ /* Generate a valid character. */
+ n = rand32() % (len < 4 ? len : 4) + 1;
+ utf8_encode_raw(out, utf8_randcode(n, true, after_clipped), n);
+ after_clipped = false;
+ } else if (rand32() % 5) {
+ /* Generate an invalid character. */
+ n = rand32() % (len < 4 ? len : 4) + 1;
+ utf8_encode_raw(out, utf8_randcode(n, false, after_clipped), n);
+ after_clipped = false;
+ } else {
+ /* Generate a clipped but otherwise valid character. */
+ char tmp[4];
+ n = rand32() % 3 + 2;
+ utf8_encode_raw(tmp, utf8_randcode(n, true, after_clipped), n);
+ n -= rand32() % (n-1) + 1;
+ if (n > len)
+ n = len;
+ assert(n >= 1 && n <= 3);
+ memcpy(out, tmp, n);
+ after_clipped = true;
+ }
+
+ if (!v)
+ valid = false;
+
+ #if COMPUTE_AVERAGE_LENGTH
+ n_total += n;
+ count++;
+ #endif
+ }
+
+ #if COMPUTE_AVERAGE_LENGTH
+ if (count > 0)
+ total_averages += (double)n_total / count;
+ #endif
+
+ return valid;
+}
+
+static void test_utf8_validate(void)
+{
+ char buffer[128];
+ int i;
+ int len;
+ bool valid;
+ int passed=0, p_valid=0, p_invalid=0, total=0;
+ int count;
+
+ count = 100000;
+
+ #if COMPUTE_AVERAGE_LENGTH
+ total_averages = 0.0;
+ #endif
+
+ for (i=0; i<count; i++) {
+ len = rand32() % (sizeof(buffer) + 1);
+ valid = utf8_mktest(buffer, len);
+ if (utf8_validate(buffer, len) == valid) {
+ passed++;
+ if (valid)
+ p_valid++;
+ else
+ p_invalid++;
+ } else {
+ bool uvalid = utf8_validate(buffer, len);
+ printf("Failed: generated %s string, but utf8_validate returned %s\n",
+ valid ? "valid" : "invalid",
+ uvalid ? "true" : "false");
+ }
+ total++;
+ }
+
+ if (passed == total)
+ pass("%d valid tests, %d invalid tests", p_valid, p_invalid);
+ else
+ fail("Passed only %d out of %d tests\n", passed, total);
+
+ ok(p_valid > count/10 && p_invalid > count/10,
+ "Valid and invalid should be balanced");
+
+ #if COMPUTE_AVERAGE_LENGTH
+ printf("Average character length: %f\n", total_averages / count);
+ #endif
+}
+
+int main(void)
+{
+ /* This is how many tests you plan to run */
+ plan_tests(2);
+
+ test_utf8_validate();
+
+ /* This exits depending on whether all tests passed */
+ return exit_status();
+}