1 #include <ccan/charset/charset.c>
2 #include <ccan/tap/tap.h>
11 /* Make a valid or invalid Unicode character fitting in exactly @len UTF-8 bytes. */
12 static uchar_t utf8_randcode(int len, bool valid, bool after_clipped)
14 uint32_t r = rand32();
17 #define range(lo, hi) ((r & 0x7FFFFFFF) % ((hi)-(lo)+1) + (lo))
18 #define high_bit_set() (!!(r & 0x80000000))
23 /* Generate a character U+0000..U+007F */
27 * Generate a character U+0080..U+00BF or U+00F8..U+00FF.
29 * However, don't generate U+0080..U+00BF (10xxxxxx) after a
30 * clipped character, as that can inadvertently form a valid,
33 if (!after_clipped && high_bit_set())
34 return range(0x80, 0xBF);
36 return range(0xF8, 0xFF);
40 /* Generate a character U+0080..U+07FF */
41 return range(0x80, 0x7FF);
43 /* Generate a character U+0000..U+007F */
48 /* Generate a character U+0800..U+FFFF, but not U+D800..U+DFFF */
50 ret = range(0x800, 0xFFFF);
51 if (ret >= 0xD800 && ret <= 0xDFFF) {
60 /* Generate a character U+0000..U+07FF or U+D800..U+DFFF */
64 return 0xD800 + (r & 0x7FF);
68 /* Generate a character U+10000..U+10FFFF */
69 return range(0x10000, 0x10FFFF);
71 /* Generate a character U+0000..0xFFFF or U+110000..U+1FFFFF */
75 return range(0x110000, 0x1FFFFF);
85 /* Encode @uc as UTF-8 using exactly @len characters.
86 @len should be 1 thru 4. */
87 static void utf8_encode_raw(char *out, unsigned int uc, int len)
91 assert(uc <= 0xC1 || (uc >= 0xF8 && uc <= 0xFF));
96 *out++ = 0xC0 | ((uc >> 6) & 0x1F);
97 *out++ = 0x80 | (uc & 0x3F);
100 assert(uc <= 0xFFFF);
101 *out++ = 0xE0 | ((uc >> 12) & 0x0F);
102 *out++ = 0x80 | ((uc >> 6) & 0x3F);
103 *out++ = 0x80 | (uc & 0x3F);
106 assert(uc <= 0x1FFFFF);
107 *out++ = 0xF0 | ((uc >> 18) & 0x07);
108 *out++ = 0x80 | ((uc >> 12) & 0x3F);
109 *out++ = 0x80 | ((uc >> 6) & 0x3F);
110 *out++ = 0x80 | (uc & 0x3F);
115 #if COMPUTE_AVERAGE_LENGTH
116 double total_averages;
119 /* Generate a UTF-8 string of the given byte length,
120 randomly deciding if it should be valid or not.
122 Return true if it's valid, false if it's not. */
123 static bool utf8_mktest(char *out, int len)
130 bool after_clipped = false;
132 #if COMPUTE_AVERAGE_LENGTH
138 * Probability that, per character, it should be valid.
139 * The goal is to make utf8_mktest as a whole
140 * have a 50% chance of generating a valid string.
142 * The equation being solved is:
146 * where p is the probability that each character is valid,
147 * and n is the number of characters in the string.
149 * 2.384 is the approximate average length of each character,
150 * so len/2.384 is about how many characters this string
151 * is expected to contain.
153 pf = pow(0.5, 2.384/len);
155 /* Convert to uint32_t to test against rand32. */
156 pu = pf * 4294967295.0;
158 for (;len > 0; len -= n, out += n) {
162 /* Generate a valid character. */
163 n = rand32() % (len < 4 ? len : 4) + 1;
164 utf8_encode_raw(out, utf8_randcode(n, true, after_clipped), n);
165 after_clipped = false;
166 } else if (rand32() % 5) {
167 /* Generate an invalid character. */
168 n = rand32() % (len < 4 ? len : 4) + 1;
169 utf8_encode_raw(out, utf8_randcode(n, false, after_clipped), n);
170 after_clipped = false;
172 /* Generate a clipped but otherwise valid character. */
174 n = rand32() % 3 + 2;
175 utf8_encode_raw(tmp, utf8_randcode(n, true, after_clipped), n);
176 n -= rand32() % (n-1) + 1;
179 assert(n >= 1 && n <= 3);
181 after_clipped = true;
187 #if COMPUTE_AVERAGE_LENGTH
193 #if COMPUTE_AVERAGE_LENGTH
195 total_averages += (double)n_total / count;
201 static void test_utf8_validate(void)
207 int passed=0, p_valid=0, p_invalid=0, total=0;
212 #if COMPUTE_AVERAGE_LENGTH
213 total_averages = 0.0;
216 for (i=0; i<count; i++) {
217 len = rand32() % (sizeof(buffer) + 1);
218 valid = utf8_mktest(buffer, len);
219 if (utf8_validate(buffer, len) == valid) {
226 bool uvalid = utf8_validate(buffer, len);
227 printf("Failed: generated %s string, but utf8_validate returned %s\n",
228 valid ? "valid" : "invalid",
229 uvalid ? "true" : "false");
235 pass("%d valid tests, %d invalid tests", p_valid, p_invalid);
237 fail("Passed only %d out of %d tests\n", passed, total);
239 ok(p_valid > count/10 && p_invalid > count/10,
240 "Valid and invalid should be balanced");
242 #if COMPUTE_AVERAGE_LENGTH
243 printf("Average character length: %f\n", total_averages / count);
249 /* This is how many tests you plan to run */
252 test_utf8_validate();
254 /* This exits depending on whether all tests passed */
255 return exit_status();