charset: Rewrote utf8_validate, and added four new functions:
[ccan] / ccan / charset / test / run-utf8_validate.c
1 #include <ccan/charset/charset.c>
2 #include <ccan/tap/tap.h>
3
4 #include <assert.h>
5 #include <math.h>
6 #include <stdio.h>
7 #include <string.h>
8
9 #include "common.h"
10
11 /* Make a valid or invalid Unicode character fitting in exactly @len UTF-8 bytes. */
12 static uchar_t utf8_randcode(int len, bool valid, bool after_clipped)
13 {
14         uint32_t r = rand32();
15         uchar_t ret;
16         
17         #define range(lo, hi)  ((r & 0x7FFFFFFF) % ((hi)-(lo)+1) + (lo))
18         #define high_bit_set() (!!(r & 0x80000000))
19         
20         switch (len) {
21                 case 1:
22                         if (valid) {
23                                 /* Generate a character U+0000..U+007F */
24                                 return r & 0x7F;
25                         } else {
26                                 /*
27                                  * Generate a character U+0080..U+00BF or U+00F8..U+00FF.
28                                  *
29                                  * However, don't generate U+0080..U+00BF (10xxxxxx) after a
30                                  * clipped character, as that can inadvertently form a valid,
31                                  * complete character.
32                                  */
33                                 if (!after_clipped && high_bit_set())
34                                         return range(0x80, 0xBF);
35                                 else
36                                         return range(0xF8, 0xFF);
37                         }
38                 case 2:
39                         if (valid) {
40                                 /* Generate a character U+0080..U+07FF */
41                                 return range(0x80, 0x7FF);
42                         } else {
43                                 /* Generate a character U+0000..U+007F */
44                                 return r & 0x7F;
45                         }
46                 case 3:
47                         if (valid) {
48                                 /* Generate a character U+0800..U+FFFF, but not U+D800..U+DFFF */
49                                 for (;;) {
50                                         ret = range(0x800, 0xFFFF);
51                                         if (ret >= 0xD800 && ret <= 0xDFFF) {
52                                                 r = rand32();
53                                                 continue;
54                                         } else {
55                                                 break;
56                                         }
57                                 }
58                                 return ret;
59                         } else {
60                                 /* Generate a character U+0000..U+07FF or U+D800..U+DFFF */
61                                 if (high_bit_set())
62                                         return r & 0x7FF;
63                                 else
64                                         return 0xD800 + (r & 0x7FF);
65                         }
66                 case 4:
67                         if (valid) {
68                                 /* Generate a character U+10000..U+10FFFF */
69                                 return range(0x10000, 0x10FFFF);
70                         } else {
71                                 /* Generate a character U+0000..0xFFFF or U+110000..U+1FFFFF */
72                                 if (high_bit_set())
73                                         return r & 0xFFFF;
74                                 else
75                                         return range(0x110000, 0x1FFFFF);
76                         }
77                 default:
78                         assert(false);
79         }
80         
81         #undef range
82         #undef high_bit_set
83 }
84
85 /* Encode @uc as UTF-8 using exactly @len characters.
86    @len should be 1 thru 4. */
87 static void utf8_encode_raw(char *out, unsigned int uc, int len)
88 {
89         switch (len) {
90                 case 1:
91                         assert(uc <= 0xC1 || (uc >= 0xF8 && uc <= 0xFF));
92                         *out++ = uc;
93                         break;
94                 case 2:
95                         assert(uc <= 0x7FF);
96                         *out++ = 0xC0 | ((uc >> 6) & 0x1F);
97                         *out++ = 0x80 | (uc & 0x3F);
98                         break;
99                 case 3:
100                         assert(uc <= 0xFFFF);
101                         *out++ = 0xE0 | ((uc >> 12) & 0x0F);
102                         *out++ = 0x80 | ((uc >> 6) & 0x3F);
103                         *out++ = 0x80 | (uc & 0x3F);
104                         break;
105                 case 4:
106                         assert(uc <= 0x1FFFFF);
107                         *out++ = 0xF0 | ((uc >> 18) & 0x07);
108                         *out++ = 0x80 | ((uc >> 12) & 0x3F);
109                         *out++ = 0x80 | ((uc >> 6) & 0x3F);
110                         *out++ = 0x80 | (uc & 0x3F);
111                         break;
112         }
113 }
114
115 #if COMPUTE_AVERAGE_LENGTH
116 double total_averages;
117 #endif
118
119 /* Generate a UTF-8 string of the given byte length,
120    randomly deciding if it should be valid or not.
121    
122    Return true if it's valid, false if it's not. */
123 static bool utf8_mktest(char *out, int len)
124 {
125         double pf;
126         uint32_t pu;
127         int n;
128         bool valid = true;
129         bool v;
130         bool after_clipped = false;
131         
132         #if COMPUTE_AVERAGE_LENGTH
133         int n_total = 0;
134         int count = 0;
135         #endif
136         
137         /*
138          * Probability that, per character, it should be valid.
139          * The goal is to make utf8_mktest as a whole
140          * have a 50% chance of generating a valid string.
141          *
142          * The equation being solved is:
143          *
144          *     p^n = 0.5
145          *
146          * where p is the probability that each character is valid,
147          * and n is the number of characters in the string.
148          *
149          * 2.384 is the approximate average length of each character,
150          * so len/2.384 is about how many characters this string
151          * is expected to contain.
152          */
153         pf = pow(0.5, 2.384/len);
154         
155         /* Convert to uint32_t to test against rand32. */
156         pu = pf * 4294967295.0;
157         
158         for (;len > 0; len -= n, out += n) {
159                 v = rand32() <= pu;
160                 
161                 if (v) {
162                         /* Generate a valid character. */
163                         n = rand32() % (len < 4 ? len : 4) + 1;
164                         utf8_encode_raw(out, utf8_randcode(n, true, after_clipped), n);
165                         after_clipped = false;
166                 } else if (rand32() % 5) {
167                         /* Generate an invalid character. */
168                         n = rand32() % (len < 4 ? len : 4) + 1;
169                         utf8_encode_raw(out, utf8_randcode(n, false, after_clipped), n);
170                         after_clipped = false;
171                 } else {
172                         /* Generate a clipped but otherwise valid character. */
173                         char tmp[4];
174                         n = rand32() % 3 + 2;
175                         utf8_encode_raw(tmp, utf8_randcode(n, true, after_clipped), n);
176                         n -= rand32() % (n-1) + 1;
177                         if (n > len)
178                                 n = len;
179                         assert(n >= 1 && n <= 3);
180                         memcpy(out, tmp, n);
181                         after_clipped = true;
182                 }
183                 
184                 if (!v)
185                         valid = false;
186                 
187                 #if COMPUTE_AVERAGE_LENGTH
188                 n_total += n;
189                 count++;
190                 #endif
191         }
192         
193         #if COMPUTE_AVERAGE_LENGTH
194         if (count > 0)
195                 total_averages += (double)n_total / count;
196         #endif
197         
198         return valid;
199 }
200
201 static void test_utf8_validate(void)
202 {
203         char buffer[128];
204         int i;
205         int len;
206         bool valid;
207         int passed=0, p_valid=0, p_invalid=0, total=0;
208         int count;
209         
210         count = 100000;
211         
212         #if COMPUTE_AVERAGE_LENGTH
213         total_averages = 0.0;
214         #endif
215         
216         for (i=0; i<count; i++) {
217                 len = rand32() % (sizeof(buffer) + 1);
218                 valid = utf8_mktest(buffer, len);
219                 if (utf8_validate(buffer, len) == valid) {
220                         passed++;
221                         if (valid)
222                                 p_valid++;
223                         else
224                                 p_invalid++;
225                 } else {
226                         bool uvalid = utf8_validate(buffer, len);
227                         printf("Failed: generated %s string, but utf8_validate returned %s\n",
228                                valid ? "valid" : "invalid",
229                                uvalid ? "true" : "false");
230                 }
231                 total++;
232         }
233         
234         if (passed == total)
235                 pass("%d valid tests, %d invalid tests", p_valid, p_invalid);
236         else
237                 fail("Passed only %d out of %d tests\n", passed, total);
238         
239         ok(p_valid > count/10 && p_invalid > count/10,
240            "Valid and invalid should be balanced");
241         
242         #if COMPUTE_AVERAGE_LENGTH
243         printf("Average character length: %f\n", total_averages / count);
244         #endif
245 }
246
247 int main(void)
248 {
249         /* This is how many tests you plan to run */
250         plan_tests(2);
251         
252         test_utf8_validate();
253
254         /* This exits depending on whether all tests passed */
255         return exit_status();
256 }