1 #include <ccan/utf8/utf8.h>
2 /* Include the C files directly. */
3 #include <ccan/utf8/utf8.c>
4 #include <ccan/tap/tap.h>
7 /* Stolen from https://github.com/chansen/c-utf8-valid/blob/master/test.c */
12 * U+0000..U+007F 00..7F
14 * U+0080..U+07FF C2..DF 80..BF
16 * U+0800..U+D7FF E0..ED A0..9F 80..BF
17 * U+D800..U+DFFF s ED A0..BF 80..BF
18 * U+E000..U+FFFF EE..EF 80..BF 80..BF
19 * n F0 80..8F 80..BF 80..BF
20 * U+0800..U+FFFF F0 80..8F A0..BF 80..BF
21 * U+10000..U+10FFFF F0..F4 90..8F 80..BF 80..BF
23 * U-110000..U-1FFFFF x F4..F7 90..BF 80..BF 80..BF
24 * xn F8 80..87 80..BF 80..BF 80..BF
25 * U-200000..U-3FFFFFF x F8..FB 88..BF 80..BF 80..BF 80..BF
26 * xn FC 80..83 80..BF 80..BF 80..BF 80..BF
27 * U-4000000..U-7FFFFFFF x FC..FD 84..BF 80..BF 80..BF 80..BF 80..BF
30 * n = Non-shortest form
32 * x = Codepoints outside Unicode codespace
36 * Encodes the given ordinal [0, 7FFFFFFF] using the UTF-8 encoding scheme
37 * to the given sequence length [1, 6]. This routine can be used to
38 * produce well-formed and ill-formed UTF-8.
40 * To encode a Unicode scalar value to a well-formed representation:
42 * [U+0000, U+007F] should be encoded to a sequence length of 1
43 * [U+0080, U+07FF] should be encoded to a sequence length of 2
44 * [U+0800, U+D7FF] should be encoded to a sequence length of 3
45 * [U+E000, U+FFFF] should be encoded to a sequence length of 3
46 * [U+10000, U+10FFFF] should be encoded to a sequence length of 4
48 * To encode a Unicode scalar value to non-shortest form representation:
50 * [U+0000, U+007F] can be encoded to a sequence length of [2, 6]
51 * [U+0080, U+07FF] can be encoded to a sequence length of [3, 6]
52 * [U+0800, U+FFFF] can be encoded to a sequence length of [4, 6]
54 * To encode an ordinal outside of Unicode codespace:
56 * [110000, 1FFFFF] can be encoded to a sequence length of 4
57 * [200000, 3FFFFFF] can be encoded to a sequence length of 5
58 * [4000000, 7FFFFFFF] can be encoded to a sequence length of 6
62 encode_ord(uint32_t ord, size_t len, char *dst) {
63 static const uint32_t kMask[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
64 static const uint32_t kMax[6] = { 1 << 7, 1 << 11, 1 << 16,
65 1 << 21, 1 << 26, 1 << 31 };
70 assert(ord < kMax[len - 1]);
72 for (i = len - 1; i > 0; i--) {
73 dst[i] = (ord & 0x3F) | 0x80;
76 dst[0] = ord | kMask[len - 1];
80 static int utf8_check(const char *src, size_t len)
83 struct utf8_state utf8_state = UTF8_STATE_INIT;
86 for (i = 0; i < len; i++) {
87 decoded = utf8_decode(&utf8_state, src[i]);
99 test_utf8(const char *src, size_t len, int exp_err, unsigned line) {
104 got_err = utf8_check(src, len);
106 ok(got_err == exp_err, "Got result %i, expected %i at line %u",
107 got_err, exp_err, line);
110 #define TEST_UTF8(src, len, exp) \
111 test_utf8(src, len, exp, __LINE__)
115 test_unicode_scalar_value(void) {
119 /* Unicode scalar value [U+0000, U+007F] */
120 for (ord = 0x0001; ord <= 0x007F; ord++) {
121 encode_ord(ord, 1, src);
122 TEST_UTF8(src, 1, ord ? 0 : ERANGE);
126 * Unicode scalar value [U+0080, U+07FF]
127 * The maximal subpart is the length of the truncated sequence
129 for (ord = 0x0080; ord <= 0x07FF; ord++) {
130 encode_ord(ord, 2, src);
131 TEST_UTF8(src, 2, 0);
135 * Unicode scalar value [U+0800, U+D7FF] and [U+E000, U+FFFF]
136 * The maximal subpart is the length of the truncated sequence
138 for (ord = 0x0800; ord <= 0xFFFF && (ord & 0xF800) != 0xD800; ord++) {
139 encode_ord(ord, 3, src);
141 TEST_UTF8(src, 3, 0);
142 if ((ord % (1 << 6)) == 0)
143 TEST_UTF8(src, 2, EMLINK);
147 * Unicode scalar value [U+10000, U+10FFF]
148 * The maximal subpart is the length of the truncated sequence
150 for (ord = 0x10000; ord <= 0x10FFFF; ord++) {
151 encode_ord(ord, 4, src);
153 TEST_UTF8(src, 4, 0);
154 if ((ord % (1 << 6)) == 0)
155 TEST_UTF8(src, 3, EMLINK);
156 if ((ord % (1 << 12)) == 0)
157 TEST_UTF8(src, 2, EMLINK);
162 test_non_shortest_form(void) {
167 * Non-shortest form 2-byte sequence [U+0000, U+007F]
168 * The maximal subpart is 1-byte
170 for (ord = 0x0001; ord <= 0x007F; ord++) {
171 encode_ord(ord, 2, src);
172 TEST_UTF8(src, 2, EFBIG);
176 * Non-shortest form 3-byte sequence [U+0000, U+07FF]
177 * The maximal subpart is 1-byte
179 for (ord = 0x0001; ord <= 0x07FF; ord++) {
180 encode_ord(ord, 3, src);
182 TEST_UTF8(src, 3, EFBIG);
183 if ((ord % (1 << 6)) == 0)
184 TEST_UTF8(src, 2, EMLINK);
188 * Non-shortest form 4-byte sequence [U+0000, U+FFFF]
189 * The maximal subpart is 1-byte
191 for (ord = 0x0001; ord <= 0xFFFF; ord++) {
192 encode_ord(ord, 4, src);
194 TEST_UTF8(src, 4, EFBIG);
195 if ((ord % (1 << 6)) == 0)
196 TEST_UTF8(src, 3, EMLINK);
197 if ((ord % (1 << 12)) == 0)
198 TEST_UTF8(src, 2, EMLINK);
203 test_non_unicode(void) {
208 * Code point outside Unicode codespace
209 * The maximal subpart is 1-byte
211 for (ord = 0x110000; ord <= 0x1FFFFF; ord++) {
212 encode_ord(ord, 4, src);
214 TEST_UTF8(src, 4, ERANGE);
215 if ((ord % (1 << 6)) == 0)
216 TEST_UTF8(src, 3, EMLINK);
217 if ((ord % (1 << 12)) == 0)
218 TEST_UTF8(src, 2, EMLINK);
223 test_surrogates(void) {
228 * Surrogates [U+D800, U+DFFF]
229 * The maximal subpart is 1-byte
231 for (ord = 0xD800; ord <= 0xDFFF; ord++) {
232 encode_ord(ord, 3, src);
234 TEST_UTF8(src, 3, ERANGE);
235 if ((ord % (1 << 6)) == 0)
236 TEST_UTF8(src, 2, EMLINK);
241 test_continuations(void) {
246 * Missplaced continuation [\x80, \xBF]
247 * The maximal subpart is 1-byte
249 for (ord = 0x80; ord <= 0xBF; ord++) {
251 TEST_UTF8(src, 1, EINVAL);
256 main(int argc, char **argv)
258 plan_tests(2190906 - 1);
259 test_unicode_scalar_value();
261 test_non_shortest_form();
263 test_continuations();
265 return exit_status();