git.ozlabs.org Git - ccan/blob - ccan/utf8/test/run-decode.c

   1 #include <ccan/utf8/utf8.h>
   2 /* Include the C files directly. */
   3 #include <ccan/utf8/utf8.c>
   4 #include <ccan/tap/tap.h>
   5 #include <assert.h>
   6
   7 /* Stolen from https://github.com/chansen/c-utf8-valid/blob/master/test.c */
   8
   9 /*
  10  *  UTF-8
  11  *
  12  *     U+0000..U+007F         00..7F
  13  *                         n  C0..C1  80..BF
  14  *     U+0080..U+07FF         C2..DF  80..BF
  15  *                         n  E0      80..9F  80..BF
  16  *     U+0800..U+D7FF         E0..ED  A0..9F  80..BF
  17  *     U+D800..U+DFFF      s  ED      A0..BF  80..BF
  18  *     U+E000..U+FFFF         EE..EF  80..BF  80..BF
  19  *                         n  F0      80..8F  80..BF  80..BF
  20  *     U+0800..U+FFFF         F0      80..8F  A0..BF  80..BF
  21  *    U+10000..U+10FFFF       F0..F4  90..8F  80..BF  80..BF
  22  *
  23  *   U-110000..U-1FFFFF    x  F4..F7  90..BF  80..BF  80..BF
  24  *                         xn F8      80..87  80..BF  80..BF  80..BF
  25  *   U-200000..U-3FFFFFF   x  F8..FB  88..BF  80..BF  80..BF  80..BF
  26  *                         xn FC      80..83  80..BF  80..BF  80..BF  80..BF
  27  *  U-4000000..U-7FFFFFFF  x  FC..FD  84..BF  80..BF  80..BF  80..BF  80..BF
  28  *
  29  *  Legend:
  30  *    n = Non-shortest form
  31  *    s = Surrogates
  32  *    x = Codepoints outside Unicode codespace
  33  */
  34
  35 /*
  36  *  Encodes the given ordinal [0, 7FFFFFFF] using the UTF-8 encoding scheme
  37  *  to the given sequence length [1, 6]. This routine can be used to
  38  *  produce well-formed and ill-formed UTF-8.
  39  *
  40  *  To encode a Unicode scalar value to a well-formed representation:
  41  *
  42  *   [U+0000, U+007F] should be encoded to a sequence length of 1
  43  *   [U+0080, U+07FF] should be encoded to a sequence length of 2
  44  *   [U+0800, U+D7FF] should be encoded to a sequence length of 3
  45  *   [U+E000, U+FFFF] should be encoded to a sequence length of 3
  46  *   [U+10000, U+10FFFF] should be encoded to a sequence length of 4
  47  *
  48  *  To encode a Unicode scalar value to non-shortest form representation:
  49  *
  50  *   [U+0000, U+007F] can be encoded to a sequence length of [2, 6]
  51  *   [U+0080, U+07FF] can be encoded to a sequence length of [3, 6]
  52  *   [U+0800, U+FFFF] can be encoded to a sequence length of [4, 6]
  53  *
  54  *  To encode an ordinal outside of Unicode codespace:
  55  *
  56  *   [110000, 1FFFFF] can be encoded to a sequence length of 4
  57  *   [200000, 3FFFFFF] can be encoded to a sequence length of 5
  58  *   [4000000, 7FFFFFFF] can be encoded to a sequence length of 6
  59  */
  60
  61 static char *
  62 encode_ord(uint32_t ord, size_t len, char *dst) {
  63   static const uint32_t kMask[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  64   static const uint32_t kMax[6]  = { 1 <<  7, 1 << 11, 1 << 16,
  65                                      1 << 21, 1 << 26, 1 << 31 };
  66   size_t i;
  67
  68   assert(len >= 1);
  69   assert(len <= 6);
  70   assert(ord < kMax[len - 1]);
  71
  72   for (i = len - 1; i > 0; i--) {
  73     dst[i] = (ord & 0x3F) | 0x80;
  74     ord >>= 6;
  75   }
  76   dst[0] = ord | kMask[len - 1];
  77   return dst;
  78 }
  79
  80 static int utf8_check(const char *src, size_t len)
  81 {
  82         bool decoded = false;
  83         struct utf8_state utf8_state = UTF8_STATE_INIT;
  84         size_t i;
  85
  86         for (i = 0; i < len; i++) {
  87                 decoded = utf8_decode(&utf8_state, src[i]);
  88                 if (decoded) {
  89                         if (errno != 0)
  90                                 return errno;
  91                 }
  92         }
  93         if (!decoded)
  94                 return EMLINK;
  95         return 0;
  96 }
  97
  98 static void
  99 test_utf8(const char *src, size_t len, int exp_err, unsigned line) {
 100   int got_err;
 101
 102   assert(len <= 255);
 103
 104   got_err = utf8_check(src, len);
 105
 106   ok(got_err == exp_err, "Got result %i, expected %i at line %u",
 107      got_err, exp_err, line);
 108 }
 109
 110 #define TEST_UTF8(src, len, exp) \
 111   test_utf8(src, len, exp, __LINE__)
 112
 113
 114 static void
 115 test_unicode_scalar_value(void) {
 116   uint32_t ord;
 117   char src[4];
 118
 119   /* Unicode scalar value [U+0000, U+007F] */
 120   for (ord = 0x0001; ord <= 0x007F; ord++) {
 121     encode_ord(ord, 1, src);
 122     TEST_UTF8(src, 1, ord ? 0 : ERANGE);
 123   }
 124
 125   /*
 126    * Unicode scalar value [U+0080, U+07FF]
 127    * The maximal subpart is the length of the truncated sequence
 128    */
 129   for (ord = 0x0080; ord <= 0x07FF; ord++) {
 130     encode_ord(ord, 2, src);
 131     TEST_UTF8(src, 2, 0);
 132   }
 133
 134   /*
 135    * Unicode scalar value [U+0800, U+D7FF] and [U+E000, U+FFFF]
 136    * The maximal subpart is the length of the truncated sequence
 137    */
 138   for (ord = 0x0800; ord <= 0xFFFF && (ord & 0xF800) != 0xD800; ord++) {
 139     encode_ord(ord, 3, src);
 140
 141     TEST_UTF8(src, 3, 0);
 142     if ((ord % (1 << 6)) == 0)
 143       TEST_UTF8(src, 2, EMLINK);
 144   }
 145
 146   /*
 147    * Unicode scalar value [U+10000, U+10FFF]
 148    * The maximal subpart is the length of the truncated sequence
 149    */
 150   for (ord = 0x10000; ord <= 0x10FFFF; ord++) {
 151     encode_ord(ord, 4, src);
 152
 153     TEST_UTF8(src, 4, 0);
 154     if ((ord % (1 << 6)) == 0)
 155       TEST_UTF8(src, 3, EMLINK);
 156     if ((ord % (1 << 12)) == 0)
 157       TEST_UTF8(src, 2, EMLINK);
 158   }
 159 }
 160
 161 static void
 162 test_non_shortest_form(void) {
 163   uint32_t ord;
 164   char src[4];
 165
 166   /*
 167    * Non-shortest form 2-byte sequence [U+0000, U+007F]
 168    * The maximal subpart is 1-byte
 169    */
 170   for (ord = 0x0001; ord <= 0x007F; ord++) {
 171     encode_ord(ord, 2, src);
 172     TEST_UTF8(src, 2, EFBIG);
 173   }
 174
 175   /*
 176    * Non-shortest form 3-byte sequence [U+0000, U+07FF]
 177    * The maximal subpart is 1-byte
 178    */
 179   for (ord = 0x0001; ord <= 0x07FF; ord++) {
 180     encode_ord(ord, 3, src);
 181
 182     TEST_UTF8(src, 3, EFBIG);
 183     if ((ord % (1 << 6)) == 0)
 184       TEST_UTF8(src, 2, EMLINK);
 185   }
 186
 187   /*
 188    * Non-shortest form 4-byte sequence [U+0000, U+FFFF]
 189    * The maximal subpart is 1-byte
 190    */
 191   for (ord = 0x0001; ord <= 0xFFFF; ord++) {
 192     encode_ord(ord, 4, src);
 193
 194     TEST_UTF8(src, 4, EFBIG);
 195     if ((ord % (1 << 6)) == 0)
 196       TEST_UTF8(src, 3, EMLINK);
 197     if ((ord % (1 << 12)) == 0)
 198       TEST_UTF8(src, 2, EMLINK);
 199   }
 200 }
 201
 202 static void
 203 test_non_unicode(void) {
 204   uint32_t ord;
 205   char src[4];
 206
 207   /*
 208    * Code point outside Unicode codespace
 209    * The maximal subpart is 1-byte
 210    */
 211   for (ord = 0x110000; ord <= 0x1FFFFF; ord++) {
 212     encode_ord(ord, 4, src);
 213
 214     TEST_UTF8(src, 4, ERANGE);
 215     if ((ord % (1 << 6)) == 0)
 216       TEST_UTF8(src, 3, EMLINK);
 217     if ((ord % (1 << 12)) == 0)
 218       TEST_UTF8(src, 2, EMLINK);
 219   }
 220 }
 221
 222 static void
 223 test_surrogates(void) {
 224   uint32_t ord;
 225   char src[4];
 226
 227   /*
 228    * Surrogates [U+D800, U+DFFF]
 229    * The maximal subpart is 1-byte
 230    */
 231   for (ord = 0xD800; ord <= 0xDFFF; ord++) {
 232     encode_ord(ord, 3, src);
 233
 234     TEST_UTF8(src, 3, ERANGE);
 235     if ((ord % (1 << 6)) == 0)
 236       TEST_UTF8(src, 2, EMLINK);
 237   }
 238 }
 239
 240 static void
 241 test_continuations(void) {
 242   uint8_t ord;
 243   char src[4];
 244
 245   /*
 246    * Missplaced continuation [\x80, \xBF]
 247    * The maximal subpart is 1-byte
 248    */
 249   for (ord = 0x80; ord <= 0xBF; ord++) {
 250     src[0] = ord;
 251     TEST_UTF8(src, 1, EINVAL);
 252   }
 253 }
 254
 255 int
 256 main(int argc, char **argv)
 257 {
 258   plan_tests(2190906 - 1);
 259   test_unicode_scalar_value();
 260   test_surrogates();
 261   test_non_shortest_form();
 262   test_non_unicode();
 263   test_continuations();
 264
 265   return exit_status();
 266 }