From: Joseph Adams <joeyadams3.14159@gmail.com>
authorRusty Russell <rusty@rustcorp.com.au>
Fri, 9 Apr 2010 01:23:35 +0000 (10:53 +0930)
committerRusty Russell <rusty@rustcorp.com.au>
Fri, 9 Apr 2010 01:23:35 +0000 (10:53 +0930)
The charset patch makes utf8_validate reject the invalid codepoints
U+FFFE and U+FFFF .  Hopefully it's fully UTF-8 compliant now.

ccan/charset/charset.c
ccan/charset/test/run.c

index 6c21df38ac703efcb5048648ef0abd00089505c5..756080138d23110e26aa97c6fc3b0a0964b488ce 100644 (file)
@@ -50,18 +50,23 @@ bool utf8_validate(const char *str, size_t length)
                                return false;
                } else if (c < 0xF0) {
                        /* 3-byte sequence, U+0800 to U+FFFF
-                          Note that the surrogate range is U+D800 to U+DFFF
+                          Note that the surrogate range is U+D800 to U+DFFF,
+                                 and that U+FFFE and U+FFFF are illegal characters.
                           c must be >= 11100000 (which it is)
                           If c is 11100000, then s[0] must be >= 10100000
                           If the global parameter utf8_allow_surrogates is false:
                              If c is 11101101 and s[0] is >= 10100000,
                                 then this is a surrogate and we should fail.
+                          If c is 11101111, s[0] is 10111111, and s[1] >= 10111110,
+                                 then this is an illegal character and we should fail.
                           s[0] and s[1] must be 10xxxxxx */
                        len = 1;
                        if (c == 0xE0 && *s < 0xA0)
                                return false;
                        if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
                                return false;
+                       if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE))
+                               return false;
                } else {
                        /* 4-byte sequence, U+010000 to U+10FFFF
                           c must be >= 11110000 (which it is) and <= 11110100
index 9f3f90073dc0624e7bcfc064f56c849dfff6ce94..5504e886b9f9f111a9554aa5dfb70cfce42b96c2 100644 (file)
@@ -40,7 +40,8 @@ static unsigned int utf8_randcode(int len)
                case 3:
                        for (;;) {
                                ret = r % (0x10000-0x800) + 0x800;
-                               if (!utf8_allow_surrogates && ret >= 0xD800 && ret <= 0xDFFF)
+                               if ((!utf8_allow_surrogates && ret >= 0xD800 && ret <= 0xDFFF)
+                               || ret >= 0xFFFE)
                                {
                                        r = rand32();
                                        continue;