]> git.ozlabs.org Git - ccan/commitdiff
charset: Added utf8_validate_char (factored out of utf8_validate).
authorJoey Adams <joeyadams3.14159@gmail.com>
Wed, 15 Jun 2011 02:13:01 +0000 (22:13 -0400)
committerJoey Adams <joeyadams3.14159@gmail.com>
Wed, 15 Jun 2011 02:14:28 +0000 (22:14 -0400)
ccan/charset/charset.c
ccan/charset/charset.h

index cd2035969222866c49617f1bd90d022ce82c3453..e585b45785f5c8cda1cc8075698080bf0afc0f21 100644 (file)
 */
 
 #include "charset.h"
+#include <assert.h>
+
+
+bool utf8_validate(const char *str, size_t length)
+{
+       const char *s = str;
+       const char *e = str + length;
+       int len;
+       
+       for (; s < e; s += len) {
+               len = utf8_validate_char(s, e);
+               if (len == 0)
+                       return false;
+       }
+       assert(s == e);
+       
+       return true;
+}
 
 /*
  * This function implements the syntax given in RFC3629, which is
  *  * The sixty-six Unicode "non-characters" are permitted
  *    (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
  */
-bool utf8_validate(const char *str, size_t length)
+int utf8_validate_char(const char *s, const char *e)
 {
-       const unsigned char *s = (const unsigned char*)str;
-       const unsigned char *e = s + length;
+       unsigned char c = *s++;
        
-       while (s < e) {
-               unsigned char c = *s++;
-               unsigned char c2;
-               int len_minus_two;
+       if (c <= 0x7F) {        /* 00..7F */
+               return 1;
+       } else if (c <= 0xC1) { /* 80..C1 */
+               /* Disallow overlong 2-byte sequence. */
+               return 0;
+       } else if (c <= 0xDF) { /* C2..DF */
+               /* Make sure the character isn't clipped. */
+               if (e - s < 1)
+                       return 0;
                
-               /* Validate the first byte and determine the sequence length. */
-               if (c <= 0x7F)          /* 00..7F */
-                       continue;
-               else if (c <= 0xC1)     /* 80..C1 */
-                       return false;
-               else if (c <= 0xDF)     /* C2..DF */
-                       len_minus_two = 0;
-               else if (c <= 0xEF)     /* E0..EF */
-                       len_minus_two = 1;
-               else if (c <= 0xF4)     /* F0..F4 */
-                       len_minus_two = 2;
-               else
-                       return false;
+               /* Make sure subsequent byte is in the range 0x80..0xBF. */
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
                
+               return 2;
+       } else if (c <= 0xEF) { /* E0..EF */
                /* Make sure the character isn't clipped. */
-               if (s + len_minus_two >= e)
-                       return false;
+               if (e - s < 2)
+                       return 0;
+               
+               /* Disallow overlong 3-byte sequence. */
+               if (c == 0xE0 && (unsigned char)*s < 0xA0)
+                       return 0;
+               
+               /* Disallow U+D800..U+DFFF. */
+               if (c == 0xED && (unsigned char)*s > 0x9F)
+                       return 0;
+               
+               /* Make sure subsequent bytes are in the range 0x80..0xBF. */
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
                
-               c2 = *s;
+               return 3;
+       } else if (c <= 0xF4) { /* F0..F4 */
+               /* Make sure the character isn't clipped. */
+               if (e - s < 3)
+                       return 0;
+               
+               /* Disallow overlong 4-byte sequence. */
+               if (c == 0xF0 && (unsigned char)*s < 0x90)
+                       return 0;
+               
+               /* Disallow codepoints beyond U+10FFFF. */
+               if (c == 0xF4 && (unsigned char)*s > 0x8F)
+                       return 0;
                
                /* Make sure subsequent bytes are in the range 0x80..0xBF. */
-               do {
-                       if ((*s++ & 0xC0) != 0x80)
-                               return false;
-               } while (len_minus_two--);
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
                
-               /* Handle special cases. */
-               switch (c) {
-                       case 0xE0:
-                               /* Disallow overlong 3-byte sequence. */
-                               if (c2 < 0xA0)
-                                       return false;
-                               break;
-                       case 0xED:
-                               /* Disallow U+D800..U+DFFF. */
-                               if (c2 > 0x9F)
-                                       return false;
-                               break;
-                       case 0xF0:
-                               /* Disallow overlong 4-byte sequence. */
-                               if (c2 < 0x90)
-                                       return false;
-                               break;
-                       case 0xF4:
-                               /* Disallow codepoints beyond U+10FFFF. */
-                               if (c2 > 0x8F)
-                                       return false;
-                               break;
-               }
+               return 4;
+       } else {                /* F5..FF */
+               return 0;
        }
-       
-       return true;
 }
 
 int utf8_read_char(const char *s, uchar_t *out)
index 257d2860d70122efb407ae2d4d49b84015f72dcc..907b9e09bb5268ef116150f6e8916667392b2397 100644 (file)
@@ -42,6 +42,16 @@ typedef uint32_t uchar_t;
  */
 bool utf8_validate(const char *str, size_t length);
 
+/*
+ * Validate a single UTF-8 character.
+ * @s: Beginning of UTF-8 character.
+ * @e: End of string.
+ *
+ * If it's valid, return its length (1 thru 4).
+ * If it's invalid or clipped, return 0.
+ */
+int utf8_validate_char(const char *s, const char *e);
+
 /*
  * Read a single UTF-8 character starting at @s,
  * returning the length, in bytes, of the character read.