]> git.ozlabs.org Git - ccan/blobdiff - ccan/charset/charset.c
tdb2: copy tdb1's changed expansion logic.
[ccan] / ccan / charset / charset.c
index cd2035969222866c49617f1bd90d022ce82c3453..c8efbb3979ad0d81a864cfb13aef6b3aa1d8f6da 100644 (file)
@@ -1,5 +1,5 @@
 /*
-  Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
+  Copyright (C) 2011 Joseph A. Adams (joeyadams3.14159@gmail.com)
   All rights reserved.
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
 */
 
 #include "charset.h"
+#include <assert.h>
+
+
+bool utf8_validate(const char *str, size_t length)
+{
+       const char *s = str;
+       const char *e = str + length;
+       int len;
+       
+       for (; s < e; s += len) {
+               len = utf8_validate_char(s, e);
+               if (len == 0)
+                       return false;
+       }
+       assert(s == e);
+       
+       return true;
+}
 
 /*
  * This function implements the syntax given in RFC3629, which is
  *  * The sixty-six Unicode "non-characters" are permitted
  *    (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
  */
-bool utf8_validate(const char *str, size_t length)
+int utf8_validate_char(const char *s, const char *e)
 {
-       const unsigned char *s = (const unsigned char*)str;
-       const unsigned char *e = s + length;
+       unsigned char c = *s++;
        
-       while (s < e) {
-               unsigned char c = *s++;
-               unsigned char c2;
-               int len_minus_two;
+       if (c <= 0x7F) {        /* 00..7F */
+               return 1;
+       } else if (c <= 0xC1) { /* 80..C1 */
+               /* Disallow overlong 2-byte sequence. */
+               return 0;
+       } else if (c <= 0xDF) { /* C2..DF */
+               /* Make sure the character isn't clipped. */
+               if (e - s < 1)
+                       return 0;
                
-               /* Validate the first byte and determine the sequence length. */
-               if (c <= 0x7F)          /* 00..7F */
-                       continue;
-               else if (c <= 0xC1)     /* 80..C1 */
-                       return false;
-               else if (c <= 0xDF)     /* C2..DF */
-                       len_minus_two = 0;
-               else if (c <= 0xEF)     /* E0..EF */
-                       len_minus_two = 1;
-               else if (c <= 0xF4)     /* F0..F4 */
-                       len_minus_two = 2;
-               else
-                       return false;
+               /* Make sure subsequent byte is in the range 0x80..0xBF. */
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
                
+               return 2;
+       } else if (c <= 0xEF) { /* E0..EF */
                /* Make sure the character isn't clipped. */
-               if (s + len_minus_two >= e)
-                       return false;
+               if (e - s < 2)
+                       return 0;
+               
+               /* Disallow overlong 3-byte sequence. */
+               if (c == 0xE0 && (unsigned char)*s < 0xA0)
+                       return 0;
+               
+               /* Disallow U+D800..U+DFFF. */
+               if (c == 0xED && (unsigned char)*s > 0x9F)
+                       return 0;
+               
+               /* Make sure subsequent bytes are in the range 0x80..0xBF. */
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
                
-               c2 = *s;
+               return 3;
+       } else if (c <= 0xF4) { /* F0..F4 */
+               /* Make sure the character isn't clipped. */
+               if (e - s < 3)
+                       return 0;
+               
+               /* Disallow overlong 4-byte sequence. */
+               if (c == 0xF0 && (unsigned char)*s < 0x90)
+                       return 0;
+               
+               /* Disallow codepoints beyond U+10FFFF. */
+               if (c == 0xF4 && (unsigned char)*s > 0x8F)
+                       return 0;
                
                /* Make sure subsequent bytes are in the range 0x80..0xBF. */
-               do {
-                       if ((*s++ & 0xC0) != 0x80)
-                               return false;
-               } while (len_minus_two--);
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
                
-               /* Handle special cases. */
-               switch (c) {
-                       case 0xE0:
-                               /* Disallow overlong 3-byte sequence. */
-                               if (c2 < 0xA0)
-                                       return false;
-                               break;
-                       case 0xED:
-                               /* Disallow U+D800..U+DFFF. */
-                               if (c2 > 0x9F)
-                                       return false;
-                               break;
-                       case 0xF0:
-                               /* Disallow overlong 4-byte sequence. */
-                               if (c2 < 0x90)
-                                       return false;
-                               break;
-                       case 0xF4:
-                               /* Disallow codepoints beyond U+10FFFF. */
-                               if (c2 > 0x8F)
-                                       return false;
-                               break;
-               }
+               return 4;
+       } else {                /* F5..FF */
+               return 0;
        }
-       
-       return true;
 }
 
 int utf8_read_char(const char *s, uchar_t *out)