]> git.ozlabs.org Git - ccan/blobdiff - ccan/charset/charset.c
endian: add constant versions.
[ccan] / ccan / charset / charset.c
index 756080138d23110e26aa97c6fc3b0a0964b488ce..c8efbb3979ad0d81a864cfb13aef6b3aa1d8f6da 100644 (file)
@@ -1,5 +1,5 @@
 /*
-  Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
+  Copyright (C) 2011 Joseph A. Adams (joeyadams3.14159@gmail.com)
   All rights reserved.
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
 */
 
 #include "charset.h"
+#include <assert.h>
 
-bool utf8_allow_surrogates = false;
 
 bool utf8_validate(const char *str, size_t length)
 {
-       const unsigned char *s = (const unsigned char*)str;
-       const unsigned char *e = s + length;
+       const char *s = str;
+       const char *e = str + length;
+       int len;
        
-       while (s < e) {
-               unsigned char c = *s++;
-               unsigned int len; /* number of bytes in sequence - 2 */
+       for (; s < e; s += len) {
+               len = utf8_validate_char(s, e);
+               if (len == 0)
+                       return false;
+       }
+       assert(s == e);
+       
+       return true;
+}
+
+/*
+ * This function implements the syntax given in RFC3629, which is
+ * the same as that given in The Unicode Standard, Version 6.0.
+ *
+ * It has the following properties:
+ *
+ *  * All codepoints U+0000..U+10FFFF may be encoded,
+ *    except for U+D800..U+DFFF, which are reserved
+ *    for UTF-16 surrogate pair encoding.
+ *  * UTF-8 byte sequences longer than 4 bytes are not permitted,
+ *    as they exceed the range of Unicode.
+ *  * The sixty-six Unicode "non-characters" are permitted
+ *    (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
+ */
+int utf8_validate_char(const char *s, const char *e)
+{
+       unsigned char c = *s++;
+       
+       if (c <= 0x7F) {        /* 00..7F */
+               return 1;
+       } else if (c <= 0xC1) { /* 80..C1 */
+               /* Disallow overlong 2-byte sequence. */
+               return 0;
+       } else if (c <= 0xDF) { /* C2..DF */
+               /* Make sure the character isn't clipped. */
+               if (e - s < 1)
+                       return 0;
+               
+               /* Make sure subsequent byte is in the range 0x80..0xBF. */
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
                
-               /* If character is ASCII, move on. */
-               if (c < 0x80)
-                       continue;
+               return 2;
+       } else if (c <= 0xEF) { /* E0..EF */
+               /* Make sure the character isn't clipped. */
+               if (e - s < 2)
+                       return 0;
                
-               if (s >= e)
-                       return false; /* Missing bytes in sequence. */
+               /* Disallow overlong 3-byte sequence. */
+               if (c == 0xE0 && (unsigned char)*s < 0xA0)
+                       return 0;
                
-               if (c < 0xE0) {
-                       /* 2-byte sequence, U+0080 to U+07FF
-                          c must be 11000010 or higher
-                          s[0] must be 10xxxxxx */
-                       len = 0;
-                       if (c < 0xC2)
-                               return false;
-               } else if (c < 0xF0) {
-                       /* 3-byte sequence, U+0800 to U+FFFF
-                          Note that the surrogate range is U+D800 to U+DFFF,
-                                 and that U+FFFE and U+FFFF are illegal characters.
-                          c must be >= 11100000 (which it is)
-                          If c is 11100000, then s[0] must be >= 10100000
-                          If the global parameter utf8_allow_surrogates is false:
-                             If c is 11101101 and s[0] is >= 10100000,
-                                then this is a surrogate and we should fail.
-                          If c is 11101111, s[0] is 10111111, and s[1] >= 10111110,
-                                 then this is an illegal character and we should fail.
-                          s[0] and s[1] must be 10xxxxxx */
-                       len = 1;
-                       if (c == 0xE0 && *s < 0xA0)
-                               return false;
-                       if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
-                               return false;
-                       if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE))
-                               return false;
-               } else {
-                       /* 4-byte sequence, U+010000 to U+10FFFF
-                          c must be >= 11110000 (which it is) and <= 11110100
-                          If c is 11110000, then s[0] must be >= 10010000
-                          If c is 11110100, then s[0] must be < 10010000
-                          s[0], s[1], and s[2] must be 10xxxxxx */
-                       len = 2;
-                       if (c > 0xF4)
-                               return false;
-                       if (c == 0xF0 && *s < 0x90)
-                               return false;
-                       if (c == 0xF4 && *s >= 0x90)
-                               return false;
-               }
+               /* Disallow U+D800..U+DFFF. */
+               if (c == 0xED && (unsigned char)*s > 0x9F)
+                       return 0;
                
-               if (s + len >= e)
-                       return false; /* Missing bytes in sequence. */
+               /* Make sure subsequent bytes are in the range 0x80..0xBF. */
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
                
-               do {
-                       if ((*s++ & 0xC0) != 0x80)
-                               return false;
-               } while (len--);
+               return 3;
+       } else if (c <= 0xF4) { /* F0..F4 */
+               /* Make sure the character isn't clipped. */
+               if (e - s < 3)
+                       return 0;
+               
+               /* Disallow overlong 4-byte sequence. */
+               if (c == 0xF0 && (unsigned char)*s < 0x90)
+                       return 0;
+               
+               /* Disallow codepoints beyond U+10FFFF. */
+               if (c == 0xF4 && (unsigned char)*s > 0x8F)
+                       return 0;
+               
+               /* Make sure subsequent bytes are in the range 0x80..0xBF. */
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
+               if (((unsigned char)*s++ & 0xC0) != 0x80)
+                       return 0;
+               
+               return 4;
+       } else {                /* F5..FF */
+               return 0;
        }
-       
-       return true;
 }
 
-/*
-  Note to future contributors: These routines are currently all under the
-    MIT license.  It would be nice to keep it that way :)
-*/
+int utf8_read_char(const char *s, uchar_t *out)
+{
+       const unsigned char *c = (const unsigned char*) s;
+
+       if (c[0] <= 0x7F) {
+               /* 00..7F */
+               *out = c[0];
+               return 1;
+       } else if (c[0] <= 0xDF) {
+               /* C2..DF (unless input is invalid) */
+               *out = ((uchar_t)c[0] & 0x1F) << 6 |
+                      ((uchar_t)c[1] & 0x3F);
+               return 2;
+       } else if (c[0] <= 0xEF) {
+               /* E0..EF */
+               *out = ((uchar_t)c[0] &  0xF) << 12 |
+                      ((uchar_t)c[1] & 0x3F) << 6  |
+                      ((uchar_t)c[2] & 0x3F);
+               return 3;
+       } else {
+               /* F0..F4 (unless input is invalid) */
+               *out = ((uchar_t)c[0] &  0x7) << 18 |
+                      ((uchar_t)c[1] & 0x3F) << 12 |
+                      ((uchar_t)c[2] & 0x3F) << 6  |
+                      ((uchar_t)c[3] & 0x3F);
+               return 4;
+       }
+}
+
+int utf8_write_char(uchar_t unicode, char *out)
+{
+       unsigned char *o = (unsigned char*) out;
+
+       if (unicode <= 0x7F) {
+               /* U+0000..U+007F */
+               *o++ = unicode;
+               return 1;
+       } else if (unicode <= 0x7FF) {
+               /* U+0080..U+07FF */
+               *o++ = 0xC0 | unicode >> 6;
+               *o++ = 0x80 | (unicode & 0x3F);
+               return 2;
+       } else if (unicode <= 0xFFFF) {
+               /* U+0800..U+FFFF */
+               if (unicode >= 0xD800 && unicode <= 0xDFFF)
+                       unicode = REPLACEMENT_CHARACTER;
+       three_byte_character:
+               *o++ = 0xE0 | unicode >> 12;
+               *o++ = 0x80 | (unicode >> 6 & 0x3F);
+               *o++ = 0x80 | (unicode & 0x3F);
+               return 3;
+       } else if (unicode <= 0x10FFFF) {
+               /* U+10000..U+10FFFF */
+               *o++ = 0xF0 | unicode >> 18;
+               *o++ = 0x80 | (unicode >> 12 & 0x3F);
+               *o++ = 0x80 | (unicode >> 6 & 0x3F);
+               *o++ = 0x80 | (unicode & 0x3F);
+               return 4;
+       } else {
+               /* U+110000... */
+               unicode = REPLACEMENT_CHARACTER;
+               goto three_byte_character;
+       }
+}
+
+uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc)
+{
+       if (uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF)
+               return 0x10000 + ((((uchar_t)uc & 0x3FF) << 10) | (lc & 0x3FF));
+       else
+               return REPLACEMENT_CHARACTER;
+}
+
+bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc)
+{
+       if (unicode >= 0x10000 && unicode <= 0x10FFFF) {
+               uchar_t n = unicode - 0x10000;
+               *uc = ((n >> 10) & 0x3FF) | 0xD800;
+               *lc = (n & 0x3FF) | 0xDC00;
+               return true;
+       } else {
+               *uc = *lc = REPLACEMENT_CHARACTER;
+               return false;
+       }
+}