X-Git-Url: http://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Fcharset%2Fcharset.c;h=e585b45785f5c8cda1cc8075698080bf0afc0f21;hp=cd2035969222866c49617f1bd90d022ce82c3453;hb=12af7e37d7da88f23679c7d3e6962817cd6f18c3;hpb=06c4af3163e2bd99999a93a478d1308ea39c5a79 diff --git a/ccan/charset/charset.c b/ccan/charset/charset.c index cd203596..e585b457 100644 --- a/ccan/charset/charset.c +++ b/ccan/charset/charset.c @@ -22,6 +22,24 @@ */ #include "charset.h" +#include + + +bool utf8_validate(const char *str, size_t length) +{ + const char *s = str; + const char *e = str + length; + int len; + + for (; s < e; s += len) { + len = utf8_validate_char(s, e); + if (len == 0) + return false; + } + assert(s == e); + + return true; +} /* * This function implements the syntax given in RFC3629, which is @@ -37,68 +55,70 @@ * * The sixty-six Unicode "non-characters" are permitted * (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF). */ -bool utf8_validate(const char *str, size_t length) +int utf8_validate_char(const char *s, const char *e) { - const unsigned char *s = (const unsigned char*)str; - const unsigned char *e = s + length; + unsigned char c = *s++; - while (s < e) { - unsigned char c = *s++; - unsigned char c2; - int len_minus_two; + if (c <= 0x7F) { /* 00..7F */ + return 1; + } else if (c <= 0xC1) { /* 80..C1 */ + /* Disallow overlong 2-byte sequence. */ + return 0; + } else if (c <= 0xDF) { /* C2..DF */ + /* Make sure the character isn't clipped. */ + if (e - s < 1) + return 0; - /* Validate the first byte and determine the sequence length. */ - if (c <= 0x7F) /* 00..7F */ - continue; - else if (c <= 0xC1) /* 80..C1 */ - return false; - else if (c <= 0xDF) /* C2..DF */ - len_minus_two = 0; - else if (c <= 0xEF) /* E0..EF */ - len_minus_two = 1; - else if (c <= 0xF4) /* F0..F4 */ - len_minus_two = 2; - else - return false; + /* Make sure subsequent byte is in the range 0x80..0xBF. */ + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + return 2; + } else if (c <= 0xEF) { /* E0..EF */ /* Make sure the character isn't clipped. */ - if (s + len_minus_two >= e) - return false; + if (e - s < 2) + return 0; + + /* Disallow overlong 3-byte sequence. */ + if (c == 0xE0 && (unsigned char)*s < 0xA0) + return 0; + + /* Disallow U+D800..U+DFFF. */ + if (c == 0xED && (unsigned char)*s > 0x9F) + return 0; + + /* Make sure subsequent bytes are in the range 0x80..0xBF. */ + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; - c2 = *s; + return 3; + } else if (c <= 0xF4) { /* F0..F4 */ + /* Make sure the character isn't clipped. */ + if (e - s < 3) + return 0; + + /* Disallow overlong 4-byte sequence. */ + if (c == 0xF0 && (unsigned char)*s < 0x90) + return 0; + + /* Disallow codepoints beyond U+10FFFF. */ + if (c == 0xF4 && (unsigned char)*s > 0x8F) + return 0; /* Make sure subsequent bytes are in the range 0x80..0xBF. */ - do { - if ((*s++ & 0xC0) != 0x80) - return false; - } while (len_minus_two--); + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; - /* Handle special cases. */ - switch (c) { - case 0xE0: - /* Disallow overlong 3-byte sequence. */ - if (c2 < 0xA0) - return false; - break; - case 0xED: - /* Disallow U+D800..U+DFFF. */ - if (c2 > 0x9F) - return false; - break; - case 0xF0: - /* Disallow overlong 4-byte sequence. */ - if (c2 < 0x90) - return false; - break; - case 0xF4: - /* Disallow codepoints beyond U+10FFFF. */ - if (c2 > 0x8F) - return false; - break; - } + return 4; + } else { /* F5..FF */ + return 0; } - - return true; } int utf8_read_char(const char *s, uchar_t *out)