X-Git-Url: https://git.ozlabs.org/?a=blobdiff_plain;f=ccan%2Fcharset%2Fcharset.c;h=cd2035969222866c49617f1bd90d022ce82c3453;hb=06c4af3163e2bd99999a93a478d1308ea39c5a79;hp=6c21df38ac703efcb5048648ef0abd00089505c5;hpb=c8c69dc68792e85b14646e8a8219dae923b34feb;p=ccan diff --git a/ccan/charset/charset.c b/ccan/charset/charset.c index 6c21df38..cd203596 100644 --- a/ccan/charset/charset.c +++ b/ccan/charset/charset.c @@ -23,8 +23,20 @@ #include "charset.h" -bool utf8_allow_surrogates = false; - +/* + * This function implements the syntax given in RFC3629, which is + * the same as that given in The Unicode Standard, Version 6.0. + * + * It has the following properties: + * + * * All codepoints U+0000..U+10FFFF may be encoded, + * except for U+D800..U+DFFF, which are reserved + * for UTF-16 surrogate pair encoding. + * * UTF-8 byte sequences longer than 4 bytes are not permitted, + * as they exceed the range of Unicode. + * * The sixty-six Unicode "non-characters" are permitted + * (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF). + */ bool utf8_validate(const char *str, size_t length) { const unsigned char *s = (const unsigned char*)str; @@ -32,64 +44,145 @@ bool utf8_validate(const char *str, size_t length) while (s < e) { unsigned char c = *s++; - unsigned int len; /* number of bytes in sequence - 2 */ + unsigned char c2; + int len_minus_two; - /* If character is ASCII, move on. */ - if (c < 0x80) + /* Validate the first byte and determine the sequence length. */ + if (c <= 0x7F) /* 00..7F */ continue; + else if (c <= 0xC1) /* 80..C1 */ + return false; + else if (c <= 0xDF) /* C2..DF */ + len_minus_two = 0; + else if (c <= 0xEF) /* E0..EF */ + len_minus_two = 1; + else if (c <= 0xF4) /* F0..F4 */ + len_minus_two = 2; + else + return false; - if (s >= e) - return false; /* Missing bytes in sequence. */ + /* Make sure the character isn't clipped. */ + if (s + len_minus_two >= e) + return false; - if (c < 0xE0) { - /* 2-byte sequence, U+0080 to U+07FF - c must be 11000010 or higher - s[0] must be 10xxxxxx */ - len = 0; - if (c < 0xC2) - return false; - } else if (c < 0xF0) { - /* 3-byte sequence, U+0800 to U+FFFF - Note that the surrogate range is U+D800 to U+DFFF - c must be >= 11100000 (which it is) - If c is 11100000, then s[0] must be >= 10100000 - If the global parameter utf8_allow_surrogates is false: - If c is 11101101 and s[0] is >= 10100000, - then this is a surrogate and we should fail. - s[0] and s[1] must be 10xxxxxx */ - len = 1; - if (c == 0xE0 && *s < 0xA0) - return false; - if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0) - return false; - } else { - /* 4-byte sequence, U+010000 to U+10FFFF - c must be >= 11110000 (which it is) and <= 11110100 - If c is 11110000, then s[0] must be >= 10010000 - If c is 11110100, then s[0] must be < 10010000 - s[0], s[1], and s[2] must be 10xxxxxx */ - len = 2; - if (c > 0xF4) - return false; - if (c == 0xF0 && *s < 0x90) - return false; - if (c == 0xF4 && *s >= 0x90) - return false; - } - - if (s + len >= e) - return false; /* Missing bytes in sequence. */ + c2 = *s; + /* Make sure subsequent bytes are in the range 0x80..0xBF. */ do { if ((*s++ & 0xC0) != 0x80) return false; - } while (len--); + } while (len_minus_two--); + + /* Handle special cases. */ + switch (c) { + case 0xE0: + /* Disallow overlong 3-byte sequence. */ + if (c2 < 0xA0) + return false; + break; + case 0xED: + /* Disallow U+D800..U+DFFF. */ + if (c2 > 0x9F) + return false; + break; + case 0xF0: + /* Disallow overlong 4-byte sequence. */ + if (c2 < 0x90) + return false; + break; + case 0xF4: + /* Disallow codepoints beyond U+10FFFF. */ + if (c2 > 0x8F) + return false; + break; + } } return true; } -/* - Note to future contributors: These routines are currently all under the - MIT license. It would be nice to keep it that way :) -*/ +int utf8_read_char(const char *s, uchar_t *out) +{ + const unsigned char *c = (const unsigned char*) s; + + if (c[0] <= 0x7F) { + /* 00..7F */ + *out = c[0]; + return 1; + } else if (c[0] <= 0xDF) { + /* C2..DF (unless input is invalid) */ + *out = ((uchar_t)c[0] & 0x1F) << 6 | + ((uchar_t)c[1] & 0x3F); + return 2; + } else if (c[0] <= 0xEF) { + /* E0..EF */ + *out = ((uchar_t)c[0] & 0xF) << 12 | + ((uchar_t)c[1] & 0x3F) << 6 | + ((uchar_t)c[2] & 0x3F); + return 3; + } else { + /* F0..F4 (unless input is invalid) */ + *out = ((uchar_t)c[0] & 0x7) << 18 | + ((uchar_t)c[1] & 0x3F) << 12 | + ((uchar_t)c[2] & 0x3F) << 6 | + ((uchar_t)c[3] & 0x3F); + return 4; + } +} + +int utf8_write_char(uchar_t unicode, char *out) +{ + unsigned char *o = (unsigned char*) out; + + if (unicode <= 0x7F) { + /* U+0000..U+007F */ + *o++ = unicode; + return 1; + } else if (unicode <= 0x7FF) { + /* U+0080..U+07FF */ + *o++ = 0xC0 | unicode >> 6; + *o++ = 0x80 | (unicode & 0x3F); + return 2; + } else if (unicode <= 0xFFFF) { + /* U+0800..U+FFFF */ + if (unicode >= 0xD800 && unicode <= 0xDFFF) + unicode = REPLACEMENT_CHARACTER; + three_byte_character: + *o++ = 0xE0 | unicode >> 12; + *o++ = 0x80 | (unicode >> 6 & 0x3F); + *o++ = 0x80 | (unicode & 0x3F); + return 3; + } else if (unicode <= 0x10FFFF) { + /* U+10000..U+10FFFF */ + *o++ = 0xF0 | unicode >> 18; + *o++ = 0x80 | (unicode >> 12 & 0x3F); + *o++ = 0x80 | (unicode >> 6 & 0x3F); + *o++ = 0x80 | (unicode & 0x3F); + return 4; + } else { + /* U+110000... */ + unicode = REPLACEMENT_CHARACTER; + goto three_byte_character; + } +} + +uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc) +{ + if (uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF) + return 0x10000 + ((((uchar_t)uc & 0x3FF) << 10) | (lc & 0x3FF)); + else + return REPLACEMENT_CHARACTER; +} + +bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc) +{ + if (unicode >= 0x10000 && unicode <= 0x10FFFF) { + uchar_t n = unicode - 0x10000; + *uc = ((n >> 10) & 0x3FF) | 0xD800; + *lc = (n & 0x3FF) | 0xDC00; + return true; + } else { + *uc = *lc = REPLACEMENT_CHARACTER; + return false; + } +}