X-Git-Url: http://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Fcharset%2Fcharset.c;h=c8efbb3979ad0d81a864cfb13aef6b3aa1d8f6da;hp=756080138d23110e26aa97c6fc3b0a0964b488ce;hb=56023cca5f66a40646a1e807c3d10af6e5913623;hpb=455572f3e5a66e8a02f38458524fad651eb46489 diff --git a/ccan/charset/charset.c b/ccan/charset/charset.c index 75608013..c8efbb39 100644 --- a/ccan/charset/charset.c +++ b/ccan/charset/charset.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com) + Copyright (C) 2011 Joseph A. Adams (joeyadams3.14159@gmail.com) All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy @@ -22,79 +22,187 @@ */ #include "charset.h" +#include -bool utf8_allow_surrogates = false; bool utf8_validate(const char *str, size_t length) { - const unsigned char *s = (const unsigned char*)str; - const unsigned char *e = s + length; + const char *s = str; + const char *e = str + length; + int len; - while (s < e) { - unsigned char c = *s++; - unsigned int len; /* number of bytes in sequence - 2 */ + for (; s < e; s += len) { + len = utf8_validate_char(s, e); + if (len == 0) + return false; + } + assert(s == e); + + return true; +} + +/* + * This function implements the syntax given in RFC3629, which is + * the same as that given in The Unicode Standard, Version 6.0. + * + * It has the following properties: + * + * * All codepoints U+0000..U+10FFFF may be encoded, + * except for U+D800..U+DFFF, which are reserved + * for UTF-16 surrogate pair encoding. + * * UTF-8 byte sequences longer than 4 bytes are not permitted, + * as they exceed the range of Unicode. + * * The sixty-six Unicode "non-characters" are permitted + * (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF). + */ +int utf8_validate_char(const char *s, const char *e) +{ + unsigned char c = *s++; + + if (c <= 0x7F) { /* 00..7F */ + return 1; + } else if (c <= 0xC1) { /* 80..C1 */ + /* Disallow overlong 2-byte sequence. */ + return 0; + } else if (c <= 0xDF) { /* C2..DF */ + /* Make sure the character isn't clipped. */ + if (e - s < 1) + return 0; + + /* Make sure subsequent byte is in the range 0x80..0xBF. */ + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; - /* If character is ASCII, move on. */ - if (c < 0x80) - continue; + return 2; + } else if (c <= 0xEF) { /* E0..EF */ + /* Make sure the character isn't clipped. */ + if (e - s < 2) + return 0; - if (s >= e) - return false; /* Missing bytes in sequence. */ + /* Disallow overlong 3-byte sequence. */ + if (c == 0xE0 && (unsigned char)*s < 0xA0) + return 0; - if (c < 0xE0) { - /* 2-byte sequence, U+0080 to U+07FF - c must be 11000010 or higher - s[0] must be 10xxxxxx */ - len = 0; - if (c < 0xC2) - return false; - } else if (c < 0xF0) { - /* 3-byte sequence, U+0800 to U+FFFF - Note that the surrogate range is U+D800 to U+DFFF, - and that U+FFFE and U+FFFF are illegal characters. - c must be >= 11100000 (which it is) - If c is 11100000, then s[0] must be >= 10100000 - If the global parameter utf8_allow_surrogates is false: - If c is 11101101 and s[0] is >= 10100000, - then this is a surrogate and we should fail. - If c is 11101111, s[0] is 10111111, and s[1] >= 10111110, - then this is an illegal character and we should fail. - s[0] and s[1] must be 10xxxxxx */ - len = 1; - if (c == 0xE0 && *s < 0xA0) - return false; - if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0) - return false; - if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE)) - return false; - } else { - /* 4-byte sequence, U+010000 to U+10FFFF - c must be >= 11110000 (which it is) and <= 11110100 - If c is 11110000, then s[0] must be >= 10010000 - If c is 11110100, then s[0] must be < 10010000 - s[0], s[1], and s[2] must be 10xxxxxx */ - len = 2; - if (c > 0xF4) - return false; - if (c == 0xF0 && *s < 0x90) - return false; - if (c == 0xF4 && *s >= 0x90) - return false; - } + /* Disallow U+D800..U+DFFF. */ + if (c == 0xED && (unsigned char)*s > 0x9F) + return 0; - if (s + len >= e) - return false; /* Missing bytes in sequence. */ + /* Make sure subsequent bytes are in the range 0x80..0xBF. */ + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; - do { - if ((*s++ & 0xC0) != 0x80) - return false; - } while (len--); + return 3; + } else if (c <= 0xF4) { /* F0..F4 */ + /* Make sure the character isn't clipped. */ + if (e - s < 3) + return 0; + + /* Disallow overlong 4-byte sequence. */ + if (c == 0xF0 && (unsigned char)*s < 0x90) + return 0; + + /* Disallow codepoints beyond U+10FFFF. */ + if (c == 0xF4 && (unsigned char)*s > 0x8F) + return 0; + + /* Make sure subsequent bytes are in the range 0x80..0xBF. */ + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + if (((unsigned char)*s++ & 0xC0) != 0x80) + return 0; + + return 4; + } else { /* F5..FF */ + return 0; } - - return true; } -/* - Note to future contributors: These routines are currently all under the - MIT license. It would be nice to keep it that way :) -*/ +int utf8_read_char(const char *s, uchar_t *out) +{ + const unsigned char *c = (const unsigned char*) s; + + if (c[0] <= 0x7F) { + /* 00..7F */ + *out = c[0]; + return 1; + } else if (c[0] <= 0xDF) { + /* C2..DF (unless input is invalid) */ + *out = ((uchar_t)c[0] & 0x1F) << 6 | + ((uchar_t)c[1] & 0x3F); + return 2; + } else if (c[0] <= 0xEF) { + /* E0..EF */ + *out = ((uchar_t)c[0] & 0xF) << 12 | + ((uchar_t)c[1] & 0x3F) << 6 | + ((uchar_t)c[2] & 0x3F); + return 3; + } else { + /* F0..F4 (unless input is invalid) */ + *out = ((uchar_t)c[0] & 0x7) << 18 | + ((uchar_t)c[1] & 0x3F) << 12 | + ((uchar_t)c[2] & 0x3F) << 6 | + ((uchar_t)c[3] & 0x3F); + return 4; + } +} + +int utf8_write_char(uchar_t unicode, char *out) +{ + unsigned char *o = (unsigned char*) out; + + if (unicode <= 0x7F) { + /* U+0000..U+007F */ + *o++ = unicode; + return 1; + } else if (unicode <= 0x7FF) { + /* U+0080..U+07FF */ + *o++ = 0xC0 | unicode >> 6; + *o++ = 0x80 | (unicode & 0x3F); + return 2; + } else if (unicode <= 0xFFFF) { + /* U+0800..U+FFFF */ + if (unicode >= 0xD800 && unicode <= 0xDFFF) + unicode = REPLACEMENT_CHARACTER; + three_byte_character: + *o++ = 0xE0 | unicode >> 12; + *o++ = 0x80 | (unicode >> 6 & 0x3F); + *o++ = 0x80 | (unicode & 0x3F); + return 3; + } else if (unicode <= 0x10FFFF) { + /* U+10000..U+10FFFF */ + *o++ = 0xF0 | unicode >> 18; + *o++ = 0x80 | (unicode >> 12 & 0x3F); + *o++ = 0x80 | (unicode >> 6 & 0x3F); + *o++ = 0x80 | (unicode & 0x3F); + return 4; + } else { + /* U+110000... */ + unicode = REPLACEMENT_CHARACTER; + goto three_byte_character; + } +} + +uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc) +{ + if (uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF) + return 0x10000 + ((((uchar_t)uc & 0x3FF) << 10) | (lc & 0x3FF)); + else + return REPLACEMENT_CHARACTER; +} + +bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc) +{ + if (unicode >= 0x10000 && unicode <= 0x10FFFF) { + uchar_t n = unicode - 0x10000; + *uc = ((n >> 10) & 0x3FF) | 0xD800; + *lc = (n & 0x3FF) | 0xDC00; + return true; + } else { + *uc = *lc = REPLACEMENT_CHARACTER; + return false; + } +}