X-Git-Url: http://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Fcharset%2Fcharset.c;h=c8efbb3979ad0d81a864cfb13aef6b3aa1d8f6da;hp=756080138d23110e26aa97c6fc3b0a0964b488ce;hb=56023cca5f66a40646a1e807c3d10af6e5913623;hpb=455572f3e5a66e8a02f38458524fad651eb46489

diff --git a/ccan/charset/charset.c b/ccan/charset/charset.c
index 75608013..c8efbb39 100644
--- a/ccan/charset/charset.c
+++ b/ccan/charset/charset.c
@@ -1,5 +1,5 @@
 /*
-  Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
+  Copyright (C) 2011 Joseph A. Adams (joeyadams3.14159@gmail.com)
   All rights reserved.
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -22,79 +22,187 @@
 */
 
 #include "charset.h"
+#include <assert.h>
 
-bool utf8_allow_surrogates = false;
 
 bool utf8_validate(const char *str, size_t length)
 {
-	const unsigned char *s = (const unsigned char*)str;
-	const unsigned char *e = s + length;
+	const char *s = str;
+	const char *e = str + length;
+	int len;
 	
-	while (s < e) {
-		unsigned char c = *s++;
-		unsigned int len; /* number of bytes in sequence - 2 */
+	for (; s < e; s += len) {
+		len = utf8_validate_char(s, e);
+		if (len == 0)
+			return false;
+	}
+	assert(s == e);
+	
+	return true;
+}
+
+/*
+ * This function implements the syntax given in RFC3629, which is
+ * the same as that given in The Unicode Standard, Version 6.0.
+ *
+ * It has the following properties:
+ *
+ *  * All codepoints U+0000..U+10FFFF may be encoded,
+ *    except for U+D800..U+DFFF, which are reserved
+ *    for UTF-16 surrogate pair encoding.
+ *  * UTF-8 byte sequences longer than 4 bytes are not permitted,
+ *    as they exceed the range of Unicode.
+ *  * The sixty-six Unicode "non-characters" are permitted
+ *    (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
+ */
+int utf8_validate_char(const char *s, const char *e)
+{
+	unsigned char c = *s++;
+	
+	if (c <= 0x7F) {        /* 00..7F */
+		return 1;
+	} else if (c <= 0xC1) { /* 80..C1 */
+		/* Disallow overlong 2-byte sequence. */
+		return 0;
+	} else if (c <= 0xDF) { /* C2..DF */
+		/* Make sure the character isn't clipped. */
+		if (e - s < 1)
+			return 0;
+		
+		/* Make sure subsequent byte is in the range 0x80..0xBF. */
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
 		
-		/* If character is ASCII, move on. */
-		if (c < 0x80)
-			continue;
+		return 2;
+	} else if (c <= 0xEF) { /* E0..EF */
+		/* Make sure the character isn't clipped. */
+		if (e - s < 2)
+			return 0;
 		
-		if (s >= e)
-			return false; /* Missing bytes in sequence. */
+		/* Disallow overlong 3-byte sequence. */
+		if (c == 0xE0 && (unsigned char)*s < 0xA0)
+			return 0;
 		
-		if (c < 0xE0) {
-			/* 2-byte sequence, U+0080 to U+07FF
-			   c must be 11000010 or higher
-			   s[0] must be 10xxxxxx */
-			len = 0;
-			if (c < 0xC2)
-				return false;
-		} else if (c < 0xF0) {
-			/* 3-byte sequence, U+0800 to U+FFFF
-			   Note that the surrogate range is U+D800 to U+DFFF,
-				  and that U+FFFE and U+FFFF are illegal characters.
-			   c must be >= 11100000 (which it is)
-			   If c is 11100000, then s[0] must be >= 10100000
-			   If the global parameter utf8_allow_surrogates is false:
-			      If c is 11101101 and s[0] is >= 10100000,
-			         then this is a surrogate and we should fail.
-			   If c is 11101111, s[0] is 10111111, and s[1] >= 10111110,
-				  then this is an illegal character and we should fail.
-			   s[0] and s[1] must be 10xxxxxx */
-			len = 1;
-			if (c == 0xE0 && *s < 0xA0)
-				return false;
-			if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
-				return false;
-			if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE))
-				return false;
-		} else {
-			/* 4-byte sequence, U+010000 to U+10FFFF
-			   c must be >= 11110000 (which it is) and <= 11110100
-			   If c is 11110000, then s[0] must be >= 10010000
-			   If c is 11110100, then s[0] must be < 10010000
-			   s[0], s[1], and s[2] must be 10xxxxxx */
-			len = 2;
-			if (c > 0xF4)
-				return false;
-			if (c == 0xF0 && *s < 0x90)
-				return false;
-			if (c == 0xF4 && *s >= 0x90)
-				return false;
-		}
+		/* Disallow U+D800..U+DFFF. */
+		if (c == 0xED && (unsigned char)*s > 0x9F)
+			return 0;
 		
-		if (s + len >= e)
-			return false; /* Missing bytes in sequence. */
+		/* Make sure subsequent bytes are in the range 0x80..0xBF. */
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
 		
-		do {
-			if ((*s++ & 0xC0) != 0x80)
-				return false;
-		} while (len--);
+		return 3;
+	} else if (c <= 0xF4) { /* F0..F4 */
+		/* Make sure the character isn't clipped. */
+		if (e - s < 3)
+			return 0;
+		
+		/* Disallow overlong 4-byte sequence. */
+		if (c == 0xF0 && (unsigned char)*s < 0x90)
+			return 0;
+		
+		/* Disallow codepoints beyond U+10FFFF. */
+		if (c == 0xF4 && (unsigned char)*s > 0x8F)
+			return 0;
+		
+		/* Make sure subsequent bytes are in the range 0x80..0xBF. */
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
+		
+		return 4;
+	} else {                /* F5..FF */
+		return 0;
 	}
-	
-	return true;
 }
 
-/*
-  Note to future contributors: These routines are currently all under the
-    MIT license.  It would be nice to keep it that way :)
-*/
+int utf8_read_char(const char *s, uchar_t *out)
+{
+	const unsigned char *c = (const unsigned char*) s;
+
+	if (c[0] <= 0x7F) {
+		/* 00..7F */
+		*out = c[0];
+		return 1;
+	} else if (c[0] <= 0xDF) {
+		/* C2..DF (unless input is invalid) */
+		*out = ((uchar_t)c[0] & 0x1F) << 6 |
+		       ((uchar_t)c[1] & 0x3F);
+		return 2;
+	} else if (c[0] <= 0xEF) {
+		/* E0..EF */
+		*out = ((uchar_t)c[0] &  0xF) << 12 |
+		       ((uchar_t)c[1] & 0x3F) << 6  |
+		       ((uchar_t)c[2] & 0x3F);
+		return 3;
+	} else {
+		/* F0..F4 (unless input is invalid) */
+		*out = ((uchar_t)c[0] &  0x7) << 18 |
+		       ((uchar_t)c[1] & 0x3F) << 12 |
+		       ((uchar_t)c[2] & 0x3F) << 6  |
+		       ((uchar_t)c[3] & 0x3F);
+		return 4;
+	}
+}
+
+int utf8_write_char(uchar_t unicode, char *out)
+{
+	unsigned char *o = (unsigned char*) out;
+
+	if (unicode <= 0x7F) {
+		/* U+0000..U+007F */
+		*o++ = unicode;
+		return 1;
+	} else if (unicode <= 0x7FF) {
+		/* U+0080..U+07FF */
+		*o++ = 0xC0 | unicode >> 6;
+		*o++ = 0x80 | (unicode & 0x3F);
+		return 2;
+	} else if (unicode <= 0xFFFF) {
+		/* U+0800..U+FFFF */
+		if (unicode >= 0xD800 && unicode <= 0xDFFF)
+			unicode = REPLACEMENT_CHARACTER;
+	three_byte_character:
+		*o++ = 0xE0 | unicode >> 12;
+		*o++ = 0x80 | (unicode >> 6 & 0x3F);
+		*o++ = 0x80 | (unicode & 0x3F);
+		return 3;
+	} else if (unicode <= 0x10FFFF) {
+		/* U+10000..U+10FFFF */
+		*o++ = 0xF0 | unicode >> 18;
+		*o++ = 0x80 | (unicode >> 12 & 0x3F);
+		*o++ = 0x80 | (unicode >> 6 & 0x3F);
+		*o++ = 0x80 | (unicode & 0x3F);
+		return 4;
+	} else {
+		/* U+110000... */
+		unicode = REPLACEMENT_CHARACTER;
+		goto three_byte_character;
+	}
+}
+
+uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc)
+{
+	if (uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF)
+		return 0x10000 + ((((uchar_t)uc & 0x3FF) << 10) | (lc & 0x3FF));
+	else
+		return REPLACEMENT_CHARACTER;
+}
+
+bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc)
+{
+	if (unicode >= 0x10000 && unicode <= 0x10FFFF) {
+		uchar_t n = unicode - 0x10000;
+		*uc = ((n >> 10) & 0x3FF) | 0xD800;
+		*lc = (n & 0x3FF) | 0xDC00;
+		return true;
+	} else {
+		*uc = *lc = REPLACEMENT_CHARACTER;
+		return false;
+	}
+}