X-Git-Url: http://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Fcharset%2Fcharset.c;h=e585b45785f5c8cda1cc8075698080bf0afc0f21;hp=cd2035969222866c49617f1bd90d022ce82c3453;hb=12af7e37d7da88f23679c7d3e6962817cd6f18c3;hpb=06c4af3163e2bd99999a93a478d1308ea39c5a79

diff --git a/ccan/charset/charset.c b/ccan/charset/charset.c
index cd203596..e585b457 100644
--- a/ccan/charset/charset.c
+++ b/ccan/charset/charset.c
@@ -22,6 +22,24 @@
 */
 
 #include "charset.h"
+#include <assert.h>
+
+
+bool utf8_validate(const char *str, size_t length)
+{
+	const char *s = str;
+	const char *e = str + length;
+	int len;
+	
+	for (; s < e; s += len) {
+		len = utf8_validate_char(s, e);
+		if (len == 0)
+			return false;
+	}
+	assert(s == e);
+	
+	return true;
+}
 
 /*
  * This function implements the syntax given in RFC3629, which is
@@ -37,68 +55,70 @@
  *  * The sixty-six Unicode "non-characters" are permitted
  *    (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
  */
-bool utf8_validate(const char *str, size_t length)
+int utf8_validate_char(const char *s, const char *e)
 {
-	const unsigned char *s = (const unsigned char*)str;
-	const unsigned char *e = s + length;
+	unsigned char c = *s++;
 	
-	while (s < e) {
-		unsigned char c = *s++;
-		unsigned char c2;
-		int len_minus_two;
+	if (c <= 0x7F) {        /* 00..7F */
+		return 1;
+	} else if (c <= 0xC1) { /* 80..C1 */
+		/* Disallow overlong 2-byte sequence. */
+		return 0;
+	} else if (c <= 0xDF) { /* C2..DF */
+		/* Make sure the character isn't clipped. */
+		if (e - s < 1)
+			return 0;
 		
-		/* Validate the first byte and determine the sequence length. */
-		if (c <= 0x7F)          /* 00..7F */
-			continue;
-		else if (c <= 0xC1)     /* 80..C1 */
-			return false;
-		else if (c <= 0xDF)     /* C2..DF */
-			len_minus_two = 0;
-		else if (c <= 0xEF)     /* E0..EF */
-			len_minus_two = 1;
-		else if (c <= 0xF4)     /* F0..F4 */
-			len_minus_two = 2;
-		else
-			return false;
+		/* Make sure subsequent byte is in the range 0x80..0xBF. */
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
 		
+		return 2;
+	} else if (c <= 0xEF) { /* E0..EF */
 		/* Make sure the character isn't clipped. */
-		if (s + len_minus_two >= e)
-			return false;
+		if (e - s < 2)
+			return 0;
+		
+		/* Disallow overlong 3-byte sequence. */
+		if (c == 0xE0 && (unsigned char)*s < 0xA0)
+			return 0;
+		
+		/* Disallow U+D800..U+DFFF. */
+		if (c == 0xED && (unsigned char)*s > 0x9F)
+			return 0;
+		
+		/* Make sure subsequent bytes are in the range 0x80..0xBF. */
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
 		
-		c2 = *s;
+		return 3;
+	} else if (c <= 0xF4) { /* F0..F4 */
+		/* Make sure the character isn't clipped. */
+		if (e - s < 3)
+			return 0;
+		
+		/* Disallow overlong 4-byte sequence. */
+		if (c == 0xF0 && (unsigned char)*s < 0x90)
+			return 0;
+		
+		/* Disallow codepoints beyond U+10FFFF. */
+		if (c == 0xF4 && (unsigned char)*s > 0x8F)
+			return 0;
 		
 		/* Make sure subsequent bytes are in the range 0x80..0xBF. */
-		do {
-			if ((*s++ & 0xC0) != 0x80)
-				return false;
-		} while (len_minus_two--);
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
+		if (((unsigned char)*s++ & 0xC0) != 0x80)
+			return 0;
 		
-		/* Handle special cases. */
-		switch (c) {
-			case 0xE0:
-				/* Disallow overlong 3-byte sequence. */
-				if (c2 < 0xA0)
-					return false;
-				break;
-			case 0xED:
-				/* Disallow U+D800..U+DFFF. */
-				if (c2 > 0x9F)
-					return false;
-				break;
-			case 0xF0:
-				/* Disallow overlong 4-byte sequence. */
-				if (c2 < 0x90)
-					return false;
-				break;
-			case 0xF4:
-				/* Disallow codepoints beyond U+10FFFF. */
-				if (c2 > 0x8F)
-					return false;
-				break;
-		}
+		return 4;
+	} else {                /* F5..FF */
+		return 0;
 	}
-	
-	return true;
 }
 
 int utf8_read_char(const char *s, uchar_t *out)