/*
- Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
+ Copyright (C) 2011 Joseph A. Adams (joeyadams3.14159@gmail.com)
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
*/
#include "charset.h"
+#include <assert.h>
+
+
+bool utf8_validate(const char *str, size_t length)
+{
+ const char *s = str;
+ const char *e = str + length;
+ int len;
+
+ for (; s < e; s += len) {
+ len = utf8_validate_char(s, e);
+ if (len == 0)
+ return false;
+ }
+ assert(s == e);
+
+ return true;
+}
/*
* This function implements the syntax given in RFC3629, which is
* * The sixty-six Unicode "non-characters" are permitted
* (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
*/
-bool utf8_validate(const char *str, size_t length)
+int utf8_validate_char(const char *s, const char *e)
{
- const unsigned char *s = (const unsigned char*)str;
- const unsigned char *e = s + length;
+ unsigned char c = *s++;
- while (s < e) {
- unsigned char c = *s++;
- unsigned char c2;
- int len_minus_two;
+ if (c <= 0x7F) { /* 00..7F */
+ return 1;
+ } else if (c <= 0xC1) { /* 80..C1 */
+ /* Disallow overlong 2-byte sequence. */
+ return 0;
+ } else if (c <= 0xDF) { /* C2..DF */
+ /* Make sure the character isn't clipped. */
+ if (e - s < 1)
+ return 0;
- /* Validate the first byte and determine the sequence length. */
- if (c <= 0x7F) /* 00..7F */
- continue;
- else if (c <= 0xC1) /* 80..C1 */
- return false;
- else if (c <= 0xDF) /* C2..DF */
- len_minus_two = 0;
- else if (c <= 0xEF) /* E0..EF */
- len_minus_two = 1;
- else if (c <= 0xF4) /* F0..F4 */
- len_minus_two = 2;
- else
- return false;
+ /* Make sure subsequent byte is in the range 0x80..0xBF. */
+ if (((unsigned char)*s++ & 0xC0) != 0x80)
+ return 0;
+ return 2;
+ } else if (c <= 0xEF) { /* E0..EF */
/* Make sure the character isn't clipped. */
- if (s + len_minus_two >= e)
- return false;
+ if (e - s < 2)
+ return 0;
+
+ /* Disallow overlong 3-byte sequence. */
+ if (c == 0xE0 && (unsigned char)*s < 0xA0)
+ return 0;
+
+ /* Disallow U+D800..U+DFFF. */
+ if (c == 0xED && (unsigned char)*s > 0x9F)
+ return 0;
+
+ /* Make sure subsequent bytes are in the range 0x80..0xBF. */
+ if (((unsigned char)*s++ & 0xC0) != 0x80)
+ return 0;
+ if (((unsigned char)*s++ & 0xC0) != 0x80)
+ return 0;
- c2 = *s;
+ return 3;
+ } else if (c <= 0xF4) { /* F0..F4 */
+ /* Make sure the character isn't clipped. */
+ if (e - s < 3)
+ return 0;
+
+ /* Disallow overlong 4-byte sequence. */
+ if (c == 0xF0 && (unsigned char)*s < 0x90)
+ return 0;
+
+ /* Disallow codepoints beyond U+10FFFF. */
+ if (c == 0xF4 && (unsigned char)*s > 0x8F)
+ return 0;
/* Make sure subsequent bytes are in the range 0x80..0xBF. */
- do {
- if ((*s++ & 0xC0) != 0x80)
- return false;
- } while (len_minus_two--);
+ if (((unsigned char)*s++ & 0xC0) != 0x80)
+ return 0;
+ if (((unsigned char)*s++ & 0xC0) != 0x80)
+ return 0;
+ if (((unsigned char)*s++ & 0xC0) != 0x80)
+ return 0;
- /* Handle special cases. */
- switch (c) {
- case 0xE0:
- /* Disallow overlong 3-byte sequence. */
- if (c2 < 0xA0)
- return false;
- break;
- case 0xED:
- /* Disallow U+D800..U+DFFF. */
- if (c2 > 0x9F)
- return false;
- break;
- case 0xF0:
- /* Disallow overlong 4-byte sequence. */
- if (c2 < 0x90)
- return false;
- break;
- case 0xF4:
- /* Disallow codepoints beyond U+10FFFF. */
- if (c2 > 0x8F)
- return false;
- break;
- }
+ return 4;
+ } else { /* F5..FF */
+ return 0;
}
-
- return true;
}
int utf8_read_char(const char *s, uchar_t *out)