From ac8694de3ef34483ce02811c1ba45096ee547a5f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 18 Jun 2018 20:12:46 +0930 Subject: [PATCH 1/1] ccan/utf8: new module. Signed-off-by: Rusty Russell --- ccan/utf8/LICENSE | 1 + ccan/utf8/_info | 48 ++++++ ccan/utf8/test/run-decode.c | 266 +++++++++++++++++++++++++++++ ccan/utf8/test/run-encode-decode.c | 42 +++++ ccan/utf8/test/run-encode.c | 30 ++++ ccan/utf8/utf8.c | 176 +++++++++++++++++++ ccan/utf8/utf8.h | 54 ++++++ 7 files changed, 617 insertions(+) create mode 120000 ccan/utf8/LICENSE create mode 100644 ccan/utf8/_info create mode 100644 ccan/utf8/test/run-decode.c create mode 100644 ccan/utf8/test/run-encode-decode.c create mode 100644 ccan/utf8/test/run-encode.c create mode 100644 ccan/utf8/utf8.c create mode 100644 ccan/utf8/utf8.h diff --git a/ccan/utf8/LICENSE b/ccan/utf8/LICENSE new file mode 120000 index 00000000..2354d129 --- /dev/null +++ b/ccan/utf8/LICENSE @@ -0,0 +1 @@ +../../licenses/BSD-MIT \ No newline at end of file diff --git a/ccan/utf8/_info b/ccan/utf8/_info new file mode 100644 index 00000000..4fe4437b --- /dev/null +++ b/ccan/utf8/_info @@ -0,0 +1,48 @@ +#include "config.h" +#include +#include + +/** + * utf8 - Simple routines to encode/decode valid UTF-8. + * + * This code contains routines to encode and decode UTF-8 characters. + * Table and test code stolen entirely from: + * Copyright (c) 2017 Christian Hansen + * + * + * Example: + * int main(int argc, char *argv[]) + * { + * size_t i; + * struct utf8_state utf8_state = UTF8_STATE_INIT; + * bool decoded = true; + * + * for (i = 0; i < strlen(argv[1]); i++) { + * decoded = utf8_decode(&utf8_state, argv[1][i]); + * if (decoded) { + * if (errno != 0) + * err(1, "Invalid UTF8 char %zu-%zu", + * i - utf8_state.used_len, i); + * printf("Character %u\n", utf8_state.c); + * } + * } + * + * if (!decoded) + * errx(1, "Incomplete UTF8"); + * return 0; + * } + * + * License: BSD-MIT + */ +int main(int argc, char *argv[]) +{ + /* Expect exactly one argument */ + if (argc != 2) + return 1; + + if (strcmp(argv[1], "depends") == 0) { + return 0; + } + + return 1; +} diff --git a/ccan/utf8/test/run-decode.c b/ccan/utf8/test/run-decode.c new file mode 100644 index 00000000..34ecb1d1 --- /dev/null +++ b/ccan/utf8/test/run-decode.c @@ -0,0 +1,266 @@ +#include +/* Include the C files directly. */ +#include +#include +#include + +/* Stolen from https://github.com/chansen/c-utf8-valid/blob/master/test.c */ + +/* + * UTF-8 + * + * U+0000..U+007F 00..7F + * n C0..C1 80..BF + * U+0080..U+07FF C2..DF 80..BF + * n E0 80..9F 80..BF + * U+0800..U+D7FF E0..ED A0..9F 80..BF + * U+D800..U+DFFF s ED A0..BF 80..BF + * U+E000..U+FFFF EE..EF 80..BF 80..BF + * n F0 80..8F 80..BF 80..BF + * U+0800..U+FFFF F0 80..8F A0..BF 80..BF + * U+10000..U+10FFFF F0..F4 90..8F 80..BF 80..BF + * + * U-110000..U-1FFFFF x F4..F7 90..BF 80..BF 80..BF + * xn F8 80..87 80..BF 80..BF 80..BF + * U-200000..U-3FFFFFF x F8..FB 88..BF 80..BF 80..BF 80..BF + * xn FC 80..83 80..BF 80..BF 80..BF 80..BF + * U-4000000..U-7FFFFFFF x FC..FD 84..BF 80..BF 80..BF 80..BF 80..BF + * + * Legend: + * n = Non-shortest form + * s = Surrogates + * x = Codepoints outside Unicode codespace + */ + +/* + * Encodes the given ordinal [0, 7FFFFFFF] using the UTF-8 encoding scheme + * to the given sequence length [1, 6]. This routine can be used to + * produce well-formed and ill-formed UTF-8. + * + * To encode a Unicode scalar value to a well-formed representation: + * + * [U+0000, U+007F] should be encoded to a sequence length of 1 + * [U+0080, U+07FF] should be encoded to a sequence length of 2 + * [U+0800, U+D7FF] should be encoded to a sequence length of 3 + * [U+E000, U+FFFF] should be encoded to a sequence length of 3 + * [U+10000, U+10FFFF] should be encoded to a sequence length of 4 + * + * To encode a Unicode scalar value to non-shortest form representation: + * + * [U+0000, U+007F] can be encoded to a sequence length of [2, 6] + * [U+0080, U+07FF] can be encoded to a sequence length of [3, 6] + * [U+0800, U+FFFF] can be encoded to a sequence length of [4, 6] + * + * To encode an ordinal outside of Unicode codespace: + * + * [110000, 1FFFFF] can be encoded to a sequence length of 4 + * [200000, 3FFFFFF] can be encoded to a sequence length of 5 + * [4000000, 7FFFFFFF] can be encoded to a sequence length of 6 + */ + +static char * +encode_ord(uint32_t ord, size_t len, char *dst) { + static const uint32_t kMask[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + static const uint32_t kMax[6] = { 1 << 7, 1 << 11, 1 << 16, + 1 << 21, 1 << 26, 1 << 31 }; + size_t i; + + assert(len >= 1); + assert(len <= 6); + assert(ord < kMax[len - 1]); + + for (i = len - 1; i > 0; i--) { + dst[i] = (ord & 0x3F) | 0x80; + ord >>= 6; + } + dst[0] = ord | kMask[len - 1]; + return dst; +} + +static int utf8_check(const char *src, size_t len) +{ + bool decoded = false; + struct utf8_state utf8_state = UTF8_STATE_INIT; + size_t i; + + for (i = 0; i < len; i++) { + decoded = utf8_decode(&utf8_state, src[i]); + if (decoded) { + if (errno != 0) + return errno; + } + } + if (!decoded) + return EMLINK; + return 0; +} + +static void +test_utf8(const char *src, size_t len, int exp_err, unsigned line) { + int got_err; + + assert(len <= 255); + + got_err = utf8_check(src, len); + + ok(got_err == exp_err, "Got result %i, expected %i at line %u", + got_err, exp_err, line); +} + +#define TEST_UTF8(src, len, exp) \ + test_utf8(src, len, exp, __LINE__) + + +static void +test_unicode_scalar_value(void) { + uint32_t ord; + char src[4]; + + /* Unicode scalar value [U+0000, U+007F] */ + for (ord = 0x0000; ord <= 0x007F; ord++) { + encode_ord(ord, 1, src); + TEST_UTF8(src, 1, ord ? 0 : ERANGE); + } + + /* + * Unicode scalar value [U+0080, U+07FF] + * The maximal subpart is the length of the truncated sequence + */ + for (ord = 0x0080; ord <= 0x07FF; ord++) { + encode_ord(ord, 2, src); + TEST_UTF8(src, 2, 0); + } + + /* + * Unicode scalar value [U+0800, U+D7FF] and [U+E000, U+FFFF] + * The maximal subpart is the length of the truncated sequence + */ + for (ord = 0x0800; ord <= 0xFFFF && (ord & 0xF800) != 0xD800; ord++) { + encode_ord(ord, 3, src); + + TEST_UTF8(src, 3, 0); + if ((ord % (1 << 6)) == 0) + TEST_UTF8(src, 2, EMLINK); + } + + /* + * Unicode scalar value [U+10000, U+10FFF] + * The maximal subpart is the length of the truncated sequence + */ + for (ord = 0x10000; ord <= 0x10FFFF; ord++) { + encode_ord(ord, 4, src); + + TEST_UTF8(src, 4, 0); + if ((ord % (1 << 6)) == 0) + TEST_UTF8(src, 3, EMLINK); + if ((ord % (1 << 12)) == 0) + TEST_UTF8(src, 2, EMLINK); + } +} + +static void +test_non_shortest_form(void) { + uint32_t ord; + char src[4]; + + /* + * Non-shortest form 2-byte sequence [U+0000, U+007F] + * The maximal subpart is 1-byte + */ + for (ord = 0x0001; ord <= 0x007F; ord++) { + encode_ord(ord, 2, src); + TEST_UTF8(src, 2, EFBIG); + } + + /* + * Non-shortest form 3-byte sequence [U+0000, U+07FF] + * The maximal subpart is 1-byte + */ + for (ord = 0x0001; ord <= 0x07FF; ord++) { + encode_ord(ord, 3, src); + + TEST_UTF8(src, 3, EFBIG); + if ((ord % (1 << 6)) == 0) + TEST_UTF8(src, 2, EMLINK); + } + + /* + * Non-shortest form 4-byte sequence [U+0000, U+FFFF] + * The maximal subpart is 1-byte + */ + for (ord = 0x0001; ord <= 0xFFFF; ord++) { + encode_ord(ord, 4, src); + + TEST_UTF8(src, 4, EFBIG); + if ((ord % (1 << 6)) == 0) + TEST_UTF8(src, 3, EMLINK); + if ((ord % (1 << 12)) == 0) + TEST_UTF8(src, 2, EMLINK); + } +} + +static void +test_non_unicode(void) { + uint32_t ord; + char src[4]; + + /* + * Code point outside Unicode codespace + * The maximal subpart is 1-byte + */ + for (ord = 0x110000; ord <= 0x1FFFFF; ord++) { + encode_ord(ord, 4, src); + + TEST_UTF8(src, 4, ERANGE); + if ((ord % (1 << 6)) == 0) + TEST_UTF8(src, 3, EMLINK); + if ((ord % (1 << 12)) == 0) + TEST_UTF8(src, 2, EMLINK); + } +} + +static void +test_surrogates(void) { + uint32_t ord; + char src[4]; + + /* + * Surrogates [U+D800, U+DFFF] + * The maximal subpart is 1-byte + */ + for (ord = 0xD800; ord <= 0xDFFF; ord++) { + encode_ord(ord, 3, src); + + TEST_UTF8(src, 3, ERANGE); + if ((ord % (1 << 6)) == 0) + TEST_UTF8(src, 2, EMLINK); + } +} + +static void +test_continuations(void) { + uint8_t ord; + char src[4]; + + /* + * Missplaced continuation [\x80, \xBF] + * The maximal subpart is 1-byte + */ + for (ord = 0x80; ord <= 0xBF; ord++) { + src[0] = ord; + TEST_UTF8(src, 1, EINVAL); + } +} + +int +main(int argc, char **argv) +{ + plan_tests(2190906); + test_unicode_scalar_value(); + test_surrogates(); + test_non_shortest_form(); + test_non_unicode(); + test_continuations(); + + return exit_status(); +} diff --git a/ccan/utf8/test/run-encode-decode.c b/ccan/utf8/test/run-encode-decode.c new file mode 100644 index 00000000..1dd71fdb --- /dev/null +++ b/ccan/utf8/test/run-encode-decode.c @@ -0,0 +1,42 @@ +#include +/* Include the C files directly. */ +#include +#include +#include + +static bool utf8_check(const char *src, size_t len) +{ + bool decoded = false; + struct utf8_state utf8_state = UTF8_STATE_INIT; + size_t i; + + for (i = 0; i < len; i++) { + decoded = utf8_decode(&utf8_state, src[i]); + if (decoded) { + if (errno != 0) + return false; + } + } + if (!decoded) + return false; + return true; +} + +int main(int argc, char **argv) +{ + int i; + char dest[UTF8_MAX_LEN]; + + plan_tests(0x10FFFF - (0xDFFF - 0xD7FF + 2)); + + for (i = 1; i < 0x10FFFF; i++) { + int len; + if (i >= 0xD7FF && i <= 0xDFFF) + continue; + len = utf8_encode(i, dest); + assert(len != 0); + ok1(utf8_check(dest, len)); + } + + return exit_status(); +} diff --git a/ccan/utf8/test/run-encode.c b/ccan/utf8/test/run-encode.c new file mode 100644 index 00000000..375e13f6 --- /dev/null +++ b/ccan/utf8/test/run-encode.c @@ -0,0 +1,30 @@ +#include +/* Include the C files directly. */ +#include +#include + +int main(int argc, char **argv) +{ + int i; + char dest[UTF8_MAX_LEN]; + + plan_tests(1 + 0x10FFFF + 1); + + for (i = 0; i < 1; i++) + ok1(utf8_encode(i, dest) == 0 && errno == ERANGE); + for (; i <= 0x7F; i++) + ok1(utf8_encode(i, dest) == 1); + for (; i <= 0x7FF; i++) + ok1(utf8_encode(i, dest) == 2); + for (; i <= 0xD7FF; i++) + ok1(utf8_encode(i, dest) == 3); + for (; i <= 0xDFFF; i++) + ok1(utf8_encode(i, dest) == 0 && errno == ERANGE); + for (; i <= 0xFFFF; i++) + ok1(utf8_encode(i, dest) == 3); + for (; i <= 0x10FFFF; i++) + ok1(utf8_encode(i, dest) == 4); + ok1(utf8_encode(i, dest) == 0 && errno == ERANGE); + + return exit_status(); +} diff --git a/ccan/utf8/utf8.c b/ccan/utf8/utf8.c new file mode 100644 index 00000000..346d2d95 --- /dev/null +++ b/ccan/utf8/utf8.c @@ -0,0 +1,176 @@ +/* MIT (BSD) license - see LICENSE file for details */ +#include +#include +#include + +/* I loved this table, so I stole it: */ +/* + * Copyright (c) 2017 Christian Hansen + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/* + * UTF-8 Encoding Form + * + * U+0000..U+007F 0xxxxxxx <= 7 bits + * U+0080..U+07FF 110xxxxx 10xxxxxx <= 11 bits + * U+0800..U+FFFF 1110xxxx 10xxxxxx 10xxxxxx <= 16 bits + * U+10000..U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx <= 21 bits + * + * + * U+0000..U+007F 00..7F + * N C0..C1 80..BF 1100000x 10xxxxxx + * U+0080..U+07FF C2..DF 80..BF + * N E0 80..9F 80..BF 11100000 100xxxxx + * U+0800..U+0FFF E0 A0..BF 80..BF + * U+1000..U+CFFF E1..EC 80..BF 80..BF + * U+D000..U+D7FF ED 80..9F 80..BF + * S ED A0..BF 80..BF 11101101 101xxxxx + * U+E000..U+FFFF EE..EF 80..BF 80..BF + * N F0 80..8F 80..BF 80..BF 11110000 1000xxxx + * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 11110100 1000xxxx + * + * Legend: + * N = Non-shortest form + * S = Surrogates + */ +bool utf8_decode(struct utf8_state *utf8_state, char c) +{ + if (utf8_state->used_len == utf8_state->total_len) { + utf8_state->used_len = 1; + /* First character in sequence. */ + if (((unsigned char)c & 0x80) == 0) { + /* ASCII, easy. */ + utf8_state->total_len = 1; + utf8_state->c = c; + goto finished_decoding; + } else if (((unsigned char)c & 0xE0) == 0xC0) { + utf8_state->total_len = 2; + utf8_state->c = ((unsigned char)c & 0x1F); + return false; + } else if (((unsigned char)c & 0xF0) == 0xE0) { + utf8_state->total_len = 3; + utf8_state->c = ((unsigned char)c & 0x0F); + return false; + } else if (((unsigned char)c & 0xF8) == 0xF0) { + utf8_state->total_len = 4; + utf8_state->c = ((unsigned char)c & 0x07); + return false; + } + goto bad_encoding; + } + + if (((unsigned char)c & 0xC0) != 0x80) + goto bad_encoding; + + utf8_state->c <<= 6; + utf8_state->c |= ((unsigned char)c & 0x3F); + + utf8_state->used_len++; + if (utf8_state->used_len == utf8_state->total_len) + goto finished_decoding; + return false; + +finished_decoding: + if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF) + errno = ERANGE; + /* The UTF-16 "surrogate range": illegal in UTF-8 */ + else if (utf8_state->total_len == 3 + && (utf8_state->c & 0xFFFFF800) == 0x0000D800) + errno = ERANGE; + else { + int min_bits; + switch (utf8_state->total_len) { + case 1: + min_bits = 0; + break; + case 2: + min_bits = 7; + break; + case 3: + min_bits = 11; + break; + case 4: + min_bits = 16; + break; + default: + abort(); + } + if ((utf8_state->c >> min_bits) == 0) + errno = EFBIG; + else + errno = 0; + } + return true; + +bad_encoding: + utf8_state->total_len = utf8_state->used_len; + errno = EINVAL; + return true; +} + +size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN]) +{ + if ((point >> 7) == 0) { + if (point == 0) { + errno = ERANGE; + return 0; + } + /* 0xxxxxxx */ + dest[0] = point; + return 1; + } + + if ((point >> 11) == 0) { + /* 110xxxxx 10xxxxxx */ + dest[1] = 0x80 | (point & 0x3F); + dest[0] = 0xC0 | (point >> 6); + return 2; + } + + if ((point >> 16) == 0) { + if (point >= 0xD800 && point <= 0xDFFF) { + errno = ERANGE; + return 0; + } + /* 1110xxxx 10xxxxxx 10xxxxxx */ + dest[2] = 0x80 | (point & 0x3F); + dest[1] = 0x80 | ((point >> 6) & 0x3F); + dest[0] = 0xE0 | (point >> 12); + return 3; + } + + if (point > 0x10FFFF) { + errno = ERANGE; + return 0; + } + + /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + dest[3] = 0x80 | (point & 0x3F); + dest[2] = 0x80 | ((point >> 6) & 0x3F); + dest[1] = 0x80 | ((point >> 12) & 0x3F); + dest[0] = 0xF0 | (point >> 18); + return 4; +} diff --git a/ccan/utf8/utf8.h b/ccan/utf8/utf8.h new file mode 100644 index 00000000..a095f02e --- /dev/null +++ b/ccan/utf8/utf8.h @@ -0,0 +1,54 @@ +/* MIT (BSD) license - see LICENSE file for details */ +#ifndef CCAN_UTF8_H +#define CCAN_UTF8_H +#include +#include +#include + +/* Unicode is limited to 21 bits. */ +#define UTF8_MAX_LEN 4 + +struct utf8_state { + /* How many characters we are expecting as part of this Unicode point */ + uint16_t total_len; + /* How many characters we've already seen. */ + uint16_t used_len; + /* Compound character, aka Unicode point. */ + uint32_t c; +}; + +#define UTF8_STATE_INIT { 0, 0, 0 } + +static inline void utf8_state_init(struct utf8_state *utf8_state) +{ + memset(utf8_state, 0, sizeof(*utf8_state)); +} + +/** + * utf8_decode - continue UTF8 decoding with this character. + * @utf8_state - initialized UTF8 state. + * @c - the character. + * + * Returns false if it needs another character to give results. + * Otherwise returns true, @utf8_state can be reused without initializeation, + * and sets errno: + * 0: success + * EINVAL: bad encoding. + * EFBIG: not a minimal encoding. + * ERANGE: encoding of invalid character. + * + * You can extract the character from @utf8_state->c; @utf8_state->used_len + * indicates how many characters have been consumed. + */ +bool utf8_decode(struct utf8_state *utf8_state, char c); + +/** + * utf8_encode - encode a point into UTF8. + * @point - Unicode point to include. + * @dest - buffer to fill. + * + * Returns 0 if point was invalid, otherwise bytes of dest used. + * Sets errno to ERANGE if point was invalid. + */ +size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN]); +#endif /* CCAN_UTF8_H */ -- 2.39.2