1 /* MIT (BSD) license - see LICENSE file for details */
2 #include <ccan/utf8/utf8.h>
6 /* I loved this table, so I stole it: */
8 * Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
9 * <https://github.com/chansen/c-utf8-valid>
10 * All rights reserved.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions are met:
15 * 1. Redistributions of source code must retain the above copyright notice, this
16 * list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright notice,
18 * this list of conditions and the following disclaimer in the documentation
19 * and/or other materials provided with the distribution.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 * U+0000..U+007F 0xxxxxxx <= 7 bits
36 * U+0080..U+07FF 110xxxxx 10xxxxxx <= 11 bits
37 * U+0800..U+FFFF 1110xxxx 10xxxxxx 10xxxxxx <= 16 bits
38 * U+10000..U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx <= 21 bits
41 * U+0000..U+007F 00..7F
42 * N C0..C1 80..BF 1100000x 10xxxxxx
43 * U+0080..U+07FF C2..DF 80..BF
44 * N E0 80..9F 80..BF 11100000 100xxxxx
45 * U+0800..U+0FFF E0 A0..BF 80..BF
46 * U+1000..U+CFFF E1..EC 80..BF 80..BF
47 * U+D000..U+D7FF ED 80..9F 80..BF
48 * S ED A0..BF 80..BF 11101101 101xxxxx
49 * U+E000..U+FFFF EE..EF 80..BF 80..BF
50 * N F0 80..8F 80..BF 80..BF 11110000 1000xxxx
51 * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
52 * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
53 * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 11110100 1000xxxx
56 * N = Non-shortest form
59 bool utf8_decode(struct utf8_state *utf8_state, char c)
61 if (utf8_state->used_len == utf8_state->total_len) {
62 utf8_state->used_len = 1;
63 /* First character in sequence. */
64 if (((unsigned char)c & 0x80) == 0) {
68 utf8_state->total_len = 1;
70 goto finished_decoding;
71 } else if (((unsigned char)c & 0xE0) == 0xC0) {
72 utf8_state->total_len = 2;
73 utf8_state->c = ((unsigned char)c & 0x1F);
75 } else if (((unsigned char)c & 0xF0) == 0xE0) {
76 utf8_state->total_len = 3;
77 utf8_state->c = ((unsigned char)c & 0x0F);
79 } else if (((unsigned char)c & 0xF8) == 0xF0) {
80 utf8_state->total_len = 4;
81 utf8_state->c = ((unsigned char)c & 0x07);
87 if (((unsigned char)c & 0xC0) != 0x80)
91 utf8_state->c |= ((unsigned char)c & 0x3F);
93 utf8_state->used_len++;
94 if (utf8_state->used_len == utf8_state->total_len)
95 goto finished_decoding;
99 if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF)
101 /* The UTF-16 "surrogate range": illegal in UTF-8 */
102 else if (utf8_state->total_len == 3
103 && (utf8_state->c & 0xFFFFF800) == 0x0000D800)
107 switch (utf8_state->total_len) {
123 if ((utf8_state->c >> min_bits) == 0)
131 utf8_state->total_len = utf8_state->used_len;
136 size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN])
138 if ((point >> 7) == 0) {
148 if ((point >> 11) == 0) {
149 /* 110xxxxx 10xxxxxx */
150 dest[1] = 0x80 | (point & 0x3F);
151 dest[0] = 0xC0 | (point >> 6);
155 if ((point >> 16) == 0) {
156 if (point >= 0xD800 && point <= 0xDFFF) {
160 /* 1110xxxx 10xxxxxx 10xxxxxx */
161 dest[2] = 0x80 | (point & 0x3F);
162 dest[1] = 0x80 | ((point >> 6) & 0x3F);
163 dest[0] = 0xE0 | (point >> 12);
167 if (point > 0x10FFFF) {
172 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
173 dest[3] = 0x80 | (point & 0x3F);
174 dest[2] = 0x80 | ((point >> 6) & 0x3F);
175 dest[1] = 0x80 | ((point >> 12) & 0x3F);
176 dest[0] = 0xF0 | (point >> 18);