1 /* MIT (BSD) license - see LICENSE file for details */
2 #include <ccan/utf8/utf8.h>
6 /* I loved this table, so I stole it: */
8 * Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
9 * <https://github.com/chansen/c-utf8-valid>
10 * All rights reserved.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions are met:
15 * 1. Redistributions of source code must retain the above copyright notice, this
16 * list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright notice,
18 * this list of conditions and the following disclaimer in the documentation
19 * and/or other materials provided with the distribution.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 * U+0000..U+007F 0xxxxxxx <= 7 bits
36 * U+0080..U+07FF 110xxxxx 10xxxxxx <= 11 bits
37 * U+0800..U+FFFF 1110xxxx 10xxxxxx 10xxxxxx <= 16 bits
38 * U+10000..U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx <= 21 bits
41 * U+0000..U+007F 00..7F
42 * N C0..C1 80..BF 1100000x 10xxxxxx
43 * U+0080..U+07FF C2..DF 80..BF
44 * N E0 80..9F 80..BF 11100000 100xxxxx
45 * U+0800..U+0FFF E0 A0..BF 80..BF
46 * U+1000..U+CFFF E1..EC 80..BF 80..BF
47 * U+D000..U+D7FF ED 80..9F 80..BF
48 * S ED A0..BF 80..BF 11101101 101xxxxx
49 * U+E000..U+FFFF EE..EF 80..BF 80..BF
50 * N F0 80..8F 80..BF 80..BF 11110000 1000xxxx
51 * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
52 * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
53 * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 11110100 1000xxxx
56 * N = Non-shortest form
59 bool utf8_decode(struct utf8_state *utf8_state, char c)
61 if (utf8_state->used_len == utf8_state->total_len) {
62 utf8_state->used_len = 1;
63 /* First character in sequence. */
64 if (((unsigned char)c & 0x80) == 0) {
66 utf8_state->total_len = 1;
68 goto finished_decoding;
69 } else if (((unsigned char)c & 0xE0) == 0xC0) {
70 utf8_state->total_len = 2;
71 utf8_state->c = ((unsigned char)c & 0x1F);
73 } else if (((unsigned char)c & 0xF0) == 0xE0) {
74 utf8_state->total_len = 3;
75 utf8_state->c = ((unsigned char)c & 0x0F);
77 } else if (((unsigned char)c & 0xF8) == 0xF0) {
78 utf8_state->total_len = 4;
79 utf8_state->c = ((unsigned char)c & 0x07);
85 if (((unsigned char)c & 0xC0) != 0x80)
89 utf8_state->c |= ((unsigned char)c & 0x3F);
91 utf8_state->used_len++;
92 if (utf8_state->used_len == utf8_state->total_len)
93 goto finished_decoding;
97 if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF)
99 /* The UTF-16 "surrogate range": illegal in UTF-8 */
100 else if (utf8_state->total_len == 3
101 && (utf8_state->c & 0xFFFFF800) == 0x0000D800)
105 switch (utf8_state->total_len) {
121 if ((utf8_state->c >> min_bits) == 0)
129 utf8_state->total_len = utf8_state->used_len;
134 size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN])
136 if ((point >> 7) == 0) {
146 if ((point >> 11) == 0) {
147 /* 110xxxxx 10xxxxxx */
148 dest[1] = 0x80 | (point & 0x3F);
149 dest[0] = 0xC0 | (point >> 6);
153 if ((point >> 16) == 0) {
154 if (point >= 0xD800 && point <= 0xDFFF) {
158 /* 1110xxxx 10xxxxxx 10xxxxxx */
159 dest[2] = 0x80 | (point & 0x3F);
160 dest[1] = 0x80 | ((point >> 6) & 0x3F);
161 dest[0] = 0xE0 | (point >> 12);
165 if (point > 0x10FFFF) {
170 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
171 dest[3] = 0x80 | (point & 0x3F);
172 dest[2] = 0x80 | ((point >> 6) & 0x3F);
173 dest[1] = 0x80 | ((point >> 12) & 0x3F);
174 dest[0] = 0xF0 | (point >> 18);