]> git.ozlabs.org Git - ccan/blob - ccan/utf8/utf8.c
utf8: don't allow NUL in decoded strings.
[ccan] / ccan / utf8 / utf8.c
1 /* MIT (BSD) license - see LICENSE file for details */
2 #include <ccan/utf8/utf8.h>
3 #include <errno.h>
4 #include <stdlib.h>
5
6 /* I loved this table, so I stole it: */
7 /*
8  * Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
9  * <https://github.com/chansen/c-utf8-valid>
10  * All rights reserved.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright notice, this
16  *    list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright notice,
18  *    this list of conditions and the following disclaimer in the documentation
19  *    and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
28  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 /*
33  *    UTF-8 Encoding Form
34  *
35  *    U+0000..U+007F       0xxxxxxx                             <= 7 bits
36  *    U+0080..U+07FF       110xxxxx 10xxxxxx                    <= 11 bits
37  *    U+0800..U+FFFF       1110xxxx 10xxxxxx 10xxxxxx           <= 16 bits
38  *   U+10000..U+10FFFF     11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  <= 21 bits
39  *
40  *
41  *    U+0000..U+007F       00..7F
42  *                      N  C0..C1  80..BF                   1100000x 10xxxxxx
43  *    U+0080..U+07FF       C2..DF  80..BF
44  *                      N  E0      80..9F  80..BF           11100000 100xxxxx
45  *    U+0800..U+0FFF       E0      A0..BF  80..BF
46  *    U+1000..U+CFFF       E1..EC  80..BF  80..BF
47  *    U+D000..U+D7FF       ED      80..9F  80..BF
48  *                      S  ED      A0..BF  80..BF           11101101 101xxxxx
49  *    U+E000..U+FFFF       EE..EF  80..BF  80..BF
50  *                      N  F0      80..8F  80..BF  80..BF   11110000 1000xxxx
51  *   U+10000..U+3FFFF      F0      90..BF  80..BF  80..BF
52  *   U+40000..U+FFFFF      F1..F3  80..BF  80..BF  80..BF
53  *  U+100000..U+10FFFF     F4      80..8F  80..BF  80..BF   11110100 1000xxxx
54  *
55  *  Legend:
56  *    N = Non-shortest form
57  *    S = Surrogates
58  */
59 bool utf8_decode(struct utf8_state *utf8_state, char c)
60 {
61         if (utf8_state->used_len == utf8_state->total_len) {
62                 utf8_state->used_len = 1;
63                 /* First character in sequence. */
64                 if (((unsigned char)c & 0x80) == 0) {
65                         /* ASCII, easy. */
66                         if (c == 0)
67                                 goto bad_encoding;
68                         utf8_state->total_len = 1;
69                         utf8_state->c = c;
70                         goto finished_decoding;
71                 } else if (((unsigned char)c & 0xE0) == 0xC0) {
72                         utf8_state->total_len = 2;
73                         utf8_state->c = ((unsigned char)c & 0x1F);
74                         return false;
75                 } else if (((unsigned char)c & 0xF0) == 0xE0) {
76                         utf8_state->total_len = 3;
77                         utf8_state->c = ((unsigned char)c & 0x0F);
78                         return false;
79                 } else if (((unsigned char)c & 0xF8) == 0xF0) {
80                         utf8_state->total_len = 4;
81                         utf8_state->c = ((unsigned char)c & 0x07);
82                         return false;
83                 }
84                 goto bad_encoding;
85         }
86
87         if (((unsigned char)c & 0xC0) != 0x80)
88                 goto bad_encoding;
89
90         utf8_state->c <<= 6;
91         utf8_state->c |= ((unsigned char)c & 0x3F);
92         
93         utf8_state->used_len++;
94         if (utf8_state->used_len == utf8_state->total_len)
95                 goto finished_decoding;
96         return false;
97
98 finished_decoding:
99         if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF)
100                 errno = ERANGE;
101         /* The UTF-16 "surrogate range": illegal in UTF-8 */
102         else if (utf8_state->total_len == 3
103                  && (utf8_state->c & 0xFFFFF800) == 0x0000D800)
104                 errno = ERANGE;
105         else {
106                 int min_bits;
107                 switch (utf8_state->total_len) {
108                 case 1:
109                         min_bits = 0;
110                         break;
111                 case 2:
112                         min_bits = 7;
113                         break;
114                 case 3:
115                         min_bits = 11;
116                         break;
117                 case 4:
118                         min_bits = 16;
119                         break;
120                 default:
121                         abort();
122                 }
123                 if ((utf8_state->c >> min_bits) == 0)
124                         errno = EFBIG;
125                 else
126                         errno = 0;
127         }
128         return true;
129
130 bad_encoding:
131         utf8_state->total_len = utf8_state->used_len;
132         errno = EINVAL;
133         return true;
134 }
135
136 size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN])
137 {
138         if ((point >> 7) == 0) {
139                 if (point == 0) {
140                         errno = ERANGE;
141                         return 0;
142                 }
143                 /* 0xxxxxxx */
144                 dest[0] = point;
145                 return 1;
146         }
147
148         if ((point >> 11) == 0) {
149                 /* 110xxxxx 10xxxxxx */
150                 dest[1] = 0x80 | (point & 0x3F);
151                 dest[0] = 0xC0 | (point >> 6);
152                 return 2;
153         }
154
155         if ((point >> 16) == 0) {
156                 if (point >= 0xD800 && point <= 0xDFFF) {
157                         errno = ERANGE;
158                         return 0;
159                 }
160                 /* 1110xxxx 10xxxxxx 10xxxxxx */
161                 dest[2] = 0x80 | (point & 0x3F);
162                 dest[1] = 0x80 | ((point >> 6) & 0x3F);
163                 dest[0] = 0xE0 | (point >> 12);
164                 return 3;
165         }
166
167         if (point > 0x10FFFF) {
168                 errno = ERANGE;
169                 return 0;
170         }
171
172         /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
173         dest[3] = 0x80 | (point & 0x3F);
174         dest[2] = 0x80 | ((point >> 6) & 0x3F);
175         dest[1] = 0x80 | ((point >> 12) & 0x3F);
176         dest[0] = 0xF0 | (point >> 18);
177         return 4;
178 }