git.ozlabs.org Git - ccan/blob - ccan/utf8/utf8.c

   1 /* MIT (BSD) license - see LICENSE file for details */
   2 #include <ccan/utf8/utf8.h>
   3 #include <errno.h>
   4 #include <stdlib.h>
   5
   6 /* I loved this table, so I stole it: */
   7 /*
   8  * Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
   9  * <https://github.com/chansen/c-utf8-valid>
  10  * All rights reserved.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions are met:
  14  *
  15  * 1. Redistributions of source code must retain the above copyright notice, this
  16  *    list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright notice,
  18  *    this list of conditions and the following disclaimer in the documentation
  19  *    and/or other materials provided with the distribution.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  24  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  26  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  28  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  */
  32 /*
  33  *    UTF-8 Encoding Form
  34  *
  35  *    U+0000..U+007F       0xxxxxxx                             <= 7 bits
  36  *    U+0080..U+07FF       110xxxxx 10xxxxxx                    <= 11 bits
  37  *    U+0800..U+FFFF       1110xxxx 10xxxxxx 10xxxxxx           <= 16 bits
  38  *   U+10000..U+10FFFF     11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  <= 21 bits
  39  *
  40  *
  41  *    U+0000..U+007F       00..7F
  42  *                      N  C0..C1  80..BF                   1100000x 10xxxxxx
  43  *    U+0080..U+07FF       C2..DF  80..BF
  44  *                      N  E0      80..9F  80..BF           11100000 100xxxxx
  45  *    U+0800..U+0FFF       E0      A0..BF  80..BF
  46  *    U+1000..U+CFFF       E1..EC  80..BF  80..BF
  47  *    U+D000..U+D7FF       ED      80..9F  80..BF
  48  *                      S  ED      A0..BF  80..BF           11101101 101xxxxx
  49  *    U+E000..U+FFFF       EE..EF  80..BF  80..BF
  50  *                      N  F0      80..8F  80..BF  80..BF   11110000 1000xxxx
  51  *   U+10000..U+3FFFF      F0      90..BF  80..BF  80..BF
  52  *   U+40000..U+FFFFF      F1..F3  80..BF  80..BF  80..BF
  53  *  U+100000..U+10FFFF     F4      80..8F  80..BF  80..BF   11110100 1000xxxx
  54  *
  55  *  Legend:
  56  *    N = Non-shortest form
  57  *    S = Surrogates
  58  */
  59 bool utf8_decode(struct utf8_state *utf8_state, char c)
  60 {
  61         if (utf8_state->used_len == utf8_state->total_len) {
  62                 utf8_state->used_len = 1;
  63                 /* First character in sequence. */
  64                 if (((unsigned char)c & 0x80) == 0) {
  65                         /* ASCII, easy. */
  66                         utf8_state->total_len = 1;
  67                         utf8_state->c = c;
  68                         goto finished_decoding;
  69                 } else if (((unsigned char)c & 0xE0) == 0xC0) {
  70                         utf8_state->total_len = 2;
  71                         utf8_state->c = ((unsigned char)c & 0x1F);
  72                         return false;
  73                 } else if (((unsigned char)c & 0xF0) == 0xE0) {
  74                         utf8_state->total_len = 3;
  75                         utf8_state->c = ((unsigned char)c & 0x0F);
  76                         return false;
  77                 } else if (((unsigned char)c & 0xF8) == 0xF0) {
  78                         utf8_state->total_len = 4;
  79                         utf8_state->c = ((unsigned char)c & 0x07);
  80                         return false;
  81                 }
  82                 goto bad_encoding;
  83         }
  84
  85         if (((unsigned char)c & 0xC0) != 0x80)
  86                 goto bad_encoding;
  87
  88         utf8_state->c <<= 6;
  89         utf8_state->c |= ((unsigned char)c & 0x3F);
  90
  91         utf8_state->used_len++;
  92         if (utf8_state->used_len == utf8_state->total_len)
  93                 goto finished_decoding;
  94         return false;
  95
  96 finished_decoding:
  97         if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF)
  98                 errno = ERANGE;
  99         /* The UTF-16 "surrogate range": illegal in UTF-8 */
 100         else if (utf8_state->total_len == 3
 101                  && (utf8_state->c & 0xFFFFF800) == 0x0000D800)
 102                 errno = ERANGE;
 103         else {
 104                 int min_bits;
 105                 switch (utf8_state->total_len) {
 106                 case 1:
 107                         min_bits = 0;
 108                         break;
 109                 case 2:
 110                         min_bits = 7;
 111                         break;
 112                 case 3:
 113                         min_bits = 11;
 114                         break;
 115                 case 4:
 116                         min_bits = 16;
 117                         break;
 118                 default:
 119                         abort();
 120                 }
 121                 if ((utf8_state->c >> min_bits) == 0)
 122                         errno = EFBIG;
 123                 else
 124                         errno = 0;
 125         }
 126         return true;
 127
 128 bad_encoding:
 129         utf8_state->total_len = utf8_state->used_len;
 130         errno = EINVAL;
 131         return true;
 132 }
 133
 134 size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN])
 135 {
 136         if ((point >> 7) == 0) {
 137                 if (point == 0) {
 138                         errno = ERANGE;
 139                         return 0;
 140                 }
 141                 /* 0xxxxxxx */
 142                 dest[0] = point;
 143                 return 1;
 144         }
 145
 146         if ((point >> 11) == 0) {
 147                 /* 110xxxxx 10xxxxxx */
 148                 dest[1] = 0x80 | (point & 0x3F);
 149                 dest[0] = 0xC0 | (point >> 6);
 150                 return 2;
 151         }
 152
 153         if ((point >> 16) == 0) {
 154                 if (point >= 0xD800 && point <= 0xDFFF) {
 155                         errno = ERANGE;
 156                         return 0;
 157                 }
 158                 /* 1110xxxx 10xxxxxx 10xxxxxx */
 159                 dest[2] = 0x80 | (point & 0x3F);
 160                 dest[1] = 0x80 | ((point >> 6) & 0x3F);
 161                 dest[0] = 0xE0 | (point >> 12);
 162                 return 3;
 163         }
 164
 165         if (point > 0x10FFFF) {
 166                 errno = ERANGE;
 167                 return 0;
 168         }
 169
 170         /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
 171         dest[3] = 0x80 | (point & 0x3F);
 172         dest[2] = 0x80 | ((point >> 6) & 0x3F);
 173         dest[1] = 0x80 | ((point >> 12) & 0x3F);
 174         dest[0] = 0xF0 | (point >> 18);
 175         return 4;
 176 }