]> git.ozlabs.org Git - ccan/commitdiff
ccan/utf8: new module.
authorRusty Russell <rusty@rustcorp.com.au>
Mon, 18 Jun 2018 10:42:46 +0000 (20:12 +0930)
committerRusty Russell <rusty@rustcorp.com.au>
Mon, 18 Jun 2018 10:42:46 +0000 (20:12 +0930)
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
ccan/utf8/LICENSE [new symlink]
ccan/utf8/_info [new file with mode: 0644]
ccan/utf8/test/run-decode.c [new file with mode: 0644]
ccan/utf8/test/run-encode-decode.c [new file with mode: 0644]
ccan/utf8/test/run-encode.c [new file with mode: 0644]
ccan/utf8/utf8.c [new file with mode: 0644]
ccan/utf8/utf8.h [new file with mode: 0644]

diff --git a/ccan/utf8/LICENSE b/ccan/utf8/LICENSE
new file mode 120000 (symlink)
index 0000000..2354d12
--- /dev/null
@@ -0,0 +1 @@
+../../licenses/BSD-MIT
\ No newline at end of file
diff --git a/ccan/utf8/_info b/ccan/utf8/_info
new file mode 100644 (file)
index 0000000..4fe4437
--- /dev/null
@@ -0,0 +1,48 @@
+#include "config.h"
+#include <stdio.h>
+#include <string.h>
+
+/**
+ * utf8 - Simple routines to encode/decode valid UTF-8.
+ *
+ * This code contains routines to encode and decode UTF-8 characters.
+ * Table and test code stolen entirely from:
+ *    Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
+ *    <https://github.com/chansen/c-utf8-valid>
+ * 
+ * Example:
+ *     int main(int argc, char *argv[])
+ *     {
+ *             size_t i;
+ *             struct utf8_state utf8_state = UTF8_STATE_INIT;
+ *             bool decoded = true;
+ *
+ *             for (i = 0; i < strlen(argv[1]); i++) {
+ *                     decoded = utf8_decode(&utf8_state, argv[1][i]);
+ *                     if (decoded) {
+ *                             if (errno != 0)
+ *                                     err(1, "Invalid UTF8 char %zu-%zu",
+ *                                         i - utf8_state.used_len, i);
+ *                             printf("Character %u\n", utf8_state.c);
+ *                     }
+ *             }
+ *
+ *             if (!decoded)
+ *                     errx(1, "Incomplete UTF8");
+ *             return 0;
+ *     }
+ *
+ * License: BSD-MIT
+ */
+int main(int argc, char *argv[])
+{
+       /* Expect exactly one argument */
+       if (argc != 2)
+               return 1;
+
+       if (strcmp(argv[1], "depends") == 0) {
+               return 0;
+       }
+
+       return 1;
+}
diff --git a/ccan/utf8/test/run-decode.c b/ccan/utf8/test/run-decode.c
new file mode 100644 (file)
index 0000000..34ecb1d
--- /dev/null
@@ -0,0 +1,266 @@
+#include <ccan/utf8/utf8.h>
+/* Include the C files directly. */
+#include <ccan/utf8/utf8.c>
+#include <ccan/tap/tap.h>
+#include <assert.h>
+
+/* Stolen from https://github.com/chansen/c-utf8-valid/blob/master/test.c */
+
+/*
+ *  UTF-8
+ *
+ *     U+0000..U+007F         00..7F
+ *                         n  C0..C1  80..BF
+ *     U+0080..U+07FF         C2..DF  80..BF
+ *                         n  E0      80..9F  80..BF
+ *     U+0800..U+D7FF         E0..ED  A0..9F  80..BF
+ *     U+D800..U+DFFF      s  ED      A0..BF  80..BF
+ *     U+E000..U+FFFF         EE..EF  80..BF  80..BF
+ *                         n  F0      80..8F  80..BF  80..BF
+ *     U+0800..U+FFFF         F0      80..8F  A0..BF  80..BF
+ *    U+10000..U+10FFFF       F0..F4  90..8F  80..BF  80..BF
+ *
+ *   U-110000..U-1FFFFF    x  F4..F7  90..BF  80..BF  80..BF
+ *                         xn F8      80..87  80..BF  80..BF  80..BF
+ *   U-200000..U-3FFFFFF   x  F8..FB  88..BF  80..BF  80..BF  80..BF
+ *                         xn FC      80..83  80..BF  80..BF  80..BF  80..BF
+ *  U-4000000..U-7FFFFFFF  x  FC..FD  84..BF  80..BF  80..BF  80..BF  80..BF
+ * 
+ *  Legend:
+ *    n = Non-shortest form
+ *    s = Surrogates
+ *    x = Codepoints outside Unicode codespace
+ */
+
+/*
+ *  Encodes the given ordinal [0, 7FFFFFFF] using the UTF-8 encoding scheme
+ *  to the given sequence length [1, 6]. This routine can be used to
+ *  produce well-formed and ill-formed UTF-8.
+ *
+ *  To encode a Unicode scalar value to a well-formed representation:
+ *
+ *   [U+0000, U+007F] should be encoded to a sequence length of 1
+ *   [U+0080, U+07FF] should be encoded to a sequence length of 2
+ *   [U+0800, U+D7FF] should be encoded to a sequence length of 3
+ *   [U+E000, U+FFFF] should be encoded to a sequence length of 3
+ *   [U+10000, U+10FFFF] should be encoded to a sequence length of 4
+ *
+ *  To encode a Unicode scalar value to non-shortest form representation:
+ *
+ *   [U+0000, U+007F] can be encoded to a sequence length of [2, 6]
+ *   [U+0080, U+07FF] can be encoded to a sequence length of [3, 6]
+ *   [U+0800, U+FFFF] can be encoded to a sequence length of [4, 6]
+ *
+ *  To encode an ordinal outside of Unicode codespace:
+ *
+ *   [110000, 1FFFFF] can be encoded to a sequence length of 4
+ *   [200000, 3FFFFFF] can be encoded to a sequence length of 5
+ *   [4000000, 7FFFFFFF] can be encoded to a sequence length of 6
+ */
+
+static char *
+encode_ord(uint32_t ord, size_t len, char *dst) {
+  static const uint32_t kMask[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+  static const uint32_t kMax[6]  = { 1 <<  7, 1 << 11, 1 << 16, 
+                                     1 << 21, 1 << 26, 1 << 31 };
+  size_t i;
+
+  assert(len >= 1);
+  assert(len <= 6);
+  assert(ord < kMax[len - 1]);
+
+  for (i = len - 1; i > 0; i--) {
+    dst[i] = (ord & 0x3F) | 0x80;
+    ord >>= 6;
+  }
+  dst[0] = ord | kMask[len - 1];
+  return dst;
+}
+
+static int utf8_check(const char *src, size_t len)
+{
+       bool decoded = false;
+       struct utf8_state utf8_state = UTF8_STATE_INIT;
+       size_t i;
+
+       for (i = 0; i < len; i++) {
+               decoded = utf8_decode(&utf8_state, src[i]);
+               if (decoded) {
+                       if (errno != 0)
+                               return errno;
+               }
+       }
+       if (!decoded)
+               return EMLINK;
+       return 0;
+}
+
+static void
+test_utf8(const char *src, size_t len, int exp_err, unsigned line) {
+  int got_err;
+
+  assert(len <= 255);
+
+  got_err = utf8_check(src, len);
+
+  ok(got_err == exp_err, "Got result %i, expected %i at line %u",
+     got_err, exp_err, line);
+}
+
+#define TEST_UTF8(src, len, exp) \
+  test_utf8(src, len, exp, __LINE__)
+
+
+static void
+test_unicode_scalar_value(void) {
+  uint32_t ord;
+  char src[4];
+
+  /* Unicode scalar value [U+0000, U+007F] */
+  for (ord = 0x0000; ord <= 0x007F; ord++) {
+    encode_ord(ord, 1, src);
+    TEST_UTF8(src, 1, ord ? 0 : ERANGE);
+  }
+
+  /*
+   * Unicode scalar value [U+0080, U+07FF]
+   * The maximal subpart is the length of the truncated sequence
+   */
+  for (ord = 0x0080; ord <= 0x07FF; ord++) {
+    encode_ord(ord, 2, src);
+    TEST_UTF8(src, 2, 0);
+  }
+
+  /*
+   * Unicode scalar value [U+0800, U+D7FF] and [U+E000, U+FFFF]
+   * The maximal subpart is the length of the truncated sequence
+   */
+  for (ord = 0x0800; ord <= 0xFFFF && (ord & 0xF800) != 0xD800; ord++) {
+    encode_ord(ord, 3, src);
+
+    TEST_UTF8(src, 3, 0);
+    if ((ord % (1 << 6)) == 0)
+      TEST_UTF8(src, 2, EMLINK);
+  }
+
+  /*
+   * Unicode scalar value [U+10000, U+10FFF]
+   * The maximal subpart is the length of the truncated sequence
+   */
+  for (ord = 0x10000; ord <= 0x10FFFF; ord++) {
+    encode_ord(ord, 4, src);
+
+    TEST_UTF8(src, 4, 0);
+    if ((ord % (1 << 6)) == 0)
+      TEST_UTF8(src, 3, EMLINK);
+    if ((ord % (1 << 12)) == 0)
+      TEST_UTF8(src, 2, EMLINK);
+  }
+}
+
+static void
+test_non_shortest_form(void) {
+  uint32_t ord;
+  char src[4];
+
+  /*
+   * Non-shortest form 2-byte sequence [U+0000, U+007F]
+   * The maximal subpart is 1-byte
+   */
+  for (ord = 0x0001; ord <= 0x007F; ord++) {
+    encode_ord(ord, 2, src);
+    TEST_UTF8(src, 2, EFBIG);
+  }
+
+  /*
+   * Non-shortest form 3-byte sequence [U+0000, U+07FF]
+   * The maximal subpart is 1-byte
+   */
+  for (ord = 0x0001; ord <= 0x07FF; ord++) {
+    encode_ord(ord, 3, src);
+
+    TEST_UTF8(src, 3, EFBIG);
+    if ((ord % (1 << 6)) == 0)
+      TEST_UTF8(src, 2, EMLINK);
+  }
+
+  /*
+   * Non-shortest form 4-byte sequence [U+0000, U+FFFF]
+   * The maximal subpart is 1-byte
+   */
+  for (ord = 0x0001; ord <= 0xFFFF; ord++) {
+    encode_ord(ord, 4, src);
+
+    TEST_UTF8(src, 4, EFBIG);
+    if ((ord % (1 << 6)) == 0)
+      TEST_UTF8(src, 3, EMLINK);
+    if ((ord % (1 << 12)) == 0)
+      TEST_UTF8(src, 2, EMLINK);
+  }
+}
+
+static void
+test_non_unicode(void) {
+  uint32_t ord;
+  char src[4];
+
+  /*
+   * Code point outside Unicode codespace
+   * The maximal subpart is 1-byte
+   */
+  for (ord = 0x110000; ord <= 0x1FFFFF; ord++) {
+    encode_ord(ord, 4, src);
+
+    TEST_UTF8(src, 4, ERANGE);
+    if ((ord % (1 << 6)) == 0)
+      TEST_UTF8(src, 3, EMLINK);
+    if ((ord % (1 << 12)) == 0)
+      TEST_UTF8(src, 2, EMLINK);
+  }
+}
+
+static void
+test_surrogates(void) {
+  uint32_t ord;
+  char src[4];
+
+  /*
+   * Surrogates [U+D800, U+DFFF]
+   * The maximal subpart is 1-byte
+   */
+  for (ord = 0xD800; ord <= 0xDFFF; ord++) {
+    encode_ord(ord, 3, src);
+
+    TEST_UTF8(src, 3, ERANGE);
+    if ((ord % (1 << 6)) == 0)
+      TEST_UTF8(src, 2, EMLINK);
+  }
+}
+
+static void
+test_continuations(void) {
+  uint8_t ord;
+  char src[4];
+
+  /*
+   * Missplaced continuation [\x80, \xBF]
+   * The maximal subpart is 1-byte
+   */
+  for (ord = 0x80; ord <= 0xBF; ord++) {
+    src[0] = ord;
+    TEST_UTF8(src, 1, EINVAL);
+  }
+}
+
+int
+main(int argc, char **argv)
+{
+  plan_tests(2190906);
+  test_unicode_scalar_value();
+  test_surrogates();
+  test_non_shortest_form();
+  test_non_unicode();
+  test_continuations();
+
+  return exit_status();
+}
diff --git a/ccan/utf8/test/run-encode-decode.c b/ccan/utf8/test/run-encode-decode.c
new file mode 100644 (file)
index 0000000..1dd71fd
--- /dev/null
@@ -0,0 +1,42 @@
+#include <ccan/utf8/utf8.h>
+/* Include the C files directly. */
+#include <ccan/utf8/utf8.c>
+#include <ccan/tap/tap.h>
+#include <assert.h>
+
+static bool utf8_check(const char *src, size_t len)
+{
+       bool decoded = false;
+       struct utf8_state utf8_state = UTF8_STATE_INIT;
+       size_t i;
+
+       for (i = 0; i < len; i++) {
+               decoded = utf8_decode(&utf8_state, src[i]);
+               if (decoded) {
+                       if (errno != 0)
+                               return false;
+               }
+       }
+       if (!decoded)
+               return false;
+       return true;
+}
+
+int main(int argc, char **argv)
+{
+       int i;
+       char dest[UTF8_MAX_LEN];
+       
+       plan_tests(0x10FFFF - (0xDFFF - 0xD7FF + 2));
+
+       for (i = 1; i < 0x10FFFF; i++) {
+               int len;
+               if (i >= 0xD7FF && i <= 0xDFFF)
+                       continue;
+               len = utf8_encode(i, dest);
+               assert(len != 0);
+               ok1(utf8_check(dest, len));
+       }
+
+       return exit_status();
+}
diff --git a/ccan/utf8/test/run-encode.c b/ccan/utf8/test/run-encode.c
new file mode 100644 (file)
index 0000000..375e13f
--- /dev/null
@@ -0,0 +1,30 @@
+#include <ccan/utf8/utf8.h>
+/* Include the C files directly. */
+#include <ccan/utf8/utf8.c>
+#include <ccan/tap/tap.h>
+
+int main(int argc, char **argv)
+{
+       int i;
+       char dest[UTF8_MAX_LEN];
+
+       plan_tests(1 + 0x10FFFF + 1);
+
+       for (i = 0; i < 1; i++)
+               ok1(utf8_encode(i, dest) == 0 && errno == ERANGE);
+       for (; i <= 0x7F; i++)
+               ok1(utf8_encode(i, dest) == 1);
+       for (; i <= 0x7FF; i++)
+               ok1(utf8_encode(i, dest) == 2);
+       for (; i <= 0xD7FF; i++)
+               ok1(utf8_encode(i, dest) == 3);
+       for (; i <= 0xDFFF; i++)
+               ok1(utf8_encode(i, dest) == 0 && errno == ERANGE);
+       for (; i <= 0xFFFF; i++)
+               ok1(utf8_encode(i, dest) == 3);
+       for (; i <= 0x10FFFF; i++)
+               ok1(utf8_encode(i, dest) == 4);
+       ok1(utf8_encode(i, dest) == 0 && errno == ERANGE);
+
+       return exit_status();
+}
diff --git a/ccan/utf8/utf8.c b/ccan/utf8/utf8.c
new file mode 100644 (file)
index 0000000..346d2d9
--- /dev/null
@@ -0,0 +1,176 @@
+/* MIT (BSD) license - see LICENSE file for details */
+#include <ccan/utf8/utf8.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* I loved this table, so I stole it: */
+/*
+ * Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
+ * <https://github.com/chansen/c-utf8-valid>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ *    UTF-8 Encoding Form
+ *
+ *    U+0000..U+007F       0xxxxxxx                            <= 7 bits
+ *    U+0080..U+07FF       110xxxxx 10xxxxxx                   <= 11 bits
+ *    U+0800..U+FFFF       1110xxxx 10xxxxxx 10xxxxxx          <= 16 bits
+ *   U+10000..U+10FFFF     11110xxx 10xxxxxx 10xxxxxx 10xxxxxx <= 21 bits
+ *
+ *
+ *    U+0000..U+007F       00..7F
+ *                      N  C0..C1  80..BF                   1100000x 10xxxxxx
+ *    U+0080..U+07FF       C2..DF  80..BF
+ *                      N  E0      80..9F  80..BF           11100000 100xxxxx
+ *    U+0800..U+0FFF       E0      A0..BF  80..BF
+ *    U+1000..U+CFFF       E1..EC  80..BF  80..BF
+ *    U+D000..U+D7FF       ED      80..9F  80..BF
+ *                      S  ED      A0..BF  80..BF           11101101 101xxxxx
+ *    U+E000..U+FFFF       EE..EF  80..BF  80..BF
+ *                      N  F0      80..8F  80..BF  80..BF   11110000 1000xxxx
+ *   U+10000..U+3FFFF      F0      90..BF  80..BF  80..BF
+ *   U+40000..U+FFFFF      F1..F3  80..BF  80..BF  80..BF
+ *  U+100000..U+10FFFF     F4      80..8F  80..BF  80..BF   11110100 1000xxxx
+ *
+ *  Legend:
+ *    N = Non-shortest form
+ *    S = Surrogates
+ */
+bool utf8_decode(struct utf8_state *utf8_state, char c)
+{
+       if (utf8_state->used_len == utf8_state->total_len) {
+               utf8_state->used_len = 1;
+               /* First character in sequence. */
+               if (((unsigned char)c & 0x80) == 0) {
+                       /* ASCII, easy. */
+                       utf8_state->total_len = 1;
+                       utf8_state->c = c;
+                       goto finished_decoding;
+               } else if (((unsigned char)c & 0xE0) == 0xC0) {
+                       utf8_state->total_len = 2;
+                       utf8_state->c = ((unsigned char)c & 0x1F);
+                       return false;
+               } else if (((unsigned char)c & 0xF0) == 0xE0) {
+                       utf8_state->total_len = 3;
+                       utf8_state->c = ((unsigned char)c & 0x0F);
+                       return false;
+               } else if (((unsigned char)c & 0xF8) == 0xF0) {
+                       utf8_state->total_len = 4;
+                       utf8_state->c = ((unsigned char)c & 0x07);
+                       return false;
+               }
+               goto bad_encoding;
+       }
+
+       if (((unsigned char)c & 0xC0) != 0x80)
+               goto bad_encoding;
+
+       utf8_state->c <<= 6;
+       utf8_state->c |= ((unsigned char)c & 0x3F);
+       
+       utf8_state->used_len++;
+       if (utf8_state->used_len == utf8_state->total_len)
+               goto finished_decoding;
+       return false;
+
+finished_decoding:
+       if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF)
+               errno = ERANGE;
+       /* The UTF-16 "surrogate range": illegal in UTF-8 */
+       else if (utf8_state->total_len == 3
+                && (utf8_state->c & 0xFFFFF800) == 0x0000D800)
+               errno = ERANGE;
+       else {
+               int min_bits;
+               switch (utf8_state->total_len) {
+               case 1:
+                       min_bits = 0;
+                       break;
+               case 2:
+                       min_bits = 7;
+                       break;
+               case 3:
+                       min_bits = 11;
+                       break;
+               case 4:
+                       min_bits = 16;
+                       break;
+               default:
+                       abort();
+               }
+               if ((utf8_state->c >> min_bits) == 0)
+                       errno = EFBIG;
+               else
+                       errno = 0;
+       }
+       return true;
+
+bad_encoding:
+       utf8_state->total_len = utf8_state->used_len;
+       errno = EINVAL;
+       return true;
+}
+
+size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN])
+{
+       if ((point >> 7) == 0) {
+               if (point == 0) {
+                       errno = ERANGE;
+                       return 0;
+               }
+               /* 0xxxxxxx */
+               dest[0] = point;
+               return 1;
+       }
+
+       if ((point >> 11) == 0) {
+               /* 110xxxxx 10xxxxxx */
+               dest[1] = 0x80 | (point & 0x3F);
+               dest[0] = 0xC0 | (point >> 6);
+               return 2;
+       }
+
+       if ((point >> 16) == 0) {
+               if (point >= 0xD800 && point <= 0xDFFF) {
+                       errno = ERANGE;
+                       return 0;
+               }
+               /* 1110xxxx 10xxxxxx 10xxxxxx */
+               dest[2] = 0x80 | (point & 0x3F);
+               dest[1] = 0x80 | ((point >> 6) & 0x3F);
+               dest[0] = 0xE0 | (point >> 12);
+               return 3;
+       }
+
+       if (point > 0x10FFFF) {
+               errno = ERANGE;
+               return 0;
+       }
+
+       /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+       dest[3] = 0x80 | (point & 0x3F);
+       dest[2] = 0x80 | ((point >> 6) & 0x3F);
+       dest[1] = 0x80 | ((point >> 12) & 0x3F);
+       dest[0] = 0xF0 | (point >> 18);
+       return 4;
+}
diff --git a/ccan/utf8/utf8.h b/ccan/utf8/utf8.h
new file mode 100644 (file)
index 0000000..a095f02
--- /dev/null
@@ -0,0 +1,54 @@
+/* MIT (BSD) license - see LICENSE file for details */
+#ifndef CCAN_UTF8_H
+#define CCAN_UTF8_H
+#include <inttypes.h>
+#include <stdbool.h>
+#include <string.h>
+
+/* Unicode is limited to 21 bits. */
+#define UTF8_MAX_LEN   4
+
+struct utf8_state {
+       /* How many characters we are expecting as part of this Unicode point */
+       uint16_t total_len;
+       /* How many characters we've already seen. */
+       uint16_t used_len;
+       /* Compound character, aka Unicode point. */
+       uint32_t c;
+};
+
+#define UTF8_STATE_INIT { 0, 0, 0 }
+
+static inline void utf8_state_init(struct utf8_state *utf8_state)
+{
+       memset(utf8_state, 0, sizeof(*utf8_state));
+}
+
+/**
+ * utf8_decode - continue UTF8 decoding with this character.
+ * @utf8_state - initialized UTF8 state.
+ * @c - the character.
+ *
+ * Returns false if it needs another character to give results.
+ * Otherwise returns true, @utf8_state can be reused without initializeation,
+ * and sets errno:
+ * 0: success
+ * EINVAL: bad encoding.
+ * EFBIG: not a minimal encoding.
+ * ERANGE: encoding of invalid character.
+ *
+ * You can extract the character from @utf8_state->c; @utf8_state->used_len
+ * indicates how many characters have been consumed.
+ */
+bool utf8_decode(struct utf8_state *utf8_state, char c);
+
+/**
+ * utf8_encode - encode a point into UTF8.
+ * @point - Unicode point to include.
+ * @dest - buffer to fill.
+ *
+ * Returns 0 if point was invalid, otherwise bytes of dest used.
+ * Sets errno to ERANGE if point was invalid.
+ */
+size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN]);
+#endif /* CCAN_UTF8_H */