git.ozlabs.org Git - ccan/blob - ccan/utf8/utf8.h

   1 /* MIT (BSD) license - see LICENSE file for details */
   2 #ifndef CCAN_UTF8_H
   3 #define CCAN_UTF8_H
   4 #include <inttypes.h>
   5 #include <stdbool.h>
   6 #include <string.h>
   7
   8 /* Unicode is limited to 21 bits. */
   9 #define UTF8_MAX_LEN    4
  10
  11 struct utf8_state {
  12         /* How many characters we are expecting as part of this Unicode point */
  13         uint16_t total_len;
  14         /* How many characters we've already seen. */
  15         uint16_t used_len;
  16         /* Compound character, aka Unicode point. */
  17         uint32_t c;
  18 };
  19
  20 #define UTF8_STATE_INIT { 0, 0, 0 }
  21
  22 static inline void utf8_state_init(struct utf8_state *utf8_state)
  23 {
  24         memset(utf8_state, 0, sizeof(*utf8_state));
  25 }
  26
  27 /**
  28  * utf8_decode - continue UTF8 decoding with this character.
  29  * @utf8_state - initialized UTF8 state.
  30  * @c - the character.
  31  *
  32  * Returns false if it needs another character to give results.
  33  * Otherwise returns true, @utf8_state can be reused without initializeation,
  34  * and sets errno:
  35  * 0: success
  36  * EINVAL: bad encoding.
  37  * EFBIG: not a minimal encoding.
  38  * ERANGE: encoding of invalid character.
  39  *
  40  * You can extract the character from @utf8_state->c; @utf8_state->used_len
  41  * indicates how many characters have been consumed.
  42  */
  43 bool utf8_decode(struct utf8_state *utf8_state, char c);
  44
  45 /**
  46  * utf8_encode - encode a point into UTF8.
  47  * @point - Unicode point to include.
  48  * @dest - buffer to fill.
  49  *
  50  * Returns 0 if point was invalid, otherwise bytes of dest used.
  51  * Sets errno to ERANGE if point was invalid.
  52  */
  53 size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN]);
  54 #endif /* CCAN_UTF8_H */