X-Git-Url: http://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Fcharset%2Fcharset.h;h=898bfb5ae89deeffdd0414f420dd017015288acc;hp=74317fce19b76b1467fefe9abddb1b605c117cd3;hb=c438ec17d7b2efe76e56e5fc5ab88bd4a02735e8;hpb=c8c69dc68792e85b14646e8a8219dae923b34feb diff --git a/ccan/charset/charset.h b/ccan/charset/charset.h index 74317fce..898bfb5a 100644 --- a/ccan/charset/charset.h +++ b/ccan/charset/charset.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com) + Copyright (C) 2011 Joseph A. Adams (joeyadams3.14159@gmail.com) All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy @@ -26,19 +26,67 @@ #include #include +#include + +#define REPLACEMENT_CHARACTER 0xFFFD /* - * Validate the given UTF-8 string. If it contains '\0' characters, - * it is still valid. - * - * By default, Unicode characters U+D800 thru U+DFFF will be considered - * invalid UTF-8. However, if you set utf8_allow_surrogates to true, - * they will be allowed. Allowing the surrogate range makes it possible - * to losslessly encode malformed UTF-16. + * Type for Unicode codepoints. + * We need our own because wchar_t might be 16 bits. + */ +typedef uint32_t uchar_t; + +/* + * Validate the given UTF-8 string. + * If it contains '\0' characters, it is still valid. */ bool utf8_validate(const char *str, size_t length); -/* Default: false */ -extern bool utf8_allow_surrogates; +/* + * Validate a single UTF-8 character. + * @s: Beginning of UTF-8 character. + * @e: End of string. + * + * If it's valid, return its length (1 thru 4). + * If it's invalid or clipped, return 0. + */ +int utf8_validate_char(const char *s, const char *e); + +/* + * Read a single UTF-8 character starting at @s, + * returning the length, in bytes, of the character read. + * + * This function assumes input is valid UTF-8, + * and that there are enough characters in front of @s. + */ +int utf8_read_char(const char *s, uchar_t *out); + +/* + * Write a single UTF-8 character to @s, + * returning the length, in bytes, of the character written. + * + * @unicode should be U+0000..U+10FFFF, but not U+D800..U+DFFF. + * If @unicode is invalid, REPLACEMENT_CHARACTER will be emitted instead. + * + * This function will write up to 4 bytes to @out. + */ +int utf8_write_char(uchar_t unicode, char *out); + +/* + * Compute the Unicode codepoint of a UTF-16 surrogate pair. + * + * @uc should be 0xD800..0xDBFF, and @lc should be 0xDC00..0xDFFF. + * If they aren't, this function returns REPLACEMENT_CHARACTER. + */ +uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc); + +/* + * Construct a UTF-16 surrogate pair given a Unicode codepoint. + * + * @unicode should be U+10000..U+10FFFF. + * If it's not, this function returns false, + * and sets *uc and *lc to REPLACEMENT_CHARACTER. + */ +bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc); #endif