X-Git-Url: http://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Fcharset%2Fcharset.h;h=898bfb5ae89deeffdd0414f420dd017015288acc;hp=74317fce19b76b1467fefe9abddb1b605c117cd3;hb=c438ec17d7b2efe76e56e5fc5ab88bd4a02735e8;hpb=c8c69dc68792e85b14646e8a8219dae923b34feb

diff --git a/ccan/charset/charset.h b/ccan/charset/charset.h
index 74317fce..898bfb5a 100644
--- a/ccan/charset/charset.h
+++ b/ccan/charset/charset.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
+  Copyright (C) 2011 Joseph A. Adams (joeyadams3.14159@gmail.com)
   All rights reserved.
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -26,19 +26,67 @@
 
 #include <stdbool.h>
 #include <stddef.h>
+#include <stdint.h>
+
+#define REPLACEMENT_CHARACTER 0xFFFD
 
 /*
- * Validate the given UTF-8 string.  If it contains '\0' characters,
- * it is still valid.
- *
- * By default, Unicode characters U+D800 thru U+DFFF will be considered
- * invalid UTF-8.  However, if you set utf8_allow_surrogates to true,
- * they will be allowed.  Allowing the surrogate range makes it possible
- * to losslessly encode malformed UTF-16.
+ * Type for Unicode codepoints.
+ * We need our own because wchar_t might be 16 bits.
+ */
+typedef uint32_t uchar_t;
+
+/*
+ * Validate the given UTF-8 string.
+ * If it contains '\0' characters, it is still valid.
  */
 bool utf8_validate(const char *str, size_t length);
 
-/* Default: false */
-extern bool utf8_allow_surrogates;
+/*
+ * Validate a single UTF-8 character.
+ * @s: Beginning of UTF-8 character.
+ * @e: End of string.
+ *
+ * If it's valid, return its length (1 thru 4).
+ * If it's invalid or clipped, return 0.
+ */
+int utf8_validate_char(const char *s, const char *e);
+
+/*
+ * Read a single UTF-8 character starting at @s,
+ * returning the length, in bytes, of the character read.
+ *
+ * This function assumes input is valid UTF-8,
+ * and that there are enough characters in front of @s.
+ */
+int utf8_read_char(const char *s, uchar_t *out);
+
+/*
+ * Write a single UTF-8 character to @s,
+ * returning the length, in bytes, of the character written.
+ *
+ * @unicode should be U+0000..U+10FFFF, but not U+D800..U+DFFF.
+ * If @unicode is invalid, REPLACEMENT_CHARACTER will be emitted instead.
+ *
+ * This function will write up to 4 bytes to @out.
+ */
+int utf8_write_char(uchar_t unicode, char *out);
+
+/*
+ * Compute the Unicode codepoint of a UTF-16 surrogate pair.
+ *
+ * @uc should be 0xD800..0xDBFF, and @lc should be 0xDC00..0xDFFF.
+ * If they aren't, this function returns REPLACEMENT_CHARACTER.
+ */
+uchar_t from_surrogate_pair(unsigned int uc, unsigned int lc);
+
+/*
+ * Construct a UTF-16 surrogate pair given a Unicode codepoint.
+ *
+ * @unicode should be U+10000..U+10FFFF.
+ * If it's not, this function returns false,
+ * and sets *uc and *lc to REPLACEMENT_CHARACTER.
+ */
+bool to_surrogate_pair(uchar_t unicode, unsigned int *uc, unsigned int *lc);
 
 #endif