lib/fold: Add support for multibyte strings
authorJeremy Kerr <jk@ozlabs.org>
Tue, 23 Sep 2014 06:46:06 +0000 (14:46 +0800)
committerSamuel Mendoza-Jonas <sam.mj@au1.ibm.com>
Tue, 23 Sep 2014 06:47:58 +0000 (16:47 +1000)
Currently, the fold_text function doesn't understand multibyte strings,
so may break a line in the middle of a multibyte sequence.

This change adds multibyte-awareness to the fold code, and uses proper
width calculations for the contents of the folded string.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
lib/fold/fold.c
test/lib/test-fold.c

index ec10c8c240388af62981d582eaac401d19ae28d7..8bf133c8b34a9e0e37762eda8122dce4ac210141 100644 (file)
@@ -1,4 +1,12 @@
 
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include <wchar.h>
+#include <wctype.h>
+
 #include "fold/fold.h"
 
 void fold_text(const char *text,
@@ -7,38 +15,78 @@ void fold_text(const char *text,
                void *arg)
 {
        const char *start, *end, *sep;
-       int rc = 0;
+       size_t sep_bytes, len;
+       int col, rc = 0;
+       mbstate_t ps;
 
+       /* start, end and sep are byte-positions in the string, and should always
+        * lie on the start of a multibyte sequence */
        start = end = sep = text;
+       sep_bytes = 0;
+       col = 0;
+       len = strlen(text);
+       memset(&ps, 0, sizeof(ps));
 
        while (!rc) {
+               size_t bytes;
+               wchar_t wc;
+               int width;
+
+               bytes = mbrtowc(&wc, end, len - (end - text), &ps);
+
+               assert(bytes >= 0);
+
+               /* we'll get a zero size for the nul terminator */
+               if (!bytes) {
+                       line_cb(arg, start, end - start);
+                       break;
+               }
 
-               if (*end == '\n') {
+               if (wc == L'\n') {
                        rc = line_cb(arg, start, end - start);
-                       start = sep = ++end;
+                       start = sep = end += bytes;
+                       sep_bytes = 0;
+                       col = 0;
+                       continue;
+               }
+
+               width = wcwidth(wc);
 
-               } else if (*end == '\0') {
+               /* we should have caught this in the !bytes check... */
+               if (width == 0) {
                        line_cb(arg, start, end - start);
-                       rc = 1;
+                       break;
+               }
 
-               } else if (end - start >= linelen - 1) {
+               /* unprintable character? just add it to the current line */
+               if (width < 0) {
+                       end += bytes;
+                       continue;
+               }
+
+               col += width;
+
+               if (col > linelen) {
                        if (sep != start) {
                                /* split on a previous word boundary, if
                                 * possible */
                                rc = line_cb(arg, start, sep - start);
-                               start = end = ++sep;
+                               end = sep + sep_bytes;
                        } else {
                                /* otherwise, break the word */
-                               end++;
                                rc = line_cb(arg, start, end - start);
-                               start = sep = end;
                        }
+                       sep_bytes = 0;
+                       start = sep = end;
+                       col = 0;
 
                } else {
-                       end++;
                        /* record our last separator */
-                       if (*end == ' ')
+                       if (wc == L' ') {
                                sep = end;
+                               sep_bytes = bytes;
+                       }
+                       end += bytes;
                }
        }
 }
index 1f58fdf98ba17a6c95dfd4e352f6e84d834ec1e2..474892d4b6fd08c229f74c444fe50c3071882dce 100644 (file)
@@ -1,7 +1,12 @@
 
+#define _GNU_SOURCE
+
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
+#include <locale.h>
+#include <wchar.h>
+#include <langinfo.h>
 
 #include <fold/fold.h>
 #include <list/list.h>
@@ -72,8 +77,19 @@ struct test test_break = {
        },
 };
 
+struct test test_mbs = {
+       .in = "從主功能表畫面中,選取啟動選項。",
+       .linelen = 15,
+       .out = {
+               "從主功能表畫面",
+               "中,選取啟動選",
+               "項。",
+               NULL,
+       },
+};
+
 static struct test *tests[] = {
-       &test_split, &test_fold_line, &test_break,
+       &test_split, &test_fold_line, &test_break, &test_mbs,
 };
 
 static void __attribute__((noreturn)) fail(struct ctx *ctx,
@@ -83,7 +99,7 @@ static void __attribute__((noreturn)) fail(struct ctx *ctx,
        int i;
 
        fprintf(stderr, "%s\n", msg);
-       fprintf(stderr, "input:\n%s\n", test->in);
+       fprintf(stderr, "input, split at %d:\n%s\n", test->linelen, test->in);
 
        fprintf(stderr, "expected:\n");
        for (i = 0; test->out[i]; i++)
@@ -116,19 +132,39 @@ static void run_test(struct test *test)
 {
        struct line *line;
        struct ctx *ctx;
-       int i;
+       wchar_t *wcs;
+       int i, n;
 
        ctx = talloc(NULL, struct ctx);
+       n = strlen(test->in) + 1;
        list_init(&ctx->lines);
        fold_text(test->in, test->linelen, fold_line_cb, ctx);
 
+
        i = 0;
        list_for_each_entry(&ctx->lines, line, list) {
+               size_t wcslen;
+               char *buf;
+               int width;
+
                if (!test->out[i])
                        fail(ctx, test,
                                "fold_text returned more lines than expected");
 
-               if (line->len > test->linelen)
+               buf = talloc_strndup(ctx, line->buf, line->len);
+               wcslen = mbstowcs(NULL, buf, 0);
+
+               if (wcslen == (size_t)-1)
+                       fail(ctx, test, "invalid mutlibyte sequence");
+
+               wcs = talloc_array(ctx, wchar_t, wcslen + 1);
+               wcslen = mbstowcs(wcs, buf, n);
+
+               width = wcswidth(wcs, wcslen);
+               if (width == -1)
+                       fail(ctx, test, "nonprintable characters present");
+
+               if (width > (signed int)test->linelen)
                        fail(ctx, test, "line too long");
 
                if (line->len != strlen(test->out[i]))
@@ -149,6 +185,16 @@ static void run_test(struct test *test)
 int main(void)
 {
        unsigned int i;
+       char *charset;
+
+       setlocale(LC_CTYPE, "");
+
+       charset = nl_langinfo(CODESET);
+       if (strcmp(charset, "UTF-8")) {
+               fprintf(stderr, "Current charset is %s, tests require UTF-8\n",
+                               charset);
+               return EXIT_FAILURE;
+       }
 
        for (i = 0; i < ARRAY_SIZE(tests); i++)
                run_test(tests[i]);