From 703565490945abb062b20e043863c3ca0366f394 Mon Sep 17 00:00:00 2001 From: Joey Adams Date: Sat, 18 Jul 2009 03:52:53 -0400 Subject: [PATCH 1/1] Improved stringmap to support strings with null characters --- ccan/stringmap/_info | 3 +- ccan/stringmap/stringmap.c | 305 ++++++++++++++++++++++++++++++------- ccan/stringmap/stringmap.h | 19 ++- ccan/stringmap/test/run.c | 84 +++++++--- 4 files changed, 327 insertions(+), 84 deletions(-) diff --git a/ccan/stringmap/_info b/ccan/stringmap/_info index 38f30678..5ad40cac 100644 --- a/ccan/stringmap/_info +++ b/ccan/stringmap/_info @@ -5,7 +5,8 @@ /** * stringmap - Macros for mapping strings to things * - * stringmap provides a generic string map via macros. + * stringmap provides a generic string map via macros. It also supports byte + * strings with null characters. * * Features which are sorely lacking in this version of stringmap are deletion and traversal. * diff --git a/ccan/stringmap/stringmap.c b/ccan/stringmap/stringmap.c index 059dbaaa..7380e4e1 100644 --- a/ccan/stringmap/stringmap.c +++ b/ccan/stringmap/stringmap.c @@ -31,100 +31,156 @@ #include "stringmap.h" +//#define CONSISTENCY_CHECK + #if 0 #include #else #define assert(...) do {} while(0) #endif -#define BITNO(x) ((x) & ~(LEFT_IS_LEAF|RIGHT_IS_LEAF)) -#define LEFT_IS_LEAF 0x80000000 -#define RIGHT_IS_LEAF 0x40000000 -#define IS_LEFT_LEAF(x) (((x) & LEFT_IS_LEAF) != 0) -#define IS_RIGHT_LEAF(x) (((x) & RIGHT_IS_LEAF) != 0) -#define P_BIT(key, bit) (key[bit >> 3] >> (bit & 7)) & 1 -#define CHECKBITS 8 +#define PEEK_BIT(key, bit) ((key[bit >> 3] >> (bit & 7)) & 1) + +struct stringmap_node { + uint32_t left_is_leaf:1, right_is_leaf:1, bitno:30; + struct stringmap_node *lr[2]; +}; struct T { char *str; + size_t len; }; -static void *T_new(struct block_pool *bp, const char *key, size_t T_size) { +static inline struct T *leaf(struct stringmap_node *n, int lr) { + assert(lr ? n->right_is_leaf : n->left_is_leaf); + return (struct T*)n->lr[lr]; +} + +/* Normal nodes diverge because there was a 0 or 1 difference. If left_ends(n), + then the node diverges because one string ends and the rest don't. */ +static inline int left_ends(struct stringmap_node *n) { + return (n->left_is_leaf && (leaf(n,0)->len << 3)==n->bitno); +} + +static void *T_new(struct block_pool *bp, const char *key, size_t len, size_t T_size) { struct T *leaf = block_pool_alloc(bp, T_size); memset(leaf, 0, T_size); - leaf->str = block_pool_strdup(bp, key); + + leaf->str = block_pool_alloc_align(bp, len+1, 1); + memcpy(leaf->str, key, len); + leaf->str[len] = 0; + leaf->len = len; + return leaf; } -void *stringmap_lookup_real(struct stringmap *t, const char *key, int enterf, const size_t T_size) { +//used for diagnostics +static int consistency_check(struct stringmap *t); +static void emit_dot(struct stringmap *t); +static void emit_subtree(struct stringmap_node *n, int is_leaf); + +void *stringmap_lookup_real(struct stringmap *t, const char *key, size_t len, int enterf, size_t T_size) { struct T *sp; struct stringmap_node *w, *new, *last; - int len, cix, bit, fbit, svbit, ix, bitno; - const char *k, *m, *sm; + uint32_t cix, bit, svbit, ix, bitno, end_bit; + const char *k, *m; + + (void) consistency_check; + (void) emit_dot; + #ifdef STRINGMAP_EMIT_DOT + emit_dot(t); + #endif + #ifdef CONSISTENCY_CHECK + consistency_check(t); + #endif + + /* If key length wasn't supplied, calculate it. */ + if (len == (size_t)-1) + len = strlen(key); + end_bit = len << 3; + /* If tree is empty, create the first node. */ if (!t->root) { if (!enterf) return NULL; t->bp = block_pool_new(t->bp); - t->root = T_new(t->bp, key, T_size); + t->root = T_new(t->bp, key, len, T_size); t->count = 1; return t->root; } - - /* Count full string length */ - for (k = key, len = 0; *k; k++, len++) - ; + /* Follow the tree down to what might be the target key. */ if (t->count == 1) { w = t->root; svbit = 0; } else { w = t->root; - bitno = len * CHECKBITS; for (;;) { - bit = BITNO(w->bitno); - fbit = bit > bitno ? 0 : P_BIT(key, bit); - svbit = fbit ? IS_RIGHT_LEAF(w->bitno) : - IS_LEFT_LEAF(w->bitno); - w = w->lr[fbit]; + if (!left_ends(w)) //0 or 1 + bit = w->bitno < end_bit ? PEEK_BIT(key, w->bitno) : 0; + else //ends or doesn't end + bit = (w->bitno != end_bit); + svbit = bit ? w->right_is_leaf : w->left_is_leaf; + w = w->lr[bit]; if (svbit) break; } } - + + /* See if the strings match. If not, set cix to the first bit offset + where there's a difference, and bit to the side on which to put + this leaf. */ sp = (struct T *)w; - - sm = m = sp->str; + m = sp->str; k = key; - - /* Check for correct string and return */ - for (cix = 0; *m && *k && *m == *k; m++, k++, cix += CHECKBITS) - ; - if (*m == 0 && *k == 0) { - //if (!enterf && sp->value == NULL) - // return NULL; - return sp; + for (cix = 0; ; m++, k++, cix++) { + if (cix>=sp->len || cix>=len) { //we reached the end of one or both strings + if (cix==sp->len && cix==len) { //strings match + //if (!enterf && sp->value == NULL) + // return NULL; + return sp; + } + cix <<= 3; + + //put the shorter key to the left + bit = len > sp->len; + + break; + } + if (*m != *k) { //the strings have a differing character + cix <<= 3; + + //advance cix to the first differing bit + ix = *m ^ *k; + while ((ix & 1) == 0) + ix >>= 1, cix++; + + //choose left/right based on the differing bit + bit = PEEK_BIT(key, cix); + + break; + } } - + if (!enterf) return NULL; /* no string found and do not enter */ - ix = *m ^ *k; - while ((ix & 1) == 0) - ix >>= 1, cix++; - /* Create new node */ new = block_pool_alloc(t->bp, sizeof *new); - bit = P_BIT(key, cix); - new->bitno = cix | (bit ? RIGHT_IS_LEAF : LEFT_IS_LEAF); - new->lr[bit] = T_new(t->bp, key, T_size); + + new->right_is_leaf = bit; + new->left_is_leaf = !bit; + new->bitno = cix; + + new->lr[bit] = T_new(t->bp, key, len, T_size); if (t->count++ == 1) { new->lr[!bit] = t->root; - new->bitno |= (bit ? LEFT_IS_LEAF : RIGHT_IS_LEAF); + new->right_is_leaf = 1; + new->left_is_leaf = 1; t->root = new; return (struct T *)new->lr[bit]; } @@ -132,16 +188,31 @@ void *stringmap_lookup_real(struct stringmap *t, const char *key, int enterf, co w = t->root; last = NULL; for (;;) { - fbit = w->bitno; - bitno = BITNO(w->bitno); - assert(bitno != cix); + bitno = w->bitno; if (bitno > cix) break; - svbit = P_BIT(key, bitno); + + if (!left_ends(w)) { //0 or 1 + if (bitno == cix) + break; + svbit = PEEK_BIT(key, bitno); + + } else { //ends or doesn't end + //because left is an end, we cannot split it, so we must turn right + svbit = 1; + } + last = w; w = w->lr[svbit]; - if (fbit & (svbit ? RIGHT_IS_LEAF : LEFT_IS_LEAF)) + if (svbit ? last->right_is_leaf : last->left_is_leaf) { + //w is a leaf, so mark it accordingly in its parent structure + if (!bit) + new->right_is_leaf = 1; + else + new->left_is_leaf = 1; + break; + } } new->lr[!bit] = w; @@ -149,9 +220,141 @@ void *stringmap_lookup_real(struct stringmap *t, const char *key, int enterf, co t->root = new; } else { last->lr[svbit] = new; - last->bitno &= ~(svbit ? RIGHT_IS_LEAF : LEFT_IS_LEAF); + if (svbit) + last->right_is_leaf = 0; + else + last->left_is_leaf = 0; } - if (bitno < cix) - new->bitno |= (bit ? LEFT_IS_LEAF : RIGHT_IS_LEAF); + return (struct T *)new->lr[bit]; } + +static int consistency_check_subtree(struct stringmap_node *n) { + uint32_t bitno = n->bitno; + int success = 1; + + //make sure bitnos ascend (must ascend unless left ends) + if (!n->left_is_leaf && bitno >= n->lr[0]->bitno) { + printf("Left leaf has bitno >= than parent\n"); + success = 0; + } + if (!n->right_is_leaf && bitno >= n->lr[1]->bitno) { + if (left_ends(n) && bitno == n->lr[1]->bitno) { + //fine, there's a shelf here + } else { + printf("Right leaf has bitno >= than parent\n"); + success = 0; + } + } + + //make sure eponymous bits are set properly + if (n->left_is_leaf) { + struct T *lf = leaf(n, 0); + size_t len = lf->len << 3; + if (len == n->bitno) { + //this is a shelf + } else if (len <= n->bitno) { + printf("Left leaf is too short\n"); + success = 0; + } else if (PEEK_BIT(lf->str, n->bitno) == 1) { + printf("Left leaf has incorrect bit\n"); + success = 0; + } + } + if (n->right_is_leaf) { + struct T *lf = leaf(n, 1); + size_t len = lf->len << 3; + if (len <= n->bitno) { + printf("Right leaf is too short\n"); + success = 0; + } else if (PEEK_BIT(lf->str, n->bitno) == 0 && !left_ends(n)) { + printf("Right leaf has incorrect bit\n"); + success = 0; + } + } + + if (!success) { + //emit_subtree(n, 0); + abort(); + } + + //recursively check + return (!n->left_is_leaf ? consistency_check_subtree(n->lr[0]) : 1) && + (!n->right_is_leaf ? consistency_check_subtree(n->lr[1]) : 1); +} + +static int consistency_check(struct stringmap *t) { + if (t->count < 2) + return 1; + return consistency_check_subtree(t->root); +} + +//The following can be used to create Graphviz "dot" files to visualize the tree + +static void leaf_to_dot(void *lp, FILE *f) { + struct T *leaf = lp; + size_t bit_count = leaf->len << 3; + size_t i; + + fputs("\"", f); + #if 1 + for (i=0; istr, i) ? '1' : '0', f); + if (((i+1) & 7) == 0) + fputs("\\n", f); //add newlines between bytes + } + putc(' ', f); + #endif + fprintf(f, "(%s)\"\n", leaf->str); +} + +static void node_to_dot(struct stringmap_node *n, FILE *f, size_t level) { + //don't draw ridiculously huge trees + if (level > 4) + return; + + fprintf(f, "%zu [label=\"[%zu] %u\"]\n", (size_t)n, level, n->bitno); + + if (n->left_is_leaf) { + fprintf(f, "%zu -> ", (size_t)n); + leaf_to_dot(n->lr[0], f); + } else { + fprintf(f, "%zu -> %zu \n", (size_t)n, (size_t)n->lr[0]); + node_to_dot(n->lr[0], f, level+1); + } + + if (n->right_is_leaf) { + fprintf(f, "%zu -> ", (size_t)n); + leaf_to_dot(n->lr[1], f); + } else { + fprintf(f, "%zu -> %zu \n", (size_t)n, (size_t)n->lr[1]); + node_to_dot(n->lr[1], f, level+1); + } +} + +static void stringmap_subtree_to_dot(struct stringmap_node *n, int is_leaf, const char *filename_out) { + FILE *f = fopen(filename_out, "w"); + + fputs("digraph G {\n", f); + + if (is_leaf) + leaf_to_dot(n, f); + else + node_to_dot(n, f, 0); + + fputs("}\n", f); + fclose(f); +} + +static size_t dot_file_number = 0; + +static void emit_subtree(struct stringmap_node *n, int is_leaf) { + char buf[64]; + sprintf(buf, "dot/%04zu.dot", dot_file_number++); + stringmap_subtree_to_dot(n, is_leaf, buf); +} + +static void emit_dot(struct stringmap *t) { + if (t->count) + emit_subtree(t->root, t->count==1); +} diff --git a/ccan/stringmap/stringmap.h b/ccan/stringmap/stringmap.h index df9fe636..f50e12d7 100644 --- a/ccan/stringmap/stringmap.h +++ b/ccan/stringmap/stringmap.h @@ -32,7 +32,7 @@ #include #include -#define stringmap(theType) struct {struct stringmap t; struct {char *str; theType value;} *last;} +#define stringmap(theType) struct {struct stringmap t; struct {char *str; size_t len; theType value;} *last;} //the 'last' pointer here is used as a hacky typeof() alternative #define stringmap_new(ctx) {{0,0,(struct block_pool*)(ctx)},0} @@ -50,14 +50,19 @@ #define stringmap_lookup(sm, key) stringmap_le(sm, key, 0) #define stringmap_enter(sm, key) stringmap_le(sm, key, 1) +/* Variants of lookup and enter that let you specify a length. Note that byte + strings may have null characters in them, and it won't affect the + algorithm. Many lives were lost to make this possible. */ +#define stringmap_lookup_n(sm, key, len) stringmap_le_n(sm, key, len, 0) +#define stringmap_enter_n(sm, key, len) stringmap_le_n(sm, key, len, 1) + +#define stringmap_le(sm, key, enterf) stringmap_le_n(sm, key, (size_t)-1, enterf) + //this macro sets sm.last so it can exploit its type -#define stringmap_le(sm, key, enterf) ((((sm).last) = stringmap_lookup_real(&(sm).t, key, enterf, sizeof(*(sm).last))) ? &(sm).last->value : NULL) +#define stringmap_le_n(sm, key, len, enterf) ((((sm).last) = stringmap_lookup_real(&(sm).t, key, len, enterf, sizeof(*(sm).last))) ? &(sm).last->value : NULL) -struct stringmap_node { - uint32_t bitno; - struct stringmap_node *lr[2]; -}; +struct stringmap_node; struct stringmap { struct stringmap_node *root; @@ -66,6 +71,6 @@ struct stringmap { //hack: 'bp' holds talloc ctx when 'root' is NULL }; -void *stringmap_lookup_real(struct stringmap *t, const char *key, int enterf, size_t T_size); +void *stringmap_lookup_real(struct stringmap *t, const char *key, size_t len, int enterf, size_t T_size); #endif diff --git a/ccan/stringmap/test/run.c b/ccan/stringmap/test/run.c index 68c5538a..a8099ec8 100644 --- a/ccan/stringmap/test/run.c +++ b/ccan/stringmap/test/run.c @@ -6,6 +6,12 @@ static void test_trivial(void) { stringmap(int) map = stringmap_new(NULL); + ok1(stringmap_lookup(map, "") == NULL); + *stringmap_enter(map, "") = -1; + + ok1(stringmap_lookup(map, "0") == NULL); + *stringmap_enter(map, "0") = 0; + ok1(stringmap_lookup(map, "one") == NULL); *stringmap_enter(map, "one") = 1; @@ -22,8 +28,10 @@ static void test_trivial(void) { ok1(*stringmap_lookup(map, "one") == 1); ok1(*stringmap_lookup(map, "four") == 4); ok1(*stringmap_lookup(map, "two") == 2); + ok1(*stringmap_lookup(map, "") == -1); + ok1(*stringmap_lookup(map, "0") == 0); - ok1(map.t.count == 4); + ok1(map.t.count == 6); stringmap_free(map); } @@ -45,35 +53,52 @@ static void scramble(void *base, size_t nmemb, size_t size) { //#define RANDOM_STRING_READABLE -static char *random_string(struct block_pool *bp) { - size_t len = random() % 100; - char *str = block_pool_alloc(bp, len+1); +static char *random_string(struct block_pool *bp, size_t *len_out) { + #ifndef RANDOM_STRING_READABLE + size_t len = random()%5 ? random()%10 : random()%1000; + #else + size_t len = random() % 10; + #endif + char *str = block_pool_alloc(bp, len); char *i; + *len_out = len; + for (i=str; len--; i++) { #ifndef RANDOM_STRING_READABLE char c = random(); - *i = c ? c : ' '; + *i = c; #else - //only generate characters [32,126] - char c = random()%95 + 32; + //only generate characters a-z + char c = random()%26 + 'a'; *i = c; #endif } - *i = 0; return str; } struct test_entry { + //note: struct layout needs to match *stringmap(char*).last const char *str; + size_t len; + char *value; /* value is not a string, but a pointer to char marking that this key has been entered already. */ }; -static int by_str(const void *ap, const void *bp) { - return strcmp(((struct test_entry*)ap)->str, ((struct test_entry*)bp)->str); +static int tecmp(const struct test_entry *a, const struct test_entry *b) { + if (a->len < b->len) + return -1; + else if (a->len > b->len) + return 1; + else + return memcmp(a->str, b->str, a->len); +} + +static int by_str(const void *a, const void *b) { + return tecmp(a, b); } static void cull_duplicates(struct test_entry *entries, size_t *count) { @@ -83,10 +108,10 @@ static void cull_duplicates(struct test_entry *entries, size_t *count) { for (i=entries, o=entries; ientries) { - const char *last = i[-1].str; - if (!strcmp(last, i->str)) { - do i++; while(istr)); + if (o>entries) { + struct test_entry *last = &o[-1]; + if (!tecmp(last, i)) { + do i++; while(istr = str; + i->len = len; i->value = value; } } @@ -147,24 +175,28 @@ static int test_stringmap(size_t count, FILE *out) { debug("Looking up %s", i->str); - node = stringmap_lookup(map, i->str); + node = stringmap_lookup_n(map, i->str, i->len); if (!node) { if (*i->value) err("Previously inserted entry not found"); - debug("Not found; entering"); + debug("Not found; entering %s", i->str); - node = stringmap_enter(map, i->str); - if (!node || strcmp(i->str, map.last->str)) + node = stringmap_enter_n(map, i->str, i->len); + if (!node || tecmp(i, (void*)map.last)) err("Node not properly entered"); + if (map.last->str[map.last->len]) + err("Entered string not zero-terminated"); *node = i->value; *i->value = 1; //mark that the entry is entered unique_count++; } else { - if (strcmp(i->str, map.last->str)) + if (tecmp(i, (void*)map.last)) err("lookup returned incorrect string"); + if (map.last->str[map.last->len]) + err("Looked-up string not zero-terminated"); if (i->value != *node) err("lookup returned incorrect value"); if (!*i->value) @@ -175,14 +207,16 @@ static int test_stringmap(size_t count, FILE *out) { if (map.t.count != unique_count) err("Map has incorrect count"); - printf("stringmap test passed after %zu inserts, %zu lookups (%zu total operations)\n", unique_count, (i-entries)-unique_count, i-entries); + printf("stringmap test passed after %zu inserts, %zu lookups (%zu total operations)\n", + unique_count, (i-entries)-unique_count, i-entries); block_pool_free(bp); stringmap_free(map); return 1; fail: - printf("stringmap test failed after %zu inserts, %zu lookups (%zu total operations)\n", unique_count, (i-entries)-unique_count, i-entries); + printf("stringmap test failed after %zu inserts, %zu lookups (%zu total operations)\n", + unique_count, (i-entries)-unique_count, i-entries); block_pool_free(bp); stringmap_free(map); @@ -196,10 +230,10 @@ fail: int main(void) { - plan_tests(10); + plan_tests(14); test_trivial(); - ok1(test_stringmap(10000, NULL)); + ok1(test_stringmap(10000, stdout)); return exit_status(); } -- 2.39.2