From ab83de953730f5e5e571dbf69ffb3cc685a102dc Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 26 Sep 2011 20:31:17 +0930 Subject: [PATCH] strset: new module using critbit trees. Not as fast as using htable, but simple and provides order and prefix ops. --- ccan/strset/_info | 71 ++++++++ ccan/strset/strset.c | 298 ++++++++++++++++++++++++++++++++++ ccan/strset/strset.h | 164 +++++++++++++++++++ ccan/strset/test/run-hibit.c | 82 ++++++++++ ccan/strset/test/run-order.c | 81 +++++++++ ccan/strset/test/run-prefix.c | 87 ++++++++++ ccan/strset/test/run.c | 56 +++++++ 7 files changed, 839 insertions(+) create mode 100644 ccan/strset/_info create mode 100644 ccan/strset/strset.c create mode 100644 ccan/strset/strset.h create mode 100644 ccan/strset/test/run-hibit.c create mode 100644 ccan/strset/test/run-order.c create mode 100644 ccan/strset/test/run-prefix.c create mode 100644 ccan/strset/test/run.c diff --git a/ccan/strset/_info b/ccan/strset/_info new file mode 100644 index 00000000..81e8f53c --- /dev/null +++ b/ccan/strset/_info @@ -0,0 +1,71 @@ +#include +#include "config.h" + +/** + * strset - an ordered set of strings + * + * This code implements an ordered set of string as a critbit tree. See: + * + * http://cr.yp.to/critbit.html + * http://github.com/agl/critbit (which this code is based on) + * + * Note that ccan/htable is faster and uses less memory, but doesn't provide + * ordered or prefix operations. + * + * Example: + * // Print all words in order. + * #include + * #include + * #include + * #include + * + * static bool dump(const char *member, void *unused) + * { + * printf("%s ", member); + * return false; + * } + * + * int main(void) + * { + * struct strset words; + * char *file, *word; + * + * strset_init(&words); + * file = grab_fd(NULL, 0, NULL); + * if (!file) + * err(1, "Reading stdin"); + * + * for (word = strtok(file, " \t\r\n"); + * word; + * word = strtok(NULL, " \t\r\n")) { + * strset_set(&words, word); + * } + * strset_iterate(&words, dump, NULL); + * printf("\n"); + * return 0; + * } + * // Given "foo bar" outputs "bar foo " + * // Given "foo foo bar" outputs "bar foo " + * + * License: Public domain (but some dependencies are LGPL!) + * Author: Rusty Russell + * Ccanlint: + * license_depends_compat FAIL + */ +int main(int argc, char *argv[]) +{ + /* Expect exactly one argument */ + if (argc != 2) + return 1; + + if (strcmp(argv[1], "depends") == 0) { + printf("ccan/ilog\n" + "ccan/likely\n" + "ccan/short_types\n" + "ccan/str\n" + "ccan/typesafe_cb\n"); + return 0; + } + + return 1; +} diff --git a/ccan/strset/strset.c b/ccan/strset/strset.c new file mode 100644 index 00000000..8088db47 --- /dev/null +++ b/ccan/strset/strset.c @@ -0,0 +1,298 @@ +/* This code is based on the public domain code at + * http://github.com/agl/critbit writtem by Adam Langley + * . + * + * Here are the main implementation differences: + * (1) We don't strdup the string on insert; we use the pointer we're given. + * (2) We use a straight bit number rather than a mask; it's simpler. + * (3) We don't use the bottom bit of the pointer, but instead use a leading + * zero to distinguish nodes from strings. + * (4) The empty string (which would look like a node) is handled + * using a special "empty node". + * (5) Delete returns the string, so you can free it if you want to. + * (6) Unions instead of void *, bool instead of int. + */ +#include +#include +#include +#include +#include +#include +#include + +struct node { + /* To differentiate us from strings. */ + char nul_byte; + /* The bit where these children differ. */ + u8 bit_num; + /* The byte number where first bit differs (-1 == empty string node). */ + size_t byte_num; + /* These point to strings or nodes. */ + struct strset child[2]; +}; + +/* Closest member to this in a non-empty set. */ +static const char *closest(struct strset n, const char *member) +{ + size_t len = strlen(member); + const u8 *bytes = (const u8 *)member; + + /* Anything with first byte 0 is a node. */ + while (!n.u.s[0]) { + u8 direction = 0; + + /* Special node which represents the empty string. */ + if (unlikely(n.u.n->byte_num == (size_t)-1)) { + n = n.u.n->child[0]; + break; + } + + if (n.u.n->byte_num < len) { + u8 c = bytes[n.u.n->byte_num]; + direction = (c >> n.u.n->bit_num) & 1; + } + n = n.u.n->child[direction]; + } + return n.u.s; +} + +char *strset_test(const struct strset *set, const char *member) +{ + const char *str; + + /* Empty set? */ + if (!set->u.n) + return NULL; + str = closest(*set, member); + if (streq(member, str)) + return (char *)str; + return NULL; +} + +static bool set_string(struct strset *set, + struct strset *n, const char *member) +{ + /* Substitute magic empty node if this is the empty string */ + if (unlikely(!member[0])) { + n->u.n = malloc(sizeof(*n->u.n)); + if (unlikely(!n->u.n)) + return false; + n->u.n->nul_byte = '\0'; + n->u.n->byte_num = (size_t)-1; + /* Attach the string to child[0] */ + n = &n->u.n->child[0]; + } + n->u.s = member; + return true; +} + +bool strset_set(struct strset *set, const char *member) +{ + size_t len = strlen(member); + const u8 *bytes = (const u8 *)member; + struct strset *np; + const char *str; + struct node *newn; + size_t byte_num; + u8 bit_num, new_dir; + + /* Empty set? */ + if (!set->u.n) { + return set_string(set, set, member); + } + + /* Find closest existing member. */ + str = closest(*set, member); + + /* Find where they differ. */ + for (byte_num = 0; str[byte_num] == member[byte_num]; byte_num++) { + if (member[byte_num] == '\0') { + /* All identical! */ + return false; + } + } + + /* Find which bit differs (if we had ilog8, we'd use it) */ + bit_num = ilog32_nz((u8)str[byte_num] ^ bytes[byte_num]) - 1; + assert(bit_num < CHAR_BIT); + + /* Which direction do we go at this bit? */ + new_dir = ((bytes[byte_num]) >> bit_num) & 1; + + /* Allocate new node. */ + newn = malloc(sizeof(*newn)); + if (!newn) { + /* FIXME */ + return false; + } + newn->nul_byte = '\0'; + newn->byte_num = byte_num; + newn->bit_num = bit_num; + if (unlikely(!set_string(set, &newn->child[new_dir], member))) { + free(newn); + return false; + } + + /* Find where to insert: not closest, but first which differs! */ + np = set; + while (!np->u.s[0]) { + u8 direction = 0; + + /* Special node which represents the empty string will + * break here too! */ + if (np->u.n->byte_num > byte_num) + break; + /* Subtle: bit numbers are "backwards" for comparison */ + if (np->u.n->byte_num == byte_num && np->u.n->bit_num < bit_num) + break; + + if (np->u.n->byte_num < len) { + u8 c = bytes[np->u.n->byte_num]; + direction = (c >> np->u.n->bit_num) & 1; + } + np = &np->u.n->child[direction]; + } + + newn->child[!new_dir]= *np; + np->u.n = newn; + return true; +} + +char *strset_clear(struct strset *set, const char *member) +{ + size_t len = strlen(member); + const u8 *bytes = (const u8 *)member; + struct strset *parent = NULL, *n; + const char *ret = NULL; + u8 direction = 0; /* prevent bogus gcc warning. */ + + /* Empty set? */ + if (!set->u.n) + return NULL; + + /* Find closest, but keep track of parent. */ + n = set; + /* Anything with first byte 0 is a node. */ + while (!n->u.s[0]) { + u8 c = 0; + + /* Special node which represents the empty string. */ + if (unlikely(n->u.n->byte_num == (size_t)-1)) { + const char *empty_str = n->u.n->child[0].u.s; + + if (member[0]) + return NULL; + + /* Sew empty string back so remaining logic works */ + free(n->u.n); + n->u.s = empty_str; + break; + } + + parent = n; + if (n->u.n->byte_num < len) { + c = bytes[n->u.n->byte_num]; + direction = (c >> n->u.n->bit_num) & 1; + } else + direction = 0; + n = &n->u.n->child[direction]; + } + + /* Did we find it? */ + if (!streq(member, n->u.s)) + return NULL; + + ret = n->u.s; + + if (!parent) { + /* We deleted last node. */ + set->u.n = NULL; + } else { + struct node *old = parent->u.n; + /* Raise other node to parent. */ + *parent = old->child[!direction]; + free(old); + } + + return (char *)ret; +} + +static bool iterate(struct strset n, + bool (*handle)(const char *, void *), void *data) +{ + if (n.u.s[0]) + return handle(n.u.s, data); + if (unlikely(n.u.n->byte_num == (size_t)-1)) + return handle(n.u.n->child[0].u.s, data); + + return iterate(n.u.n->child[0], handle, data) + || iterate(n.u.n->child[1], handle, data); +} + +void strset_iterate_(const struct strset *set, + bool (*handle)(const char *, void *), void *data) +{ + /* Empty set? */ + if (!set->u.n) + return; + + iterate(*set, handle, data); +} + +const struct strset *strset_prefix(const struct strset *set, const char *prefix) +{ + const struct strset *n, *top; + size_t len = strlen(prefix); + const u8 *bytes = (const u8 *)prefix; + + /* Empty set -> return empty set. */ + if (!set->u.n) + return set; + + top = n = set; + + /* We walk to find the top, but keep going to check prefix matches. */ + while (!n->u.s[0]) { + u8 c = 0, direction; + + /* Special node which represents the empty string. */ + if (unlikely(n->u.n->byte_num == (size_t)-1)) { + n = &n->u.n->child[0]; + break; + } + + if (n->u.n->byte_num < len) + c = bytes[n->u.n->byte_num]; + + direction = (c >> n->u.n->bit_num) & 1; + n = &n->u.n->child[direction]; + if (c) + top = n; + } + + if (!strstarts(n->u.s, prefix)) { + /* Convenient return for prefixes which do not appear in set. */ + static const struct strset empty_set; + return &empty_set; + } + + return top; +} + +static void destroy(struct strset n) +{ + if (!n.u.s[0]) { + if (likely(n.u.n->byte_num != (size_t)-1)) { + destroy(n.u.n->child[0]); + destroy(n.u.n->child[1]); + } + free(n.u.n); + } +} + +void strset_destroy(struct strset *set) +{ + if (set->u.n) + destroy(*set); + set->u.n = NULL; +} diff --git a/ccan/strset/strset.h b/ccan/strset/strset.h new file mode 100644 index 00000000..8d0d307e --- /dev/null +++ b/ccan/strset/strset.h @@ -0,0 +1,164 @@ +#ifndef CCAN_STRSET_H +#define CCAN_STRSET_H +#include "config.h" +#include +#include +#include + +/** + * struct strset - representation of a string set + * + * It's exposed here to allow you to embed it and so we can inline the + * trivial functions. + */ +struct strset { + union { + struct node *n; + const char *s; + } u; +}; + +/** + * strset_init - initialize a string set (empty) + * + * For completeness; if you've arranged for it to be NULL already you don't + * need this. + * + * Example: + * struct strset set; + * + * strset_init(&set); + */ +static inline void strset_init(struct strset *set) +{ + set->u.n = NULL; +} + +/** + * strset_empty - is this string set empty? + * @set: the set. + * + * Example: + * if (!strset_empty(&set)) + * abort(); + */ +static inline bool strset_empty(const struct strset *set) +{ + return set->u.n == NULL; +} + +/** + * strset_test - is this a member of this string set? + * @set: the set. + * @member: the string to search for. + * + * Returns the member, or NULL if it isn't in the set. + * + * Example: + * if (strset_test(&set, "hello")) + * printf("hello is in the set\n"); + */ +char *strset_test(const struct strset *set, const char *member); + +/** + * strset_set - place a member in the string set. + * @set: the set. + * @member: the string to place in the set. + * + * This returns false if we run out of memory, or (more normally) if that + * string already appears in the set. + * + * Note that the pointer is placed in the set, the string is not copied. If + * you want a copy in the set, use strdup(). + * + * Example: + * if (!strset_set(&set, "goodbye")) + * printf("goodbye was already in the set\n"); + */ +bool strset_set(struct strset *set, const char *member); + +/** + * strset_clear - remove a member from the string set. + * @set: the set. + * @member: the string to remove from the set. + * + * This returns the string which was passed to strset_set(), or NULL. + * This means that if you allocated a string (eg. using strdup()), you can + * free it here. + * + * Example: + * if (!strset_clear(&set, "goodbye")) + * printf("goodbye was not in the set?\n"); + */ +char *strset_clear(struct strset *set, const char *member); + +/** + * strset_destroy - remove every member from the set. + * @set: the set. + * + * The set will be empty after this. + * + * Example: + * strset_destroy(&set); + */ +void strset_destroy(struct strset *set); + +/** + * strset_iterate - ordered iteration over a set + * @set: the set. + * @handle: the function to call. + * @arg: the argument for the function (types should match). + * + * You should not alter the set within the @handle function! If it returns + * true, the iteration will stop. + * + * Example: + * static bool dump_some(const char *member, int *num) + * { + * // Only dump out num nodes. + * if (*(num--) == 0) + * return true; + * printf("%s\n", member); + * return false; + * } + * + * static void dump_set(const struct strset *set) + * { + * int max = 100; + * strset_iterate(set, dump_some, &max); + * if (max < 0) + * printf("... (truncated to 100 entries)\n"); + * } + */ +#define strset_iterate(set, handle, arg) \ + strset_iterate_((set), typesafe_cb_preargs(bool, void *, \ + (handle), (arg), \ + const char *), \ + (arg)) +void strset_iterate_(const struct strset *set, + bool (*handle)(const char *, void *), void *data); + + +/** + * strset_prefix - return a subset matching a prefix + * @set: the set. + * @prefix: the prefix. + * + * This returns a pointer into @set, so don't alter @set while using + * the return value. You can use strset_iterate(), strset_test() or + * strset_empty() on the returned pointer. + * + * Example: + * static void dump_prefix(const struct strset *set, const char *prefix) + * { + * int max = 100; + * printf("Nodes with prefix %s:\n", prefix); + * strset_iterate(strset_prefix(set, prefix), dump_some, &max); + * if (max < 0) + * printf("... (truncated to 100 entries)\n"); + * } + */ +const struct strset *strset_prefix(const struct strset *set, + const char *prefix); + +#endif /* CCAN_STRSET_H */ diff --git a/ccan/strset/test/run-hibit.c b/ccan/strset/test/run-hibit.c new file mode 100644 index 00000000..2ea158dd --- /dev/null +++ b/ccan/strset/test/run-hibit.c @@ -0,0 +1,82 @@ +/* Test high bit handling. */ +#include +#include +#include + +#define NUM 1000 + +static void encode(char template[3], unsigned int val) +{ + assert(val < 255 * 255); + template[0] = (val / 255) + 1; + template[1] = (val % 255) + 1; + template[2] = '\0'; +} + +static bool in_order(const char *value, unsigned int *count) +{ + char template[3]; + encode(template, *count); + ok1(streq(value, template)); + (*count)++; + return false; +} + +static bool in_order_by_2(const char *value, unsigned int *count) +{ + char template[3]; + encode(template, *count); + ok1(streq(value, template)); + (*count) += 2; + return false; +} + +static bool dump(const char *value, void *unused) +{ + diag("%s", value); + return false; +} + +int main(void) +{ + struct strset set; + unsigned int i; + char *str[NUM]; + + plan_tests(NUM + 3 * NUM / 2); + strset_init(&set); + + for (i = 0; i < NUM; i++) { + char template[3]; + encode(template, i); + str[i] = strdup(template); + } + + for (i = 0; i < NUM; i++) + strset_set(&set, str[i]); + + strset_iterate(&set, dump, NULL); + + /* Iterate. */ + i = 0; + strset_iterate(&set, in_order, &i); + + /* Preserve order after deletion. */ + for (i = 0; i < NUM; i += 2) + ok1(strset_clear(&set, str[i]) == str[i]); + + i = 1; + strset_iterate(&set, in_order_by_2, &i); + + for (i = 1; i < NUM; i += 2) + ok1(strset_clear(&set, str[i]) == str[i]); + + /* empty traverse. */ + strset_iterate(&set, in_order_by_2, (unsigned int *)NULL); + + for (i = 0; i < NUM; i++) + free(str[i]); + + /* This exits depending on whether all tests passed */ + return exit_status(); +} diff --git a/ccan/strset/test/run-order.c b/ccan/strset/test/run-order.c new file mode 100644 index 00000000..73784e75 --- /dev/null +++ b/ccan/strset/test/run-order.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include + +#define NUM 1000 + +static bool in_order(const char *value, unsigned int *count) +{ + int i = atoi(value); + ok1(*count == i); + (*count)++; + return false; +} + +static bool in_order_by_2(const char *value, unsigned int *count) +{ + int i = atoi(value); + ok1(*count == i); + (*count) += 2; + return false; +} + +static bool dump(const char *value, void *unused) +{ + diag("%s", value); + return false; +} + +int main(void) +{ + struct strset set; + unsigned int i; + char *str[NUM]; + + plan_tests(NUM * 2 + 3 * NUM / 2); + strset_init(&set); + + for (i = 0; i < NUM; i++) { + char template[10]; + sprintf(template, "%08u", i); + str[i] = strdup(template); + } + + for (i = 0; i < NUM; i++) + strset_set(&set, str[i]); + + strset_iterate(&set, dump, NULL); + + /* Iterate. */ + i = 0; + strset_iterate(&set, in_order, &i); + + /* Preserve order after deletion. */ + for (i = 0; i < NUM; i += 2) + ok1(strset_clear(&set, str[i]) == str[i]); + + i = 1; + strset_iterate(&set, in_order_by_2, &i); + + for (i = 1; i < NUM; i += 2) + ok1(strset_clear(&set, str[i]) == str[i]); + + /* empty traverse. */ + strset_iterate(&set, in_order_by_2, (unsigned int *)NULL); + + /* insert backwards, should be fine. */ + for (i = 0; i < NUM; i++) + strset_set(&set, str[NUM-1-i]); + + i = 0; + strset_iterate(&set, in_order, &i); + + strset_destroy(&set); + + for (i = 0; i < NUM; i++) + free(str[i]); + + /* This exits depending on whether all tests passed */ + return exit_status(); +} diff --git a/ccan/strset/test/run-prefix.c b/ccan/strset/test/run-prefix.c new file mode 100644 index 00000000..87ce98fa --- /dev/null +++ b/ccan/strset/test/run-prefix.c @@ -0,0 +1,87 @@ +#include +#include +#include +#include + +/* Must be > 100, see below. */ +#define NUM 200 + +static bool in_order(const char *value, unsigned int *count) +{ + int i = atoi(value); + ok1(*count == i); + (*count)++; + return false; +} + +static bool find_empty(const char *value, char *empty) +{ + if (value == empty) + pass("Found empty entry!"); + return false; +} + +int main(void) +{ + struct strset set; + const struct strset *sub; + unsigned int i; + char *str[NUM], *empty; + + plan_tests(7 + 1 + 10 + 100); + strset_init(&set); + + for (i = 0; i < NUM; i++) { + char template[10]; + sprintf(template, "%08u", i); + str[i] = strdup(template); + } + + for (i = 0; i < NUM; i++) + strset_set(&set, str[i]); + + /* Nothing */ + sub = strset_prefix(&set, "a"); + ok1(strset_empty(sub)); + + /* Everything */ + sub = strset_prefix(&set, "0"); + ok1(sub->u.n == set.u.n); + sub = strset_prefix(&set, ""); + ok1(sub->u.n == set.u.n); + + /* Singleton. */ + sub = strset_prefix(&set, "00000000"); + i = 0; + strset_iterate(sub, in_order, &i); + ok1(i == 1); + + /* First 10. */ + sub = strset_prefix(&set, "0000000"); + i = 0; + strset_iterate(sub, in_order, &i); + ok1(i == 10); + + /* First 100. */ + sub = strset_prefix(&set, "000000"); + i = 0; + strset_iterate(sub, in_order, &i); + ok1(i == 100); + + /* Everything, *plus* empty string. */ + empty = strdup(""); + strset_set(&set, empty); + + sub = strset_prefix(&set, ""); + /* Check we get *our* empty string back! */ + strset_iterate(sub, find_empty, empty); + + strset_destroy(&set); + + for (i = 0; i < NUM; i++) + free(str[i]); + free(empty); + + /* This exits depending on whether all tests passed */ + return exit_status(); +} diff --git a/ccan/strset/test/run.c b/ccan/strset/test/run.c new file mode 100644 index 00000000..557c7080 --- /dev/null +++ b/ccan/strset/test/run.c @@ -0,0 +1,56 @@ +#include +#include +#include + +int main(void) +{ + struct strset set; + const char str[] = "hello"; + const char none[] = ""; + char *dup = strdup(str); + + /* This is how many tests you plan to run */ + plan_tests(24); + + strset_init(&set); + + ok1(!strset_test(&set, str)); + ok1(!strset_test(&set, none)); + ok1(!strset_clear(&set, str)); + ok1(!strset_clear(&set, none)); + + ok1(strset_set(&set, str)); + ok1(strset_test(&set, str)); + /* We compare the string, not the pointer. */ + ok1(strset_test(&set, dup)); + ok1(!strset_test(&set, none)); + + /* Delete should return original string. */ + ok1(strset_clear(&set, dup) == str); + ok1(!strset_test(&set, str)); + ok1(!strset_test(&set, none)); + + /* Try insert and delete of empty string. */ + ok1(strset_set(&set, none)); + ok1(strset_test(&set, none)); + ok1(!strset_test(&set, str)); + + /* Delete should return original string. */ + ok1(strset_clear(&set, "") == none); + ok1(!strset_test(&set, str)); + ok1(!strset_test(&set, none)); + + /* Both at once... */ + ok1(strset_set(&set, none)); + ok1(strset_set(&set, str)); + ok1(strset_test(&set, str)); + ok1(strset_test(&set, none)); + ok1(strset_clear(&set, "") == none); + ok1(strset_clear(&set, dup) == str); + + ok1(set.u.n == NULL); + free(dup); + + /* This exits depending on whether all tests passed */ + return exit_status(); +} -- 2.39.2