From 49bb40e9615ada5a3e0d06bc45613744f441895c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 7 Nov 2014 15:08:03 +1030 Subject: [PATCH] invbloom: new module for IBLTs. Signed-off-by: Rusty Russell --- Makefile-ccan | 1 + ccan/invbloom/LICENSE | 1 + ccan/invbloom/_info | 66 ++++++++++++ ccan/invbloom/invbloom.c | 171 ++++++++++++++++++++++++++++++ ccan/invbloom/invbloom.h | 117 ++++++++++++++++++++ ccan/invbloom/test/run-subtract.c | 42 ++++++++ ccan/invbloom/test/run.c | 115 ++++++++++++++++++++ 7 files changed, 513 insertions(+) create mode 120000 ccan/invbloom/LICENSE create mode 100644 ccan/invbloom/_info create mode 100644 ccan/invbloom/invbloom.c create mode 100644 ccan/invbloom/invbloom.h create mode 100644 ccan/invbloom/test/run-subtract.c create mode 100644 ccan/invbloom/test/run.c diff --git a/Makefile-ccan b/Makefile-ccan index 5b6d9c94..74916d2a 100644 --- a/Makefile-ccan +++ b/Makefile-ccan @@ -59,6 +59,7 @@ MODS_WITH_SRC := antithread \ htable \ idtree \ ilog \ + invbloom \ io \ isaac \ iscsi \ diff --git a/ccan/invbloom/LICENSE b/ccan/invbloom/LICENSE new file mode 120000 index 00000000..2354d129 --- /dev/null +++ b/ccan/invbloom/LICENSE @@ -0,0 +1 @@ +../../licenses/BSD-MIT \ No newline at end of file diff --git a/ccan/invbloom/_info b/ccan/invbloom/_info new file mode 100644 index 00000000..03e9fe0b --- /dev/null +++ b/ccan/invbloom/_info @@ -0,0 +1,66 @@ +#include "config.h" +#include +#include + +/** + * invbloom - implementation of invertible bloom lookup tables. + * + * This code implements a subset of invertible bloom lookup tables + * as described in[1]. + * + * [1] Goodrich, Michael T., and Michael Mitzenmacher. "Invertible bloom + * lookup tables." Communication, Control, and Computing (Allerton), 2011 + * 49th Annual Allerton Conference on. IEEE, 2011. + * http://arxiv.org/pdf/1101.2245 + * + * License: BSD-MIT + * + * Example: + * #include + * #include + * + * int main(int argc, char *argv[]) + * { + * unsigned int i, n; + * struct invbloom *ib = invbloom_new(NULL, char *, 16, 0); + * + * for (i = 1; i < argc; i++) + * invbloom_insert(ib, &argv[i]); + * + * n = 0; + * for (i = 1; i < argc; i++) + * n += invbloom_get(ib, &argv[i]); + * + * printf("%u out of %u are found\n", n, argc); + * + * n = 0; + * for (i = 1; i < argc; i++) { + * unsigned int j; + * char **p = invbloom_extract(NULL, ib); + * + * for (j = 1; j < argc; j++) { + * if (p == &argv[j]) + * n++; + * } + * tal_free(p); + * } + * printf("%u out of %u were extracted\n", n, argc); + * return 0; + * } + */ +int main(int argc, char *argv[]) +{ + /* Expect exactly one argument */ + if (argc != 2) + return 1; + + if (strcmp(argv[1], "depends") == 0) { + printf("ccan/endian\n"); + printf("ccan/hash\n"); + printf("ccan/short_types\n"); + printf("ccan/tal\n"); + return 0; + } + + return 1; +} diff --git a/ccan/invbloom/invbloom.c b/ccan/invbloom/invbloom.c new file mode 100644 index 00000000..73e1825d --- /dev/null +++ b/ccan/invbloom/invbloom.c @@ -0,0 +1,171 @@ +/* Licensed under BSD-MIT - see LICENSE file for details */ +#include "invbloom.h" +#include +#include +#include + +/* "We will show that hash_count values of 3 or 4 work well in practice" + + From: + + Eppstein, David, et al. "What's the difference?: efficient set reconciliation without prior context." ACM SIGCOMM Computer Communication Review. Vol. 41. No. 4. ACM, 2011. http://conferences.sigcomm.org/sigcomm/2011/papers/sigcomm/p218.pdf +*/ +#define NUM_HASHES 4 + +struct invbloom *invbloom_new_(const tal_t *ctx, + size_t id_size, + size_t n_elems, + u32 salt) +{ + struct invbloom *ib = tal(ctx, struct invbloom); + + if (ib) { + ib->n_elems = n_elems; + ib->id_size = id_size; + ib->salt = salt; + ib->count = tal_arrz(ib, s32, n_elems); + ib->idsum = tal_arrz(ib, u8, id_size * n_elems); + if (!ib->count || !ib->idsum) + ib = tal_free(ib); + } + return ib; +} + +static size_t hash_bucket(const struct invbloom *ib, const void *id, size_t i) +{ + return hash((const char *)id, ib->id_size, ib->salt+i*7) % ib->n_elems; +} + +static u8 *idsum_ptr(const struct invbloom *ib, size_t bucket) +{ + return (u8 *)ib->idsum + bucket * ib->id_size; +} + +static void add_to_bucket(struct invbloom *ib, size_t n, const u8 *id) +{ + size_t i; + u8 *idsum = idsum_ptr(ib, n); + + ib->count[n]++; + + for (i = 0; i < ib->id_size; i++) + idsum[i] ^= id[i]; +} + +static void remove_from_bucket(struct invbloom *ib, size_t n, const u8 *id) +{ + size_t i; + u8 *idsum = idsum_ptr(ib, n); + + ib->count[n]--; + for (i = 0; i < ib->id_size; i++) + idsum[i] ^= id[i]; +} + +void invbloom_insert(struct invbloom *ib, const void *id) +{ + unsigned int i; + + for (i = 0; i < NUM_HASHES; i++) + add_to_bucket(ib, hash_bucket(ib, id, i), id); +} + +void invbloom_delete(struct invbloom *ib, const void *id) +{ + unsigned int i; + + for (i = 0; i < NUM_HASHES; i++) + remove_from_bucket(ib, hash_bucket(ib, id, i), id); +} + +static bool all_zero(const u8 *mem, size_t size) +{ + unsigned int i; + + for (i = 0; i < size; i++) + if (mem[i]) + return false; + return true; +} + +bool invbloom_get(const struct invbloom *ib, const void *id) +{ + unsigned int i; + + for (i = 0; i < NUM_HASHES; i++) { + size_t h = hash_bucket(ib, id, i); + u8 *idsum = idsum_ptr(ib, h); + + if (ib->count[h] == 0 && all_zero(idsum, ib->id_size)) + return false; + + if (ib->count[h] == 1) + return (memcmp(idsum, id, ib->id_size) == 0); + } + return false; +} + +static void *extract(const tal_t *ctx, struct invbloom *ib, int count) +{ + size_t i; + + /* FIXME: this makes full extraction O(n^2). */ + for (i = 0; i < ib->n_elems; i++) { + void *id; + + if (ib->count[i] != count) + continue; + + id = tal_dup(ctx, u8, idsum_ptr(ib, i), ib->id_size, 0); + return id; + } + return NULL; +} + +void *invbloom_extract(const tal_t *ctx, struct invbloom *ib) +{ + void *id; + + id = extract(ctx, ib, 1); + if (id) + invbloom_delete(ib, id); + return id; +} + +void *invbloom_extract_negative(const tal_t *ctx, struct invbloom *ib) +{ + void *id; + + id = extract(ctx, ib, -1); + if (id) + invbloom_insert(ib, id); + return id; +} + +void invbloom_subtract(struct invbloom *ib1, const struct invbloom *ib2) +{ + size_t i; + + assert(ib1->n_elems == ib2->n_elems); + assert(ib1->id_size == ib2->id_size); + assert(ib1->salt == ib2->salt); + + for (i = 0; i < ib1->n_elems; i++) + ib1->count[i] -= ib2->count[i]; + + for (i = 0; i < ib1->n_elems * ib1->id_size; i++) + ib1->idsum[i] ^= ib2->idsum[i]; +} + +bool invbloom_empty(const struct invbloom *ib) +{ + size_t i; + + for (i = 0; i < ib->n_elems; i++) { + if (ib->count[i]) + return false; + if (!all_zero(idsum_ptr(ib, i), ib->id_size)) + return false; + } + return true; +} diff --git a/ccan/invbloom/invbloom.h b/ccan/invbloom/invbloom.h new file mode 100644 index 00000000..04b156f6 --- /dev/null +++ b/ccan/invbloom/invbloom.h @@ -0,0 +1,117 @@ +/* Licensed under BSD-MIT - see LICENSE file for details */ +#ifndef CCAN_INVBLOOM_H +#define CCAN_INVBLOOM_H +#include +#include + +struct invbloom { + size_t n_elems; + size_t id_size; + u32 hashsum_bytes; /* 0 - 4. */ + u32 salt; + s32 *count; /* [n_elems] */ + u8 *idsum; /* [n_elems][id_size] */ +}; + +/** + * invbloom_new - create a new invertable bloom lookup table + * @ctx: context to tal() from, or NULL. + * @type: type to place into the buckets (must not contain padding) + * @n_elems: number of entries in table + * @salt: 32 bit seed for table + * + * Returns a new table, which can be freed with tal_free(). + */ +#define invbloom_new(ctx, type, n_elems, salt) \ + invbloom_new_((ctx), sizeof(type), (n_elems), (salt)) +struct invbloom *invbloom_new_(const tal_t *ctx, + size_t id_size, + size_t n_elems, u32 salt); + + +/** + * invbloom_insert - add a new element + * @ib: the invertable bloom lookup table. + * @elem: the element + * + * This is guaranteed to be the inverse of invbloom_delete. + */ +void invbloom_insert(struct invbloom *ib, const void *elem); + +/** + * invbloom_delete - remove an element + * @ib: the invertable bloom lookup table. + * @elem: the element + * + * Note that this doesn't check the element was previously added (as + * that can not be done in general anyway). + * + * This is guaranteed to be the inverse of invbloom_delete. + */ +void invbloom_delete(struct invbloom *ib, const void *elem); + +/** + * invbloom_get - check if an element is (probably) in the table. + * @ib: the invertable bloom lookup table. + * @elem: the element + * + * This may return a false negative if the table is too full. + * + * It will only return a false positive if deletions and insertions + * don't match, and have managed to produce a result which matches the + * element. This is much less likely. + */ +bool invbloom_get(const struct invbloom *ib, const void *elem); + +/** + * invbloom_extract - try to recover an entry added to a bloom lookup table. + * @ctx: the context to tal() the return value from. + * @ib: the invertable bloom lookup table. + * + * This may be able to recover (and delete) an element which was + * invbloom_insert()ed into the table (and not deleted). This will + * not work if the table is too full. + * + * It may return a bogus element if deletions and insertions don't + * match. + */ +void *invbloom_extract(const tal_t *ctx, struct invbloom *ib); + +/** + * invbloom_extract_negative - try to recover an entry deleted from a bloom lookup table. + * @ctx: the context to tal() the return value from. + * @ib: the invertable bloom lookup table. + * + * This may be able to recover (and insert/undo) an element which was + * invbloom_delete()ed from the table (and not inserted). This will + * not work if the table is too full. + * + * It may return a bogus element if deletions and insertions don't + * match. + */ +void *invbloom_extract_negative(const tal_t *ctx, struct invbloom *ib); + +/** + * invbloom_subtract - return the differences of two IBLTs. + * @ib1: the invertable bloom lookup table to alter + * @ib2: the invertable bloom lookup table to subtract. + * + * This produces exactly the same result as if a new table had all the + * elements only in @ib1 inserted, then all the elements onlt in @ib2 + * deleted. + * + * ie. if @ib1 and @ib2 are similar, the result may be a usable by + * invbloom_extract and invbloom_extract_negative. + */ +void invbloom_subtract(struct invbloom *ib1, const struct invbloom *ib2); + +/** + * invbloom_empty - is an invertable bloom lookup table completely clean? + * @ib: the invertable bloom lookup table + * + * This is always true if @ib has had the same elements inserted and + * deleted. It is far less likely to be true if different ones were + * deleted than inserted. + */ +bool invbloom_empty(const struct invbloom *ib); +#endif /* CCAN_INVBLOOM_H */ diff --git a/ccan/invbloom/test/run-subtract.c b/ccan/invbloom/test/run-subtract.c new file mode 100644 index 00000000..ae476423 --- /dev/null +++ b/ccan/invbloom/test/run-subtract.c @@ -0,0 +1,42 @@ +#include +/* Include the C files directly. */ +#include +#include + +int main(void) +{ + struct invbloom *ib1, *ib2; + const tal_t *ctx = tal(NULL, char); + int val = 1, val2 = 2, *ip; + + /* This is how many tests you plan to run */ + plan_tests(8); + + ib1 = invbloom_new(ctx, int, 1024, 0); + ib2 = invbloom_new(ctx, int, 1024, 0); + invbloom_insert(ib1, &val); + invbloom_insert(ib2, &val2); + + invbloom_subtract(ib1, ib2); + + ip = invbloom_extract(ctx, ib1); + ok1(ip); + ok1(tal_parent(ip) == ctx); + ok1(*ip == val); + + ip = invbloom_extract(ctx, ib1); + ok1(!ip); + + ip = invbloom_extract_negative(ctx, ib1); + ok1(ip); + ok1(tal_parent(ip) == ctx); + ok1(*ip == val2); + + ip = invbloom_extract_negative(ctx, ib1); + ok1(!ip); + + tal_free(ctx); + + /* This exits depending on whether all tests passed */ + return exit_status(); +} diff --git a/ccan/invbloom/test/run.c b/ccan/invbloom/test/run.c new file mode 100644 index 00000000..b770128b --- /dev/null +++ b/ccan/invbloom/test/run.c @@ -0,0 +1,115 @@ +#include +/* Include the C files directly. */ +#include +#include + +int main(void) +{ + struct invbloom *ib; + const tal_t *ctx = tal(NULL, char); + int val, val2, *ip, *ip2, i; + + /* This is how many tests you plan to run */ + plan_tests(127); + + ib = invbloom_new(ctx, int, 1, 100); + ok1(tal_parent(ib) == ctx); + ok1(ib->id_size == sizeof(int)); + ok1(ib->salt == 100); + ok1(ib->n_elems == 1); + ok1(invbloom_empty(ib)); + + val = 0; + invbloom_insert(ib, &val); + ok1(ib->count[0] == NUM_HASHES); + ok1(!invbloom_empty(ib)); + invbloom_delete(ib, &val); + ok1(invbloom_empty(ib)); + + val2 = 2; + invbloom_insert(ib, &val); + invbloom_insert(ib, &val2); + ok1(!invbloom_empty(ib)); + invbloom_delete(ib, &val); + ok1(!invbloom_empty(ib)); + invbloom_delete(ib, &val2); + ok1(invbloom_empty(ib)); + + tal_free(ib); + + ib = invbloom_new(ctx, int, 1, 100); + ok1(tal_parent(ib) == ctx); + ok1(ib->id_size == sizeof(int)); + ok1(ib->salt == 100); + ok1(ib->n_elems == 1); + ok1(invbloom_empty(ib)); + + val = 0; + invbloom_insert(ib, &val); + ok1(ib->count[0] == NUM_HASHES); + ok1(!invbloom_empty(ib)); + invbloom_delete(ib, &val); + ok1(invbloom_empty(ib)); + + val2 = 2; + invbloom_insert(ib, &val); + invbloom_insert(ib, &val2); + ok1(!invbloom_empty(ib)); + invbloom_delete(ib, &val); + ok1(!invbloom_empty(ib)); + invbloom_delete(ib, &val2); + ok1(invbloom_empty(ib)); + + tal_free(ib); + + /* Now, a more realistic test. */ + for (i = 0; i < 5; i++) { + ib = invbloom_new(ctx, int, 1024, i); + invbloom_insert(ib, &val); + invbloom_insert(ib, &val2); + ok1(invbloom_get(ib, &val)); + ok1(invbloom_get(ib, &val2)); + + ip = invbloom_extract_negative(ctx, ib); + ok1(!ip); + + ip = invbloom_extract(ctx, ib); + ok1(ip); + ok1(tal_parent(ip) == ctx); + ok1(*ip == val || *ip == val2); + + ip2 = invbloom_extract(ctx, ib); + ok1(ip2); + ok1(tal_parent(ip2) == ctx); + ok1(*ip2 == val || *ip2 == val2); + ok1(*ip2 != *ip); + + ok1(invbloom_extract(ctx, ib) == NULL); + + invbloom_delete(ib, &val); + invbloom_delete(ib, &val2); + ip = invbloom_extract(ctx, ib); + ok1(!ip); + + ip = invbloom_extract_negative(ctx, ib); + ok1(ip); + ok1(tal_parent(ip) == ctx); + ok1(*ip == val || *ip == val2); + + ip2 = invbloom_extract_negative(ctx, ib); + ok1(ip2); + ok1(tal_parent(ip2) == ctx); + ok1(*ip2 == val || *ip2 == val2); + ok1(*ip2 != *ip); + + ok1(invbloom_extract_negative(ctx, ib) == NULL); + ok1(invbloom_empty(ib)); + + tal_free(ib); + } + + tal_free(ctx); + + /* This exits depending on whether all tests passed */ + return exit_status(); +} -- 2.39.2