From 49bb40e9615ada5a3e0d06bc45613744f441895c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 7 Nov 2014 15:08:03 +1030
Subject: [PATCH] invbloom: new module for IBLTs.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Makefile-ccan                     |   1 +
 ccan/invbloom/LICENSE             |   1 +
 ccan/invbloom/_info               |  66 ++++++++++++
 ccan/invbloom/invbloom.c          | 171 ++++++++++++++++++++++++++++++
 ccan/invbloom/invbloom.h          | 117 ++++++++++++++++++++
 ccan/invbloom/test/run-subtract.c |  42 ++++++++
 ccan/invbloom/test/run.c          | 115 ++++++++++++++++++++
 7 files changed, 513 insertions(+)
 create mode 120000 ccan/invbloom/LICENSE
 create mode 100644 ccan/invbloom/_info
 create mode 100644 ccan/invbloom/invbloom.c
 create mode 100644 ccan/invbloom/invbloom.h
 create mode 100644 ccan/invbloom/test/run-subtract.c
 create mode 100644 ccan/invbloom/test/run.c

diff --git a/Makefile-ccan b/Makefile-ccan
index 5b6d9c94..74916d2a 100644
--- a/Makefile-ccan
+++ b/Makefile-ccan
@@ -59,6 +59,7 @@ MODS_WITH_SRC := antithread \
 	htable \
 	idtree \
 	ilog \
+	invbloom \
 	io \
 	isaac \
 	iscsi \
diff --git a/ccan/invbloom/LICENSE b/ccan/invbloom/LICENSE
new file mode 120000
index 00000000..2354d129
--- /dev/null
+++ b/ccan/invbloom/LICENSE
@@ -0,0 +1 @@
+../../licenses/BSD-MIT
\ No newline at end of file
diff --git a/ccan/invbloom/_info b/ccan/invbloom/_info
new file mode 100644
index 00000000..03e9fe0b
--- /dev/null
+++ b/ccan/invbloom/_info
@@ -0,0 +1,66 @@
+#include "config.h"
+#include <stdio.h>
+#include <string.h>
+
+/**
+ * invbloom - implementation of invertible bloom lookup tables.
+ *
+ * This code implements a subset of invertible bloom lookup tables
+ * as described in[1].
+ *
+ * [1] Goodrich, Michael T., and Michael Mitzenmacher. "Invertible bloom
+ *     lookup tables." Communication, Control, and Computing (Allerton), 2011
+ *     49th Annual Allerton Conference on. IEEE, 2011.
+ *      http://arxiv.org/pdf/1101.2245
+ *
+ * License: BSD-MIT
+ *
+ * Example:
+ *	#include <ccan/invbloom/invbloom.h>
+ *	#include <stdio.h>
+ *
+ *	int main(int argc, char *argv[])
+ *	{
+ *		unsigned int i, n;
+ *		struct invbloom *ib = invbloom_new(NULL, char *, 16, 0);
+ *
+ *		for (i = 1; i < argc; i++)
+ *			invbloom_insert(ib, &argv[i]);
+ *
+ *		n = 0;
+ *		for (i = 1; i < argc; i++)
+ *			n += invbloom_get(ib, &argv[i]);
+ *
+ *		printf("%u out of %u are found\n", n, argc);
+ *
+ *		n = 0;
+ *		for (i = 1; i < argc; i++) {
+ *			unsigned int j;
+ *			char **p = invbloom_extract(NULL, ib);
+ *
+ *			for (j = 1; j < argc; j++) {
+ *				if (p == &argv[j])
+ *					n++;
+ *			}
+ *			tal_free(p);
+ *		}
+ *		printf("%u out of %u were extracted\n", n, argc);
+ *		return 0;
+ *	}
+ */
+int main(int argc, char *argv[])
+{
+	/* Expect exactly one argument */
+	if (argc != 2)
+		return 1;
+
+	if (strcmp(argv[1], "depends") == 0) {
+		printf("ccan/endian\n");
+		printf("ccan/hash\n");
+		printf("ccan/short_types\n");
+		printf("ccan/tal\n");
+		return 0;
+	}
+
+	return 1;
+}
diff --git a/ccan/invbloom/invbloom.c b/ccan/invbloom/invbloom.c
new file mode 100644
index 00000000..73e1825d
--- /dev/null
+++ b/ccan/invbloom/invbloom.c
@@ -0,0 +1,171 @@
+/* Licensed under BSD-MIT - see LICENSE file for details */
+#include "invbloom.h"
+#include <ccan/hash/hash.h>
+#include <ccan/endian/endian.h>
+#include <assert.h>
+
+/* 	"We will show that hash_count values of 3 or 4 work well in practice"
+
+	From:
+
+	Eppstein, David, et al. "What's the difference?: efficient set reconciliation without prior context." ACM SIGCOMM Computer Communication Review. Vol. 41. No. 4. ACM, 2011. http://conferences.sigcomm.org/sigcomm/2011/papers/sigcomm/p218.pdf
+*/
+#define NUM_HASHES 4
+
+struct invbloom *invbloom_new_(const tal_t *ctx,
+			       size_t id_size,
+			       size_t n_elems,
+			       u32 salt)
+{
+	struct invbloom *ib = tal(ctx, struct invbloom);
+
+	if (ib) {
+		ib->n_elems = n_elems;
+		ib->id_size = id_size;
+		ib->salt = salt;
+		ib->count = tal_arrz(ib, s32, n_elems);
+		ib->idsum = tal_arrz(ib, u8, id_size * n_elems);
+		if (!ib->count || !ib->idsum)
+			ib = tal_free(ib);
+	}
+	return ib;
+}
+
+static size_t hash_bucket(const struct invbloom *ib, const void *id, size_t i)
+{
+	return hash((const char *)id, ib->id_size, ib->salt+i*7) % ib->n_elems;
+}
+
+static u8 *idsum_ptr(const struct invbloom *ib, size_t bucket)
+{
+	return (u8 *)ib->idsum + bucket * ib->id_size;
+}
+
+static void add_to_bucket(struct invbloom *ib, size_t n, const u8 *id)
+{
+	size_t i;
+	u8 *idsum = idsum_ptr(ib, n);
+	
+	ib->count[n]++;
+
+	for (i = 0; i < ib->id_size; i++)
+		idsum[i] ^= id[i];
+}
+
+static void remove_from_bucket(struct invbloom *ib, size_t n, const u8 *id)
+{
+	size_t i;
+	u8 *idsum = idsum_ptr(ib, n);
+
+	ib->count[n]--;
+	for (i = 0; i < ib->id_size; i++)
+		idsum[i] ^= id[i];
+}
+
+void invbloom_insert(struct invbloom *ib, const void *id)
+{
+	unsigned int i;
+
+	for (i = 0; i < NUM_HASHES; i++)
+		add_to_bucket(ib, hash_bucket(ib, id, i), id);
+}
+
+void invbloom_delete(struct invbloom *ib, const void *id)
+{
+	unsigned int i;
+
+	for (i = 0; i < NUM_HASHES; i++)
+		remove_from_bucket(ib, hash_bucket(ib, id, i), id);
+}
+
+static bool all_zero(const u8 *mem, size_t size)
+{
+	unsigned int i;
+
+	for (i = 0; i < size; i++)
+		if (mem[i])
+			return false;
+	return true;
+}
+
+bool invbloom_get(const struct invbloom *ib, const void *id)
+{
+	unsigned int i;
+
+	for (i = 0; i < NUM_HASHES; i++) {
+		size_t h = hash_bucket(ib, id, i);
+		u8 *idsum = idsum_ptr(ib, h);
+
+		if (ib->count[h] == 0 && all_zero(idsum, ib->id_size))
+			return false;
+
+		if (ib->count[h] == 1)
+			return (memcmp(idsum, id, ib->id_size) == 0);
+	}
+	return false;
+}
+
+static void *extract(const tal_t *ctx, struct invbloom *ib, int count)
+{
+	size_t i;
+
+	/* FIXME: this makes full extraction O(n^2). */
+	for (i = 0; i < ib->n_elems; i++) {
+		void *id;
+
+		if (ib->count[i] != count)
+			continue;
+
+		id = tal_dup(ctx, u8, idsum_ptr(ib, i), ib->id_size, 0);
+		return id;
+	}
+	return NULL;
+}
+
+void *invbloom_extract(const tal_t *ctx, struct invbloom *ib)
+{
+	void *id;
+
+	id = extract(ctx, ib, 1);
+	if (id)
+		invbloom_delete(ib, id);
+	return id;
+}
+
+void *invbloom_extract_negative(const tal_t *ctx, struct invbloom *ib)
+{
+	void *id;
+
+	id = extract(ctx, ib, -1);
+	if (id)
+		invbloom_insert(ib, id);
+	return id;
+}
+
+void invbloom_subtract(struct invbloom *ib1, const struct invbloom *ib2)
+{
+	size_t i;
+
+	assert(ib1->n_elems == ib2->n_elems);
+	assert(ib1->id_size == ib2->id_size);
+	assert(ib1->salt == ib2->salt);
+
+	for (i = 0; i < ib1->n_elems; i++)
+		ib1->count[i] -= ib2->count[i];
+
+	for (i = 0; i < ib1->n_elems * ib1->id_size; i++)
+		ib1->idsum[i] ^= ib2->idsum[i];
+}
+
+bool invbloom_empty(const struct invbloom *ib)
+{
+	size_t i;
+
+	for (i = 0; i < ib->n_elems; i++) {
+		if (ib->count[i])
+			return false;
+		if (!all_zero(idsum_ptr(ib, i), ib->id_size))
+			return false;
+	}
+	return true;
+}
diff --git a/ccan/invbloom/invbloom.h b/ccan/invbloom/invbloom.h
new file mode 100644
index 00000000..04b156f6
--- /dev/null
+++ b/ccan/invbloom/invbloom.h
@@ -0,0 +1,117 @@
+/* Licensed under BSD-MIT - see LICENSE file for details */
+#ifndef CCAN_INVBLOOM_H
+#define CCAN_INVBLOOM_H
+#include <ccan/short_types/short_types.h>
+#include <ccan/tal/tal.h>
+
+struct invbloom {
+	size_t n_elems;
+	size_t id_size;
+	u32 hashsum_bytes; /* 0 - 4. */
+	u32 salt;
+	s32 *count; /* [n_elems] */
+	u8 *idsum; /* [n_elems][id_size] */
+};
+
+/**
+ * invbloom_new - create a new invertable bloom lookup table
+ * @ctx: context to tal() from, or NULL.
+ * @type: type to place into the buckets (must not contain padding)
+ * @n_elems: number of entries in table
+ * @salt: 32 bit seed for table
+ *
+ * Returns a new table, which can be freed with tal_free().
+ */
+#define invbloom_new(ctx, type, n_elems, salt)	\
+	invbloom_new_((ctx), sizeof(type), (n_elems), (salt))
+struct invbloom *invbloom_new_(const tal_t *ctx,
+			       size_t id_size,
+			       size_t n_elems, u32 salt);
+
+
+/**
+ * invbloom_insert - add a new element
+ * @ib: the invertable bloom lookup table.
+ * @elem: the element
+ *
+ * This is guaranteed to be the inverse of invbloom_delete.
+ */
+void invbloom_insert(struct invbloom *ib, const void *elem);
+
+/**
+ * invbloom_delete - remove an element
+ * @ib: the invertable bloom lookup table.
+ * @elem: the element
+ *
+ * Note that this doesn't check the element was previously added (as
+ * that can not be done in general anyway).
+ *
+ * This is guaranteed to be the inverse of invbloom_delete.
+ */
+void invbloom_delete(struct invbloom *ib, const void *elem);
+
+/**
+ * invbloom_get - check if an element is (probably) in the table.
+ * @ib: the invertable bloom lookup table.
+ * @elem: the element
+ *
+ * This may return a false negative if the table is too full.
+ *
+ * It will only return a false positive if deletions and insertions
+ * don't match, and have managed to produce a result which matches the
+ * element.  This is much less likely.
+ */
+bool invbloom_get(const struct invbloom *ib, const void *elem);
+
+/**
+ * invbloom_extract - try to recover an entry added to a bloom lookup table.
+ * @ctx: the context to tal() the return value from.
+ * @ib: the invertable bloom lookup table.
+ *
+ * This may be able to recover (and delete) an element which was
+ * invbloom_insert()ed into the table (and not deleted).  This will
+ * not work if the table is too full.
+ *
+ * It may return a bogus element if deletions and insertions don't
+ * match.
+ */
+void *invbloom_extract(const tal_t *ctx, struct invbloom *ib);
+
+/**
+ * invbloom_extract_negative - try to recover an entry deleted from a bloom lookup table.
+ * @ctx: the context to tal() the return value from.
+ * @ib: the invertable bloom lookup table.
+ *
+ * This may be able to recover (and insert/undo) an element which was
+ * invbloom_delete()ed from the table (and not inserted).  This will
+ * not work if the table is too full.
+ *
+ * It may return a bogus element if deletions and insertions don't
+ * match.
+ */
+void *invbloom_extract_negative(const tal_t *ctx, struct invbloom *ib);
+
+/**
+ * invbloom_subtract - return the differences of two IBLTs.
+ * @ib1: the invertable bloom lookup table to alter
+ * @ib2: the invertable bloom lookup table to subtract.
+ *
+ * This produces exactly the same result as if a new table had all the
+ * elements only in @ib1 inserted, then all the elements onlt in @ib2
+ * deleted.
+ *
+ * ie. if @ib1 and @ib2 are similar, the result may be a usable by
+ * invbloom_extract and invbloom_extract_negative.
+ */
+void invbloom_subtract(struct invbloom *ib1, const struct invbloom *ib2);
+
+/**
+ * invbloom_empty - is an invertable bloom lookup table completely clean?
+ * @ib: the invertable bloom lookup table
+ *
+ * This is always true if @ib has had the same elements inserted and
+ * deleted.  It is far less likely to be true if different ones were
+ * deleted than inserted.
+ */
+bool invbloom_empty(const struct invbloom *ib);
+#endif /* CCAN_INVBLOOM_H */
diff --git a/ccan/invbloom/test/run-subtract.c b/ccan/invbloom/test/run-subtract.c
new file mode 100644
index 00000000..ae476423
--- /dev/null
+++ b/ccan/invbloom/test/run-subtract.c
@@ -0,0 +1,42 @@
+#include <ccan/invbloom/invbloom.h>
+/* Include the C files directly. */
+#include <ccan/invbloom/invbloom.c>
+#include <ccan/tap/tap.h>
+
+int main(void)
+{
+	struct invbloom *ib1, *ib2;
+	const tal_t *ctx = tal(NULL, char);
+	int val = 1, val2 = 2, *ip;
+
+	/* This is how many tests you plan to run */
+	plan_tests(8);
+
+	ib1 = invbloom_new(ctx, int, 1024, 0);
+	ib2 = invbloom_new(ctx, int, 1024, 0);
+	invbloom_insert(ib1, &val);
+	invbloom_insert(ib2, &val2);
+
+	invbloom_subtract(ib1, ib2);
+
+	ip = invbloom_extract(ctx, ib1);
+	ok1(ip);
+	ok1(tal_parent(ip) == ctx);
+	ok1(*ip == val);
+
+	ip = invbloom_extract(ctx, ib1);
+	ok1(!ip);
+
+	ip = invbloom_extract_negative(ctx, ib1);
+	ok1(ip);
+	ok1(tal_parent(ip) == ctx);
+	ok1(*ip == val2);
+
+	ip = invbloom_extract_negative(ctx, ib1);
+	ok1(!ip);
+
+	tal_free(ctx);
+
+	/* This exits depending on whether all tests passed */
+	return exit_status();
+}
diff --git a/ccan/invbloom/test/run.c b/ccan/invbloom/test/run.c
new file mode 100644
index 00000000..b770128b
--- /dev/null
+++ b/ccan/invbloom/test/run.c
@@ -0,0 +1,115 @@
+#include <ccan/invbloom/invbloom.h>
+/* Include the C files directly. */
+#include <ccan/invbloom/invbloom.c>
+#include <ccan/tap/tap.h>
+
+int main(void)
+{
+	struct invbloom *ib;
+	const tal_t *ctx = tal(NULL, char);
+	int val, val2, *ip, *ip2, i;
+
+	/* This is how many tests you plan to run */
+	plan_tests(127);
+
+	ib = invbloom_new(ctx, int, 1, 100);
+	ok1(tal_parent(ib) == ctx);
+	ok1(ib->id_size == sizeof(int));
+	ok1(ib->salt == 100);
+	ok1(ib->n_elems == 1);
+	ok1(invbloom_empty(ib));
+
+	val = 0;
+	invbloom_insert(ib, &val);
+	ok1(ib->count[0] == NUM_HASHES);
+	ok1(!invbloom_empty(ib));
+	invbloom_delete(ib, &val);
+	ok1(invbloom_empty(ib));
+
+	val2 = 2;
+	invbloom_insert(ib, &val);
+	invbloom_insert(ib, &val2);
+	ok1(!invbloom_empty(ib));
+	invbloom_delete(ib, &val);
+	ok1(!invbloom_empty(ib));
+	invbloom_delete(ib, &val2);
+	ok1(invbloom_empty(ib));
+
+	tal_free(ib);
+
+	ib = invbloom_new(ctx, int, 1, 100);
+	ok1(tal_parent(ib) == ctx);
+	ok1(ib->id_size == sizeof(int));
+	ok1(ib->salt == 100);
+	ok1(ib->n_elems == 1);
+	ok1(invbloom_empty(ib));
+
+	val = 0;
+	invbloom_insert(ib, &val);
+	ok1(ib->count[0] == NUM_HASHES);
+	ok1(!invbloom_empty(ib));
+	invbloom_delete(ib, &val);
+	ok1(invbloom_empty(ib));
+
+	val2 = 2;
+	invbloom_insert(ib, &val);
+	invbloom_insert(ib, &val2);
+	ok1(!invbloom_empty(ib));
+	invbloom_delete(ib, &val);
+	ok1(!invbloom_empty(ib));
+	invbloom_delete(ib, &val2);
+	ok1(invbloom_empty(ib));
+
+	tal_free(ib);
+
+	/* Now, a more realistic test. */
+	for (i = 0; i < 5; i++) {
+		ib = invbloom_new(ctx, int, 1024, i);
+		invbloom_insert(ib, &val);
+		invbloom_insert(ib, &val2);
+		ok1(invbloom_get(ib, &val));
+		ok1(invbloom_get(ib, &val2));
+
+		ip = invbloom_extract_negative(ctx, ib);
+		ok1(!ip);
+
+		ip = invbloom_extract(ctx, ib);
+		ok1(ip);
+		ok1(tal_parent(ip) == ctx);
+		ok1(*ip == val || *ip == val2);
+
+		ip2 = invbloom_extract(ctx, ib);
+		ok1(ip2);
+		ok1(tal_parent(ip2) == ctx);
+		ok1(*ip2 == val || *ip2 == val2);
+		ok1(*ip2 != *ip);
+
+		ok1(invbloom_extract(ctx, ib) == NULL);
+
+		invbloom_delete(ib, &val);
+		invbloom_delete(ib, &val2);
+		ip = invbloom_extract(ctx, ib);
+		ok1(!ip);
+
+		ip = invbloom_extract_negative(ctx, ib);
+		ok1(ip);
+		ok1(tal_parent(ip) == ctx);
+		ok1(*ip == val || *ip == val2);
+
+		ip2 = invbloom_extract_negative(ctx, ib);
+		ok1(ip2);
+		ok1(tal_parent(ip2) == ctx);
+		ok1(*ip2 == val || *ip2 == val2);
+		ok1(*ip2 != *ip);
+
+		ok1(invbloom_extract_negative(ctx, ib) == NULL);
+		ok1(invbloom_empty(ib));
+
+		tal_free(ib);
+	}
+
+	tal_free(ctx);
+
+	/* This exits depending on whether all tests passed */
+	return exit_status();
+}
-- 
2.39.2