static bool check_hash(struct tdb_context *tdb,
tdb_off_t used[],
- size_t num_used)
+ size_t num_used, size_t num_flists)
{
- size_t num_found = 0;
+ /* Free lists also show up as used. */
+ size_t num_found = num_flists;
if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
0, 0, used, num_used, &num_found))
return false;
- /* 1 is for the free list. */
- if (num_found != num_used - 1) {
+ if (num_found != num_used) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: Not all entries are in hash\n");
return false;
static bool check_free(struct tdb_context *tdb,
tdb_off_t off,
const struct tdb_free_record *frec,
- tdb_off_t prev, unsigned int bucket)
+ tdb_off_t prev, tdb_off_t flist_off, unsigned int bucket)
{
if (frec_magic(frec) != TDB_FREE_MAGIC) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
(long long)off, (long long)frec->magic_and_meta);
return false;
}
+ if (frec_flist(frec) != flist_off) {
+ tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
+ "tdb_check: offset %llu bad freelist 0x%llx\n",
+ (long long)off, (long long)frec_flist(frec));
+ return false;
+ }
+
if (tdb->methods->oob(tdb, off
+ frec->data_len+sizeof(struct tdb_used_record),
false))
return false;
if (tdb_read_convert(tdb, off, &f, sizeof(f)))
return false;
- if (!check_free(tdb, off, &f, prev, i))
+ if (!check_free(tdb, off, &f, prev, flist_off, i))
return false;
/* FIXME: Check hash bits */
int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
void *private_data)
{
- tdb_off_t *free = NULL, *used = NULL;
- size_t num_free = 0, num_used = 0, num_found = 0;
+ tdb_off_t *free = NULL, *used = NULL, flist;
+ size_t num_free = 0, num_used = 0, num_found = 0, num_flists = 0;
if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
return -1;
if (!check_linear(tdb, &used, &num_used, &free, &num_free))
goto fail;
- /* FIXME: Check key uniqueness? */
- if (!check_hash(tdb, used, num_used))
- goto fail;
+ for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
+ if (flist == TDB_OFF_ERR)
+ goto fail;
+ if (!check_free_list(tdb, flist, free, num_free, &num_found))
+ goto fail;
+ num_flists++;
+ }
- if (!check_free_list(tdb, tdb->flist_off, free, num_free, &num_found))
+ /* FIXME: Check key uniqueness? */
+ if (!check_hash(tdb, used, num_used, num_flists))
goto fail;
if (num_found != num_free) {
return bucket;
}
+tdb_off_t first_flist(struct tdb_context *tdb)
+{
+ return tdb_read_off(tdb, offsetof(struct tdb_header, free_list));
+}
+
+tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist)
+{
+ return tdb_read_off(tdb, flist + offsetof(struct tdb_freelist, next));
+}
+
int tdb_flist_init(struct tdb_context *tdb)
{
- tdb->flist_off = tdb_read_off(tdb,
- offsetof(struct tdb_header, free_list));
- if (tdb->flist_off == TDB_OFF_ERR)
- return -1;
+ /* Use reservoir sampling algorithm to select a free list at random. */
+ unsigned int rnd, max = 0;
+ tdb_off_t off;
+
+ tdb->flist_off = off = first_flist(tdb);
+
+ while (off) {
+ if (off == TDB_OFF_ERR)
+ return -1;
+
+ rnd = random();
+ if (rnd >= max) {
+ tdb->flist_off = off;
+ max = rnd;
+ }
+
+ off = next_flist(tdb, off);
+ }
return 0;
}
}
/* Returns free_buckets + 1, or list number to search. */
-static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
+static tdb_off_t find_free_head(struct tdb_context *tdb,
+ tdb_off_t flist_off,
+ tdb_off_t bucket)
{
/* Speculatively search for a non-zero bucket. */
- return tdb_find_nonzero_off(tdb, bucket_off(tdb->flist_off, 0),
+ return tdb_find_nonzero_off(tdb, bucket_off(flist_off, 0),
bucket, TDB_FREE_BUCKETS);
}
assert(len_with_header >= sizeof(new));
- new.magic_and_meta = TDB_FREE_MAGIC;
+ new.magic_and_meta = TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL)
+ | tdb->flist_off;
new.data_len = len_with_header - sizeof(struct tdb_used_record);
b_off = bucket_off(tdb->flist_off, size_to_bucket(new.data_len));
if (frec_magic(r) != TDB_FREE_MAGIC)
break;
- /* FIXME: Use flist from record */
- nb_off = bucket_off(tdb->flist_off,size_to_bucket(r->data_len));
+ nb_off = bucket_off(frec_flist(r), size_to_bucket(r->data_len));
/* We may be violating lock order here, so best effort. */
if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
break;
}
- if (unlikely(bucket_off(tdb->flist_off,
+ if (unlikely(bucket_off(frec_flist(r),
size_to_bucket(r->data_len))
!= nb_off)) {
tdb_unlock_free_bucket(tdb, nb_off);
/* We have to drop this to avoid deadlocks, so make sure record
* doesn't get coalesced by someone else! */
- r->magic_and_meta = TDB_COALESCING_MAGIC;
+ r->magic_and_meta = TDB_COALESCING_MAGIC << (64 - TDB_OFF_UPPER_STEAL);
r->data_len = end - off - sizeof(struct tdb_used_record);
if (tdb_access_commit(tdb, r) != 0)
goto err;
size_t keylen, size_t datalen, bool want_extra,
unsigned hashlow)
{
- tdb_off_t off;
+ tdb_off_t off, flist;
unsigned start_b, b;
+ bool wrapped = false;
/* If they are growing, add 50% to get to higher bucket. */
if (want_extra)
else
start_b = size_to_bucket(adjust_size(keylen, datalen));
- /* Start at exact size bucket, and search up... */
- for (b = find_free_head(tdb, start_b);
- b < TDB_FREE_BUCKETS;
- b = find_free_head(tdb, b + 1)) {
- /* Try getting one from list. */
- off = lock_and_alloc(tdb, tdb->flist_off,
- b, keylen, datalen, want_extra,
- hashlow);
- if (off == TDB_OFF_ERR)
- return TDB_OFF_ERR;
- if (off != 0)
- return off;
- /* Didn't work. Try next bucket. */
+ flist = tdb->flist_off;
+ while (!wrapped || flist != tdb->flist_off) {
+ /* Start at exact size bucket, and search up... */
+ for (b = find_free_head(tdb, flist, start_b);
+ b < TDB_FREE_BUCKETS;
+ b = find_free_head(tdb, flist, b + 1)) {
+ /* Try getting one from list. */
+ off = lock_and_alloc(tdb, flist,
+ b, keylen, datalen, want_extra,
+ hashlow);
+ if (off == TDB_OFF_ERR)
+ return TDB_OFF_ERR;
+ if (off != 0) {
+ /* Worked? Stay using this list. */
+ tdb->flist_off = flist;
+ return off;
+ }
+ /* Didn't work. Try next bucket. */
+ }
+
+ /* Hmm, try next list. */
+ flist = next_flist(tdb, flist);
+ if (flist == 0) {
+ wrapped = true;
+ flist = first_flist(tdb);
+ }
}
+
return 0;
}
#define TDB_MAGIC_FOOD "TDB file\n"
#define TDB_VERSION ((uint64_t)(0x26011967 + 7))
#define TDB_MAGIC ((uint64_t)0x1999)
-#define TDB_FREE_MAGIC ((~(uint64_t)TDB_MAGIC) << 6)
-#define TDB_COALESCING_MAGIC (0xBAD1DEA2FEEDULL << 6)
+#define TDB_FREE_MAGIC ((uint64_t)0xFE)
+#define TDB_COALESCING_MAGIC ((uint64_t)0xFD)
#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
#define TDB_RECOVERY_MAGIC (0xf53bc0e7U)
#define TDB_RECOVERY_INVALID_MAGIC (0x0)
}
struct tdb_free_record {
- uint64_t magic_and_meta;
+ uint64_t magic_and_meta; /* TDB_OFF_UPPER_STEAL bits of magic */
uint64_t data_len; /* Not counting these two fields. */
/* This is why the minimum record size is 16 bytes. */
uint64_t next, prev;
static inline uint64_t frec_magic(const struct tdb_free_record *f)
{
- return f->magic_and_meta;
+ return f->magic_and_meta >> (64 - TDB_OFF_UPPER_STEAL);
+}
+
+static inline uint64_t frec_flist(const struct tdb_free_record *f)
+{
+ return f->magic_and_meta & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
}
/* this is stored at the front of every database */
struct tdb_freelist {
struct tdb_used_record hdr;
+ tdb_off_t next;
tdb_off_t buckets[TDB_FREE_BUCKETS];
};
/* free.c: */
int tdb_flist_init(struct tdb_context *tdb);
+/* check.c needs these to iterate through free lists. */
+tdb_off_t first_flist(struct tdb_context *tdb);
+tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist);
+
/* If this fails, try tdb_expand. */
tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
uint64_t hash, bool growing);
newdb.hdr.free_list = offsetof(struct new_database, flist);
memset(&newdb.flist, 0, sizeof(newdb.flist));
set_header(NULL, &newdb.flist.hdr, 0,
- sizeof(newdb.flist.buckets), sizeof(newdb.flist.buckets), 1);
+ sizeof(newdb.flist) - sizeof(newdb.flist.hdr),
+ sizeof(newdb.flist) - sizeof(newdb.flist.hdr), 1);
/* Magic food */
memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
add(layout, elem);
}
-void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len)
+void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
+ unsigned flist)
{
union tdb_layout_elem elem;
elem.base.type = FREE;
elem.free.len = len;
+ elem.free.flist_num = flist;
add(layout, elem);
}
}
static void set_freelist(void *mem, struct tdb_context *tdb,
- struct tle_freelist *freelist)
+ struct tle_freelist *freelist, struct tdb_header *hdr,
+ tdb_off_t last_flist)
{
struct tdb_freelist *flist = mem;
memset(flist, 0, sizeof(*flist));
set_header(tdb, &flist->hdr, 0,
sizeof(*flist) - sizeof(flist->hdr),
sizeof(*flist) - sizeof(flist->hdr), 1);
+
+ if (last_flist) {
+ flist = (struct tdb_freelist *)((char *)hdr + last_flist);
+ flist->next = freelist->base.off;
+ } else {
+ hdr->free_list = freelist->base.off;
+ }
}
static void add_to_freetable(struct tdb_context *tdb,
tdb_off_t eoff,
- tdb_off_t elen)
+ tdb_off_t elen,
+ struct tle_freelist *freelist)
{
+ tdb->flist_off = freelist->base.off;
add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen);
}
abort();
}
+static struct tle_freelist *find_flist(struct tdb_layout *layout, unsigned num)
+{
+ unsigned i;
+
+ for (i = 0; i < layout->num_elems; i++) {
+ if (layout->elem[i].base.type != FREELIST)
+ continue;
+ if (num == 0)
+ return &layout->elem[i].flist;
+ num--;
+ }
+ abort();
+}
+
/* FIXME: Support TDB_CONVERT */
struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
{
unsigned int i;
- tdb_off_t off, len, flist_off = 0;
+ tdb_off_t off, len, last_flist;
char *mem;
struct tdb_context *tdb;
e->base.off = off;
switch (e->base.type) {
case FREELIST:
- assert(flist_off == 0);
- flist_off = off;
len = freelist_len(&e->flist);
break;
case FREE:
}
off += len;
}
- /* Must have a free list! */
- assert(flist_off);
mem = malloc(off);
/* Now populate our header, cribbing from a real TDB header. */
free(tdb->map_ptr);
tdb->map_ptr = mem;
tdb->map_size = off;
- tdb->flist_off = flist_off;
+ last_flist = 0;
for (i = 0; i < layout->num_elems; i++) {
union tdb_layout_elem *e = &layout->elem[i];
switch (e->base.type) {
case FREELIST:
- set_freelist(mem + e->base.off, tdb, &e->flist);
+ set_freelist(mem + e->base.off, tdb, &e->flist,
+ (struct tdb_header *)mem, last_flist);
+ last_flist = e->base.off;
break;
case FREE:
set_free_record(mem + e->base.off, e->free.len);
break;
}
}
+ /* Must have a free list! */
+ assert(last_flist);
/* Now fill the free and hash tables. */
for (i = 0; i < layout->num_elems; i++) {
union tdb_layout_elem *e = &layout->elem[i];
switch (e->base.type) {
case FREE:
- add_to_freetable(tdb, e->base.off, e->free.len);
+ add_to_freetable(tdb, e->base.off, e->free.len,
+ find_flist(layout, e->free.flist_num));
break;
case DATA:
add_to_hashtable(tdb, e->base.off, e->used.key);
}
}
+ tdb->flist_off = find_flist(layout, 0)->base.off;
+
/* Get physical if they asked for it. */
if (layout->filename) {
int fd = open(layout->filename, O_WRONLY|O_TRUNC|O_CREAT,
struct tdb_layout *new_tdb_layout(const char *filename);
void tdb_layout_add_freelist(struct tdb_layout *layout);
-void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len);
+void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
+ unsigned flist);
void tdb_layout_add_used(struct tdb_layout *layout,
TDB_DATA key, TDB_DATA data,
tdb_len_t extra);
struct tle_free {
struct tle_base base;
tdb_len_t len;
+ unsigned flist_num;
};
struct tle_used {
layout = new_tdb_layout(NULL);
tdb_layout_add_freelist(layout);
len = 1024;
- tdb_layout_add_free(layout, len);
+ tdb_layout_add_free(layout, len, 0);
tdb = tdb_layout_get(layout);
ok1(tdb_check(tdb, NULL, NULL) == 0);
ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
/* No coalescing can be done due to used record */
layout = new_tdb_layout(NULL);
tdb_layout_add_freelist(layout);
- tdb_layout_add_free(layout, 1024);
+ tdb_layout_add_free(layout, 1024, 0);
tdb_layout_add_used(layout, key, data, 6);
tdb = tdb_layout_get(layout);
ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
/* Coalescing can be done due to two free records, then EOF */
layout = new_tdb_layout(NULL);
tdb_layout_add_freelist(layout);
- tdb_layout_add_free(layout, 1024);
- tdb_layout_add_free(layout, 2048);
+ tdb_layout_add_free(layout, 1024, 0);
+ tdb_layout_add_free(layout, 2048, 0);
tdb = tdb_layout_get(layout);
ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
ok1(free_record_length(tdb, layout->elem[2].base.off) == 2048);
/* Coalescing can be done due to two free records, then data */
layout = new_tdb_layout(NULL);
tdb_layout_add_freelist(layout);
- tdb_layout_add_free(layout, 1024);
- tdb_layout_add_free(layout, 512);
+ tdb_layout_add_free(layout, 1024, 0);
+ tdb_layout_add_free(layout, 512, 0);
tdb_layout_add_used(layout, key, data, 6);
tdb = tdb_layout_get(layout);
ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
/* Coalescing can be done due to three free records, then EOF */
layout = new_tdb_layout(NULL);
tdb_layout_add_freelist(layout);
- tdb_layout_add_free(layout, 1024);
- tdb_layout_add_free(layout, 512);
- tdb_layout_add_free(layout, 256);
+ tdb_layout_add_free(layout, 1024, 0);
+ tdb_layout_add_free(layout, 512, 0);
+ tdb_layout_add_free(layout, 256, 0);
tdb = tdb_layout_get(layout);
ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
ok1(free_record_length(tdb, layout->elem[2].base.off) == 512);
--- /dev/null
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+#include "layout.h"
+
+int main(int argc, char *argv[])
+{
+ tdb_off_t off;
+ struct tdb_context *tdb;
+ struct tdb_layout *layout;
+ TDB_DATA key, data;
+
+ plan_tests(11);
+ key.dptr = (unsigned char *)"Hello";
+ data.dptr = (unsigned char *)"world";
+ data.dsize = 5;
+ key.dsize = 5;
+
+ /* Create a TDB with three free lists. */
+ layout = new_tdb_layout(NULL);
+ tdb_layout_add_freelist(layout);
+ tdb_layout_add_freelist(layout);
+ tdb_layout_add_freelist(layout);
+ tdb_layout_add_free(layout, 80, 0);
+ /* Used record prevent coalescing. */
+ tdb_layout_add_used(layout, key, data, 6);
+ tdb_layout_add_free(layout, 160, 1);
+ key.dsize--;
+ tdb_layout_add_used(layout, key, data, 7);
+ tdb_layout_add_free(layout, 320, 2);
+ key.dsize--;
+ tdb_layout_add_used(layout, key, data, 8);
+ tdb_layout_add_free(layout, 40, 0);
+ tdb = tdb_layout_get(layout);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0, 0);
+ ok1(off == layout->elem[3].base.off);
+ ok1(tdb->flist_off == layout->elem[0].base.off);
+
+ off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0, 0);
+ ok1(off == layout->elem[5].base.off);
+ ok1(tdb->flist_off == layout->elem[1].base.off);
+
+ off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0, 0);
+ ok1(off == layout->elem[7].base.off);
+ ok1(tdb->flist_off == layout->elem[2].base.off);
+
+ off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0, 0);
+ ok1(off == layout->elem[9].base.off);
+ ok1(tdb->flist_off == layout->elem[0].base.off);
+
+ /* Now we fail. */
+ off = get_free(tdb, 0, 0, 1, 0);
+ ok1(off == 0);
+
+ tdb_close(tdb);
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}