From: Rusty Russell Date: Wed, 1 Dec 2010 13:22:43 +0000 (+1030) Subject: tdb2: shrink free header from 32 to 24 bytes. X-Git-Url: http://git.ozlabs.org/?p=ccan;a=commitdiff_plain;h=5e30abc662990449444769c71cf98ca788db4117 tdb2: shrink free header from 32 to 24 bytes. This reduces our minimum key+data length to 8 bytes; we do this by packing the prev pointer where we used to put the flist pointer, and storing the flist as an 8 bit index (meaning we can only have 256 free tables). Note that this has a perverse result on the size of the database, as our 4-byte key and 4-byte data now fit perfectly in a minimal record, so appeding causes us to allocate new records which are 50% larger, since we detect growing. Current results of speed test: $ ./speed 1000000 Adding 1000000 records: 23210 ns (59193360 bytes) Finding 1000000 records: 2387 ns (59193360 bytes) Traversing 1000000 records: 2150 ns (59193360 bytes) Deleting 1000000 records: 13392 ns (59193360 bytes) Re-adding 1000000 records: 11546 ns (59193360 bytes) Appending 1000000 records: 29327 ns (91193360 bytes) Churning 1000000 records: 33026 ns (91193360 bytes) Previous: $ ./speed 1000000 Adding 1000000 records: 28324 ns (67232528 bytes) Finding 1000000 records: 2468 ns (67232528 bytes) Traversing 1000000 records: 2200 ns (67232528 bytes) Deleting 1000000 records: 13083 ns (67232528 bytes) Re-adding 1000000 records: 16433 ns (67232528 bytes) Appending 1000000 records: 2511 ns (67232528 bytes) Churning 1000000 records: 31068 ns (67570448 bytes) --- diff --git a/ccan/tdb2/check.c b/ccan/tdb2/check.c index c5450ccc..97347920 100644 --- a/ccan/tdb2/check.c +++ b/ccan/tdb2/check.c @@ -315,37 +315,37 @@ static bool check_hash(struct tdb_context *tdb, static bool check_free(struct tdb_context *tdb, tdb_off_t off, const struct tdb_free_record *frec, - tdb_off_t prev, tdb_off_t flist_off, unsigned int bucket) + tdb_off_t prev, unsigned int flist, unsigned int bucket) { if (frec_magic(frec) != TDB_FREE_MAGIC) { tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_check: offset %llu bad magic 0x%llx\n", - (long long)off, (long long)frec->magic_and_meta); + (long long)off, (long long)frec->magic_and_prev); return false; } - if (frec_flist(frec) != flist_off) { + if (frec_flist(frec) != flist) { tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, - "tdb_check: offset %llu bad freelist 0x%llx\n", - (long long)off, (long long)frec_flist(frec)); + "tdb_check: offset %llu bad freelist %u\n", + (long long)off, frec_flist(frec)); return false; } if (tdb->methods->oob(tdb, off - + frec->data_len+sizeof(struct tdb_used_record), + + frec_len(frec) + sizeof(struct tdb_used_record), false)) return false; - if (size_to_bucket(frec->data_len) != bucket) { + if (size_to_bucket(frec_len(frec)) != bucket) { tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_check: offset %llu in wrong bucket %u vs %u\n", (long long)off, - bucket, size_to_bucket(frec->data_len)); + bucket, size_to_bucket(frec_len(frec))); return false; } - if (prev != frec->prev) { + if (prev != frec_prev(frec)) { tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_check: offset %llu bad prev %llu vs %llu\n", (long long)off, - (long long)prev, (long long)frec->prev); + (long long)prev, (long long)frec_len(frec)); return false; } return true; @@ -353,6 +353,7 @@ static bool check_free(struct tdb_context *tdb, static bool check_free_list(struct tdb_context *tdb, tdb_off_t flist_off, + unsigned flist_num, tdb_off_t free[], size_t num_free, size_t *num_found) @@ -384,7 +385,7 @@ static bool check_free_list(struct tdb_context *tdb, return false; if (tdb_read_convert(tdb, off, &f, sizeof(f))) return false; - if (!check_free(tdb, off, &f, prev, flist_off, i)) + if (!check_free(tdb, off, &f, prev, flist_num, i)) return false; /* FIXME: Check hash bits */ @@ -436,13 +437,17 @@ static bool check_linear(struct tdb_context *tdb, struct tdb_free_record f; struct tdb_recovery_record r; } pad, *p; - p = tdb_get(tdb, off, &pad, sizeof(pad)); + /* r is larger: only get that if we need to. */ + p = tdb_get(tdb, off, &pad, sizeof(pad.f)); if (!p) return false; /* If we crash after ftruncate, we can get zeroes or fill. */ if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC || p->r.magic == 0x4343434343434343ULL) { + p = tdb_get(tdb, off, &pad, sizeof(pad.r)); + if (!p) + return false; if (recovery == off) { found_recovery = true; len = sizeof(p->r) + p->r.max_len; @@ -462,6 +467,9 @@ static bool check_linear(struct tdb_context *tdb, (size_t)tdb->map_size); } } else if (p->r.magic == TDB_RECOVERY_MAGIC) { + p = tdb_get(tdb, off, &pad, sizeof(pad.r)); + if (!p) + return false; if (recovery != off) { tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_check: unexpected recovery" @@ -469,11 +477,23 @@ static bool check_linear(struct tdb_context *tdb, (size_t)off); return false; } + if (p->r.len > p->r.max_len) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: invalid recovery length" + " %zu\n", (size_t)p->r.len); + return false; + } + if (p->r.eof > tdb->map_size) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: invalid old EOF" + " %zu\n", (size_t)p->r.eof); + return false; + } found_recovery = true; len = sizeof(p->r) + p->r.max_len; } else if (frec_magic(&p->f) == TDB_FREE_MAGIC || frec_magic(&p->f) == TDB_COALESCING_MAGIC) { - len = sizeof(p->u) + p->f.data_len; + len = sizeof(p->u) + frec_len(&p->f); if (off + len > tdb->map_size) { tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "tdb_check: free overlength %llu" @@ -560,7 +580,8 @@ int tdb_check(struct tdb_context *tdb, for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) { if (flist == TDB_OFF_ERR) goto fail; - if (!check_free_list(tdb, flist, free, num_free, &num_found)) + if (!check_free_list(tdb, flist, num_flists, free, num_free, + &num_found)) goto fail; num_flists++; } diff --git a/ccan/tdb2/free.c b/ccan/tdb2/free.c index 6fd82c5c..13db5933 100644 --- a/ccan/tdb2/free.c +++ b/ccan/tdb2/free.c @@ -62,10 +62,11 @@ tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist) int tdb_flist_init(struct tdb_context *tdb) { /* Use reservoir sampling algorithm to select a free list at random. */ - unsigned int rnd, max = 0; + unsigned int rnd, max = 0, count = 0; tdb_off_t off; tdb->flist_off = off = first_flist(tdb); + tdb->flist = 0; while (off) { if (off == TDB_OFF_ERR) @@ -74,10 +75,12 @@ int tdb_flist_init(struct tdb_context *tdb) rnd = random(); if (rnd >= max) { tdb->flist_off = off; + tdb->flist = count; max = rnd; } off = next_flist(tdb, off); + count++; } return 0; } @@ -107,10 +110,10 @@ static int remove_from_list(struct tdb_context *tdb, tdb_off_t off; /* Front of list? */ - if (r->prev == 0) { + if (frec_prev(r) == 0) { off = b_off; } else { - off = r->prev + offsetof(struct tdb_free_record, next); + off = frec_prev(r) + offsetof(struct tdb_free_record, next); } #ifdef DEBUG @@ -128,11 +131,11 @@ static int remove_from_list(struct tdb_context *tdb, } if (r->next != 0) { - off = r->next + offsetof(struct tdb_free_record, prev); + off = r->next + offsetof(struct tdb_free_record,magic_and_prev); /* r->next->prev = r->prev */ #ifdef DEBUG - if (tdb_read_off(tdb, off) != r_off) { + if (tdb_read_off(tdb, off) & TDB_OFF_MASK != r_off) { tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "remove_from_list: %llu bad list %llu\n", (long long)r_off, (long long)b_off); @@ -140,7 +143,7 @@ static int remove_from_list(struct tdb_context *tdb, } #endif - if (tdb_write_off(tdb, off, r->prev)) { + if (tdb_write_off(tdb, off, r->magic_and_prev)) { return -1; } } @@ -151,58 +154,65 @@ static int remove_from_list(struct tdb_context *tdb, static int enqueue_in_free(struct tdb_context *tdb, tdb_off_t b_off, tdb_off_t off, - struct tdb_free_record *new) + tdb_len_t len) { - new->prev = 0; + struct tdb_free_record new; + uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL)); + + /* We only need to set flist_and_len; rest is set in enqueue_in_free */ + new.flist_and_len = ((uint64_t)tdb->flist << (64 - TDB_OFF_UPPER_STEAL)) + | len; + /* prev = 0. */ + new.magic_and_prev = magic; + /* new->next = head. */ - new->next = tdb_read_off(tdb, b_off); - if (new->next == TDB_OFF_ERR) + new.next = tdb_read_off(tdb, b_off); + if (new.next == TDB_OFF_ERR) return -1; - if (new->next) { + if (new.next) { #ifdef DEBUG if (tdb_read_off(tdb, - new->next - + offsetof(struct tdb_free_record, prev)) - != 0) { + new.next + offsetof(struct tdb_free_record, + magic_and_prev)) + != magic) { tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "enqueue_in_free: %llu bad head prev %llu\n", - (long long)new->next, (long long)b_off); + (long long)new.next, (long long)b_off); return -1; } #endif /* next->prev = new. */ - if (tdb_write_off(tdb, new->next - + offsetof(struct tdb_free_record, prev), - off) != 0) + if (tdb_write_off(tdb, new.next + + offsetof(struct tdb_free_record, + magic_and_prev), + off | magic) != 0) return -1; } /* head = new */ if (tdb_write_off(tdb, b_off, off) != 0) return -1; - return tdb_write_convert(tdb, off, new, sizeof(*new)); + return tdb_write_convert(tdb, off, &new, sizeof(new)); } /* List need not be locked. */ int add_free_record(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len_with_header) { - struct tdb_free_record new; tdb_off_t b_off; + tdb_len_t len; int ret; - assert(len_with_header >= sizeof(new)); + assert(len_with_header >= sizeof(struct tdb_free_record)); - new.magic_and_meta = TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL) - | tdb->flist_off; - new.data_len = len_with_header - sizeof(struct tdb_used_record); + len = len_with_header - sizeof(struct tdb_used_record); - b_off = bucket_off(tdb->flist_off, size_to_bucket(new.data_len)); + b_off = bucket_off(tdb->flist_off, size_to_bucket(len)); if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0) return -1; - ret = enqueue_in_free(tdb, b_off, off, &new); + ret = enqueue_in_free(tdb, b_off, off, len); tdb_unlock_free_bucket(tdb, b_off); return ret; } @@ -234,6 +244,17 @@ static size_t record_leftover(size_t keylen, size_t datalen, return leftover; } +/* FIXME: Shortcut common case where tdb->flist == flist */ +static tdb_off_t flist_offset(struct tdb_context *tdb, unsigned int flist) +{ + tdb_off_t off = first_flist(tdb); + unsigned int i; + + for (i = 0; i < flist; i++) + off = next_flist(tdb, off); + return off; +} + /* Note: we unlock the current bucket if we coalesce or fail. */ static int coalesce(struct tdb_context *tdb, tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len) @@ -245,6 +266,7 @@ static int coalesce(struct tdb_context *tdb, while (end < tdb->map_size) { tdb_off_t nb_off; + unsigned flist, bucket; /* FIXME: do tdb_get here and below really win? */ r = tdb_get(tdb, end, &pad, sizeof(pad)); @@ -254,7 +276,9 @@ static int coalesce(struct tdb_context *tdb, if (frec_magic(r) != TDB_FREE_MAGIC) break; - nb_off = bucket_off(frec_flist(r), size_to_bucket(r->data_len)); + flist = frec_flist(r); + bucket = size_to_bucket(frec_len(r)); + nb_off = bucket_off(flist_offset(tdb, flist), bucket); /* We may be violating lock order here, so best effort. */ if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1) @@ -272,9 +296,8 @@ static int coalesce(struct tdb_context *tdb, break; } - if (unlikely(bucket_off(frec_flist(r), - size_to_bucket(r->data_len)) - != nb_off)) { + if (unlikely(frec_flist(r) != flist) + || unlikely(size_to_bucket(frec_len(r)) != bucket)) { tdb_unlock_free_bucket(tdb, nb_off); break; } @@ -284,7 +307,7 @@ static int coalesce(struct tdb_context *tdb, goto err; } - end += sizeof(struct tdb_used_record) + r->data_len; + end += sizeof(struct tdb_used_record) + frec_len(r); tdb_unlock_free_bucket(tdb, nb_off); } @@ -297,11 +320,11 @@ static int coalesce(struct tdb_context *tdb, if (!r) goto err; - if (r->data_len != data_len) { + if (frec_len(r) != data_len) { tdb->ecode = TDB_ERR_CORRUPT; tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, "coalesce: expected data len %llu not %llu\n", - (long long)data_len, (long long)r->data_len); + (long long)data_len, (long long)frec_len(r)); goto err; } @@ -314,8 +337,9 @@ static int coalesce(struct tdb_context *tdb, /* We have to drop this to avoid deadlocks, so make sure record * doesn't get coalesced by someone else! */ - r->magic_and_meta = TDB_COALESCING_MAGIC << (64 - TDB_OFF_UPPER_STEAL); - r->data_len = end - off - sizeof(struct tdb_used_record); + r->magic_and_prev = TDB_COALESCING_MAGIC << (64 - TDB_OFF_UPPER_STEAL); + /* FIXME: Use 255 as invalid free list? */ + r->flist_and_len = end - off - sizeof(struct tdb_used_record); if (tdb_access_commit(tdb, r) != 0) goto err; @@ -353,7 +377,7 @@ again: return TDB_OFF_ERR; } - best.data_len = -1ULL; + best.flist_and_len = -1ULL; best_off = 0; /* Get slack if we're after extra. */ @@ -377,22 +401,22 @@ again: if (frec_magic(r) != TDB_FREE_MAGIC) { tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "lock_and_alloc: %llu non-free 0x%llx\n", - (long long)off, (long long)r->magic_and_meta); + (long long)off, (long long)r->magic_and_prev); goto unlock_err; } - if (r->data_len >= size && r->data_len < best.data_len) { + if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) { best_off = off; best = *r; } - if (best.data_len < size * multiplier && best_off) + if (frec_len(&best) < size * multiplier && best_off) break; multiplier *= 1.01; /* Since we're going slow anyway, try coalescing here. */ - switch (coalesce(tdb, off, b_off, r->data_len)) { + switch (coalesce(tdb, off, b_off, frec_len(r))) { case -1: /* This has already unlocked on error. */ return -1; @@ -413,13 +437,13 @@ again: goto unlock_err; leftover = record_leftover(keylen, datalen, want_extra, - best.data_len); + frec_len(&best)); - assert(keylen + datalen + leftover <= best.data_len); + assert(keylen + datalen + leftover <= frec_len(&best)); /* We need to mark non-free before we drop lock, otherwise * coalesce() could try to merge it! */ if (set_used_header(tdb, &rec, keylen, datalen, - best.data_len - leftover, + frec_len(&best) - leftover, hashlow) != 0) goto unlock_err; @@ -431,7 +455,7 @@ again: if (leftover) { if (add_free_record(tdb, best_off + sizeof(rec) - + best.data_len - leftover, + + frec_len(&best) - leftover, leftover)) return TDB_OFF_ERR; } @@ -451,8 +475,8 @@ static tdb_off_t get_free(struct tdb_context *tdb, size_t keylen, size_t datalen, bool want_extra, unsigned hashlow) { - tdb_off_t off, flist; - unsigned start_b, b; + tdb_off_t off, flist_off; + unsigned start_b, b, flist; bool wrapped = false; /* If they are growing, add 50% to get to higher bucket. */ @@ -462,31 +486,35 @@ static tdb_off_t get_free(struct tdb_context *tdb, else start_b = size_to_bucket(adjust_size(keylen, datalen)); - flist = tdb->flist_off; - while (!wrapped || flist != tdb->flist_off) { + flist_off = tdb->flist_off; + flist = tdb->flist; + while (!wrapped || flist_off != tdb->flist_off) { /* Start at exact size bucket, and search up... */ - for (b = find_free_head(tdb, flist, start_b); + for (b = find_free_head(tdb, flist_off, start_b); b < TDB_FREE_BUCKETS; - b = find_free_head(tdb, flist, b + 1)) { + b = find_free_head(tdb, flist_off, b + 1)) { /* Try getting one from list. */ - off = lock_and_alloc(tdb, flist, + off = lock_and_alloc(tdb, flist_off, b, keylen, datalen, want_extra, hashlow); if (off == TDB_OFF_ERR) return TDB_OFF_ERR; if (off != 0) { /* Worked? Stay using this list. */ - tdb->flist_off = flist; + tdb->flist_off = flist_off; + tdb->flist = flist; return off; } /* Didn't work. Try next bucket. */ } /* Hmm, try next list. */ - flist = next_flist(tdb, flist); - if (flist == 0) { + flist_off = next_flist(tdb, flist_off); + flist++; + if (flist_off == 0) { wrapped = true; - flist = first_flist(tdb); + flist_off = first_flist(tdb); + flist = 0; } } diff --git a/ccan/tdb2/private.h b/ccan/tdb2/private.h index 387e49f6..a10d1070 100644 --- a/ccan/tdb2/private.h +++ b/ccan/tdb2/private.h @@ -173,20 +173,30 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r) } struct tdb_free_record { - uint64_t magic_and_meta; /* TDB_OFF_UPPER_STEAL bits of magic */ - uint64_t data_len; /* Not counting these two fields. */ - /* This is why the minimum record size is 16 bytes. */ - uint64_t next, prev; + uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */ + uint64_t flist_and_len; /* Len not counting these two fields. */ + /* This is why the minimum record size is 8 bytes. */ + uint64_t next; }; +static inline uint64_t frec_prev(const struct tdb_free_record *f) +{ + return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1); +} + static inline uint64_t frec_magic(const struct tdb_free_record *f) { - return f->magic_and_meta >> (64 - TDB_OFF_UPPER_STEAL); + return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL); +} + +static inline uint64_t frec_len(const struct tdb_free_record *f) +{ + return f->flist_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1); } -static inline uint64_t frec_flist(const struct tdb_free_record *f) +static inline unsigned frec_flist(const struct tdb_free_record *f) { - return f->magic_and_meta & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1); + return f->flist_and_len >> (64 - TDB_OFF_UPPER_STEAL); } struct tdb_recovery_record { @@ -311,6 +321,7 @@ struct tdb_context { /* What freelist are we using? */ uint64_t flist_off; + unsigned int flist; /* IO methods: changes for transactions. */ const struct tdb_methods *methods; diff --git a/ccan/tdb2/summary.c b/ccan/tdb2/summary.c index 97052608..c04fa76f 100644 --- a/ccan/tdb2/summary.c +++ b/ccan/tdb2/summary.c @@ -63,7 +63,7 @@ static bool summarize(struct tdb_context *tdb, || p->r.magic == TDB_RECOVERY_MAGIC) { len = sizeof(p->r) + p->r.max_len; } else if (rec_magic(&p->u) != TDB_MAGIC) { - len = p->f.data_len; + len = frec_len(&p->f); tally_add(free, len); tally_add(buckets, size_to_bucket(len)); len += sizeof(p->u); diff --git a/ccan/tdb2/test/layout.c b/ccan/tdb2/test/layout.c index 3049d321..25f41e25 100644 --- a/ccan/tdb2/test/layout.c +++ b/ccan/tdb2/test/layout.c @@ -136,9 +136,11 @@ static void set_freelist(void *mem, struct tdb_context *tdb, static void add_to_freetable(struct tdb_context *tdb, tdb_off_t eoff, tdb_off_t elen, + unsigned flist, struct tle_freelist *freelist) { tdb->flist_off = freelist->base.off; + tdb->flist = flist; add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen); } @@ -288,6 +290,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout) switch (e->base.type) { case FREE: add_to_freetable(tdb, e->base.off, e->free.len, + e->free.flist_num, find_flist(layout, e->free.flist_num)); break; case DATA: diff --git a/ccan/tdb2/test/run-03-coalesce.c b/ccan/tdb2/test/run-03-coalesce.c index 5d55577c..4def490f 100644 --- a/ccan/tdb2/test/run-03-coalesce.c +++ b/ccan/tdb2/test/run-03-coalesce.c @@ -17,7 +17,7 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off) return TDB_OFF_ERR; if (frec_magic(&f) != TDB_FREE_MAGIC) return TDB_OFF_ERR; - return f.data_len; + return frec_len(&f); } int main(int argc, char *argv[])