From 5c4a21ab03a428373e7659b9606facf85dcbe17b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 27 Apr 2011 21:43:23 +0930 Subject: [PATCH] tdb2: don't start again when we coalesce a record. We currently start walking the free list again when we coalesce any record; this is overzealous, as we only care about the next record being blatted, or the record we currently consider "best". We can also opportunistically try to add the coalesced record into the new free list: if it fails, we go back to the old "mark record, unlock, re-lock" code. Before: $ time ./growtdb-bench 250000 10 > /dev/null && ls -l /tmp/growtdb.tdb && time ./tdbtorture -s 0 && ls -l torture.tdb && ./speed --transaction 2000000 real 1m0.243s user 0m13.677s sys 0m4.336s -rw------- 1 rusty rusty 683302864 2011-04-27 21:03 /tmp/growtdb.tdb testing with 3 processes, 5000 loops, seed=0 OK real 1m24.074s user 0m0.344s sys 0m0.468s -rw------- 1 rusty rusty 836040 2011-04-27 21:04 torture.tdb Adding 2000000 records: 1015 ns (110551992 bytes) Finding 2000000 records: 641 ns (110551992 bytes) Missing 2000000 records: 445 ns (110551992 bytes) Traversing 2000000 records: 439 ns (110551992 bytes) Deleting 2000000 records: 807 ns (199517112 bytes) Re-adding 2000000 records: 851 ns (199517112 bytes) Appending 2000000 records: 1301 ns (376542552 bytes) Churning 2000000 records: 2423 ns (553641304 bytes) After: $ time ./growtdb-bench 250000 10 > /dev/null && ls -l /tmp/growtdb.tdb && time ./tdbtorture -s 0 && ls -l torture.tdb && ./speed --transaction 2000000 real 0m57.403s user 0m11.361s sys 0m4.056s -rw------- 1 rusty rusty 689536976 2011-04-27 21:10 /tmp/growtdb.tdb testing with 3 processes, 5000 loops, seed=0 OK real 1m24.901s user 0m0.380s sys 0m0.512s -rw------- 1 rusty rusty 655368 2011-04-27 21:12 torture.tdb Adding 2000000 records: 941 ns (110551992 bytes) Finding 2000000 records: 603 ns (110551992 bytes) Missing 2000000 records: 428 ns (110551992 bytes) Traversing 2000000 records: 416 ns (110551992 bytes) Deleting 2000000 records: 741 ns (199517112 bytes) Re-adding 2000000 records: 819 ns (199517112 bytes) Appending 2000000 records: 1228 ns (376542552 bytes) Churning 2000000 records: 2042 ns (553641304 bytes) --- ccan/tdb2/free.c | 81 ++++++++++++++++++++++--------- ccan/tdb2/private.h | 3 +- ccan/tdb2/tdb.c | 6 ++- ccan/tdb2/test/layout.c | 3 +- ccan/tdb2/test/run-03-coalesce.c | 29 ++++++++--- ccan/tdb2/test/run-04-basichash.c | 3 +- ccan/tdb2/transaction.c | 3 +- 7 files changed, 92 insertions(+), 36 deletions(-) diff --git a/ccan/tdb2/free.c b/ccan/tdb2/free.c index eaaeb3cf..cd9a332a 100644 --- a/ccan/tdb2/free.c +++ b/ccan/tdb2/free.c @@ -266,7 +266,8 @@ static enum TDB_ERROR enqueue_in_free(struct tdb_context *tdb, /* List need not be locked. */ enum TDB_ERROR add_free_record(struct tdb_context *tdb, - tdb_off_t off, tdb_len_t len_with_header) + tdb_off_t off, tdb_len_t len_with_header, + enum tdb_lock_flags waitflag) { tdb_off_t b_off; tdb_len_t len; @@ -277,7 +278,7 @@ enum TDB_ERROR add_free_record(struct tdb_context *tdb, len = len_with_header - sizeof(struct tdb_used_record); b_off = bucket_off(tdb->ftable_off, size_to_bucket(len)); - ecode = tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT); + ecode = tdb_lock_free_bucket(tdb, b_off, waitflag); if (ecode != TDB_SUCCESS) { return ecode; } @@ -333,10 +334,13 @@ static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable) return off; } -/* Note: we unlock the current bucket if we coalesce (> 0) or fail (-ve). */ +/* Note: we unlock the current bucket if fail (-ve), or coalesce (-ve) and + * need to blatt either of the *protect records (which is set to an error). */ static tdb_len_t coalesce(struct tdb_context *tdb, tdb_off_t off, tdb_off_t b_off, - tdb_len_t data_len) + tdb_len_t data_len, + tdb_off_t *protect1, + tdb_off_t *protect2) { tdb_off_t end; struct tdb_free_record rec; @@ -400,6 +404,10 @@ static tdb_len_t coalesce(struct tdb_context *tdb, break; } + /* Did we just mess up a record you were hoping to use? */ + if (end == *protect1 || end == *protect2) + *protect1 = TDB_ERR_NOEXIST; + ecode = remove_from_list(tdb, nb_off, end, &rec); check_list(tdb, nb_off); if (ecode != TDB_SUCCESS) { @@ -416,6 +424,10 @@ static tdb_len_t coalesce(struct tdb_context *tdb, if (end == off + sizeof(struct tdb_used_record) + data_len) return 0; + /* Before we expand, check this isn't one you wanted protected? */ + if (off == *protect1 || off == *protect2) + *protect1 = TDB_ERR_EXISTS; + /* OK, expand initial record */ ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); if (ecode != TDB_SUCCESS) { @@ -435,24 +447,37 @@ static tdb_len_t coalesce(struct tdb_context *tdb, goto err; } - /* We have to drop this to avoid deadlocks, so make sure record - * doesn't get coalesced by someone else! */ - rec.ftable_and_len = (TDB_FTABLE_NONE << (64 - TDB_OFF_UPPER_STEAL)) - | (end - off - sizeof(struct tdb_used_record)); - ecode = tdb_write_off(tdb, off + offsetof(struct tdb_free_record, - ftable_and_len), - rec.ftable_and_len); + /* Try locking violation first... */ + ecode = add_free_record(tdb, off, end - off, TDB_LOCK_NOWAIT); if (ecode != TDB_SUCCESS) { - goto err; - } + /* Need to drop lock. Can't rely on anything stable. */ + *protect1 = TDB_ERR_CORRUPT; + + /* We have to drop this to avoid deadlocks, so make sure record + * doesn't get coalesced by someone else! */ + rec.ftable_and_len = (TDB_FTABLE_NONE + << (64 - TDB_OFF_UPPER_STEAL)) + | (end - off - sizeof(struct tdb_used_record)); + ecode = tdb_write_off(tdb, + off + offsetof(struct tdb_free_record, + ftable_and_len), + rec.ftable_and_len); + if (ecode != TDB_SUCCESS) { + goto err; + } - tdb->stats.alloc_coalesce_succeeded++; - tdb_unlock_free_bucket(tdb, b_off); + tdb->stats.alloc_coalesce_succeeded++; + tdb_unlock_free_bucket(tdb, b_off); - ecode = add_free_record(tdb, off, end - off); - if (ecode != TDB_SUCCESS) { - return ecode; + ecode = add_free_record(tdb, off, end - off, TDB_LOCK_WAIT); + if (ecode != TDB_SUCCESS) { + return ecode; + } + } else if (TDB_OFF_IS_ERR(*protect1)) { + /* For simplicity, we always drop lock if they can't continue */ + tdb_unlock_free_bucket(tdb, b_off); } + /* Return usable length. */ return end - off - sizeof(struct tdb_used_record); @@ -474,6 +499,7 @@ static tdb_off_t lock_and_alloc(struct tdb_context *tdb, tdb_off_t off, b_off,best_off; struct tdb_free_record best = { 0 }; double multiplier; + bool coalesce_after_best = false; /* Damn GCC warning! */ size_t size = adjust_size(keylen, datalen); enum TDB_ERROR ecode; @@ -529,6 +555,7 @@ again: if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) { best_off = off; best = *r; + coalesce_after_best = false; } if (frec_len(&best) <= size * multiplier && best_off) { @@ -543,15 +570,17 @@ again: tdb_access_release(tdb, r); /* Since we're going slow anyway, try coalescing here. */ - coal = coalesce(tdb, off, b_off, len); + coal = coalesce(tdb, off, b_off, len, &best_off, &next); if (TDB_OFF_IS_ERR(coal)) { /* This has already unlocked on error. */ return coal; } - if (coal > 0) { + if (TDB_OFF_IS_ERR(best_off)) { /* This has unlocked list, restart. */ goto again; } + if (coal > 0) + coalesce_after_best = true; off = next; } @@ -560,6 +589,14 @@ again: struct tdb_used_record rec; size_t leftover; + /* If we coalesced, we might have change prev/next ptrs. */ + if (coalesce_after_best) { + ecode = tdb_read_convert(tdb, best_off, &best, + sizeof(best)); + if (ecode != TDB_SUCCESS) + goto unlock_err; + } + /* We're happy with this size: take it. */ ecode = remove_from_list(tdb, b_off, best_off, &best); check_list(tdb, b_off); @@ -600,7 +637,7 @@ again: ecode = add_free_record(tdb, best_off + sizeof(rec) + frec_len(&best) - leftover, - leftover); + leftover, TDB_LOCK_WAIT); if (ecode != TDB_SUCCESS) { best_off = ecode; } @@ -774,7 +811,7 @@ static enum TDB_ERROR tdb_expand(struct tdb_context *tdb, tdb_len_t size) tdb_unlock_expand(tdb, F_WRLCK); tdb->stats.expands++; - return add_free_record(tdb, old_size, wanted); + return add_free_record(tdb, old_size, wanted, TDB_LOCK_WAIT); } /* This won't fail: it will expand the database if it has to. */ diff --git a/ccan/tdb2/private.h b/ccan/tdb2/private.h index 14d319ce..213e8361 100644 --- a/ccan/tdb2/private.h +++ b/ccan/tdb2/private.h @@ -465,7 +465,8 @@ tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen, /* Put this record in a free list. */ enum TDB_ERROR add_free_record(struct tdb_context *tdb, - tdb_off_t off, tdb_len_t len_with_header); + tdb_off_t off, tdb_len_t len_with_header, + enum tdb_lock_flags waitflag); /* Set up header for a used/ftable/htable/chain record. */ enum TDB_ERROR set_header(struct tdb_context *tdb, diff --git a/ccan/tdb2/tdb.c b/ccan/tdb2/tdb.c index b96c8010..d7b5163b 100644 --- a/ccan/tdb2/tdb.c +++ b/ccan/tdb2/tdb.c @@ -41,7 +41,8 @@ static enum TDB_ERROR replace_data(struct tdb_context *tdb, tdb->stats.frees++; ecode = add_free_record(tdb, old_off, sizeof(struct tdb_used_record) - + key.dsize + old_room); + + key.dsize + old_room, + TDB_LOCK_WAIT); if (ecode == TDB_SUCCESS) ecode = replace_in_hash(tdb, h, new_off); } else { @@ -290,7 +291,8 @@ enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key) sizeof(struct tdb_used_record) + rec_key_length(&rec) + rec_data_length(&rec) - + rec_extra_padding(&rec)); + + rec_extra_padding(&rec), + TDB_LOCK_WAIT); if (tdb->flags & TDB_SEQNUM) tdb_inc_seqnum(tdb); diff --git a/ccan/tdb2/test/layout.c b/ccan/tdb2/test/layout.c index 7549798e..be54fe97 100644 --- a/ccan/tdb2/test/layout.c +++ b/ccan/tdb2/test/layout.c @@ -149,7 +149,8 @@ static void add_to_freetable(struct tdb_context *tdb, { tdb->ftable_off = freetable->base.off; tdb->ftable = ftable; - add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen); + add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen, + TDB_LOCK_WAIT); } static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned ingroup) diff --git a/ccan/tdb2/test/run-03-coalesce.c b/ccan/tdb2/test/run-03-coalesce.c index cecd7eb2..e60e341e 100644 --- a/ccan/tdb2/test/run-03-coalesce.c +++ b/ccan/tdb2/test/run-03-coalesce.c @@ -25,15 +25,16 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off) int main(int argc, char *argv[]) { - tdb_off_t b_off; + tdb_off_t b_off, test; struct tdb_context *tdb; struct tdb_layout *layout; struct tdb_data data, key; tdb_len_t len; /* FIXME: Test TDB_CONVERT */ + /* FIXME: Test lock order fail. */ - plan_tests(38); + plan_tests(42); data = tdb_mkdata("world", 5); key = tdb_mkdata("hello", 5); @@ -50,9 +51,12 @@ int main(int argc, char *argv[]) b_off = bucket_off(tdb->ftable_off, size_to_bucket(len)); /* Lock and fail to coalesce. */ ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); - ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len) == 0); + test = layout->elem[1].base.off; + ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len, &test, &test) + == 0); tdb_unlock_free_bucket(tdb, b_off); ok1(free_record_length(tdb, layout->elem[1].base.off) == len); + ok1(test == layout->elem[1].base.off); ok1(tdb_check(tdb, NULL, NULL) == 0); tdb_close(tdb); tdb_layout_free(layout); @@ -70,9 +74,12 @@ int main(int argc, char *argv[]) b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024)); /* Lock and fail to coalesce. */ ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); - ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 0); + test = layout->elem[1].base.off; + ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test, &test) + == 0); tdb_unlock_free_bucket(tdb, b_off); ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024); + ok1(test == layout->elem[1].base.off); ok1(tdb_check(tdb, NULL, NULL) == 0); tdb_close(tdb); tdb_layout_free(layout); @@ -91,8 +98,11 @@ int main(int argc, char *argv[]) b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024)); /* Lock and coalesce. */ ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); - ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) + test = layout->elem[2].base.off; + ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test, &test) == 1024 + sizeof(struct tdb_used_record) + 2048); + /* Should tell us it's erased this one... */ + ok1(test == TDB_ERR_NOEXIST); ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0); ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024 + sizeof(struct tdb_used_record) + 2048); @@ -115,11 +125,13 @@ int main(int argc, char *argv[]) b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024)); /* Lock and coalesce. */ ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); - ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) + test = layout->elem[2].base.off; + ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test, &test) == 1024 + sizeof(struct tdb_used_record) + 512); ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0); ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024 + sizeof(struct tdb_used_record) + 512); + ok1(test == TDB_ERR_NOEXIST); ok1(tdb_check(tdb, NULL, NULL) == 0); tdb_close(tdb); tdb_layout_free(layout); @@ -140,8 +152,9 @@ int main(int argc, char *argv[]) b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024)); /* Lock and coalesce. */ ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); - ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == - 1024 + sizeof(struct tdb_used_record) + 512 + test = layout->elem[2].base.off; + ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test, &test) + == 1024 + sizeof(struct tdb_used_record) + 512 + sizeof(struct tdb_used_record) + 256); ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0); diff --git a/ccan/tdb2/test/run-04-basichash.c b/ccan/tdb2/test/run-04-basichash.c index 50c7c77d..b92b6bdd 100644 --- a/ccan/tdb2/test/run-04-basichash.c +++ b/ccan/tdb2/test/run-04-basichash.c @@ -176,7 +176,8 @@ int main(int argc, char *argv[]) sizeof(struct tdb_used_record) + rec_key_length(&rec) + rec_data_length(&rec) - + rec_extra_padding(&rec)) == 0); + + rec_extra_padding(&rec), + TDB_LOCK_NOWAIT) == 0); ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK) == 0); ok1(tdb_check(tdb, NULL, NULL) == 0); diff --git a/ccan/tdb2/transaction.c b/ccan/tdb2/transaction.c index dd2f0e25..e6eb1c0b 100644 --- a/ccan/tdb2/transaction.c +++ b/ccan/tdb2/transaction.c @@ -688,7 +688,8 @@ static enum TDB_ERROR tdb_recovery_allocate(struct tdb_context *tdb, if (recovery_head != 0) { tdb->stats.frees++; ecode = add_free_record(tdb, recovery_head, - sizeof(rec) + rec.max_len); + sizeof(rec) + rec.max_len, + TDB_LOCK_WAIT); if (ecode != TDB_SUCCESS) { return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, "tdb_recovery_allocate:" -- 2.39.2