]> git.ozlabs.org Git - ccan/commitdiff
tdb2: don't start again when we coalesce a record.
authorRusty Russell <rusty@rustcorp.com.au>
Wed, 27 Apr 2011 12:13:23 +0000 (21:43 +0930)
committerRusty Russell <rusty@rustcorp.com.au>
Wed, 27 Apr 2011 12:13:23 +0000 (21:43 +0930)
We currently start walking the free list again when we coalesce any record;
this is overzealous, as we only care about the next record being blatted,
or the record we currently consider "best".

We can also opportunistically try to add the coalesced record into the
new free list: if it fails, we go back to the old "mark record,
unlock, re-lock" code.

Before:
$ time ./growtdb-bench 250000 10 > /dev/null && ls -l /tmp/growtdb.tdb && time ./tdbtorture -s 0 && ls -l torture.tdb && ./speed --transaction 2000000
real 1m0.243s
user 0m13.677s
sys 0m4.336s
-rw------- 1 rusty rusty 683302864 2011-04-27 21:03 /tmp/growtdb.tdb
testing with 3 processes, 5000 loops, seed=0
OK

real 1m24.074s
user 0m0.344s
sys 0m0.468s
-rw------- 1 rusty rusty 836040 2011-04-27 21:04 torture.tdb
Adding 2000000 records:  1015 ns (110551992 bytes)
Finding 2000000 records:  641 ns (110551992 bytes)
Missing 2000000 records:  445 ns (110551992 bytes)
Traversing 2000000 records:  439 ns (110551992 bytes)
Deleting 2000000 records:  807 ns (199517112 bytes)
Re-adding 2000000 records:  851 ns (199517112 bytes)
Appending 2000000 records:  1301 ns (376542552 bytes)
Churning 2000000 records:  2423 ns (553641304 bytes)

After:
$ time ./growtdb-bench 250000 10 > /dev/null && ls -l /tmp/growtdb.tdb && time ./tdbtorture -s 0 && ls -l torture.tdb && ./speed --transaction 2000000
real 0m57.403s
user 0m11.361s
sys 0m4.056s
-rw------- 1 rusty rusty 689536976 2011-04-27 21:10 /tmp/growtdb.tdb
testing with 3 processes, 5000 loops, seed=0
OK

real 1m24.901s
user 0m0.380s
sys 0m0.512s
-rw------- 1 rusty rusty 655368 2011-04-27 21:12 torture.tdb
Adding 2000000 records:  941 ns (110551992 bytes)
Finding 2000000 records:  603 ns (110551992 bytes)
Missing 2000000 records:  428 ns (110551992 bytes)
Traversing 2000000 records:  416 ns (110551992 bytes)
Deleting 2000000 records:  741 ns (199517112 bytes)
Re-adding 2000000 records:  819 ns (199517112 bytes)
Appending 2000000 records:  1228 ns (376542552 bytes)
Churning 2000000 records:  2042 ns (553641304 bytes)

ccan/tdb2/free.c
ccan/tdb2/private.h
ccan/tdb2/tdb.c
ccan/tdb2/test/layout.c
ccan/tdb2/test/run-03-coalesce.c
ccan/tdb2/test/run-04-basichash.c
ccan/tdb2/transaction.c

index eaaeb3cf2bca594d8294ac9a25b533d45d671eea..cd9a332abe425407cf27b5e35a49db912e3a92b3 100644 (file)
@@ -266,7 +266,8 @@ static enum TDB_ERROR enqueue_in_free(struct tdb_context *tdb,
 
 /* List need not be locked. */
 enum TDB_ERROR add_free_record(struct tdb_context *tdb,
-                              tdb_off_t off, tdb_len_t len_with_header)
+                              tdb_off_t off, tdb_len_t len_with_header,
+                              enum tdb_lock_flags waitflag)
 {
        tdb_off_t b_off;
        tdb_len_t len;
@@ -277,7 +278,7 @@ enum TDB_ERROR add_free_record(struct tdb_context *tdb,
        len = len_with_header - sizeof(struct tdb_used_record);
 
        b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
-       ecode = tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT);
+       ecode = tdb_lock_free_bucket(tdb, b_off, waitflag);
        if (ecode != TDB_SUCCESS) {
                return ecode;
        }
@@ -333,10 +334,13 @@ static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable)
        return off;
 }
 
-/* Note: we unlock the current bucket if we coalesce (> 0) or fail (-ve). */
+/* Note: we unlock the current bucket if fail (-ve), or coalesce (-ve) and
+ * need to blatt either of the *protect records (which is set to an error). */
 static tdb_len_t coalesce(struct tdb_context *tdb,
                          tdb_off_t off, tdb_off_t b_off,
-                         tdb_len_t data_len)
+                         tdb_len_t data_len,
+                         tdb_off_t *protect1,
+                         tdb_off_t *protect2)
 {
        tdb_off_t end;
        struct tdb_free_record rec;
@@ -400,6 +404,10 @@ static tdb_len_t coalesce(struct tdb_context *tdb,
                        break;
                }
 
+               /* Did we just mess up a record you were hoping to use? */
+               if (end == *protect1 || end == *protect2)
+                       *protect1 = TDB_ERR_NOEXIST;
+
                ecode = remove_from_list(tdb, nb_off, end, &rec);
                check_list(tdb, nb_off);
                if (ecode != TDB_SUCCESS) {
@@ -416,6 +424,10 @@ static tdb_len_t coalesce(struct tdb_context *tdb,
        if (end == off + sizeof(struct tdb_used_record) + data_len)
                return 0;
 
+       /* Before we expand, check this isn't one you wanted protected? */
+       if (off == *protect1 || off == *protect2)
+               *protect1 = TDB_ERR_EXISTS;
+
        /* OK, expand initial record */
        ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
        if (ecode != TDB_SUCCESS) {
@@ -435,24 +447,37 @@ static tdb_len_t coalesce(struct tdb_context *tdb,
                goto err;
        }
 
-       /* We have to drop this to avoid deadlocks, so make sure record
-        * doesn't get coalesced by someone else! */
-       rec.ftable_and_len = (TDB_FTABLE_NONE << (64 - TDB_OFF_UPPER_STEAL))
-               | (end - off - sizeof(struct tdb_used_record));
-       ecode = tdb_write_off(tdb, off + offsetof(struct tdb_free_record,
-                                                 ftable_and_len),
-                             rec.ftable_and_len);
+       /* Try locking violation first... */
+       ecode = add_free_record(tdb, off, end - off, TDB_LOCK_NOWAIT);
        if (ecode != TDB_SUCCESS) {
-               goto err;
-       }
+               /* Need to drop lock.  Can't rely on anything stable. */
+               *protect1 = TDB_ERR_CORRUPT;
+
+               /* We have to drop this to avoid deadlocks, so make sure record
+                * doesn't get coalesced by someone else! */
+               rec.ftable_and_len = (TDB_FTABLE_NONE
+                                     << (64 - TDB_OFF_UPPER_STEAL))
+                       | (end - off - sizeof(struct tdb_used_record));
+               ecode = tdb_write_off(tdb,
+                                     off + offsetof(struct tdb_free_record,
+                                                    ftable_and_len),
+                                     rec.ftable_and_len);
+               if (ecode != TDB_SUCCESS) {
+                       goto err;
+               }
 
-       tdb->stats.alloc_coalesce_succeeded++;
-       tdb_unlock_free_bucket(tdb, b_off);
+               tdb->stats.alloc_coalesce_succeeded++;
+               tdb_unlock_free_bucket(tdb, b_off);
 
-       ecode = add_free_record(tdb, off, end - off);
-       if (ecode != TDB_SUCCESS) {
-               return ecode;
+               ecode = add_free_record(tdb, off, end - off, TDB_LOCK_WAIT);
+               if (ecode != TDB_SUCCESS) {
+                       return ecode;
+               }
+       } else if (TDB_OFF_IS_ERR(*protect1)) {
+               /* For simplicity, we always drop lock if they can't continue */
+               tdb_unlock_free_bucket(tdb, b_off);
        }
+
        /* Return usable length. */
        return end - off - sizeof(struct tdb_used_record);
 
@@ -474,6 +499,7 @@ static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
        tdb_off_t off, b_off,best_off;
        struct tdb_free_record best = { 0 };
        double multiplier;
+       bool coalesce_after_best = false; /* Damn GCC warning! */
        size_t size = adjust_size(keylen, datalen);
        enum TDB_ERROR ecode;
 
@@ -529,6 +555,7 @@ again:
                if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
                        best_off = off;
                        best = *r;
+                       coalesce_after_best = false;
                }
 
                if (frec_len(&best) <= size * multiplier && best_off) {
@@ -543,15 +570,17 @@ again:
                tdb_access_release(tdb, r);
 
                /* Since we're going slow anyway, try coalescing here. */
-               coal = coalesce(tdb, off, b_off, len);
+               coal = coalesce(tdb, off, b_off, len, &best_off, &next);
                if (TDB_OFF_IS_ERR(coal)) {
                        /* This has already unlocked on error. */
                        return coal;
                }
-               if (coal > 0) {
+               if (TDB_OFF_IS_ERR(best_off)) {
                        /* This has unlocked list, restart. */
                        goto again;
                }
+               if (coal > 0)
+                       coalesce_after_best = true;
                off = next;
        }
 
@@ -560,6 +589,14 @@ again:
                struct tdb_used_record rec;
                size_t leftover;
 
+               /* If we coalesced, we might have change prev/next ptrs. */
+               if (coalesce_after_best) {
+                       ecode = tdb_read_convert(tdb, best_off, &best,
+                                                sizeof(best));
+                       if (ecode != TDB_SUCCESS)
+                               goto unlock_err;
+               }
+
                /* We're happy with this size: take it. */
                ecode = remove_from_list(tdb, b_off, best_off, &best);
                check_list(tdb, b_off);
@@ -600,7 +637,7 @@ again:
                        ecode = add_free_record(tdb,
                                                best_off + sizeof(rec)
                                                + frec_len(&best) - leftover,
-                                               leftover);
+                                               leftover, TDB_LOCK_WAIT);
                        if (ecode != TDB_SUCCESS) {
                                best_off = ecode;
                        }
@@ -774,7 +811,7 @@ static enum TDB_ERROR tdb_expand(struct tdb_context *tdb, tdb_len_t size)
        tdb_unlock_expand(tdb, F_WRLCK);
 
        tdb->stats.expands++;
-       return add_free_record(tdb, old_size, wanted);
+       return add_free_record(tdb, old_size, wanted, TDB_LOCK_WAIT);
 }
 
 /* This won't fail: it will expand the database if it has to. */
index 14d319ce7aa6a128d9630b6676a517a2c0461887..213e83615a33cc00090175badb2b05e1c3aca24e 100644 (file)
@@ -465,7 +465,8 @@ tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
 
 /* Put this record in a free list. */
 enum TDB_ERROR add_free_record(struct tdb_context *tdb,
-                              tdb_off_t off, tdb_len_t len_with_header);
+                              tdb_off_t off, tdb_len_t len_with_header,
+                              enum tdb_lock_flags waitflag);
 
 /* Set up header for a used/ftable/htable/chain record. */
 enum TDB_ERROR set_header(struct tdb_context *tdb,
index b96c80103688fd299064b7558b556ab956560e6d..d7b5163be95eba66ac204f1be75dc239350c41f9 100644 (file)
@@ -41,7 +41,8 @@ static enum TDB_ERROR replace_data(struct tdb_context *tdb,
                tdb->stats.frees++;
                ecode = add_free_record(tdb, old_off,
                                        sizeof(struct tdb_used_record)
-                                       + key.dsize + old_room);
+                                       + key.dsize + old_room,
+                                       TDB_LOCK_WAIT);
                if (ecode == TDB_SUCCESS)
                        ecode = replace_in_hash(tdb, h, new_off);
        } else {
@@ -290,7 +291,8 @@ enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key)
                                sizeof(struct tdb_used_record)
                                + rec_key_length(&rec)
                                + rec_data_length(&rec)
-                               + rec_extra_padding(&rec));
+                               + rec_extra_padding(&rec),
+                               TDB_LOCK_WAIT);
 
        if (tdb->flags & TDB_SEQNUM)
                tdb_inc_seqnum(tdb);
index 7549798e76b39690e57e3a9bbf34c55c1be0ea45..be54fe977891b8ea6835c8d40516abb5289214bb 100644 (file)
@@ -149,7 +149,8 @@ static void add_to_freetable(struct tdb_context *tdb,
 {
        tdb->ftable_off = freetable->base.off;
        tdb->ftable = ftable;
-       add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen);
+       add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen,
+                       TDB_LOCK_WAIT);
 }
 
 static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned ingroup)
index cecd7eb2c250359db9c0dd8727e340cc8b6dbdac..e60e341eca46d9fe359a98602f144184f41271bc 100644 (file)
@@ -25,15 +25,16 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
 
 int main(int argc, char *argv[])
 {
-       tdb_off_t b_off;
+       tdb_off_t b_off, test;
        struct tdb_context *tdb;
        struct tdb_layout *layout;
        struct tdb_data data, key;
        tdb_len_t len;
 
        /* FIXME: Test TDB_CONVERT */
+       /* FIXME: Test lock order fail. */
 
-       plan_tests(38);
+       plan_tests(42);
        data = tdb_mkdata("world", 5);
        key = tdb_mkdata("hello", 5);
 
@@ -50,9 +51,12 @@ int main(int argc, char *argv[])
        b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
        /* Lock and fail to coalesce. */
        ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
-       ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len) == 0);
+       test = layout->elem[1].base.off;
+       ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len, &test, &test)
+           == 0);
        tdb_unlock_free_bucket(tdb, b_off);
        ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
+       ok1(test == layout->elem[1].base.off);
        ok1(tdb_check(tdb, NULL, NULL) == 0);
        tdb_close(tdb);
        tdb_layout_free(layout);
@@ -70,9 +74,12 @@ int main(int argc, char *argv[])
        b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
        /* Lock and fail to coalesce. */
        ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
-       ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) == 0);
+       test = layout->elem[1].base.off;
+       ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test, &test)
+           == 0);
        tdb_unlock_free_bucket(tdb, b_off);
        ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
+       ok1(test == layout->elem[1].base.off);
        ok1(tdb_check(tdb, NULL, NULL) == 0);
        tdb_close(tdb);
        tdb_layout_free(layout);
@@ -91,8 +98,11 @@ int main(int argc, char *argv[])
        b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
        /* Lock and coalesce. */
        ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
-       ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024)
+       test = layout->elem[2].base.off;
+       ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test, &test)
            == 1024 + sizeof(struct tdb_used_record) + 2048);
+       /* Should tell us it's erased this one... */
+       ok1(test == TDB_ERR_NOEXIST);
        ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0);
        ok1(free_record_length(tdb, layout->elem[1].base.off)
            == 1024 + sizeof(struct tdb_used_record) + 2048);
@@ -115,11 +125,13 @@ int main(int argc, char *argv[])
        b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
        /* Lock and coalesce. */
        ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
-       ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024)
+       test = layout->elem[2].base.off;
+       ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test, &test)
            == 1024 + sizeof(struct tdb_used_record) + 512);
        ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0);
        ok1(free_record_length(tdb, layout->elem[1].base.off)
            == 1024 + sizeof(struct tdb_used_record) + 512);
+       ok1(test == TDB_ERR_NOEXIST);
        ok1(tdb_check(tdb, NULL, NULL) == 0);
        tdb_close(tdb);
        tdb_layout_free(layout);
@@ -140,8 +152,9 @@ int main(int argc, char *argv[])
        b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
        /* Lock and coalesce. */
        ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
-       ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024) ==
-           1024 + sizeof(struct tdb_used_record) + 512
+       test = layout->elem[2].base.off;
+       ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test, &test)
+           == 1024 + sizeof(struct tdb_used_record) + 512
            + sizeof(struct tdb_used_record) + 256);
        ok1(tdb->file->allrecord_lock.count == 0
            && tdb->file->num_lockrecs == 0);
index 50c7c77df119642cba9b62a91fab3e133b897fc3..b92b6bdde05a4d1928094576b03206e9b194864d 100644 (file)
@@ -176,7 +176,8 @@ int main(int argc, char *argv[])
                                    sizeof(struct tdb_used_record)
                                    + rec_key_length(&rec)
                                    + rec_data_length(&rec)
-                                   + rec_extra_padding(&rec)) == 0);
+                                   + rec_extra_padding(&rec),
+                                   TDB_LOCK_NOWAIT) == 0);
                ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
                                      F_WRLCK) == 0);
                ok1(tdb_check(tdb, NULL, NULL) == 0);
index dd2f0e2507d095cb367ca06f865baea26d6ed1a2..e6eb1c0b5c413670b49be51d99f579e5fdad3be0 100644 (file)
@@ -688,7 +688,8 @@ static enum TDB_ERROR tdb_recovery_allocate(struct tdb_context *tdb,
        if (recovery_head != 0) {
                tdb->stats.frees++;
                ecode = add_free_record(tdb, recovery_head,
-                                       sizeof(rec) + rec.max_len);
+                                       sizeof(rec) + rec.max_len,
+                                       TDB_LOCK_WAIT);
                if (ecode != TDB_SUCCESS) {
                        return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
                                          "tdb_recovery_allocate:"