tdb2: reduce transaction before writing to recovery area.
authorRusty Russell <rusty@rustcorp.com.au>
Wed, 27 Apr 2011 12:17:58 +0000 (21:47 +0930)
committerRusty Russell <rusty@rustcorp.com.au>
Wed, 27 Apr 2011 12:17:58 +0000 (21:47 +0930)
We don't need to write the whole page to the recovery area if it
hasn't all changed.  Simply skipping the start and end of the pages
which are similar saves us about 20% on growtdb-bench 250000, and 45%
on tdbtorture.  The more thorough examination of page differences
gives us a saving of 90% on growtdb-bench and 98% on tdbtorture!

And we do win a bit on timings for transaction commit:

Before:
$ time ./growtdb-bench 250000 10 > /dev/null && ls -l /tmp/growtdb.tdb && time ./tdbtorture -s 0 && ls -l torture.tdb && ./speed --transaction 2000000
real 1m4.844s
user 0m15.537s
sys 0m3.796s
-rw------- 1 rusty rusty 626693096 2011-04-27 21:28 /tmp/growtdb.tdb
testing with 3 processes, 5000 loops, seed=0
OK

real 1m17.021s
user 0m0.272s
sys 0m0.540s
-rw------- 1 rusty rusty 458800 2011-04-27 21:29 torture.tdb
Adding 2000000 records:  894 ns (110556088 bytes)
Finding 2000000 records:  569 ns (110556088 bytes)
Missing 2000000 records:  390 ns (110556088 bytes)
Traversing 2000000 records:  403 ns (110556088 bytes)
Deleting 2000000 records:  710 ns (244003768 bytes)
Re-adding 2000000 records:  825 ns (244003768 bytes)
Appending 2000000 records:  1262 ns (268404160 bytes)
Churning 2000000 records:  2311 ns (268404160 bytes)

After:
$ time ./growtdb-bench 250000 10 > /dev/null && ls -l /tmp/growtdb.tdb && time ./tdbtorture -s 0 && ls -l torture.tdb && ./speed --transaction 2000000
real 0m50.366s
user 0m17.109s
sys 0m2.468s
-rw------- 1 rusty rusty 564215952 2011-04-27 21:31 /tmp/growtdb.tdb
testing with 3 processes, 5000 loops, seed=0
OK

real 1m23.818s
user 0m0.304s
sys 0m0.508s
-rw------- 1 rusty rusty 669856 2011-04-27 21:32 torture.tdb
Adding 2000000 records:  887 ns (110556088 bytes)
Finding 2000000 records:  556 ns (110556088 bytes)
Missing 2000000 records:  385 ns (110556088 bytes)
Traversing 2000000 records:  401 ns (110556088 bytes)
Deleting 2000000 records:  710 ns (244003768 bytes)
Re-adding 2000000 records:  825 ns (244003768 bytes)
Appending 2000000 records:  1255 ns (268404160 bytes)
Churning 2000000 records:  2299 ns (268404160 bytes)

ccan/tdb2/io.c
ccan/tdb2/transaction.c

index 31756de341d8fd43d17a41d57c41284cbb650227..8c5f45f30827c9e722227e3787797288713e056e 100644 (file)
@@ -130,6 +130,7 @@ static enum TDB_ERROR tdb_oob(struct tdb_context *tdb, tdb_off_t len,
 /* Endian conversion: we only ever deal with 8 byte quantities */
 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
 {
+       assert(size % 8 == 0);
        if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
                uint64_t i, *p = (uint64_t *)buf;
                for (i = 0; i < size / 8; i++)
index 73ceb9620265ee70b86320871c57aba0488c429f..4bdc3f32d019b6abe6e9519be298b027a7b804e6 100644 (file)
@@ -757,6 +757,44 @@ static void set_recovery_header(struct tdb_recovery_record *rec,
        rec->eof = oldsize;
 }
 
+static unsigned int same(const unsigned char *new,
+                        const unsigned char *old,
+                        unsigned int length)
+{
+       unsigned int i;
+
+       for (i = 0; i < length; i++) {
+               if (new[i] != old[i])
+                       break;
+       }
+       return i;
+}
+
+static unsigned int different(const unsigned char *new,
+                             const unsigned char *old,
+                             unsigned int length,
+                             unsigned int min_same,
+                             unsigned int *samelen)
+{
+       unsigned int i;
+
+       *samelen = 0;
+       for (i = 0; i < length; i++) {
+               if (new[i] == old[i]) {
+                       (*samelen)++;
+               } else {
+                       if (*samelen >= min_same) {
+                               return i - *samelen;
+                       }
+                       *samelen = 0;
+               }
+       }
+
+       if (*samelen < min_same)
+               *samelen = 0;
+       return length - *samelen;
+}
+
 /*
   setup the recovery data that will be used on a crash during commit
 */
@@ -791,9 +829,6 @@ static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb,
        }
 
        rec = (struct tdb_recovery_record *)data;
-       set_recovery_header(rec, TDB_RECOVERY_INVALID_MAGIC,
-                           recovery_size, recovery_max_size, old_map_size);
-       tdb_convert(tdb, rec, sizeof(*rec));
 
        /* build the recovery data into a single blob to allow us to do a single
           large write, which should be more efficient */
@@ -801,6 +836,8 @@ static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb,
        for (i=0;i<tdb->transaction->num_blocks;i++) {
                tdb_off_t offset;
                tdb_len_t length;
+               unsigned int off;
+               unsigned char buffer[PAGESIZE];
 
                if (tdb->transaction->blocks[i] == NULL) {
                        continue;
@@ -823,50 +860,60 @@ static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb,
                                          " transaction data over new region"
                                          " boundary");
                }
-               memcpy(p, &offset, sizeof(offset));
-               memcpy(p + sizeof(offset), &length, sizeof(length));
-               tdb_convert(tdb, p, sizeof(offset) + sizeof(length));
-
-               /* the recovery area contains the old data, not the
-                  new data, so we have to call the original tdb_read
-                  method to get it */
                if (offset + length > old_map_size) {
-                       /* Short read at EOF, and zero fill. */
-                       unsigned int len = old_map_size - offset;
-                       ecode = methods->tread(tdb, offset,
-                                              p + sizeof(offset) + sizeof(length),
-                                              len);
-                       memset(p + sizeof(offset) + sizeof(length) + len, 0,
-                              length - len);
-               } else {
-                       ecode = methods->tread(tdb, offset,
-                                              p + sizeof(offset) + sizeof(length),
-                                              length);
+                       /* Short read at EOF. */
+                       length = old_map_size - offset;
                }
+               ecode = methods->tread(tdb, offset, buffer, length);
                if (ecode != TDB_SUCCESS) {
                        free(data);
                        return ecode;
                }
-               p += sizeof(offset) + sizeof(length) + length;
+
+               /* Skip over anything the same at the start. */
+               off = same(tdb->transaction->blocks[i], buffer, length);
+               offset += off;
+
+               while (off < length) {
+                       tdb_len_t len;
+                       unsigned int samelen;
+
+                       len = different(tdb->transaction->blocks[i] + off,
+                                       buffer + off, length - off,
+                                       sizeof(offset) + sizeof(len) + 1,
+                                       &samelen);
+
+                       memcpy(p, &offset, sizeof(offset));
+                       memcpy(p + sizeof(offset), &len, sizeof(len));
+                       tdb_convert(tdb, p, sizeof(offset) + sizeof(len));
+                       p += sizeof(offset) + sizeof(len);
+                       memcpy(p, buffer + off, len);
+                       p += len;
+                       off += len + samelen;
+                       offset += len + samelen;
+               }
        }
 
+       /* Now we know size, set up rec header. */
+       set_recovery_header(rec, TDB_RECOVERY_INVALID_MAGIC,
+                           p - data - sizeof(*rec),
+                           recovery_max_size, old_map_size);
+       tdb_convert(tdb, rec, sizeof(*rec));
+
        /* write the recovery data to the recovery area */
-       ecode = methods->twrite(tdb, recovery_offset, data,
-                               sizeof(*rec) + recovery_size);
+       ecode = methods->twrite(tdb, recovery_offset, data, p - data);
        if (ecode != TDB_SUCCESS) {
                free(data);
                return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
                                  "tdb_transaction_setup_recovery:"
                                  " failed to write recovery data");
        }
-       transaction_write_existing(tdb, recovery_offset, data,
-                                  sizeof(*rec) + recovery_size);
+       transaction_write_existing(tdb, recovery_offset, data, p - data);
 
        /* as we don't have ordered writes, we have to sync the recovery
           data before we update the magic to indicate that the recovery
           data is present */
-       ecode = transaction_sync(tdb, recovery_offset,
-                                sizeof(*rec) + recovery_size);
+       ecode = transaction_sync(tdb, recovery_offset, p - data);
        if (ecode != TDB_SUCCESS) {
                free(data);
                return ecode;