From: Rusty Russell Date: Wed, 27 Apr 2011 12:17:58 +0000 (+0930) Subject: tdb2: reduce transaction before writing to recovery area. X-Git-Url: http://git.ozlabs.org/?p=ccan;a=commitdiff_plain;h=cfc7d301da49dd2b2ce346b08bf5fbff3acfae67 tdb2: reduce transaction before writing to recovery area. We don't need to write the whole page to the recovery area if it hasn't all changed. Simply skipping the start and end of the pages which are similar saves us about 20% on growtdb-bench 250000, and 45% on tdbtorture. The more thorough examination of page differences gives us a saving of 90% on growtdb-bench and 98% on tdbtorture! And we do win a bit on timings for transaction commit: Before: $ time ./growtdb-bench 250000 10 > /dev/null && ls -l /tmp/growtdb.tdb && time ./tdbtorture -s 0 && ls -l torture.tdb && ./speed --transaction 2000000 real 1m4.844s user 0m15.537s sys 0m3.796s -rw------- 1 rusty rusty 626693096 2011-04-27 21:28 /tmp/growtdb.tdb testing with 3 processes, 5000 loops, seed=0 OK real 1m17.021s user 0m0.272s sys 0m0.540s -rw------- 1 rusty rusty 458800 2011-04-27 21:29 torture.tdb Adding 2000000 records: 894 ns (110556088 bytes) Finding 2000000 records: 569 ns (110556088 bytes) Missing 2000000 records: 390 ns (110556088 bytes) Traversing 2000000 records: 403 ns (110556088 bytes) Deleting 2000000 records: 710 ns (244003768 bytes) Re-adding 2000000 records: 825 ns (244003768 bytes) Appending 2000000 records: 1262 ns (268404160 bytes) Churning 2000000 records: 2311 ns (268404160 bytes) After: $ time ./growtdb-bench 250000 10 > /dev/null && ls -l /tmp/growtdb.tdb && time ./tdbtorture -s 0 && ls -l torture.tdb && ./speed --transaction 2000000 real 0m50.366s user 0m17.109s sys 0m2.468s -rw------- 1 rusty rusty 564215952 2011-04-27 21:31 /tmp/growtdb.tdb testing with 3 processes, 5000 loops, seed=0 OK real 1m23.818s user 0m0.304s sys 0m0.508s -rw------- 1 rusty rusty 669856 2011-04-27 21:32 torture.tdb Adding 2000000 records: 887 ns (110556088 bytes) Finding 2000000 records: 556 ns (110556088 bytes) Missing 2000000 records: 385 ns (110556088 bytes) Traversing 2000000 records: 401 ns (110556088 bytes) Deleting 2000000 records: 710 ns (244003768 bytes) Re-adding 2000000 records: 825 ns (244003768 bytes) Appending 2000000 records: 1255 ns (268404160 bytes) Churning 2000000 records: 2299 ns (268404160 bytes) --- diff --git a/ccan/tdb2/io.c b/ccan/tdb2/io.c index 31756de3..8c5f45f3 100644 --- a/ccan/tdb2/io.c +++ b/ccan/tdb2/io.c @@ -130,6 +130,7 @@ static enum TDB_ERROR tdb_oob(struct tdb_context *tdb, tdb_off_t len, /* Endian conversion: we only ever deal with 8 byte quantities */ void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size) { + assert(size % 8 == 0); if (unlikely((tdb->flags & TDB_CONVERT)) && buf) { uint64_t i, *p = (uint64_t *)buf; for (i = 0; i < size / 8; i++) diff --git a/ccan/tdb2/transaction.c b/ccan/tdb2/transaction.c index 73ceb962..4bdc3f32 100644 --- a/ccan/tdb2/transaction.c +++ b/ccan/tdb2/transaction.c @@ -757,6 +757,44 @@ static void set_recovery_header(struct tdb_recovery_record *rec, rec->eof = oldsize; } +static unsigned int same(const unsigned char *new, + const unsigned char *old, + unsigned int length) +{ + unsigned int i; + + for (i = 0; i < length; i++) { + if (new[i] != old[i]) + break; + } + return i; +} + +static unsigned int different(const unsigned char *new, + const unsigned char *old, + unsigned int length, + unsigned int min_same, + unsigned int *samelen) +{ + unsigned int i; + + *samelen = 0; + for (i = 0; i < length; i++) { + if (new[i] == old[i]) { + (*samelen)++; + } else { + if (*samelen >= min_same) { + return i - *samelen; + } + *samelen = 0; + } + } + + if (*samelen < min_same) + *samelen = 0; + return length - *samelen; +} + /* setup the recovery data that will be used on a crash during commit */ @@ -791,9 +829,6 @@ static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb, } rec = (struct tdb_recovery_record *)data; - set_recovery_header(rec, TDB_RECOVERY_INVALID_MAGIC, - recovery_size, recovery_max_size, old_map_size); - tdb_convert(tdb, rec, sizeof(*rec)); /* build the recovery data into a single blob to allow us to do a single large write, which should be more efficient */ @@ -801,6 +836,8 @@ static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb, for (i=0;itransaction->num_blocks;i++) { tdb_off_t offset; tdb_len_t length; + unsigned int off; + unsigned char buffer[PAGESIZE]; if (tdb->transaction->blocks[i] == NULL) { continue; @@ -823,50 +860,60 @@ static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb, " transaction data over new region" " boundary"); } - memcpy(p, &offset, sizeof(offset)); - memcpy(p + sizeof(offset), &length, sizeof(length)); - tdb_convert(tdb, p, sizeof(offset) + sizeof(length)); - - /* the recovery area contains the old data, not the - new data, so we have to call the original tdb_read - method to get it */ if (offset + length > old_map_size) { - /* Short read at EOF, and zero fill. */ - unsigned int len = old_map_size - offset; - ecode = methods->tread(tdb, offset, - p + sizeof(offset) + sizeof(length), - len); - memset(p + sizeof(offset) + sizeof(length) + len, 0, - length - len); - } else { - ecode = methods->tread(tdb, offset, - p + sizeof(offset) + sizeof(length), - length); + /* Short read at EOF. */ + length = old_map_size - offset; } + ecode = methods->tread(tdb, offset, buffer, length); if (ecode != TDB_SUCCESS) { free(data); return ecode; } - p += sizeof(offset) + sizeof(length) + length; + + /* Skip over anything the same at the start. */ + off = same(tdb->transaction->blocks[i], buffer, length); + offset += off; + + while (off < length) { + tdb_len_t len; + unsigned int samelen; + + len = different(tdb->transaction->blocks[i] + off, + buffer + off, length - off, + sizeof(offset) + sizeof(len) + 1, + &samelen); + + memcpy(p, &offset, sizeof(offset)); + memcpy(p + sizeof(offset), &len, sizeof(len)); + tdb_convert(tdb, p, sizeof(offset) + sizeof(len)); + p += sizeof(offset) + sizeof(len); + memcpy(p, buffer + off, len); + p += len; + off += len + samelen; + offset += len + samelen; + } } + /* Now we know size, set up rec header. */ + set_recovery_header(rec, TDB_RECOVERY_INVALID_MAGIC, + p - data - sizeof(*rec), + recovery_max_size, old_map_size); + tdb_convert(tdb, rec, sizeof(*rec)); + /* write the recovery data to the recovery area */ - ecode = methods->twrite(tdb, recovery_offset, data, - sizeof(*rec) + recovery_size); + ecode = methods->twrite(tdb, recovery_offset, data, p - data); if (ecode != TDB_SUCCESS) { free(data); return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, "tdb_transaction_setup_recovery:" " failed to write recovery data"); } - transaction_write_existing(tdb, recovery_offset, data, - sizeof(*rec) + recovery_size); + transaction_write_existing(tdb, recovery_offset, data, p - data); /* as we don't have ordered writes, we have to sync the recovery data before we update the magic to indicate that the recovery data is present */ - ecode = transaction_sync(tdb, recovery_offset, - sizeof(*rec) + recovery_size); + ecode = transaction_sync(tdb, recovery_offset, p - data); if (ecode != TDB_SUCCESS) { free(data); return ecode;