We don't need to write the whole page to the recovery area if it
hasn't all changed. Simply skipping the start and end of the pages
which are similar saves us about 20% on growtdb-bench 250000, and 45%
on tdbtorture. The more thorough examination of page differences
gives us a saving of 90% on growtdb-bench and 98% on tdbtorture!
And we do win a bit on timings for transaction commit:
Before:
$ time ./growtdb-bench 250000 10 > /dev/null && ls -l /tmp/growtdb.tdb && time ./tdbtorture -s 0 && ls -l torture.tdb && ./speed --transaction
2000000
real 1m4.844s
user 0m15.537s
sys 0m3.796s
-rw------- 1 rusty rusty
626693096 2011-04-27 21:28 /tmp/growtdb.tdb
testing with 3 processes, 5000 loops, seed=0
OK
real 1m17.021s
user 0m0.272s
sys 0m0.540s
-rw------- 1 rusty rusty 458800 2011-04-27 21:29 torture.tdb
Adding
2000000 records: 894 ns (
110556088 bytes)
Finding
2000000 records: 569 ns (
110556088 bytes)
Missing
2000000 records: 390 ns (
110556088 bytes)
Traversing
2000000 records: 403 ns (
110556088 bytes)
Deleting
2000000 records: 710 ns (
244003768 bytes)
Re-adding
2000000 records: 825 ns (
244003768 bytes)
Appending
2000000 records: 1262 ns (
268404160 bytes)
Churning
2000000 records: 2311 ns (
268404160 bytes)
After:
$ time ./growtdb-bench 250000 10 > /dev/null && ls -l /tmp/growtdb.tdb && time ./tdbtorture -s 0 && ls -l torture.tdb && ./speed --transaction
2000000
real 0m50.366s
user 0m17.109s
sys 0m2.468s
-rw------- 1 rusty rusty
564215952 2011-04-27 21:31 /tmp/growtdb.tdb
testing with 3 processes, 5000 loops, seed=0
OK
real 1m23.818s
user 0m0.304s
sys 0m0.508s
-rw------- 1 rusty rusty 669856 2011-04-27 21:32 torture.tdb
Adding
2000000 records: 887 ns (
110556088 bytes)
Finding
2000000 records: 556 ns (
110556088 bytes)
Missing
2000000 records: 385 ns (
110556088 bytes)
Traversing
2000000 records: 401 ns (
110556088 bytes)
Deleting
2000000 records: 710 ns (
244003768 bytes)
Re-adding
2000000 records: 825 ns (
244003768 bytes)
Appending
2000000 records: 1255 ns (
268404160 bytes)
Churning
2000000 records: 2299 ns (
268404160 bytes)
/* Endian conversion: we only ever deal with 8 byte quantities */
void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
{
/* Endian conversion: we only ever deal with 8 byte quantities */
void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
{
if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
uint64_t i, *p = (uint64_t *)buf;
for (i = 0; i < size / 8; i++)
if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
uint64_t i, *p = (uint64_t *)buf;
for (i = 0; i < size / 8; i++)
+static unsigned int same(const unsigned char *new,
+ const unsigned char *old,
+ unsigned int length)
+{
+ unsigned int i;
+
+ for (i = 0; i < length; i++) {
+ if (new[i] != old[i])
+ break;
+ }
+ return i;
+}
+
+static unsigned int different(const unsigned char *new,
+ const unsigned char *old,
+ unsigned int length,
+ unsigned int min_same,
+ unsigned int *samelen)
+{
+ unsigned int i;
+
+ *samelen = 0;
+ for (i = 0; i < length; i++) {
+ if (new[i] == old[i]) {
+ (*samelen)++;
+ } else {
+ if (*samelen >= min_same) {
+ return i - *samelen;
+ }
+ *samelen = 0;
+ }
+ }
+
+ if (*samelen < min_same)
+ *samelen = 0;
+ return length - *samelen;
+}
+
/*
setup the recovery data that will be used on a crash during commit
*/
/*
setup the recovery data that will be used on a crash during commit
*/
}
rec = (struct tdb_recovery_record *)data;
}
rec = (struct tdb_recovery_record *)data;
- set_recovery_header(rec, TDB_RECOVERY_INVALID_MAGIC,
- recovery_size, recovery_max_size, old_map_size);
- tdb_convert(tdb, rec, sizeof(*rec));
/* build the recovery data into a single blob to allow us to do a single
large write, which should be more efficient */
/* build the recovery data into a single blob to allow us to do a single
large write, which should be more efficient */
for (i=0;i<tdb->transaction->num_blocks;i++) {
tdb_off_t offset;
tdb_len_t length;
for (i=0;i<tdb->transaction->num_blocks;i++) {
tdb_off_t offset;
tdb_len_t length;
+ unsigned int off;
+ unsigned char buffer[PAGESIZE];
if (tdb->transaction->blocks[i] == NULL) {
continue;
if (tdb->transaction->blocks[i] == NULL) {
continue;
" transaction data over new region"
" boundary");
}
" transaction data over new region"
" boundary");
}
- memcpy(p, &offset, sizeof(offset));
- memcpy(p + sizeof(offset), &length, sizeof(length));
- tdb_convert(tdb, p, sizeof(offset) + sizeof(length));
-
- /* the recovery area contains the old data, not the
- new data, so we have to call the original tdb_read
- method to get it */
if (offset + length > old_map_size) {
if (offset + length > old_map_size) {
- /* Short read at EOF, and zero fill. */
- unsigned int len = old_map_size - offset;
- ecode = methods->tread(tdb, offset,
- p + sizeof(offset) + sizeof(length),
- len);
- memset(p + sizeof(offset) + sizeof(length) + len, 0,
- length - len);
- } else {
- ecode = methods->tread(tdb, offset,
- p + sizeof(offset) + sizeof(length),
- length);
+ /* Short read at EOF. */
+ length = old_map_size - offset;
+ ecode = methods->tread(tdb, offset, buffer, length);
if (ecode != TDB_SUCCESS) {
free(data);
return ecode;
}
if (ecode != TDB_SUCCESS) {
free(data);
return ecode;
}
- p += sizeof(offset) + sizeof(length) + length;
+
+ /* Skip over anything the same at the start. */
+ off = same(tdb->transaction->blocks[i], buffer, length);
+ offset += off;
+
+ while (off < length) {
+ tdb_len_t len;
+ unsigned int samelen;
+
+ len = different(tdb->transaction->blocks[i] + off,
+ buffer + off, length - off,
+ sizeof(offset) + sizeof(len) + 1,
+ &samelen);
+
+ memcpy(p, &offset, sizeof(offset));
+ memcpy(p + sizeof(offset), &len, sizeof(len));
+ tdb_convert(tdb, p, sizeof(offset) + sizeof(len));
+ p += sizeof(offset) + sizeof(len);
+ memcpy(p, buffer + off, len);
+ p += len;
+ off += len + samelen;
+ offset += len + samelen;
+ }
+ /* Now we know size, set up rec header. */
+ set_recovery_header(rec, TDB_RECOVERY_INVALID_MAGIC,
+ p - data - sizeof(*rec),
+ recovery_max_size, old_map_size);
+ tdb_convert(tdb, rec, sizeof(*rec));
+
/* write the recovery data to the recovery area */
/* write the recovery data to the recovery area */
- ecode = methods->twrite(tdb, recovery_offset, data,
- sizeof(*rec) + recovery_size);
+ ecode = methods->twrite(tdb, recovery_offset, data, p - data);
if (ecode != TDB_SUCCESS) {
free(data);
return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
"tdb_transaction_setup_recovery:"
" failed to write recovery data");
}
if (ecode != TDB_SUCCESS) {
free(data);
return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
"tdb_transaction_setup_recovery:"
" failed to write recovery data");
}
- transaction_write_existing(tdb, recovery_offset, data,
- sizeof(*rec) + recovery_size);
+ transaction_write_existing(tdb, recovery_offset, data, p - data);
/* as we don't have ordered writes, we have to sync the recovery
data before we update the magic to indicate that the recovery
data is present */
/* as we don't have ordered writes, we have to sync the recovery
data before we update the magic to indicate that the recovery
data is present */
- ecode = transaction_sync(tdb, recovery_offset,
- sizeof(*rec) + recovery_size);
+ ecode = transaction_sync(tdb, recovery_offset, p - data);
if (ecode != TDB_SUCCESS) {
free(data);
return ecode;
if (ecode != TDB_SUCCESS) {
free(data);
return ecode;