X-Git-Url: http://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Ftdb2%2Ftransaction.c;h=1af1c4acafa90b44de277e147cb5d6cfc84fdd6c;hp=73ceb9620265ee70b86320871c57aba0488c429f;hb=56023cca5f66a40646a1e807c3d10af6e5913623;hpb=ba7740e689b5791d79b95d2c5345870f9c29fb71 diff --git a/ccan/tdb2/transaction.c b/ccan/tdb2/transaction.c index 73ceb962..1af1c4ac 100644 --- a/ccan/tdb2/transaction.c +++ b/ccan/tdb2/transaction.c @@ -25,7 +25,7 @@ */ #include "private.h" -#define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0) +#define SAFE_FREE(x) do { if ((x) != NULL) {free((void *)x); (x)=NULL;} } while(0) /* transaction design: @@ -88,7 +88,6 @@ fsync/msync calls are made. */ - /* hold the context of any current transaction */ @@ -121,7 +120,7 @@ struct tdb_transaction { }; /* This doesn't really need to be pagesize, but we use it for similar reasons. */ -#define PAGESIZE 4096 +#define PAGESIZE 65536 /* read while in a transaction. We need to check first if the data is in our list @@ -152,10 +151,10 @@ static enum TDB_ERROR transaction_read(struct tdb_context *tdb, tdb_off_t off, blk = off / PAGESIZE; /* see if we have it in the block list */ - if (tdb->transaction->num_blocks <= blk || - tdb->transaction->blocks[blk] == NULL) { + if (tdb->tdb2.transaction->num_blocks <= blk || + tdb->tdb2.transaction->blocks[blk] == NULL) { /* nope, do a real read */ - ecode = tdb->transaction->io_methods->tread(tdb, off, buf, len); + ecode = tdb->tdb2.transaction->io_methods->tread(tdb, off, buf, len); if (ecode != TDB_SUCCESS) { goto fail; } @@ -163,19 +162,19 @@ static enum TDB_ERROR transaction_read(struct tdb_context *tdb, tdb_off_t off, } /* it is in the block list. Now check for the last block */ - if (blk == tdb->transaction->num_blocks-1) { - if (len > tdb->transaction->last_block_size) { + if (blk == tdb->tdb2.transaction->num_blocks-1) { + if (len > tdb->tdb2.transaction->last_block_size) { ecode = TDB_ERR_IO; goto fail; } } /* now copy it out of this block */ - memcpy(buf, tdb->transaction->blocks[blk] + (off % PAGESIZE), len); + memcpy(buf, tdb->tdb2.transaction->blocks[blk] + (off % PAGESIZE), len); return TDB_SUCCESS; fail: - tdb->transaction->transaction_error = 1; + tdb->tdb2.transaction->transaction_error = 1; return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, "transaction_read: failed at off=%zu len=%zu", (size_t)off, (size_t)len); @@ -192,7 +191,7 @@ static enum TDB_ERROR transaction_write(struct tdb_context *tdb, tdb_off_t off, enum TDB_ERROR ecode; /* Only a commit is allowed on a prepared transaction */ - if (tdb->transaction->prepared) { + if (tdb->tdb2.transaction->prepared) { ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR, "transaction_write: transaction already" " prepared, write not allowed"); @@ -204,7 +203,7 @@ static enum TDB_ERROR transaction_write(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len2 = PAGESIZE - (off % PAGESIZE); ecode = transaction_write(tdb, off, buf, len2); if (ecode != TDB_SUCCESS) { - return -1; + return ecode; } len -= len2; off += len2; @@ -220,15 +219,15 @@ static enum TDB_ERROR transaction_write(struct tdb_context *tdb, tdb_off_t off, blk = off / PAGESIZE; off = off % PAGESIZE; - if (tdb->transaction->num_blocks <= blk) { + if (tdb->tdb2.transaction->num_blocks <= blk) { uint8_t **new_blocks; /* expand the blocks array */ - if (tdb->transaction->blocks == NULL) { + if (tdb->tdb2.transaction->blocks == NULL) { new_blocks = (uint8_t **)malloc( (blk+1)*sizeof(uint8_t *)); } else { new_blocks = (uint8_t **)realloc( - tdb->transaction->blocks, + tdb->tdb2.transaction->blocks, (blk+1)*sizeof(uint8_t *)); } if (new_blocks == NULL) { @@ -237,30 +236,30 @@ static enum TDB_ERROR transaction_write(struct tdb_context *tdb, tdb_off_t off, " failed to allocate"); goto fail; } - memset(&new_blocks[tdb->transaction->num_blocks], 0, - (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *)); - tdb->transaction->blocks = new_blocks; - tdb->transaction->num_blocks = blk+1; - tdb->transaction->last_block_size = 0; + memset(&new_blocks[tdb->tdb2.transaction->num_blocks], 0, + (1+(blk - tdb->tdb2.transaction->num_blocks))*sizeof(uint8_t *)); + tdb->tdb2.transaction->blocks = new_blocks; + tdb->tdb2.transaction->num_blocks = blk+1; + tdb->tdb2.transaction->last_block_size = 0; } /* allocate and fill a block? */ - if (tdb->transaction->blocks[blk] == NULL) { - tdb->transaction->blocks[blk] = (uint8_t *)calloc(PAGESIZE, 1); - if (tdb->transaction->blocks[blk] == NULL) { + if (tdb->tdb2.transaction->blocks[blk] == NULL) { + tdb->tdb2.transaction->blocks[blk] = (uint8_t *)calloc(PAGESIZE, 1); + if (tdb->tdb2.transaction->blocks[blk] == NULL) { ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, "transaction_write:" " failed to allocate"); goto fail; } - if (tdb->transaction->old_map_size > blk * PAGESIZE) { + if (tdb->tdb2.transaction->old_map_size > blk * PAGESIZE) { tdb_len_t len2 = PAGESIZE; - if (len2 + (blk * PAGESIZE) > tdb->transaction->old_map_size) { - len2 = tdb->transaction->old_map_size - (blk * PAGESIZE); + if (len2 + (blk * PAGESIZE) > tdb->tdb2.transaction->old_map_size) { + len2 = tdb->tdb2.transaction->old_map_size - (blk * PAGESIZE); } - ecode = tdb->transaction->io_methods->tread(tdb, + ecode = tdb->tdb2.transaction->io_methods->tread(tdb, blk * PAGESIZE, - tdb->transaction->blocks[blk], + tdb->tdb2.transaction->blocks[blk], len2); if (ecode != TDB_SUCCESS) { ecode = tdb_logerr(tdb, ecode, @@ -269,31 +268,31 @@ static enum TDB_ERROR transaction_write(struct tdb_context *tdb, tdb_off_t off, " failed to" " read old block: %s", strerror(errno)); - SAFE_FREE(tdb->transaction->blocks[blk]); + SAFE_FREE(tdb->tdb2.transaction->blocks[blk]); goto fail; } - if (blk == tdb->transaction->num_blocks-1) { - tdb->transaction->last_block_size = len2; + if (blk == tdb->tdb2.transaction->num_blocks-1) { + tdb->tdb2.transaction->last_block_size = len2; } } } /* overwrite part of an existing block */ if (buf == NULL) { - memset(tdb->transaction->blocks[blk] + off, 0, len); + memset(tdb->tdb2.transaction->blocks[blk] + off, 0, len); } else { - memcpy(tdb->transaction->blocks[blk] + off, buf, len); + memcpy(tdb->tdb2.transaction->blocks[blk] + off, buf, len); } - if (blk == tdb->transaction->num_blocks-1) { - if (len + off > tdb->transaction->last_block_size) { - tdb->transaction->last_block_size = len + off; + if (blk == tdb->tdb2.transaction->num_blocks-1) { + if (len + off > tdb->tdb2.transaction->last_block_size) { + tdb->tdb2.transaction->last_block_size = len + off; } } return TDB_SUCCESS; fail: - tdb->transaction->transaction_error = 1; + tdb->tdb2.transaction->transaction_error = 1; return ecode; } @@ -325,21 +324,21 @@ static void transaction_write_existing(struct tdb_context *tdb, tdb_off_t off, blk = off / PAGESIZE; off = off % PAGESIZE; - if (tdb->transaction->num_blocks <= blk || - tdb->transaction->blocks[blk] == NULL) { + if (tdb->tdb2.transaction->num_blocks <= blk || + tdb->tdb2.transaction->blocks[blk] == NULL) { return; } - if (blk == tdb->transaction->num_blocks-1 && - off + len > tdb->transaction->last_block_size) { - if (off >= tdb->transaction->last_block_size) { + if (blk == tdb->tdb2.transaction->num_blocks-1 && + off + len > tdb->tdb2.transaction->last_block_size) { + if (off >= tdb->tdb2.transaction->last_block_size) { return; } - len = tdb->transaction->last_block_size - off; + len = tdb->tdb2.transaction->last_block_size - off; } /* overwrite part of an existing block */ - memcpy(tdb->transaction->blocks[blk] + off, buf, len); + memcpy(tdb->tdb2.transaction->blocks[blk] + off, buf, len); } @@ -349,15 +348,14 @@ static void transaction_write_existing(struct tdb_context *tdb, tdb_off_t off, static enum TDB_ERROR transaction_oob(struct tdb_context *tdb, tdb_off_t len, bool probe) { - if (len <= tdb->file->map_size) { + if (len <= tdb->file->map_size || probe) { return TDB_SUCCESS; } - if (!probe) { - tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_oob len %lld beyond transaction size %lld", - (long long)len, - (long long)tdb->file->map_size); - } + + tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, + "tdb_oob len %lld beyond transaction size %lld", + (long long)len, + (long long)tdb->file->map_size); return TDB_ERR_IO; } @@ -388,30 +386,34 @@ static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off, /* Can only do direct if in single block and we've already copied. */ if (write_mode) { - if (blk != end_blk) - return NULL; - if (blk >= tdb->transaction->num_blocks) + tdb->stats.transaction_write_direct++; + if (blk != end_blk + || blk >= tdb->tdb2.transaction->num_blocks + || tdb->tdb2.transaction->blocks[blk] == NULL) { + tdb->stats.transaction_write_direct_fail++; return NULL; - if (tdb->transaction->blocks[blk] == NULL) - return NULL; - return tdb->transaction->blocks[blk] + off % PAGESIZE; + } + return tdb->tdb2.transaction->blocks[blk] + off % PAGESIZE; } + tdb->stats.transaction_read_direct++; /* Single which we have copied? */ if (blk == end_blk - && blk < tdb->transaction->num_blocks - && tdb->transaction->blocks[blk]) - return tdb->transaction->blocks[blk] + off % PAGESIZE; + && blk < tdb->tdb2.transaction->num_blocks + && tdb->tdb2.transaction->blocks[blk]) + return tdb->tdb2.transaction->blocks[blk] + off % PAGESIZE; /* Otherwise must be all not copied. */ while (blk <= end_blk) { - if (blk >= tdb->transaction->num_blocks) + if (blk >= tdb->tdb2.transaction->num_blocks) break; - if (tdb->transaction->blocks[blk]) + if (tdb->tdb2.transaction->blocks[blk]) { + tdb->stats.transaction_read_direct_fail++; return NULL; + } blk++; } - return tdb->transaction->io_methods->direct(tdb, off, len, false); + return tdb->tdb2.transaction->io_methods->direct(tdb, off, len, false); } static const struct tdb_methods transaction_methods = { @@ -439,7 +441,7 @@ static enum TDB_ERROR transaction_sync(struct tdb_context *tdb, } #ifdef MS_SYNC if (tdb->file->map_ptr) { - tdb_off_t moffset = offset & ~(PAGESIZE-1); + tdb_off_t moffset = offset & ~(getpagesize()-1); if (msync(moffset + (char *)tdb->file->map_ptr, length + (offset - moffset), MS_SYNC) != 0) { return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, @@ -457,38 +459,38 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb) int i; enum TDB_ERROR ecode; - if (tdb->transaction == NULL) { + if (tdb->tdb2.transaction == NULL) { tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, "tdb_transaction_cancel: no transaction"); return; } - if (tdb->transaction->nesting != 0) { - tdb->transaction->transaction_error = 1; - tdb->transaction->nesting--; + if (tdb->tdb2.transaction->nesting != 0) { + tdb->tdb2.transaction->transaction_error = 1; + tdb->tdb2.transaction->nesting--; return; } - tdb->file->map_size = tdb->transaction->old_map_size; + tdb->file->map_size = tdb->tdb2.transaction->old_map_size; /* free all the transaction blocks */ - for (i=0;itransaction->num_blocks;i++) { - if (tdb->transaction->blocks[i] != NULL) { - free(tdb->transaction->blocks[i]); + for (i=0;itdb2.transaction->num_blocks;i++) { + if (tdb->tdb2.transaction->blocks[i] != NULL) { + free(tdb->tdb2.transaction->blocks[i]); } } - SAFE_FREE(tdb->transaction->blocks); + SAFE_FREE(tdb->tdb2.transaction->blocks); - if (tdb->transaction->magic_offset) { - const struct tdb_methods *methods = tdb->transaction->io_methods; + if (tdb->tdb2.transaction->magic_offset) { + const struct tdb_methods *methods = tdb->tdb2.transaction->io_methods; uint64_t invalid = TDB_RECOVERY_INVALID_MAGIC; /* remove the recovery marker */ - ecode = methods->twrite(tdb, tdb->transaction->magic_offset, + ecode = methods->twrite(tdb, tdb->tdb2.transaction->magic_offset, &invalid, sizeof(invalid)); if (ecode == TDB_SUCCESS) ecode = transaction_sync(tdb, - tdb->transaction->magic_offset, + tdb->tdb2.transaction->magic_offset, sizeof(invalid)); if (ecode != TDB_SUCCESS) { tdb_logerr(tdb, ecode, TDB_LOG_ERROR, @@ -501,14 +503,14 @@ static void _tdb_transaction_cancel(struct tdb_context *tdb) tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype); /* restore the normal io methods */ - tdb->methods = tdb->transaction->io_methods; + tdb->tdb2.io = tdb->tdb2.transaction->io_methods; tdb_transaction_unlock(tdb, F_WRLCK); if (tdb_has_open_lock(tdb)) - tdb_unlock_open(tdb); + tdb_unlock_open(tdb, F_WRLCK); - SAFE_FREE(tdb->transaction); + SAFE_FREE(tdb->tdb2.transaction); } /* @@ -519,18 +521,34 @@ enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb) { enum TDB_ERROR ecode; + if (tdb->flags & TDB_VERSION1) { + if (tdb1_transaction_start(tdb) == -1) + return tdb->last_error; + return TDB_SUCCESS; + } + + tdb->stats.transactions++; /* some sanity checks */ - if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) { + if (tdb->flags & TDB_INTERNAL) { return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, + TDB_LOG_USE_ERROR, + "tdb_transaction_start:" + " cannot start a" + " transaction on an" + " internal tdb"); + } + + if (tdb->flags & TDB_RDONLY) { + return tdb->last_error = tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR, "tdb_transaction_start:" " cannot start a" " transaction on a " - "read-only or internal db"); + " read-only tdb"); } /* cope with nested tdb_transaction_start() calls */ - if (tdb->transaction != NULL) { + if (tdb->tdb2.transaction != NULL) { if (!(tdb->flags & TDB_ALLOW_NESTING)) { return tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, @@ -538,7 +556,8 @@ enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb) "tdb_transaction_start:" " already inside transaction"); } - tdb->transaction->nesting++; + tdb->tdb2.transaction->nesting++; + tdb->stats.transaction_nest++; return 0; } @@ -554,9 +573,9 @@ enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb) " held"); } - tdb->transaction = (struct tdb_transaction *) + tdb->tdb2.transaction = (struct tdb_transaction *) calloc(sizeof(struct tdb_transaction), 1); - if (tdb->transaction == NULL) { + if (tdb->tdb2.transaction == NULL) { return tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, "tdb_transaction_start:" @@ -568,8 +587,8 @@ enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb) make this async, which we will probably do in the future */ ecode = tdb_transaction_lock(tdb, F_WRLCK); if (ecode != TDB_SUCCESS) { - SAFE_FREE(tdb->transaction->blocks); - SAFE_FREE(tdb->transaction); + SAFE_FREE(tdb->tdb2.transaction->blocks); + SAFE_FREE(tdb->tdb2.transaction); return tdb->last_error = ecode; } @@ -582,19 +601,19 @@ enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb) /* make sure we know about any file expansions already done by anyone else */ - tdb->methods->oob(tdb, tdb->file->map_size + 1, true); - tdb->transaction->old_map_size = tdb->file->map_size; + tdb->tdb2.io->oob(tdb, tdb->file->map_size + 1, true); + tdb->tdb2.transaction->old_map_size = tdb->file->map_size; /* finally hook the io methods, replacing them with transaction specific methods */ - tdb->transaction->io_methods = tdb->methods; - tdb->methods = &transaction_methods; + tdb->tdb2.transaction->io_methods = tdb->tdb2.io; + tdb->tdb2.io = &transaction_methods; return tdb->last_error = TDB_SUCCESS; fail_allrecord_lock: tdb_transaction_unlock(tdb, F_WRLCK); - SAFE_FREE(tdb->transaction->blocks); - SAFE_FREE(tdb->transaction); + SAFE_FREE(tdb->tdb2.transaction->blocks); + SAFE_FREE(tdb->tdb2.transaction); return tdb->last_error = ecode; } @@ -604,11 +623,16 @@ fail_allrecord_lock: */ void tdb_transaction_cancel(struct tdb_context *tdb) { + if (tdb->flags & TDB_VERSION1) { + tdb1_transaction_cancel(tdb); + return; + } + tdb->stats.transaction_cancel++; _tdb_transaction_cancel(tdb); } /* - work out how much space the linearised recovery data will consume + work out how much space the linearised recovery data will consume (worst case) */ static tdb_len_t tdb_recovery_size(struct tdb_context *tdb) { @@ -616,16 +640,16 @@ static tdb_len_t tdb_recovery_size(struct tdb_context *tdb) int i; recovery_size = 0; - for (i=0;itransaction->num_blocks;i++) { - if (i * PAGESIZE >= tdb->transaction->old_map_size) { + for (i=0;itdb2.transaction->num_blocks;i++) { + if (i * PAGESIZE >= tdb->tdb2.transaction->old_map_size) { break; } - if (tdb->transaction->blocks[i] == NULL) { + if (tdb->tdb2.transaction->blocks[i] == NULL) { continue; } recovery_size += 2*sizeof(tdb_off_t); - if (i == tdb->transaction->num_blocks-1) { - recovery_size += tdb->transaction->last_block_size; + if (i == tdb->tdb2.transaction->num_blocks-1) { + recovery_size += tdb->tdb2.transaction->last_block_size; } else { recovery_size += PAGESIZE; } @@ -634,262 +658,326 @@ static tdb_len_t tdb_recovery_size(struct tdb_context *tdb) return recovery_size; } -/* - allocate the recovery area, or use an existing recovery area if it is - large enough -*/ -static enum TDB_ERROR tdb_recovery_allocate(struct tdb_context *tdb, - tdb_len_t *recovery_size, - tdb_off_t *recovery_offset, - tdb_len_t *recovery_max_size) +static enum TDB_ERROR tdb_recovery_area(struct tdb_context *tdb, + const struct tdb_methods *methods, + tdb_off_t *recovery_offset, + struct tdb_recovery_record *rec) { - struct tdb_recovery_record rec; - const struct tdb_methods *methods = tdb->transaction->io_methods; - tdb_off_t recovery_head; - size_t addition; enum TDB_ERROR ecode; - recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery)); - if (TDB_OFF_IS_ERR(recovery_head)) { - return tdb_logerr(tdb, recovery_head, TDB_LOG_ERROR, - "tdb_recovery_allocate:" - " failed to read recovery head"); + *recovery_offset = tdb_read_off(tdb, + offsetof(struct tdb_header, recovery)); + if (TDB_OFF_IS_ERR(*recovery_offset)) { + return TDB_OFF_TO_ERR(*recovery_offset); } - if (recovery_head != 0) { - ecode = methods->tread(tdb, recovery_head, &rec, sizeof(rec)); - if (ecode != TDB_SUCCESS) { - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_recovery_allocate:" - " failed to read recovery record"); - } - tdb_convert(tdb, &rec, sizeof(rec)); - /* ignore invalid recovery regions: can happen in crash */ - if (rec.magic != TDB_RECOVERY_MAGIC && - rec.magic != TDB_RECOVERY_INVALID_MAGIC) { - recovery_head = 0; + if (*recovery_offset == 0) { + rec->max_len = 0; + return TDB_SUCCESS; + } + + ecode = methods->tread(tdb, *recovery_offset, rec, sizeof(*rec)); + if (ecode != TDB_SUCCESS) + return ecode; + + tdb_convert(tdb, rec, sizeof(*rec)); + /* ignore invalid recovery regions: can happen in crash */ + if (rec->magic != TDB_RECOVERY_MAGIC && + rec->magic != TDB_RECOVERY_INVALID_MAGIC) { + *recovery_offset = 0; + rec->max_len = 0; + } + return TDB_SUCCESS; +} + +static unsigned int same(const unsigned char *new, + const unsigned char *old, + unsigned int length) +{ + unsigned int i; + + for (i = 0; i < length; i++) { + if (new[i] != old[i]) + break; + } + return i; +} + +static unsigned int different(const unsigned char *new, + const unsigned char *old, + unsigned int length, + unsigned int min_same, + unsigned int *samelen) +{ + unsigned int i; + + *samelen = 0; + for (i = 0; i < length; i++) { + if (new[i] == old[i]) { + (*samelen)++; + } else { + if (*samelen >= min_same) { + return i - *samelen; + } + *samelen = 0; } } - *recovery_size = tdb_recovery_size(tdb); + if (*samelen < min_same) + *samelen = 0; + return length - *samelen; +} - if (recovery_head != 0 && *recovery_size <= rec.max_len) { - /* it fits in the existing area */ - *recovery_max_size = rec.max_len; - *recovery_offset = recovery_head; - return TDB_SUCCESS; +/* Allocates recovery blob, without tdb_recovery_record at head set up. */ +static struct tdb_recovery_record *alloc_recovery(struct tdb_context *tdb, + tdb_len_t *len) +{ + struct tdb_recovery_record *rec; + size_t i; + enum TDB_ERROR ecode; + unsigned char *p; + const struct tdb_methods *old_methods = tdb->tdb2.io; + + rec = malloc(sizeof(*rec) + tdb_recovery_size(tdb)); + if (!rec) { + tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, + "transaction_setup_recovery:" + " cannot allocate"); + return TDB_ERR_PTR(TDB_ERR_OOM); } - /* we need to free up the old recovery area, then allocate a - new one at the end of the file. Note that we cannot use - normal allocation to allocate the new one as that might return - us an area that is being currently used (as of the start of - the transaction) */ - if (recovery_head != 0) { - tdb->stats.frees++; - ecode = add_free_record(tdb, recovery_head, - sizeof(rec) + rec.max_len, - TDB_LOCK_WAIT, true); - if (ecode != TDB_SUCCESS) { - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_recovery_allocate:" - " failed to free previous" - " recovery area"); + /* We temporarily revert to the old I/O methods, so we can use + * tdb_access_read */ + tdb->tdb2.io = tdb->tdb2.transaction->io_methods; + + /* build the recovery data into a single blob to allow us to do a single + large write, which should be more efficient */ + p = (unsigned char *)(rec + 1); + for (i=0;itdb2.transaction->num_blocks;i++) { + tdb_off_t offset; + tdb_len_t length; + unsigned int off; + const unsigned char *buffer; + + if (tdb->tdb2.transaction->blocks[i] == NULL) { + continue; + } + + offset = i * PAGESIZE; + length = PAGESIZE; + if (i == tdb->tdb2.transaction->num_blocks-1) { + length = tdb->tdb2.transaction->last_block_size; + } + + if (offset >= tdb->tdb2.transaction->old_map_size) { + continue; } + + if (offset + length > tdb->file->map_size) { + ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, + "tdb_transaction_setup_recovery:" + " transaction data over new region" + " boundary"); + goto fail; + } + if (offset + length > tdb->tdb2.transaction->old_map_size) { + /* Short read at EOF. */ + length = tdb->tdb2.transaction->old_map_size - offset; + } + buffer = tdb_access_read(tdb, offset, length, false); + if (TDB_PTR_IS_ERR(buffer)) { + ecode = TDB_PTR_ERR(buffer); + goto fail; + } + + /* Skip over anything the same at the start. */ + off = same(tdb->tdb2.transaction->blocks[i], buffer, length); + offset += off; + + while (off < length) { + tdb_len_t len; + unsigned int samelen; + + len = different(tdb->tdb2.transaction->blocks[i] + off, + buffer + off, length - off, + sizeof(offset) + sizeof(len) + 1, + &samelen); + + memcpy(p, &offset, sizeof(offset)); + memcpy(p + sizeof(offset), &len, sizeof(len)); + tdb_convert(tdb, p, sizeof(offset) + sizeof(len)); + p += sizeof(offset) + sizeof(len); + memcpy(p, buffer + off, len); + p += len; + off += len + samelen; + offset += len + samelen; + } + tdb_access_release(tdb, buffer); } - /* the tdb_free() call might have increased the recovery size */ - *recovery_size = tdb_recovery_size(tdb); + *len = p - (unsigned char *)(rec + 1); + tdb->tdb2.io = old_methods; + return rec; + +fail: + free(rec); + tdb->tdb2.io = old_methods; + return TDB_ERR_PTR(ecode); +} + +static tdb_off_t create_recovery_area(struct tdb_context *tdb, + tdb_len_t rec_length, + struct tdb_recovery_record *rec) +{ + tdb_off_t off, recovery_off; + tdb_len_t addition; + enum TDB_ERROR ecode; + const struct tdb_methods *methods = tdb->tdb2.transaction->io_methods; /* round up to a multiple of page size. Overallocate, since each * such allocation forces us to expand the file. */ - *recovery_max_size - = (((sizeof(rec) + *recovery_size + *recovery_size / 2) + rec->max_len + = (((sizeof(*rec) + rec_length + rec_length / 2) + PAGESIZE-1) & ~(PAGESIZE-1)) - - sizeof(rec); - *recovery_offset = tdb->file->map_size; - recovery_head = *recovery_offset; + - sizeof(*rec); + off = tdb->file->map_size; /* Restore ->map_size before calling underlying expand_file. Also so that we don't try to expand the file again in the transaction commit, which would destroy the recovery area */ - addition = (tdb->file->map_size - tdb->transaction->old_map_size) + - sizeof(rec) + *recovery_max_size; - tdb->file->map_size = tdb->transaction->old_map_size; + addition = (tdb->file->map_size - tdb->tdb2.transaction->old_map_size) + + sizeof(*rec) + rec->max_len; + tdb->file->map_size = tdb->tdb2.transaction->old_map_size; + tdb->stats.transaction_expand_file++; ecode = methods->expand_file(tdb, addition); if (ecode != TDB_SUCCESS) { - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_recovery_allocate:" - " failed to create recovery area"); + tdb_logerr(tdb, ecode, TDB_LOG_ERROR, + "tdb_recovery_allocate:" + " failed to create recovery area"); + return TDB_ERR_TO_OFF(ecode); } /* we have to reset the old map size so that we don't try to expand the file again in the transaction commit, which would destroy the recovery area */ - tdb->transaction->old_map_size = tdb->file->map_size; + tdb->tdb2.transaction->old_map_size = tdb->file->map_size; /* write the recovery header offset and sync - we can sync without a race here as the magic ptr in the recovery record has not been set */ - tdb_convert(tdb, &recovery_head, sizeof(recovery_head)); + recovery_off = off; + tdb_convert(tdb, &recovery_off, sizeof(recovery_off)); ecode = methods->twrite(tdb, offsetof(struct tdb_header, recovery), - &recovery_head, sizeof(tdb_off_t)); + &recovery_off, sizeof(tdb_off_t)); if (ecode != TDB_SUCCESS) { - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_recovery_allocate:" - " failed to write recovery head"); + tdb_logerr(tdb, ecode, TDB_LOG_ERROR, + "tdb_recovery_allocate:" + " failed to write recovery head"); + return TDB_ERR_TO_OFF(ecode); } transaction_write_existing(tdb, offsetof(struct tdb_header, recovery), - &recovery_head, + &recovery_off, sizeof(tdb_off_t)); - return TDB_SUCCESS; -} - -/* Set up header for the recovery record. */ -static void set_recovery_header(struct tdb_recovery_record *rec, - uint64_t magic, - uint64_t datalen, uint64_t actuallen, - uint64_t oldsize) -{ - rec->magic = magic; - rec->max_len = actuallen; - rec->len = datalen; - rec->eof = oldsize; + return off; } /* setup the recovery data that will be used on a crash during commit */ -static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb, - tdb_off_t *magic_offset) +static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb) { - /* Initialized for GCC's 4.4.5 overzealous uninitialized warnings. */ tdb_len_t recovery_size = 0; - tdb_off_t recovery_offset = 0, recovery_max_size = 0; - unsigned char *data, *p; - const struct tdb_methods *methods = tdb->transaction->io_methods; - struct tdb_recovery_record *rec; - tdb_off_t old_map_size = tdb->transaction->old_map_size; + tdb_off_t recovery_off = 0; + tdb_off_t old_map_size = tdb->tdb2.transaction->old_map_size; + struct tdb_recovery_record *recovery; + const struct tdb_methods *methods = tdb->tdb2.transaction->io_methods; uint64_t magic; - int i; enum TDB_ERROR ecode; - /* - check that the recovery area has enough space - */ - ecode = tdb_recovery_allocate(tdb, &recovery_size, - &recovery_offset, &recovery_max_size); - if (ecode != TDB_SUCCESS) { - return ecode; - } + recovery = alloc_recovery(tdb, &recovery_size); + if (TDB_PTR_IS_ERR(recovery)) + return TDB_PTR_ERR(recovery); - data = (unsigned char *)malloc(recovery_size + sizeof(*rec)); - if (data == NULL) { - return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "transaction_setup_recovery:" - " cannot allocate"); + ecode = tdb_recovery_area(tdb, methods, &recovery_off, recovery); + if (ecode) { + free(recovery); + return ecode; } - rec = (struct tdb_recovery_record *)data; - set_recovery_header(rec, TDB_RECOVERY_INVALID_MAGIC, - recovery_size, recovery_max_size, old_map_size); - tdb_convert(tdb, rec, sizeof(*rec)); - - /* build the recovery data into a single blob to allow us to do a single - large write, which should be more efficient */ - p = data + sizeof(*rec); - for (i=0;itransaction->num_blocks;i++) { - tdb_off_t offset; - tdb_len_t length; - - if (tdb->transaction->blocks[i] == NULL) { - continue; - } - - offset = i * PAGESIZE; - length = PAGESIZE; - if (i == tdb->transaction->num_blocks-1) { - length = tdb->transaction->last_block_size; - } + if (recovery->max_len < recovery_size) { + /* Not large enough. Free up old recovery area. */ + if (recovery_off) { + tdb->stats.frees++; + ecode = add_free_record(tdb, recovery_off, + sizeof(*recovery) + + recovery->max_len, + TDB_LOCK_WAIT, true); + free(recovery); + if (ecode != TDB_SUCCESS) { + return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, + "tdb_recovery_allocate:" + " failed to free previous" + " recovery area"); + } - if (offset >= old_map_size) { - continue; + /* Refresh recovery after add_free_record above. */ + recovery = alloc_recovery(tdb, &recovery_size); + if (TDB_PTR_IS_ERR(recovery)) + return TDB_PTR_ERR(recovery); } - if (offset + length > tdb->file->map_size) { - free(data); - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_transaction_setup_recovery:" - " transaction data over new region" - " boundary"); - } - memcpy(p, &offset, sizeof(offset)); - memcpy(p + sizeof(offset), &length, sizeof(length)); - tdb_convert(tdb, p, sizeof(offset) + sizeof(length)); - - /* the recovery area contains the old data, not the - new data, so we have to call the original tdb_read - method to get it */ - if (offset + length > old_map_size) { - /* Short read at EOF, and zero fill. */ - unsigned int len = old_map_size - offset; - ecode = methods->tread(tdb, offset, - p + sizeof(offset) + sizeof(length), - len); - memset(p + sizeof(offset) + sizeof(length) + len, 0, - length - len); - } else { - ecode = methods->tread(tdb, offset, - p + sizeof(offset) + sizeof(length), - length); + recovery_off = create_recovery_area(tdb, recovery_size, + recovery); + if (TDB_OFF_IS_ERR(recovery_off)) { + free(recovery); + return TDB_OFF_TO_ERR(recovery_off); } - if (ecode != TDB_SUCCESS) { - free(data); - return ecode; - } - p += sizeof(offset) + sizeof(length) + length; } + /* Now we know size, convert rec header. */ + recovery->magic = TDB_RECOVERY_INVALID_MAGIC; + recovery->len = recovery_size; + recovery->eof = old_map_size; + tdb_convert(tdb, recovery, sizeof(*recovery)); + /* write the recovery data to the recovery area */ - ecode = methods->twrite(tdb, recovery_offset, data, - sizeof(*rec) + recovery_size); + ecode = methods->twrite(tdb, recovery_off, recovery, recovery_size); if (ecode != TDB_SUCCESS) { - free(data); + free(recovery); return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, "tdb_transaction_setup_recovery:" " failed to write recovery data"); } - transaction_write_existing(tdb, recovery_offset, data, - sizeof(*rec) + recovery_size); + transaction_write_existing(tdb, recovery_off, recovery, recovery_size); + + free(recovery); /* as we don't have ordered writes, we have to sync the recovery data before we update the magic to indicate that the recovery data is present */ - ecode = transaction_sync(tdb, recovery_offset, - sizeof(*rec) + recovery_size); - if (ecode != TDB_SUCCESS) { - free(data); + ecode = transaction_sync(tdb, recovery_off, recovery_size); + if (ecode != TDB_SUCCESS) return ecode; - } - - free(data); magic = TDB_RECOVERY_MAGIC; tdb_convert(tdb, &magic, sizeof(magic)); - *magic_offset = recovery_offset + offsetof(struct tdb_recovery_record, - magic); + tdb->tdb2.transaction->magic_offset + = recovery_off + offsetof(struct tdb_recovery_record, magic); - ecode = methods->twrite(tdb, *magic_offset, &magic, sizeof(magic)); + ecode = methods->twrite(tdb, tdb->tdb2.transaction->magic_offset, + &magic, sizeof(magic)); if (ecode != TDB_SUCCESS) { return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, "tdb_transaction_setup_recovery:" " failed to write recovery magic"); } - transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic)); + transaction_write_existing(tdb, tdb->tdb2.transaction->magic_offset, + &magic, sizeof(magic)); /* ensure the recovery magic marker is on disk */ - return transaction_sync(tdb, *magic_offset, sizeof(magic)); + return transaction_sync(tdb, tdb->tdb2.transaction->magic_offset, + sizeof(magic)); } static enum TDB_ERROR _tdb_transaction_prepare_commit(struct tdb_context *tdb) @@ -897,20 +985,20 @@ static enum TDB_ERROR _tdb_transaction_prepare_commit(struct tdb_context *tdb) const struct tdb_methods *methods; enum TDB_ERROR ecode; - if (tdb->transaction == NULL) { + if (tdb->tdb2.transaction == NULL) { return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, "tdb_transaction_prepare_commit:" " no transaction"); } - if (tdb->transaction->prepared) { + if (tdb->tdb2.transaction->prepared) { _tdb_transaction_cancel(tdb); return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, "tdb_transaction_prepare_commit:" " transaction already prepared"); } - if (tdb->transaction->transaction_error) { + if (tdb->tdb2.transaction->transaction_error) { _tdb_transaction_cancel(tdb); return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR, "tdb_transaction_prepare_commit:" @@ -918,50 +1006,49 @@ static enum TDB_ERROR _tdb_transaction_prepare_commit(struct tdb_context *tdb) } - if (tdb->transaction->nesting != 0) { + if (tdb->tdb2.transaction->nesting != 0) { return TDB_SUCCESS; } /* check for a null transaction */ - if (tdb->transaction->blocks == NULL) { + if (tdb->tdb2.transaction->blocks == NULL) { return TDB_SUCCESS; } - methods = tdb->transaction->io_methods; + methods = tdb->tdb2.transaction->io_methods; /* upgrade the main transaction lock region to a write lock */ - ecode = tdb_allrecord_upgrade(tdb); + ecode = tdb_allrecord_upgrade(tdb, TDB_HASH_LOCK_START); if (ecode != TDB_SUCCESS) { return ecode; } /* get the open lock - this prevents new users attaching to the database during the commit */ - ecode = tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK); + ecode = tdb_lock_open(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK); if (ecode != TDB_SUCCESS) { return ecode; } /* Since we have whole db locked, we don't need the expansion lock. */ if (!(tdb->flags & TDB_NOSYNC)) { - /* write the recovery data to the end of the file */ - ecode = transaction_setup_recovery(tdb, - &tdb->transaction - ->magic_offset); + /* Sets up tdb->tdb2.transaction->recovery and + * tdb->tdb2.transaction->magic_offset. */ + ecode = transaction_setup_recovery(tdb); if (ecode != TDB_SUCCESS) { return ecode; } } - tdb->transaction->prepared = true; + tdb->tdb2.transaction->prepared = true; /* expand the file to the new size if needed */ - if (tdb->file->map_size != tdb->transaction->old_map_size) { + if (tdb->file->map_size != tdb->tdb2.transaction->old_map_size) { tdb_len_t add; - add = tdb->file->map_size - tdb->transaction->old_map_size; + add = tdb->file->map_size - tdb->tdb2.transaction->old_map_size; /* Restore original map size for tdb_expand_file */ - tdb->file->map_size = tdb->transaction->old_map_size; + tdb->file->map_size = tdb->tdb2.transaction->old_map_size; ecode = methods->expand_file(tdb, add); if (ecode != TDB_SUCCESS) { return ecode; @@ -977,7 +1064,12 @@ static enum TDB_ERROR _tdb_transaction_prepare_commit(struct tdb_context *tdb) */ enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb) { - return _tdb_transaction_prepare_commit(tdb); + if (tdb->flags & TDB_VERSION1) { + if (tdb1_transaction_prepare_commit(tdb) == -1) + return tdb->last_error; + return TDB_SUCCESS; + } + return tdb->last_error = _tdb_transaction_prepare_commit(tdb); } /* @@ -989,7 +1081,13 @@ enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb) int i; enum TDB_ERROR ecode; - if (tdb->transaction == NULL) { + if (tdb->flags & TDB_VERSION1) { + if (tdb1_transaction_commit(tdb) == -1) + return tdb->last_error; + return TDB_SUCCESS; + } + + if (tdb->tdb2.transaction == NULL) { return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, "tdb_transaction_commit:" @@ -998,18 +1096,18 @@ enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb) tdb_trace(tdb, "tdb_transaction_commit"); - if (tdb->transaction->nesting != 0) { - tdb->transaction->nesting--; + if (tdb->tdb2.transaction->nesting != 0) { + tdb->tdb2.transaction->nesting--; return tdb->last_error = TDB_SUCCESS; } /* check for a null transaction */ - if (tdb->transaction->blocks == NULL) { + if (tdb->tdb2.transaction->blocks == NULL) { _tdb_transaction_cancel(tdb); return tdb->last_error = TDB_SUCCESS; } - if (!tdb->transaction->prepared) { + if (!tdb->tdb2.transaction->prepared) { ecode = _tdb_transaction_prepare_commit(tdb); if (ecode != TDB_SUCCESS) { _tdb_transaction_cancel(tdb); @@ -1017,41 +1115,41 @@ enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb) } } - methods = tdb->transaction->io_methods; + methods = tdb->tdb2.transaction->io_methods; /* perform all the writes */ - for (i=0;itransaction->num_blocks;i++) { + for (i=0;itdb2.transaction->num_blocks;i++) { tdb_off_t offset; tdb_len_t length; - if (tdb->transaction->blocks[i] == NULL) { + if (tdb->tdb2.transaction->blocks[i] == NULL) { continue; } offset = i * PAGESIZE; length = PAGESIZE; - if (i == tdb->transaction->num_blocks-1) { - length = tdb->transaction->last_block_size; + if (i == tdb->tdb2.transaction->num_blocks-1) { + length = tdb->tdb2.transaction->last_block_size; } ecode = methods->twrite(tdb, offset, - tdb->transaction->blocks[i], length); + tdb->tdb2.transaction->blocks[i], length); if (ecode != TDB_SUCCESS) { /* we've overwritten part of the data and possibly expanded the file, so we need to run the crash recovery code */ - tdb->methods = methods; + tdb->tdb2.io = methods; tdb_transaction_recover(tdb); _tdb_transaction_cancel(tdb); return tdb->last_error = ecode; } - SAFE_FREE(tdb->transaction->blocks[i]); + SAFE_FREE(tdb->tdb2.transaction->blocks[i]); } - SAFE_FREE(tdb->transaction->blocks); - tdb->transaction->num_blocks = 0; + SAFE_FREE(tdb->tdb2.transaction->blocks); + tdb->tdb2.transaction->num_blocks = 0; /* ensure the new data is on disk */ ecode = transaction_sync(tdb, 0, tdb->file->map_size); @@ -1076,7 +1174,7 @@ enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb) /* use a transaction cancel to free memory and remove the transaction locks: it "restores" map_size, too. */ - tdb->transaction->old_map_size = tdb->file->map_size; + tdb->tdb2.transaction->old_map_size = tdb->file->map_size; _tdb_transaction_cancel(tdb); return tdb->last_error = TDB_SUCCESS; @@ -1098,7 +1196,8 @@ enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb) /* find the recovery area */ recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery)); if (TDB_OFF_IS_ERR(recovery_head)) { - return tdb_logerr(tdb, recovery_head, TDB_LOG_ERROR, + ecode = TDB_OFF_TO_ERR(recovery_head); + return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, "tdb_transaction_recover:" " failed to read recovery head"); } @@ -1121,7 +1220,7 @@ enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb) return TDB_SUCCESS; } - if (tdb->read_only) { + if (tdb->flags & TDB_RDONLY) { return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, "tdb_transaction_recover:" " attempt to recover read only database"); @@ -1137,7 +1236,7 @@ enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb) } /* read the full recovery data */ - ecode = tdb->methods->tread(tdb, recovery_head + sizeof(rec), data, + ecode = tdb->tdb2.io->tread(tdb, recovery_head + sizeof(rec), data, rec.len); if (ecode != TDB_SUCCESS) { return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, @@ -1155,7 +1254,7 @@ enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb) memcpy(&len, p + sizeof(ofs), sizeof(len)); p += sizeof(ofs) + sizeof(len); - ecode = tdb->methods->twrite(tdb, ofs, p, len); + ecode = tdb->tdb2.io->twrite(tdb, ofs, p, len); if (ecode != TDB_SUCCESS) { free(data); return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, @@ -1234,7 +1333,7 @@ tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb) /* read the recovery record */ ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec)); if (ecode != TDB_SUCCESS) { - return ecode; + return TDB_ERR_TO_OFF(ecode); } return (rec.magic == TDB_RECOVERY_MAGIC);