- don't allow any locks to be held when a transaction starts,
otherwise we can end up with deadlock (plus lack of lock nesting
- in posix locks would mean the lock is lost)
+ in POSIX locks would mean the lock is lost)
- if the caller gains a lock during the transaction but doesn't
release it then fail the commit
- allow for nested calls to tdb_transaction_start(), re-using the
- existing transaction record. If the inner transaction is cancelled
+ existing transaction record. If the inner transaction is canceled
then a subsequent commit will fail
- keep a mirrored copy of the tdb hash chain heads to allow for the
- allow callers to mix transaction and non-transaction use of tdb,
although once a transaction is started then an exclusive lock is
- gained until the transaction is committed or cancelled
+ gained until the transaction is committed or canceled
- the commit stategy involves first saving away all modified data
into a linearised buffer in the transaction recovery area, then
/* when inside a transaction we need to keep track of any
nested tdb_transaction_start() calls, as these are allowed,
but don't create a new transaction */
- int nesting;
+ unsigned int nesting;
/* set when a prepare has already occurred */
bool prepared;
tdb_len_t old_map_size;
};
+/* This doesn't really need to be pagesize, but we use it for similar reasons. */
+#define PAGESIZE 4096
/*
read while in a transaction. We need to check first if the data is in our list
enum TDB_ERROR ecode;
/* break it down into block sized ops */
- while (len + (off % getpagesize()) > getpagesize()) {
- tdb_len_t len2 = getpagesize() - (off % getpagesize());
+ while (len + (off % PAGESIZE) > PAGESIZE) {
+ tdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
ecode = transaction_read(tdb, off, buf, len2);
if (ecode != TDB_SUCCESS) {
return ecode;
return TDB_SUCCESS;
}
- blk = off / getpagesize();
+ blk = off / PAGESIZE;
/* see if we have it in the block list */
if (tdb->transaction->num_blocks <= blk ||
}
/* now copy it out of this block */
- memcpy(buf, tdb->transaction->blocks[blk] + (off % getpagesize()), len);
+ memcpy(buf, tdb->transaction->blocks[blk] + (off % PAGESIZE), len);
return TDB_SUCCESS;
fail:
}
/* break it up into block sized chunks */
- while (len + (off % getpagesize()) > getpagesize()) {
- tdb_len_t len2 = getpagesize() - (off % getpagesize());
+ while (len + (off % PAGESIZE) > PAGESIZE) {
+ tdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
ecode = transaction_write(tdb, off, buf, len2);
if (ecode != TDB_SUCCESS) {
return -1;
return TDB_SUCCESS;
}
- blk = off / getpagesize();
- off = off % getpagesize();
+ blk = off / PAGESIZE;
+ off = off % PAGESIZE;
if (tdb->transaction->num_blocks <= blk) {
uint8_t **new_blocks;
/* allocate and fill a block? */
if (tdb->transaction->blocks[blk] == NULL) {
- tdb->transaction->blocks[blk] = (uint8_t *)calloc(getpagesize(), 1);
+ tdb->transaction->blocks[blk] = (uint8_t *)calloc(PAGESIZE, 1);
if (tdb->transaction->blocks[blk] == NULL) {
ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
"transaction_write:"
" failed to allocate");
goto fail;
}
- if (tdb->transaction->old_map_size > blk * getpagesize()) {
- tdb_len_t len2 = getpagesize();
- if (len2 + (blk * getpagesize()) > tdb->transaction->old_map_size) {
- len2 = tdb->transaction->old_map_size - (blk * getpagesize());
+ if (tdb->transaction->old_map_size > blk * PAGESIZE) {
+ tdb_len_t len2 = PAGESIZE;
+ if (len2 + (blk * PAGESIZE) > tdb->transaction->old_map_size) {
+ len2 = tdb->transaction->old_map_size - (blk * PAGESIZE);
}
ecode = tdb->transaction->io_methods->tread(tdb,
- blk * getpagesize(),
+ blk * PAGESIZE,
tdb->transaction->blocks[blk],
len2);
if (ecode != TDB_SUCCESS) {
/*
- write while in a transaction - this varient never expands the transaction blocks, it only
+ write while in a transaction - this variant never expands the transaction blocks, it only
updates existing blocks. This means it cannot change the recovery size
*/
static void transaction_write_existing(struct tdb_context *tdb, tdb_off_t off,
size_t blk;
/* break it up into block sized chunks */
- while (len + (off % getpagesize()) > getpagesize()) {
- tdb_len_t len2 = getpagesize() - (off % getpagesize());
+ while (len + (off % PAGESIZE) > PAGESIZE) {
+ tdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
transaction_write_existing(tdb, off, buf, len2);
len -= len2;
off += len2;
return;
}
- blk = off / getpagesize();
- off = off % getpagesize();
+ blk = off / PAGESIZE;
+ off = off % PAGESIZE;
if (tdb->transaction->num_blocks <= blk ||
tdb->transaction->blocks[blk] == NULL) {
static enum TDB_ERROR transaction_oob(struct tdb_context *tdb, tdb_off_t len,
bool probe)
{
- if (len <= tdb->map_size) {
+ if (len <= tdb->file->map_size) {
return TDB_SUCCESS;
}
if (!probe) {
tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
"tdb_oob len %lld beyond transaction size %lld",
(long long)len,
- (long long)tdb->map_size);
+ (long long)tdb->file->map_size);
}
return TDB_ERR_IO;
}
/* add a write to the transaction elements, so subsequent
reads see the zero data */
- ecode = transaction_write(tdb, tdb->map_size, NULL, addition);
+ ecode = transaction_write(tdb, tdb->file->map_size, NULL, addition);
if (ecode == TDB_SUCCESS) {
- tdb->map_size += addition;
+ tdb->file->map_size += addition;
}
return ecode;
}
static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off,
size_t len, bool write_mode)
{
- size_t blk = off / getpagesize(), end_blk;
+ size_t blk = off / PAGESIZE, end_blk;
/* This is wrong for zero-length blocks, but will fail gracefully */
- end_blk = (off + len - 1) / getpagesize();
+ end_blk = (off + len - 1) / PAGESIZE;
/* Can only do direct if in single block and we've already copied. */
if (write_mode) {
return NULL;
if (tdb->transaction->blocks[blk] == NULL)
return NULL;
- return tdb->transaction->blocks[blk] + off % getpagesize();
+ return tdb->transaction->blocks[blk] + off % PAGESIZE;
}
/* Single which we have copied? */
if (blk == end_blk
&& blk < tdb->transaction->num_blocks
&& tdb->transaction->blocks[blk])
- return tdb->transaction->blocks[blk] + off % getpagesize();
+ return tdb->transaction->blocks[blk] + off % PAGESIZE;
/* Otherwise must be all not copied. */
- while (blk < end_blk) {
+ while (blk <= end_blk) {
if (blk >= tdb->transaction->num_blocks)
break;
if (tdb->transaction->blocks[blk])
return TDB_SUCCESS;
}
- if (fsync(tdb->fd) != 0) {
+ if (fsync(tdb->file->fd) != 0) {
return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
"tdb_transaction: fsync failed: %s",
strerror(errno));
}
#ifdef MS_SYNC
- if (tdb->map_ptr) {
- tdb_off_t moffset = offset & ~(getpagesize()-1);
- if (msync(moffset + (char *)tdb->map_ptr,
+ if (tdb->file->map_ptr) {
+ tdb_off_t moffset = offset & ~(PAGESIZE-1);
+ if (msync(moffset + (char *)tdb->file->map_ptr,
length + (offset - moffset), MS_SYNC) != 0) {
return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
"tdb_transaction: msync failed: %s",
return;
}
- tdb->map_size = tdb->transaction->old_map_size;
+ tdb->file->map_size = tdb->transaction->old_map_size;
/* free all the transaction blocks */
for (i=0;i<tdb->transaction->num_blocks;i++) {
}
}
- if (tdb->allrecord_lock.count)
- tdb_allrecord_unlock(tdb, tdb->allrecord_lock.ltype);
+ if (tdb->file->allrecord_lock.count)
+ tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype);
/* restore the normal io methods */
tdb->methods = tdb->transaction->io_methods;
start a tdb transaction. No token is returned, as only a single
transaction is allowed to be pending per tdb_context
*/
-int tdb_transaction_start(struct tdb_context *tdb)
+enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb)
{
enum TDB_ERROR ecode;
/* some sanity checks */
if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) {
- tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
- "tdb_transaction_start: cannot start a transaction"
- " on a read-only or internal db");
- return -1;
+ return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_transaction_start:"
+ " cannot start a"
+ " transaction on a "
+ "read-only or internal db");
}
/* cope with nested tdb_transaction_start() calls */
if (tdb->transaction != NULL) {
- tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_USE_ERROR,
- "tdb_transaction_start:"
- " already inside transaction");
- return -1;
+ if (!(tdb->flags & TDB_ALLOW_NESTING)) {
+ return tdb->last_error
+ = tdb_logerr(tdb, TDB_ERR_IO,
+ TDB_LOG_USE_ERROR,
+ "tdb_transaction_start:"
+ " already inside transaction");
+ }
+ tdb->transaction->nesting++;
+ return 0;
}
if (tdb_has_hash_locks(tdb)) {
/* the caller must not have any locks when starting a
transaction as otherwise we'll be screwed by lack
- of nested locks in posix */
- tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
- "tdb_transaction_start: cannot start a transaction"
- " with locks held");
- return -1;
+ of nested locks in POSIX */
+ return tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK,
+ TDB_LOG_USE_ERROR,
+ "tdb_transaction_start:"
+ " cannot start a"
+ " transaction with locks"
+ " held");
}
tdb->transaction = (struct tdb_transaction *)
calloc(sizeof(struct tdb_transaction), 1);
if (tdb->transaction == NULL) {
- tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
- "tdb_transaction_start: cannot allocate");
- return -1;
+ return tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM,
+ TDB_LOG_ERROR,
+ "tdb_transaction_start:"
+ " cannot allocate");
}
/* get the transaction write lock. This is a blocking lock. As
make this async, which we will probably do in the future */
ecode = tdb_transaction_lock(tdb, F_WRLCK);
if (ecode != TDB_SUCCESS) {
- tdb->ecode = ecode;
SAFE_FREE(tdb->transaction->blocks);
SAFE_FREE(tdb->transaction);
- return -1;
+ return tdb->last_error = ecode;
}
/* get a read lock over entire file. This is upgraded to a write
lock during the commit */
ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true);
if (ecode != TDB_SUCCESS) {
- tdb->ecode = ecode;
goto fail_allrecord_lock;
}
/* make sure we know about any file expansions already done by
anyone else */
- tdb->methods->oob(tdb, tdb->map_size + 1, true);
- tdb->transaction->old_map_size = tdb->map_size;
+ tdb->methods->oob(tdb, tdb->file->map_size + 1, true);
+ tdb->transaction->old_map_size = tdb->file->map_size;
/* finally hook the io methods, replacing them with
transaction specific methods */
tdb->transaction->io_methods = tdb->methods;
tdb->methods = &transaction_methods;
- return 0;
+ return tdb->last_error = TDB_SUCCESS;
fail_allrecord_lock:
tdb_transaction_unlock(tdb, F_WRLCK);
SAFE_FREE(tdb->transaction->blocks);
SAFE_FREE(tdb->transaction);
- return -1;
+ return tdb->last_error = ecode;
}
tdb_len_t recovery_size = 0;
int i;
- recovery_size = sizeof(tdb_len_t);
+ recovery_size = 0;
for (i=0;i<tdb->transaction->num_blocks;i++) {
- if (i * getpagesize() >= tdb->transaction->old_map_size) {
+ if (i * PAGESIZE >= tdb->transaction->old_map_size) {
break;
}
if (tdb->transaction->blocks[i] == NULL) {
if (i == tdb->transaction->num_blocks-1) {
recovery_size += tdb->transaction->last_block_size;
} else {
- recovery_size += getpagesize();
+ recovery_size += PAGESIZE;
}
}
us an area that is being currently used (as of the start of
the transaction) */
if (recovery_head != 0) {
- add_stat(tdb, frees, 1);
+ tdb->stats.frees++;
ecode = add_free_record(tdb, recovery_head,
- sizeof(rec) + rec.max_len);
+ sizeof(rec) + rec.max_len,
+ TDB_LOCK_WAIT, true);
if (ecode != TDB_SUCCESS) {
return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
"tdb_recovery_allocate:"
/* the tdb_free() call might have increased the recovery size */
*recovery_size = tdb_recovery_size(tdb);
- /* round up to a multiple of page size */
+ /* round up to a multiple of page size. Overallocate, since each
+ * such allocation forces us to expand the file. */
*recovery_max_size
- = (((sizeof(rec) + *recovery_size) + getpagesize()-1)
- & ~(getpagesize()-1))
+ = (((sizeof(rec) + *recovery_size + *recovery_size / 2)
+ + PAGESIZE-1) & ~(PAGESIZE-1))
- sizeof(rec);
- *recovery_offset = tdb->map_size;
+ *recovery_offset = tdb->file->map_size;
recovery_head = *recovery_offset;
/* Restore ->map_size before calling underlying expand_file.
Also so that we don't try to expand the file again in the
transaction commit, which would destroy the recovery
area */
- addition = (tdb->map_size - tdb->transaction->old_map_size) +
+ addition = (tdb->file->map_size - tdb->transaction->old_map_size) +
sizeof(rec) + *recovery_max_size;
- tdb->map_size = tdb->transaction->old_map_size;
+ tdb->file->map_size = tdb->transaction->old_map_size;
ecode = methods->expand_file(tdb, addition);
if (ecode != TDB_SUCCESS) {
return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
/* we have to reset the old map size so that we don't try to
expand the file again in the transaction commit, which
would destroy the recovery area */
- tdb->transaction->old_map_size = tdb->map_size;
+ tdb->transaction->old_map_size = tdb->file->map_size;
/* write the recovery header offset and sync - we can sync without a race here
as the magic ptr in the recovery record has not been set */
static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb,
tdb_off_t *magic_offset)
{
- tdb_len_t recovery_size;
+ /* Initialized for GCC's 4.4.5 overzealous uninitialized warnings. */
+ tdb_len_t recovery_size = 0;
+ tdb_off_t recovery_offset = 0, recovery_max_size = 0;
unsigned char *data, *p;
const struct tdb_methods *methods = tdb->transaction->io_methods;
struct tdb_recovery_record *rec;
- tdb_off_t recovery_offset, recovery_max_size;
tdb_off_t old_map_size = tdb->transaction->old_map_size;
- uint64_t magic, tailer;
+ uint64_t magic;
int i;
enum TDB_ERROR ecode;
continue;
}
- offset = i * getpagesize();
- length = getpagesize();
+ offset = i * PAGESIZE;
+ length = PAGESIZE;
if (i == tdb->transaction->num_blocks-1) {
length = tdb->transaction->last_block_size;
}
if (offset >= old_map_size) {
continue;
}
- if (offset + length > tdb->map_size) {
+
+ if (offset + length > tdb->file->map_size) {
free(data);
return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
"tdb_transaction_setup_recovery:"
/* the recovery area contains the old data, not the
new data, so we have to call the original tdb_read
method to get it */
- ecode = methods->tread(tdb, offset,
- p + sizeof(offset) + sizeof(length),
- length);
+ if (offset + length > old_map_size) {
+ /* Short read at EOF, and zero fill. */
+ unsigned int len = old_map_size - offset;
+ ecode = methods->tread(tdb, offset,
+ p + sizeof(offset) + sizeof(length),
+ len);
+ memset(p + sizeof(offset) + sizeof(length) + len, 0,
+ length - len);
+ } else {
+ ecode = methods->tread(tdb, offset,
+ p + sizeof(offset) + sizeof(length),
+ length);
+ }
if (ecode != TDB_SUCCESS) {
free(data);
return ecode;
p += sizeof(offset) + sizeof(length) + length;
}
- /* and the tailer */
- tailer = sizeof(*rec) + recovery_max_size;
- memcpy(p, &tailer, sizeof(tailer));
- tdb_convert(tdb, p, sizeof(tailer));
-
/* write the recovery data to the recovery area */
ecode = methods->twrite(tdb, recovery_offset, data,
sizeof(*rec) + recovery_size);
if (tdb->transaction->nesting != 0) {
- tdb->transaction->nesting--;
return TDB_SUCCESS;
}
/* upgrade the main transaction lock region to a write lock */
ecode = tdb_allrecord_upgrade(tdb);
if (ecode != TDB_SUCCESS) {
- tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
- "tdb_transaction_prepare_commit:"
- " failed to upgrade hash locks");
- _tdb_transaction_cancel(tdb);
return ecode;
}
during the commit */
ecode = tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
if (ecode != TDB_SUCCESS) {
- tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
- "tdb_transaction_prepare_commit:"
- " failed to get open lock");
- _tdb_transaction_cancel(tdb);
return ecode;
}
&tdb->transaction
->magic_offset);
if (ecode != TDB_SUCCESS) {
- tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
- "tdb_transaction_prepare_commit:"
- " failed to setup recovery data");
- _tdb_transaction_cancel(tdb);
return ecode;
}
}
tdb->transaction->prepared = true;
/* expand the file to the new size if needed */
- if (tdb->map_size != tdb->transaction->old_map_size) {
- tdb_len_t add = tdb->map_size - tdb->transaction->old_map_size;
+ if (tdb->file->map_size != tdb->transaction->old_map_size) {
+ tdb_len_t add;
+
+ add = tdb->file->map_size - tdb->transaction->old_map_size;
/* Restore original map size for tdb_expand_file */
- tdb->map_size = tdb->transaction->old_map_size;
+ tdb->file->map_size = tdb->transaction->old_map_size;
ecode = methods->expand_file(tdb, add);
if (ecode != TDB_SUCCESS) {
- tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
- "tdb_transaction_prepare_commit:"
- " expansion failed");
- _tdb_transaction_cancel(tdb);
return ecode;
}
}
/*
prepare to commit the current transaction
*/
-int tdb_transaction_prepare_commit(struct tdb_context *tdb)
+enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb)
{
- tdb->ecode = _tdb_transaction_prepare_commit(tdb);
- if (tdb->ecode != TDB_SUCCESS)
- return -1;
- return 0;
+ return _tdb_transaction_prepare_commit(tdb);
}
/*
commit the current transaction
*/
-int tdb_transaction_commit(struct tdb_context *tdb)
+enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb)
{
const struct tdb_methods *methods;
int i;
enum TDB_ERROR ecode;
if (tdb->transaction == NULL) {
- tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
- "tdb_transaction_commit: no transaction");
- return -1;
+ return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_transaction_commit:"
+ " no transaction");
}
tdb_trace(tdb, "tdb_transaction_commit");
if (tdb->transaction->nesting != 0) {
tdb->transaction->nesting--;
- return 0;
+ return tdb->last_error = TDB_SUCCESS;
}
/* check for a null transaction */
if (tdb->transaction->blocks == NULL) {
_tdb_transaction_cancel(tdb);
- return 0;
+ return tdb->last_error = TDB_SUCCESS;
}
if (!tdb->transaction->prepared) {
- tdb->ecode = _tdb_transaction_prepare_commit(tdb);
- if (tdb->ecode != TDB_SUCCESS)
- return -1;
+ ecode = _tdb_transaction_prepare_commit(tdb);
+ if (ecode != TDB_SUCCESS) {
+ _tdb_transaction_cancel(tdb);
+ return tdb->last_error = ecode;
+ }
}
methods = tdb->transaction->io_methods;
continue;
}
- offset = i * getpagesize();
- length = getpagesize();
+ offset = i * PAGESIZE;
+ length = PAGESIZE;
if (i == tdb->transaction->num_blocks-1) {
length = tdb->transaction->last_block_size;
}
ecode = methods->twrite(tdb, offset,
tdb->transaction->blocks[i], length);
if (ecode != TDB_SUCCESS) {
- tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
- "tdb_transaction_commit:"
- " write failed during commit");
-
/* we've overwritten part of the data and
possibly expanded the file, so we need to
run the crash recovery code */
_tdb_transaction_cancel(tdb);
- return -1;
+ return tdb->last_error = ecode;
}
SAFE_FREE(tdb->transaction->blocks[i]);
}
tdb->transaction->num_blocks = 0;
/* ensure the new data is on disk */
- ecode = transaction_sync(tdb, 0, tdb->map_size);
+ ecode = transaction_sync(tdb, 0, tdb->file->map_size);
if (ecode != TDB_SUCCESS) {
- tdb->ecode = ecode;
- return -1;
+ return tdb->last_error = ecode;
}
/*
#endif
/* use a transaction cancel to free memory and remove the
- transaction locks */
+ transaction locks: it "restores" map_size, too. */
+ tdb->transaction->old_map_size = tdb->file->map_size;
_tdb_transaction_cancel(tdb);
- return 0;
+ return tdb->last_error = TDB_SUCCESS;
}
free(data);
- ecode = transaction_sync(tdb, 0, tdb->map_size);
+ ecode = transaction_sync(tdb, 0, tdb->file->map_size);
if (ecode != TDB_SUCCESS) {
return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
"tdb_transaction_recover:"