+++ /dev/null
-ntdb_add_flag: void (struct ntdb_context *, unsigned int)
-ntdb_append: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA)
-ntdb_chainlock: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
-ntdb_chainlock_read: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
-ntdb_chainunlock: void (struct ntdb_context *, NTDB_DATA)
-ntdb_chainunlock_read: void (struct ntdb_context *, NTDB_DATA)
-ntdb_check_: enum NTDB_ERROR (struct ntdb_context *, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
-ntdb_close: int (struct ntdb_context *)
-ntdb_delete: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
-ntdb_errorstr: const char *(enum NTDB_ERROR)
-ntdb_exists: bool (struct ntdb_context *, NTDB_DATA)
-ntdb_fd: int (const struct ntdb_context *)
-ntdb_fetch: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA *)
-ntdb_firstkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
-ntdb_foreach_: void (int (*)(struct ntdb_context *, void *), void *)
-ntdb_get_attribute: enum NTDB_ERROR (struct ntdb_context *, union ntdb_attribute *)
-ntdb_get_flags: unsigned int (struct ntdb_context *)
-ntdb_get_seqnum: int64_t (struct ntdb_context *)
-ntdb_lockall: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_lockall_read: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_name: const char *(const struct ntdb_context *)
-ntdb_nextkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
-ntdb_open: struct ntdb_context *(const char *, int, int, mode_t, union ntdb_attribute *)
-ntdb_parse_record_: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
-ntdb_remove_flag: void (struct ntdb_context *, unsigned int)
-ntdb_repack: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_set_attribute: enum NTDB_ERROR (struct ntdb_context *, const union ntdb_attribute *)
-ntdb_store: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA, int)
-ntdb_summary: enum NTDB_ERROR (struct ntdb_context *, enum ntdb_summary_flags, char **)
-ntdb_transaction_cancel: void (struct ntdb_context *)
-ntdb_transaction_commit: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_transaction_prepare_commit: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_transaction_start: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_traverse_: int64_t (struct ntdb_context *, int (*)(struct ntdb_context *, NTDB_DATA, NTDB_DATA, void *), void *)
-ntdb_unlockall: void (struct ntdb_context *)
-ntdb_unlockall_read: void (struct ntdb_context *)
-ntdb_unset_attribute: void (struct ntdb_context *, enum ntdb_attribute_type)
-ntdb_wipe_all: enum NTDB_ERROR (struct ntdb_context *)
+++ /dev/null
-ntdb_add_flag: void (struct ntdb_context *, unsigned int)
-ntdb_append: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA)
-ntdb_chainlock: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
-ntdb_chainlock_read: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
-ntdb_chainunlock: void (struct ntdb_context *, NTDB_DATA)
-ntdb_chainunlock_read: void (struct ntdb_context *, NTDB_DATA)
-ntdb_check_: enum NTDB_ERROR (struct ntdb_context *, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
-ntdb_close: int (struct ntdb_context *)
-ntdb_delete: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
-ntdb_errorstr: const char *(enum NTDB_ERROR)
-ntdb_exists: bool (struct ntdb_context *, NTDB_DATA)
-ntdb_fd: int (const struct ntdb_context *)
-ntdb_fetch: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA *)
-ntdb_firstkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
-ntdb_foreach_: void (int (*)(struct ntdb_context *, void *), void *)
-ntdb_get_attribute: enum NTDB_ERROR (struct ntdb_context *, union ntdb_attribute *)
-ntdb_get_flags: unsigned int (struct ntdb_context *)
-ntdb_get_seqnum: int64_t (struct ntdb_context *)
-ntdb_lockall: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_lockall_read: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_name: const char *(const struct ntdb_context *)
-ntdb_nextkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
-ntdb_open: struct ntdb_context *(const char *, int, int, mode_t, union ntdb_attribute *)
-ntdb_parse_record_: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
-ntdb_remove_flag: void (struct ntdb_context *, unsigned int)
-ntdb_repack: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_set_attribute: enum NTDB_ERROR (struct ntdb_context *, const union ntdb_attribute *)
-ntdb_store: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA, int)
-ntdb_summary: enum NTDB_ERROR (struct ntdb_context *, enum ntdb_summary_flags, char **)
-ntdb_transaction_cancel: void (struct ntdb_context *)
-ntdb_transaction_commit: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_transaction_prepare_commit: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_transaction_start: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_traverse_: int64_t (struct ntdb_context *, int (*)(struct ntdb_context *, NTDB_DATA, NTDB_DATA, void *), void *)
-ntdb_unlockall: void (struct ntdb_context *)
-ntdb_unlockall_read: void (struct ntdb_context *)
-ntdb_unset_attribute: void (struct ntdb_context *, enum ntdb_attribute_type)
-ntdb_wipe_all: enum NTDB_ERROR (struct ntdb_context *)
+++ /dev/null
-../../licenses/LGPL-3
\ No newline at end of file
+++ /dev/null
-CC=gcc
-CFLAGS=-g -O0 -Wall -W -I../../ -I./
-LIBS=
-
-LIBNTDB_OBJ = ccan_hash.o ccan_tally.o check.o free.o hash.o io.o lock.o open.o summary.o ntdb.o transaction.o traverse.o
-
-all: ntdbtorture ntdbtool ntdbdump ntdbrestore ntdbbackup
-
-ntdbtorture: tools/ntdbtorture.c libntdb.a
- $(CC) $(CFLAGS) -o tools/$@ tools/$@.c libntdb.a $(LIBS)
-
-ntdbtool: tools/ntdbtool.c libntdb.a
- $(CC) $(CFLAGS) -o tools/$@ tools/$@.c libntdb.a $(LIBS)
-
-ntdbdump: tools/ntdbdump.c libntdb.a
- $(CC) $(CFLAGS) -o tools/$@ tools/$@.c libntdb.a $(LIBS)
-
-ntdbrestore: tools/ntdbrestore.c libntdb.a
- $(CC) $(CFLAGS) -o tools/$@ tools/$@.c libntdb.a $(LIBS)
-
-ntdbbackup: tools/ntdbbackup.c libntdb.a
- $(CC) $(CFLAGS) -o tools/$@ tools/$@.c libntdb.a $(LIBS)
-
-libntdb.a: $(LIBNTDB_OBJ)
- @echo Creating library $@
- ar r libntdb.a $(LIBNTDB_OBJ)
- ranlib libntdb.a
-
-check.o: check.c
- @echo Compiling $@
- $(CC) $(CFLAGS) -c check.c -o $@
-
-free.o: free.c
- @echo Compiling $@
- $(CC) $(CFLAGS) -c free.c -o $@
-
-hash.o: hash.c
- @echo Compiling $@
- $(CC) $(CFLAGS) -c hash.c -o $@
-
-io.o: io.c
- @echo Compiling $@
- $(CC) $(CFLAGS) -c io.c -o $@
-
-lock.o: lock.c
- @echo Compiling $@
- $(CC) $(CFLAGS) -c lock.c -o $@
-
-open.o: open.c
- @echo Compiling $@
- $(CC) $(CFLAGS) -c open.c -o $@
-
-summary.o: summary.c
- @echo Compiling $@
- $(CC) $(CFLAGS) -c summary.c -o $@
-
-ntdb.o: ntdb.c
- @echo Compiling $@
- $(CC) $(CFLAGS) -c ntdb.c -o $@
-
-transaction.o: transaction.c
- @echo Compiling $@
- $(CC) $(CFLAGS) -c transaction.c -o $@
-
-traverse.o: traverse.c
- @echo Compiling $@
- $(CC) $(CFLAGS) -c traverse.c -o $@
-
-ccan_hash.o: ../hash/hash.c
- @echo Compiling $@
- $(CC) $(CFLAGS) -c ../hash/hash.c -o $@
-
-ccan_tally.o: ../tally/tally.c
- @echo Compiling $@
- $(CC) $(CFLAGS) -c ../tally/tally.c -o $@
-
-clean:
- rm -f *.o
- rm -f *.a
- rm -f tools/ntdbtorture tools/ntdbtool tools/ntdbdump tools/ntdbrestore tools/ntdbbackup
+++ /dev/null
-#include "config.h"
-#include <stdio.h>
-#include <string.h>
-
-/**
- * ntdb - Next Generation Trivial Database
- *
- * This package provides an experimental persistent keyword/data store.
- * Its main advantage over tdb is that it's 64-bit.
- *
- * Example:
- * #include <stdio.h>
- * #include <err.h>
- * #include <unistd.h>
- * #include <ccan/ntdb/ntdb.h>
- *
- * int main(int argc, char *argv[])
- * {
- * NTDB_DATA key = ntdb_mkdata("key", 3);
- * NTDB_DATA val = ntdb_mkdata("val", 3);
- * struct ntdb_context *ntdb;
- *
- * ntdb = ntdb_open("example.ntdb", NTDB_DEFAULT,
- * O_RDWR | O_CREAT | O_TRUNC, 0600, NULL);
- * if (ntdb == NULL)
- * errx(1, "failed to open database file");
- *
- * ntdb_store(ntdb, key, val, NTDB_INSERT);
- *
- * ntdb_close(ntdb);
- *
- * return 0;
- * }
- *
- * License: LGPL (v3 or any later version)
- * Authors: Rusty Russell
- * Andrew Tridgell
- * Jeremy Allison
- * Jelmer Vernooij
- * Volker Lendecke
- * Andrew Esh
- * Simon McVittie
- * Tim Potter
- * Maintainer: Rusty Russell <rusty@rustcorp.com.au>
- */
-int main(int argc, char *argv[])
-{
- if (argc != 2)
- return 1;
-
- if (strcmp(argv[1], "depends") == 0) {
- printf("ccan/asearch\n");
- printf("ccan/build_assert\n");
- printf("ccan/cast\n");
- printf("ccan/compiler\n");
- printf("ccan/endian\n");
- printf("ccan/hash\n");
- printf("ccan/ilog\n");
- printf("ccan/likely\n");
- printf("ccan/tally\n");
- printf("ccan/typesafe_cb\n");
- return 0;
- }
-
- if (strcmp(argv[1], "testdepends") == 0) {
- printf("ccan/failtest\n");
- printf("ccan/err\n");
- return 0;
- }
-
- return 1;
-}
+++ /dev/null
- /*
- Trivial Database 2: free list/block handling
- Copyright (C) Rusty Russell 2010
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/likely/likely.h>
-#include <ccan/asearch/asearch.h>
-
-/* We keep an ordered array of offsets. */
-static bool append(struct ntdb_context *ntdb,
- ntdb_off_t **arr, size_t *num, ntdb_off_t off)
-{
- ntdb_off_t *new;
-
- if (*num == 0) {
- new = ntdb->alloc_fn(ntdb, sizeof(ntdb_off_t), ntdb->alloc_data);
- } else {
- new = ntdb->expand_fn(*arr, (*num + 1) * sizeof(ntdb_off_t),
- ntdb->alloc_data);
- }
- if (!new)
- return false;
- new[(*num)++] = off;
- *arr = new;
- return true;
-}
-
-static enum NTDB_ERROR check_header(struct ntdb_context *ntdb,
- ntdb_off_t *recovery,
- uint64_t *features,
- size_t *num_capabilities)
-{
- uint64_t hash_test;
- struct ntdb_header hdr;
- enum NTDB_ERROR ecode;
- ntdb_off_t off, next;
-
- ecode = ntdb_read_convert(ntdb, 0, &hdr, sizeof(hdr));
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
- /* magic food should not be converted, so convert back. */
- ntdb_convert(ntdb, hdr.magic_food, sizeof(hdr.magic_food));
-
- hash_test = NTDB_HASH_MAGIC;
- hash_test = ntdb_hash(ntdb, &hash_test, sizeof(hash_test));
- if (hdr.hash_test != hash_test) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "check: hash test %llu should be %llu",
- (long long)hdr.hash_test,
- (long long)hash_test);
- }
-
- if (strcmp(hdr.magic_food, NTDB_MAGIC_FOOD) != 0) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "check: bad magic '%.*s'",
- (unsigned)sizeof(hdr.magic_food),
- hdr.magic_food);
- }
-
- /* Features which are used must be a subset of features offered. */
- if (hdr.features_used & ~hdr.features_offered) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "check: features used (0x%llx) which"
- " are not offered (0x%llx)",
- (long long)hdr.features_used,
- (long long)hdr.features_offered);
- }
-
- *features = hdr.features_offered;
- *recovery = hdr.recovery;
- if (*recovery) {
- if (*recovery < sizeof(hdr)
- || *recovery > ntdb->file->map_size) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check:"
- " invalid recovery offset %zu",
- (size_t)*recovery);
- }
- }
-
- for (off = hdr.capabilities; off && ecode == NTDB_SUCCESS; off = next) {
- const struct ntdb_capability *cap;
- enum NTDB_ERROR e;
-
- cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
- if (NTDB_PTR_IS_ERR(cap)) {
- return NTDB_PTR_ERR(cap);
- }
-
- /* All capabilities are unknown. */
- e = unknown_capability(ntdb, "ntdb_check", cap->type);
- next = cap->next;
- ntdb_access_release(ntdb, cap);
- if (e)
- return e;
- (*num_capabilities)++;
- }
-
- /* Don't check reserved: they *can* be used later. */
- return NTDB_SUCCESS;
-}
-
-static int off_cmp(const ntdb_off_t *a, const ntdb_off_t *b, void *ctx)
-{
- /* Can overflow an int. */
- return *a > *b ? 1
- : *a < *b ? -1
- : 0;
-}
-
-static enum NTDB_ERROR check_entry(struct ntdb_context *ntdb,
- ntdb_off_t off_and_hash,
- ntdb_len_t bucket,
- ntdb_off_t used[],
- size_t num_used,
- size_t *num_found,
- enum NTDB_ERROR (*check)(NTDB_DATA,
- NTDB_DATA,
- void *),
- void *data)
-{
- enum NTDB_ERROR ecode;
- const struct ntdb_used_record *r;
- const unsigned char *kptr;
- ntdb_len_t klen, dlen;
- uint32_t hash;
- ntdb_off_t off = off_and_hash & NTDB_OFF_MASK;
- ntdb_off_t *p;
-
- /* Empty bucket is fine. */
- if (!off_and_hash) {
- return NTDB_SUCCESS;
- }
-
- /* This can't point to a chain, we handled those at toplevel. */
- if (off_and_hash & (1ULL << NTDB_OFF_CHAIN_BIT)) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: Invalid chain bit in offset "
- " %llu", (long long)off_and_hash);
- }
-
- p = asearch(&off, used, num_used, off_cmp, NULL);
- if (!p) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: Invalid offset"
- " %llu in hash", (long long)off);
- }
- /* Mark it invalid. */
- *p ^= 1;
- (*num_found)++;
-
- r = ntdb_access_read(ntdb, off, sizeof(*r), true);
- if (NTDB_PTR_IS_ERR(r)) {
- return NTDB_PTR_ERR(r);
- }
- klen = rec_key_length(r);
- dlen = rec_data_length(r);
- ntdb_access_release(ntdb, r);
-
- kptr = ntdb_access_read(ntdb, off + sizeof(*r), klen + dlen, false);
- if (NTDB_PTR_IS_ERR(kptr)) {
- return NTDB_PTR_ERR(kptr);
- }
-
- hash = ntdb_hash(ntdb, kptr, klen);
-
- /* Are we in the right chain? */
- if (bits_from(hash, 0, ntdb->hash_bits) != bucket) {
- ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "ntdb_check: Bad bucket %u vs %llu",
- bits_from(hash, 0, ntdb->hash_bits),
- (long long)bucket);
- /* Next 8 bits should be the same as top bits of bucket. */
- } else if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
- != bits_from(off_and_hash, 64-NTDB_OFF_UPPER_STEAL,
- NTDB_OFF_UPPER_STEAL)) {
- ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "ntdb_check: Bad hash bits %llu vs %llu",
- (long long)off_and_hash,
- (long long)hash);
- } else if (check) {
- NTDB_DATA k, d;
-
- k = ntdb_mkdata(kptr, klen);
- d = ntdb_mkdata(kptr + klen, dlen);
- ecode = check(k, d, data);
- } else {
- ecode = NTDB_SUCCESS;
- }
- ntdb_access_release(ntdb, kptr);
-
- return ecode;
-}
-
-static enum NTDB_ERROR check_hash_chain(struct ntdb_context *ntdb,
- ntdb_off_t off,
- ntdb_len_t bucket,
- ntdb_off_t used[],
- size_t num_used,
- size_t *num_found,
- enum NTDB_ERROR (*check)(NTDB_DATA,
- NTDB_DATA,
- void *),
- void *data)
-{
- struct ntdb_used_record rec;
- enum NTDB_ERROR ecode;
- const ntdb_off_t *entries;
- ntdb_len_t i, num;
-
- /* This is a used entry. */
- (*num_found)++;
-
- ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- if (rec_magic(&rec) != NTDB_CHAIN_MAGIC) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: Bad hash chain magic %llu",
- (long long)rec_magic(&rec));
- }
-
- if (rec_data_length(&rec) % sizeof(ntdb_off_t)) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: Bad hash chain data length %llu",
- (long long)rec_data_length(&rec));
- }
-
- if (rec_key_length(&rec) != 0) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: Bad hash chain key length %llu",
- (long long)rec_key_length(&rec));
- }
-
- off += sizeof(rec);
- num = rec_data_length(&rec) / sizeof(ntdb_off_t);
- entries = ntdb_access_read(ntdb, off, rec_data_length(&rec), true);
- if (NTDB_PTR_IS_ERR(entries)) {
- return NTDB_PTR_ERR(entries);
- }
-
- /* Check each non-deleted entry in chain. */
- for (i = 0; i < num; i++) {
- ecode = check_entry(ntdb, entries[i], bucket,
- used, num_used, num_found, check, data);
- if (ecode) {
- break;
- }
- }
-
- ntdb_access_release(ntdb, entries);
- return ecode;
-}
-
-static enum NTDB_ERROR check_hash(struct ntdb_context *ntdb,
- ntdb_off_t used[],
- size_t num_used,
- size_t num_other_used,
- enum NTDB_ERROR (*check)(NTDB_DATA,
- NTDB_DATA,
- void *),
- void *data)
-{
- enum NTDB_ERROR ecode;
- struct ntdb_used_record rec;
- const ntdb_off_t *entries;
- ntdb_len_t i;
- /* Free tables and capabilities also show up as used, as do we. */
- size_t num_found = num_other_used + 1;
-
- ecode = ntdb_read_convert(ntdb, NTDB_HASH_OFFSET, &rec, sizeof(rec));
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- if (rec_magic(&rec) != NTDB_HTABLE_MAGIC) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: Bad hash table magic %llu",
- (long long)rec_magic(&rec));
- }
-
- if (rec_data_length(&rec) != (sizeof(ntdb_off_t) << ntdb->hash_bits)) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: Bad hash table data length %llu",
- (long long)rec_data_length(&rec));
- }
-
- if (rec_key_length(&rec) != 0) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: Bad hash table key length %llu",
- (long long)rec_key_length(&rec));
- }
-
- entries = ntdb_access_read(ntdb, NTDB_HASH_OFFSET + sizeof(rec),
- rec_data_length(&rec), true);
- if (NTDB_PTR_IS_ERR(entries)) {
- return NTDB_PTR_ERR(entries);
- }
-
- for (i = 0; i < (1 << ntdb->hash_bits); i++) {
- ntdb_off_t off = entries[i] & NTDB_OFF_MASK;
- if (entries[i] & (1ULL << NTDB_OFF_CHAIN_BIT)) {
- ecode = check_hash_chain(ntdb, off, i,
- used, num_used, &num_found,
- check, data);
- } else {
- ecode = check_entry(ntdb, entries[i], i,
- used, num_used, &num_found,
- check, data);
- }
- if (ecode) {
- break;
- }
- }
- ntdb_access_release(ntdb, entries);
-
- if (ecode == NTDB_SUCCESS && num_found != num_used) {
- ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: Not all entries are in hash");
- }
- return ecode;
-}
-
-static enum NTDB_ERROR check_free(struct ntdb_context *ntdb,
- ntdb_off_t off,
- const struct ntdb_free_record *frec,
- ntdb_off_t prev, unsigned int ftable,
- unsigned int bucket)
-{
- enum NTDB_ERROR ecode;
-
- if (frec_magic(frec) != NTDB_FREE_MAGIC) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: offset %llu bad magic 0x%llx",
- (long long)off,
- (long long)frec->magic_and_prev);
- }
- if (frec_ftable(frec) != ftable) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: offset %llu bad freetable %u",
- (long long)off, frec_ftable(frec));
-
- }
-
- ecode = ntdb_oob(ntdb, off,
- frec_len(frec) + sizeof(struct ntdb_used_record),
- false);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
- if (size_to_bucket(frec_len(frec)) != bucket) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: offset %llu in wrong bucket"
- " (%u vs %u)",
- (long long)off,
- bucket, size_to_bucket(frec_len(frec)));
- }
- if (prev && prev != frec_prev(frec)) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: offset %llu bad prev"
- " (%llu vs %llu)",
- (long long)off,
- (long long)prev, (long long)frec_len(frec));
- }
- return NTDB_SUCCESS;
-}
-
-static enum NTDB_ERROR check_free_table(struct ntdb_context *ntdb,
- ntdb_off_t ftable_off,
- unsigned ftable_num,
- ntdb_off_t fr[],
- size_t num_free,
- size_t *num_found)
-{
- struct ntdb_freetable ft;
- ntdb_off_t h;
- unsigned int i;
- enum NTDB_ERROR ecode;
-
- ecode = ntdb_read_convert(ntdb, ftable_off, &ft, sizeof(ft));
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- if (rec_magic(&ft.hdr) != NTDB_FTABLE_MAGIC
- || rec_key_length(&ft.hdr) != 0
- || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: Invalid header on free table");
- }
-
- for (i = 0; i < NTDB_FREE_BUCKETS; i++) {
- ntdb_off_t off, prev = 0, *p, first = 0;
- struct ntdb_free_record f;
-
- h = bucket_off(ftable_off, i);
- for (off = ntdb_read_off(ntdb, h); off; off = f.next) {
- if (NTDB_OFF_IS_ERR(off)) {
- return NTDB_OFF_TO_ERR(off);
- }
- if (!first) {
- off &= NTDB_OFF_MASK;
- first = off;
- }
- ecode = ntdb_read_convert(ntdb, off, &f, sizeof(f));
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
- ecode = check_free(ntdb, off, &f, prev, ftable_num, i);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- /* FIXME: Check hash bits */
- p = asearch(&off, fr, num_free, off_cmp, NULL);
- if (!p) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "ntdb_check: Invalid offset"
- " %llu in free table",
- (long long)off);
- }
- /* Mark it invalid. */
- *p ^= 1;
- (*num_found)++;
- prev = off;
- }
-
- if (first) {
- /* Now we can check first back pointer. */
- ecode = ntdb_read_convert(ntdb, first, &f, sizeof(f));
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
- ecode = check_free(ntdb, first, &f, prev, ftable_num, i);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
- }
- }
- return NTDB_SUCCESS;
-}
-
-/* Slow, but should be very rare. */
-ntdb_off_t dead_space(struct ntdb_context *ntdb, ntdb_off_t off)
-{
- size_t len;
- enum NTDB_ERROR ecode;
-
- for (len = 0; off + len < ntdb->file->map_size; len++) {
- char c;
- ecode = ntdb->io->tread(ntdb, off, &c, 1);
- if (ecode != NTDB_SUCCESS) {
- return NTDB_ERR_TO_OFF(ecode);
- }
- if (c != 0 && c != 0x43)
- break;
- }
- return len;
-}
-
-static enum NTDB_ERROR check_linear(struct ntdb_context *ntdb,
- ntdb_off_t **used, size_t *num_used,
- ntdb_off_t **fr, size_t *num_free,
- uint64_t features, ntdb_off_t recovery)
-{
- ntdb_off_t off;
- ntdb_len_t len;
- enum NTDB_ERROR ecode;
- bool found_recovery = false;
-
- for (off = sizeof(struct ntdb_header);
- off < ntdb->file->map_size;
- off += len) {
- union {
- struct ntdb_used_record u;
- struct ntdb_free_record f;
- struct ntdb_recovery_record r;
- } rec;
- /* r is larger: only get that if we need to. */
- ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.f));
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- /* If we crash after ftruncate, we can get zeroes or fill. */
- if (rec.r.magic == NTDB_RECOVERY_INVALID_MAGIC
- || rec.r.magic == 0x4343434343434343ULL) {
- ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.r));
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
- if (recovery == off) {
- found_recovery = true;
- len = sizeof(rec.r) + rec.r.max_len;
- } else {
- len = dead_space(ntdb, off);
- if (NTDB_OFF_IS_ERR(len)) {
- return NTDB_OFF_TO_ERR(len);
- }
- if (len < sizeof(rec.r)) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "ntdb_check: invalid"
- " dead space at %zu",
- (size_t)off);
- }
-
- ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
- "Dead space at %zu-%zu (of %zu)",
- (size_t)off, (size_t)(off + len),
- (size_t)ntdb->file->map_size);
- }
- } else if (rec.r.magic == NTDB_RECOVERY_MAGIC) {
- ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.r));
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
- if (recovery != off) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "ntdb_check: unexpected"
- " recovery record at offset"
- " %zu",
- (size_t)off);
- }
- if (rec.r.len > rec.r.max_len) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "ntdb_check: invalid recovery"
- " length %zu",
- (size_t)rec.r.len);
- }
- if (rec.r.eof > ntdb->file->map_size) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "ntdb_check: invalid old EOF"
- " %zu", (size_t)rec.r.eof);
- }
- found_recovery = true;
- len = sizeof(rec.r) + rec.r.max_len;
- } else if (frec_magic(&rec.f) == NTDB_FREE_MAGIC) {
- len = sizeof(rec.u) + frec_len(&rec.f);
- if (off + len > ntdb->file->map_size) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "ntdb_check: free overlength"
- " %llu at offset %llu",
- (long long)len,
- (long long)off);
- }
- /* This record should be in free lists. */
- if (frec_ftable(&rec.f) != NTDB_FTABLE_NONE
- && !append(ntdb, fr, num_free, off)) {
- return ntdb_logerr(ntdb, NTDB_ERR_OOM,
- NTDB_LOG_ERROR,
- "ntdb_check: tracking %zu'th"
- " free record.", *num_free);
- }
- } else if (rec_magic(&rec.u) == NTDB_USED_MAGIC
- || rec_magic(&rec.u) == NTDB_CHAIN_MAGIC
- || rec_magic(&rec.u) == NTDB_HTABLE_MAGIC
- || rec_magic(&rec.u) == NTDB_FTABLE_MAGIC
- || rec_magic(&rec.u) == NTDB_CAP_MAGIC) {
- uint64_t klen, dlen, extra;
-
- /* This record is used! */
- if (!append(ntdb, used, num_used, off)) {
- return ntdb_logerr(ntdb, NTDB_ERR_OOM,
- NTDB_LOG_ERROR,
- "ntdb_check: tracking %zu'th"
- " used record.", *num_used);
- }
-
- klen = rec_key_length(&rec.u);
- dlen = rec_data_length(&rec.u);
- extra = rec_extra_padding(&rec.u);
-
- len = sizeof(rec.u) + klen + dlen + extra;
- if (off + len > ntdb->file->map_size) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "ntdb_check: used overlength"
- " %llu at offset %llu",
- (long long)len,
- (long long)off);
- }
-
- if (len < sizeof(rec.f)) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "ntdb_check: too short record"
- " %llu at %llu",
- (long long)len,
- (long long)off);
- }
-
- /* Check that records have correct 0 at end (but may
- * not in future). */
- if (extra && !features
- && rec_magic(&rec.u) != NTDB_CAP_MAGIC) {
- const char *p;
- char c;
- p = ntdb_access_read(ntdb, off + sizeof(rec.u)
- + klen + dlen, 1, false);
- if (NTDB_PTR_IS_ERR(p))
- return NTDB_PTR_ERR(p);
- c = *p;
- ntdb_access_release(ntdb, p);
-
- if (c != '\0') {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "ntdb_check:"
- " non-zero extra"
- " at %llu",
- (long long)off);
- }
- }
- } else {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "ntdb_check: Bad magic 0x%llx"
- " at offset %zu",
- (long long)rec_magic(&rec.u),
- (size_t)off);
- }
- }
-
- /* We must have found recovery area if there was one. */
- if (recovery != 0 && !found_recovery) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: expected a recovery area at %zu",
- (size_t)recovery);
- }
-
- return NTDB_SUCCESS;
-}
-
-_PUBLIC_ enum NTDB_ERROR ntdb_check_(struct ntdb_context *ntdb,
- enum NTDB_ERROR (*check)(NTDB_DATA, NTDB_DATA, void *),
- void *data)
-{
- ntdb_off_t *fr = NULL, *used = NULL;
- ntdb_off_t ft = 0, recovery = 0;
- size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0,
- num_capabilities = 0;
- uint64_t features = 0;
- enum NTDB_ERROR ecode;
-
- if (ntdb->flags & NTDB_CANT_CHECK) {
- return ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
- "ntdb_check: database has unknown capability,"
- " cannot check.");
- }
-
- ecode = ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, false);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- ecode = ntdb_lock_expand(ntdb, F_RDLCK);
- if (ecode != NTDB_SUCCESS) {
- ntdb_allrecord_unlock(ntdb, F_RDLCK);
- return ecode;
- }
-
- ecode = check_header(ntdb, &recovery, &features, &num_capabilities);
- if (ecode != NTDB_SUCCESS)
- goto out;
-
- /* First we do a linear scan, checking all records. */
- ecode = check_linear(ntdb, &used, &num_used, &fr, &num_free, features,
- recovery);
- if (ecode != NTDB_SUCCESS)
- goto out;
-
- for (ft = first_ftable(ntdb); ft; ft = next_ftable(ntdb, ft)) {
- if (NTDB_OFF_IS_ERR(ft)) {
- ecode = NTDB_OFF_TO_ERR(ft);
- goto out;
- }
- ecode = check_free_table(ntdb, ft, num_ftables, fr, num_free,
- &num_found);
- if (ecode != NTDB_SUCCESS)
- goto out;
- num_ftables++;
- }
-
- /* FIXME: Check key uniqueness? */
- ecode = check_hash(ntdb, used, num_used, num_ftables + num_capabilities,
- check, data);
- if (ecode != NTDB_SUCCESS)
- goto out;
-
- if (num_found != num_free) {
- ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "ntdb_check: Not all entries are in"
- " free table");
- }
-
-out:
- ntdb_allrecord_unlock(ntdb, F_RDLCK);
- ntdb_unlock_expand(ntdb, F_RDLCK);
- ntdb->free_fn(fr, ntdb->alloc_data);
- ntdb->free_fn(used, ntdb->alloc_data);
- return ecode;
-}
+++ /dev/null
-Interface differences between TDB and NTDB.
-
-- ntdb shares 'struct TDB_DATA' with tdb, but TDB defines the TDB_DATA
- typedef, whereas ntdb defines NTDB_DATA (ie. both are compatible).
- If you include both ntdb.h and tdb.h, #include tdb.h first,
- otherwise you'll get a compile error when tdb.h re-defined struct
- TDB_DATA.
-
- Example:
- #include <tdb.h>
- #include <ntdb.h>
-
-- ntdb functions return NTDB_SUCCESS (ie 0) on success, and a negative
- error on failure, whereas tdb functions returned 0 on success, and
- -1 on failure. tdb then used tdb_error() to determine the error;
- this API is nasty if we ever want to support threads, so is not supported.
-
- Example:
- #include <tdb.h>
- #include <ntdb.h>
-
- void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
- {
- if (tdb_store(tdb, key, d) == -1) {
- printf("store failed: %s\n", tdb_errorstr(tdb));
- }
- }
-
- void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
- {
- enum NTDB_ERROR e;
-
- e = ntdb_store(ntdb, key, d);
- if (e) {
- printf("store failed: %s\n", ntdb_errorstr(e));
- }
- }
-
-- ntdb's ntdb_fetch() returns an error, tdb's returned the data directly
- (or tdb_null, and you were supposed to check tdb_error() to find out why).
-
- Example:
- #include <tdb.h>
- #include <ntdb.h>
-
- void tdb_example(struct tdb_context *tdb, TDB_DATA key)
- {
- TDB_DATA data;
-
- data = tdb_fetch(tdb, key);
- if (!data.dptr) {
- printf("fetch failed: %s\n", tdb_errorstr(tdb));
- }
- }
-
- void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key)
- {
- NTDB_DATA data;
- enum NTDB_ERROR e;
-
- e = ntdb_fetch(ntdb, key, &data);
- if (e) {
- printf("fetch failed: %s\n", ntdb_errorstr(e));
- }
- }
-
-- ntdb's ntdb_nextkey() frees the old key's dptr, in tdb you needed to do
- this manually.
-
- Example:
- #include <tdb.h>
- #include <ntdb.h>
-
- void tdb_example(struct tdb_context *tdb)
- {
- TDB_DATA key, next, data;
-
- for (key = tdb_firstkey(tdb); key.dptr; key = next) {
- printf("Got key!\n");
- next = tdb_nextkey(tdb, key);
- free(key.dptr);
- }
- }
-
-
- void ntdb_example(struct ntdb_context *ntdb)
- {
- NTDB_DATA k, data;
- enum NTDB_ERROR e;
-
- for (e = ntdb_firstkey(ntdb,&k); !e; e = ntdb_nextkey(ntdb,&k))
- printf("Got key!\n");
- }
-
-- Unlike tdb_open/tdb_open_ex, ntdb_open does not allow NULL names,
- even for NTDB_INTERNAL dbs, and thus ntdb_name() never returns NULL.
-
- Example:
- #include <tdb.h>
- #include <ntdb.h>
-
- struct tdb_context *tdb_example(void)
- {
- return tdb_open(NULL, 0, TDB_INTERNAL, O_RDWR, 0);
- }
-
- struct ntdb_context *ntdb_example(void)
- {
- return ntdb_open("example", NTDB_INTERNAL, O_RDWR, 0);
- }
-
-- ntdb uses a linked list of attribute structures to implement logging and
- alternate hashes. tdb used tdb_open_ex, which was not extensible.
-
- Example:
- #include <tdb.h>
- #include <ntdb.h>
-
- /* Custom hash function */
- static unsigned int my_tdb_hash_func(TDB_DATA *key)
- {
- return key->dsize;
- }
-
- struct tdb_context *tdb_example(void)
- {
- return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
- O_CREAT|O_RDWR, 0600, NULL, my_hash_func);
- }
-
- /* Custom hash function */
- static unsigned int my_ntdb_hash_func(const void *key, size_t len,
- uint32_t seed, void *data)
- {
- return len;
- }
-
- struct ntdb_context *ntdb_example(void)
- {
- union ntdb_attribute hash;
-
- hash.base.attr = NTDB_ATTRIBUTE_HASH;
- hash.base.next = NULL;
- hash.hash.fn = my_ntdb_hash_func;
- return ntdb_open("example.ntdb", NTDB_DEFAULT,
- O_CREAT|O_RDWR, 0600, &hash);
- }
-
-- tdb's tdb_open/tdb_open_ex took an explicit hash size, defaulting to
- 131. ntdb's uses an attribute for this, defaulting to 8192.
-
- Example:
- #include <tdb.h>
- #include <ntdb.h>
-
- struct tdb_context *tdb_example(void)
- {
- return tdb_open("example.tdb", 10007, TDB_DEFAULT,
- O_CREAT|O_RDWR, 0600);
- }
-
- struct ntdb_context *ntdb_example(void)
- {
- union ntdb_attribute hashsize;
-
- hashsize.base.attr = NTDB_ATTRIBUTE_HASHSIZE;
- hashsize.base.next = NULL;
- hashsize.hashsize.size = 16384;
- return ntdb_open("example.ntdb", NTDB_DEFAULT,
- O_CREAT|O_RDWR, 0600, &hashsize);
- }
-
-- ntdb's log function is simpler than tdb's log function. The string
- is already formatted, is not terminated by a '\n', and it takes an
- enum ntdb_log_level not a tdb_debug_level, and which has only three
- values: NTDB_LOG_ERROR, NTDB_LOG_USE_ERROR and NTDB_LOG_WARNING.
-
- #include <tdb.h>
- #include <ntdb.h>
-
- static void tdb_log(struct tdb_context *tdb,
- enum tdb_debug_level level, const char *fmt, ...)
- {
- va_list ap;
- const char *name;
-
- switch (level) {
- case TDB_DEBUG_FATAL:
- fprintf(stderr, "FATAL: ");
- break;
- case TDB_DEBUG_ERROR:
- fprintf(stderr, "ERROR: ");
- break;
- case TDB_DEBUG_WARNING:
- fprintf(stderr, "WARNING: ");
- break;
- case TDB_DEBUG_TRACE:
- /* Don't print out tracing. */
- return;
- }
-
- name = tdb_name(tdb);
- if (!name) {
- name = "unnamed";
- }
-
- fprintf(stderr, "tdb(%s):", name);
-
- va_start(ap, fmt);
- vfprintf(stderr, fmt, ap);
- va_end(ap);
- }
-
- struct tdb_context *tdb_example(void)
- {
- struct tdb_logging_context lctx;
-
- lctx.log_fn = tdb_log;
- return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
- O_CREAT|O_RDWR, 0600, &lctx, NULL);
- }
-
- static void ntdb_log(struct ntdb_context *ntdb,
- enum ntdb_log_level level,
- enum NTDB_ERROR ecode,
- const char *message,
- void *data)
- {
- switch (level) {
- case NTDB_LOG_ERROR:
- fprintf(stderr, "ERROR: ");
- break;
- case NTDB_LOG_USE_ERROR:
- /* We made a mistake, so abort. */
- abort();
- break;
- case NTDB_LOG_WARNING:
- fprintf(stderr, "WARNING: ");
- break;
- }
-
- fprintf(stderr, "ntdb(%s):%s:%s\n",
- ntdb_name(ntdb), ntdb_errorstr(ecode), message);
- }
-
- struct ntdb_context *ntdb_example(void)
- {
- union ntdb_attribute log;
-
- log.base.attr = NTDB_ATTRIBUTE_LOG;
- log.base.next = NULL;
- log.log.fn = ntdb_log;
- return ntdb_open("example.ntdb", NTDB_DEFAULT,
- O_CREAT|O_RDWR, 0600, &log);
- }
-
-- ntdb provides ntdb_deq() for comparing two NTDB_DATA, and ntdb_mkdata() for
- creating an NTDB_DATA.
-
- #include <tdb.h>
- #include <ntdb.h>
-
- void tdb_example(struct tdb_context *tdb)
- {
- TDB_DATA data, key;
-
- key.dsize = strlen("hello");
- key.dptr = "hello";
- data = tdb_fetch(tdb, key);
- if (data.dsize == key.dsize
- && !memcmp(data.dptr, key.dptr, key.dsize))
- printf("key is same as data\n");
- }
- free(data.dptr);
- }
-
- void ntdb_example(struct ntdb_context *ntdb)
- {
- NTDB_DATA data, key;
-
- key = ntdb_mkdata("hello", strlen("hello"));
- if (ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS) {
- if (ntdb_deq(key, data)) {
- printf("key is same as data\n");
- }
- free(data.dptr);
- }
- }
-
-- ntdb's ntdb_parse_record() takes a type-checked callback data
- pointer, not a void * (though a void * pointer still works). The
- callback function is allowed to do read operations on the database,
- or write operations if you first call ntdb_lockall(). TDB's
- tdb_parse_record() did not allow any database access within the
- callback, could crash if you tried.
-
- Example:
- #include <tdb.h>
- #include <ntdb.h>
-
- static int tdb_parser(TDB_DATA key, TDB_DATA data, void *private_data)
- {
- TDB_DATA *expect = private_data;
-
- return data.dsize == expect->dsize
- && !memcmp(data.dptr, expect->dptr, data.dsize);
- }
-
- void tdb_example(struct tdb_context *tdb, TDB_DATA key, NTDB_DATA d)
- {
- switch (tdb_parse_record(tdb, key, tdb_parser, &d)) {
- case -1:
- printf("parse failed: %s\n", tdb_errorstr(tdb));
- break;
- case 0:
- printf("data was different!\n");
- break;
- case 1:
- printf("data was same!\n");
- break;
- }
- }
-
- static int ntdb_parser(TDB_DATA key, TDB_DATA data, TDB_DATA *expect)
- {
- return ntdb_deq(data, *expect);
- }
-
- void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
- {
- enum NTDB_ERROR e;
-
- e = tdb_parse_record(tdb, key, tdb_parser, &d);
- switch (e) {
- case 0:
- printf("data was different!\n");
- break;
- case 1:
- printf("data was same!\n");
- break;
- default:
- printf("parse failed: %s\n", ntdb_errorstr(e));
- break;
- }
- }
-
-- ntdb does locking on read-only databases (ie. O_RDONLY passed to ntdb_open).
- tdb did not: use the NTDB_NOLOCK flag if you want to suppress locking.
-
- Example:
- #include <tdb.h>
- #include <ntdb.h>
-
- struct tdb_context *tdb_example(void)
- {
- return tdb_open("example.tdb", 0, TDB_DEFAULT, O_RDONLY, 0);
- }
-
- struct ntdb_context *ntdb_example(void)
- {
- return ntdb_open("example.ntdb", NTDB_NOLOCK, O_RDONLY, NULL);
- }
-
-- Failure inside a transaction (such as a lock function failing) does
- not implicitly cancel the transaction; you still need to call
- ntdb_transaction_cancel().
-
- #include <tdb.h>
- #include <ntdb.h>
-
- void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
- {
- if (tdb_transaction_start(tdb) == -1) {
- printf("transaction failed: %s\n", tdb_errorstr(tdb));
- return;
- }
-
- if (tdb_store(tdb, key, d) == -1) {
- printf("store failed: %s\n", tdb_errorstr(tdb));
- return;
- }
- if (tdb_transaction_commit(tdb) == -1) {
- printf("commit failed: %s\n", tdb_errorstr(tdb));
- }
- }
-
- void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
- {
- enum NTDB_ERROR e;
-
- e = ntdb_transaction_start(ntdb);
- if (e) {
- printf("transaction failed: %s\n", ntdb_errorstr(e));
- return;
- }
-
- e = ntdb_store(ntdb, key, d);
- if (e) {
- printf("store failed: %s\n", ntdb_errorstr(e));
- ntdb_transaction_cancel(ntdb);
- }
-
- e = ntdb_transaction_commit(ntdb);
- if (e) {
- printf("commit failed: %s\n", ntdb_errorstr(e));
- }
- }
-
-- There is no NTDB_CLEAR_IF_FIRST flag; it has severe scalability and
- API problems. If necessary, you can emulate this by using the open
- hook and placing a 1-byte lock at offset 4. If your program forks
- and exits, you will need to place this lock again in the child before
- the parent exits.
-
- Example:
-
- #include <tdb.h>
- #include <ntdb.h>
-
- struct tdb_context *tdb_example(void)
- {
- return tdb_open("example.tdb", 0, TDB_CLEAR_IF_FIRST,
- O_CREAT|O_RDWR, 0600);
- }
-
- static enum NTDB_ERROR clear_if_first(int fd, void *unused)
- {
- /* We hold a lock offset 4 always, so we can tell if
- * anyone else is. */
- struct flock fl;
-
- fl.l_type = F_WRLCK;
- fl.l_whence = SEEK_SET;
- fl.l_start = 4; /* ACTIVE_LOCK */
- fl.l_len = 1;
-
- if (fcntl(fd, F_SETLK, &fl) == 0) {
- /* We must be first ones to open it! Clear it. */
- if (ftruncate(fd, 0) != 0) {
- return NTDB_ERR_IO;
- }
- }
- fl.l_type = F_RDLCK;
- if (fcntl(fd, F_SETLKW, &fl) != 0) {
- return NTDB_ERR_IO;
- }
- return NTDB_SUCCESS;
- }
-
- struct ntdb_context *ntdb_example(void)
- {
- union ntdb_attribute open_attr;
-
- open_attr.openhook.base.attr = NTDB_ATTRIBUTE_OPENHOOK;
- open_attr.openhook.base.next = NULL;
- open_attr.openhook.fn = clear_if_first;
-
- return ntdb_open("example.ntdb", NTDB_DEFAULT,
- O_CREAT|O_RDWR, 0600, &open_attr);
- }
-
-- ntdb traversals are not reliable if the database is changed during
- the traversal, ie your traversal may not cover all elements, or may
- cover elements multiple times. As a special exception, deleting the
- current record within ntdb_traverse() is reliable.
-
-- There is no ntdb_traverse_read, since ntdb_traverse does not hold
- a lock across the entire traversal anyway. If you want to make sure
- that your traversal function does not write to the database, you can
- set and clear the NTDB_RDONLY flag around the traversal.
-
-- ntdb does not need tdb_reopen() or tdb_reopen_all(). If you call
- fork() after during certain operations the child should close the
- ntdb, or complete the operations before continuing to use the tdb:
-
- ntdb_transaction_start(): child must ntdb_transaction_cancel()
- ntdb_lockall(): child must call ntdb_unlockall()
- ntdb_lockall_read(): child must call ntdb_unlockall_read()
- ntdb_chainlock(): child must call ntdb_chainunlock()
- ntdb_parse() callback: child must return from ntdb_parse()
-
-- ntdb will not open a non-ntdb file, even if O_CREAT is specified. tdb
- will overwrite an unknown file in that case.
+++ /dev/null
-#LyX 2.0 created this file. For more info see http://www.lyx.org/
-\lyxformat 413
-\begin_document
-\begin_header
-\textclass article
-\use_default_options true
-\maintain_unincluded_children false
-\language english
-\language_package default
-\inputencoding auto
-\fontencoding global
-\font_roman default
-\font_sans default
-\font_typewriter default
-\font_default_family default
-\use_non_tex_fonts false
-\font_sc false
-\font_osf false
-\font_sf_scale 100
-\font_tt_scale 100
-
-\graphics default
-\default_output_format default
-\output_sync 0
-\bibtex_command default
-\index_command default
-\paperfontsize default
-\use_hyperref false
-\papersize default
-\use_geometry false
-\use_amsmath 1
-\use_esint 1
-\use_mhchem 1
-\use_mathdots 1
-\cite_engine basic
-\use_bibtopic false
-\use_indices false
-\paperorientation portrait
-\suppress_date false
-\use_refstyle 0
-\index Index
-\shortcut idx
-\color #008000
-\end_index
-\secnumdepth 3
-\tocdepth 3
-\paragraph_separation indent
-\paragraph_indentation default
-\quotes_language english
-\papercolumns 1
-\papersides 1
-\paperpagestyle default
-\tracking_changes true
-\output_changes true
-\html_math_output 0
-\html_css_as_file 0
-\html_be_strict false
-\end_header
-
-\begin_body
-
-\begin_layout Title
-NTDB: Redesigning The Trivial DataBase
-\end_layout
-
-\begin_layout Author
-Rusty Russell, IBM Corporation
-\end_layout
-
-\begin_layout Date
-19 June 2012
-\end_layout
-
-\begin_layout Abstract
-The Trivial DataBase on-disk format is 32 bits; with usage cases heading
- towards the 4G limit, that must change.
- This required breakage provides an opportunity to revisit TDB's other design
- decisions and reassess them.
-\end_layout
-
-\begin_layout Section
-Introduction
-\end_layout
-
-\begin_layout Standard
-The Trivial DataBase was originally written by Andrew Tridgell as a simple
- key/data pair storage system with the same API as dbm, but allowing multiple
- readers and writers while being small enough (< 1000 lines of C) to include
- in SAMBA.
- The simple design created in 1999 has proven surprisingly robust and performant
-, used in Samba versions 3 and 4 as well as numerous other projects.
- Its useful life was greatly increased by the (backwards-compatible!) addition
- of transaction support in 2005.
-\end_layout
-
-\begin_layout Standard
-The wider variety and greater demands of TDB-using code has lead to some
- organic growth of the API, as well as some compromises on the implementation.
- None of these, by themselves, are seen as show-stoppers, but the cumulative
- effect is to a loss of elegance over the initial, simple TDB implementation.
- Here is a table of the approximate number of lines of implementation code
- and number of API functions at the end of each year:
-\end_layout
-
-\begin_layout Standard
-\begin_inset Tabular
-<lyxtabular version="3" rows="12" columns="3">
-<features tabularvalignment="middle">
-<column alignment="center" valignment="top" width="0">
-<column alignment="center" valignment="top" width="0">
-<column alignment="center" valignment="top" width="0">
-<row>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-Year End
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-API Functions
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-Lines of C Code Implementation
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-1999
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-13
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-1195
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2000
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-24
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-1725
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2001
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-32
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2228
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2002
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-35
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2481
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2003
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-35
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2552
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2004
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-40
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2584
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2005
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-38
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2647
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2006
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-52
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-3754
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2007
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-66
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-4398
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2008
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-71
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-4768
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2009
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-73
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-5715
-\end_layout
-
-\end_inset
-</cell>
-</row>
-</lyxtabular>
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Standard
-This review is an attempt to catalog and address all the known issues with
- TDB and create solutions which address the problems without significantly
- increasing complexity; all involved are far too aware of the dangers of
- second system syndrome in rewriting a successful project like this.
-\end_layout
-
-\begin_layout Standard
-Note: the final decision was to make ntdb a separate library, with a separarate
- 'ntdb' namespace so both can potentially be linked together.
- This document still refers to
-\begin_inset Quotes eld
-\end_inset
-
-tdb
-\begin_inset Quotes erd
-\end_inset
-
- everywhere, for simplicity.
-\end_layout
-
-\begin_layout Section
-API Issues
-\end_layout
-
-\begin_layout Subsection
-tdb_open_ex Is Not Expandable
-\end_layout
-
-\begin_layout Standard
-The tdb_open() call was expanded to tdb_open_ex(), which added an optional
- hashing function and an optional logging function argument.
- Additional arguments to open would require the introduction of a tdb_open_ex2
- call etc.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\begin_inset CommandInset label
-LatexCommand label
-name "attributes"
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Standard
-tdb_open() will take a linked-list of attributes:
-\end_layout
-
-\begin_layout LyX-Code
-enum tdb_attribute {
-\end_layout
-
-\begin_layout LyX-Code
- TDB_ATTRIBUTE_LOG = 0,
-\end_layout
-
-\begin_layout LyX-Code
- TDB_ATTRIBUTE_HASH = 1
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_attribute_base {
-\end_layout
-
-\begin_layout LyX-Code
- enum tdb_attribute attr;
-\end_layout
-
-\begin_layout LyX-Code
- union tdb_attribute *next;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_attribute_log {
-\end_layout
-
-\begin_layout LyX-Code
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
-\end_layout
-
-\begin_layout LyX-Code
- tdb_log_func log_fn;
-\end_layout
-
-\begin_layout LyX-Code
- void *log_private;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_attribute_hash {
-\end_layout
-
-\begin_layout LyX-Code
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
-\end_layout
-
-\begin_layout LyX-Code
- tdb_hash_func hash_fn;
-\end_layout
-
-\begin_layout LyX-Code
- void *hash_private;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-union tdb_attribute {
-\end_layout
-
-\begin_layout LyX-Code
- struct tdb_attribute_base base;
-\end_layout
-
-\begin_layout LyX-Code
- struct tdb_attribute_log log;
-\end_layout
-
-\begin_layout LyX-Code
- struct tdb_attribute_hash hash;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-This allows future attributes to be added, even if this expands the size
- of the union.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-tdb_traverse Makes Impossible Guarantees
-\end_layout
-
-\begin_layout Standard
-tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
- was thought that it was important to guarantee that all records which exist
- at the start and end of the traversal would be included, and no record
- would be included twice.
-\end_layout
-
-\begin_layout Standard
-This adds complexity (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Reliable-Traversal-Adds"
-
-\end_inset
-
-) and does not work anyway for records which are altered (in particular,
- those which are expanded may be effectively deleted and re-added behind
- the traversal).
-\end_layout
-
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "traverse-Proposed-Solution"
-
-\end_inset
-
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Abandon the guarantee.
- You will see every record if no changes occur during your traversal, otherwise
- you will see some subset.
- You can prevent changes by using a transaction or the locking API.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
- Delete-during-traverse will still delete every record, too (assuming no
- other changes).
-\end_layout
-
-\begin_layout Subsection
-Nesting of Transactions Is Fraught
-\end_layout
-
-\begin_layout Standard
-TDB has alternated between allowing nested transactions and not allowing
- them.
- Various paths in the Samba codebase assume that transactions will nest,
- and in a sense they can: the operation is only committed to disk when the
- outer transaction is committed.
- There are two problems, however:
-\end_layout
-
-\begin_layout Enumerate
-Canceling the inner transaction will cause the outer transaction commit
- to fail, and will not undo any operations since the inner transaction began.
- This problem is soluble with some additional internal code.
-\end_layout
-
-\begin_layout Enumerate
-An inner transaction commit can be cancelled by the outer transaction.
- This is desirable in the way which Samba's database initialization code
- uses transactions, but could be a surprise to any users expecting a successful
- transaction commit to expose changes to others.
-\end_layout
-
-\begin_layout Standard
-The current solution is to specify the behavior at tdb_open(), with the
- default currently that nested transactions are allowed.
- This flag can also be changed at runtime.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Given the usage patterns, it seems that the
-\begin_inset Quotes eld
-\end_inset
-
-least-surprise
-\begin_inset Quotes erd
-\end_inset
-
- behavior of disallowing nested transactions should become the default.
- Additionally, it seems the outer transaction is the only code which knows
- whether inner transactions should be allowed, so a flag to indicate this
- could be added to tdb_transaction_start.
- However, this behavior can be simulated with a wrapper which uses tdb_add_flags
-() and tdb_remove_flags(), so the API should not be expanded for this relatively
--obscure case.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete; the nesting flag has been removed.
-\end_layout
-
-\begin_layout Subsection
-Incorrect Hash Function is Not Detected
-\end_layout
-
-\begin_layout Standard
-tdb_open_ex() allows the calling code to specify a different hash function
- to use, but does not check that all other processes accessing this tdb
- are using the same hash function.
- The result is that records are missing from tdb_fetch().
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The header should contain an example hash result (eg.
- the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
- hash function produces the same answer, or fail the tdb_open call.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-tdb_set_max_dead/TDB_VOLATILE Expose Implementation
-\end_layout
-
-\begin_layout Standard
-In response to scalability issues with the free list (
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB-Freelist-Is"
-
-\end_inset
-
-) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
- and the TDB_VOLATILE flag to tdb_open.
- The latter actually calls the former with an argument of
-\begin_inset Quotes eld
-\end_inset
-
-5
-\begin_inset Quotes erd
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Standard
-This code allows deleted records to accumulate without putting them in the
- free list.
- On delete we iterate through each chain and free them in a batch if there
- are more than max_dead entries.
- These are never otherwise recycled except as a side-effect of a tdb_repack.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-With the scalability problems of the freelist solved, this API can be removed.
- The TDB_VOLATILE flag may still be useful as a hint that store and delete
- of records will be at least as common as fetch in order to allow some internal
- tuning, but initially will become a no-op.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
- Unknown flags cause tdb_open() to fail as well, so they can be detected
- at runtime.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB-Files-Cannot"
-
-\end_inset
-
-TDB Files Cannot Be Opened Multiple Times In The Same Process
-\end_layout
-
-\begin_layout Standard
-No process can open the same TDB twice; we check and disallow it.
- This is an unfortunate side-effect of fcntl locks, which operate on a per-file
- rather than per-file-descriptor basis, and do not nest.
- Thus, closing any file descriptor on a file clears all the locks obtained
- by this process, even if they were placed using a different file descriptor!
-\end_layout
-
-\begin_layout Standard
-Note that even if this were solved, deadlock could occur if operations were
- nested: this is a more manageable programming error in most cases.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-We could lobby POSIX to fix the perverse rules, or at least lobby Linux
- to violate them so that the most common implementation does not have this
- restriction.
- This would be a generally good idea for other fcntl lock users.
-\end_layout
-
-\begin_layout Standard
-Samba uses a wrapper which hands out the same tdb_context to multiple callers
- if this happens, and does simple reference counting.
- We should do this inside the tdb library, which already emulates lock nesting
- internally; it would need to recognize when deadlock occurs within a single
- process.
- This would create a new failure mode for tdb operations (while we currently
- handle locking failures, they are impossible in normal use and a process
- encountering them can do little but give up).
-\end_layout
-
-\begin_layout Standard
-I do not see benefit in an additional tdb_open flag to indicate whether
- re-opening is allowed, as though there may be some benefit to adding a
- call to detect when a tdb_context is shared, to allow other to create such
- an API.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB API Is Not POSIX Thread-safe
-\end_layout
-
-\begin_layout Standard
-The TDB API uses an error code which can be queried after an operation to
- determine what went wrong.
- This programming model does not work with threads, unless specific additional
- guarantees are given by the implementation.
- In addition, even otherwise-independent threads cannot open the same TDB
- (as in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB-Files-Cannot"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Reachitecting the API to include a tdb_errcode pointer would be a great
- deal of churn, but fortunately most functions return 0 on success and -1
- on error: we can change these to return 0 on success and a negative error
- code on error, and the API remains similar to previous.
- The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
- pointer and return an error code.
- It is also simpler to have tdb_nextkey replace its key argument in place,
- freeing up any old .dptr.
-\end_layout
-
-\begin_layout Standard
-Internal locking is required to make sure that fcntl locks do not overlap
- between threads, and also that the global list of tdbs is maintained.
-\end_layout
-
-\begin_layout Standard
-The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
- version of the library, and otherwise no overhead will exist.
- Alternatively, a hooking mechanism similar to that proposed for
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Proposed-Solution-locking-hook"
-
-\end_inset
-
- could be used to enable pthread locking at runtime.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete; API has been changed but thread safety has not been implemented.
-\end_layout
-
-\begin_layout Subsection
-*_nonblock Functions And *_mark Functions Expose Implementation
-\end_layout
-
-\begin_layout Standard
-CTDB
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-Clustered TDB, see http://ctdb.samba.org
-\end_layout
-
-\end_inset
-
- wishes to operate on TDB in a non-blocking manner.
- This is currently done as follows:
-\end_layout
-
-\begin_layout Enumerate
-Call the _nonblock variant of an API function (eg.
- tdb_lockall_nonblock).
- If this fails:
-\end_layout
-
-\begin_layout Enumerate
-Fork a child process, and wait for it to call the normal variant (eg.
- tdb_lockall).
-\end_layout
-
-\begin_layout Enumerate
-If the child succeeds, call the _mark variant to indicate we already have
- the locks (eg.
- tdb_lockall_mark).
-\end_layout
-
-\begin_layout Enumerate
-Upon completion, tell the child to release the locks (eg.
- tdb_unlockall).
-\end_layout
-
-\begin_layout Enumerate
-Indicate to tdb that it should consider the locks removed (eg.
- tdb_unlockall_mark).
-\end_layout
-
-\begin_layout Standard
-There are several issues with this approach.
- Firstly, adding two new variants of each function clutters the API for
- an obscure use, and so not all functions have three variants.
- Secondly, it assumes that all paths of the functions ask for the same locks,
- otherwise the parent process will have to get a lock which the child doesn't
- have under some circumstances.
- I don't believe this is currently the case, but it constrains the implementatio
-n.
-\end_layout
-
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "Proposed-Solution-locking-hook"
-
-\end_inset
-
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Implement a hook for locking methods, so that the caller can control the
- calls to create and remove fcntl locks.
- In this scenario, ctdbd would operate as follows:
-\end_layout
-
-\begin_layout Enumerate
-Call the normal API function, eg tdb_lockall().
-\end_layout
-
-\begin_layout Enumerate
-When the lock callback comes in, check if the child has the lock.
- Initially, this is always false.
- If so, return 0.
- Otherwise, try to obtain it in non-blocking mode.
- If that fails, return EWOULDBLOCK.
-\end_layout
-
-\begin_layout Enumerate
-Release locks in the unlock callback as normal.
-\end_layout
-
-\begin_layout Enumerate
-If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
- child to repeat the operation.
-\end_layout
-
-\begin_layout Enumerate
-The child records what locks it obtains, and returns that information to
- the parent.
-\end_layout
-
-\begin_layout Enumerate
-When the child has succeeded, goto 1.
-\end_layout
-
-\begin_layout Standard
-This is flexible enough to handle any potential locking scenario, even when
- lock requirements change.
- It can be optimized so that the parent does not release locks, just tells
- the child which locks it doesn't need to obtain.
-\end_layout
-
-\begin_layout Standard
-It also keeps the complexity out of the API, and in ctdbd where it is needed.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-tdb_chainlock Functions Expose Implementation
-\end_layout
-
-\begin_layout Standard
-tdb_chainlock locks some number of records, including the record indicated
- by the given key.
- This gave atomicity guarantees; no-one can start a transaction, alter,
- read or delete that key while the lock is held.
-\end_layout
-
-\begin_layout Standard
-It also makes the same guarantee for any other key in the chain, which is
- an internal implementation detail and potentially a cause for deadlock.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
- It would be nice to have an explicit single entry lock which effected no
- other keys.
- Unfortunately, this won't work for an entry which doesn't exist.
- Thus while chainlock may be implemented more efficiently for the existing
- case, it will still have overlap issues with the non-existing case.
- So it is best to keep the current (lack of) guarantee about which records
- will be effected to avoid constraining our implementation.
-\end_layout
-
-\begin_layout Subsection
-Signal Handling is Not Race-Free
-\end_layout
-
-\begin_layout Standard
-The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
- that the tdb locking code should return with a failure, rather than trying
- again when a signal is received (and errno == EAGAIN).
- This is usually used to implement timeouts.
-\end_layout
-
-\begin_layout Standard
-Unfortunately, this does not work in the case where the signal is received
- before the tdb code enters the fcntl() call to place the lock: the code
- will sleep within the fcntl() code, unaware that the signal wants it to
- exit.
- In the case of long timeouts, this does not happen in practice.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The locking hooks proposed in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Proposed-Solution-locking-hook"
-
-\end_inset
-
- would allow the user to decide on whether to fail the lock acquisition
- on a signal.
- This allows the caller to choose their own compromise: they could narrow
- the race by checking immediately before the fcntl call.
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-It may be possible to make this race-free in some implementations by having
- the signal handler alter the struct flock to make it invalid.
- This will cause the fcntl() lock call to fail with EINVAL if the signal
- occurs before the kernel is entered, otherwise EAGAIN.
-\end_layout
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-The API Uses Gratuitous Typedefs, Capitals
-\end_layout
-
-\begin_layout Standard
-typedefs are useful for providing source compatibility when types can differ
- across implementations, or arguably in the case of function pointer definitions
- which are hard for humans to parse.
- Otherwise it is simply obfuscation and pollutes the namespace.
-\end_layout
-
-\begin_layout Standard
-Capitalization is usually reserved for compile-time constants and macros.
-\end_layout
-
-\begin_layout Description
-TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
- definition isn't visible to the API user anyway.
-\end_layout
-
-\begin_layout Description
-TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
- needs to be understood by the API user.
-\end_layout
-
-\begin_layout Description
-struct
-\begin_inset space ~
-\end_inset
-
-TDB_DATA This would normally be called 'struct tdb_data'.
-\end_layout
-
-\begin_layout Description
-enum
-\begin_inset space ~
-\end_inset
-
-TDB_ERROR Similarly, this would normally be enum tdb_error.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
- Introducing lower case variants would please pedants like myself, but if
- it were done the existing ones should be kept.
- There is little point forcing a purely cosmetic change upon tdb users.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "tdb_log_func-Doesnt-Take"
-
-\end_inset
-
-tdb_log_func Doesn't Take The Private Pointer
-\end_layout
-
-\begin_layout Standard
-For API compatibility reasons, the logging function needs to call tdb_get_loggin
-g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-It should simply take an extra argument, since we are prepared to break
- the API/ABI.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Various Callback Functions Are Not Typesafe
-\end_layout
-
-\begin_layout Standard
-The callback functions in tdb_set_logging_function (after
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "tdb_log_func-Doesnt-Take"
-
-\end_inset
-
- is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
- all take void * and must internally convert it to the argument type they
- were expecting.
-\end_layout
-
-\begin_layout Standard
-If this type changes, the compiler will not produce warnings on the callers,
- since it only sees void *.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-With careful use of macros, we can create callback functions which give
- a warning when used on gcc and the types of the callback and its private
- argument differ.
- Unsupported compilers will not give a warning, which is no worse than now.
- In addition, the callbacks become clearer, as they need not use void *
- for their parameter.
-\end_layout
-
-\begin_layout Standard
-See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
-\end_layout
-
-\begin_layout Standard
-The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
- be cleared if the caller discovers it is the only process with the TDB
- open.
- However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
- be detected, so will have the TDB erased underneath them (usually resulting
- in a crash).
-\end_layout
-
-\begin_layout Standard
-There is a similar issue on fork(); if the parent exits (or otherwise closes
- the tdb) before the child calls tdb_reopen_all() to establish the lock
- used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
- at that moment will believe it alone has opened the TDB and will erase
- it.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Remove TDB_CLEAR_IF_FIRST.
- Other workarounds are possible, but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
-
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
- An open hook is provided to replicate this functionality if required.
-\end_layout
-
-\begin_layout Subsection
-Extending The Header Is Difficult
-\end_layout
-
-\begin_layout Standard
-We have reserved (zeroed) words in the TDB header, which can be used for
- future features.
- If the future features are compulsory, the version number must be updated
- to prevent old code from accessing the database.
- But if the future feature is optional, we have no way of telling if older
- code is accessing the database or not.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The header should contain a
-\begin_inset Quotes eld
-\end_inset
-
-format variant
-\begin_inset Quotes erd
-\end_inset
-
- value (64-bit).
- This is divided into two 32-bit parts:
-\end_layout
-
-\begin_layout Enumerate
-The lower part reflects the format variant understood by code accessing
- the database.
-\end_layout
-
-\begin_layout Enumerate
-The upper part reflects the format variant you must understand to write
- to the database (otherwise you can only open for reading).
-\end_layout
-
-\begin_layout Standard
-The latter field can only be written at creation time, the former should
- be written under the OPEN_LOCK when opening the database for writing, if
- the variant of the code is lower than the current lowest variant.
-\end_layout
-
-\begin_layout Standard
-This should allow backwards-compatible features to be added, and detection
- if older code (which doesn't understand the feature) writes to the database.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Record Headers Are Not Expandible
-\end_layout
-
-\begin_layout Standard
-If we later want to add (say) checksums on keys and data, it would require
- another format change, which we'd like to avoid.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-We often have extra padding at the tail of a record.
- If we ensure that the first byte (if any) of this padding is zero, we will
- have a way for future changes to detect code which doesn't understand a
- new format: the new code would write (say) a 1 at the tail, and thus if
- there is no tail or the first byte is 0, we would know the extension is
- not present on that record.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB Does Not Use Talloc
-\end_layout
-
-\begin_layout Standard
-Many users of TDB (particularly Samba) use the talloc allocator, and thus
- have to wrap TDB in a talloc context to use it conveniently.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The allocation within TDB is not complicated enough to justify the use of
- talloc, and I am reluctant to force another (excellent) library on TDB
- users.
- Nonetheless a compromise is possible.
- An attribute (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "attributes"
-
-\end_inset
-
-) can be added later to tdb_open() to provide an alternate allocation mechanism,
- specifically for talloc but usable by any other allocator (which would
- ignore the
-\begin_inset Quotes eld
-\end_inset
-
-context
-\begin_inset Quotes erd
-\end_inset
-
- argument).
-\end_layout
-
-\begin_layout Standard
-This would form a talloc heirarchy as expected, but the caller would still
- have to attach a destructor to the tdb context returned from tdb_open to
- close it.
- All TDB_DATA fields would be children of the tdb_context, and the caller
- would still have to manage them (using talloc_free() or talloc_steal()).
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
-\end_layout
-
-\begin_layout Section
-Performance And Scalability Issues
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
-
-\end_inset
-
-TDB_CLEAR_IF_FIRST Imposes Performance Penalty
-\end_layout
-
-\begin_layout Standard
-When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
- 4 (aka.
- the ACTIVE_LOCK).
- While these locks never conflict in normal tdb usage, they do add substantial
- overhead for most fcntl lock implementations when the kernel scans to detect
- if a lock conflict exists.
- This is often a single linked list, making the time to acquire and release
- a fcntl lock O(N) where N is the number of processes with the TDB open,
- not the number actually doing work.
-\end_layout
-
-\begin_layout Standard
-In a Samba server it is common to have huge numbers of clients sitting idle,
- and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-There is a flag to tdb_reopen_all() which is used for this optimization:
- if the parent process will outlive the child, the child does not need the
- ACTIVE_LOCK.
- This is a workaround for this very performance issue.
-\end_layout
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Remove the flag.
- It was a neat idea, but even trivial servers tend to know when they are
- initializing for the first time and can simply unlink the old tdb at that
- point.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB Files Have a 4G Limit
-\end_layout
-
-\begin_layout Standard
-This seems to be becoming an issue (so much for
-\begin_inset Quotes eld
-\end_inset
-
-trivial
-\begin_inset Quotes erd
-\end_inset
-
-!), particularly for ldb.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-A new, incompatible TDB format which uses 64 bit offsets internally rather
- than 32 bit as now.
- For simplicity of endian conversion (which TDB does on the fly if required),
- all values will be 64 bit on disk.
- In practice, some upper bits may be used for other purposes, but at least
- 56 bits will be available for file offsets.
-\end_layout
-
-\begin_layout Standard
-tdb_open() will automatically detect the old version, and even create them
- if TDB_VERSION6 is specified to tdb_open.
-\end_layout
-
-\begin_layout Standard
-32 bit processes will still be able to access TDBs larger than 4G (assuming
- that their off_t allows them to seek to 64 bits), they will gracefully
- fall back as they fail to mmap.
- This can happen already with large TDBs.
-\end_layout
-
-\begin_layout Standard
-Old versions of tdb will fail to open the new TDB files (since 28 August
- 2009, commit 398d0c29290: prior to that any unrecognized file format would
- be erased and initialized as a fresh tdb!)
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB Records Have a 4G Limit
-\end_layout
-
-\begin_layout Standard
-This has not been a reported problem, and the API uses size_t which can
- be 64 bit on 64 bit platforms.
- However, other limits may have made such an issue moot.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Record sizes will be 64 bit, with an error returned on 32 bit platforms
- which try to access such records (the current implementation would return
- TDB_ERR_OOM in a similar case).
- It seems unlikely that 32 bit keys will be a limitation, so the implementation
- may not support this (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Records-Incur-A"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Hash Size Is Determined At TDB Creation Time
-\end_layout
-
-\begin_layout Standard
-TDB contains a number of hash chains in the header; the number is specified
- at creation time, and defaults to 131.
- This is such a bottleneck on large databases (as each hash chain gets quite
- long), that LDB uses 10,000 for this hash.
- In general it is impossible to know what the 'right' answer is at database
- creation time.
-\end_layout
-
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:Hash-Size-Solution"
-
-\end_inset
-
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-After comprehensive performance testing on various scalable hash variants
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
- because I was previously convinced that an expanding tree of hashes would
- be very close to optimal.
-\end_layout
-
-\end_inset
-
-, it became clear that it is hard to beat a straight linear hash table which
- doubles in size when it reaches saturation.
- Unfortunately, altering the hash table introduces serious locking complications
-: the entire hash table needs to be locked to enlarge the hash table, and
- others might be holding locks.
- Particularly insidious are insertions done under tdb_chainlock.
-\end_layout
-
-\begin_layout Standard
-Thus an expanding layered hash will be used: an array of hash groups, with
- each hash group exploding into pointers to lower hash groups once it fills,
- turning into a hash tree.
- This has implications for locking: we must lock the entire group in case
- we need to expand it, yet we don't know how deep the tree is at that point.
-\end_layout
-
-\begin_layout Standard
-Note that bits from the hash table entries should be stolen to hold more
- hash bits to reduce the penalty of collisions.
- We can use the otherwise-unused lower 3 bits.
- If we limit the size of the database to 64 exabytes, we can use the top
- 8 bits of the hash entry as well.
- These 11 bits would reduce false positives down to 1 in 2000 which is more
- than we need: we can use one of the bits to indicate that the extra hash
- bits are valid.
- This means we can choose not to re-hash all entries when we expand a hash
- group; simply use the next bits we need and mark them invalid.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Ignore.
- Scaling the hash automatically proved inefficient at small hash sizes;
- we default to a 8192-element hash (changable via NTDB_ATTRIBUTE_HASHSIZE),
- and when buckets clash we expand to an array of hash entries.
- This scales slightly better than the tdb chain (due to the 8 top bits containin
-g extra hash).
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB-Freelist-Is"
-
-\end_inset
-
-TDB Freelist Is Highly Contended
-\end_layout
-
-\begin_layout Standard
-TDB uses a single linked list for the free list.
- Allocation occurs as follows, using heuristics which have evolved over
- time:
-\end_layout
-
-\begin_layout Enumerate
-Get the free list lock for this whole operation.
-\end_layout
-
-\begin_layout Enumerate
-Multiply length by 1.25, so we always over-allocate by 25%.
-\end_layout
-
-\begin_layout Enumerate
-Set the slack multiplier to 1.
-\end_layout
-
-\begin_layout Enumerate
-Examine the current freelist entry: if it is > length but < the current
- best case, remember it as the best case.
-\end_layout
-
-\begin_layout Enumerate
-Multiply the slack multiplier by 1.05.
-\end_layout
-
-\begin_layout Enumerate
-If our best fit so far is less than length * slack multiplier, return it.
- The slack will be turned into a new free record if it's large enough.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise, go onto the next freelist entry.
-\end_layout
-
-\begin_layout Standard
-Deleting a record occurs as follows:
-\end_layout
-
-\begin_layout Enumerate
-Lock the hash chain for this whole operation.
-\end_layout
-
-\begin_layout Enumerate
-Walk the chain to find the record, keeping the prev pointer offset.
-\end_layout
-
-\begin_layout Enumerate
-If max_dead is non-zero:
-\end_layout
-
-\begin_deeper
-\begin_layout Enumerate
-Walk the hash chain again and count the dead records.
-\end_layout
-
-\begin_layout Enumerate
-If it's more than max_dead, bulk free all the dead ones (similar to steps
- 4 and below, but the lock is only obtained once).
-\end_layout
-
-\begin_layout Enumerate
-Simply mark this record as dead and return.
-\end_layout
-
-\end_deeper
-\begin_layout Enumerate
-Get the free list lock for the remainder of this operation.
-\end_layout
-
-\begin_layout Enumerate
-\begin_inset CommandInset label
-LatexCommand label
-name "right-merging"
-
-\end_inset
-
-Examine the following block to see if it is free; if so, enlarge the current
- block and remove that block from the free list.
- This was disabled, as removal from the free list was O(entries-in-free-list).
-\end_layout
-
-\begin_layout Enumerate
-Examine the preceeding block to see if it is free: for this reason, each
- block has a 32-bit tailer which indicates its length.
- If it is free, expand it to cover our new block and return.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise, prepend ourselves to the free list.
-\end_layout
-
-\begin_layout Standard
-Disabling right-merging (step
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "right-merging"
-
-\end_inset
-
-) causes fragmentation; the other heuristics proved insufficient to address
- this, so the final answer to this was that when we expand the TDB file
- inside a transaction commit, we repack the entire tdb.
-\end_layout
-
-\begin_layout Standard
-The single list lock limits our allocation rate; due to the other issues
- this is not currently seen as a bottleneck.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The first step is to remove all the current heuristics, as they obviously
- interact, then examine them once the lock contention is addressed.
-\end_layout
-
-\begin_layout Standard
-The free list must be split to reduce contention.
- Assuming perfect free merging, we can at most have 1 free list entry for
- each entry.
- This implies that the number of free lists is related to the size of the
- hash table, but as it is rare to walk a large number of free list entries
- we can use far fewer, say 1/32 of the number of hash buckets.
-\end_layout
-
-\begin_layout Standard
-It seems tempting to try to reuse the hash implementation which we use for
- records here, but we have two ways of searching for free entries: for allocatio
-n we search by size (and possibly zone) which produces too many clashes
- for our hash table to handle well, and for coalescing we search by address.
- Thus an array of doubly-linked free lists seems preferable.
-\end_layout
-
-\begin_layout Standard
-There are various benefits in using per-size free lists (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Becomes-Fragmented"
-
-\end_inset
-
-) but it's not clear this would reduce contention in the common case where
- all processes are allocating/freeing the same size.
- Thus we almost certainly need to divide in other ways: the most obvious
- is to divide the file into zones, and using a free list (or table of free
- lists) for each.
- This approximates address ordering.
-\end_layout
-
-\begin_layout Standard
-Unfortunately it is difficult to know what heuristics should be used to
- determine zone sizes, and our transaction code relies on being able to
- create a
-\begin_inset Quotes eld
-\end_inset
-
-recovery area
-\begin_inset Quotes erd
-\end_inset
-
- by simply appending to the file (difficult if it would need to create a
- new zone header).
- Thus we use a linked-list of free tables; currently we only ever create
- one, but if there is more than one we choose one at random to use.
- In future we may use heuristics to add new free tables on contention.
- We only expand the file when all free tables are exhausted.
-\end_layout
-
-\begin_layout Standard
-The basic algorithm is as follows.
- Freeing is simple:
-\end_layout
-
-\begin_layout Enumerate
-Identify the correct free list.
-\end_layout
-
-\begin_layout Enumerate
-Lock the corresponding list.
-\end_layout
-
-\begin_layout Enumerate
-Re-check the list (we didn't have a lock, sizes could have changed): relock
- if necessary.
-\end_layout
-
-\begin_layout Enumerate
-Place the freed entry in the list.
-\end_layout
-
-\begin_layout Standard
-Allocation is a little more complicated, as we perform delayed coalescing
- at this point:
-\end_layout
-
-\begin_layout Enumerate
-Pick a free table; usually the previous one.
-\end_layout
-
-\begin_layout Enumerate
-Lock the corresponding list.
-\end_layout
-
-\begin_layout Enumerate
-If the top entry is -large enough, remove it from the list and return it.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise, coalesce entries in the list.If there was no entry large enough,
- unlock the list and try the next largest list
-\end_layout
-
-\begin_layout Enumerate
-If no list has an entry which meets our needs, try the next free table.
-\end_layout
-
-\begin_layout Enumerate
-If no zone satisfies, expand the file.
-\end_layout
-
-\begin_layout Standard
-This optimizes rapid insert/delete of free list entries by not coalescing
- them all the time..
- First-fit address ordering ordering seems to be fairly good for keeping
- fragmentation low (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Becomes-Fragmented"
-
-\end_inset
-
-).
- Note that address ordering does not need a tailer to coalesce, though if
- we needed one we could have one cheaply: see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Records-Incur-A"
-
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Standard
-Each free entry has the free table number in the header: less than 255.
- It also contains a doubly-linked list for easy deletion.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:TDB-Becomes-Fragmented"
-
-\end_inset
-
-TDB Becomes Fragmented
-\end_layout
-
-\begin_layout Standard
-Much of this is a result of allocation strategy
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
-xas.edu/pub/garbage/malloc/ismm98.ps
-\end_layout
-
-\end_inset
-
- and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
-on) is deliberately set at 25%, and external fragmentation is only cured
- by the decision to repack the entire db when a transaction commit needs
- to enlarge the file.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The 25% overhead on allocation works in practice for ldb because indexes
- tend to expand by one record at a time.
- This internal fragmentation can be resolved by having an
-\begin_inset Quotes eld
-\end_inset
-
-expanded
-\begin_inset Quotes erd
-\end_inset
-
- bit in the header to note entries that have previously expanded, and allocating
- more space for them.
-\end_layout
-
-\begin_layout Standard
-There are is a spectrum of possible solutions for external fragmentation:
- one is to use a fragmentation-avoiding allocation strategy such as best-fit
- address-order allocator.
- The other end of the spectrum would be to use a bump allocator (very fast
- and simple) and simply repack the file when we reach the end.
-\end_layout
-
-\begin_layout Standard
-There are three problems with efficient fragmentation-avoiding allocators:
- they are non-trivial, they tend to use a single free list for each size,
- and there's no evidence that tdb allocation patterns will match those recorded
- for general allocators (though it seems likely).
-\end_layout
-
-\begin_layout Standard
-Thus we don't spend too much effort on external fragmentation; we will be
- no worse than the current code if we need to repack on occasion.
- More effort is spent on reducing freelist contention, and reducing overhead.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:Records-Incur-A"
-
-\end_inset
-
-Records Incur A 28-Byte Overhead
-\end_layout
-
-\begin_layout Standard
-Each TDB record has a header as follows:
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_record {
-\end_layout
-
-\begin_layout LyX-Code
- tdb_off_t next; /* offset of the next record in the list */
-\end_layout
-
-\begin_layout LyX-Code
- tdb_len_t rec_len; /* total byte length of record */
-\end_layout
-
-\begin_layout LyX-Code
- tdb_len_t key_len; /* byte length of key */
-\end_layout
-
-\begin_layout LyX-Code
- tdb_len_t data_len; /* byte length of data */
-\end_layout
-
-\begin_layout LyX-Code
- uint32_t full_hash; /* the full 32 bit hash of the key */
-\end_layout
-
-\begin_layout LyX-Code
- uint32_t magic; /* try to catch errors */
-\end_layout
-
-\begin_layout LyX-Code
- /* the following union is implied:
-\end_layout
-
-\begin_layout LyX-Code
- union {
-\end_layout
-
-\begin_layout LyX-Code
- char record[rec_len];
-\end_layout
-
-\begin_layout LyX-Code
- struct {
-\end_layout
-
-\begin_layout LyX-Code
- char key[key_len];
-\end_layout
-
-\begin_layout LyX-Code
- char data[data_len];
-\end_layout
-
-\begin_layout LyX-Code
- }
-\end_layout
-
-\begin_layout LyX-Code
- uint32_t totalsize; (tailer)
-\end_layout
-
-\begin_layout LyX-Code
- }
-\end_layout
-
-\begin_layout LyX-Code
- */
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-Naively, this would double to a 56-byte overhead on a 64 bit implementation.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-We can use various techniques to reduce this for an allocated block:
-\end_layout
-
-\begin_layout Enumerate
-The 'next' pointer is not required, as we are using a flat hash table.
-\end_layout
-
-\begin_layout Enumerate
-'rec_len' can instead be expressed as an addition to key_len and data_len
- (it accounts for wasted or overallocated length in the record).
- Since the record length is always a multiple of 8, we can conveniently
- fit it in 32 bits (representing up to 35 bits).
-\end_layout
-
-\begin_layout Enumerate
-'key_len' and 'data_len' can be reduced.
- I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
- the two into one 64-bit field and using a 5 bit value which indicates at
- what bit to divide the two.
- Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
- size of 32 bits.
-\end_layout
-
-\begin_layout Enumerate
-'full_hash' is used to avoid a memcmp on the
-\begin_inset Quotes eld
-\end_inset
-
-miss
-\begin_inset Quotes erd
-\end_inset
-
- case, but this is diminishing returns after a handful of bits (at 10 bits,
- it reduces 99.9% of false memcmp).
- As an aside, as the lower bits are already incorporated in the hash table
- resolution, the upper bits should be used here.
- Note that it's not clear that these bits will be a win, given the extra
- bits in the hash table itself (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Hash-Size-Solution"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Enumerate
-'magic' does not need to be enlarged: it currently reflects one of 5 values
- (used, free, dead, recovery, and unused_recovery).
- It is useful for quick sanity checking however, and should not be eliminated.
-\end_layout
-
-\begin_layout Enumerate
-'tailer' is only used to coalesce free blocks (so a block to the right can
- find the header to check if this block is free).
- This can be replaced by a single 'free' bit in the header of the following
- block (and the tailer only exists in free blocks).
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-This technique from Thomas Standish.
- Data Structure Techniques.
- Addison-Wesley, Reading, Massachusetts, 1980.
-\end_layout
-
-\end_inset
-
- The current proposed coalescing algorithm doesn't need this, however.
-\end_layout
-
-\begin_layout Standard
-This produces a 16 byte used header like this:
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_used_record {
-\end_layout
-
-\begin_layout LyX-Code
- uint32_t used_magic : 16,
-\end_layout
-
-\begin_layout LyX-Code
-
-\end_layout
-
-\begin_layout LyX-Code
- key_data_divide: 5,
-\end_layout
-
-\begin_layout LyX-Code
- top_hash: 11;
-\end_layout
-
-\begin_layout LyX-Code
- uint32_t extra_octets;
-\end_layout
-
-\begin_layout LyX-Code
- uint64_t key_and_data_len;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-And a free record like this:
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_free_record {
-\end_layout
-
-\begin_layout LyX-Code
- uint64_t free_magic: 8,
-\end_layout
-
-\begin_layout LyX-Code
- prev : 56;
-\end_layout
-
-\begin_layout LyX-Code
-
-\end_layout
-
-\begin_layout LyX-Code
- uint64_t free_table: 8,
-\end_layout
-
-\begin_layout LyX-Code
- total_length : 56
-\end_layout
-
-\begin_layout LyX-Code
- uint64_t next;;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-Note that by limiting valid offsets to 56 bits, we can pack everything we
- need into 3 64-byte words, meaning our minimum record size is 8 bytes.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Transaction Commit Requires 4 fdatasync
-\end_layout
-
-\begin_layout Standard
-The current transaction algorithm is:
-\end_layout
-
-\begin_layout Enumerate
-write_recovery_data();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Enumerate
-write_recovery_header();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Enumerate
-overwrite_with_new_data();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Enumerate
-remove_recovery_header();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Standard
-On current ext3, each sync flushes all data to disk, so the next 3 syncs
- are relatively expensive.
- But this could become a performance bottleneck on other filesystems such
- as ext4.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Neil Brown points out that this is overzealous, and only one sync is needed:
-\end_layout
-
-\begin_layout Enumerate
-Bundle the recovery data, a transaction counter and a strong checksum of
- the new data.
-\end_layout
-
-\begin_layout Enumerate
-Strong checksum that whole bundle.
-\end_layout
-
-\begin_layout Enumerate
-Store the bundle in the database.
-\end_layout
-
-\begin_layout Enumerate
-Overwrite the oldest of the two recovery pointers in the header (identified
- using the transaction counter) with the offset of this bundle.
-\end_layout
-
-\begin_layout Enumerate
-sync.
-\end_layout
-
-\begin_layout Enumerate
-Write the new data to the file.
-\end_layout
-
-\begin_layout Standard
-Checking for recovery means identifying the latest bundle with a valid checksum
- and using the new data checksum to ensure that it has been applied.
- This is more expensive than the current check, but need only be done at
- open.
- For running databases, a separate header field can be used to indicate
- a transaction in progress; we need only check for recovery if this is set.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:TDB-Does-Not"
-
-\end_inset
-
-TDB Does Not Have Snapshot Support
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
- At some point you say
-\begin_inset Quotes eld
-\end_inset
-
-use a real database
-\begin_inset Quotes erd
-\end_inset
-
- (but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "replay-attribute"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Standard
-But as a thought experiment, if we implemented transactions to only overwrite
- free entries (this is tricky: there must not be a header in each entry
- which indicates whether it is free, but use of presence in metadata elsewhere),
- and a pointer to the hash table, we could create an entirely new commit
- without destroying existing data.
- Then it would be easy to implement snapshots in a similar way.
-\end_layout
-
-\begin_layout Standard
-This would not allow arbitrary changes to the database, such as tdb_repack
- does, and would require more space (since we have to preserve the current
- and future entries at once).
- If we used hash trees rather than one big hash table, we might only have
- to rewrite some sections of the hash, too.
-\end_layout
-
-\begin_layout Standard
-We could then implement snapshots using a similar method, using multiple
- different hash tables/free tables.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Subsection
-Transactions Cannot Operate in Parallel
-\end_layout
-
-\begin_layout Standard
-This would be useless for ldb, as it hits the index records with just about
- every update.
- It would add significant complexity in resolving clashes, and cause the
- all transaction callers to write their code to loop in the case where the
- transactions spuriously failed.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None (but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "replay-attribute"
-
-\end_inset
-
-).
- We could solve a small part of the problem by providing read-only transactions.
- These would allow one write transaction to begin, but it could not commit
- until all r/o transactions are done.
- This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
- commit.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Subsection
-Default Hash Function Is Suboptimal
-\end_layout
-
-\begin_layout Standard
-The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
- if we expand it to 64 bits), and works best when the hash bucket size is
- a prime number (which also means a slow modulus).
- In addition, it is highly predictable which could potentially lead to a
- Denial of Service attack in some TDB uses.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The Jenkins lookup3 hash
-\begin_inset Foot
-status open
-
-\begin_layout Plain Layout
-http://burtleburtle.net/bob/c/lookup3.c
-\end_layout
-
-\end_inset
-
- is a fast and superbly-mixing hash.
- It's used by the Linux kernel and almost everything else.
- This has the particular properties that it takes an initial seed, and produces
- two 32 bit hash numbers, which we can combine into a 64-bit hash.
-\end_layout
-
-\begin_layout Standard
-The seed should be created at tdb-creation time from some random source,
- and placed in the header.
- This is far from foolproof, but adds a little bit of protection against
- hash bombing.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "Reliable-Traversal-Adds"
-
-\end_inset
-
-Reliable Traversal Adds Complexity
-\end_layout
-
-\begin_layout Standard
-We lock a record during traversal iteration, and try to grab that lock in
- the delete code.
- If that grab on delete fails, we simply mark it deleted and continue onwards;
- traversal checks for this condition and does the delete when it moves off
- the record.
-\end_layout
-
-\begin_layout Standard
-If traversal terminates, the dead record may be left indefinitely.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Remove reliability guarantees; see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "traverse-Proposed-Solution"
-
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Fcntl Locking Adds Overhead
-\end_layout
-
-\begin_layout Standard
-Placing a fcntl lock means a system call, as does removing one.
- This is actually one reason why transactions can be faster (everything
- is locked once at transaction start).
- In the uncontended case, this overhead can theoretically be eliminated.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
-\end_layout
-
-\begin_layout Standard
-We tried this before with spinlock support, in the early days of TDB, and
- it didn't make much difference except in manufactured benchmarks.
-\end_layout
-
-\begin_layout Standard
-We could use spinlocks (with futex kernel support under Linux), but it means
- that we lose automatic cleanup when a process dies with a lock.
- There is a method of auto-cleanup under Linux, but it's not supported by
- other operating systems.
- We could reintroduce a clear-if-first-style lock and sweep for dead futexes
- on open, but that wouldn't help the normal case of one concurrent opener
- dying.
- Increasingly elaborate repair schemes could be considered, but they require
- an ABI change (everyone must use them) anyway, so there's no need to do
- this at the same time as everything else.
-\end_layout
-
-\begin_layout Subsection
-Some Transactions Don't Require Durability
-\end_layout
-
-\begin_layout Standard
-Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
- usage, and occasionally empties the results into a transactional TDB.
- This kind of usage prioritizes performance over durability: as long as
- we are consistent, data can be lost.
-\end_layout
-
-\begin_layout Standard
-This would be more neatly implemented inside tdb: a
-\begin_inset Quotes eld
-\end_inset
-
-soft
-\begin_inset Quotes erd
-\end_inset
-
- transaction commit (ie.
- syncless) which meant that data may be reverted on a crash.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
-\end_layout
-
-\begin_layout Standard
-Unfortunately any transaction scheme which overwrites old data requires
- a sync before that overwrite to avoid the possibility of corruption.
-\end_layout
-
-\begin_layout Standard
-It seems possible to use a scheme similar to that described in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Does-Not"
-
-\end_inset
-
-,where transactions are committed without overwriting existing data, and
- an array of top-level pointers were available in the header.
- If the transaction is
-\begin_inset Quotes eld
-\end_inset
-
-soft
-\begin_inset Quotes erd
-\end_inset
-
- then we would not need a sync at all: existing processes would pick up
- the new hash table and free list and work with that.
-\end_layout
-
-\begin_layout Standard
-At some later point, a sync would allow recovery of the old data into the
- free lists (perhaps when the array of top-level pointers filled).
- On crash, tdb_open() would examine the array of top levels, and apply the
- transactions until it encountered an invalid checksum.
-\end_layout
-
-\begin_layout Subsection
-Tracing Is Fragile, Replay Is External
-\end_layout
-
-\begin_layout Standard
-The current TDB has compile-time-enabled tracing code, but it often breaks
- as it is not enabled by default.
- In a similar way, the ctdb code has an external wrapper which does replay
- tracing so it can coordinate cluster-wide transactions.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\begin_inset CommandInset label
-LatexCommand label
-name "replay-attribute"
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Standard
-Tridge points out that an attribute can be later added to tdb_open (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "attributes"
-
-\end_inset
-
-) to provide replay/trace hooks, which could become the basis for this and
- future parallel transactions and snapshot support.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\end_body
-\end_document
+++ /dev/null
-NTDB: Redesigning The Trivial DataBase
-
-Rusty Russell, IBM Corporation
-
-19 June 2012
-
-Abstract
-
-The Trivial DataBase on-disk format is 32 bits; with usage cases
-heading towards the 4G limit, that must change. This required
-breakage provides an opportunity to revisit TDB's other design
-decisions and reassess them.
-
-1 Introduction
-
-The Trivial DataBase was originally written by Andrew Tridgell as
-a simple key/data pair storage system with the same API as dbm,
-but allowing multiple readers and writers while being small
-enough (< 1000 lines of C) to include in SAMBA. The simple design
-created in 1999 has proven surprisingly robust and performant,
-used in Samba versions 3 and 4 as well as numerous other
-projects. Its useful life was greatly increased by the
-(backwards-compatible!) addition of transaction support in 2005.
-
-The wider variety and greater demands of TDB-using code has lead
-to some organic growth of the API, as well as some compromises on
-the implementation. None of these, by themselves, are seen as
-show-stoppers, but the cumulative effect is to a loss of elegance
-over the initial, simple TDB implementation. Here is a table of
-the approximate number of lines of implementation code and number
-of API functions at the end of each year:
-
-
-+-----------+----------------+--------------------------------+
-| Year End | API Functions | Lines of C Code Implementation |
-+-----------+----------------+--------------------------------+
-+-----------+----------------+--------------------------------+
-| 1999 | 13 | 1195 |
-+-----------+----------------+--------------------------------+
-| 2000 | 24 | 1725 |
-+-----------+----------------+--------------------------------+
-| 2001 | 32 | 2228 |
-+-----------+----------------+--------------------------------+
-| 2002 | 35 | 2481 |
-+-----------+----------------+--------------------------------+
-| 2003 | 35 | 2552 |
-+-----------+----------------+--------------------------------+
-| 2004 | 40 | 2584 |
-+-----------+----------------+--------------------------------+
-| 2005 | 38 | 2647 |
-+-----------+----------------+--------------------------------+
-| 2006 | 52 | 3754 |
-+-----------+----------------+--------------------------------+
-| 2007 | 66 | 4398 |
-+-----------+----------------+--------------------------------+
-| 2008 | 71 | 4768 |
-+-----------+----------------+--------------------------------+
-| 2009 | 73 | 5715 |
-+-----------+----------------+--------------------------------+
-
-
-This review is an attempt to catalog and address all the known
-issues with TDB and create solutions which address the problems
-without significantly increasing complexity; all involved are far
-too aware of the dangers of second system syndrome in rewriting a
-successful project like this.
-
-Note: the final decision was to make ntdb a separate library,
-with a separarate 'ntdb' namespace so both can potentially be
-linked together. This document still refers to “tdb” everywhere,
-for simplicity.
-
-2 API Issues
-
-2.1 tdb_open_ex Is Not Expandable
-
-The tdb_open() call was expanded to tdb_open_ex(), which added an
-optional hashing function and an optional logging function
-argument. Additional arguments to open would require the
-introduction of a tdb_open_ex2 call etc.
-
-2.1.1 Proposed Solution<attributes>
-
-tdb_open() will take a linked-list of attributes:
-
-enum tdb_attribute {
-
- TDB_ATTRIBUTE_LOG = 0,
-
- TDB_ATTRIBUTE_HASH = 1
-
-};
-
-struct tdb_attribute_base {
-
- enum tdb_attribute attr;
-
- union tdb_attribute *next;
-
-};
-
-struct tdb_attribute_log {
-
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
-*/
-
- tdb_log_func log_fn;
-
- void *log_private;
-
-};
-
-struct tdb_attribute_hash {
-
- struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
-*/
-
- tdb_hash_func hash_fn;
-
- void *hash_private;
-
-};
-
-union tdb_attribute {
-
- struct tdb_attribute_base base;
-
- struct tdb_attribute_log log;
-
- struct tdb_attribute_hash hash;
-
-};
-
-This allows future attributes to be added, even if this expands
-the size of the union.
-
-2.1.2 Status
-
-Complete.
-
-2.2 tdb_traverse Makes Impossible Guarantees
-
-tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
-and it was thought that it was important to guarantee that all
-records which exist at the start and end of the traversal would
-be included, and no record would be included twice.
-
-This adds complexity (see[Reliable-Traversal-Adds]) and does not
-work anyway for records which are altered (in particular, those
-which are expanded may be effectively deleted and re-added behind
-the traversal).
-
-2.2.1 <traverse-Proposed-Solution>Proposed Solution
-
-Abandon the guarantee. You will see every record if no changes
-occur during your traversal, otherwise you will see some subset.
-You can prevent changes by using a transaction or the locking
-API.
-
-2.2.2 Status
-
-Complete. Delete-during-traverse will still delete every record,
-too (assuming no other changes).
-
-2.3 Nesting of Transactions Is Fraught
-
-TDB has alternated between allowing nested transactions and not
-allowing them. Various paths in the Samba codebase assume that
-transactions will nest, and in a sense they can: the operation is
-only committed to disk when the outer transaction is committed.
-There are two problems, however:
-
-1. Canceling the inner transaction will cause the outer
- transaction commit to fail, and will not undo any operations
- since the inner transaction began. This problem is soluble with
- some additional internal code.
-
-2. An inner transaction commit can be cancelled by the outer
- transaction. This is desirable in the way which Samba's
- database initialization code uses transactions, but could be a
- surprise to any users expecting a successful transaction commit
- to expose changes to others.
-
-The current solution is to specify the behavior at tdb_open(),
-with the default currently that nested transactions are allowed.
-This flag can also be changed at runtime.
-
-2.3.1 Proposed Solution
-
-Given the usage patterns, it seems that the“least-surprise”
-behavior of disallowing nested transactions should become the
-default. Additionally, it seems the outer transaction is the only
-code which knows whether inner transactions should be allowed, so
-a flag to indicate this could be added to tdb_transaction_start.
-However, this behavior can be simulated with a wrapper which uses
-tdb_add_flags() and tdb_remove_flags(), so the API should not be
-expanded for this relatively-obscure case.
-
-2.3.2 Status
-
-Complete; the nesting flag has been removed.
-
-2.4 Incorrect Hash Function is Not Detected
-
-tdb_open_ex() allows the calling code to specify a different hash
-function to use, but does not check that all other processes
-accessing this tdb are using the same hash function. The result
-is that records are missing from tdb_fetch().
-
-2.4.1 Proposed Solution
-
-The header should contain an example hash result (eg. the hash of
-0xdeadbeef), and tdb_open_ex() should check that the given hash
-function produces the same answer, or fail the tdb_open call.
-
-2.4.2 Status
-
-Complete.
-
-2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
-
-In response to scalability issues with the free list ([TDB-Freelist-Is]
-) two API workarounds have been incorporated in TDB:
-tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
-latter actually calls the former with an argument of“5”.
-
-This code allows deleted records to accumulate without putting
-them in the free list. On delete we iterate through each chain
-and free them in a batch if there are more than max_dead entries.
-These are never otherwise recycled except as a side-effect of a
-tdb_repack.
-
-2.5.1 Proposed Solution
-
-With the scalability problems of the freelist solved, this API
-can be removed. The TDB_VOLATILE flag may still be useful as a
-hint that store and delete of records will be at least as common
-as fetch in order to allow some internal tuning, but initially
-will become a no-op.
-
-2.5.2 Status
-
-Complete. Unknown flags cause tdb_open() to fail as well, so they
-can be detected at runtime.
-
-2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
- In The Same Process
-
-No process can open the same TDB twice; we check and disallow it.
-This is an unfortunate side-effect of fcntl locks, which operate
-on a per-file rather than per-file-descriptor basis, and do not
-nest. Thus, closing any file descriptor on a file clears all the
-locks obtained by this process, even if they were placed using a
-different file descriptor!
-
-Note that even if this were solved, deadlock could occur if
-operations were nested: this is a more manageable programming
-error in most cases.
-
-2.6.1 Proposed Solution
-
-We could lobby POSIX to fix the perverse rules, or at least lobby
-Linux to violate them so that the most common implementation does
-not have this restriction. This would be a generally good idea
-for other fcntl lock users.
-
-Samba uses a wrapper which hands out the same tdb_context to
-multiple callers if this happens, and does simple reference
-counting. We should do this inside the tdb library, which already
-emulates lock nesting internally; it would need to recognize when
-deadlock occurs within a single process. This would create a new
-failure mode for tdb operations (while we currently handle
-locking failures, they are impossible in normal use and a process
-encountering them can do little but give up).
-
-I do not see benefit in an additional tdb_open flag to indicate
-whether re-opening is allowed, as though there may be some
-benefit to adding a call to detect when a tdb_context is shared,
-to allow other to create such an API.
-
-2.6.2 Status
-
-Complete.
-
-2.7 TDB API Is Not POSIX Thread-safe
-
-The TDB API uses an error code which can be queried after an
-operation to determine what went wrong. This programming model
-does not work with threads, unless specific additional guarantees
-are given by the implementation. In addition, even
-otherwise-independent threads cannot open the same TDB (as in[TDB-Files-Cannot]
-).
-
-2.7.1 Proposed Solution
-
-Reachitecting the API to include a tdb_errcode pointer would be a
-great deal of churn, but fortunately most functions return 0 on
-success and -1 on error: we can change these to return 0 on
-success and a negative error code on error, and the API remains
-similar to previous. The tdb_fetch, tdb_firstkey and tdb_nextkey
-functions need to take a TDB_DATA pointer and return an error
-code. It is also simpler to have tdb_nextkey replace its key
-argument in place, freeing up any old .dptr.
-
-Internal locking is required to make sure that fcntl locks do not
-overlap between threads, and also that the global list of tdbs is
-maintained.
-
-The aim is that building tdb with -DTDB_PTHREAD will result in a
-pthread-safe version of the library, and otherwise no overhead
-will exist. Alternatively, a hooking mechanism similar to that
-proposed for[Proposed-Solution-locking-hook] could be used to
-enable pthread locking at runtime.
-
-2.7.2 Status
-
-Incomplete; API has been changed but thread safety has not been
-implemented.
-
-2.8 *_nonblock Functions And *_mark Functions Expose
- Implementation
-
-CTDB[footnote:
-Clustered TDB, see http://ctdb.samba.org
-] wishes to operate on TDB in a non-blocking manner. This is
-currently done as follows:
-
-1. Call the _nonblock variant of an API function (eg.
- tdb_lockall_nonblock). If this fails:
-
-2. Fork a child process, and wait for it to call the normal
- variant (eg. tdb_lockall).
-
-3. If the child succeeds, call the _mark variant to indicate we
- already have the locks (eg. tdb_lockall_mark).
-
-4. Upon completion, tell the child to release the locks (eg.
- tdb_unlockall).
-
-5. Indicate to tdb that it should consider the locks removed (eg.
- tdb_unlockall_mark).
-
-There are several issues with this approach. Firstly, adding two
-new variants of each function clutters the API for an obscure
-use, and so not all functions have three variants. Secondly, it
-assumes that all paths of the functions ask for the same locks,
-otherwise the parent process will have to get a lock which the
-child doesn't have under some circumstances. I don't believe this
-is currently the case, but it constrains the implementation.
-
-2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
-
-Implement a hook for locking methods, so that the caller can
-control the calls to create and remove fcntl locks. In this
-scenario, ctdbd would operate as follows:
-
-1. Call the normal API function, eg tdb_lockall().
-
-2. When the lock callback comes in, check if the child has the
- lock. Initially, this is always false. If so, return 0.
- Otherwise, try to obtain it in non-blocking mode. If that
- fails, return EWOULDBLOCK.
-
-3. Release locks in the unlock callback as normal.
-
-4. If tdb_lockall() fails, see if we recorded a lock failure; if
- so, call the child to repeat the operation.
-
-5. The child records what locks it obtains, and returns that
- information to the parent.
-
-6. When the child has succeeded, goto 1.
-
-This is flexible enough to handle any potential locking scenario,
-even when lock requirements change. It can be optimized so that
-the parent does not release locks, just tells the child which
-locks it doesn't need to obtain.
-
-It also keeps the complexity out of the API, and in ctdbd where
-it is needed.
-
-2.8.2 Status
-
-Complete.
-
-2.9 tdb_chainlock Functions Expose Implementation
-
-tdb_chainlock locks some number of records, including the record
-indicated by the given key. This gave atomicity guarantees;
-no-one can start a transaction, alter, read or delete that key
-while the lock is held.
-
-It also makes the same guarantee for any other key in the chain,
-which is an internal implementation detail and potentially a
-cause for deadlock.
-
-2.9.1 Proposed Solution
-
-None. It would be nice to have an explicit single entry lock
-which effected no other keys. Unfortunately, this won't work for
-an entry which doesn't exist. Thus while chainlock may be
-implemented more efficiently for the existing case, it will still
-have overlap issues with the non-existing case. So it is best to
-keep the current (lack of) guarantee about which records will be
-effected to avoid constraining our implementation.
-
-2.10 Signal Handling is Not Race-Free
-
-The tdb_setalarm_sigptr() call allows the caller's signal handler
-to indicate that the tdb locking code should return with a
-failure, rather than trying again when a signal is received (and
-errno == EAGAIN). This is usually used to implement timeouts.
-
-Unfortunately, this does not work in the case where the signal is
-received before the tdb code enters the fcntl() call to place the
-lock: the code will sleep within the fcntl() code, unaware that
-the signal wants it to exit. In the case of long timeouts, this
-does not happen in practice.
-
-2.10.1 Proposed Solution
-
-The locking hooks proposed in[Proposed-Solution-locking-hook]
-would allow the user to decide on whether to fail the lock
-acquisition on a signal. This allows the caller to choose their
-own compromise: they could narrow the race by checking
-immediately before the fcntl call.[footnote:
-It may be possible to make this race-free in some implementations
-by having the signal handler alter the struct flock to make it
-invalid. This will cause the fcntl() lock call to fail with
-EINVAL if the signal occurs before the kernel is entered,
-otherwise EAGAIN.
-]
-
-2.10.2 Status
-
-Complete.
-
-2.11 The API Uses Gratuitous Typedefs, Capitals
-
-typedefs are useful for providing source compatibility when types
-can differ across implementations, or arguably in the case of
-function pointer definitions which are hard for humans to parse.
-Otherwise it is simply obfuscation and pollutes the namespace.
-
-Capitalization is usually reserved for compile-time constants and
-macros.
-
- TDB_CONTEXT There is no reason to use this over 'struct
- tdb_context'; the definition isn't visible to the API user
- anyway.
-
- TDB_DATA There is no reason to use this over struct TDB_DATA;
- the struct needs to be understood by the API user.
-
- struct TDB_DATA This would normally be called 'struct
- tdb_data'.
-
- enum TDB_ERROR Similarly, this would normally be enum
- tdb_error.
-
-2.11.1 Proposed Solution
-
-None. Introducing lower case variants would please pedants like
-myself, but if it were done the existing ones should be kept.
-There is little point forcing a purely cosmetic change upon tdb
-users.
-
-2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
- Private Pointer
-
-For API compatibility reasons, the logging function needs to call
-tdb_get_logging_private() to retrieve the pointer registered by
-the tdb_open_ex for logging.
-
-2.12.1 Proposed Solution
-
-It should simply take an extra argument, since we are prepared to
-break the API/ABI.
-
-2.12.2 Status
-
-Complete.
-
-2.13 Various Callback Functions Are Not Typesafe
-
-The callback functions in tdb_set_logging_function (after[tdb_log_func-Doesnt-Take]
- is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
-and tdb_check all take void * and must internally convert it to
-the argument type they were expecting.
-
-If this type changes, the compiler will not produce warnings on
-the callers, since it only sees void *.
-
-2.13.1 Proposed Solution
-
-With careful use of macros, we can create callback functions
-which give a warning when used on gcc and the types of the
-callback and its private argument differ. Unsupported compilers
-will not give a warning, which is no worse than now. In addition,
-the callbacks become clearer, as they need not use void * for
-their parameter.
-
-See CCAN's typesafe_cb module at
-http://ccan.ozlabs.org/info/typesafe_cb.html
-
-2.13.2 Status
-
-Complete.
-
-2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
- tdb_reopen_all Problematic
-
-The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
-file should be cleared if the caller discovers it is the only
-process with the TDB open. However, if any caller does not
-specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
-the TDB erased underneath them (usually resulting in a crash).
-
-There is a similar issue on fork(); if the parent exits (or
-otherwise closes the tdb) before the child calls tdb_reopen_all()
-to establish the lock used to indicate the TDB is opened by
-someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
-it alone has opened the TDB and will erase it.
-
-2.14.1 Proposed Solution
-
-Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
-see[TDB_CLEAR_IF_FIRST-Imposes-Performance].
-
-2.14.2 Status
-
-Complete. An open hook is provided to replicate this
-functionality if required.
-
-2.15 Extending The Header Is Difficult
-
-We have reserved (zeroed) words in the TDB header, which can be
-used for future features. If the future features are compulsory,
-the version number must be updated to prevent old code from
-accessing the database. But if the future feature is optional, we
-have no way of telling if older code is accessing the database or
-not.
-
-2.15.1 Proposed Solution
-
-The header should contain a“format variant” value (64-bit). This
-is divided into two 32-bit parts:
-
-1. The lower part reflects the format variant understood by code
- accessing the database.
-
-2. The upper part reflects the format variant you must understand
- to write to the database (otherwise you can only open for
- reading).
-
-The latter field can only be written at creation time, the former
-should be written under the OPEN_LOCK when opening the database
-for writing, if the variant of the code is lower than the current
-lowest variant.
-
-This should allow backwards-compatible features to be added, and
-detection if older code (which doesn't understand the feature)
-writes to the database.
-
-2.15.2 Status
-
-Complete.
-
-2.16 Record Headers Are Not Expandible
-
-If we later want to add (say) checksums on keys and data, it
-would require another format change, which we'd like to avoid.
-
-2.16.1 Proposed Solution
-
-We often have extra padding at the tail of a record. If we ensure
-that the first byte (if any) of this padding is zero, we will
-have a way for future changes to detect code which doesn't
-understand a new format: the new code would write (say) a 1 at
-the tail, and thus if there is no tail or the first byte is 0, we
-would know the extension is not present on that record.
-
-2.16.2 Status
-
-Complete.
-
-2.17 TDB Does Not Use Talloc
-
-Many users of TDB (particularly Samba) use the talloc allocator,
-and thus have to wrap TDB in a talloc context to use it
-conveniently.
-
-2.17.1 Proposed Solution
-
-The allocation within TDB is not complicated enough to justify
-the use of talloc, and I am reluctant to force another
-(excellent) library on TDB users. Nonetheless a compromise is
-possible. An attribute (see[attributes]) can be added later to
-tdb_open() to provide an alternate allocation mechanism,
-specifically for talloc but usable by any other allocator (which
-would ignore the“context” argument).
-
-This would form a talloc heirarchy as expected, but the caller
-would still have to attach a destructor to the tdb context
-returned from tdb_open to close it. All TDB_DATA fields would be
-children of the tdb_context, and the caller would still have to
-manage them (using talloc_free() or talloc_steal()).
-
-2.17.2 Status
-
-Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
-
-3 Performance And Scalability Issues
-
-3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
- Imposes Performance Penalty
-
-When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
-placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
-never conflict in normal tdb usage, they do add substantial
-overhead for most fcntl lock implementations when the kernel
-scans to detect if a lock conflict exists. This is often a single
-linked list, making the time to acquire and release a fcntl lock
-O(N) where N is the number of processes with the TDB open, not
-the number actually doing work.
-
-In a Samba server it is common to have huge numbers of clients
-sitting idle, and thus they have weaned themselves off the
-TDB_CLEAR_IF_FIRST flag.[footnote:
-There is a flag to tdb_reopen_all() which is used for this
-optimization: if the parent process will outlive the child, the
-child does not need the ACTIVE_LOCK. This is a workaround for
-this very performance issue.
-]
-
-3.1.1 Proposed Solution
-
-Remove the flag. It was a neat idea, but even trivial servers
-tend to know when they are initializing for the first time and
-can simply unlink the old tdb at that point.
-
-3.1.2 Status
-
-Complete.
-
-3.2 TDB Files Have a 4G Limit
-
-This seems to be becoming an issue (so much for“trivial”!),
-particularly for ldb.
-
-3.2.1 Proposed Solution
-
-A new, incompatible TDB format which uses 64 bit offsets
-internally rather than 32 bit as now. For simplicity of endian
-conversion (which TDB does on the fly if required), all values
-will be 64 bit on disk. In practice, some upper bits may be used
-for other purposes, but at least 56 bits will be available for
-file offsets.
-
-tdb_open() will automatically detect the old version, and even
-create them if TDB_VERSION6 is specified to tdb_open.
-
-32 bit processes will still be able to access TDBs larger than 4G
-(assuming that their off_t allows them to seek to 64 bits), they
-will gracefully fall back as they fail to mmap. This can happen
-already with large TDBs.
-
-Old versions of tdb will fail to open the new TDB files (since 28
-August 2009, commit 398d0c29290: prior to that any unrecognized
-file format would be erased and initialized as a fresh tdb!)
-
-3.2.2 Status
-
-Complete.
-
-3.3 TDB Records Have a 4G Limit
-
-This has not been a reported problem, and the API uses size_t
-which can be 64 bit on 64 bit platforms. However, other limits
-may have made such an issue moot.
-
-3.3.1 Proposed Solution
-
-Record sizes will be 64 bit, with an error returned on 32 bit
-platforms which try to access such records (the current
-implementation would return TDB_ERR_OOM in a similar case). It
-seems unlikely that 32 bit keys will be a limitation, so the
-implementation may not support this (see[sub:Records-Incur-A]).
-
-3.3.2 Status
-
-Complete.
-
-3.4 Hash Size Is Determined At TDB Creation Time
-
-TDB contains a number of hash chains in the header; the number is
-specified at creation time, and defaults to 131. This is such a
-bottleneck on large databases (as each hash chain gets quite
-long), that LDB uses 10,000 for this hash. In general it is
-impossible to know what the 'right' answer is at database
-creation time.
-
-3.4.1 <sub:Hash-Size-Solution>Proposed Solution
-
-After comprehensive performance testing on various scalable hash
-variants[footnote:
-http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94
-This was annoying because I was previously convinced that an
-expanding tree of hashes would be very close to optimal.
-], it became clear that it is hard to beat a straight linear hash
-table which doubles in size when it reaches saturation.
-Unfortunately, altering the hash table introduces serious locking
-complications: the entire hash table needs to be locked to
-enlarge the hash table, and others might be holding locks.
-Particularly insidious are insertions done under tdb_chainlock.
-
-Thus an expanding layered hash will be used: an array of hash
-groups, with each hash group exploding into pointers to lower
-hash groups once it fills, turning into a hash tree. This has
-implications for locking: we must lock the entire group in case
-we need to expand it, yet we don't know how deep the tree is at
-that point.
-
-Note that bits from the hash table entries should be stolen to
-hold more hash bits to reduce the penalty of collisions. We can
-use the otherwise-unused lower 3 bits. If we limit the size of
-the database to 64 exabytes, we can use the top 8 bits of the
-hash entry as well. These 11 bits would reduce false positives
-down to 1 in 2000 which is more than we need: we can use one of
-the bits to indicate that the extra hash bits are valid. This
-means we can choose not to re-hash all entries when we expand a
-hash group; simply use the next bits we need and mark them
-invalid.
-
-3.4.2 Status
-
-Ignore. Scaling the hash automatically proved inefficient at
-small hash sizes; we default to a 8192-element hash (changable
-via NTDB_ATTRIBUTE_HASHSIZE), and when buckets clash we expand to
-an array of hash entries. This scales slightly better than the
-tdb chain (due to the 8 top bits containing extra hash).
-
-3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
-
-TDB uses a single linked list for the free list. Allocation
-occurs as follows, using heuristics which have evolved over time:
-
-1. Get the free list lock for this whole operation.
-
-2. Multiply length by 1.25, so we always over-allocate by 25%.
-
-3. Set the slack multiplier to 1.
-
-4. Examine the current freelist entry: if it is > length but <
- the current best case, remember it as the best case.
-
-5. Multiply the slack multiplier by 1.05.
-
-6. If our best fit so far is less than length * slack multiplier,
- return it. The slack will be turned into a new free record if
- it's large enough.
-
-7. Otherwise, go onto the next freelist entry.
-
-Deleting a record occurs as follows:
-
-1. Lock the hash chain for this whole operation.
-
-2. Walk the chain to find the record, keeping the prev pointer
- offset.
-
-3. If max_dead is non-zero:
-
- (a) Walk the hash chain again and count the dead records.
-
- (b) If it's more than max_dead, bulk free all the dead ones
- (similar to steps 4 and below, but the lock is only obtained
- once).
-
- (c) Simply mark this record as dead and return.
-
-4. Get the free list lock for the remainder of this operation.
-
-5. <right-merging>Examine the following block to see if it is
- free; if so, enlarge the current block and remove that block
- from the free list. This was disabled, as removal from the free
- list was O(entries-in-free-list).
-
-6. Examine the preceeding block to see if it is free: for this
- reason, each block has a 32-bit tailer which indicates its
- length. If it is free, expand it to cover our new block and
- return.
-
-7. Otherwise, prepend ourselves to the free list.
-
-Disabling right-merging (step[right-merging]) causes
-fragmentation; the other heuristics proved insufficient to
-address this, so the final answer to this was that when we expand
-the TDB file inside a transaction commit, we repack the entire
-tdb.
-
-The single list lock limits our allocation rate; due to the other
-issues this is not currently seen as a bottleneck.
-
-3.5.1 Proposed Solution
-
-The first step is to remove all the current heuristics, as they
-obviously interact, then examine them once the lock contention is
-addressed.
-
-The free list must be split to reduce contention. Assuming
-perfect free merging, we can at most have 1 free list entry for
-each entry. This implies that the number of free lists is related
-to the size of the hash table, but as it is rare to walk a large
-number of free list entries we can use far fewer, say 1/32 of the
-number of hash buckets.
-
-It seems tempting to try to reuse the hash implementation which
-we use for records here, but we have two ways of searching for
-free entries: for allocation we search by size (and possibly
-zone) which produces too many clashes for our hash table to
-handle well, and for coalescing we search by address. Thus an
-array of doubly-linked free lists seems preferable.
-
-There are various benefits in using per-size free lists (see[sub:TDB-Becomes-Fragmented]
-) but it's not clear this would reduce contention in the common
-case where all processes are allocating/freeing the same size.
-Thus we almost certainly need to divide in other ways: the most
-obvious is to divide the file into zones, and using a free list
-(or table of free lists) for each. This approximates address
-ordering.
-
-Unfortunately it is difficult to know what heuristics should be
-used to determine zone sizes, and our transaction code relies on
-being able to create a“recovery area” by simply appending to the
-file (difficult if it would need to create a new zone header).
-Thus we use a linked-list of free tables; currently we only ever
-create one, but if there is more than one we choose one at random
-to use. In future we may use heuristics to add new free tables on
-contention. We only expand the file when all free tables are
-exhausted.
-
-The basic algorithm is as follows. Freeing is simple:
-
-1. Identify the correct free list.
-
-2. Lock the corresponding list.
-
-3. Re-check the list (we didn't have a lock, sizes could have
- changed): relock if necessary.
-
-4. Place the freed entry in the list.
-
-Allocation is a little more complicated, as we perform delayed
-coalescing at this point:
-
-1. Pick a free table; usually the previous one.
-
-2. Lock the corresponding list.
-
-3. If the top entry is -large enough, remove it from the list and
- return it.
-
-4. Otherwise, coalesce entries in the list.If there was no entry
- large enough, unlock the list and try the next largest list
-
-5. If no list has an entry which meets our needs, try the next
- free table.
-
-6. If no zone satisfies, expand the file.
-
-This optimizes rapid insert/delete of free list entries by not
-coalescing them all the time.. First-fit address ordering
-ordering seems to be fairly good for keeping fragmentation low
-(see[sub:TDB-Becomes-Fragmented]). Note that address ordering
-does not need a tailer to coalesce, though if we needed one we
-could have one cheaply: see[sub:Records-Incur-A].
-
-Each free entry has the free table number in the header: less
-than 255. It also contains a doubly-linked list for easy
-deletion.
-
-3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
-
-Much of this is a result of allocation strategy[footnote:
-The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
-ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
-] and deliberate hobbling of coalescing; internal fragmentation
-(aka overallocation) is deliberately set at 25%, and external
-fragmentation is only cured by the decision to repack the entire
-db when a transaction commit needs to enlarge the file.
-
-3.6.1 Proposed Solution
-
-The 25% overhead on allocation works in practice for ldb because
-indexes tend to expand by one record at a time. This internal
-fragmentation can be resolved by having an“expanded” bit in the
-header to note entries that have previously expanded, and
-allocating more space for them.
-
-There are is a spectrum of possible solutions for external
-fragmentation: one is to use a fragmentation-avoiding allocation
-strategy such as best-fit address-order allocator. The other end
-of the spectrum would be to use a bump allocator (very fast and
-simple) and simply repack the file when we reach the end.
-
-There are three problems with efficient fragmentation-avoiding
-allocators: they are non-trivial, they tend to use a single free
-list for each size, and there's no evidence that tdb allocation
-patterns will match those recorded for general allocators (though
-it seems likely).
-
-Thus we don't spend too much effort on external fragmentation; we
-will be no worse than the current code if we need to repack on
-occasion. More effort is spent on reducing freelist contention,
-and reducing overhead.
-
-3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
-
-Each TDB record has a header as follows:
-
-struct tdb_record {
-
- tdb_off_t next; /* offset of the next record in the list
-*/
-
- tdb_len_t rec_len; /* total byte length of record */
-
- tdb_len_t key_len; /* byte length of key */
-
- tdb_len_t data_len; /* byte length of data */
-
- uint32_t full_hash; /* the full 32 bit hash of the key */
-
- uint32_t magic; /* try to catch errors */
-
- /* the following union is implied:
-
- union {
-
- char record[rec_len];
-
- struct {
-
- char key[key_len];
-
- char data[data_len];
-
- }
-
- uint32_t totalsize; (tailer)
-
- }
-
- */
-
-};
-
-Naively, this would double to a 56-byte overhead on a 64 bit
-implementation.
-
-3.7.1 Proposed Solution
-
-We can use various techniques to reduce this for an allocated
-block:
-
-1. The 'next' pointer is not required, as we are using a flat
- hash table.
-
-2. 'rec_len' can instead be expressed as an addition to key_len
- and data_len (it accounts for wasted or overallocated length in
- the record). Since the record length is always a multiple of 8,
- we can conveniently fit it in 32 bits (representing up to 35
- bits).
-
-3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
- restrict 'data_len' to 32 bits, but instead we can combine the
- two into one 64-bit field and using a 5 bit value which
- indicates at what bit to divide the two. Keys are unlikely to
- scale as fast as data, so I'm assuming a maximum key size of 32
- bits.
-
-4. 'full_hash' is used to avoid a memcmp on the“miss” case, but
- this is diminishing returns after a handful of bits (at 10
- bits, it reduces 99.9% of false memcmp). As an aside, as the
- lower bits are already incorporated in the hash table
- resolution, the upper bits should be used here. Note that it's
- not clear that these bits will be a win, given the extra bits
- in the hash table itself (see[sub:Hash-Size-Solution]).
-
-5. 'magic' does not need to be enlarged: it currently reflects
- one of 5 values (used, free, dead, recovery, and
- unused_recovery). It is useful for quick sanity checking
- however, and should not be eliminated.
-
-6. 'tailer' is only used to coalesce free blocks (so a block to
- the right can find the header to check if this block is free).
- This can be replaced by a single 'free' bit in the header of
- the following block (and the tailer only exists in free
- blocks).[footnote:
-This technique from Thomas Standish. Data Structure Techniques.
-Addison-Wesley, Reading, Massachusetts, 1980.
-] The current proposed coalescing algorithm doesn't need this,
- however.
-
-This produces a 16 byte used header like this:
-
-struct tdb_used_record {
-
- uint32_t used_magic : 16,
-
-
-
- key_data_divide: 5,
-
- top_hash: 11;
-
- uint32_t extra_octets;
-
- uint64_t key_and_data_len;
-
-};
-
-And a free record like this:
-
-struct tdb_free_record {
-
- uint64_t free_magic: 8,
-
- prev : 56;
-
-
-
- uint64_t free_table: 8,
-
- total_length : 56
-
- uint64_t next;;
-
-};
-
-Note that by limiting valid offsets to 56 bits, we can pack
-everything we need into 3 64-byte words, meaning our minimum
-record size is 8 bytes.
-
-3.7.2 Status
-
-Complete.
-
-3.8 Transaction Commit Requires 4 fdatasync
-
-The current transaction algorithm is:
-
-1. write_recovery_data();
-
-2. sync();
-
-3. write_recovery_header();
-
-4. sync();
-
-5. overwrite_with_new_data();
-
-6. sync();
-
-7. remove_recovery_header();
-
-8. sync();
-
-On current ext3, each sync flushes all data to disk, so the next
-3 syncs are relatively expensive. But this could become a
-performance bottleneck on other filesystems such as ext4.
-
-3.8.1 Proposed Solution
-
-Neil Brown points out that this is overzealous, and only one sync
-is needed:
-
-1. Bundle the recovery data, a transaction counter and a strong
- checksum of the new data.
-
-2. Strong checksum that whole bundle.
-
-3. Store the bundle in the database.
-
-4. Overwrite the oldest of the two recovery pointers in the
- header (identified using the transaction counter) with the
- offset of this bundle.
-
-5. sync.
-
-6. Write the new data to the file.
-
-Checking for recovery means identifying the latest bundle with a
-valid checksum and using the new data checksum to ensure that it
-has been applied. This is more expensive than the current check,
-but need only be done at open. For running databases, a separate
-header field can be used to indicate a transaction in progress;
-we need only check for recovery if this is set.
-
-3.8.2 Status
-
-Deferred.
-
-3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
-
-3.9.1 Proposed Solution
-
-None. At some point you say“use a real database” (but see[replay-attribute]
-).
-
-But as a thought experiment, if we implemented transactions to
-only overwrite free entries (this is tricky: there must not be a
-header in each entry which indicates whether it is free, but use
-of presence in metadata elsewhere), and a pointer to the hash
-table, we could create an entirely new commit without destroying
-existing data. Then it would be easy to implement snapshots in a
-similar way.
-
-This would not allow arbitrary changes to the database, such as
-tdb_repack does, and would require more space (since we have to
-preserve the current and future entries at once). If we used hash
-trees rather than one big hash table, we might only have to
-rewrite some sections of the hash, too.
-
-We could then implement snapshots using a similar method, using
-multiple different hash tables/free tables.
-
-3.9.2 Status
-
-Deferred.
-
-3.10 Transactions Cannot Operate in Parallel
-
-This would be useless for ldb, as it hits the index records with
-just about every update. It would add significant complexity in
-resolving clashes, and cause the all transaction callers to write
-their code to loop in the case where the transactions spuriously
-failed.
-
-3.10.1 Proposed Solution
-
-None (but see[replay-attribute]). We could solve a small part of
-the problem by providing read-only transactions. These would
-allow one write transaction to begin, but it could not commit
-until all r/o transactions are done. This would require a new
-RO_TRANSACTION_LOCK, which would be upgraded on commit.
-
-3.10.2 Status
-
-Deferred.
-
-3.11 Default Hash Function Is Suboptimal
-
-The Knuth-inspired multiplicative hash used by tdb is fairly slow
-(especially if we expand it to 64 bits), and works best when the
-hash bucket size is a prime number (which also means a slow
-modulus). In addition, it is highly predictable which could
-potentially lead to a Denial of Service attack in some TDB uses.
-
-3.11.1 Proposed Solution
-
-The Jenkins lookup3 hash[footnote:
-http://burtleburtle.net/bob/c/lookup3.c
-] is a fast and superbly-mixing hash. It's used by the Linux
-kernel and almost everything else. This has the particular
-properties that it takes an initial seed, and produces two 32 bit
-hash numbers, which we can combine into a 64-bit hash.
-
-The seed should be created at tdb-creation time from some random
-source, and placed in the header. This is far from foolproof, but
-adds a little bit of protection against hash bombing.
-
-3.11.2 Status
-
-Complete.
-
-3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
-
-We lock a record during traversal iteration, and try to grab that
-lock in the delete code. If that grab on delete fails, we simply
-mark it deleted and continue onwards; traversal checks for this
-condition and does the delete when it moves off the record.
-
-If traversal terminates, the dead record may be left
-indefinitely.
-
-3.12.1 Proposed Solution
-
-Remove reliability guarantees; see[traverse-Proposed-Solution].
-
-3.12.2 Status
-
-Complete.
-
-3.13 Fcntl Locking Adds Overhead
-
-Placing a fcntl lock means a system call, as does removing one.
-This is actually one reason why transactions can be faster
-(everything is locked once at transaction start). In the
-uncontended case, this overhead can theoretically be eliminated.
-
-3.13.1 Proposed Solution
-
-None.
-
-We tried this before with spinlock support, in the early days of
-TDB, and it didn't make much difference except in manufactured
-benchmarks.
-
-We could use spinlocks (with futex kernel support under Linux),
-but it means that we lose automatic cleanup when a process dies
-with a lock. There is a method of auto-cleanup under Linux, but
-it's not supported by other operating systems. We could
-reintroduce a clear-if-first-style lock and sweep for dead
-futexes on open, but that wouldn't help the normal case of one
-concurrent opener dying. Increasingly elaborate repair schemes
-could be considered, but they require an ABI change (everyone
-must use them) anyway, so there's no need to do this at the same
-time as everything else.
-
-3.14 Some Transactions Don't Require Durability
-
-Volker points out that gencache uses a CLEAR_IF_FIRST tdb for
-normal (fast) usage, and occasionally empties the results into a
-transactional TDB. This kind of usage prioritizes performance
-over durability: as long as we are consistent, data can be lost.
-
-This would be more neatly implemented inside tdb: a“soft”
-transaction commit (ie. syncless) which meant that data may be
-reverted on a crash.
-
-3.14.1 Proposed Solution
-
-None.
-
-Unfortunately any transaction scheme which overwrites old data
-requires a sync before that overwrite to avoid the possibility of
-corruption.
-
-It seems possible to use a scheme similar to that described in[sub:TDB-Does-Not]
-,where transactions are committed without overwriting existing
-data, and an array of top-level pointers were available in the
-header. If the transaction is“soft” then we would not need a sync
-at all: existing processes would pick up the new hash table and
-free list and work with that.
-
-At some later point, a sync would allow recovery of the old data
-into the free lists (perhaps when the array of top-level pointers
-filled). On crash, tdb_open() would examine the array of top
-levels, and apply the transactions until it encountered an
-invalid checksum.
-
-3.15 Tracing Is Fragile, Replay Is External
-
-The current TDB has compile-time-enabled tracing code, but it
-often breaks as it is not enabled by default. In a similar way,
-the ctdb code has an external wrapper which does replay tracing
-so it can coordinate cluster-wide transactions.
-
-3.15.1 Proposed Solution<replay-attribute>
-
-Tridge points out that an attribute can be later added to
-tdb_open (see[attributes]) to provide replay/trace hooks, which
-could become the basis for this and future parallel transactions
-and snapshot support.
-
-3.15.2 Status
-
-Deferred.
+++ /dev/null
- /*
- Trivial Database 2: free list/block handling
- Copyright (C) Rusty Russell 2010
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/likely/likely.h>
-#include <ccan/ilog/ilog.h>
-#include <time.h>
-#include <limits.h>
-
-static unsigned fls64(uint64_t val)
-{
- return ilog64(val);
-}
-
-/* In which bucket would we find a particular record size? (ignoring header) */
-unsigned int size_to_bucket(ntdb_len_t data_len)
-{
- unsigned int bucket;
-
- /* We can't have records smaller than this. */
- assert(data_len >= NTDB_MIN_DATA_LEN);
-
- /* Ignoring the header... */
- if (data_len - NTDB_MIN_DATA_LEN <= 64) {
- /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
- bucket = (data_len - NTDB_MIN_DATA_LEN) / 8;
- } else {
- /* After that we go power of 2. */
- bucket = fls64(data_len - NTDB_MIN_DATA_LEN) + 2;
- }
-
- if (unlikely(bucket >= NTDB_FREE_BUCKETS))
- bucket = NTDB_FREE_BUCKETS - 1;
- return bucket;
-}
-
-ntdb_off_t first_ftable(struct ntdb_context *ntdb)
-{
- return ntdb_read_off(ntdb, offsetof(struct ntdb_header, free_table));
-}
-
-ntdb_off_t next_ftable(struct ntdb_context *ntdb, ntdb_off_t ftable)
-{
- return ntdb_read_off(ntdb, ftable + offsetof(struct ntdb_freetable,next));
-}
-
-enum NTDB_ERROR ntdb_ftable_init(struct ntdb_context *ntdb)
-{
- /* Use reservoir sampling algorithm to select a free list at random. */
- unsigned int rnd, max = 0, count = 0;
- ntdb_off_t off;
-
- ntdb->ftable_off = off = first_ftable(ntdb);
- ntdb->ftable = 0;
-
- while (off) {
- if (NTDB_OFF_IS_ERR(off)) {
- return NTDB_OFF_TO_ERR(off);
- }
-
- rnd = random();
- if (rnd >= max) {
- ntdb->ftable_off = off;
- ntdb->ftable = count;
- max = rnd;
- }
-
- off = next_ftable(ntdb, off);
- count++;
- }
- return NTDB_SUCCESS;
-}
-
-/* Offset of a given bucket. */
-ntdb_off_t bucket_off(ntdb_off_t ftable_off, unsigned bucket)
-{
- return ftable_off + offsetof(struct ntdb_freetable, buckets)
- + bucket * sizeof(ntdb_off_t);
-}
-
-/* Returns free_buckets + 1, or list number to search, or -ve error. */
-static ntdb_off_t find_free_head(struct ntdb_context *ntdb,
- ntdb_off_t ftable_off,
- ntdb_off_t bucket)
-{
- /* Speculatively search for a non-zero bucket. */
- return ntdb_find_nonzero_off(ntdb, bucket_off(ftable_off, 0),
- bucket, NTDB_FREE_BUCKETS);
-}
-
-static void check_list(struct ntdb_context *ntdb, ntdb_off_t b_off)
-{
-#ifdef CCAN_NTDB_DEBUG
- ntdb_off_t off, prev = 0, first;
- struct ntdb_free_record r;
-
- first = off = (ntdb_read_off(ntdb, b_off) & NTDB_OFF_MASK);
- while (off != 0) {
- ntdb_read_convert(ntdb, off, &r, sizeof(r));
- if (frec_magic(&r) != NTDB_FREE_MAGIC)
- abort();
- if (prev && frec_prev(&r) != prev)
- abort();
- prev = off;
- off = r.next;
- }
-
- if (first) {
- ntdb_read_convert(ntdb, first, &r, sizeof(r));
- if (frec_prev(&r) != prev)
- abort();
- }
-#endif
-}
-
-/* Remove from free bucket. */
-static enum NTDB_ERROR remove_from_list(struct ntdb_context *ntdb,
- ntdb_off_t b_off, ntdb_off_t r_off,
- const struct ntdb_free_record *r)
-{
- ntdb_off_t off, prev_next, head;
- enum NTDB_ERROR ecode;
-
- /* Is this only element in list? Zero out bucket, and we're done. */
- if (frec_prev(r) == r_off)
- return ntdb_write_off(ntdb, b_off, 0);
-
- /* off = &r->prev->next */
- off = frec_prev(r) + offsetof(struct ntdb_free_record, next);
-
- /* Get prev->next */
- prev_next = ntdb_read_off(ntdb, off);
- if (NTDB_OFF_IS_ERR(prev_next))
- return NTDB_OFF_TO_ERR(prev_next);
-
- /* If prev->next == 0, we were head: update bucket to point to next. */
- if (prev_next == 0) {
- /* We must preserve upper bits. */
- head = ntdb_read_off(ntdb, b_off);
- if (NTDB_OFF_IS_ERR(head))
- return NTDB_OFF_TO_ERR(head);
-
- if ((head & NTDB_OFF_MASK) != r_off) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "remove_from_list:"
- " %llu head %llu on list %llu",
- (long long)r_off,
- (long long)head,
- (long long)b_off);
- }
- head = ((head & ~NTDB_OFF_MASK) | r->next);
- ecode = ntdb_write_off(ntdb, b_off, head);
- if (ecode != NTDB_SUCCESS)
- return ecode;
- } else {
- /* r->prev->next = r->next */
- ecode = ntdb_write_off(ntdb, off, r->next);
- if (ecode != NTDB_SUCCESS)
- return ecode;
- }
-
- /* If we were the tail, off = &head->prev. */
- if (r->next == 0) {
- head = ntdb_read_off(ntdb, b_off);
- if (NTDB_OFF_IS_ERR(head))
- return NTDB_OFF_TO_ERR(head);
- head &= NTDB_OFF_MASK;
- off = head + offsetof(struct ntdb_free_record, magic_and_prev);
- } else {
- /* off = &r->next->prev */
- off = r->next + offsetof(struct ntdb_free_record,
- magic_and_prev);
- }
-
-#ifdef CCAN_NTDB_DEBUG
- /* *off == r */
- if ((ntdb_read_off(ntdb, off) & NTDB_OFF_MASK) != r_off) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "remove_from_list:"
- " %llu bad prev in list %llu",
- (long long)r_off, (long long)b_off);
- }
-#endif
- /* r->next->prev = r->prev */
- return ntdb_write_off(ntdb, off, r->magic_and_prev);
-}
-
-/* Enqueue in this free bucket: sets coalesce if we've added 128
- * entries to it. */
-static enum NTDB_ERROR enqueue_in_free(struct ntdb_context *ntdb,
- ntdb_off_t b_off,
- ntdb_off_t off,
- ntdb_len_t len,
- bool *coalesce)
-{
- struct ntdb_free_record new;
- enum NTDB_ERROR ecode;
- ntdb_off_t prev, head;
- uint64_t magic = (NTDB_FREE_MAGIC << (64 - NTDB_OFF_UPPER_STEAL));
-
- head = ntdb_read_off(ntdb, b_off);
- if (NTDB_OFF_IS_ERR(head))
- return NTDB_OFF_TO_ERR(head);
-
- /* We only need to set ftable_and_len; rest is set in enqueue_in_free */
- new.ftable_and_len = ((uint64_t)ntdb->ftable
- << (64 - NTDB_OFF_UPPER_STEAL))
- | len;
-
- /* new->next = head. */
- new.next = (head & NTDB_OFF_MASK);
-
- /* First element? Prev points to ourselves. */
- if (!new.next) {
- new.magic_and_prev = (magic | off);
- } else {
- /* new->prev = next->prev */
- prev = ntdb_read_off(ntdb,
- new.next + offsetof(struct ntdb_free_record,
- magic_and_prev));
- new.magic_and_prev = prev;
- if (frec_magic(&new) != NTDB_FREE_MAGIC) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "enqueue_in_free: %llu bad head"
- " prev %llu",
- (long long)new.next,
- (long long)prev);
- }
- /* next->prev = new. */
- ecode = ntdb_write_off(ntdb, new.next
- + offsetof(struct ntdb_free_record,
- magic_and_prev),
- off | magic);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
-#ifdef CCAN_NTDB_DEBUG
- prev = ntdb_read_off(ntdb, frec_prev(&new)
- + offsetof(struct ntdb_free_record, next));
- if (prev != 0) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "enqueue_in_free:"
- " %llu bad tail next ptr %llu",
- (long long)frec_prev(&new)
- + offsetof(struct ntdb_free_record,
- next),
- (long long)prev);
- }
-#endif
- }
-
- /* Update enqueue count, but don't set high bit: see NTDB_OFF_IS_ERR */
- if (*coalesce)
- head += (1ULL << (64 - NTDB_OFF_UPPER_STEAL));
- head &= ~(NTDB_OFF_MASK | (1ULL << 63));
- head |= off;
-
- ecode = ntdb_write_off(ntdb, b_off, head);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- /* It's time to coalesce if counter wrapped. */
- if (*coalesce)
- *coalesce = ((head & ~NTDB_OFF_MASK) == 0);
-
- return ntdb_write_convert(ntdb, off, &new, sizeof(new));
-}
-
-static ntdb_off_t ftable_offset(struct ntdb_context *ntdb, unsigned int ftable)
-{
- ntdb_off_t off;
- unsigned int i;
-
- if (likely(ntdb->ftable == ftable))
- return ntdb->ftable_off;
-
- off = first_ftable(ntdb);
- for (i = 0; i < ftable; i++) {
- if (NTDB_OFF_IS_ERR(off)) {
- break;
- }
- off = next_ftable(ntdb, off);
- }
- return off;
-}
-
-/* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and
- * need to blatt the *protect record (which is set to an error). */
-static ntdb_len_t coalesce(struct ntdb_context *ntdb,
- ntdb_off_t off, ntdb_off_t b_off,
- ntdb_len_t data_len,
- ntdb_off_t *protect)
-{
- ntdb_off_t end;
- struct ntdb_free_record rec;
- enum NTDB_ERROR ecode;
-
- ntdb->stats.alloc_coalesce_tried++;
- end = off + sizeof(struct ntdb_used_record) + data_len;
-
- while (end < ntdb->file->map_size) {
- const struct ntdb_free_record *r;
- ntdb_off_t nb_off;
- unsigned ftable, bucket;
-
- r = ntdb_access_read(ntdb, end, sizeof(*r), true);
- if (NTDB_PTR_IS_ERR(r)) {
- ecode = NTDB_PTR_ERR(r);
- goto err;
- }
-
- if (frec_magic(r) != NTDB_FREE_MAGIC
- || frec_ftable(r) == NTDB_FTABLE_NONE) {
- ntdb_access_release(ntdb, r);
- break;
- }
-
- ftable = frec_ftable(r);
- bucket = size_to_bucket(frec_len(r));
- nb_off = ftable_offset(ntdb, ftable);
- if (NTDB_OFF_IS_ERR(nb_off)) {
- ntdb_access_release(ntdb, r);
- ecode = NTDB_OFF_TO_ERR(nb_off);
- goto err;
- }
- nb_off = bucket_off(nb_off, bucket);
- ntdb_access_release(ntdb, r);
-
- /* We may be violating lock order here, so best effort. */
- if (ntdb_lock_free_bucket(ntdb, nb_off, NTDB_LOCK_NOWAIT)
- != NTDB_SUCCESS) {
- ntdb->stats.alloc_coalesce_lockfail++;
- break;
- }
-
- /* Now we have lock, re-check. */
- ecode = ntdb_read_convert(ntdb, end, &rec, sizeof(rec));
- if (ecode != NTDB_SUCCESS) {
- ntdb_unlock_free_bucket(ntdb, nb_off);
- goto err;
- }
-
- if (unlikely(frec_magic(&rec) != NTDB_FREE_MAGIC)) {
- ntdb->stats.alloc_coalesce_race++;
- ntdb_unlock_free_bucket(ntdb, nb_off);
- break;
- }
-
- if (unlikely(frec_ftable(&rec) != ftable)
- || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
- ntdb->stats.alloc_coalesce_race++;
- ntdb_unlock_free_bucket(ntdb, nb_off);
- break;
- }
-
- /* Did we just mess up a record you were hoping to use? */
- if (end == *protect) {
- ntdb->stats.alloc_coalesce_iterate_clash++;
- *protect = NTDB_ERR_TO_OFF(NTDB_ERR_NOEXIST);
- }
-
- ecode = remove_from_list(ntdb, nb_off, end, &rec);
- check_list(ntdb, nb_off);
- if (ecode != NTDB_SUCCESS) {
- ntdb_unlock_free_bucket(ntdb, nb_off);
- goto err;
- }
-
- end += sizeof(struct ntdb_used_record) + frec_len(&rec);
- ntdb_unlock_free_bucket(ntdb, nb_off);
- ntdb->stats.alloc_coalesce_num_merged++;
- }
-
- /* Didn't find any adjacent free? */
- if (end == off + sizeof(struct ntdb_used_record) + data_len)
- return 0;
-
- /* Before we expand, check this isn't one you wanted protected? */
- if (off == *protect) {
- *protect = NTDB_ERR_TO_OFF(NTDB_ERR_EXISTS);
- ntdb->stats.alloc_coalesce_iterate_clash++;
- }
-
- /* OK, expand initial record */
- ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
- if (ecode != NTDB_SUCCESS) {
- goto err;
- }
-
- if (frec_len(&rec) != data_len) {
- ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "coalesce: expected data len %zu not %zu",
- (size_t)data_len, (size_t)frec_len(&rec));
- goto err;
- }
-
- ecode = remove_from_list(ntdb, b_off, off, &rec);
- check_list(ntdb, b_off);
- if (ecode != NTDB_SUCCESS) {
- goto err;
- }
-
- /* Try locking violation first. We don't allow coalesce recursion! */
- ecode = add_free_record(ntdb, off, end - off, NTDB_LOCK_NOWAIT, false);
- if (ecode != NTDB_SUCCESS) {
- /* Need to drop lock. Can't rely on anything stable. */
- ntdb->stats.alloc_coalesce_lockfail++;
- *protect = NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT);
-
- /* We have to drop this to avoid deadlocks, so make sure record
- * doesn't get coalesced by someone else! */
- rec.ftable_and_len = (NTDB_FTABLE_NONE
- << (64 - NTDB_OFF_UPPER_STEAL))
- | (end - off - sizeof(struct ntdb_used_record));
- ecode = ntdb_write_off(ntdb,
- off + offsetof(struct ntdb_free_record,
- ftable_and_len),
- rec.ftable_and_len);
- if (ecode != NTDB_SUCCESS) {
- goto err;
- }
-
- ntdb_unlock_free_bucket(ntdb, b_off);
-
- ecode = add_free_record(ntdb, off, end - off, NTDB_LOCK_WAIT,
- false);
- if (ecode != NTDB_SUCCESS) {
- return NTDB_ERR_TO_OFF(ecode);
- }
- } else if (NTDB_OFF_IS_ERR(*protect)) {
- /* For simplicity, we always drop lock if they can't continue */
- ntdb_unlock_free_bucket(ntdb, b_off);
- }
- ntdb->stats.alloc_coalesce_succeeded++;
-
- /* Return usable length. */
- return end - off - sizeof(struct ntdb_used_record);
-
-err:
- /* To unify error paths, we *always* unlock bucket on error. */
- ntdb_unlock_free_bucket(ntdb, b_off);
- return NTDB_ERR_TO_OFF(ecode);
-}
-
-/* List is locked: we unlock it. */
-static enum NTDB_ERROR coalesce_list(struct ntdb_context *ntdb,
- ntdb_off_t ftable_off,
- ntdb_off_t b_off,
- unsigned int limit)
-{
- enum NTDB_ERROR ecode;
- ntdb_off_t off;
-
- off = ntdb_read_off(ntdb, b_off);
- if (NTDB_OFF_IS_ERR(off)) {
- ecode = NTDB_OFF_TO_ERR(off);
- goto unlock_err;
- }
- /* A little bit of paranoia: counter should be 0. */
- off &= NTDB_OFF_MASK;
-
- while (off && limit--) {
- struct ntdb_free_record rec;
- ntdb_len_t coal;
- ntdb_off_t next;
-
- ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
- if (ecode != NTDB_SUCCESS)
- goto unlock_err;
-
- next = rec.next;
- coal = coalesce(ntdb, off, b_off, frec_len(&rec), &next);
- if (NTDB_OFF_IS_ERR(coal)) {
- /* This has already unlocked on error. */
- return NTDB_OFF_TO_ERR(coal);
- }
- if (NTDB_OFF_IS_ERR(next)) {
- /* Coalescing had to unlock, so stop. */
- return NTDB_SUCCESS;
- }
- /* Keep going if we're doing well... */
- limit += size_to_bucket(coal / 16 + NTDB_MIN_DATA_LEN);
- off = next;
- }
-
- /* Now, move those elements to the tail of the list so we get something
- * else next time. */
- if (off) {
- struct ntdb_free_record oldhrec, newhrec, oldtrec, newtrec;
- ntdb_off_t oldhoff, oldtoff, newtoff;
-
- /* The record we were up to is the new head. */
- ecode = ntdb_read_convert(ntdb, off, &newhrec, sizeof(newhrec));
- if (ecode != NTDB_SUCCESS)
- goto unlock_err;
-
- /* Get the new tail. */
- newtoff = frec_prev(&newhrec);
- ecode = ntdb_read_convert(ntdb, newtoff, &newtrec,
- sizeof(newtrec));
- if (ecode != NTDB_SUCCESS)
- goto unlock_err;
-
- /* Get the old head. */
- oldhoff = ntdb_read_off(ntdb, b_off);
- if (NTDB_OFF_IS_ERR(oldhoff)) {
- ecode = NTDB_OFF_TO_ERR(oldhoff);
- goto unlock_err;
- }
-
- /* This could happen if they all coalesced away. */
- if (oldhoff == off)
- goto out;
-
- ecode = ntdb_read_convert(ntdb, oldhoff, &oldhrec,
- sizeof(oldhrec));
- if (ecode != NTDB_SUCCESS)
- goto unlock_err;
-
- /* Get the old tail. */
- oldtoff = frec_prev(&oldhrec);
- ecode = ntdb_read_convert(ntdb, oldtoff, &oldtrec,
- sizeof(oldtrec));
- if (ecode != NTDB_SUCCESS)
- goto unlock_err;
-
- /* Old tail's next points to old head. */
- oldtrec.next = oldhoff;
-
- /* Old head's prev points to old tail. */
- oldhrec.magic_and_prev
- = (NTDB_FREE_MAGIC << (64 - NTDB_OFF_UPPER_STEAL))
- | oldtoff;
-
- /* New tail's next is 0. */
- newtrec.next = 0;
-
- /* Write out the modified versions. */
- ecode = ntdb_write_convert(ntdb, oldtoff, &oldtrec,
- sizeof(oldtrec));
- if (ecode != NTDB_SUCCESS)
- goto unlock_err;
-
- ecode = ntdb_write_convert(ntdb, oldhoff, &oldhrec,
- sizeof(oldhrec));
- if (ecode != NTDB_SUCCESS)
- goto unlock_err;
-
- ecode = ntdb_write_convert(ntdb, newtoff, &newtrec,
- sizeof(newtrec));
- if (ecode != NTDB_SUCCESS)
- goto unlock_err;
-
- /* And finally link in new head. */
- ecode = ntdb_write_off(ntdb, b_off, off);
- if (ecode != NTDB_SUCCESS)
- goto unlock_err;
- }
-out:
- ntdb_unlock_free_bucket(ntdb, b_off);
- return NTDB_SUCCESS;
-
-unlock_err:
- ntdb_unlock_free_bucket(ntdb, b_off);
- return ecode;
-}
-
-/* List must not be locked if coalesce_ok is set. */
-enum NTDB_ERROR add_free_record(struct ntdb_context *ntdb,
- ntdb_off_t off, ntdb_len_t len_with_header,
- enum ntdb_lock_flags waitflag,
- bool coalesce_ok)
-{
- ntdb_off_t b_off;
- ntdb_len_t len;
- enum NTDB_ERROR ecode;
-
- assert(len_with_header >= sizeof(struct ntdb_free_record));
-
- len = len_with_header - sizeof(struct ntdb_used_record);
-
- b_off = bucket_off(ntdb->ftable_off, size_to_bucket(len));
- ecode = ntdb_lock_free_bucket(ntdb, b_off, waitflag);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- ecode = enqueue_in_free(ntdb, b_off, off, len, &coalesce_ok);
- check_list(ntdb, b_off);
-
- /* Coalescing unlocks free list. */
- if (!ecode && coalesce_ok)
- ecode = coalesce_list(ntdb, ntdb->ftable_off, b_off, 2);
- else
- ntdb_unlock_free_bucket(ntdb, b_off);
- return ecode;
-}
-
-static size_t adjust_size(size_t keylen, size_t datalen)
-{
- size_t size = keylen + datalen;
-
- if (size < NTDB_MIN_DATA_LEN)
- size = NTDB_MIN_DATA_LEN;
-
- /* Round to next uint64_t boundary. */
- return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
-}
-
-/* If we have enough left over to be useful, split that off. */
-static size_t record_leftover(size_t keylen, size_t datalen,
- bool want_extra, size_t total_len)
-{
- ssize_t leftover;
-
- if (want_extra)
- datalen += datalen / 2;
- leftover = total_len - adjust_size(keylen, datalen);
-
- if (leftover < (ssize_t)sizeof(struct ntdb_free_record))
- return 0;
-
- return leftover;
-}
-
-/* We need size bytes to put our key and data in. */
-static ntdb_off_t lock_and_alloc(struct ntdb_context *ntdb,
- ntdb_off_t ftable_off,
- ntdb_off_t bucket,
- size_t keylen, size_t datalen,
- bool want_extra,
- unsigned magic)
-{
- ntdb_off_t off, b_off,best_off;
- struct ntdb_free_record best = { 0 };
- double multiplier;
- size_t size = adjust_size(keylen, datalen);
- enum NTDB_ERROR ecode;
-
- ntdb->stats.allocs++;
- b_off = bucket_off(ftable_off, bucket);
-
- /* FIXME: Try non-blocking wait first, to measure contention. */
- /* Lock this bucket. */
- ecode = ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT);
- if (ecode != NTDB_SUCCESS) {
- return NTDB_ERR_TO_OFF(ecode);
- }
-
- best.ftable_and_len = -1ULL;
- best_off = 0;
-
- /* Get slack if we're after extra. */
- if (want_extra)
- multiplier = 1.5;
- else
- multiplier = 1.0;
-
- /* Walk the list to see if any are large enough, getting less fussy
- * as we go. */
- off = ntdb_read_off(ntdb, b_off);
- if (NTDB_OFF_IS_ERR(off)) {
- ecode = NTDB_OFF_TO_ERR(off);
- goto unlock_err;
- }
- off &= NTDB_OFF_MASK;
-
- while (off) {
- const struct ntdb_free_record *r;
- ntdb_off_t next;
-
- r = ntdb_access_read(ntdb, off, sizeof(*r), true);
- if (NTDB_PTR_IS_ERR(r)) {
- ecode = NTDB_PTR_ERR(r);
- goto unlock_err;
- }
-
- if (frec_magic(r) != NTDB_FREE_MAGIC) {
- ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
- "lock_and_alloc:"
- " %llu non-free 0x%llx",
- (long long)off,
- (long long)r->magic_and_prev);
- ntdb_access_release(ntdb, r);
- goto unlock_err;
- }
-
- if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
- best_off = off;
- best = *r;
- }
-
- if (frec_len(&best) <= size * multiplier && best_off) {
- ntdb_access_release(ntdb, r);
- break;
- }
-
- multiplier *= 1.01;
-
- next = r->next;
- ntdb_access_release(ntdb, r);
- off = next;
- }
-
- /* If we found anything at all, use it. */
- if (best_off) {
- struct ntdb_used_record rec;
- size_t leftover;
-
- /* We're happy with this size: take it. */
- ecode = remove_from_list(ntdb, b_off, best_off, &best);
- check_list(ntdb, b_off);
- if (ecode != NTDB_SUCCESS) {
- goto unlock_err;
- }
-
- leftover = record_leftover(keylen, datalen, want_extra,
- frec_len(&best));
-
- assert(keylen + datalen + leftover <= frec_len(&best));
- /* We need to mark non-free before we drop lock, otherwise
- * coalesce() could try to merge it! */
- ecode = set_header(ntdb, &rec, magic, keylen, datalen,
- frec_len(&best) - leftover);
- if (ecode != NTDB_SUCCESS) {
- goto unlock_err;
- }
-
- ecode = ntdb_write_convert(ntdb, best_off, &rec, sizeof(rec));
- if (ecode != NTDB_SUCCESS) {
- goto unlock_err;
- }
-
- /* For futureproofing, we put a 0 in any unused space. */
- if (rec_extra_padding(&rec)) {
- ecode = ntdb->io->twrite(ntdb, best_off + sizeof(rec)
- + keylen + datalen, "", 1);
- if (ecode != NTDB_SUCCESS) {
- goto unlock_err;
- }
- }
-
- /* Bucket of leftover will be <= current bucket, so nested
- * locking is allowed. */
- if (leftover) {
- ntdb->stats.alloc_leftover++;
- ecode = add_free_record(ntdb,
- best_off + sizeof(rec)
- + frec_len(&best) - leftover,
- leftover, NTDB_LOCK_WAIT, false);
- if (ecode != NTDB_SUCCESS) {
- best_off = NTDB_ERR_TO_OFF(ecode);
- }
- }
- ntdb_unlock_free_bucket(ntdb, b_off);
-
- return best_off;
- }
-
- ntdb_unlock_free_bucket(ntdb, b_off);
- return 0;
-
-unlock_err:
- ntdb_unlock_free_bucket(ntdb, b_off);
- return NTDB_ERR_TO_OFF(ecode);
-}
-
-/* Get a free block from current free list, or 0 if none, -ve on error. */
-static ntdb_off_t get_free(struct ntdb_context *ntdb,
- size_t keylen, size_t datalen, bool want_extra,
- unsigned magic)
-{
- ntdb_off_t off, ftable_off;
- ntdb_off_t start_b, b, ftable;
- bool wrapped = false;
-
- /* If they are growing, add 50% to get to higher bucket. */
- if (want_extra)
- start_b = size_to_bucket(adjust_size(keylen,
- datalen + datalen / 2));
- else
- start_b = size_to_bucket(adjust_size(keylen, datalen));
-
- ftable_off = ntdb->ftable_off;
- ftable = ntdb->ftable;
- while (!wrapped || ftable_off != ntdb->ftable_off) {
- /* Start at exact size bucket, and search up... */
- for (b = find_free_head(ntdb, ftable_off, start_b);
- b < NTDB_FREE_BUCKETS;
- b = find_free_head(ntdb, ftable_off, b + 1)) {
- /* Try getting one from list. */
- off = lock_and_alloc(ntdb, ftable_off,
- b, keylen, datalen, want_extra,
- magic);
- if (NTDB_OFF_IS_ERR(off))
- return off;
- if (off != 0) {
- if (b == start_b)
- ntdb->stats.alloc_bucket_exact++;
- if (b == NTDB_FREE_BUCKETS - 1)
- ntdb->stats.alloc_bucket_max++;
- /* Worked? Stay using this list. */
- ntdb->ftable_off = ftable_off;
- ntdb->ftable = ftable;
- return off;
- }
- /* Didn't work. Try next bucket. */
- }
-
- if (NTDB_OFF_IS_ERR(b)) {
- return b;
- }
-
- /* Hmm, try next table. */
- ftable_off = next_ftable(ntdb, ftable_off);
- if (NTDB_OFF_IS_ERR(ftable_off)) {
- return ftable_off;
- }
- ftable++;
-
- if (ftable_off == 0) {
- wrapped = true;
- ftable_off = first_ftable(ntdb);
- if (NTDB_OFF_IS_ERR(ftable_off)) {
- return ftable_off;
- }
- ftable = 0;
- }
- }
-
- return 0;
-}
-
-enum NTDB_ERROR set_header(struct ntdb_context *ntdb,
- struct ntdb_used_record *rec,
- unsigned magic, uint64_t keylen, uint64_t datalen,
- uint64_t actuallen)
-{
- uint64_t keybits = (fls64(keylen) + 1) / 2;
-
- rec->magic_and_meta = ((actuallen - (keylen + datalen)) << 11)
- | (keybits << 43)
- | ((uint64_t)magic << 48);
- rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
-
- /* Encoding can fail on big values. */
- if (rec_key_length(rec) != keylen
- || rec_data_length(rec) != datalen
- || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
- return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
- "Could not encode k=%llu,d=%llu,a=%llu",
- (long long)keylen, (long long)datalen,
- (long long)actuallen);
- }
- return NTDB_SUCCESS;
-}
-
-/* You need 'size', this tells you how much you should expand by. */
-ntdb_off_t ntdb_expand_adjust(ntdb_off_t map_size, ntdb_off_t size)
-{
- ntdb_off_t new_size, top_size;
-
- /* limit size in order to avoid using up huge amounts of memory for
- * in memory tdbs if an oddball huge record creeps in */
- if (size > 100 * 1024) {
- top_size = map_size + size * 2;
- } else {
- top_size = map_size + size * 100;
- }
-
- /* always make room for at least top_size more records, and at
- least 25% more space. if the DB is smaller than 100MiB,
- otherwise grow it by 10% only. */
- if (map_size > 100 * 1024 * 1024) {
- new_size = map_size * 1.10;
- } else {
- new_size = map_size * 1.25;
- }
-
- if (new_size < top_size)
- new_size = top_size;
-
- /* We always make the file a multiple of transaction page
- * size. This guarantees that the transaction recovery area
- * is always aligned, otherwise the transaction code can overwrite
- * itself. */
- new_size = (new_size + NTDB_PGSIZE-1) & ~(NTDB_PGSIZE-1);
- return new_size - map_size;
-}
-
-/* Expand the database. */
-static enum NTDB_ERROR ntdb_expand(struct ntdb_context *ntdb, ntdb_len_t size)
-{
- uint64_t old_size;
- ntdb_len_t wanted;
- enum NTDB_ERROR ecode;
-
- /* Need to hold a hash lock to expand DB: transactions rely on it. */
- if (!(ntdb->flags & NTDB_NOLOCK)
- && !ntdb->file->allrecord_lock.count && !ntdb_has_hash_locks(ntdb)) {
- return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
- "ntdb_expand: must hold lock during expand");
- }
-
- /* Only one person can expand file at a time. */
- ecode = ntdb_lock_expand(ntdb, F_WRLCK);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- /* Someone else may have expanded the file, so retry. */
- old_size = ntdb->file->map_size;
- ntdb_oob(ntdb, ntdb->file->map_size, 1, true);
- if (ntdb->file->map_size != old_size) {
- ntdb_unlock_expand(ntdb, F_WRLCK);
- return NTDB_SUCCESS;
- }
-
- /* We need room for the record header too. */
- size = adjust_size(0, sizeof(struct ntdb_used_record) + size);
- /* Overallocate. */
- wanted = ntdb_expand_adjust(old_size, size);
-
- ecode = ntdb->io->expand_file(ntdb, wanted);
- if (ecode != NTDB_SUCCESS) {
- ntdb_unlock_expand(ntdb, F_WRLCK);
- return ecode;
- }
-
- /* We need to drop this lock before adding free record. */
- ntdb_unlock_expand(ntdb, F_WRLCK);
-
- ntdb->stats.expands++;
- return add_free_record(ntdb, old_size, wanted, NTDB_LOCK_WAIT, true);
-}
-
-/* This won't fail: it will expand the database if it has to. */
-ntdb_off_t alloc(struct ntdb_context *ntdb, size_t keylen, size_t datalen,
- unsigned magic, bool growing)
-{
- ntdb_off_t off;
-
- for (;;) {
- enum NTDB_ERROR ecode;
- off = get_free(ntdb, keylen, datalen, growing, magic);
- if (likely(off != 0))
- break;
-
- ecode = ntdb_expand(ntdb, adjust_size(keylen, datalen));
- if (ecode != NTDB_SUCCESS) {
- return NTDB_ERR_TO_OFF(ecode);
- }
- }
-
- return off;
-}
+++ /dev/null
- /*
- Trivial Database 2: hash handling
- Copyright (C) Rusty Russell 2010
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/hash/hash.h>
-
-/* Default hash function. */
-uint32_t ntdb_jenkins_hash(const void *key, size_t length, uint32_t seed,
- void *unused)
-{
- return hash_stable((const unsigned char *)key, length, seed);
-}
-
-uint32_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len)
-{
- return ntdb->hash_fn(ptr, len, ntdb->hash_seed, ntdb->hash_data);
-}
-
-static ntdb_bool_err key_matches(struct ntdb_context *ntdb,
- const struct ntdb_used_record *rec,
- ntdb_off_t off,
- const NTDB_DATA *key,
- const char **rptr)
-{
- ntdb_bool_err ret = false;
- const char *rkey;
-
- if (rec_key_length(rec) != key->dsize) {
- ntdb->stats.compare_wrong_keylen++;
- return ret;
- }
-
- rkey = ntdb_access_read(ntdb, off + sizeof(*rec),
- key->dsize + rec_data_length(rec), false);
- if (NTDB_PTR_IS_ERR(rkey)) {
- return (ntdb_bool_err)NTDB_PTR_ERR(rkey);
- }
- if (memcmp(rkey, key->dptr, key->dsize) == 0) {
- if (rptr) {
- *rptr = rkey;
- } else {
- ntdb_access_release(ntdb, rkey);
- }
- return true;
- }
- ntdb->stats.compare_wrong_keycmp++;
- ntdb_access_release(ntdb, rkey);
- return ret;
-}
-
-/* Does entry match? */
-static ntdb_bool_err match(struct ntdb_context *ntdb,
- uint32_t hash,
- const NTDB_DATA *key,
- ntdb_off_t val,
- struct ntdb_used_record *rec,
- const char **rptr)
-{
- ntdb_off_t off;
- enum NTDB_ERROR ecode;
-
- ntdb->stats.compares++;
-
- /* Top bits of offset == next bits of hash. */
- if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
- != bits_from(val, 64-NTDB_OFF_UPPER_STEAL, NTDB_OFF_UPPER_STEAL)) {
- ntdb->stats.compare_wrong_offsetbits++;
- return false;
- }
-
- off = val & NTDB_OFF_MASK;
- ecode = ntdb_read_convert(ntdb, off, rec, sizeof(*rec));
- if (ecode != NTDB_SUCCESS) {
- return (ntdb_bool_err)ecode;
- }
-
- return key_matches(ntdb, rec, off, key, rptr);
-}
-
-static bool is_chain(ntdb_off_t val)
-{
- return val & (1ULL << NTDB_OFF_CHAIN_BIT);
-}
-
-static ntdb_off_t hbucket_off(ntdb_off_t base, ntdb_len_t idx)
-{
- return base + sizeof(struct ntdb_used_record)
- + idx * sizeof(ntdb_off_t);
-}
-
-/* This is the core routine which searches the hashtable for an entry.
- * On error, no locks are held and -ve is returned.
- * Otherwise, hinfo is filled in.
- * If not found, the return value is 0.
- * If found, the return value is the offset, and *rec is the record. */
-ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
- NTDB_DATA key,
- int ltype,
- struct hash_info *h,
- struct ntdb_used_record *rec,
- const char **rptr)
-{
- ntdb_off_t off, val;
- const ntdb_off_t *arr = NULL;
- ntdb_len_t i;
- bool found_empty;
- enum NTDB_ERROR ecode;
- struct ntdb_used_record chdr;
- ntdb_bool_err berr;
-
- h->h = ntdb_hash(ntdb, key.dptr, key.dsize);
-
- h->table = NTDB_HASH_OFFSET;
- h->table_size = 1 << ntdb->hash_bits;
- h->bucket = bits_from(h->h, 0, ntdb->hash_bits);
- h->old_val = 0;
-
- ecode = ntdb_lock_hash(ntdb, h->bucket, ltype);
- if (ecode != NTDB_SUCCESS) {
- return NTDB_ERR_TO_OFF(ecode);
- }
-
- off = hbucket_off(h->table, h->bucket);
- val = ntdb_read_off(ntdb, off);
- if (NTDB_OFF_IS_ERR(val)) {
- ecode = NTDB_OFF_TO_ERR(val);
- goto fail;
- }
-
- /* Directly in hash table? */
- if (!likely(is_chain(val))) {
- if (val) {
- berr = match(ntdb, h->h, &key, val, rec, rptr);
- if (berr < 0) {
- ecode = NTDB_OFF_TO_ERR(berr);
- goto fail;
- }
- if (berr) {
- return val & NTDB_OFF_MASK;
- }
- /* If you want to insert here, make a chain. */
- h->old_val = val;
- }
- return 0;
- }
-
- /* Nope? Iterate through chain. */
- h->table = val & NTDB_OFF_MASK;
-
- ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
- if (ecode != NTDB_SUCCESS) {
- goto fail;
- }
-
- if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
- ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "find_and_lock:"
- " corrupt record %#x at %llu",
- rec_magic(&chdr), (long long)off);
- goto fail;
- }
-
- h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
-
- arr = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
- rec_data_length(&chdr), true);
- if (NTDB_PTR_IS_ERR(arr)) {
- ecode = NTDB_PTR_ERR(arr);
- goto fail;
- }
-
- found_empty = false;
- for (i = 0; i < h->table_size; i++) {
- if (arr[i] == 0) {
- if (!found_empty) {
- h->bucket = i;
- found_empty = true;
- }
- } else {
- berr = match(ntdb, h->h, &key, arr[i], rec, rptr);
- if (berr < 0) {
- ecode = NTDB_OFF_TO_ERR(berr);
- ntdb_access_release(ntdb, arr);
- goto fail;
- }
- if (berr) {
- /* We found it! */
- h->bucket = i;
- off = arr[i] & NTDB_OFF_MASK;
- ntdb_access_release(ntdb, arr);
- return off;
- }
- }
- }
- if (!found_empty) {
- /* Set to any non-zero value */
- h->old_val = 1;
- h->bucket = i;
- }
-
- ntdb_access_release(ntdb, arr);
- return 0;
-
-fail:
- ntdb_unlock_hash(ntdb, h->bucket, ltype);
- return NTDB_ERR_TO_OFF(ecode);
-}
-
-static ntdb_off_t encode_offset(const struct ntdb_context *ntdb,
- ntdb_off_t new_off, uint32_t hash)
-{
- ntdb_off_t extra;
-
- assert((new_off & (1ULL << NTDB_OFF_CHAIN_BIT)) == 0);
- assert((new_off >> (64 - NTDB_OFF_UPPER_STEAL)) == 0);
- /* We pack extra hash bits into the upper bits of the offset. */
- extra = bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL);
- extra <<= (64 - NTDB_OFF_UPPER_STEAL);
-
- return new_off | extra;
-}
-
-/* Simply overwrite the hash entry we found before. */
-enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
- const struct hash_info *h,
- ntdb_off_t new_off)
-{
- return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket),
- encode_offset(ntdb, new_off, h->h));
-}
-
-enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb,
- const struct hash_info *h)
-{
- return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket), 0);
-}
-
-
-enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb,
- const struct hash_info *h,
- ntdb_off_t new_off)
-{
- enum NTDB_ERROR ecode;
- ntdb_off_t chain;
- struct ntdb_used_record chdr;
- const ntdb_off_t *old;
- ntdb_off_t *new;
-
- /* We hit an empty bucket during search? That's where it goes. */
- if (!h->old_val) {
- return replace_in_hash(ntdb, h, new_off);
- }
-
- /* Full at top-level? Create a 2-element chain. */
- if (h->table == NTDB_HASH_OFFSET) {
- ntdb_off_t pair[2];
-
- /* One element is old value, the other is the new value. */
- pair[0] = h->old_val;
- pair[1] = encode_offset(ntdb, new_off, h->h);
-
- chain = alloc(ntdb, 0, sizeof(pair), NTDB_CHAIN_MAGIC, true);
- if (NTDB_OFF_IS_ERR(chain)) {
- return NTDB_OFF_TO_ERR(chain);
- }
- ecode = ntdb_write_convert(ntdb,
- chain
- + sizeof(struct ntdb_used_record),
- pair, sizeof(pair));
- if (ecode == NTDB_SUCCESS) {
- ecode = ntdb_write_off(ntdb,
- hbucket_off(h->table, h->bucket),
- chain
- | (1ULL << NTDB_OFF_CHAIN_BIT));
- }
- return ecode;
- }
-
- /* Full bucket. Expand. */
- ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- if (rec_extra_padding(&chdr) >= sizeof(new_off)) {
- /* Expand in place. */
- uint64_t dlen = rec_data_length(&chdr);
-
- ecode = set_header(ntdb, &chdr, NTDB_CHAIN_MAGIC, 0,
- dlen + sizeof(new_off),
- dlen + rec_extra_padding(&chdr));
-
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
- /* find_and_lock set up h to point to last bucket. */
- ecode = replace_in_hash(ntdb, h, new_off);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
- ecode = ntdb_write_convert(ntdb, h->table, &chdr, sizeof(chdr));
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
- /* For futureproofing, we always make the first byte of padding
- * a zero. */
- if (rec_extra_padding(&chdr)) {
- ecode = ntdb->io->twrite(ntdb, h->table + sizeof(chdr)
- + dlen + sizeof(new_off),
- "", 1);
- }
- return ecode;
- }
-
- /* We need to reallocate the chain. */
- chain = alloc(ntdb, 0, (h->table_size + 1) * sizeof(ntdb_off_t),
- NTDB_CHAIN_MAGIC, true);
- if (NTDB_OFF_IS_ERR(chain)) {
- return NTDB_OFF_TO_ERR(chain);
- }
-
- /* Map both and copy across old buckets. */
- old = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
- h->table_size*sizeof(ntdb_off_t), true);
- if (NTDB_PTR_IS_ERR(old)) {
- return NTDB_PTR_ERR(old);
- }
- new = ntdb_access_write(ntdb, hbucket_off(chain, 0),
- (h->table_size + 1)*sizeof(ntdb_off_t), true);
- if (NTDB_PTR_IS_ERR(new)) {
- ntdb_access_release(ntdb, old);
- return NTDB_PTR_ERR(new);
- }
-
- memcpy(new, old, h->bucket * sizeof(ntdb_off_t));
- new[h->bucket] = encode_offset(ntdb, new_off, h->h);
- ntdb_access_release(ntdb, old);
-
- ecode = ntdb_access_commit(ntdb, new);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- /* Free the old chain. */
- ecode = add_free_record(ntdb, h->table,
- sizeof(struct ntdb_used_record)
- + rec_data_length(&chdr)
- + rec_extra_padding(&chdr),
- NTDB_LOCK_WAIT, true);
-
- /* Replace top-level to point to new chain */
- return ntdb_write_off(ntdb,
- hbucket_off(NTDB_HASH_OFFSET,
- bits_from(h->h, 0, ntdb->hash_bits)),
- chain | (1ULL << NTDB_OFF_CHAIN_BIT));
-}
-
-/* Traverse support: returns offset of record, or 0 or -ve error. */
-static ntdb_off_t iterate_chain(struct ntdb_context *ntdb,
- ntdb_off_t val,
- struct hash_info *h)
-{
- ntdb_off_t i;
- enum NTDB_ERROR ecode;
- struct ntdb_used_record chdr;
-
- /* First load up chain header. */
- h->table = val & NTDB_OFF_MASK;
- ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
- return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "get_table:"
- " corrupt record %#x at %llu",
- rec_magic(&chdr),
- (long long)h->table);
- }
-
- /* Chain length is implied by data length. */
- h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
-
- i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), h->bucket,
- h->table_size);
- if (NTDB_OFF_IS_ERR(i)) {
- return i;
- }
-
- if (i != h->table_size) {
- /* Return to next bucket. */
- h->bucket = i + 1;
- val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
- if (NTDB_OFF_IS_ERR(val)) {
- return val;
- }
- return val & NTDB_OFF_MASK;
- }
-
- /* Go back up to hash table. */
- h->table = NTDB_HASH_OFFSET;
- h->table_size = 1 << ntdb->hash_bits;
- h->bucket = bits_from(h->h, 0, ntdb->hash_bits) + 1;
- return 0;
-}
-
-/* Keeps hash locked unless returns 0 or error. */
-static ntdb_off_t lock_and_iterate_hash(struct ntdb_context *ntdb,
- struct hash_info *h)
-{
- ntdb_off_t val, i;
- enum NTDB_ERROR ecode;
-
- if (h->table != NTDB_HASH_OFFSET) {
- /* We're in a chain. */
- i = bits_from(h->h, 0, ntdb->hash_bits);
- ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
- if (ecode != NTDB_SUCCESS) {
- return NTDB_ERR_TO_OFF(ecode);
- }
-
- /* We dropped lock, bucket might have moved! */
- val = ntdb_read_off(ntdb, hbucket_off(NTDB_HASH_OFFSET, i));
- if (NTDB_OFF_IS_ERR(val)) {
- goto unlock;
- }
-
- /* We don't remove chains: there should still be one there! */
- if (!val || !is_chain(val)) {
- ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "iterate_hash:"
- " vanished hchain %llu at %llu",
- (long long)val,
- (long long)i);
- val = NTDB_ERR_TO_OFF(ecode);
- goto unlock;
- }
-
- /* Find next bucket in the chain. */
- val = iterate_chain(ntdb, val, h);
- if (NTDB_OFF_IS_ERR(val)) {
- goto unlock;
- }
- if (val != 0) {
- return val;
- }
- ntdb_unlock_hash(ntdb, i, F_RDLCK);
-
- /* OK, we've reset h back to top level. */
- }
-
- /* We do this unlocked, then re-check. */
- for (i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
- h->bucket, h->table_size);
- i != h->table_size;
- i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
- i+1, h->table_size)) {
- ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
- if (ecode != NTDB_SUCCESS) {
- return NTDB_ERR_TO_OFF(ecode);
- }
-
- val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
- if (NTDB_OFF_IS_ERR(val)) {
- goto unlock;
- }
-
- /* Lost race, and it's empty? */
- if (!val) {
- ntdb->stats.traverse_val_vanished++;
- ntdb_unlock_hash(ntdb, i, F_RDLCK);
- continue;
- }
-
- if (!is_chain(val)) {
- /* So caller knows what lock to free. */
- h->h = i;
- /* Return to next bucket. */
- h->bucket = i + 1;
- val &= NTDB_OFF_MASK;
- return val;
- }
-
- /* Start at beginning of chain */
- h->bucket = 0;
- h->h = i;
-
- val = iterate_chain(ntdb, val, h);
- if (NTDB_OFF_IS_ERR(val)) {
- goto unlock;
- }
- if (val != 0) {
- return val;
- }
-
- /* Otherwise, bucket has been set to i+1 */
- ntdb_unlock_hash(ntdb, i, F_RDLCK);
- }
- return 0;
-
-unlock:
- ntdb_unlock_hash(ntdb, i, F_RDLCK);
- return val;
-}
-
-/* Return success if we find something, NTDB_ERR_NOEXIST if none. */
-enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
- struct hash_info *h,
- NTDB_DATA *kbuf, size_t *dlen)
-{
- ntdb_off_t off;
- struct ntdb_used_record rec;
- enum NTDB_ERROR ecode;
-
- off = lock_and_iterate_hash(ntdb, h);
-
- if (NTDB_OFF_IS_ERR(off)) {
- return NTDB_OFF_TO_ERR(off);
- } else if (off == 0) {
- return NTDB_ERR_NOEXIST;
- }
-
- /* The hash for this key is still locked. */
- ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
- if (ecode != NTDB_SUCCESS) {
- goto unlock;
- }
- if (rec_magic(&rec) != NTDB_USED_MAGIC) {
- ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
- NTDB_LOG_ERROR,
- "next_in_hash:"
- " corrupt record at %llu",
- (long long)off);
- goto unlock;
- }
-
- kbuf->dsize = rec_key_length(&rec);
-
- /* They want data as well? */
- if (dlen) {
- *dlen = rec_data_length(&rec);
- kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
- kbuf->dsize + *dlen);
- } else {
- kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
- kbuf->dsize);
- }
- if (NTDB_PTR_IS_ERR(kbuf->dptr)) {
- ecode = NTDB_PTR_ERR(kbuf->dptr);
- goto unlock;
- }
- ecode = NTDB_SUCCESS;
-
-unlock:
- ntdb_unlock_hash(ntdb, bits_from(h->h, 0, ntdb->hash_bits), F_RDLCK);
- return ecode;
-
-}
-
-enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
- struct hash_info *h,
- NTDB_DATA *kbuf, size_t *dlen)
-{
- h->table = NTDB_HASH_OFFSET;
- h->table_size = 1 << ntdb->hash_bits;
- h->bucket = 0;
-
- return next_in_hash(ntdb, h, kbuf, dlen);
-}
-
-/* Even if the entry isn't in this hash bucket, you'd have to lock this
- * bucket to find it. */
-static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb,
- const NTDB_DATA *key, int ltype)
-{
- uint32_t h = ntdb_hash(ntdb, key->dptr, key->dsize);
-
- return ntdb_lock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), ltype);
-}
-
-/* lock/unlock one hash chain. This is meant to be used to reduce
- contention - it cannot guarantee how many records will be locked */
-_PUBLIC_ enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key)
-{
- return chainlock(ntdb, &key, F_WRLCK);
-}
-
-_PUBLIC_ void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key)
-{
- uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
-
- ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_WRLCK);
-}
-
-_PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb,
- NTDB_DATA key)
-{
- return chainlock(ntdb, &key, F_RDLCK);
-}
-
-_PUBLIC_ void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key)
-{
- uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
-
- ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_RDLCK);
-}
+++ /dev/null
- /*
- Unix SMB/CIFS implementation.
-
- trivial database library
-
- Copyright (C) Andrew Tridgell 1999-2005
- Copyright (C) Paul `Rusty' Russell 2000
- Copyright (C) Jeremy Allison 2000-2003
- Copyright (C) Rusty Russell 2010
-
- ** NOTE! The following LGPL license applies to the ntdb
- ** library. This does NOT imply that all of Samba is released
- ** under the LGPL
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/likely/likely.h>
-
-static void free_old_mmaps(struct ntdb_context *ntdb)
-{
- struct ntdb_old_mmap *i;
-
- assert(ntdb->file->direct_count == 0);
-
- while ((i = ntdb->file->old_mmaps) != NULL) {
- ntdb->file->old_mmaps = i->next;
- if (ntdb->flags & NTDB_INTERNAL) {
- ntdb->free_fn(i->map_ptr, ntdb->alloc_data);
- } else {
- munmap(i->map_ptr, i->map_size);
- }
- ntdb->free_fn(i, ntdb->alloc_data);
- }
-}
-
-static enum NTDB_ERROR save_old_map(struct ntdb_context *ntdb)
-{
- struct ntdb_old_mmap *old;
-
- assert(ntdb->file->direct_count);
-
- old = ntdb->alloc_fn(ntdb->file, sizeof(*old), ntdb->alloc_data);
- if (!old) {
- return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
- "save_old_map alloc failed");
- }
- old->next = ntdb->file->old_mmaps;
- old->map_ptr = ntdb->file->map_ptr;
- old->map_size = ntdb->file->map_size;
- ntdb->file->old_mmaps = old;
-
- return NTDB_SUCCESS;
-}
-
-enum NTDB_ERROR ntdb_munmap(struct ntdb_context *ntdb)
-{
- if (ntdb->file->fd == -1) {
- return NTDB_SUCCESS;
- }
-
- if (!ntdb->file->map_ptr) {
- return NTDB_SUCCESS;
- }
-
- /* We can't unmap now if there are accessors. */
- if (ntdb->file->direct_count) {
- return save_old_map(ntdb);
- } else {
- munmap(ntdb->file->map_ptr, ntdb->file->map_size);
- ntdb->file->map_ptr = NULL;
- }
- return NTDB_SUCCESS;
-}
-
-enum NTDB_ERROR ntdb_mmap(struct ntdb_context *ntdb)
-{
- int mmap_flags;
-
- if (ntdb->flags & NTDB_INTERNAL)
- return NTDB_SUCCESS;
-
-#ifndef HAVE_INCOHERENT_MMAP
- if (ntdb->flags & NTDB_NOMMAP)
- return NTDB_SUCCESS;
-#endif
-
- if ((ntdb->open_flags & O_ACCMODE) == O_RDONLY)
- mmap_flags = PROT_READ;
- else
- mmap_flags = PROT_READ | PROT_WRITE;
-
- /* size_t can be smaller than off_t. */
- if ((size_t)ntdb->file->map_size == ntdb->file->map_size) {
- ntdb->file->map_ptr = mmap(NULL, ntdb->file->map_size,
- mmap_flags,
- MAP_SHARED, ntdb->file->fd, 0);
- } else
- ntdb->file->map_ptr = MAP_FAILED;
-
- /*
- * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
- */
- if (ntdb->file->map_ptr == MAP_FAILED) {
- ntdb->file->map_ptr = NULL;
-#ifdef HAVE_INCOHERENT_MMAP
- /* Incoherent mmap means everyone must mmap! */
- return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
- "ntdb_mmap failed for size %lld (%s)",
- (long long)ntdb->file->map_size,
- strerror(errno));
-#else
- ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
- "ntdb_mmap failed for size %lld (%s)",
- (long long)ntdb->file->map_size, strerror(errno));
-#endif
- }
- return NTDB_SUCCESS;
-}
-
-/* check for an out of bounds access - if it is out of bounds then
- see if the database has been expanded by someone else and expand
- if necessary
- note that "len" is the minimum length needed for the db.
-
- If probe is true, len being too large isn't a failure.
-*/
-static enum NTDB_ERROR ntdb_normal_oob(struct ntdb_context *ntdb,
- ntdb_off_t off, ntdb_len_t len,
- bool probe)
-{
- struct stat st;
- enum NTDB_ERROR ecode;
-
- if (len + off < len) {
- if (probe)
- return NTDB_SUCCESS;
-
- return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
- "ntdb_oob off %llu len %llu wrap\n",
- (long long)off, (long long)len);
- }
-
- if (ntdb->flags & NTDB_INTERNAL) {
- if (probe)
- return NTDB_SUCCESS;
-
- ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
- "ntdb_oob len %lld beyond internal"
- " alloc size %lld",
- (long long)(off + len),
- (long long)ntdb->file->map_size);
- return NTDB_ERR_IO;
- }
-
- ecode = ntdb_lock_expand(ntdb, F_RDLCK);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- if (fstat(ntdb->file->fd, &st) != 0) {
- ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
- "Failed to fstat file: %s", strerror(errno));
- ntdb_unlock_expand(ntdb, F_RDLCK);
- return NTDB_ERR_IO;
- }
-
- ntdb_unlock_expand(ntdb, F_RDLCK);
-
- if (st.st_size < off + len) {
- if (probe)
- return NTDB_SUCCESS;
-
- ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
- "ntdb_oob len %llu beyond eof at %llu",
- (long long)(off + len), (long long)st.st_size);
- return NTDB_ERR_IO;
- }
-
- /* Unmap, update size, remap */
- ecode = ntdb_munmap(ntdb);
- if (ecode) {
- return ecode;
- }
-
- ntdb->file->map_size = st.st_size;
- return ntdb_mmap(ntdb);
-}
-
-/* Endian conversion: we only ever deal with 8 byte quantities */
-void *ntdb_convert(const struct ntdb_context *ntdb, void *buf, ntdb_len_t size)
-{
- assert(size % 8 == 0);
- if (unlikely((ntdb->flags & NTDB_CONVERT)) && buf) {
- uint64_t i, *p = (uint64_t *)buf;
- for (i = 0; i < size / 8; i++)
- p[i] = bswap_64(p[i]);
- }
- return buf;
-}
-
-/* Return first non-zero offset in offset array, or end, or -ve error. */
-/* FIXME: Return the off? */
-uint64_t ntdb_find_nonzero_off(struct ntdb_context *ntdb,
- ntdb_off_t base, uint64_t start, uint64_t end)
-{
- uint64_t i;
- const uint64_t *val;
-
- /* Zero vs non-zero is the same unconverted: minor optimization. */
- val = ntdb_access_read(ntdb, base + start * sizeof(ntdb_off_t),
- (end - start) * sizeof(ntdb_off_t), false);
- if (NTDB_PTR_IS_ERR(val)) {
- return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val));
- }
-
- for (i = 0; i < (end - start); i++) {
- if (val[i])
- break;
- }
- ntdb_access_release(ntdb, val);
- return start + i;
-}
-
-/* Return first zero offset in num offset array, or num, or -ve error. */
-uint64_t ntdb_find_zero_off(struct ntdb_context *ntdb, ntdb_off_t off,
- uint64_t num)
-{
- uint64_t i;
- const uint64_t *val;
-
- /* Zero vs non-zero is the same unconverted: minor optimization. */
- val = ntdb_access_read(ntdb, off, num * sizeof(ntdb_off_t), false);
- if (NTDB_PTR_IS_ERR(val)) {
- return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val));
- }
-
- for (i = 0; i < num; i++) {
- if (!val[i])
- break;
- }
- ntdb_access_release(ntdb, val);
- return i;
-}
-
-enum NTDB_ERROR zero_out(struct ntdb_context *ntdb, ntdb_off_t off, ntdb_len_t len)
-{
- char buf[8192] = { 0 };
- void *p = ntdb->io->direct(ntdb, off, len, true);
- enum NTDB_ERROR ecode = NTDB_SUCCESS;
-
- assert(!(ntdb->flags & NTDB_RDONLY));
- if (NTDB_PTR_IS_ERR(p)) {
- return NTDB_PTR_ERR(p);
- }
- if (p) {
- memset(p, 0, len);
- return ecode;
- }
- while (len) {
- unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
- ecode = ntdb->io->twrite(ntdb, off, buf, todo);
- if (ecode != NTDB_SUCCESS) {
- break;
- }
- len -= todo;
- off += todo;
- }
- return ecode;
-}
-
-/* write a lump of data at a specified offset */
-static enum NTDB_ERROR ntdb_write(struct ntdb_context *ntdb, ntdb_off_t off,
- const void *buf, ntdb_len_t len)
-{
- enum NTDB_ERROR ecode;
-
- if (ntdb->flags & NTDB_RDONLY) {
- return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
- "Write to read-only database");
- }
-
- ecode = ntdb_oob(ntdb, off, len, false);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- if (ntdb->file->map_ptr) {
- memcpy(off + (char *)ntdb->file->map_ptr, buf, len);
- } else {
-#ifdef HAVE_INCOHERENT_MMAP
- return NTDB_ERR_IO;
-#else
- ssize_t ret;
- ret = pwrite(ntdb->file->fd, buf, len, off);
- if (ret != len) {
- /* This shouldn't happen: we avoid sparse files. */
- if (ret >= 0)
- errno = ENOSPC;
-
- return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
- "ntdb_write: %zi at %zu len=%zu (%s)",
- ret, (size_t)off, (size_t)len,
- strerror(errno));
- }
-#endif
- }
- return NTDB_SUCCESS;
-}
-
-/* read a lump of data at a specified offset */
-static enum NTDB_ERROR ntdb_read(struct ntdb_context *ntdb, ntdb_off_t off,
- void *buf, ntdb_len_t len)
-{
- enum NTDB_ERROR ecode;
-
- ecode = ntdb_oob(ntdb, off, len, false);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- if (ntdb->file->map_ptr) {
- memcpy(buf, off + (char *)ntdb->file->map_ptr, len);
- } else {
-#ifdef HAVE_INCOHERENT_MMAP
- return NTDB_ERR_IO;
-#else
- ssize_t r = pread(ntdb->file->fd, buf, len, off);
- if (r != len) {
- return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
- "ntdb_read failed with %zi at %zu "
- "len=%zu (%s) map_size=%zu",
- r, (size_t)off, (size_t)len,
- strerror(errno),
- (size_t)ntdb->file->map_size);
- }
-#endif
- }
- return NTDB_SUCCESS;
-}
-
-enum NTDB_ERROR ntdb_write_convert(struct ntdb_context *ntdb, ntdb_off_t off,
- const void *rec, size_t len)
-{
- enum NTDB_ERROR ecode;
-
- if (unlikely((ntdb->flags & NTDB_CONVERT))) {
- void *conv = ntdb->alloc_fn(ntdb, len, ntdb->alloc_data);
- if (!conv) {
- return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
- "ntdb_write: no memory converting"
- " %zu bytes", len);
- }
- memcpy(conv, rec, len);
- ecode = ntdb->io->twrite(ntdb, off,
- ntdb_convert(ntdb, conv, len), len);
- ntdb->free_fn(conv, ntdb->alloc_data);
- } else {
- ecode = ntdb->io->twrite(ntdb, off, rec, len);
- }
- return ecode;
-}
-
-enum NTDB_ERROR ntdb_read_convert(struct ntdb_context *ntdb, ntdb_off_t off,
- void *rec, size_t len)
-{
- enum NTDB_ERROR ecode = ntdb->io->tread(ntdb, off, rec, len);
- ntdb_convert(ntdb, rec, len);
- return ecode;
-}
-
-static void *_ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset,
- ntdb_len_t len, unsigned int prefix)
-{
- unsigned char *buf;
- enum NTDB_ERROR ecode;
-
- /* some systems don't like zero length malloc */
- buf = ntdb->alloc_fn(ntdb, prefix + len ? prefix + len : 1,
- ntdb->alloc_data);
- if (!buf) {
- ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
- "ntdb_alloc_read alloc failed len=%zu",
- (size_t)(prefix + len));
- return NTDB_ERR_PTR(NTDB_ERR_OOM);
- } else {
- ecode = ntdb->io->tread(ntdb, offset, buf+prefix, len);
- if (unlikely(ecode != NTDB_SUCCESS)) {
- ntdb->free_fn(buf, ntdb->alloc_data);
- return NTDB_ERR_PTR(ecode);
- }
- }
- return buf;
-}
-
-/* read a lump of data, allocating the space for it */
-void *ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset, ntdb_len_t len)
-{
- return _ntdb_alloc_read(ntdb, offset, len, 0);
-}
-
-static enum NTDB_ERROR fill(struct ntdb_context *ntdb,
- const void *buf, size_t size,
- ntdb_off_t off, ntdb_len_t len)
-{
- while (len) {
- size_t n = len > size ? size : len;
- ssize_t ret = pwrite(ntdb->file->fd, buf, n, off);
- if (ret != n) {
- if (ret >= 0)
- errno = ENOSPC;
-
- return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
- "fill failed:"
- " %zi at %zu len=%zu (%s)",
- ret, (size_t)off, (size_t)len,
- strerror(errno));
- }
- len -= n;
- off += n;
- }
- return NTDB_SUCCESS;
-}
-
-/* expand a file. we prefer to use ftruncate, as that is what posix
- says to use for mmap expansion */
-static enum NTDB_ERROR ntdb_expand_file(struct ntdb_context *ntdb,
- ntdb_len_t addition)
-{
- char buf[8192];
- enum NTDB_ERROR ecode;
-
- assert((ntdb->file->map_size + addition) % NTDB_PGSIZE == 0);
- if (ntdb->flags & NTDB_RDONLY) {
- return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
- "Expand on read-only database");
- }
-
- if (ntdb->flags & NTDB_INTERNAL) {
- char *new;
-
- /* Can't free it if we have direct accesses. */
- if (ntdb->file->direct_count) {
- ecode = save_old_map(ntdb);
- if (ecode) {
- return ecode;
- }
- new = ntdb->alloc_fn(ntdb->file,
- ntdb->file->map_size + addition,
- ntdb->alloc_data);
- if (new) {
- memcpy(new, ntdb->file->map_ptr,
- ntdb->file->map_size);
- }
- } else {
- new = ntdb->expand_fn(ntdb->file->map_ptr,
- ntdb->file->map_size + addition,
- ntdb->alloc_data);
- }
- if (!new) {
- return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
- "No memory to expand database");
- }
- ntdb->file->map_ptr = new;
- ntdb->file->map_size += addition;
- return NTDB_SUCCESS;
- } else {
- /* Unmap before trying to write; old NTDB claimed OpenBSD had
- * problem with this otherwise. */
- ecode = ntdb_munmap(ntdb);
- if (ecode) {
- return ecode;
- }
-
- /* If this fails, we try to fill anyway. */
- if (ftruncate(ntdb->file->fd, ntdb->file->map_size + addition))
- ;
-
- /* now fill the file with something. This ensures that the
- file isn't sparse, which would be very bad if we ran out of
- disk. This must be done with write, not via mmap */
- memset(buf, 0x43, sizeof(buf));
- ecode = fill(ntdb, buf, sizeof(buf), ntdb->file->map_size,
- addition);
- if (ecode != NTDB_SUCCESS)
- return ecode;
- ntdb->file->map_size += addition;
- return ntdb_mmap(ntdb);
- }
-}
-
-const void *ntdb_access_read(struct ntdb_context *ntdb,
- ntdb_off_t off, ntdb_len_t len, bool convert)
-{
- void *ret = NULL;
-
- if (likely(!(ntdb->flags & NTDB_CONVERT))) {
- ret = ntdb->io->direct(ntdb, off, len, false);
-
- if (NTDB_PTR_IS_ERR(ret)) {
- return ret;
- }
- }
- if (!ret) {
- struct ntdb_access_hdr *hdr;
- hdr = _ntdb_alloc_read(ntdb, off, len, sizeof(*hdr));
- if (NTDB_PTR_IS_ERR(hdr)) {
- return hdr;
- }
- hdr->next = ntdb->access;
- ntdb->access = hdr;
- ret = hdr + 1;
- if (convert) {
- ntdb_convert(ntdb, (void *)ret, len);
- }
- } else {
- ntdb->file->direct_count++;
- }
-
- return ret;
-}
-
-void *ntdb_access_write(struct ntdb_context *ntdb,
- ntdb_off_t off, ntdb_len_t len, bool convert)
-{
- void *ret = NULL;
-
- if (ntdb->flags & NTDB_RDONLY) {
- ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
- "Write to read-only database");
- return NTDB_ERR_PTR(NTDB_ERR_RDONLY);
- }
-
- if (likely(!(ntdb->flags & NTDB_CONVERT))) {
- ret = ntdb->io->direct(ntdb, off, len, true);
-
- if (NTDB_PTR_IS_ERR(ret)) {
- return ret;
- }
- }
-
- if (!ret) {
- struct ntdb_access_hdr *hdr;
- hdr = _ntdb_alloc_read(ntdb, off, len, sizeof(*hdr));
- if (NTDB_PTR_IS_ERR(hdr)) {
- return hdr;
- }
- hdr->next = ntdb->access;
- ntdb->access = hdr;
- hdr->off = off;
- hdr->len = len;
- hdr->convert = convert;
- ret = hdr + 1;
- if (convert)
- ntdb_convert(ntdb, (void *)ret, len);
- } else {
- ntdb->file->direct_count++;
- }
- return ret;
-}
-
-static struct ntdb_access_hdr **find_hdr(struct ntdb_context *ntdb, const void *p)
-{
- struct ntdb_access_hdr **hp;
-
- for (hp = &ntdb->access; *hp; hp = &(*hp)->next) {
- if (*hp + 1 == p)
- return hp;
- }
- return NULL;
-}
-
-void ntdb_access_release(struct ntdb_context *ntdb, const void *p)
-{
- struct ntdb_access_hdr *hdr, **hp = find_hdr(ntdb, p);
-
- if (hp) {
- hdr = *hp;
- *hp = hdr->next;
- ntdb->free_fn(hdr, ntdb->alloc_data);
- } else {
- if (--ntdb->file->direct_count == 0) {
- free_old_mmaps(ntdb);
- }
- }
-}
-
-enum NTDB_ERROR ntdb_access_commit(struct ntdb_context *ntdb, void *p)
-{
- struct ntdb_access_hdr *hdr, **hp = find_hdr(ntdb, p);
- enum NTDB_ERROR ecode;
-
- if (hp) {
- hdr = *hp;
- if (hdr->convert)
- ecode = ntdb_write_convert(ntdb, hdr->off, p, hdr->len);
- else
- ecode = ntdb_write(ntdb, hdr->off, p, hdr->len);
- *hp = hdr->next;
- ntdb->free_fn(hdr, ntdb->alloc_data);
- } else {
- if (--ntdb->file->direct_count == 0) {
- free_old_mmaps(ntdb);
- }
- ecode = NTDB_SUCCESS;
- }
-
- return ecode;
-}
-
-static void *ntdb_direct(struct ntdb_context *ntdb, ntdb_off_t off, size_t len,
- bool write_mode)
-{
- enum NTDB_ERROR ecode;
-
- if (unlikely(!ntdb->file->map_ptr))
- return NULL;
-
- ecode = ntdb_oob(ntdb, off, len, false);
- if (unlikely(ecode != NTDB_SUCCESS))
- return NTDB_ERR_PTR(ecode);
- return (char *)ntdb->file->map_ptr + off;
-}
-
-static ntdb_off_t ntdb_read_normal_off(struct ntdb_context *ntdb,
- ntdb_off_t off)
-{
- ntdb_off_t ret;
- enum NTDB_ERROR ecode;
- ntdb_off_t *p;
-
- p = ntdb_direct(ntdb, off, sizeof(*p), false);
- if (NTDB_PTR_IS_ERR(p)) {
- return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(p));
- }
- if (likely(p)) {
- return *p;
- }
-
- ecode = ntdb_read(ntdb, off, &ret, sizeof(ret));
- if (ecode != NTDB_SUCCESS) {
- return NTDB_ERR_TO_OFF(ecode);
- }
- return ret;
-}
-
-static ntdb_off_t ntdb_read_convert_off(struct ntdb_context *ntdb,
- ntdb_off_t off)
-{
- ntdb_off_t ret;
- enum NTDB_ERROR ecode;
-
- ecode = ntdb_read_convert(ntdb, off, &ret, sizeof(ret));
- if (ecode != NTDB_SUCCESS) {
- return NTDB_ERR_TO_OFF(ecode);
- }
- return ret;
-}
-
-static enum NTDB_ERROR ntdb_write_normal_off(struct ntdb_context *ntdb,
- ntdb_off_t off, ntdb_off_t val)
-{
- ntdb_off_t *p;
-
- p = ntdb_direct(ntdb, off, sizeof(*p), true);
- if (NTDB_PTR_IS_ERR(p)) {
- return NTDB_PTR_ERR(p);
- }
- if (likely(p)) {
- *p = val;
- return NTDB_SUCCESS;
- }
- return ntdb_write(ntdb, off, &val, sizeof(val));
-}
-
-static enum NTDB_ERROR ntdb_write_convert_off(struct ntdb_context *ntdb,
- ntdb_off_t off, ntdb_off_t val)
-{
- return ntdb_write_convert(ntdb, off, &val, sizeof(val));
-}
-
-void ntdb_inc_seqnum(struct ntdb_context *ntdb)
-{
- ntdb_off_t seq;
-
- if (likely(!(ntdb->flags & NTDB_CONVERT))) {
- int64_t *direct;
-
- direct = ntdb->io->direct(ntdb,
- offsetof(struct ntdb_header, seqnum),
- sizeof(*direct), true);
- if (likely(direct)) {
- /* Don't let it go negative, even briefly */
- if (unlikely((*direct) + 1) < 0)
- *direct = 0;
- (*direct)++;
- return;
- }
- }
-
- seq = ntdb_read_off(ntdb, offsetof(struct ntdb_header, seqnum));
- if (!NTDB_OFF_IS_ERR(seq)) {
- seq++;
- if (unlikely((int64_t)seq < 0))
- seq = 0;
- ntdb_write_off(ntdb, offsetof(struct ntdb_header, seqnum), seq);
- }
-}
-
-static const struct ntdb_methods io_methods = {
- ntdb_read,
- ntdb_write,
- ntdb_normal_oob,
- ntdb_expand_file,
- ntdb_direct,
- ntdb_read_normal_off,
- ntdb_write_normal_off,
-};
-
-static const struct ntdb_methods io_convert_methods = {
- ntdb_read,
- ntdb_write,
- ntdb_normal_oob,
- ntdb_expand_file,
- ntdb_direct,
- ntdb_read_convert_off,
- ntdb_write_convert_off,
-};
-
-/*
- initialise the default methods table
-*/
-void ntdb_io_init(struct ntdb_context *ntdb)
-{
- if (ntdb->flags & NTDB_CONVERT)
- ntdb->io = &io_convert_methods;
- else
- ntdb->io = &io_methods;
-}
+++ /dev/null
- /*
- Unix SMB/CIFS implementation.
-
- trivial database library
-
- Copyright (C) Andrew Tridgell 1999-2005
- Copyright (C) Paul `Rusty' Russell 2000
- Copyright (C) Jeremy Allison 2000-2003
-
- ** NOTE! The following LGPL license applies to the ntdb
- ** library. This does NOT imply that all of Samba is released
- ** under the LGPL
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "private.h"
-#include <ccan/build_assert/build_assert.h>
-
-/* If we were threaded, we could wait for unlock, but we're not, so fail. */
-enum NTDB_ERROR owner_conflict(struct ntdb_context *ntdb, const char *call)
-{
- return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
- "%s: lock owned by another ntdb in this process.",
- call);
-}
-
-/* If we fork, we no longer really own locks. */
-bool check_lock_pid(struct ntdb_context *ntdb, const char *call, bool log)
-{
- /* No locks? No problem! */
- if (ntdb->file->allrecord_lock.count == 0
- && ntdb->file->num_lockrecs == 0) {
- return true;
- }
-
- /* No fork? No problem! */
- if (ntdb->file->locker == getpid()) {
- return true;
- }
-
- if (log) {
- ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
- "%s: fork() detected after lock acquisition!"
- " (%u vs %u)", call,
- (unsigned int)ntdb->file->locker,
- (unsigned int)getpid());
- }
- return false;
-}
-
-int ntdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
- void *unused)
-{
- struct flock fl;
- int ret;
-
- do {
- fl.l_type = rw;
- fl.l_whence = SEEK_SET;
- fl.l_start = off;
- fl.l_len = len;
-
- if (waitflag)
- ret = fcntl(fd, F_SETLKW, &fl);
- else
- ret = fcntl(fd, F_SETLK, &fl);
- } while (ret != 0 && errno == EINTR);
- return ret;
-}
-
-int ntdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *unused)
-{
- struct flock fl;
- int ret;
-
- do {
- fl.l_type = F_UNLCK;
- fl.l_whence = SEEK_SET;
- fl.l_start = off;
- fl.l_len = len;
-
- ret = fcntl(fd, F_SETLKW, &fl);
- } while (ret != 0 && errno == EINTR);
- return ret;
-}
-
-static int lock(struct ntdb_context *ntdb,
- int rw, off_t off, off_t len, bool waitflag)
-{
- int ret;
- if (ntdb->file->allrecord_lock.count == 0
- && ntdb->file->num_lockrecs == 0) {
- ntdb->file->locker = getpid();
- }
-
- ntdb->stats.lock_lowlevel++;
- ret = ntdb->lock_fn(ntdb->file->fd, rw, off, len, waitflag,
- ntdb->lock_data);
- if (!waitflag) {
- ntdb->stats.lock_nonblock++;
- if (ret != 0)
- ntdb->stats.lock_nonblock_fail++;
- }
- return ret;
-}
-
-static int unlock(struct ntdb_context *ntdb, int rw, off_t off, off_t len)
-{
-#if 0 /* Check they matched up locks and unlocks correctly. */
- char line[80];
- FILE *locks;
- bool found = false;
-
- locks = fopen("/proc/locks", "r");
-
- while (fgets(line, 80, locks)) {
- char *p;
- int type, start, l;
-
- /* eg. 1: FLOCK ADVISORY WRITE 2440 08:01:2180826 0 EOF */
- p = strchr(line, ':') + 1;
- if (strncmp(p, " POSIX ADVISORY ", strlen(" POSIX ADVISORY ")))
- continue;
- p += strlen(" FLOCK ADVISORY ");
- if (strncmp(p, "READ ", strlen("READ ")) == 0)
- type = F_RDLCK;
- else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
- type = F_WRLCK;
- else
- abort();
- p += 6;
- if (atoi(p) != getpid())
- continue;
- p = strchr(strchr(p, ' ') + 1, ' ') + 1;
- start = atoi(p);
- p = strchr(p, ' ') + 1;
- if (strncmp(p, "EOF", 3) == 0)
- l = 0;
- else
- l = atoi(p) - start + 1;
-
- if (off == start) {
- if (len != l) {
- fprintf(stderr, "Len %u should be %u: %s",
- (int)len, l, line);
- abort();
- }
- if (type != rw) {
- fprintf(stderr, "Type %s wrong: %s",
- rw == F_RDLCK ? "READ" : "WRITE", line);
- abort();
- }
- found = true;
- break;
- }
- }
-
- if (!found) {
- fprintf(stderr, "Unlock on %u@%u not found!",
- (int)off, (int)len);
- abort();
- }
-
- fclose(locks);
-#endif
-
- return ntdb->unlock_fn(ntdb->file->fd, rw, off, len, ntdb->lock_data);
-}
-
-/* a byte range locking function - return 0 on success
- this functions locks len bytes at the specified offset.
-
- note that a len of zero means lock to end of file
-*/
-static enum NTDB_ERROR ntdb_brlock(struct ntdb_context *ntdb,
- int rw_type, ntdb_off_t offset, ntdb_off_t len,
- enum ntdb_lock_flags flags)
-{
- int ret;
-
- if (rw_type == F_WRLCK && (ntdb->flags & NTDB_RDONLY)) {
- return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
- "Write lock attempted on read-only database");
- }
-
- if (ntdb->flags & NTDB_NOLOCK) {
- return NTDB_SUCCESS;
- }
-
- /* A 32 bit system cannot open a 64-bit file, but it could have
- * expanded since then: check here. */
- if ((size_t)(offset + len) != offset + len) {
- return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
- "ntdb_brlock: lock on giant offset %llu",
- (long long)(offset + len));
- }
-
- ret = lock(ntdb, rw_type, offset, len, flags & NTDB_LOCK_WAIT);
- if (ret != 0) {
- /* Generic lock error. errno set by fcntl.
- * EAGAIN is an expected return from non-blocking
- * locks. */
- if (!(flags & NTDB_LOCK_PROBE)
- && (errno != EAGAIN && errno != EINTR)) {
- ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
- "ntdb_brlock failed (fd=%d) at"
- " offset %zu rw_type=%d flags=%d len=%zu:"
- " %s",
- ntdb->file->fd, (size_t)offset, rw_type,
- flags, (size_t)len, strerror(errno));
- }
- return NTDB_ERR_LOCK;
- }
- return NTDB_SUCCESS;
-}
-
-static enum NTDB_ERROR ntdb_brunlock(struct ntdb_context *ntdb,
- int rw_type, ntdb_off_t offset, size_t len)
-{
- if (ntdb->flags & NTDB_NOLOCK) {
- return NTDB_SUCCESS;
- }
-
- if (!check_lock_pid(ntdb, "ntdb_brunlock", false))
- return NTDB_ERR_LOCK;
-
- if (unlock(ntdb, rw_type, offset, len) == -1) {
- return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
- "ntdb_brunlock failed (fd=%d) at offset %zu"
- " rw_type=%d len=%zu: %s",
- ntdb->file->fd, (size_t)offset, rw_type,
- (size_t)len, strerror(errno));
- }
- return NTDB_SUCCESS;
-}
-
-/*
- upgrade a read lock to a write lock. This needs to be handled in a
- special way as some OSes (such as solaris) have too conservative
- deadlock detection and claim a deadlock when progress can be
- made. For those OSes we may loop for a while.
-*/
-enum NTDB_ERROR ntdb_allrecord_upgrade(struct ntdb_context *ntdb, off_t start)
-{
- int count = 1000;
-
- if (!check_lock_pid(ntdb, "ntdb_transaction_prepare_commit", true))
- return NTDB_ERR_LOCK;
-
- if (ntdb->file->allrecord_lock.count != 1) {
- return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
- "ntdb_allrecord_upgrade failed:"
- " count %u too high",
- ntdb->file->allrecord_lock.count);
- }
-
- if (ntdb->file->allrecord_lock.off != 1) {
- return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
- "ntdb_allrecord_upgrade failed:"
- " already upgraded?");
- }
-
- if (ntdb->file->allrecord_lock.owner != ntdb) {
- return owner_conflict(ntdb, "ntdb_allrecord_upgrade");
- }
-
- while (count--) {
- struct timeval tv;
- if (ntdb_brlock(ntdb, F_WRLCK, start, 0,
- NTDB_LOCK_WAIT|NTDB_LOCK_PROBE) == NTDB_SUCCESS) {
- ntdb->file->allrecord_lock.ltype = F_WRLCK;
- ntdb->file->allrecord_lock.off = 0;
- return NTDB_SUCCESS;
- }
- if (errno != EDEADLK) {
- break;
- }
- /* sleep for as short a time as we can - more portable than usleep() */
- tv.tv_sec = 0;
- tv.tv_usec = 1;
- select(0, NULL, NULL, NULL, &tv);
- }
-
- if (errno != EAGAIN && errno != EINTR)
- ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
- "ntdb_allrecord_upgrade failed");
- return NTDB_ERR_LOCK;
-}
-
-static struct ntdb_lock *find_nestlock(struct ntdb_context *ntdb, ntdb_off_t offset,
- const struct ntdb_context *owner)
-{
- unsigned int i;
-
- for (i=0; i<ntdb->file->num_lockrecs; i++) {
- if (ntdb->file->lockrecs[i].off == offset) {
- if (owner && ntdb->file->lockrecs[i].owner != owner)
- return NULL;
- return &ntdb->file->lockrecs[i];
- }
- }
- return NULL;
-}
-
-enum NTDB_ERROR ntdb_lock_and_recover(struct ntdb_context *ntdb)
-{
- enum NTDB_ERROR ecode;
-
- if (!check_lock_pid(ntdb, "ntdb_transaction_prepare_commit", true))
- return NTDB_ERR_LOCK;
-
- ecode = ntdb_allrecord_lock(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK,
- false);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- ecode = ntdb_lock_open(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
- if (ecode != NTDB_SUCCESS) {
- ntdb_allrecord_unlock(ntdb, F_WRLCK);
- return ecode;
- }
- ecode = ntdb_transaction_recover(ntdb);
- ntdb_unlock_open(ntdb, F_WRLCK);
- ntdb_allrecord_unlock(ntdb, F_WRLCK);
-
- return ecode;
-}
-
-/* lock an offset in the database. */
-static enum NTDB_ERROR ntdb_nest_lock(struct ntdb_context *ntdb,
- ntdb_off_t offset, int ltype,
- enum ntdb_lock_flags flags)
-{
- struct ntdb_lock *new_lck;
- enum NTDB_ERROR ecode;
-
- assert(offset <= (NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits)
- + ntdb->file->map_size / 8));
-
- if (ntdb->flags & NTDB_NOLOCK)
- return NTDB_SUCCESS;
-
- if (!check_lock_pid(ntdb, "ntdb_nest_lock", true)) {
- return NTDB_ERR_LOCK;
- }
-
- ntdb->stats.locks++;
-
- new_lck = find_nestlock(ntdb, offset, NULL);
- if (new_lck) {
- if (new_lck->owner != ntdb) {
- return owner_conflict(ntdb, "ntdb_nest_lock");
- }
-
- if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
- return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
- "ntdb_nest_lock:"
- " offset %zu has read lock",
- (size_t)offset);
- }
- /* Just increment the struct, posix locks don't stack. */
- new_lck->count++;
- return NTDB_SUCCESS;
- }
-
-#if 0
- if (ntdb->file->num_lockrecs
- && offset >= NTDB_HASH_LOCK_START
- && offset < NTDB_HASH_LOCK_START + NTDB_HASH_LOCK_RANGE) {
- return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
- "ntdb_nest_lock: already have a hash lock?");
- }
-#endif
- if (ntdb->file->lockrecs == NULL) {
- new_lck = ntdb->alloc_fn(ntdb->file, sizeof(*ntdb->file->lockrecs),
- ntdb->alloc_data);
- } else {
- new_lck = (struct ntdb_lock *)ntdb->expand_fn(
- ntdb->file->lockrecs,
- sizeof(*ntdb->file->lockrecs)
- * (ntdb->file->num_lockrecs+1),
- ntdb->alloc_data);
- }
- if (new_lck == NULL) {
- return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
- "ntdb_nest_lock:"
- " unable to allocate %zu lock struct",
- ntdb->file->num_lockrecs + 1);
- }
- ntdb->file->lockrecs = new_lck;
-
- /* Since fcntl locks don't nest, we do a lock for the first one,
- and simply bump the count for future ones */
- ecode = ntdb_brlock(ntdb, ltype, offset, 1, flags);
- if (ecode != NTDB_SUCCESS) {
- return ecode;
- }
-
- /* First time we grab a lock, perhaps someone died in commit? */
- if (!(flags & NTDB_LOCK_NOCHECK)
- && ntdb->file->num_lockrecs == 0) {
- ntdb_bool_err berr = ntdb_needs_recovery(ntdb);
- if (berr != false) {
- ntdb_brunlock(ntdb, ltype, offset, 1);
-
- &nb