ccan/ntdb: demote to junkcode.
authorRusty Russell <rusty@rustcorp.com.au>
Mon, 3 Apr 2017 00:08:26 +0000 (09:38 +0930)
committerRusty Russell <rusty@rustcorp.com.au>
Mon, 3 Apr 2017 00:08:26 +0000 (09:38 +0930)
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
220 files changed:
ccan/ntdb/ABI/ntdb-0.9.sigs [deleted file]
ccan/ntdb/ABI/ntdb-1.0.sigs [deleted file]
ccan/ntdb/LICENSE [deleted symlink]
ccan/ntdb/Makefile [deleted file]
ccan/ntdb/_info [deleted file]
ccan/ntdb/check.c [deleted file]
ccan/ntdb/doc/TDB_porting.txt [deleted file]
ccan/ntdb/doc/design.lyx [deleted file]
ccan/ntdb/doc/design.pdf [deleted file]
ccan/ntdb/doc/design.txt [deleted file]
ccan/ntdb/free.c [deleted file]
ccan/ntdb/hash.c [deleted file]
ccan/ntdb/io.c [deleted file]
ccan/ntdb/lock.c [deleted file]
ccan/ntdb/man/ntdb.3.xml [deleted file]
ccan/ntdb/man/ntdbbackup.8.xml [deleted file]
ccan/ntdb/man/ntdbdump.8.xml [deleted file]
ccan/ntdb/man/ntdbrestore.8.xml [deleted file]
ccan/ntdb/man/ntdbtool.8.xml [deleted file]
ccan/ntdb/ntdb.c [deleted file]
ccan/ntdb/ntdb.h [deleted file]
ccan/ntdb/ntdb.pc.in [deleted file]
ccan/ntdb/open.c [deleted file]
ccan/ntdb/private.h [deleted file]
ccan/ntdb/python/pyntdb.c [deleted file]
ccan/ntdb/summary.c [deleted file]
ccan/ntdb/test/api-12-store.c [deleted file]
ccan/ntdb/test/api-13-delete.c [deleted file]
ccan/ntdb/test/api-14-exists.c [deleted file]
ccan/ntdb/test/api-16-wipe_all.c [deleted file]
ccan/ntdb/test/api-20-alloc-attr.c [deleted file]
ccan/ntdb/test/api-21-parse_record.c [deleted file]
ccan/ntdb/test/api-55-transaction.c [deleted file]
ccan/ntdb/test/api-60-noop-transaction.c [deleted file]
ccan/ntdb/test/api-80-tdb_fd.c [deleted file]
ccan/ntdb/test/api-81-seqnum.c [deleted file]
ccan/ntdb/test/api-82-lockattr.c [deleted file]
ccan/ntdb/test/api-83-openhook.c [deleted file]
ccan/ntdb/test/api-91-get-stats.c [deleted file]
ccan/ntdb/test/api-92-get-set-readonly.c [deleted file]
ccan/ntdb/test/api-93-repack.c [deleted file]
ccan/ntdb/test/api-94-expand-during-parse.c [deleted file]
ccan/ntdb/test/api-95-read-only-during-parse.c [deleted file]
ccan/ntdb/test/api-add-remove-flags.c [deleted file]
ccan/ntdb/test/api-check-callback.c [deleted file]
ccan/ntdb/test/api-firstkey-nextkey.c [deleted file]
ccan/ntdb/test/api-fork-test.c [deleted file]
ccan/ntdb/test/api-locktimeout.c [deleted file]
ccan/ntdb/test/api-missing-entries.c [deleted file]
ccan/ntdb/test/api-open-multiple-times.c [deleted file]
ccan/ntdb/test/api-record-expand.c [deleted file]
ccan/ntdb/test/api-simple-delete.c [deleted file]
ccan/ntdb/test/api-summary.c [deleted file]
ccan/ntdb/test/external-agent.c [deleted file]
ccan/ntdb/test/external-agent.h [deleted file]
ccan/ntdb/test/failtest_helper.c [deleted file]
ccan/ntdb/test/failtest_helper.h [deleted file]
ccan/ntdb/test/helpapi-external-agent.h [deleted file]
ccan/ntdb/test/helprun-external-agent.h [deleted file]
ccan/ntdb/test/helprun-layout.h [deleted file]
ccan/ntdb/test/layout.h [deleted file]
ccan/ntdb/test/lock-tracking.c [deleted file]
ccan/ntdb/test/lock-tracking.h [deleted file]
ccan/ntdb/test/logging.c [deleted file]
ccan/ntdb/test/logging.h [deleted file]
ccan/ntdb/test/no-fsync.h [deleted file]
ccan/ntdb/test/ntdb-source.h [deleted file]
ccan/ntdb/test/python-api.py [deleted file]
ccan/ntdb/test/run-001-encode.c [deleted file]
ccan/ntdb/test/run-001-fls.c [deleted file]
ccan/ntdb/test/run-01-new_database.c [deleted file]
ccan/ntdb/test/run-02-expand.c [deleted file]
ccan/ntdb/test/run-03-coalesce.c [deleted file]
ccan/ntdb/test/run-04-basichash.c [deleted file]
ccan/ntdb/test/run-05-readonly-open.c [deleted file]
ccan/ntdb/test/run-10-simple-store.c [deleted file]
ccan/ntdb/test/run-11-simple-fetch.c [deleted file]
ccan/ntdb/test/run-12-check.c [deleted file]
ccan/ntdb/test/run-15-append.c [deleted file]
ccan/ntdb/test/run-25-hashoverload.c [deleted file]
ccan/ntdb/test/run-30-exhaust-before-expand.c [deleted file]
ccan/ntdb/test/run-35-convert.c [deleted file]
ccan/ntdb/test/run-50-multiple-freelists.c [deleted file]
ccan/ntdb/test/run-56-open-during-transaction.c [deleted file]
ccan/ntdb/test/run-57-die-during-transaction.c [deleted file]
ccan/ntdb/test/run-64-bit-tdb.c [deleted file]
ccan/ntdb/test/run-90-get-set-attributes.c [deleted file]
ccan/ntdb/test/run-capabilities.c [deleted file]
ccan/ntdb/test/run-expand-in-transaction.c [deleted file]
ccan/ntdb/test/run-features.c [deleted file]
ccan/ntdb/test/run-lockall.c [deleted file]
ccan/ntdb/test/run-remap-in-read_traverse.c [deleted file]
ccan/ntdb/test/run-seed.c [deleted file]
ccan/ntdb/test/run-tdb_errorstr.c [deleted file]
ccan/ntdb/test/run-tdb_foreach.c [deleted file]
ccan/ntdb/test/run-traverse.c [deleted file]
ccan/ntdb/test/tap-interface.c [deleted file]
ccan/ntdb/test/tap-interface.h [deleted file]
ccan/ntdb/tools/Makefile [deleted file]
ccan/ntdb/tools/growtdb-bench.c [deleted file]
ccan/ntdb/tools/mkntdb.c [deleted file]
ccan/ntdb/tools/ntdbbackup.c [deleted file]
ccan/ntdb/tools/ntdbdump.c [deleted file]
ccan/ntdb/tools/ntdbrestore.c [deleted file]
ccan/ntdb/tools/ntdbtool.c [deleted file]
ccan/ntdb/tools/ntdbtorture.c [deleted file]
ccan/ntdb/tools/speed.c [deleted file]
ccan/ntdb/transaction.c [deleted file]
ccan/ntdb/traverse.c [deleted file]
ccan/ntdb/wscript [deleted file]
junkcode/rusty@rustcorp.com.au-ntdb/ABI/ntdb-0.9.sigs [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/ABI/ntdb-1.0.sigs [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/LICENSE [new symlink]
junkcode/rusty@rustcorp.com.au-ntdb/Makefile [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/_info [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/check.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/doc/TDB_porting.txt [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/doc/design.lyx [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/doc/design.pdf [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/doc/design.txt [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/free.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/hash.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/io.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/lock.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/man/ntdb.3.xml [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/man/ntdbbackup.8.xml [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/man/ntdbdump.8.xml [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/man/ntdbrestore.8.xml [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/man/ntdbtool.8.xml [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/ntdb.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/ntdb.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/ntdb.pc.in [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/open.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/private.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/python/pyntdb.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/summary.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-12-store.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-13-delete.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-14-exists.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-16-wipe_all.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-20-alloc-attr.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-21-parse_record.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-55-transaction.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-60-noop-transaction.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-80-tdb_fd.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-81-seqnum.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-82-lockattr.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-83-openhook.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-91-get-stats.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-92-get-set-readonly.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-93-repack.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-94-expand-during-parse.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-95-read-only-during-parse.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-add-remove-flags.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-check-callback.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-firstkey-nextkey.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-fork-test.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-locktimeout.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-missing-entries.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-open-multiple-times.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-record-expand.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-simple-delete.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/api-summary.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/external-agent.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/external-agent.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/failtest_helper.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/failtest_helper.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/helpapi-external-agent.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/helprun-external-agent.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/helprun-layout.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/layout.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/lock-tracking.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/lock-tracking.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/logging.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/logging.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/no-fsync.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/ntdb-source.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/python-api.py [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-001-encode.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-001-fls.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-01-new_database.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-02-expand.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-03-coalesce.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-04-basichash.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-05-readonly-open.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-10-simple-store.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-11-simple-fetch.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-12-check.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-15-append.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-25-hashoverload.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-30-exhaust-before-expand.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-35-convert.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-50-multiple-freelists.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-56-open-during-transaction.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-57-die-during-transaction.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-64-bit-tdb.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-90-get-set-attributes.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-capabilities.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-expand-in-transaction.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-features.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-lockall.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-remap-in-read_traverse.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-seed.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-tdb_errorstr.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-tdb_foreach.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/run-traverse.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/tap-interface.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/test/tap-interface.h [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/tools/Makefile [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/tools/growtdb-bench.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/tools/mkntdb.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/tools/ntdbbackup.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/tools/ntdbdump.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/tools/ntdbrestore.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/tools/ntdbtool.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/tools/ntdbtorture.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/tools/speed.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/transaction.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/traverse.c [new file with mode: 0644]
junkcode/rusty@rustcorp.com.au-ntdb/wscript [new file with mode: 0644]

diff --git a/ccan/ntdb/ABI/ntdb-0.9.sigs b/ccan/ntdb/ABI/ntdb-0.9.sigs
deleted file mode 100644 (file)
index 6b12ddb..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-ntdb_add_flag: void (struct ntdb_context *, unsigned int)
-ntdb_append: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA)
-ntdb_chainlock: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
-ntdb_chainlock_read: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
-ntdb_chainunlock: void (struct ntdb_context *, NTDB_DATA)
-ntdb_chainunlock_read: void (struct ntdb_context *, NTDB_DATA)
-ntdb_check_: enum NTDB_ERROR (struct ntdb_context *, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
-ntdb_close: int (struct ntdb_context *)
-ntdb_delete: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
-ntdb_errorstr: const char *(enum NTDB_ERROR)
-ntdb_exists: bool (struct ntdb_context *, NTDB_DATA)
-ntdb_fd: int (const struct ntdb_context *)
-ntdb_fetch: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA *)
-ntdb_firstkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
-ntdb_foreach_: void (int (*)(struct ntdb_context *, void *), void *)
-ntdb_get_attribute: enum NTDB_ERROR (struct ntdb_context *, union ntdb_attribute *)
-ntdb_get_flags: unsigned int (struct ntdb_context *)
-ntdb_get_seqnum: int64_t (struct ntdb_context *)
-ntdb_lockall: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_lockall_read: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_name: const char *(const struct ntdb_context *)
-ntdb_nextkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
-ntdb_open: struct ntdb_context *(const char *, int, int, mode_t, union ntdb_attribute *)
-ntdb_parse_record_: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
-ntdb_remove_flag: void (struct ntdb_context *, unsigned int)
-ntdb_repack: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_set_attribute: enum NTDB_ERROR (struct ntdb_context *, const union ntdb_attribute *)
-ntdb_store: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA, int)
-ntdb_summary: enum NTDB_ERROR (struct ntdb_context *, enum ntdb_summary_flags, char **)
-ntdb_transaction_cancel: void (struct ntdb_context *)
-ntdb_transaction_commit: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_transaction_prepare_commit: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_transaction_start: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_traverse_: int64_t (struct ntdb_context *, int (*)(struct ntdb_context *, NTDB_DATA, NTDB_DATA, void *), void *)
-ntdb_unlockall: void (struct ntdb_context *)
-ntdb_unlockall_read: void (struct ntdb_context *)
-ntdb_unset_attribute: void (struct ntdb_context *, enum ntdb_attribute_type)
-ntdb_wipe_all: enum NTDB_ERROR (struct ntdb_context *)
diff --git a/ccan/ntdb/ABI/ntdb-1.0.sigs b/ccan/ntdb/ABI/ntdb-1.0.sigs
deleted file mode 100644 (file)
index 6b12ddb..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-ntdb_add_flag: void (struct ntdb_context *, unsigned int)
-ntdb_append: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA)
-ntdb_chainlock: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
-ntdb_chainlock_read: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
-ntdb_chainunlock: void (struct ntdb_context *, NTDB_DATA)
-ntdb_chainunlock_read: void (struct ntdb_context *, NTDB_DATA)
-ntdb_check_: enum NTDB_ERROR (struct ntdb_context *, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
-ntdb_close: int (struct ntdb_context *)
-ntdb_delete: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
-ntdb_errorstr: const char *(enum NTDB_ERROR)
-ntdb_exists: bool (struct ntdb_context *, NTDB_DATA)
-ntdb_fd: int (const struct ntdb_context *)
-ntdb_fetch: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA *)
-ntdb_firstkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
-ntdb_foreach_: void (int (*)(struct ntdb_context *, void *), void *)
-ntdb_get_attribute: enum NTDB_ERROR (struct ntdb_context *, union ntdb_attribute *)
-ntdb_get_flags: unsigned int (struct ntdb_context *)
-ntdb_get_seqnum: int64_t (struct ntdb_context *)
-ntdb_lockall: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_lockall_read: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_name: const char *(const struct ntdb_context *)
-ntdb_nextkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
-ntdb_open: struct ntdb_context *(const char *, int, int, mode_t, union ntdb_attribute *)
-ntdb_parse_record_: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
-ntdb_remove_flag: void (struct ntdb_context *, unsigned int)
-ntdb_repack: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_set_attribute: enum NTDB_ERROR (struct ntdb_context *, const union ntdb_attribute *)
-ntdb_store: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA, int)
-ntdb_summary: enum NTDB_ERROR (struct ntdb_context *, enum ntdb_summary_flags, char **)
-ntdb_transaction_cancel: void (struct ntdb_context *)
-ntdb_transaction_commit: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_transaction_prepare_commit: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_transaction_start: enum NTDB_ERROR (struct ntdb_context *)
-ntdb_traverse_: int64_t (struct ntdb_context *, int (*)(struct ntdb_context *, NTDB_DATA, NTDB_DATA, void *), void *)
-ntdb_unlockall: void (struct ntdb_context *)
-ntdb_unlockall_read: void (struct ntdb_context *)
-ntdb_unset_attribute: void (struct ntdb_context *, enum ntdb_attribute_type)
-ntdb_wipe_all: enum NTDB_ERROR (struct ntdb_context *)
diff --git a/ccan/ntdb/LICENSE b/ccan/ntdb/LICENSE
deleted file mode 120000 (symlink)
index 7455044..0000000
+++ /dev/null
@@ -1 +0,0 @@
-../../licenses/LGPL-3
\ No newline at end of file
diff --git a/ccan/ntdb/Makefile b/ccan/ntdb/Makefile
deleted file mode 100644 (file)
index 3ce5fd1..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-CC=gcc
-CFLAGS=-g -O0 -Wall -W -I../../ -I./
-LIBS=
-
-LIBNTDB_OBJ = ccan_hash.o ccan_tally.o check.o free.o hash.o io.o lock.o open.o summary.o ntdb.o transaction.o traverse.o
-
-all: ntdbtorture ntdbtool ntdbdump ntdbrestore ntdbbackup
-
-ntdbtorture: tools/ntdbtorture.c libntdb.a
-       $(CC) $(CFLAGS) -o tools/$@ tools/$@.c libntdb.a $(LIBS)
-
-ntdbtool: tools/ntdbtool.c libntdb.a
-       $(CC) $(CFLAGS) -o tools/$@ tools/$@.c libntdb.a $(LIBS)
-
-ntdbdump: tools/ntdbdump.c libntdb.a
-       $(CC) $(CFLAGS) -o tools/$@ tools/$@.c libntdb.a $(LIBS)
-
-ntdbrestore: tools/ntdbrestore.c libntdb.a
-       $(CC) $(CFLAGS) -o tools/$@ tools/$@.c libntdb.a $(LIBS)
-
-ntdbbackup: tools/ntdbbackup.c libntdb.a
-       $(CC) $(CFLAGS) -o tools/$@ tools/$@.c libntdb.a $(LIBS)
-
-libntdb.a: $(LIBNTDB_OBJ)
-       @echo Creating library $@
-       ar r libntdb.a $(LIBNTDB_OBJ)
-       ranlib libntdb.a
-
-check.o: check.c
-       @echo Compiling $@
-       $(CC) $(CFLAGS) -c check.c -o $@
-
-free.o: free.c
-       @echo Compiling $@
-       $(CC) $(CFLAGS) -c free.c -o $@
-
-hash.o: hash.c
-       @echo Compiling $@
-       $(CC) $(CFLAGS) -c hash.c -o $@
-
-io.o: io.c
-       @echo Compiling $@
-       $(CC) $(CFLAGS) -c io.c -o $@
-
-lock.o: lock.c
-       @echo Compiling $@
-       $(CC) $(CFLAGS) -c lock.c -o $@
-
-open.o: open.c
-       @echo Compiling $@
-       $(CC) $(CFLAGS) -c open.c -o $@
-
-summary.o: summary.c
-       @echo Compiling $@
-       $(CC) $(CFLAGS) -c summary.c -o $@
-
-ntdb.o: ntdb.c
-       @echo Compiling $@
-       $(CC) $(CFLAGS) -c ntdb.c -o $@
-
-transaction.o: transaction.c
-       @echo Compiling $@
-       $(CC) $(CFLAGS) -c transaction.c -o $@
-
-traverse.o: traverse.c
-       @echo Compiling $@
-       $(CC) $(CFLAGS) -c traverse.c -o $@
-
-ccan_hash.o: ../hash/hash.c
-       @echo Compiling $@
-       $(CC) $(CFLAGS) -c ../hash/hash.c -o $@
-
-ccan_tally.o: ../tally/tally.c
-       @echo Compiling $@
-       $(CC) $(CFLAGS) -c ../tally/tally.c -o $@
-
-clean:
-       rm -f *.o
-       rm -f *.a
-       rm -f tools/ntdbtorture tools/ntdbtool tools/ntdbdump tools/ntdbrestore tools/ntdbbackup
diff --git a/ccan/ntdb/_info b/ccan/ntdb/_info
deleted file mode 100644 (file)
index 5aedb81..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-#include "config.h"
-#include <stdio.h>
-#include <string.h>
-
-/**
- * ntdb - Next Generation Trivial Database
- *
- * This package provides an experimental persistent keyword/data store.
- * Its main advantage over tdb is that it's 64-bit.
- *
- * Example:
- *     #include <stdio.h>
- *      #include <err.h>
- *      #include <unistd.h>
- *      #include <ccan/ntdb/ntdb.h>
- *
- *     int main(int argc, char *argv[])
- *     {
- *             NTDB_DATA key = ntdb_mkdata("key", 3);
- *             NTDB_DATA val = ntdb_mkdata("val", 3);
- *             struct ntdb_context *ntdb;
- *
- *             ntdb = ntdb_open("example.ntdb", NTDB_DEFAULT,
- *                              O_RDWR | O_CREAT | O_TRUNC, 0600, NULL);
- *             if (ntdb == NULL)
- *                     errx(1, "failed to open database file");
- *
- *             ntdb_store(ntdb, key, val, NTDB_INSERT);
- *
- *             ntdb_close(ntdb);
- *
- *             return 0;
- *     }
- *
- * License: LGPL (v3 or any later version)
- * Authors: Rusty Russell
- *          Andrew Tridgell
- *          Jeremy Allison
- *          Jelmer Vernooij
- *          Volker Lendecke
- *          Andrew Esh
- *          Simon McVittie
- *          Tim Potter
- * Maintainer: Rusty Russell <rusty@rustcorp.com.au>
- */
-int main(int argc, char *argv[])
-{
-       if (argc != 2)
-               return 1;
-
-       if (strcmp(argv[1], "depends") == 0) {
-               printf("ccan/asearch\n");       
-               printf("ccan/build_assert\n");  
-               printf("ccan/cast\n");  
-               printf("ccan/compiler\n");      
-               printf("ccan/endian\n");        
-               printf("ccan/hash\n");  
-               printf("ccan/ilog\n");  
-               printf("ccan/likely\n");        
-               printf("ccan/tally\n"); 
-               printf("ccan/typesafe_cb\n");   
-               return 0;
-       }
-
-       if (strcmp(argv[1], "testdepends") == 0) {
-               printf("ccan/failtest\n");      
-               printf("ccan/err\n");   
-               return 0;
-       }
-
-       return 1;
-}
diff --git a/ccan/ntdb/check.c b/ccan/ntdb/check.c
deleted file mode 100644 (file)
index f242394..0000000
+++ /dev/null
@@ -1,726 +0,0 @@
- /*
-   Trivial Database 2: free list/block handling
-   Copyright (C) Rusty Russell 2010
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/likely/likely.h>
-#include <ccan/asearch/asearch.h>
-
-/* We keep an ordered array of offsets. */
-static bool append(struct ntdb_context *ntdb,
-                  ntdb_off_t **arr, size_t *num, ntdb_off_t off)
-{
-       ntdb_off_t *new;
-
-       if (*num == 0) {
-               new = ntdb->alloc_fn(ntdb, sizeof(ntdb_off_t), ntdb->alloc_data);
-       } else {
-               new = ntdb->expand_fn(*arr, (*num + 1) * sizeof(ntdb_off_t),
-                                 ntdb->alloc_data);
-       }
-       if (!new)
-               return false;
-       new[(*num)++] = off;
-       *arr = new;
-       return true;
-}
-
-static enum NTDB_ERROR check_header(struct ntdb_context *ntdb,
-                                   ntdb_off_t *recovery,
-                                   uint64_t *features,
-                                   size_t *num_capabilities)
-{
-       uint64_t hash_test;
-       struct ntdb_header hdr;
-       enum NTDB_ERROR ecode;
-       ntdb_off_t off, next;
-
-       ecode = ntdb_read_convert(ntdb, 0, &hdr, sizeof(hdr));
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-       /* magic food should not be converted, so convert back. */
-       ntdb_convert(ntdb, hdr.magic_food, sizeof(hdr.magic_food));
-
-       hash_test = NTDB_HASH_MAGIC;
-       hash_test = ntdb_hash(ntdb, &hash_test, sizeof(hash_test));
-       if (hdr.hash_test != hash_test) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "check: hash test %llu should be %llu",
-                                 (long long)hdr.hash_test,
-                                 (long long)hash_test);
-       }
-
-       if (strcmp(hdr.magic_food, NTDB_MAGIC_FOOD) != 0) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "check: bad magic '%.*s'",
-                                 (unsigned)sizeof(hdr.magic_food),
-                                 hdr.magic_food);
-       }
-
-       /* Features which are used must be a subset of features offered. */
-       if (hdr.features_used & ~hdr.features_offered) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "check: features used (0x%llx) which"
-                                 " are not offered (0x%llx)",
-                                 (long long)hdr.features_used,
-                                 (long long)hdr.features_offered);
-       }
-
-       *features = hdr.features_offered;
-       *recovery = hdr.recovery;
-       if (*recovery) {
-               if (*recovery < sizeof(hdr)
-                   || *recovery > ntdb->file->map_size) {
-                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                         "ntdb_check:"
-                                         " invalid recovery offset %zu",
-                                         (size_t)*recovery);
-               }
-       }
-
-       for (off = hdr.capabilities; off && ecode == NTDB_SUCCESS; off = next) {
-               const struct ntdb_capability *cap;
-               enum NTDB_ERROR e;
-
-               cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
-               if (NTDB_PTR_IS_ERR(cap)) {
-                       return NTDB_PTR_ERR(cap);
-               }
-
-               /* All capabilities are unknown. */
-               e = unknown_capability(ntdb, "ntdb_check", cap->type);
-               next = cap->next;
-               ntdb_access_release(ntdb, cap);
-               if (e)
-                       return e;
-               (*num_capabilities)++;
-       }
-
-       /* Don't check reserved: they *can* be used later. */
-       return NTDB_SUCCESS;
-}
-
-static int off_cmp(const ntdb_off_t *a, const ntdb_off_t *b, void *ctx)
-{
-       /* Can overflow an int. */
-       return *a > *b ? 1
-               : *a < *b ? -1
-               : 0;
-}
-
-static enum NTDB_ERROR check_entry(struct ntdb_context *ntdb,
-                                  ntdb_off_t off_and_hash,
-                                  ntdb_len_t bucket,
-                                  ntdb_off_t used[],
-                                  size_t num_used,
-                                  size_t *num_found,
-                                  enum NTDB_ERROR (*check)(NTDB_DATA,
-                                                           NTDB_DATA,
-                                                           void *),
-                                  void *data)
-{
-       enum NTDB_ERROR ecode;
-       const struct ntdb_used_record *r;
-       const unsigned char *kptr;
-       ntdb_len_t klen, dlen;
-       uint32_t hash;
-       ntdb_off_t off = off_and_hash & NTDB_OFF_MASK;
-       ntdb_off_t *p;
-
-       /* Empty bucket is fine. */
-       if (!off_and_hash) {
-               return NTDB_SUCCESS;
-       }
-
-       /* This can't point to a chain, we handled those at toplevel. */
-       if (off_and_hash & (1ULL << NTDB_OFF_CHAIN_BIT)) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                  "ntdb_check: Invalid chain bit in offset "
-                                  " %llu", (long long)off_and_hash);
-       }
-
-       p = asearch(&off, used, num_used, off_cmp, NULL);
-       if (!p) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                  "ntdb_check: Invalid offset"
-                                  " %llu in hash", (long long)off);
-       }
-       /* Mark it invalid. */
-       *p ^= 1;
-       (*num_found)++;
-
-       r = ntdb_access_read(ntdb, off, sizeof(*r), true);
-       if (NTDB_PTR_IS_ERR(r)) {
-               return NTDB_PTR_ERR(r);
-       }
-       klen = rec_key_length(r);
-       dlen = rec_data_length(r);
-       ntdb_access_release(ntdb, r);
-
-       kptr = ntdb_access_read(ntdb, off + sizeof(*r), klen + dlen, false);
-       if (NTDB_PTR_IS_ERR(kptr)) {
-               return NTDB_PTR_ERR(kptr);
-       }
-
-       hash = ntdb_hash(ntdb, kptr, klen);
-
-       /* Are we in the right chain? */
-       if (bits_from(hash, 0, ntdb->hash_bits) != bucket) {
-               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                   NTDB_LOG_ERROR,
-                                   "ntdb_check: Bad bucket %u vs %llu",
-                                   bits_from(hash, 0, ntdb->hash_bits),
-                                   (long long)bucket);
-       /* Next 8 bits should be the same as top bits of bucket. */
-       } else if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
-                  != bits_from(off_and_hash, 64-NTDB_OFF_UPPER_STEAL,
-                               NTDB_OFF_UPPER_STEAL)) {
-               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                   NTDB_LOG_ERROR,
-                                   "ntdb_check: Bad hash bits %llu vs %llu",
-                                   (long long)off_and_hash,
-                                   (long long)hash);
-       } else if (check) {
-               NTDB_DATA k, d;
-
-               k = ntdb_mkdata(kptr, klen);
-               d = ntdb_mkdata(kptr + klen, dlen);
-               ecode = check(k, d, data);
-       } else {
-               ecode = NTDB_SUCCESS;
-       }
-       ntdb_access_release(ntdb, kptr);
-
-       return ecode;
-}
-
-static enum NTDB_ERROR check_hash_chain(struct ntdb_context *ntdb,
-                                       ntdb_off_t off,
-                                       ntdb_len_t bucket,
-                                       ntdb_off_t used[],
-                                       size_t num_used,
-                                       size_t *num_found,
-                                       enum NTDB_ERROR (*check)(NTDB_DATA,
-                                                                NTDB_DATA,
-                                                                void *),
-                                       void *data)
-{
-       struct ntdb_used_record rec;
-       enum NTDB_ERROR ecode;
-       const ntdb_off_t *entries;
-       ntdb_len_t i, num;
-
-       /* This is a used entry. */
-       (*num_found)++;
-
-       ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       if (rec_magic(&rec) != NTDB_CHAIN_MAGIC) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "ntdb_check: Bad hash chain magic %llu",
-                                 (long long)rec_magic(&rec));
-       }
-
-       if (rec_data_length(&rec) % sizeof(ntdb_off_t)) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "ntdb_check: Bad hash chain data length %llu",
-                                 (long long)rec_data_length(&rec));
-       }
-
-       if (rec_key_length(&rec) != 0) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "ntdb_check: Bad hash chain key length %llu",
-                                 (long long)rec_key_length(&rec));
-       }
-
-       off += sizeof(rec);
-       num = rec_data_length(&rec) / sizeof(ntdb_off_t);
-       entries = ntdb_access_read(ntdb, off, rec_data_length(&rec), true);
-       if (NTDB_PTR_IS_ERR(entries)) {
-               return NTDB_PTR_ERR(entries);
-       }
-
-       /* Check each non-deleted entry in chain. */
-       for (i = 0; i < num; i++) {
-               ecode = check_entry(ntdb, entries[i], bucket,
-                                   used, num_used, num_found, check, data);
-               if (ecode) {
-                       break;
-               }
-       }
-
-       ntdb_access_release(ntdb, entries);
-       return ecode;
-}
-
-static enum NTDB_ERROR check_hash(struct ntdb_context *ntdb,
-                                 ntdb_off_t used[],
-                                 size_t num_used,
-                                 size_t num_other_used,
-                                 enum NTDB_ERROR (*check)(NTDB_DATA,
-                                                          NTDB_DATA,
-                                                          void *),
-                                 void *data)
-{
-       enum NTDB_ERROR ecode;
-       struct ntdb_used_record rec;
-       const ntdb_off_t *entries;
-       ntdb_len_t i;
-       /* Free tables and capabilities also show up as used, as do we. */
-       size_t num_found = num_other_used + 1;
-
-       ecode = ntdb_read_convert(ntdb, NTDB_HASH_OFFSET, &rec, sizeof(rec));
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       if (rec_magic(&rec) != NTDB_HTABLE_MAGIC) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "ntdb_check: Bad hash table magic %llu",
-                                 (long long)rec_magic(&rec));
-       }
-
-       if (rec_data_length(&rec) != (sizeof(ntdb_off_t) << ntdb->hash_bits)) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "ntdb_check: Bad hash table data length %llu",
-                                 (long long)rec_data_length(&rec));
-       }
-
-       if (rec_key_length(&rec) != 0) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "ntdb_check: Bad hash table key length %llu",
-                                 (long long)rec_key_length(&rec));
-       }
-
-       entries = ntdb_access_read(ntdb, NTDB_HASH_OFFSET + sizeof(rec),
-                                  rec_data_length(&rec), true);
-       if (NTDB_PTR_IS_ERR(entries)) {
-               return NTDB_PTR_ERR(entries);
-       }
-
-       for (i = 0; i < (1 << ntdb->hash_bits); i++) {
-               ntdb_off_t off = entries[i] & NTDB_OFF_MASK;
-               if (entries[i] & (1ULL << NTDB_OFF_CHAIN_BIT)) {
-                       ecode = check_hash_chain(ntdb, off, i,
-                                                used, num_used, &num_found,
-                                                check, data);
-               } else {
-                       ecode = check_entry(ntdb, entries[i], i,
-                                           used, num_used, &num_found,
-                                           check, data);
-               }
-               if (ecode) {
-                       break;
-               }
-       }
-       ntdb_access_release(ntdb, entries);
-
-       if (ecode == NTDB_SUCCESS && num_found != num_used) {
-               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                   "ntdb_check: Not all entries are in hash");
-       }
-       return ecode;
-}
-
-static enum NTDB_ERROR check_free(struct ntdb_context *ntdb,
-                                ntdb_off_t off,
-                                const struct ntdb_free_record *frec,
-                                ntdb_off_t prev, unsigned int ftable,
-                                unsigned int bucket)
-{
-       enum NTDB_ERROR ecode;
-
-       if (frec_magic(frec) != NTDB_FREE_MAGIC) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "ntdb_check: offset %llu bad magic 0x%llx",
-                                 (long long)off,
-                                 (long long)frec->magic_and_prev);
-       }
-       if (frec_ftable(frec) != ftable) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "ntdb_check: offset %llu bad freetable %u",
-                                 (long long)off, frec_ftable(frec));
-
-       }
-
-       ecode = ntdb_oob(ntdb, off,
-                        frec_len(frec) + sizeof(struct ntdb_used_record),
-                        false);
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-       if (size_to_bucket(frec_len(frec)) != bucket) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "ntdb_check: offset %llu in wrong bucket"
-                                 " (%u vs %u)",
-                                 (long long)off,
-                                 bucket, size_to_bucket(frec_len(frec)));
-       }
-       if (prev && prev != frec_prev(frec)) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "ntdb_check: offset %llu bad prev"
-                                 " (%llu vs %llu)",
-                                 (long long)off,
-                                 (long long)prev, (long long)frec_len(frec));
-       }
-       return NTDB_SUCCESS;
-}
-
-static enum NTDB_ERROR check_free_table(struct ntdb_context *ntdb,
-                                      ntdb_off_t ftable_off,
-                                      unsigned ftable_num,
-                                      ntdb_off_t fr[],
-                                      size_t num_free,
-                                      size_t *num_found)
-{
-       struct ntdb_freetable ft;
-       ntdb_off_t h;
-       unsigned int i;
-       enum NTDB_ERROR ecode;
-
-       ecode = ntdb_read_convert(ntdb, ftable_off, &ft, sizeof(ft));
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       if (rec_magic(&ft.hdr) != NTDB_FTABLE_MAGIC
-           || rec_key_length(&ft.hdr) != 0
-           || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "ntdb_check: Invalid header on free table");
-       }
-
-       for (i = 0; i < NTDB_FREE_BUCKETS; i++) {
-               ntdb_off_t off, prev = 0, *p, first = 0;
-               struct ntdb_free_record f;
-
-               h = bucket_off(ftable_off, i);
-               for (off = ntdb_read_off(ntdb, h); off; off = f.next) {
-                       if (NTDB_OFF_IS_ERR(off)) {
-                               return NTDB_OFF_TO_ERR(off);
-                       }
-                       if (!first) {
-                               off &= NTDB_OFF_MASK;
-                               first = off;
-                       }
-                       ecode = ntdb_read_convert(ntdb, off, &f, sizeof(f));
-                       if (ecode != NTDB_SUCCESS) {
-                               return ecode;
-                       }
-                       ecode = check_free(ntdb, off, &f, prev, ftable_num, i);
-                       if (ecode != NTDB_SUCCESS) {
-                               return ecode;
-                       }
-
-                       /* FIXME: Check hash bits */
-                       p = asearch(&off, fr, num_free, off_cmp, NULL);
-                       if (!p) {
-                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                                 NTDB_LOG_ERROR,
-                                                 "ntdb_check: Invalid offset"
-                                                 " %llu in free table",
-                                                 (long long)off);
-                       }
-                       /* Mark it invalid. */
-                       *p ^= 1;
-                       (*num_found)++;
-                       prev = off;
-               }
-
-               if (first) {
-                       /* Now we can check first back pointer. */
-                       ecode = ntdb_read_convert(ntdb, first, &f, sizeof(f));
-                       if (ecode != NTDB_SUCCESS) {
-                               return ecode;
-                       }
-                       ecode = check_free(ntdb, first, &f, prev, ftable_num, i);
-                       if (ecode != NTDB_SUCCESS) {
-                               return ecode;
-                       }
-               }
-       }
-       return NTDB_SUCCESS;
-}
-
-/* Slow, but should be very rare. */
-ntdb_off_t dead_space(struct ntdb_context *ntdb, ntdb_off_t off)
-{
-       size_t len;
-       enum NTDB_ERROR ecode;
-
-       for (len = 0; off + len < ntdb->file->map_size; len++) {
-               char c;
-               ecode = ntdb->io->tread(ntdb, off, &c, 1);
-               if (ecode != NTDB_SUCCESS) {
-                       return NTDB_ERR_TO_OFF(ecode);
-               }
-               if (c != 0 && c != 0x43)
-                       break;
-       }
-       return len;
-}
-
-static enum NTDB_ERROR check_linear(struct ntdb_context *ntdb,
-                                  ntdb_off_t **used, size_t *num_used,
-                                  ntdb_off_t **fr, size_t *num_free,
-                                  uint64_t features, ntdb_off_t recovery)
-{
-       ntdb_off_t off;
-       ntdb_len_t len;
-       enum NTDB_ERROR ecode;
-       bool found_recovery = false;
-
-       for (off = sizeof(struct ntdb_header);
-            off < ntdb->file->map_size;
-            off += len) {
-               union {
-                       struct ntdb_used_record u;
-                       struct ntdb_free_record f;
-                       struct ntdb_recovery_record r;
-               } rec;
-               /* r is larger: only get that if we need to. */
-               ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.f));
-               if (ecode != NTDB_SUCCESS) {
-                       return ecode;
-               }
-
-               /* If we crash after ftruncate, we can get zeroes or fill. */
-               if (rec.r.magic == NTDB_RECOVERY_INVALID_MAGIC
-                   || rec.r.magic ==  0x4343434343434343ULL) {
-                       ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.r));
-                       if (ecode != NTDB_SUCCESS) {
-                               return ecode;
-                       }
-                       if (recovery == off) {
-                               found_recovery = true;
-                               len = sizeof(rec.r) + rec.r.max_len;
-                       } else {
-                               len = dead_space(ntdb, off);
-                               if (NTDB_OFF_IS_ERR(len)) {
-                                       return NTDB_OFF_TO_ERR(len);
-                               }
-                               if (len < sizeof(rec.r)) {
-                                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                                         NTDB_LOG_ERROR,
-                                                         "ntdb_check: invalid"
-                                                         " dead space at %zu",
-                                                         (size_t)off);
-                               }
-
-                               ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
-                                          "Dead space at %zu-%zu (of %zu)",
-                                          (size_t)off, (size_t)(off + len),
-                                          (size_t)ntdb->file->map_size);
-                       }
-               } else if (rec.r.magic == NTDB_RECOVERY_MAGIC) {
-                       ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.r));
-                       if (ecode != NTDB_SUCCESS) {
-                               return ecode;
-                       }
-                       if (recovery != off) {
-                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                                 NTDB_LOG_ERROR,
-                                                 "ntdb_check: unexpected"
-                                                 " recovery record at offset"
-                                                 " %zu",
-                                                 (size_t)off);
-                       }
-                       if (rec.r.len > rec.r.max_len) {
-                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                                 NTDB_LOG_ERROR,
-                                                 "ntdb_check: invalid recovery"
-                                                 " length %zu",
-                                                 (size_t)rec.r.len);
-                       }
-                       if (rec.r.eof > ntdb->file->map_size) {
-                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                                 NTDB_LOG_ERROR,
-                                                 "ntdb_check: invalid old EOF"
-                                                 " %zu", (size_t)rec.r.eof);
-                       }
-                       found_recovery = true;
-                       len = sizeof(rec.r) + rec.r.max_len;
-               } else if (frec_magic(&rec.f) == NTDB_FREE_MAGIC) {
-                       len = sizeof(rec.u) + frec_len(&rec.f);
-                       if (off + len > ntdb->file->map_size) {
-                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                                 NTDB_LOG_ERROR,
-                                                 "ntdb_check: free overlength"
-                                                 " %llu at offset %llu",
-                                                 (long long)len,
-                                                 (long long)off);
-                       }
-                       /* This record should be in free lists. */
-                       if (frec_ftable(&rec.f) != NTDB_FTABLE_NONE
-                           && !append(ntdb, fr, num_free, off)) {
-                               return ntdb_logerr(ntdb, NTDB_ERR_OOM,
-                                                 NTDB_LOG_ERROR,
-                                                 "ntdb_check: tracking %zu'th"
-                                                 " free record.", *num_free);
-                       }
-               } else if (rec_magic(&rec.u) == NTDB_USED_MAGIC
-                          || rec_magic(&rec.u) == NTDB_CHAIN_MAGIC
-                          || rec_magic(&rec.u) == NTDB_HTABLE_MAGIC
-                          || rec_magic(&rec.u) == NTDB_FTABLE_MAGIC
-                          || rec_magic(&rec.u) == NTDB_CAP_MAGIC) {
-                       uint64_t klen, dlen, extra;
-
-                       /* This record is used! */
-                       if (!append(ntdb, used, num_used, off)) {
-                               return ntdb_logerr(ntdb, NTDB_ERR_OOM,
-                                                 NTDB_LOG_ERROR,
-                                                 "ntdb_check: tracking %zu'th"
-                                                 " used record.", *num_used);
-                       }
-
-                       klen = rec_key_length(&rec.u);
-                       dlen = rec_data_length(&rec.u);
-                       extra = rec_extra_padding(&rec.u);
-
-                       len = sizeof(rec.u) + klen + dlen + extra;
-                       if (off + len > ntdb->file->map_size) {
-                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                                 NTDB_LOG_ERROR,
-                                                 "ntdb_check: used overlength"
-                                                 " %llu at offset %llu",
-                                                 (long long)len,
-                                                 (long long)off);
-                       }
-
-                       if (len < sizeof(rec.f)) {
-                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                                 NTDB_LOG_ERROR,
-                                                 "ntdb_check: too short record"
-                                                 " %llu at %llu",
-                                                 (long long)len,
-                                                 (long long)off);
-                       }
-
-                       /* Check that records have correct 0 at end (but may
-                        * not in future). */
-                       if (extra && !features
-                           && rec_magic(&rec.u) != NTDB_CAP_MAGIC) {
-                               const char *p;
-                               char c;
-                               p = ntdb_access_read(ntdb, off + sizeof(rec.u)
-                                                   + klen + dlen, 1, false);
-                               if (NTDB_PTR_IS_ERR(p))
-                                       return NTDB_PTR_ERR(p);
-                               c = *p;
-                               ntdb_access_release(ntdb, p);
-
-                               if (c != '\0') {
-                                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                                         NTDB_LOG_ERROR,
-                                                         "ntdb_check:"
-                                                         " non-zero extra"
-                                                         " at %llu",
-                                                         (long long)off);
-                               }
-                       }
-               } else {
-                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                         NTDB_LOG_ERROR,
-                                         "ntdb_check: Bad magic 0x%llx"
-                                         " at offset %zu",
-                                         (long long)rec_magic(&rec.u),
-                                         (size_t)off);
-               }
-       }
-
-       /* We must have found recovery area if there was one. */
-       if (recovery != 0 && !found_recovery) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "ntdb_check: expected a recovery area at %zu",
-                                 (size_t)recovery);
-       }
-
-       return NTDB_SUCCESS;
-}
-
-_PUBLIC_ enum NTDB_ERROR ntdb_check_(struct ntdb_context *ntdb,
-                         enum NTDB_ERROR (*check)(NTDB_DATA, NTDB_DATA, void *),
-                         void *data)
-{
-       ntdb_off_t *fr = NULL, *used = NULL;
-       ntdb_off_t ft = 0, recovery = 0;
-       size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0,
-               num_capabilities = 0;
-       uint64_t features = 0;
-       enum NTDB_ERROR ecode;
-
-       if (ntdb->flags & NTDB_CANT_CHECK) {
-               return ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
-                                 "ntdb_check: database has unknown capability,"
-                                 " cannot check.");
-       }
-
-       ecode = ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, false);
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       ecode = ntdb_lock_expand(ntdb, F_RDLCK);
-       if (ecode != NTDB_SUCCESS) {
-               ntdb_allrecord_unlock(ntdb, F_RDLCK);
-               return ecode;
-       }
-
-       ecode = check_header(ntdb, &recovery, &features, &num_capabilities);
-       if (ecode != NTDB_SUCCESS)
-               goto out;
-
-       /* First we do a linear scan, checking all records. */
-       ecode = check_linear(ntdb, &used, &num_used, &fr, &num_free, features,
-                            recovery);
-       if (ecode != NTDB_SUCCESS)
-               goto out;
-
-       for (ft = first_ftable(ntdb); ft; ft = next_ftable(ntdb, ft)) {
-               if (NTDB_OFF_IS_ERR(ft)) {
-                       ecode = NTDB_OFF_TO_ERR(ft);
-                       goto out;
-               }
-               ecode = check_free_table(ntdb, ft, num_ftables, fr, num_free,
-                                        &num_found);
-               if (ecode != NTDB_SUCCESS)
-                       goto out;
-               num_ftables++;
-       }
-
-       /* FIXME: Check key uniqueness? */
-       ecode = check_hash(ntdb, used, num_used, num_ftables + num_capabilities,
-                          check, data);
-       if (ecode != NTDB_SUCCESS)
-               goto out;
-
-       if (num_found != num_free) {
-               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                  "ntdb_check: Not all entries are in"
-                                  " free table");
-       }
-
-out:
-       ntdb_allrecord_unlock(ntdb, F_RDLCK);
-       ntdb_unlock_expand(ntdb, F_RDLCK);
-       ntdb->free_fn(fr, ntdb->alloc_data);
-       ntdb->free_fn(used, ntdb->alloc_data);
-       return ecode;
-}
diff --git a/ccan/ntdb/doc/TDB_porting.txt b/ccan/ntdb/doc/TDB_porting.txt
deleted file mode 100644 (file)
index 5daf94b..0000000
+++ /dev/null
@@ -1,483 +0,0 @@
-Interface differences between TDB and NTDB.
-
-- ntdb shares 'struct TDB_DATA' with tdb, but TDB defines the TDB_DATA
-  typedef, whereas ntdb defines NTDB_DATA (ie. both are compatible).
-  If you include both ntdb.h and tdb.h, #include tdb.h first,
-  otherwise you'll get a compile error when tdb.h re-defined struct
-  TDB_DATA.
-
-  Example:
-       #include <tdb.h>
-       #include <ntdb.h>
-
-- ntdb functions return NTDB_SUCCESS (ie 0) on success, and a negative
-  error on failure, whereas tdb functions returned 0 on success, and
-  -1 on failure.  tdb then used tdb_error() to determine the error;
-  this API is nasty if we ever want to support threads, so is not supported.
-
-  Example:
-       #include <tdb.h>
-       #include <ntdb.h>
-
-       void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
-       {
-               if (tdb_store(tdb, key, d) == -1) {
-                       printf("store failed: %s\n", tdb_errorstr(tdb));
-               }
-       }
-
-       void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
-       {
-               enum NTDB_ERROR e;
-
-               e = ntdb_store(ntdb, key, d);
-               if (e) {
-                       printf("store failed: %s\n", ntdb_errorstr(e));
-               }
-       }
-
-- ntdb's ntdb_fetch() returns an error, tdb's returned the data directly
-  (or tdb_null, and you were supposed to check tdb_error() to find out why).
-
-  Example:
-       #include <tdb.h>
-       #include <ntdb.h>
-
-       void tdb_example(struct tdb_context *tdb, TDB_DATA key)
-       {
-               TDB_DATA data;
-
-               data = tdb_fetch(tdb, key);
-               if (!data.dptr) {
-                       printf("fetch failed: %s\n", tdb_errorstr(tdb));
-               }
-       }
-
-       void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key)
-       {
-               NTDB_DATA data;
-               enum NTDB_ERROR e;
-
-               e = ntdb_fetch(ntdb, key, &data);
-               if (e) {
-                       printf("fetch failed: %s\n", ntdb_errorstr(e));
-               }
-       }
-
-- ntdb's ntdb_nextkey() frees the old key's dptr, in tdb you needed to do
-  this manually.
-
-  Example:
-       #include <tdb.h>
-       #include <ntdb.h>
-
-       void tdb_example(struct tdb_context *tdb)
-       {
-               TDB_DATA key, next, data;
-
-               for (key = tdb_firstkey(tdb); key.dptr; key = next) {
-                       printf("Got key!\n");
-                       next = tdb_nextkey(tdb, key);
-                       free(key.dptr);
-               }
-       }
-
-
-       void ntdb_example(struct ntdb_context *ntdb)
-       {
-               NTDB_DATA k, data;
-               enum NTDB_ERROR e;
-
-               for (e = ntdb_firstkey(ntdb,&k); !e; e = ntdb_nextkey(ntdb,&k))
-                       printf("Got key!\n");
-       }
-
-- Unlike tdb_open/tdb_open_ex, ntdb_open does not allow NULL names,
-  even for NTDB_INTERNAL dbs, and thus ntdb_name() never returns NULL.
-
-  Example:
-       #include <tdb.h>
-       #include <ntdb.h>
-
-       struct tdb_context *tdb_example(void)
-       {
-               return tdb_open(NULL, 0, TDB_INTERNAL, O_RDWR, 0);
-       }
-
-       struct ntdb_context *ntdb_example(void)
-       {
-               return ntdb_open("example", NTDB_INTERNAL, O_RDWR, 0);
-       }
-
-- ntdb uses a linked list of attribute structures to implement logging and
-  alternate hashes.  tdb used tdb_open_ex, which was not extensible.
-
-  Example:
-       #include <tdb.h>
-       #include <ntdb.h>
-
-       /* Custom hash function */
-       static unsigned int my_tdb_hash_func(TDB_DATA *key)
-       {
-               return key->dsize;
-       }
-
-       struct tdb_context *tdb_example(void)
-       {
-               return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
-                                  O_CREAT|O_RDWR, 0600, NULL, my_hash_func);
-       }
-
-       /* Custom hash function */
-       static unsigned int my_ntdb_hash_func(const void *key, size_t len,
-                                             uint32_t seed, void *data)
-       {
-               return len;
-       }
-
-       struct ntdb_context *ntdb_example(void)
-       {
-               union ntdb_attribute hash;
-
-               hash.base.attr = NTDB_ATTRIBUTE_HASH;
-               hash.base.next = NULL;
-               hash.hash.fn = my_ntdb_hash_func;
-               return ntdb_open("example.ntdb", NTDB_DEFAULT,
-                                  O_CREAT|O_RDWR, 0600, &hash);
-       }
-
-- tdb's tdb_open/tdb_open_ex took an explicit hash size, defaulting to
-  131.  ntdb's uses an attribute for this, defaulting to 8192.
-
-  Example:
-       #include <tdb.h>
-       #include <ntdb.h>
-
-       struct tdb_context *tdb_example(void)
-       {
-               return tdb_open("example.tdb", 10007, TDB_DEFAULT,
-                               O_CREAT|O_RDWR, 0600);
-       }
-
-       struct ntdb_context *ntdb_example(void)
-       {
-               union ntdb_attribute hashsize;
-
-               hashsize.base.attr = NTDB_ATTRIBUTE_HASHSIZE;
-               hashsize.base.next = NULL;
-               hashsize.hashsize.size = 16384;
-               return ntdb_open("example.ntdb", NTDB_DEFAULT,
-                                  O_CREAT|O_RDWR, 0600, &hashsize);
-       }
-
-- ntdb's log function is simpler than tdb's log function.  The string
-  is already formatted, is not terminated by a '\n', and it takes an
-  enum ntdb_log_level not a tdb_debug_level, and which has only three
-  values: NTDB_LOG_ERROR, NTDB_LOG_USE_ERROR and NTDB_LOG_WARNING.
-
-       #include <tdb.h>
-       #include <ntdb.h>
-
-       static void tdb_log(struct tdb_context *tdb,
-                           enum tdb_debug_level level, const char *fmt, ...)
-       {
-               va_list ap;
-               const char *name;
-
-               switch (level) {
-               case TDB_DEBUG_FATAL:
-                       fprintf(stderr, "FATAL: ");
-                       break;
-               case TDB_DEBUG_ERROR:
-                       fprintf(stderr, "ERROR: ");
-                       break;
-               case TDB_DEBUG_WARNING:
-                       fprintf(stderr, "WARNING: ");
-                       break;
-               case TDB_DEBUG_TRACE:
-                       /* Don't print out tracing. */
-                       return;
-               }
-
-               name = tdb_name(tdb);
-               if (!name) {
-                       name = "unnamed";
-               }
-
-               fprintf(stderr, "tdb(%s):", name);
-
-               va_start(ap, fmt);
-               vfprintf(stderr, fmt, ap);
-               va_end(ap);
-       }
-
-       struct tdb_context *tdb_example(void)
-       {
-               struct tdb_logging_context lctx;
-
-               lctx.log_fn = tdb_log;
-               return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
-                                  O_CREAT|O_RDWR, 0600, &lctx, NULL);
-       }
-
-       static void ntdb_log(struct ntdb_context *ntdb,
-                            enum ntdb_log_level level,
-                            enum NTDB_ERROR ecode,
-                            const char *message,
-                            void *data)
-       {
-               switch (level) {
-               case NTDB_LOG_ERROR:
-                       fprintf(stderr, "ERROR: ");
-                       break;
-               case NTDB_LOG_USE_ERROR:
-                       /* We made a mistake, so abort. */
-                       abort();
-                       break;
-               case NTDB_LOG_WARNING:
-                       fprintf(stderr, "WARNING: ");
-                       break;
-               }
-
-               fprintf(stderr, "ntdb(%s):%s:%s\n",
-                       ntdb_name(ntdb), ntdb_errorstr(ecode), message);
-       }
-
-       struct ntdb_context *ntdb_example(void)
-       {
-               union ntdb_attribute log;
-
-               log.base.attr = NTDB_ATTRIBUTE_LOG;
-               log.base.next = NULL;
-               log.log.fn = ntdb_log;
-               return ntdb_open("example.ntdb", NTDB_DEFAULT,
-                                O_CREAT|O_RDWR, 0600, &log);
-       }
-
-- ntdb provides ntdb_deq() for comparing two NTDB_DATA, and ntdb_mkdata() for
-  creating an NTDB_DATA.
-
-       #include <tdb.h>
-       #include <ntdb.h>
-
-       void tdb_example(struct tdb_context *tdb)
-       {
-               TDB_DATA data, key;
-
-               key.dsize = strlen("hello");
-               key.dptr = "hello";
-               data = tdb_fetch(tdb, key);
-               if (data.dsize == key.dsize
-                   && !memcmp(data.dptr, key.dptr, key.dsize))
-                       printf("key is same as data\n");
-               }
-               free(data.dptr);
-       }
-
-       void ntdb_example(struct ntdb_context *ntdb)
-       {
-               NTDB_DATA data, key;
-
-               key = ntdb_mkdata("hello", strlen("hello"));
-               if (ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS) {
-                       if (ntdb_deq(key, data)) {
-                               printf("key is same as data\n");
-                       }
-                       free(data.dptr);
-               }
-       }
-
-- ntdb's ntdb_parse_record() takes a type-checked callback data
-  pointer, not a void * (though a void * pointer still works).  The
-  callback function is allowed to do read operations on the database,
-  or write operations if you first call ntdb_lockall().  TDB's
-  tdb_parse_record() did not allow any database access within the
-  callback, could crash if you tried.
-
-  Example:
-       #include <tdb.h>
-       #include <ntdb.h>
-
-       static int tdb_parser(TDB_DATA key, TDB_DATA data, void *private_data)
-       {
-               TDB_DATA *expect = private_data;
-
-               return data.dsize == expect->dsize
-                       && !memcmp(data.dptr, expect->dptr, data.dsize);
-       }
-
-       void tdb_example(struct tdb_context *tdb, TDB_DATA key, NTDB_DATA d)
-       {
-               switch (tdb_parse_record(tdb, key, tdb_parser, &d)) {
-               case -1:
-                       printf("parse failed: %s\n", tdb_errorstr(tdb));
-                       break;
-               case 0:
-                       printf("data was different!\n");
-                       break;
-               case 1:
-                       printf("data was same!\n");
-                       break;
-               }
-       }
-
-       static int ntdb_parser(TDB_DATA key, TDB_DATA data, TDB_DATA *expect)
-       {
-               return ntdb_deq(data, *expect);
-       }
-
-       void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
-       {
-               enum NTDB_ERROR e;
-
-               e = tdb_parse_record(tdb, key, tdb_parser, &d);
-               switch (e) {
-               case 0:
-                       printf("data was different!\n");
-                       break;
-               case 1:
-                       printf("data was same!\n");
-                       break;
-               default:
-                       printf("parse failed: %s\n", ntdb_errorstr(e));
-                       break;
-               }
-       }
-
-- ntdb does locking on read-only databases (ie. O_RDONLY passed to ntdb_open).
-  tdb did not: use the NTDB_NOLOCK flag if you want to suppress locking.
-
-  Example:
-       #include <tdb.h>
-       #include <ntdb.h>
-
-       struct tdb_context *tdb_example(void)
-       {
-               return tdb_open("example.tdb", 0, TDB_DEFAULT, O_RDONLY, 0);
-       }
-
-       struct ntdb_context *ntdb_example(void)
-       {
-               return ntdb_open("example.ntdb", NTDB_NOLOCK, O_RDONLY, NULL);
-       }
-
-- Failure inside a transaction (such as a lock function failing) does
-  not implicitly cancel the transaction; you still need to call
-  ntdb_transaction_cancel().
-
-       #include <tdb.h>
-       #include <ntdb.h>
-
-       void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
-       {
-               if (tdb_transaction_start(tdb) == -1) {
-                       printf("transaction failed: %s\n", tdb_errorstr(tdb));
-                       return;
-               }
-
-               if (tdb_store(tdb, key, d) == -1) {
-                       printf("store failed: %s\n", tdb_errorstr(tdb));
-                       return;
-               }
-               if (tdb_transaction_commit(tdb) == -1) {
-                       printf("commit failed: %s\n", tdb_errorstr(tdb));
-               }
-       }
-
-       void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
-       {
-               enum NTDB_ERROR e;
-
-               e = ntdb_transaction_start(ntdb);
-               if (e) {
-                       printf("transaction failed: %s\n", ntdb_errorstr(e));
-                       return;
-               }
-
-               e = ntdb_store(ntdb, key, d);
-               if (e) {
-                       printf("store failed: %s\n", ntdb_errorstr(e));
-                       ntdb_transaction_cancel(ntdb);
-               }
-
-               e = ntdb_transaction_commit(ntdb);
-               if (e) {
-                       printf("commit failed: %s\n", ntdb_errorstr(e));
-               }
-       }
-
-- There is no NTDB_CLEAR_IF_FIRST flag; it has severe scalability and
-  API problems.  If necessary, you can emulate this by using the open
-  hook and placing a 1-byte lock at offset 4.  If your program forks
-  and exits, you will need to place this lock again in the child before
-  the parent exits.
-
-  Example:
-
-       #include <tdb.h>
-       #include <ntdb.h>
-
-       struct tdb_context *tdb_example(void)
-       {
-               return tdb_open("example.tdb", 0, TDB_CLEAR_IF_FIRST,
-                                  O_CREAT|O_RDWR, 0600);
-       }
-
-       static enum NTDB_ERROR clear_if_first(int fd, void *unused)
-       {
-               /* We hold a lock offset 4 always, so we can tell if
-                * anyone else is. */
-               struct flock fl;
-
-               fl.l_type = F_WRLCK;
-               fl.l_whence = SEEK_SET;
-               fl.l_start = 4; /* ACTIVE_LOCK */
-               fl.l_len = 1;
-
-               if (fcntl(fd, F_SETLK, &fl) == 0) {
-                       /* We must be first ones to open it!  Clear it. */
-                       if (ftruncate(fd, 0) != 0) {
-                               return NTDB_ERR_IO;
-                       }
-               }
-               fl.l_type = F_RDLCK;
-               if (fcntl(fd, F_SETLKW, &fl) != 0) {
-                       return NTDB_ERR_IO;
-               }
-               return NTDB_SUCCESS;
-       }
-
-       struct ntdb_context *ntdb_example(void)
-       {
-               union ntdb_attribute open_attr;
-
-               open_attr.openhook.base.attr = NTDB_ATTRIBUTE_OPENHOOK;
-               open_attr.openhook.base.next = NULL;
-               open_attr.openhook.fn = clear_if_first;
-
-               return ntdb_open("example.ntdb", NTDB_DEFAULT,
-                                O_CREAT|O_RDWR, 0600, &open_attr);
-       }
-
-- ntdb traversals are not reliable if the database is changed during
-  the traversal, ie your traversal may not cover all elements, or may
-  cover elements multiple times.  As a special exception, deleting the
-  current record within ntdb_traverse() is reliable.
-
-- There is no ntdb_traverse_read, since ntdb_traverse does not hold
-  a lock across the entire traversal anyway.  If you want to make sure
-  that your traversal function does not write to the database, you can
-  set and clear the NTDB_RDONLY flag around the traversal.
-
-- ntdb does not need tdb_reopen() or tdb_reopen_all().  If you call
-  fork() after during certain operations the child should close the
-  ntdb, or complete the operations before continuing to use the tdb:
-
-       ntdb_transaction_start(): child must ntdb_transaction_cancel()
-       ntdb_lockall(): child must call ntdb_unlockall()
-       ntdb_lockall_read(): child must call ntdb_unlockall_read()
-       ntdb_chainlock(): child must call ntdb_chainunlock()
-       ntdb_parse() callback: child must return from ntdb_parse()
-
-- ntdb will not open a non-ntdb file, even if O_CREAT is specified.  tdb
-  will overwrite an unknown file in that case.
diff --git a/ccan/ntdb/doc/design.lyx b/ccan/ntdb/doc/design.lyx
deleted file mode 100644 (file)
index 5a10ee3..0000000
+++ /dev/null
@@ -1,2727 +0,0 @@
-#LyX 2.0 created this file. For more info see http://www.lyx.org/
-\lyxformat 413
-\begin_document
-\begin_header
-\textclass article
-\use_default_options true
-\maintain_unincluded_children false
-\language english
-\language_package default
-\inputencoding auto
-\fontencoding global
-\font_roman default
-\font_sans default
-\font_typewriter default
-\font_default_family default
-\use_non_tex_fonts false
-\font_sc false
-\font_osf false
-\font_sf_scale 100
-\font_tt_scale 100
-
-\graphics default
-\default_output_format default
-\output_sync 0
-\bibtex_command default
-\index_command default
-\paperfontsize default
-\use_hyperref false
-\papersize default
-\use_geometry false
-\use_amsmath 1
-\use_esint 1
-\use_mhchem 1
-\use_mathdots 1
-\cite_engine basic
-\use_bibtopic false
-\use_indices false
-\paperorientation portrait
-\suppress_date false
-\use_refstyle 0
-\index Index
-\shortcut idx
-\color #008000
-\end_index
-\secnumdepth 3
-\tocdepth 3
-\paragraph_separation indent
-\paragraph_indentation default
-\quotes_language english
-\papercolumns 1
-\papersides 1
-\paperpagestyle default
-\tracking_changes true
-\output_changes true
-\html_math_output 0
-\html_css_as_file 0
-\html_be_strict false
-\end_header
-
-\begin_body
-
-\begin_layout Title
-NTDB: Redesigning The Trivial DataBase
-\end_layout
-
-\begin_layout Author
-Rusty Russell, IBM Corporation
-\end_layout
-
-\begin_layout Date
-19 June 2012
-\end_layout
-
-\begin_layout Abstract
-The Trivial DataBase on-disk format is 32 bits; with usage cases heading
- towards the 4G limit, that must change.
- This required breakage provides an opportunity to revisit TDB's other design
- decisions and reassess them.
-\end_layout
-
-\begin_layout Section
-Introduction
-\end_layout
-
-\begin_layout Standard
-The Trivial DataBase was originally written by Andrew Tridgell as a simple
- key/data pair storage system with the same API as dbm, but allowing multiple
- readers and writers while being small enough (< 1000 lines of C) to include
- in SAMBA.
- The simple design created in 1999 has proven surprisingly robust and performant
-, used in Samba versions 3 and 4 as well as numerous other projects.
- Its useful life was greatly increased by the (backwards-compatible!) addition
- of transaction support in 2005.
-\end_layout
-
-\begin_layout Standard
-The wider variety and greater demands of TDB-using code has lead to some
- organic growth of the API, as well as some compromises on the implementation.
- None of these, by themselves, are seen as show-stoppers, but the cumulative
- effect is to a loss of elegance over the initial, simple TDB implementation.
- Here is a table of the approximate number of lines of implementation code
- and number of API functions at the end of each year:
-\end_layout
-
-\begin_layout Standard
-\begin_inset Tabular
-<lyxtabular version="3" rows="12" columns="3">
-<features tabularvalignment="middle">
-<column alignment="center" valignment="top" width="0">
-<column alignment="center" valignment="top" width="0">
-<column alignment="center" valignment="top" width="0">
-<row>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-Year End
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-API Functions
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-Lines of C Code Implementation
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-1999
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-13
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-1195
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2000
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-24
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-1725
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2001
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-32
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2228
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2002
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-35
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2481
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2003
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-35
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2552
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2004
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-40
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2584
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2005
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-38
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2647
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2006
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-52
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-3754
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2007
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-66
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-4398
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2008
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-71
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-4768
-\end_layout
-
-\end_inset
-</cell>
-</row>
-<row>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-2009
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-73
-\end_layout
-
-\end_inset
-</cell>
-<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
-\begin_inset Text
-
-\begin_layout Plain Layout
-5715
-\end_layout
-
-\end_inset
-</cell>
-</row>
-</lyxtabular>
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Standard
-This review is an attempt to catalog and address all the known issues with
- TDB and create solutions which address the problems without significantly
- increasing complexity; all involved are far too aware of the dangers of
- second system syndrome in rewriting a successful project like this.
-\end_layout
-
-\begin_layout Standard
-Note: the final decision was to make ntdb a separate library, with a separarate
- 'ntdb' namespace so both can potentially be linked together.
- This document still refers to
-\begin_inset Quotes eld
-\end_inset
-
-tdb
-\begin_inset Quotes erd
-\end_inset
-
- everywhere, for simplicity.
-\end_layout
-
-\begin_layout Section
-API Issues
-\end_layout
-
-\begin_layout Subsection
-tdb_open_ex Is Not Expandable
-\end_layout
-
-\begin_layout Standard
-The tdb_open() call was expanded to tdb_open_ex(), which added an optional
- hashing function and an optional logging function argument.
- Additional arguments to open would require the introduction of a tdb_open_ex2
- call etc.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\begin_inset CommandInset label
-LatexCommand label
-name "attributes"
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Standard
-tdb_open() will take a linked-list of attributes:
-\end_layout
-
-\begin_layout LyX-Code
-enum tdb_attribute {
-\end_layout
-
-\begin_layout LyX-Code
-    TDB_ATTRIBUTE_LOG = 0,
-\end_layout
-
-\begin_layout LyX-Code
-    TDB_ATTRIBUTE_HASH = 1
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_attribute_base {
-\end_layout
-
-\begin_layout LyX-Code
-    enum tdb_attribute attr;
-\end_layout
-
-\begin_layout LyX-Code
-    union tdb_attribute *next;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_attribute_log {
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
-\end_layout
-
-\begin_layout LyX-Code
-    tdb_log_func log_fn;
-\end_layout
-
-\begin_layout LyX-Code
-    void *log_private;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_attribute_hash {
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
-\end_layout
-
-\begin_layout LyX-Code
-    tdb_hash_func hash_fn;
-\end_layout
-
-\begin_layout LyX-Code
-    void *hash_private;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout LyX-Code
-union tdb_attribute {
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_base base;
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_log log;
-\end_layout
-
-\begin_layout LyX-Code
-    struct tdb_attribute_hash hash;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-This allows future attributes to be added, even if this expands the size
- of the union.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-tdb_traverse Makes Impossible Guarantees
-\end_layout
-
-\begin_layout Standard
-tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
- was thought that it was important to guarantee that all records which exist
- at the start and end of the traversal would be included, and no record
- would be included twice.
-\end_layout
-
-\begin_layout Standard
-This adds complexity (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Reliable-Traversal-Adds"
-
-\end_inset
-
-) and does not work anyway for records which are altered (in particular,
- those which are expanded may be effectively deleted and re-added behind
- the traversal).
-\end_layout
-
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "traverse-Proposed-Solution"
-
-\end_inset
-
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Abandon the guarantee.
- You will see every record if no changes occur during your traversal, otherwise
- you will see some subset.
- You can prevent changes by using a transaction or the locking API.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
- Delete-during-traverse will still delete every record, too (assuming no
- other changes).
-\end_layout
-
-\begin_layout Subsection
-Nesting of Transactions Is Fraught
-\end_layout
-
-\begin_layout Standard
-TDB has alternated between allowing nested transactions and not allowing
- them.
- Various paths in the Samba codebase assume that transactions will nest,
- and in a sense they can: the operation is only committed to disk when the
- outer transaction is committed.
- There are two problems, however:
-\end_layout
-
-\begin_layout Enumerate
-Canceling the inner transaction will cause the outer transaction commit
- to fail, and will not undo any operations since the inner transaction began.
- This problem is soluble with some additional internal code.
-\end_layout
-
-\begin_layout Enumerate
-An inner transaction commit can be cancelled by the outer transaction.
- This is desirable in the way which Samba's database initialization code
- uses transactions, but could be a surprise to any users expecting a successful
- transaction commit to expose changes to others.
-\end_layout
-
-\begin_layout Standard
-The current solution is to specify the behavior at tdb_open(), with the
- default currently that nested transactions are allowed.
- This flag can also be changed at runtime.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Given the usage patterns, it seems that the
-\begin_inset Quotes eld
-\end_inset
-
-least-surprise
-\begin_inset Quotes erd
-\end_inset
-
- behavior of disallowing nested transactions should become the default.
- Additionally, it seems the outer transaction is the only code which knows
- whether inner transactions should be allowed, so a flag to indicate this
- could be added to tdb_transaction_start.
- However, this behavior can be simulated with a wrapper which uses tdb_add_flags
-() and tdb_remove_flags(), so the API should not be expanded for this relatively
--obscure case.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete; the nesting flag has been removed.
-\end_layout
-
-\begin_layout Subsection
-Incorrect Hash Function is Not Detected
-\end_layout
-
-\begin_layout Standard
-tdb_open_ex() allows the calling code to specify a different hash function
- to use, but does not check that all other processes accessing this tdb
- are using the same hash function.
- The result is that records are missing from tdb_fetch().
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The header should contain an example hash result (eg.
- the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
- hash function produces the same answer, or fail the tdb_open call.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-tdb_set_max_dead/TDB_VOLATILE Expose Implementation
-\end_layout
-
-\begin_layout Standard
-In response to scalability issues with the free list (
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB-Freelist-Is"
-
-\end_inset
-
-) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
- and the TDB_VOLATILE flag to tdb_open.
- The latter actually calls the former with an argument of
-\begin_inset Quotes eld
-\end_inset
-
-5
-\begin_inset Quotes erd
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Standard
-This code allows deleted records to accumulate without putting them in the
- free list.
- On delete we iterate through each chain and free them in a batch if there
- are more than max_dead entries.
- These are never otherwise recycled except as a side-effect of a tdb_repack.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-With the scalability problems of the freelist solved, this API can be removed.
- The TDB_VOLATILE flag may still be useful as a hint that store and delete
- of records will be at least as common as fetch in order to allow some internal
- tuning, but initially will become a no-op.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
- Unknown flags cause tdb_open() to fail as well, so they can be detected
- at runtime.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB-Files-Cannot"
-
-\end_inset
-
-TDB Files Cannot Be Opened Multiple Times In The Same Process
-\end_layout
-
-\begin_layout Standard
-No process can open the same TDB twice; we check and disallow it.
- This is an unfortunate side-effect of fcntl locks, which operate on a per-file
- rather than per-file-descriptor basis, and do not nest.
- Thus, closing any file descriptor on a file clears all the locks obtained
- by this process, even if they were placed using a different file descriptor!
-\end_layout
-
-\begin_layout Standard
-Note that even if this were solved, deadlock could occur if operations were
- nested: this is a more manageable programming error in most cases.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-We could lobby POSIX to fix the perverse rules, or at least lobby Linux
- to violate them so that the most common implementation does not have this
- restriction.
- This would be a generally good idea for other fcntl lock users.
-\end_layout
-
-\begin_layout Standard
-Samba uses a wrapper which hands out the same tdb_context to multiple callers
- if this happens, and does simple reference counting.
- We should do this inside the tdb library, which already emulates lock nesting
- internally; it would need to recognize when deadlock occurs within a single
- process.
- This would create a new failure mode for tdb operations (while we currently
- handle locking failures, they are impossible in normal use and a process
- encountering them can do little but give up).
-\end_layout
-
-\begin_layout Standard
-I do not see benefit in an additional tdb_open flag to indicate whether
- re-opening is allowed, as though there may be some benefit to adding a
- call to detect when a tdb_context is shared, to allow other to create such
- an API.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB API Is Not POSIX Thread-safe
-\end_layout
-
-\begin_layout Standard
-The TDB API uses an error code which can be queried after an operation to
- determine what went wrong.
- This programming model does not work with threads, unless specific additional
- guarantees are given by the implementation.
- In addition, even otherwise-independent threads cannot open the same TDB
- (as in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB-Files-Cannot"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Reachitecting the API to include a tdb_errcode pointer would be a great
- deal of churn, but fortunately most functions return 0 on success and -1
- on error: we can change these to return 0 on success and a negative error
- code on error, and the API remains similar to previous.
- The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
- pointer and return an error code.
- It is also simpler to have tdb_nextkey replace its key argument in place,
- freeing up any old .dptr.
-\end_layout
-
-\begin_layout Standard
-Internal locking is required to make sure that fcntl locks do not overlap
- between threads, and also that the global list of tdbs is maintained.
-\end_layout
-
-\begin_layout Standard
-The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
- version of the library, and otherwise no overhead will exist.
- Alternatively, a hooking mechanism similar to that proposed for
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Proposed-Solution-locking-hook"
-
-\end_inset
-
- could be used to enable pthread locking at runtime.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Incomplete; API has been changed but thread safety has not been implemented.
-\end_layout
-
-\begin_layout Subsection
-*_nonblock Functions And *_mark Functions Expose Implementation
-\end_layout
-
-\begin_layout Standard
-CTDB
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-Clustered TDB, see http://ctdb.samba.org
-\end_layout
-
-\end_inset
-
- wishes to operate on TDB in a non-blocking manner.
- This is currently done as follows:
-\end_layout
-
-\begin_layout Enumerate
-Call the _nonblock variant of an API function (eg.
- tdb_lockall_nonblock).
- If this fails:
-\end_layout
-
-\begin_layout Enumerate
-Fork a child process, and wait for it to call the normal variant (eg.
- tdb_lockall).
-\end_layout
-
-\begin_layout Enumerate
-If the child succeeds, call the _mark variant to indicate we already have
- the locks (eg.
- tdb_lockall_mark).
-\end_layout
-
-\begin_layout Enumerate
-Upon completion, tell the child to release the locks (eg.
- tdb_unlockall).
-\end_layout
-
-\begin_layout Enumerate
-Indicate to tdb that it should consider the locks removed (eg.
- tdb_unlockall_mark).
-\end_layout
-
-\begin_layout Standard
-There are several issues with this approach.
- Firstly, adding two new variants of each function clutters the API for
- an obscure use, and so not all functions have three variants.
- Secondly, it assumes that all paths of the functions ask for the same locks,
- otherwise the parent process will have to get a lock which the child doesn't
- have under some circumstances.
- I don't believe this is currently the case, but it constrains the implementatio
-n.
-\end_layout
-
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "Proposed-Solution-locking-hook"
-
-\end_inset
-
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Implement a hook for locking methods, so that the caller can control the
- calls to create and remove fcntl locks.
- In this scenario, ctdbd would operate as follows:
-\end_layout
-
-\begin_layout Enumerate
-Call the normal API function, eg tdb_lockall().
-\end_layout
-
-\begin_layout Enumerate
-When the lock callback comes in, check if the child has the lock.
- Initially, this is always false.
- If so, return 0.
- Otherwise, try to obtain it in non-blocking mode.
- If that fails, return EWOULDBLOCK.
-\end_layout
-
-\begin_layout Enumerate
-Release locks in the unlock callback as normal.
-\end_layout
-
-\begin_layout Enumerate
-If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
- child to repeat the operation.
-\end_layout
-
-\begin_layout Enumerate
-The child records what locks it obtains, and returns that information to
- the parent.
-\end_layout
-
-\begin_layout Enumerate
-When the child has succeeded, goto 1.
-\end_layout
-
-\begin_layout Standard
-This is flexible enough to handle any potential locking scenario, even when
- lock requirements change.
- It can be optimized so that the parent does not release locks, just tells
- the child which locks it doesn't need to obtain.
-\end_layout
-
-\begin_layout Standard
-It also keeps the complexity out of the API, and in ctdbd where it is needed.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-tdb_chainlock Functions Expose Implementation
-\end_layout
-
-\begin_layout Standard
-tdb_chainlock locks some number of records, including the record indicated
- by the given key.
- This gave atomicity guarantees; no-one can start a transaction, alter,
- read or delete that key while the lock is held.
-\end_layout
-
-\begin_layout Standard
-It also makes the same guarantee for any other key in the chain, which is
- an internal implementation detail and potentially a cause for deadlock.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
- It would be nice to have an explicit single entry lock which effected no
- other keys.
- Unfortunately, this won't work for an entry which doesn't exist.
- Thus while chainlock may be implemented more efficiently for the existing
- case, it will still have overlap issues with the non-existing case.
- So it is best to keep the current (lack of) guarantee about which records
- will be effected to avoid constraining our implementation.
-\end_layout
-
-\begin_layout Subsection
-Signal Handling is Not Race-Free
-\end_layout
-
-\begin_layout Standard
-The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
- that the tdb locking code should return with a failure, rather than trying
- again when a signal is received (and errno == EAGAIN).
- This is usually used to implement timeouts.
-\end_layout
-
-\begin_layout Standard
-Unfortunately, this does not work in the case where the signal is received
- before the tdb code enters the fcntl() call to place the lock: the code
- will sleep within the fcntl() code, unaware that the signal wants it to
- exit.
- In the case of long timeouts, this does not happen in practice.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The locking hooks proposed in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "Proposed-Solution-locking-hook"
-
-\end_inset
-
- would allow the user to decide on whether to fail the lock acquisition
- on a signal.
- This allows the caller to choose their own compromise: they could narrow
- the race by checking immediately before the fcntl call.
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-It may be possible to make this race-free in some implementations by having
- the signal handler alter the struct flock to make it invalid.
- This will cause the fcntl() lock call to fail with EINVAL if the signal
- occurs before the kernel is entered, otherwise EAGAIN.
-\end_layout
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-The API Uses Gratuitous Typedefs, Capitals
-\end_layout
-
-\begin_layout Standard
-typedefs are useful for providing source compatibility when types can differ
- across implementations, or arguably in the case of function pointer definitions
- which are hard for humans to parse.
- Otherwise it is simply obfuscation and pollutes the namespace.
-\end_layout
-
-\begin_layout Standard
-Capitalization is usually reserved for compile-time constants and macros.
-\end_layout
-
-\begin_layout Description
-TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
- definition isn't visible to the API user anyway.
-\end_layout
-
-\begin_layout Description
-TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
- needs to be understood by the API user.
-\end_layout
-
-\begin_layout Description
-struct
-\begin_inset space ~
-\end_inset
-
-TDB_DATA This would normally be called 'struct tdb_data'.
-\end_layout
-
-\begin_layout Description
-enum
-\begin_inset space ~
-\end_inset
-
-TDB_ERROR Similarly, this would normally be enum tdb_error.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
- Introducing lower case variants would please pedants like myself, but if
- it were done the existing ones should be kept.
- There is little point forcing a purely cosmetic change upon tdb users.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "tdb_log_func-Doesnt-Take"
-
-\end_inset
-
-tdb_log_func Doesn't Take The Private Pointer
-\end_layout
-
-\begin_layout Standard
-For API compatibility reasons, the logging function needs to call tdb_get_loggin
-g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-It should simply take an extra argument, since we are prepared to break
- the API/ABI.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Various Callback Functions Are Not Typesafe
-\end_layout
-
-\begin_layout Standard
-The callback functions in tdb_set_logging_function (after
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "tdb_log_func-Doesnt-Take"
-
-\end_inset
-
- is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
- all take void * and must internally convert it to the argument type they
- were expecting.
-\end_layout
-
-\begin_layout Standard
-If this type changes, the compiler will not produce warnings on the callers,
- since it only sees void *.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-With careful use of macros, we can create callback functions which give
- a warning when used on gcc and the types of the callback and its private
- argument differ.
- Unsupported compilers will not give a warning, which is no worse than now.
- In addition, the callbacks become clearer, as they need not use void *
- for their parameter.
-\end_layout
-
-\begin_layout Standard
-See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
-\end_layout
-
-\begin_layout Standard
-The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
- be cleared if the caller discovers it is the only process with the TDB
- open.
- However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
- be detected, so will have the TDB erased underneath them (usually resulting
- in a crash).
-\end_layout
-
-\begin_layout Standard
-There is a similar issue on fork(); if the parent exits (or otherwise closes
- the tdb) before the child calls tdb_reopen_all() to establish the lock
- used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
- at that moment will believe it alone has opened the TDB and will erase
- it.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Remove TDB_CLEAR_IF_FIRST.
- Other workarounds are possible, but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
-
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
- An open hook is provided to replicate this functionality if required.
-\end_layout
-
-\begin_layout Subsection
-Extending The Header Is Difficult
-\end_layout
-
-\begin_layout Standard
-We have reserved (zeroed) words in the TDB header, which can be used for
- future features.
- If the future features are compulsory, the version number must be updated
- to prevent old code from accessing the database.
- But if the future feature is optional, we have no way of telling if older
- code is accessing the database or not.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The header should contain a
-\begin_inset Quotes eld
-\end_inset
-
-format variant
-\begin_inset Quotes erd
-\end_inset
-
- value (64-bit).
- This is divided into two 32-bit parts:
-\end_layout
-
-\begin_layout Enumerate
-The lower part reflects the format variant understood by code accessing
- the database.
-\end_layout
-
-\begin_layout Enumerate
-The upper part reflects the format variant you must understand to write
- to the database (otherwise you can only open for reading).
-\end_layout
-
-\begin_layout Standard
-The latter field can only be written at creation time, the former should
- be written under the OPEN_LOCK when opening the database for writing, if
- the variant of the code is lower than the current lowest variant.
-\end_layout
-
-\begin_layout Standard
-This should allow backwards-compatible features to be added, and detection
- if older code (which doesn't understand the feature) writes to the database.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Record Headers Are Not Expandible
-\end_layout
-
-\begin_layout Standard
-If we later want to add (say) checksums on keys and data, it would require
- another format change, which we'd like to avoid.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-We often have extra padding at the tail of a record.
- If we ensure that the first byte (if any) of this padding is zero, we will
- have a way for future changes to detect code which doesn't understand a
- new format: the new code would write (say) a 1 at the tail, and thus if
- there is no tail or the first byte is 0, we would know the extension is
- not present on that record.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB Does Not Use Talloc
-\end_layout
-
-\begin_layout Standard
-Many users of TDB (particularly Samba) use the talloc allocator, and thus
- have to wrap TDB in a talloc context to use it conveniently.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The allocation within TDB is not complicated enough to justify the use of
- talloc, and I am reluctant to force another (excellent) library on TDB
- users.
- Nonetheless a compromise is possible.
- An attribute (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "attributes"
-
-\end_inset
-
-) can be added later to tdb_open() to provide an alternate allocation mechanism,
- specifically for talloc but usable by any other allocator (which would
- ignore the
-\begin_inset Quotes eld
-\end_inset
-
-context
-\begin_inset Quotes erd
-\end_inset
-
- argument).
-\end_layout
-
-\begin_layout Standard
-This would form a talloc heirarchy as expected, but the caller would still
- have to attach a destructor to the tdb context returned from tdb_open to
- close it.
- All TDB_DATA fields would be children of the tdb_context, and the caller
- would still have to manage them (using talloc_free() or talloc_steal()).
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
-\end_layout
-
-\begin_layout Section
-Performance And Scalability Issues
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
-
-\end_inset
-
-TDB_CLEAR_IF_FIRST Imposes Performance Penalty
-\end_layout
-
-\begin_layout Standard
-When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
- 4 (aka.
- the ACTIVE_LOCK).
- While these locks never conflict in normal tdb usage, they do add substantial
- overhead for most fcntl lock implementations when the kernel scans to detect
- if a lock conflict exists.
- This is often a single linked list, making the time to acquire and release
- a fcntl lock O(N) where N is the number of processes with the TDB open,
- not the number actually doing work.
-\end_layout
-
-\begin_layout Standard
-In a Samba server it is common to have huge numbers of clients sitting idle,
- and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-There is a flag to tdb_reopen_all() which is used for this optimization:
- if the parent process will outlive the child, the child does not need the
- ACTIVE_LOCK.
- This is a workaround for this very performance issue.
-\end_layout
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Remove the flag.
- It was a neat idea, but even trivial servers tend to know when they are
- initializing for the first time and can simply unlink the old tdb at that
- point.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB Files Have a 4G Limit
-\end_layout
-
-\begin_layout Standard
-This seems to be becoming an issue (so much for
-\begin_inset Quotes eld
-\end_inset
-
-trivial
-\begin_inset Quotes erd
-\end_inset
-
-!), particularly for ldb.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-A new, incompatible TDB format which uses 64 bit offsets internally rather
- than 32 bit as now.
- For simplicity of endian conversion (which TDB does on the fly if required),
- all values will be 64 bit on disk.
- In practice, some upper bits may be used for other purposes, but at least
- 56 bits will be available for file offsets.
-\end_layout
-
-\begin_layout Standard
-tdb_open() will automatically detect the old version, and even create them
- if TDB_VERSION6 is specified to tdb_open.
-\end_layout
-
-\begin_layout Standard
-32 bit processes will still be able to access TDBs larger than 4G (assuming
- that their off_t allows them to seek to 64 bits), they will gracefully
- fall back as they fail to mmap.
- This can happen already with large TDBs.
-\end_layout
-
-\begin_layout Standard
-Old versions of tdb will fail to open the new TDB files (since 28 August
- 2009, commit 398d0c29290: prior to that any unrecognized file format would
- be erased and initialized as a fresh tdb!)
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-TDB Records Have a 4G Limit
-\end_layout
-
-\begin_layout Standard
-This has not been a reported problem, and the API uses size_t which can
- be 64 bit on 64 bit platforms.
- However, other limits may have made such an issue moot.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Record sizes will be 64 bit, with an error returned on 32 bit platforms
- which try to access such records (the current implementation would return
- TDB_ERR_OOM in a similar case).
- It seems unlikely that 32 bit keys will be a limitation, so the implementation
- may not support this (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Records-Incur-A"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Hash Size Is Determined At TDB Creation Time
-\end_layout
-
-\begin_layout Standard
-TDB contains a number of hash chains in the header; the number is specified
- at creation time, and defaults to 131.
- This is such a bottleneck on large databases (as each hash chain gets quite
- long), that LDB uses 10,000 for this hash.
- In general it is impossible to know what the 'right' answer is at database
- creation time.
-\end_layout
-
-\begin_layout Subsubsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:Hash-Size-Solution"
-
-\end_inset
-
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-After comprehensive performance testing on various scalable hash variants
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
- because I was previously convinced that an expanding tree of hashes would
- be very close to optimal.
-\end_layout
-
-\end_inset
-
-, it became clear that it is hard to beat a straight linear hash table which
- doubles in size when it reaches saturation.
- Unfortunately, altering the hash table introduces serious locking complications
-: the entire hash table needs to be locked to enlarge the hash table, and
- others might be holding locks.
- Particularly insidious are insertions done under tdb_chainlock.
-\end_layout
-
-\begin_layout Standard
-Thus an expanding layered hash will be used: an array of hash groups, with
- each hash group exploding into pointers to lower hash groups once it fills,
- turning into a hash tree.
- This has implications for locking: we must lock the entire group in case
- we need to expand it, yet we don't know how deep the tree is at that point.
-\end_layout
-
-\begin_layout Standard
-Note that bits from the hash table entries should be stolen to hold more
- hash bits to reduce the penalty of collisions.
- We can use the otherwise-unused lower 3 bits.
- If we limit the size of the database to 64 exabytes, we can use the top
- 8 bits of the hash entry as well.
- These 11 bits would reduce false positives down to 1 in 2000 which is more
- than we need: we can use one of the bits to indicate that the extra hash
- bits are valid.
- This means we can choose not to re-hash all entries when we expand a hash
- group; simply use the next bits we need and mark them invalid.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Ignore.
- Scaling the hash automatically proved inefficient at small hash sizes;
- we default to a 8192-element hash (changable via NTDB_ATTRIBUTE_HASHSIZE),
- and when buckets clash we expand to an array of hash entries.
- This scales slightly better than the tdb chain (due to the 8 top bits containin
-g extra hash).
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB-Freelist-Is"
-
-\end_inset
-
-TDB Freelist Is Highly Contended
-\end_layout
-
-\begin_layout Standard
-TDB uses a single linked list for the free list.
- Allocation occurs as follows, using heuristics which have evolved over
- time:
-\end_layout
-
-\begin_layout Enumerate
-Get the free list lock for this whole operation.
-\end_layout
-
-\begin_layout Enumerate
-Multiply length by 1.25, so we always over-allocate by 25%.
-\end_layout
-
-\begin_layout Enumerate
-Set the slack multiplier to 1.
-\end_layout
-
-\begin_layout Enumerate
-Examine the current freelist entry: if it is > length but < the current
- best case, remember it as the best case.
-\end_layout
-
-\begin_layout Enumerate
-Multiply the slack multiplier by 1.05.
-\end_layout
-
-\begin_layout Enumerate
-If our best fit so far is less than length * slack multiplier, return it.
- The slack will be turned into a new free record if it's large enough.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise, go onto the next freelist entry.
-\end_layout
-
-\begin_layout Standard
-Deleting a record occurs as follows:
-\end_layout
-
-\begin_layout Enumerate
-Lock the hash chain for this whole operation.
-\end_layout
-
-\begin_layout Enumerate
-Walk the chain to find the record, keeping the prev pointer offset.
-\end_layout
-
-\begin_layout Enumerate
-If max_dead is non-zero:
-\end_layout
-
-\begin_deeper
-\begin_layout Enumerate
-Walk the hash chain again and count the dead records.
-\end_layout
-
-\begin_layout Enumerate
-If it's more than max_dead, bulk free all the dead ones (similar to steps
- 4 and below, but the lock is only obtained once).
-\end_layout
-
-\begin_layout Enumerate
-Simply mark this record as dead and return.
-\end_layout
-
-\end_deeper
-\begin_layout Enumerate
-Get the free list lock for the remainder of this operation.
-\end_layout
-
-\begin_layout Enumerate
-\begin_inset CommandInset label
-LatexCommand label
-name "right-merging"
-
-\end_inset
-
-Examine the following block to see if it is free; if so, enlarge the current
- block and remove that block from the free list.
- This was disabled, as removal from the free list was O(entries-in-free-list).
-\end_layout
-
-\begin_layout Enumerate
-Examine the preceeding block to see if it is free: for this reason, each
- block has a 32-bit tailer which indicates its length.
- If it is free, expand it to cover our new block and return.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise, prepend ourselves to the free list.
-\end_layout
-
-\begin_layout Standard
-Disabling right-merging (step
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "right-merging"
-
-\end_inset
-
-) causes fragmentation; the other heuristics proved insufficient to address
- this, so the final answer to this was that when we expand the TDB file
- inside a transaction commit, we repack the entire tdb.
-\end_layout
-
-\begin_layout Standard
-The single list lock limits our allocation rate; due to the other issues
- this is not currently seen as a bottleneck.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The first step is to remove all the current heuristics, as they obviously
- interact, then examine them once the lock contention is addressed.
-\end_layout
-
-\begin_layout Standard
-The free list must be split to reduce contention.
- Assuming perfect free merging, we can at most have 1 free list entry for
- each entry.
- This implies that the number of free lists is related to the size of the
- hash table, but as it is rare to walk a large number of free list entries
- we can use far fewer, say 1/32 of the number of hash buckets.
-\end_layout
-
-\begin_layout Standard
-It seems tempting to try to reuse the hash implementation which we use for
- records here, but we have two ways of searching for free entries: for allocatio
-n we search by size (and possibly zone) which produces too many clashes
- for our hash table to handle well, and for coalescing we search by address.
- Thus an array of doubly-linked free lists seems preferable.
-\end_layout
-
-\begin_layout Standard
-There are various benefits in using per-size free lists (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Becomes-Fragmented"
-
-\end_inset
-
-) but it's not clear this would reduce contention in the common case where
- all processes are allocating/freeing the same size.
- Thus we almost certainly need to divide in other ways: the most obvious
- is to divide the file into zones, and using a free list (or table of free
- lists) for each.
- This approximates address ordering.
-\end_layout
-
-\begin_layout Standard
-Unfortunately it is difficult to know what heuristics should be used to
- determine zone sizes, and our transaction code relies on being able to
- create a
-\begin_inset Quotes eld
-\end_inset
-
-recovery area
-\begin_inset Quotes erd
-\end_inset
-
- by simply appending to the file (difficult if it would need to create a
- new zone header).
- Thus we use a linked-list of free tables; currently we only ever create
- one, but if there is more than one we choose one at random to use.
- In future we may use heuristics to add new free tables on contention.
- We only expand the file when all free tables are exhausted.
-\end_layout
-
-\begin_layout Standard
-The basic algorithm is as follows.
- Freeing is simple:
-\end_layout
-
-\begin_layout Enumerate
-Identify the correct free list.
-\end_layout
-
-\begin_layout Enumerate
-Lock the corresponding list.
-\end_layout
-
-\begin_layout Enumerate
-Re-check the list (we didn't have a lock, sizes could have changed): relock
- if necessary.
-\end_layout
-
-\begin_layout Enumerate
-Place the freed entry in the list.
-\end_layout
-
-\begin_layout Standard
-Allocation is a little more complicated, as we perform delayed coalescing
- at this point:
-\end_layout
-
-\begin_layout Enumerate
-Pick a free table; usually the previous one.
-\end_layout
-
-\begin_layout Enumerate
-Lock the corresponding list.
-\end_layout
-
-\begin_layout Enumerate
-If the top entry is -large enough, remove it from the list and return it.
-\end_layout
-
-\begin_layout Enumerate
-Otherwise, coalesce entries in the list.If there was no entry large enough,
- unlock the list and try the next largest list
-\end_layout
-
-\begin_layout Enumerate
-If no list has an entry which meets our needs, try the next free table.
-\end_layout
-
-\begin_layout Enumerate
-If no zone satisfies, expand the file.
-\end_layout
-
-\begin_layout Standard
-This optimizes rapid insert/delete of free list entries by not coalescing
- them all the time..
- First-fit address ordering ordering seems to be fairly good for keeping
- fragmentation low (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Becomes-Fragmented"
-
-\end_inset
-
-).
- Note that address ordering does not need a tailer to coalesce, though if
- we needed one we could have one cheaply: see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Records-Incur-A"
-
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Standard
-Each free entry has the free table number in the header: less than 255.
- It also contains a doubly-linked list for easy deletion.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:TDB-Becomes-Fragmented"
-
-\end_inset
-
-TDB Becomes Fragmented
-\end_layout
-
-\begin_layout Standard
-Much of this is a result of allocation strategy
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
-xas.edu/pub/garbage/malloc/ismm98.ps
-\end_layout
-
-\end_inset
-
- and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
-on) is deliberately set at 25%, and external fragmentation is only cured
- by the decision to repack the entire db when a transaction commit needs
- to enlarge the file.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The 25% overhead on allocation works in practice for ldb because indexes
- tend to expand by one record at a time.
- This internal fragmentation can be resolved by having an
-\begin_inset Quotes eld
-\end_inset
-
-expanded
-\begin_inset Quotes erd
-\end_inset
-
- bit in the header to note entries that have previously expanded, and allocating
- more space for them.
-\end_layout
-
-\begin_layout Standard
-There are is a spectrum of possible solutions for external fragmentation:
- one is to use a fragmentation-avoiding allocation strategy such as best-fit
- address-order allocator.
- The other end of the spectrum would be to use a bump allocator (very fast
- and simple) and simply repack the file when we reach the end.
-\end_layout
-
-\begin_layout Standard
-There are three problems with efficient fragmentation-avoiding allocators:
- they are non-trivial, they tend to use a single free list for each size,
- and there's no evidence that tdb allocation patterns will match those recorded
- for general allocators (though it seems likely).
-\end_layout
-
-\begin_layout Standard
-Thus we don't spend too much effort on external fragmentation; we will be
- no worse than the current code if we need to repack on occasion.
- More effort is spent on reducing freelist contention, and reducing overhead.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:Records-Incur-A"
-
-\end_inset
-
-Records Incur A 28-Byte Overhead
-\end_layout
-
-\begin_layout Standard
-Each TDB record has a header as follows:
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_record {
-\end_layout
-
-\begin_layout LyX-Code
-        tdb_off_t next; /* offset of the next record in the list */
-\end_layout
-
-\begin_layout LyX-Code
-        tdb_len_t rec_len; /* total byte length of record */
-\end_layout
-
-\begin_layout LyX-Code
-        tdb_len_t key_len; /* byte length of key */
-\end_layout
-
-\begin_layout LyX-Code
-        tdb_len_t data_len; /* byte length of data */
-\end_layout
-
-\begin_layout LyX-Code
-        uint32_t full_hash; /* the full 32 bit hash of the key */
-\end_layout
-
-\begin_layout LyX-Code
-        uint32_t magic;   /* try to catch errors */
-\end_layout
-
-\begin_layout LyX-Code
-        /* the following union is implied:
-\end_layout
-
-\begin_layout LyX-Code
-                union {
-\end_layout
-
-\begin_layout LyX-Code
-                        char record[rec_len];
-\end_layout
-
-\begin_layout LyX-Code
-                        struct {
-\end_layout
-
-\begin_layout LyX-Code
-                                char key[key_len];
-\end_layout
-
-\begin_layout LyX-Code
-                                char data[data_len];
-\end_layout
-
-\begin_layout LyX-Code
-                        }
-\end_layout
-
-\begin_layout LyX-Code
-                        uint32_t totalsize; (tailer)
-\end_layout
-
-\begin_layout LyX-Code
-                }
-\end_layout
-
-\begin_layout LyX-Code
-        */
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-Naively, this would double to a 56-byte overhead on a 64 bit implementation.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-We can use various techniques to reduce this for an allocated block:
-\end_layout
-
-\begin_layout Enumerate
-The 'next' pointer is not required, as we are using a flat hash table.
-\end_layout
-
-\begin_layout Enumerate
-'rec_len' can instead be expressed as an addition to key_len and data_len
- (it accounts for wasted or overallocated length in the record).
- Since the record length is always a multiple of 8, we can conveniently
- fit it in 32 bits (representing up to 35 bits).
-\end_layout
-
-\begin_layout Enumerate
-'key_len' and 'data_len' can be reduced.
- I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
- the two into one 64-bit field and using a 5 bit value which indicates at
- what bit to divide the two.
- Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
- size of 32 bits.
-\end_layout
-
-\begin_layout Enumerate
-'full_hash' is used to avoid a memcmp on the
-\begin_inset Quotes eld
-\end_inset
-
-miss
-\begin_inset Quotes erd
-\end_inset
-
- case, but this is diminishing returns after a handful of bits (at 10 bits,
- it reduces 99.9% of false memcmp).
- As an aside, as the lower bits are already incorporated in the hash table
- resolution, the upper bits should be used here.
- Note that it's not clear that these bits will be a win, given the extra
- bits in the hash table itself (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:Hash-Size-Solution"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Enumerate
-'magic' does not need to be enlarged: it currently reflects one of 5 values
- (used, free, dead, recovery, and unused_recovery).
- It is useful for quick sanity checking however, and should not be eliminated.
-\end_layout
-
-\begin_layout Enumerate
-'tailer' is only used to coalesce free blocks (so a block to the right can
- find the header to check if this block is free).
- This can be replaced by a single 'free' bit in the header of the following
- block (and the tailer only exists in free blocks).
-\begin_inset Foot
-status collapsed
-
-\begin_layout Plain Layout
-This technique from Thomas Standish.
- Data Structure Techniques.
- Addison-Wesley, Reading, Massachusetts, 1980.
-\end_layout
-
-\end_inset
-
- The current proposed coalescing algorithm doesn't need this, however.
-\end_layout
-
-\begin_layout Standard
-This produces a 16 byte used header like this:
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_used_record {
-\end_layout
-
-\begin_layout LyX-Code
-        uint32_t used_magic : 16,
-\end_layout
-
-\begin_layout LyX-Code
-
-\end_layout
-
-\begin_layout LyX-Code
-                 key_data_divide: 5,
-\end_layout
-
-\begin_layout LyX-Code
-                 top_hash: 11;
-\end_layout
-
-\begin_layout LyX-Code
-        uint32_t extra_octets;
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t key_and_data_len;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-And a free record like this:
-\end_layout
-
-\begin_layout LyX-Code
-struct tdb_free_record {
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t free_magic: 8,
-\end_layout
-
-\begin_layout LyX-Code
-                   prev : 56;
-\end_layout
-
-\begin_layout LyX-Code
-
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t free_table: 8,
-\end_layout
-
-\begin_layout LyX-Code
-                 total_length : 56
-\end_layout
-
-\begin_layout LyX-Code
-        uint64_t next;;
-\end_layout
-
-\begin_layout LyX-Code
-};
-\end_layout
-
-\begin_layout Standard
-Note that by limiting valid offsets to 56 bits, we can pack everything we
- need into 3 64-byte words, meaning our minimum record size is 8 bytes.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Transaction Commit Requires 4 fdatasync
-\end_layout
-
-\begin_layout Standard
-The current transaction algorithm is:
-\end_layout
-
-\begin_layout Enumerate
-write_recovery_data();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Enumerate
-write_recovery_header();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Enumerate
-overwrite_with_new_data();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Enumerate
-remove_recovery_header();
-\end_layout
-
-\begin_layout Enumerate
-sync();
-\end_layout
-
-\begin_layout Standard
-On current ext3, each sync flushes all data to disk, so the next 3 syncs
- are relatively expensive.
- But this could become a performance bottleneck on other filesystems such
- as ext4.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Neil Brown points out that this is overzealous, and only one sync is needed:
-\end_layout
-
-\begin_layout Enumerate
-Bundle the recovery data, a transaction counter and a strong checksum of
- the new data.
-\end_layout
-
-\begin_layout Enumerate
-Strong checksum that whole bundle.
-\end_layout
-
-\begin_layout Enumerate
-Store the bundle in the database.
-\end_layout
-
-\begin_layout Enumerate
-Overwrite the oldest of the two recovery pointers in the header (identified
- using the transaction counter) with the offset of this bundle.
-\end_layout
-
-\begin_layout Enumerate
-sync.
-\end_layout
-
-\begin_layout Enumerate
-Write the new data to the file.
-\end_layout
-
-\begin_layout Standard
-Checking for recovery means identifying the latest bundle with a valid checksum
- and using the new data checksum to ensure that it has been applied.
- This is more expensive than the current check, but need only be done at
- open.
- For running databases, a separate header field can be used to indicate
- a transaction in progress; we need only check for recovery if this is set.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "sub:TDB-Does-Not"
-
-\end_inset
-
-TDB Does Not Have Snapshot Support
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
- At some point you say
-\begin_inset Quotes eld
-\end_inset
-
-use a real database
-\begin_inset Quotes erd
-\end_inset
-
- (but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "replay-attribute"
-
-\end_inset
-
-).
-\end_layout
-
-\begin_layout Standard
-But as a thought experiment, if we implemented transactions to only overwrite
- free entries (this is tricky: there must not be a header in each entry
- which indicates whether it is free, but use of presence in metadata elsewhere),
- and a pointer to the hash table, we could create an entirely new commit
- without destroying existing data.
- Then it would be easy to implement snapshots in a similar way.
-\end_layout
-
-\begin_layout Standard
-This would not allow arbitrary changes to the database, such as tdb_repack
- does, and would require more space (since we have to preserve the current
- and future entries at once).
- If we used hash trees rather than one big hash table, we might only have
- to rewrite some sections of the hash, too.
-\end_layout
-
-\begin_layout Standard
-We could then implement snapshots using a similar method, using multiple
- different hash tables/free tables.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Subsection
-Transactions Cannot Operate in Parallel
-\end_layout
-
-\begin_layout Standard
-This would be useless for ldb, as it hits the index records with just about
- every update.
- It would add significant complexity in resolving clashes, and cause the
- all transaction callers to write their code to loop in the case where the
- transactions spuriously failed.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None (but see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "replay-attribute"
-
-\end_inset
-
-).
- We could solve a small part of the problem by providing read-only transactions.
- These would allow one write transaction to begin, but it could not commit
- until all r/o transactions are done.
- This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
- commit.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\begin_layout Subsection
-Default Hash Function Is Suboptimal
-\end_layout
-
-\begin_layout Standard
-The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
- if we expand it to 64 bits), and works best when the hash bucket size is
- a prime number (which also means a slow modulus).
- In addition, it is highly predictable which could potentially lead to a
- Denial of Service attack in some TDB uses.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-The Jenkins lookup3 hash
-\begin_inset Foot
-status open
-
-\begin_layout Plain Layout
-http://burtleburtle.net/bob/c/lookup3.c
-\end_layout
-
-\end_inset
-
- is a fast and superbly-mixing hash.
- It's used by the Linux kernel and almost everything else.
- This has the particular properties that it takes an initial seed, and produces
- two 32 bit hash numbers, which we can combine into a 64-bit hash.
-\end_layout
-
-\begin_layout Standard
-The seed should be created at tdb-creation time from some random source,
- and placed in the header.
- This is far from foolproof, but adds a little bit of protection against
- hash bombing.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "Reliable-Traversal-Adds"
-
-\end_inset
-
-Reliable Traversal Adds Complexity
-\end_layout
-
-\begin_layout Standard
-We lock a record during traversal iteration, and try to grab that lock in
- the delete code.
- If that grab on delete fails, we simply mark it deleted and continue onwards;
- traversal checks for this condition and does the delete when it moves off
- the record.
-\end_layout
-
-\begin_layout Standard
-If traversal terminates, the dead record may be left indefinitely.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-Remove reliability guarantees; see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "traverse-Proposed-Solution"
-
-\end_inset
-
-.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Complete.
-\end_layout
-
-\begin_layout Subsection
-Fcntl Locking Adds Overhead
-\end_layout
-
-\begin_layout Standard
-Placing a fcntl lock means a system call, as does removing one.
- This is actually one reason why transactions can be faster (everything
- is locked once at transaction start).
- In the uncontended case, this overhead can theoretically be eliminated.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
-\end_layout
-
-\begin_layout Standard
-We tried this before with spinlock support, in the early days of TDB, and
- it didn't make much difference except in manufactured benchmarks.
-\end_layout
-
-\begin_layout Standard
-We could use spinlocks (with futex kernel support under Linux), but it means
- that we lose automatic cleanup when a process dies with a lock.
- There is a method of auto-cleanup under Linux, but it's not supported by
- other operating systems.
- We could reintroduce a clear-if-first-style lock and sweep for dead futexes
- on open, but that wouldn't help the normal case of one concurrent opener
- dying.
- Increasingly elaborate repair schemes could be considered, but they require
- an ABI change (everyone must use them) anyway, so there's no need to do
- this at the same time as everything else.
-\end_layout
-
-\begin_layout Subsection
-Some Transactions Don't Require Durability
-\end_layout
-
-\begin_layout Standard
-Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
- usage, and occasionally empties the results into a transactional TDB.
- This kind of usage prioritizes performance over durability: as long as
- we are consistent, data can be lost.
-\end_layout
-
-\begin_layout Standard
-This would be more neatly implemented inside tdb: a
-\begin_inset Quotes eld
-\end_inset
-
-soft
-\begin_inset Quotes erd
-\end_inset
-
- transaction commit (ie.
- syncless) which meant that data may be reverted on a crash.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
-\end_layout
-
-\begin_layout Standard
-Unfortunately any transaction scheme which overwrites old data requires
- a sync before that overwrite to avoid the possibility of corruption.
-\end_layout
-
-\begin_layout Standard
-It seems possible to use a scheme similar to that described in
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "sub:TDB-Does-Not"
-
-\end_inset
-
-,where transactions are committed without overwriting existing data, and
- an array of top-level pointers were available in the header.
- If the transaction is
-\begin_inset Quotes eld
-\end_inset
-
-soft
-\begin_inset Quotes erd
-\end_inset
-
- then we would not need a sync at all: existing processes would pick up
- the new hash table and free list and work with that.
-\end_layout
-
-\begin_layout Standard
-At some later point, a sync would allow recovery of the old data into the
- free lists (perhaps when the array of top-level pointers filled).
- On crash, tdb_open() would examine the array of top levels, and apply the
- transactions until it encountered an invalid checksum.
-\end_layout
-
-\begin_layout Subsection
-Tracing Is Fragile, Replay Is External
-\end_layout
-
-\begin_layout Standard
-The current TDB has compile-time-enabled tracing code, but it often breaks
- as it is not enabled by default.
- In a similar way, the ctdb code has an external wrapper which does replay
- tracing so it can coordinate cluster-wide transactions.
-\end_layout
-
-\begin_layout Subsubsection
-Proposed Solution
-\begin_inset CommandInset label
-LatexCommand label
-name "replay-attribute"
-
-\end_inset
-
-
-\end_layout
-
-\begin_layout Standard
-Tridge points out that an attribute can be later added to tdb_open (see
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "attributes"
-
-\end_inset
-
-) to provide replay/trace hooks, which could become the basis for this and
- future parallel transactions and snapshot support.
-\end_layout
-
-\begin_layout Subsubsection
-Status
-\end_layout
-
-\begin_layout Standard
-Deferred.
-\end_layout
-
-\end_body
-\end_document
diff --git a/ccan/ntdb/doc/design.pdf b/ccan/ntdb/doc/design.pdf
deleted file mode 100644 (file)
index 8381914..0000000
Binary files a/ccan/ntdb/doc/design.pdf and /dev/null differ
diff --git a/ccan/ntdb/doc/design.txt b/ccan/ntdb/doc/design.txt
deleted file mode 100644 (file)
index bd680f0..0000000
+++ /dev/null
@@ -1,1270 +0,0 @@
-NTDB: Redesigning The Trivial DataBase
-
-Rusty Russell, IBM Corporation
-
-19 June 2012
-
-Abstract
-
-The Trivial DataBase on-disk format is 32 bits; with usage cases
-heading towards the 4G limit, that must change. This required
-breakage provides an opportunity to revisit TDB's other design
-decisions and reassess them.
-
-1 Introduction
-
-The Trivial DataBase was originally written by Andrew Tridgell as
-a simple key/data pair storage system with the same API as dbm,
-but allowing multiple readers and writers while being small
-enough (< 1000 lines of C) to include in SAMBA. The simple design
-created in 1999 has proven surprisingly robust and performant,
-used in Samba versions 3 and 4 as well as numerous other
-projects. Its useful life was greatly increased by the
-(backwards-compatible!) addition of transaction support in 2005.
-
-The wider variety and greater demands of TDB-using code has lead
-to some organic growth of the API, as well as some compromises on
-the implementation. None of these, by themselves, are seen as
-show-stoppers, but the cumulative effect is to a loss of elegance
-over the initial, simple TDB implementation. Here is a table of
-the approximate number of lines of implementation code and number
-of API functions at the end of each year:
-
-
-+-----------+----------------+--------------------------------+
-| Year End  | API Functions  | Lines of C Code Implementation |
-+-----------+----------------+--------------------------------+
-+-----------+----------------+--------------------------------+
-|   1999    |      13        |              1195              |
-+-----------+----------------+--------------------------------+
-|   2000    |      24        |              1725              |
-+-----------+----------------+--------------------------------+
-|   2001    |      32        |              2228              |
-+-----------+----------------+--------------------------------+
-|   2002    |      35        |              2481              |
-+-----------+----------------+--------------------------------+
-|   2003    |      35        |              2552              |
-+-----------+----------------+--------------------------------+
-|   2004    |      40        |              2584              |
-+-----------+----------------+--------------------------------+
-|   2005    |      38        |              2647              |
-+-----------+----------------+--------------------------------+
-|   2006    |      52        |              3754              |
-+-----------+----------------+--------------------------------+
-|   2007    |      66        |              4398              |
-+-----------+----------------+--------------------------------+
-|   2008    |      71        |              4768              |
-+-----------+----------------+--------------------------------+
-|   2009    |      73        |              5715              |
-+-----------+----------------+--------------------------------+
-
-
-This review is an attempt to catalog and address all the known
-issues with TDB and create solutions which address the problems
-without significantly increasing complexity; all involved are far
-too aware of the dangers of second system syndrome in rewriting a
-successful project like this.
-
-Note: the final decision was to make ntdb a separate library,
-with a separarate 'ntdb' namespace so both can potentially be
-linked together. This document still refers to “tdb” everywhere,
-for simplicity.
-
-2 API Issues
-
-2.1 tdb_open_ex Is Not Expandable
-
-The tdb_open() call was expanded to tdb_open_ex(), which added an
-optional hashing function and an optional logging function
-argument. Additional arguments to open would require the
-introduction of a tdb_open_ex2 call etc.
-
-2.1.1 Proposed Solution<attributes>
-
-tdb_open() will take a linked-list of attributes:
-
-enum tdb_attribute {
-
-    TDB_ATTRIBUTE_LOG = 0,
-
-    TDB_ATTRIBUTE_HASH = 1
-
-};
-
-struct tdb_attribute_base {
-
-    enum tdb_attribute attr;
-
-    union tdb_attribute *next;
-
-};
-
-struct tdb_attribute_log {
-
-    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
-*/
-
-    tdb_log_func log_fn;
-
-    void *log_private;
-
-};
-
-struct tdb_attribute_hash {
-
-    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
-*/
-
-    tdb_hash_func hash_fn;
-
-    void *hash_private;
-
-};
-
-union tdb_attribute {
-
-    struct tdb_attribute_base base;
-
-    struct tdb_attribute_log log;
-
-    struct tdb_attribute_hash hash;
-
-};
-
-This allows future attributes to be added, even if this expands
-the size of the union.
-
-2.1.2 Status
-
-Complete.
-
-2.2 tdb_traverse Makes Impossible Guarantees
-
-tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
-and it was thought that it was important to guarantee that all
-records which exist at the start and end of the traversal would
-be included, and no record would be included twice.
-
-This adds complexity (see[Reliable-Traversal-Adds]) and does not
-work anyway for records which are altered (in particular, those
-which are expanded may be effectively deleted and re-added behind
-the traversal).
-
-2.2.1 <traverse-Proposed-Solution>Proposed Solution
-
-Abandon the guarantee. You will see every record if no changes
-occur during your traversal, otherwise you will see some subset.
-You can prevent changes by using a transaction or the locking
-API.
-
-2.2.2 Status
-
-Complete. Delete-during-traverse will still delete every record,
-too (assuming no other changes).
-
-2.3 Nesting of Transactions Is Fraught
-
-TDB has alternated between allowing nested transactions and not
-allowing them. Various paths in the Samba codebase assume that
-transactions will nest, and in a sense they can: the operation is
-only committed to disk when the outer transaction is committed.
-There are two problems, however:
-
-1. Canceling the inner transaction will cause the outer
-  transaction commit to fail, and will not undo any operations
-  since the inner transaction began. This problem is soluble with
-  some additional internal code.
-
-2. An inner transaction commit can be cancelled by the outer
-  transaction. This is desirable in the way which Samba's
-  database initialization code uses transactions, but could be a
-  surprise to any users expecting a successful transaction commit
-  to expose changes to others.
-
-The current solution is to specify the behavior at tdb_open(),
-with the default currently that nested transactions are allowed.
-This flag can also be changed at runtime.
-
-2.3.1 Proposed Solution
-
-Given the usage patterns, it seems that the“least-surprise”
-behavior of disallowing nested transactions should become the
-default. Additionally, it seems the outer transaction is the only
-code which knows whether inner transactions should be allowed, so
-a flag to indicate this could be added to tdb_transaction_start.
-However, this behavior can be simulated with a wrapper which uses
-tdb_add_flags() and tdb_remove_flags(), so the API should not be
-expanded for this relatively-obscure case.
-
-2.3.2 Status
-
-Complete; the nesting flag has been removed.
-
-2.4 Incorrect Hash Function is Not Detected
-
-tdb_open_ex() allows the calling code to specify a different hash
-function to use, but does not check that all other processes
-accessing this tdb are using the same hash function. The result
-is that records are missing from tdb_fetch().
-
-2.4.1 Proposed Solution
-
-The header should contain an example hash result (eg. the hash of
-0xdeadbeef), and tdb_open_ex() should check that the given hash
-function produces the same answer, or fail the tdb_open call.
-
-2.4.2 Status
-
-Complete.
-
-2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
-
-In response to scalability issues with the free list ([TDB-Freelist-Is]
-) two API workarounds have been incorporated in TDB:
-tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
-latter actually calls the former with an argument of“5”.
-
-This code allows deleted records to accumulate without putting
-them in the free list. On delete we iterate through each chain
-and free them in a batch if there are more than max_dead entries.
-These are never otherwise recycled except as a side-effect of a
-tdb_repack.
-
-2.5.1 Proposed Solution
-
-With the scalability problems of the freelist solved, this API
-can be removed. The TDB_VOLATILE flag may still be useful as a
-hint that store and delete of records will be at least as common
-as fetch in order to allow some internal tuning, but initially
-will become a no-op.
-
-2.5.2 Status
-
-Complete. Unknown flags cause tdb_open() to fail as well, so they
-can be detected at runtime.
-
-2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
-  In The Same Process
-
-No process can open the same TDB twice; we check and disallow it.
-This is an unfortunate side-effect of fcntl locks, which operate
-on a per-file rather than per-file-descriptor basis, and do not
-nest. Thus, closing any file descriptor on a file clears all the
-locks obtained by this process, even if they were placed using a
-different file descriptor!
-
-Note that even if this were solved, deadlock could occur if
-operations were nested: this is a more manageable programming
-error in most cases.
-
-2.6.1 Proposed Solution
-
-We could lobby POSIX to fix the perverse rules, or at least lobby
-Linux to violate them so that the most common implementation does
-not have this restriction. This would be a generally good idea
-for other fcntl lock users.
-
-Samba uses a wrapper which hands out the same tdb_context to
-multiple callers if this happens, and does simple reference
-counting. We should do this inside the tdb library, which already
-emulates lock nesting internally; it would need to recognize when
-deadlock occurs within a single process. This would create a new
-failure mode for tdb operations (while we currently handle
-locking failures, they are impossible in normal use and a process
-encountering them can do little but give up).
-
-I do not see benefit in an additional tdb_open flag to indicate
-whether re-opening is allowed, as though there may be some
-benefit to adding a call to detect when a tdb_context is shared,
-to allow other to create such an API.
-
-2.6.2 Status
-
-Complete.
-
-2.7 TDB API Is Not POSIX Thread-safe
-
-The TDB API uses an error code which can be queried after an
-operation to determine what went wrong. This programming model
-does not work with threads, unless specific additional guarantees
-are given by the implementation. In addition, even
-otherwise-independent threads cannot open the same TDB (as in[TDB-Files-Cannot]
-).
-
-2.7.1 Proposed Solution
-
-Reachitecting the API to include a tdb_errcode pointer would be a
-great deal of churn, but fortunately most functions return 0 on
-success and -1 on error: we can change these to return 0 on
-success and a negative error code on error, and the API remains
-similar to previous. The tdb_fetch, tdb_firstkey and tdb_nextkey
-functions need to take a TDB_DATA pointer and return an error
-code. It is also simpler to have tdb_nextkey replace its key
-argument in place, freeing up any old .dptr.
-
-Internal locking is required to make sure that fcntl locks do not
-overlap between threads, and also that the global list of tdbs is
-maintained.
-
-The aim is that building tdb with -DTDB_PTHREAD will result in a
-pthread-safe version of the library, and otherwise no overhead
-will exist. Alternatively, a hooking mechanism similar to that
-proposed for[Proposed-Solution-locking-hook] could be used to
-enable pthread locking at runtime.
-
-2.7.2 Status
-
-Incomplete; API has been changed but thread safety has not been
-implemented.
-
-2.8 *_nonblock Functions And *_mark Functions Expose
-  Implementation
-
-CTDB[footnote:
-Clustered TDB, see http://ctdb.samba.org
-] wishes to operate on TDB in a non-blocking manner. This is
-currently done as follows:
-
-1. Call the _nonblock variant of an API function (eg.
-  tdb_lockall_nonblock). If this fails:
-
-2. Fork a child process, and wait for it to call the normal
-  variant (eg. tdb_lockall).
-
-3. If the child succeeds, call the _mark variant to indicate we
-  already have the locks (eg. tdb_lockall_mark).
-
-4. Upon completion, tell the child to release the locks (eg.
-  tdb_unlockall).
-
-5. Indicate to tdb that it should consider the locks removed (eg.
-  tdb_unlockall_mark).
-
-There are several issues with this approach. Firstly, adding two
-new variants of each function clutters the API for an obscure
-use, and so not all functions have three variants. Secondly, it
-assumes that all paths of the functions ask for the same locks,
-otherwise the parent process will have to get a lock which the
-child doesn't have under some circumstances. I don't believe this
-is currently the case, but it constrains the implementation.
-
-2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
-
-Implement a hook for locking methods, so that the caller can
-control the calls to create and remove fcntl locks. In this
-scenario, ctdbd would operate as follows:
-
-1. Call the normal API function, eg tdb_lockall().
-
-2. When the lock callback comes in, check if the child has the
-  lock. Initially, this is always false. If so, return 0.
-  Otherwise, try to obtain it in non-blocking mode. If that
-  fails, return EWOULDBLOCK.
-
-3. Release locks in the unlock callback as normal.
-
-4. If tdb_lockall() fails, see if we recorded a lock failure; if
-  so, call the child to repeat the operation.
-
-5. The child records what locks it obtains, and returns that
-  information to the parent.
-
-6. When the child has succeeded, goto 1.
-
-This is flexible enough to handle any potential locking scenario,
-even when lock requirements change. It can be optimized so that
-the parent does not release locks, just tells the child which
-locks it doesn't need to obtain.
-
-It also keeps the complexity out of the API, and in ctdbd where
-it is needed.
-
-2.8.2 Status
-
-Complete.
-
-2.9 tdb_chainlock Functions Expose Implementation
-
-tdb_chainlock locks some number of records, including the record
-indicated by the given key. This gave atomicity guarantees;
-no-one can start a transaction, alter, read or delete that key
-while the lock is held.
-
-It also makes the same guarantee for any other key in the chain,
-which is an internal implementation detail and potentially a
-cause for deadlock.
-
-2.9.1 Proposed Solution
-
-None. It would be nice to have an explicit single entry lock
-which effected no other keys. Unfortunately, this won't work for
-an entry which doesn't exist. Thus while chainlock may be
-implemented more efficiently for the existing case, it will still
-have overlap issues with the non-existing case. So it is best to
-keep the current (lack of) guarantee about which records will be
-effected to avoid constraining our implementation.
-
-2.10 Signal Handling is Not Race-Free
-
-The tdb_setalarm_sigptr() call allows the caller's signal handler
-to indicate that the tdb locking code should return with a
-failure, rather than trying again when a signal is received (and
-errno == EAGAIN). This is usually used to implement timeouts.
-
-Unfortunately, this does not work in the case where the signal is
-received before the tdb code enters the fcntl() call to place the
-lock: the code will sleep within the fcntl() code, unaware that
-the signal wants it to exit. In the case of long timeouts, this
-does not happen in practice.
-
-2.10.1 Proposed Solution
-
-The locking hooks proposed in[Proposed-Solution-locking-hook]
-would allow the user to decide on whether to fail the lock
-acquisition on a signal. This allows the caller to choose their
-own compromise: they could narrow the race by checking
-immediately before the fcntl call.[footnote:
-It may be possible to make this race-free in some implementations
-by having the signal handler alter the struct flock to make it
-invalid. This will cause the fcntl() lock call to fail with
-EINVAL if the signal occurs before the kernel is entered,
-otherwise EAGAIN.
-]
-
-2.10.2 Status
-
-Complete.
-
-2.11 The API Uses Gratuitous Typedefs, Capitals
-
-typedefs are useful for providing source compatibility when types
-can differ across implementations, or arguably in the case of
-function pointer definitions which are hard for humans to parse.
-Otherwise it is simply obfuscation and pollutes the namespace.
-
-Capitalization is usually reserved for compile-time constants and
-macros.
-
-  TDB_CONTEXT There is no reason to use this over 'struct
-  tdb_context'; the definition isn't visible to the API user
-  anyway.
-
-  TDB_DATA There is no reason to use this over struct TDB_DATA;
-  the struct needs to be understood by the API user.
-
-  struct TDB_DATA This would normally be called 'struct
-  tdb_data'.
-
-  enum TDB_ERROR Similarly, this would normally be enum
-  tdb_error.
-
-2.11.1 Proposed Solution
-
-None. Introducing lower case variants would please pedants like
-myself, but if it were done the existing ones should be kept.
-There is little point forcing a purely cosmetic change upon tdb
-users.
-
-2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
-  Private Pointer
-
-For API compatibility reasons, the logging function needs to call
-tdb_get_logging_private() to retrieve the pointer registered by
-the tdb_open_ex for logging.
-
-2.12.1 Proposed Solution
-
-It should simply take an extra argument, since we are prepared to
-break the API/ABI.
-
-2.12.2 Status
-
-Complete.
-
-2.13 Various Callback Functions Are Not Typesafe
-
-The callback functions in tdb_set_logging_function (after[tdb_log_func-Doesnt-Take]
- is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
-and tdb_check all take void * and must internally convert it to
-the argument type they were expecting.
-
-If this type changes, the compiler will not produce warnings on
-the callers, since it only sees void *.
-
-2.13.1 Proposed Solution
-
-With careful use of macros, we can create callback functions
-which give a warning when used on gcc and the types of the
-callback and its private argument differ. Unsupported compilers
-will not give a warning, which is no worse than now. In addition,
-the callbacks become clearer, as they need not use void * for
-their parameter.
-
-See CCAN's typesafe_cb module at
-http://ccan.ozlabs.org/info/typesafe_cb.html
-
-2.13.2 Status
-
-Complete.
-
-2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
-  tdb_reopen_all Problematic
-
-The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
-file should be cleared if the caller discovers it is the only
-process with the TDB open. However, if any caller does not
-specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
-the TDB erased underneath them (usually resulting in a crash).
-
-There is a similar issue on fork(); if the parent exits (or
-otherwise closes the tdb) before the child calls tdb_reopen_all()
-to establish the lock used to indicate the TDB is opened by
-someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
-it alone has opened the TDB and will erase it.
-
-2.14.1 Proposed Solution
-
-Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
-see[TDB_CLEAR_IF_FIRST-Imposes-Performance].
-
-2.14.2 Status
-
-Complete. An open hook is provided to replicate this
-functionality if required.
-
-2.15 Extending The Header Is Difficult
-
-We have reserved (zeroed) words in the TDB header, which can be
-used for future features. If the future features are compulsory,
-the version number must be updated to prevent old code from
-accessing the database. But if the future feature is optional, we
-have no way of telling if older code is accessing the database or
-not.
-
-2.15.1 Proposed Solution
-
-The header should contain a“format variant” value (64-bit). This
-is divided into two 32-bit parts:
-
-1. The lower part reflects the format variant understood by code
-  accessing the database.
-
-2. The upper part reflects the format variant you must understand
-  to write to the database (otherwise you can only open for
-  reading).
-
-The latter field can only be written at creation time, the former
-should be written under the OPEN_LOCK when opening the database
-for writing, if the variant of the code is lower than the current
-lowest variant.
-
-This should allow backwards-compatible features to be added, and
-detection if older code (which doesn't understand the feature)
-writes to the database.
-
-2.15.2 Status
-
-Complete.
-
-2.16 Record Headers Are Not Expandible
-
-If we later want to add (say) checksums on keys and data, it
-would require another format change, which we'd like to avoid.
-
-2.16.1 Proposed Solution
-
-We often have extra padding at the tail of a record. If we ensure
-that the first byte (if any) of this padding is zero, we will
-have a way for future changes to detect code which doesn't
-understand a new format: the new code would write (say) a 1 at
-the tail, and thus if there is no tail or the first byte is 0, we
-would know the extension is not present on that record.
-
-2.16.2 Status
-
-Complete.
-
-2.17 TDB Does Not Use Talloc
-
-Many users of TDB (particularly Samba) use the talloc allocator,
-and thus have to wrap TDB in a talloc context to use it
-conveniently.
-
-2.17.1 Proposed Solution
-
-The allocation within TDB is not complicated enough to justify
-the use of talloc, and I am reluctant to force another
-(excellent) library on TDB users. Nonetheless a compromise is
-possible. An attribute (see[attributes]) can be added later to
-tdb_open() to provide an alternate allocation mechanism,
-specifically for talloc but usable by any other allocator (which
-would ignore the“context” argument).
-
-This would form a talloc heirarchy as expected, but the caller
-would still have to attach a destructor to the tdb context
-returned from tdb_open to close it. All TDB_DATA fields would be
-children of the tdb_context, and the caller would still have to
-manage them (using talloc_free() or talloc_steal()).
-
-2.17.2 Status
-
-Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
-
-3 Performance And Scalability Issues
-
-3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
-  Imposes Performance Penalty
-
-When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
-placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
-never conflict in normal tdb usage, they do add substantial
-overhead for most fcntl lock implementations when the kernel
-scans to detect if a lock conflict exists. This is often a single
-linked list, making the time to acquire and release a fcntl lock
-O(N) where N is the number of processes with the TDB open, not
-the number actually doing work.
-
-In a Samba server it is common to have huge numbers of clients
-sitting idle, and thus they have weaned themselves off the
-TDB_CLEAR_IF_FIRST flag.[footnote:
-There is a flag to tdb_reopen_all() which is used for this
-optimization: if the parent process will outlive the child, the
-child does not need the ACTIVE_LOCK. This is a workaround for
-this very performance issue.
-]
-
-3.1.1 Proposed Solution
-
-Remove the flag. It was a neat idea, but even trivial servers
-tend to know when they are initializing for the first time and
-can simply unlink the old tdb at that point.
-
-3.1.2 Status
-
-Complete.
-
-3.2 TDB Files Have a 4G Limit
-
-This seems to be becoming an issue (so much for“trivial”!),
-particularly for ldb.
-
-3.2.1 Proposed Solution
-
-A new, incompatible TDB format which uses 64 bit offsets
-internally rather than 32 bit as now. For simplicity of endian
-conversion (which TDB does on the fly if required), all values
-will be 64 bit on disk. In practice, some upper bits may be used
-for other purposes, but at least 56 bits will be available for
-file offsets.
-
-tdb_open() will automatically detect the old version, and even
-create them if TDB_VERSION6 is specified to tdb_open.
-
-32 bit processes will still be able to access TDBs larger than 4G
-(assuming that their off_t allows them to seek to 64 bits), they
-will gracefully fall back as they fail to mmap. This can happen
-already with large TDBs.
-
-Old versions of tdb will fail to open the new TDB files (since 28
-August 2009, commit 398d0c29290: prior to that any unrecognized
-file format would be erased and initialized as a fresh tdb!)
-
-3.2.2 Status
-
-Complete.
-
-3.3 TDB Records Have a 4G Limit
-
-This has not been a reported problem, and the API uses size_t
-which can be 64 bit on 64 bit platforms. However, other limits
-may have made such an issue moot.
-
-3.3.1 Proposed Solution
-
-Record sizes will be 64 bit, with an error returned on 32 bit
-platforms which try to access such records (the current
-implementation would return TDB_ERR_OOM in a similar case). It
-seems unlikely that 32 bit keys will be a limitation, so the
-implementation may not support this (see[sub:Records-Incur-A]).
-
-3.3.2 Status
-
-Complete.
-
-3.4 Hash Size Is Determined At TDB Creation Time
-
-TDB contains a number of hash chains in the header; the number is
-specified at creation time, and defaults to 131. This is such a
-bottleneck on large databases (as each hash chain gets quite
-long), that LDB uses 10,000 for this hash. In general it is
-impossible to know what the 'right' answer is at database
-creation time.
-
-3.4.1 <sub:Hash-Size-Solution>Proposed Solution
-
-After comprehensive performance testing on various scalable hash
-variants[footnote:
-http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94
-This was annoying because I was previously convinced that an
-expanding tree of hashes would be very close to optimal.
-], it became clear that it is hard to beat a straight linear hash
-table which doubles in size when it reaches saturation.
-Unfortunately, altering the hash table introduces serious locking
-complications: the entire hash table needs to be locked to
-enlarge the hash table, and others might be holding locks.
-Particularly insidious are insertions done under tdb_chainlock.
-
-Thus an expanding layered hash will be used: an array of hash
-groups, with each hash group exploding into pointers to lower
-hash groups once it fills, turning into a hash tree. This has
-implications for locking: we must lock the entire group in case
-we need to expand it, yet we don't know how deep the tree is at
-that point.
-
-Note that bits from the hash table entries should be stolen to
-hold more hash bits to reduce the penalty of collisions. We can
-use the otherwise-unused lower 3 bits. If we limit the size of
-the database to 64 exabytes, we can use the top 8 bits of the
-hash entry as well. These 11 bits would reduce false positives
-down to 1 in 2000 which is more than we need: we can use one of
-the bits to indicate that the extra hash bits are valid. This
-means we can choose not to re-hash all entries when we expand a
-hash group; simply use the next bits we need and mark them
-invalid.
-
-3.4.2 Status
-
-Ignore. Scaling the hash automatically proved inefficient at
-small hash sizes; we default to a 8192-element hash (changable
-via NTDB_ATTRIBUTE_HASHSIZE), and when buckets clash we expand to
-an array of hash entries. This scales slightly better than the
-tdb chain (due to the 8 top bits containing extra hash).
-
-3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
-
-TDB uses a single linked list for the free list. Allocation
-occurs as follows, using heuristics which have evolved over time:
-
-1. Get the free list lock for this whole operation.
-
-2. Multiply length by 1.25, so we always over-allocate by 25%.
-
-3. Set the slack multiplier to 1.
-
-4. Examine the current freelist entry: if it is > length but <
-  the current best case, remember it as the best case.
-
-5. Multiply the slack multiplier by 1.05.
-
-6. If our best fit so far is less than length * slack multiplier,
-  return it. The slack will be turned into a new free record if
-  it's large enough.
-
-7. Otherwise, go onto the next freelist entry.
-
-Deleting a record occurs as follows:
-
-1. Lock the hash chain for this whole operation.
-
-2. Walk the chain to find the record, keeping the prev pointer
-  offset.
-
-3. If max_dead is non-zero:
-
-  (a) Walk the hash chain again and count the dead records.
-
-  (b) If it's more than max_dead, bulk free all the dead ones
-    (similar to steps 4 and below, but the lock is only obtained
-    once).
-
-  (c) Simply mark this record as dead and return.
-
-4. Get the free list lock for the remainder of this operation.
-
-5. <right-merging>Examine the following block to see if it is
-  free; if so, enlarge the current block and remove that block
-  from the free list. This was disabled, as removal from the free
-  list was O(entries-in-free-list).
-
-6. Examine the preceeding block to see if it is free: for this
-  reason, each block has a 32-bit tailer which indicates its
-  length. If it is free, expand it to cover our new block and
-  return.
-
-7. Otherwise, prepend ourselves to the free list.
-
-Disabling right-merging (step[right-merging]) causes
-fragmentation; the other heuristics proved insufficient to
-address this, so the final answer to this was that when we expand
-the TDB file inside a transaction commit, we repack the entire
-tdb.
-
-The single list lock limits our allocation rate; due to the other
-issues this is not currently seen as a bottleneck.
-
-3.5.1 Proposed Solution
-
-The first step is to remove all the current heuristics, as they
-obviously interact, then examine them once the lock contention is
-addressed.
-
-The free list must be split to reduce contention. Assuming
-perfect free merging, we can at most have 1 free list entry for
-each entry. This implies that the number of free lists is related
-to the size of the hash table, but as it is rare to walk a large
-number of free list entries we can use far fewer, say 1/32 of the
-number of hash buckets.
-
-It seems tempting to try to reuse the hash implementation which
-we use for records here, but we have two ways of searching for
-free entries: for allocation we search by size (and possibly
-zone) which produces too many clashes for our hash table to
-handle well, and for coalescing we search by address. Thus an
-array of doubly-linked free lists seems preferable.
-
-There are various benefits in using per-size free lists (see[sub:TDB-Becomes-Fragmented]
-) but it's not clear this would reduce contention in the common
-case where all processes are allocating/freeing the same size.
-Thus we almost certainly need to divide in other ways: the most
-obvious is to divide the file into zones, and using a free list
-(or table of free lists) for each. This approximates address
-ordering.
-
-Unfortunately it is difficult to know what heuristics should be
-used to determine zone sizes, and our transaction code relies on
-being able to create a“recovery area” by simply appending to the
-file (difficult if it would need to create a new zone header).
-Thus we use a linked-list of free tables; currently we only ever
-create one, but if there is more than one we choose one at random
-to use. In future we may use heuristics to add new free tables on
-contention. We only expand the file when all free tables are
-exhausted.
-
-The basic algorithm is as follows. Freeing is simple:
-
-1. Identify the correct free list.
-
-2. Lock the corresponding list.
-
-3. Re-check the list (we didn't have a lock, sizes could have
-  changed): relock if necessary.
-
-4. Place the freed entry in the list.
-
-Allocation is a little more complicated, as we perform delayed
-coalescing at this point:
-
-1. Pick a free table; usually the previous one.
-
-2. Lock the corresponding list.
-
-3. If the top entry is -large enough, remove it from the list and
-  return it.
-
-4. Otherwise, coalesce entries in the list.If there was no entry
-  large enough, unlock the list and try the next largest list
-
-5. If no list has an entry which meets our needs, try the next
-  free table.
-
-6. If no zone satisfies, expand the file.
-
-This optimizes rapid insert/delete of free list entries by not
-coalescing them all the time.. First-fit address ordering
-ordering seems to be fairly good for keeping fragmentation low
-(see[sub:TDB-Becomes-Fragmented]). Note that address ordering
-does not need a tailer to coalesce, though if we needed one we
-could have one cheaply: see[sub:Records-Incur-A].
-
-Each free entry has the free table number in the header: less
-than 255. It also contains a doubly-linked list for easy
-deletion.
-
-3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
-
-Much of this is a result of allocation strategy[footnote:
-The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
-ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
-] and deliberate hobbling of coalescing; internal fragmentation
-(aka overallocation) is deliberately set at 25%, and external
-fragmentation is only cured by the decision to repack the entire
-db when a transaction commit needs to enlarge the file.
-
-3.6.1 Proposed Solution
-
-The 25% overhead on allocation works in practice for ldb because
-indexes tend to expand by one record at a time. This internal
-fragmentation can be resolved by having an“expanded” bit in the
-header to note entries that have previously expanded, and
-allocating more space for them.
-
-There are is a spectrum of possible solutions for external
-fragmentation: one is to use a fragmentation-avoiding allocation
-strategy such as best-fit address-order allocator. The other end
-of the spectrum would be to use a bump allocator (very fast and
-simple) and simply repack the file when we reach the end.
-
-There are three problems with efficient fragmentation-avoiding
-allocators: they are non-trivial, they tend to use a single free
-list for each size, and there's no evidence that tdb allocation
-patterns will match those recorded for general allocators (though
-it seems likely).
-
-Thus we don't spend too much effort on external fragmentation; we
-will be no worse than the current code if we need to repack on
-occasion. More effort is spent on reducing freelist contention,
-and reducing overhead.
-
-3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
-
-Each TDB record has a header as follows:
-
-struct tdb_record {
-
-        tdb_off_t next; /* offset of the next record in the list
-*/
-
-        tdb_len_t rec_len; /* total byte length of record */
-
-        tdb_len_t key_len; /* byte length of key */
-
-        tdb_len_t data_len; /* byte length of data */
-
-        uint32_t full_hash; /* the full 32 bit hash of the key */
-
-        uint32_t magic;   /* try to catch errors */
-
-        /* the following union is implied:
-
-                union {
-
-                        char record[rec_len];
-
-                        struct {
-
-                                char key[key_len];
-
-                                char data[data_len];
-
-                        }
-
-                        uint32_t totalsize; (tailer)
-
-                }
-
-        */
-
-};
-
-Naively, this would double to a 56-byte overhead on a 64 bit
-implementation.
-
-3.7.1 Proposed Solution
-
-We can use various techniques to reduce this for an allocated
-block:
-
-1. The 'next' pointer is not required, as we are using a flat
-  hash table.
-
-2. 'rec_len' can instead be expressed as an addition to key_len
-  and data_len (it accounts for wasted or overallocated length in
-  the record). Since the record length is always a multiple of 8,
-  we can conveniently fit it in 32 bits (representing up to 35
-  bits).
-
-3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
-  restrict 'data_len' to 32 bits, but instead we can combine the
-  two into one 64-bit field and using a 5 bit value which
-  indicates at what bit to divide the two. Keys are unlikely to
-  scale as fast as data, so I'm assuming a maximum key size of 32
-  bits.
-
-4. 'full_hash' is used to avoid a memcmp on the“miss” case, but
-  this is diminishing returns after a handful of bits (at 10
-  bits, it reduces 99.9% of false memcmp). As an aside, as the
-  lower bits are already incorporated in the hash table
-  resolution, the upper bits should be used here. Note that it's
-  not clear that these bits will be a win, given the extra bits
-  in the hash table itself (see[sub:Hash-Size-Solution]).
-
-5. 'magic' does not need to be enlarged: it currently reflects
-  one of 5 values (used, free, dead, recovery, and
-  unused_recovery). It is useful for quick sanity checking
-  however, and should not be eliminated.
-
-6. 'tailer' is only used to coalesce free blocks (so a block to
-  the right can find the header to check if this block is free).
-  This can be replaced by a single 'free' bit in the header of
-  the following block (and the tailer only exists in free
-  blocks).[footnote:
-This technique from Thomas Standish. Data Structure Techniques.
-Addison-Wesley, Reading, Massachusetts, 1980.
-] The current proposed coalescing algorithm doesn't need this,
-  however.
-
-This produces a 16 byte used header like this:
-
-struct tdb_used_record {
-
-        uint32_t used_magic : 16,
-
-
-
-                 key_data_divide: 5,
-
-                 top_hash: 11;
-
-        uint32_t extra_octets;
-
-        uint64_t key_and_data_len;
-
-};
-
-And a free record like this:
-
-struct tdb_free_record {
-
-        uint64_t free_magic: 8,
-
-                   prev : 56;
-
-
-
-        uint64_t free_table: 8,
-
-                 total_length : 56
-
-        uint64_t next;;
-
-};
-
-Note that by limiting valid offsets to 56 bits, we can pack
-everything we need into 3 64-byte words, meaning our minimum
-record size is 8 bytes.
-
-3.7.2 Status
-
-Complete.
-
-3.8 Transaction Commit Requires 4 fdatasync
-
-The current transaction algorithm is:
-
-1. write_recovery_data();
-
-2. sync();
-
-3. write_recovery_header();
-
-4. sync();
-
-5. overwrite_with_new_data();
-
-6. sync();
-
-7. remove_recovery_header();
-
-8. sync();
-
-On current ext3, each sync flushes all data to disk, so the next
-3 syncs are relatively expensive. But this could become a
-performance bottleneck on other filesystems such as ext4.
-
-3.8.1 Proposed Solution
-
-Neil Brown points out that this is overzealous, and only one sync
-is needed:
-
-1. Bundle the recovery data, a transaction counter and a strong
-  checksum of the new data.
-
-2. Strong checksum that whole bundle.
-
-3. Store the bundle in the database.
-
-4. Overwrite the oldest of the two recovery pointers in the
-  header (identified using the transaction counter) with the
-  offset of this bundle.
-
-5. sync.
-
-6. Write the new data to the file.
-
-Checking for recovery means identifying the latest bundle with a
-valid checksum and using the new data checksum to ensure that it
-has been applied. This is more expensive than the current check,
-but need only be done at open. For running databases, a separate
-header field can be used to indicate a transaction in progress;
-we need only check for recovery if this is set.
-
-3.8.2 Status
-
-Deferred.
-
-3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
-
-3.9.1 Proposed Solution
-
-None. At some point you say“use a real database” (but see[replay-attribute]
-).
-
-But as a thought experiment, if we implemented transactions to
-only overwrite free entries (this is tricky: there must not be a
-header in each entry which indicates whether it is free, but use
-of presence in metadata elsewhere), and a pointer to the hash
-table, we could create an entirely new commit without destroying
-existing data. Then it would be easy to implement snapshots in a
-similar way.
-
-This would not allow arbitrary changes to the database, such as
-tdb_repack does, and would require more space (since we have to
-preserve the current and future entries at once). If we used hash
-trees rather than one big hash table, we might only have to
-rewrite some sections of the hash, too.
-
-We could then implement snapshots using a similar method, using
-multiple different hash tables/free tables.
-
-3.9.2 Status
-
-Deferred.
-
-3.10 Transactions Cannot Operate in Parallel
-
-This would be useless for ldb, as it hits the index records with
-just about every update. It would add significant complexity in
-resolving clashes, and cause the all transaction callers to write
-their code to loop in the case where the transactions spuriously
-failed.
-
-3.10.1 Proposed Solution
-
-None (but see[replay-attribute]). We could solve a small part of
-the problem by providing read-only transactions. These would
-allow one write transaction to begin, but it could not commit
-until all r/o transactions are done. This would require a new
-RO_TRANSACTION_LOCK, which would be upgraded on commit.
-
-3.10.2 Status
-
-Deferred.
-
-3.11 Default Hash Function Is Suboptimal
-
-The Knuth-inspired multiplicative hash used by tdb is fairly slow
-(especially if we expand it to 64 bits), and works best when the
-hash bucket size is a prime number (which also means a slow
-modulus). In addition, it is highly predictable which could
-potentially lead to a Denial of Service attack in some TDB uses.
-
-3.11.1 Proposed Solution
-
-The Jenkins lookup3 hash[footnote:
-http://burtleburtle.net/bob/c/lookup3.c
-] is a fast and superbly-mixing hash. It's used by the Linux
-kernel and almost everything else. This has the particular
-properties that it takes an initial seed, and produces two 32 bit
-hash numbers, which we can combine into a 64-bit hash.
-
-The seed should be created at tdb-creation time from some random
-source, and placed in the header. This is far from foolproof, but
-adds a little bit of protection against hash bombing.
-
-3.11.2 Status
-
-Complete.
-
-3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
-
-We lock a record during traversal iteration, and try to grab that
-lock in the delete code. If that grab on delete fails, we simply
-mark it deleted and continue onwards; traversal checks for this
-condition and does the delete when it moves off the record.
-
-If traversal terminates, the dead record may be left
-indefinitely.
-
-3.12.1 Proposed Solution
-
-Remove reliability guarantees; see[traverse-Proposed-Solution].
-
-3.12.2 Status
-
-Complete.
-
-3.13 Fcntl Locking Adds Overhead
-
-Placing a fcntl lock means a system call, as does removing one.
-This is actually one reason why transactions can be faster
-(everything is locked once at transaction start). In the
-uncontended case, this overhead can theoretically be eliminated.
-
-3.13.1 Proposed Solution
-
-None.
-
-We tried this before with spinlock support, in the early days of
-TDB, and it didn't make much difference except in manufactured
-benchmarks.
-
-We could use spinlocks (with futex kernel support under Linux),
-but it means that we lose automatic cleanup when a process dies
-with a lock. There is a method of auto-cleanup under Linux, but
-it's not supported by other operating systems. We could
-reintroduce a clear-if-first-style lock and sweep for dead
-futexes on open, but that wouldn't help the normal case of one
-concurrent opener dying. Increasingly elaborate repair schemes
-could be considered, but they require an ABI change (everyone
-must use them) anyway, so there's no need to do this at the same
-time as everything else.
-
-3.14 Some Transactions Don't Require Durability
-
-Volker points out that gencache uses a CLEAR_IF_FIRST tdb for
-normal (fast) usage, and occasionally empties the results into a
-transactional TDB. This kind of usage prioritizes performance
-over durability: as long as we are consistent, data can be lost.
-
-This would be more neatly implemented inside tdb: a“soft”
-transaction commit (ie. syncless) which meant that data may be
-reverted on a crash.
-
-3.14.1 Proposed Solution
-
-None.
-
-Unfortunately any transaction scheme which overwrites old data
-requires a sync before that overwrite to avoid the possibility of
-corruption.
-
-It seems possible to use a scheme similar to that described in[sub:TDB-Does-Not]
-,where transactions are committed without overwriting existing
-data, and an array of top-level pointers were available in the
-header. If the transaction is“soft” then we would not need a sync
-at all: existing processes would pick up the new hash table and
-free list and work with that.
-
-At some later point, a sync would allow recovery of the old data
-into the free lists (perhaps when the array of top-level pointers
-filled). On crash, tdb_open() would examine the array of top
-levels, and apply the transactions until it encountered an
-invalid checksum.
-
-3.15 Tracing Is Fragile, Replay Is External
-
-The current TDB has compile-time-enabled tracing code, but it
-often breaks as it is not enabled by default. In a similar way,
-the ctdb code has an external wrapper which does replay tracing
-so it can coordinate cluster-wide transactions.
-
-3.15.1 Proposed Solution<replay-attribute>
-
-Tridge points out that an attribute can be later added to
-tdb_open (see[attributes]) to provide replay/trace hooks, which
-could become the basis for this and future parallel transactions
-and snapshot support.
-
-3.15.2 Status
-
-Deferred.
diff --git a/ccan/ntdb/free.c b/ccan/ntdb/free.c
deleted file mode 100644 (file)
index 0d0e25f..0000000
+++ /dev/null
@@ -1,972 +0,0 @@
- /*
-   Trivial Database 2: free list/block handling
-   Copyright (C) Rusty Russell 2010
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/likely/likely.h>
-#include <ccan/ilog/ilog.h>
-#include <time.h>
-#include <limits.h>
-
-static unsigned fls64(uint64_t val)
-{
-       return ilog64(val);
-}
-
-/* In which bucket would we find a particular record size? (ignoring header) */
-unsigned int size_to_bucket(ntdb_len_t data_len)
-{
-       unsigned int bucket;
-
-       /* We can't have records smaller than this. */
-       assert(data_len >= NTDB_MIN_DATA_LEN);
-
-       /* Ignoring the header... */
-       if (data_len - NTDB_MIN_DATA_LEN <= 64) {
-               /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
-               bucket = (data_len - NTDB_MIN_DATA_LEN) / 8;
-       } else {
-               /* After that we go power of 2. */
-               bucket = fls64(data_len - NTDB_MIN_DATA_LEN) + 2;
-       }
-
-       if (unlikely(bucket >= NTDB_FREE_BUCKETS))
-               bucket = NTDB_FREE_BUCKETS - 1;
-       return bucket;
-}
-
-ntdb_off_t first_ftable(struct ntdb_context *ntdb)
-{
-       return ntdb_read_off(ntdb, offsetof(struct ntdb_header, free_table));
-}
-
-ntdb_off_t next_ftable(struct ntdb_context *ntdb, ntdb_off_t ftable)
-{
-       return ntdb_read_off(ntdb, ftable + offsetof(struct ntdb_freetable,next));
-}
-
-enum NTDB_ERROR ntdb_ftable_init(struct ntdb_context *ntdb)
-{
-       /* Use reservoir sampling algorithm to select a free list at random. */
-       unsigned int rnd, max = 0, count = 0;
-       ntdb_off_t off;
-
-       ntdb->ftable_off = off = first_ftable(ntdb);
-       ntdb->ftable = 0;
-
-       while (off) {
-               if (NTDB_OFF_IS_ERR(off)) {
-                       return NTDB_OFF_TO_ERR(off);
-               }
-
-               rnd = random();
-               if (rnd >= max) {
-                       ntdb->ftable_off = off;
-                       ntdb->ftable = count;
-                       max = rnd;
-               }
-
-               off = next_ftable(ntdb, off);
-               count++;
-       }
-       return NTDB_SUCCESS;
-}
-
-/* Offset of a given bucket. */
-ntdb_off_t bucket_off(ntdb_off_t ftable_off, unsigned bucket)
-{
-       return ftable_off + offsetof(struct ntdb_freetable, buckets)
-               + bucket * sizeof(ntdb_off_t);
-}
-
-/* Returns free_buckets + 1, or list number to search, or -ve error. */
-static ntdb_off_t find_free_head(struct ntdb_context *ntdb,
-                               ntdb_off_t ftable_off,
-                               ntdb_off_t bucket)
-{
-       /* Speculatively search for a non-zero bucket. */
-       return ntdb_find_nonzero_off(ntdb, bucket_off(ftable_off, 0),
-                                   bucket, NTDB_FREE_BUCKETS);
-}
-
-static void check_list(struct ntdb_context *ntdb, ntdb_off_t b_off)
-{
-#ifdef CCAN_NTDB_DEBUG
-       ntdb_off_t off, prev = 0, first;
-       struct ntdb_free_record r;
-
-       first = off = (ntdb_read_off(ntdb, b_off) & NTDB_OFF_MASK);
-       while (off != 0) {
-               ntdb_read_convert(ntdb, off, &r, sizeof(r));
-               if (frec_magic(&r) != NTDB_FREE_MAGIC)
-                       abort();
-               if (prev && frec_prev(&r) != prev)
-                       abort();
-               prev = off;
-               off = r.next;
-       }
-
-       if (first) {
-               ntdb_read_convert(ntdb, first, &r, sizeof(r));
-               if (frec_prev(&r) != prev)
-                       abort();
-       }
-#endif
-}
-
-/* Remove from free bucket. */
-static enum NTDB_ERROR remove_from_list(struct ntdb_context *ntdb,
-                                      ntdb_off_t b_off, ntdb_off_t r_off,
-                                      const struct ntdb_free_record *r)
-{
-       ntdb_off_t off, prev_next, head;
-       enum NTDB_ERROR ecode;
-
-       /* Is this only element in list?  Zero out bucket, and we're done. */
-       if (frec_prev(r) == r_off)
-               return ntdb_write_off(ntdb, b_off, 0);
-
-       /* off = &r->prev->next */
-       off = frec_prev(r) + offsetof(struct ntdb_free_record, next);
-
-       /* Get prev->next */
-       prev_next = ntdb_read_off(ntdb, off);
-       if (NTDB_OFF_IS_ERR(prev_next))
-               return NTDB_OFF_TO_ERR(prev_next);
-
-       /* If prev->next == 0, we were head: update bucket to point to next. */
-       if (prev_next == 0) {
-               /* We must preserve upper bits. */
-               head = ntdb_read_off(ntdb, b_off);
-               if (NTDB_OFF_IS_ERR(head))
-                       return NTDB_OFF_TO_ERR(head);
-
-               if ((head & NTDB_OFF_MASK) != r_off) {
-                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                         "remove_from_list:"
-                                         " %llu head %llu on list %llu",
-                                         (long long)r_off,
-                                         (long long)head,
-                                         (long long)b_off);
-               }
-               head = ((head & ~NTDB_OFF_MASK) | r->next);
-               ecode = ntdb_write_off(ntdb, b_off, head);
-               if (ecode != NTDB_SUCCESS)
-                       return ecode;
-       } else {
-               /* r->prev->next = r->next */
-               ecode = ntdb_write_off(ntdb, off, r->next);
-               if (ecode != NTDB_SUCCESS)
-                       return ecode;
-       }
-
-       /* If we were the tail, off = &head->prev. */
-       if (r->next == 0) {
-               head = ntdb_read_off(ntdb, b_off);
-               if (NTDB_OFF_IS_ERR(head))
-                       return NTDB_OFF_TO_ERR(head);
-               head &= NTDB_OFF_MASK;
-               off = head + offsetof(struct ntdb_free_record, magic_and_prev);
-       } else {
-               /* off = &r->next->prev */
-               off = r->next + offsetof(struct ntdb_free_record,
-                                        magic_and_prev);
-       }
-
-#ifdef CCAN_NTDB_DEBUG
-       /* *off == r */
-       if ((ntdb_read_off(ntdb, off) & NTDB_OFF_MASK) != r_off) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                 "remove_from_list:"
-                                 " %llu bad prev in list %llu",
-                                 (long long)r_off, (long long)b_off);
-       }
-#endif
-       /* r->next->prev = r->prev */
-       return ntdb_write_off(ntdb, off, r->magic_and_prev);
-}
-
-/* Enqueue in this free bucket: sets coalesce if we've added 128
- * entries to it. */
-static enum NTDB_ERROR enqueue_in_free(struct ntdb_context *ntdb,
-                                     ntdb_off_t b_off,
-                                     ntdb_off_t off,
-                                     ntdb_len_t len,
-                                     bool *coalesce)
-{
-       struct ntdb_free_record new;
-       enum NTDB_ERROR ecode;
-       ntdb_off_t prev, head;
-       uint64_t magic = (NTDB_FREE_MAGIC << (64 - NTDB_OFF_UPPER_STEAL));
-
-       head = ntdb_read_off(ntdb, b_off);
-       if (NTDB_OFF_IS_ERR(head))
-               return NTDB_OFF_TO_ERR(head);
-
-       /* We only need to set ftable_and_len; rest is set in enqueue_in_free */
-       new.ftable_and_len = ((uint64_t)ntdb->ftable
-                             << (64 - NTDB_OFF_UPPER_STEAL))
-               | len;
-
-       /* new->next = head. */
-       new.next = (head & NTDB_OFF_MASK);
-
-       /* First element?  Prev points to ourselves. */
-       if (!new.next) {
-               new.magic_and_prev = (magic | off);
-       } else {
-               /* new->prev = next->prev */
-               prev = ntdb_read_off(ntdb,
-                                   new.next + offsetof(struct ntdb_free_record,
-                                                       magic_and_prev));
-               new.magic_and_prev = prev;
-               if (frec_magic(&new) != NTDB_FREE_MAGIC) {
-                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                         "enqueue_in_free: %llu bad head"
-                                         " prev %llu",
-                                         (long long)new.next,
-                                         (long long)prev);
-               }
-               /* next->prev = new. */
-               ecode = ntdb_write_off(ntdb, new.next
-                                     + offsetof(struct ntdb_free_record,
-                                                magic_and_prev),
-                                     off | magic);
-               if (ecode != NTDB_SUCCESS) {
-                       return ecode;
-               }
-
-#ifdef CCAN_NTDB_DEBUG
-               prev = ntdb_read_off(ntdb, frec_prev(&new)
-                                   + offsetof(struct ntdb_free_record, next));
-               if (prev != 0) {
-                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                         "enqueue_in_free:"
-                                         " %llu bad tail next ptr %llu",
-                                         (long long)frec_prev(&new)
-                                         + offsetof(struct ntdb_free_record,
-                                                    next),
-                                         (long long)prev);
-               }
-#endif
-       }
-
-       /* Update enqueue count, but don't set high bit: see NTDB_OFF_IS_ERR */
-       if (*coalesce)
-               head += (1ULL << (64 - NTDB_OFF_UPPER_STEAL));
-       head &= ~(NTDB_OFF_MASK | (1ULL << 63));
-       head |= off;
-
-       ecode = ntdb_write_off(ntdb, b_off, head);
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       /* It's time to coalesce if counter wrapped. */
-       if (*coalesce)
-               *coalesce = ((head & ~NTDB_OFF_MASK) == 0);
-
-       return ntdb_write_convert(ntdb, off, &new, sizeof(new));
-}
-
-static ntdb_off_t ftable_offset(struct ntdb_context *ntdb, unsigned int ftable)
-{
-       ntdb_off_t off;
-       unsigned int i;
-
-       if (likely(ntdb->ftable == ftable))
-               return ntdb->ftable_off;
-
-       off = first_ftable(ntdb);
-       for (i = 0; i < ftable; i++) {
-               if (NTDB_OFF_IS_ERR(off)) {
-                       break;
-               }
-               off = next_ftable(ntdb, off);
-       }
-       return off;
-}
-
-/* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and
- * need to blatt the *protect record (which is set to an error). */
-static ntdb_len_t coalesce(struct ntdb_context *ntdb,
-                         ntdb_off_t off, ntdb_off_t b_off,
-                         ntdb_len_t data_len,
-                         ntdb_off_t *protect)
-{
-       ntdb_off_t end;
-       struct ntdb_free_record rec;
-       enum NTDB_ERROR ecode;
-
-       ntdb->stats.alloc_coalesce_tried++;
-       end = off + sizeof(struct ntdb_used_record) + data_len;
-
-       while (end < ntdb->file->map_size) {
-               const struct ntdb_free_record *r;
-               ntdb_off_t nb_off;
-               unsigned ftable, bucket;
-
-               r = ntdb_access_read(ntdb, end, sizeof(*r), true);
-               if (NTDB_PTR_IS_ERR(r)) {
-                       ecode = NTDB_PTR_ERR(r);
-                       goto err;
-               }
-
-               if (frec_magic(r) != NTDB_FREE_MAGIC
-                   || frec_ftable(r) == NTDB_FTABLE_NONE) {
-                       ntdb_access_release(ntdb, r);
-                       break;
-               }
-
-               ftable = frec_ftable(r);
-               bucket = size_to_bucket(frec_len(r));
-               nb_off = ftable_offset(ntdb, ftable);
-               if (NTDB_OFF_IS_ERR(nb_off)) {
-                       ntdb_access_release(ntdb, r);
-                       ecode = NTDB_OFF_TO_ERR(nb_off);
-                       goto err;
-               }
-               nb_off = bucket_off(nb_off, bucket);
-               ntdb_access_release(ntdb, r);
-
-               /* We may be violating lock order here, so best effort. */
-               if (ntdb_lock_free_bucket(ntdb, nb_off, NTDB_LOCK_NOWAIT)
-                   != NTDB_SUCCESS) {
-                       ntdb->stats.alloc_coalesce_lockfail++;
-                       break;
-               }
-
-               /* Now we have lock, re-check. */
-               ecode = ntdb_read_convert(ntdb, end, &rec, sizeof(rec));
-               if (ecode != NTDB_SUCCESS) {
-                       ntdb_unlock_free_bucket(ntdb, nb_off);
-                       goto err;
-               }
-
-               if (unlikely(frec_magic(&rec) != NTDB_FREE_MAGIC)) {
-                       ntdb->stats.alloc_coalesce_race++;
-                       ntdb_unlock_free_bucket(ntdb, nb_off);
-                       break;
-               }
-
-               if (unlikely(frec_ftable(&rec) != ftable)
-                   || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
-                       ntdb->stats.alloc_coalesce_race++;
-                       ntdb_unlock_free_bucket(ntdb, nb_off);
-                       break;
-               }
-
-               /* Did we just mess up a record you were hoping to use? */
-               if (end == *protect) {
-                       ntdb->stats.alloc_coalesce_iterate_clash++;
-                       *protect = NTDB_ERR_TO_OFF(NTDB_ERR_NOEXIST);
-               }
-
-               ecode = remove_from_list(ntdb, nb_off, end, &rec);
-               check_list(ntdb, nb_off);
-               if (ecode != NTDB_SUCCESS) {
-                       ntdb_unlock_free_bucket(ntdb, nb_off);
-                       goto err;
-               }
-
-               end += sizeof(struct ntdb_used_record) + frec_len(&rec);
-               ntdb_unlock_free_bucket(ntdb, nb_off);
-               ntdb->stats.alloc_coalesce_num_merged++;
-       }
-
-       /* Didn't find any adjacent free? */
-       if (end == off + sizeof(struct ntdb_used_record) + data_len)
-               return 0;
-
-       /* Before we expand, check this isn't one you wanted protected? */
-       if (off == *protect) {
-               *protect = NTDB_ERR_TO_OFF(NTDB_ERR_EXISTS);
-               ntdb->stats.alloc_coalesce_iterate_clash++;
-       }
-
-       /* OK, expand initial record */
-       ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
-       if (ecode != NTDB_SUCCESS) {
-               goto err;
-       }
-
-       if (frec_len(&rec) != data_len) {
-               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                  "coalesce: expected data len %zu not %zu",
-                                  (size_t)data_len, (size_t)frec_len(&rec));
-               goto err;
-       }
-
-       ecode = remove_from_list(ntdb, b_off, off, &rec);
-       check_list(ntdb, b_off);
-       if (ecode != NTDB_SUCCESS) {
-               goto err;
-       }
-
-       /* Try locking violation first.  We don't allow coalesce recursion! */
-       ecode = add_free_record(ntdb, off, end - off, NTDB_LOCK_NOWAIT, false);
-       if (ecode != NTDB_SUCCESS) {
-               /* Need to drop lock.  Can't rely on anything stable. */
-               ntdb->stats.alloc_coalesce_lockfail++;
-               *protect = NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT);
-
-               /* We have to drop this to avoid deadlocks, so make sure record
-                * doesn't get coalesced by someone else! */
-               rec.ftable_and_len = (NTDB_FTABLE_NONE
-                                     << (64 - NTDB_OFF_UPPER_STEAL))
-                       | (end - off - sizeof(struct ntdb_used_record));
-               ecode = ntdb_write_off(ntdb,
-                                     off + offsetof(struct ntdb_free_record,
-                                                    ftable_and_len),
-                                     rec.ftable_and_len);
-               if (ecode != NTDB_SUCCESS) {
-                       goto err;
-               }
-
-               ntdb_unlock_free_bucket(ntdb, b_off);
-
-               ecode = add_free_record(ntdb, off, end - off, NTDB_LOCK_WAIT,
-                                       false);
-               if (ecode != NTDB_SUCCESS) {
-                       return NTDB_ERR_TO_OFF(ecode);
-               }
-       } else if (NTDB_OFF_IS_ERR(*protect)) {
-               /* For simplicity, we always drop lock if they can't continue */
-               ntdb_unlock_free_bucket(ntdb, b_off);
-       }
-       ntdb->stats.alloc_coalesce_succeeded++;
-
-       /* Return usable length. */
-       return end - off - sizeof(struct ntdb_used_record);
-
-err:
-       /* To unify error paths, we *always* unlock bucket on error. */
-       ntdb_unlock_free_bucket(ntdb, b_off);
-       return NTDB_ERR_TO_OFF(ecode);
-}
-
-/* List is locked: we unlock it. */
-static enum NTDB_ERROR coalesce_list(struct ntdb_context *ntdb,
-                                   ntdb_off_t ftable_off,
-                                   ntdb_off_t b_off,
-                                   unsigned int limit)
-{
-       enum NTDB_ERROR ecode;
-       ntdb_off_t off;
-
-       off = ntdb_read_off(ntdb, b_off);
-       if (NTDB_OFF_IS_ERR(off)) {
-               ecode = NTDB_OFF_TO_ERR(off);
-               goto unlock_err;
-       }
-       /* A little bit of paranoia: counter should be 0. */
-       off &= NTDB_OFF_MASK;
-
-       while (off && limit--) {
-               struct ntdb_free_record rec;
-               ntdb_len_t coal;
-               ntdb_off_t next;
-
-               ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
-               if (ecode != NTDB_SUCCESS)
-                       goto unlock_err;
-
-               next = rec.next;
-               coal = coalesce(ntdb, off, b_off, frec_len(&rec), &next);
-               if (NTDB_OFF_IS_ERR(coal)) {
-                       /* This has already unlocked on error. */
-                       return NTDB_OFF_TO_ERR(coal);
-               }
-               if (NTDB_OFF_IS_ERR(next)) {
-                       /* Coalescing had to unlock, so stop. */
-                       return NTDB_SUCCESS;
-               }
-               /* Keep going if we're doing well... */
-               limit += size_to_bucket(coal / 16 + NTDB_MIN_DATA_LEN);
-               off = next;
-       }
-
-       /* Now, move those elements to the tail of the list so we get something
-        * else next time. */
-       if (off) {
-               struct ntdb_free_record oldhrec, newhrec, oldtrec, newtrec;
-               ntdb_off_t oldhoff, oldtoff, newtoff;
-
-               /* The record we were up to is the new head. */
-               ecode = ntdb_read_convert(ntdb, off, &newhrec, sizeof(newhrec));
-               if (ecode != NTDB_SUCCESS)
-                       goto unlock_err;
-
-               /* Get the new tail. */
-               newtoff = frec_prev(&newhrec);
-               ecode = ntdb_read_convert(ntdb, newtoff, &newtrec,
-                                        sizeof(newtrec));
-               if (ecode != NTDB_SUCCESS)
-                       goto unlock_err;
-
-               /* Get the old head. */
-               oldhoff = ntdb_read_off(ntdb, b_off);
-               if (NTDB_OFF_IS_ERR(oldhoff)) {
-                       ecode = NTDB_OFF_TO_ERR(oldhoff);
-                       goto unlock_err;
-               }
-
-               /* This could happen if they all coalesced away. */
-               if (oldhoff == off)
-                       goto out;
-
-               ecode = ntdb_read_convert(ntdb, oldhoff, &oldhrec,
-                                        sizeof(oldhrec));
-               if (ecode != NTDB_SUCCESS)
-                       goto unlock_err;
-
-               /* Get the old tail. */
-               oldtoff = frec_prev(&oldhrec);
-               ecode = ntdb_read_convert(ntdb, oldtoff, &oldtrec,
-                                        sizeof(oldtrec));
-               if (ecode != NTDB_SUCCESS)
-                       goto unlock_err;
-
-               /* Old tail's next points to old head. */
-               oldtrec.next = oldhoff;
-
-               /* Old head's prev points to old tail. */
-               oldhrec.magic_and_prev
-                       = (NTDB_FREE_MAGIC << (64 - NTDB_OFF_UPPER_STEAL))
-                       | oldtoff;
-
-               /* New tail's next is 0. */
-               newtrec.next = 0;
-
-               /* Write out the modified versions. */
-               ecode = ntdb_write_convert(ntdb, oldtoff, &oldtrec,
-                                         sizeof(oldtrec));
-               if (ecode != NTDB_SUCCESS)
-                       goto unlock_err;
-
-               ecode = ntdb_write_convert(ntdb, oldhoff, &oldhrec,
-                                         sizeof(oldhrec));
-               if (ecode != NTDB_SUCCESS)
-                       goto unlock_err;
-
-               ecode = ntdb_write_convert(ntdb, newtoff, &newtrec,
-                                         sizeof(newtrec));
-               if (ecode != NTDB_SUCCESS)
-                       goto unlock_err;
-
-               /* And finally link in new head. */
-               ecode = ntdb_write_off(ntdb, b_off, off);
-               if (ecode != NTDB_SUCCESS)
-                       goto unlock_err;
-       }
-out:
-       ntdb_unlock_free_bucket(ntdb, b_off);
-       return NTDB_SUCCESS;
-
-unlock_err:
-       ntdb_unlock_free_bucket(ntdb, b_off);
-       return ecode;
-}
-
-/* List must not be locked if coalesce_ok is set. */
-enum NTDB_ERROR add_free_record(struct ntdb_context *ntdb,
-                              ntdb_off_t off, ntdb_len_t len_with_header,
-                              enum ntdb_lock_flags waitflag,
-                              bool coalesce_ok)
-{
-       ntdb_off_t b_off;
-       ntdb_len_t len;
-       enum NTDB_ERROR ecode;
-
-       assert(len_with_header >= sizeof(struct ntdb_free_record));
-
-       len = len_with_header - sizeof(struct ntdb_used_record);
-
-       b_off = bucket_off(ntdb->ftable_off, size_to_bucket(len));
-       ecode = ntdb_lock_free_bucket(ntdb, b_off, waitflag);
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       ecode = enqueue_in_free(ntdb, b_off, off, len, &coalesce_ok);
-       check_list(ntdb, b_off);
-
-       /* Coalescing unlocks free list. */
-       if (!ecode && coalesce_ok)
-               ecode = coalesce_list(ntdb, ntdb->ftable_off, b_off, 2);
-       else
-               ntdb_unlock_free_bucket(ntdb, b_off);
-       return ecode;
-}
-
-static size_t adjust_size(size_t keylen, size_t datalen)
-{
-       size_t size = keylen + datalen;
-
-       if (size < NTDB_MIN_DATA_LEN)
-               size = NTDB_MIN_DATA_LEN;
-
-       /* Round to next uint64_t boundary. */
-       return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
-}
-
-/* If we have enough left over to be useful, split that off. */
-static size_t record_leftover(size_t keylen, size_t datalen,
-                             bool want_extra, size_t total_len)
-{
-       ssize_t leftover;
-
-       if (want_extra)
-               datalen += datalen / 2;
-       leftover = total_len - adjust_size(keylen, datalen);
-
-       if (leftover < (ssize_t)sizeof(struct ntdb_free_record))
-               return 0;
-
-       return leftover;
-}
-
-/* We need size bytes to put our key and data in. */
-static ntdb_off_t lock_and_alloc(struct ntdb_context *ntdb,
-                               ntdb_off_t ftable_off,
-                               ntdb_off_t bucket,
-                               size_t keylen, size_t datalen,
-                               bool want_extra,
-                               unsigned magic)
-{
-       ntdb_off_t off, b_off,best_off;
-       struct ntdb_free_record best = { 0 };
-       double multiplier;
-       size_t size = adjust_size(keylen, datalen);
-       enum NTDB_ERROR ecode;
-
-       ntdb->stats.allocs++;
-       b_off = bucket_off(ftable_off, bucket);
-
-       /* FIXME: Try non-blocking wait first, to measure contention. */
-       /* Lock this bucket. */
-       ecode = ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT);
-       if (ecode != NTDB_SUCCESS) {
-               return NTDB_ERR_TO_OFF(ecode);
-       }
-
-       best.ftable_and_len = -1ULL;
-       best_off = 0;
-
-       /* Get slack if we're after extra. */
-       if (want_extra)
-               multiplier = 1.5;
-       else
-               multiplier = 1.0;
-
-       /* Walk the list to see if any are large enough, getting less fussy
-        * as we go. */
-       off = ntdb_read_off(ntdb, b_off);
-       if (NTDB_OFF_IS_ERR(off)) {
-               ecode = NTDB_OFF_TO_ERR(off);
-               goto unlock_err;
-       }
-       off &= NTDB_OFF_MASK;
-
-       while (off) {
-               const struct ntdb_free_record *r;
-               ntdb_off_t next;
-
-               r = ntdb_access_read(ntdb, off, sizeof(*r), true);
-               if (NTDB_PTR_IS_ERR(r)) {
-                       ecode = NTDB_PTR_ERR(r);
-                       goto unlock_err;
-               }
-
-               if (frec_magic(r) != NTDB_FREE_MAGIC) {
-                       ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
-                                          "lock_and_alloc:"
-                                          " %llu non-free 0x%llx",
-                                          (long long)off,
-                                          (long long)r->magic_and_prev);
-                       ntdb_access_release(ntdb, r);
-                       goto unlock_err;
-               }
-
-               if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
-                       best_off = off;
-                       best = *r;
-               }
-
-               if (frec_len(&best) <= size * multiplier && best_off) {
-                       ntdb_access_release(ntdb, r);
-                       break;
-               }
-
-               multiplier *= 1.01;
-
-               next = r->next;
-               ntdb_access_release(ntdb, r);
-               off = next;
-       }
-
-       /* If we found anything at all, use it. */
-       if (best_off) {
-               struct ntdb_used_record rec;
-               size_t leftover;
-
-               /* We're happy with this size: take it. */
-               ecode = remove_from_list(ntdb, b_off, best_off, &best);
-               check_list(ntdb, b_off);
-               if (ecode != NTDB_SUCCESS) {
-                       goto unlock_err;
-               }
-
-               leftover = record_leftover(keylen, datalen, want_extra,
-                                          frec_len(&best));
-
-               assert(keylen + datalen + leftover <= frec_len(&best));
-               /* We need to mark non-free before we drop lock, otherwise
-                * coalesce() could try to merge it! */
-               ecode = set_header(ntdb, &rec, magic, keylen, datalen,
-                                  frec_len(&best) - leftover);
-               if (ecode != NTDB_SUCCESS) {
-                       goto unlock_err;
-               }
-
-               ecode = ntdb_write_convert(ntdb, best_off, &rec, sizeof(rec));
-               if (ecode != NTDB_SUCCESS) {
-                       goto unlock_err;
-               }
-
-               /* For futureproofing, we put a 0 in any unused space. */
-               if (rec_extra_padding(&rec)) {
-                       ecode = ntdb->io->twrite(ntdb, best_off + sizeof(rec)
-                                               + keylen + datalen, "", 1);
-                       if (ecode != NTDB_SUCCESS) {
-                               goto unlock_err;
-                       }
-               }
-
-               /* Bucket of leftover will be <= current bucket, so nested
-                * locking is allowed. */
-               if (leftover) {
-                       ntdb->stats.alloc_leftover++;
-                       ecode = add_free_record(ntdb,
-                                               best_off + sizeof(rec)
-                                               + frec_len(&best) - leftover,
-                                               leftover, NTDB_LOCK_WAIT, false);
-                       if (ecode != NTDB_SUCCESS) {
-                               best_off = NTDB_ERR_TO_OFF(ecode);
-                       }
-               }
-               ntdb_unlock_free_bucket(ntdb, b_off);
-
-               return best_off;
-       }
-
-       ntdb_unlock_free_bucket(ntdb, b_off);
-       return 0;
-
-unlock_err:
-       ntdb_unlock_free_bucket(ntdb, b_off);
-       return NTDB_ERR_TO_OFF(ecode);
-}
-
-/* Get a free block from current free list, or 0 if none, -ve on error. */
-static ntdb_off_t get_free(struct ntdb_context *ntdb,
-                         size_t keylen, size_t datalen, bool want_extra,
-                         unsigned magic)
-{
-       ntdb_off_t off, ftable_off;
-       ntdb_off_t start_b, b, ftable;
-       bool wrapped = false;
-
-       /* If they are growing, add 50% to get to higher bucket. */
-       if (want_extra)
-               start_b = size_to_bucket(adjust_size(keylen,
-                                                    datalen + datalen / 2));
-       else
-               start_b = size_to_bucket(adjust_size(keylen, datalen));
-
-       ftable_off = ntdb->ftable_off;
-       ftable = ntdb->ftable;
-       while (!wrapped || ftable_off != ntdb->ftable_off) {
-               /* Start at exact size bucket, and search up... */
-               for (b = find_free_head(ntdb, ftable_off, start_b);
-                    b < NTDB_FREE_BUCKETS;
-                    b = find_free_head(ntdb, ftable_off, b + 1)) {
-                       /* Try getting one from list. */
-                       off = lock_and_alloc(ntdb, ftable_off,
-                                            b, keylen, datalen, want_extra,
-                                            magic);
-                       if (NTDB_OFF_IS_ERR(off))
-                               return off;
-                       if (off != 0) {
-                               if (b == start_b)
-                                       ntdb->stats.alloc_bucket_exact++;
-                               if (b == NTDB_FREE_BUCKETS - 1)
-                                       ntdb->stats.alloc_bucket_max++;
-                               /* Worked?  Stay using this list. */
-                               ntdb->ftable_off = ftable_off;
-                               ntdb->ftable = ftable;
-                               return off;
-                       }
-                       /* Didn't work.  Try next bucket. */
-               }
-
-               if (NTDB_OFF_IS_ERR(b)) {
-                       return b;
-               }
-
-               /* Hmm, try next table. */
-               ftable_off = next_ftable(ntdb, ftable_off);
-               if (NTDB_OFF_IS_ERR(ftable_off)) {
-                       return ftable_off;
-               }
-               ftable++;
-
-               if (ftable_off == 0) {
-                       wrapped = true;
-                       ftable_off = first_ftable(ntdb);
-                       if (NTDB_OFF_IS_ERR(ftable_off)) {
-                               return ftable_off;
-                       }
-                       ftable = 0;
-               }
-       }
-
-       return 0;
-}
-
-enum NTDB_ERROR set_header(struct ntdb_context *ntdb,
-                         struct ntdb_used_record *rec,
-                         unsigned magic, uint64_t keylen, uint64_t datalen,
-                         uint64_t actuallen)
-{
-       uint64_t keybits = (fls64(keylen) + 1) / 2;
-
-       rec->magic_and_meta = ((actuallen - (keylen + datalen)) << 11)
-               | (keybits << 43)
-               | ((uint64_t)magic << 48);
-       rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
-
-       /* Encoding can fail on big values. */
-       if (rec_key_length(rec) != keylen
-           || rec_data_length(rec) != datalen
-           || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
-               return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
-                                 "Could not encode k=%llu,d=%llu,a=%llu",
-                                 (long long)keylen, (long long)datalen,
-                                 (long long)actuallen);
-       }
-       return NTDB_SUCCESS;
-}
-
-/* You need 'size', this tells you how much you should expand by. */
-ntdb_off_t ntdb_expand_adjust(ntdb_off_t map_size, ntdb_off_t size)
-{
-       ntdb_off_t new_size, top_size;
-
-       /* limit size in order to avoid using up huge amounts of memory for
-        * in memory tdbs if an oddball huge record creeps in */
-       if (size > 100 * 1024) {
-               top_size = map_size + size * 2;
-       } else {
-               top_size = map_size + size * 100;
-       }
-
-       /* always make room for at least top_size more records, and at
-          least 25% more space. if the DB is smaller than 100MiB,
-          otherwise grow it by 10% only. */
-       if (map_size > 100 * 1024 * 1024) {
-               new_size = map_size * 1.10;
-       } else {
-               new_size = map_size * 1.25;
-       }
-
-       if (new_size < top_size)
-               new_size = top_size;
-
-       /* We always make the file a multiple of transaction page
-        * size.  This guarantees that the transaction recovery area
-        * is always aligned, otherwise the transaction code can overwrite
-        * itself. */
-       new_size = (new_size + NTDB_PGSIZE-1) & ~(NTDB_PGSIZE-1);
-       return new_size - map_size;
-}
-
-/* Expand the database. */
-static enum NTDB_ERROR ntdb_expand(struct ntdb_context *ntdb, ntdb_len_t size)
-{
-       uint64_t old_size;
-       ntdb_len_t wanted;
-       enum NTDB_ERROR ecode;
-
-       /* Need to hold a hash lock to expand DB: transactions rely on it. */
-       if (!(ntdb->flags & NTDB_NOLOCK)
-           && !ntdb->file->allrecord_lock.count && !ntdb_has_hash_locks(ntdb)) {
-               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
-                                 "ntdb_expand: must hold lock during expand");
-       }
-
-       /* Only one person can expand file at a time. */
-       ecode = ntdb_lock_expand(ntdb, F_WRLCK);
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       /* Someone else may have expanded the file, so retry. */
-       old_size = ntdb->file->map_size;
-       ntdb_oob(ntdb, ntdb->file->map_size, 1, true);
-       if (ntdb->file->map_size != old_size) {
-               ntdb_unlock_expand(ntdb, F_WRLCK);
-               return NTDB_SUCCESS;
-       }
-
-       /* We need room for the record header too. */
-       size = adjust_size(0, sizeof(struct ntdb_used_record) + size);
-       /* Overallocate. */
-       wanted = ntdb_expand_adjust(old_size, size);
-
-       ecode = ntdb->io->expand_file(ntdb, wanted);
-       if (ecode != NTDB_SUCCESS) {
-               ntdb_unlock_expand(ntdb, F_WRLCK);
-               return ecode;
-       }
-
-       /* We need to drop this lock before adding free record. */
-       ntdb_unlock_expand(ntdb, F_WRLCK);
-
-       ntdb->stats.expands++;
-       return add_free_record(ntdb, old_size, wanted, NTDB_LOCK_WAIT, true);
-}
-
-/* This won't fail: it will expand the database if it has to. */
-ntdb_off_t alloc(struct ntdb_context *ntdb, size_t keylen, size_t datalen,
-                unsigned magic, bool growing)
-{
-       ntdb_off_t off;
-
-       for (;;) {
-               enum NTDB_ERROR ecode;
-               off = get_free(ntdb, keylen, datalen, growing, magic);
-               if (likely(off != 0))
-                       break;
-
-               ecode = ntdb_expand(ntdb, adjust_size(keylen, datalen));
-               if (ecode != NTDB_SUCCESS) {
-                       return NTDB_ERR_TO_OFF(ecode);
-               }
-       }
-
-       return off;
-}
diff --git a/ccan/ntdb/hash.c b/ccan/ntdb/hash.c
deleted file mode 100644 (file)
index b223668..0000000
+++ /dev/null
@@ -1,624 +0,0 @@
- /*
-   Trivial Database 2: hash handling
-   Copyright (C) Rusty Russell 2010
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/hash/hash.h>
-
-/* Default hash function. */
-uint32_t ntdb_jenkins_hash(const void *key, size_t length, uint32_t seed,
-                         void *unused)
-{
-       return hash_stable((const unsigned char *)key, length, seed);
-}
-
-uint32_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len)
-{
-       return ntdb->hash_fn(ptr, len, ntdb->hash_seed, ntdb->hash_data);
-}
-
-static ntdb_bool_err key_matches(struct ntdb_context *ntdb,
-                                const struct ntdb_used_record *rec,
-                                ntdb_off_t off,
-                                const NTDB_DATA *key,
-                                const char **rptr)
-{
-       ntdb_bool_err ret = false;
-       const char *rkey;
-
-       if (rec_key_length(rec) != key->dsize) {
-               ntdb->stats.compare_wrong_keylen++;
-               return ret;
-       }
-
-       rkey = ntdb_access_read(ntdb, off + sizeof(*rec),
-                               key->dsize + rec_data_length(rec), false);
-       if (NTDB_PTR_IS_ERR(rkey)) {
-               return (ntdb_bool_err)NTDB_PTR_ERR(rkey);
-       }
-       if (memcmp(rkey, key->dptr, key->dsize) == 0) {
-               if (rptr) {
-                       *rptr = rkey;
-               } else {
-                       ntdb_access_release(ntdb, rkey);
-               }
-               return true;
-       }
-       ntdb->stats.compare_wrong_keycmp++;
-       ntdb_access_release(ntdb, rkey);
-       return ret;
-}
-
-/* Does entry match? */
-static ntdb_bool_err match(struct ntdb_context *ntdb,
-                          uint32_t hash,
-                          const NTDB_DATA *key,
-                          ntdb_off_t val,
-                          struct ntdb_used_record *rec,
-                          const char **rptr)
-{
-       ntdb_off_t off;
-       enum NTDB_ERROR ecode;
-
-       ntdb->stats.compares++;
-
-       /* Top bits of offset == next bits of hash. */
-       if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
-           != bits_from(val, 64-NTDB_OFF_UPPER_STEAL, NTDB_OFF_UPPER_STEAL)) {
-               ntdb->stats.compare_wrong_offsetbits++;
-               return false;
-       }
-
-       off = val & NTDB_OFF_MASK;
-       ecode = ntdb_read_convert(ntdb, off, rec, sizeof(*rec));
-       if (ecode != NTDB_SUCCESS) {
-               return (ntdb_bool_err)ecode;
-       }
-
-       return key_matches(ntdb, rec, off, key, rptr);
-}
-
-static bool is_chain(ntdb_off_t val)
-{
-       return val & (1ULL << NTDB_OFF_CHAIN_BIT);
-}
-
-static ntdb_off_t hbucket_off(ntdb_off_t base, ntdb_len_t idx)
-{
-       return base + sizeof(struct ntdb_used_record)
-               + idx * sizeof(ntdb_off_t);
-}
-
-/* This is the core routine which searches the hashtable for an entry.
- * On error, no locks are held and -ve is returned.
- * Otherwise, hinfo is filled in.
- * If not found, the return value is 0.
- * If found, the return value is the offset, and *rec is the record. */
-ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
-                        NTDB_DATA key,
-                        int ltype,
-                        struct hash_info *h,
-                        struct ntdb_used_record *rec,
-                        const char **rptr)
-{
-       ntdb_off_t off, val;
-       const ntdb_off_t *arr = NULL;
-       ntdb_len_t i;
-       bool found_empty;
-       enum NTDB_ERROR ecode;
-       struct ntdb_used_record chdr;
-       ntdb_bool_err berr;
-
-       h->h = ntdb_hash(ntdb, key.dptr, key.dsize);
-
-       h->table = NTDB_HASH_OFFSET;
-       h->table_size = 1 << ntdb->hash_bits;
-       h->bucket = bits_from(h->h, 0, ntdb->hash_bits);
-       h->old_val = 0;
-
-       ecode = ntdb_lock_hash(ntdb, h->bucket, ltype);
-       if (ecode != NTDB_SUCCESS) {
-               return NTDB_ERR_TO_OFF(ecode);
-       }
-
-       off = hbucket_off(h->table, h->bucket);
-       val = ntdb_read_off(ntdb, off);
-       if (NTDB_OFF_IS_ERR(val)) {
-               ecode = NTDB_OFF_TO_ERR(val);
-               goto fail;
-       }
-
-       /* Directly in hash table? */
-       if (!likely(is_chain(val))) {
-               if (val) {
-                       berr = match(ntdb, h->h, &key, val, rec, rptr);
-                       if (berr < 0) {
-                               ecode = NTDB_OFF_TO_ERR(berr);
-                               goto fail;
-                       }
-                       if (berr) {
-                               return val & NTDB_OFF_MASK;
-                       }
-                       /* If you want to insert here, make a chain. */
-                       h->old_val = val;
-               }
-               return 0;
-       }
-
-       /* Nope?  Iterate through chain. */
-       h->table = val & NTDB_OFF_MASK;
-
-       ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
-       if (ecode != NTDB_SUCCESS) {
-               goto fail;
-       }
-
-       if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
-               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                   NTDB_LOG_ERROR,
-                                   "find_and_lock:"
-                                   " corrupt record %#x at %llu",
-                                   rec_magic(&chdr), (long long)off);
-               goto fail;
-       }
-
-       h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
-
-       arr = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
-                              rec_data_length(&chdr), true);
-       if (NTDB_PTR_IS_ERR(arr)) {
-               ecode = NTDB_PTR_ERR(arr);
-               goto fail;
-       }
-
-       found_empty = false;
-       for (i = 0; i < h->table_size; i++) {
-               if (arr[i] == 0) {
-                       if (!found_empty) {
-                               h->bucket = i;
-                               found_empty = true;
-                       }
-               } else {
-                       berr = match(ntdb, h->h, &key, arr[i], rec, rptr);
-                       if (berr < 0) {
-                               ecode = NTDB_OFF_TO_ERR(berr);
-                               ntdb_access_release(ntdb, arr);
-                               goto fail;
-                       }
-                       if (berr) {
-                               /* We found it! */
-                               h->bucket = i;
-                               off = arr[i] & NTDB_OFF_MASK;
-                               ntdb_access_release(ntdb, arr);
-                               return off;
-                       }
-               }
-       }
-       if (!found_empty) {
-               /* Set to any non-zero value */
-               h->old_val = 1;
-               h->bucket = i;
-       }
-
-       ntdb_access_release(ntdb, arr);
-       return 0;
-
-fail:
-       ntdb_unlock_hash(ntdb, h->bucket, ltype);
-       return NTDB_ERR_TO_OFF(ecode);
-}
-
-static ntdb_off_t encode_offset(const struct ntdb_context *ntdb,
-                               ntdb_off_t new_off, uint32_t hash)
-{
-       ntdb_off_t extra;
-
-       assert((new_off & (1ULL << NTDB_OFF_CHAIN_BIT)) == 0);
-       assert((new_off >> (64 - NTDB_OFF_UPPER_STEAL)) == 0);
-       /* We pack extra hash bits into the upper bits of the offset. */
-       extra = bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL);
-       extra <<= (64 - NTDB_OFF_UPPER_STEAL);
-
-       return new_off | extra;
-}
-
-/* Simply overwrite the hash entry we found before. */
-enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
-                               const struct hash_info *h,
-                               ntdb_off_t new_off)
-{
-       return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket),
-                             encode_offset(ntdb, new_off, h->h));
-}
-
-enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb,
-                                const struct hash_info *h)
-{
-       return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket), 0);
-}
-
-
-enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb,
-                           const struct hash_info *h,
-                           ntdb_off_t new_off)
-{
-       enum NTDB_ERROR ecode;
-       ntdb_off_t chain;
-       struct ntdb_used_record chdr;
-       const ntdb_off_t *old;
-       ntdb_off_t *new;
-
-       /* We hit an empty bucket during search?  That's where it goes. */
-       if (!h->old_val) {
-               return replace_in_hash(ntdb, h, new_off);
-       }
-
-       /* Full at top-level?  Create a 2-element chain. */
-       if (h->table == NTDB_HASH_OFFSET) {
-               ntdb_off_t pair[2];
-
-               /* One element is old value, the other is the new value. */
-               pair[0] = h->old_val;
-               pair[1] = encode_offset(ntdb, new_off, h->h);
-
-               chain = alloc(ntdb, 0, sizeof(pair), NTDB_CHAIN_MAGIC, true);
-               if (NTDB_OFF_IS_ERR(chain)) {
-                       return NTDB_OFF_TO_ERR(chain);
-               }
-               ecode = ntdb_write_convert(ntdb,
-                                          chain
-                                          + sizeof(struct ntdb_used_record),
-                                          pair, sizeof(pair));
-               if (ecode == NTDB_SUCCESS) {
-                       ecode = ntdb_write_off(ntdb,
-                                              hbucket_off(h->table, h->bucket),
-                                              chain
-                                              | (1ULL << NTDB_OFF_CHAIN_BIT));
-               }
-               return ecode;
-       }
-
-       /* Full bucket.  Expand. */
-       ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       if (rec_extra_padding(&chdr) >= sizeof(new_off)) {
-               /* Expand in place. */
-               uint64_t dlen = rec_data_length(&chdr);
-
-               ecode = set_header(ntdb, &chdr, NTDB_CHAIN_MAGIC, 0,
-                                  dlen + sizeof(new_off),
-                                  dlen + rec_extra_padding(&chdr));
-
-               if (ecode != NTDB_SUCCESS) {
-                       return ecode;
-               }
-               /* find_and_lock set up h to point to last bucket. */
-               ecode = replace_in_hash(ntdb, h, new_off);
-               if (ecode != NTDB_SUCCESS) {
-                       return ecode;
-               }
-               ecode = ntdb_write_convert(ntdb, h->table, &chdr, sizeof(chdr));
-               if (ecode != NTDB_SUCCESS) {
-                       return ecode;
-               }
-               /* For futureproofing, we always make the first byte of padding
-                * a zero. */
-               if (rec_extra_padding(&chdr)) {
-                       ecode = ntdb->io->twrite(ntdb, h->table + sizeof(chdr)
-                                                + dlen + sizeof(new_off),
-                                                "", 1);
-               }
-               return ecode;
-       }
-
-       /* We need to reallocate the chain. */
-       chain = alloc(ntdb, 0, (h->table_size + 1) * sizeof(ntdb_off_t),
-                     NTDB_CHAIN_MAGIC, true);
-       if (NTDB_OFF_IS_ERR(chain)) {
-               return NTDB_OFF_TO_ERR(chain);
-       }
-
-       /* Map both and copy across old buckets. */
-       old = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
-                              h->table_size*sizeof(ntdb_off_t), true);
-       if (NTDB_PTR_IS_ERR(old)) {
-               return NTDB_PTR_ERR(old);
-       }
-       new = ntdb_access_write(ntdb, hbucket_off(chain, 0),
-                               (h->table_size + 1)*sizeof(ntdb_off_t), true);
-       if (NTDB_PTR_IS_ERR(new)) {
-               ntdb_access_release(ntdb, old);
-               return NTDB_PTR_ERR(new);
-       }
-
-       memcpy(new, old, h->bucket * sizeof(ntdb_off_t));
-       new[h->bucket] = encode_offset(ntdb, new_off, h->h);
-       ntdb_access_release(ntdb, old);
-
-       ecode = ntdb_access_commit(ntdb, new);
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       /* Free the old chain. */
-       ecode = add_free_record(ntdb, h->table,
-                               sizeof(struct ntdb_used_record)
-                               + rec_data_length(&chdr)
-                               + rec_extra_padding(&chdr),
-                               NTDB_LOCK_WAIT, true);
-
-       /* Replace top-level to point to new chain */
-       return ntdb_write_off(ntdb,
-                             hbucket_off(NTDB_HASH_OFFSET,
-                                         bits_from(h->h, 0, ntdb->hash_bits)),
-                             chain | (1ULL << NTDB_OFF_CHAIN_BIT));
-}
-
-/* Traverse support: returns offset of record, or 0 or -ve error. */
-static ntdb_off_t iterate_chain(struct ntdb_context *ntdb,
-                               ntdb_off_t val,
-                               struct hash_info *h)
-{
-       ntdb_off_t i;
-       enum NTDB_ERROR ecode;
-       struct ntdb_used_record chdr;
-
-       /* First load up chain header. */
-       h->table = val & NTDB_OFF_MASK;
-       ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
-               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                  NTDB_LOG_ERROR,
-                                  "get_table:"
-                                  " corrupt record %#x at %llu",
-                                  rec_magic(&chdr),
-                                  (long long)h->table);
-       }
-
-       /* Chain length is implied by data length. */
-       h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
-
-       i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), h->bucket,
-                                 h->table_size);
-       if (NTDB_OFF_IS_ERR(i)) {
-               return i;
-       }
-
-       if (i != h->table_size) {
-               /* Return to next bucket. */
-               h->bucket = i + 1;
-               val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
-               if (NTDB_OFF_IS_ERR(val)) {
-                       return val;
-               }
-               return val & NTDB_OFF_MASK;
-       }
-
-       /* Go back up to hash table. */
-       h->table = NTDB_HASH_OFFSET;
-       h->table_size = 1 << ntdb->hash_bits;
-       h->bucket = bits_from(h->h, 0, ntdb->hash_bits) + 1;
-       return 0;
-}
-
-/* Keeps hash locked unless returns 0 or error. */
-static ntdb_off_t lock_and_iterate_hash(struct ntdb_context *ntdb,
-                                       struct hash_info *h)
-{
-       ntdb_off_t val, i;
-       enum NTDB_ERROR ecode;
-
-       if (h->table != NTDB_HASH_OFFSET) {
-               /* We're in a chain. */
-               i = bits_from(h->h, 0, ntdb->hash_bits);
-               ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
-               if (ecode != NTDB_SUCCESS) {
-                       return NTDB_ERR_TO_OFF(ecode);
-               }
-
-               /* We dropped lock, bucket might have moved! */
-               val = ntdb_read_off(ntdb, hbucket_off(NTDB_HASH_OFFSET, i));
-               if (NTDB_OFF_IS_ERR(val)) {
-                       goto unlock;
-               }
-
-               /* We don't remove chains: there should still be one there! */
-               if (!val || !is_chain(val)) {
-                       ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                           NTDB_LOG_ERROR,
-                                           "iterate_hash:"
-                                           " vanished hchain %llu at %llu",
-                                           (long long)val,
-                                           (long long)i);
-                       val = NTDB_ERR_TO_OFF(ecode);
-                       goto unlock;
-               }
-
-               /* Find next bucket in the chain. */
-               val = iterate_chain(ntdb, val, h);
-               if (NTDB_OFF_IS_ERR(val)) {
-                       goto unlock;
-               }
-               if (val != 0) {
-                       return val;
-               }
-               ntdb_unlock_hash(ntdb, i, F_RDLCK);
-
-               /* OK, we've reset h back to top level. */
-       }
-
-       /* We do this unlocked, then re-check. */
-       for (i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
-                                      h->bucket, h->table_size);
-            i != h->table_size;
-            i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
-                                      i+1, h->table_size)) {
-               ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
-               if (ecode != NTDB_SUCCESS) {
-                       return NTDB_ERR_TO_OFF(ecode);
-               }
-
-               val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
-               if (NTDB_OFF_IS_ERR(val)) {
-                       goto unlock;
-               }
-
-               /* Lost race, and it's empty? */
-               if (!val) {
-                       ntdb->stats.traverse_val_vanished++;
-                       ntdb_unlock_hash(ntdb, i, F_RDLCK);
-                       continue;
-               }
-
-               if (!is_chain(val)) {
-                       /* So caller knows what lock to free. */
-                       h->h = i;
-                       /* Return to next bucket. */
-                       h->bucket = i + 1;
-                       val &= NTDB_OFF_MASK;
-                       return val;
-               }
-
-               /* Start at beginning of chain */
-               h->bucket = 0;
-               h->h = i;
-
-               val = iterate_chain(ntdb, val, h);
-               if (NTDB_OFF_IS_ERR(val)) {
-                       goto unlock;
-               }
-               if (val != 0) {
-                       return val;
-               }
-
-               /* Otherwise, bucket has been set to i+1 */
-               ntdb_unlock_hash(ntdb, i, F_RDLCK);
-       }
-       return 0;
-
-unlock:
-       ntdb_unlock_hash(ntdb, i, F_RDLCK);
-       return val;
-}
-
-/* Return success if we find something, NTDB_ERR_NOEXIST if none. */
-enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
-                            struct hash_info *h,
-                            NTDB_DATA *kbuf, size_t *dlen)
-{
-       ntdb_off_t off;
-       struct ntdb_used_record rec;
-       enum NTDB_ERROR ecode;
-
-       off = lock_and_iterate_hash(ntdb, h);
-
-       if (NTDB_OFF_IS_ERR(off)) {
-               return NTDB_OFF_TO_ERR(off);
-       } else if (off == 0) {
-               return NTDB_ERR_NOEXIST;
-       }
-
-       /* The hash for this key is still locked. */
-       ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
-       if (ecode != NTDB_SUCCESS) {
-               goto unlock;
-       }
-       if (rec_magic(&rec) != NTDB_USED_MAGIC) {
-               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
-                                   NTDB_LOG_ERROR,
-                                   "next_in_hash:"
-                                   " corrupt record at %llu",
-                                   (long long)off);
-               goto unlock;
-       }
-
-       kbuf->dsize = rec_key_length(&rec);
-
-       /* They want data as well? */
-       if (dlen) {
-               *dlen = rec_data_length(&rec);
-               kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
-                                            kbuf->dsize + *dlen);
-       } else {
-               kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
-                                            kbuf->dsize);
-       }
-       if (NTDB_PTR_IS_ERR(kbuf->dptr)) {
-               ecode = NTDB_PTR_ERR(kbuf->dptr);
-               goto unlock;
-       }
-       ecode = NTDB_SUCCESS;
-
-unlock:
-       ntdb_unlock_hash(ntdb, bits_from(h->h, 0, ntdb->hash_bits), F_RDLCK);
-       return ecode;
-
-}
-
-enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
-                            struct hash_info *h,
-                            NTDB_DATA *kbuf, size_t *dlen)
-{
-       h->table = NTDB_HASH_OFFSET;
-       h->table_size = 1 << ntdb->hash_bits;
-       h->bucket = 0;
-
-       return next_in_hash(ntdb, h, kbuf, dlen);
-}
-
-/* Even if the entry isn't in this hash bucket, you'd have to lock this
- * bucket to find it. */
-static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb,
-                                const NTDB_DATA *key, int ltype)
-{
-       uint32_t h = ntdb_hash(ntdb, key->dptr, key->dsize);
-
-       return ntdb_lock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), ltype);
-}
-
-/* lock/unlock one hash chain. This is meant to be used to reduce
-   contention - it cannot guarantee how many records will be locked */
-_PUBLIC_ enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key)
-{
-       return chainlock(ntdb, &key, F_WRLCK);
-}
-
-_PUBLIC_ void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key)
-{
-       uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
-
-       ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_WRLCK);
-}
-
-_PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb,
-                                            NTDB_DATA key)
-{
-       return chainlock(ntdb, &key, F_RDLCK);
-}
-
-_PUBLIC_ void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key)
-{
-       uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
-
-       ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_RDLCK);
-}
diff --git a/ccan/ntdb/io.c b/ccan/ntdb/io.c
deleted file mode 100644 (file)
index 7645cdd..0000000
+++ /dev/null
@@ -1,750 +0,0 @@
- /*
-   Unix SMB/CIFS implementation.
-
-   trivial database library
-
-   Copyright (C) Andrew Tridgell              1999-2005
-   Copyright (C) Paul `Rusty' Russell             2000
-   Copyright (C) Jeremy Allison                           2000-2003
-   Copyright (C) Rusty Russell                    2010
-
-     ** NOTE! The following LGPL license applies to the ntdb
-     ** library. This does NOT imply that all of Samba is released
-     ** under the LGPL
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-#include "private.h"
-#include <ccan/likely/likely.h>
-
-static void free_old_mmaps(struct ntdb_context *ntdb)
-{
-       struct ntdb_old_mmap *i;
-
-       assert(ntdb->file->direct_count == 0);
-
-       while ((i = ntdb->file->old_mmaps) != NULL) {
-               ntdb->file->old_mmaps = i->next;
-               if (ntdb->flags & NTDB_INTERNAL) {
-                       ntdb->free_fn(i->map_ptr, ntdb->alloc_data);
-               } else {
-                       munmap(i->map_ptr, i->map_size);
-               }
-               ntdb->free_fn(i, ntdb->alloc_data);
-       }
-}
-
-static enum NTDB_ERROR save_old_map(struct ntdb_context *ntdb)
-{
-       struct ntdb_old_mmap *old;
-
-       assert(ntdb->file->direct_count);
-
-       old = ntdb->alloc_fn(ntdb->file, sizeof(*old), ntdb->alloc_data);
-       if (!old) {
-               return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
-                                  "save_old_map alloc failed");
-       }
-       old->next = ntdb->file->old_mmaps;
-       old->map_ptr = ntdb->file->map_ptr;
-       old->map_size = ntdb->file->map_size;
-       ntdb->file->old_mmaps = old;
-
-       return NTDB_SUCCESS;
-}
-
-enum NTDB_ERROR ntdb_munmap(struct ntdb_context *ntdb)
-{
-       if (ntdb->file->fd == -1) {
-               return NTDB_SUCCESS;
-       }
-
-       if (!ntdb->file->map_ptr) {
-               return NTDB_SUCCESS;
-       }
-
-       /* We can't unmap now if there are accessors. */
-       if (ntdb->file->direct_count) {
-               return save_old_map(ntdb);
-       } else {
-               munmap(ntdb->file->map_ptr, ntdb->file->map_size);
-               ntdb->file->map_ptr = NULL;
-       }
-       return NTDB_SUCCESS;
-}
-
-enum NTDB_ERROR ntdb_mmap(struct ntdb_context *ntdb)
-{
-       int mmap_flags;
-
-       if (ntdb->flags & NTDB_INTERNAL)
-               return NTDB_SUCCESS;
-
-#ifndef HAVE_INCOHERENT_MMAP
-       if (ntdb->flags & NTDB_NOMMAP)
-               return NTDB_SUCCESS;
-#endif
-
-       if ((ntdb->open_flags & O_ACCMODE) == O_RDONLY)
-               mmap_flags = PROT_READ;
-       else
-               mmap_flags = PROT_READ | PROT_WRITE;
-
-       /* size_t can be smaller than off_t. */
-       if ((size_t)ntdb->file->map_size == ntdb->file->map_size) {
-               ntdb->file->map_ptr = mmap(NULL, ntdb->file->map_size,
-                                         mmap_flags,
-                                         MAP_SHARED, ntdb->file->fd, 0);
-       } else
-               ntdb->file->map_ptr = MAP_FAILED;
-
-       /*
-        * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
-        */
-       if (ntdb->file->map_ptr == MAP_FAILED) {
-               ntdb->file->map_ptr = NULL;
-#ifdef HAVE_INCOHERENT_MMAP
-               /* Incoherent mmap means everyone must mmap! */
-               return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
-                                 "ntdb_mmap failed for size %lld (%s)",
-                                 (long long)ntdb->file->map_size,
-                                 strerror(errno));
-#else
-               ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
-                          "ntdb_mmap failed for size %lld (%s)",
-                          (long long)ntdb->file->map_size, strerror(errno));
-#endif
-       }
-       return NTDB_SUCCESS;
-}
-
-/* check for an out of bounds access - if it is out of bounds then
-   see if the database has been expanded by someone else and expand
-   if necessary
-   note that "len" is the minimum length needed for the db.
-
-   If probe is true, len being too large isn't a failure.
-*/
-static enum NTDB_ERROR ntdb_normal_oob(struct ntdb_context *ntdb,
-                                      ntdb_off_t off, ntdb_len_t len,
-                                      bool probe)
-{
-       struct stat st;
-       enum NTDB_ERROR ecode;
-
-       if (len + off < len) {
-               if (probe)
-                       return NTDB_SUCCESS;
-
-               return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
-                                 "ntdb_oob off %llu len %llu wrap\n",
-                                 (long long)off, (long long)len);
-       }
-
-       if (ntdb->flags & NTDB_INTERNAL) {
-               if (probe)
-                       return NTDB_SUCCESS;
-
-               ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
-                          "ntdb_oob len %lld beyond internal"
-                          " alloc size %lld",
-                          (long long)(off + len),
-                          (long long)ntdb->file->map_size);
-               return NTDB_ERR_IO;
-       }
-
-       ecode = ntdb_lock_expand(ntdb, F_RDLCK);
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       if (fstat(ntdb->file->fd, &st) != 0) {
-               ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
-                          "Failed to fstat file: %s", strerror(errno));
-               ntdb_unlock_expand(ntdb, F_RDLCK);
-               return NTDB_ERR_IO;
-       }
-
-       ntdb_unlock_expand(ntdb, F_RDLCK);
-
-       if (st.st_size < off + len) {
-               if (probe)
-                       return NTDB_SUCCESS;
-
-               ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
-                          "ntdb_oob len %llu beyond eof at %llu",
-                          (long long)(off + len), (long long)st.st_size);
-               return NTDB_ERR_IO;
-       }
-
-       /* Unmap, update size, remap */
-       ecode = ntdb_munmap(ntdb);
-       if (ecode) {
-               return ecode;
-       }
-
-       ntdb->file->map_size = st.st_size;
-       return ntdb_mmap(ntdb);
-}
-
-/* Endian conversion: we only ever deal with 8 byte quantities */
-void *ntdb_convert(const struct ntdb_context *ntdb, void *buf, ntdb_len_t size)
-{
-       assert(size % 8 == 0);
-       if (unlikely((ntdb->flags & NTDB_CONVERT)) && buf) {
-               uint64_t i, *p = (uint64_t *)buf;
-               for (i = 0; i < size / 8; i++)
-                       p[i] = bswap_64(p[i]);
-       }
-       return buf;
-}
-
-/* Return first non-zero offset in offset array, or end, or -ve error. */
-/* FIXME: Return the off? */
-uint64_t ntdb_find_nonzero_off(struct ntdb_context *ntdb,
-                             ntdb_off_t base, uint64_t start, uint64_t end)
-{
-       uint64_t i;
-       const uint64_t *val;
-
-       /* Zero vs non-zero is the same unconverted: minor optimization. */
-       val = ntdb_access_read(ntdb, base + start * sizeof(ntdb_off_t),
-                             (end - start) * sizeof(ntdb_off_t), false);
-       if (NTDB_PTR_IS_ERR(val)) {
-               return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val));
-       }
-
-       for (i = 0; i < (end - start); i++) {
-               if (val[i])
-                       break;
-       }
-       ntdb_access_release(ntdb, val);
-       return start + i;
-}
-
-/* Return first zero offset in num offset array, or num, or -ve error. */
-uint64_t ntdb_find_zero_off(struct ntdb_context *ntdb, ntdb_off_t off,
-                          uint64_t num)
-{
-       uint64_t i;
-       const uint64_t *val;
-
-       /* Zero vs non-zero is the same unconverted: minor optimization. */
-       val = ntdb_access_read(ntdb, off, num * sizeof(ntdb_off_t), false);
-       if (NTDB_PTR_IS_ERR(val)) {
-               return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val));
-       }
-
-       for (i = 0; i < num; i++) {
-               if (!val[i])
-                       break;
-       }
-       ntdb_access_release(ntdb, val);
-       return i;
-}
-
-enum NTDB_ERROR zero_out(struct ntdb_context *ntdb, ntdb_off_t off, ntdb_len_t len)
-{
-       char buf[8192] = { 0 };
-       void *p = ntdb->io->direct(ntdb, off, len, true);
-       enum NTDB_ERROR ecode = NTDB_SUCCESS;
-
-       assert(!(ntdb->flags & NTDB_RDONLY));
-       if (NTDB_PTR_IS_ERR(p)) {
-               return NTDB_PTR_ERR(p);
-       }
-       if (p) {
-               memset(p, 0, len);
-               return ecode;
-       }
-       while (len) {
-               unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
-               ecode = ntdb->io->twrite(ntdb, off, buf, todo);
-               if (ecode != NTDB_SUCCESS) {
-                       break;
-               }
-               len -= todo;
-               off += todo;
-       }
-       return ecode;
-}
-
-/* write a lump of data at a specified offset */
-static enum NTDB_ERROR ntdb_write(struct ntdb_context *ntdb, ntdb_off_t off,
-                               const void *buf, ntdb_len_t len)
-{
-       enum NTDB_ERROR ecode;
-
-       if (ntdb->flags & NTDB_RDONLY) {
-               return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
-                                 "Write to read-only database");
-       }
-
-       ecode = ntdb_oob(ntdb, off, len, false);
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       if (ntdb->file->map_ptr) {
-               memcpy(off + (char *)ntdb->file->map_ptr, buf, len);
-       } else {
-#ifdef HAVE_INCOHERENT_MMAP
-               return NTDB_ERR_IO;
-#else
-               ssize_t ret;
-               ret = pwrite(ntdb->file->fd, buf, len, off);
-               if (ret != len) {
-                       /* This shouldn't happen: we avoid sparse files. */
-                       if (ret >= 0)
-                               errno = ENOSPC;
-
-                       return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
-                                         "ntdb_write: %zi at %zu len=%zu (%s)",
-                                         ret, (size_t)off, (size_t)len,
-                                         strerror(errno));
-               }
-#endif
-       }
-       return NTDB_SUCCESS;
-}
-
-/* read a lump of data at a specified offset */
-static enum NTDB_ERROR ntdb_read(struct ntdb_context *ntdb, ntdb_off_t off,
-                              void *buf, ntdb_len_t len)
-{
-       enum NTDB_ERROR ecode;
-
-       ecode = ntdb_oob(ntdb, off, len, false);
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       if (ntdb->file->map_ptr) {
-               memcpy(buf, off + (char *)ntdb->file->map_ptr, len);
-       } else {
-#ifdef HAVE_INCOHERENT_MMAP
-               return NTDB_ERR_IO;
-#else
-               ssize_t r = pread(ntdb->file->fd, buf, len, off);
-               if (r != len) {
-                       return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
-                                         "ntdb_read failed with %zi at %zu "
-                                         "len=%zu (%s) map_size=%zu",
-                                         r, (size_t)off, (size_t)len,
-                                         strerror(errno),
-                                         (size_t)ntdb->file->map_size);
-               }
-#endif
-       }
-       return NTDB_SUCCESS;
-}
-
-enum NTDB_ERROR ntdb_write_convert(struct ntdb_context *ntdb, ntdb_off_t off,
-                                const void *rec, size_t len)
-{
-       enum NTDB_ERROR ecode;
-
-       if (unlikely((ntdb->flags & NTDB_CONVERT))) {
-               void *conv = ntdb->alloc_fn(ntdb, len, ntdb->alloc_data);
-               if (!conv) {
-                       return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
-                                         "ntdb_write: no memory converting"
-                                         " %zu bytes", len);
-               }
-               memcpy(conv, rec, len);
-               ecode = ntdb->io->twrite(ntdb, off,
-                                        ntdb_convert(ntdb, conv, len), len);
-               ntdb->free_fn(conv, ntdb->alloc_data);
-       } else {
-               ecode = ntdb->io->twrite(ntdb, off, rec, len);
-       }
-       return ecode;
-}
-
-enum NTDB_ERROR ntdb_read_convert(struct ntdb_context *ntdb, ntdb_off_t off,
-                               void *rec, size_t len)
-{
-       enum NTDB_ERROR ecode = ntdb->io->tread(ntdb, off, rec, len);
-       ntdb_convert(ntdb, rec, len);
-       return ecode;
-}
-
-static void *_ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset,
-                            ntdb_len_t len, unsigned int prefix)
-{
-       unsigned char *buf;
-       enum NTDB_ERROR ecode;
-
-       /* some systems don't like zero length malloc */
-       buf = ntdb->alloc_fn(ntdb, prefix + len ? prefix + len : 1,
-                         ntdb->alloc_data);
-       if (!buf) {
-               ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
-                          "ntdb_alloc_read alloc failed len=%zu",
-                          (size_t)(prefix + len));
-               return NTDB_ERR_PTR(NTDB_ERR_OOM);
-       } else {
-               ecode = ntdb->io->tread(ntdb, offset, buf+prefix, len);
-               if (unlikely(ecode != NTDB_SUCCESS)) {
-                       ntdb->free_fn(buf, ntdb->alloc_data);
-                       return NTDB_ERR_PTR(ecode);
-               }
-       }
-       return buf;
-}
-
-/* read a lump of data, allocating the space for it */
-void *ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset, ntdb_len_t len)
-{
-       return _ntdb_alloc_read(ntdb, offset, len, 0);
-}
-
-static enum NTDB_ERROR fill(struct ntdb_context *ntdb,
-                          const void *buf, size_t size,
-                          ntdb_off_t off, ntdb_len_t len)
-{
-       while (len) {
-               size_t n = len > size ? size : len;
-               ssize_t ret = pwrite(ntdb->file->fd, buf, n, off);
-               if (ret != n) {
-                       if (ret >= 0)
-                               errno = ENOSPC;
-
-                       return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
-                                         "fill failed:"
-                                         " %zi at %zu len=%zu (%s)",
-                                         ret, (size_t)off, (size_t)len,
-                                         strerror(errno));
-               }
-               len -= n;
-               off += n;
-       }
-       return NTDB_SUCCESS;
-}
-
-/* expand a file.  we prefer to use ftruncate, as that is what posix
-  says to use for mmap expansion */
-static enum NTDB_ERROR ntdb_expand_file(struct ntdb_context *ntdb,
-                                     ntdb_len_t addition)
-{
-       char buf[8192];
-       enum NTDB_ERROR ecode;
-
-       assert((ntdb->file->map_size + addition) % NTDB_PGSIZE == 0);
-       if (ntdb->flags & NTDB_RDONLY) {
-               return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
-                                 "Expand on read-only database");
-       }
-
-       if (ntdb->flags & NTDB_INTERNAL) {
-               char *new;
-
-               /* Can't free it if we have direct accesses. */
-               if (ntdb->file->direct_count) {
-                       ecode = save_old_map(ntdb);
-                       if (ecode) {
-                               return ecode;
-                       }
-                       new = ntdb->alloc_fn(ntdb->file,
-                                            ntdb->file->map_size + addition,
-                                            ntdb->alloc_data);
-                       if (new) {
-                               memcpy(new, ntdb->file->map_ptr,
-                                      ntdb->file->map_size);
-                       }
-               } else {
-                       new = ntdb->expand_fn(ntdb->file->map_ptr,
-                                             ntdb->file->map_size + addition,
-                                             ntdb->alloc_data);
-               }
-               if (!new) {
-                       return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
-                                         "No memory to expand database");
-               }
-               ntdb->file->map_ptr = new;
-               ntdb->file->map_size += addition;
-               return NTDB_SUCCESS;
-       } else {
-               /* Unmap before trying to write; old NTDB claimed OpenBSD had
-                * problem with this otherwise. */
-               ecode = ntdb_munmap(ntdb);
-               if (ecode) {
-                       return ecode;
-               }
-
-               /* If this fails, we try to fill anyway. */
-               if (ftruncate(ntdb->file->fd, ntdb->file->map_size + addition))
-                       ;
-
-               /* now fill the file with something. This ensures that the
-                  file isn't sparse, which would be very bad if we ran out of
-                  disk. This must be done with write, not via mmap */
-               memset(buf, 0x43, sizeof(buf));
-               ecode = fill(ntdb, buf, sizeof(buf), ntdb->file->map_size,
-                            addition);
-               if (ecode != NTDB_SUCCESS)
-                       return ecode;
-               ntdb->file->map_size += addition;
-               return ntdb_mmap(ntdb);
-       }
-}
-
-const void *ntdb_access_read(struct ntdb_context *ntdb,
-                           ntdb_off_t off, ntdb_len_t len, bool convert)
-{
-       void *ret = NULL;
-
-       if (likely(!(ntdb->flags & NTDB_CONVERT))) {
-               ret = ntdb->io->direct(ntdb, off, len, false);
-
-               if (NTDB_PTR_IS_ERR(ret)) {
-                       return ret;
-               }
-       }
-       if (!ret) {
-               struct ntdb_access_hdr *hdr;
-               hdr = _ntdb_alloc_read(ntdb, off, len, sizeof(*hdr));
-               if (NTDB_PTR_IS_ERR(hdr)) {
-                       return hdr;
-               }
-               hdr->next = ntdb->access;
-               ntdb->access = hdr;
-               ret = hdr + 1;
-               if (convert) {
-                       ntdb_convert(ntdb, (void *)ret, len);
-               }
-       } else {
-               ntdb->file->direct_count++;
-       }
-
-       return ret;
-}
-
-void *ntdb_access_write(struct ntdb_context *ntdb,
-                      ntdb_off_t off, ntdb_len_t len, bool convert)
-{
-       void *ret = NULL;
-
-       if (ntdb->flags & NTDB_RDONLY) {
-               ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
-                          "Write to read-only database");
-               return NTDB_ERR_PTR(NTDB_ERR_RDONLY);
-       }
-
-       if (likely(!(ntdb->flags & NTDB_CONVERT))) {
-               ret = ntdb->io->direct(ntdb, off, len, true);
-
-               if (NTDB_PTR_IS_ERR(ret)) {
-                       return ret;
-               }
-       }
-
-       if (!ret) {
-               struct ntdb_access_hdr *hdr;
-               hdr = _ntdb_alloc_read(ntdb, off, len, sizeof(*hdr));
-               if (NTDB_PTR_IS_ERR(hdr)) {
-                       return hdr;
-               }
-               hdr->next = ntdb->access;
-               ntdb->access = hdr;
-               hdr->off = off;
-               hdr->len = len;
-               hdr->convert = convert;
-               ret = hdr + 1;
-               if (convert)
-                       ntdb_convert(ntdb, (void *)ret, len);
-       } else {
-               ntdb->file->direct_count++;
-       }
-       return ret;
-}
-
-static struct ntdb_access_hdr **find_hdr(struct ntdb_context *ntdb, const void *p)
-{
-       struct ntdb_access_hdr **hp;
-
-       for (hp = &ntdb->access; *hp; hp = &(*hp)->next) {
-               if (*hp + 1 == p)
-                       return hp;
-       }
-       return NULL;
-}
-
-void ntdb_access_release(struct ntdb_context *ntdb, const void *p)
-{
-       struct ntdb_access_hdr *hdr, **hp = find_hdr(ntdb, p);
-
-       if (hp) {
-               hdr = *hp;
-               *hp = hdr->next;
-               ntdb->free_fn(hdr, ntdb->alloc_data);
-       } else {
-               if (--ntdb->file->direct_count == 0) {
-                       free_old_mmaps(ntdb);
-               }
-       }
-}
-
-enum NTDB_ERROR ntdb_access_commit(struct ntdb_context *ntdb, void *p)
-{
-       struct ntdb_access_hdr *hdr, **hp = find_hdr(ntdb, p);
-       enum NTDB_ERROR ecode;
-
-       if (hp) {
-               hdr = *hp;
-               if (hdr->convert)
-                       ecode = ntdb_write_convert(ntdb, hdr->off, p, hdr->len);
-               else
-                       ecode = ntdb_write(ntdb, hdr->off, p, hdr->len);
-               *hp = hdr->next;
-               ntdb->free_fn(hdr, ntdb->alloc_data);
-       } else {
-               if (--ntdb->file->direct_count == 0) {
-                       free_old_mmaps(ntdb);
-               }
-               ecode = NTDB_SUCCESS;
-       }
-
-       return ecode;
-}
-
-static void *ntdb_direct(struct ntdb_context *ntdb, ntdb_off_t off, size_t len,
-                       bool write_mode)
-{
-       enum NTDB_ERROR ecode;
-
-       if (unlikely(!ntdb->file->map_ptr))
-               return NULL;
-
-       ecode = ntdb_oob(ntdb, off, len, false);
-       if (unlikely(ecode != NTDB_SUCCESS))
-               return NTDB_ERR_PTR(ecode);
-       return (char *)ntdb->file->map_ptr + off;
-}
-
-static ntdb_off_t ntdb_read_normal_off(struct ntdb_context *ntdb,
-                                      ntdb_off_t off)
-{
-       ntdb_off_t ret;
-       enum NTDB_ERROR ecode;
-       ntdb_off_t *p;
-
-       p = ntdb_direct(ntdb, off, sizeof(*p), false);
-       if (NTDB_PTR_IS_ERR(p)) {
-               return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(p));
-       }
-       if (likely(p)) {
-               return *p;
-       }
-
-       ecode = ntdb_read(ntdb, off, &ret, sizeof(ret));
-       if (ecode != NTDB_SUCCESS) {
-               return NTDB_ERR_TO_OFF(ecode);
-       }
-       return ret;
-}
-
-static ntdb_off_t ntdb_read_convert_off(struct ntdb_context *ntdb,
-                                       ntdb_off_t off)
-{
-       ntdb_off_t ret;
-       enum NTDB_ERROR ecode;
-
-       ecode = ntdb_read_convert(ntdb, off, &ret, sizeof(ret));
-       if (ecode != NTDB_SUCCESS) {
-               return NTDB_ERR_TO_OFF(ecode);
-       }
-       return ret;
-}
-
-static enum NTDB_ERROR ntdb_write_normal_off(struct ntdb_context *ntdb,
-                                            ntdb_off_t off, ntdb_off_t val)
-{
-       ntdb_off_t *p;
-
-       p = ntdb_direct(ntdb, off, sizeof(*p), true);
-       if (NTDB_PTR_IS_ERR(p)) {
-               return NTDB_PTR_ERR(p);
-       }
-       if (likely(p)) {
-               *p = val;
-               return NTDB_SUCCESS;
-       }
-       return ntdb_write(ntdb, off, &val, sizeof(val));
-}
-
-static enum NTDB_ERROR ntdb_write_convert_off(struct ntdb_context *ntdb,
-                                             ntdb_off_t off, ntdb_off_t val)
-{
-       return ntdb_write_convert(ntdb, off, &val, sizeof(val));
-}
-
-void ntdb_inc_seqnum(struct ntdb_context *ntdb)
-{
-       ntdb_off_t seq;
-
-       if (likely(!(ntdb->flags & NTDB_CONVERT))) {
-               int64_t *direct;
-
-               direct = ntdb->io->direct(ntdb,
-                                        offsetof(struct ntdb_header, seqnum),
-                                        sizeof(*direct), true);
-               if (likely(direct)) {
-                       /* Don't let it go negative, even briefly */
-                       if (unlikely((*direct) + 1) < 0)
-                               *direct = 0;
-                       (*direct)++;
-                       return;
-               }
-       }
-
-       seq = ntdb_read_off(ntdb, offsetof(struct ntdb_header, seqnum));
-       if (!NTDB_OFF_IS_ERR(seq)) {
-               seq++;
-               if (unlikely((int64_t)seq < 0))
-                       seq = 0;
-               ntdb_write_off(ntdb, offsetof(struct ntdb_header, seqnum), seq);
-       }
-}
-
-static const struct ntdb_methods io_methods = {
-       ntdb_read,
-       ntdb_write,
-       ntdb_normal_oob,
-       ntdb_expand_file,
-       ntdb_direct,
-       ntdb_read_normal_off,
-       ntdb_write_normal_off,
-};
-
-static const struct ntdb_methods io_convert_methods = {
-       ntdb_read,
-       ntdb_write,
-       ntdb_normal_oob,
-       ntdb_expand_file,
-       ntdb_direct,
-       ntdb_read_convert_off,
-       ntdb_write_convert_off,
-};
-
-/*
-  initialise the default methods table
-*/
-void ntdb_io_init(struct ntdb_context *ntdb)
-{
-       if (ntdb->flags & NTDB_CONVERT)
-               ntdb->io = &io_convert_methods;
-       else
-               ntdb->io = &io_methods;
-}
diff --git a/ccan/ntdb/lock.c b/ccan/ntdb/lock.c
deleted file mode 100644 (file)
index 71d6d85..0000000
+++ /dev/null
@@ -1,886 +0,0 @@
- /*
-   Unix SMB/CIFS implementation.
-
-   trivial database library
-
-   Copyright (C) Andrew Tridgell              1999-2005
-   Copyright (C) Paul `Rusty' Russell             2000
-   Copyright (C) Jeremy Allison                           2000-2003
-
-     ** NOTE! The following LGPL license applies to the ntdb
-     ** library. This does NOT imply that all of Samba is released
-     ** under the LGPL
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 3 of the License, or (at your option) any later version.
-
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "private.h"
-#include <ccan/build_assert/build_assert.h>
-
-/* If we were threaded, we could wait for unlock, but we're not, so fail. */
-enum NTDB_ERROR owner_conflict(struct ntdb_context *ntdb, const char *call)
-{
-       return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
-                         "%s: lock owned by another ntdb in this process.",
-                         call);
-}
-
-/* If we fork, we no longer really own locks. */
-bool check_lock_pid(struct ntdb_context *ntdb, const char *call, bool log)
-{
-       /* No locks?  No problem! */
-       if (ntdb->file->allrecord_lock.count == 0
-           && ntdb->file->num_lockrecs == 0) {
-               return true;
-       }
-
-       /* No fork?  No problem! */
-       if (ntdb->file->locker == getpid()) {
-               return true;
-       }
-
-       if (log) {
-               ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
-                           "%s: fork() detected after lock acquisition!"
-                           " (%u vs %u)", call,
-                           (unsigned int)ntdb->file->locker,
-                           (unsigned int)getpid());
-       }
-       return false;
-}
-
-int ntdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
-                  void *unused)
-{
-       struct flock fl;
-       int ret;
-
-       do {
-               fl.l_type = rw;
-               fl.l_whence = SEEK_SET;
-               fl.l_start = off;
-               fl.l_len = len;
-
-               if (waitflag)
-                       ret = fcntl(fd, F_SETLKW, &fl);
-               else
-                       ret = fcntl(fd, F_SETLK, &fl);
-       } while (ret != 0 && errno == EINTR);
-       return ret;
-}
-
-int ntdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *unused)
-{
-       struct flock fl;
-       int ret;
-
-       do {
-               fl.l_type = F_UNLCK;
-               fl.l_whence = SEEK_SET;
-               fl.l_start = off;
-               fl.l_len = len;
-
-               ret = fcntl(fd, F_SETLKW, &fl);
-       } while (ret != 0 && errno == EINTR);
-       return ret;
-}
-
-static int lock(struct ntdb_context *ntdb,
-                     int rw, off_t off, off_t len, bool waitflag)
-{
-       int ret;
-       if (ntdb->file->allrecord_lock.count == 0
-           && ntdb->file->num_lockrecs == 0) {
-               ntdb->file->locker = getpid();
-       }
-
-       ntdb->stats.lock_lowlevel++;
-       ret = ntdb->lock_fn(ntdb->file->fd, rw, off, len, waitflag,
-                          ntdb->lock_data);
-       if (!waitflag) {
-               ntdb->stats.lock_nonblock++;
-               if (ret != 0)
-                       ntdb->stats.lock_nonblock_fail++;
-       }
-       return ret;
-}
-
-static int unlock(struct ntdb_context *ntdb, int rw, off_t off, off_t len)
-{
-#if 0 /* Check they matched up locks and unlocks correctly. */
-       char line[80];
-       FILE *locks;
-       bool found = false;
-
-       locks = fopen("/proc/locks", "r");
-
-       while (fgets(line, 80, locks)) {
-               char *p;
-               int type, start, l;
-
-               /* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
-               p = strchr(line, ':') + 1;
-               if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
-                       continue;
-               p += strlen(" FLOCK  ADVISORY  ");
-               if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
-                       type = F_RDLCK;
-               else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
-                       type = F_WRLCK;
-               else
-                       abort();
-               p += 6;
-               if (atoi(p) != getpid())
-                       continue;
-               p = strchr(strchr(p, ' ') + 1, ' ') + 1;
-               start = atoi(p);
-               p = strchr(p, ' ') + 1;
-               if (strncmp(p, "EOF", 3) == 0)
-                       l = 0;
-               else
-                       l = atoi(p) - start + 1;
-
-               if (off == start) {
-                       if (len != l) {
-                               fprintf(stderr, "Len %u should be %u: %s",
-                                       (int)len, l, line);
-                               abort();
-                       }
-                       if (type != rw) {
-                               fprintf(stderr, "Type %s wrong: %s",
-                                       rw == F_RDLCK ? "READ" : "WRITE", line);
-                               abort();
-                       }
-                       found = true;
-                       break;
-               }
-       }
-
-       if (!found) {
-               fprintf(stderr, "Unlock on %u@%u not found!",
-                       (int)off, (int)len);
-               abort();
-       }
-
-       fclose(locks);
-#endif
-
-       return ntdb->unlock_fn(ntdb->file->fd, rw, off, len, ntdb->lock_data);
-}
-
-/* a byte range locking function - return 0 on success
-   this functions locks len bytes at the specified offset.
-
-   note that a len of zero means lock to end of file
-*/
-static enum NTDB_ERROR ntdb_brlock(struct ntdb_context *ntdb,
-                                int rw_type, ntdb_off_t offset, ntdb_off_t len,
-                                enum ntdb_lock_flags flags)
-{
-       int ret;
-
-       if (rw_type == F_WRLCK && (ntdb->flags & NTDB_RDONLY)) {
-               return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
-                                 "Write lock attempted on read-only database");
-       }
-
-       if (ntdb->flags & NTDB_NOLOCK) {
-               return NTDB_SUCCESS;
-       }
-
-       /* A 32 bit system cannot open a 64-bit file, but it could have
-        * expanded since then: check here. */
-       if ((size_t)(offset + len) != offset + len) {
-               return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
-                                 "ntdb_brlock: lock on giant offset %llu",
-                                 (long long)(offset + len));
-       }
-
-       ret = lock(ntdb, rw_type, offset, len, flags & NTDB_LOCK_WAIT);
-       if (ret != 0) {
-               /* Generic lock error. errno set by fcntl.
-                * EAGAIN is an expected return from non-blocking
-                * locks. */
-               if (!(flags & NTDB_LOCK_PROBE)
-                   && (errno != EAGAIN && errno != EINTR)) {
-                       ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
-                                  "ntdb_brlock failed (fd=%d) at"
-                                  " offset %zu rw_type=%d flags=%d len=%zu:"
-                                  " %s",
-                                  ntdb->file->fd, (size_t)offset, rw_type,
-                                  flags, (size_t)len, strerror(errno));
-               }
-               return NTDB_ERR_LOCK;
-       }
-       return NTDB_SUCCESS;
-}
-
-static enum NTDB_ERROR ntdb_brunlock(struct ntdb_context *ntdb,
-                                  int rw_type, ntdb_off_t offset, size_t len)
-{
-       if (ntdb->flags & NTDB_NOLOCK) {
-               return NTDB_SUCCESS;
-       }
-
-       if (!check_lock_pid(ntdb, "ntdb_brunlock", false))
-               return NTDB_ERR_LOCK;
-
-       if (unlock(ntdb, rw_type, offset, len) == -1) {
-               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
-                                 "ntdb_brunlock failed (fd=%d) at offset %zu"
-                                 " rw_type=%d len=%zu: %s",
-                                 ntdb->file->fd, (size_t)offset, rw_type,
-                                 (size_t)len, strerror(errno));
-       }
-       return NTDB_SUCCESS;
-}
-
-/*
-  upgrade a read lock to a write lock. This needs to be handled in a
-  special way as some OSes (such as solaris) have too conservative
-  deadlock detection and claim a deadlock when progress can be
-  made. For those OSes we may loop for a while.
-*/
-enum NTDB_ERROR ntdb_allrecord_upgrade(struct ntdb_context *ntdb, off_t start)
-{
-       int count = 1000;
-
-       if (!check_lock_pid(ntdb, "ntdb_transaction_prepare_commit", true))
-               return NTDB_ERR_LOCK;
-
-       if (ntdb->file->allrecord_lock.count != 1) {
-               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
-                                 "ntdb_allrecord_upgrade failed:"
-                                 " count %u too high",
-                                 ntdb->file->allrecord_lock.count);
-       }
-
-       if (ntdb->file->allrecord_lock.off != 1) {
-               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
-                                 "ntdb_allrecord_upgrade failed:"
-                                 " already upgraded?");
-       }
-
-       if (ntdb->file->allrecord_lock.owner != ntdb) {
-               return owner_conflict(ntdb, "ntdb_allrecord_upgrade");
-       }
-
-       while (count--) {
-               struct timeval tv;
-               if (ntdb_brlock(ntdb, F_WRLCK, start, 0,
-                              NTDB_LOCK_WAIT|NTDB_LOCK_PROBE) == NTDB_SUCCESS) {
-                       ntdb->file->allrecord_lock.ltype = F_WRLCK;
-                       ntdb->file->allrecord_lock.off = 0;
-                       return NTDB_SUCCESS;
-               }
-               if (errno != EDEADLK) {
-                       break;
-               }
-               /* sleep for as short a time as we can - more portable than usleep() */
-               tv.tv_sec = 0;
-               tv.tv_usec = 1;
-               select(0, NULL, NULL, NULL, &tv);
-       }
-
-       if (errno != EAGAIN && errno != EINTR)
-               ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
-                          "ntdb_allrecord_upgrade failed");
-       return NTDB_ERR_LOCK;
-}
-
-static struct ntdb_lock *find_nestlock(struct ntdb_context *ntdb, ntdb_off_t offset,
-                                     const struct ntdb_context *owner)
-{
-       unsigned int i;
-
-       for (i=0; i<ntdb->file->num_lockrecs; i++) {
-               if (ntdb->file->lockrecs[i].off == offset) {
-                       if (owner && ntdb->file->lockrecs[i].owner != owner)
-                               return NULL;
-                       return &ntdb->file->lockrecs[i];
-               }
-       }
-       return NULL;
-}
-
-enum NTDB_ERROR ntdb_lock_and_recover(struct ntdb_context *ntdb)
-{
-       enum NTDB_ERROR ecode;
-
-       if (!check_lock_pid(ntdb, "ntdb_transaction_prepare_commit", true))
-               return NTDB_ERR_LOCK;
-
-       ecode = ntdb_allrecord_lock(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK,
-                                  false);
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       ecode = ntdb_lock_open(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
-       if (ecode != NTDB_SUCCESS) {
-               ntdb_allrecord_unlock(ntdb, F_WRLCK);
-               return ecode;
-       }
-       ecode = ntdb_transaction_recover(ntdb);
-       ntdb_unlock_open(ntdb, F_WRLCK);
-       ntdb_allrecord_unlock(ntdb, F_WRLCK);
-
-       return ecode;
-}
-
-/* lock an offset in the database. */
-static enum NTDB_ERROR ntdb_nest_lock(struct ntdb_context *ntdb,
-                                   ntdb_off_t offset, int ltype,
-                                   enum ntdb_lock_flags flags)
-{
-       struct ntdb_lock *new_lck;
-       enum NTDB_ERROR ecode;
-
-       assert(offset <= (NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits)
-                         + ntdb->file->map_size / 8));
-
-       if (ntdb->flags & NTDB_NOLOCK)
-               return NTDB_SUCCESS;
-
-       if (!check_lock_pid(ntdb, "ntdb_nest_lock", true)) {
-               return NTDB_ERR_LOCK;
-       }
-
-       ntdb->stats.locks++;
-
-       new_lck = find_nestlock(ntdb, offset, NULL);
-       if (new_lck) {
-               if (new_lck->owner != ntdb) {
-                       return owner_conflict(ntdb, "ntdb_nest_lock");
-               }
-
-               if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
-                       return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
-                                         "ntdb_nest_lock:"
-                                         " offset %zu has read lock",
-                                         (size_t)offset);
-               }
-               /* Just increment the struct, posix locks don't stack. */
-               new_lck->count++;
-               return NTDB_SUCCESS;
-       }
-
-#if 0
-       if (ntdb->file->num_lockrecs
-           && offset >= NTDB_HASH_LOCK_START
-           && offset < NTDB_HASH_LOCK_START + NTDB_HASH_LOCK_RANGE) {
-               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
-                                 "ntdb_nest_lock: already have a hash lock?");
-       }
-#endif
-       if (ntdb->file->lockrecs == NULL) {
-               new_lck = ntdb->alloc_fn(ntdb->file, sizeof(*ntdb->file->lockrecs),
-                                    ntdb->alloc_data);
-       } else {
-               new_lck = (struct ntdb_lock *)ntdb->expand_fn(
-                       ntdb->file->lockrecs,
-                       sizeof(*ntdb->file->lockrecs)
-                       * (ntdb->file->num_lockrecs+1),
-                       ntdb->alloc_data);
-       }
-       if (new_lck == NULL) {
-               return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
-                                 "ntdb_nest_lock:"
-                                 " unable to allocate %zu lock struct",
-                                 ntdb->file->num_lockrecs + 1);
-       }
-       ntdb->file->lockrecs = new_lck;
-
-       /* Since fcntl locks don't nest, we do a lock for the first one,
-          and simply bump the count for future ones */
-       ecode = ntdb_brlock(ntdb, ltype, offset, 1, flags);
-       if (ecode != NTDB_SUCCESS) {
-               return ecode;
-       }
-
-       /* First time we grab a lock, perhaps someone died in commit? */
-       if (!(flags & NTDB_LOCK_NOCHECK)
-           && ntdb->file->num_lockrecs == 0) {
-               ntdb_bool_err berr = ntdb_needs_recovery(ntdb);
-               if (berr != false) {
-                       ntdb_brunlock(ntdb, ltype, offset, 1);
-
-              &nb