ntdb: next-generation trivial key-value database
authorDavid Disseldorp <ddiss@suse.de>
Fri, 13 Mar 2015 16:45:16 +0000 (17:45 +0100)
committerDavid Disseldorp <ddiss@suse.de>
Mon, 16 Mar 2015 13:34:53 +0000 (14:34 +0100)
ntdb is a partial rewrite of Samba's Trivial Database, providing > 4GB
database scalability and an improved API.
Obtained from the Samba repository at git://git.samba.org/samba.git, as
of bd13405e8570e9a5942165a8c52a2bc3fdc9d96e.

See Samba's Git changelog for authorship details.

Signed-off-by: David Disseldorp <ddiss@suse.de>
110 files changed:
ccan/ntdb/ABI/ntdb-0.9.sigs [new file with mode: 0644]
ccan/ntdb/ABI/ntdb-1.0.sigs [new file with mode: 0644]
ccan/ntdb/LICENSE [new file with mode: 0644]
ccan/ntdb/Makefile [new file with mode: 0644]
ccan/ntdb/check.c [new file with mode: 0644]
ccan/ntdb/configure [new file with mode: 0755]
ccan/ntdb/doc/TDB_porting.txt [new file with mode: 0644]
ccan/ntdb/doc/design.lyx [new file with mode: 0644]
ccan/ntdb/doc/design.pdf [new file with mode: 0644]
ccan/ntdb/doc/design.txt [new file with mode: 0644]
ccan/ntdb/free.c [new file with mode: 0644]
ccan/ntdb/hash.c [new file with mode: 0644]
ccan/ntdb/io.c [new file with mode: 0644]
ccan/ntdb/lock.c [new file with mode: 0644]
ccan/ntdb/man/ntdb.3.xml [new file with mode: 0644]
ccan/ntdb/man/ntdbbackup.8.xml [new file with mode: 0644]
ccan/ntdb/man/ntdbdump.8.xml [new file with mode: 0644]
ccan/ntdb/man/ntdbrestore.8.xml [new file with mode: 0644]
ccan/ntdb/man/ntdbtool.8.xml [new file with mode: 0644]
ccan/ntdb/ntdb.c [new file with mode: 0644]
ccan/ntdb/ntdb.h [new file with mode: 0644]
ccan/ntdb/ntdb.pc.in [new file with mode: 0644]
ccan/ntdb/open.c [new file with mode: 0644]
ccan/ntdb/private.h [new file with mode: 0644]
ccan/ntdb/pyntdb.c [new file with mode: 0644]
ccan/ntdb/summary.c [new file with mode: 0644]
ccan/ntdb/test/api-12-store.c [new file with mode: 0644]
ccan/ntdb/test/api-13-delete.c [new file with mode: 0644]
ccan/ntdb/test/api-14-exists.c [new file with mode: 0644]
ccan/ntdb/test/api-16-wipe_all.c [new file with mode: 0644]
ccan/ntdb/test/api-20-alloc-attr.c [new file with mode: 0644]
ccan/ntdb/test/api-21-parse_record.c [new file with mode: 0644]
ccan/ntdb/test/api-55-transaction.c [new file with mode: 0644]
ccan/ntdb/test/api-60-noop-transaction.c [new file with mode: 0644]
ccan/ntdb/test/api-80-tdb_fd.c [new file with mode: 0644]
ccan/ntdb/test/api-81-seqnum.c [new file with mode: 0644]
ccan/ntdb/test/api-82-lockattr.c [new file with mode: 0644]
ccan/ntdb/test/api-83-openhook.c [new file with mode: 0644]
ccan/ntdb/test/api-91-get-stats.c [new file with mode: 0644]
ccan/ntdb/test/api-92-get-set-readonly.c [new file with mode: 0644]
ccan/ntdb/test/api-93-repack.c [new file with mode: 0644]
ccan/ntdb/test/api-94-expand-during-parse.c [new file with mode: 0644]
ccan/ntdb/test/api-95-read-only-during-parse.c [new file with mode: 0644]
ccan/ntdb/test/api-add-remove-flags.c [new file with mode: 0644]
ccan/ntdb/test/api-check-callback.c [new file with mode: 0644]
ccan/ntdb/test/api-firstkey-nextkey.c [new file with mode: 0644]
ccan/ntdb/test/api-fork-test.c [new file with mode: 0644]
ccan/ntdb/test/api-locktimeout.c [new file with mode: 0644]
ccan/ntdb/test/api-missing-entries.c [new file with mode: 0644]
ccan/ntdb/test/api-open-multiple-times.c [new file with mode: 0644]
ccan/ntdb/test/api-record-expand.c [new file with mode: 0644]
ccan/ntdb/test/api-simple-delete.c [new file with mode: 0644]
ccan/ntdb/test/api-summary.c [new file with mode: 0644]
ccan/ntdb/test/external-agent.c [new file with mode: 0644]
ccan/ntdb/test/external-agent.h [new file with mode: 0644]
ccan/ntdb/test/failtest_helper.c [new file with mode: 0644]
ccan/ntdb/test/failtest_helper.h [new file with mode: 0644]
ccan/ntdb/test/helpapi-external-agent.c [new file with mode: 0644]
ccan/ntdb/test/helprun-external-agent.c [new file with mode: 0644]
ccan/ntdb/test/helprun-layout.c [new file with mode: 0644]
ccan/ntdb/test/layout.h [new file with mode: 0644]
ccan/ntdb/test/lock-tracking.c [new file with mode: 0644]
ccan/ntdb/test/lock-tracking.h [new file with mode: 0644]
ccan/ntdb/test/logging.c [new file with mode: 0644]
ccan/ntdb/test/logging.h [new file with mode: 0644]
ccan/ntdb/test/no-fsync.h [new file with mode: 0644]
ccan/ntdb/test/ntdb-source.h [new file with mode: 0644]
ccan/ntdb/test/python-api.py [new file with mode: 0644]
ccan/ntdb/test/run-001-encode.c [new file with mode: 0644]
ccan/ntdb/test/run-001-fls.c [new file with mode: 0644]
ccan/ntdb/test/run-01-new_database.c [new file with mode: 0644]
ccan/ntdb/test/run-02-expand.c [new file with mode: 0644]
ccan/ntdb/test/run-03-coalesce.c [new file with mode: 0644]
ccan/ntdb/test/run-04-basichash.c [new file with mode: 0644]
ccan/ntdb/test/run-05-readonly-open.c [new file with mode: 0644]
ccan/ntdb/test/run-10-simple-store.c [new file with mode: 0644]
ccan/ntdb/test/run-11-simple-fetch.c [new file with mode: 0644]
ccan/ntdb/test/run-12-check.c [new file with mode: 0644]
ccan/ntdb/test/run-15-append.c [new file with mode: 0644]
ccan/ntdb/test/run-25-hashoverload.c [new file with mode: 0644]
ccan/ntdb/test/run-30-exhaust-before-expand.c [new file with mode: 0644]
ccan/ntdb/test/run-35-convert.c [new file with mode: 0644]
ccan/ntdb/test/run-50-multiple-freelists.c [new file with mode: 0644]
ccan/ntdb/test/run-56-open-during-transaction.c [new file with mode: 0644]
ccan/ntdb/test/run-57-die-during-transaction.c [new file with mode: 0644]
ccan/ntdb/test/run-64-bit-tdb.c [new file with mode: 0644]
ccan/ntdb/test/run-90-get-set-attributes.c [new file with mode: 0644]
ccan/ntdb/test/run-capabilities.c [new file with mode: 0644]
ccan/ntdb/test/run-expand-in-transaction.c [new file with mode: 0644]
ccan/ntdb/test/run-features.c [new file with mode: 0644]
ccan/ntdb/test/run-lockall.c [new file with mode: 0644]
ccan/ntdb/test/run-remap-in-read_traverse.c [new file with mode: 0644]
ccan/ntdb/test/run-seed.c [new file with mode: 0644]
ccan/ntdb/test/run-tdb_errorstr.c [new file with mode: 0644]
ccan/ntdb/test/run-tdb_foreach.c [new file with mode: 0644]
ccan/ntdb/test/run-traverse.c [new file with mode: 0644]
ccan/ntdb/test/tap-interface.c [new file with mode: 0644]
ccan/ntdb/test/tap-interface.h [new file with mode: 0644]
ccan/ntdb/tools/Makefile [new file with mode: 0644]
ccan/ntdb/tools/growtdb-bench.c [new file with mode: 0644]
ccan/ntdb/tools/mkntdb.c [new file with mode: 0644]
ccan/ntdb/tools/ntdbbackup.c [new file with mode: 0644]
ccan/ntdb/tools/ntdbdump.c [new file with mode: 0644]
ccan/ntdb/tools/ntdbrestore.c [new file with mode: 0644]
ccan/ntdb/tools/ntdbtool.c [new file with mode: 0644]
ccan/ntdb/tools/ntdbtorture.c [new file with mode: 0644]
ccan/ntdb/tools/speed.c [new file with mode: 0644]
ccan/ntdb/transaction.c [new file with mode: 0644]
ccan/ntdb/traverse.c [new file with mode: 0644]
ccan/ntdb/wscript [new file with mode: 0644]

diff --git a/ccan/ntdb/ABI/ntdb-0.9.sigs b/ccan/ntdb/ABI/ntdb-0.9.sigs
new file mode 100644 (file)
index 0000000..6b12ddb
--- /dev/null
@@ -0,0 +1,38 @@
+ntdb_add_flag: void (struct ntdb_context *, unsigned int)
+ntdb_append: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA)
+ntdb_chainlock: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
+ntdb_chainlock_read: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
+ntdb_chainunlock: void (struct ntdb_context *, NTDB_DATA)
+ntdb_chainunlock_read: void (struct ntdb_context *, NTDB_DATA)
+ntdb_check_: enum NTDB_ERROR (struct ntdb_context *, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
+ntdb_close: int (struct ntdb_context *)
+ntdb_delete: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
+ntdb_errorstr: const char *(enum NTDB_ERROR)
+ntdb_exists: bool (struct ntdb_context *, NTDB_DATA)
+ntdb_fd: int (const struct ntdb_context *)
+ntdb_fetch: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA *)
+ntdb_firstkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
+ntdb_foreach_: void (int (*)(struct ntdb_context *, void *), void *)
+ntdb_get_attribute: enum NTDB_ERROR (struct ntdb_context *, union ntdb_attribute *)
+ntdb_get_flags: unsigned int (struct ntdb_context *)
+ntdb_get_seqnum: int64_t (struct ntdb_context *)
+ntdb_lockall: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_lockall_read: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_name: const char *(const struct ntdb_context *)
+ntdb_nextkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
+ntdb_open: struct ntdb_context *(const char *, int, int, mode_t, union ntdb_attribute *)
+ntdb_parse_record_: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
+ntdb_remove_flag: void (struct ntdb_context *, unsigned int)
+ntdb_repack: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_set_attribute: enum NTDB_ERROR (struct ntdb_context *, const union ntdb_attribute *)
+ntdb_store: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA, int)
+ntdb_summary: enum NTDB_ERROR (struct ntdb_context *, enum ntdb_summary_flags, char **)
+ntdb_transaction_cancel: void (struct ntdb_context *)
+ntdb_transaction_commit: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_transaction_prepare_commit: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_transaction_start: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_traverse_: int64_t (struct ntdb_context *, int (*)(struct ntdb_context *, NTDB_DATA, NTDB_DATA, void *), void *)
+ntdb_unlockall: void (struct ntdb_context *)
+ntdb_unlockall_read: void (struct ntdb_context *)
+ntdb_unset_attribute: void (struct ntdb_context *, enum ntdb_attribute_type)
+ntdb_wipe_all: enum NTDB_ERROR (struct ntdb_context *)
diff --git a/ccan/ntdb/ABI/ntdb-1.0.sigs b/ccan/ntdb/ABI/ntdb-1.0.sigs
new file mode 100644 (file)
index 0000000..6b12ddb
--- /dev/null
@@ -0,0 +1,38 @@
+ntdb_add_flag: void (struct ntdb_context *, unsigned int)
+ntdb_append: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA)
+ntdb_chainlock: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
+ntdb_chainlock_read: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
+ntdb_chainunlock: void (struct ntdb_context *, NTDB_DATA)
+ntdb_chainunlock_read: void (struct ntdb_context *, NTDB_DATA)
+ntdb_check_: enum NTDB_ERROR (struct ntdb_context *, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
+ntdb_close: int (struct ntdb_context *)
+ntdb_delete: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA)
+ntdb_errorstr: const char *(enum NTDB_ERROR)
+ntdb_exists: bool (struct ntdb_context *, NTDB_DATA)
+ntdb_fd: int (const struct ntdb_context *)
+ntdb_fetch: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA *)
+ntdb_firstkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
+ntdb_foreach_: void (int (*)(struct ntdb_context *, void *), void *)
+ntdb_get_attribute: enum NTDB_ERROR (struct ntdb_context *, union ntdb_attribute *)
+ntdb_get_flags: unsigned int (struct ntdb_context *)
+ntdb_get_seqnum: int64_t (struct ntdb_context *)
+ntdb_lockall: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_lockall_read: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_name: const char *(const struct ntdb_context *)
+ntdb_nextkey: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA *)
+ntdb_open: struct ntdb_context *(const char *, int, int, mode_t, union ntdb_attribute *)
+ntdb_parse_record_: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, enum NTDB_ERROR (*)(NTDB_DATA, NTDB_DATA, void *), void *)
+ntdb_remove_flag: void (struct ntdb_context *, unsigned int)
+ntdb_repack: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_set_attribute: enum NTDB_ERROR (struct ntdb_context *, const union ntdb_attribute *)
+ntdb_store: enum NTDB_ERROR (struct ntdb_context *, NTDB_DATA, NTDB_DATA, int)
+ntdb_summary: enum NTDB_ERROR (struct ntdb_context *, enum ntdb_summary_flags, char **)
+ntdb_transaction_cancel: void (struct ntdb_context *)
+ntdb_transaction_commit: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_transaction_prepare_commit: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_transaction_start: enum NTDB_ERROR (struct ntdb_context *)
+ntdb_traverse_: int64_t (struct ntdb_context *, int (*)(struct ntdb_context *, NTDB_DATA, NTDB_DATA, void *), void *)
+ntdb_unlockall: void (struct ntdb_context *)
+ntdb_unlockall_read: void (struct ntdb_context *)
+ntdb_unset_attribute: void (struct ntdb_context *, enum ntdb_attribute_type)
+ntdb_wipe_all: enum NTDB_ERROR (struct ntdb_context *)
diff --git a/ccan/ntdb/LICENSE b/ccan/ntdb/LICENSE
new file mode 100644 (file)
index 0000000..cca7fc2
--- /dev/null
@@ -0,0 +1,165 @@
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/ccan/ntdb/Makefile b/ccan/ntdb/Makefile
new file mode 100644 (file)
index 0000000..ddd439d
--- /dev/null
@@ -0,0 +1,67 @@
+# simple makefile wrapper to run waf
+
+WAF=WAF_MAKE=1 PATH=buildtools/bin:../../buildtools/bin:$$PATH waf
+
+all:
+       $(WAF) build
+
+install:
+       $(WAF) install
+
+uninstall:
+       $(WAF) uninstall
+
+test: FORCE
+       $(WAF) test $(TEST_OPTIONS)
+
+testenv:
+       $(WAF) test --testenv $(TEST_OPTIONS)
+
+quicktest:
+       $(WAF) test --quick $(TEST_OPTIONS)
+
+dist:
+       touch .tmplock
+       WAFLOCK=.tmplock $(WAF) dist
+
+distcheck:
+       touch .tmplock
+       WAFLOCK=.tmplock $(WAF) distcheck
+
+clean:
+       $(WAF) clean
+
+distclean:
+       $(WAF) distclean
+
+reconfigure: configure
+       $(WAF) reconfigure
+
+show_waf_options:
+       $(WAF) --help
+
+# some compatibility make targets
+everything: all
+
+testsuite: all
+
+.PHONY: check
+check: test
+
+torture: all
+
+# this should do an install as well, once install is finished
+installcheck: test
+
+etags:
+       $(WAF) etags
+
+ctags:
+       $(WAF) ctags
+
+pydoctor:
+       $(WAF) pydoctor
+
+bin/%:: FORCE
+       $(WAF) --targets=`basename $@`
+FORCE:
diff --git a/ccan/ntdb/check.c b/ccan/ntdb/check.c
new file mode 100644 (file)
index 0000000..5b6e905
--- /dev/null
@@ -0,0 +1,726 @@
+ /*
+   Trivial Database 2: free list/block handling
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+#include <ccan/asearch/asearch.h>
+
+/* We keep an ordered array of offsets. */
+static bool append(struct ntdb_context *ntdb,
+                  ntdb_off_t **arr, size_t *num, ntdb_off_t off)
+{
+       ntdb_off_t *new;
+
+       if (*num == 0) {
+               new = ntdb->alloc_fn(ntdb, sizeof(ntdb_off_t), ntdb->alloc_data);
+       } else {
+               new = ntdb->expand_fn(*arr, (*num + 1) * sizeof(ntdb_off_t),
+                                 ntdb->alloc_data);
+       }
+       if (!new)
+               return false;
+       new[(*num)++] = off;
+       *arr = new;
+       return true;
+}
+
+static enum NTDB_ERROR check_header(struct ntdb_context *ntdb,
+                                   ntdb_off_t *recovery,
+                                   uint64_t *features,
+                                   size_t *num_capabilities)
+{
+       uint64_t hash_test;
+       struct ntdb_header hdr;
+       enum NTDB_ERROR ecode;
+       ntdb_off_t off, next;
+
+       ecode = ntdb_read_convert(ntdb, 0, &hdr, sizeof(hdr));
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+       /* magic food should not be converted, so convert back. */
+       ntdb_convert(ntdb, hdr.magic_food, sizeof(hdr.magic_food));
+
+       hash_test = NTDB_HASH_MAGIC;
+       hash_test = ntdb_hash(ntdb, &hash_test, sizeof(hash_test));
+       if (hdr.hash_test != hash_test) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "check: hash test %llu should be %llu",
+                                 (long long)hdr.hash_test,
+                                 (long long)hash_test);
+       }
+
+       if (strcmp(hdr.magic_food, NTDB_MAGIC_FOOD) != 0) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "check: bad magic '%.*s'",
+                                 (unsigned)sizeof(hdr.magic_food),
+                                 hdr.magic_food);
+       }
+
+       /* Features which are used must be a subset of features offered. */
+       if (hdr.features_used & ~hdr.features_offered) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "check: features used (0x%llx) which"
+                                 " are not offered (0x%llx)",
+                                 (long long)hdr.features_used,
+                                 (long long)hdr.features_offered);
+       }
+
+       *features = hdr.features_offered;
+       *recovery = hdr.recovery;
+       if (*recovery) {
+               if (*recovery < sizeof(hdr)
+                   || *recovery > ntdb->file->map_size) {
+                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                         "ntdb_check:"
+                                         " invalid recovery offset %zu",
+                                         (size_t)*recovery);
+               }
+       }
+
+       for (off = hdr.capabilities; off && ecode == NTDB_SUCCESS; off = next) {
+               const struct ntdb_capability *cap;
+               enum NTDB_ERROR e;
+
+               cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
+               if (NTDB_PTR_IS_ERR(cap)) {
+                       return NTDB_PTR_ERR(cap);
+               }
+
+               /* All capabilities are unknown. */
+               e = unknown_capability(ntdb, "ntdb_check", cap->type);
+               next = cap->next;
+               ntdb_access_release(ntdb, cap);
+               if (e)
+                       return e;
+               (*num_capabilities)++;
+       }
+
+       /* Don't check reserved: they *can* be used later. */
+       return NTDB_SUCCESS;
+}
+
+static int off_cmp(const ntdb_off_t *a, const ntdb_off_t *b)
+{
+       /* Can overflow an int. */
+       return *a > *b ? 1
+               : *a < *b ? -1
+               : 0;
+}
+
+static enum NTDB_ERROR check_entry(struct ntdb_context *ntdb,
+                                  ntdb_off_t off_and_hash,
+                                  ntdb_len_t bucket,
+                                  ntdb_off_t used[],
+                                  size_t num_used,
+                                  size_t *num_found,
+                                  enum NTDB_ERROR (*check)(NTDB_DATA,
+                                                           NTDB_DATA,
+                                                           void *),
+                                  void *data)
+{
+       enum NTDB_ERROR ecode;
+       const struct ntdb_used_record *r;
+       const unsigned char *kptr;
+       ntdb_len_t klen, dlen;
+       uint32_t hash;
+       ntdb_off_t off = off_and_hash & NTDB_OFF_MASK;
+       ntdb_off_t *p;
+
+       /* Empty bucket is fine. */
+       if (!off_and_hash) {
+               return NTDB_SUCCESS;
+       }
+
+       /* This can't point to a chain, we handled those at toplevel. */
+       if (off_and_hash & (1ULL << NTDB_OFF_CHAIN_BIT)) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                  "ntdb_check: Invalid chain bit in offset "
+                                  " %llu", (long long)off_and_hash);
+       }
+
+       p = asearch(&off, used, num_used, off_cmp);
+       if (!p) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                  "ntdb_check: Invalid offset"
+                                  " %llu in hash", (long long)off);
+       }
+       /* Mark it invalid. */
+       *p ^= 1;
+       (*num_found)++;
+
+       r = ntdb_access_read(ntdb, off, sizeof(*r), true);
+       if (NTDB_PTR_IS_ERR(r)) {
+               return NTDB_PTR_ERR(r);
+       }
+       klen = rec_key_length(r);
+       dlen = rec_data_length(r);
+       ntdb_access_release(ntdb, r);
+
+       kptr = ntdb_access_read(ntdb, off + sizeof(*r), klen + dlen, false);
+       if (NTDB_PTR_IS_ERR(kptr)) {
+               return NTDB_PTR_ERR(kptr);
+       }
+
+       hash = ntdb_hash(ntdb, kptr, klen);
+
+       /* Are we in the right chain? */
+       if (bits_from(hash, 0, ntdb->hash_bits) != bucket) {
+               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                   NTDB_LOG_ERROR,
+                                   "ntdb_check: Bad bucket %u vs %llu",
+                                   bits_from(hash, 0, ntdb->hash_bits),
+                                   (long long)bucket);
+       /* Next 8 bits should be the same as top bits of bucket. */
+       } else if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
+                  != bits_from(off_and_hash, 64-NTDB_OFF_UPPER_STEAL,
+                               NTDB_OFF_UPPER_STEAL)) {
+               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                   NTDB_LOG_ERROR,
+                                   "ntdb_check: Bad hash bits %llu vs %llu",
+                                   (long long)off_and_hash,
+                                   (long long)hash);
+       } else if (check) {
+               NTDB_DATA k, d;
+
+               k = ntdb_mkdata(kptr, klen);
+               d = ntdb_mkdata(kptr + klen, dlen);
+               ecode = check(k, d, data);
+       } else {
+               ecode = NTDB_SUCCESS;
+       }
+       ntdb_access_release(ntdb, kptr);
+
+       return ecode;
+}
+
+static enum NTDB_ERROR check_hash_chain(struct ntdb_context *ntdb,
+                                       ntdb_off_t off,
+                                       ntdb_len_t bucket,
+                                       ntdb_off_t used[],
+                                       size_t num_used,
+                                       size_t *num_found,
+                                       enum NTDB_ERROR (*check)(NTDB_DATA,
+                                                                NTDB_DATA,
+                                                                void *),
+                                       void *data)
+{
+       struct ntdb_used_record rec;
+       enum NTDB_ERROR ecode;
+       const ntdb_off_t *entries;
+       ntdb_len_t i, num;
+
+       /* This is a used entry. */
+       (*num_found)++;
+
+       ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       if (rec_magic(&rec) != NTDB_CHAIN_MAGIC) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "ntdb_check: Bad hash chain magic %llu",
+                                 (long long)rec_magic(&rec));
+       }
+
+       if (rec_data_length(&rec) % sizeof(ntdb_off_t)) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "ntdb_check: Bad hash chain data length %llu",
+                                 (long long)rec_data_length(&rec));
+       }
+
+       if (rec_key_length(&rec) != 0) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "ntdb_check: Bad hash chain key length %llu",
+                                 (long long)rec_key_length(&rec));
+       }
+
+       off += sizeof(rec);
+       num = rec_data_length(&rec) / sizeof(ntdb_off_t);
+       entries = ntdb_access_read(ntdb, off, rec_data_length(&rec), true);
+       if (NTDB_PTR_IS_ERR(entries)) {
+               return NTDB_PTR_ERR(entries);
+       }
+
+       /* Check each non-deleted entry in chain. */
+       for (i = 0; i < num; i++) {
+               ecode = check_entry(ntdb, entries[i], bucket,
+                                   used, num_used, num_found, check, data);
+               if (ecode) {
+                       break;
+               }
+       }
+
+       ntdb_access_release(ntdb, entries);
+       return ecode;
+}
+
+static enum NTDB_ERROR check_hash(struct ntdb_context *ntdb,
+                                 ntdb_off_t used[],
+                                 size_t num_used,
+                                 size_t num_other_used,
+                                 enum NTDB_ERROR (*check)(NTDB_DATA,
+                                                          NTDB_DATA,
+                                                          void *),
+                                 void *data)
+{
+       enum NTDB_ERROR ecode;
+       struct ntdb_used_record rec;
+       const ntdb_off_t *entries;
+       ntdb_len_t i;
+       /* Free tables and capabilities also show up as used, as do we. */
+       size_t num_found = num_other_used + 1;
+
+       ecode = ntdb_read_convert(ntdb, NTDB_HASH_OFFSET, &rec, sizeof(rec));
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       if (rec_magic(&rec) != NTDB_HTABLE_MAGIC) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "ntdb_check: Bad hash table magic %llu",
+                                 (long long)rec_magic(&rec));
+       }
+
+       if (rec_data_length(&rec) != (sizeof(ntdb_off_t) << ntdb->hash_bits)) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "ntdb_check: Bad hash table data length %llu",
+                                 (long long)rec_data_length(&rec));
+       }
+
+       if (rec_key_length(&rec) != 0) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "ntdb_check: Bad hash table key length %llu",
+                                 (long long)rec_key_length(&rec));
+       }
+
+       entries = ntdb_access_read(ntdb, NTDB_HASH_OFFSET + sizeof(rec),
+                                  rec_data_length(&rec), true);
+       if (NTDB_PTR_IS_ERR(entries)) {
+               return NTDB_PTR_ERR(entries);
+       }
+
+       for (i = 0; i < (1 << ntdb->hash_bits); i++) {
+               ntdb_off_t off = entries[i] & NTDB_OFF_MASK;
+               if (entries[i] & (1ULL << NTDB_OFF_CHAIN_BIT)) {
+                       ecode = check_hash_chain(ntdb, off, i,
+                                                used, num_used, &num_found,
+                                                check, data);
+               } else {
+                       ecode = check_entry(ntdb, entries[i], i,
+                                           used, num_used, &num_found,
+                                           check, data);
+               }
+               if (ecode) {
+                       break;
+               }
+       }
+       ntdb_access_release(ntdb, entries);
+
+       if (ecode == NTDB_SUCCESS && num_found != num_used) {
+               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                   "ntdb_check: Not all entries are in hash");
+       }
+       return ecode;
+}
+
+static enum NTDB_ERROR check_free(struct ntdb_context *ntdb,
+                                ntdb_off_t off,
+                                const struct ntdb_free_record *frec,
+                                ntdb_off_t prev, unsigned int ftable,
+                                unsigned int bucket)
+{
+       enum NTDB_ERROR ecode;
+
+       if (frec_magic(frec) != NTDB_FREE_MAGIC) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "ntdb_check: offset %llu bad magic 0x%llx",
+                                 (long long)off,
+                                 (long long)frec->magic_and_prev);
+       }
+       if (frec_ftable(frec) != ftable) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "ntdb_check: offset %llu bad freetable %u",
+                                 (long long)off, frec_ftable(frec));
+
+       }
+
+       ecode = ntdb_oob(ntdb, off,
+                        frec_len(frec) + sizeof(struct ntdb_used_record),
+                        false);
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+       if (size_to_bucket(frec_len(frec)) != bucket) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "ntdb_check: offset %llu in wrong bucket"
+                                 " (%u vs %u)",
+                                 (long long)off,
+                                 bucket, size_to_bucket(frec_len(frec)));
+       }
+       if (prev && prev != frec_prev(frec)) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "ntdb_check: offset %llu bad prev"
+                                 " (%llu vs %llu)",
+                                 (long long)off,
+                                 (long long)prev, (long long)frec_len(frec));
+       }
+       return NTDB_SUCCESS;
+}
+
+static enum NTDB_ERROR check_free_table(struct ntdb_context *ntdb,
+                                      ntdb_off_t ftable_off,
+                                      unsigned ftable_num,
+                                      ntdb_off_t fr[],
+                                      size_t num_free,
+                                      size_t *num_found)
+{
+       struct ntdb_freetable ft;
+       ntdb_off_t h;
+       unsigned int i;
+       enum NTDB_ERROR ecode;
+
+       ecode = ntdb_read_convert(ntdb, ftable_off, &ft, sizeof(ft));
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       if (rec_magic(&ft.hdr) != NTDB_FTABLE_MAGIC
+           || rec_key_length(&ft.hdr) != 0
+           || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "ntdb_check: Invalid header on free table");
+       }
+
+       for (i = 0; i < NTDB_FREE_BUCKETS; i++) {
+               ntdb_off_t off, prev = 0, *p, first = 0;
+               struct ntdb_free_record f;
+
+               h = bucket_off(ftable_off, i);
+               for (off = ntdb_read_off(ntdb, h); off; off = f.next) {
+                       if (NTDB_OFF_IS_ERR(off)) {
+                               return NTDB_OFF_TO_ERR(off);
+                       }
+                       if (!first) {
+                               off &= NTDB_OFF_MASK;
+                               first = off;
+                       }
+                       ecode = ntdb_read_convert(ntdb, off, &f, sizeof(f));
+                       if (ecode != NTDB_SUCCESS) {
+                               return ecode;
+                       }
+                       ecode = check_free(ntdb, off, &f, prev, ftable_num, i);
+                       if (ecode != NTDB_SUCCESS) {
+                               return ecode;
+                       }
+
+                       /* FIXME: Check hash bits */
+                       p = asearch(&off, fr, num_free, off_cmp);
+                       if (!p) {
+                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                                 NTDB_LOG_ERROR,
+                                                 "ntdb_check: Invalid offset"
+                                                 " %llu in free table",
+                                                 (long long)off);
+                       }
+                       /* Mark it invalid. */
+                       *p ^= 1;
+                       (*num_found)++;
+                       prev = off;
+               }
+
+               if (first) {
+                       /* Now we can check first back pointer. */
+                       ecode = ntdb_read_convert(ntdb, first, &f, sizeof(f));
+                       if (ecode != NTDB_SUCCESS) {
+                               return ecode;
+                       }
+                       ecode = check_free(ntdb, first, &f, prev, ftable_num, i);
+                       if (ecode != NTDB_SUCCESS) {
+                               return ecode;
+                       }
+               }
+       }
+       return NTDB_SUCCESS;
+}
+
+/* Slow, but should be very rare. */
+ntdb_off_t dead_space(struct ntdb_context *ntdb, ntdb_off_t off)
+{
+       size_t len;
+       enum NTDB_ERROR ecode;
+
+       for (len = 0; off + len < ntdb->file->map_size; len++) {
+               char c;
+               ecode = ntdb->io->tread(ntdb, off, &c, 1);
+               if (ecode != NTDB_SUCCESS) {
+                       return NTDB_ERR_TO_OFF(ecode);
+               }
+               if (c != 0 && c != 0x43)
+                       break;
+       }
+       return len;
+}
+
+static enum NTDB_ERROR check_linear(struct ntdb_context *ntdb,
+                                  ntdb_off_t **used, size_t *num_used,
+                                  ntdb_off_t **fr, size_t *num_free,
+                                  uint64_t features, ntdb_off_t recovery)
+{
+       ntdb_off_t off;
+       ntdb_len_t len;
+       enum NTDB_ERROR ecode;
+       bool found_recovery = false;
+
+       for (off = sizeof(struct ntdb_header);
+            off < ntdb->file->map_size;
+            off += len) {
+               union {
+                       struct ntdb_used_record u;
+                       struct ntdb_free_record f;
+                       struct ntdb_recovery_record r;
+               } rec;
+               /* r is larger: only get that if we need to. */
+               ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.f));
+               if (ecode != NTDB_SUCCESS) {
+                       return ecode;
+               }
+
+               /* If we crash after ftruncate, we can get zeroes or fill. */
+               if (rec.r.magic == NTDB_RECOVERY_INVALID_MAGIC
+                   || rec.r.magic ==  0x4343434343434343ULL) {
+                       ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.r));
+                       if (ecode != NTDB_SUCCESS) {
+                               return ecode;
+                       }
+                       if (recovery == off) {
+                               found_recovery = true;
+                               len = sizeof(rec.r) + rec.r.max_len;
+                       } else {
+                               len = dead_space(ntdb, off);
+                               if (NTDB_OFF_IS_ERR(len)) {
+                                       return NTDB_OFF_TO_ERR(len);
+                               }
+                               if (len < sizeof(rec.r)) {
+                                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                                         NTDB_LOG_ERROR,
+                                                         "ntdb_check: invalid"
+                                                         " dead space at %zu",
+                                                         (size_t)off);
+                               }
+
+                               ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
+                                          "Dead space at %zu-%zu (of %zu)",
+                                          (size_t)off, (size_t)(off + len),
+                                          (size_t)ntdb->file->map_size);
+                       }
+               } else if (rec.r.magic == NTDB_RECOVERY_MAGIC) {
+                       ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.r));
+                       if (ecode != NTDB_SUCCESS) {
+                               return ecode;
+                       }
+                       if (recovery != off) {
+                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                                 NTDB_LOG_ERROR,
+                                                 "ntdb_check: unexpected"
+                                                 " recovery record at offset"
+                                                 " %zu",
+                                                 (size_t)off);
+                       }
+                       if (rec.r.len > rec.r.max_len) {
+                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                                 NTDB_LOG_ERROR,
+                                                 "ntdb_check: invalid recovery"
+                                                 " length %zu",
+                                                 (size_t)rec.r.len);
+                       }
+                       if (rec.r.eof > ntdb->file->map_size) {
+                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                                 NTDB_LOG_ERROR,
+                                                 "ntdb_check: invalid old EOF"
+                                                 " %zu", (size_t)rec.r.eof);
+                       }
+                       found_recovery = true;
+                       len = sizeof(rec.r) + rec.r.max_len;
+               } else if (frec_magic(&rec.f) == NTDB_FREE_MAGIC) {
+                       len = sizeof(rec.u) + frec_len(&rec.f);
+                       if (off + len > ntdb->file->map_size) {
+                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                                 NTDB_LOG_ERROR,
+                                                 "ntdb_check: free overlength"
+                                                 " %llu at offset %llu",
+                                                 (long long)len,
+                                                 (long long)off);
+                       }
+                       /* This record should be in free lists. */
+                       if (frec_ftable(&rec.f) != NTDB_FTABLE_NONE
+                           && !append(ntdb, fr, num_free, off)) {
+                               return ntdb_logerr(ntdb, NTDB_ERR_OOM,
+                                                 NTDB_LOG_ERROR,
+                                                 "ntdb_check: tracking %zu'th"
+                                                 " free record.", *num_free);
+                       }
+               } else if (rec_magic(&rec.u) == NTDB_USED_MAGIC
+                          || rec_magic(&rec.u) == NTDB_CHAIN_MAGIC
+                          || rec_magic(&rec.u) == NTDB_HTABLE_MAGIC
+                          || rec_magic(&rec.u) == NTDB_FTABLE_MAGIC
+                          || rec_magic(&rec.u) == NTDB_CAP_MAGIC) {
+                       uint64_t klen, dlen, extra;
+
+                       /* This record is used! */
+                       if (!append(ntdb, used, num_used, off)) {
+                               return ntdb_logerr(ntdb, NTDB_ERR_OOM,
+                                                 NTDB_LOG_ERROR,
+                                                 "ntdb_check: tracking %zu'th"
+                                                 " used record.", *num_used);
+                       }
+
+                       klen = rec_key_length(&rec.u);
+                       dlen = rec_data_length(&rec.u);
+                       extra = rec_extra_padding(&rec.u);
+
+                       len = sizeof(rec.u) + klen + dlen + extra;
+                       if (off + len > ntdb->file->map_size) {
+                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                                 NTDB_LOG_ERROR,
+                                                 "ntdb_check: used overlength"
+                                                 " %llu at offset %llu",
+                                                 (long long)len,
+                                                 (long long)off);
+                       }
+
+                       if (len < sizeof(rec.f)) {
+                               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                                 NTDB_LOG_ERROR,
+                                                 "ntdb_check: too short record"
+                                                 " %llu at %llu",
+                                                 (long long)len,
+                                                 (long long)off);
+                       }
+
+                       /* Check that records have correct 0 at end (but may
+                        * not in future). */
+                       if (extra && !features
+                           && rec_magic(&rec.u) != NTDB_CAP_MAGIC) {
+                               const char *p;
+                               char c;
+                               p = ntdb_access_read(ntdb, off + sizeof(rec.u)
+                                                   + klen + dlen, 1, false);
+                               if (NTDB_PTR_IS_ERR(p))
+                                       return NTDB_PTR_ERR(p);
+                               c = *p;
+                               ntdb_access_release(ntdb, p);
+
+                               if (c != '\0') {
+                                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                                         NTDB_LOG_ERROR,
+                                                         "ntdb_check:"
+                                                         " non-zero extra"
+                                                         " at %llu",
+                                                         (long long)off);
+                               }
+                       }
+               } else {
+                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                         NTDB_LOG_ERROR,
+                                         "ntdb_check: Bad magic 0x%llx"
+                                         " at offset %zu",
+                                         (long long)rec_magic(&rec.u),
+                                         (size_t)off);
+               }
+       }
+
+       /* We must have found recovery area if there was one. */
+       if (recovery != 0 && !found_recovery) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "ntdb_check: expected a recovery area at %zu",
+                                 (size_t)recovery);
+       }
+
+       return NTDB_SUCCESS;
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_check_(struct ntdb_context *ntdb,
+                         enum NTDB_ERROR (*check)(NTDB_DATA, NTDB_DATA, void *),
+                         void *data)
+{
+       ntdb_off_t *fr = NULL, *used = NULL;
+       ntdb_off_t ft = 0, recovery = 0;
+       size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0,
+               num_capabilities = 0;
+       uint64_t features = 0;
+       enum NTDB_ERROR ecode;
+
+       if (ntdb->flags & NTDB_CANT_CHECK) {
+               return ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
+                                 "ntdb_check: database has unknown capability,"
+                                 " cannot check.");
+       }
+
+       ecode = ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, false);
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       ecode = ntdb_lock_expand(ntdb, F_RDLCK);
+       if (ecode != NTDB_SUCCESS) {
+               ntdb_allrecord_unlock(ntdb, F_RDLCK);
+               return ecode;
+       }
+
+       ecode = check_header(ntdb, &recovery, &features, &num_capabilities);
+       if (ecode != NTDB_SUCCESS)
+               goto out;
+
+       /* First we do a linear scan, checking all records. */
+       ecode = check_linear(ntdb, &used, &num_used, &fr, &num_free, features,
+                            recovery);
+       if (ecode != NTDB_SUCCESS)
+               goto out;
+
+       for (ft = first_ftable(ntdb); ft; ft = next_ftable(ntdb, ft)) {
+               if (NTDB_OFF_IS_ERR(ft)) {
+                       ecode = NTDB_OFF_TO_ERR(ft);
+                       goto out;
+               }
+               ecode = check_free_table(ntdb, ft, num_ftables, fr, num_free,
+                                        &num_found);
+               if (ecode != NTDB_SUCCESS)
+                       goto out;
+               num_ftables++;
+       }
+
+       /* FIXME: Check key uniqueness? */
+       ecode = check_hash(ntdb, used, num_used, num_ftables + num_capabilities,
+                          check, data);
+       if (ecode != NTDB_SUCCESS)
+               goto out;
+
+       if (num_found != num_free) {
+               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                  "ntdb_check: Not all entries are in"
+                                  " free table");
+       }
+
+out:
+       ntdb_allrecord_unlock(ntdb, F_RDLCK);
+       ntdb_unlock_expand(ntdb, F_RDLCK);
+       ntdb->free_fn(fr, ntdb->alloc_data);
+       ntdb->free_fn(used, ntdb->alloc_data);
+       return ecode;
+}
diff --git a/ccan/ntdb/configure b/ccan/ntdb/configure
new file mode 100755 (executable)
index 0000000..6a9f875
--- /dev/null
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+PREVPATH=`dirname $0`
+
+if [ -f $PREVPATH/../../buildtools/bin/waf ]; then
+       WAF=../../buildtools/bin/waf
+elif [ -f $PREVPATH/buildtools/bin/waf ]; then
+       WAF=./buildtools/bin/waf
+else
+       echo "replace: Unable to find waf"
+       exit 1
+fi
+
+# using JOBS=1 gives maximum compatibility with
+# systems like AIX which have broken threading in python
+JOBS=1
+export JOBS
+
+cd . || exit 1
+$WAF configure "$@" || exit 1
+cd $PREVPATH
diff --git a/ccan/ntdb/doc/TDB_porting.txt b/ccan/ntdb/doc/TDB_porting.txt
new file mode 100644 (file)
index 0000000..5daf94b
--- /dev/null
@@ -0,0 +1,483 @@
+Interface differences between TDB and NTDB.
+
+- ntdb shares 'struct TDB_DATA' with tdb, but TDB defines the TDB_DATA
+  typedef, whereas ntdb defines NTDB_DATA (ie. both are compatible).
+  If you include both ntdb.h and tdb.h, #include tdb.h first,
+  otherwise you'll get a compile error when tdb.h re-defined struct
+  TDB_DATA.
+
+  Example:
+       #include <tdb.h>
+       #include <ntdb.h>
+
+- ntdb functions return NTDB_SUCCESS (ie 0) on success, and a negative
+  error on failure, whereas tdb functions returned 0 on success, and
+  -1 on failure.  tdb then used tdb_error() to determine the error;
+  this API is nasty if we ever want to support threads, so is not supported.
+
+  Example:
+       #include <tdb.h>
+       #include <ntdb.h>
+
+       void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
+       {
+               if (tdb_store(tdb, key, d) == -1) {
+                       printf("store failed: %s\n", tdb_errorstr(tdb));
+               }
+       }
+
+       void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
+       {
+               enum NTDB_ERROR e;
+
+               e = ntdb_store(ntdb, key, d);
+               if (e) {
+                       printf("store failed: %s\n", ntdb_errorstr(e));
+               }
+       }
+
+- ntdb's ntdb_fetch() returns an error, tdb's returned the data directly
+  (or tdb_null, and you were supposed to check tdb_error() to find out why).
+
+  Example:
+       #include <tdb.h>
+       #include <ntdb.h>
+
+       void tdb_example(struct tdb_context *tdb, TDB_DATA key)
+       {
+               TDB_DATA data;
+
+               data = tdb_fetch(tdb, key);
+               if (!data.dptr) {
+                       printf("fetch failed: %s\n", tdb_errorstr(tdb));
+               }
+       }
+
+       void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key)
+       {
+               NTDB_DATA data;
+               enum NTDB_ERROR e;
+
+               e = ntdb_fetch(ntdb, key, &data);
+               if (e) {
+                       printf("fetch failed: %s\n", ntdb_errorstr(e));
+               }
+       }
+
+- ntdb's ntdb_nextkey() frees the old key's dptr, in tdb you needed to do
+  this manually.
+
+  Example:
+       #include <tdb.h>
+       #include <ntdb.h>
+
+       void tdb_example(struct tdb_context *tdb)
+       {
+               TDB_DATA key, next, data;
+
+               for (key = tdb_firstkey(tdb); key.dptr; key = next) {
+                       printf("Got key!\n");
+                       next = tdb_nextkey(tdb, key);
+                       free(key.dptr);
+               }
+       }
+
+
+       void ntdb_example(struct ntdb_context *ntdb)
+       {
+               NTDB_DATA k, data;
+               enum NTDB_ERROR e;
+
+               for (e = ntdb_firstkey(ntdb,&k); !e; e = ntdb_nextkey(ntdb,&k))
+                       printf("Got key!\n");
+       }
+
+- Unlike tdb_open/tdb_open_ex, ntdb_open does not allow NULL names,
+  even for NTDB_INTERNAL dbs, and thus ntdb_name() never returns NULL.
+
+  Example:
+       #include <tdb.h>
+       #include <ntdb.h>
+
+       struct tdb_context *tdb_example(void)
+       {
+               return tdb_open(NULL, 0, TDB_INTERNAL, O_RDWR, 0);
+       }
+
+       struct ntdb_context *ntdb_example(void)
+       {
+               return ntdb_open("example", NTDB_INTERNAL, O_RDWR, 0);
+       }
+
+- ntdb uses a linked list of attribute structures to implement logging and
+  alternate hashes.  tdb used tdb_open_ex, which was not extensible.
+
+  Example:
+       #include <tdb.h>
+       #include <ntdb.h>
+
+       /* Custom hash function */
+       static unsigned int my_tdb_hash_func(TDB_DATA *key)
+       {
+               return key->dsize;
+       }
+
+       struct tdb_context *tdb_example(void)
+       {
+               return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
+                                  O_CREAT|O_RDWR, 0600, NULL, my_hash_func);
+       }
+
+       /* Custom hash function */
+       static unsigned int my_ntdb_hash_func(const void *key, size_t len,
+                                             uint32_t seed, void *data)
+       {
+               return len;
+       }
+
+       struct ntdb_context *ntdb_example(void)
+       {
+               union ntdb_attribute hash;
+
+               hash.base.attr = NTDB_ATTRIBUTE_HASH;
+               hash.base.next = NULL;
+               hash.hash.fn = my_ntdb_hash_func;
+               return ntdb_open("example.ntdb", NTDB_DEFAULT,
+                                  O_CREAT|O_RDWR, 0600, &hash);
+       }
+
+- tdb's tdb_open/tdb_open_ex took an explicit hash size, defaulting to
+  131.  ntdb's uses an attribute for this, defaulting to 8192.
+
+  Example:
+       #include <tdb.h>
+       #include <ntdb.h>
+
+       struct tdb_context *tdb_example(void)
+       {
+               return tdb_open("example.tdb", 10007, TDB_DEFAULT,
+                               O_CREAT|O_RDWR, 0600);
+       }
+
+       struct ntdb_context *ntdb_example(void)
+       {
+               union ntdb_attribute hashsize;
+
+               hashsize.base.attr = NTDB_ATTRIBUTE_HASHSIZE;
+               hashsize.base.next = NULL;
+               hashsize.hashsize.size = 16384;
+               return ntdb_open("example.ntdb", NTDB_DEFAULT,
+                                  O_CREAT|O_RDWR, 0600, &hashsize);
+       }
+
+- ntdb's log function is simpler than tdb's log function.  The string
+  is already formatted, is not terminated by a '\n', and it takes an
+  enum ntdb_log_level not a tdb_debug_level, and which has only three
+  values: NTDB_LOG_ERROR, NTDB_LOG_USE_ERROR and NTDB_LOG_WARNING.
+
+       #include <tdb.h>
+       #include <ntdb.h>
+
+       static void tdb_log(struct tdb_context *tdb,
+                           enum tdb_debug_level level, const char *fmt, ...)
+       {
+               va_list ap;
+               const char *name;
+
+               switch (level) {
+               case TDB_DEBUG_FATAL:
+                       fprintf(stderr, "FATAL: ");
+                       break;
+               case TDB_DEBUG_ERROR:
+                       fprintf(stderr, "ERROR: ");
+                       break;
+               case TDB_DEBUG_WARNING:
+                       fprintf(stderr, "WARNING: ");
+                       break;
+               case TDB_DEBUG_TRACE:
+                       /* Don't print out tracing. */
+                       return;
+               }
+
+               name = tdb_name(tdb);
+               if (!name) {
+                       name = "unnamed";
+               }
+
+               fprintf(stderr, "tdb(%s):", name);
+
+               va_start(ap, fmt);
+               vfprintf(stderr, fmt, ap);
+               va_end(ap);
+       }
+
+       struct tdb_context *tdb_example(void)
+       {
+               struct tdb_logging_context lctx;
+
+               lctx.log_fn = tdb_log;
+               return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
+                                  O_CREAT|O_RDWR, 0600, &lctx, NULL);
+       }
+
+       static void ntdb_log(struct ntdb_context *ntdb,
+                            enum ntdb_log_level level,
+                            enum NTDB_ERROR ecode,
+                            const char *message,
+                            void *data)
+       {
+               switch (level) {
+               case NTDB_LOG_ERROR:
+                       fprintf(stderr, "ERROR: ");
+                       break;
+               case NTDB_LOG_USE_ERROR:
+                       /* We made a mistake, so abort. */
+                       abort();
+                       break;
+               case NTDB_LOG_WARNING:
+                       fprintf(stderr, "WARNING: ");
+                       break;
+               }
+
+               fprintf(stderr, "ntdb(%s):%s:%s\n",
+                       ntdb_name(ntdb), ntdb_errorstr(ecode), message);
+       }
+
+       struct ntdb_context *ntdb_example(void)
+       {
+               union ntdb_attribute log;
+
+               log.base.attr = NTDB_ATTRIBUTE_LOG;
+               log.base.next = NULL;
+               log.log.fn = ntdb_log;
+               return ntdb_open("example.ntdb", NTDB_DEFAULT,
+                                O_CREAT|O_RDWR, 0600, &log);
+       }
+
+- ntdb provides ntdb_deq() for comparing two NTDB_DATA, and ntdb_mkdata() for
+  creating an NTDB_DATA.
+
+       #include <tdb.h>
+       #include <ntdb.h>
+
+       void tdb_example(struct tdb_context *tdb)
+       {
+               TDB_DATA data, key;
+
+               key.dsize = strlen("hello");
+               key.dptr = "hello";
+               data = tdb_fetch(tdb, key);
+               if (data.dsize == key.dsize
+                   && !memcmp(data.dptr, key.dptr, key.dsize))
+                       printf("key is same as data\n");
+               }
+               free(data.dptr);
+       }
+
+       void ntdb_example(struct ntdb_context *ntdb)
+       {
+               NTDB_DATA data, key;
+
+               key = ntdb_mkdata("hello", strlen("hello"));
+               if (ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS) {
+                       if (ntdb_deq(key, data)) {
+                               printf("key is same as data\n");
+                       }
+                       free(data.dptr);
+               }
+       }
+
+- ntdb's ntdb_parse_record() takes a type-checked callback data
+  pointer, not a void * (though a void * pointer still works).  The
+  callback function is allowed to do read operations on the database,
+  or write operations if you first call ntdb_lockall().  TDB's
+  tdb_parse_record() did not allow any database access within the
+  callback, could crash if you tried.
+
+  Example:
+       #include <tdb.h>
+       #include <ntdb.h>
+
+       static int tdb_parser(TDB_DATA key, TDB_DATA data, void *private_data)
+       {
+               TDB_DATA *expect = private_data;
+
+               return data.dsize == expect->dsize
+                       && !memcmp(data.dptr, expect->dptr, data.dsize);
+       }
+
+       void tdb_example(struct tdb_context *tdb, TDB_DATA key, NTDB_DATA d)
+       {
+               switch (tdb_parse_record(tdb, key, tdb_parser, &d)) {
+               case -1:
+                       printf("parse failed: %s\n", tdb_errorstr(tdb));
+                       break;
+               case 0:
+                       printf("data was different!\n");
+                       break;
+               case 1:
+                       printf("data was same!\n");
+                       break;
+               }
+       }
+
+       static int ntdb_parser(TDB_DATA key, TDB_DATA data, TDB_DATA *expect)
+       {
+               return ntdb_deq(data, *expect);
+       }
+
+       void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
+       {
+               enum NTDB_ERROR e;
+
+               e = tdb_parse_record(tdb, key, tdb_parser, &d);
+               switch (e) {
+               case 0:
+                       printf("data was different!\n");
+                       break;
+               case 1:
+                       printf("data was same!\n");
+                       break;
+               default:
+                       printf("parse failed: %s\n", ntdb_errorstr(e));
+                       break;
+               }
+       }
+
+- ntdb does locking on read-only databases (ie. O_RDONLY passed to ntdb_open).
+  tdb did not: use the NTDB_NOLOCK flag if you want to suppress locking.
+
+  Example:
+       #include <tdb.h>
+       #include <ntdb.h>
+
+       struct tdb_context *tdb_example(void)
+       {
+               return tdb_open("example.tdb", 0, TDB_DEFAULT, O_RDONLY, 0);
+       }
+
+       struct ntdb_context *ntdb_example(void)
+       {
+               return ntdb_open("example.ntdb", NTDB_NOLOCK, O_RDONLY, NULL);
+       }
+
+- Failure inside a transaction (such as a lock function failing) does
+  not implicitly cancel the transaction; you still need to call
+  ntdb_transaction_cancel().
+
+       #include <tdb.h>
+       #include <ntdb.h>
+
+       void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
+       {
+               if (tdb_transaction_start(tdb) == -1) {
+                       printf("transaction failed: %s\n", tdb_errorstr(tdb));
+                       return;
+               }
+
+               if (tdb_store(tdb, key, d) == -1) {
+                       printf("store failed: %s\n", tdb_errorstr(tdb));
+                       return;
+               }
+               if (tdb_transaction_commit(tdb) == -1) {
+                       printf("commit failed: %s\n", tdb_errorstr(tdb));
+               }
+       }
+
+       void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
+       {
+               enum NTDB_ERROR e;
+
+               e = ntdb_transaction_start(ntdb);
+               if (e) {
+                       printf("transaction failed: %s\n", ntdb_errorstr(e));
+                       return;
+               }
+
+               e = ntdb_store(ntdb, key, d);
+               if (e) {
+                       printf("store failed: %s\n", ntdb_errorstr(e));
+                       ntdb_transaction_cancel(ntdb);
+               }
+
+               e = ntdb_transaction_commit(ntdb);
+               if (e) {
+                       printf("commit failed: %s\n", ntdb_errorstr(e));
+               }
+       }
+
+- There is no NTDB_CLEAR_IF_FIRST flag; it has severe scalability and
+  API problems.  If necessary, you can emulate this by using the open
+  hook and placing a 1-byte lock at offset 4.  If your program forks
+  and exits, you will need to place this lock again in the child before
+  the parent exits.
+
+  Example:
+
+       #include <tdb.h>
+       #include <ntdb.h>
+
+       struct tdb_context *tdb_example(void)
+       {
+               return tdb_open("example.tdb", 0, TDB_CLEAR_IF_FIRST,
+                                  O_CREAT|O_RDWR, 0600);
+       }
+
+       static enum NTDB_ERROR clear_if_first(int fd, void *unused)
+       {
+               /* We hold a lock offset 4 always, so we can tell if
+                * anyone else is. */
+               struct flock fl;
+
+               fl.l_type = F_WRLCK;
+               fl.l_whence = SEEK_SET;
+               fl.l_start = 4; /* ACTIVE_LOCK */
+               fl.l_len = 1;
+
+               if (fcntl(fd, F_SETLK, &fl) == 0) {
+                       /* We must be first ones to open it!  Clear it. */
+                       if (ftruncate(fd, 0) != 0) {
+                               return NTDB_ERR_IO;
+                       }
+               }
+               fl.l_type = F_RDLCK;
+               if (fcntl(fd, F_SETLKW, &fl) != 0) {
+                       return NTDB_ERR_IO;
+               }
+               return NTDB_SUCCESS;
+       }
+
+       struct ntdb_context *ntdb_example(void)
+       {
+               union ntdb_attribute open_attr;
+
+               open_attr.openhook.base.attr = NTDB_ATTRIBUTE_OPENHOOK;
+               open_attr.openhook.base.next = NULL;
+               open_attr.openhook.fn = clear_if_first;
+
+               return ntdb_open("example.ntdb", NTDB_DEFAULT,
+                                O_CREAT|O_RDWR, 0600, &open_attr);
+       }
+
+- ntdb traversals are not reliable if the database is changed during
+  the traversal, ie your traversal may not cover all elements, or may
+  cover elements multiple times.  As a special exception, deleting the
+  current record within ntdb_traverse() is reliable.
+
+- There is no ntdb_traverse_read, since ntdb_traverse does not hold
+  a lock across the entire traversal anyway.  If you want to make sure
+  that your traversal function does not write to the database, you can
+  set and clear the NTDB_RDONLY flag around the traversal.
+
+- ntdb does not need tdb_reopen() or tdb_reopen_all().  If you call
+  fork() after during certain operations the child should close the
+  ntdb, or complete the operations before continuing to use the tdb:
+
+       ntdb_transaction_start(): child must ntdb_transaction_cancel()
+       ntdb_lockall(): child must call ntdb_unlockall()
+       ntdb_lockall_read(): child must call ntdb_unlockall_read()
+       ntdb_chainlock(): child must call ntdb_chainunlock()
+       ntdb_parse() callback: child must return from ntdb_parse()
+
+- ntdb will not open a non-ntdb file, even if O_CREAT is specified.  tdb
+  will overwrite an unknown file in that case.
diff --git a/ccan/ntdb/doc/design.lyx b/ccan/ntdb/doc/design.lyx
new file mode 100644 (file)
index 0000000..5a10ee3
--- /dev/null
@@ -0,0 +1,2727 @@
+#LyX 2.0 created this file. For more info see http://www.lyx.org/
+\lyxformat 413
+\begin_document
+\begin_header
+\textclass article
+\use_default_options true
+\maintain_unincluded_children false
+\language english
+\language_package default
+\inputencoding auto
+\fontencoding global
+\font_roman default
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\use_non_tex_fonts false
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\default_output_format default
+\output_sync 0
+\bibtex_command default
+\index_command default
+\paperfontsize default
+\use_hyperref false
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\use_mhchem 1
+\use_mathdots 1
+\cite_engine basic
+\use_bibtopic false
+\use_indices false
+\paperorientation portrait
+\suppress_date false
+\use_refstyle 0
+\index Index
+\shortcut idx
+\color #008000
+\end_index
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation indent
+\paragraph_indentation default
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes true
+\output_changes true
+\html_math_output 0
+\html_css_as_file 0
+\html_be_strict false
+\end_header
+
+\begin_body
+
+\begin_layout Title
+NTDB: Redesigning The Trivial DataBase
+\end_layout
+
+\begin_layout Author
+Rusty Russell, IBM Corporation
+\end_layout
+
+\begin_layout Date
+19 June 2012
+\end_layout
+
+\begin_layout Abstract
+The Trivial DataBase on-disk format is 32 bits; with usage cases heading
+ towards the 4G limit, that must change.
+ This required breakage provides an opportunity to revisit TDB's other design
+ decisions and reassess them.
+\end_layout
+
+\begin_layout Section
+Introduction
+\end_layout
+
+\begin_layout Standard
+The Trivial DataBase was originally written by Andrew Tridgell as a simple
+ key/data pair storage system with the same API as dbm, but allowing multiple
+ readers and writers while being small enough (< 1000 lines of C) to include
+ in SAMBA.
+ The simple design created in 1999 has proven surprisingly robust and performant
+, used in Samba versions 3 and 4 as well as numerous other projects.
+ Its useful life was greatly increased by the (backwards-compatible!) addition
+ of transaction support in 2005.
+\end_layout
+
+\begin_layout Standard
+The wider variety and greater demands of TDB-using code has lead to some
+ organic growth of the API, as well as some compromises on the implementation.
+ None of these, by themselves, are seen as show-stoppers, but the cumulative
+ effect is to a loss of elegance over the initial, simple TDB implementation.
+ Here is a table of the approximate number of lines of implementation code
+ and number of API functions at the end of each year:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="12" columns="3">
+<features tabularvalignment="middle">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Year End
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+API Functions
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Lines of C Code Implementation
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1999
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+13
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1195
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2000
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+24
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1725
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2001
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+32
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2228
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2002
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2481
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2003
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2552
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2004
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+40
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2584
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2005
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+38
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2647
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2006
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+52
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+3754
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2007
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+66
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4398
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2008
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+71
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4768
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2009
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+73
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+5715
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This review is an attempt to catalog and address all the known issues with
+ TDB and create solutions which address the problems without significantly
+ increasing complexity; all involved are far too aware of the dangers of
+ second system syndrome in rewriting a successful project like this.
+\end_layout
+
+\begin_layout Standard
+Note: the final decision was to make ntdb a separate library, with a separarate
+ 'ntdb' namespace so both can potentially be linked together.
+ This document still refers to
+\begin_inset Quotes eld
+\end_inset
+
+tdb
+\begin_inset Quotes erd
+\end_inset
+
+ everywhere, for simplicity.
+\end_layout
+
+\begin_layout Section
+API Issues
+\end_layout
+
+\begin_layout Subsection
+tdb_open_ex Is Not Expandable
+\end_layout
+
+\begin_layout Standard
+The tdb_open() call was expanded to tdb_open_ex(), which added an optional
+ hashing function and an optional logging function argument.
+ Additional arguments to open would require the introduction of a tdb_open_ex2
+ call etc.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "attributes"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+tdb_open() will take a linked-list of attributes:
+\end_layout
+
+\begin_layout LyX-Code
+enum tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_LOG = 0,
+\end_layout
+
+\begin_layout LyX-Code
+    TDB_ATTRIBUTE_HASH = 1
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_base {
+\end_layout
+
+\begin_layout LyX-Code
+    enum tdb_attribute attr;
+\end_layout
+
+\begin_layout LyX-Code
+    union tdb_attribute *next;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_log {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_log_func log_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *log_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_hash {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
+\end_layout
+
+\begin_layout LyX-Code
+    tdb_hash_func hash_fn;
+\end_layout
+
+\begin_layout LyX-Code
+    void *hash_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+union tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_base base;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_log log;
+\end_layout
+
+\begin_layout LyX-Code
+    struct tdb_attribute_hash hash;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+This allows future attributes to be added, even if this expands the size
+ of the union.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_traverse Makes Impossible Guarantees
+\end_layout
+
+\begin_layout Standard
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
+ was thought that it was important to guarantee that all records which exist
+ at the start and end of the traversal would be included, and no record
+ would be included twice.
+\end_layout
+
+\begin_layout Standard
+This adds complexity (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Reliable-Traversal-Adds"
+
+\end_inset
+
+) and does not work anyway for records which are altered (in particular,
+ those which are expanded may be effectively deleted and re-added behind
+ the traversal).
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "traverse-Proposed-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Abandon the guarantee.
+ You will see every record if no changes occur during your traversal, otherwise
+ you will see some subset.
+ You can prevent changes by using a transaction or the locking API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ Delete-during-traverse will still delete every record, too (assuming no
+ other changes).
+\end_layout
+
+\begin_layout Subsection
+Nesting of Transactions Is Fraught
+\end_layout
+
+\begin_layout Standard
+TDB has alternated between allowing nested transactions and not allowing
+ them.
+ Various paths in the Samba codebase assume that transactions will nest,
+ and in a sense they can: the operation is only committed to disk when the
+ outer transaction is committed.
+ There are two problems, however:
+\end_layout
+
+\begin_layout Enumerate
+Canceling the inner transaction will cause the outer transaction commit
+ to fail, and will not undo any operations since the inner transaction began.
+ This problem is soluble with some additional internal code.
+\end_layout
+
+\begin_layout Enumerate
+An inner transaction commit can be cancelled by the outer transaction.
+ This is desirable in the way which Samba's database initialization code
+ uses transactions, but could be a surprise to any users expecting a successful
+ transaction commit to expose changes to others.
+\end_layout
+
+\begin_layout Standard
+The current solution is to specify the behavior at tdb_open(), with the
+ default currently that nested transactions are allowed.
+ This flag can also be changed at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Given the usage patterns, it seems that the
+\begin_inset Quotes eld
+\end_inset
+
+least-surprise
+\begin_inset Quotes erd
+\end_inset
+
+ behavior of disallowing nested transactions should become the default.
+ Additionally, it seems the outer transaction is the only code which knows
+ whether inner transactions should be allowed, so a flag to indicate this
+ could be added to tdb_transaction_start.
+ However, this behavior can be simulated with a wrapper which uses tdb_add_flags
+() and tdb_remove_flags(), so the API should not be expanded for this relatively
+-obscure case.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete; the nesting flag has been removed.
+\end_layout
+
+\begin_layout Subsection
+Incorrect Hash Function is Not Detected
+\end_layout
+
+\begin_layout Standard
+tdb_open_ex() allows the calling code to specify a different hash function
+ to use, but does not check that all other processes accessing this tdb
+ are using the same hash function.
+ The result is that records are missing from tdb_fetch().
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain an example hash result (eg.
+ the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
+ hash function produces the same answer, or fail the tdb_open call.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+\end_layout
+
+\begin_layout Standard
+In response to scalability issues with the free list (
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Freelist-Is"
+
+\end_inset
+
+) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
+ and the TDB_VOLATILE flag to tdb_open.
+ The latter actually calls the former with an argument of
+\begin_inset Quotes eld
+\end_inset
+
+5
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+This code allows deleted records to accumulate without putting them in the
+ free list.
+ On delete we iterate through each chain and free them in a batch if there
+ are more than max_dead entries.
+ These are never otherwise recycled except as a side-effect of a tdb_repack.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With the scalability problems of the freelist solved, this API can be removed.
+ The TDB_VOLATILE flag may still be useful as a hint that store and delete
+ of records will be at least as common as fetch in order to allow some internal
+ tuning, but initially will become a no-op.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ Unknown flags cause tdb_open() to fail as well, so they can be detected
+ at runtime.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Files-Cannot"
+
+\end_inset
+
+TDB Files Cannot Be Opened Multiple Times In The Same Process
+\end_layout
+
+\begin_layout Standard
+No process can open the same TDB twice; we check and disallow it.
+ This is an unfortunate side-effect of fcntl locks, which operate on a per-file
+ rather than per-file-descriptor basis, and do not nest.
+ Thus, closing any file descriptor on a file clears all the locks obtained
+ by this process, even if they were placed using a different file descriptor!
+\end_layout
+
+\begin_layout Standard
+Note that even if this were solved, deadlock could occur if operations were
+ nested: this is a more manageable programming error in most cases.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We could lobby POSIX to fix the perverse rules, or at least lobby Linux
+ to violate them so that the most common implementation does not have this
+ restriction.
+ This would be a generally good idea for other fcntl lock users.
+\end_layout
+
+\begin_layout Standard
+Samba uses a wrapper which hands out the same tdb_context to multiple callers
+ if this happens, and does simple reference counting.
+ We should do this inside the tdb library, which already emulates lock nesting
+ internally; it would need to recognize when deadlock occurs within a single
+ process.
+ This would create a new failure mode for tdb operations (while we currently
+ handle locking failures, they are impossible in normal use and a process
+ encountering them can do little but give up).
+\end_layout
+
+\begin_layout Standard
+I do not see benefit in an additional tdb_open flag to indicate whether
+ re-opening is allowed, as though there may be some benefit to adding a
+ call to detect when a tdb_context is shared, to allow other to create such
+ an API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB API Is Not POSIX Thread-safe
+\end_layout
+
+\begin_layout Standard
+The TDB API uses an error code which can be queried after an operation to
+ determine what went wrong.
+ This programming model does not work with threads, unless specific additional
+ guarantees are given by the implementation.
+ In addition, even otherwise-independent threads cannot open the same TDB
+ (as in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Files-Cannot"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Reachitecting the API to include a tdb_errcode pointer would be a great
+ deal of churn, but fortunately most functions return 0 on success and -1
+ on error: we can change these to return 0 on success and a negative error
+ code on error, and the API remains similar to previous.
+ The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
+ pointer and return an error code.
+ It is also simpler to have tdb_nextkey replace its key argument in place,
+ freeing up any old .dptr.
+\end_layout
+
+\begin_layout Standard
+Internal locking is required to make sure that fcntl locks do not overlap
+ between threads, and also that the global list of tdbs is maintained.
+\end_layout
+
+\begin_layout Standard
+The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
+ version of the library, and otherwise no overhead will exist.
+ Alternatively, a hooking mechanism similar to that proposed for
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ could be used to enable pthread locking at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete; API has been changed but thread safety has not been implemented.
+\end_layout
+
+\begin_layout Subsection
+*_nonblock Functions And *_mark Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+CTDB
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Clustered TDB, see http://ctdb.samba.org
+\end_layout
+
+\end_inset
+
+ wishes to operate on TDB in a non-blocking manner.
+ This is currently done as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the _nonblock variant of an API function (eg.
+ tdb_lockall_nonblock).
+ If this fails:
+\end_layout
+
+\begin_layout Enumerate
+Fork a child process, and wait for it to call the normal variant (eg.
+ tdb_lockall).
+\end_layout
+
+\begin_layout Enumerate
+If the child succeeds, call the _mark variant to indicate we already have
+ the locks (eg.
+ tdb_lockall_mark).
+\end_layout
+
+\begin_layout Enumerate
+Upon completion, tell the child to release the locks (eg.
+ tdb_unlockall).
+\end_layout
+
+\begin_layout Enumerate
+Indicate to tdb that it should consider the locks removed (eg.
+ tdb_unlockall_mark).
+\end_layout
+
+\begin_layout Standard
+There are several issues with this approach.
+ Firstly, adding two new variants of each function clutters the API for
+ an obscure use, and so not all functions have three variants.
+ Secondly, it assumes that all paths of the functions ask for the same locks,
+ otherwise the parent process will have to get a lock which the child doesn't
+ have under some circumstances.
+ I don't believe this is currently the case, but it constrains the implementatio
+n.
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Proposed-Solution-locking-hook"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Implement a hook for locking methods, so that the caller can control the
+ calls to create and remove fcntl locks.
+ In this scenario, ctdbd would operate as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the normal API function, eg tdb_lockall().
+\end_layout
+
+\begin_layout Enumerate
+When the lock callback comes in, check if the child has the lock.
+ Initially, this is always false.
+ If so, return 0.
+ Otherwise, try to obtain it in non-blocking mode.
+ If that fails, return EWOULDBLOCK.
+\end_layout
+
+\begin_layout Enumerate
+Release locks in the unlock callback as normal.
+\end_layout
+
+\begin_layout Enumerate
+If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
+ child to repeat the operation.
+\end_layout
+
+\begin_layout Enumerate
+The child records what locks it obtains, and returns that information to
+ the parent.
+\end_layout
+
+\begin_layout Enumerate
+When the child has succeeded, goto 1.
+\end_layout
+
+\begin_layout Standard
+This is flexible enough to handle any potential locking scenario, even when
+ lock requirements change.
+ It can be optimized so that the parent does not release locks, just tells
+ the child which locks it doesn't need to obtain.
+\end_layout
+
+\begin_layout Standard
+It also keeps the complexity out of the API, and in ctdbd where it is needed.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_chainlock Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+tdb_chainlock locks some number of records, including the record indicated
+ by the given key.
+ This gave atomicity guarantees; no-one can start a transaction, alter,
+ read or delete that key while the lock is held.
+\end_layout
+
+\begin_layout Standard
+It also makes the same guarantee for any other key in the chain, which is
+ an internal implementation detail and potentially a cause for deadlock.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ It would be nice to have an explicit single entry lock which effected no
+ other keys.
+ Unfortunately, this won't work for an entry which doesn't exist.
+ Thus while chainlock may be implemented more efficiently for the existing
+ case, it will still have overlap issues with the non-existing case.
+ So it is best to keep the current (lack of) guarantee about which records
+ will be effected to avoid constraining our implementation.
+\end_layout
+
+\begin_layout Subsection
+Signal Handling is Not Race-Free
+\end_layout
+
+\begin_layout Standard
+The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
+ that the tdb locking code should return with a failure, rather than trying
+ again when a signal is received (and errno == EAGAIN).
+ This is usually used to implement timeouts.
+\end_layout
+
+\begin_layout Standard
+Unfortunately, this does not work in the case where the signal is received
+ before the tdb code enters the fcntl() call to place the lock: the code
+ will sleep within the fcntl() code, unaware that the signal wants it to
+ exit.
+ In the case of long timeouts, this does not happen in practice.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The locking hooks proposed in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ would allow the user to decide on whether to fail the lock acquisition
+ on a signal.
+ This allows the caller to choose their own compromise: they could narrow
+ the race by checking immediately before the fcntl call.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+It may be possible to make this race-free in some implementations by having
+ the signal handler alter the struct flock to make it invalid.
+ This will cause the fcntl() lock call to fail with EINVAL if the signal
+ occurs before the kernel is entered, otherwise EAGAIN.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+The API Uses Gratuitous Typedefs, Capitals
+\end_layout
+
+\begin_layout Standard
+typedefs are useful for providing source compatibility when types can differ
+ across implementations, or arguably in the case of function pointer definitions
+ which are hard for humans to parse.
+ Otherwise it is simply obfuscation and pollutes the namespace.
+\end_layout
+
+\begin_layout Standard
+Capitalization is usually reserved for compile-time constants and macros.
+\end_layout
+
+\begin_layout Description
+TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
+ definition isn't visible to the API user anyway.
+\end_layout
+
+\begin_layout Description
+TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
+ needs to be understood by the API user.
+\end_layout
+
+\begin_layout Description
+struct
+\begin_inset space ~
+\end_inset
+
+TDB_DATA This would normally be called 'struct tdb_data'.
+\end_layout
+
+\begin_layout Description
+enum
+\begin_inset space ~
+\end_inset
+
+TDB_ERROR Similarly, this would normally be enum tdb_error.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ Introducing lower case variants would please pedants like myself, but if
+ it were done the existing ones should be kept.
+ There is little point forcing a purely cosmetic change upon tdb users.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+tdb_log_func Doesn't Take The Private Pointer
+\end_layout
+
+\begin_layout Standard
+For API compatibility reasons, the logging function needs to call tdb_get_loggin
+g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+It should simply take an extra argument, since we are prepared to break
+ the API/ABI.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Various Callback Functions Are Not Typesafe
+\end_layout
+
+\begin_layout Standard
+The callback functions in tdb_set_logging_function (after
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
+ all take void * and must internally convert it to the argument type they
+ were expecting.
+\end_layout
+
+\begin_layout Standard
+If this type changes, the compiler will not produce warnings on the callers,
+ since it only sees void *.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With careful use of macros, we can create callback functions which give
+ a warning when used on gcc and the types of the callback and its private
+ argument differ.
+ Unsupported compilers will not give a warning, which is no worse than now.
+ In addition, the callbacks become clearer, as they need not use void *
+ for their parameter.
+\end_layout
+
+\begin_layout Standard
+See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
+\end_layout
+
+\begin_layout Standard
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
+ be cleared if the caller discovers it is the only process with the TDB
+ open.
+ However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
+ be detected, so will have the TDB erased underneath them (usually resulting
+ in a crash).
+\end_layout
+
+\begin_layout Standard
+There is a similar issue on fork(); if the parent exits (or otherwise closes
+ the tdb) before the child calls tdb_reopen_all() to establish the lock
+ used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
+ at that moment will believe it alone has opened the TDB and will erase
+ it.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove TDB_CLEAR_IF_FIRST.
+ Other workarounds are possible, but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ An open hook is provided to replicate this functionality if required.
+\end_layout
+
+\begin_layout Subsection
+Extending The Header Is Difficult
+\end_layout
+
+\begin_layout Standard
+We have reserved (zeroed) words in the TDB header, which can be used for
+ future features.
+ If the future features are compulsory, the version number must be updated
+ to prevent old code from accessing the database.
+ But if the future feature is optional, we have no way of telling if older
+ code is accessing the database or not.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain a
+\begin_inset Quotes eld
+\end_inset
+
+format variant
+\begin_inset Quotes erd
+\end_inset
+
+ value (64-bit).
+ This is divided into two 32-bit parts:
+\end_layout
+
+\begin_layout Enumerate
+The lower part reflects the format variant understood by code accessing
+ the database.
+\end_layout
+
+\begin_layout Enumerate
+The upper part reflects the format variant you must understand to write
+ to the database (otherwise you can only open for reading).
+\end_layout
+
+\begin_layout Standard
+The latter field can only be written at creation time, the former should
+ be written under the OPEN_LOCK when opening the database for writing, if
+ the variant of the code is lower than the current lowest variant.
+\end_layout
+
+\begin_layout Standard
+This should allow backwards-compatible features to be added, and detection
+ if older code (which doesn't understand the feature) writes to the database.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Record Headers Are Not Expandible
+\end_layout
+
+\begin_layout Standard
+If we later want to add (say) checksums on keys and data, it would require
+ another format change, which we'd like to avoid.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We often have extra padding at the tail of a record.
+ If we ensure that the first byte (if any) of this padding is zero, we will
+ have a way for future changes to detect code which doesn't understand a
+ new format: the new code would write (say) a 1 at the tail, and thus if
+ there is no tail or the first byte is 0, we would know the extension is
+ not present on that record.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Does Not Use Talloc
+\end_layout
+
+\begin_layout Standard
+Many users of TDB (particularly Samba) use the talloc allocator, and thus
+ have to wrap TDB in a talloc context to use it conveniently.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The allocation within TDB is not complicated enough to justify the use of
+ talloc, and I am reluctant to force another (excellent) library on TDB
+ users.
+ Nonetheless a compromise is possible.
+ An attribute (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) can be added later to tdb_open() to provide an alternate allocation mechanism,
+ specifically for talloc but usable by any other allocator (which would
+ ignore the
+\begin_inset Quotes eld
+\end_inset
+
+context
+\begin_inset Quotes erd
+\end_inset
+
+ argument).
+\end_layout
+
+\begin_layout Standard
+This would form a talloc heirarchy as expected, but the caller would still
+ have to attach a destructor to the tdb context returned from tdb_open to
+ close it.
+ All TDB_DATA fields would be children of the tdb_context, and the caller
+ would still have to manage them (using talloc_free() or talloc_steal()).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
+\end_layout
+
+\begin_layout Section
+Performance And Scalability Issues
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+TDB_CLEAR_IF_FIRST Imposes Performance Penalty
+\end_layout
+
+\begin_layout Standard
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
+ 4 (aka.
+ the ACTIVE_LOCK).
+ While these locks never conflict in normal tdb usage, they do add substantial
+ overhead for most fcntl lock implementations when the kernel scans to detect
+ if a lock conflict exists.
+ This is often a single linked list, making the time to acquire and release
+ a fcntl lock O(N) where N is the number of processes with the TDB open,
+ not the number actually doing work.
+\end_layout
+
+\begin_layout Standard
+In a Samba server it is common to have huge numbers of clients sitting idle,
+ and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+There is a flag to tdb_reopen_all() which is used for this optimization:
+ if the parent process will outlive the child, the child does not need the
+ ACTIVE_LOCK.
+ This is a workaround for this very performance issue.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove the flag.
+ It was a neat idea, but even trivial servers tend to know when they are
+ initializing for the first time and can simply unlink the old tdb at that
+ point.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Files Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This seems to be becoming an issue (so much for
+\begin_inset Quotes eld
+\end_inset
+
+trivial
+\begin_inset Quotes erd
+\end_inset
+
+!), particularly for ldb.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+A new, incompatible TDB format which uses 64 bit offsets internally rather
+ than 32 bit as now.
+ For simplicity of endian conversion (which TDB does on the fly if required),
+ all values will be 64 bit on disk.
+ In practice, some upper bits may be used for other purposes, but at least
+ 56 bits will be available for file offsets.
+\end_layout
+
+\begin_layout Standard
+tdb_open() will automatically detect the old version, and even create them
+ if TDB_VERSION6 is specified to tdb_open.
+\end_layout
+
+\begin_layout Standard
+32 bit processes will still be able to access TDBs larger than 4G (assuming
+ that their off_t allows them to seek to 64 bits), they will gracefully
+ fall back as they fail to mmap.
+ This can happen already with large TDBs.
+\end_layout
+
+\begin_layout Standard
+Old versions of tdb will fail to open the new TDB files (since 28 August
+ 2009, commit 398d0c29290: prior to that any unrecognized file format would
+ be erased and initialized as a fresh tdb!)
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Records Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This has not been a reported problem, and the API uses size_t which can
+ be 64 bit on 64 bit platforms.
+ However, other limits may have made such an issue moot.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Record sizes will be 64 bit, with an error returned on 32 bit platforms
+ which try to access such records (the current implementation would return
+ TDB_ERR_OOM in a similar case).
+ It seems unlikely that 32 bit keys will be a limitation, so the implementation
+ may not support this (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Hash Size Is Determined At TDB Creation Time
+\end_layout
+
+\begin_layout Standard
+TDB contains a number of hash chains in the header; the number is specified
+ at creation time, and defaults to 131.
+ This is such a bottleneck on large databases (as each hash chain gets quite
+ long), that LDB uses 10,000 for this hash.
+ In general it is impossible to know what the 'right' answer is at database
+ creation time.
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Hash-Size-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+After comprehensive performance testing on various scalable hash variants
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
+ because I was previously convinced that an expanding tree of hashes would
+ be very close to optimal.
+\end_layout
+
+\end_inset
+
+, it became clear that it is hard to beat a straight linear hash table which
+ doubles in size when it reaches saturation.
+ Unfortunately, altering the hash table introduces serious locking complications
+: the entire hash table needs to be locked to enlarge the hash table, and
+ others might be holding locks.
+ Particularly insidious are insertions done under tdb_chainlock.
+\end_layout
+
+\begin_layout Standard
+Thus an expanding layered hash will be used: an array of hash groups, with
+ each hash group exploding into pointers to lower hash groups once it fills,
+ turning into a hash tree.
+ This has implications for locking: we must lock the entire group in case
+ we need to expand it, yet we don't know how deep the tree is at that point.
+\end_layout
+
+\begin_layout Standard
+Note that bits from the hash table entries should be stolen to hold more
+ hash bits to reduce the penalty of collisions.
+ We can use the otherwise-unused lower 3 bits.
+ If we limit the size of the database to 64 exabytes, we can use the top
+ 8 bits of the hash entry as well.
+ These 11 bits would reduce false positives down to 1 in 2000 which is more
+ than we need: we can use one of the bits to indicate that the extra hash
+ bits are valid.
+ This means we can choose not to re-hash all entries when we expand a hash
+ group; simply use the next bits we need and mark them invalid.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Ignore.
+ Scaling the hash automatically proved inefficient at small hash sizes;
+ we default to a 8192-element hash (changable via NTDB_ATTRIBUTE_HASHSIZE),
+ and when buckets clash we expand to an array of hash entries.
+ This scales slightly better than the tdb chain (due to the 8 top bits containin
+g extra hash).
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Freelist-Is"
+
+\end_inset
+
+TDB Freelist Is Highly Contended
+\end_layout
+
+\begin_layout Standard
+TDB uses a single linked list for the free list.
+ Allocation occurs as follows, using heuristics which have evolved over
+ time:
+\end_layout
+
+\begin_layout Enumerate
+Get the free list lock for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Multiply length by 1.25, so we always over-allocate by 25%.
+\end_layout
+
+\begin_layout Enumerate
+Set the slack multiplier to 1.
+\end_layout
+
+\begin_layout Enumerate
+Examine the current freelist entry: if it is > length but < the current
+ best case, remember it as the best case.
+\end_layout
+
+\begin_layout Enumerate
+Multiply the slack multiplier by 1.05.
+\end_layout
+
+\begin_layout Enumerate
+If our best fit so far is less than length * slack multiplier, return it.
+ The slack will be turned into a new free record if it's large enough.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, go onto the next freelist entry.
+\end_layout
+
+\begin_layout Standard
+Deleting a record occurs as follows:
+\end_layout
+
+\begin_layout Enumerate
+Lock the hash chain for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Walk the chain to find the record, keeping the prev pointer offset.
+\end_layout
+
+\begin_layout Enumerate
+If max_dead is non-zero:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Walk the hash chain again and count the dead records.
+\end_layout
+
+\begin_layout Enumerate
+If it's more than max_dead, bulk free all the dead ones (similar to steps
+ 4 and below, but the lock is only obtained once).
+\end_layout
+
+\begin_layout Enumerate
+Simply mark this record as dead and return.
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+Get the free list lock for the remainder of this operation.
+\end_layout
+
+\begin_layout Enumerate
+\begin_inset CommandInset label
+LatexCommand label
+name "right-merging"
+
+\end_inset
+
+Examine the following block to see if it is free; if so, enlarge the current
+ block and remove that block from the free list.
+ This was disabled, as removal from the free list was O(entries-in-free-list).
+\end_layout
+
+\begin_layout Enumerate
+Examine the preceeding block to see if it is free: for this reason, each
+ block has a 32-bit tailer which indicates its length.
+ If it is free, expand it to cover our new block and return.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, prepend ourselves to the free list.
+\end_layout
+
+\begin_layout Standard
+Disabling right-merging (step
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "right-merging"
+
+\end_inset
+
+) causes fragmentation; the other heuristics proved insufficient to address
+ this, so the final answer to this was that when we expand the TDB file
+ inside a transaction commit, we repack the entire tdb.
+\end_layout
+
+\begin_layout Standard
+The single list lock limits our allocation rate; due to the other issues
+ this is not currently seen as a bottleneck.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+\end_layout
+
+\begin_layout Standard
+The free list must be split to reduce contention.
+ Assuming perfect free merging, we can at most have 1 free list entry for
+ each entry.
+ This implies that the number of free lists is related to the size of the
+ hash table, but as it is rare to walk a large number of free list entries
+ we can use far fewer, say 1/32 of the number of hash buckets.
+\end_layout
+
+\begin_layout Standard
+It seems tempting to try to reuse the hash implementation which we use for
+ records here, but we have two ways of searching for free entries: for allocatio
+n we search by size (and possibly zone) which produces too many clashes
+ for our hash table to handle well, and for coalescing we search by address.
+ Thus an array of doubly-linked free lists seems preferable.
+\end_layout
+
+\begin_layout Standard
+There are various benefits in using per-size free lists (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+) but it's not clear this would reduce contention in the common case where
+ all processes are allocating/freeing the same size.
+ Thus we almost certainly need to divide in other ways: the most obvious
+ is to divide the file into zones, and using a free list (or table of free
+ lists) for each.
+ This approximates address ordering.
+\end_layout
+
+\begin_layout Standard
+Unfortunately it is difficult to know what heuristics should be used to
+ determine zone sizes, and our transaction code relies on being able to
+ create a
+\begin_inset Quotes eld
+\end_inset
+
+recovery area
+\begin_inset Quotes erd
+\end_inset
+
+ by simply appending to the file (difficult if it would need to create a
+ new zone header).
+ Thus we use a linked-list of free tables; currently we only ever create
+ one, but if there is more than one we choose one at random to use.
+ In future we may use heuristics to add new free tables on contention.
+ We only expand the file when all free tables are exhausted.
+\end_layout
+
+\begin_layout Standard
+The basic algorithm is as follows.
+ Freeing is simple:
+\end_layout
+
+\begin_layout Enumerate
+Identify the correct free list.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+Re-check the list (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+\end_layout
+
+\begin_layout Enumerate
+Place the freed entry in the list.
+\end_layout
+
+\begin_layout Standard
+Allocation is a little more complicated, as we perform delayed coalescing
+ at this point:
+\end_layout
+
+\begin_layout Enumerate
+Pick a free table; usually the previous one.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is -large enough, remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, coalesce entries in the list.If there was no entry large enough,
+ unlock the list and try the next largest list
+\end_layout
+
+\begin_layout Enumerate
+If no list has an entry which meets our needs, try the next free table.
+\end_layout
+
+\begin_layout Enumerate
+If no zone satisfies, expand the file.
+\end_layout
+
+\begin_layout Standard
+This optimizes rapid insert/delete of free list entries by not coalescing
+ them all the time..
+ First-fit address ordering ordering seems to be fairly good for keeping
+ fragmentation low (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+).
+ Note that address ordering does not need a tailer to coalesce, though if
+ we needed one we could have one cheaply: see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+Each free entry has the free table number in the header: less than 255.
+ It also contains a doubly-linked list for easy deletion.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+TDB Becomes Fragmented
+\end_layout
+
+\begin_layout Standard
+Much of this is a result of allocation strategy
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
+xas.edu/pub/garbage/malloc/ismm98.ps
+\end_layout
+
+\end_inset
+
+ and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
+on) is deliberately set at 25%, and external fragmentation is only cured
+ by the decision to repack the entire db when a transaction commit needs
+ to enlarge the file.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The 25% overhead on allocation works in practice for ldb because indexes
+ tend to expand by one record at a time.
+ This internal fragmentation can be resolved by having an
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+\end_layout
+
+\begin_layout Standard
+There are is a spectrum of possible solutions for external fragmentation:
+ one is to use a fragmentation-avoiding allocation strategy such as best-fit
+ address-order allocator.
+ The other end of the spectrum would be to use a bump allocator (very fast
+ and simple) and simply repack the file when we reach the end.
+\end_layout
+
+\begin_layout Standard
+There are three problems with efficient fragmentation-avoiding allocators:
+ they are non-trivial, they tend to use a single free list for each size,
+ and there's no evidence that tdb allocation patterns will match those recorded
+ for general allocators (though it seems likely).
+\end_layout
+
+\begin_layout Standard
+Thus we don't spend too much effort on external fragmentation; we will be
+ no worse than the current code if we need to repack on occasion.
+ More effort is spent on reducing freelist contention, and reducing overhead.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Records-Incur-A"
+
+\end_inset
+
+Records Incur A 28-Byte Overhead
+\end_layout
+
+\begin_layout Standard
+Each TDB record has a header as follows:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_record {
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_off_t next; /* offset of the next record in the list */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t rec_len; /* total byte length of record */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t key_len; /* byte length of key */
+\end_layout
+
+\begin_layout LyX-Code
+        tdb_len_t data_len; /* byte length of data */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t magic;   /* try to catch errors */
+\end_layout
+
+\begin_layout LyX-Code
+        /* the following union is implied:
+\end_layout
+
+\begin_layout LyX-Code
+                union {
+\end_layout
+
+\begin_layout LyX-Code
+                        char record[rec_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        struct {
+\end_layout
+
+\begin_layout LyX-Code
+                                char key[key_len];
+\end_layout
+
+\begin_layout LyX-Code
+                                char data[data_len];
+\end_layout
+
+\begin_layout LyX-Code
+                        }
+\end_layout
+
+\begin_layout LyX-Code
+                        uint32_t totalsize; (tailer)
+\end_layout
+
+\begin_layout LyX-Code
+                }
+\end_layout
+
+\begin_layout LyX-Code
+        */
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Naively, this would double to a 56-byte overhead on a 64 bit implementation.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We can use various techniques to reduce this for an allocated block:
+\end_layout
+
+\begin_layout Enumerate
+The 'next' pointer is not required, as we are using a flat hash table.
+\end_layout
+
+\begin_layout Enumerate
+'rec_len' can instead be expressed as an addition to key_len and data_len
+ (it accounts for wasted or overallocated length in the record).
+ Since the record length is always a multiple of 8, we can conveniently
+ fit it in 32 bits (representing up to 35 bits).
+\end_layout
+
+\begin_layout Enumerate
+'key_len' and 'data_len' can be reduced.
+ I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
+ the two into one 64-bit field and using a 5 bit value which indicates at
+ what bit to divide the two.
+ Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
+ size of 32 bits.
+\end_layout
+
+\begin_layout Enumerate
+'full_hash' is used to avoid a memcmp on the
+\begin_inset Quotes eld
+\end_inset
+
+miss
+\begin_inset Quotes erd
+\end_inset
+
+ case, but this is diminishing returns after a handful of bits (at 10 bits,
+ it reduces 99.9% of false memcmp).
+ As an aside, as the lower bits are already incorporated in the hash table
+ resolution, the upper bits should be used here.
+ Note that it's not clear that these bits will be a win, given the extra
+ bits in the hash table itself (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Hash-Size-Solution"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate
+'magic' does not need to be enlarged: it currently reflects one of 5 values
+ (used, free, dead, recovery, and unused_recovery).
+ It is useful for quick sanity checking however, and should not be eliminated.
+\end_layout
+
+\begin_layout Enumerate
+'tailer' is only used to coalesce free blocks (so a block to the right can
+ find the header to check if this block is free).
+ This can be replaced by a single 'free' bit in the header of the following
+ block (and the tailer only exists in free blocks).
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+This technique from Thomas Standish.
+ Data Structure Techniques.
+ Addison-Wesley, Reading, Massachusetts, 1980.
+\end_layout
+
+\end_inset
+
+ The current proposed coalescing algorithm doesn't need this, however.
+\end_layout
+
+\begin_layout Standard
+This produces a 16 byte used header like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_used_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t used_magic : 16,
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+                 key_data_divide: 5,
+\end_layout
+
+\begin_layout LyX-Code
+                 top_hash: 11;
+\end_layout
+
+\begin_layout LyX-Code
+        uint32_t extra_octets;
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t key_and_data_len;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+And a free record like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_free_record {
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t free_magic: 8,
+\end_layout
+
+\begin_layout LyX-Code
+                   prev : 56;
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t free_table: 8,
+\end_layout
+
+\begin_layout LyX-Code
+                 total_length : 56
+\end_layout
+
+\begin_layout LyX-Code
+        uint64_t next;;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Note that by limiting valid offsets to 56 bits, we can pack everything we
+ need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Transaction Commit Requires 4 fdatasync
+\end_layout
+
+\begin_layout Standard
+The current transaction algorithm is:
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+overwrite_with_new_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+remove_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Standard
+On current ext3, each sync flushes all data to disk, so the next 3 syncs
+ are relatively expensive.
+ But this could become a performance bottleneck on other filesystems such
+ as ext4.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Neil Brown points out that this is overzealous, and only one sync is needed:
+\end_layout
+
+\begin_layout Enumerate
+Bundle the recovery data, a transaction counter and a strong checksum of
+ the new data.
+\end_layout
+
+\begin_layout Enumerate
+Strong checksum that whole bundle.
+\end_layout
+
+\begin_layout Enumerate
+Store the bundle in the database.
+\end_layout
+
+\begin_layout Enumerate
+Overwrite the oldest of the two recovery pointers in the header (identified
+ using the transaction counter) with the offset of this bundle.
+\end_layout
+
+\begin_layout Enumerate
+sync.
+\end_layout
+
+\begin_layout Enumerate
+Write the new data to the file.
+\end_layout
+
+\begin_layout Standard
+Checking for recovery means identifying the latest bundle with a valid checksum
+ and using the new data checksum to ensure that it has been applied.
+ This is more expensive than the current check, but need only be done at
+ open.
+ For running databases, a separate header field can be used to indicate
+ a transaction in progress; we need only check for recovery if this is set.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Does-Not"
+
+\end_inset
+
+TDB Does Not Have Snapshot Support
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ At some point you say
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+ (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+But as a thought experiment, if we implemented transactions to only overwrite
+ free entries (this is tricky: there must not be a header in each entry
+ which indicates whether it is free, but use of presence in metadata elsewhere),
+ and a pointer to the hash table, we could create an entirely new commit
+ without destroying existing data.
+ Then it would be easy to implement snapshots in a similar way.
+\end_layout
+
+\begin_layout Standard
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\end_layout
+
+\begin_layout Standard
+We could then implement snapshots using a similar method, using multiple
+ different hash tables/free tables.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Transactions Cannot Operate in Parallel
+\end_layout
+
+\begin_layout Standard
+This would be useless for ldb, as it hits the index records with just about
+ every update.
+ It would add significant complexity in resolving clashes, and cause the
+ all transaction callers to write their code to loop in the case where the
+ transactions spuriously failed.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+ We could solve a small part of the problem by providing read-only transactions.
+ These would allow one write transaction to begin, but it could not commit
+ until all r/o transactions are done.
+ This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
+ commit.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Default Hash Function Is Suboptimal
+\end_layout
+
+\begin_layout Standard
+The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
+ if we expand it to 64 bits), and works best when the hash bucket size is
+ a prime number (which also means a slow modulus).
+ In addition, it is highly predictable which could potentially lead to a
+ Denial of Service attack in some TDB uses.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The Jenkins lookup3 hash
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+http://burtleburtle.net/bob/c/lookup3.c
+\end_layout
+
+\end_inset
+
+ is a fast and superbly-mixing hash.
+ It's used by the Linux kernel and almost everything else.
+ This has the particular properties that it takes an initial seed, and produces
+ two 32 bit hash numbers, which we can combine into a 64-bit hash.
+\end_layout
+
+\begin_layout Standard
+The seed should be created at tdb-creation time from some random source,
+ and placed in the header.
+ This is far from foolproof, but adds a little bit of protection against
+ hash bombing.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Reliable-Traversal-Adds"
+
+\end_inset
+
+Reliable Traversal Adds Complexity
+\end_layout
+
+\begin_layout Standard
+We lock a record during traversal iteration, and try to grab that lock in
+ the delete code.
+ If that grab on delete fails, we simply mark it deleted and continue onwards;
+ traversal checks for this condition and does the delete when it moves off
+ the record.
+\end_layout
+
+\begin_layout Standard
+If traversal terminates, the dead record may be left indefinitely.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove reliability guarantees; see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "traverse-Proposed-Solution"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Fcntl Locking Adds Overhead
+\end_layout
+
+\begin_layout Standard
+Placing a fcntl lock means a system call, as does removing one.
+ This is actually one reason why transactions can be faster (everything
+ is locked once at transaction start).
+ In the uncontended case, this overhead can theoretically be eliminated.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+We tried this before with spinlock support, in the early days of TDB, and
+ it didn't make much difference except in manufactured benchmarks.
+\end_layout
+
+\begin_layout Standard
+We could use spinlocks (with futex kernel support under Linux), but it means
+ that we lose automatic cleanup when a process dies with a lock.
+ There is a method of auto-cleanup under Linux, but it's not supported by
+ other operating systems.
+ We could reintroduce a clear-if-first-style lock and sweep for dead futexes
+ on open, but that wouldn't help the normal case of one concurrent opener
+ dying.
+ Increasingly elaborate repair schemes could be considered, but they require
+ an ABI change (everyone must use them) anyway, so there's no need to do
+ this at the same time as everything else.
+\end_layout
+
+\begin_layout Subsection
+Some Transactions Don't Require Durability
+\end_layout
+
+\begin_layout Standard
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
+ usage, and occasionally empties the results into a transactional TDB.
+ This kind of usage prioritizes performance over durability: as long as
+ we are consistent, data can be lost.
+\end_layout
+
+\begin_layout Standard
+This would be more neatly implemented inside tdb: a
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ transaction commit (ie.
+ syncless) which meant that data may be reverted on a crash.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+Unfortunately any transaction scheme which overwrites old data requires
+ a sync before that overwrite to avoid the possibility of corruption.
+\end_layout
+
+\begin_layout Standard
+It seems possible to use a scheme similar to that described in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Does-Not"
+
+\end_inset
+
+,where transactions are committed without overwriting existing data, and
+ an array of top-level pointers were available in the header.
+ If the transaction is
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ then we would not need a sync at all: existing processes would pick up
+ the new hash table and free list and work with that.
+\end_layout
+
+\begin_layout Standard
+At some later point, a sync would allow recovery of the old data into the
+ free lists (perhaps when the array of top-level pointers filled).
+ On crash, tdb_open() would examine the array of top levels, and apply the
+ transactions until it encountered an invalid checksum.
+\end_layout
+
+\begin_layout Subsection
+Tracing Is Fragile, Replay Is External
+\end_layout
+
+\begin_layout Standard
+The current TDB has compile-time-enabled tracing code, but it often breaks
+ as it is not enabled by default.
+ In a similar way, the ctdb code has an external wrapper which does replay
+ tracing so it can coordinate cluster-wide transactions.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "replay-attribute"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Tridge points out that an attribute can be later added to tdb_open (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) to provide replay/trace hooks, which could become the basis for this and
+ future parallel transactions and snapshot support.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\end_body
+\end_document
diff --git a/ccan/ntdb/doc/design.pdf b/ccan/ntdb/doc/design.pdf
new file mode 100644 (file)
index 0000000..8381914
Binary files /dev/null and b/ccan/ntdb/doc/design.pdf differ
diff --git a/ccan/ntdb/doc/design.txt b/ccan/ntdb/doc/design.txt
new file mode 100644 (file)
index 0000000..bd680f0
--- /dev/null
@@ -0,0 +1,1270 @@
+NTDB: Redesigning The Trivial DataBase
+
+Rusty Russell, IBM Corporation
+
+19 June 2012
+
+Abstract
+
+The Trivial DataBase on-disk format is 32 bits; with usage cases
+heading towards the 4G limit, that must change. This required
+breakage provides an opportunity to revisit TDB's other design
+decisions and reassess them.
+
+1 Introduction
+
+The Trivial DataBase was originally written by Andrew Tridgell as
+a simple key/data pair storage system with the same API as dbm,
+but allowing multiple readers and writers while being small
+enough (< 1000 lines of C) to include in SAMBA. The simple design
+created in 1999 has proven surprisingly robust and performant,
+used in Samba versions 3 and 4 as well as numerous other
+projects. Its useful life was greatly increased by the
+(backwards-compatible!) addition of transaction support in 2005.
+
+The wider variety and greater demands of TDB-using code has lead
+to some organic growth of the API, as well as some compromises on
+the implementation. None of these, by themselves, are seen as
+show-stoppers, but the cumulative effect is to a loss of elegance
+over the initial, simple TDB implementation. Here is a table of
+the approximate number of lines of implementation code and number
+of API functions at the end of each year:
+
+
++-----------+----------------+--------------------------------+
+| Year End  | API Functions  | Lines of C Code Implementation |
++-----------+----------------+--------------------------------+
++-----------+----------------+--------------------------------+
+|   1999    |      13        |              1195              |
++-----------+----------------+--------------------------------+
+|   2000    |      24        |              1725              |
++-----------+----------------+--------------------------------+
+|   2001    |      32        |              2228              |
++-----------+----------------+--------------------------------+
+|   2002    |      35        |              2481              |
++-----------+----------------+--------------------------------+
+|   2003    |      35        |              2552              |
++-----------+----------------+--------------------------------+
+|   2004    |      40        |              2584              |
++-----------+----------------+--------------------------------+
+|   2005    |      38        |              2647              |
++-----------+----------------+--------------------------------+
+|   2006    |      52        |              3754              |
++-----------+----------------+--------------------------------+
+|   2007    |      66        |              4398              |
++-----------+----------------+--------------------------------+
+|   2008    |      71        |              4768              |
++-----------+----------------+--------------------------------+
+|   2009    |      73        |              5715              |
++-----------+----------------+--------------------------------+
+
+
+This review is an attempt to catalog and address all the known
+issues with TDB and create solutions which address the problems
+without significantly increasing complexity; all involved are far
+too aware of the dangers of second system syndrome in rewriting a
+successful project like this.
+
+Note: the final decision was to make ntdb a separate library,
+with a separarate 'ntdb' namespace so both can potentially be
+linked together. This document still refers to “tdb” everywhere,
+for simplicity.
+
+2 API Issues
+
+2.1 tdb_open_ex Is Not Expandable
+
+The tdb_open() call was expanded to tdb_open_ex(), which added an
+optional hashing function and an optional logging function
+argument. Additional arguments to open would require the
+introduction of a tdb_open_ex2 call etc.
+
+2.1.1 Proposed Solution<attributes>
+
+tdb_open() will take a linked-list of attributes:
+
+enum tdb_attribute {
+
+    TDB_ATTRIBUTE_LOG = 0,
+
+    TDB_ATTRIBUTE_HASH = 1
+
+};
+
+struct tdb_attribute_base {
+
+    enum tdb_attribute attr;
+
+    union tdb_attribute *next;
+
+};
+
+struct tdb_attribute_log {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
+*/
+
+    tdb_log_func log_fn;
+
+    void *log_private;
+
+};
+
+struct tdb_attribute_hash {
+
+    struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
+*/
+
+    tdb_hash_func hash_fn;
+
+    void *hash_private;
+
+};
+
+union tdb_attribute {
+
+    struct tdb_attribute_base base;
+
+    struct tdb_attribute_log log;
+
+    struct tdb_attribute_hash hash;
+
+};
+
+This allows future attributes to be added, even if this expands
+the size of the union.
+
+2.1.2 Status
+
+Complete.
+
+2.2 tdb_traverse Makes Impossible Guarantees
+
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
+and it was thought that it was important to guarantee that all
+records which exist at the start and end of the traversal would
+be included, and no record would be included twice.
+
+This adds complexity (see[Reliable-Traversal-Adds]) and does not
+work anyway for records which are altered (in particular, those
+which are expanded may be effectively deleted and re-added behind
+the traversal).
+
+2.2.1 <traverse-Proposed-Solution>Proposed Solution
+
+Abandon the guarantee. You will see every record if no changes
+occur during your traversal, otherwise you will see some subset.
+You can prevent changes by using a transaction or the locking
+API.
+
+2.2.2 Status
+
+Complete. Delete-during-traverse will still delete every record,
+too (assuming no other changes).
+
+2.3 Nesting of Transactions Is Fraught
+
+TDB has alternated between allowing nested transactions and not
+allowing them. Various paths in the Samba codebase assume that
+transactions will nest, and in a sense they can: the operation is
+only committed to disk when the outer transaction is committed.
+There are two problems, however:
+
+1. Canceling the inner transaction will cause the outer
+  transaction commit to fail, and will not undo any operations
+  since the inner transaction began. This problem is soluble with
+  some additional internal code.
+
+2. An inner transaction commit can be cancelled by the outer
+  transaction. This is desirable in the way which Samba's
+  database initialization code uses transactions, but could be a
+  surprise to any users expecting a successful transaction commit
+  to expose changes to others.
+
+The current solution is to specify the behavior at tdb_open(),
+with the default currently that nested transactions are allowed.
+This flag can also be changed at runtime.
+
+2.3.1 Proposed Solution
+
+Given the usage patterns, it seems that the“least-surprise”
+behavior of disallowing nested transactions should become the
+default. Additionally, it seems the outer transaction is the only
+code which knows whether inner transactions should be allowed, so
+a flag to indicate this could be added to tdb_transaction_start.
+However, this behavior can be simulated with a wrapper which uses
+tdb_add_flags() and tdb_remove_flags(), so the API should not be
+expanded for this relatively-obscure case.
+
+2.3.2 Status
+
+Complete; the nesting flag has been removed.
+
+2.4 Incorrect Hash Function is Not Detected
+
+tdb_open_ex() allows the calling code to specify a different hash
+function to use, but does not check that all other processes
+accessing this tdb are using the same hash function. The result
+is that records are missing from tdb_fetch().
+
+2.4.1 Proposed Solution
+
+The header should contain an example hash result (eg. the hash of
+0xdeadbeef), and tdb_open_ex() should check that the given hash
+function produces the same answer, or fail the tdb_open call.
+
+2.4.2 Status
+
+Complete.
+
+2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+
+In response to scalability issues with the free list ([TDB-Freelist-Is]
+) two API workarounds have been incorporated in TDB:
+tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
+latter actually calls the former with an argument of“5”.
+
+This code allows deleted records to accumulate without putting
+them in the free list. On delete we iterate through each chain
+and free them in a batch if there are more than max_dead entries.
+These are never otherwise recycled except as a side-effect of a
+tdb_repack.
+
+2.5.1 Proposed Solution
+
+With the scalability problems of the freelist solved, this API
+can be removed. The TDB_VOLATILE flag may still be useful as a
+hint that store and delete of records will be at least as common
+as fetch in order to allow some internal tuning, but initially
+will become a no-op.
+
+2.5.2 Status
+
+Complete. Unknown flags cause tdb_open() to fail as well, so they
+can be detected at runtime.
+
+2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
+  In The Same Process
+
+No process can open the same TDB twice; we check and disallow it.
+This is an unfortunate side-effect of fcntl locks, which operate
+on a per-file rather than per-file-descriptor basis, and do not
+nest. Thus, closing any file descriptor on a file clears all the
+locks obtained by this process, even if they were placed using a
+different file descriptor!
+
+Note that even if this were solved, deadlock could occur if
+operations were nested: this is a more manageable programming
+error in most cases.
+
+2.6.1 Proposed Solution
+
+We could lobby POSIX to fix the perverse rules, or at least lobby
+Linux to violate them so that the most common implementation does
+not have this restriction. This would be a generally good idea
+for other fcntl lock users.
+
+Samba uses a wrapper which hands out the same tdb_context to
+multiple callers if this happens, and does simple reference
+counting. We should do this inside the tdb library, which already
+emulates lock nesting internally; it would need to recognize when
+deadlock occurs within a single process. This would create a new
+failure mode for tdb operations (while we currently handle
+locking failures, they are impossible in normal use and a process
+encountering them can do little but give up).
+
+I do not see benefit in an additional tdb_open flag to indicate
+whether re-opening is allowed, as though there may be some
+benefit to adding a call to detect when a tdb_context is shared,
+to allow other to create such an API.
+
+2.6.2 Status
+
+Complete.
+
+2.7 TDB API Is Not POSIX Thread-safe
+
+The TDB API uses an error code which can be queried after an
+operation to determine what went wrong. This programming model
+does not work with threads, unless specific additional guarantees
+are given by the implementation. In addition, even
+otherwise-independent threads cannot open the same TDB (as in[TDB-Files-Cannot]
+).
+
+2.7.1 Proposed Solution
+
+Reachitecting the API to include a tdb_errcode pointer would be a
+great deal of churn, but fortunately most functions return 0 on
+success and -1 on error: we can change these to return 0 on
+success and a negative error code on error, and the API remains
+similar to previous. The tdb_fetch, tdb_firstkey and tdb_nextkey
+functions need to take a TDB_DATA pointer and return an error
+code. It is also simpler to have tdb_nextkey replace its key
+argument in place, freeing up any old .dptr.
+
+Internal locking is required to make sure that fcntl locks do not
+overlap between threads, and also that the global list of tdbs is
+maintained.
+
+The aim is that building tdb with -DTDB_PTHREAD will result in a
+pthread-safe version of the library, and otherwise no overhead
+will exist. Alternatively, a hooking mechanism similar to that
+proposed for[Proposed-Solution-locking-hook] could be used to
+enable pthread locking at runtime.
+
+2.7.2 Status
+
+Incomplete; API has been changed but thread safety has not been
+implemented.
+
+2.8 *_nonblock Functions And *_mark Functions Expose
+  Implementation
+
+CTDB[footnote:
+Clustered TDB, see http://ctdb.samba.org
+] wishes to operate on TDB in a non-blocking manner. This is
+currently done as follows:
+
+1. Call the _nonblock variant of an API function (eg.
+  tdb_lockall_nonblock). If this fails:
+
+2. Fork a child process, and wait for it to call the normal
+  variant (eg. tdb_lockall).
+
+3. If the child succeeds, call the _mark variant to indicate we
+  already have the locks (eg. tdb_lockall_mark).
+
+4. Upon completion, tell the child to release the locks (eg.
+  tdb_unlockall).
+
+5. Indicate to tdb that it should consider the locks removed (eg.
+  tdb_unlockall_mark).
+
+There are several issues with this approach. Firstly, adding two
+new variants of each function clutters the API for an obscure
+use, and so not all functions have three variants. Secondly, it
+assumes that all paths of the functions ask for the same locks,
+otherwise the parent process will have to get a lock which the
+child doesn't have under some circumstances. I don't believe this
+is currently the case, but it constrains the implementation.
+
+2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
+
+Implement a hook for locking methods, so that the caller can
+control the calls to create and remove fcntl locks. In this
+scenario, ctdbd would operate as follows:
+
+1. Call the normal API function, eg tdb_lockall().
+
+2. When the lock callback comes in, check if the child has the
+  lock. Initially, this is always false. If so, return 0.
+  Otherwise, try to obtain it in non-blocking mode. If that
+  fails, return EWOULDBLOCK.
+
+3. Release locks in the unlock callback as normal.
+
+4. If tdb_lockall() fails, see if we recorded a lock failure; if
+  so, call the child to repeat the operation.
+
+5. The child records what locks it obtains, and returns that
+  information to the parent.
+
+6. When the child has succeeded, goto 1.
+
+This is flexible enough to handle any potential locking scenario,
+even when lock requirements change. It can be optimized so that
+the parent does not release locks, just tells the child which
+locks it doesn't need to obtain.
+
+It also keeps the complexity out of the API, and in ctdbd where
+it is needed.
+
+2.8.2 Status
+
+Complete.
+
+2.9 tdb_chainlock Functions Expose Implementation
+
+tdb_chainlock locks some number of records, including the record
+indicated by the given key. This gave atomicity guarantees;
+no-one can start a transaction, alter, read or delete that key
+while the lock is held.
+
+It also makes the same guarantee for any other key in the chain,
+which is an internal implementation detail and potentially a
+cause for deadlock.
+
+2.9.1 Proposed Solution
+
+None. It would be nice to have an explicit single entry lock
+which effected no other keys. Unfortunately, this won't work for
+an entry which doesn't exist. Thus while chainlock may be
+implemented more efficiently for the existing case, it will still
+have overlap issues with the non-existing case. So it is best to
+keep the current (lack of) guarantee about which records will be
+effected to avoid constraining our implementation.
+
+2.10 Signal Handling is Not Race-Free
+
+The tdb_setalarm_sigptr() call allows the caller's signal handler
+to indicate that the tdb locking code should return with a
+failure, rather than trying again when a signal is received (and
+errno == EAGAIN). This is usually used to implement timeouts.
+
+Unfortunately, this does not work in the case where the signal is
+received before the tdb code enters the fcntl() call to place the
+lock: the code will sleep within the fcntl() code, unaware that
+the signal wants it to exit. In the case of long timeouts, this
+does not happen in practice.
+
+2.10.1 Proposed Solution
+
+The locking hooks proposed in[Proposed-Solution-locking-hook]
+would allow the user to decide on whether to fail the lock
+acquisition on a signal. This allows the caller to choose their
+own compromise: they could narrow the race by checking
+immediately before the fcntl call.[footnote:
+It may be possible to make this race-free in some implementations
+by having the signal handler alter the struct flock to make it
+invalid. This will cause the fcntl() lock call to fail with
+EINVAL if the signal occurs before the kernel is entered,
+otherwise EAGAIN.
+]
+
+2.10.2 Status
+
+Complete.
+
+2.11 The API Uses Gratuitous Typedefs, Capitals
+
+typedefs are useful for providing source compatibility when types
+can differ across implementations, or arguably in the case of
+function pointer definitions which are hard for humans to parse.
+Otherwise it is simply obfuscation and pollutes the namespace.
+
+Capitalization is usually reserved for compile-time constants and
+macros.
+
+  TDB_CONTEXT There is no reason to use this over 'struct
+  tdb_context'; the definition isn't visible to the API user
+  anyway.
+
+  TDB_DATA There is no reason to use this over struct TDB_DATA;
+  the struct needs to be understood by the API user.
+
+  struct TDB_DATA This would normally be called 'struct
+  tdb_data'.
+
+  enum TDB_ERROR Similarly, this would normally be enum
+  tdb_error.
+
+2.11.1 Proposed Solution
+
+None. Introducing lower case variants would please pedants like
+myself, but if it were done the existing ones should be kept.
+There is little point forcing a purely cosmetic change upon tdb
+users.
+
+2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
+  Private Pointer
+
+For API compatibility reasons, the logging function needs to call
+tdb_get_logging_private() to retrieve the pointer registered by
+the tdb_open_ex for logging.
+
+2.12.1 Proposed Solution
+
+It should simply take an extra argument, since we are prepared to
+break the API/ABI.
+
+2.12.2 Status
+
+Complete.
+
+2.13 Various Callback Functions Are Not Typesafe
+
+The callback functions in tdb_set_logging_function (after[tdb_log_func-Doesnt-Take]
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
+and tdb_check all take void * and must internally convert it to
+the argument type they were expecting.
+
+If this type changes, the compiler will not produce warnings on
+the callers, since it only sees void *.
+
+2.13.1 Proposed Solution
+
+With careful use of macros, we can create callback functions
+which give a warning when used on gcc and the types of the
+callback and its private argument differ. Unsupported compilers
+will not give a warning, which is no worse than now. In addition,
+the callbacks become clearer, as they need not use void * for
+their parameter.
+
+See CCAN's typesafe_cb module at
+http://ccan.ozlabs.org/info/typesafe_cb.html
+
+2.13.2 Status
+
+Complete.
+
+2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
+  tdb_reopen_all Problematic
+
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
+file should be cleared if the caller discovers it is the only
+process with the TDB open. However, if any caller does not
+specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
+the TDB erased underneath them (usually resulting in a crash).
+
+There is a similar issue on fork(); if the parent exits (or
+otherwise closes the tdb) before the child calls tdb_reopen_all()
+to establish the lock used to indicate the TDB is opened by
+someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
+it alone has opened the TDB and will erase it.
+
+2.14.1 Proposed Solution
+
+Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
+see[TDB_CLEAR_IF_FIRST-Imposes-Performance].
+
+2.14.2 Status
+
+Complete. An open hook is provided to replicate this
+functionality if required.
+
+2.15 Extending The Header Is Difficult
+
+We have reserved (zeroed) words in the TDB header, which can be
+used for future features. If the future features are compulsory,
+the version number must be updated to prevent old code from
+accessing the database. But if the future feature is optional, we
+have no way of telling if older code is accessing the database or
+not.
+
+2.15.1 Proposed Solution
+
+The header should contain a“format variant” value (64-bit). This
+is divided into two 32-bit parts:
+
+1. The lower part reflects the format variant understood by code
+  accessing the database.
+
+2. The upper part reflects the format variant you must understand
+  to write to the database (otherwise you can only open for
+  reading).
+
+The latter field can only be written at creation time, the former
+should be written under the OPEN_LOCK when opening the database
+for writing, if the variant of the code is lower than the current
+lowest variant.
+
+This should allow backwards-compatible features to be added, and
+detection if older code (which doesn't understand the feature)
+writes to the database.
+
+2.15.2 Status
+
+Complete.
+
+2.16 Record Headers Are Not Expandible
+
+If we later want to add (say) checksums on keys and data, it
+would require another format change, which we'd like to avoid.
+
+2.16.1 Proposed Solution
+
+We often have extra padding at the tail of a record. If we ensure
+that the first byte (if any) of this padding is zero, we will
+have a way for future changes to detect code which doesn't
+understand a new format: the new code would write (say) a 1 at
+the tail, and thus if there is no tail or the first byte is 0, we
+would know the extension is not present on that record.
+
+2.16.2 Status
+
+Complete.
+
+2.17 TDB Does Not Use Talloc
+
+Many users of TDB (particularly Samba) use the talloc allocator,
+and thus have to wrap TDB in a talloc context to use it
+conveniently.
+
+2.17.1 Proposed Solution
+
+The allocation within TDB is not complicated enough to justify
+the use of talloc, and I am reluctant to force another
+(excellent) library on TDB users. Nonetheless a compromise is
+possible. An attribute (see[attributes]) can be added later to
+tdb_open() to provide an alternate allocation mechanism,
+specifically for talloc but usable by any other allocator (which
+would ignore the“context” argument).
+
+This would form a talloc heirarchy as expected, but the caller
+would still have to attach a destructor to the tdb context
+returned from tdb_open to close it. All TDB_DATA fields would be
+children of the tdb_context, and the caller would still have to
+manage them (using talloc_free() or talloc_steal()).
+
+2.17.2 Status
+
+Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
+
+3 Performance And Scalability Issues
+
+3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
+  Imposes Performance Penalty
+
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
+placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
+never conflict in normal tdb usage, they do add substantial
+overhead for most fcntl lock implementations when the kernel
+scans to detect if a lock conflict exists. This is often a single
+linked list, making the time to acquire and release a fcntl lock
+O(N) where N is the number of processes with the TDB open, not
+the number actually doing work.
+
+In a Samba server it is common to have huge numbers of clients
+sitting idle, and thus they have weaned themselves off the
+TDB_CLEAR_IF_FIRST flag.[footnote:
+There is a flag to tdb_reopen_all() which is used for this
+optimization: if the parent process will outlive the child, the
+child does not need the ACTIVE_LOCK. This is a workaround for
+this very performance issue.
+]
+
+3.1.1 Proposed Solution
+
+Remove the flag. It was a neat idea, but even trivial servers
+tend to know when they are initializing for the first time and
+can simply unlink the old tdb at that point.
+
+3.1.2 Status
+
+Complete.
+
+3.2 TDB Files Have a 4G Limit
+
+This seems to be becoming an issue (so much for“trivial”!),
+particularly for ldb.
+
+3.2.1 Proposed Solution
+
+A new, incompatible TDB format which uses 64 bit offsets
+internally rather than 32 bit as now. For simplicity of endian
+conversion (which TDB does on the fly if required), all values
+will be 64 bit on disk. In practice, some upper bits may be used
+for other purposes, but at least 56 bits will be available for
+file offsets.
+
+tdb_open() will automatically detect the old version, and even
+create them if TDB_VERSION6 is specified to tdb_open.
+
+32 bit processes will still be able to access TDBs larger than 4G
+(assuming that their off_t allows them to seek to 64 bits), they
+will gracefully fall back as they fail to mmap. This can happen
+already with large TDBs.
+
+Old versions of tdb will fail to open the new TDB files (since 28
+August 2009, commit 398d0c29290: prior to that any unrecognized
+file format would be erased and initialized as a fresh tdb!)
+
+3.2.2 Status
+
+Complete.
+
+3.3 TDB Records Have a 4G Limit
+
+This has not been a reported problem, and the API uses size_t
+which can be 64 bit on 64 bit platforms. However, other limits
+may have made such an issue moot.
+
+3.3.1 Proposed Solution
+
+Record sizes will be 64 bit, with an error returned on 32 bit
+platforms which try to access such records (the current
+implementation would return TDB_ERR_OOM in a similar case). It
+seems unlikely that 32 bit keys will be a limitation, so the
+implementation may not support this (see[sub:Records-Incur-A]).
+
+3.3.2 Status
+
+Complete.
+
+3.4 Hash Size Is Determined At TDB Creation Time
+
+TDB contains a number of hash chains in the header; the number is
+specified at creation time, and defaults to 131. This is such a
+bottleneck on large databases (as each hash chain gets quite
+long), that LDB uses 10,000 for this hash. In general it is
+impossible to know what the 'right' answer is at database
+creation time.
+
+3.4.1 <sub:Hash-Size-Solution>Proposed Solution
+
+After comprehensive performance testing on various scalable hash
+variants[footnote:
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94
+This was annoying because I was previously convinced that an
+expanding tree of hashes would be very close to optimal.
+], it became clear that it is hard to beat a straight linear hash
+table which doubles in size when it reaches saturation.
+Unfortunately, altering the hash table introduces serious locking
+complications: the entire hash table needs to be locked to
+enlarge the hash table, and others might be holding locks.
+Particularly insidious are insertions done under tdb_chainlock.
+
+Thus an expanding layered hash will be used: an array of hash
+groups, with each hash group exploding into pointers to lower
+hash groups once it fills, turning into a hash tree. This has
+implications for locking: we must lock the entire group in case
+we need to expand it, yet we don't know how deep the tree is at
+that point.
+
+Note that bits from the hash table entries should be stolen to
+hold more hash bits to reduce the penalty of collisions. We can
+use the otherwise-unused lower 3 bits. If we limit the size of
+the database to 64 exabytes, we can use the top 8 bits of the
+hash entry as well. These 11 bits would reduce false positives
+down to 1 in 2000 which is more than we need: we can use one of
+the bits to indicate that the extra hash bits are valid. This
+means we can choose not to re-hash all entries when we expand a
+hash group; simply use the next bits we need and mark them
+invalid.
+
+3.4.2 Status
+
+Ignore. Scaling the hash automatically proved inefficient at
+small hash sizes; we default to a 8192-element hash (changable
+via NTDB_ATTRIBUTE_HASHSIZE), and when buckets clash we expand to
+an array of hash entries. This scales slightly better than the
+tdb chain (due to the 8 top bits containing extra hash).
+
+3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
+
+TDB uses a single linked list for the free list. Allocation
+occurs as follows, using heuristics which have evolved over time:
+
+1. Get the free list lock for this whole operation.
+
+2. Multiply length by 1.25, so we always over-allocate by 25%.
+
+3. Set the slack multiplier to 1.
+
+4. Examine the current freelist entry: if it is > length but <
+  the current best case, remember it as the best case.
+
+5. Multiply the slack multiplier by 1.05.
+
+6. If our best fit so far is less than length * slack multiplier,
+  return it. The slack will be turned into a new free record if
+  it's large enough.
+
+7. Otherwise, go onto the next freelist entry.
+
+Deleting a record occurs as follows:
+
+1. Lock the hash chain for this whole operation.
+
+2. Walk the chain to find the record, keeping the prev pointer
+  offset.
+
+3. If max_dead is non-zero:
+
+  (a) Walk the hash chain again and count the dead records.
+
+  (b) If it's more than max_dead, bulk free all the dead ones
+    (similar to steps 4 and below, but the lock is only obtained
+    once).
+
+  (c) Simply mark this record as dead and return.
+
+4. Get the free list lock for the remainder of this operation.
+
+5. <right-merging>Examine the following block to see if it is
+  free; if so, enlarge the current block and remove that block
+  from the free list. This was disabled, as removal from the free
+  list was O(entries-in-free-list).
+
+6. Examine the preceeding block to see if it is free: for this
+  reason, each block has a 32-bit tailer which indicates its
+  length. If it is free, expand it to cover our new block and
+  return.
+
+7. Otherwise, prepend ourselves to the free list.
+
+Disabling right-merging (step[right-merging]) causes
+fragmentation; the other heuristics proved insufficient to
+address this, so the final answer to this was that when we expand
+the TDB file inside a transaction commit, we repack the entire
+tdb.
+
+The single list lock limits our allocation rate; due to the other
+issues this is not currently seen as a bottleneck.
+
+3.5.1 Proposed Solution
+
+The first step is to remove all the current heuristics, as they
+obviously interact, then examine them once the lock contention is
+addressed.
+
+The free list must be split to reduce contention. Assuming
+perfect free merging, we can at most have 1 free list entry for
+each entry. This implies that the number of free lists is related
+to the size of the hash table, but as it is rare to walk a large
+number of free list entries we can use far fewer, say 1/32 of the
+number of hash buckets.
+
+It seems tempting to try to reuse the hash implementation which
+we use for records here, but we have two ways of searching for
+free entries: for allocation we search by size (and possibly
+zone) which produces too many clashes for our hash table to
+handle well, and for coalescing we search by address. Thus an
+array of doubly-linked free lists seems preferable.
+
+There are various benefits in using per-size free lists (see[sub:TDB-Becomes-Fragmented]
+) but it's not clear this would reduce contention in the common
+case where all processes are allocating/freeing the same size.
+Thus we almost certainly need to divide in other ways: the most
+obvious is to divide the file into zones, and using a free list
+(or table of free lists) for each. This approximates address
+ordering.
+
+Unfortunately it is difficult to know what heuristics should be
+used to determine zone sizes, and our transaction code relies on
+being able to create a“recovery area” by simply appending to the
+file (difficult if it would need to create a new zone header).
+Thus we use a linked-list of free tables; currently we only ever
+create one, but if there is more than one we choose one at random
+to use. In future we may use heuristics to add new free tables on
+contention. We only expand the file when all free tables are
+exhausted.
+
+The basic algorithm is as follows. Freeing is simple:
+
+1. Identify the correct free list.
+
+2. Lock the corresponding list.
+
+3. Re-check the list (we didn't have a lock, sizes could have
+  changed): relock if necessary.
+
+4. Place the freed entry in the list.
+
+Allocation is a little more complicated, as we perform delayed
+coalescing at this point:
+
+1. Pick a free table; usually the previous one.
+
+2. Lock the corresponding list.
+
+3. If the top entry is -large enough, remove it from the list and
+  return it.
+
+4. Otherwise, coalesce entries in the list.If there was no entry
+  large enough, unlock the list and try the next largest list
+
+5. If no list has an entry which meets our needs, try the next
+  free table.
+
+6. If no zone satisfies, expand the file.
+
+This optimizes rapid insert/delete of free list entries by not
+coalescing them all the time.. First-fit address ordering
+ordering seems to be fairly good for keeping fragmentation low
+(see[sub:TDB-Becomes-Fragmented]). Note that address ordering
+does not need a tailer to coalesce, though if we needed one we
+could have one cheaply: see[sub:Records-Incur-A].
+
+Each free entry has the free table number in the header: less
+than 255. It also contains a doubly-linked list for easy
+deletion.
+
+3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
+
+Much of this is a result of allocation strategy[footnote:
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
+ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
+] and deliberate hobbling of coalescing; internal fragmentation
+(aka overallocation) is deliberately set at 25%, and external
+fragmentation is only cured by the decision to repack the entire
+db when a transaction commit needs to enlarge the file.
+
+3.6.1 Proposed Solution
+
+The 25% overhead on allocation works in practice for ldb because
+indexes tend to expand by one record at a time. This internal
+fragmentation can be resolved by having an“expanded” bit in the
+header to note entries that have previously expanded, and
+allocating more space for them.
+
+There are is a spectrum of possible solutions for external
+fragmentation: one is to use a fragmentation-avoiding allocation
+strategy such as best-fit address-order allocator. The other end
+of the spectrum would be to use a bump allocator (very fast and
+simple) and simply repack the file when we reach the end.
+
+There are three problems with efficient fragmentation-avoiding
+allocators: they are non-trivial, they tend to use a single free
+list for each size, and there's no evidence that tdb allocation
+patterns will match those recorded for general allocators (though
+it seems likely).
+
+Thus we don't spend too much effort on external fragmentation; we
+will be no worse than the current code if we need to repack on
+occasion. More effort is spent on reducing freelist contention,
+and reducing overhead.
+
+3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
+
+Each TDB record has a header as follows:
+
+struct tdb_record {
+
+        tdb_off_t next; /* offset of the next record in the list
+*/
+
+        tdb_len_t rec_len; /* total byte length of record */
+
+        tdb_len_t key_len; /* byte length of key */
+
+        tdb_len_t data_len; /* byte length of data */
+
+        uint32_t full_hash; /* the full 32 bit hash of the key */
+
+        uint32_t magic;   /* try to catch errors */
+
+        /* the following union is implied:
+
+                union {
+
+                        char record[rec_len];
+
+                        struct {
+
+                                char key[key_len];
+
+                                char data[data_len];
+
+                        }
+
+                        uint32_t totalsize; (tailer)
+
+                }
+
+        */
+
+};
+
+Naively, this would double to a 56-byte overhead on a 64 bit
+implementation.
+
+3.7.1 Proposed Solution
+
+We can use various techniques to reduce this for an allocated
+block:
+
+1. The 'next' pointer is not required, as we are using a flat
+  hash table.
+
+2. 'rec_len' can instead be expressed as an addition to key_len
+  and data_len (it accounts for wasted or overallocated length in
+  the record). Since the record length is always a multiple of 8,
+  we can conveniently fit it in 32 bits (representing up to 35
+  bits).
+
+3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
+  restrict 'data_len' to 32 bits, but instead we can combine the
+  two into one 64-bit field and using a 5 bit value which
+  indicates at what bit to divide the two. Keys are unlikely to
+  scale as fast as data, so I'm assuming a maximum key size of 32
+  bits.
+
+4. 'full_hash' is used to avoid a memcmp on the“miss” case, but
+  this is diminishing returns after a handful of bits (at 10
+  bits, it reduces 99.9% of false memcmp). As an aside, as the
+  lower bits are already incorporated in the hash table
+  resolution, the upper bits should be used here. Note that it's
+  not clear that these bits will be a win, given the extra bits
+  in the hash table itself (see[sub:Hash-Size-Solution]).
+
+5. 'magic' does not need to be enlarged: it currently reflects
+  one of 5 values (used, free, dead, recovery, and
+  unused_recovery). It is useful for quick sanity checking
+  however, and should not be eliminated.
+
+6. 'tailer' is only used to coalesce free blocks (so a block to
+  the right can find the header to check if this block is free).
+  This can be replaced by a single 'free' bit in the header of
+  the following block (and the tailer only exists in free
+  blocks).[footnote:
+This technique from Thomas Standish. Data Structure Techniques.
+Addison-Wesley, Reading, Massachusetts, 1980.
+] The current proposed coalescing algorithm doesn't need this,
+  however.
+
+This produces a 16 byte used header like this:
+
+struct tdb_used_record {
+
+        uint32_t used_magic : 16,
+
+
+
+                 key_data_divide: 5,
+
+                 top_hash: 11;
+
+        uint32_t extra_octets;
+
+        uint64_t key_and_data_len;
+
+};
+
+And a free record like this:
+
+struct tdb_free_record {
+
+        uint64_t free_magic: 8,
+
+                   prev : 56;
+
+
+
+        uint64_t free_table: 8,
+
+                 total_length : 56
+
+        uint64_t next;;
+
+};
+
+Note that by limiting valid offsets to 56 bits, we can pack
+everything we need into 3 64-byte words, meaning our minimum
+record size is 8 bytes.
+
+3.7.2 Status
+
+Complete.
+
+3.8 Transaction Commit Requires 4 fdatasync
+
+The current transaction algorithm is:
+
+1. write_recovery_data();
+
+2. sync();
+
+3. write_recovery_header();
+
+4. sync();
+
+5. overwrite_with_new_data();
+
+6. sync();
+
+7. remove_recovery_header();
+
+8. sync();
+
+On current ext3, each sync flushes all data to disk, so the next
+3 syncs are relatively expensive. But this could become a
+performance bottleneck on other filesystems such as ext4.
+
+3.8.1 Proposed Solution
+
+Neil Brown points out that this is overzealous, and only one sync
+is needed:
+
+1. Bundle the recovery data, a transaction counter and a strong
+  checksum of the new data.
+
+2. Strong checksum that whole bundle.
+
+3. Store the bundle in the database.
+
+4. Overwrite the oldest of the two recovery pointers in the
+  header (identified using the transaction counter) with the
+  offset of this bundle.
+
+5. sync.
+
+6. Write the new data to the file.
+
+Checking for recovery means identifying the latest bundle with a
+valid checksum and using the new data checksum to ensure that it
+has been applied. This is more expensive than the current check,
+but need only be done at open. For running databases, a separate
+header field can be used to indicate a transaction in progress;
+we need only check for recovery if this is set.
+
+3.8.2 Status
+
+Deferred.
+
+3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
+
+3.9.1 Proposed Solution
+
+None. At some point you say“use a real database” (but see[replay-attribute]
+).
+
+But as a thought experiment, if we implemented transactions to
+only overwrite free entries (this is tricky: there must not be a
+header in each entry which indicates whether it is free, but use
+of presence in metadata elsewhere), and a pointer to the hash
+table, we could create an entirely new commit without destroying
+existing data. Then it would be easy to implement snapshots in a
+similar way.
+
+This would not allow arbitrary changes to the database, such as
+tdb_repack does, and would require more space (since we have to
+preserve the current and future entries at once). If we used hash
+trees rather than one big hash table, we might only have to
+rewrite some sections of the hash, too.
+
+We could then implement snapshots using a similar method, using
+multiple different hash tables/free tables.
+
+3.9.2 Status
+
+Deferred.
+
+3.10 Transactions Cannot Operate in Parallel
+
+This would be useless for ldb, as it hits the index records with
+just about every update. It would add significant complexity in
+resolving clashes, and cause the all transaction callers to write
+their code to loop in the case where the transactions spuriously
+failed.
+
+3.10.1 Proposed Solution
+
+None (but see[replay-attribute]). We could solve a small part of
+the problem by providing read-only transactions. These would
+allow one write transaction to begin, but it could not commit
+until all r/o transactions are done. This would require a new
+RO_TRANSACTION_LOCK, which would be upgraded on commit.
+
+3.10.2 Status
+
+Deferred.
+
+3.11 Default Hash Function Is Suboptimal
+
+The Knuth-inspired multiplicative hash used by tdb is fairly slow
+(especially if we expand it to 64 bits), and works best when the
+hash bucket size is a prime number (which also means a slow
+modulus). In addition, it is highly predictable which could
+potentially lead to a Denial of Service attack in some TDB uses.
+
+3.11.1 Proposed Solution
+
+The Jenkins lookup3 hash[footnote:
+http://burtleburtle.net/bob/c/lookup3.c
+] is a fast and superbly-mixing hash. It's used by the Linux
+kernel and almost everything else. This has the particular
+properties that it takes an initial seed, and produces two 32 bit
+hash numbers, which we can combine into a 64-bit hash.
+
+The seed should be created at tdb-creation time from some random
+source, and placed in the header. This is far from foolproof, but
+adds a little bit of protection against hash bombing.
+
+3.11.2 Status
+
+Complete.
+
+3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
+
+We lock a record during traversal iteration, and try to grab that
+lock in the delete code. If that grab on delete fails, we simply
+mark it deleted and continue onwards; traversal checks for this
+condition and does the delete when it moves off the record.
+
+If traversal terminates, the dead record may be left
+indefinitely.
+
+3.12.1 Proposed Solution
+
+Remove reliability guarantees; see[traverse-Proposed-Solution].
+
+3.12.2 Status
+
+Complete.
+
+3.13 Fcntl Locking Adds Overhead
+
+Placing a fcntl lock means a system call, as does removing one.
+This is actually one reason why transactions can be faster
+(everything is locked once at transaction start). In the
+uncontended case, this overhead can theoretically be eliminated.
+
+3.13.1 Proposed Solution
+
+None.
+
+We tried this before with spinlock support, in the early days of
+TDB, and it didn't make much difference except in manufactured
+benchmarks.
+
+We could use spinlocks (with futex kernel support under Linux),
+but it means that we lose automatic cleanup when a process dies
+with a lock. There is a method of auto-cleanup under Linux, but
+it's not supported by other operating systems. We could
+reintroduce a clear-if-first-style lock and sweep for dead
+futexes on open, but that wouldn't help the normal case of one
+concurrent opener dying. Increasingly elaborate repair schemes
+could be considered, but they require an ABI change (everyone
+must use them) anyway, so there's no need to do this at the same
+time as everything else.
+
+3.14 Some Transactions Don't Require Durability
+
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for
+normal (fast) usage, and occasionally empties the results into a
+transactional TDB. This kind of usage prioritizes performance
+over durability: as long as we are consistent, data can be lost.
+
+This would be more neatly implemented inside tdb: a“soft”
+transaction commit (ie. syncless) which meant that data may be
+reverted on a crash.
+
+3.14.1 Proposed Solution
+
+None.
+
+Unfortunately any transaction scheme which overwrites old data
+requires a sync before that overwrite to avoid the possibility of
+corruption.
+
+It seems possible to use a scheme similar to that described in[sub:TDB-Does-Not]
+,where transactions are committed without overwriting existing
+data, and an array of top-level pointers were available in the
+header. If the transaction is“soft” then we would not need a sync
+at all: existing processes would pick up the new hash table and
+free list and work with that.
+
+At some later point, a sync would allow recovery of the old data
+into the free lists (perhaps when the array of top-level pointers
+filled). On crash, tdb_open() would examine the array of top
+levels, and apply the transactions until it encountered an
+invalid checksum.
+
+3.15 Tracing Is Fragile, Replay Is External
+
+The current TDB has compile-time-enabled tracing code, but it
+often breaks as it is not enabled by default. In a similar way,
+the ctdb code has an external wrapper which does replay tracing
+so it can coordinate cluster-wide transactions.
+
+3.15.1 Proposed Solution<replay-attribute>
+
+Tridge points out that an attribute can be later added to
+tdb_open (see[attributes]) to provide replay/trace hooks, which
+could become the basis for this and future parallel transactions
+and snapshot support.
+
+3.15.2 Status
+
+Deferred.
diff --git a/ccan/ntdb/free.c b/ccan/ntdb/free.c
new file mode 100644 (file)
index 0000000..0d0e25f
--- /dev/null
@@ -0,0 +1,972 @@
+ /*
+   Trivial Database 2: free list/block handling
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+#include <ccan/ilog/ilog.h>
+#include <time.h>
+#include <limits.h>
+
+static unsigned fls64(uint64_t val)
+{
+       return ilog64(val);
+}
+
+/* In which bucket would we find a particular record size? (ignoring header) */
+unsigned int size_to_bucket(ntdb_len_t data_len)
+{
+       unsigned int bucket;
+
+       /* We can't have records smaller than this. */
+       assert(data_len >= NTDB_MIN_DATA_LEN);
+
+       /* Ignoring the header... */
+       if (data_len - NTDB_MIN_DATA_LEN <= 64) {
+               /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
+               bucket = (data_len - NTDB_MIN_DATA_LEN) / 8;
+       } else {
+               /* After that we go power of 2. */
+               bucket = fls64(data_len - NTDB_MIN_DATA_LEN) + 2;
+       }
+
+       if (unlikely(bucket >= NTDB_FREE_BUCKETS))
+               bucket = NTDB_FREE_BUCKETS - 1;
+       return bucket;
+}
+
+ntdb_off_t first_ftable(struct ntdb_context *ntdb)
+{
+       return ntdb_read_off(ntdb, offsetof(struct ntdb_header, free_table));
+}
+
+ntdb_off_t next_ftable(struct ntdb_context *ntdb, ntdb_off_t ftable)
+{
+       return ntdb_read_off(ntdb, ftable + offsetof(struct ntdb_freetable,next));
+}
+
+enum NTDB_ERROR ntdb_ftable_init(struct ntdb_context *ntdb)
+{
+       /* Use reservoir sampling algorithm to select a free list at random. */
+       unsigned int rnd, max = 0, count = 0;
+       ntdb_off_t off;
+
+       ntdb->ftable_off = off = first_ftable(ntdb);
+       ntdb->ftable = 0;
+
+       while (off) {
+               if (NTDB_OFF_IS_ERR(off)) {
+                       return NTDB_OFF_TO_ERR(off);
+               }
+
+               rnd = random();
+               if (rnd >= max) {
+                       ntdb->ftable_off = off;
+                       ntdb->ftable = count;
+                       max = rnd;
+               }
+
+               off = next_ftable(ntdb, off);
+               count++;
+       }
+       return NTDB_SUCCESS;
+}
+
+/* Offset of a given bucket. */
+ntdb_off_t bucket_off(ntdb_off_t ftable_off, unsigned bucket)
+{
+       return ftable_off + offsetof(struct ntdb_freetable, buckets)
+               + bucket * sizeof(ntdb_off_t);
+}
+
+/* Returns free_buckets + 1, or list number to search, or -ve error. */
+static ntdb_off_t find_free_head(struct ntdb_context *ntdb,
+                               ntdb_off_t ftable_off,
+                               ntdb_off_t bucket)
+{
+       /* Speculatively search for a non-zero bucket. */
+       return ntdb_find_nonzero_off(ntdb, bucket_off(ftable_off, 0),
+                                   bucket, NTDB_FREE_BUCKETS);
+}
+
+static void check_list(struct ntdb_context *ntdb, ntdb_off_t b_off)
+{
+#ifdef CCAN_NTDB_DEBUG
+       ntdb_off_t off, prev = 0, first;
+       struct ntdb_free_record r;
+
+       first = off = (ntdb_read_off(ntdb, b_off) & NTDB_OFF_MASK);
+       while (off != 0) {
+               ntdb_read_convert(ntdb, off, &r, sizeof(r));
+               if (frec_magic(&r) != NTDB_FREE_MAGIC)
+                       abort();
+               if (prev && frec_prev(&r) != prev)
+                       abort();
+               prev = off;
+               off = r.next;
+       }
+
+       if (first) {
+               ntdb_read_convert(ntdb, first, &r, sizeof(r));
+               if (frec_prev(&r) != prev)
+                       abort();
+       }
+#endif
+}
+
+/* Remove from free bucket. */
+static enum NTDB_ERROR remove_from_list(struct ntdb_context *ntdb,
+                                      ntdb_off_t b_off, ntdb_off_t r_off,
+                                      const struct ntdb_free_record *r)
+{
+       ntdb_off_t off, prev_next, head;
+       enum NTDB_ERROR ecode;
+
+       /* Is this only element in list?  Zero out bucket, and we're done. */
+       if (frec_prev(r) == r_off)
+               return ntdb_write_off(ntdb, b_off, 0);
+
+       /* off = &r->prev->next */
+       off = frec_prev(r) + offsetof(struct ntdb_free_record, next);
+
+       /* Get prev->next */
+       prev_next = ntdb_read_off(ntdb, off);
+       if (NTDB_OFF_IS_ERR(prev_next))
+               return NTDB_OFF_TO_ERR(prev_next);
+
+       /* If prev->next == 0, we were head: update bucket to point to next. */
+       if (prev_next == 0) {
+               /* We must preserve upper bits. */
+               head = ntdb_read_off(ntdb, b_off);
+               if (NTDB_OFF_IS_ERR(head))
+                       return NTDB_OFF_TO_ERR(head);
+
+               if ((head & NTDB_OFF_MASK) != r_off) {
+                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                         "remove_from_list:"
+                                         " %llu head %llu on list %llu",
+                                         (long long)r_off,
+                                         (long long)head,
+                                         (long long)b_off);
+               }
+               head = ((head & ~NTDB_OFF_MASK) | r->next);
+               ecode = ntdb_write_off(ntdb, b_off, head);
+               if (ecode != NTDB_SUCCESS)
+                       return ecode;
+       } else {
+               /* r->prev->next = r->next */
+               ecode = ntdb_write_off(ntdb, off, r->next);
+               if (ecode != NTDB_SUCCESS)
+                       return ecode;
+       }
+
+       /* If we were the tail, off = &head->prev. */
+       if (r->next == 0) {
+               head = ntdb_read_off(ntdb, b_off);
+               if (NTDB_OFF_IS_ERR(head))
+                       return NTDB_OFF_TO_ERR(head);
+               head &= NTDB_OFF_MASK;
+               off = head + offsetof(struct ntdb_free_record, magic_and_prev);
+       } else {
+               /* off = &r->next->prev */
+               off = r->next + offsetof(struct ntdb_free_record,
+                                        magic_and_prev);
+       }
+
+#ifdef CCAN_NTDB_DEBUG
+       /* *off == r */
+       if ((ntdb_read_off(ntdb, off) & NTDB_OFF_MASK) != r_off) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                 "remove_from_list:"
+                                 " %llu bad prev in list %llu",
+                                 (long long)r_off, (long long)b_off);
+       }
+#endif
+       /* r->next->prev = r->prev */
+       return ntdb_write_off(ntdb, off, r->magic_and_prev);
+}
+
+/* Enqueue in this free bucket: sets coalesce if we've added 128
+ * entries to it. */
+static enum NTDB_ERROR enqueue_in_free(struct ntdb_context *ntdb,
+                                     ntdb_off_t b_off,
+                                     ntdb_off_t off,
+                                     ntdb_len_t len,
+                                     bool *coalesce)
+{
+       struct ntdb_free_record new;
+       enum NTDB_ERROR ecode;
+       ntdb_off_t prev, head;
+       uint64_t magic = (NTDB_FREE_MAGIC << (64 - NTDB_OFF_UPPER_STEAL));
+
+       head = ntdb_read_off(ntdb, b_off);
+       if (NTDB_OFF_IS_ERR(head))
+               return NTDB_OFF_TO_ERR(head);
+
+       /* We only need to set ftable_and_len; rest is set in enqueue_in_free */
+       new.ftable_and_len = ((uint64_t)ntdb->ftable
+                             << (64 - NTDB_OFF_UPPER_STEAL))
+               | len;
+
+       /* new->next = head. */
+       new.next = (head & NTDB_OFF_MASK);
+
+       /* First element?  Prev points to ourselves. */
+       if (!new.next) {
+               new.magic_and_prev = (magic | off);
+       } else {
+               /* new->prev = next->prev */
+               prev = ntdb_read_off(ntdb,
+                                   new.next + offsetof(struct ntdb_free_record,
+                                                       magic_and_prev));
+               new.magic_and_prev = prev;
+               if (frec_magic(&new) != NTDB_FREE_MAGIC) {
+                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                         "enqueue_in_free: %llu bad head"
+                                         " prev %llu",
+                                         (long long)new.next,
+                                         (long long)prev);
+               }
+               /* next->prev = new. */
+               ecode = ntdb_write_off(ntdb, new.next
+                                     + offsetof(struct ntdb_free_record,
+                                                magic_and_prev),
+                                     off | magic);
+               if (ecode != NTDB_SUCCESS) {
+                       return ecode;
+               }
+
+#ifdef CCAN_NTDB_DEBUG
+               prev = ntdb_read_off(ntdb, frec_prev(&new)
+                                   + offsetof(struct ntdb_free_record, next));
+               if (prev != 0) {
+                       return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                         "enqueue_in_free:"
+                                         " %llu bad tail next ptr %llu",
+                                         (long long)frec_prev(&new)
+                                         + offsetof(struct ntdb_free_record,
+                                                    next),
+                                         (long long)prev);
+               }
+#endif
+       }
+
+       /* Update enqueue count, but don't set high bit: see NTDB_OFF_IS_ERR */
+       if (*coalesce)
+               head += (1ULL << (64 - NTDB_OFF_UPPER_STEAL));
+       head &= ~(NTDB_OFF_MASK | (1ULL << 63));
+       head |= off;
+
+       ecode = ntdb_write_off(ntdb, b_off, head);
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       /* It's time to coalesce if counter wrapped. */
+       if (*coalesce)
+               *coalesce = ((head & ~NTDB_OFF_MASK) == 0);
+
+       return ntdb_write_convert(ntdb, off, &new, sizeof(new));
+}
+
+static ntdb_off_t ftable_offset(struct ntdb_context *ntdb, unsigned int ftable)
+{
+       ntdb_off_t off;
+       unsigned int i;
+
+       if (likely(ntdb->ftable == ftable))
+               return ntdb->ftable_off;
+
+       off = first_ftable(ntdb);
+       for (i = 0; i < ftable; i++) {
+               if (NTDB_OFF_IS_ERR(off)) {
+                       break;
+               }
+               off = next_ftable(ntdb, off);
+       }
+       return off;
+}
+
+/* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and
+ * need to blatt the *protect record (which is set to an error). */
+static ntdb_len_t coalesce(struct ntdb_context *ntdb,
+                         ntdb_off_t off, ntdb_off_t b_off,
+                         ntdb_len_t data_len,
+                         ntdb_off_t *protect)
+{
+       ntdb_off_t end;
+       struct ntdb_free_record rec;
+       enum NTDB_ERROR ecode;
+
+       ntdb->stats.alloc_coalesce_tried++;
+       end = off + sizeof(struct ntdb_used_record) + data_len;
+
+       while (end < ntdb->file->map_size) {
+               const struct ntdb_free_record *r;
+               ntdb_off_t nb_off;
+               unsigned ftable, bucket;
+
+               r = ntdb_access_read(ntdb, end, sizeof(*r), true);
+               if (NTDB_PTR_IS_ERR(r)) {
+                       ecode = NTDB_PTR_ERR(r);
+                       goto err;
+               }
+
+               if (frec_magic(r) != NTDB_FREE_MAGIC
+                   || frec_ftable(r) == NTDB_FTABLE_NONE) {
+                       ntdb_access_release(ntdb, r);
+                       break;
+               }
+
+               ftable = frec_ftable(r);
+               bucket = size_to_bucket(frec_len(r));
+               nb_off = ftable_offset(ntdb, ftable);
+               if (NTDB_OFF_IS_ERR(nb_off)) {
+                       ntdb_access_release(ntdb, r);
+                       ecode = NTDB_OFF_TO_ERR(nb_off);
+                       goto err;
+               }
+               nb_off = bucket_off(nb_off, bucket);
+               ntdb_access_release(ntdb, r);
+
+               /* We may be violating lock order here, so best effort. */
+               if (ntdb_lock_free_bucket(ntdb, nb_off, NTDB_LOCK_NOWAIT)
+                   != NTDB_SUCCESS) {
+                       ntdb->stats.alloc_coalesce_lockfail++;
+                       break;
+               }
+
+               /* Now we have lock, re-check. */
+               ecode = ntdb_read_convert(ntdb, end, &rec, sizeof(rec));
+               if (ecode != NTDB_SUCCESS) {
+                       ntdb_unlock_free_bucket(ntdb, nb_off);
+                       goto err;
+               }
+
+               if (unlikely(frec_magic(&rec) != NTDB_FREE_MAGIC)) {
+                       ntdb->stats.alloc_coalesce_race++;
+                       ntdb_unlock_free_bucket(ntdb, nb_off);
+                       break;
+               }
+
+               if (unlikely(frec_ftable(&rec) != ftable)
+                   || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
+                       ntdb->stats.alloc_coalesce_race++;
+                       ntdb_unlock_free_bucket(ntdb, nb_off);
+                       break;
+               }
+
+               /* Did we just mess up a record you were hoping to use? */
+               if (end == *protect) {
+                       ntdb->stats.alloc_coalesce_iterate_clash++;
+                       *protect = NTDB_ERR_TO_OFF(NTDB_ERR_NOEXIST);
+               }
+
+               ecode = remove_from_list(ntdb, nb_off, end, &rec);
+               check_list(ntdb, nb_off);
+               if (ecode != NTDB_SUCCESS) {
+                       ntdb_unlock_free_bucket(ntdb, nb_off);
+                       goto err;
+               }
+
+               end += sizeof(struct ntdb_used_record) + frec_len(&rec);
+               ntdb_unlock_free_bucket(ntdb, nb_off);
+               ntdb->stats.alloc_coalesce_num_merged++;
+       }
+
+       /* Didn't find any adjacent free? */
+       if (end == off + sizeof(struct ntdb_used_record) + data_len)
+               return 0;
+
+       /* Before we expand, check this isn't one you wanted protected? */
+       if (off == *protect) {
+               *protect = NTDB_ERR_TO_OFF(NTDB_ERR_EXISTS);
+               ntdb->stats.alloc_coalesce_iterate_clash++;
+       }
+
+       /* OK, expand initial record */
+       ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
+       if (ecode != NTDB_SUCCESS) {
+               goto err;
+       }
+
+       if (frec_len(&rec) != data_len) {
+               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                  "coalesce: expected data len %zu not %zu",
+                                  (size_t)data_len, (size_t)frec_len(&rec));
+               goto err;
+       }
+
+       ecode = remove_from_list(ntdb, b_off, off, &rec);
+       check_list(ntdb, b_off);
+       if (ecode != NTDB_SUCCESS) {
+               goto err;
+       }
+
+       /* Try locking violation first.  We don't allow coalesce recursion! */
+       ecode = add_free_record(ntdb, off, end - off, NTDB_LOCK_NOWAIT, false);
+       if (ecode != NTDB_SUCCESS) {
+               /* Need to drop lock.  Can't rely on anything stable. */
+               ntdb->stats.alloc_coalesce_lockfail++;
+               *protect = NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT);
+
+               /* We have to drop this to avoid deadlocks, so make sure record
+                * doesn't get coalesced by someone else! */
+               rec.ftable_and_len = (NTDB_FTABLE_NONE
+                                     << (64 - NTDB_OFF_UPPER_STEAL))
+                       | (end - off - sizeof(struct ntdb_used_record));
+               ecode = ntdb_write_off(ntdb,
+                                     off + offsetof(struct ntdb_free_record,
+                                                    ftable_and_len),
+                                     rec.ftable_and_len);
+               if (ecode != NTDB_SUCCESS) {
+                       goto err;
+               }
+
+               ntdb_unlock_free_bucket(ntdb, b_off);
+
+               ecode = add_free_record(ntdb, off, end - off, NTDB_LOCK_WAIT,
+                                       false);
+               if (ecode != NTDB_SUCCESS) {
+                       return NTDB_ERR_TO_OFF(ecode);
+               }
+       } else if (NTDB_OFF_IS_ERR(*protect)) {
+               /* For simplicity, we always drop lock if they can't continue */
+               ntdb_unlock_free_bucket(ntdb, b_off);
+       }
+       ntdb->stats.alloc_coalesce_succeeded++;
+
+       /* Return usable length. */
+       return end - off - sizeof(struct ntdb_used_record);
+
+err:
+       /* To unify error paths, we *always* unlock bucket on error. */
+       ntdb_unlock_free_bucket(ntdb, b_off);
+       return NTDB_ERR_TO_OFF(ecode);
+}
+
+/* List is locked: we unlock it. */
+static enum NTDB_ERROR coalesce_list(struct ntdb_context *ntdb,
+                                   ntdb_off_t ftable_off,
+                                   ntdb_off_t b_off,
+                                   unsigned int limit)
+{
+       enum NTDB_ERROR ecode;
+       ntdb_off_t off;
+
+       off = ntdb_read_off(ntdb, b_off);
+       if (NTDB_OFF_IS_ERR(off)) {
+               ecode = NTDB_OFF_TO_ERR(off);
+               goto unlock_err;
+       }
+       /* A little bit of paranoia: counter should be 0. */
+       off &= NTDB_OFF_MASK;
+
+       while (off && limit--) {
+               struct ntdb_free_record rec;
+               ntdb_len_t coal;
+               ntdb_off_t next;
+
+               ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
+               if (ecode != NTDB_SUCCESS)
+                       goto unlock_err;
+
+               next = rec.next;
+               coal = coalesce(ntdb, off, b_off, frec_len(&rec), &next);
+               if (NTDB_OFF_IS_ERR(coal)) {
+                       /* This has already unlocked on error. */
+                       return NTDB_OFF_TO_ERR(coal);
+               }
+               if (NTDB_OFF_IS_ERR(next)) {
+                       /* Coalescing had to unlock, so stop. */
+                       return NTDB_SUCCESS;
+               }
+               /* Keep going if we're doing well... */
+               limit += size_to_bucket(coal / 16 + NTDB_MIN_DATA_LEN);
+               off = next;
+       }
+
+       /* Now, move those elements to the tail of the list so we get something
+        * else next time. */
+       if (off) {
+               struct ntdb_free_record oldhrec, newhrec, oldtrec, newtrec;
+               ntdb_off_t oldhoff, oldtoff, newtoff;
+
+               /* The record we were up to is the new head. */
+               ecode = ntdb_read_convert(ntdb, off, &newhrec, sizeof(newhrec));
+               if (ecode != NTDB_SUCCESS)
+                       goto unlock_err;
+
+               /* Get the new tail. */
+               newtoff = frec_prev(&newhrec);
+               ecode = ntdb_read_convert(ntdb, newtoff, &newtrec,
+                                        sizeof(newtrec));
+               if (ecode != NTDB_SUCCESS)
+                       goto unlock_err;
+
+               /* Get the old head. */
+               oldhoff = ntdb_read_off(ntdb, b_off);
+               if (NTDB_OFF_IS_ERR(oldhoff)) {
+                       ecode = NTDB_OFF_TO_ERR(oldhoff);
+                       goto unlock_err;
+               }
+
+               /* This could happen if they all coalesced away. */
+               if (oldhoff == off)
+                       goto out;
+
+               ecode = ntdb_read_convert(ntdb, oldhoff, &oldhrec,
+                                        sizeof(oldhrec));
+               if (ecode != NTDB_SUCCESS)
+                       goto unlock_err;
+
+               /* Get the old tail. */
+               oldtoff = frec_prev(&oldhrec);
+               ecode = ntdb_read_convert(ntdb, oldtoff, &oldtrec,
+                                        sizeof(oldtrec));
+               if (ecode != NTDB_SUCCESS)
+                       goto unlock_err;
+
+               /* Old tail's next points to old head. */
+               oldtrec.next = oldhoff;
+
+               /* Old head's prev points to old tail. */
+               oldhrec.magic_and_prev
+                       = (NTDB_FREE_MAGIC << (64 - NTDB_OFF_UPPER_STEAL))
+                       | oldtoff;
+
+               /* New tail's next is 0. */
+               newtrec.next = 0;
+
+               /* Write out the modified versions. */
+               ecode = ntdb_write_convert(ntdb, oldtoff, &oldtrec,
+                                         sizeof(oldtrec));
+               if (ecode != NTDB_SUCCESS)
+                       goto unlock_err;
+
+               ecode = ntdb_write_convert(ntdb, oldhoff, &oldhrec,
+                                         sizeof(oldhrec));
+               if (ecode != NTDB_SUCCESS)
+                       goto unlock_err;
+
+               ecode = ntdb_write_convert(ntdb, newtoff, &newtrec,
+                                         sizeof(newtrec));
+               if (ecode != NTDB_SUCCESS)
+                       goto unlock_err;
+
+               /* And finally link in new head. */
+               ecode = ntdb_write_off(ntdb, b_off, off);
+               if (ecode != NTDB_SUCCESS)
+                       goto unlock_err;
+       }
+out:
+       ntdb_unlock_free_bucket(ntdb, b_off);
+       return NTDB_SUCCESS;
+
+unlock_err:
+       ntdb_unlock_free_bucket(ntdb, b_off);
+       return ecode;
+}
+
+/* List must not be locked if coalesce_ok is set. */
+enum NTDB_ERROR add_free_record(struct ntdb_context *ntdb,
+                              ntdb_off_t off, ntdb_len_t len_with_header,
+                              enum ntdb_lock_flags waitflag,
+                              bool coalesce_ok)
+{
+       ntdb_off_t b_off;
+       ntdb_len_t len;
+       enum NTDB_ERROR ecode;
+
+       assert(len_with_header >= sizeof(struct ntdb_free_record));
+
+       len = len_with_header - sizeof(struct ntdb_used_record);
+
+       b_off = bucket_off(ntdb->ftable_off, size_to_bucket(len));
+       ecode = ntdb_lock_free_bucket(ntdb, b_off, waitflag);
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       ecode = enqueue_in_free(ntdb, b_off, off, len, &coalesce_ok);
+       check_list(ntdb, b_off);
+
+       /* Coalescing unlocks free list. */
+       if (!ecode && coalesce_ok)
+               ecode = coalesce_list(ntdb, ntdb->ftable_off, b_off, 2);
+       else
+               ntdb_unlock_free_bucket(ntdb, b_off);
+       return ecode;
+}
+
+static size_t adjust_size(size_t keylen, size_t datalen)
+{
+       size_t size = keylen + datalen;
+
+       if (size < NTDB_MIN_DATA_LEN)
+               size = NTDB_MIN_DATA_LEN;
+
+       /* Round to next uint64_t boundary. */
+       return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
+}
+
+/* If we have enough left over to be useful, split that off. */
+static size_t record_leftover(size_t keylen, size_t datalen,
+                             bool want_extra, size_t total_len)
+{
+       ssize_t leftover;
+
+       if (want_extra)
+               datalen += datalen / 2;
+       leftover = total_len - adjust_size(keylen, datalen);
+
+       if (leftover < (ssize_t)sizeof(struct ntdb_free_record))
+               return 0;
+
+       return leftover;
+}
+
+/* We need size bytes to put our key and data in. */
+static ntdb_off_t lock_and_alloc(struct ntdb_context *ntdb,
+                               ntdb_off_t ftable_off,
+                               ntdb_off_t bucket,
+                               size_t keylen, size_t datalen,
+                               bool want_extra,
+                               unsigned magic)
+{
+       ntdb_off_t off, b_off,best_off;
+       struct ntdb_free_record best = { 0 };
+       double multiplier;
+       size_t size = adjust_size(keylen, datalen);
+       enum NTDB_ERROR ecode;
+
+       ntdb->stats.allocs++;
+       b_off = bucket_off(ftable_off, bucket);
+
+       /* FIXME: Try non-blocking wait first, to measure contention. */
+       /* Lock this bucket. */
+       ecode = ntdb_lock_free_bucket(ntdb, b_off, NTDB_LOCK_WAIT);
+       if (ecode != NTDB_SUCCESS) {
+               return NTDB_ERR_TO_OFF(ecode);
+       }
+
+       best.ftable_and_len = -1ULL;
+       best_off = 0;
+
+       /* Get slack if we're after extra. */
+       if (want_extra)
+               multiplier = 1.5;
+       else
+               multiplier = 1.0;
+
+       /* Walk the list to see if any are large enough, getting less fussy
+        * as we go. */
+       off = ntdb_read_off(ntdb, b_off);
+       if (NTDB_OFF_IS_ERR(off)) {
+               ecode = NTDB_OFF_TO_ERR(off);
+               goto unlock_err;
+       }
+       off &= NTDB_OFF_MASK;
+
+       while (off) {
+               const struct ntdb_free_record *r;
+               ntdb_off_t next;
+
+               r = ntdb_access_read(ntdb, off, sizeof(*r), true);
+               if (NTDB_PTR_IS_ERR(r)) {
+                       ecode = NTDB_PTR_ERR(r);
+                       goto unlock_err;
+               }
+
+               if (frec_magic(r) != NTDB_FREE_MAGIC) {
+                       ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
+                                          "lock_and_alloc:"
+                                          " %llu non-free 0x%llx",
+                                          (long long)off,
+                                          (long long)r->magic_and_prev);
+                       ntdb_access_release(ntdb, r);
+                       goto unlock_err;
+               }
+
+               if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
+                       best_off = off;
+                       best = *r;
+               }
+
+               if (frec_len(&best) <= size * multiplier && best_off) {
+                       ntdb_access_release(ntdb, r);
+                       break;
+               }
+
+               multiplier *= 1.01;
+
+               next = r->next;
+               ntdb_access_release(ntdb, r);
+               off = next;
+       }
+
+       /* If we found anything at all, use it. */
+       if (best_off) {
+               struct ntdb_used_record rec;
+               size_t leftover;
+
+               /* We're happy with this size: take it. */
+               ecode = remove_from_list(ntdb, b_off, best_off, &best);
+               check_list(ntdb, b_off);
+               if (ecode != NTDB_SUCCESS) {
+                       goto unlock_err;
+               }
+
+               leftover = record_leftover(keylen, datalen, want_extra,
+                                          frec_len(&best));
+
+               assert(keylen + datalen + leftover <= frec_len(&best));
+               /* We need to mark non-free before we drop lock, otherwise
+                * coalesce() could try to merge it! */
+               ecode = set_header(ntdb, &rec, magic, keylen, datalen,
+                                  frec_len(&best) - leftover);
+               if (ecode != NTDB_SUCCESS) {
+                       goto unlock_err;
+               }
+
+               ecode = ntdb_write_convert(ntdb, best_off, &rec, sizeof(rec));
+               if (ecode != NTDB_SUCCESS) {
+                       goto unlock_err;
+               }
+
+               /* For futureproofing, we put a 0 in any unused space. */
+               if (rec_extra_padding(&rec)) {
+                       ecode = ntdb->io->twrite(ntdb, best_off + sizeof(rec)
+                                               + keylen + datalen, "", 1);
+                       if (ecode != NTDB_SUCCESS) {
+                               goto unlock_err;
+                       }
+               }
+
+               /* Bucket of leftover will be <= current bucket, so nested
+                * locking is allowed. */
+               if (leftover) {
+                       ntdb->stats.alloc_leftover++;
+                       ecode = add_free_record(ntdb,
+                                               best_off + sizeof(rec)
+                                               + frec_len(&best) - leftover,
+                                               leftover, NTDB_LOCK_WAIT, false);
+                       if (ecode != NTDB_SUCCESS) {
+                               best_off = NTDB_ERR_TO_OFF(ecode);
+                       }
+               }
+               ntdb_unlock_free_bucket(ntdb, b_off);
+
+               return best_off;
+       }
+
+       ntdb_unlock_free_bucket(ntdb, b_off);
+       return 0;
+
+unlock_err:
+       ntdb_unlock_free_bucket(ntdb, b_off);
+       return NTDB_ERR_TO_OFF(ecode);
+}
+
+/* Get a free block from current free list, or 0 if none, -ve on error. */
+static ntdb_off_t get_free(struct ntdb_context *ntdb,
+                         size_t keylen, size_t datalen, bool want_extra,
+                         unsigned magic)
+{
+       ntdb_off_t off, ftable_off;
+       ntdb_off_t start_b, b, ftable;
+       bool wrapped = false;
+
+       /* If they are growing, add 50% to get to higher bucket. */
+       if (want_extra)
+               start_b = size_to_bucket(adjust_size(keylen,
+                                                    datalen + datalen / 2));
+       else
+               start_b = size_to_bucket(adjust_size(keylen, datalen));
+
+       ftable_off = ntdb->ftable_off;
+       ftable = ntdb->ftable;
+       while (!wrapped || ftable_off != ntdb->ftable_off) {
+               /* Start at exact size bucket, and search up... */
+               for (b = find_free_head(ntdb, ftable_off, start_b);
+                    b < NTDB_FREE_BUCKETS;
+                    b = find_free_head(ntdb, ftable_off, b + 1)) {
+                       /* Try getting one from list. */
+                       off = lock_and_alloc(ntdb, ftable_off,
+                                            b, keylen, datalen, want_extra,
+                                            magic);
+                       if (NTDB_OFF_IS_ERR(off))
+                               return off;
+                       if (off != 0) {
+                               if (b == start_b)
+                                       ntdb->stats.alloc_bucket_exact++;
+                               if (b == NTDB_FREE_BUCKETS - 1)
+                                       ntdb->stats.alloc_bucket_max++;
+                               /* Worked?  Stay using this list. */
+                               ntdb->ftable_off = ftable_off;
+                               ntdb->ftable = ftable;
+                               return off;
+                       }
+                       /* Didn't work.  Try next bucket. */
+               }
+
+               if (NTDB_OFF_IS_ERR(b)) {
+                       return b;
+               }
+
+               /* Hmm, try next table. */
+               ftable_off = next_ftable(ntdb, ftable_off);
+               if (NTDB_OFF_IS_ERR(ftable_off)) {
+                       return ftable_off;
+               }
+               ftable++;
+
+               if (ftable_off == 0) {
+                       wrapped = true;
+                       ftable_off = first_ftable(ntdb);
+                       if (NTDB_OFF_IS_ERR(ftable_off)) {
+                               return ftable_off;
+                       }
+                       ftable = 0;
+               }
+       }
+
+       return 0;
+}
+
+enum NTDB_ERROR set_header(struct ntdb_context *ntdb,
+                         struct ntdb_used_record *rec,
+                         unsigned magic, uint64_t keylen, uint64_t datalen,
+                         uint64_t actuallen)
+{
+       uint64_t keybits = (fls64(keylen) + 1) / 2;
+
+       rec->magic_and_meta = ((actuallen - (keylen + datalen)) << 11)
+               | (keybits << 43)
+               | ((uint64_t)magic << 48);
+       rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
+
+       /* Encoding can fail on big values. */
+       if (rec_key_length(rec) != keylen
+           || rec_data_length(rec) != datalen
+           || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
+               return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+                                 "Could not encode k=%llu,d=%llu,a=%llu",
+                                 (long long)keylen, (long long)datalen,
+                                 (long long)actuallen);
+       }
+       return NTDB_SUCCESS;
+}
+
+/* You need 'size', this tells you how much you should expand by. */
+ntdb_off_t ntdb_expand_adjust(ntdb_off_t map_size, ntdb_off_t size)
+{
+       ntdb_off_t new_size, top_size;
+
+       /* limit size in order to avoid using up huge amounts of memory for
+        * in memory tdbs if an oddball huge record creeps in */
+       if (size > 100 * 1024) {
+               top_size = map_size + size * 2;
+       } else {
+               top_size = map_size + size * 100;
+       }
+
+       /* always make room for at least top_size more records, and at
+          least 25% more space. if the DB is smaller than 100MiB,
+          otherwise grow it by 10% only. */
+       if (map_size > 100 * 1024 * 1024) {
+               new_size = map_size * 1.10;
+       } else {
+               new_size = map_size * 1.25;
+       }
+
+       if (new_size < top_size)
+               new_size = top_size;
+
+       /* We always make the file a multiple of transaction page
+        * size.  This guarantees that the transaction recovery area
+        * is always aligned, otherwise the transaction code can overwrite
+        * itself. */
+       new_size = (new_size + NTDB_PGSIZE-1) & ~(NTDB_PGSIZE-1);
+       return new_size - map_size;
+}
+
+/* Expand the database. */
+static enum NTDB_ERROR ntdb_expand(struct ntdb_context *ntdb, ntdb_len_t size)
+{
+       uint64_t old_size;
+       ntdb_len_t wanted;
+       enum NTDB_ERROR ecode;
+
+       /* Need to hold a hash lock to expand DB: transactions rely on it. */
+       if (!(ntdb->flags & NTDB_NOLOCK)
+           && !ntdb->file->allrecord_lock.count && !ntdb_has_hash_locks(ntdb)) {
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                 "ntdb_expand: must hold lock during expand");
+       }
+
+       /* Only one person can expand file at a time. */
+       ecode = ntdb_lock_expand(ntdb, F_WRLCK);
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       /* Someone else may have expanded the file, so retry. */
+       old_size = ntdb->file->map_size;
+       ntdb_oob(ntdb, ntdb->file->map_size, 1, true);
+       if (ntdb->file->map_size != old_size) {
+               ntdb_unlock_expand(ntdb, F_WRLCK);
+               return NTDB_SUCCESS;
+       }
+
+       /* We need room for the record header too. */
+       size = adjust_size(0, sizeof(struct ntdb_used_record) + size);
+       /* Overallocate. */
+       wanted = ntdb_expand_adjust(old_size, size);
+
+       ecode = ntdb->io->expand_file(ntdb, wanted);
+       if (ecode != NTDB_SUCCESS) {
+               ntdb_unlock_expand(ntdb, F_WRLCK);
+               return ecode;
+       }
+
+       /* We need to drop this lock before adding free record. */
+       ntdb_unlock_expand(ntdb, F_WRLCK);
+
+       ntdb->stats.expands++;
+       return add_free_record(ntdb, old_size, wanted, NTDB_LOCK_WAIT, true);
+}
+
+/* This won't fail: it will expand the database if it has to. */
+ntdb_off_t alloc(struct ntdb_context *ntdb, size_t keylen, size_t datalen,
+                unsigned magic, bool growing)
+{
+       ntdb_off_t off;
+
+       for (;;) {
+               enum NTDB_ERROR ecode;
+               off = get_free(ntdb, keylen, datalen, growing, magic);
+               if (likely(off != 0))
+                       break;
+
+               ecode = ntdb_expand(ntdb, adjust_size(keylen, datalen));
+               if (ecode != NTDB_SUCCESS) {
+                       return NTDB_ERR_TO_OFF(ecode);
+               }
+       }
+
+       return off;
+}
diff --git a/ccan/ntdb/hash.c b/ccan/ntdb/hash.c
new file mode 100644 (file)
index 0000000..b223668
--- /dev/null
@@ -0,0 +1,624 @@
+ /*
+   Trivial Database 2: hash handling
+   Copyright (C) Rusty Russell 2010
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/hash/hash.h>
+
+/* Default hash function. */
+uint32_t ntdb_jenkins_hash(const void *key, size_t length, uint32_t seed,
+                         void *unused)
+{
+       return hash_stable((const unsigned char *)key, length, seed);
+}
+
+uint32_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len)
+{
+       return ntdb->hash_fn(ptr, len, ntdb->hash_seed, ntdb->hash_data);
+}
+
+static ntdb_bool_err key_matches(struct ntdb_context *ntdb,
+                                const struct ntdb_used_record *rec,
+                                ntdb_off_t off,
+                                const NTDB_DATA *key,
+                                const char **rptr)
+{
+       ntdb_bool_err ret = false;
+       const char *rkey;
+
+       if (rec_key_length(rec) != key->dsize) {
+               ntdb->stats.compare_wrong_keylen++;
+               return ret;
+       }
+
+       rkey = ntdb_access_read(ntdb, off + sizeof(*rec),
+                               key->dsize + rec_data_length(rec), false);
+       if (NTDB_PTR_IS_ERR(rkey)) {
+               return (ntdb_bool_err)NTDB_PTR_ERR(rkey);
+       }
+       if (memcmp(rkey, key->dptr, key->dsize) == 0) {
+               if (rptr) {
+                       *rptr = rkey;
+               } else {
+                       ntdb_access_release(ntdb, rkey);
+               }
+               return true;
+       }
+       ntdb->stats.compare_wrong_keycmp++;
+       ntdb_access_release(ntdb, rkey);
+       return ret;
+}
+
+/* Does entry match? */
+static ntdb_bool_err match(struct ntdb_context *ntdb,
+                          uint32_t hash,
+                          const NTDB_DATA *key,
+                          ntdb_off_t val,
+                          struct ntdb_used_record *rec,
+                          const char **rptr)
+{
+       ntdb_off_t off;
+       enum NTDB_ERROR ecode;
+
+       ntdb->stats.compares++;
+
+       /* Top bits of offset == next bits of hash. */
+       if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
+           != bits_from(val, 64-NTDB_OFF_UPPER_STEAL, NTDB_OFF_UPPER_STEAL)) {
+               ntdb->stats.compare_wrong_offsetbits++;
+               return false;
+       }
+
+       off = val & NTDB_OFF_MASK;
+       ecode = ntdb_read_convert(ntdb, off, rec, sizeof(*rec));
+       if (ecode != NTDB_SUCCESS) {
+               return (ntdb_bool_err)ecode;
+       }
+
+       return key_matches(ntdb, rec, off, key, rptr);
+}
+
+static bool is_chain(ntdb_off_t val)
+{
+       return val & (1ULL << NTDB_OFF_CHAIN_BIT);
+}
+
+static ntdb_off_t hbucket_off(ntdb_off_t base, ntdb_len_t idx)
+{
+       return base + sizeof(struct ntdb_used_record)
+               + idx * sizeof(ntdb_off_t);
+}
+
+/* This is the core routine which searches the hashtable for an entry.
+ * On error, no locks are held and -ve is returned.
+ * Otherwise, hinfo is filled in.
+ * If not found, the return value is 0.
+ * If found, the return value is the offset, and *rec is the record. */
+ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
+                        NTDB_DATA key,
+                        int ltype,
+                        struct hash_info *h,
+                        struct ntdb_used_record *rec,
+                        const char **rptr)
+{
+       ntdb_off_t off, val;
+       const ntdb_off_t *arr = NULL;
+       ntdb_len_t i;
+       bool found_empty;
+       enum NTDB_ERROR ecode;
+       struct ntdb_used_record chdr;
+       ntdb_bool_err berr;
+
+       h->h = ntdb_hash(ntdb, key.dptr, key.dsize);
+
+       h->table = NTDB_HASH_OFFSET;
+       h->table_size = 1 << ntdb->hash_bits;
+       h->bucket = bits_from(h->h, 0, ntdb->hash_bits);
+       h->old_val = 0;
+
+       ecode = ntdb_lock_hash(ntdb, h->bucket, ltype);
+       if (ecode != NTDB_SUCCESS) {
+               return NTDB_ERR_TO_OFF(ecode);
+       }
+
+       off = hbucket_off(h->table, h->bucket);
+       val = ntdb_read_off(ntdb, off);
+       if (NTDB_OFF_IS_ERR(val)) {
+               ecode = NTDB_OFF_TO_ERR(val);
+               goto fail;
+       }
+
+       /* Directly in hash table? */
+       if (!likely(is_chain(val))) {
+               if (val) {
+                       berr = match(ntdb, h->h, &key, val, rec, rptr);
+                       if (berr < 0) {
+                               ecode = NTDB_OFF_TO_ERR(berr);
+                               goto fail;
+                       }
+                       if (berr) {
+                               return val & NTDB_OFF_MASK;
+                       }
+                       /* If you want to insert here, make a chain. */
+                       h->old_val = val;
+               }
+               return 0;
+       }
+
+       /* Nope?  Iterate through chain. */
+       h->table = val & NTDB_OFF_MASK;
+
+       ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
+       if (ecode != NTDB_SUCCESS) {
+               goto fail;
+       }
+
+       if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
+               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                   NTDB_LOG_ERROR,
+                                   "find_and_lock:"
+                                   " corrupt record %#x at %llu",
+                                   rec_magic(&chdr), (long long)off);
+               goto fail;
+       }
+
+       h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
+
+       arr = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
+                              rec_data_length(&chdr), true);
+       if (NTDB_PTR_IS_ERR(arr)) {
+               ecode = NTDB_PTR_ERR(arr);
+               goto fail;
+       }
+
+       found_empty = false;
+       for (i = 0; i < h->table_size; i++) {
+               if (arr[i] == 0) {
+                       if (!found_empty) {
+                               h->bucket = i;
+                               found_empty = true;
+                       }
+               } else {
+                       berr = match(ntdb, h->h, &key, arr[i], rec, rptr);
+                       if (berr < 0) {
+                               ecode = NTDB_OFF_TO_ERR(berr);
+                               ntdb_access_release(ntdb, arr);
+                               goto fail;
+                       }
+                       if (berr) {
+                               /* We found it! */
+                               h->bucket = i;
+                               off = arr[i] & NTDB_OFF_MASK;
+                               ntdb_access_release(ntdb, arr);
+                               return off;
+                       }
+               }
+       }
+       if (!found_empty) {
+               /* Set to any non-zero value */
+               h->old_val = 1;
+               h->bucket = i;
+       }
+
+       ntdb_access_release(ntdb, arr);
+       return 0;
+
+fail:
+       ntdb_unlock_hash(ntdb, h->bucket, ltype);
+       return NTDB_ERR_TO_OFF(ecode);
+}
+
+static ntdb_off_t encode_offset(const struct ntdb_context *ntdb,
+                               ntdb_off_t new_off, uint32_t hash)
+{
+       ntdb_off_t extra;
+
+       assert((new_off & (1ULL << NTDB_OFF_CHAIN_BIT)) == 0);
+       assert((new_off >> (64 - NTDB_OFF_UPPER_STEAL)) == 0);
+       /* We pack extra hash bits into the upper bits of the offset. */
+       extra = bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL);
+       extra <<= (64 - NTDB_OFF_UPPER_STEAL);
+
+       return new_off | extra;
+}
+
+/* Simply overwrite the hash entry we found before. */
+enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
+                               const struct hash_info *h,
+                               ntdb_off_t new_off)
+{
+       return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket),
+                             encode_offset(ntdb, new_off, h->h));
+}
+
+enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb,
+                                const struct hash_info *h)
+{
+       return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket), 0);
+}
+
+
+enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb,
+                           const struct hash_info *h,
+                           ntdb_off_t new_off)
+{
+       enum NTDB_ERROR ecode;
+       ntdb_off_t chain;
+       struct ntdb_used_record chdr;
+       const ntdb_off_t *old;
+       ntdb_off_t *new;
+
+       /* We hit an empty bucket during search?  That's where it goes. */
+       if (!h->old_val) {
+               return replace_in_hash(ntdb, h, new_off);
+       }
+
+       /* Full at top-level?  Create a 2-element chain. */
+       if (h->table == NTDB_HASH_OFFSET) {
+               ntdb_off_t pair[2];
+
+               /* One element is old value, the other is the new value. */
+               pair[0] = h->old_val;
+               pair[1] = encode_offset(ntdb, new_off, h->h);
+
+               chain = alloc(ntdb, 0, sizeof(pair), NTDB_CHAIN_MAGIC, true);
+               if (NTDB_OFF_IS_ERR(chain)) {
+                       return NTDB_OFF_TO_ERR(chain);
+               }
+               ecode = ntdb_write_convert(ntdb,
+                                          chain
+                                          + sizeof(struct ntdb_used_record),
+                                          pair, sizeof(pair));
+               if (ecode == NTDB_SUCCESS) {
+                       ecode = ntdb_write_off(ntdb,
+                                              hbucket_off(h->table, h->bucket),
+                                              chain
+                                              | (1ULL << NTDB_OFF_CHAIN_BIT));
+               }
+               return ecode;
+       }
+
+       /* Full bucket.  Expand. */
+       ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       if (rec_extra_padding(&chdr) >= sizeof(new_off)) {
+               /* Expand in place. */
+               uint64_t dlen = rec_data_length(&chdr);
+
+               ecode = set_header(ntdb, &chdr, NTDB_CHAIN_MAGIC, 0,
+                                  dlen + sizeof(new_off),
+                                  dlen + rec_extra_padding(&chdr));
+
+               if (ecode != NTDB_SUCCESS) {
+                       return ecode;
+               }
+               /* find_and_lock set up h to point to last bucket. */
+               ecode = replace_in_hash(ntdb, h, new_off);
+               if (ecode != NTDB_SUCCESS) {
+                       return ecode;
+               }
+               ecode = ntdb_write_convert(ntdb, h->table, &chdr, sizeof(chdr));
+               if (ecode != NTDB_SUCCESS) {
+                       return ecode;
+               }
+               /* For futureproofing, we always make the first byte of padding
+                * a zero. */
+               if (rec_extra_padding(&chdr)) {
+                       ecode = ntdb->io->twrite(ntdb, h->table + sizeof(chdr)
+                                                + dlen + sizeof(new_off),
+                                                "", 1);
+               }
+               return ecode;
+       }
+
+       /* We need to reallocate the chain. */
+       chain = alloc(ntdb, 0, (h->table_size + 1) * sizeof(ntdb_off_t),
+                     NTDB_CHAIN_MAGIC, true);
+       if (NTDB_OFF_IS_ERR(chain)) {
+               return NTDB_OFF_TO_ERR(chain);
+       }
+
+       /* Map both and copy across old buckets. */
+       old = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
+                              h->table_size*sizeof(ntdb_off_t), true);
+       if (NTDB_PTR_IS_ERR(old)) {
+               return NTDB_PTR_ERR(old);
+       }
+       new = ntdb_access_write(ntdb, hbucket_off(chain, 0),
+                               (h->table_size + 1)*sizeof(ntdb_off_t), true);
+       if (NTDB_PTR_IS_ERR(new)) {
+               ntdb_access_release(ntdb, old);
+               return NTDB_PTR_ERR(new);
+       }
+
+       memcpy(new, old, h->bucket * sizeof(ntdb_off_t));
+       new[h->bucket] = encode_offset(ntdb, new_off, h->h);
+       ntdb_access_release(ntdb, old);
+
+       ecode = ntdb_access_commit(ntdb, new);
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       /* Free the old chain. */
+       ecode = add_free_record(ntdb, h->table,
+                               sizeof(struct ntdb_used_record)
+                               + rec_data_length(&chdr)
+                               + rec_extra_padding(&chdr),
+                               NTDB_LOCK_WAIT, true);
+
+       /* Replace top-level to point to new chain */
+       return ntdb_write_off(ntdb,
+                             hbucket_off(NTDB_HASH_OFFSET,
+                                         bits_from(h->h, 0, ntdb->hash_bits)),
+                             chain | (1ULL << NTDB_OFF_CHAIN_BIT));
+}
+
+/* Traverse support: returns offset of record, or 0 or -ve error. */
+static ntdb_off_t iterate_chain(struct ntdb_context *ntdb,
+                               ntdb_off_t val,
+                               struct hash_info *h)
+{
+       ntdb_off_t i;
+       enum NTDB_ERROR ecode;
+       struct ntdb_used_record chdr;
+
+       /* First load up chain header. */
+       h->table = val & NTDB_OFF_MASK;
+       ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
+               return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                  NTDB_LOG_ERROR,
+                                  "get_table:"
+                                  " corrupt record %#x at %llu",
+                                  rec_magic(&chdr),
+                                  (long long)h->table);
+       }
+
+       /* Chain length is implied by data length. */
+       h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
+
+       i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), h->bucket,
+                                 h->table_size);
+       if (NTDB_OFF_IS_ERR(i)) {
+               return i;
+       }
+
+       if (i != h->table_size) {
+               /* Return to next bucket. */
+               h->bucket = i + 1;
+               val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
+               if (NTDB_OFF_IS_ERR(val)) {
+                       return val;
+               }
+               return val & NTDB_OFF_MASK;
+       }
+
+       /* Go back up to hash table. */
+       h->table = NTDB_HASH_OFFSET;
+       h->table_size = 1 << ntdb->hash_bits;
+       h->bucket = bits_from(h->h, 0, ntdb->hash_bits) + 1;
+       return 0;
+}
+
+/* Keeps hash locked unless returns 0 or error. */
+static ntdb_off_t lock_and_iterate_hash(struct ntdb_context *ntdb,
+                                       struct hash_info *h)
+{
+       ntdb_off_t val, i;
+       enum NTDB_ERROR ecode;
+
+       if (h->table != NTDB_HASH_OFFSET) {
+               /* We're in a chain. */
+               i = bits_from(h->h, 0, ntdb->hash_bits);
+               ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
+               if (ecode != NTDB_SUCCESS) {
+                       return NTDB_ERR_TO_OFF(ecode);
+               }
+
+               /* We dropped lock, bucket might have moved! */
+               val = ntdb_read_off(ntdb, hbucket_off(NTDB_HASH_OFFSET, i));
+               if (NTDB_OFF_IS_ERR(val)) {
+                       goto unlock;
+               }
+
+               /* We don't remove chains: there should still be one there! */
+               if (!val || !is_chain(val)) {
+                       ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                           NTDB_LOG_ERROR,
+                                           "iterate_hash:"
+                                           " vanished hchain %llu at %llu",
+                                           (long long)val,
+                                           (long long)i);
+                       val = NTDB_ERR_TO_OFF(ecode);
+                       goto unlock;
+               }
+
+               /* Find next bucket in the chain. */
+               val = iterate_chain(ntdb, val, h);
+               if (NTDB_OFF_IS_ERR(val)) {
+                       goto unlock;
+               }
+               if (val != 0) {
+                       return val;
+               }
+               ntdb_unlock_hash(ntdb, i, F_RDLCK);
+
+               /* OK, we've reset h back to top level. */
+       }
+
+       /* We do this unlocked, then re-check. */
+       for (i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
+                                      h->bucket, h->table_size);
+            i != h->table_size;
+            i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
+                                      i+1, h->table_size)) {
+               ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
+               if (ecode != NTDB_SUCCESS) {
+                       return NTDB_ERR_TO_OFF(ecode);
+               }
+
+               val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
+               if (NTDB_OFF_IS_ERR(val)) {
+                       goto unlock;
+               }
+
+               /* Lost race, and it's empty? */
+               if (!val) {
+                       ntdb->stats.traverse_val_vanished++;
+                       ntdb_unlock_hash(ntdb, i, F_RDLCK);
+                       continue;
+               }
+
+               if (!is_chain(val)) {
+                       /* So caller knows what lock to free. */
+                       h->h = i;
+                       /* Return to next bucket. */
+                       h->bucket = i + 1;
+                       val &= NTDB_OFF_MASK;
+                       return val;
+               }
+
+               /* Start at beginning of chain */
+               h->bucket = 0;
+               h->h = i;
+
+               val = iterate_chain(ntdb, val, h);
+               if (NTDB_OFF_IS_ERR(val)) {
+                       goto unlock;
+               }
+               if (val != 0) {
+                       return val;
+               }
+
+               /* Otherwise, bucket has been set to i+1 */
+               ntdb_unlock_hash(ntdb, i, F_RDLCK);
+       }
+       return 0;
+
+unlock:
+       ntdb_unlock_hash(ntdb, i, F_RDLCK);
+       return val;
+}
+
+/* Return success if we find something, NTDB_ERR_NOEXIST if none. */
+enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
+                            struct hash_info *h,
+                            NTDB_DATA *kbuf, size_t *dlen)
+{
+       ntdb_off_t off;
+       struct ntdb_used_record rec;
+       enum NTDB_ERROR ecode;
+
+       off = lock_and_iterate_hash(ntdb, h);
+
+       if (NTDB_OFF_IS_ERR(off)) {
+               return NTDB_OFF_TO_ERR(off);
+       } else if (off == 0) {
+               return NTDB_ERR_NOEXIST;
+       }
+
+       /* The hash for this key is still locked. */
+       ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
+       if (ecode != NTDB_SUCCESS) {
+               goto unlock;
+       }
+       if (rec_magic(&rec) != NTDB_USED_MAGIC) {
+               ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
+                                   NTDB_LOG_ERROR,
+                                   "next_in_hash:"
+                                   " corrupt record at %llu",
+                                   (long long)off);
+               goto unlock;
+       }
+
+       kbuf->dsize = rec_key_length(&rec);
+
+       /* They want data as well? */
+       if (dlen) {
+               *dlen = rec_data_length(&rec);
+               kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
+                                            kbuf->dsize + *dlen);
+       } else {
+               kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
+                                            kbuf->dsize);
+       }
+       if (NTDB_PTR_IS_ERR(kbuf->dptr)) {
+               ecode = NTDB_PTR_ERR(kbuf->dptr);
+               goto unlock;
+       }
+       ecode = NTDB_SUCCESS;
+
+unlock:
+       ntdb_unlock_hash(ntdb, bits_from(h->h, 0, ntdb->hash_bits), F_RDLCK);
+       return ecode;
+
+}
+
+enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
+                            struct hash_info *h,
+                            NTDB_DATA *kbuf, size_t *dlen)
+{
+       h->table = NTDB_HASH_OFFSET;
+       h->table_size = 1 << ntdb->hash_bits;
+       h->bucket = 0;
+
+       return next_in_hash(ntdb, h, kbuf, dlen);
+}
+
+/* Even if the entry isn't in this hash bucket, you'd have to lock this
+ * bucket to find it. */
+static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb,
+                                const NTDB_DATA *key, int ltype)
+{
+       uint32_t h = ntdb_hash(ntdb, key->dptr, key->dsize);
+
+       return ntdb_lock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), ltype);
+}
+
+/* lock/unlock one hash chain. This is meant to be used to reduce
+   contention - it cannot guarantee how many records will be locked */
+_PUBLIC_ enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key)
+{
+       return chainlock(ntdb, &key, F_WRLCK);
+}
+
+_PUBLIC_ void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key)
+{
+       uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
+
+       ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_WRLCK);
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb,
+                                            NTDB_DATA key)
+{
+       return chainlock(ntdb, &key, F_RDLCK);
+}
+
+_PUBLIC_ void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key)
+{
+       uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
+
+       ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_RDLCK);
+}
diff --git a/ccan/ntdb/io.c b/ccan/ntdb/io.c
new file mode 100644 (file)
index 0000000..7645cdd
--- /dev/null
@@ -0,0 +1,750 @@
+ /*
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell             2000
+   Copyright (C) Jeremy Allison                           2000-2003
+   Copyright (C) Rusty Russell                    2010
+
+     ** NOTE! The following LGPL license applies to the ntdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+
+static void free_old_mmaps(struct ntdb_context *ntdb)
+{
+       struct ntdb_old_mmap *i;
+
+       assert(ntdb->file->direct_count == 0);
+
+       while ((i = ntdb->file->old_mmaps) != NULL) {
+               ntdb->file->old_mmaps = i->next;
+               if (ntdb->flags & NTDB_INTERNAL) {
+                       ntdb->free_fn(i->map_ptr, ntdb->alloc_data);
+               } else {
+                       munmap(i->map_ptr, i->map_size);
+               }
+               ntdb->free_fn(i, ntdb->alloc_data);
+       }
+}
+
+static enum NTDB_ERROR save_old_map(struct ntdb_context *ntdb)
+{
+       struct ntdb_old_mmap *old;
+
+       assert(ntdb->file->direct_count);
+
+       old = ntdb->alloc_fn(ntdb->file, sizeof(*old), ntdb->alloc_data);
+       if (!old) {
+               return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+                                  "save_old_map alloc failed");
+       }
+       old->next = ntdb->file->old_mmaps;
+       old->map_ptr = ntdb->file->map_ptr;
+       old->map_size = ntdb->file->map_size;
+       ntdb->file->old_mmaps = old;
+
+       return NTDB_SUCCESS;
+}
+
+enum NTDB_ERROR ntdb_munmap(struct ntdb_context *ntdb)
+{
+       if (ntdb->file->fd == -1) {
+               return NTDB_SUCCESS;
+       }
+
+       if (!ntdb->file->map_ptr) {
+               return NTDB_SUCCESS;
+       }
+
+       /* We can't unmap now if there are accessors. */
+       if (ntdb->file->direct_count) {
+               return save_old_map(ntdb);
+       } else {
+               munmap(ntdb->file->map_ptr, ntdb->file->map_size);
+               ntdb->file->map_ptr = NULL;
+       }
+       return NTDB_SUCCESS;
+}
+
+enum NTDB_ERROR ntdb_mmap(struct ntdb_context *ntdb)
+{
+       int mmap_flags;
+
+       if (ntdb->flags & NTDB_INTERNAL)
+               return NTDB_SUCCESS;
+
+#ifndef HAVE_INCOHERENT_MMAP
+       if (ntdb->flags & NTDB_NOMMAP)
+               return NTDB_SUCCESS;
+#endif
+
+       if ((ntdb->open_flags & O_ACCMODE) == O_RDONLY)
+               mmap_flags = PROT_READ;
+       else
+               mmap_flags = PROT_READ | PROT_WRITE;
+
+       /* size_t can be smaller than off_t. */
+       if ((size_t)ntdb->file->map_size == ntdb->file->map_size) {
+               ntdb->file->map_ptr = mmap(NULL, ntdb->file->map_size,
+                                         mmap_flags,
+                                         MAP_SHARED, ntdb->file->fd, 0);
+       } else
+               ntdb->file->map_ptr = MAP_FAILED;
+
+       /*
+        * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
+        */
+       if (ntdb->file->map_ptr == MAP_FAILED) {
+               ntdb->file->map_ptr = NULL;
+#ifdef HAVE_INCOHERENT_MMAP
+               /* Incoherent mmap means everyone must mmap! */
+               return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+                                 "ntdb_mmap failed for size %lld (%s)",
+                                 (long long)ntdb->file->map_size,
+                                 strerror(errno));
+#else
+               ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
+                          "ntdb_mmap failed for size %lld (%s)",
+                          (long long)ntdb->file->map_size, strerror(errno));
+#endif
+       }
+       return NTDB_SUCCESS;
+}
+
+/* check for an out of bounds access - if it is out of bounds then
+   see if the database has been expanded by someone else and expand
+   if necessary
+   note that "len" is the minimum length needed for the db.
+
+   If probe is true, len being too large isn't a failure.
+*/
+static enum NTDB_ERROR ntdb_normal_oob(struct ntdb_context *ntdb,
+                                      ntdb_off_t off, ntdb_len_t len,
+                                      bool probe)
+{
+       struct stat st;
+       enum NTDB_ERROR ecode;
+
+       if (len + off < len) {
+               if (probe)
+                       return NTDB_SUCCESS;
+
+               return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+                                 "ntdb_oob off %llu len %llu wrap\n",
+                                 (long long)off, (long long)len);
+       }
+
+       if (ntdb->flags & NTDB_INTERNAL) {
+               if (probe)
+                       return NTDB_SUCCESS;
+
+               ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+                          "ntdb_oob len %lld beyond internal"
+                          " alloc size %lld",
+                          (long long)(off + len),
+                          (long long)ntdb->file->map_size);
+               return NTDB_ERR_IO;
+       }
+
+       ecode = ntdb_lock_expand(ntdb, F_RDLCK);
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       if (fstat(ntdb->file->fd, &st) != 0) {
+               ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+                          "Failed to fstat file: %s", strerror(errno));
+               ntdb_unlock_expand(ntdb, F_RDLCK);
+               return NTDB_ERR_IO;
+       }
+
+       ntdb_unlock_expand(ntdb, F_RDLCK);
+
+       if (st.st_size < off + len) {
+               if (probe)
+                       return NTDB_SUCCESS;
+
+               ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+                          "ntdb_oob len %llu beyond eof at %llu",
+                          (long long)(off + len), (long long)st.st_size);
+               return NTDB_ERR_IO;
+       }
+
+       /* Unmap, update size, remap */
+       ecode = ntdb_munmap(ntdb);
+       if (ecode) {
+               return ecode;
+       }
+
+       ntdb->file->map_size = st.st_size;
+       return ntdb_mmap(ntdb);
+}
+
+/* Endian conversion: we only ever deal with 8 byte quantities */
+void *ntdb_convert(const struct ntdb_context *ntdb, void *buf, ntdb_len_t size)
+{
+       assert(size % 8 == 0);
+       if (unlikely((ntdb->flags & NTDB_CONVERT)) && buf) {
+               uint64_t i, *p = (uint64_t *)buf;
+               for (i = 0; i < size / 8; i++)
+                       p[i] = bswap_64(p[i]);
+       }
+       return buf;
+}
+
+/* Return first non-zero offset in offset array, or end, or -ve error. */
+/* FIXME: Return the off? */
+uint64_t ntdb_find_nonzero_off(struct ntdb_context *ntdb,
+                             ntdb_off_t base, uint64_t start, uint64_t end)
+{
+       uint64_t i;
+       const uint64_t *val;
+
+       /* Zero vs non-zero is the same unconverted: minor optimization. */
+       val = ntdb_access_read(ntdb, base + start * sizeof(ntdb_off_t),
+                             (end - start) * sizeof(ntdb_off_t), false);
+       if (NTDB_PTR_IS_ERR(val)) {
+               return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val));
+       }
+
+       for (i = 0; i < (end - start); i++) {
+               if (val[i])
+                       break;
+       }
+       ntdb_access_release(ntdb, val);
+       return start + i;
+}
+
+/* Return first zero offset in num offset array, or num, or -ve error. */
+uint64_t ntdb_find_zero_off(struct ntdb_context *ntdb, ntdb_off_t off,
+                          uint64_t num)
+{
+       uint64_t i;
+       const uint64_t *val;
+
+       /* Zero vs non-zero is the same unconverted: minor optimization. */
+       val = ntdb_access_read(ntdb, off, num * sizeof(ntdb_off_t), false);
+       if (NTDB_PTR_IS_ERR(val)) {
+               return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val));
+       }
+
+       for (i = 0; i < num; i++) {
+               if (!val[i])
+                       break;
+       }
+       ntdb_access_release(ntdb, val);
+       return i;
+}
+
+enum NTDB_ERROR zero_out(struct ntdb_context *ntdb, ntdb_off_t off, ntdb_len_t len)
+{
+       char buf[8192] = { 0 };
+       void *p = ntdb->io->direct(ntdb, off, len, true);
+       enum NTDB_ERROR ecode = NTDB_SUCCESS;
+
+       assert(!(ntdb->flags & NTDB_RDONLY));
+       if (NTDB_PTR_IS_ERR(p)) {
+               return NTDB_PTR_ERR(p);
+       }
+       if (p) {
+               memset(p, 0, len);
+               return ecode;
+       }
+       while (len) {
+               unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
+               ecode = ntdb->io->twrite(ntdb, off, buf, todo);
+               if (ecode != NTDB_SUCCESS) {
+                       break;
+               }
+               len -= todo;
+               off += todo;
+       }
+       return ecode;
+}
+
+/* write a lump of data at a specified offset */
+static enum NTDB_ERROR ntdb_write(struct ntdb_context *ntdb, ntdb_off_t off,
+                               const void *buf, ntdb_len_t len)
+{
+       enum NTDB_ERROR ecode;
+
+       if (ntdb->flags & NTDB_RDONLY) {
+               return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
+                                 "Write to read-only database");
+       }
+
+       ecode = ntdb_oob(ntdb, off, len, false);
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       if (ntdb->file->map_ptr) {
+               memcpy(off + (char *)ntdb->file->map_ptr, buf, len);
+       } else {
+#ifdef HAVE_INCOHERENT_MMAP
+               return NTDB_ERR_IO;
+#else
+               ssize_t ret;
+               ret = pwrite(ntdb->file->fd, buf, len, off);
+               if (ret != len) {
+                       /* This shouldn't happen: we avoid sparse files. */
+                       if (ret >= 0)
+                               errno = ENOSPC;
+
+                       return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+                                         "ntdb_write: %zi at %zu len=%zu (%s)",
+                                         ret, (size_t)off, (size_t)len,
+                                         strerror(errno));
+               }
+#endif
+       }
+       return NTDB_SUCCESS;
+}
+
+/* read a lump of data at a specified offset */
+static enum NTDB_ERROR ntdb_read(struct ntdb_context *ntdb, ntdb_off_t off,
+                              void *buf, ntdb_len_t len)
+{
+       enum NTDB_ERROR ecode;
+
+       ecode = ntdb_oob(ntdb, off, len, false);
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       if (ntdb->file->map_ptr) {
+               memcpy(buf, off + (char *)ntdb->file->map_ptr, len);
+       } else {
+#ifdef HAVE_INCOHERENT_MMAP
+               return NTDB_ERR_IO;
+#else
+               ssize_t r = pread(ntdb->file->fd, buf, len, off);
+               if (r != len) {
+                       return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+                                         "ntdb_read failed with %zi at %zu "
+                                         "len=%zu (%s) map_size=%zu",
+                                         r, (size_t)off, (size_t)len,
+                                         strerror(errno),
+                                         (size_t)ntdb->file->map_size);
+               }
+#endif
+       }
+       return NTDB_SUCCESS;
+}
+
+enum NTDB_ERROR ntdb_write_convert(struct ntdb_context *ntdb, ntdb_off_t off,
+                                const void *rec, size_t len)
+{
+       enum NTDB_ERROR ecode;
+
+       if (unlikely((ntdb->flags & NTDB_CONVERT))) {
+               void *conv = ntdb->alloc_fn(ntdb, len, ntdb->alloc_data);
+               if (!conv) {
+                       return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+                                         "ntdb_write: no memory converting"
+                                         " %zu bytes", len);
+               }
+               memcpy(conv, rec, len);
+               ecode = ntdb->io->twrite(ntdb, off,
+                                        ntdb_convert(ntdb, conv, len), len);
+               ntdb->free_fn(conv, ntdb->alloc_data);
+       } else {
+               ecode = ntdb->io->twrite(ntdb, off, rec, len);
+       }
+       return ecode;
+}
+
+enum NTDB_ERROR ntdb_read_convert(struct ntdb_context *ntdb, ntdb_off_t off,
+                               void *rec, size_t len)
+{
+       enum NTDB_ERROR ecode = ntdb->io->tread(ntdb, off, rec, len);
+       ntdb_convert(ntdb, rec, len);
+       return ecode;
+}
+
+static void *_ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset,
+                            ntdb_len_t len, unsigned int prefix)
+{
+       unsigned char *buf;
+       enum NTDB_ERROR ecode;
+
+       /* some systems don't like zero length malloc */
+       buf = ntdb->alloc_fn(ntdb, prefix + len ? prefix + len : 1,
+                         ntdb->alloc_data);
+       if (!buf) {
+               ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+                          "ntdb_alloc_read alloc failed len=%zu",
+                          (size_t)(prefix + len));
+               return NTDB_ERR_PTR(NTDB_ERR_OOM);
+       } else {
+               ecode = ntdb->io->tread(ntdb, offset, buf+prefix, len);
+               if (unlikely(ecode != NTDB_SUCCESS)) {
+                       ntdb->free_fn(buf, ntdb->alloc_data);
+                       return NTDB_ERR_PTR(ecode);
+               }
+       }
+       return buf;
+}
+
+/* read a lump of data, allocating the space for it */
+void *ntdb_alloc_read(struct ntdb_context *ntdb, ntdb_off_t offset, ntdb_len_t len)
+{
+       return _ntdb_alloc_read(ntdb, offset, len, 0);
+}
+
+static enum NTDB_ERROR fill(struct ntdb_context *ntdb,
+                          const void *buf, size_t size,
+                          ntdb_off_t off, ntdb_len_t len)
+{
+       while (len) {
+               size_t n = len > size ? size : len;
+               ssize_t ret = pwrite(ntdb->file->fd, buf, n, off);
+               if (ret != n) {
+                       if (ret >= 0)
+                               errno = ENOSPC;
+
+                       return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+                                         "fill failed:"
+                                         " %zi at %zu len=%zu (%s)",
+                                         ret, (size_t)off, (size_t)len,
+                                         strerror(errno));
+               }
+               len -= n;
+               off += n;
+       }
+       return NTDB_SUCCESS;
+}
+
+/* expand a file.  we prefer to use ftruncate, as that is what posix
+  says to use for mmap expansion */
+static enum NTDB_ERROR ntdb_expand_file(struct ntdb_context *ntdb,
+                                     ntdb_len_t addition)
+{
+       char buf[8192];
+       enum NTDB_ERROR ecode;
+
+       assert((ntdb->file->map_size + addition) % NTDB_PGSIZE == 0);
+       if (ntdb->flags & NTDB_RDONLY) {
+               return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
+                                 "Expand on read-only database");
+       }
+
+       if (ntdb->flags & NTDB_INTERNAL) {
+               char *new;
+
+               /* Can't free it if we have direct accesses. */
+               if (ntdb->file->direct_count) {
+                       ecode = save_old_map(ntdb);
+                       if (ecode) {
+                               return ecode;
+                       }
+                       new = ntdb->alloc_fn(ntdb->file,
+                                            ntdb->file->map_size + addition,
+                                            ntdb->alloc_data);
+                       if (new) {
+                               memcpy(new, ntdb->file->map_ptr,
+                                      ntdb->file->map_size);
+                       }
+               } else {
+                       new = ntdb->expand_fn(ntdb->file->map_ptr,
+                                             ntdb->file->map_size + addition,
+                                             ntdb->alloc_data);
+               }
+               if (!new) {
+                       return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+                                         "No memory to expand database");
+               }
+               ntdb->file->map_ptr = new;
+               ntdb->file->map_size += addition;
+               return NTDB_SUCCESS;
+       } else {
+               /* Unmap before trying to write; old NTDB claimed OpenBSD had
+                * problem with this otherwise. */
+               ecode = ntdb_munmap(ntdb);
+               if (ecode) {
+                       return ecode;
+               }
+
+               /* If this fails, we try to fill anyway. */
+               if (ftruncate(ntdb->file->fd, ntdb->file->map_size + addition))
+                       ;
+
+               /* now fill the file with something. This ensures that the
+                  file isn't sparse, which would be very bad if we ran out of
+                  disk. This must be done with write, not via mmap */
+               memset(buf, 0x43, sizeof(buf));
+               ecode = fill(ntdb, buf, sizeof(buf), ntdb->file->map_size,
+                            addition);
+               if (ecode != NTDB_SUCCESS)
+                       return ecode;
+               ntdb->file->map_size += addition;
+               return ntdb_mmap(ntdb);
+       }
+}
+
+const void *ntdb_access_read(struct ntdb_context *ntdb,
+                           ntdb_off_t off, ntdb_len_t len, bool convert)
+{
+       void *ret = NULL;
+
+       if (likely(!(ntdb->flags & NTDB_CONVERT))) {
+               ret = ntdb->io->direct(ntdb, off, len, false);
+
+               if (NTDB_PTR_IS_ERR(ret)) {
+                       return ret;
+               }
+       }
+       if (!ret) {
+               struct ntdb_access_hdr *hdr;
+               hdr = _ntdb_alloc_read(ntdb, off, len, sizeof(*hdr));
+               if (NTDB_PTR_IS_ERR(hdr)) {
+                       return hdr;
+               }
+               hdr->next = ntdb->access;
+               ntdb->access = hdr;
+               ret = hdr + 1;
+               if (convert) {
+                       ntdb_convert(ntdb, (void *)ret, len);
+               }
+       } else {
+               ntdb->file->direct_count++;
+       }
+
+       return ret;
+}
+
+void *ntdb_access_write(struct ntdb_context *ntdb,
+                      ntdb_off_t off, ntdb_len_t len, bool convert)
+{
+       void *ret = NULL;
+
+       if (ntdb->flags & NTDB_RDONLY) {
+               ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
+                          "Write to read-only database");
+               return NTDB_ERR_PTR(NTDB_ERR_RDONLY);
+       }
+
+       if (likely(!(ntdb->flags & NTDB_CONVERT))) {
+               ret = ntdb->io->direct(ntdb, off, len, true);
+
+               if (NTDB_PTR_IS_ERR(ret)) {
+                       return ret;
+               }
+       }
+
+       if (!ret) {
+               struct ntdb_access_hdr *hdr;
+               hdr = _ntdb_alloc_read(ntdb, off, len, sizeof(*hdr));
+               if (NTDB_PTR_IS_ERR(hdr)) {
+                       return hdr;
+               }
+               hdr->next = ntdb->access;
+               ntdb->access = hdr;
+               hdr->off = off;
+               hdr->len = len;
+               hdr->convert = convert;
+               ret = hdr + 1;
+               if (convert)
+                       ntdb_convert(ntdb, (void *)ret, len);
+       } else {
+               ntdb->file->direct_count++;
+       }
+       return ret;
+}
+
+static struct ntdb_access_hdr **find_hdr(struct ntdb_context *ntdb, const void *p)
+{
+       struct ntdb_access_hdr **hp;
+
+       for (hp = &ntdb->access; *hp; hp = &(*hp)->next) {
+               if (*hp + 1 == p)
+                       return hp;
+       }
+       return NULL;
+}
+
+void ntdb_access_release(struct ntdb_context *ntdb, const void *p)
+{
+       struct ntdb_access_hdr *hdr, **hp = find_hdr(ntdb, p);
+
+       if (hp) {
+               hdr = *hp;
+               *hp = hdr->next;
+               ntdb->free_fn(hdr, ntdb->alloc_data);
+       } else {
+               if (--ntdb->file->direct_count == 0) {
+                       free_old_mmaps(ntdb);
+               }
+       }
+}
+
+enum NTDB_ERROR ntdb_access_commit(struct ntdb_context *ntdb, void *p)
+{
+       struct ntdb_access_hdr *hdr, **hp = find_hdr(ntdb, p);
+       enum NTDB_ERROR ecode;
+
+       if (hp) {
+               hdr = *hp;
+               if (hdr->convert)
+                       ecode = ntdb_write_convert(ntdb, hdr->off, p, hdr->len);
+               else
+                       ecode = ntdb_write(ntdb, hdr->off, p, hdr->len);
+               *hp = hdr->next;
+               ntdb->free_fn(hdr, ntdb->alloc_data);
+       } else {
+               if (--ntdb->file->direct_count == 0) {
+                       free_old_mmaps(ntdb);
+               }
+               ecode = NTDB_SUCCESS;
+       }
+
+       return ecode;
+}
+
+static void *ntdb_direct(struct ntdb_context *ntdb, ntdb_off_t off, size_t len,
+                       bool write_mode)
+{
+       enum NTDB_ERROR ecode;
+
+       if (unlikely(!ntdb->file->map_ptr))
+               return NULL;
+
+       ecode = ntdb_oob(ntdb, off, len, false);
+       if (unlikely(ecode != NTDB_SUCCESS))
+               return NTDB_ERR_PTR(ecode);
+       return (char *)ntdb->file->map_ptr + off;
+}
+
+static ntdb_off_t ntdb_read_normal_off(struct ntdb_context *ntdb,
+                                      ntdb_off_t off)
+{
+       ntdb_off_t ret;
+       enum NTDB_ERROR ecode;
+       ntdb_off_t *p;
+
+       p = ntdb_direct(ntdb, off, sizeof(*p), false);
+       if (NTDB_PTR_IS_ERR(p)) {
+               return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(p));
+       }
+       if (likely(p)) {
+               return *p;
+       }
+
+       ecode = ntdb_read(ntdb, off, &ret, sizeof(ret));
+       if (ecode != NTDB_SUCCESS) {
+               return NTDB_ERR_TO_OFF(ecode);
+       }
+       return ret;
+}
+
+static ntdb_off_t ntdb_read_convert_off(struct ntdb_context *ntdb,
+                                       ntdb_off_t off)
+{
+       ntdb_off_t ret;
+       enum NTDB_ERROR ecode;
+
+       ecode = ntdb_read_convert(ntdb, off, &ret, sizeof(ret));
+       if (ecode != NTDB_SUCCESS) {
+               return NTDB_ERR_TO_OFF(ecode);
+       }
+       return ret;
+}
+
+static enum NTDB_ERROR ntdb_write_normal_off(struct ntdb_context *ntdb,
+                                            ntdb_off_t off, ntdb_off_t val)
+{
+       ntdb_off_t *p;
+
+       p = ntdb_direct(ntdb, off, sizeof(*p), true);
+       if (NTDB_PTR_IS_ERR(p)) {
+               return NTDB_PTR_ERR(p);
+       }
+       if (likely(p)) {
+               *p = val;
+               return NTDB_SUCCESS;
+       }
+       return ntdb_write(ntdb, off, &val, sizeof(val));
+}
+
+static enum NTDB_ERROR ntdb_write_convert_off(struct ntdb_context *ntdb,
+                                             ntdb_off_t off, ntdb_off_t val)
+{
+       return ntdb_write_convert(ntdb, off, &val, sizeof(val));
+}
+
+void ntdb_inc_seqnum(struct ntdb_context *ntdb)
+{
+       ntdb_off_t seq;
+
+       if (likely(!(ntdb->flags & NTDB_CONVERT))) {
+               int64_t *direct;
+
+               direct = ntdb->io->direct(ntdb,
+                                        offsetof(struct ntdb_header, seqnum),
+                                        sizeof(*direct), true);
+               if (likely(direct)) {
+                       /* Don't let it go negative, even briefly */
+                       if (unlikely((*direct) + 1) < 0)
+                               *direct = 0;
+                       (*direct)++;
+                       return;
+               }
+       }
+
+       seq = ntdb_read_off(ntdb, offsetof(struct ntdb_header, seqnum));
+       if (!NTDB_OFF_IS_ERR(seq)) {
+               seq++;
+               if (unlikely((int64_t)seq < 0))
+                       seq = 0;
+               ntdb_write_off(ntdb, offsetof(struct ntdb_header, seqnum), seq);
+       }
+}
+
+static const struct ntdb_methods io_methods = {
+       ntdb_read,
+       ntdb_write,
+       ntdb_normal_oob,
+       ntdb_expand_file,
+       ntdb_direct,
+       ntdb_read_normal_off,
+       ntdb_write_normal_off,
+};
+
+static const struct ntdb_methods io_convert_methods = {
+       ntdb_read,
+       ntdb_write,
+       ntdb_normal_oob,
+       ntdb_expand_file,
+       ntdb_direct,
+       ntdb_read_convert_off,
+       ntdb_write_convert_off,
+};
+
+/*
+  initialise the default methods table
+*/
+void ntdb_io_init(struct ntdb_context *ntdb)
+{
+       if (ntdb->flags & NTDB_CONVERT)
+               ntdb->io = &io_convert_methods;
+       else
+               ntdb->io = &io_methods;
+}
diff --git a/ccan/ntdb/lock.c b/ccan/ntdb/lock.c
new file mode 100644 (file)
index 0000000..71d6d85
--- /dev/null
@@ -0,0 +1,886 @@
+ /*
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell             2000
+   Copyright (C) Jeremy Allison                           2000-2003
+
+     ** NOTE! The following LGPL license applies to the ntdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "private.h"
+#include <ccan/build_assert/build_assert.h>
+
+/* If we were threaded, we could wait for unlock, but we're not, so fail. */
+enum NTDB_ERROR owner_conflict(struct ntdb_context *ntdb, const char *call)
+{
+       return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+                         "%s: lock owned by another ntdb in this process.",
+                         call);
+}
+
+/* If we fork, we no longer really own locks. */
+bool check_lock_pid(struct ntdb_context *ntdb, const char *call, bool log)
+{
+       /* No locks?  No problem! */
+       if (ntdb->file->allrecord_lock.count == 0
+           && ntdb->file->num_lockrecs == 0) {
+               return true;
+       }
+
+       /* No fork?  No problem! */
+       if (ntdb->file->locker == getpid()) {
+               return true;
+       }
+
+       if (log) {
+               ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+                           "%s: fork() detected after lock acquisition!"
+                           " (%u vs %u)", call,
+                           (unsigned int)ntdb->file->locker,
+                           (unsigned int)getpid());
+       }
+       return false;
+}
+
+int ntdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
+                  void *unused)
+{
+       struct flock fl;
+       int ret;
+
+       do {
+               fl.l_type = rw;
+               fl.l_whence = SEEK_SET;
+               fl.l_start = off;
+               fl.l_len = len;
+
+               if (waitflag)
+                       ret = fcntl(fd, F_SETLKW, &fl);
+               else
+                       ret = fcntl(fd, F_SETLK, &fl);
+       } while (ret != 0 && errno == EINTR);
+       return ret;
+}
+
+int ntdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *unused)
+{
+       struct flock fl;
+       int ret;
+
+       do {
+               fl.l_type = F_UNLCK;
+               fl.l_whence = SEEK_SET;
+               fl.l_start = off;
+               fl.l_len = len;
+
+               ret = fcntl(fd, F_SETLKW, &fl);
+       } while (ret != 0 && errno == EINTR);
+       return ret;
+}
+
+static int lock(struct ntdb_context *ntdb,
+                     int rw, off_t off, off_t len, bool waitflag)
+{
+       int ret;
+       if (ntdb->file->allrecord_lock.count == 0
+           && ntdb->file->num_lockrecs == 0) {
+               ntdb->file->locker = getpid();
+       }
+
+       ntdb->stats.lock_lowlevel++;
+       ret = ntdb->lock_fn(ntdb->file->fd, rw, off, len, waitflag,
+                          ntdb->lock_data);
+       if (!waitflag) {
+               ntdb->stats.lock_nonblock++;
+               if (ret != 0)
+                       ntdb->stats.lock_nonblock_fail++;
+       }
+       return ret;
+}
+
+static int unlock(struct ntdb_context *ntdb, int rw, off_t off, off_t len)
+{
+#if 0 /* Check they matched up locks and unlocks correctly. */
+       char line[80];
+       FILE *locks;
+       bool found = false;
+
+       locks = fopen("/proc/locks", "r");
+
+       while (fgets(line, 80, locks)) {
+               char *p;
+               int type, start, l;
+
+               /* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
+               p = strchr(line, ':') + 1;
+               if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
+                       continue;
+               p += strlen(" FLOCK  ADVISORY  ");
+               if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
+                       type = F_RDLCK;
+               else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
+                       type = F_WRLCK;
+               else
+                       abort();
+               p += 6;
+               if (atoi(p) != getpid())
+                       continue;
+               p = strchr(strchr(p, ' ') + 1, ' ') + 1;
+               start = atoi(p);
+               p = strchr(p, ' ') + 1;
+               if (strncmp(p, "EOF", 3) == 0)
+                       l = 0;
+               else
+                       l = atoi(p) - start + 1;
+
+               if (off == start) {
+                       if (len != l) {
+                               fprintf(stderr, "Len %u should be %u: %s",
+                                       (int)len, l, line);
+                               abort();
+                       }
+                       if (type != rw) {
+                               fprintf(stderr, "Type %s wrong: %s",
+                                       rw == F_RDLCK ? "READ" : "WRITE", line);
+                               abort();
+                       }
+                       found = true;
+                       break;
+               }
+       }
+
+       if (!found) {
+               fprintf(stderr, "Unlock on %u@%u not found!",
+                       (int)off, (int)len);
+               abort();
+       }
+
+       fclose(locks);
+#endif
+
+       return ntdb->unlock_fn(ntdb->file->fd, rw, off, len, ntdb->lock_data);
+}
+
+/* a byte range locking function - return 0 on success
+   this functions locks len bytes at the specified offset.
+
+   note that a len of zero means lock to end of file
+*/
+static enum NTDB_ERROR ntdb_brlock(struct ntdb_context *ntdb,
+                                int rw_type, ntdb_off_t offset, ntdb_off_t len,
+                                enum ntdb_lock_flags flags)
+{
+       int ret;
+
+       if (rw_type == F_WRLCK && (ntdb->flags & NTDB_RDONLY)) {
+               return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
+                                 "Write lock attempted on read-only database");
+       }
+
+       if (ntdb->flags & NTDB_NOLOCK) {
+               return NTDB_SUCCESS;
+       }
+
+       /* A 32 bit system cannot open a 64-bit file, but it could have
+        * expanded since then: check here. */
+       if ((size_t)(offset + len) != offset + len) {
+               return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
+                                 "ntdb_brlock: lock on giant offset %llu",
+                                 (long long)(offset + len));
+       }
+
+       ret = lock(ntdb, rw_type, offset, len, flags & NTDB_LOCK_WAIT);
+       if (ret != 0) {
+               /* Generic lock error. errno set by fcntl.
+                * EAGAIN is an expected return from non-blocking
+                * locks. */
+               if (!(flags & NTDB_LOCK_PROBE)
+                   && (errno != EAGAIN && errno != EINTR)) {
+                       ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                  "ntdb_brlock failed (fd=%d) at"
+                                  " offset %zu rw_type=%d flags=%d len=%zu:"
+                                  " %s",
+                                  ntdb->file->fd, (size_t)offset, rw_type,
+                                  flags, (size_t)len, strerror(errno));
+               }
+               return NTDB_ERR_LOCK;
+       }
+       return NTDB_SUCCESS;
+}
+
+static enum NTDB_ERROR ntdb_brunlock(struct ntdb_context *ntdb,
+                                  int rw_type, ntdb_off_t offset, size_t len)
+{
+       if (ntdb->flags & NTDB_NOLOCK) {
+               return NTDB_SUCCESS;
+       }
+
+       if (!check_lock_pid(ntdb, "ntdb_brunlock", false))
+               return NTDB_ERR_LOCK;
+
+       if (unlock(ntdb, rw_type, offset, len) == -1) {
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                 "ntdb_brunlock failed (fd=%d) at offset %zu"
+                                 " rw_type=%d len=%zu: %s",
+                                 ntdb->file->fd, (size_t)offset, rw_type,
+                                 (size_t)len, strerror(errno));
+       }
+       return NTDB_SUCCESS;
+}
+
+/*
+  upgrade a read lock to a write lock. This needs to be handled in a
+  special way as some OSes (such as solaris) have too conservative
+  deadlock detection and claim a deadlock when progress can be
+  made. For those OSes we may loop for a while.
+*/
+enum NTDB_ERROR ntdb_allrecord_upgrade(struct ntdb_context *ntdb, off_t start)
+{
+       int count = 1000;
+
+       if (!check_lock_pid(ntdb, "ntdb_transaction_prepare_commit", true))
+               return NTDB_ERR_LOCK;
+
+       if (ntdb->file->allrecord_lock.count != 1) {
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                 "ntdb_allrecord_upgrade failed:"
+                                 " count %u too high",
+                                 ntdb->file->allrecord_lock.count);
+       }
+
+       if (ntdb->file->allrecord_lock.off != 1) {
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                 "ntdb_allrecord_upgrade failed:"
+                                 " already upgraded?");
+       }
+
+       if (ntdb->file->allrecord_lock.owner != ntdb) {
+               return owner_conflict(ntdb, "ntdb_allrecord_upgrade");
+       }
+
+       while (count--) {
+               struct timeval tv;
+               if (ntdb_brlock(ntdb, F_WRLCK, start, 0,
+                              NTDB_LOCK_WAIT|NTDB_LOCK_PROBE) == NTDB_SUCCESS) {
+                       ntdb->file->allrecord_lock.ltype = F_WRLCK;
+                       ntdb->file->allrecord_lock.off = 0;
+                       return NTDB_SUCCESS;
+               }
+               if (errno != EDEADLK) {
+                       break;
+               }
+               /* sleep for as short a time as we can - more portable than usleep() */
+               tv.tv_sec = 0;
+               tv.tv_usec = 1;
+               select(0, NULL, NULL, NULL, &tv);
+       }
+
+       if (errno != EAGAIN && errno != EINTR)
+               ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                          "ntdb_allrecord_upgrade failed");
+       return NTDB_ERR_LOCK;
+}
+
+static struct ntdb_lock *find_nestlock(struct ntdb_context *ntdb, ntdb_off_t offset,
+                                     const struct ntdb_context *owner)
+{
+       unsigned int i;
+
+       for (i=0; i<ntdb->file->num_lockrecs; i++) {
+               if (ntdb->file->lockrecs[i].off == offset) {
+                       if (owner && ntdb->file->lockrecs[i].owner != owner)
+                               return NULL;
+                       return &ntdb->file->lockrecs[i];
+               }
+       }
+       return NULL;
+}
+
+enum NTDB_ERROR ntdb_lock_and_recover(struct ntdb_context *ntdb)
+{
+       enum NTDB_ERROR ecode;
+
+       if (!check_lock_pid(ntdb, "ntdb_transaction_prepare_commit", true))
+               return NTDB_ERR_LOCK;
+
+       ecode = ntdb_allrecord_lock(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK,
+                                  false);
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       ecode = ntdb_lock_open(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
+       if (ecode != NTDB_SUCCESS) {
+               ntdb_allrecord_unlock(ntdb, F_WRLCK);
+               return ecode;
+       }
+       ecode = ntdb_transaction_recover(ntdb);
+       ntdb_unlock_open(ntdb, F_WRLCK);
+       ntdb_allrecord_unlock(ntdb, F_WRLCK);
+
+       return ecode;
+}
+
+/* lock an offset in the database. */
+static enum NTDB_ERROR ntdb_nest_lock(struct ntdb_context *ntdb,
+                                   ntdb_off_t offset, int ltype,
+                                   enum ntdb_lock_flags flags)
+{
+       struct ntdb_lock *new_lck;
+       enum NTDB_ERROR ecode;
+
+       assert(offset <= (NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits)
+                         + ntdb->file->map_size / 8));
+
+       if (ntdb->flags & NTDB_NOLOCK)
+               return NTDB_SUCCESS;
+
+       if (!check_lock_pid(ntdb, "ntdb_nest_lock", true)) {
+               return NTDB_ERR_LOCK;
+       }
+
+       ntdb->stats.locks++;
+
+       new_lck = find_nestlock(ntdb, offset, NULL);
+       if (new_lck) {
+               if (new_lck->owner != ntdb) {
+                       return owner_conflict(ntdb, "ntdb_nest_lock");
+               }
+
+               if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
+                       return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                         "ntdb_nest_lock:"
+                                         " offset %zu has read lock",
+                                         (size_t)offset);
+               }
+               /* Just increment the struct, posix locks don't stack. */
+               new_lck->count++;
+               return NTDB_SUCCESS;
+       }
+
+#if 0
+       if (ntdb->file->num_lockrecs
+           && offset >= NTDB_HASH_LOCK_START
+           && offset < NTDB_HASH_LOCK_START + NTDB_HASH_LOCK_RANGE) {
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                 "ntdb_nest_lock: already have a hash lock?");
+       }
+#endif
+       if (ntdb->file->lockrecs == NULL) {
+               new_lck = ntdb->alloc_fn(ntdb->file, sizeof(*ntdb->file->lockrecs),
+                                    ntdb->alloc_data);
+       } else {
+               new_lck = (struct ntdb_lock *)ntdb->expand_fn(
+                       ntdb->file->lockrecs,
+                       sizeof(*ntdb->file->lockrecs)
+                       * (ntdb->file->num_lockrecs+1),
+                       ntdb->alloc_data);
+       }
+       if (new_lck == NULL) {
+               return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
+                                 "ntdb_nest_lock:"
+                                 " unable to allocate %zu lock struct",
+                                 ntdb->file->num_lockrecs + 1);
+       }
+       ntdb->file->lockrecs = new_lck;
+
+       /* Since fcntl locks don't nest, we do a lock for the first one,
+          and simply bump the count for future ones */
+       ecode = ntdb_brlock(ntdb, ltype, offset, 1, flags);
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+
+       /* First time we grab a lock, perhaps someone died in commit? */
+       if (!(flags & NTDB_LOCK_NOCHECK)
+           && ntdb->file->num_lockrecs == 0) {
+               ntdb_bool_err berr = ntdb_needs_recovery(ntdb);
+               if (berr != false) {
+                       ntdb_brunlock(ntdb, ltype, offset, 1);
+
+                       if (berr < 0)
+                               return NTDB_OFF_TO_ERR(berr);
+                       ecode = ntdb_lock_and_recover(ntdb);
+                       if (ecode == NTDB_SUCCESS) {
+                               ecode = ntdb_brlock(ntdb, ltype, offset, 1,
+                                                  flags);
+                       }
+                       if (ecode != NTDB_SUCCESS) {
+                               return ecode;
+                       }
+               }
+       }
+
+       ntdb->file->lockrecs[ntdb->file->num_lockrecs].owner = ntdb;
+       ntdb->file->lockrecs[ntdb->file->num_lockrecs].off = offset;
+       ntdb->file->lockrecs[ntdb->file->num_lockrecs].count = 1;
+       ntdb->file->lockrecs[ntdb->file->num_lockrecs].ltype = ltype;
+       ntdb->file->num_lockrecs++;
+
+       return NTDB_SUCCESS;
+}
+
+static enum NTDB_ERROR ntdb_nest_unlock(struct ntdb_context *ntdb,
+                                     ntdb_off_t off, int ltype)
+{
+       struct ntdb_lock *lck;
+       enum NTDB_ERROR ecode;
+
+       if (ntdb->flags & NTDB_NOLOCK)
+               return NTDB_SUCCESS;
+
+       lck = find_nestlock(ntdb, off, ntdb);
+       if ((lck == NULL) || (lck->count == 0)) {
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                 "ntdb_nest_unlock: no lock for %zu",
+                                 (size_t)off);
+       }
+
+       if (lck->count > 1) {
+               lck->count--;
+               return NTDB_SUCCESS;
+       }
+
+       /*
+        * This lock has count==1 left, so we need to unlock it in the
+        * kernel. We don't bother with decrementing the in-memory array
+        * element, we're about to overwrite it with the last array element
+        * anyway.
+        */
+       ecode = ntdb_brunlock(ntdb, ltype, off, 1);
+
+       /*
+        * Shrink the array by overwriting the element just unlocked with the
+        * last array element.
+        */
+       *lck = ntdb->file->lockrecs[--ntdb->file->num_lockrecs];
+
+       return ecode;
+}
+
+/*
+  get the transaction lock
+ */
+enum NTDB_ERROR ntdb_transaction_lock(struct ntdb_context *ntdb, int ltype)
+{
+       return ntdb_nest_lock(ntdb, NTDB_TRANSACTION_LOCK, ltype, NTDB_LOCK_WAIT);
+}
+
+/*
+  release the transaction lock
+ */
+void ntdb_transaction_unlock(struct ntdb_context *ntdb, int ltype)
+{
+       ntdb_nest_unlock(ntdb, NTDB_TRANSACTION_LOCK, ltype);
+}
+
+/* We only need to lock individual bytes, but Linux merges consecutive locks
+ * so we lock in contiguous ranges. */
+static enum NTDB_ERROR ntdb_lock_gradual(struct ntdb_context *ntdb,
+                                      int ltype, enum ntdb_lock_flags flags,
+                                      ntdb_off_t off, ntdb_off_t len)
+{
+       enum NTDB_ERROR ecode;
+       enum ntdb_lock_flags nb_flags = (flags & ~NTDB_LOCK_WAIT);
+
+       if (len <= 1) {
+               /* 0 would mean to end-of-file... */
+               assert(len != 0);
+               /* Single hash.  Just do blocking lock. */
+               return ntdb_brlock(ntdb, ltype, off, len, flags);
+       }
+
+       /* First we try non-blocking. */
+       ecode = ntdb_brlock(ntdb, ltype, off, len, nb_flags);
+       if (ecode != NTDB_ERR_LOCK) {
+               return ecode;
+       }
+
+       /* Try locking first half, then second. */
+       ecode = ntdb_lock_gradual(ntdb, ltype, flags, off, len / 2);
+       if (ecode != NTDB_SUCCESS)
+               return ecode;
+
+       ecode = ntdb_lock_gradual(ntdb, ltype, flags,
+                                off + len / 2, len - len / 2);
+       if (ecode != NTDB_SUCCESS) {
+               ntdb_brunlock(ntdb, ltype, off, len / 2);
+       }
+       return ecode;
+}
+
+/* lock/unlock entire database.  It can only be upgradable if you have some
+ * other way of guaranteeing exclusivity (ie. transaction write lock). */
+enum NTDB_ERROR ntdb_allrecord_lock(struct ntdb_context *ntdb, int ltype,
+                                 enum ntdb_lock_flags flags, bool upgradable)
+{
+       enum NTDB_ERROR ecode;
+       ntdb_bool_err berr;
+
+       if (ntdb->flags & NTDB_NOLOCK) {
+               return NTDB_SUCCESS;
+       }
+
+       if (!check_lock_pid(ntdb, "ntdb_allrecord_lock", true)) {
+               return NTDB_ERR_LOCK;
+       }
+
+       if (ntdb->file->allrecord_lock.count) {
+               if (ntdb->file->allrecord_lock.owner != ntdb) {
+                       return owner_conflict(ntdb, "ntdb_allrecord_lock");
+               }
+
+               if (ltype == F_RDLCK
+                   || ntdb->file->allrecord_lock.ltype == F_WRLCK) {
+                       ntdb->file->allrecord_lock.count++;
+                       return NTDB_SUCCESS;
+               }
+
+               /* a global lock of a different type exists */
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+                                 "ntdb_allrecord_lock: already have %s lock",
+                                 ntdb->file->allrecord_lock.ltype == F_RDLCK
+                                 ? "read" : "write");
+       }
+
+       if (ntdb_has_hash_locks(ntdb)) {
+               /* can't combine global and chain locks */
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+                                 "ntdb_allrecord_lock:"
+                                 " already have chain lock");
+       }
+
+       if (upgradable && ltype != F_RDLCK) {
+               /* ntdb error: you can't upgrade a write lock! */
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                 "ntdb_allrecord_lock:"
+                                 " can't upgrade a write lock");
+       }
+
+       ntdb->stats.locks++;
+again:
+       /* Lock hashes, gradually. */
+       ecode = ntdb_lock_gradual(ntdb, ltype, flags, NTDB_HASH_LOCK_START,
+                                 1 << ntdb->hash_bits);
+       if (ecode != NTDB_SUCCESS)
+               return ecode;
+
+       /* Lock free tables: there to end of file. */
+       ecode = ntdb_brlock(ntdb, ltype,
+                           NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits),
+                           0, flags);
+       if (ecode != NTDB_SUCCESS) {
+               ntdb_brunlock(ntdb, ltype, NTDB_HASH_LOCK_START,
+                             1 << ntdb->hash_bits);
+               return ecode;
+       }
+
+       ntdb->file->allrecord_lock.owner = ntdb;
+       ntdb->file->allrecord_lock.count = 1;
+       /* If it's upgradable, it's actually exclusive so we can treat
+        * it as a write lock. */
+       ntdb->file->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
+       ntdb->file->allrecord_lock.off = upgradable;
+
+       /* Now check for needing recovery. */
+       if (flags & NTDB_LOCK_NOCHECK)
+               return NTDB_SUCCESS;
+
+       berr = ntdb_needs_recovery(ntdb);
+       if (likely(berr == false))
+               return NTDB_SUCCESS;
+
+       ntdb_allrecord_unlock(ntdb, ltype);
+       if (berr < 0)
+               return NTDB_OFF_TO_ERR(berr);
+       ecode = ntdb_lock_and_recover(ntdb);
+       if (ecode != NTDB_SUCCESS) {
+               return ecode;
+       }
+       goto again;
+}
+
+enum NTDB_ERROR ntdb_lock_open(struct ntdb_context *ntdb,
+                            int ltype, enum ntdb_lock_flags flags)
+{
+       return ntdb_nest_lock(ntdb, NTDB_OPEN_LOCK, ltype, flags);
+}
+
+void ntdb_unlock_open(struct ntdb_context *ntdb, int ltype)
+{
+       ntdb_nest_unlock(ntdb, NTDB_OPEN_LOCK, ltype);
+}
+
+bool ntdb_has_open_lock(struct ntdb_context *ntdb)
+{
+       return !(ntdb->flags & NTDB_NOLOCK)
+               && find_nestlock(ntdb, NTDB_OPEN_LOCK, ntdb) != NULL;
+}
+
+enum NTDB_ERROR ntdb_lock_expand(struct ntdb_context *ntdb, int ltype)
+{
+       /* Lock doesn't protect data, so don't check (we recurse if we do!) */
+       return ntdb_nest_lock(ntdb, NTDB_EXPANSION_LOCK, ltype,
+                            NTDB_LOCK_WAIT | NTDB_LOCK_NOCHECK);
+}
+
+void ntdb_unlock_expand(struct ntdb_context *ntdb, int ltype)
+{
+       ntdb_nest_unlock(ntdb, NTDB_EXPANSION_LOCK, ltype);
+}
+
+/* unlock entire db */
+void ntdb_allrecord_unlock(struct ntdb_context *ntdb, int ltype)
+{
+       if (ntdb->flags & NTDB_NOLOCK)
+               return;
+
+       if (ntdb->file->allrecord_lock.count == 0) {
+               ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+                          "ntdb_allrecord_unlock: not locked!");
+               return;
+       }
+
+       if (ntdb->file->allrecord_lock.owner != ntdb) {
+               ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+                          "ntdb_allrecord_unlock: not locked by us!");
+               return;
+       }
+
+       /* Upgradable locks are marked as write locks. */
+       if (ntdb->file->allrecord_lock.ltype != ltype
+           && (!ntdb->file->allrecord_lock.off || ltype != F_RDLCK)) {
+               ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                          "ntdb_allrecord_unlock: have %s lock",
+                          ntdb->file->allrecord_lock.ltype == F_RDLCK
+                          ? "read" : "write");
+               return;
+       }
+
+       if (ntdb->file->allrecord_lock.count > 1) {
+               ntdb->file->allrecord_lock.count--;
+               return;
+       }
+
+       ntdb->file->allrecord_lock.count = 0;
+       ntdb->file->allrecord_lock.ltype = 0;
+
+       ntdb_brunlock(ntdb, ltype, NTDB_HASH_LOCK_START, 0);
+}
+
+bool ntdb_has_expansion_lock(struct ntdb_context *ntdb)
+{
+       return find_nestlock(ntdb, NTDB_EXPANSION_LOCK, ntdb) != NULL;
+}
+
+bool ntdb_has_hash_locks(struct ntdb_context *ntdb)
+{
+       unsigned int i;
+
+       for (i=0; i<ntdb->file->num_lockrecs; i++) {
+               if (ntdb->file->lockrecs[i].off >= NTDB_HASH_LOCK_START
+                   && ntdb->file->lockrecs[i].off < (NTDB_HASH_LOCK_START
+                                                     + (1 << ntdb->hash_bits)))
+                       return true;
+       }
+       return false;
+}
+
+static bool ntdb_has_free_lock(struct ntdb_context *ntdb)
+{
+       unsigned int i;
+
+       if (ntdb->flags & NTDB_NOLOCK)
+               return false;
+
+       for (i=0; i<ntdb->file->num_lockrecs; i++) {
+               if (ntdb->file->lockrecs[i].off
+                   > NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits))
+                       return true;
+       }
+       return false;
+}
+
+enum NTDB_ERROR ntdb_lock_hash(struct ntdb_context *ntdb,
+                              unsigned int h,
+                              int ltype)
+{
+       unsigned l = NTDB_HASH_LOCK_START + h;
+
+       assert(h < (1 << ntdb->hash_bits));
+
+       /* a allrecord lock allows us to avoid per chain locks */
+       if (ntdb->file->allrecord_lock.count) {
+               if (!check_lock_pid(ntdb, "ntdb_lock_hashes", true))
+                       return NTDB_ERR_LOCK;
+
+               if (ntdb->file->allrecord_lock.owner != ntdb)
+                       return owner_conflict(ntdb, "ntdb_lock_hashes");
+               if (ltype == ntdb->file->allrecord_lock.ltype
+                   || ltype == F_RDLCK) {
+                       return NTDB_SUCCESS;
+               }
+
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+                                 "ntdb_lock_hashes:"
+                                 " already have %s allrecordlock",
+                                 ntdb->file->allrecord_lock.ltype == F_RDLCK
+                                 ? "read" : "write");
+       }
+
+       if (ntdb_has_free_lock(ntdb)) {
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                 "ntdb_lock_hashes: already have free lock");
+       }
+
+       if (ntdb_has_expansion_lock(ntdb)) {
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                 "ntdb_lock_hashes:"
+                                 " already have expansion lock");
+       }
+
+       return ntdb_nest_lock(ntdb, l, ltype, NTDB_LOCK_WAIT);
+}
+
+enum NTDB_ERROR ntdb_unlock_hash(struct ntdb_context *ntdb,
+                                unsigned int h, int ltype)
+{
+       unsigned l = NTDB_HASH_LOCK_START + (h & ((1 << ntdb->hash_bits)-1));
+
+       if (ntdb->flags & NTDB_NOLOCK)
+               return 0;
+
+       /* a allrecord lock allows us to avoid per chain locks */
+       if (ntdb->file->allrecord_lock.count) {
+               if (ntdb->file->allrecord_lock.ltype == F_RDLCK
+                   && ltype == F_WRLCK) {
+                       return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                         "ntdb_unlock_hashes RO allrecord!");
+               }
+               if (ntdb->file->allrecord_lock.owner != ntdb) {
+                       return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
+                                         "ntdb_unlock_hashes:"
+                                         " not locked by us!");
+               }
+               return NTDB_SUCCESS;
+       }
+
+       return ntdb_nest_unlock(ntdb, l, ltype);
+}
+
+/* Hash locks use NTDB_HASH_LOCK_START + <number of hash entries>..
+ * Then we begin; bucket offsets are sizeof(ntdb_len_t) apart, so we divide.
+ * The result is that on 32 bit systems we don't use lock values > 2^31 on
+ * files that are less than 4GB.
+ */
+static ntdb_off_t free_lock_off(const struct ntdb_context *ntdb,
+                               ntdb_off_t b_off)
+{
+       return NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits)
+               + b_off / sizeof(ntdb_off_t);
+}
+
+enum NTDB_ERROR ntdb_lock_free_bucket(struct ntdb_context *ntdb, ntdb_off_t b_off,
+                                   enum ntdb_lock_flags waitflag)
+{
+       assert(b_off >= sizeof(struct ntdb_header));
+
+       if (ntdb->flags & NTDB_NOLOCK)
+               return 0;
+
+       /* a allrecord lock allows us to avoid per chain locks */
+       if (ntdb->file->allrecord_lock.count) {
+               if (!check_lock_pid(ntdb, "ntdb_lock_free_bucket", true))
+                       return NTDB_ERR_LOCK;
+
+               if (ntdb->file->allrecord_lock.owner != ntdb) {
+                       return owner_conflict(ntdb, "ntdb_lock_free_bucket");
+               }
+
+               if (ntdb->file->allrecord_lock.ltype == F_WRLCK)
+                       return 0;
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                 "ntdb_lock_free_bucket with"
+                                 " read-only allrecordlock!");
+       }
+
+#if 0 /* FIXME */
+       if (ntdb_has_expansion_lock(ntdb)) {
+               return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
+                                 "ntdb_lock_free_bucket:"
+                                 " already have expansion lock");
+       }
+#endif
+
+       return ntdb_nest_lock(ntdb, free_lock_off(ntdb, b_off), F_WRLCK,
+                             waitflag);
+}
+
+void ntdb_unlock_free_bucket(struct ntdb_context *ntdb, ntdb_off_t b_off)
+{
+       if (ntdb->file->allrecord_lock.count)
+               return;
+
+       ntdb_nest_unlock(ntdb, free_lock_off(ntdb, b_off), F_WRLCK);
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_lockall(struct ntdb_context *ntdb)
+{
+       return ntdb_allrecord_lock(ntdb, F_WRLCK, NTDB_LOCK_WAIT, false);
+}
+
+_PUBLIC_ void ntdb_unlockall(struct ntdb_context *ntdb)
+{
+       ntdb_allrecord_unlock(ntdb, F_WRLCK);
+}
+
+_PUBLIC_ enum NTDB_ERROR ntdb_lockall_read(struct ntdb_context *ntdb)
+{
+       return ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, false);
+}
+
+_PUBLIC_ void ntdb_unlockall_read(struct ntdb_context *ntdb)
+{
+       ntdb_allrecord_unlock(ntdb, F_RDLCK);
+}
+
+void ntdb_lock_cleanup(struct ntdb_context *ntdb)
+{
+       unsigned int i;
+
+       /* We don't want to warn: they're allowed to close ntdb after fork. */
+       if (!check_lock_pid(ntdb, "ntdb_close", false))
+               return;
+
+       while (ntdb->file->allrecord_lock.count
+              && ntdb->file->allrecord_lock.owner == ntdb) {
+               ntdb_allrecord_unlock(ntdb, ntdb->file->allrecord_lock.ltype);
+       }
+
+       for (i=0; i<ntdb->file->num_lockrecs; i++) {
+               if (ntdb->file->lockrecs[i].owner == ntdb) {
+                       ntdb_nest_unlock(ntdb,
+                                       ntdb->file->lockrecs[i].off,
+                                       ntdb->file->lockrecs[i].ltype);
+                       i--;
+               }
+       }
+}
diff --git a/ccan/ntdb/man/ntdb.3.xml b/ccan/ntdb/man/ntdb.3.xml
new file mode 100644 (file)
index 0000000..3bac974
--- /dev/null
@@ -0,0 +1,132 @@
+<?xml version="1.0"?>
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
+<refentry>
+  <refmeta>
+    <refentrytitle>ntdb</refentrytitle>
+    <manvolnum>3</manvolnum>
+    <refmiscinfo class="source">Samba</refmiscinfo>
+    <refmiscinfo class="manual">System Administration tools</refmiscinfo>
+    <refmiscinfo class="version">4.1</refmiscinfo>
+  </refmeta>
+  <refnamediv>
+    <refname>ntdb</refname>
+<refpurpose>A not-so trivial keyword/data database system</refpurpose>
+  </refnamediv>
+  <refsynopsisdiv>
+<synopsis>#include &lt;ntdb.h&gt;</synopsis>
+  </refsynopsisdiv>
+  <refsect1><title>DESCRIPTION</title>
+    <para>
+      If you have previously used the tdb library from Samba, much of
+      this will seem familiar, but there are some API changes which a
+      compiler will warn you about if you simply replace 'tdb' with
+      'ntdb' in your code!  The on-disk format for ntdb is
+      incompatible with tdb.
+    </para>
+    <para>
+      tdb's API was based on gdbm, and ntdb continues this tradition,
+      with enhancements.  A differences guide is available in the text
+      file <filename>lib/ntdb/doc/TDB_porting.txt</filename> in the
+      SAMBA source tree.
+    </para>
+  </refsect1>
+  <refsect1><title>NTDB API OVERVIEW</title>
+    <para>
+      The complete API is documented in the ntdb.h header, which is
+      kept up-to-date and recommended reading.
+    </para>
+    <para>
+      Normal usage is to call ntdb_open() to create or open an ntdb
+      file.  ntdb_store() is used to add records, ntdb_fetch() is used
+      to fetch them.  Traversals are supported via callback
+      (ntdb_traverse()) or iteration (ntdb_firstkey() and
+      ntdb_nextkey()).  Transactions are supported for batching
+      updates or reads atomically, using ntdb_transaction_start() and
+      ntdb_transaction_commit().
+    </para>
+    <refsect2><title>Use With Talloc</title>
+      <para>
+       ntdb_open() takes an optional linked list of attributes:
+       in particular you can specify an alternate allocator (such as
+       talloc):
+      </para>
+      <programlisting>
+#include &lt;talloc.h&gt;
+#include &lt;ntdb.h&gt;
+
+static void *my_alloc(const void *owner, size_t len, void *priv)
+{
+    return talloc_size(owner, len);
+}
+
+static void *my_expand(void *old, size_t newlen, void *priv)
+{
+    return talloc_realloc_size(NULL, old, newlen);
+}
+
+static void my_free(void *old, void *priv)
+{
+    talloc_free(old);
+}
+
+/* This opens an ntdb file as a talloc object with given parent. */
+struct ntdb_context *ntdb_open_talloc(const void *parent,
+                                      const char *filename)
+{
+     struct ntdb_context *ntdb;
+     union ntdb_attribute alloc;
+
+     alloc.base.attr = NTDB_ATTRIBUTE_ALLOCATOR;
+     alloc.base.next = NULL;
+     alloc.alloc.alloc = my_alloc;
+     alloc.alloc.expand = my_expand;
+     alloc.alloc.free = my_free;
+
+     ntdb = ntdb_open(filename, NTDB_DEFAULT, O_RDWR|O_CREAT, 0600,
+                      &amp;alloc);
+     if (ntdb) {
+         talloc_steal(parent, ntdb);
+         talloc_set_name(ntdb, "%s", filename);
+     }
+     return ntdb;
+}
+</programlisting>
+    </refsect2>
+  </refsect1>
+  <refsect1><title>SEE ALSO</title>
+    <para>
+      <ulink url="http://tdb.samba.org/"/>
+    </para>
+  </refsect1>
+
+  <refsect1><title>AUTHOR</title>
+    <para> The original tdb software was created by Andrew Tridgell, and
+    is now developed by the
+      Samba Team as an Open Source project similar to the way the
+      Linux kernel is developed.  ntdb was derived from tdb, but mostly
+      rewritten by Rusty Russell.
+    </para>
+  </refsect1>
+
+  <refsect1><title>COPYRIGHT/LICENSE</title>
+    <para>
+      Copyright (C) Rusty Russell 2013, IBM Corporation
+    </para>
+    <para>
+      This program is free software; you can redistribute it and/or modify
+      it under the terms of the GNU Lesser General Public License as
+      published by the Free&n