From: Rusty Russell Date: Thu, 26 Aug 2010 03:22:59 +0000 (+0930) Subject: tdb2: initial commit (doesn't work, still writing tests) X-Git-Url: https://git.ozlabs.org/?a=commitdiff_plain;h=39f01834db9b6a21d076e67d1e3143ab99aaf43e;p=ccan-lca-2011.git tdb2: initial commit (doesn't work, still writing tests) --- diff --git a/ccan/tdb2/_info b/ccan/tdb2/_info new file mode 100644 index 0000000..cd7412c --- /dev/null +++ b/ccan/tdb2/_info @@ -0,0 +1,81 @@ +#include +#include + +/** + * tdb2 - [[WORK IN PROGRESS!]] The trivial (64bit transactional) database + * + * The tdb2 module provides an efficient keyword data mapping (usually + * within a file). It supports transactions, so the contents of the + * database is reliable even across crashes. + * + * Example: + * #include + * #include + * #include + * #include + * + * static void usage(void) + * { + * errx(1, "Usage: %s fetch \n" + * "OR %s store "); + * } + * + * int main(int argc, char *argv[]) + * { + * struct tdb_context *tdb; + * TDB_DATA key, value; + * + * if (argc < 4) + * usage(); + * + * tdb = tdb_open(argv[2], 1024, TDB_DEFAULT, O_CREAT|O_RDWR, + * 0600); + * if (!tdb) + * err(1, "Opening %s", argv[2]); + * + * key.dptr = (void *)argv[3]; + * key.dsize = strlen(argv[3]); + * + * if (streq(argv[1], "fetch")) { + * if (argc != 4) + * usage(); + * value = tdb_fetch(tdb, key); + * if (!value.dptr) + * errx(1, "fetch %s: %s", + * argv[3], tdb_errorstr(tdb)); + * printf("%.*s\n", value.dsize, (char *)value.dptr); + * free(value.dptr); + * } else if (streq(argv[1], "store")) { + * if (argc != 5) + * usage(); + * value.dptr = (void *)argv[4]; + * value.dsize = strlen(argv[4]); + * if (tdb_store(tdb, key, value, 0) != 0) + * errx(1, "store %s: %s", + * argv[3], tdb_errorstr(tdb)); + * } else + * usage(); + * + * return 0; + * } + * + * Maintainer: Rusty Russell + * + * Author: Rusty Russell + * + * Licence: LGPLv3 (or later) + */ +int main(int argc, char *argv[]) +{ + if (argc != 2) + return 1; + + if (strcmp(argv[1], "depends") == 0) { + printf("ccan/hash\n"); + printf("ccan/likely\n"); + printf("ccan/asearch\n"); + return 0; + } + + return 1; +} diff --git a/ccan/tdb2/check.c b/ccan/tdb2/check.c new file mode 100644 index 0000000..f005a48 --- /dev/null +++ b/ccan/tdb2/check.c @@ -0,0 +1,411 @@ + /* + Trivial Database 2: free list/block handling + Copyright (C) Rusty Russell 2010 + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, see . +*/ +#include "private.h" +#include +#include + +/* We keep an ordered array of offsets. */ +static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off) +{ + tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t)); + if (!new) + return false; + new[(*num)++] = off; + *arr = new; + return true; +} + +static bool check_header(struct tdb_context *tdb) +{ + uint64_t hash_test; + + hash_test = TDB_HASH_MAGIC; + hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test)); + if (tdb->header.hash_test != hash_test) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "check: hash test %llu should be %llu\n", + tdb->header.hash_test, hash_test); + return false; + } + if (strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "check: bad magic '%.*s'\n", + sizeof(tdb->header.magic_food), + tdb->header.magic_food); + return false; + } + if (tdb->header.v.hash_bits < INITIAL_HASH_BITS) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "check: bad hash bits %llu\n", + (long long)tdb->header.v.hash_bits); + return false; + } + if (tdb->header.v.zone_bits < INITIAL_ZONE_BITS) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "check: bad zone_bits %llu\n", + (long long)tdb->header.v.zone_bits); + return false; + } + if (tdb->header.v.free_buckets < INITIAL_FREE_BUCKETS) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "check: bad free_buckets %llu\n", + (long long)tdb->header.v.free_buckets); + return false; + } + if ((1ULL << tdb->header.v.zone_bits) * tdb->header.v.num_zones + < tdb->map_size) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "check: %llu zones size %llu don't cover %llu\n", + (long long)(1ULL << tdb->header.v.zone_bits), + (long long)tdb->header.v.num_zones, + (long long)tdb->map_size); + return false; + } + + /* We check hash_off and free_off later. */ + + /* Don't check reserved: they *can* be used later. */ + return true; +} + +static int off_cmp(const tdb_off_t *a, const tdb_off_t *b) +{ + /* Can overflow an int. */ + return a > b ? 1 + : a < b ? -1 + : 0; +} + +static bool check_hash_list(struct tdb_context *tdb, + tdb_off_t used[], + size_t num_used) +{ + struct tdb_used_record rec; + tdb_len_t hashlen, i, num_nonzero; + tdb_off_t h; + size_t num_found; + + hashlen = sizeof(tdb_off_t) << tdb->header.v.hash_bits; + + if (tdb_read_convert(tdb, tdb->header.v.hash_off - sizeof(rec), + &rec, sizeof(rec)) == -1) + return false; + + if (rec_data_length(&rec) != hashlen) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: Bad hash table length %llu vs %llu\n", + (long long)rec_data_length(&rec), + (long long)hashlen); + return false; + } + if (rec_key_length(&rec) != 0) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: Bad hash table key length %llu\n", + (long long)rec_key_length(&rec)); + return false; + } + if (rec_hash(&rec) != 0) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: Bad hash table hash value %llu\n", + (long long)rec_hash(&rec)); + return false; + } + + num_found = 0; + num_nonzero = 0; + for (i = 0, h = tdb->header.v.hash_off; + i < (1ULL << tdb->header.v.hash_bits); + i++, h += sizeof(tdb_off_t)) { + tdb_off_t off, *p, pos; + struct tdb_used_record rec; + uint64_t hash; + + off = tdb_read_off(tdb, h); + if (off == TDB_OFF_ERR) + return false; + if (!off) { + num_nonzero = 0; + continue; + } + /* FIXME: Check hash bits */ + p = asearch(&off, used, num_used, off_cmp); + if (!p) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: Invalid offset %llu in hash\n", + (long long)off); + return false; + } + /* Mark it invalid. */ + *p ^= 1; + num_found++; + + if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1) + return false; + + /* Check it is hashed correctly. */ + hash = hash_record(tdb, off); + + /* Top bits must match header. */ + if (hash >> (64 - 11) != rec_hash(&rec)) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: Bad hash magic at offset %llu" + " (0x%llx vs 0x%llx)\n", + (long long)off, + (long long)hash, (long long)rec_hash(&rec)); + return false; + } + + /* It must be in the right place in hash array. */ + pos = hash & ((1ULL << tdb->header.v.hash_bits)-1); + if (pos < i - num_nonzero || pos > i) { + /* Could be wrap from end of array? FIXME: check? */ + if (i != num_nonzero) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: Bad hash position %llu at" + " offset %llu hash 0x%llx\n", + (long long)i, + (long long)off, + (long long)hash); + return false; + } + } + num_nonzero++; + } + + if (num_found != num_used) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: Not all entries are in hash\n"); + return false; + } + return true; +} + +static bool check_free(struct tdb_context *tdb, + tdb_off_t off, + const struct tdb_free_record *frec, + tdb_off_t prev, + tdb_off_t zone, unsigned int bucket) +{ + if (frec->magic != TDB_FREE_MAGIC) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: offset %llu bad magic 0x%llx\n", + (long long)off, (long long)frec->magic); + return false; + } + if (tdb->methods->oob(tdb, off + + frec->data_len-sizeof(struct tdb_used_record), + true)) + return false; + if (zone_of(tdb, off) != zone) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: offset %llu in wrong zone %llu vs %llu\n", + (long long)off, + (long long)zone, (long long)zone_of(tdb, off)); + return false; + } + if (size_to_bucket(tdb, frec->data_len) != bucket) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: offset %llu in wrong bucket %u vs %u\n", + (long long)off, + bucket, size_to_bucket(tdb, frec->data_len)); + return false; + } + if (prev != frec->prev) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: offset %llu bad prev %llu vs %llu\n", + (long long)off, + (long long)prev, (long long)frec->prev); + return false; + } + return true; +} + +static bool check_free_list(struct tdb_context *tdb, + tdb_off_t free[], + size_t num_free) +{ + struct tdb_used_record rec; + tdb_len_t freelen, i, j; + tdb_off_t h; + size_t num_found; + + freelen = sizeof(tdb_off_t) * tdb->header.v.num_zones + * (tdb->header.v.free_buckets + 1); + + if (tdb_read_convert(tdb, tdb->header.v.free_off - sizeof(rec), + &rec, sizeof(rec)) == -1) + return false; + + if (rec_data_length(&rec) != freelen) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: Bad free table length %llu vs %llu\n", + (long long)rec_data_length(&rec), + (long long)freelen); + return false; + } + if (rec_key_length(&rec) != 0) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: Bad free table key length %llu\n", + (long long)rec_key_length(&rec)); + return false; + } + if (rec_hash(&rec) != 0) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: Bad free table hash value %llu\n", + (long long)rec_hash(&rec)); + return false; + } + + num_found = 0; + h = tdb->header.v.free_off; + for (i = 0; i < tdb->header.v.num_zones; i++) { + for (j = 0; j <= tdb->header.v.free_buckets; + j++, h += sizeof(tdb_off_t)) { + tdb_off_t off, prev = 0, *p; + struct tdb_free_record f; + + for (off = tdb_read_off(tdb, h); off; off = f.next) { + if (off == TDB_OFF_ERR) + return false; + if (tdb_read_convert(tdb, off, &f, sizeof(f))) + return false; + if (!check_free(tdb, off, &f, prev, i, j)) + return false; + + /* FIXME: Check hash bits */ + p = asearch(&off, free, num_free, off_cmp); + if (!p) { + tdb->log(tdb, TDB_DEBUG_ERROR, + tdb->log_priv, + "tdb_check: Invalid offset" + " %llu in free table\n", + (long long)off); + return false; + } + /* Mark it invalid. */ + *p ^= 1; + num_found++; + prev = off; + } + } + } + if (num_found != num_free) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: Not all entries are in free table\n"); + return false; + } + return true; +} + +/* FIXME: call check() function. */ +int tdb_check(struct tdb_context *tdb, + int (*check)(TDB_DATA key, TDB_DATA data, void *private_data), + void *private_data) +{ + tdb_off_t *free = NULL, *used = NULL, off; + tdb_len_t len; + size_t num_free = 0, num_used = 0; + bool hash_found = false, free_found = false; + + if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0) + return -1; + + update_header(tdb); + + if (!check_header(tdb)) + goto fail; + + /* First we do a linear scan, checking all records. */ + for (off = sizeof(struct tdb_header); + off < tdb->map_size; + off += len) { + union { + struct tdb_used_record u; + struct tdb_free_record f; + } pad, *p; + p = tdb_get(tdb, off, &pad, sizeof(pad)); + if (!p) + goto fail; + if (p->f.magic == TDB_FREE_MAGIC) { + /* This record is free! */ + if (!append(&free, &num_free, off)) + goto fail; + len = sizeof(p->u) + p->f.data_len; + if (tdb->methods->oob(tdb, off + len, false)) + goto fail; + } else { + uint64_t klen, dlen, extra; + + /* This record is used! */ + if (rec_magic(&p->u) != TDB_MAGIC) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: Bad magic 0x%llx" + " at offset %llu\n", + (long long)rec_magic(&p->u), + (long long)off); + goto fail; + } + + if (!append(&used, &num_used, off)) + goto fail; + + klen = rec_key_length(&p->u); + dlen = rec_data_length(&p->u); + extra = rec_extra_padding(&p->u); + + len = sizeof(p->u) + klen + dlen + extra; + if (tdb->methods->oob(tdb, off + len, false)) + goto fail; + + if (off + sizeof(p->u) == tdb->header.v.hash_off) { + hash_found = true; + } else if (off + sizeof(p->u) + == tdb->header.v.free_off) { + free_found = true; + } + } + } + + if (!hash_found) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: hash table not found at %llu\n", + (long long)tdb->header.v.hash_off); + goto fail; + } + + if (!free_found) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_check: free table not found at %llu\n", + (long long)tdb->header.v.free_off); + goto fail; + } + + /* FIXME: Check key uniqueness? */ + if (!check_hash_list(tdb, used, num_used)) + goto fail; + + if (!check_free_list(tdb, free, num_free)) + goto fail; + + tdb_allrecord_unlock(tdb, F_RDLCK); + return true; + +fail: + tdb_allrecord_unlock(tdb, F_RDLCK); + return false; +} diff --git a/ccan/tdb2/doc/design-1.3.txt b/ccan/tdb2/doc/design-1.3.txt new file mode 100644 index 0000000..651ada0 --- /dev/null +++ b/ccan/tdb2/doc/design-1.3.txt @@ -0,0 +1,1050 @@ +TDB2: A Redesigning The Trivial DataBase + +Rusty Russell, IBM Corporation + +27-April-2010 + +Abstract + +The Trivial DataBase on-disk format is 32 bits; with usage cases +heading towards the 4G limit, that must change. This required +breakage provides an opportunity to revisit TDB's other design +decisions and reassess them. + +1 Introduction + +The Trivial DataBase was originally written by Andrew Tridgell as +a simple key/data pair storage system with the same API as dbm, +but allowing multiple readers and writers while being small +enough (< 1000 lines of C) to include in SAMBA. The simple design +created in 1999 has proven surprisingly robust and performant, +used in Samba versions 3 and 4 as well as numerous other +projects. Its useful life was greatly increased by the +(backwards-compatible!) addition of transaction support in 2005. + +The wider variety and greater demands of TDB-using code has lead +to some organic growth of the API, as well as some compromises on +the implementation. None of these, by themselves, are seen as +show-stoppers, but the cumulative effect is to a loss of elegance +over the initial, simple TDB implementation. Here is a table of +the approximate number of lines of implementation code and number +of API functions at the end of each year: + + ++-----------+----------------+--------------------------------+ +| Year End | API Functions | Lines of C Code Implementation | ++-----------+----------------+--------------------------------+ ++-----------+----------------+--------------------------------+ +| 1999 | 13 | 1195 | ++-----------+----------------+--------------------------------+ +| 2000 | 24 | 1725 | ++-----------+----------------+--------------------------------+ +| 2001 | 32 | 2228 | ++-----------+----------------+--------------------------------+ +| 2002 | 35 | 2481 | ++-----------+----------------+--------------------------------+ +| 2003 | 35 | 2552 | ++-----------+----------------+--------------------------------+ +| 2004 | 40 | 2584 | ++-----------+----------------+--------------------------------+ +| 2005 | 38 | 2647 | ++-----------+----------------+--------------------------------+ +| 2006 | 52 | 3754 | ++-----------+----------------+--------------------------------+ +| 2007 | 66 | 4398 | ++-----------+----------------+--------------------------------+ +| 2008 | 71 | 4768 | ++-----------+----------------+--------------------------------+ +| 2009 | 73 | 5715 | ++-----------+----------------+--------------------------------+ + + +This review is an attempt to catalog and address all the known +issues with TDB and create solutions which address the problems +without significantly increasing complexity; all involved are far +too aware of the dangers of second system syndrome in rewriting a +successful project like this. + +2 API Issues + +2.1 tdb_open_ex Is Not Expandable + +The tdb_open() call was expanded to tdb_open_ex(), which added an +optional hashing function and an optional logging function +argument. Additional arguments to open would require the +introduction of a tdb_open_ex2 call etc. + +2.1.1 Proposed Solution + +tdb_open() will take a linked-list of attributes: + +enum tdb_attribute { + + TDB_ATTRIBUTE_LOG = 0, + + TDB_ATTRIBUTE_HASH = 1 + +}; + +struct tdb_attribute_base { + + enum tdb_attribute attr; + + union tdb_attribute *next; + +}; + +struct tdb_attribute_log { + + struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG +*/ + + tdb_log_func log_fn; + + void *log_private; + +}; + +struct tdb_attribute_hash { + + struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH +*/ + + tdb_hash_func hash_fn; + + void *hash_private; + +}; + +union tdb_attribute { + + struct tdb_attribute_base base; + + struct tdb_attribute_log log; + + struct tdb_attribute_hash hash; + +}; + +This allows future attributes to be added, even if this expands +the size of the union. + +2.2 tdb_traverse Makes Impossible Guarantees + +tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, +and it was thought that it was important to guarantee that all +records which exist at the start and end of the traversal would +be included, and no record would be included twice. + +This adds complexity (see[Reliable-Traversal-Adds]) and does not +work anyway for records which are altered (in particular, those +which are expanded may be effectively deleted and re-added behind +the traversal). + +2.2.1 Proposed Solution + +Abandon the guarantee. You will see every record if no changes +occur during your traversal, otherwise you will see some subset. +You can prevent changes by using a transaction or the locking +API. + +2.3 Nesting of Transactions Is Fraught + +TDB has alternated between allowing nested transactions and not +allowing them. Various paths in the Samba codebase assume that +transactions will nest, and in a sense they can: the operation is +only committed to disk when the outer transaction is committed. +There are two problems, however: + +1. Canceling the inner transaction will cause the outer + transaction commit to fail, and will not undo any operations + since the inner transaction began. This problem is soluble with + some additional internal code. + +2. An inner transaction commit can be cancelled by the outer + transaction. This is desirable in the way which Samba's + database initialization code uses transactions, but could be a + surprise to any users expecting a successful transaction commit + to expose changes to others. + +The current solution is to specify the behavior at tdb_open(), +with the default currently that nested transactions are allowed. +This flag can also be changed at runtime. + +2.3.1 Proposed Solution + +Given the usage patterns, it seems that the “least-surprise” +behavior of disallowing nested transactions should become the +default. Additionally, it seems the outer transaction is the only +code which knows whether inner transactions should be allowed, so +a flag to indicate this could be added to tdb_transaction_start. +However, this behavior can be simulated with a wrapper which uses +tdb_add_flags() and tdb_remove_flags(), so the API should not be +expanded for this relatively-obscure case. + +2.4 Incorrect Hash Function is Not Detected + +tdb_open_ex() allows the calling code to specify a different hash +function to use, but does not check that all other processes +accessing this tdb are using the same hash function. The result +is that records are missing from tdb_fetch(). + +2.4.1 Proposed Solution + +The header should contain an example hash result (eg. the hash of +0xdeadbeef), and tdb_open_ex() should check that the given hash +function produces the same answer, or fail the tdb_open call. + +2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation + +In response to scalability issues with the free list ([TDB-Freelist-Is] +) two API workarounds have been incorporated in TDB: +tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The +latter actually calls the former with an argument of “5”. + +This code allows deleted records to accumulate without putting +them in the free list. On delete we iterate through each chain +and free them in a batch if there are more than max_dead entries. +These are never otherwise recycled except as a side-effect of a +tdb_repack. + +2.5.1 Proposed Solution + +With the scalability problems of the freelist solved, this API +can be removed. The TDB_VOLATILE flag may still be useful as a +hint that store and delete of records will be at least as common +as fetch in order to allow some internal tuning, but initially +will become a no-op. + +2.6 TDB Files Cannot Be Opened Multiple Times + In The Same Process + +No process can open the same TDB twice; we check and disallow it. +This is an unfortunate side-effect of fcntl locks, which operate +on a per-file rather than per-file-descriptor basis, and do not +nest. Thus, closing any file descriptor on a file clears all the +locks obtained by this process, even if they were placed using a +different file descriptor! + +Note that even if this were solved, deadlock could occur if +operations were nested: this is a more manageable programming +error in most cases. + +2.6.1 Proposed Solution + +We could lobby POSIX to fix the perverse rules, or at least lobby +Linux to violate them so that the most common implementation does +not have this restriction. This would be a generally good idea +for other fcntl lock users. + +Samba uses a wrapper which hands out the same tdb_context to +multiple callers if this happens, and does simple reference +counting. We should do this inside the tdb library, which already +emulates lock nesting internally; it would need to recognize when +deadlock occurs within a single process. This would create a new +failure mode for tdb operations (while we currently handle +locking failures, they are impossible in normal use and a process +encountering them can do little but give up). + +I do not see benefit in an additional tdb_open flag to indicate +whether re-opening is allowed, as though there may be some +benefit to adding a call to detect when a tdb_context is shared, +to allow other to create such an API. + +2.7 TDB API Is Not POSIX Thread-safe + +The TDB API uses an error code which can be queried after an +operation to determine what went wrong. This programming model +does not work with threads, unless specific additional guarantees +are given by the implementation. In addition, even +otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot] +). + +2.7.1 Proposed Solution + +Reachitecting the API to include a tdb_errcode pointer would be a +great deal of churn; we are better to guarantee that the +tdb_errcode is per-thread so the current programming model can be +maintained. + +This requires dynamic per-thread allocations, which is awkward +with POSIX threads (pthread_key_create space is limited and we +cannot simply allocate a key for every TDB). + +Internal locking is required to make sure that fcntl locks do not +overlap between threads, and also that the global list of tdbs is +maintained. + +The aim is that building tdb with -DTDB_PTHREAD will result in a +pthread-safe version of the library, and otherwise no overhead +will exist. + +2.8 *_nonblock Functions And *_mark Functions Expose + Implementation + +CTDB[footnote: +Clustered TDB, see http://ctdb.samba.org +] wishes to operate on TDB in a non-blocking manner. This is +currently done as follows: + +1. Call the _nonblock variant of an API function (eg. + tdb_lockall_nonblock). If this fails: + +2. Fork a child process, and wait for it to call the normal + variant (eg. tdb_lockall). + +3. If the child succeeds, call the _mark variant to indicate we + already have the locks (eg. tdb_lockall_mark). + +4. Upon completion, tell the child to release the locks (eg. + tdb_unlockall). + +5. Indicate to tdb that it should consider the locks removed (eg. + tdb_unlockall_mark). + +There are several issues with this approach. Firstly, adding two +new variants of each function clutters the API for an obscure +use, and so not all functions have three variants. Secondly, it +assumes that all paths of the functions ask for the same locks, +otherwise the parent process will have to get a lock which the +child doesn't have under some circumstances. I don't believe this +is currently the case, but it constrains the implementation. + +2.8.1 Proposed Solution + +Implement a hook for locking methods, so that the caller can +control the calls to create and remove fcntl locks. In this +scenario, ctdbd would operate as follows: + +1. Call the normal API function, eg tdb_lockall(). + +2. When the lock callback comes in, check if the child has the + lock. Initially, this is always false. If so, return 0. + Otherwise, try to obtain it in non-blocking mode. If that + fails, return EWOULDBLOCK. + +3. Release locks in the unlock callback as normal. + +4. If tdb_lockall() fails, see if we recorded a lock failure; if + so, call the child to repeat the operation. + +5. The child records what locks it obtains, and returns that + information to the parent. + +6. When the child has succeeded, goto 1. + +This is flexible enough to handle any potential locking scenario, +even when lock requirements change. It can be optimized so that +the parent does not release locks, just tells the child which +locks it doesn't need to obtain. + +It also keeps the complexity out of the API, and in ctdbd where +it is needed. + +2.9 tdb_chainlock Functions Expose Implementation + +tdb_chainlock locks some number of records, including the record +indicated by the given key. This gave atomicity guarantees; +no-one can start a transaction, alter, read or delete that key +while the lock is held. + +It also makes the same guarantee for any other key in the chain, +which is an internal implementation detail and potentially a +cause for deadlock. + +2.9.1 Proposed Solution + +None. It would be nice to have an explicit single entry lock +which effected no other keys. Unfortunately, this won't work for +an entry which doesn't exist. Thus while chainlock may be +implemented more efficiently for the existing case, it will still +have overlap issues with the non-existing case. So it is best to +keep the current (lack of) guarantee about which records will be +effected to avoid constraining our implementation. + +2.10 Signal Handling is Not Race-Free + +The tdb_setalarm_sigptr() call allows the caller's signal handler +to indicate that the tdb locking code should return with a +failure, rather than trying again when a signal is received (and +errno == EAGAIN). This is usually used to implement timeouts. + +Unfortunately, this does not work in the case where the signal is +received before the tdb code enters the fcntl() call to place the +lock: the code will sleep within the fcntl() code, unaware that +the signal wants it to exit. In the case of long timeouts, this +does not happen in practice. + +2.10.1 Proposed Solution + +The locking hooks proposed in[Proposed-Solution-locking-hook] +would allow the user to decide on whether to fail the lock +acquisition on a signal. This allows the caller to choose their +own compromise: they could narrow the race by checking +immediately before the fcntl call.[footnote: +It may be possible to make this race-free in some implementations +by having the signal handler alter the struct flock to make it +invalid. This will cause the fcntl() lock call to fail with +EINVAL if the signal occurs before the kernel is entered, +otherwise EAGAIN. +] + +2.11 The API Uses Gratuitous Typedefs, Capitals + +typedefs are useful for providing source compatibility when types +can differ across implementations, or arguably in the case of +function pointer definitions which are hard for humans to parse. +Otherwise it is simply obfuscation and pollutes the namespace. + +Capitalization is usually reserved for compile-time constants and +macros. + + TDB_CONTEXT There is no reason to use this over 'struct + tdb_context'; the definition isn't visible to the API user + anyway. + + TDB_DATA There is no reason to use this over struct TDB_DATA; + the struct needs to be understood by the API user. + + struct TDB_DATA This would normally be called 'struct + tdb_data'. + + enum TDB_ERROR Similarly, this would normally be enum + tdb_error. + +2.11.1 Proposed Solution + +None. Introducing lower case variants would please pedants like +myself, but if it were done the existing ones should be kept. +There is little point forcing a purely cosmetic change upon tdb +users. + +2.12 tdb_log_func Doesn't Take The + Private Pointer + +For API compatibility reasons, the logging function needs to call +tdb_get_logging_private() to retrieve the pointer registered by +the tdb_open_ex for logging. + +2.12.1 Proposed Solution + +It should simply take an extra argument, since we are prepared to +break the API/ABI. + +2.13 Various Callback Functions Are Not Typesafe + +The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take] + is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read +and tdb_check all take void * and must internally convert it to +the argument type they were expecting. + +If this type changes, the compiler will not produce warnings on +the callers, since it only sees void *. + +2.13.1 Proposed Solution + +With careful use of macros, we can create callback functions +which give a warning when used on gcc and the types of the +callback and its private argument differ. Unsupported compilers +will not give a warning, which is no worse than now. In addition, +the callbacks become clearer, as they need not use void * for +their parameter. + +See CCAN's typesafe_cb module at +http://ccan.ozlabs.org/info/typesafe_cb.html + +2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, + tdb_reopen_all Problematic + +The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB +file should be cleared if the caller discovers it is the only +process with the TDB open. However, if any caller does not +specify TDB_CLEAR_IF_FIRST it will not be detected, so will have +the TDB erased underneath them (usually resulting in a crash). + +There is a similar issue on fork(); if the parent exits (or +otherwise closes the tdb) before the child calls tdb_reopen_all() +to establish the lock used to indicate the TDB is opened by +someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe +it alone has opened the TDB and will erase it. + +2.14.1 Proposed Solution + +Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but +see [TDB_CLEAR_IF_FIRST-Imposes-Performance]. + +3 Performance And Scalability Issues + +3.1 TDB_CLEAR_IF_FIRST + Imposes Performance Penalty + +When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is +placed at offset 4 (aka. the ACTIVE_LOCK). While these locks +never conflict in normal tdb usage, they do add substantial +overhead for most fcntl lock implementations when the kernel +scans to detect if a lock conflict exists. This is often a single +linked list, making the time to acquire and release a fcntl lock +O(N) where N is the number of processes with the TDB open, not +the number actually doing work. + +In a Samba server it is common to have huge numbers of clients +sitting idle, and thus they have weaned themselves off the +TDB_CLEAR_IF_FIRST flag.[footnote: +There is a flag to tdb_reopen_all() which is used for this +optimization: if the parent process will outlive the child, the +child does not need the ACTIVE_LOCK. This is a workaround for +this very performance issue. +] + +3.1.1 Proposed Solution + +Remove the flag. It was a neat idea, but even trivial servers +tend to know when they are initializing for the first time and +can simply unlink the old tdb at that point. + +3.2 TDB Files Have a 4G Limit + +This seems to be becoming an issue (so much for “trivial”!), +particularly for ldb. + +3.2.1 Proposed Solution + +A new, incompatible TDB format which uses 64 bit offsets +internally rather than 32 bit as now. For simplicity of endian +conversion (which TDB does on the fly if required), all values +will be 64 bit on disk. In practice, some upper bits may be used +for other purposes, but at least 56 bits will be available for +file offsets. + +tdb_open() will automatically detect the old version, and even +create them if TDB_VERSION6 is specified to tdb_open. + +32 bit processes will still be able to access TDBs larger than 4G +(assuming that their off_t allows them to seek to 64 bits), they +will gracefully fall back as they fail to mmap. This can happen +already with large TDBs. + +Old versions of tdb will fail to open the new TDB files (since 28 +August 2009, commit 398d0c29290: prior to that any unrecognized +file format would be erased and initialized as a fresh tdb!) + +3.3 TDB Records Have a 4G Limit + +This has not been a reported problem, and the API uses size_t +which can be 64 bit on 64 bit platforms. However, other limits +may have made such an issue moot. + +3.3.1 Proposed Solution + +Record sizes will be 64 bit, with an error returned on 32 bit +platforms which try to access such records (the current +implementation would return TDB_ERR_OOM in a similar case). It +seems unlikely that 32 bit keys will be a limitation, so the +implementation may not support this (see [sub:Records-Incur-A]). + +3.4 Hash Size Is Determined At TDB Creation Time + +TDB contains a number of hash chains in the header; the number is +specified at creation time, and defaults to 131. This is such a +bottleneck on large databases (as each hash chain gets quite +long), that LDB uses 10,000 for this hash. In general it is +impossible to know what the 'right' answer is at database +creation time. + +3.4.1 Proposed Solution + +After comprehensive performance testing on various scalable hash +variants[footnote: +http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 +This was annoying because I was previously convinced that an +expanding tree of hashes would be very close to optimal. +], it became clear that it is hard to beat a straight linear hash +table which doubles in size when it reaches saturation. There are +three details which become important: + +1. On encountering a full bucket, we use the next bucket. + +2. Extra hash bits are stored with the offset, to reduce + comparisons. + +3. A marker entry is used on deleting an entry. + +The doubling of the table must be done under a transaction; we +will not reduce it on deletion, so it will be an unusual case. It +will either be placed at the head (other entries will be moved +out the way so we can expand). We could have a pointer in the +header to the current hashtable location, but that pointer would +have to be read frequently to check for hashtable moves. + +The locking for this is slightly more complex than the chained +case; we currently have one lock per bucket, and that means we +would need to expand the lock if we overflow to the next bucket. +The frequency of such collisions will effect our locking +heuristics: we can always lock more buckets than we need. + +One possible optimization is to only re-check the hash size on an +insert or a lookup miss. + +3.5 TDB Freelist Is Highly Contended + +TDB uses a single linked list for the free list. Allocation +occurs as follows, using heuristics which have evolved over time: + +1. Get the free list lock for this whole operation. + +2. Multiply length by 1.25, so we always over-allocate by 25%. + +3. Set the slack multiplier to 1. + +4. Examine the current freelist entry: if it is > length but < + the current best case, remember it as the best case. + +5. Multiply the slack multiplier by 1.05. + +6. If our best fit so far is less than length * slack multiplier, + return it. The slack will be turned into a new free record if + it's large enough. + +7. Otherwise, go onto the next freelist entry. + +Deleting a record occurs as follows: + +1. Lock the hash chain for this whole operation. + +2. Walk the chain to find the record, keeping the prev pointer + offset. + +3. If max_dead is non-zero: + + (a) Walk the hash chain again and count the dead records. + + (b) If it's more than max_dead, bulk free all the dead ones + (similar to steps 4 and below, but the lock is only obtained + once). + + (c) Simply mark this record as dead and return. + +4. Get the free list lock for the remainder of this operation. + +5. Examine the following block to see if it is + free; if so, enlarge the current block and remove that block + from the free list. This was disabled, as removal from the free + list was O(entries-in-free-list). + +6. Examine the preceeding block to see if it is free: for this + reason, each block has a 32-bit tailer which indicates its + length. If it is free, expand it to cover our new block and + return. + +7. Otherwise, prepend ourselves to the free list. + +Disabling right-merging (step [right-merging]) causes +fragmentation; the other heuristics proved insufficient to +address this, so the final answer to this was that when we expand +the TDB file inside a transaction commit, we repack the entire +tdb. + +The single list lock limits our allocation rate; due to the other +issues this is not currently seen as a bottleneck. + +3.5.1 Proposed Solution + +The first step is to remove all the current heuristics, as they +obviously interact, then examine them once the lock contention is +addressed. + +The free list must be split to reduce contention. Assuming +perfect free merging, we can at most have 1 free list entry for +each entry. This implies that the number of free lists is related +to the size of the hash table, but as it is rare to walk a large +number of free list entries we can use far fewer, say 1/32 of the +number of hash buckets. + +There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented] +) but it's not clear this would reduce contention in the common +case where all processes are allocating/freeing the same size. +Thus we almost certainly need to divide in other ways: the most +obvious is to divide the file into zones, and using a free list +(or set of free lists) for each. This approximates address +ordering. + +Note that this means we need to split the free lists when we +expand the file; this is probably acceptable when we double the +hash table size, since that is such an expensive operation +already. In the case of increasing the file size, there is an +optimization we can use: if we use M in the formula above as the +file size rounded up to the next power of 2, we only need +reshuffle free lists when the file size crosses a power of 2 +boundary, and reshuffling the free lists is trivial: we simply +merge every consecutive pair of free lists. + +The basic algorithm is as follows. Freeing is simple: + +1. Identify the correct zone. + +2. Lock the corresponding list. + +3. Re-check the zone (we didn't have a lock, sizes could have + changed): relock if necessary. + +4. Place the freed entry in the list for that zone. + +Allocation is a little more complicated, as we perform delayed +coalescing at this point: + +1. Pick a zone either the zone we last freed into, or based on a “ + random” number. + +2. Lock the corresponding list. + +3. Re-check the zone: relock if necessary. + +4. If the top entry is -large enough, remove it from the list and + return it. + +5. Otherwise, coalesce entries in the list. + + (a) + + (b) + + (c) + + (d) + +6. If there was no entry large enough, unlock the list and try + the next zone. + +7. + +8. + +9. If no zone satisfies, expand the file. + +This optimizes rapid insert/delete of free list entries by not +coalescing them all the time.. First-fit address ordering +ordering seems to be fairly good for keeping fragmentation low +(see [sub:TDB-Becomes-Fragmented]). Note that address ordering +does not need a tailer to coalesce, though if we needed one we +could have one cheaply: see [sub:Records-Incur-A]. + + + +I anticipate that the number of entries in each free zone would +be small, but it might be worth using one free entry to hold +pointers to the others for cache efficiency. + +3.6 TDB Becomes Fragmented + +Much of this is a result of allocation strategy[footnote: +The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 +ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps +] and deliberate hobbling of coalescing; internal fragmentation +(aka overallocation) is deliberately set at 25%, and external +fragmentation is only cured by the decision to repack the entire +db when a transaction commit needs to enlarge the file. + +3.6.1 Proposed Solution + +The 25% overhead on allocation works in practice for ldb because +indexes tend to expand by one record at a time. This internal +fragmentation can be resolved by having an “expanded” bit in the +header to note entries that have previously expanded, and +allocating more space for them. + +There are is a spectrum of possible solutions for external +fragmentation: one is to use a fragmentation-avoiding allocation +strategy such as best-fit address-order allocator. The other end +of the spectrum would be to use a bump allocator (very fast and +simple) and simply repack the file when we reach the end. + +There are three problems with efficient fragmentation-avoiding +allocators: they are non-trivial, they tend to use a single free +list for each size, and there's no evidence that tdb allocation +patterns will match those recorded for general allocators (though +it seems likely). + +Thus we don't spend too much effort on external fragmentation; we +will be no worse than the current code if we need to repack on +occasion. More effort is spent on reducing freelist contention, +and reducing overhead. + +3.7 Records Incur A 28-Byte Overhead + +Each TDB record has a header as follows: + +struct tdb_record { + + tdb_off_t next; /* offset of the next record in the list +*/ + + tdb_len_t rec_len; /* total byte length of record */ + + tdb_len_t key_len; /* byte length of key */ + + tdb_len_t data_len; /* byte length of data */ + + uint32_t full_hash; /* the full 32 bit hash of the key */ + + uint32_t magic; /* try to catch errors */ + + /* the following union is implied: + + union { + + char record[rec_len]; + + struct { + + char key[key_len]; + + char data[data_len]; + + } + + uint32_t totalsize; (tailer) + + } + + */ + +}; + +Naively, this would double to a 56-byte overhead on a 64 bit +implementation. + +3.7.1 Proposed Solution + +We can use various techniques to reduce this for an allocated +block: + +1. The 'next' pointer is not required, as we are using a flat + hash table. + +2. 'rec_len' can instead be expressed as an addition to key_len + and data_len (it accounts for wasted or overallocated length in + the record). Since the record length is always a multiple of 8, + we can conveniently fit it in 32 bits (representing up to 35 + bits). + +3. 'key_len' and 'data_len' can be reduced. I'm unwilling to + restrict 'data_len' to 32 bits, but instead we can combine the + two into one 64-bit field and using a 5 bit value which + indicates at what bit to divide the two. Keys are unlikely to + scale as fast as data, so I'm assuming a maximum key size of 32 + bits. + +4. 'full_hash' is used to avoid a memcmp on the “miss” case, but + this is diminishing returns after a handful of bits (at 10 + bits, it reduces 99.9% of false memcmp). As an aside, as the + lower bits are already incorporated in the hash table + resolution, the upper bits should be used here. + +5. 'magic' does not need to be enlarged: it currently reflects + one of 5 values (used, free, dead, recovery, and + unused_recovery). It is useful for quick sanity checking + however, and should not be eliminated. + +6. 'tailer' is only used to coalesce free blocks (so a block to + the right can find the header to check if this block is free). + This can be replaced by a single 'free' bit in the header of + the following block (and the tailer only exists in free + blocks).[footnote: +This technique from Thomas Standish. Data Structure Techniques. +Addison-Wesley, Reading, Massachusetts, 1980. +] The current proposed coalescing algorithm doesn't need this, + however. + +This produces a 16 byte used header like this: + +struct tdb_used_record { + + uint32_t magic : 16, + + prev_is_free: 1, + + key_data_divide: 5, + + top_hash: 10; + + uint32_t extra_octets; + + uint64_t key_and_data_len; + +}; + +And a free record like this: + +struct tdb_free_record { + + uint32_t free_magic; + + uint64_t total_length; + + ... + + uint64_t tailer; + +}; + + + +3.8 Transaction Commit Requires 4 fdatasync + +The current transaction algorithm is: + +1. write_recovery_data(); + +2. sync(); + +3. write_recovery_header(); + +4. sync(); + +5. overwrite_with_new_data(); + +6. sync(); + +7. remove_recovery_header(); + +8. sync(); + +On current ext3, each sync flushes all data to disk, so the next +3 syncs are relatively expensive. But this could become a +performance bottleneck on other filesystems such as ext4. + +3.8.1 Proposed Solution + + + + + + + + + +Neil Brown points out that this is overzealous, and only one sync +is needed: + +1. Bundle the recovery data, a transaction counter and a strong + checksum of the new data. + +2. Strong checksum that whole bundle. + +3. Store the bundle in the database. + +4. Overwrite the oldest of the two recovery pointers in the + header (identified using the transaction counter) with the + offset of this bundle. + +5. sync. + +6. Write the new data to the file. + +Checking for recovery means identifying the latest bundle with a +valid checksum and using the new data checksum to ensure that it +has been applied. This is more expensive than the current check, +but need only be done at open. For running databases, a separate +header field can be used to indicate a transaction in progress; +we need only check for recovery if this is set. + +3.9 TDB Does Not Have Snapshot Support + +3.9.1 Proposed Solution + +None. At some point you say “use a real database”. + +But as a thought experiment, if we implemented transactions to +only overwrite free entries (this is tricky: there must not be a +header in each entry which indicates whether it is free, but use +of presence in metadata elsewhere), and a pointer to the hash +table, we could create an entirely new commit without destroying +existing data. Then it would be easy to implement snapshots in a +similar way. + +This would not allow arbitrary changes to the database, such as +tdb_repack does, and would require more space (since we have to +preserve the current and future entries at once). If we used hash +trees rather than one big hash table, we might only have to +rewrite some sections of the hash, too. + +We could then implement snapshots using a similar method, using +multiple different hash tables/free tables. + +3.10 Transactions Cannot Operate in Parallel + +This would be useless for ldb, as it hits the index records with +just about every update. It would add significant complexity in +resolving clashes, and cause the all transaction callers to write +their code to loop in the case where the transactions spuriously +failed. + +3.10.1 Proposed Solution + +We could solve a small part of the problem by providing read-only +transactions. These would allow one write transaction to begin, +but it could not commit until all r/o transactions are done. This +would require a new RO_TRANSACTION_LOCK, which would be upgraded +on commit. + +3.11 Default Hash Function Is Suboptimal + +The Knuth-inspired multiplicative hash used by tdb is fairly slow +(especially if we expand it to 64 bits), and works best when the +hash bucket size is a prime number (which also means a slow +modulus). In addition, it is highly predictable which could +potentially lead to a Denial of Service attack in some TDB uses. + +3.11.1 Proposed Solution + +The Jenkins lookup3 hash[footnote: +http://burtleburtle.net/bob/c/lookup3.c +] is a fast and superbly-mixing hash. It's used by the Linux +kernel and almost everything else. This has the particular +properties that it takes an initial seed, and produces two 32 bit +hash numbers, which we can combine into a 64-bit hash. + +The seed should be created at tdb-creation time from some random +source, and placed in the header. This is far from foolproof, but +adds a little bit of protection against hash bombing. + +3.12 Reliable Traversal Adds Complexity + +We lock a record during traversal iteration, and try to grab that +lock in the delete code. If that grab on delete fails, we simply +mark it deleted and continue onwards; traversal checks for this +condition and does the delete when it moves off the record. + +If traversal terminates, the dead record may be left +indefinitely. + +3.12.1 Proposed Solution + +Remove reliability guarantees; see [traverse-Proposed-Solution]. + +3.13 Fcntl Locking Adds Overhead + +Placing a fcntl lock means a system call, as does removing one. +This is actually one reason why transactions can be faster +(everything is locked once at transaction start). In the +uncontended case, this overhead can theoretically be eliminated. + +3.13.1 Proposed Solution + +None. + +We tried this before with spinlock support, in the early days of +TDB, and it didn't make much difference except in manufactured +benchmarks. + +We could use spinlocks (with futex kernel support under Linux), +but it means that we lose automatic cleanup when a process dies +with a lock. There is a method of auto-cleanup under Linux, but +it's not supported by other operating systems. We could +reintroduce a clear-if-first-style lock and sweep for dead +futexes on open, but that wouldn't help the normal case of one +concurrent opener dying. Increasingly elaborate repair schemes +could be considered, but they require an ABI change (everyone +must use them) anyway, so there's no need to do this at the same +time as everything else. + diff --git a/ccan/tdb2/doc/design.lyx b/ccan/tdb2/doc/design.lyx new file mode 100644 index 0000000..51378c3 --- /dev/null +++ b/ccan/tdb2/doc/design.lyx @@ -0,0 +1,2282 @@ +#LyX 1.6.5 created this file. For more info see http://www.lyx.org/ +\lyxformat 345 +\begin_document +\begin_header +\textclass article +\use_default_options true +\language english +\inputencoding auto +\font_roman default +\font_sans default +\font_typewriter default +\font_default_family default +\font_sc false +\font_osf false +\font_sf_scale 100 +\font_tt_scale 100 + +\graphics default +\paperfontsize default +\use_hyperref false +\papersize default +\use_geometry false +\use_amsmath 1 +\use_esint 1 +\cite_engine basic +\use_bibtopic false +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation indent +\defskip medskip +\quotes_language english +\papercolumns 1 +\papersides 1 +\paperpagestyle default +\tracking_changes true +\output_changes true +\author "" +\author "" +\end_header + +\begin_body + +\begin_layout Title +TDB2: A Redesigning The Trivial DataBase +\end_layout + +\begin_layout Author +Rusty Russell, IBM Corporation +\end_layout + +\begin_layout Date +26-July-2010 +\end_layout + +\begin_layout Abstract +The Trivial DataBase on-disk format is 32 bits; with usage cases heading + towards the 4G limit, that must change. + This required breakage provides an opportunity to revisit TDB's other design + decisions and reassess them. +\end_layout + +\begin_layout Section +Introduction +\end_layout + +\begin_layout Standard +The Trivial DataBase was originally written by Andrew Tridgell as a simple + key/data pair storage system with the same API as dbm, but allowing multiple + readers and writers while being small enough (< 1000 lines of C) to include + in SAMBA. + The simple design created in 1999 has proven surprisingly robust and performant +, used in Samba versions 3 and 4 as well as numerous other projects. + Its useful life was greatly increased by the (backwards-compatible!) addition + of transaction support in 2005. +\end_layout + +\begin_layout Standard +The wider variety and greater demands of TDB-using code has lead to some + organic growth of the API, as well as some compromises on the implementation. + None of these, by themselves, are seen as show-stoppers, but the cumulative + effect is to a loss of elegance over the initial, simple TDB implementation. + Here is a table of the approximate number of lines of implementation code + and number of API functions at the end of each year: +\end_layout + +\begin_layout Standard +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +Year End +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +API Functions +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +Lines of C Code Implementation +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +1999 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +13 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1195 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2000 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +24 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1725 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2001 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +32 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2228 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2002 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +35 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2481 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2003 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +35 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2552 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2004 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +40 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2584 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2005 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +38 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2647 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2006 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +52 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +3754 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2007 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +66 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +4398 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2008 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +71 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +4768 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2009 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +73 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +5715 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +This review is an attempt to catalog and address all the known issues with + TDB and create solutions which address the problems without significantly + increasing complexity; all involved are far too aware of the dangers of + second system syndrome in rewriting a successful project like this. +\end_layout + +\begin_layout Section +API Issues +\end_layout + +\begin_layout Subsection +tdb_open_ex Is Not Expandable +\end_layout + +\begin_layout Standard +The tdb_open() call was expanded to tdb_open_ex(), which added an optional + hashing function and an optional logging function argument. + Additional arguments to open would require the introduction of a tdb_open_ex2 + call etc. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +tdb_open() will take a linked-list of attributes: +\end_layout + +\begin_layout LyX-Code +enum tdb_attribute { +\end_layout + +\begin_layout LyX-Code + TDB_ATTRIBUTE_LOG = 0, +\end_layout + +\begin_layout LyX-Code + TDB_ATTRIBUTE_HASH = 1 +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +struct tdb_attribute_base { +\end_layout + +\begin_layout LyX-Code + enum tdb_attribute attr; +\end_layout + +\begin_layout LyX-Code + union tdb_attribute *next; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +struct tdb_attribute_log { +\end_layout + +\begin_layout LyX-Code + struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */ +\end_layout + +\begin_layout LyX-Code + tdb_log_func log_fn; +\end_layout + +\begin_layout LyX-Code + void *log_private; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +struct tdb_attribute_hash { +\end_layout + +\begin_layout LyX-Code + struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */ +\end_layout + +\begin_layout LyX-Code + tdb_hash_func hash_fn; +\end_layout + +\begin_layout LyX-Code + void *hash_private; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +union tdb_attribute { +\end_layout + +\begin_layout LyX-Code + struct tdb_attribute_base base; +\end_layout + +\begin_layout LyX-Code + struct tdb_attribute_log log; +\end_layout + +\begin_layout LyX-Code + struct tdb_attribute_hash hash; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +This allows future attributes to be added, even if this expands the size + of the union. +\end_layout + +\begin_layout Subsection +tdb_traverse Makes Impossible Guarantees +\end_layout + +\begin_layout Standard +tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it + was thought that it was important to guarantee that all records which exist + at the start and end of the traversal would be included, and no record + would be included twice. +\end_layout + +\begin_layout Standard +This adds complexity (see +\begin_inset CommandInset ref +LatexCommand ref +reference "Reliable-Traversal-Adds" + +\end_inset + +) and does not work anyway for records which are altered (in particular, + those which are expanded may be effectively deleted and re-added behind + the traversal). +\end_layout + +\begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "traverse-Proposed-Solution" + +\end_inset + +Proposed Solution +\end_layout + +\begin_layout Standard +Abandon the guarantee. + You will see every record if no changes occur during your traversal, otherwise + you will see some subset. + You can prevent changes by using a transaction or the locking API. +\end_layout + +\begin_layout Subsection +Nesting of Transactions Is Fraught +\end_layout + +\begin_layout Standard +TDB has alternated between allowing nested transactions and not allowing + them. + Various paths in the Samba codebase assume that transactions will nest, + and in a sense they can: the operation is only committed to disk when the + outer transaction is committed. + There are two problems, however: +\end_layout + +\begin_layout Enumerate +Canceling the inner transaction will cause the outer transaction commit + to fail, and will not undo any operations since the inner transaction began. + This problem is soluble with some additional internal code. +\end_layout + +\begin_layout Enumerate +An inner transaction commit can be cancelled by the outer transaction. + This is desirable in the way which Samba's database initialization code + uses transactions, but could be a surprise to any users expecting a successful + transaction commit to expose changes to others. +\end_layout + +\begin_layout Standard +The current solution is to specify the behavior at tdb_open(), with the + default currently that nested transactions are allowed. + This flag can also be changed at runtime. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Given the usage patterns, it seems that the +\begin_inset Quotes eld +\end_inset + +least-surprise +\begin_inset Quotes erd +\end_inset + + behavior of disallowing nested transactions should become the default. + Additionally, it seems the outer transaction is the only code which knows + whether inner transactions should be allowed, so a flag to indicate this + could be added to tdb_transaction_start. + However, this behavior can be simulated with a wrapper which uses tdb_add_flags +() and tdb_remove_flags(), so the API should not be expanded for this relatively +-obscure case. +\end_layout + +\begin_layout Subsection +Incorrect Hash Function is Not Detected +\end_layout + +\begin_layout Standard +tdb_open_ex() allows the calling code to specify a different hash function + to use, but does not check that all other processes accessing this tdb + are using the same hash function. + The result is that records are missing from tdb_fetch(). +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The header should contain an example hash result (eg. + the hash of 0xdeadbeef), and tdb_open_ex() should check that the given + hash function produces the same answer, or fail the tdb_open call. +\end_layout + +\begin_layout Subsection +tdb_set_max_dead/TDB_VOLATILE Expose Implementation +\end_layout + +\begin_layout Standard +In response to scalability issues with the free list ( +\begin_inset CommandInset ref +LatexCommand ref +reference "TDB-Freelist-Is" + +\end_inset + +) two API workarounds have been incorporated in TDB: tdb_set_max_dead() + and the TDB_VOLATILE flag to tdb_open. + The latter actually calls the former with an argument of +\begin_inset Quotes eld +\end_inset + +5 +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Standard +This code allows deleted records to accumulate without putting them in the + free list. + On delete we iterate through each chain and free them in a batch if there + are more than max_dead entries. + These are never otherwise recycled except as a side-effect of a tdb_repack. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +With the scalability problems of the freelist solved, this API can be removed. + The TDB_VOLATILE flag may still be useful as a hint that store and delete + of records will be at least as common as fetch in order to allow some internal + tuning, but initially will become a no-op. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "TDB-Files-Cannot" + +\end_inset + +TDB Files Cannot Be Opened Multiple Times In The Same Process +\end_layout + +\begin_layout Standard +No process can open the same TDB twice; we check and disallow it. + This is an unfortunate side-effect of fcntl locks, which operate on a per-file + rather than per-file-descriptor basis, and do not nest. + Thus, closing any file descriptor on a file clears all the locks obtained + by this process, even if they were placed using a different file descriptor! +\end_layout + +\begin_layout Standard +Note that even if this were solved, deadlock could occur if operations were + nested: this is a more manageable programming error in most cases. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We could lobby POSIX to fix the perverse rules, or at least lobby Linux + to violate them so that the most common implementation does not have this + restriction. + This would be a generally good idea for other fcntl lock users. +\end_layout + +\begin_layout Standard +Samba uses a wrapper which hands out the same tdb_context to multiple callers + if this happens, and does simple reference counting. + We should do this inside the tdb library, which already emulates lock nesting + internally; it would need to recognize when deadlock occurs within a single + process. + This would create a new failure mode for tdb operations (while we currently + handle locking failures, they are impossible in normal use and a process + encountering them can do little but give up). +\end_layout + +\begin_layout Standard +I do not see benefit in an additional tdb_open flag to indicate whether + re-opening is allowed, as though there may be some benefit to adding a + call to detect when a tdb_context is shared, to allow other to create such + an API. +\end_layout + +\begin_layout Subsection +TDB API Is Not POSIX Thread-safe +\end_layout + +\begin_layout Standard +The TDB API uses an error code which can be queried after an operation to + determine what went wrong. + This programming model does not work with threads, unless specific additional + guarantees are given by the implementation. + In addition, even otherwise-independent threads cannot open the same TDB + (as in +\begin_inset CommandInset ref +LatexCommand ref +reference "TDB-Files-Cannot" + +\end_inset + +). +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Reachitecting the API to include a tdb_errcode pointer would be a great + deal of churn; we are better to guarantee that the tdb_errcode is per-thread + so the current programming model can be maintained. +\end_layout + +\begin_layout Standard +This requires dynamic per-thread allocations, which is awkward with POSIX + threads (pthread_key_create space is limited and we cannot simply allocate + a key for every TDB). +\end_layout + +\begin_layout Standard +Internal locking is required to make sure that fcntl locks do not overlap + between threads, and also that the global list of tdbs is maintained. +\end_layout + +\begin_layout Standard +The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe + version of the library, and otherwise no overhead will exist. +\end_layout + +\begin_layout Subsection +*_nonblock Functions And *_mark Functions Expose Implementation +\end_layout + +\begin_layout Standard +CTDB +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +Clustered TDB, see http://ctdb.samba.org +\end_layout + +\end_inset + + wishes to operate on TDB in a non-blocking manner. + This is currently done as follows: +\end_layout + +\begin_layout Enumerate +Call the _nonblock variant of an API function (eg. + tdb_lockall_nonblock). + If this fails: +\end_layout + +\begin_layout Enumerate +Fork a child process, and wait for it to call the normal variant (eg. + tdb_lockall). +\end_layout + +\begin_layout Enumerate +If the child succeeds, call the _mark variant to indicate we already have + the locks (eg. + tdb_lockall_mark). +\end_layout + +\begin_layout Enumerate +Upon completion, tell the child to release the locks (eg. + tdb_unlockall). +\end_layout + +\begin_layout Enumerate +Indicate to tdb that it should consider the locks removed (eg. + tdb_unlockall_mark). +\end_layout + +\begin_layout Standard +There are several issues with this approach. + Firstly, adding two new variants of each function clutters the API for + an obscure use, and so not all functions have three variants. + Secondly, it assumes that all paths of the functions ask for the same locks, + otherwise the parent process will have to get a lock which the child doesn't + have under some circumstances. + I don't believe this is currently the case, but it constrains the implementatio +n. + +\end_layout + +\begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "Proposed-Solution-locking-hook" + +\end_inset + +Proposed Solution +\end_layout + +\begin_layout Standard +Implement a hook for locking methods, so that the caller can control the + calls to create and remove fcntl locks. + In this scenario, ctdbd would operate as follows: +\end_layout + +\begin_layout Enumerate +Call the normal API function, eg tdb_lockall(). +\end_layout + +\begin_layout Enumerate +When the lock callback comes in, check if the child has the lock. + Initially, this is always false. + If so, return 0. + Otherwise, try to obtain it in non-blocking mode. + If that fails, return EWOULDBLOCK. +\end_layout + +\begin_layout Enumerate +Release locks in the unlock callback as normal. +\end_layout + +\begin_layout Enumerate +If tdb_lockall() fails, see if we recorded a lock failure; if so, call the + child to repeat the operation. +\end_layout + +\begin_layout Enumerate +The child records what locks it obtains, and returns that information to + the parent. +\end_layout + +\begin_layout Enumerate +When the child has succeeded, goto 1. +\end_layout + +\begin_layout Standard +This is flexible enough to handle any potential locking scenario, even when + lock requirements change. + It can be optimized so that the parent does not release locks, just tells + the child which locks it doesn't need to obtain. +\end_layout + +\begin_layout Standard +It also keeps the complexity out of the API, and in ctdbd where it is needed. +\end_layout + +\begin_layout Subsection +tdb_chainlock Functions Expose Implementation +\end_layout + +\begin_layout Standard +tdb_chainlock locks some number of records, including the record indicated + by the given key. + This gave atomicity guarantees; no-one can start a transaction, alter, + read or delete that key while the lock is held. +\end_layout + +\begin_layout Standard +It also makes the same guarantee for any other key in the chain, which is + an internal implementation detail and potentially a cause for deadlock. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. + It would be nice to have an explicit single entry lock which effected no + other keys. + Unfortunately, this won't work for an entry which doesn't exist. + Thus while chainlock may be implemented more efficiently for the existing + case, it will still have overlap issues with the non-existing case. + So it is best to keep the current (lack of) guarantee about which records + will be effected to avoid constraining our implementation. +\end_layout + +\begin_layout Subsection +Signal Handling is Not Race-Free +\end_layout + +\begin_layout Standard +The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate + that the tdb locking code should return with a failure, rather than trying + again when a signal is received (and errno == EAGAIN). + This is usually used to implement timeouts. +\end_layout + +\begin_layout Standard +Unfortunately, this does not work in the case where the signal is received + before the tdb code enters the fcntl() call to place the lock: the code + will sleep within the fcntl() code, unaware that the signal wants it to + exit. + In the case of long timeouts, this does not happen in practice. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The locking hooks proposed in +\begin_inset CommandInset ref +LatexCommand ref +reference "Proposed-Solution-locking-hook" + +\end_inset + + would allow the user to decide on whether to fail the lock acquisition + on a signal. + This allows the caller to choose their own compromise: they could narrow + the race by checking immediately before the fcntl call. +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +It may be possible to make this race-free in some implementations by having + the signal handler alter the struct flock to make it invalid. + This will cause the fcntl() lock call to fail with EINVAL if the signal + occurs before the kernel is entered, otherwise EAGAIN. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +The API Uses Gratuitous Typedefs, Capitals +\end_layout + +\begin_layout Standard +typedefs are useful for providing source compatibility when types can differ + across implementations, or arguably in the case of function pointer definitions + which are hard for humans to parse. + Otherwise it is simply obfuscation and pollutes the namespace. +\end_layout + +\begin_layout Standard +Capitalization is usually reserved for compile-time constants and macros. +\end_layout + +\begin_layout Description +TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the + definition isn't visible to the API user anyway. +\end_layout + +\begin_layout Description +TDB_DATA There is no reason to use this over struct TDB_DATA; the struct + needs to be understood by the API user. +\end_layout + +\begin_layout Description +struct +\begin_inset space ~ +\end_inset + +TDB_DATA This would normally be called 'struct tdb_data'. +\end_layout + +\begin_layout Description +enum +\begin_inset space ~ +\end_inset + +TDB_ERROR Similarly, this would normally be enum tdb_error. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. + Introducing lower case variants would please pedants like myself, but if + it were done the existing ones should be kept. + There is little point forcing a purely cosmetic change upon tdb users. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "tdb_log_func-Doesnt-Take" + +\end_inset + +tdb_log_func Doesn't Take The Private Pointer +\end_layout + +\begin_layout Standard +For API compatibility reasons, the logging function needs to call tdb_get_loggin +g_private() to retrieve the pointer registered by the tdb_open_ex for logging. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +It should simply take an extra argument, since we are prepared to break + the API/ABI. +\end_layout + +\begin_layout Subsection +Various Callback Functions Are Not Typesafe +\end_layout + +\begin_layout Standard +The callback functions in tdb_set_logging_function (after +\begin_inset CommandInset ref +LatexCommand ref +reference "tdb_log_func-Doesnt-Take" + +\end_inset + + is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check + all take void * and must internally convert it to the argument type they + were expecting. +\end_layout + +\begin_layout Standard +If this type changes, the compiler will not produce warnings on the callers, + since it only sees void *. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +With careful use of macros, we can create callback functions which give + a warning when used on gcc and the types of the callback and its private + argument differ. + Unsupported compilers will not give a warning, which is no worse than now. + In addition, the callbacks become clearer, as they need not use void * + for their parameter. +\end_layout + +\begin_layout Standard +See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html +\end_layout + +\begin_layout Subsection +TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic +\end_layout + +\begin_layout Standard +The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should + be cleared if the caller discovers it is the only process with the TDB + open. + However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not + be detected, so will have the TDB erased underneath them (usually resulting + in a crash). +\end_layout + +\begin_layout Standard +There is a similar issue on fork(); if the parent exits (or otherwise closes + the tdb) before the child calls tdb_reopen_all() to establish the lock + used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener + at that moment will believe it alone has opened the TDB and will erase + it. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Remove TDB_CLEAR_IF_FIRST. + Other workarounds are possible, but see +\begin_inset CommandInset ref +LatexCommand ref +reference "TDB_CLEAR_IF_FIRST-Imposes-Performance" + +\end_inset + +. +\end_layout + +\begin_layout Section +Performance And Scalability Issues +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "TDB_CLEAR_IF_FIRST-Imposes-Performance" + +\end_inset + +TDB_CLEAR_IF_FIRST Imposes Performance Penalty +\end_layout + +\begin_layout Standard +When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset + 4 (aka. + the ACTIVE_LOCK). + While these locks never conflict in normal tdb usage, they do add substantial + overhead for most fcntl lock implementations when the kernel scans to detect + if a lock conflict exists. + This is often a single linked list, making the time to acquire and release + a fcntl lock O(N) where N is the number of processes with the TDB open, + not the number actually doing work. +\end_layout + +\begin_layout Standard +In a Samba server it is common to have huge numbers of clients sitting idle, + and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag. +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +There is a flag to tdb_reopen_all() which is used for this optimization: + if the parent process will outlive the child, the child does not need the + ACTIVE_LOCK. + This is a workaround for this very performance issue. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Remove the flag. + It was a neat idea, but even trivial servers tend to know when they are + initializing for the first time and can simply unlink the old tdb at that + point. +\end_layout + +\begin_layout Subsection +TDB Files Have a 4G Limit +\end_layout + +\begin_layout Standard +This seems to be becoming an issue (so much for +\begin_inset Quotes eld +\end_inset + +trivial +\begin_inset Quotes erd +\end_inset + +!), particularly for ldb. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +A new, incompatible TDB format which uses 64 bit offsets internally rather + than 32 bit as now. + For simplicity of endian conversion (which TDB does on the fly if required), + all values will be 64 bit on disk. + In practice, some upper bits may be used for other purposes, but at least + 56 bits will be available for file offsets. +\end_layout + +\begin_layout Standard +tdb_open() will automatically detect the old version, and even create them + if TDB_VERSION6 is specified to tdb_open. +\end_layout + +\begin_layout Standard +32 bit processes will still be able to access TDBs larger than 4G (assuming + that their off_t allows them to seek to 64 bits), they will gracefully + fall back as they fail to mmap. + This can happen already with large TDBs. +\end_layout + +\begin_layout Standard +Old versions of tdb will fail to open the new TDB files (since 28 August + 2009, commit 398d0c29290: prior to that any unrecognized file format would + be erased and initialized as a fresh tdb!) +\end_layout + +\begin_layout Subsection +TDB Records Have a 4G Limit +\end_layout + +\begin_layout Standard +This has not been a reported problem, and the API uses size_t which can + be 64 bit on 64 bit platforms. + However, other limits may have made such an issue moot. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Record sizes will be 64 bit, with an error returned on 32 bit platforms + which try to access such records (the current implementation would return + TDB_ERR_OOM in a similar case). + It seems unlikely that 32 bit keys will be a limitation, so the implementation + may not support this (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Records-Incur-A" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +Hash Size Is Determined At TDB Creation Time +\end_layout + +\begin_layout Standard +TDB contains a number of hash chains in the header; the number is specified + at creation time, and defaults to 131. + This is such a bottleneck on large databases (as each hash chain gets quite + long), that LDB uses 10,000 for this hash. + In general it is impossible to know what the 'right' answer is at database + creation time. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +After comprehensive performance testing on various scalable hash variants +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying + because I was previously convinced that an expanding tree of hashes would + be very close to optimal. +\end_layout + +\end_inset + +, it became clear that it is hard to beat a straight linear hash table which + doubles in size when it reaches saturation. + There are three details which become important: +\end_layout + +\begin_layout Enumerate +On encountering a full bucket, we use the next bucket. +\end_layout + +\begin_layout Enumerate +Extra hash bits are stored with the offset, to reduce comparisons. +\end_layout + +\begin_layout Enumerate +A marker entry is used on deleting an entry. +\end_layout + +\begin_layout Standard +The doubling of the table must be done under a transaction; we will not + reduce it on deletion, so it will be an unusual case. + It will either be placed at the head (other entries will be moved out the + way so we can expand). + We could have a pointer in the header to the current hashtable location, + but that pointer would have to be read frequently to check for hashtable + moves. +\end_layout + +\begin_layout Standard +The locking for this is slightly more complex than the chained case; we + currently have one lock per bucket, and that means we would need to expand + the lock if we overflow to the next bucket. + The frequency of such collisions will effect our locking heuristics: we + can always lock more buckets than we need. +\end_layout + +\begin_layout Standard +One possible optimization is to only re-check the hash size on an insert + or a lookup miss. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "TDB-Freelist-Is" + +\end_inset + +TDB Freelist Is Highly Contended +\end_layout + +\begin_layout Standard +TDB uses a single linked list for the free list. + Allocation occurs as follows, using heuristics which have evolved over + time: +\end_layout + +\begin_layout Enumerate +Get the free list lock for this whole operation. +\end_layout + +\begin_layout Enumerate +Multiply length by 1.25, so we always over-allocate by 25%. +\end_layout + +\begin_layout Enumerate +Set the slack multiplier to 1. +\end_layout + +\begin_layout Enumerate +Examine the current freelist entry: if it is > length but < the current + best case, remember it as the best case. +\end_layout + +\begin_layout Enumerate +Multiply the slack multiplier by 1.05. +\end_layout + +\begin_layout Enumerate +If our best fit so far is less than length * slack multiplier, return it. + The slack will be turned into a new free record if it's large enough. +\end_layout + +\begin_layout Enumerate +Otherwise, go onto the next freelist entry. +\end_layout + +\begin_layout Standard +Deleting a record occurs as follows: +\end_layout + +\begin_layout Enumerate +Lock the hash chain for this whole operation. +\end_layout + +\begin_layout Enumerate +Walk the chain to find the record, keeping the prev pointer offset. +\end_layout + +\begin_layout Enumerate +If max_dead is non-zero: +\end_layout + +\begin_deeper +\begin_layout Enumerate +Walk the hash chain again and count the dead records. +\end_layout + +\begin_layout Enumerate +If it's more than max_dead, bulk free all the dead ones (similar to steps + 4 and below, but the lock is only obtained once). +\end_layout + +\begin_layout Enumerate +Simply mark this record as dead and return. + +\end_layout + +\end_deeper +\begin_layout Enumerate +Get the free list lock for the remainder of this operation. +\end_layout + +\begin_layout Enumerate +\begin_inset CommandInset label +LatexCommand label +name "right-merging" + +\end_inset + +Examine the following block to see if it is free; if so, enlarge the current + block and remove that block from the free list. + This was disabled, as removal from the free list was O(entries-in-free-list). +\end_layout + +\begin_layout Enumerate +Examine the preceeding block to see if it is free: for this reason, each + block has a 32-bit tailer which indicates its length. + If it is free, expand it to cover our new block and return. +\end_layout + +\begin_layout Enumerate +Otherwise, prepend ourselves to the free list. +\end_layout + +\begin_layout Standard +Disabling right-merging (step +\begin_inset CommandInset ref +LatexCommand ref +reference "right-merging" + +\end_inset + +) causes fragmentation; the other heuristics proved insufficient to address + this, so the final answer to this was that when we expand the TDB file + inside a transaction commit, we repack the entire tdb. +\end_layout + +\begin_layout Standard +The single list lock limits our allocation rate; due to the other issues + this is not currently seen as a bottleneck. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The first step is to remove all the current heuristics, as they obviously + interact, then examine them once the lock contention is addressed. +\end_layout + +\begin_layout Standard +The free list must be split to reduce contention. + Assuming perfect free merging, we can at most have 1 free list entry for + each entry. + This implies that the number of free lists is related to the size of the + hash table, but as it is rare to walk a large number of free list entries + we can use far fewer, say 1/32 of the number of hash buckets. +\end_layout + +\begin_layout Standard +There are various benefits in using per-size free lists (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:TDB-Becomes-Fragmented" + +\end_inset + +) but it's not clear this would reduce contention in the common case where + all processes are allocating/freeing the same size. + Thus we almost certainly need to divide in other ways: the most obvious + is to divide the file into zones, and using a free list (or set of free + lists) for each. + This approximates address ordering. +\end_layout + +\begin_layout Standard +Note that this means we need to split the free lists when we expand the + file; this is probably acceptable when we double the hash table size, since + that is such an expensive operation already. + In the case of increasing the file size, there is an optimization we can + use: if we use M in the formula above as the file size rounded up to the + next power of 2, we only need reshuffle free lists when the file size crosses + a power of 2 boundary, +\emph on +and +\emph default +reshuffling the free lists is trivial: we simply merge every consecutive + pair of free lists. +\end_layout + +\begin_layout Standard +The basic algorithm is as follows. + Freeing is simple: +\end_layout + +\begin_layout Enumerate +Identify the correct zone. +\end_layout + +\begin_layout Enumerate +Lock the corresponding list. +\end_layout + +\begin_layout Enumerate +Re-check the zone (we didn't have a lock, sizes could have changed): relock + if necessary. +\end_layout + +\begin_layout Enumerate +Place the freed entry in the list for that zone. +\end_layout + +\begin_layout Standard +Allocation is a little more complicated, as we perform delayed coalescing + at this point: +\end_layout + +\begin_layout Enumerate +Pick a zone either the zone we last freed into, or based on a +\begin_inset Quotes eld +\end_inset + +random +\begin_inset Quotes erd +\end_inset + + number. +\end_layout + +\begin_layout Enumerate +Lock the corresponding list. +\end_layout + +\begin_layout Enumerate +Re-check the zone: relock if necessary. +\end_layout + +\begin_layout Enumerate +If the top entry is -large enough, remove it from the list and return it. +\end_layout + +\begin_layout Enumerate +Otherwise, coalesce entries in the list.If there was no entry large enough, + unlock the list and try the next zone. +\end_layout + +\begin_layout Enumerate +If no zone satisfies, expand the file. +\end_layout + +\begin_layout Standard +This optimizes rapid insert/delete of free list entries by not coalescing + them all the time.. + First-fit address ordering ordering seems to be fairly good for keeping + fragmentation low (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:TDB-Becomes-Fragmented" + +\end_inset + +). + Note that address ordering does not need a tailer to coalesce, though if + we needed one we could have one cheaply: see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Records-Incur-A" + +\end_inset + +. + +\end_layout + +\begin_layout Standard +I anticipate that the number of entries in each free zone would be small, + but it might be worth using one free entry to hold pointers to the others + for cache efficiency. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:TDB-Becomes-Fragmented" + +\end_inset + +TDB Becomes Fragmented +\end_layout + +\begin_layout Standard +Much of this is a result of allocation strategy +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute +xas.edu/pub/garbage/malloc/ismm98.ps +\end_layout + +\end_inset + + and deliberate hobbling of coalescing; internal fragmentation (aka overallocati +on) is deliberately set at 25%, and external fragmentation is only cured + by the decision to repack the entire db when a transaction commit needs + to enlarge the file. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The 25% overhead on allocation works in practice for ldb because indexes + tend to expand by one record at a time. + This internal fragmentation can be resolved by having an +\begin_inset Quotes eld +\end_inset + +expanded +\begin_inset Quotes erd +\end_inset + + bit in the header to note entries that have previously expanded, and allocating + more space for them. +\end_layout + +\begin_layout Standard +There are is a spectrum of possible solutions for external fragmentation: + one is to use a fragmentation-avoiding allocation strategy such as best-fit + address-order allocator. + The other end of the spectrum would be to use a bump allocator (very fast + and simple) and simply repack the file when we reach the end. +\end_layout + +\begin_layout Standard +There are three problems with efficient fragmentation-avoiding allocators: + they are non-trivial, they tend to use a single free list for each size, + and there's no evidence that tdb allocation patterns will match those recorded + for general allocators (though it seems likely). +\end_layout + +\begin_layout Standard +Thus we don't spend too much effort on external fragmentation; we will be + no worse than the current code if we need to repack on occasion. + More effort is spent on reducing freelist contention, and reducing overhead. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:Records-Incur-A" + +\end_inset + +Records Incur A 28-Byte Overhead +\end_layout + +\begin_layout Standard +Each TDB record has a header as follows: +\end_layout + +\begin_layout LyX-Code +struct tdb_record { +\end_layout + +\begin_layout LyX-Code + tdb_off_t next; /* offset of the next record in the list */ +\end_layout + +\begin_layout LyX-Code + tdb_len_t rec_len; /* total byte length of record */ +\end_layout + +\begin_layout LyX-Code + tdb_len_t key_len; /* byte length of key */ +\end_layout + +\begin_layout LyX-Code + tdb_len_t data_len; /* byte length of data */ +\end_layout + +\begin_layout LyX-Code + uint32_t full_hash; /* the full 32 bit hash of the key */ +\end_layout + +\begin_layout LyX-Code + uint32_t magic; /* try to catch errors */ +\end_layout + +\begin_layout LyX-Code + /* the following union is implied: +\end_layout + +\begin_layout LyX-Code + union { +\end_layout + +\begin_layout LyX-Code + char record[rec_len]; +\end_layout + +\begin_layout LyX-Code + struct { +\end_layout + +\begin_layout LyX-Code + char key[key_len]; +\end_layout + +\begin_layout LyX-Code + char data[data_len]; +\end_layout + +\begin_layout LyX-Code + } +\end_layout + +\begin_layout LyX-Code + uint32_t totalsize; (tailer) +\end_layout + +\begin_layout LyX-Code + } +\end_layout + +\begin_layout LyX-Code + */ +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +Naively, this would double to a 56-byte overhead on a 64 bit implementation. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We can use various techniques to reduce this for an allocated block: +\end_layout + +\begin_layout Enumerate +The 'next' pointer is not required, as we are using a flat hash table. +\end_layout + +\begin_layout Enumerate +'rec_len' can instead be expressed as an addition to key_len and data_len + (it accounts for wasted or overallocated length in the record). + Since the record length is always a multiple of 8, we can conveniently + fit it in 32 bits (representing up to 35 bits). +\end_layout + +\begin_layout Enumerate +'key_len' and 'data_len' can be reduced. + I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine + the two into one 64-bit field and using a 5 bit value which indicates at + what bit to divide the two. + Keys are unlikely to scale as fast as data, so I'm assuming a maximum key + size of 32 bits. +\end_layout + +\begin_layout Enumerate +'full_hash' is used to avoid a memcmp on the +\begin_inset Quotes eld +\end_inset + +miss +\begin_inset Quotes erd +\end_inset + + case, but this is diminishing returns after a handful of bits (at 10 bits, + it reduces 99.9% of false memcmp). + As an aside, as the lower bits are already incorporated in the hash table + resolution, the upper bits should be used here. +\end_layout + +\begin_layout Enumerate +'magic' does not need to be enlarged: it currently reflects one of 5 values + (used, free, dead, recovery, and unused_recovery). + It is useful for quick sanity checking however, and should not be eliminated. +\end_layout + +\begin_layout Enumerate +'tailer' is only used to coalesce free blocks (so a block to the right can + find the header to check if this block is free). + This can be replaced by a single 'free' bit in the header of the following + block (and the tailer only exists in free blocks). +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +This technique from Thomas Standish. + Data Structure Techniques. + Addison-Wesley, Reading, Massachusetts, 1980. +\end_layout + +\end_inset + + The current proposed coalescing algorithm doesn't need this, however. +\end_layout + +\begin_layout Standard +This produces a 16 byte used header like this: +\end_layout + +\begin_layout LyX-Code +struct tdb_used_record { +\end_layout + +\begin_layout LyX-Code + uint32_t magic : 16, +\end_layout + +\begin_layout LyX-Code + prev_is_free: 1, +\end_layout + +\begin_layout LyX-Code + key_data_divide: 5, +\end_layout + +\begin_layout LyX-Code + top_hash: 10; +\end_layout + +\begin_layout LyX-Code + uint32_t extra_octets; +\end_layout + +\begin_layout LyX-Code + uint64_t key_and_data_len; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +And a free record like this: +\end_layout + +\begin_layout LyX-Code +struct tdb_free_record { +\end_layout + +\begin_layout LyX-Code + uint32_t free_magic; +\end_layout + +\begin_layout LyX-Code + uint64_t total_length; +\end_layout + +\begin_layout LyX-Code + ... +\end_layout + +\begin_layout LyX-Code + uint64_t tailer; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout Subsection +Transaction Commit Requires 4 fdatasync +\end_layout + +\begin_layout Standard +The current transaction algorithm is: +\end_layout + +\begin_layout Enumerate +write_recovery_data(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Enumerate +write_recovery_header(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Enumerate +overwrite_with_new_data(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Enumerate +remove_recovery_header(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Standard +On current ext3, each sync flushes all data to disk, so the next 3 syncs + are relatively expensive. + But this could become a performance bottleneck on other filesystems such + as ext4. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Neil Brown points out that this is overzealous, and only one sync is needed: +\end_layout + +\begin_layout Enumerate +Bundle the recovery data, a transaction counter and a strong checksum of + the new data. +\end_layout + +\begin_layout Enumerate +Strong checksum that whole bundle. +\end_layout + +\begin_layout Enumerate +Store the bundle in the database. +\end_layout + +\begin_layout Enumerate +Overwrite the oldest of the two recovery pointers in the header (identified + using the transaction counter) with the offset of this bundle. +\end_layout + +\begin_layout Enumerate +sync. +\end_layout + +\begin_layout Enumerate +Write the new data to the file. +\end_layout + +\begin_layout Standard +Checking for recovery means identifying the latest bundle with a valid checksum + and using the new data checksum to ensure that it has been applied. + This is more expensive than the current check, but need only be done at + open. + For running databases, a separate header field can be used to indicate + a transaction in progress; we need only check for recovery if this is set. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:TDB-Does-Not" + +\end_inset + +TDB Does Not Have Snapshot Support +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. + At some point you say +\begin_inset Quotes eld +\end_inset + +use a real database +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Standard +But as a thought experiment, if we implemented transactions to only overwrite + free entries (this is tricky: there must not be a header in each entry + which indicates whether it is free, but use of presence in metadata elsewhere), + and a pointer to the hash table, we could create an entirely new commit + without destroying existing data. + Then it would be easy to implement snapshots in a similar way. +\end_layout + +\begin_layout Standard +This would not allow arbitrary changes to the database, such as tdb_repack + does, and would require more space (since we have to preserve the current + and future entries at once). + If we used hash trees rather than one big hash table, we might only have + to rewrite some sections of the hash, too. +\end_layout + +\begin_layout Standard +We could then implement snapshots using a similar method, using multiple + different hash tables/free tables. +\end_layout + +\begin_layout Subsection +Transactions Cannot Operate in Parallel +\end_layout + +\begin_layout Standard +This would be useless for ldb, as it hits the index records with just about + every update. + It would add significant complexity in resolving clashes, and cause the + all transaction callers to write their code to loop in the case where the + transactions spuriously failed. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We could solve a small part of the problem by providing read-only transactions. + These would allow one write transaction to begin, but it could not commit + until all r/o transactions are done. + This would require a new RO_TRANSACTION_LOCK, which would be upgraded on + commit. +\end_layout + +\begin_layout Subsection +Default Hash Function Is Suboptimal +\end_layout + +\begin_layout Standard +The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially + if we expand it to 64 bits), and works best when the hash bucket size is + a prime number (which also means a slow modulus). + In addition, it is highly predictable which could potentially lead to a + Denial of Service attack in some TDB uses. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The Jenkins lookup3 hash +\begin_inset Foot +status open + +\begin_layout Plain Layout +http://burtleburtle.net/bob/c/lookup3.c +\end_layout + +\end_inset + + is a fast and superbly-mixing hash. + It's used by the Linux kernel and almost everything else. + This has the particular properties that it takes an initial seed, and produces + two 32 bit hash numbers, which we can combine into a 64-bit hash. +\end_layout + +\begin_layout Standard +The seed should be created at tdb-creation time from some random source, + and placed in the header. + This is far from foolproof, but adds a little bit of protection against + hash bombing. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "Reliable-Traversal-Adds" + +\end_inset + +Reliable Traversal Adds Complexity +\end_layout + +\begin_layout Standard +We lock a record during traversal iteration, and try to grab that lock in + the delete code. + If that grab on delete fails, we simply mark it deleted and continue onwards; + traversal checks for this condition and does the delete when it moves off + the record. +\end_layout + +\begin_layout Standard +If traversal terminates, the dead record may be left indefinitely. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Remove reliability guarantees; see +\begin_inset CommandInset ref +LatexCommand ref +reference "traverse-Proposed-Solution" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +Fcntl Locking Adds Overhead +\end_layout + +\begin_layout Standard +Placing a fcntl lock means a system call, as does removing one. + This is actually one reason why transactions can be faster (everything + is locked once at transaction start). + In the uncontended case, this overhead can theoretically be eliminated. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. +\end_layout + +\begin_layout Standard +We tried this before with spinlock support, in the early days of TDB, and + it didn't make much difference except in manufactured benchmarks. +\end_layout + +\begin_layout Standard +We could use spinlocks (with futex kernel support under Linux), but it means + that we lose automatic cleanup when a process dies with a lock. + There is a method of auto-cleanup under Linux, but it's not supported by + other operating systems. + We could reintroduce a clear-if-first-style lock and sweep for dead futexes + on open, but that wouldn't help the normal case of one concurrent opener + dying. + Increasingly elaborate repair schemes could be considered, but they require + an ABI change (everyone must use them) anyway, so there's no need to do + this at the same time as everything else. +\end_layout + +\begin_layout Subsection +Some Transactions Don't Require Durability +\end_layout + +\begin_layout Standard +Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast) + usage, and occasionally empties the results into a transactional TDB. + This kind of usage prioritizes performance over durability: as long as + we are consistent, data can be lost. +\end_layout + +\begin_layout Standard +This would be more neatly implemented inside tdb: a +\begin_inset Quotes eld +\end_inset + +soft +\begin_inset Quotes erd +\end_inset + + transaction commit (ie. + syncless) which meant that data may be reverted on a crash. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. +\end_layout + +\begin_layout Standard +Unfortunately any transaction scheme which overwrites old data requires + a sync before that overwrite to avoid the possibility of corruption. +\end_layout + +\begin_layout Standard +It seems possible to use a scheme similar to that described in +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:TDB-Does-Not" + +\end_inset + +,where transactions are committed without overwriting existing data, and + an array of top-level pointers were available in the header. + If the transaction is +\begin_inset Quotes eld +\end_inset + +soft +\begin_inset Quotes erd +\end_inset + + then we would not need a sync at all: existing processes would pick up + the new hash table and free list and work with that. +\end_layout + +\begin_layout Standard +At some later point, a sync would allow recovery of the old data into the + free lists (perhaps when the array of top-level pointers filled). + On crash, tdb_open() would examine the array of top levels, and apply the + transactions until it encountered an invalid checksum. +\end_layout + +\end_body +\end_document diff --git a/ccan/tdb2/doc/design.lyx,v b/ccan/tdb2/doc/design.lyx,v new file mode 100644 index 0000000..e44b3fc --- /dev/null +++ b/ccan/tdb2/doc/design.lyx,v @@ -0,0 +1,3106 @@ +head 1.6; +access; +symbols; +locks; strict; +comment @# @; + + +1.6 +date 2010.08.02.00.21.43; author rusty; state Exp; +branches; +next 1.5; + +1.5 +date 2010.08.02.00.21.16; author rusty; state Exp; +branches; +next 1.4; + +1.4 +date 2010.05.10.13.09.11; author rusty; state Exp; +branches; +next 1.3; + +1.3 +date 2010.05.10.11.58.37; author rusty; state Exp; +branches; +next 1.2; + +1.2 +date 2010.05.10.05.35.13; author rusty; state Exp; +branches; +next 1.1; + +1.1 +date 2010.05.04.02.29.16; author rusty; state Exp; +branches; +next ; + + +desc +@First draft +@ + + +1.6 +log +@Commit changes +@ +text +@#LyX 1.6.5 created this file. For more info see http://www.lyx.org/ +\lyxformat 345 +\begin_document +\begin_header +\textclass article +\use_default_options true +\language english +\inputencoding auto +\font_roman default +\font_sans default +\font_typewriter default +\font_default_family default +\font_sc false +\font_osf false +\font_sf_scale 100 +\font_tt_scale 100 + +\graphics default +\paperfontsize default +\use_hyperref false +\papersize default +\use_geometry false +\use_amsmath 1 +\use_esint 1 +\cite_engine basic +\use_bibtopic false +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation indent +\defskip medskip +\quotes_language english +\papercolumns 1 +\papersides 1 +\paperpagestyle default +\tracking_changes true +\output_changes true +\author "" +\author "" +\end_header + +\begin_body + +\begin_layout Title +TDB2: A Redesigning The Trivial DataBase +\end_layout + +\begin_layout Author +Rusty Russell, IBM Corporation +\end_layout + +\begin_layout Date +26-July-2010 +\end_layout + +\begin_layout Abstract +The Trivial DataBase on-disk format is 32 bits; with usage cases heading + towards the 4G limit, that must change. + This required breakage provides an opportunity to revisit TDB's other design + decisions and reassess them. +\end_layout + +\begin_layout Section +Introduction +\end_layout + +\begin_layout Standard +The Trivial DataBase was originally written by Andrew Tridgell as a simple + key/data pair storage system with the same API as dbm, but allowing multiple + readers and writers while being small enough (< 1000 lines of C) to include + in SAMBA. + The simple design created in 1999 has proven surprisingly robust and performant +, used in Samba versions 3 and 4 as well as numerous other projects. + Its useful life was greatly increased by the (backwards-compatible!) addition + of transaction support in 2005. +\end_layout + +\begin_layout Standard +The wider variety and greater demands of TDB-using code has lead to some + organic growth of the API, as well as some compromises on the implementation. + None of these, by themselves, are seen as show-stoppers, but the cumulative + effect is to a loss of elegance over the initial, simple TDB implementation. + Here is a table of the approximate number of lines of implementation code + and number of API functions at the end of each year: +\end_layout + +\begin_layout Standard +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\begin_layout Plain Layout +Year End +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +API Functions +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +Lines of C Code Implementation +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +1999 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +13 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1195 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2000 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +24 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +1725 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2001 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +32 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2228 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2002 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +35 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2481 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2003 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +35 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2552 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2004 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +40 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2584 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2005 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +38 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +2647 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2006 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +52 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +3754 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2007 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +66 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +4398 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2008 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +71 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +4768 +\end_layout + +\end_inset + + + + +\begin_inset Text + +\begin_layout Plain Layout +2009 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +73 +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +5715 +\end_layout + +\end_inset + + + + +\end_inset + + +\end_layout + +\begin_layout Standard +This review is an attempt to catalog and address all the known issues with + TDB and create solutions which address the problems without significantly + increasing complexity; all involved are far too aware of the dangers of + second system syndrome in rewriting a successful project like this. +\end_layout + +\begin_layout Section +API Issues +\end_layout + +\begin_layout Subsection +tdb_open_ex Is Not Expandable +\end_layout + +\begin_layout Standard +The tdb_open() call was expanded to tdb_open_ex(), which added an optional + hashing function and an optional logging function argument. + Additional arguments to open would require the introduction of a tdb_open_ex2 + call etc. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +tdb_open() will take a linked-list of attributes: +\end_layout + +\begin_layout LyX-Code +enum tdb_attribute { +\end_layout + +\begin_layout LyX-Code + TDB_ATTRIBUTE_LOG = 0, +\end_layout + +\begin_layout LyX-Code + TDB_ATTRIBUTE_HASH = 1 +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +struct tdb_attribute_base { +\end_layout + +\begin_layout LyX-Code + enum tdb_attribute attr; +\end_layout + +\begin_layout LyX-Code + union tdb_attribute *next; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +struct tdb_attribute_log { +\end_layout + +\begin_layout LyX-Code + struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */ +\end_layout + +\begin_layout LyX-Code + tdb_log_func log_fn; +\end_layout + +\begin_layout LyX-Code + void *log_private; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +struct tdb_attribute_hash { +\end_layout + +\begin_layout LyX-Code + struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */ +\end_layout + +\begin_layout LyX-Code + tdb_hash_func hash_fn; +\end_layout + +\begin_layout LyX-Code + void *hash_private; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code +union tdb_attribute { +\end_layout + +\begin_layout LyX-Code + struct tdb_attribute_base base; +\end_layout + +\begin_layout LyX-Code + struct tdb_attribute_log log; +\end_layout + +\begin_layout LyX-Code + struct tdb_attribute_hash hash; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +This allows future attributes to be added, even if this expands the size + of the union. +\end_layout + +\begin_layout Subsection +tdb_traverse Makes Impossible Guarantees +\end_layout + +\begin_layout Standard +tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it + was thought that it was important to guarantee that all records which exist + at the start and end of the traversal would be included, and no record + would be included twice. +\end_layout + +\begin_layout Standard +This adds complexity (see +\begin_inset CommandInset ref +LatexCommand ref +reference "Reliable-Traversal-Adds" + +\end_inset + +) and does not work anyway for records which are altered (in particular, + those which are expanded may be effectively deleted and re-added behind + the traversal). +\end_layout + +\begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "traverse-Proposed-Solution" + +\end_inset + +Proposed Solution +\end_layout + +\begin_layout Standard +Abandon the guarantee. + You will see every record if no changes occur during your traversal, otherwise + you will see some subset. + You can prevent changes by using a transaction or the locking API. +\end_layout + +\begin_layout Subsection +Nesting of Transactions Is Fraught +\end_layout + +\begin_layout Standard +TDB has alternated between allowing nested transactions and not allowing + them. + Various paths in the Samba codebase assume that transactions will nest, + and in a sense they can: the operation is only committed to disk when the + outer transaction is committed. + There are two problems, however: +\end_layout + +\begin_layout Enumerate +Canceling the inner transaction will cause the outer transaction commit + to fail, and will not undo any operations since the inner transaction began. + This problem is soluble with some additional internal code. +\end_layout + +\begin_layout Enumerate +An inner transaction commit can be cancelled by the outer transaction. + This is desirable in the way which Samba's database initialization code + uses transactions, but could be a surprise to any users expecting a successful + transaction commit to expose changes to others. +\end_layout + +\begin_layout Standard +The current solution is to specify the behavior at tdb_open(), with the + default currently that nested transactions are allowed. + This flag can also be changed at runtime. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Given the usage patterns, it seems that the +\begin_inset Quotes eld +\end_inset + +least-surprise +\begin_inset Quotes erd +\end_inset + + behavior of disallowing nested transactions should become the default. + Additionally, it seems the outer transaction is the only code which knows + whether inner transactions should be allowed, so a flag to indicate this + could be added to tdb_transaction_start. + However, this behavior can be simulated with a wrapper which uses tdb_add_flags +() and tdb_remove_flags(), so the API should not be expanded for this relatively +-obscure case. +\end_layout + +\begin_layout Subsection +Incorrect Hash Function is Not Detected +\end_layout + +\begin_layout Standard +tdb_open_ex() allows the calling code to specify a different hash function + to use, but does not check that all other processes accessing this tdb + are using the same hash function. + The result is that records are missing from tdb_fetch(). +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The header should contain an example hash result (eg. + the hash of 0xdeadbeef), and tdb_open_ex() should check that the given + hash function produces the same answer, or fail the tdb_open call. +\end_layout + +\begin_layout Subsection +tdb_set_max_dead/TDB_VOLATILE Expose Implementation +\end_layout + +\begin_layout Standard +In response to scalability issues with the free list ( +\begin_inset CommandInset ref +LatexCommand ref +reference "TDB-Freelist-Is" + +\end_inset + +) two API workarounds have been incorporated in TDB: tdb_set_max_dead() + and the TDB_VOLATILE flag to tdb_open. + The latter actually calls the former with an argument of +\begin_inset Quotes eld +\end_inset + +5 +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Standard +This code allows deleted records to accumulate without putting them in the + free list. + On delete we iterate through each chain and free them in a batch if there + are more than max_dead entries. + These are never otherwise recycled except as a side-effect of a tdb_repack. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +With the scalability problems of the freelist solved, this API can be removed. + The TDB_VOLATILE flag may still be useful as a hint that store and delete + of records will be at least as common as fetch in order to allow some internal + tuning, but initially will become a no-op. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "TDB-Files-Cannot" + +\end_inset + +TDB Files Cannot Be Opened Multiple Times In The Same Process +\end_layout + +\begin_layout Standard +No process can open the same TDB twice; we check and disallow it. + This is an unfortunate side-effect of fcntl locks, which operate on a per-file + rather than per-file-descriptor basis, and do not nest. + Thus, closing any file descriptor on a file clears all the locks obtained + by this process, even if they were placed using a different file descriptor! +\end_layout + +\begin_layout Standard +Note that even if this were solved, deadlock could occur if operations were + nested: this is a more manageable programming error in most cases. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We could lobby POSIX to fix the perverse rules, or at least lobby Linux + to violate them so that the most common implementation does not have this + restriction. + This would be a generally good idea for other fcntl lock users. +\end_layout + +\begin_layout Standard +Samba uses a wrapper which hands out the same tdb_context to multiple callers + if this happens, and does simple reference counting. + We should do this inside the tdb library, which already emulates lock nesting + internally; it would need to recognize when deadlock occurs within a single + process. + This would create a new failure mode for tdb operations (while we currently + handle locking failures, they are impossible in normal use and a process + encountering them can do little but give up). +\end_layout + +\begin_layout Standard +I do not see benefit in an additional tdb_open flag to indicate whether + re-opening is allowed, as though there may be some benefit to adding a + call to detect when a tdb_context is shared, to allow other to create such + an API. +\end_layout + +\begin_layout Subsection +TDB API Is Not POSIX Thread-safe +\end_layout + +\begin_layout Standard +The TDB API uses an error code which can be queried after an operation to + determine what went wrong. + This programming model does not work with threads, unless specific additional + guarantees are given by the implementation. + In addition, even otherwise-independent threads cannot open the same TDB + (as in +\begin_inset CommandInset ref +LatexCommand ref +reference "TDB-Files-Cannot" + +\end_inset + +). +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Reachitecting the API to include a tdb_errcode pointer would be a great + deal of churn; we are better to guarantee that the tdb_errcode is per-thread + so the current programming model can be maintained. +\end_layout + +\begin_layout Standard +This requires dynamic per-thread allocations, which is awkward with POSIX + threads (pthread_key_create space is limited and we cannot simply allocate + a key for every TDB). +\end_layout + +\begin_layout Standard +Internal locking is required to make sure that fcntl locks do not overlap + between threads, and also that the global list of tdbs is maintained. +\end_layout + +\begin_layout Standard +The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe + version of the library, and otherwise no overhead will exist. +\end_layout + +\begin_layout Subsection +*_nonblock Functions And *_mark Functions Expose Implementation +\end_layout + +\begin_layout Standard +CTDB +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +Clustered TDB, see http://ctdb.samba.org +\end_layout + +\end_inset + + wishes to operate on TDB in a non-blocking manner. + This is currently done as follows: +\end_layout + +\begin_layout Enumerate +Call the _nonblock variant of an API function (eg. + tdb_lockall_nonblock). + If this fails: +\end_layout + +\begin_layout Enumerate +Fork a child process, and wait for it to call the normal variant (eg. + tdb_lockall). +\end_layout + +\begin_layout Enumerate +If the child succeeds, call the _mark variant to indicate we already have + the locks (eg. + tdb_lockall_mark). +\end_layout + +\begin_layout Enumerate +Upon completion, tell the child to release the locks (eg. + tdb_unlockall). +\end_layout + +\begin_layout Enumerate +Indicate to tdb that it should consider the locks removed (eg. + tdb_unlockall_mark). +\end_layout + +\begin_layout Standard +There are several issues with this approach. + Firstly, adding two new variants of each function clutters the API for + an obscure use, and so not all functions have three variants. + Secondly, it assumes that all paths of the functions ask for the same locks, + otherwise the parent process will have to get a lock which the child doesn't + have under some circumstances. + I don't believe this is currently the case, but it constrains the implementatio +n. + +\end_layout + +\begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "Proposed-Solution-locking-hook" + +\end_inset + +Proposed Solution +\end_layout + +\begin_layout Standard +Implement a hook for locking methods, so that the caller can control the + calls to create and remove fcntl locks. + In this scenario, ctdbd would operate as follows: +\end_layout + +\begin_layout Enumerate +Call the normal API function, eg tdb_lockall(). +\end_layout + +\begin_layout Enumerate +When the lock callback comes in, check if the child has the lock. + Initially, this is always false. + If so, return 0. + Otherwise, try to obtain it in non-blocking mode. + If that fails, return EWOULDBLOCK. +\end_layout + +\begin_layout Enumerate +Release locks in the unlock callback as normal. +\end_layout + +\begin_layout Enumerate +If tdb_lockall() fails, see if we recorded a lock failure; if so, call the + child to repeat the operation. +\end_layout + +\begin_layout Enumerate +The child records what locks it obtains, and returns that information to + the parent. +\end_layout + +\begin_layout Enumerate +When the child has succeeded, goto 1. +\end_layout + +\begin_layout Standard +This is flexible enough to handle any potential locking scenario, even when + lock requirements change. + It can be optimized so that the parent does not release locks, just tells + the child which locks it doesn't need to obtain. +\end_layout + +\begin_layout Standard +It also keeps the complexity out of the API, and in ctdbd where it is needed. +\end_layout + +\begin_layout Subsection +tdb_chainlock Functions Expose Implementation +\end_layout + +\begin_layout Standard +tdb_chainlock locks some number of records, including the record indicated + by the given key. + This gave atomicity guarantees; no-one can start a transaction, alter, + read or delete that key while the lock is held. +\end_layout + +\begin_layout Standard +It also makes the same guarantee for any other key in the chain, which is + an internal implementation detail and potentially a cause for deadlock. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. + It would be nice to have an explicit single entry lock which effected no + other keys. + Unfortunately, this won't work for an entry which doesn't exist. + Thus while chainlock may be implemented more efficiently for the existing + case, it will still have overlap issues with the non-existing case. + So it is best to keep the current (lack of) guarantee about which records + will be effected to avoid constraining our implementation. +\end_layout + +\begin_layout Subsection +Signal Handling is Not Race-Free +\end_layout + +\begin_layout Standard +The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate + that the tdb locking code should return with a failure, rather than trying + again when a signal is received (and errno == EAGAIN). + This is usually used to implement timeouts. +\end_layout + +\begin_layout Standard +Unfortunately, this does not work in the case where the signal is received + before the tdb code enters the fcntl() call to place the lock: the code + will sleep within the fcntl() code, unaware that the signal wants it to + exit. + In the case of long timeouts, this does not happen in practice. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The locking hooks proposed in +\begin_inset CommandInset ref +LatexCommand ref +reference "Proposed-Solution-locking-hook" + +\end_inset + + would allow the user to decide on whether to fail the lock acquisition + on a signal. + This allows the caller to choose their own compromise: they could narrow + the race by checking immediately before the fcntl call. +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +It may be possible to make this race-free in some implementations by having + the signal handler alter the struct flock to make it invalid. + This will cause the fcntl() lock call to fail with EINVAL if the signal + occurs before the kernel is entered, otherwise EAGAIN. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +The API Uses Gratuitous Typedefs, Capitals +\end_layout + +\begin_layout Standard +typedefs are useful for providing source compatibility when types can differ + across implementations, or arguably in the case of function pointer definitions + which are hard for humans to parse. + Otherwise it is simply obfuscation and pollutes the namespace. +\end_layout + +\begin_layout Standard +Capitalization is usually reserved for compile-time constants and macros. +\end_layout + +\begin_layout Description +TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the + definition isn't visible to the API user anyway. +\end_layout + +\begin_layout Description +TDB_DATA There is no reason to use this over struct TDB_DATA; the struct + needs to be understood by the API user. +\end_layout + +\begin_layout Description +struct +\begin_inset space ~ +\end_inset + +TDB_DATA This would normally be called 'struct tdb_data'. +\end_layout + +\begin_layout Description +enum +\begin_inset space ~ +\end_inset + +TDB_ERROR Similarly, this would normally be enum tdb_error. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. + Introducing lower case variants would please pedants like myself, but if + it were done the existing ones should be kept. + There is little point forcing a purely cosmetic change upon tdb users. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "tdb_log_func-Doesnt-Take" + +\end_inset + +tdb_log_func Doesn't Take The Private Pointer +\end_layout + +\begin_layout Standard +For API compatibility reasons, the logging function needs to call tdb_get_loggin +g_private() to retrieve the pointer registered by the tdb_open_ex for logging. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +It should simply take an extra argument, since we are prepared to break + the API/ABI. +\end_layout + +\begin_layout Subsection +Various Callback Functions Are Not Typesafe +\end_layout + +\begin_layout Standard +The callback functions in tdb_set_logging_function (after +\begin_inset CommandInset ref +LatexCommand ref +reference "tdb_log_func-Doesnt-Take" + +\end_inset + + is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check + all take void * and must internally convert it to the argument type they + were expecting. +\end_layout + +\begin_layout Standard +If this type changes, the compiler will not produce warnings on the callers, + since it only sees void *. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +With careful use of macros, we can create callback functions which give + a warning when used on gcc and the types of the callback and its private + argument differ. + Unsupported compilers will not give a warning, which is no worse than now. + In addition, the callbacks become clearer, as they need not use void * + for their parameter. +\end_layout + +\begin_layout Standard +See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html +\end_layout + +\begin_layout Subsection +TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic +\end_layout + +\begin_layout Standard +The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should + be cleared if the caller discovers it is the only process with the TDB + open. + However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not + be detected, so will have the TDB erased underneath them (usually resulting + in a crash). +\end_layout + +\begin_layout Standard +There is a similar issue on fork(); if the parent exits (or otherwise closes + the tdb) before the child calls tdb_reopen_all() to establish the lock + used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener + at that moment will believe it alone has opened the TDB and will erase + it. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Remove TDB_CLEAR_IF_FIRST. + Other workarounds are possible, but see +\begin_inset CommandInset ref +LatexCommand ref +reference "TDB_CLEAR_IF_FIRST-Imposes-Performance" + +\end_inset + +. +\end_layout + +\begin_layout Section +Performance And Scalability Issues +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "TDB_CLEAR_IF_FIRST-Imposes-Performance" + +\end_inset + +TDB_CLEAR_IF_FIRST Imposes Performance Penalty +\end_layout + +\begin_layout Standard +When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset + 4 (aka. + the ACTIVE_LOCK). + While these locks never conflict in normal tdb usage, they do add substantial + overhead for most fcntl lock implementations when the kernel scans to detect + if a lock conflict exists. + This is often a single linked list, making the time to acquire and release + a fcntl lock O(N) where N is the number of processes with the TDB open, + not the number actually doing work. +\end_layout + +\begin_layout Standard +In a Samba server it is common to have huge numbers of clients sitting idle, + and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag. +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +There is a flag to tdb_reopen_all() which is used for this optimization: + if the parent process will outlive the child, the child does not need the + ACTIVE_LOCK. + This is a workaround for this very performance issue. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Remove the flag. + It was a neat idea, but even trivial servers tend to know when they are + initializing for the first time and can simply unlink the old tdb at that + point. +\end_layout + +\begin_layout Subsection +TDB Files Have a 4G Limit +\end_layout + +\begin_layout Standard +This seems to be becoming an issue (so much for +\begin_inset Quotes eld +\end_inset + +trivial +\begin_inset Quotes erd +\end_inset + +!), particularly for ldb. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +A new, incompatible TDB format which uses 64 bit offsets internally rather + than 32 bit as now. + For simplicity of endian conversion (which TDB does on the fly if required), + all values will be 64 bit on disk. + In practice, some upper bits may be used for other purposes, but at least + 56 bits will be available for file offsets. +\end_layout + +\begin_layout Standard +tdb_open() will automatically detect the old version, and even create them + if TDB_VERSION6 is specified to tdb_open. +\end_layout + +\begin_layout Standard +32 bit processes will still be able to access TDBs larger than 4G (assuming + that their off_t allows them to seek to 64 bits), they will gracefully + fall back as they fail to mmap. + This can happen already with large TDBs. +\end_layout + +\begin_layout Standard +Old versions of tdb will fail to open the new TDB files (since 28 August + 2009, commit 398d0c29290: prior to that any unrecognized file format would + be erased and initialized as a fresh tdb!) +\end_layout + +\begin_layout Subsection +TDB Records Have a 4G Limit +\end_layout + +\begin_layout Standard +This has not been a reported problem, and the API uses size_t which can + be 64 bit on 64 bit platforms. + However, other limits may have made such an issue moot. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Record sizes will be 64 bit, with an error returned on 32 bit platforms + which try to access such records (the current implementation would return + TDB_ERR_OOM in a similar case). + It seems unlikely that 32 bit keys will be a limitation, so the implementation + may not support this (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Records-Incur-A" + +\end_inset + +). +\end_layout + +\begin_layout Subsection +Hash Size Is Determined At TDB Creation Time +\end_layout + +\begin_layout Standard +TDB contains a number of hash chains in the header; the number is specified + at creation time, and defaults to 131. + This is such a bottleneck on large databases (as each hash chain gets quite + long), that LDB uses 10,000 for this hash. + In general it is impossible to know what the 'right' answer is at database + creation time. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +After comprehensive performance testing on various scalable hash variants +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying + because I was previously convinced that an expanding tree of hashes would + be very close to optimal. +\end_layout + +\end_inset + +, it became clear that it is hard to beat a straight linear hash table which + doubles in size when it reaches saturation. + There are three details which become important: +\end_layout + +\begin_layout Enumerate +On encountering a full bucket, we use the next bucket. +\end_layout + +\begin_layout Enumerate +Extra hash bits are stored with the offset, to reduce comparisons. +\end_layout + +\begin_layout Enumerate +A marker entry is used on deleting an entry. +\end_layout + +\begin_layout Standard +The doubling of the table must be done under a transaction; we will not + reduce it on deletion, so it will be an unusual case. + It will either be placed at the head (other entries will be moved out the + way so we can expand). + We could have a pointer in the header to the current hashtable location, + but that pointer would have to be read frequently to check for hashtable + moves. +\end_layout + +\begin_layout Standard +The locking for this is slightly more complex than the chained case; we + currently have one lock per bucket, and that means we would need to expand + the lock if we overflow to the next bucket. + The frequency of such collisions will effect our locking heuristics: we + can always lock more buckets than we need. +\end_layout + +\begin_layout Standard +One possible optimization is to only re-check the hash size on an insert + or a lookup miss. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "TDB-Freelist-Is" + +\end_inset + +TDB Freelist Is Highly Contended +\end_layout + +\begin_layout Standard +TDB uses a single linked list for the free list. + Allocation occurs as follows, using heuristics which have evolved over + time: +\end_layout + +\begin_layout Enumerate +Get the free list lock for this whole operation. +\end_layout + +\begin_layout Enumerate +Multiply length by 1.25, so we always over-allocate by 25%. +\end_layout + +\begin_layout Enumerate +Set the slack multiplier to 1. +\end_layout + +\begin_layout Enumerate +Examine the current freelist entry: if it is > length but < the current + best case, remember it as the best case. +\end_layout + +\begin_layout Enumerate +Multiply the slack multiplier by 1.05. +\end_layout + +\begin_layout Enumerate +If our best fit so far is less than length * slack multiplier, return it. + The slack will be turned into a new free record if it's large enough. +\end_layout + +\begin_layout Enumerate +Otherwise, go onto the next freelist entry. +\end_layout + +\begin_layout Standard +Deleting a record occurs as follows: +\end_layout + +\begin_layout Enumerate +Lock the hash chain for this whole operation. +\end_layout + +\begin_layout Enumerate +Walk the chain to find the record, keeping the prev pointer offset. +\end_layout + +\begin_layout Enumerate +If max_dead is non-zero: +\end_layout + +\begin_deeper +\begin_layout Enumerate +Walk the hash chain again and count the dead records. +\end_layout + +\begin_layout Enumerate +If it's more than max_dead, bulk free all the dead ones (similar to steps + 4 and below, but the lock is only obtained once). +\end_layout + +\begin_layout Enumerate +Simply mark this record as dead and return. + +\end_layout + +\end_deeper +\begin_layout Enumerate +Get the free list lock for the remainder of this operation. +\end_layout + +\begin_layout Enumerate +\begin_inset CommandInset label +LatexCommand label +name "right-merging" + +\end_inset + +Examine the following block to see if it is free; if so, enlarge the current + block and remove that block from the free list. + This was disabled, as removal from the free list was O(entries-in-free-list). +\end_layout + +\begin_layout Enumerate +Examine the preceeding block to see if it is free: for this reason, each + block has a 32-bit tailer which indicates its length. + If it is free, expand it to cover our new block and return. +\end_layout + +\begin_layout Enumerate +Otherwise, prepend ourselves to the free list. +\end_layout + +\begin_layout Standard +Disabling right-merging (step +\begin_inset CommandInset ref +LatexCommand ref +reference "right-merging" + +\end_inset + +) causes fragmentation; the other heuristics proved insufficient to address + this, so the final answer to this was that when we expand the TDB file + inside a transaction commit, we repack the entire tdb. +\end_layout + +\begin_layout Standard +The single list lock limits our allocation rate; due to the other issues + this is not currently seen as a bottleneck. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The first step is to remove all the current heuristics, as they obviously + interact, then examine them once the lock contention is addressed. +\end_layout + +\begin_layout Standard +The free list must be split to reduce contention. + Assuming perfect free merging, we can at most have 1 free list entry for + each entry. + This implies that the number of free lists is related to the size of the + hash table, but as it is rare to walk a large number of free list entries + we can use far fewer, say 1/32 of the number of hash buckets. +\end_layout + +\begin_layout Standard +There are various benefits in using per-size free lists (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:TDB-Becomes-Fragmented" + +\end_inset + +) but it's not clear this would reduce contention in the common case where + all processes are allocating/freeing the same size. + Thus we almost certainly need to divide in other ways: the most obvious + is to divide the file into zones, and using a free list (or set of free + lists) for each. + This approximates address ordering. +\end_layout + +\begin_layout Standard +Note that this means we need to split the free lists when we expand the + file; this is probably acceptable when we double the hash table size, since + that is such an expensive operation already. + In the case of increasing the file size, there is an optimization we can + use: if we use M in the formula above as the file size rounded up to the + next power of 2, we only need reshuffle free lists when the file size crosses + a power of 2 boundary, +\emph on +and +\emph default +reshuffling the free lists is trivial: we simply merge every consecutive + pair of free lists. +\end_layout + +\begin_layout Standard +The basic algorithm is as follows. + Freeing is simple: +\end_layout + +\begin_layout Enumerate +Identify the correct zone. +\end_layout + +\begin_layout Enumerate +Lock the corresponding list. +\end_layout + +\begin_layout Enumerate +Re-check the zone (we didn't have a lock, sizes could have changed): relock + if necessary. +\end_layout + +\begin_layout Enumerate +Place the freed entry in the list for that zone. +\end_layout + +\begin_layout Standard +Allocation is a little more complicated, as we perform delayed coalescing + at this point: +\end_layout + +\begin_layout Enumerate +Pick a zone either the zone we last freed into, or based on a +\begin_inset Quotes eld +\end_inset + +random +\begin_inset Quotes erd +\end_inset + + number. +\end_layout + +\begin_layout Enumerate +Lock the corresponding list. +\end_layout + +\begin_layout Enumerate +Re-check the zone: relock if necessary. +\end_layout + +\begin_layout Enumerate +If the top entry is -large enough, remove it from the list and return it. +\end_layout + +\begin_layout Enumerate +Otherwise, coalesce entries in the list.If there was no entry large enough, + unlock the list and try the next zone. +\end_layout + +\begin_layout Enumerate +If no zone satisfies, expand the file. +\end_layout + +\begin_layout Standard +This optimizes rapid insert/delete of free list entries by not coalescing + them all the time.. + First-fit address ordering ordering seems to be fairly good for keeping + fragmentation low (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:TDB-Becomes-Fragmented" + +\end_inset + +). + Note that address ordering does not need a tailer to coalesce, though if + we needed one we could have one cheaply: see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Records-Incur-A" + +\end_inset + +. + +\end_layout + +\begin_layout Standard +I anticipate that the number of entries in each free zone would be small, + but it might be worth using one free entry to hold pointers to the others + for cache efficiency. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:TDB-Becomes-Fragmented" + +\end_inset + +TDB Becomes Fragmented +\end_layout + +\begin_layout Standard +Much of this is a result of allocation strategy +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute +xas.edu/pub/garbage/malloc/ismm98.ps +\end_layout + +\end_inset + + and deliberate hobbling of coalescing; internal fragmentation (aka overallocati +on) is deliberately set at 25%, and external fragmentation is only cured + by the decision to repack the entire db when a transaction commit needs + to enlarge the file. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The 25% overhead on allocation works in practice for ldb because indexes + tend to expand by one record at a time. + This internal fragmentation can be resolved by having an +\begin_inset Quotes eld +\end_inset + +expanded +\begin_inset Quotes erd +\end_inset + + bit in the header to note entries that have previously expanded, and allocating + more space for them. +\end_layout + +\begin_layout Standard +There are is a spectrum of possible solutions for external fragmentation: + one is to use a fragmentation-avoiding allocation strategy such as best-fit + address-order allocator. + The other end of the spectrum would be to use a bump allocator (very fast + and simple) and simply repack the file when we reach the end. +\end_layout + +\begin_layout Standard +There are three problems with efficient fragmentation-avoiding allocators: + they are non-trivial, they tend to use a single free list for each size, + and there's no evidence that tdb allocation patterns will match those recorded + for general allocators (though it seems likely). +\end_layout + +\begin_layout Standard +Thus we don't spend too much effort on external fragmentation; we will be + no worse than the current code if we need to repack on occasion. + More effort is spent on reducing freelist contention, and reducing overhead. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:Records-Incur-A" + +\end_inset + +Records Incur A 28-Byte Overhead +\end_layout + +\begin_layout Standard +Each TDB record has a header as follows: +\end_layout + +\begin_layout LyX-Code +struct tdb_record { +\end_layout + +\begin_layout LyX-Code + tdb_off_t next; /* offset of the next record in the list */ +\end_layout + +\begin_layout LyX-Code + tdb_len_t rec_len; /* total byte length of record */ +\end_layout + +\begin_layout LyX-Code + tdb_len_t key_len; /* byte length of key */ +\end_layout + +\begin_layout LyX-Code + tdb_len_t data_len; /* byte length of data */ +\end_layout + +\begin_layout LyX-Code + uint32_t full_hash; /* the full 32 bit hash of the key */ +\end_layout + +\begin_layout LyX-Code + uint32_t magic; /* try to catch errors */ +\end_layout + +\begin_layout LyX-Code + /* the following union is implied: +\end_layout + +\begin_layout LyX-Code + union { +\end_layout + +\begin_layout LyX-Code + char record[rec_len]; +\end_layout + +\begin_layout LyX-Code + struct { +\end_layout + +\begin_layout LyX-Code + char key[key_len]; +\end_layout + +\begin_layout LyX-Code + char data[data_len]; +\end_layout + +\begin_layout LyX-Code + } +\end_layout + +\begin_layout LyX-Code + uint32_t totalsize; (tailer) +\end_layout + +\begin_layout LyX-Code + } +\end_layout + +\begin_layout LyX-Code + */ +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +Naively, this would double to a 56-byte overhead on a 64 bit implementation. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We can use various techniques to reduce this for an allocated block: +\end_layout + +\begin_layout Enumerate +The 'next' pointer is not required, as we are using a flat hash table. +\end_layout + +\begin_layout Enumerate +'rec_len' can instead be expressed as an addition to key_len and data_len + (it accounts for wasted or overallocated length in the record). + Since the record length is always a multiple of 8, we can conveniently + fit it in 32 bits (representing up to 35 bits). +\end_layout + +\begin_layout Enumerate +'key_len' and 'data_len' can be reduced. + I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine + the two into one 64-bit field and using a 5 bit value which indicates at + what bit to divide the two. + Keys are unlikely to scale as fast as data, so I'm assuming a maximum key + size of 32 bits. +\end_layout + +\begin_layout Enumerate +'full_hash' is used to avoid a memcmp on the +\begin_inset Quotes eld +\end_inset + +miss +\begin_inset Quotes erd +\end_inset + + case, but this is diminishing returns after a handful of bits (at 10 bits, + it reduces 99.9% of false memcmp). + As an aside, as the lower bits are already incorporated in the hash table + resolution, the upper bits should be used here. +\end_layout + +\begin_layout Enumerate +'magic' does not need to be enlarged: it currently reflects one of 5 values + (used, free, dead, recovery, and unused_recovery). + It is useful for quick sanity checking however, and should not be eliminated. +\end_layout + +\begin_layout Enumerate +'tailer' is only used to coalesce free blocks (so a block to the right can + find the header to check if this block is free). + This can be replaced by a single 'free' bit in the header of the following + block (and the tailer only exists in free blocks). +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +This technique from Thomas Standish. + Data Structure Techniques. + Addison-Wesley, Reading, Massachusetts, 1980. +\end_layout + +\end_inset + + The current proposed coalescing algorithm doesn't need this, however. +\end_layout + +\begin_layout Standard +This produces a 16 byte used header like this: +\end_layout + +\begin_layout LyX-Code +struct tdb_used_record { +\end_layout + +\begin_layout LyX-Code + uint32_t magic : 16, +\end_layout + +\begin_layout LyX-Code + prev_is_free: 1, +\end_layout + +\begin_layout LyX-Code + key_data_divide: 5, +\end_layout + +\begin_layout LyX-Code + top_hash: 10; +\end_layout + +\begin_layout LyX-Code + uint32_t extra_octets; +\end_layout + +\begin_layout LyX-Code + uint64_t key_and_data_len; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard +And a free record like this: +\end_layout + +\begin_layout LyX-Code +struct tdb_free_record { +\end_layout + +\begin_layout LyX-Code + uint32_t free_magic; +\end_layout + +\begin_layout LyX-Code + uint64_t total_length; +\end_layout + +\begin_layout LyX-Code + ... +\end_layout + +\begin_layout LyX-Code + uint64_t tailer; +\end_layout + +\begin_layout LyX-Code +}; +\end_layout + +\begin_layout LyX-Code + +\end_layout + +\begin_layout Subsection +Transaction Commit Requires 4 fdatasync +\end_layout + +\begin_layout Standard +The current transaction algorithm is: +\end_layout + +\begin_layout Enumerate +write_recovery_data(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Enumerate +write_recovery_header(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Enumerate +overwrite_with_new_data(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Enumerate +remove_recovery_header(); +\end_layout + +\begin_layout Enumerate +sync(); +\end_layout + +\begin_layout Standard +On current ext3, each sync flushes all data to disk, so the next 3 syncs + are relatively expensive. + But this could become a performance bottleneck on other filesystems such + as ext4. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Neil Brown points out that this is overzealous, and only one sync is needed: +\end_layout + +\begin_layout Enumerate +Bundle the recovery data, a transaction counter and a strong checksum of + the new data. +\end_layout + +\begin_layout Enumerate +Strong checksum that whole bundle. +\end_layout + +\begin_layout Enumerate +Store the bundle in the database. +\end_layout + +\begin_layout Enumerate +Overwrite the oldest of the two recovery pointers in the header (identified + using the transaction counter) with the offset of this bundle. +\end_layout + +\begin_layout Enumerate +sync. +\end_layout + +\begin_layout Enumerate +Write the new data to the file. +\end_layout + +\begin_layout Standard +Checking for recovery means identifying the latest bundle with a valid checksum + and using the new data checksum to ensure that it has been applied. + This is more expensive than the current check, but need only be done at + open. + For running databases, a separate header field can be used to indicate + a transaction in progress; we need only check for recovery if this is set. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:TDB-Does-Not" + +\end_inset + +TDB Does Not Have Snapshot Support +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. + At some point you say +\begin_inset Quotes eld +\end_inset + +use a real database +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Standard +But as a thought experiment, if we implemented transactions to only overwrite + free entries (this is tricky: there must not be a header in each entry + which indicates whether it is free, but use of presence in metadata elsewhere), + and a pointer to the hash table, we could create an entirely new commit + without destroying existing data. + Then it would be easy to implement snapshots in a similar way. +\end_layout + +\begin_layout Standard +This would not allow arbitrary changes to the database, such as tdb_repack + does, and would require more space (since we have to preserve the current + and future entries at once). + If we used hash trees rather than one big hash table, we might only have + to rewrite some sections of the hash, too. +\end_layout + +\begin_layout Standard +We could then implement snapshots using a similar method, using multiple + different hash tables/free tables. +\end_layout + +\begin_layout Subsection +Transactions Cannot Operate in Parallel +\end_layout + +\begin_layout Standard +This would be useless for ldb, as it hits the index records with just about + every update. + It would add significant complexity in resolving clashes, and cause the + all transaction callers to write their code to loop in the case where the + transactions spuriously failed. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We could solve a small part of the problem by providing read-only transactions. + These would allow one write transaction to begin, but it could not commit + until all r/o transactions are done. + This would require a new RO_TRANSACTION_LOCK, which would be upgraded on + commit. +\end_layout + +\begin_layout Subsection +Default Hash Function Is Suboptimal +\end_layout + +\begin_layout Standard +The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially + if we expand it to 64 bits), and works best when the hash bucket size is + a prime number (which also means a slow modulus). + In addition, it is highly predictable which could potentially lead to a + Denial of Service attack in some TDB uses. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The Jenkins lookup3 hash +\begin_inset Foot +status open + +\begin_layout Plain Layout +http://burtleburtle.net/bob/c/lookup3.c +\end_layout + +\end_inset + + is a fast and superbly-mixing hash. + It's used by the Linux kernel and almost everything else. + This has the particular properties that it takes an initial seed, and produces + two 32 bit hash numbers, which we can combine into a 64-bit hash. +\end_layout + +\begin_layout Standard +The seed should be created at tdb-creation time from some random source, + and placed in the header. + This is far from foolproof, but adds a little bit of protection against + hash bombing. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "Reliable-Traversal-Adds" + +\end_inset + +Reliable Traversal Adds Complexity +\end_layout + +\begin_layout Standard +We lock a record during traversal iteration, and try to grab that lock in + the delete code. + If that grab on delete fails, we simply mark it deleted and continue onwards; + traversal checks for this condition and does the delete when it moves off + the record. +\end_layout + +\begin_layout Standard +If traversal terminates, the dead record may be left indefinitely. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +Remove reliability guarantees; see +\begin_inset CommandInset ref +LatexCommand ref +reference "traverse-Proposed-Solution" + +\end_inset + +. +\end_layout + +\begin_layout Subsection +Fcntl Locking Adds Overhead +\end_layout + +\begin_layout Standard +Placing a fcntl lock means a system call, as does removing one. + This is actually one reason why transactions can be faster (everything + is locked once at transaction start). + In the uncontended case, this overhead can theoretically be eliminated. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. +\end_layout + +\begin_layout Standard +We tried this before with spinlock support, in the early days of TDB, and + it didn't make much difference except in manufactured benchmarks. +\end_layout + +\begin_layout Standard +We could use spinlocks (with futex kernel support under Linux), but it means + that we lose automatic cleanup when a process dies with a lock. + There is a method of auto-cleanup under Linux, but it's not supported by + other operating systems. + We could reintroduce a clear-if-first-style lock and sweep for dead futexes + on open, but that wouldn't help the normal case of one concurrent opener + dying. + Increasingly elaborate repair schemes could be considered, but they require + an ABI change (everyone must use them) anyway, so there's no need to do + this at the same time as everything else. +\end_layout + +\begin_layout Subsection +Some Transactions Don't Require Durability +\end_layout + +\begin_layout Standard +Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast) + usage, and occasionally empties the results into a transactional TDB. + This kind of usage prioritizes performance over durability: as long as + we are consistent, data can be lost. +\end_layout + +\begin_layout Standard +This would be more neatly implemented inside tdb: a +\begin_inset Quotes eld +\end_inset + +soft +\begin_inset Quotes erd +\end_inset + + transaction commit (ie. + syncless) which meant that data may be reverted on a crash. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +None. +\end_layout + +\begin_layout Standard +Unfortunately any transaction scheme which overwrites old data requires + a sync before that overwrite to avoid the possibility of corruption. +\end_layout + +\begin_layout Standard +It seems possible to use a scheme similar to that described in +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:TDB-Does-Not" + +\end_inset + +,where transactions are committed without overwriting existing data, and + an array of top-level pointers were available in the header. + If the transaction is +\begin_inset Quotes eld +\end_inset + +soft +\begin_inset Quotes erd +\end_inset + + then we would not need a sync at all: existing processes would pick up + the new hash table and free list and work with that. +\end_layout + +\begin_layout Standard +At some later point, a sync would allow recovery of the old data into the + free lists (perhaps when the array of top-level pointers filled). + On crash, tdb_open() would examine the array of top levels, and apply the + transactions until it encountered an invalid checksum. +\end_layout + +\end_body +\end_document +@ + + +1.5 +log +@Soft transaction commit +@ +text +@d38 1 +a38 1 +\author "Rusty Russell,,," +a52 4 + +\change_deleted 0 1280141199 +10-May-2010 +\change_inserted 0 1280141202 +a53 2 +\change_unchanged + +a2028 2 + +\change_inserted 0 1280140902 +a2034 2 + +\change_unchanged +a2212 2 +\change_inserted 0 1280140661 + +a2215 2 + +\change_inserted 0 1280140703 +a2219 2 + +\change_inserted 0 1280708312 +a2226 2 + +\change_inserted 0 1280708400 +a2239 2 + +\change_inserted 0 1280140836 +a2243 2 + +\change_inserted 0 1280708255 +a2247 2 + +\change_inserted 0 1280708374 +a2252 2 + +\change_inserted 0 1280141181 +a2274 2 + +\change_inserted 0 1280141345 +@ + + +1.4 +log +@Merge changes +@ +text +@d38 1 +a38 1 +\author "" +d53 2 +d56 4 +d2035 10 +d2223 84 +@ + + +1.3 +log +@Transaction and freelist rethink. +@ +text +@d38 1 +a38 1 +\author "Rusty Russell,,," +d53 1 +a53 1 +27-April-2010 +d662 1 +a662 5 + behavior of disallowing +\change_inserted 0 1272940179 +nested +\change_unchanged +transactions should become the default. +a1210 2 +\change_inserted 0 1272944650 + +a1214 2 + +\change_inserted 0 1272944763 +a1218 2 +\change_unchanged + +a1223 2 +\change_unchanged + +a1301 2 + +\change_inserted 0 1273478114 +a1310 2 +\change_unchanged + +d1515 1 +a1515 11 +The free list +\change_deleted 0 1273469807 +should +\change_inserted 0 1273469810 +must +\change_unchanged + be split +\change_deleted 0 1273469815 +into multiple lists +\change_unchanged +to reduce contention. +a1520 2 +\change_inserted 0 1273470006 + +a1523 2 + +\change_inserted 0 1273492055 +a1539 2 + +\change_inserted 0 1273483888 +a1551 2 +\change_unchanged + +a1554 8 + +\change_deleted 0 1272942055 +There are various ways to organize these lisys, but because we want to be + able to quickly identify which free list an entry is in, and reduce the + number of locks required for merging, we will use zoning (eg. + each free list covers some fixed fraction of the file). + +\change_inserted 0 1273484187 +d1556 1 +a1556 7 + +\change_deleted 0 1273484194 +The algorithm for f +\change_inserted 0 1273484194 +F +\change_unchanged +reeing is simple: +d1560 1 +a1560 7 +Identify the correct +\change_deleted 0 1273482856 +free list +\change_inserted 0 1273482857 +zone +\change_unchanged +. +d1564 1 +a1564 7 +Lock the +\change_inserted 0 1273482895 +corresponding +\change_unchanged +list +\change_inserted 0 1273482863 +. +a1567 2 + +\change_inserted 0 1273482909 +d1573 1 +a1573 13 + +\change_deleted 0 1273482885 +, and p +\change_inserted 0 1273482888 +P +\change_unchanged +lace the freed entry +\change_deleted 0 1273492415 +at the head +\change_inserted 0 1273492415 +in the list for that zone +\change_unchanged +. +d1577 2 +a1578 7 +Allocation is a little more complicated, as we +\change_deleted 0 1273483240 +merge entries as we walk the list: +\change_inserted 0 1273484250 +perform delayed coalescing at this point: +\change_unchanged + +d1582 1 +a1582 19 +Pick a +\change_deleted 0 1273482955 +free list; +\change_inserted 0 1273482957 +zone +\change_unchanged + either the +\change_deleted 0 1273482962 +list +\change_inserted 0 1273482962 +zone +\change_unchanged + we last freed +\change_deleted 0 1273482966 +o +\change_inserted 0 1273482966 +i +\change_unchanged +nto, or based on a +d1594 1 +a1594 9 +Lock th +\change_inserted 0 1273482980 +e corresponding +\change_deleted 0 1273482973 +at +\change_unchanged + list. +\change_inserted 0 1273482982 + +a1597 2 + +\change_inserted 0 1273483084 +a1598 53 +\change_unchanged + +\end_layout + +\begin_layout Enumerate +If the top entry is +\change_deleted 0 1273492155 +well-sized, +\change_inserted 0 1273492159 +-large enough, +\change_unchanged +remove it from the list and return it. +\end_layout + +\begin_layout Enumerate +Otherwise, +\change_inserted 0 1273492206 +coalesce entries in the list. +\change_deleted 0 1273492200 +examine the entry to the right of it in the file. + If it is free: +\end_layout + +\begin_deeper +\begin_layout Enumerate + +\change_deleted 0 1273492200 +If that entry is in a different list, lock that list too. +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1273492200 +If we had to place a new lock, re-check that the entry is free. +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1273492200 +Remove that entry from its free list and expand this entry to cover it. +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1273485554 +Goto step 3. +\end_layout + +\end_deeper +\begin_layout Enumerate + +\change_inserted 0 1273485311 +If there was no entry large enough, unlock the list and try the next zone. +d1602 1 +a1602 5 + +\change_deleted 0 1273483646 +Repeat step 3 with each entry in the list. +\change_unchanged + +d1606 2 +a1607 5 + +\change_deleted 0 1273483668 +Unlock the list and repeat step 2 with the next list. +\change_unchanged + +d1611 1 +a1611 7 +If no +\change_deleted 0 1273483671 +list +\change_inserted 0 1273483671 +zone +\change_unchanged + satisfies, expand the file. +d1615 2 +a1616 9 +This optimizes rapid insert/delete of free list entries +\change_inserted 0 1273485794 + by not coalescing them all the time. +\change_deleted 0 1273483685 +, and allows us to get rid of the tailer altogether +\change_unchanged +. + +\change_inserted 0 1273492299 +a1638 39 + +\change_deleted 0 1273476840 +The question of +\begin_inset Quotes eld +\end_inset + +well-sized +\begin_inset Quotes erd +\end_inset + + free entries is more difficult: the 25% overhead works in practice for + ldb because indexes tend to expand by one record at a time. + This can be resolved by having an +\begin_inset Quotes eld +\end_inset + +expanded +\begin_inset Quotes erd +\end_inset + + bit in the header to note entries that have previously expanded, and allocating + more space for them. + Whether the +\begin_inset Quotes eld +\end_inset + +increasing slack +\begin_inset Quotes erd +\end_inset + + algorithm should be implemented or first-fit used is still unknown: we + will determine this once these other ideas are implemented. +\change_inserted 0 1273483750 + +\end_layout + +\begin_layout Standard + +\change_inserted 0 1273492450 +a1644 2 + +\change_inserted 0 1273470441 +a1654 2 + +\change_inserted 0 1273476556 +a1659 2 + +\change_inserted 0 1273470423 +a1661 2 +\change_unchanged + +a1672 2 + +\change_inserted 0 1273476847 +a1676 2 + +\change_inserted 0 1273476886 +a1691 2 + +\change_inserted 0 1273477233 +a1699 2 + +\change_inserted 0 1273477534 +a1706 2 + +\change_inserted 0 1273482700 +a1712 2 + +\change_inserted 0 1273478079 +a1722 2 + +\change_inserted 0 1273477839 +a1726 2 + +\change_inserted 0 1273477925 +a1730 2 + +\change_inserted 0 1273477925 +a1734 2 + +\change_inserted 0 1273477925 +a1738 2 + +\change_inserted 0 1273477925 +a1742 2 + +\change_inserted 0 1273477925 +a1746 2 + +\change_inserted 0 1273477925 +a1750 2 + +\change_inserted 0 1273477925 +a1754 2 + +\change_inserted 0 1273477925 +a1758 2 + +\change_inserted 0 1273477925 +a1762 2 + +\change_inserted 0 1273477925 +a1766 2 + +\change_inserted 0 1273477925 +a1770 2 + +\change_inserted 0 1273477925 +a1774 2 + +\change_inserted 0 1273477925 +a1778 2 + +\change_inserted 0 1273477925 +a1782 2 + +\change_inserted 0 1273477925 +a1786 2 + +\change_inserted 0 1273477925 +a1790 2 + +\change_inserted 0 1273477925 +a1794 2 + +\change_inserted 0 1273477925 +a1798 2 + +\change_inserted 0 1273492522 +a1802 2 + +\change_inserted 0 1273492530 +a1806 2 + +\change_inserted 0 1273492546 +a1810 2 + +\change_inserted 0 1273478239 +a1814 2 + +\change_inserted 0 1273479960 +a1821 2 + +\change_inserted 0 1273480265 +a1830 2 + +\change_inserted 0 1273480354 +a1845 2 + +\change_inserted 0 1273478968 +a1851 2 + +\change_inserted 0 1273492604 +a1859 2 + +\change_inserted 0 1273479572 +a1862 2 +\change_unchanged + +a1870 2 + +\change_inserted 0 1273480282 +a1874 2 + +\change_inserted 0 1273478931 +a1878 2 + +\change_inserted 0 1273481549 +a1882 2 + +\change_inserted 0 1273481557 +a1886 2 + +\change_inserted 0 1273480307 +a1890 2 + +\change_inserted 0 1273480335 +a1894 2 + +\change_inserted 0 1273479897 +a1898 2 + +\change_inserted 0 1273479653 +a1902 2 + +\change_inserted 0 1273480371 +a1906 2 + +\change_inserted 0 1273480464 +a1910 2 + +\change_inserted 0 1273480399 +a1914 2 + +\change_inserted 0 1273480425 +a1918 2 + +\change_inserted 0 1273480453 +a1922 2 + +\change_inserted 0 1273480455 +a1926 2 + +\change_inserted 0 1273480450 +a1930 2 + +\change_inserted 0 1273480452 +a1935 2 +\change_inserted 0 1273478830 + +a1942 5 + +\change_deleted 0 1273481604 +In theory, we could get away with 2: one after we write the new data, and + one to somehow atomically change over to it. +\change_inserted 0 1273481632 +a1946 2 + +\change_inserted 0 1273481724 +a1950 2 + +\change_inserted 0 1273481713 +a1954 2 + +\change_inserted 0 1273481717 +a1958 2 + +\change_inserted 0 1273481730 +a1962 2 + +\change_inserted 0 1273481736 +a1966 2 + +\change_inserted 0 1273481744 +a1970 2 + +\change_inserted 0 1273481748 +a1974 2 + +\change_inserted 0 1273482185 +a1978 2 + +\change_inserted 0 1273482259 +a1989 50 + +\change_deleted 0 1273481848 +None. + Trying to rewrite the transaction code is a separate experiment, which + I encourage someone else to do. + At some point you say +\begin_inset Quotes eld +\end_inset + +use a real database +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Standard + +\change_deleted 0 1273481848 +But as a thought experiment: +\change_unchanged + +\end_layout + +\begin_layout Standard + +\change_deleted 0 1273481788 +Say there was a pointer in the header which said where the hash table and + free list tables were, and that no blocks were labeled with whether they + were free or not (it had to be derived from what list they were in). + We could create new hash table and free list in some free space, and populate + it as we want the post-committed state to look. + Then we sync, then we switch the offset in the header, then we sync again. +\end_layout + +\begin_layout Standard + +\change_deleted 0 1273481788 +This would not allow arbitrary changes to the database, such as tdb_repack + does, and would require more space (since we have to preserve the current + and future entries at once). + If we used hash trees rather than one big hash table, we might only have + to rewrite some sections of the hash, too. +\change_inserted 0 1273481854 + +\end_layout + +\begin_layout Standard + +\change_inserted 0 1273482102 +a1993 2 + +\change_inserted 0 1273482061 +a1998 2 + +\change_inserted 0 1273482063 +a2002 2 + +\change_inserted 0 1273482072 +a2006 2 + +\change_inserted 0 1273482139 +a2011 2 + +\change_inserted 0 1273482364 +a2015 2 + +\change_inserted 0 1273482163 +a2019 2 + +\change_inserted 0 1273482493 +a2037 2 + +\change_inserted 0 1273482536 +a2046 2 +\change_unchanged + +a2049 2 + +\change_inserted 0 1273482641 +a2058 2 + +\change_inserted 0 1273481827 +d2067 2 +a2068 11 +We could +\change_inserted 0 1273481829 +then +\change_unchanged +implement snapshots using a similar method +\change_deleted 0 1273481838 + to the above, only +\change_inserted 0 1273481840 +, +\change_unchanged + using multiple different hash tables/free tables. +@ + + +1.2 +log +@After first feedback (Ronnie & Volker) +@ +text +@d1314 13 +d1531 11 +a1541 1 +The free list should be split into multiple lists to reduce contention. +d1547 39 +d1596 7 +d1604 1 +a1604 1 +The algorithm for freeing is simple: +d1608 7 +a1614 1 +Identify the correct free list. +d1618 30 +a1647 1 +Lock the list, and place the freed entry at the head. +d1651 7 +a1657 2 +Allocation is a little more complicated, as we merge entries as we walk + the list: +d1661 19 +a1679 1 +Pick a free list; either the list we last freed onto, or based on a +d1691 17 +a1707 1 +Lock that list. +d1711 7 +a1717 1 +If the top entry is well-sized, remove it from the list and return it. +d1721 5 +a1725 1 +Otherwise, examine the entry to the right of it in the file. +d1731 2 +d1737 2 +d1743 2 +d1749 2 +d1756 8 +d1765 2 +d1770 2 +d1773 2 +d1778 7 +a1784 1 +If no list satisfies, expand the file. +d1788 28 +a1815 2 +This optimizes rapid insert/delete of free list entries, and allows us to + get rid of the tailer altogether. +d1819 2 +d1851 1 +a1851 1 +\change_inserted 0 1272941474 +d1857 303 +a2159 18 +\change_inserted 0 1272942759 +There are various ways to organize these lists, but because we want to be + able to quickly identify which free list an entry is in, and reduce the + number of locks required for merging, we will use zoning (eg. + each of the N free lists in a tdb file of size M covers a fixed fraction + M/N). + Note that this means we need to reshuffle the free lists when we expand + the file; this is probably acceptable when we double the hash table size, + since that is such an expensive operation already. + In the case of increasing the file size, there is an optimization we can + use: if we use M in the formula above as the file size rounded up to the + next power of 2, we only need reshuffle free lists when the file size crosses + a power of 2 boundary, +\emph on +and +\emph default +reshuffling the free lists is trivial: we simply merge every consecutive + pair of free lists. +d2164 107 +d2276 2 +d2280 59 +d2346 2 +d2363 2 +d2366 2 +d2371 2 +d2382 2 +d2389 57 +d2458 13 +d2474 32 +a2505 2 +We could implement snapshots using a similar method to the above, only using + multiple different hash tables/free tables. +@ + + +1.1 +log +@Initial revision +@ +text +@d1 1 +a1 1 +#LyX 1.6.4 created this file. For more info see http://www.lyx.org/ +d36 3 +a38 3 +\tracking_changes false +\output_changes false +\author "" +d662 5 +a666 1 + behavior of disallowing transactions should become the default. +d1215 21 +d1527 2 +d1533 3 +a1535 1 + The algorithm for freeing is simple: +d1642 26 +@ diff --git a/ccan/tdb2/doc/design.pdf b/ccan/tdb2/doc/design.pdf new file mode 100644 index 0000000..bfe3350 Binary files /dev/null and b/ccan/tdb2/doc/design.pdf differ diff --git a/ccan/tdb2/doc/design.txt b/ccan/tdb2/doc/design.txt new file mode 100644 index 0000000..2f0d22c --- /dev/null +++ b/ccan/tdb2/doc/design.txt @@ -0,0 +1,1058 @@ +TDB2: A Redesigning The Trivial DataBase + +Rusty Russell, IBM Corporation + +26-July-2010 + +Abstract + +The Trivial DataBase on-disk format is 32 bits; with usage cases +heading towards the 4G limit, that must change. This required +breakage provides an opportunity to revisit TDB's other design +decisions and reassess them. + +1 Introduction + +The Trivial DataBase was originally written by Andrew Tridgell as +a simple key/data pair storage system with the same API as dbm, +but allowing multiple readers and writers while being small +enough (< 1000 lines of C) to include in SAMBA. The simple design +created in 1999 has proven surprisingly robust and performant, +used in Samba versions 3 and 4 as well as numerous other +projects. Its useful life was greatly increased by the +(backwards-compatible!) addition of transaction support in 2005. + +The wider variety and greater demands of TDB-using code has lead +to some organic growth of the API, as well as some compromises on +the implementation. None of these, by themselves, are seen as +show-stoppers, but the cumulative effect is to a loss of elegance +over the initial, simple TDB implementation. Here is a table of +the approximate number of lines of implementation code and number +of API functions at the end of each year: + + ++-----------+----------------+--------------------------------+ +| Year End | API Functions | Lines of C Code Implementation | ++-----------+----------------+--------------------------------+ ++-----------+----------------+--------------------------------+ +| 1999 | 13 | 1195 | ++-----------+----------------+--------------------------------+ +| 2000 | 24 | 1725 | ++-----------+----------------+--------------------------------+ +| 2001 | 32 | 2228 | ++-----------+----------------+--------------------------------+ +| 2002 | 35 | 2481 | ++-----------+----------------+--------------------------------+ +| 2003 | 35 | 2552 | ++-----------+----------------+--------------------------------+ +| 2004 | 40 | 2584 | ++-----------+----------------+--------------------------------+ +| 2005 | 38 | 2647 | ++-----------+----------------+--------------------------------+ +| 2006 | 52 | 3754 | ++-----------+----------------+--------------------------------+ +| 2007 | 66 | 4398 | ++-----------+----------------+--------------------------------+ +| 2008 | 71 | 4768 | ++-----------+----------------+--------------------------------+ +| 2009 | 73 | 5715 | ++-----------+----------------+--------------------------------+ + + +This review is an attempt to catalog and address all the known +issues with TDB and create solutions which address the problems +without significantly increasing complexity; all involved are far +too aware of the dangers of second system syndrome in rewriting a +successful project like this. + +2 API Issues + +2.1 tdb_open_ex Is Not Expandable + +The tdb_open() call was expanded to tdb_open_ex(), which added an +optional hashing function and an optional logging function +argument. Additional arguments to open would require the +introduction of a tdb_open_ex2 call etc. + +2.1.1 Proposed Solution + +tdb_open() will take a linked-list of attributes: + +enum tdb_attribute { + + TDB_ATTRIBUTE_LOG = 0, + + TDB_ATTRIBUTE_HASH = 1 + +}; + +struct tdb_attribute_base { + + enum tdb_attribute attr; + + union tdb_attribute *next; + +}; + +struct tdb_attribute_log { + + struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG +*/ + + tdb_log_func log_fn; + + void *log_private; + +}; + +struct tdb_attribute_hash { + + struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH +*/ + + tdb_hash_func hash_fn; + + void *hash_private; + +}; + +union tdb_attribute { + + struct tdb_attribute_base base; + + struct tdb_attribute_log log; + + struct tdb_attribute_hash hash; + +}; + +This allows future attributes to be added, even if this expands +the size of the union. + +2.2 tdb_traverse Makes Impossible Guarantees + +tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, +and it was thought that it was important to guarantee that all +records which exist at the start and end of the traversal would +be included, and no record would be included twice. + +This adds complexity (see[Reliable-Traversal-Adds]) and does not +work anyway for records which are altered (in particular, those +which are expanded may be effectively deleted and re-added behind +the traversal). + +2.2.1 Proposed Solution + +Abandon the guarantee. You will see every record if no changes +occur during your traversal, otherwise you will see some subset. +You can prevent changes by using a transaction or the locking +API. + +2.3 Nesting of Transactions Is Fraught + +TDB has alternated between allowing nested transactions and not +allowing them. Various paths in the Samba codebase assume that +transactions will nest, and in a sense they can: the operation is +only committed to disk when the outer transaction is committed. +There are two problems, however: + +1. Canceling the inner transaction will cause the outer + transaction commit to fail, and will not undo any operations + since the inner transaction began. This problem is soluble with + some additional internal code. + +2. An inner transaction commit can be cancelled by the outer + transaction. This is desirable in the way which Samba's + database initialization code uses transactions, but could be a + surprise to any users expecting a successful transaction commit + to expose changes to others. + +The current solution is to specify the behavior at tdb_open(), +with the default currently that nested transactions are allowed. +This flag can also be changed at runtime. + +2.3.1 Proposed Solution + +Given the usage patterns, it seems that the “least-surprise” +behavior of disallowing nested transactions should become the +default. Additionally, it seems the outer transaction is the only +code which knows whether inner transactions should be allowed, so +a flag to indicate this could be added to tdb_transaction_start. +However, this behavior can be simulated with a wrapper which uses +tdb_add_flags() and tdb_remove_flags(), so the API should not be +expanded for this relatively-obscure case. + +2.4 Incorrect Hash Function is Not Detected + +tdb_open_ex() allows the calling code to specify a different hash +function to use, but does not check that all other processes +accessing this tdb are using the same hash function. The result +is that records are missing from tdb_fetch(). + +2.4.1 Proposed Solution + +The header should contain an example hash result (eg. the hash of +0xdeadbeef), and tdb_open_ex() should check that the given hash +function produces the same answer, or fail the tdb_open call. + +2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation + +In response to scalability issues with the free list ([TDB-Freelist-Is] +) two API workarounds have been incorporated in TDB: +tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The +latter actually calls the former with an argument of “5”. + +This code allows deleted records to accumulate without putting +them in the free list. On delete we iterate through each chain +and free them in a batch if there are more than max_dead entries. +These are never otherwise recycled except as a side-effect of a +tdb_repack. + +2.5.1 Proposed Solution + +With the scalability problems of the freelist solved, this API +can be removed. The TDB_VOLATILE flag may still be useful as a +hint that store and delete of records will be at least as common +as fetch in order to allow some internal tuning, but initially +will become a no-op. + +2.6 TDB Files Cannot Be Opened Multiple Times + In The Same Process + +No process can open the same TDB twice; we check and disallow it. +This is an unfortunate side-effect of fcntl locks, which operate +on a per-file rather than per-file-descriptor basis, and do not +nest. Thus, closing any file descriptor on a file clears all the +locks obtained by this process, even if they were placed using a +different file descriptor! + +Note that even if this were solved, deadlock could occur if +operations were nested: this is a more manageable programming +error in most cases. + +2.6.1 Proposed Solution + +We could lobby POSIX to fix the perverse rules, or at least lobby +Linux to violate them so that the most common implementation does +not have this restriction. This would be a generally good idea +for other fcntl lock users. + +Samba uses a wrapper which hands out the same tdb_context to +multiple callers if this happens, and does simple reference +counting. We should do this inside the tdb library, which already +emulates lock nesting internally; it would need to recognize when +deadlock occurs within a single process. This would create a new +failure mode for tdb operations (while we currently handle +locking failures, they are impossible in normal use and a process +encountering them can do little but give up). + +I do not see benefit in an additional tdb_open flag to indicate +whether re-opening is allowed, as though there may be some +benefit to adding a call to detect when a tdb_context is shared, +to allow other to create such an API. + +2.7 TDB API Is Not POSIX Thread-safe + +The TDB API uses an error code which can be queried after an +operation to determine what went wrong. This programming model +does not work with threads, unless specific additional guarantees +are given by the implementation. In addition, even +otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot] +). + +2.7.1 Proposed Solution + +Reachitecting the API to include a tdb_errcode pointer would be a +great deal of churn; we are better to guarantee that the +tdb_errcode is per-thread so the current programming model can be +maintained. + +This requires dynamic per-thread allocations, which is awkward +with POSIX threads (pthread_key_create space is limited and we +cannot simply allocate a key for every TDB). + +Internal locking is required to make sure that fcntl locks do not +overlap between threads, and also that the global list of tdbs is +maintained. + +The aim is that building tdb with -DTDB_PTHREAD will result in a +pthread-safe version of the library, and otherwise no overhead +will exist. + +2.8 *_nonblock Functions And *_mark Functions Expose + Implementation + +CTDB[footnote: +Clustered TDB, see http://ctdb.samba.org +] wishes to operate on TDB in a non-blocking manner. This is +currently done as follows: + +1. Call the _nonblock variant of an API function (eg. + tdb_lockall_nonblock). If this fails: + +2. Fork a child process, and wait for it to call the normal + variant (eg. tdb_lockall). + +3. If the child succeeds, call the _mark variant to indicate we + already have the locks (eg. tdb_lockall_mark). + +4. Upon completion, tell the child to release the locks (eg. + tdb_unlockall). + +5. Indicate to tdb that it should consider the locks removed (eg. + tdb_unlockall_mark). + +There are several issues with this approach. Firstly, adding two +new variants of each function clutters the API for an obscure +use, and so not all functions have three variants. Secondly, it +assumes that all paths of the functions ask for the same locks, +otherwise the parent process will have to get a lock which the +child doesn't have under some circumstances. I don't believe this +is currently the case, but it constrains the implementation. + +2.8.1 Proposed Solution + +Implement a hook for locking methods, so that the caller can +control the calls to create and remove fcntl locks. In this +scenario, ctdbd would operate as follows: + +1. Call the normal API function, eg tdb_lockall(). + +2. When the lock callback comes in, check if the child has the + lock. Initially, this is always false. If so, return 0. + Otherwise, try to obtain it in non-blocking mode. If that + fails, return EWOULDBLOCK. + +3. Release locks in the unlock callback as normal. + +4. If tdb_lockall() fails, see if we recorded a lock failure; if + so, call the child to repeat the operation. + +5. The child records what locks it obtains, and returns that + information to the parent. + +6. When the child has succeeded, goto 1. + +This is flexible enough to handle any potential locking scenario, +even when lock requirements change. It can be optimized so that +the parent does not release locks, just tells the child which +locks it doesn't need to obtain. + +It also keeps the complexity out of the API, and in ctdbd where +it is needed. + +2.9 tdb_chainlock Functions Expose Implementation + +tdb_chainlock locks some number of records, including the record +indicated by the given key. This gave atomicity guarantees; +no-one can start a transaction, alter, read or delete that key +while the lock is held. + +It also makes the same guarantee for any other key in the chain, +which is an internal implementation detail and potentially a +cause for deadlock. + +2.9.1 Proposed Solution + +None. It would be nice to have an explicit single entry lock +which effected no other keys. Unfortunately, this won't work for +an entry which doesn't exist. Thus while chainlock may be +implemented more efficiently for the existing case, it will still +have overlap issues with the non-existing case. So it is best to +keep the current (lack of) guarantee about which records will be +effected to avoid constraining our implementation. + +2.10 Signal Handling is Not Race-Free + +The tdb_setalarm_sigptr() call allows the caller's signal handler +to indicate that the tdb locking code should return with a +failure, rather than trying again when a signal is received (and +errno == EAGAIN). This is usually used to implement timeouts. + +Unfortunately, this does not work in the case where the signal is +received before the tdb code enters the fcntl() call to place the +lock: the code will sleep within the fcntl() code, unaware that +the signal wants it to exit. In the case of long timeouts, this +does not happen in practice. + +2.10.1 Proposed Solution + +The locking hooks proposed in[Proposed-Solution-locking-hook] +would allow the user to decide on whether to fail the lock +acquisition on a signal. This allows the caller to choose their +own compromise: they could narrow the race by checking +immediately before the fcntl call.[footnote: +It may be possible to make this race-free in some implementations +by having the signal handler alter the struct flock to make it +invalid. This will cause the fcntl() lock call to fail with +EINVAL if the signal occurs before the kernel is entered, +otherwise EAGAIN. +] + +2.11 The API Uses Gratuitous Typedefs, Capitals + +typedefs are useful for providing source compatibility when types +can differ across implementations, or arguably in the case of +function pointer definitions which are hard for humans to parse. +Otherwise it is simply obfuscation and pollutes the namespace. + +Capitalization is usually reserved for compile-time constants and +macros. + + TDB_CONTEXT There is no reason to use this over 'struct + tdb_context'; the definition isn't visible to the API user + anyway. + + TDB_DATA There is no reason to use this over struct TDB_DATA; + the struct needs to be understood by the API user. + + struct TDB_DATA This would normally be called 'struct + tdb_data'. + + enum TDB_ERROR Similarly, this would normally be enum + tdb_error. + +2.11.1 Proposed Solution + +None. Introducing lower case variants would please pedants like +myself, but if it were done the existing ones should be kept. +There is little point forcing a purely cosmetic change upon tdb +users. + +2.12 tdb_log_func Doesn't Take The + Private Pointer + +For API compatibility reasons, the logging function needs to call +tdb_get_logging_private() to retrieve the pointer registered by +the tdb_open_ex for logging. + +2.12.1 Proposed Solution + +It should simply take an extra argument, since we are prepared to +break the API/ABI. + +2.13 Various Callback Functions Are Not Typesafe + +The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take] + is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read +and tdb_check all take void * and must internally convert it to +the argument type they were expecting. + +If this type changes, the compiler will not produce warnings on +the callers, since it only sees void *. + +2.13.1 Proposed Solution + +With careful use of macros, we can create callback functions +which give a warning when used on gcc and the types of the +callback and its private argument differ. Unsupported compilers +will not give a warning, which is no worse than now. In addition, +the callbacks become clearer, as they need not use void * for +their parameter. + +See CCAN's typesafe_cb module at +http://ccan.ozlabs.org/info/typesafe_cb.html + +2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, + tdb_reopen_all Problematic + +The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB +file should be cleared if the caller discovers it is the only +process with the TDB open. However, if any caller does not +specify TDB_CLEAR_IF_FIRST it will not be detected, so will have +the TDB erased underneath them (usually resulting in a crash). + +There is a similar issue on fork(); if the parent exits (or +otherwise closes the tdb) before the child calls tdb_reopen_all() +to establish the lock used to indicate the TDB is opened by +someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe +it alone has opened the TDB and will erase it. + +2.14.1 Proposed Solution + +Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but +see [TDB_CLEAR_IF_FIRST-Imposes-Performance]. + +3 Performance And Scalability Issues + +3.1 TDB_CLEAR_IF_FIRST + Imposes Performance Penalty + +When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is +placed at offset 4 (aka. the ACTIVE_LOCK). While these locks +never conflict in normal tdb usage, they do add substantial +overhead for most fcntl lock implementations when the kernel +scans to detect if a lock conflict exists. This is often a single +linked list, making the time to acquire and release a fcntl lock +O(N) where N is the number of processes with the TDB open, not +the number actually doing work. + +In a Samba server it is common to have huge numbers of clients +sitting idle, and thus they have weaned themselves off the +TDB_CLEAR_IF_FIRST flag.[footnote: +There is a flag to tdb_reopen_all() which is used for this +optimization: if the parent process will outlive the child, the +child does not need the ACTIVE_LOCK. This is a workaround for +this very performance issue. +] + +3.1.1 Proposed Solution + +Remove the flag. It was a neat idea, but even trivial servers +tend to know when they are initializing for the first time and +can simply unlink the old tdb at that point. + +3.2 TDB Files Have a 4G Limit + +This seems to be becoming an issue (so much for “trivial”!), +particularly for ldb. + +3.2.1 Proposed Solution + +A new, incompatible TDB format which uses 64 bit offsets +internally rather than 32 bit as now. For simplicity of endian +conversion (which TDB does on the fly if required), all values +will be 64 bit on disk. In practice, some upper bits may be used +for other purposes, but at least 56 bits will be available for +file offsets. + +tdb_open() will automatically detect the old version, and even +create them if TDB_VERSION6 is specified to tdb_open. + +32 bit processes will still be able to access TDBs larger than 4G +(assuming that their off_t allows them to seek to 64 bits), they +will gracefully fall back as they fail to mmap. This can happen +already with large TDBs. + +Old versions of tdb will fail to open the new TDB files (since 28 +August 2009, commit 398d0c29290: prior to that any unrecognized +file format would be erased and initialized as a fresh tdb!) + +3.3 TDB Records Have a 4G Limit + +This has not been a reported problem, and the API uses size_t +which can be 64 bit on 64 bit platforms. However, other limits +may have made such an issue moot. + +3.3.1 Proposed Solution + +Record sizes will be 64 bit, with an error returned on 32 bit +platforms which try to access such records (the current +implementation would return TDB_ERR_OOM in a similar case). It +seems unlikely that 32 bit keys will be a limitation, so the +implementation may not support this (see [sub:Records-Incur-A]). + +3.4 Hash Size Is Determined At TDB Creation Time + +TDB contains a number of hash chains in the header; the number is +specified at creation time, and defaults to 131. This is such a +bottleneck on large databases (as each hash chain gets quite +long), that LDB uses 10,000 for this hash. In general it is +impossible to know what the 'right' answer is at database +creation time. + +3.4.1 Proposed Solution + +After comprehensive performance testing on various scalable hash +variants[footnote: +http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 +This was annoying because I was previously convinced that an +expanding tree of hashes would be very close to optimal. +], it became clear that it is hard to beat a straight linear hash +table which doubles in size when it reaches saturation. There are +three details which become important: + +1. On encountering a full bucket, we use the next bucket. + +2. Extra hash bits are stored with the offset, to reduce + comparisons. + +3. A marker entry is used on deleting an entry. + +The doubling of the table must be done under a transaction; we +will not reduce it on deletion, so it will be an unusual case. It +will either be placed at the head (other entries will be moved +out the way so we can expand). We could have a pointer in the +header to the current hashtable location, but that pointer would +have to be read frequently to check for hashtable moves. + +The locking for this is slightly more complex than the chained +case; we currently have one lock per bucket, and that means we +would need to expand the lock if we overflow to the next bucket. +The frequency of such collisions will effect our locking +heuristics: we can always lock more buckets than we need. + +One possible optimization is to only re-check the hash size on an +insert or a lookup miss. + +3.5 TDB Freelist Is Highly Contended + +TDB uses a single linked list for the free list. Allocation +occurs as follows, using heuristics which have evolved over time: + +1. Get the free list lock for this whole operation. + +2. Multiply length by 1.25, so we always over-allocate by 25%. + +3. Set the slack multiplier to 1. + +4. Examine the current freelist entry: if it is > length but < + the current best case, remember it as the best case. + +5. Multiply the slack multiplier by 1.05. + +6. If our best fit so far is less than length * slack multiplier, + return it. The slack will be turned into a new free record if + it's large enough. + +7. Otherwise, go onto the next freelist entry. + +Deleting a record occurs as follows: + +1. Lock the hash chain for this whole operation. + +2. Walk the chain to find the record, keeping the prev pointer + offset. + +3. If max_dead is non-zero: + + (a) Walk the hash chain again and count the dead records. + + (b) If it's more than max_dead, bulk free all the dead ones + (similar to steps 4 and below, but the lock is only obtained + once). + + (c) Simply mark this record as dead and return. + +4. Get the free list lock for the remainder of this operation. + +5. Examine the following block to see if it is + free; if so, enlarge the current block and remove that block + from the free list. This was disabled, as removal from the free + list was O(entries-in-free-list). + +6. Examine the preceeding block to see if it is free: for this + reason, each block has a 32-bit tailer which indicates its + length. If it is free, expand it to cover our new block and + return. + +7. Otherwise, prepend ourselves to the free list. + +Disabling right-merging (step [right-merging]) causes +fragmentation; the other heuristics proved insufficient to +address this, so the final answer to this was that when we expand +the TDB file inside a transaction commit, we repack the entire +tdb. + +The single list lock limits our allocation rate; due to the other +issues this is not currently seen as a bottleneck. + +3.5.1 Proposed Solution + +The first step is to remove all the current heuristics, as they +obviously interact, then examine them once the lock contention is +addressed. + +The free list must be split to reduce contention. Assuming +perfect free merging, we can at most have 1 free list entry for +each entry. This implies that the number of free lists is related +to the size of the hash table, but as it is rare to walk a large +number of free list entries we can use far fewer, say 1/32 of the +number of hash buckets. + +There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented] +) but it's not clear this would reduce contention in the common +case where all processes are allocating/freeing the same size. +Thus we almost certainly need to divide in other ways: the most +obvious is to divide the file into zones, and using a free list +(or set of free lists) for each. This approximates address +ordering. + +Note that this means we need to split the free lists when we +expand the file; this is probably acceptable when we double the +hash table size, since that is such an expensive operation +already. In the case of increasing the file size, there is an +optimization we can use: if we use M in the formula above as the +file size rounded up to the next power of 2, we only need +reshuffle free lists when the file size crosses a power of 2 +boundary, and reshuffling the free lists is trivial: we simply +merge every consecutive pair of free lists. + +The basic algorithm is as follows. Freeing is simple: + +1. Identify the correct zone. + +2. Lock the corresponding list. + +3. Re-check the zone (we didn't have a lock, sizes could have + changed): relock if necessary. + +4. Place the freed entry in the list for that zone. + +Allocation is a little more complicated, as we perform delayed +coalescing at this point: + +1. Pick a zone either the zone we last freed into, or based on a “ + random” number. + +2. Lock the corresponding list. + +3. Re-check the zone: relock if necessary. + +4. If the top entry is -large enough, remove it from the list and + return it. + +5. Otherwise, coalesce entries in the list.If there was no entry + large enough, unlock the list and try the next zone. + +6. If no zone satisfies, expand the file. + +This optimizes rapid insert/delete of free list entries by not +coalescing them all the time.. First-fit address ordering +ordering seems to be fairly good for keeping fragmentation low +(see [sub:TDB-Becomes-Fragmented]). Note that address ordering +does not need a tailer to coalesce, though if we needed one we +could have one cheaply: see [sub:Records-Incur-A]. + +I anticipate that the number of entries in each free zone would +be small, but it might be worth using one free entry to hold +pointers to the others for cache efficiency. + +3.6 TDB Becomes Fragmented + +Much of this is a result of allocation strategy[footnote: +The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 +ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps +] and deliberate hobbling of coalescing; internal fragmentation +(aka overallocation) is deliberately set at 25%, and external +fragmentation is only cured by the decision to repack the entire +db when a transaction commit needs to enlarge the file. + +3.6.1 Proposed Solution + +The 25% overhead on allocation works in practice for ldb because +indexes tend to expand by one record at a time. This internal +fragmentation can be resolved by having an “expanded” bit in the +header to note entries that have previously expanded, and +allocating more space for them. + +There are is a spectrum of possible solutions for external +fragmentation: one is to use a fragmentation-avoiding allocation +strategy such as best-fit address-order allocator. The other end +of the spectrum would be to use a bump allocator (very fast and +simple) and simply repack the file when we reach the end. + +There are three problems with efficient fragmentation-avoiding +allocators: they are non-trivial, they tend to use a single free +list for each size, and there's no evidence that tdb allocation +patterns will match those recorded for general allocators (though +it seems likely). + +Thus we don't spend too much effort on external fragmentation; we +will be no worse than the current code if we need to repack on +occasion. More effort is spent on reducing freelist contention, +and reducing overhead. + +3.7 Records Incur A 28-Byte Overhead + +Each TDB record has a header as follows: + +struct tdb_record { + + tdb_off_t next; /* offset of the next record in the list +*/ + + tdb_len_t rec_len; /* total byte length of record */ + + tdb_len_t key_len; /* byte length of key */ + + tdb_len_t data_len; /* byte length of data */ + + uint32_t full_hash; /* the full 32 bit hash of the key */ + + uint32_t magic; /* try to catch errors */ + + /* the following union is implied: + + union { + + char record[rec_len]; + + struct { + + char key[key_len]; + + char data[data_len]; + + } + + uint32_t totalsize; (tailer) + + } + + */ + +}; + +Naively, this would double to a 56-byte overhead on a 64 bit +implementation. + +3.7.1 Proposed Solution + +We can use various techniques to reduce this for an allocated +block: + +1. The 'next' pointer is not required, as we are using a flat + hash table. + +2. 'rec_len' can instead be expressed as an addition to key_len + and data_len (it accounts for wasted or overallocated length in + the record). Since the record length is always a multiple of 8, + we can conveniently fit it in 32 bits (representing up to 35 + bits). + +3. 'key_len' and 'data_len' can be reduced. I'm unwilling to + restrict 'data_len' to 32 bits, but instead we can combine the + two into one 64-bit field and using a 5 bit value which + indicates at what bit to divide the two. Keys are unlikely to + scale as fast as data, so I'm assuming a maximum key size of 32 + bits. + +4. 'full_hash' is used to avoid a memcmp on the “miss” case, but + this is diminishing returns after a handful of bits (at 10 + bits, it reduces 99.9% of false memcmp). As an aside, as the + lower bits are already incorporated in the hash table + resolution, the upper bits should be used here. + +5. 'magic' does not need to be enlarged: it currently reflects + one of 5 values (used, free, dead, recovery, and + unused_recovery). It is useful for quick sanity checking + however, and should not be eliminated. + +6. 'tailer' is only used to coalesce free blocks (so a block to + the right can find the header to check if this block is free). + This can be replaced by a single 'free' bit in the header of + the following block (and the tailer only exists in free + blocks).[footnote: +This technique from Thomas Standish. Data Structure Techniques. +Addison-Wesley, Reading, Massachusetts, 1980. +] The current proposed coalescing algorithm doesn't need this, + however. + +This produces a 16 byte used header like this: + +struct tdb_used_record { + + uint32_t magic : 16, + + prev_is_free: 1, + + key_data_divide: 5, + + top_hash: 10; + + uint32_t extra_octets; + + uint64_t key_and_data_len; + +}; + +And a free record like this: + +struct tdb_free_record { + + uint32_t free_magic; + + uint64_t total_length; + + ... + + uint64_t tailer; + +}; + + + +3.8 Transaction Commit Requires 4 fdatasync + +The current transaction algorithm is: + +1. write_recovery_data(); + +2. sync(); + +3. write_recovery_header(); + +4. sync(); + +5. overwrite_with_new_data(); + +6. sync(); + +7. remove_recovery_header(); + +8. sync(); + +On current ext3, each sync flushes all data to disk, so the next +3 syncs are relatively expensive. But this could become a +performance bottleneck on other filesystems such as ext4. + +3.8.1 Proposed Solution + +Neil Brown points out that this is overzealous, and only one sync +is needed: + +1. Bundle the recovery data, a transaction counter and a strong + checksum of the new data. + +2. Strong checksum that whole bundle. + +3. Store the bundle in the database. + +4. Overwrite the oldest of the two recovery pointers in the + header (identified using the transaction counter) with the + offset of this bundle. + +5. sync. + +6. Write the new data to the file. + +Checking for recovery means identifying the latest bundle with a +valid checksum and using the new data checksum to ensure that it +has been applied. This is more expensive than the current check, +but need only be done at open. For running databases, a separate +header field can be used to indicate a transaction in progress; +we need only check for recovery if this is set. + +3.9 TDB Does Not Have Snapshot Support + +3.9.1 Proposed Solution + +None. At some point you say “use a real database”. + +But as a thought experiment, if we implemented transactions to +only overwrite free entries (this is tricky: there must not be a +header in each entry which indicates whether it is free, but use +of presence in metadata elsewhere), and a pointer to the hash +table, we could create an entirely new commit without destroying +existing data. Then it would be easy to implement snapshots in a +similar way. + +This would not allow arbitrary changes to the database, such as +tdb_repack does, and would require more space (since we have to +preserve the current and future entries at once). If we used hash +trees rather than one big hash table, we might only have to +rewrite some sections of the hash, too. + +We could then implement snapshots using a similar method, using +multiple different hash tables/free tables. + +3.10 Transactions Cannot Operate in Parallel + +This would be useless for ldb, as it hits the index records with +just about every update. It would add significant complexity in +resolving clashes, and cause the all transaction callers to write +their code to loop in the case where the transactions spuriously +failed. + +3.10.1 Proposed Solution + +We could solve a small part of the problem by providing read-only +transactions. These would allow one write transaction to begin, +but it could not commit until all r/o transactions are done. This +would require a new RO_TRANSACTION_LOCK, which would be upgraded +on commit. + +3.11 Default Hash Function Is Suboptimal + +The Knuth-inspired multiplicative hash used by tdb is fairly slow +(especially if we expand it to 64 bits), and works best when the +hash bucket size is a prime number (which also means a slow +modulus). In addition, it is highly predictable which could +potentially lead to a Denial of Service attack in some TDB uses. + +3.11.1 Proposed Solution + +The Jenkins lookup3 hash[footnote: +http://burtleburtle.net/bob/c/lookup3.c +] is a fast and superbly-mixing hash. It's used by the Linux +kernel and almost everything else. This has the particular +properties that it takes an initial seed, and produces two 32 bit +hash numbers, which we can combine into a 64-bit hash. + +The seed should be created at tdb-creation time from some random +source, and placed in the header. This is far from foolproof, but +adds a little bit of protection against hash bombing. + +3.12 Reliable Traversal Adds Complexity + +We lock a record during traversal iteration, and try to grab that +lock in the delete code. If that grab on delete fails, we simply +mark it deleted and continue onwards; traversal checks for this +condition and does the delete when it moves off the record. + +If traversal terminates, the dead record may be left +indefinitely. + +3.12.1 Proposed Solution + +Remove reliability guarantees; see [traverse-Proposed-Solution]. + +3.13 Fcntl Locking Adds Overhead + +Placing a fcntl lock means a system call, as does removing one. +This is actually one reason why transactions can be faster +(everything is locked once at transaction start). In the +uncontended case, this overhead can theoretically be eliminated. + +3.13.1 Proposed Solution + +None. + +We tried this before with spinlock support, in the early days of +TDB, and it didn't make much difference except in manufactured +benchmarks. + +We could use spinlocks (with futex kernel support under Linux), +but it means that we lose automatic cleanup when a process dies +with a lock. There is a method of auto-cleanup under Linux, but +it's not supported by other operating systems. We could +reintroduce a clear-if-first-style lock and sweep for dead +futexes on open, but that wouldn't help the normal case of one +concurrent opener dying. Increasingly elaborate repair schemes +could be considered, but they require an ABI change (everyone +must use them) anyway, so there's no need to do this at the same +time as everything else. + +3.14 Some Transactions Don't Require Durability + +Volker points out that gencache uses a CLEAR_IF_FIRST tdb for +normal (fast) usage, and occasionally empties the results into a +transactional TDB. This kind of usage prioritizes performance +over durability: as long as we are consistent, data can be lost. + +This would be more neatly implemented inside tdb: a “soft” +transaction commit (ie. syncless) which meant that data may be +reverted on a crash. + +3.14.1 Proposed Solution + +None. + +Unfortunately any transaction scheme which overwrites old data +requires a sync before that overwrite to avoid the possibility of +corruption. + +It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not] +,where transactions are committed without overwriting existing +data, and an array of top-level pointers were available in the +header. If the transaction is “soft” then we would not need a +sync at all: existing processes would pick up the new hash table +and free list and work with that. + +At some later point, a sync would allow recovery of the old data +into the free lists (perhaps when the array of top-level pointers +filled). On crash, tdb_open() would examine the array of top +levels, and apply the transactions until it encountered an +invalid checksum. + diff --git a/ccan/tdb2/free.c b/ccan/tdb2/free.c new file mode 100644 index 0000000..83d916f --- /dev/null +++ b/ccan/tdb2/free.c @@ -0,0 +1,710 @@ + /* + Trivial Database 2: free list/block handling + Copyright (C) Rusty Russell 2010 + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, see . +*/ +#include "private.h" +#include +#include +#include +#include + +/* We have to be able to fit a free record here. */ +#define MIN_DATA_LEN \ + (sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record)) + +/* We have a series of free lists, each one covering a "zone" of the file. + * + * For each zone we have a series of per-size buckets, and a final bucket for + * "too big". + * + * It's possible to move the free_list_head, but *only* under the allrecord + * lock. */ +static tdb_off_t free_list_off(struct tdb_context *tdb, unsigned int list) +{ + return tdb->header.v.free_off + list * sizeof(tdb_off_t); +} + +/* We're a library: playing with srandom() is unfriendly. srandom_r + * probably lacks portability. We don't need very random here. */ +static unsigned int quick_random(struct tdb_context *tdb) +{ + return getpid() + time(NULL) + (unsigned long)tdb; +} + +/* Start by using a random zone to spread the load. */ +uint64_t random_free_zone(struct tdb_context *tdb) +{ + /* num_zones might be out of date, but can only increase */ + return quick_random(tdb) % tdb->header.v.num_zones; +} + +static unsigned fls64(uint64_t val) +{ +#if HAVE_BUILTIN_CLZL + if (val <= ULONG_MAX) { + /* This is significantly faster! */ + return val ? sizeof(long) * CHAR_BIT - __builtin_clzl(val) : 0; + } else { +#endif + uint64_t r = 64; + + if (!val) + return 0; + if (!(val & 0xffffffff00000000ull)) { + val <<= 32; + r -= 32; + } + if (!(val & 0xffff000000000000ull)) { + val <<= 16; + r -= 16; + } + if (!(val & 0xff00000000000000ull)) { + val <<= 8; + r -= 8; + } + if (!(val & 0xf000000000000000ull)) { + val <<= 4; + r -= 4; + } + if (!(val & 0xc000000000000000ull)) { + val <<= 2; + r -= 2; + } + if (!(val & 0x8000000000000000ull)) { + val <<= 1; + r -= 1; + } + return r; +#if HAVE_BUILTIN_CLZL + } +#endif +} + +/* In which bucket would we find a particular record size? (ignoring header) */ +unsigned int size_to_bucket(struct tdb_context *tdb, tdb_len_t data_len) +{ + unsigned int bucket; + + /* We can't have records smaller than this. */ + assert(data_len >= MIN_DATA_LEN); + + /* Ignoring the header... */ + if (data_len - MIN_DATA_LEN <= 64) { + /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 6. */ + bucket = (data_len - MIN_DATA_LEN) / 8; + } else { + /* After that we go power of 2. */ + bucket = fls64(data_len - MIN_DATA_LEN) + 2; + } + + if (unlikely(bucket > tdb->header.v.free_buckets)) + bucket = tdb->header.v.free_buckets; + return bucket; +} + +/* What zone does a block belong in? */ +tdb_off_t zone_of(struct tdb_context *tdb, tdb_off_t off) +{ + assert(tdb->header_uptodate); + + return off >> tdb->header.v.zone_bits; +} + +/* Returns fl->max_bucket + 1, or list number to search. */ +static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket) +{ + tdb_off_t first, off; + + /* Speculatively search for a non-zero bucket. */ + first = tdb->last_zone * (tdb->header.v.free_buckets+1) + bucket; + off = tdb_find_nonzero_off(tdb, free_list_off(tdb, first), + tdb->header.v.free_buckets - bucket); + return bucket + off; +} + +static int remove_from_list(struct tdb_context *tdb, + tdb_off_t list, struct tdb_free_record *r) +{ + tdb_off_t off; + + /* Front of list? */ + if (r->prev == 0) { + off = free_list_off(tdb, list); + } else { + off = r->prev + offsetof(struct tdb_free_record, next); + } + /* r->prev->next = r->next */ + if (tdb_write_off(tdb, off, r->next)) { + return -1; + } + + if (r->next != 0) { + off = r->next + offsetof(struct tdb_free_record, prev); + /* r->next->prev = r->prev */ + if (tdb_write_off(tdb, off, r->prev)) { + return -1; + } + } + return 0; +} + +/* Enqueue in this free list. */ +static int enqueue_in_free(struct tdb_context *tdb, + tdb_off_t list, + tdb_off_t off, + struct tdb_free_record *new) +{ + new->prev = 0; + /* new->next = head. */ + new->next = tdb_read_off(tdb, free_list_off(tdb, list)); + if (new->next == TDB_OFF_ERR) + return -1; + + if (new->next) { + /* next->prev = new. */ + if (tdb_write_off(tdb, new->next + + offsetof(struct tdb_free_record, prev), + off) != 0) + return -1; + } + /* head = new */ + if (tdb_write_off(tdb, free_list_off(tdb, list), off) != 0) + return -1; + + return tdb_write_convert(tdb, off, new, sizeof(*new)); +} + +/* List isn't locked. */ +int add_free_record(struct tdb_context *tdb, + tdb_off_t off, tdb_len_t len_with_header) +{ + struct tdb_free_record new; + tdb_off_t list; + int ret; + + assert(len_with_header >= sizeof(new)); + + new.magic = TDB_FREE_MAGIC; + new.data_len = len_with_header - sizeof(struct tdb_used_record); + + tdb->last_zone = zone_of(tdb, off); + list = tdb->last_zone * (tdb->header.v.free_buckets+1) + + size_to_bucket(tdb, new.data_len); + + if (tdb_lock_free_list(tdb, list, TDB_LOCK_WAIT) != 0) + return -1; + + ret = enqueue_in_free(tdb, list, off, &new); + tdb_unlock_free_list(tdb, list); + return ret; +} + +/* If we have enough left over to be useful, split that off. */ +static int to_used_record(struct tdb_context *tdb, + tdb_off_t off, + tdb_len_t needed, + tdb_len_t total_len, + tdb_len_t *actual) +{ + struct tdb_used_record used; + tdb_len_t leftover; + + leftover = total_len - needed; + if (leftover < sizeof(struct tdb_free_record)) + leftover = 0; + + *actual = total_len - leftover; + + if (leftover) { + if (add_free_record(tdb, off + sizeof(used) + *actual, + total_len - needed)) + return -1; + } + return 0; +} + +/* Note: we unlock the current list if we coalesce or fail. */ +static int coalesce(struct tdb_context *tdb, tdb_off_t off, + tdb_off_t list, tdb_len_t data_len) +{ + struct tdb_free_record pad, *r; + tdb_off_t end = off + sizeof(struct tdb_used_record) + data_len; + + while (!tdb->methods->oob(tdb, end + sizeof(*r), 1)) { + tdb_off_t nlist; + + r = tdb_get(tdb, end, &pad, sizeof(pad)); + if (!r) + goto err; + + if (r->magic != TDB_FREE_MAGIC) + break; + + nlist = zone_of(tdb, end) * (tdb->header.v.free_buckets+1) + + size_to_bucket(tdb, r->data_len); + + /* We may be violating lock order here, so best effort. */ + if (tdb_lock_free_list(tdb, nlist, TDB_LOCK_NOWAIT) == -1) + break; + + /* Now we have lock, re-check. */ + r = tdb_get(tdb, end, &pad, sizeof(pad)); + if (!r) { + tdb_unlock_free_list(tdb, nlist); + goto err; + } + + if (unlikely(r->magic != TDB_FREE_MAGIC)) { + tdb_unlock_free_list(tdb, nlist); + break; + } + + if (remove_from_list(tdb, list, r) == -1) { + tdb_unlock_free_list(tdb, nlist); + goto err; + } + + end += sizeof(struct tdb_used_record) + r->data_len; + tdb_unlock_free_list(tdb, nlist); + } + + /* Didn't find any adjacent free? */ + if (end == off + sizeof(struct tdb_used_record) + data_len) + return 0; + + /* OK, expand record */ + r = tdb_get(tdb, off, &pad, sizeof(pad)); + if (!r) + goto err; + + if (remove_from_list(tdb, list, r) == -1) + goto err; + + /* We have to drop this to avoid deadlocks. */ + tdb_unlock_free_list(tdb, list); + + if (add_free_record(tdb, off, end - off) == -1) + return -1; + return 1; + +err: + /* To unify error paths, we *always* unlock list. */ + tdb_unlock_free_list(tdb, list); + return -1; +} + +/* We need size bytes to put our key and data in. */ +static tdb_off_t lock_and_alloc(struct tdb_context *tdb, + tdb_off_t bucket, size_t size, + tdb_len_t *actual) +{ + tdb_off_t list; + tdb_off_t off, prev, best_off; + struct tdb_free_record pad, best = { 0 }, *r; + double multiplier; + +again: + list = tdb->last_zone * (tdb->header.v.free_buckets+1) + bucket; + + /* Lock this list. */ + if (tdb_lock_free_list(tdb, list, TDB_LOCK_WAIT) == -1) { + return TDB_OFF_ERR; + } + + prev = free_list_off(tdb, list); + off = tdb_read_off(tdb, prev); + + if (unlikely(off == TDB_OFF_ERR)) + goto unlock_err; + + best.data_len = -1ULL; + best_off = 0; + multiplier = 1.0; + + /* Walk the list to see if any are large enough, getting less fussy + * as we go. */ + while (off) { + prev = off; + off = tdb_read_off(tdb, prev); + if (unlikely(off == TDB_OFF_ERR)) + goto unlock_err; + + r = tdb_get(tdb, off, &pad, sizeof(*r)); + if (!r) + goto unlock_err; + if (r->magic != TDB_FREE_MAGIC) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "lock_and_alloc: %llu non-free 0x%llx\n", + (long long)off, (long long)r->magic); + goto unlock_err; + } + + if (r->data_len >= size && r->data_len < best.data_len) { + best_off = off; + best = *r; + } + + if (best.data_len < size * multiplier && best_off) { + /* We're happy with this size: take it. */ + if (remove_from_list(tdb, list, &best) != 0) + goto unlock_err; + tdb_unlock_free_list(tdb, list); + + if (to_used_record(tdb, best_off, size, best.data_len, + actual)) { + return -1; + } + return best_off; + } + multiplier *= 1.01; + + /* Since we're going slow anyway, try coalescing here. */ + switch (coalesce(tdb, off, list, r->data_len)) { + case -1: + /* This has already unlocked on error. */ + return -1; + case 1: + /* This has unlocked list, restart. */ + goto again; + } + } + + tdb_unlock_free_list(tdb, list); + return 0; + +unlock_err: + tdb_unlock_free_list(tdb, list); + return TDB_OFF_ERR; +} + +/* We want a really big chunk. Look through every zone's oversize bucket */ +static tdb_off_t huge_alloc(struct tdb_context *tdb, size_t size, + tdb_len_t *actual) +{ + tdb_off_t i, off; + + do { + for (i = 0; i < tdb->header.v.num_zones; i++) { + /* Try getting one from list. */ + off = lock_and_alloc(tdb, tdb->header.v.free_buckets, + size, actual); + if (off == TDB_OFF_ERR) + return TDB_OFF_ERR; + if (off != 0) + return off; + /* FIXME: Coalesce! */ + } + } while (tdb_expand(tdb, 0, size, false) == 0); + + return TDB_OFF_ERR; +} + +static tdb_off_t get_free(struct tdb_context *tdb, size_t size, + tdb_len_t *actual) +{ + tdb_off_t off, bucket; + unsigned int num_empty, step = 0; + + bucket = size_to_bucket(tdb, size); + + /* If we're after something bigger than a single zone, handle + * specially. */ + if (unlikely(sizeof(struct tdb_used_record) + size + >= (1ULL << tdb->header.v.zone_bits))) { + return huge_alloc(tdb, size, actual); + } + + /* Number of zones we search is proportional to the log of them. */ + for (num_empty = 0; num_empty < fls64(tdb->header.v.num_zones); + num_empty++) { + tdb_off_t b; + + /* Start at exact size bucket, and search up... */ + for (b = bucket; b <= tdb->header.v.num_zones; b++) { + b = find_free_head(tdb, b); + + /* Non-empty list? Try getting block. */ + if (b <= tdb->header.v.num_zones) { + /* Try getting one from list. */ + off = lock_and_alloc(tdb, b, size, actual); + if (off == TDB_OFF_ERR) + return TDB_OFF_ERR; + if (off != 0) + return off; + /* Didn't work. Try next bucket. */ + } + } + + /* Try another zone, at pseudo random. Avoid duplicates by + using an odd step. */ + if (step == 0) + step = ((quick_random(tdb)) % 65536) * 2 + 1; + tdb->last_zone = (tdb->last_zone + step) + % tdb->header.v.num_zones; + } + return 0; +} + +int set_header(struct tdb_context *tdb, + struct tdb_used_record *rec, + uint64_t keylen, uint64_t datalen, + uint64_t actuallen, uint64_t hash) +{ + uint64_t keybits = (fls64(keylen) + 1) / 2; + + /* Use top bits of hash, so it's independent of hash table size. */ + rec->magic_and_meta + = (actuallen - (keylen + datalen)) + | ((hash >> 53) << 32) + | (keybits << 43) + | (TDB_MAGIC << 48); + rec->key_and_data_len = (keylen | (datalen << (keybits*2))); + + /* Encoding can fail on big values. */ + if (rec_key_length(rec) != keylen + || rec_data_length(rec) != datalen + || rec_extra_padding(rec) != actuallen - (keylen + datalen)) { + tdb->ecode = TDB_ERR_IO; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "Could not encode k=%llu,d=%llu,a=%llu\n", + (long long)keylen, (long long)datalen, + (long long)actuallen); + return -1; + } + return 0; +} + +static tdb_len_t adjust_size(size_t keylen, size_t datalen, bool growing) +{ + tdb_len_t size = keylen + datalen; + + if (size < MIN_DATA_LEN) + size = MIN_DATA_LEN; + + /* Overallocate if this is coming from an enlarging store. */ + if (growing) + size += datalen / 2; + + /* Round to next uint64_t boundary. */ + return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL); +} + +/* If this fails, try tdb_expand. */ +tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen, + uint64_t hash, bool growing) +{ + tdb_off_t off; + tdb_len_t size, actual; + struct tdb_used_record rec; + + /* We don't want header to change during this! */ + assert(tdb->header_uptodate); + + size = adjust_size(keylen, datalen, growing); + + off = get_free(tdb, size, &actual); + if (unlikely(off == TDB_OFF_ERR || off == 0)) + return off; + + /* Some supergiant values can't be encoded. */ + if (set_header(tdb, &rec, keylen, datalen, actual, hash) != 0) { + add_free_record(tdb, off, sizeof(rec) + actual); + return TDB_OFF_ERR; + } + + if (tdb_write_convert(tdb, off, &rec, sizeof(rec)) != 0) + return TDB_OFF_ERR; + + return off; +} + +static bool larger_buckets_might_help(struct tdb_context *tdb) +{ + /* If our buckets are already covering 1/8 of a zone, don't + * bother (note: might become an 1/16 of a zone if we double + * zone size). */ + tdb_len_t size = (1ULL << tdb->header.v.zone_bits) / 8; + + if (size >= MIN_DATA_LEN + && size_to_bucket(tdb, size) < tdb->header.v.free_buckets) { + return false; + } + + /* FIXME: Put stats in tdb_context or examine db itself! */ + /* It's fairly cheap to do as we expand database. */ + return true; +} + +static bool zones_happy(struct tdb_context *tdb) +{ + /* FIXME: look at distribution of zones. */ + return true; +} + +/* Expand the database. */ +int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen, + bool growing) +{ + uint64_t new_num_buckets, new_num_zones, new_zone_bits; + uint64_t old_num_total, i; + tdb_len_t add, freebucket_size, needed; + tdb_off_t off, old_free_off; + const tdb_off_t *oldf; + struct tdb_used_record fhdr; + + /* We need room for the record header too. */ + needed = sizeof(struct tdb_used_record) + + adjust_size(klen, dlen, growing); + + /* FIXME: this is overkill. An expand lock? */ + if (tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == -1) + return -1; + + /* Someone may have expanded for us. */ + if (update_header(tdb)) + goto success; + + /* Make sure we have the latest size. */ + tdb->methods->oob(tdb, tdb->map_size + 1, true); + + /* Did we enlarge zones without enlarging file? */ + if (tdb->map_size < tdb->header.v.num_zones<header.v.zone_bits) { + add = (tdb->header.v.num_zones<header.v.zone_bits) + - tdb->map_size; + /* Updates tdb->map_size. */ + if (tdb->methods->expand_file(tdb, tdb->map_size, add) == -1) + goto fail; + if (add_free_record(tdb, tdb->map_size - add, add) == -1) + goto fail; + if (add >= needed) { + /* Allocate from this zone. */ + tdb->last_zone = zone_of(tdb, tdb->map_size - add); + goto success; + } + } + + /* Slow path. Should we increase the number of buckets? */ + new_num_buckets = tdb->header.v.free_buckets; + if (larger_buckets_might_help(tdb)) + new_num_buckets++; + + /* Now we'll need room for the new free buckets, too. Assume + * worst case (zones expand). */ + needed += sizeof(fhdr) + + ((tdb->header.v.num_zones+1) + * (new_num_buckets+1) * sizeof(tdb_off_t)); + + /* If we need less that one zone, and they're working well, just add + * another one. */ + if (needed < (1UL<header.v.zone_bits) && zones_happy(tdb)) { + new_num_zones = tdb->header.v.num_zones+1; + new_zone_bits = tdb->header.v.zone_bits; + add = 1ULL << tdb->header.v.zone_bits; + } else { + /* Increase the zone size. */ + new_num_zones = tdb->header.v.num_zones; + new_zone_bits = tdb->header.v.zone_bits+1; + while ((new_num_zones << new_zone_bits) - tdb->map_size + < needed) { + new_zone_bits++; + } + + /* We expand by enough zones to meet the need. */ + add = (needed + (1ULL << new_zone_bits)-1) + & ~((1ULL << new_zone_bits)-1); + } + + /* Updates tdb->map_size. */ + if (tdb->methods->expand_file(tdb, tdb->map_size, add) == -1) + goto fail; + + /* Use first part as new free bucket array. */ + off = tdb->map_size - add; + freebucket_size = new_num_zones + * (new_num_buckets + 1) * sizeof(tdb_off_t); + + /* Write header. */ + if (set_header(tdb, &fhdr, 0, freebucket_size, freebucket_size, 0)) + goto fail; + if (tdb_write_convert(tdb, off, &fhdr, sizeof(fhdr)) == -1) + goto fail; + + /* Adjust off to point to start of buckets, add to be remainder. */ + add -= freebucket_size + sizeof(fhdr); + off += sizeof(fhdr); + + /* Access the old zones. */ + old_num_total = tdb->header.v.num_zones*(tdb->header.v.free_buckets+1); + old_free_off = tdb->header.v.free_off; + oldf = tdb_access_read(tdb, old_free_off, + old_num_total * sizeof(tdb_off_t)); + if (!oldf) + goto fail; + + /* Switch to using our new zone. */ + if (zero_out(tdb, off, new_num_zones * (new_num_buckets + 1)) == -1) + goto fail_release; + tdb->header.v.free_off = off; + tdb->header.v.num_zones = new_num_zones; + tdb->header.v.free_buckets = new_num_buckets; + + /* FIXME: If zone size hasn't changed, can simply copy pointers. */ + /* FIXME: Coalesce? */ + for (i = 0; i < old_num_total; i++) { + tdb_off_t next; + struct tdb_free_record rec; + tdb_off_t list; + + for (off = oldf[i]; off; off = next) { + if (tdb_read_convert(tdb, off, &rec, sizeof(rec))) + goto fail_release; + + list = zone_of(tdb, off) + * (tdb->header.v.free_buckets+1) + + size_to_bucket(tdb, rec.data_len); + next = rec.next; + + if (enqueue_in_free(tdb, list, off, &rec) == -1) + goto fail_release; + } + } + + + /* Free up the old free buckets. */ + old_free_off -= sizeof(fhdr); + if (tdb_read_convert(tdb, old_free_off, &fhdr, sizeof(fhdr)) == -1) + goto fail_release; + if (add_free_record(tdb, old_free_off, + rec_data_length(&fhdr)+rec_extra_padding(&fhdr))) + goto fail_release; + + /* Add the rest as a new free record. */ + if (add_free_record(tdb, tdb->map_size - add, add) == -1) + goto fail_release; + + /* Start allocating from where the new space is. */ + tdb->last_zone = zone_of(tdb, tdb->map_size - add); + tdb_access_release(tdb, oldf); +success: + tdb_allrecord_unlock(tdb, F_WRLCK); + return 0; + +fail_release: + tdb_access_release(tdb, oldf); +fail: + tdb_allrecord_unlock(tdb, F_WRLCK); + return -1; +} diff --git a/ccan/tdb2/io.c b/ccan/tdb2/io.c new file mode 100644 index 0000000..5910fc5 --- /dev/null +++ b/ccan/tdb2/io.c @@ -0,0 +1,662 @@ + /* + Unix SMB/CIFS implementation. + + trivial database library + + Copyright (C) Andrew Tridgell 1999-2005 + Copyright (C) Paul `Rusty' Russell 2000 + Copyright (C) Jeremy Allison 2000-2003 + Copyright (C) Rusty Russell 2010 + + ** NOTE! The following LGPL license applies to the tdb + ** library. This does NOT imply that all of Samba is released + ** under the LGPL + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, see . +*/ +#include "private.h" +#include + +void tdb_munmap(struct tdb_context *tdb) +{ + if (tdb->flags & TDB_INTERNAL) + return; + + if (tdb->map_ptr) { + munmap(tdb->map_ptr, tdb->map_size); + tdb->map_ptr = NULL; + } +} + +void tdb_mmap(struct tdb_context *tdb) +{ + if (tdb->flags & TDB_INTERNAL) + return; + + if (tdb->flags & TDB_NOMMAP) + return; + + tdb->map_ptr = mmap(NULL, tdb->map_size, + PROT_READ|(tdb->read_only? 0:PROT_WRITE), + MAP_SHARED, tdb->fd, 0); + + /* + * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!! + */ + if (tdb->map_ptr == MAP_FAILED) { + tdb->map_ptr = NULL; + tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv, + "tdb_mmap failed for size %lld (%s)\n", + (long long)tdb->map_size, strerror(errno)); + } +} + +/* check for an out of bounds access - if it is out of bounds then + see if the database has been expanded by someone else and expand + if necessary + note that "len" is the minimum length needed for the db +*/ +static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe) +{ + struct stat st; + if (len <= tdb->map_size) + return 0; + if (tdb->flags & TDB_INTERNAL) { + if (!probe) { + /* Ensure ecode is set for log fn. */ + tdb->ecode = TDB_ERR_IO; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "tdb_oob len %lld beyond internal" + " malloc size %lld\n", + (long long)len, + (long long)tdb->map_size); + } + return -1; + } + + if (fstat(tdb->fd, &st) == -1) { + tdb->ecode = TDB_ERR_IO; + return -1; + } + + if (st.st_size < (size_t)len) { + if (!probe) { + /* Ensure ecode is set for log fn. */ + tdb->ecode = TDB_ERR_IO; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "tdb_oob len %lld beyond eof at %lld\n", + (long long)len, (long long)st.st_size); + } + return -1; + } + + /* Unmap, update size, remap */ + tdb_munmap(tdb); + tdb->map_size = st.st_size; + tdb_mmap(tdb); + return 0; +} + +static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len) +{ + if (unlikely(!tdb->map_ptr)) + return NULL; + + /* FIXME: We can do a subset of this! */ + if (tdb->transaction) + return NULL; + + if (unlikely(tdb_oob(tdb, off + len, true) == -1)) + return NULL; + return (char *)tdb->map_ptr + off; +} + +/* Either make a copy into pad and return that, or return ptr into mmap. */ +/* Note: pad has to be a real object, so we can't get here if len + * overflows size_t */ +/* FIXME: Transaction */ +void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len) +{ + ssize_t r; + + if (likely(!(tdb->flags & TDB_CONVERT))) { + void *ret = tdb_direct(tdb, off, len); + if (ret) + return ret; + } + + if (unlikely(tdb_oob(tdb, off + len, false) == -1)) + return NULL; + + r = pread(tdb->fd, pad, len, off); + if (r != (ssize_t)len) { + /* Ensure ecode is set for log fn. */ + tdb->ecode = TDB_ERR_IO; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "tdb_read failed at %llu " + "len=%lld ret=%lld (%s) map_size=%lld\n", + (long long)off, (long long)len, + (long long)r, strerror(errno), + (long long)tdb->map_size); + return NULL; + } + return tdb_convert(tdb, pad, len); +} + +/* Endian conversion: we only ever deal with 8 byte quantities */ +void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size) +{ + if (unlikely((tdb->flags & TDB_CONVERT))) { + uint64_t i, *p = (uint64_t *)buf; + for (i = 0; i < size / 8; i++) + p[i] = bswap_64(p[i]); + } + return buf; +} + +/* Return first non-zero offset in num offset array, or num. */ +/* FIXME: Return the off? */ +uint64_t tdb_find_nonzero_off(struct tdb_context *tdb, tdb_off_t off, + uint64_t num) +{ + uint64_t i, *val; + bool alloc = false; + + val = tdb_direct(tdb, off, num * sizeof(tdb_off_t)); + if (!unlikely(val)) { + val = tdb_alloc_read(tdb, off, num * sizeof(tdb_off_t)); + if (!val) + return num; + alloc = true; + } + + for (i = 0; i < num; i++) { + if (val[i]) + break; + } + if (unlikely(alloc)) + free(val); + return i; +} + +/* Return first zero offset in num offset array, or num. */ +uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off, + uint64_t num) +{ + uint64_t i, *val; + bool alloc = false; + + val = tdb_direct(tdb, off, num * sizeof(tdb_off_t)); + if (!unlikely(val)) { + val = tdb_alloc_read(tdb, off, num * sizeof(tdb_off_t)); + if (!val) + return num; + alloc = true; + } + + for (i = 0; i < num; i++) { + if (!val[i]) + break; + } + if (unlikely(alloc)) + free(val); + return i; +} + +static int fill(struct tdb_context *tdb, + const void *buf, size_t size, + tdb_off_t off, tdb_len_t len) +{ + while (len) { + size_t n = len > size ? size : len; + + if (!tdb_pwrite_all(tdb->fd, buf, n, off)) { + tdb->ecode = TDB_ERR_IO; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "fill write failed: giving up!\n"); + return -1; + } + len -= n; + off += n; + } + return 0; +} + +int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len) +{ + void *p = tdb_direct(tdb, off, len); + if (p) { + memset(p, 0, len); + return 0; + } else { + char buf[8192] = { 0 }; + return fill(tdb, buf, sizeof(buf), len, off); + } +} + +tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off) +{ + tdb_off_t pad, *ret; + + ret = tdb_get(tdb, off, &pad, sizeof(ret)); + if (!ret) { + return TDB_OFF_ERR; + } + return *ret; +} + +/* Even on files, we can get partial writes due to signals. */ +bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off) +{ + while (len) { + size_t ret; + ret = pwrite(fd, buf, len, off); + if (ret < 0) + return false; + if (ret == 0) { + errno = ENOSPC; + return false; + } + buf += ret; + off += ret; + len -= ret; + } + return true; +} + +/* write a lump of data at a specified offset */ +static int tdb_write(struct tdb_context *tdb, tdb_off_t off, + const void *buf, tdb_len_t len) +{ + if (len == 0) { + return 0; + } + + if (tdb->read_only) { + tdb->ecode = TDB_ERR_RDONLY; + return -1; + } + + if (tdb->methods->oob(tdb, off + len, 0) != 0) + return -1; + + if (tdb->map_ptr) { + memcpy(off + (char *)tdb->map_ptr, buf, len); + } else { + if (!tdb_pwrite_all(tdb->fd, buf, len, off)) { + tdb->ecode = TDB_ERR_IO; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "tdb_write failed at %llu len=%llu (%s)\n", + off, len, strerror(errno)); + return -1; + } + } + return 0; +} + +/* read a lump of data at a specified offset */ +static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf, + tdb_len_t len) +{ + if (tdb->methods->oob(tdb, off + len, 0) != 0) { + return -1; + } + + if (tdb->map_ptr) { + memcpy(buf, off + (char *)tdb->map_ptr, len); + } else { + ssize_t ret = pread(tdb->fd, buf, len, off); + if (ret != (ssize_t)len) { + /* Ensure ecode is set for log fn. */ + tdb->ecode = TDB_ERR_IO; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "tdb_read failed at %lld " + "len=%lld ret=%lld (%s) map_size=%lld\n", + (long long)off, (long long)len, + (long long)ret, strerror(errno), + (long long)tdb->map_size); + return -1; + } + } + return 0; +} + +int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off, + void *rec, size_t len) +{ + return tdb->methods->write(tdb, off, tdb_convert(tdb, rec, len), len); +} + +int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off, + void *rec, size_t len) +{ + int ret = tdb->methods->read(tdb, off, rec, len); + tdb_convert(tdb, rec, len); + return ret; +} + +int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val) +{ + return tdb_write_convert(tdb, off, &val, sizeof(val)); +} + +/* read a lump of data, allocating the space for it */ +void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len) +{ + void *buf; + + /* some systems don't like zero length malloc */ + buf = malloc(len ? len : 1); + if (unlikely(!buf)) { + tdb->ecode = TDB_ERR_OOM; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_alloc_read malloc failed len=%lld\n", + (long long)len); + } else if (unlikely(tdb->methods->read(tdb, offset, buf, len))) { + free(buf); + buf = NULL; + } + return buf; +} + +uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off) +{ + struct tdb_used_record pad, *r; + void *key; + uint64_t klen, hash; + + r = tdb_get(tdb, off, &pad, sizeof(*r)); + if (!r) + /* FIXME */ + return 0; + + klen = rec_key_length(r); + key = tdb_direct(tdb, off + sizeof(*r), klen); + if (likely(key)) + return tdb_hash(tdb, key, klen); + + key = tdb_alloc_read(tdb, off + sizeof(*r), klen); + if (unlikely(!key)) + return 0; + hash = tdb_hash(tdb, key, klen); + free(key); + return hash; +} + +/* Give a piece of tdb data to a parser */ +int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key, + tdb_off_t offset, tdb_len_t len, + int (*parser)(TDB_DATA key, TDB_DATA data, + void *private_data), + void *private_data) +{ + TDB_DATA data; + int result; + bool allocated = false; + + data.dsize = len; + data.dptr = tdb_direct(tdb, offset, len); + if (unlikely(!data.dptr)) { + if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) { + return -1; + } + allocated = true; + } + result = parser(key, data, private_data); + if (unlikely(allocated)) + free(data.dptr); + return result; +} + +/* expand a file. we prefer to use ftruncate, as that is what posix + says to use for mmap expansion */ +static int tdb_expand_file(struct tdb_context *tdb, + tdb_len_t size, tdb_len_t addition) +{ + char buf[8192]; + + if (tdb->read_only) { + tdb->ecode = TDB_ERR_RDONLY; + return -1; + } + + /* If this fails, we try to fill anyway. */ + if (ftruncate(tdb->fd, size+addition)) + ; + + /* now fill the file with something. This ensures that the + file isn't sparse, which would be very bad if we ran out of + disk. This must be done with write, not via mmap */ + memset(buf, 0x43, sizeof(buf)); + return fill(tdb, buf, sizeof(buf), addition, size); +} + +const void *tdb_access_read(struct tdb_context *tdb, + tdb_off_t off, tdb_len_t len) +{ + const void *ret = tdb_direct(tdb, off, len); + + if (!ret) + ret = tdb_alloc_read(tdb, off, len); + return ret; +} + +void tdb_access_release(struct tdb_context *tdb, const void *p) +{ + if (!tdb->map_ptr + || (char *)p < (char *)tdb->map_ptr + || (char *)p >= (char *)tdb->map_ptr + tdb->map_size) + free((void *)p); +} + +#if 0 +/* write a lump of data at a specified offset */ +static int tdb_write(struct tdb_context *tdb, tdb_off_t off, + const void *buf, tdb_len_t len) +{ + if (len == 0) { + return 0; + } + + if (tdb->read_only || tdb->traverse_read) { + tdb->ecode = TDB_ERR_RDONLY; + return -1; + } + + if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0) + return -1; + + if (tdb->map_ptr) { + memcpy(off + (char *)tdb->map_ptr, buf, len); + } else { + ssize_t written = pwrite(tdb->fd, buf, len, off); + if ((written != (ssize_t)len) && (written != -1)) { + /* try once more */ + tdb->ecode = TDB_ERR_IO; + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only " + "%d of %d bytes at %d, trying once more\n", + (int)written, len, off)); + written = pwrite(tdb->fd, (const char *)buf+written, + len-written, + off+written); + } + if (written == -1) { + /* Ensure ecode is set for log fn. */ + tdb->ecode = TDB_ERR_IO; + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d " + "len=%d (%s)\n", off, len, strerror(errno))); + return -1; + } else if (written != (ssize_t)len) { + tdb->ecode = TDB_ERR_IO; + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to " + "write %d bytes at %d in two attempts\n", + len, off)); + return -1; + } + } + return 0; +} + + + +/* + do an unlocked scan of the hash table heads to find the next non-zero head. The value + will then be confirmed with the lock held +*/ +static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain) +{ + uint32_t h = *chain; + if (tdb->map_ptr) { + for (;h < tdb->header.hash_size;h++) { + if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) { + break; + } + } + } else { + uint32_t off=0; + for (;h < tdb->header.hash_size;h++) { + if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) { + break; + } + } + } + (*chain) = h; +} + + +/* expand the database by expanding the underlying file and doing the + mmap again if necessary */ +int tdb_expand(struct tdb_context *tdb) +{ + struct tdb_record rec; + tdb_off_t offset, new_size; + + /* We have to lock every hash bucket and every free list. */ + do { + + + if (tdb_lock(tdb, -1, F_WRLCK) == -1) { + TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n")); + return -1; + } + + /* must know about any previous expansions by another process */ + tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1); + + /* always make room for at least 100 more records, and at + least 25% more space. Round the database up to a multiple + of the page size */ + new_size = MAX(tdb->map_size + size*100, tdb->map_size * 1.25); + size = TDB_ALIGN(new_size, tdb->page_size) - tdb->map_size; + + if (!(tdb->flags & TDB_INTERNAL)) + tdb_munmap(tdb); + + /* + * We must ensure the file is unmapped before doing this + * to ensure consistency with systems like OpenBSD where + * writes and mmaps are not consistent. + */ + + /* expand the file itself */ + if (!(tdb->flags & TDB_INTERNAL)) { + if (tdb->methods->tdb_expand_file(tdb, tdb->map_size, size) != 0) + goto fail; + } + + tdb->map_size += size; + + if (tdb->flags & TDB_INTERNAL) { + char *new_map_ptr = (char *)realloc(tdb->map_ptr, + tdb->map_size); + if (!new_map_ptr) { + tdb->map_size -= size; + goto fail; + } + tdb->map_ptr = new_map_ptr; + } else { + /* + * We must ensure the file is remapped before adding the space + * to ensure consistency with systems like OpenBSD where + * writes and mmaps are not consistent. + */ + + /* We're ok if the mmap fails as we'll fallback to read/write */ + tdb_mmap(tdb); + } + + /* form a new freelist record */ + memset(&rec,'\0',sizeof(rec)); + rec.rec_len = size - sizeof(rec); + + /* link it into the free list */ + offset = tdb->map_size - size; + if (tdb_free(tdb, offset, &rec) == -1) + goto fail; + + tdb_unlock(tdb, -1, F_WRLCK); + return 0; + fail: + tdb_unlock(tdb, -1, F_WRLCK); + return -1; +} + +/* read/write a tdb_off_t */ +int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d) +{ + return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV()); +} + +int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d) +{ + tdb_off_t off = *d; + return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d)); +} + + +/* read/write a record */ +int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec) +{ + if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1) + return -1; + if (TDB_BAD_MAGIC(rec)) { + /* Ensure ecode is set for log fn. */ + tdb->ecode = TDB_ERR_CORRUPT; + TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset)); + return -1; + } + return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0); +} + +int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec) +{ + struct tdb_record r = *rec; + return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r)); +} +#endif + +static const struct tdb_methods io_methods = { + tdb_read, + tdb_write, + tdb_oob, + tdb_expand_file, +}; + +/* + initialise the default methods table +*/ +void tdb_io_init(struct tdb_context *tdb) +{ + tdb->methods = &io_methods; +} diff --git a/ccan/tdb2/lock.c b/ccan/tdb2/lock.c new file mode 100644 index 0000000..dca526c --- /dev/null +++ b/ccan/tdb2/lock.c @@ -0,0 +1,848 @@ + /* + Unix SMB/CIFS implementation. + + trivial database library + + Copyright (C) Andrew Tridgell 1999-2005 + Copyright (C) Paul `Rusty' Russell 2000 + Copyright (C) Jeremy Allison 2000-2003 + + ** NOTE! The following LGPL license applies to the tdb + ** library. This does NOT imply that all of Samba is released + ** under the LGPL + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, see . +*/ + +#include "private.h" + +static int fcntl_lock(struct tdb_context *tdb, + int rw, off_t off, off_t len, bool waitflag) +{ + struct flock fl; + + fl.l_type = rw; + fl.l_whence = SEEK_SET; + fl.l_start = off; + fl.l_len = len; + fl.l_pid = 0; + + if (waitflag) + return fcntl(tdb->fd, F_SETLKW, &fl); + else + return fcntl(tdb->fd, F_SETLK, &fl); +} + +static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len) +{ + struct flock fl; +#if 0 /* Check they matched up locks and unlocks correctly. */ + char line[80]; + FILE *locks; + bool found = false; + + locks = fopen("/proc/locks", "r"); + + while (fgets(line, 80, locks)) { + char *p; + int type, start, l; + + /* eg. 1: FLOCK ADVISORY WRITE 2440 08:01:2180826 0 EOF */ + p = strchr(line, ':') + 1; + if (strncmp(p, " POSIX ADVISORY ", strlen(" POSIX ADVISORY "))) + continue; + p += strlen(" FLOCK ADVISORY "); + if (strncmp(p, "READ ", strlen("READ ")) == 0) + type = F_RDLCK; + else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0) + type = F_WRLCK; + else + abort(); + p += 6; + if (atoi(p) != getpid()) + continue; + p = strchr(strchr(p, ' ') + 1, ' ') + 1; + start = atoi(p); + p = strchr(p, ' ') + 1; + if (strncmp(p, "EOF", 3) == 0) + l = 0; + else + l = atoi(p) - start + 1; + + if (off == start) { + if (len != l) { + fprintf(stderr, "Len %u should be %u: %s", + (int)len, l, line); + abort(); + } + if (type != rw) { + fprintf(stderr, "Type %s wrong: %s", + rw == F_RDLCK ? "READ" : "WRITE", line); + abort(); + } + found = true; + break; + } + } + + if (!found) { + fprintf(stderr, "Unlock on %u@%u not found!\n", + (int)off, (int)len); + abort(); + } + + fclose(locks); +#endif + + fl.l_type = F_UNLCK; + fl.l_whence = SEEK_SET; + fl.l_start = off; + fl.l_len = len; + fl.l_pid = 0; + + return fcntl(tdb->fd, F_SETLKW, &fl); +} + +/* a byte range locking function - return 0 on success + this functions locks/unlocks 1 byte at the specified offset. + + note that a len of zero means lock to end of file +*/ +static int tdb_brlock(struct tdb_context *tdb, + int rw_type, tdb_off_t offset, tdb_off_t len, + enum tdb_lock_flags flags) +{ + int ret; + + if (tdb->flags & TDB_NOLOCK) { + return 0; + } + + if (rw_type == F_WRLCK && tdb->read_only) { + tdb->ecode = TDB_ERR_RDONLY; + return -1; + } + + /* A 32 bit system cannot open a 64-bit file, but it could have + * expanded since then: check here. */ + if ((size_t)(offset + len) != offset + len) { + tdb->ecode = TDB_ERR_IO; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_brlock: lock on giant offset %llu\n", + (long long)(offset + len)); + return -1; + } + + do { + ret = fcntl_lock(tdb, rw_type, offset, len, + flags & TDB_LOCK_WAIT); + } while (ret == -1 && errno == EINTR); + + if (ret == -1) { + tdb->ecode = TDB_ERR_LOCK; + /* Generic lock error. errno set by fcntl. + * EAGAIN is an expected return from non-blocking + * locks. */ + if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_brlock failed (fd=%d) at" + " offset %llu rw_type=%d flags=%d len=%llu\n", + tdb->fd, (long long)offset, rw_type, + flags, (long long)len); + } + return -1; + } + return 0; +} + +static int tdb_brunlock(struct tdb_context *tdb, + int rw_type, tdb_off_t offset, size_t len) +{ + int ret; + + if (tdb->flags & TDB_NOLOCK) { + return 0; + } + + do { + ret = fcntl_unlock(tdb, rw_type, offset, len); + } while (ret == -1 && errno == EINTR); + + if (ret == -1) { + tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv, + "tdb_brunlock failed (fd=%d) at offset %llu" + " rw_type=%d len=%llu\n", + tdb->fd, (long long)offset, rw_type, (long long)len); + } + return ret; +} + +#if 0 +/* + upgrade a read lock to a write lock. This needs to be handled in a + special way as some OSes (such as solaris) have too conservative + deadlock detection and claim a deadlock when progress can be + made. For those OSes we may loop for a while. +*/ +int tdb_allrecord_upgrade(struct tdb_context *tdb) +{ + int count = 1000; + + if (tdb->allrecord_lock.count != 1) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_allrecord_upgrade failed: count %u too high\n", + tdb->allrecord_lock.count); + return -1; + } + + if (tdb->allrecord_lock.off != 1) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_allrecord_upgrade failed: already upgraded?\n"); + return -1; + } + + while (count--) { + struct timeval tv; + if (tdb_brlock(tdb, F_WRLCK, + TDB_HASH_LOCK_START + + (1ULL << tdb->header.v.hash_bits), 0, + TDB_LOCK_WAIT|TDB_LOCK_PROBE) == 0) { + tdb->allrecord_lock.ltype = F_WRLCK; + tdb->allrecord_lock.off = 0; + return 0; + } + if (errno != EDEADLK) { + break; + } + /* sleep for as short a time as we can - more portable than usleep() */ + tv.tv_sec = 0; + tv.tv_usec = 1; + select(0, NULL, NULL, NULL, &tv); + } + tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv, + "tdb_allrecord_upgrade failed\n"); + return -1; +} +#endif + +static struct tdb_lock_type *find_nestlock(struct tdb_context *tdb, + tdb_off_t offset) +{ + unsigned int i; + + for (i=0; inum_lockrecs; i++) { + if (tdb->lockrecs[i].off == offset) { + return &tdb->lockrecs[i]; + } + } + return NULL; +} + +/* lock an offset in the database. */ +static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype, + enum tdb_lock_flags flags) +{ + struct tdb_lock_type *new_lck; + + if (offset >= TDB_HASH_LOCK_START + (1ULL << tdb->header.v.hash_bits) + + (tdb->header.v.num_zones * (tdb->header.v.free_buckets+1))) { + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "tdb_lock: invalid offset %llu for ltype=%d\n", + (long long)offset, ltype); + return -1; + } + if (tdb->flags & TDB_NOLOCK) + return 0; + + new_lck = find_nestlock(tdb, offset); + if (new_lck) { + /* + * Just increment the in-memory struct, posix locks + * don't stack. + */ + new_lck->count++; + return 0; + } + + new_lck = (struct tdb_lock_type *)realloc( + tdb->lockrecs, + sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1)); + if (new_lck == NULL) { + tdb->ecode = TDB_ERR_OOM; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_lock: unable to allocate %llu lock structure", + (long long)(tdb->num_lockrecs + 1)); + errno = ENOMEM; + return -1; + } + tdb->lockrecs = new_lck; + + /* Since fcntl locks don't nest, we do a lock for the first one, + and simply bump the count for future ones */ + if (tdb_brlock(tdb, ltype, offset, 1, flags)) { + return -1; + } + + tdb->lockrecs[tdb->num_lockrecs].off = offset; + tdb->lockrecs[tdb->num_lockrecs].count = 1; + tdb->lockrecs[tdb->num_lockrecs].ltype = ltype; + tdb->num_lockrecs++; + + return 0; +} + +static int tdb_lock_and_recover(struct tdb_context *tdb) +{ +#if 0 /* FIXME */ + + int ret; + + /* We need to match locking order in transaction commit. */ + if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) { + return -1; + } + + if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) { + tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0); + return -1; + } + + ret = tdb_transaction_recover(tdb); + + tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1); + tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0); + + return ret; +#else + abort(); + return -1; +#endif +} + +static bool tdb_needs_recovery(struct tdb_context *tdb) +{ + /* FIXME */ + return false; +} + +static int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t off, int ltype) +{ + int ret = -1; + struct tdb_lock_type *lck; + + if (tdb->flags & TDB_NOLOCK) + return 0; + + lck = find_nestlock(tdb, off); + if ((lck == NULL) || (lck->count == 0)) { + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_unlock: no lock for %llu\n", (long long)off); + return -1; + } + + if (lck->count > 1) { + lck->count--; + return 0; + } + + /* + * This lock has count==1 left, so we need to unlock it in the + * kernel. We don't bother with decrementing the in-memory array + * element, we're about to overwrite it with the last array element + * anyway. + */ + ret = tdb_brunlock(tdb, ltype, off, 1); + + /* + * Shrink the array by overwriting the element just unlocked with the + * last array element. + */ + *lck = tdb->lockrecs[--tdb->num_lockrecs]; + + if (tdb->num_lockrecs == 0) { + /* If we're not holding any locks, header can change. */ + tdb->header_uptodate = false; + } + + return ret; +} + +#if 0 +/* + get the transaction lock + */ +int tdb_transaction_lock(struct tdb_context *tdb, int ltype, + enum tdb_lock_flags lockflags) +{ + return tdb_nest_lock(tdb, TRANSACTION_LOCK, ltype, lockflags); +} + +/* + release the transaction lock + */ +int tdb_transaction_unlock(struct tdb_context *tdb, int ltype) +{ + return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false); +} +#endif + +/* We only need to lock individual bytes, but Linux merges consecutive locks + * so we lock in contiguous ranges. */ +static int tdb_lock_gradual(struct tdb_context *tdb, + int ltype, enum tdb_lock_flags flags, + tdb_off_t off, tdb_off_t len) +{ + int ret; + enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT); + + if (len <= 4) { + /* Single record. Just do blocking lock. */ + return tdb_brlock(tdb, ltype, off, len, flags); + } + + /* First we try non-blocking. */ + ret = tdb_brlock(tdb, ltype, off, len, nb_flags); + if (ret == 0) { + return 0; + } + + /* Try locking first half, then second. */ + ret = tdb_lock_gradual(tdb, ltype, flags, off, len / 2); + if (ret == -1) + return -1; + + ret = tdb_lock_gradual(tdb, ltype, flags, + off + len / 2, len - len / 2); + if (ret == -1) { + tdb_brunlock(tdb, ltype, off, len / 2); + return -1; + } + return 0; +} + +/* lock/unlock entire database. It can only be upgradable if you have some + * other way of guaranteeing exclusivity (ie. transaction write lock). + * Note that we don't lock the free chains: noone can get those locks + * without a hash chain lock first. */ +int tdb_allrecord_lock(struct tdb_context *tdb, int ltype, + enum tdb_lock_flags flags, bool upgradable) +{ + tdb_off_t hash_size; + + /* FIXME: There are no locks on read-only dbs */ + if (tdb->read_only) { + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_allrecord_lock: read-only\n"); + return -1; + } + + if (tdb->allrecord_lock.count && tdb->allrecord_lock.ltype == ltype) { + tdb->allrecord_lock.count++; + return 0; + } + + if (tdb->allrecord_lock.count) { + /* a global lock of a different type exists */ + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_allrecord_lock: already have %s lock\n", + tdb->allrecord_lock.ltype == F_RDLCK + ? "read" : "write"); + return -1; + } + + if (tdb_has_locks(tdb)) { + /* can't combine global and chain locks */ + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_allrecord_lock: already have chain lock\n"); + return -1; + } + + if (upgradable && ltype != F_RDLCK) { + /* tdb error: you can't upgrade a write lock! */ + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_allrecord_lock: can't upgrade a write lock\n"); + return -1; + } + + /* Lock all the hash buckets. */ +again: + hash_size = (1ULL << tdb->header.v.hash_bits); + if (tdb_lock_gradual(tdb, ltype, TDB_HASH_LOCK_START, + 1ULL << tdb->header.v.hash_bits, flags)) { + if (!(flags & TDB_LOCK_PROBE)) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_lockall hashes failed (%s)\n", + strerror(errno)); + } + return -1; + } + + /* Now we re-check header, holding lock. */ + if (unlikely(update_header(tdb))) { + tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, hash_size); + goto again; + } + + /* Now check for needing recovery. */ + if (unlikely(tdb_needs_recovery(tdb))) { + tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, hash_size); + if (tdb_lock_and_recover(tdb) == -1) { + return -1; + } + goto again; + } + + + tdb->allrecord_lock.count = 1; + /* If it's upgradable, it's actually exclusive so we can treat + * it as a write lock. */ + tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype; + tdb->allrecord_lock.off = upgradable; + return 0; +} + +int tdb_lock_open(struct tdb_context *tdb) +{ + return tdb_nest_lock(tdb, TDB_OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT); +} + +void tdb_unlock_open(struct tdb_context *tdb) +{ + tdb_nest_unlock(tdb, TDB_OPEN_LOCK, F_WRLCK); +} + +/* unlock entire db */ +int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype) +{ + tdb_off_t hash_size; + + /* FIXME: There are no locks on read-only dbs */ + if (tdb->read_only) { + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_allrecord_unlock: read-only\n"); + return -1; + } + + if (tdb->allrecord_lock.count == 0) { + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_allrecord_unlock: not locked!\n"); + return -1; + } + + /* Upgradable locks are marked as write locks. */ + if (tdb->allrecord_lock.ltype != ltype + && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) { + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_allrecord_unlock: have %s lock\n", + tdb->allrecord_lock.ltype == F_RDLCK + ? "read" : "write"); + return -1; + } + + if (tdb->allrecord_lock.count > 1) { + tdb->allrecord_lock.count--; + return 0; + } + + tdb->allrecord_lock.count = 0; + tdb->allrecord_lock.ltype = 0; + + hash_size = (1ULL << tdb->header.v.hash_bits); + + return tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, hash_size); +} + +bool tdb_has_locks(struct tdb_context *tdb) +{ + return tdb->allrecord_lock.count || tdb->num_lockrecs; +} + +#if 0 +/* lock entire database with write lock */ +int tdb_lockall(struct tdb_context *tdb) +{ + tdb_trace(tdb, "tdb_lockall"); + return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false); +} + +/* lock entire database with write lock - nonblocking varient */ +int tdb_lockall_nonblock(struct tdb_context *tdb) +{ + int ret = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_NOWAIT, false); + tdb_trace_ret(tdb, "tdb_lockall_nonblock", ret); + return ret; +} + +/* unlock entire database with write lock */ +int tdb_unlockall(struct tdb_context *tdb) +{ + tdb_trace(tdb, "tdb_unlockall"); + return tdb_allrecord_unlock(tdb, F_WRLCK); +} + +/* lock entire database with read lock */ +int tdb_lockall_read(struct tdb_context *tdb) +{ + tdb_trace(tdb, "tdb_lockall_read"); + return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false); +} + +/* lock entire database with read lock - nonblock varient */ +int tdb_lockall_read_nonblock(struct tdb_context *tdb) +{ + int ret = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_NOWAIT, false); + tdb_trace_ret(tdb, "tdb_lockall_read_nonblock", ret); + return ret; +} + +/* unlock entire database with read lock */ +int tdb_unlockall_read(struct tdb_context *tdb) +{ + tdb_trace(tdb, "tdb_unlockall_read"); + return tdb_allrecord_unlock(tdb, F_RDLCK); +} +#endif + +int tdb_lock_list(struct tdb_context *tdb, tdb_off_t list, + int ltype, enum tdb_lock_flags waitflag) +{ + /* a allrecord lock allows us to avoid per chain locks */ + if (tdb->allrecord_lock.count && + (ltype == tdb->allrecord_lock.ltype || ltype == F_RDLCK)) { + return 0; + } + + if (tdb->allrecord_lock.count) { + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_lock_list: have %s allrecordlock\n", + tdb->allrecord_lock.ltype == F_RDLCK + ? "read" : "write"); + return -1; + } + + /* FIXME: Should we do header_uptodate and return retry here? */ + return tdb_nest_lock(tdb, TDB_HASH_LOCK_START + list, ltype, waitflag); +} + +int tdb_unlock_list(struct tdb_context *tdb, tdb_off_t list, int ltype) +{ + /* a allrecord lock allows us to avoid per chain locks */ + if (tdb->allrecord_lock.count) { + if (tdb->allrecord_lock.ltype == F_RDLCK + && ltype == F_WRLCK) { + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "tdb_unlock_list RO allrecord!\n"); + return -1; + } + return 0; + } else { + return tdb_nest_unlock(tdb, TDB_HASH_LOCK_START + list, ltype); + } +} + +/* Free list locks come after hash locks */ +int tdb_lock_free_list(struct tdb_context *tdb, tdb_off_t flist, + enum tdb_lock_flags waitflag) +{ + /* You're supposed to have a hash lock first! */ + if (!tdb_has_locks(tdb)) { + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "tdb_lock_free_list without lock!\n"); + return -1; + } + + /* a allrecord lock allows us to avoid per chain locks */ + if (tdb->allrecord_lock.count) { + if (tdb->allrecord_lock.ltype == F_WRLCK) + return 0; + tdb->ecode = TDB_ERR_LOCK; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "tdb_lock_free_list with RO allrecordlock!\n"); + return -1; + } + + return tdb_nest_lock(tdb, TDB_HASH_LOCK_START + + (1ULL << tdb->header.v.hash_bits) + + flist, F_WRLCK, waitflag); +} + +void tdb_unlock_free_list(struct tdb_context *tdb, tdb_off_t flist) +{ + if (tdb->allrecord_lock.count) + return; + + tdb_nest_unlock(tdb, TDB_HASH_LOCK_START + + (1ULL << tdb->header.v.hash_bits) + + flist, F_WRLCK); +} + +#if 0 +static int chainlock_loop(struct tdb_context *tdb, const TDB_DATA *key, + int ltype, enum tdb_lock_flags waitflag, + const char *func) +{ + int ret; + uint64_t h = tdb_hash(tdb, key->dptr, key->dsize); + +again: + ret = tdb_lock_list(tdb, + h & ((1ULL << tdb->header.v.hash_bits) - 1), + ltype, waitflag); + if (likely(ret == 0) && unlikely(update_header(tdb))) { + tdb_unlock_list(tdb, h & ((1ULL << tdb->header.v.hash_bits)-1), + ltype); + goto again; + } + + tdb_trace_1rec(tdb, func, *key); + return ret; +} + +/* lock/unlock one hash chain. This is meant to be used to reduce + contention - it cannot guarantee how many records will be locked */ +int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key) +{ + return chainlock_loop(tdb, &key, F_WRLCK, TDB_LOCK_WAIT, + "tdb_chainlock"); +} + +/* lock/unlock one hash chain, non-blocking. This is meant to be used + to reduce contention - it cannot guarantee how many records will be + locked */ +int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key) +{ + return chainlock_loop(tdb, &key, F_WRLCK, TDB_LOCK_NOWAIT, + "tdb_chainlock_nonblock"); +} + +int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key) +{ + uint64_t h = tdb_hash(tdb, key.dptr, key.dsize); + tdb_trace_1rec(tdb, "tdb_chainunlock", key); + return tdb_unlock_list(tdb, h & ((1ULL << tdb->header.v.hash_bits)-1), + F_WRLCK); +} + +int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key) +{ + return chainlock_loop(tdb, &key, F_RDLCK, TDB_LOCK_WAIT, + "tdb_chainlock_read"); +} + +int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key) +{ + uint64_t h = tdb_hash(tdb, key.dptr, key.dsize); + tdb_trace_1rec(tdb, "tdb_chainunlock_read", key); + return tdb_unlock_list(tdb, h & ((1ULL << tdb->header.v.hash_bits)-1), + F_RDLCK); +} + +/* record lock stops delete underneath */ +int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off) +{ + if (tdb->allrecord_lock.count) { + return 0; + } + return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0; +} + +/* + Write locks override our own fcntl readlocks, so check it here. + Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not + an error to fail to get the lock here. +*/ +int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off) +{ + struct tdb_traverse_lock *i; + for (i = &tdb->travlocks; i; i = i->next) + if (i->off == off) + return -1; + if (tdb->allrecord_lock.count) { + if (tdb->allrecord_lock.ltype == F_WRLCK) { + return 0; + } + return -1; + } + return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE); +} + +int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off) +{ + if (tdb->allrecord_lock.count) { + return 0; + } + return tdb_brunlock(tdb, F_WRLCK, off, 1); +} + +/* fcntl locks don't stack: avoid unlocking someone else's */ +int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off) +{ + struct tdb_traverse_lock *i; + uint32_t count = 0; + + if (tdb->allrecord_lock.count) { + return 0; + } + + if (off == 0) + return 0; + for (i = &tdb->travlocks; i; i = i->next) + if (i->off == off) + count++; + return (count == 1 ? tdb_brunlock(tdb, F_RDLCK, off, 1) : 0); +} + +/* The transaction code uses this to remove all locks. */ +void tdb_release_transaction_locks(struct tdb_context *tdb) +{ + unsigned int i; + + if (tdb->allrecord_lock.count != 0) { + tdb_off_t hash_size, free_size; + + hash_size = (1ULL << tdb->header.v.hash_bits) + * sizeof(tdb_off_t); + free_size = tdb->header.v.free_zones + * (tdb->header.v.free_buckets + 1) * sizeof(tdb_off_t); + + tdb_brunlock(tdb, tdb->allrecord_lock.ltype, + tdb->header.v.hash_off, hash_size); + tdb_brunlock(tdb, tdb->allrecord_lock.ltype, + tdb->header.v.free_off, free_size); + tdb->allrecord_lock.count = 0; + tdb->allrecord_lock.ltype = 0; + } + + for (i = 0; inum_lockrecs; i++) { + struct tdb_lock_type *lck = &tdb->lockrecs[i]; + + tdb_brunlock(tdb, lck->ltype, lck->off, 1); + } + tdb->num_lockrecs = 0; + SAFE_FREE(tdb->lockrecs); + tdb->header_uptodate = false; +} +#endif diff --git a/ccan/tdb2/private.h b/ccan/tdb2/private.h new file mode 100644 index 0000000..1fe1563 --- /dev/null +++ b/ccan/tdb2/private.h @@ -0,0 +1,456 @@ +#ifndef TDB_PRIVATE_H +#define TDB_PRIVATE_H + /* + Trivial Database 2: private types and prototypes + Copyright (C) Rusty Russell 2010 + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, see . +*/ + +#define _XOPEN_SOURCE 500 +#define _FILE_OFFSET_BITS 64 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "config.h" +#include +#include +#ifdef HAVE_BYTESWAP_H +#include +#endif + +#ifndef TEST_IT +#define TEST_IT(cond) +#endif + +/* #define TDB_TRACE 1 */ + +#ifndef __STRING +#define __STRING(x) #x +#endif + +#ifndef __STRINGSTRING +#define __STRINGSTRING(x) __STRING(x) +#endif + +#ifndef __location__ +#define __location__ __FILE__ ":" __STRINGSTRING(__LINE__) +#endif + +typedef uint64_t tdb_len_t; +typedef uint64_t tdb_off_t; + +#ifndef offsetof +#define offsetof(t,f) ((unsigned int)&((t *)0)->f) +#endif + +#define TDB_MAGIC_FOOD "TDB file\n" +#define TDB_VERSION ((uint64_t)(0x26011967 + 7)) +#define TDB_MAGIC ((uint64_t)0x1999) +#define TDB_FREE_MAGIC (~(uint64_t)TDB_MAGIC) +#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL) +#define TDB_RECOVERY_MAGIC (0xf53bc0e7U) +#define TDB_RECOVERY_INVALID_MAGIC (0x0) +#define TDB_EXTRA_HASHBITS (11) /* We steal 11 bits to stash hash info. */ +#define TDB_EXTRA_HASHBITS_NUM (3) + +#define TDB_OFF_ERR ((tdb_off_t)-1) + +/* Prevent others from opening the file. */ +#define TDB_OPEN_LOCK 0 +/* Doing a transaction. */ +#define TDB_TRANSACTION_LOCK 1 +/* Hash chain locks. */ +#define TDB_HASH_LOCK_START 2 + +/* We start wih 256 hash buckets, 10 free buckets. A 1k-sized zone. */ +#define INITIAL_HASH_BITS 8 +#define INITIAL_FREE_BUCKETS 10 +#define INITIAL_ZONE_BITS 10 + +#if !HAVE_BSWAP_64 +static inline uint64_t bswap_64(uint64_t x) +{ + return (((x&0x000000FFULL)<<56) + | ((x&0x0000FF00ULL)<<48) + | ((x&0x00FF0000ULL)<<40) + | ((x&0xFF000000ULL)<<32) + | ((x>>8)&0xFF000000ULL) + | ((x>>16)&0x00FF0000ULL) + | ((x>>24)&0x0000FF00ULL) + | ((x>>32)&0x000000FFULL)); +} +#endif + +struct tdb_used_record { + /* For on-disk compatibility, we avoid bitfields: + magic: 16, (highest) + key_len_bits: 5, + hash:11, + extra_padding: 32 (lowest) + */ + uint64_t magic_and_meta; + /* The bottom key_len_bits*2 are key length, rest is data length. */ + uint64_t key_and_data_len; +}; + +static inline unsigned rec_key_bits(const struct tdb_used_record *r) +{ + return ((r->magic_and_meta >> 43) & ((1 << 5)-1)) * 2; +} + +static inline uint64_t rec_key_length(const struct tdb_used_record *r) +{ + return r->key_and_data_len & ((1ULL << rec_key_bits(r)) - 1); +} + +static inline uint64_t rec_data_length(const struct tdb_used_record *r) +{ + return r->key_and_data_len >> rec_key_bits(r); +} + +static inline uint64_t rec_extra_padding(const struct tdb_used_record *r) +{ + return r->magic_and_meta & 0xFFFFFFFF; +} + +static inline uint64_t rec_hash(const struct tdb_used_record *r) +{ + return ((r->magic_and_meta >> 32) & ((1ULL << 11) - 1)) << (64 - 11); +} + +static inline uint16_t rec_magic(const struct tdb_used_record *r) +{ + return (r->magic_and_meta >> 48); +} + +struct tdb_free_record { + uint64_t magic; + uint64_t data_len; /* Not counting these two fields. */ + /* This is why the minimum record size is 16 bytes. */ + uint64_t next, prev; +}; + +/* These parts can change while we have db open. */ +struct tdb_header_volatile { + uint64_t generation; /* Makes sure it changes on every update. */ + uint64_t hash_bits; /* Entries in hash table. */ + uint64_t hash_off; /* Offset of hash table. */ + uint64_t num_zones; /* How many zones in the file. */ + uint64_t zone_bits; /* Size of zones. */ + uint64_t free_buckets; /* How many buckets in each zone. */ + uint64_t free_off; /* Arrays of free entries. */ +}; + +/* this is stored at the front of every database */ +struct tdb_header { + char magic_food[32]; /* for /etc/magic */ + uint64_t version; /* version of the code */ + uint64_t hash_test; /* result of hashing HASH_MAGIC. */ + uint64_t hash_seed; /* "random" seed written at creation time. */ + + struct tdb_header_volatile v; + + tdb_off_t reserved[19]; +}; + +enum tdb_lock_flags { + /* WAIT == F_SETLKW, NOWAIT == F_SETLK */ + TDB_LOCK_NOWAIT = 0, + TDB_LOCK_WAIT = 1, + /* If set, don't log an error on failure. */ + TDB_LOCK_PROBE = 2, +}; + +struct tdb_lock_type { + uint32_t off; + uint32_t count; + uint32_t ltype; +}; + +struct tdb_context { + /* Filename of the database. */ + const char *name; + + /* Mmap (if any), or malloc (for TDB_INTERNAL). */ + void *map_ptr; + + /* Open file descriptor (undefined for TDB_INTERNAL). */ + int fd; + + /* How much space has been mapped (<= current file size) */ + tdb_len_t map_size; + + /* Opened read-only? */ + bool read_only; + + /* Error code for last tdb error. */ + enum TDB_ERROR ecode; + + /* A cached copy of the header */ + struct tdb_header header; + /* (for debugging). */ + bool header_uptodate; + + /* the flags passed to tdb_open, for tdb_reopen. */ + uint32_t flags; + + /* Logging function */ + tdb_logfn_t log; + void *log_priv; + + /* Hash function. */ + tdb_hashfn_t khash; + void *hash_priv; + + /* What zone of the tdb to use, for spreading load. */ + uint64_t last_zone; + + /* IO methods: changes for transactions. */ + const struct tdb_methods *methods; + + /* Lock information */ + struct tdb_lock_type allrecord_lock; + uint64_t num_lockrecs; + struct tdb_lock_type *lockrecs; + + /* Set if we are in a transaction. */ + struct tdb_transaction *transaction; + + /* Single list of all TDBs, to avoid multiple opens. */ + struct tdb_context *next; + dev_t device; + ino_t inode; +}; + +struct tdb_methods { + int (*read)(struct tdb_context *, tdb_off_t, void *, tdb_len_t); + int (*write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t); + int (*oob)(struct tdb_context *, tdb_off_t, bool); + int (*expand_file)(struct tdb_context *, tdb_len_t, tdb_len_t); +}; + +/* + internal prototypes +*/ +/* tdb.c: */ +/* Returns true if header changed. */ +bool update_header(struct tdb_context *tdb); + +/* Hash random memory. */ +uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len); + + +/* free.c: */ +uint64_t random_free_zone(struct tdb_context *tdb); + +/* If this fails, try tdb_expand. */ +tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen, + uint64_t hash, bool growing); + +/* Put this record in a free list. */ +int add_free_record(struct tdb_context *tdb, + tdb_off_t off, tdb_len_t len_with_header); + +/* Set up header for a used record. */ +int set_header(struct tdb_context *tdb, + struct tdb_used_record *rec, + uint64_t keylen, uint64_t datalen, + uint64_t actuallen, uint64_t hash); + +/* Used by tdb_check to verify. */ +unsigned int size_to_bucket(struct tdb_context *tdb, tdb_len_t data_len); +tdb_off_t zone_of(struct tdb_context *tdb, tdb_off_t off); + +/* io.c: */ +/* Initialize tdb->methods. */ +void tdb_io_init(struct tdb_context *tdb); + +/* Convert endian of the buffer if required. */ +void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size); + +/* Unmap and try to map the tdb. */ +void tdb_munmap(struct tdb_context *tdb); +void tdb_mmap(struct tdb_context *tdb); + +/* Hand data to a function, direct if possible */ +int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key, + tdb_off_t offset, tdb_len_t len, + int (*parser)(TDB_DATA key, TDB_DATA data, + void *private_data), + void *private_data); + +/* Either make a copy into pad and return that, or return ptr into mmap. + * Converts endian (ie. will use pad in that case). */ +void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len); + +/* Either alloc a copy, or give direct access. Release frees or noop. */ +const void *tdb_access_read(struct tdb_context *tdb, + tdb_off_t off, tdb_len_t len); +void tdb_access_release(struct tdb_context *tdb, const void *p); + +/* Convenience routine to get an offset. */ +tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off); + +/* Write an offset at an offset. */ +int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val); + +/* Clear an ondisk area. */ +int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len); + +/* Return a non-zero offset in this array, or num. */ +tdb_off_t tdb_find_nonzero_off(struct tdb_context *tdb, tdb_off_t off, + uint64_t num); + +/* Return a zero offset in this array, or num. */ +tdb_off_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off, + uint64_t num); + +/* Even on files, we can get partial writes due to signals. */ +bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off); + +/* Allocate and make a copy of some offset. */ +void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len); + +/* Munges record and writes it */ +int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off, + void *rec, size_t len); + +/* Reads record and converts it */ +int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off, + void *rec, size_t len); + +/* Hash on disk. */ +uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off); + +/* lock.c: */ +/* Lock/unlock a particular hash list. */ +int tdb_lock_list(struct tdb_context *tdb, tdb_off_t list, + int ltype, enum tdb_lock_flags waitflag); +int tdb_unlock_list(struct tdb_context *tdb, tdb_off_t list, int ltype); + +/* Lock/unlock a particular free list. */ +int tdb_lock_free_list(struct tdb_context *tdb, tdb_off_t flist, + enum tdb_lock_flags waitflag); +void tdb_unlock_free_list(struct tdb_context *tdb, tdb_off_t flist); + +/* Do we have any locks? */ +bool tdb_has_locks(struct tdb_context *tdb); + +/* Lock entire database. */ +int tdb_allrecord_lock(struct tdb_context *tdb, int ltype, + enum tdb_lock_flags flags, bool upgradable); +int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype); + +/* Serialize db open. */ +int tdb_lock_open(struct tdb_context *tdb); +void tdb_unlock_open(struct tdb_context *tdb); +/* Expand the file. */ +int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen, + bool growing); + +#if 0 +/* Low-level locking primitives. */ +int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype, + enum tdb_lock_flags flags); +int tdb_nest_unlock(struct tdb_context *tdb, tdb_off_t offset, int ltype); + +int tdb_munmap(struct tdb_context *tdb); +void tdb_mmap(struct tdb_context *tdb); +int tdb_lock(struct tdb_context *tdb, int list, int ltype); +int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype); +bool tdb_have_locks(struct tdb_context *tdb); +int tdb_unlock(struct tdb_context *tdb, int list, int ltype); +int tdb_brlock(struct tdb_context *tdb, + int rw_type, tdb_off_t offset, size_t len, + enum tdb_lock_flags flags); +int tdb_brunlock(struct tdb_context *tdb, + int rw_type, tdb_off_t offset, size_t len); +bool tdb_have_extra_locks(struct tdb_context *tdb); +void tdb_release_extra_locks(struct tdb_context *tdb); +int tdb_transaction_lock(struct tdb_context *tdb, int ltype); +int tdb_transaction_unlock(struct tdb_context *tdb, int ltype); +int tdb_allrecord_lock(struct tdb_context *tdb, int ltype, + enum tdb_lock_flags flags, bool upgradable); +int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype); +int tdb_allrecord_upgrade(struct tdb_context *tdb); +int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off); +int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off); +int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d); +int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d); +int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec); +tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec); +int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d); +int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d); +int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off); +int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off); +bool tdb_needs_recovery(struct tdb_context *tdb); +int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec); +int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec); +int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct tdb_record *rec); +unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len); +int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key, + tdb_off_t offset, tdb_len_t len, + int (*parser)(TDB_DATA key, TDB_DATA data, + void *private_data), + void *private_data); +tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype, + struct tdb_record *rec); +void tdb_io_init(struct tdb_context *tdb); +int tdb_expand(struct tdb_context *tdb, tdb_off_t size); +int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, + struct tdb_record *rec); +#endif + +#ifdef TDB_TRACE +void tdb_trace(struct tdb_context *tdb, const char *op); +void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op); +void tdb_trace_open(struct tdb_context *tdb, const char *op, + unsigned hash_size, unsigned tdb_flags, unsigned open_flags); +void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret); +void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret); +void tdb_trace_1rec(struct tdb_context *tdb, const char *op, + TDB_DATA rec); +void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op, + TDB_DATA rec, int ret); +void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op, + TDB_DATA rec, TDB_DATA ret); +void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op, + TDB_DATA rec1, TDB_DATA rec2, unsigned flag, + int ret); +void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op, + TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret); +#else +#define tdb_trace(tdb, op) +#define tdb_trace_seqnum(tdb, seqnum, op) +#define tdb_trace_open(tdb, op, hash_size, tdb_flags, open_flags) +#define tdb_trace_ret(tdb, op, ret) +#define tdb_trace_retrec(tdb, op, ret) +#define tdb_trace_1rec(tdb, op, rec) +#define tdb_trace_1rec_ret(tdb, op, rec, ret) +#define tdb_trace_1rec_retrec(tdb, op, rec, ret) +#define tdb_trace_2rec_flag_ret(tdb, op, rec1, rec2, flag, ret) +#define tdb_trace_2rec_retrec(tdb, op, rec1, rec2, ret) +#endif /* !TDB_TRACE */ + +#endif diff --git a/ccan/tdb2/tdb.c b/ccan/tdb2/tdb.c new file mode 100644 index 0000000..3cee472 --- /dev/null +++ b/ccan/tdb2/tdb.c @@ -0,0 +1,875 @@ +#include "private.h" +#include +#include +#include +#include + +/* The null return. */ +struct tdb_data tdb_null = { .dptr = NULL, .dsize = 0 }; + +/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */ +static struct tdb_context *tdbs = NULL; + +PRINTF_ATTRIBUTE(4, 5) static void +null_log_fn(struct tdb_context *tdb, + enum tdb_debug_level level, void *priv, + const char *fmt, ...) +{ +} + +/* We do a lot of work assuming our copy of the header volatile area + * is uptodate, and usually it is. However, once we grab a lock, we have to + * re-check it. */ +bool update_header(struct tdb_context *tdb) +{ + struct tdb_header_volatile pad, *v; + + if (tdb->header_uptodate) { + tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv, + "warning: header uptodate already\n"); + } + + /* We could get a partial update if we're not holding any locks. */ + assert(tdb_has_locks(tdb)); + + v = tdb_get(tdb, offsetof(struct tdb_header, v), &pad, sizeof(*v)); + if (!v) { + /* On failure, imply we updated header so they retry. */ + return true; + } + tdb->header_uptodate = true; + if (likely(memcmp(&tdb->header.v, v, sizeof(*v)) == 0)) { + return false; + } + tdb->header.v = *v; + return true; +} + +static uint64_t jenkins_hash(const void *key, size_t length, uint64_t seed, + void *arg) +{ + return hash64_any(key, length, seed); +} + +uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len) +{ + return tdb->khash(ptr, len, tdb->header.hash_seed, tdb->hash_priv); +} + +static bool tdb_already_open(dev_t device, ino_t ino) +{ + struct tdb_context *i; + + for (i = tdbs; i; i = i->next) { + if (i->device == device && i->inode == ino) { + return true; + } + } + + return false; +} + +static uint64_t random_number(struct tdb_context *tdb) +{ + int fd; + uint64_t ret = 0; + struct timeval now; + + fd = open("/dev/urandom", O_RDONLY); + if (fd >= 0) { + if (read(fd, &ret, sizeof(ret)) == sizeof(ret)) { + tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv, + "tdb_open: random from /dev/urandom\n"); + close(fd); + return ret; + } + close(fd); + } + /* FIXME: Untested! Based on Wikipedia protocol description! */ + fd = open("/dev/egd-pool", O_RDWR); + if (fd >= 0) { + /* Command is 1, next byte is size we want to read. */ + char cmd[2] = { 1, sizeof(uint64_t) }; + if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) { + char reply[1 + sizeof(uint64_t)]; + int r = read(fd, reply, sizeof(reply)); + if (r > 1) { + tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv, + "tdb_open: %u random bytes from" + " /dev/egd-pool\n", r-1); + /* Copy at least some bytes. */ + memcpy(&ret, reply+1, r - 1); + if (reply[0] == sizeof(uint64_t) + && r == sizeof(reply)) { + close(fd); + return ret; + } + } + } + close(fd); + } + + /* Fallback: pid and time. */ + gettimeofday(&now, NULL); + ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec; + tdb->log(tdb, TDB_DEBUG_TRACE, tdb->log_priv, + "tdb_open: random from getpid and time\n"); + return ret; +} + +struct new_database { + struct tdb_header hdr; + struct tdb_used_record hrec; + tdb_off_t hash[1ULL << INITIAL_HASH_BITS]; + struct tdb_used_record frec; + tdb_off_t free[INITIAL_FREE_BUCKETS + 1]; /* One overflow bucket */ +}; + +/* initialise a new database */ +static int tdb_new_database(struct tdb_context *tdb) +{ + /* We make it up in memory, then write it out if not internal */ + struct new_database newdb; + + /* Fill in the header */ + newdb.hdr.version = TDB_VERSION; + newdb.hdr.hash_seed = random_number(tdb); + newdb.hdr.hash_test = TDB_HASH_MAGIC; + newdb.hdr.hash_test = tdb->khash(&newdb.hdr.hash_test, + sizeof(newdb.hdr.hash_test), + newdb.hdr.hash_seed, + tdb->hash_priv); + + newdb.hdr.v.generation = 0; + + /* Free array has 1 zone, 10 buckets. All buckets empty. */ + newdb.hdr.v.num_zones = 1; + newdb.hdr.v.zone_bits = INITIAL_ZONE_BITS; + newdb.hdr.v.free_buckets = INITIAL_FREE_BUCKETS; + newdb.hdr.v.free_off = offsetof(struct new_database, free); + set_header(tdb, &newdb.frec, 0, + sizeof(newdb.free), sizeof(newdb.free), 0); + memset(newdb.free, 0, sizeof(newdb.free)); + + /* Initial hashes are empty. */ + newdb.hdr.v.hash_bits = INITIAL_HASH_BITS; + newdb.hdr.v.hash_off = offsetof(struct new_database, hash); + set_header(tdb, &newdb.hrec, 0, + sizeof(newdb.hash), sizeof(newdb.hash), 0); + memset(newdb.hash, 0, sizeof(newdb.hash)); + + if (tdb->flags & TDB_INTERNAL) { + tdb->map_size = sizeof(newdb); + tdb->map_ptr = malloc(tdb->map_size); + if (!tdb->map_ptr) { + tdb->ecode = TDB_ERR_OOM; + return -1; + } + memcpy(tdb->map_ptr, &newdb, tdb->map_size); + tdb->header = newdb.hdr; + /* Convert the `ondisk' version if asked. */ + tdb_convert(tdb, tdb->map_ptr, sizeof(newdb)); + return 0; + } + if (lseek(tdb->fd, 0, SEEK_SET) == -1) + return -1; + + if (ftruncate(tdb->fd, 0) == -1) + return -1; + + /* This creates an endian-converted header, as if read from disk */ + tdb->header = newdb.hdr; + tdb_convert(tdb, &tdb->header, sizeof(tdb->header)); + + /* Don't endian-convert the magic food! */ + memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food)); + strcpy(newdb.hdr.magic_food, TDB_MAGIC_FOOD); + + if (!tdb_pwrite_all(tdb->fd, &newdb, sizeof(newdb), 0)) { + tdb->ecode = TDB_ERR_IO; + return -1; + } + return 0; +} + +struct tdb_context *tdb_open(const char *name, int tdb_flags, + int open_flags, mode_t mode, + union tdb_attribute *attr) +{ + struct tdb_context *tdb; + struct stat st; + int save_errno; + uint64_t hash_test; + unsigned v; + + if (!(tdb = (struct tdb_context *)calloc(1, sizeof *tdb))) { + /* Can't log this */ + errno = ENOMEM; + goto fail; + } + tdb->fd = -1; + tdb->name = NULL; + tdb->map_ptr = NULL; + tdb->flags = tdb_flags; + tdb->log = null_log_fn; + tdb->log_priv = NULL; + tdb->khash = jenkins_hash; + tdb->hash_priv = NULL; + + /* FIXME */ + if (attr) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_open: attributes not yet supported\n"); + errno = EINVAL; + goto fail; + } + + if ((open_flags & O_ACCMODE) == O_WRONLY) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_open: can't open tdb %s write-only\n", name); + errno = EINVAL; + goto fail; + } + + if ((open_flags & O_ACCMODE) == O_RDONLY) { + tdb->read_only = 1; + /* read only databases don't do locking */ + tdb->flags |= TDB_NOLOCK; + } + + /* internal databases don't mmap or lock */ + if (tdb->flags & TDB_INTERNAL) { + tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP); + if (tdb_new_database(tdb) != 0) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_open: tdb_new_database failed!"); + goto fail; + } + TEST_IT(tdb->flags & TDB_CONVERT); + goto internal; + } + + if ((tdb->fd = open(name, open_flags, mode)) == -1) { + tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv, + "tdb_open: could not open file %s: %s\n", + name, strerror(errno)); + goto fail; /* errno set by open(2) */ + } + + /* on exec, don't inherit the fd */ + v = fcntl(tdb->fd, F_GETFD, 0); + fcntl(tdb->fd, F_SETFD, v | FD_CLOEXEC); + + /* ensure there is only one process initialising at once */ + if (tdb_lock_open(tdb) == -1) { + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_open: failed to get open lock on %s: %s\n", + name, strerror(errno)); + goto fail; /* errno set by tdb_brlock */ + } + + errno = 0; + if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header) + || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0) { + if (!(open_flags & O_CREAT) || tdb_new_database(tdb) == -1) { + if (errno == 0) { + errno = EIO; /* ie bad format or something */ + } + goto fail; + } + } else if (tdb->header.version != TDB_VERSION) { + if (tdb->header.version == bswap_64(TDB_VERSION)) + tdb->flags |= TDB_CONVERT; + else { + /* wrong version */ + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_open: %s is unknown version 0x%llx\n", + name, (long long)tdb->header.version); + errno = EIO; + goto fail; + } + } + + tdb_convert(tdb, &tdb->header, sizeof(tdb->header)); + hash_test = TDB_HASH_MAGIC; + hash_test = tdb->khash(&hash_test, sizeof(hash_test), + tdb->header.hash_seed, tdb->hash_priv); + if (tdb->header.hash_test != hash_test) { + /* wrong hash variant */ + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_open: %s uses a different hash function\n", + name); + errno = EIO; + goto fail; + } + + if (fstat(tdb->fd, &st) == -1) + goto fail; + + /* Is it already in the open list? If so, fail. */ + if (tdb_already_open(st.st_dev, st.st_ino)) { + /* FIXME */ + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_open: %s (%d,%d) is already open in this process\n", + name, (int)st.st_dev, (int)st.st_ino); + errno = EBUSY; + goto fail; + } + + tdb->name = strdup(name); + if (!tdb->name) { + errno = ENOMEM; + goto fail; + } + + tdb->map_size = st.st_size; + tdb->device = st.st_dev; + tdb->inode = st.st_ino; + tdb_io_init(tdb); + tdb_mmap(tdb); + + internal: + /* Internal (memory-only) databases skip all the code above to + * do with disk files, and resume here by releasing their + * open lock and hooking into the active list. */ + tdb_unlock_open(tdb); + tdb->last_zone = random_free_zone(tdb); + tdb->next = tdbs; + tdbs = tdb; + return tdb; + + fail: + save_errno = errno; + + if (!tdb) + return NULL; + +#ifdef TDB_TRACE + close(tdb->tracefd); +#endif + if (tdb->map_ptr) { + if (tdb->flags & TDB_INTERNAL) { + free(tdb->map_ptr); + } else + tdb_munmap(tdb); + } + free((char *)tdb->name); + if (tdb->fd != -1) + if (close(tdb->fd) != 0) + tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, + "tdb_open: failed to close tdb->fd" + " on error!\n"); + free(tdb); + errno = save_errno; + return NULL; +} + +static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data) +{ + return memcmp(data.dptr, key.dptr, data.dsize) == 0; +} + +static void unlock_lists(struct tdb_context *tdb, + uint64_t start, uint64_t end, int ltype) +{ + do { + tdb_unlock_list(tdb, start, ltype); + start = (start + ((1ULL << tdb->header.v.hash_bits) - 1)) + & ((1ULL << tdb->header.v.hash_bits) - 1); + } while (start != end); +} + +/* FIXME: Return header copy? */ +/* Returns -1 or offset of entry (0 if not found). + * Locks hash entried from *start to *end (where the entry was found). */ +static tdb_off_t find_bucket_and_lock(struct tdb_context *tdb, + const struct tdb_data *key, + uint64_t hash, + uint64_t *start, + uint64_t *end, + uint64_t *room, + int ltype) +{ + uint64_t hextra; + tdb_off_t off; + + /* hash_bits might be out of date... */ +again: + *start = *end = hash & ((1ULL << tdb->header.v.hash_bits) - 1); + hextra = hash >> tdb->header.v.hash_bits; + + /* FIXME: can we avoid locks for some fast paths? */ + if (tdb_lock_list(tdb, *end, ltype, TDB_LOCK_WAIT) == -1) + return TDB_OFF_ERR; + + /* We only need to check this for first lock. */ + if (unlikely(update_header(tdb))) { + tdb_unlock_list(tdb, *end, ltype); + goto again; + } + + while ((off = tdb_read_off(tdb, tdb->header.v.hash_off + + *end * sizeof(tdb_off_t))) + != TDB_OFF_ERR) { + struct tdb_used_record pad, *r; + uint64_t keylen, next; + + /* Didn't find it? */ + if (!off) + return 0; + +#if 0 /* FIXME: Check other bits. */ + unsigned int bits, bitmask, hoffextra; + /* Bottom three bits show how many extra hash bits. */ + bits = (off & ((1 << TDB_EXTRA_HASHBITS_NUM) - 1)) + 1; + bitmask = (1 << bits)-1; + hoffextra = ((off >> TDB_EXTRA_HASHBITS_NUM) & bitmask); + if ((hextra & bitmask) != hoffextra) + goto lock_next; +#endif + + r = tdb_get(tdb, off, &pad, sizeof(*r)); + if (!r) + goto unlock_err; + + if (rec_magic(r) != TDB_MAGIC) { + tdb->ecode = TDB_ERR_CORRUPT; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "find_bucket_and_lock: bad magic 0x%llx" + " at offset %llu!\n", + (long long)rec_magic(r), (long long)off); + goto unlock_err; + } + + /* FIXME: check extra bits in header! */ + keylen = rec_key_length(r); + if (keylen != key->dsize) + goto lock_next; + + switch (tdb_parse_data(tdb, *key, off + sizeof(*r), key->dsize, + tdb_key_compare, NULL)) { + case 1: + /* Match! */ + *room = rec_data_length(r) + rec_extra_padding(r); + return off >> TDB_EXTRA_HASHBITS_NUM; + case 0: + break; + default: + goto unlock_err; + } + + lock_next: + /* Lock next bucket. */ + /* FIXME: We can deadlock if this wraps! */ + next = (*end + 1) & ((1ULL << tdb->header.v.hash_bits) - 1); + if (next == *start) { + tdb->ecode = TDB_ERR_CORRUPT; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "find_bucket_and_lock: full hash table!\n"); + goto unlock_err; + } + if (tdb_lock_list(tdb, next, ltype, TDB_LOCK_WAIT) == -1) + goto unlock_err; + *end = next; + } + +unlock_err: + TEST_IT(*end < *start); + unlock_lists(tdb, *start, *end, ltype); + return TDB_OFF_ERR; +} + +static int update_rec_hdr(struct tdb_context *tdb, + tdb_off_t off, + tdb_len_t keylen, + tdb_len_t datalen, + tdb_len_t room, + uint64_t h) +{ + struct tdb_used_record rec; + + if (set_header(tdb, &rec, keylen, datalen, room - datalen, h)) + return -1; + + return tdb_write_convert(tdb, off, &rec, sizeof(rec)); +} + +/* If we fail, others will try after us. */ +static void enlarge_hash(struct tdb_context *tdb) +{ + tdb_off_t newoff, i; + uint64_t h, num = 1ULL << tdb->header.v.hash_bits; + struct tdb_used_record pad, *r; + + /* FIXME: We should do this without holding locks throughout. */ + if (tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == -1) + return; + + if (unlikely(update_header(tdb))) { + /* Someone else enlarged for us? Nothing to do. */ + if ((1ULL << tdb->header.v.hash_bits) != num) + goto unlock; + } + + newoff = alloc(tdb, 0, num * 2, 0, false); + if (unlikely(newoff == TDB_OFF_ERR)) + goto unlock; + if (unlikely(newoff == 0)) { + if (tdb_expand(tdb, 0, num * 2, false) == -1) + goto unlock; + newoff = alloc(tdb, 0, num * 2, 0, false); + if (newoff == TDB_OFF_ERR || newoff == 0) + goto unlock; + } + + /* FIXME: If the space before is empty, we know this is in its ideal + * location. We can steal a bit from the pointer to avoid rehash. */ + for (i = tdb_find_nonzero_off(tdb, tdb->header.v.hash_off, num); + i < num; + i += tdb_find_nonzero_off(tdb, tdb->header.v.hash_off + + i*sizeof(tdb_off_t), num - i)) { + tdb_off_t off; + off = tdb_read_off(tdb, tdb->header.v.hash_off + + i*sizeof(tdb_off_t)); + if (unlikely(off == TDB_OFF_ERR)) + goto unlock; + if (unlikely(!off)) { + tdb->ecode = TDB_ERR_CORRUPT; + tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv, + "find_bucket_and_lock: zero hash bucket!\n"); + goto unlock; + } + h = hash_record(tdb, off); + /* FIXME: Encode extra hash bits! */ + if (tdb_write_off(tdb, newoff + + (h & ((num * 2) - 1)) * sizeof(uint64_t), + off) == -1) + goto unlock; + } + + /* Free up old hash. */ + r = tdb_get(tdb, tdb->header.v.hash_off, &pad, sizeof(*r)); + if (!r) + goto unlock; + add_free_record(tdb, tdb->header.v.hash_off, + rec_data_length(r) + rec_extra_padding(r)); + + /* Now we write the modified header. */ + tdb->header.v.generation++; + tdb->header.v.hash_bits++; + tdb->header.v.hash_off = newoff; + tdb_write_convert(tdb, offsetof(struct tdb_header, v), + &tdb->header.v, sizeof(tdb->header.v)); +unlock: + tdb_allrecord_unlock(tdb, F_WRLCK); +} + +int tdb_store(struct tdb_context *tdb, + struct tdb_data key, struct tdb_data dbuf, int flag) +{ + tdb_off_t new_off, off, start, end, room; + uint64_t h; + bool growing = false; + + h = tdb_hash(tdb, key.dptr, key.dsize); + off = find_bucket_and_lock(tdb, &key, h, &start, &end, &room, F_WRLCK); + if (off == TDB_OFF_ERR) + return -1; + + /* Now we have lock on this hash bucket. */ + if (flag == TDB_INSERT) { + if (off) { + tdb->ecode = TDB_ERR_EXISTS; + goto fail; + } + } else { + if (off) { + if (room >= key.dsize + dbuf.dsize) { + new_off = off; + if (update_rec_hdr(tdb, off, + key.dsize, dbuf.dsize, + room, h)) + goto fail; + goto write; + } + /* FIXME: See if right record is free? */ + /* Hint to allocator that we've realloced. */ + growing = true; + } else { + if (flag == TDB_MODIFY) { + /* if the record doesn't exist and we + are in TDB_MODIFY mode then we should fail + the store */ + tdb->ecode = TDB_ERR_NOEXIST; + goto fail; + } + } + } + + /* Allocate a new record. */ + new_off = alloc(tdb, key.dsize, dbuf.dsize, h, growing); + if (new_off == 0) { + unlock_lists(tdb, start, end, F_WRLCK); + /* Expand, then try again... */ + if (tdb_expand(tdb, key.dsize, dbuf.dsize, growing) == -1) + return -1; + return tdb_store(tdb, key, dbuf, flag); + } + + /* We didn't like the existing one: remove it. */ + if (off) { + add_free_record(tdb, off, sizeof(struct tdb_used_record) + + key.dsize + room); + } + +write: + off = tdb->header.v.hash_off + end * sizeof(tdb_off_t); + /* FIXME: Encode extra hash bits! */ + if (tdb_write_off(tdb, off, new_off) == -1) + goto fail; + + off = new_off + sizeof(struct tdb_used_record); + if (tdb->methods->write(tdb, off, key.dptr, key.dsize) == -1) + goto fail; + off += key.dsize; + if (tdb->methods->write(tdb, off, dbuf.dptr, dbuf.dsize) == -1) + goto fail; + + /* FIXME: tdb_increment_seqnum(tdb); */ + unlock_lists(tdb, start, end, F_WRLCK); + + /* By simple trial and error, this roughly approximates a 60% + * full measure. */ + if (unlikely(end - start > 4 * tdb->header.v.hash_bits - 32)) + enlarge_hash(tdb); + + return 0; + +fail: + unlock_lists(tdb, start, end, F_WRLCK); + return -1; +} + +struct tdb_data tdb_fetch(struct tdb_context *tdb, struct tdb_data key) +{ + tdb_off_t off, start, end, room; + uint64_t h; + struct tdb_used_record pad, *r; + struct tdb_data ret; + + h = tdb_hash(tdb, key.dptr, key.dsize); + off = find_bucket_and_lock(tdb, &key, h, &start, &end, &room, F_RDLCK); + if (off == TDB_OFF_ERR) + return tdb_null; + + if (!off) { + unlock_lists(tdb, start, end, F_RDLCK); + tdb->ecode = TDB_SUCCESS; + return tdb_null; + } + + r = tdb_get(tdb, off, &pad, sizeof(*r)); + if (!r) { + unlock_lists(tdb, start, end, F_RDLCK); + return tdb_null; + } + + ret.dsize = rec_data_length(r); + ret.dptr = tdb_alloc_read(tdb, off + sizeof(*r) + key.dsize, + ret.dsize); + unlock_lists(tdb, start, end, F_RDLCK); + return ret; +} + +static int hash_add(struct tdb_context *tdb, uint64_t h, tdb_off_t off) +{ + tdb_off_t i, hoff, len, num; + + i = (h & ((1ULL << tdb->header.v.hash_bits) - 1)); + hoff = tdb->header.v.hash_off + i * sizeof(tdb_off_t); + len = (1ULL << tdb->header.v.hash_bits) - i; + + /* Look for next space. */ + num = tdb_find_zero_off(tdb, hoff, len); + if (unlikely(num == len)) { + hoff = tdb->header.v.hash_off; + len = (1ULL << tdb->header.v.hash_bits); + num = tdb_find_zero_off(tdb, hoff, len); + if (i == len) + return -1; + } + /* FIXME: Encode extra hash bits! */ + return tdb_write_off(tdb, hoff + num * sizeof(tdb_off_t), off); +} + +static int unlink_used_record(struct tdb_context *tdb, tdb_off_t chain, + uint64_t *extra_locks) +{ + tdb_off_t num, len, i, hoff; + + /* FIXME: Maybe lock more in search? Maybe don't lock if scan + * finds none? */ +again: + len = (1ULL << tdb->header.v.hash_bits) - (chain + 1); + hoff = tdb->header.v.hash_off + (chain + 1) * sizeof(tdb_off_t); + num = tdb_find_zero_off(tdb, hoff, len); + + /* We want to lock the zero entry, too. In the wrap case, + * this locks one extra. That's harmless. */ + num++; + + for (i = chain + 1; i < chain + 1 + num; i++) { + if (tdb_lock_list(tdb, i, F_WRLCK, TDB_LOCK_WAIT) == -1) { + if (i != chain + 1) + unlock_lists(tdb, chain + 1, i-1, F_WRLCK); + return -1; + } + } + + /* The wrap case: we need those locks out of order! */ + if (unlikely(num == len + 1)) { + *extra_locks = tdb_find_zero_off(tdb, tdb->header.v.hash_off, + 1ULL << tdb->header.v.hash_bits); + (*extra_locks)++; + for (i = 0; i < *extra_locks; i++) { + if (tdb_lock_list(tdb, i, F_WRLCK, TDB_LOCK_NOWAIT)) { + /* Failed. Caller must lock in order. */ + if (i) + unlock_lists(tdb, 0, i-1, F_WRLCK); + unlock_lists(tdb, chain + 1, chain + num, + F_WRLCK); + return 1; + } + } + num += *extra_locks; + } + + /* Now we have the locks, be certain that offset is still 0! */ + hoff = tdb->header.v.hash_off + + (((chain + num) * sizeof(tdb_off_t)) + & ((1ULL << tdb->header.v.hash_bits) - 1)); + + if (unlikely(tdb_read_off(tdb, hoff) != 0)) { + unlock_lists(tdb, chain + 1, chain + num, F_WRLCK); + goto again; + } + + /* OK, all locked. Unlink first one. */ + hoff = tdb->header.v.hash_off + chain * sizeof(tdb_off_t); + if (tdb_write_off(tdb, hoff, 0) == -1) + goto unlock_err; + + /* Rehash the rest. */ + for (i = 1; i < num; i++) { + tdb_off_t off; + uint64_t h; + + hoff = tdb->header.v.hash_off + + (((chain + i) * sizeof(tdb_off_t)) + & ((1ULL << tdb->header.v.hash_bits) - 1)); + off = tdb_read_off(tdb, hoff); + if (unlikely(off == TDB_OFF_ERR)) + goto unlock_err; + + /* Maybe use a bit to indicate it is in ideal place? */ + h = hash_record(tdb, off); + /* Is it happy where it is? */ + if ((h & ((1ULL << tdb->header.v.hash_bits)-1)) == (chain + i)) + continue; + + /* Remove it. */ + if (tdb_write_off(tdb, hoff, 0) == -1) + goto unlock_err; + + /* Rehash it. */ + if (hash_add(tdb, h, off) == -1) + goto unlock_err; + } + unlock_lists(tdb, chain + 1, chain + num, F_WRLCK); + return 0; + +unlock_err: + unlock_lists(tdb, chain + 1, chain + num, F_WRLCK); + return -1; +} + +int tdb_delete(struct tdb_context *tdb, struct tdb_data key) +{ + tdb_off_t off, start, end, room, extra_locks = 0; + uint64_t h; + int ret; + + h = tdb_hash(tdb, key.dptr, key.dsize); + off = find_bucket_and_lock(tdb, &key, h, &start, &end, &room, F_WRLCK); + if (off == TDB_OFF_ERR) + return -1; + + if (off == 0) { + unlock_lists(tdb, start, end, F_WRLCK); + tdb->ecode = TDB_ERR_NOEXIST; + return -1; + } + + ret = unlink_used_record(tdb, end, &extra_locks); + if (unlikely(ret == 1)) { + unsigned int i; + + unlock_lists(tdb, start, end, F_WRLCK); + + /* We need extra locks at the start. */ + for (i = 0; i < extra_locks; i++) { + if (tdb_lock_list(tdb, i, F_WRLCK, TDB_LOCK_WAIT)) { + if (i) + unlock_lists(tdb, 0, i-1, F_WRLCK); + return -1; + } + } + /* Try again now we're holding more locks. */ + ret = tdb_delete(tdb, key); + unlock_lists(tdb, 0, i, F_WRLCK); + return ret; + } + unlock_lists(tdb, start, end, F_WRLCK); + return ret; +} + +int tdb_close(struct tdb_context *tdb) +{ + struct tdb_context **i; + int ret = 0; + + /* FIXME: + if (tdb->transaction) { + tdb_transaction_cancel(tdb); + } + */ + tdb_trace(tdb, "tdb_close"); + + if (tdb->map_ptr) { + if (tdb->flags & TDB_INTERNAL) + free(tdb->map_ptr); + else + tdb_munmap(tdb); + } + free((char *)tdb->name); + if (tdb->fd != -1) { + ret = close(tdb->fd); + tdb->fd = -1; + } + free(tdb->lockrecs); + + /* Remove from contexts list */ + for (i = &tdbs; *i; i = &(*i)->next) { + if (*i == tdb) { + *i = tdb->next; + break; + } + } + +#ifdef TDB_TRACE + close(tdb->tracefd); +#endif + free(tdb); + + return ret; +} diff --git a/ccan/tdb2/tdb2.h b/ccan/tdb2/tdb2.h new file mode 100644 index 0000000..48c5ba6 --- /dev/null +++ b/ccan/tdb2/tdb2.h @@ -0,0 +1,143 @@ +#ifndef CCAN_TDB2_H +#define CCAN_TDB2_H + +/* + Unix SMB/CIFS implementation. + + trivial database library + + Copyright (C) Andrew Tridgell 1999-2004 + + ** NOTE! The following LGPL license applies to the tdb + ** library. This does NOT imply that all of Samba is released + ** under the LGPL + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, see . +*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _SAMBA_BUILD_ +/* For mode_t */ +#include +/* For O_* flags. */ +#include +/* For sig_atomic_t. */ +#include +/* For uint64_t */ +#include +#endif + +/* flags to tdb_store() */ +#define TDB_REPLACE 1 /* Unused */ +#define TDB_INSERT 2 /* Don't overwrite an existing entry */ +#define TDB_MODIFY 3 /* Don't create an existing entry */ + +/* flags for tdb_open() */ +#define TDB_DEFAULT 0 /* just a readability place holder */ +#define TDB_CLEAR_IF_FIRST 1 +#define TDB_INTERNAL 2 /* don't store on disk */ +#define TDB_NOLOCK 4 /* don't do any locking */ +#define TDB_NOMMAP 8 /* don't use mmap */ +#define TDB_CONVERT 16 /* convert endian (internal use) */ +#define TDB_BIGENDIAN 32 /* header is big-endian (internal use) */ +#define TDB_NOSYNC 64 /* don't use synchronous transactions */ +#define TDB_SEQNUM 128 /* maintain a sequence number */ +#define TDB_VOLATILE 256 /* Activate the per-hashchain freelist, default 5 */ +#define TDB_ALLOW_NESTING 512 /* Allow transactions to nest */ +#define TDB_DISALLOW_NESTING 1024 /* Disallow transactions to nest */ + +/* error codes */ +enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK, + TDB_ERR_OOM, TDB_ERR_EXISTS, TDB_ERR_NOLOCK, TDB_ERR_LOCK_TIMEOUT, + TDB_ERR_NOEXIST, TDB_ERR_EINVAL, TDB_ERR_RDONLY, + TDB_ERR_NESTING}; + +/* debugging uses one of the following levels */ +enum tdb_debug_level {TDB_DEBUG_FATAL = 0, TDB_DEBUG_ERROR, + TDB_DEBUG_WARNING, TDB_DEBUG_TRACE}; + +typedef struct tdb_data { + unsigned char *dptr; + size_t dsize; +} TDB_DATA; + +#ifndef PRINTF_ATTRIBUTE +#if (__GNUC__ >= 3) +/** Use gcc attribute to check printf fns. a1 is the 1-based index of + * the parameter containing the format, and a2 the index of the first + * argument. Note that some gcc 2.x versions don't handle this + * properly **/ +#define PRINTF_ATTRIBUTE(a1, a2) __attribute__ ((format (__printf__, a1, a2))) +#else +#define PRINTF_ATTRIBUTE(a1, a2) +#endif +#endif + +struct tdb_context; + +/* FIXME: Make typesafe */ +typedef void (*tdb_logfn_t)(struct tdb_context *, enum tdb_debug_level, void *priv, const char *, ...) PRINTF_ATTRIBUTE(4, 5); +typedef uint64_t (*tdb_hashfn_t)(const void *key, size_t len, uint64_t seed, + void *priv); + +enum tdb_attribute_type { + TDB_ATTRIBUTE_LOG = 0, + TDB_ATTRIBUTE_HASH = 1 +}; + +struct tdb_attribute_base { + enum tdb_attribute_type attr; + union tdb_attribute *next; +}; + +struct tdb_attribute_log { + struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */ + tdb_logfn_t log_fn; + void *log_private; +}; + +struct tdb_attribute_hash { + struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */ + tdb_hashfn_t hash_fn; + void *hash_private; +}; + +union tdb_attribute { + struct tdb_attribute_base base; + struct tdb_attribute_log log; + struct tdb_attribute_hash hash; +}; + +struct tdb_context *tdb_open(const char *name, int tdb_flags, + int open_flags, mode_t mode, + union tdb_attribute *attributes); + +struct tdb_data tdb_fetch(struct tdb_context *tdb, struct tdb_data key); +int tdb_delete(struct tdb_context *tdb, struct tdb_data key); +int tdb_store(struct tdb_context *tdb, struct tdb_data key, struct tdb_data dbuf, int flag); +int tdb_close(struct tdb_context *tdb); +int tdb_check(struct tdb_context *tdb, + int (*check)(TDB_DATA key, TDB_DATA data, void *private_data), + void *private_data); + +extern struct tdb_data tdb_null; + +#ifdef __cplusplus +} +#endif + +#endif /* tdb2.h */ diff --git a/ccan/tdb2/test/run-encode.c b/ccan/tdb2/test/run-encode.c new file mode 100644 index 0000000..a6253fe --- /dev/null +++ b/ccan/tdb2/test/run-encode.c @@ -0,0 +1,40 @@ +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) +{ + unsigned int i; + struct tdb_used_record rec; + struct tdb_context tdb = { .log = null_log_fn, .log_priv = NULL }; + + plan_tests(64 + 32 + 48*6); + + /* We should be able to encode any data value. */ + for (i = 0; i < 64; i++) + ok1(set_header(&tdb, &rec, 0, 1ULL << i, 1ULL << i, 0) == 0); + + /* And any key and data with < 64 bits between them. */ + for (i = 0; i < 32; i++) { + tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i; + ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen, 0) == 0); + } + + /* We should neatly encode all values. */ + for (i = 0; i < 48; i++) { + uint64_t h = 1ULL << (i < 11 ? 63 - i : 63 - 10); + uint64_t klen = 1ULL << (i < 16 ? i : 15); + uint64_t dlen = 1ULL << i; + uint64_t xlen = 1ULL << (i < 32 ? i : 31); + ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen + xlen, h) + == 0); + ok1(rec_key_length(&rec) == klen); + ok1(rec_data_length(&rec) == dlen); + ok1(rec_extra_padding(&rec) == xlen); + ok1(rec_hash(&rec) == h); + ok1(rec_magic(&rec) == TDB_MAGIC); + } + return exit_status(); +} diff --git a/ccan/tdb2/test/run-fls.c b/ccan/tdb2/test/run-fls.c new file mode 100644 index 0000000..4ecc6f8 --- /dev/null +++ b/ccan/tdb2/test/run-fls.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include +#include + +static unsigned int dumb_fls(uint64_t num) +{ + int i; + + for (i = 63; i >= 0; i--) { + if (num & (1ULL << i)) + break; + } + return i + 1; +} + +int main(int argc, char *argv[]) +{ + unsigned int i, j; + + plan_tests(64 * 64 + 2); + + ok1(fls64(0) == 0); + ok1(dumb_fls(0) == 0); + + for (i = 0; i < 64; i++) { + for (j = 0; j < 64; j++) { + uint64_t val = (1ULL << i) | (1ULL << j); + ok(fls64(val) == dumb_fls(val), + "%llu -> %u should be %u", (long long)val, + fls64(val), dumb_fls(val)); + } + } + return exit_status(); +}