From: Rusty Russell Date: Fri, 9 Mar 2012 02:55:03 +0000 (+1030) Subject: tdb2: remove: it's now in SAMBA where it belongs. X-Git-Url: http://git.ozlabs.org/?p=ccan;a=commitdiff_plain;h=b91dafc6ab5075355d508f051977a15dd7794b0e tdb2: remove: it's now in SAMBA where it belongs. --- diff --git a/ccan/tdb2/LICENSE b/ccan/tdb2/LICENSE deleted file mode 120000 index 74550445..00000000 --- a/ccan/tdb2/LICENSE +++ /dev/null @@ -1 +0,0 @@ -../../licenses/LGPL-3 \ No newline at end of file diff --git a/ccan/tdb2/_info b/ccan/tdb2/_info deleted file mode 100644 index d26e06ba..00000000 --- a/ccan/tdb2/_info +++ /dev/null @@ -1,95 +0,0 @@ -#include -#include - -/** - * tdb2 - [[WORK IN PROGRESS!]] The trivial (64bit transactional) database - * - * The tdb2 module provides an efficient keyword data mapping (usually - * within a file). It supports transactions, so the contents of the - * database is reliable even across crashes. - * - * Example: - * #include - * #include - * #include - * #include - * - * static void usage(const char *argv0) - * { - * errx(1, "Usage: %s fetch \n" - * "OR %s store ", argv0, argv0); - * } - * - * int main(int argc, char *argv[]) - * { - * struct tdb_context *tdb; - * TDB_DATA key, value; - * enum TDB_ERROR error; - * - * if (argc < 4) - * usage(argv[0]); - * - * tdb = tdb_open(argv[2], TDB_DEFAULT, O_CREAT|O_RDWR,0600, NULL); - * if (!tdb) - * err(1, "Opening %s", argv[2]); - * - * key.dptr = (void *)argv[3]; - * key.dsize = strlen(argv[3]); - * - * if (streq(argv[1], "fetch")) { - * if (argc != 4) - * usage(argv[0]); - * error = tdb_fetch(tdb, key, &value); - * if (error) - * errx(1, "fetch %s: %s", - * argv[3], tdb_errorstr(error)); - * printf("%.*s\n", value.dsize, (char *)value.dptr); - * free(value.dptr); - * } else if (streq(argv[1], "store")) { - * if (argc != 5) - * usage(argv[0]); - * value.dptr = (void *)argv[4]; - * value.dsize = strlen(argv[4]); - * error = tdb_store(tdb, key, value, 0); - * if (error) - * errx(1, "store %s: %s", - * argv[3], tdb_errorstr(error)); - * } else - * usage(argv[0]); - * - * return 0; - * } - * - * Maintainer: Rusty Russell - * - * Author: Rusty Russell - * - * License: LGPL (v3 or any later version) - * - * Ccanlint: - * // valgrind breaks fcntl locks. - * tests_pass_valgrind test/api-83-openhook.c:FAIL - */ -int main(int argc, char *argv[]) -{ - if (argc != 2) - return 1; - - if (strcmp(argv[1], "depends") == 0) { - printf("ccan/asprintf\n"); - printf("ccan/hash\n"); - printf("ccan/likely\n"); - printf("ccan/asearch\n"); - printf("ccan/compiler\n"); - printf("ccan/build_assert\n"); - printf("ccan/ilog\n"); - printf("ccan/failtest\n"); - printf("ccan/tally\n"); - printf("ccan/typesafe_cb\n"); - printf("ccan/cast\n"); - printf("ccan/endian\n"); - return 0; - } - - return 1; -} diff --git a/ccan/tdb2/check.c b/ccan/tdb2/check.c deleted file mode 100644 index ecd6c13c..00000000 --- a/ccan/tdb2/check.c +++ /dev/null @@ -1,870 +0,0 @@ - /* - Trivial Database 2: free list/block handling - Copyright (C) Rusty Russell 2010 - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ -#include "private.h" -#include -#include - -/* We keep an ordered array of offsets. */ -static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off) -{ - tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t)); - if (!new) - return false; - new[(*num)++] = off; - *arr = new; - return true; -} - -static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery, - uint64_t *features, size_t *num_capabilities) -{ - uint64_t hash_test; - struct tdb_header hdr; - enum TDB_ERROR ecode; - tdb_off_t off, next; - - ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr)); - if (ecode != TDB_SUCCESS) { - return ecode; - } - /* magic food should not be converted, so convert back. */ - tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food)); - - hash_test = TDB_HASH_MAGIC; - hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test)); - if (hdr.hash_test != hash_test) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "check: hash test %llu should be %llu", - (long long)hdr.hash_test, - (long long)hash_test); - } - - if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "check: bad magic '%.*s'", - (unsigned)sizeof(hdr.magic_food), - hdr.magic_food); - } - - /* Features which are used must be a subset of features offered. */ - if (hdr.features_used & ~hdr.features_offered) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "check: features used (0x%llx) which" - " are not offered (0x%llx)", - (long long)hdr.features_used, - (long long)hdr.features_offered); - } - - *features = hdr.features_offered; - *recovery = hdr.recovery; - if (*recovery) { - if (*recovery < sizeof(hdr) - || *recovery > tdb->file->map_size) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check:" - " invalid recovery offset %zu", - (size_t)*recovery); - } - } - - for (off = hdr.capabilities; off && ecode == TDB_SUCCESS; off = next) { - const struct tdb_capability *cap; - enum TDB_ERROR err; - - cap = tdb_access_read(tdb, off, sizeof(*cap), true); - if (TDB_PTR_IS_ERR(cap)) { - return TDB_PTR_ERR(cap); - } - - /* All capabilities are unknown. */ - err = unknown_capability(tdb, "tdb_check", cap->type); - next = cap->next; - tdb_access_release(tdb, cap); - if (err) - return err; - (*num_capabilities)++; - } - - /* Don't check reserved: they *can* be used later. */ - return TDB_SUCCESS; -} - -static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb, - tdb_off_t off, unsigned int group_bits, - uint64_t hprefix, - unsigned hprefix_bits, - tdb_off_t used[], - size_t num_used, - size_t *num_found, - enum TDB_ERROR (*check)(TDB_DATA, - TDB_DATA, void *), - void *data); - -static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb, - tdb_off_t off, - uint64_t hash, - tdb_off_t used[], - size_t num_used, - size_t *num_found, - enum TDB_ERROR (*check)(TDB_DATA, - TDB_DATA, - void *), - void *data) -{ - struct tdb_used_record rec; - enum TDB_ERROR ecode; - - ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - if (rec_magic(&rec) != TDB_CHAIN_MAGIC) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: Bad hash chain magic %llu", - (long long)rec_magic(&rec)); - } - - if (rec_data_length(&rec) != sizeof(struct tdb_chain)) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check:" - " Bad hash chain length %llu vs %zu", - (long long)rec_data_length(&rec), - sizeof(struct tdb_chain)); - } - if (rec_key_length(&rec) != 0) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: Bad hash chain key length %llu", - (long long)rec_key_length(&rec)); - } - if (rec_hash(&rec) != 0) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: Bad hash chain hash value %llu", - (long long)rec_hash(&rec)); - } - - off += sizeof(rec); - ecode = check_hash_tree(tdb, off, 0, hash, 64, - used, num_used, num_found, check, data); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next)); - if (TDB_OFF_IS_ERR(off)) { - return TDB_OFF_TO_ERR(off); - } - if (off == 0) - return TDB_SUCCESS; - (*num_found)++; - return check_hash_chain(tdb, off, hash, used, num_used, num_found, - check, data); -} - -static enum TDB_ERROR check_hash_record(struct tdb_context *tdb, - tdb_off_t off, - uint64_t hprefix, - unsigned hprefix_bits, - tdb_off_t used[], - size_t num_used, - size_t *num_found, - enum TDB_ERROR (*check)(TDB_DATA, - TDB_DATA, - void *), - void *data) -{ - struct tdb_used_record rec; - enum TDB_ERROR ecode; - - if (hprefix_bits >= 64) - return check_hash_chain(tdb, off, hprefix, used, num_used, - num_found, check, data); - - ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - if (rec_magic(&rec) != TDB_HTABLE_MAGIC) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: Bad hash table magic %llu", - (long long)rec_magic(&rec)); - } - if (rec_data_length(&rec) - != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check:" - " Bad hash table length %llu vs %llu", - (long long)rec_data_length(&rec), - (long long)sizeof(tdb_off_t) - << TDB_SUBLEVEL_HASH_BITS); - } - if (rec_key_length(&rec) != 0) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: Bad hash table key length %llu", - (long long)rec_key_length(&rec)); - } - if (rec_hash(&rec) != 0) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: Bad hash table hash value %llu", - (long long)rec_hash(&rec)); - } - - off += sizeof(rec); - return check_hash_tree(tdb, off, - TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS, - hprefix, hprefix_bits, - used, num_used, num_found, check, data); -} - -static int off_cmp(const tdb_off_t *a, const tdb_off_t *b) -{ - /* Can overflow an int. */ - return *a > *b ? 1 - : *a < *b ? -1 - : 0; -} - -static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used) -{ - *used += num; - - return (h >> (64 - *used)) & ((1U << num) - 1); -} - -static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb, - tdb_off_t off, unsigned int group_bits, - uint64_t hprefix, - unsigned hprefix_bits, - tdb_off_t used[], - size_t num_used, - size_t *num_found, - enum TDB_ERROR (*check)(TDB_DATA, - TDB_DATA, void *), - void *data) -{ - unsigned int g, b; - const tdb_off_t *hash; - struct tdb_used_record rec; - enum TDB_ERROR ecode; - - hash = tdb_access_read(tdb, off, - sizeof(tdb_off_t) - << (group_bits + TDB_HASH_GROUP_BITS), - true); - if (TDB_PTR_IS_ERR(hash)) { - return TDB_PTR_ERR(hash); - } - - for (g = 0; g < (1 << group_bits); g++) { - const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS); - for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) { - unsigned int bucket, i, used_bits; - uint64_t h; - tdb_off_t *p; - if (group[b] == 0) - continue; - - off = group[b] & TDB_OFF_MASK; - p = asearch(&off, used, num_used, off_cmp); - if (!p) { - ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb_check: Invalid offset" - " %llu in hash", - (long long)off); - goto fail; - } - /* Mark it invalid. */ - *p ^= 1; - (*num_found)++; - - if (hprefix_bits == 64) { - /* Chained entries are unordered. */ - if (is_subhash(group[b])) { - ecode = TDB_ERR_CORRUPT; - tdb_logerr(tdb, ecode, - TDB_LOG_ERROR, - "tdb_check: Invalid chain" - " entry subhash"); - goto fail; - } - h = hash_record(tdb, off); - if (h != hprefix) { - ecode = TDB_ERR_CORRUPT; - tdb_logerr(tdb, ecode, - TDB_LOG_ERROR, - "check: bad hash chain" - " placement" - " 0x%llx vs 0x%llx", - (long long)h, - (long long)hprefix); - goto fail; - } - ecode = tdb_read_convert(tdb, off, &rec, - sizeof(rec)); - if (ecode != TDB_SUCCESS) { - goto fail; - } - goto check; - } - - if (is_subhash(group[b])) { - uint64_t subprefix; - subprefix = (hprefix - << (group_bits + TDB_HASH_GROUP_BITS)) - + g * (1 << TDB_HASH_GROUP_BITS) + b; - - ecode = check_hash_record(tdb, - group[b] & TDB_OFF_MASK, - subprefix, - hprefix_bits - + group_bits - + TDB_HASH_GROUP_BITS, - used, num_used, num_found, - check, data); - if (ecode != TDB_SUCCESS) { - goto fail; - } - continue; - } - /* A normal entry */ - - /* Does it belong here at all? */ - h = hash_record(tdb, off); - used_bits = 0; - if (get_bits(h, hprefix_bits, &used_bits) != hprefix - && hprefix_bits) { - ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "check: bad hash placement" - " 0x%llx vs 0x%llx", - (long long)h, - (long long)hprefix); - goto fail; - } - - /* Does it belong in this group? */ - if (get_bits(h, group_bits, &used_bits) != g) { - ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "check: bad group %llu" - " vs %u", - (long long)h, g); - goto fail; - } - - /* Are bucket bits correct? */ - bucket = group[b] & TDB_OFF_HASH_GROUP_MASK; - if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits) - != bucket) { - used_bits -= TDB_HASH_GROUP_BITS; - ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "check: bad bucket %u vs %u", - (unsigned)get_bits(h, - TDB_HASH_GROUP_BITS, - &used_bits), - bucket); - goto fail; - } - - /* There must not be any zero entries between - * the bucket it belongs in and this one! */ - for (i = bucket; - i != b; - i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) { - if (group[i] == 0) { - ecode = TDB_ERR_CORRUPT; - tdb_logerr(tdb, ecode, - TDB_LOG_ERROR, - "check: bad group placement" - " %u vs %u", - b, bucket); - goto fail; - } - } - - ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); - if (ecode != TDB_SUCCESS) { - goto fail; - } - - /* Bottom bits must match header. */ - if ((h & ((1 << 11)-1)) != rec_hash(&rec)) { - ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb_check: Bad hash magic" - " at offset %llu" - " (0x%llx vs 0x%llx)", - (long long)off, - (long long)h, - (long long)rec_hash(&rec)); - goto fail; - } - - check: - if (check) { - TDB_DATA k, d; - const unsigned char *kptr; - - kptr = tdb_access_read(tdb, - off + sizeof(rec), - rec_key_length(&rec) - + rec_data_length(&rec), - false); - if (TDB_PTR_IS_ERR(kptr)) { - ecode = TDB_PTR_ERR(kptr); - goto fail; - } - - k = tdb_mkdata(kptr, rec_key_length(&rec)); - d = tdb_mkdata(kptr + k.dsize, - rec_data_length(&rec)); - ecode = check(k, d, data); - tdb_access_release(tdb, kptr); - if (ecode != TDB_SUCCESS) { - goto fail; - } - } - } - } - tdb_access_release(tdb, hash); - return TDB_SUCCESS; - -fail: - tdb_access_release(tdb, hash); - return ecode; -} - -static enum TDB_ERROR check_hash(struct tdb_context *tdb, - tdb_off_t used[], - size_t num_used, size_t num_other_used, - enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *), - void *data) -{ - /* Free tables and capabilities also show up as used. */ - size_t num_found = num_other_used; - enum TDB_ERROR ecode; - - ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable), - TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS, - 0, 0, used, num_used, &num_found, - check, data); - if (ecode == TDB_SUCCESS) { - if (num_found != num_used) { - ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: Not all entries" - " are in hash"); - } - } - return ecode; -} - -static enum TDB_ERROR check_free(struct tdb_context *tdb, - tdb_off_t off, - const struct tdb_free_record *frec, - tdb_off_t prev, unsigned int ftable, - unsigned int bucket) -{ - enum TDB_ERROR ecode; - - if (frec_magic(frec) != TDB_FREE_MAGIC) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: offset %llu bad magic 0x%llx", - (long long)off, - (long long)frec->magic_and_prev); - } - if (frec_ftable(frec) != ftable) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: offset %llu bad freetable %u", - (long long)off, frec_ftable(frec)); - - } - - ecode = tdb->tdb2.io->oob(tdb, off, - frec_len(frec) - + sizeof(struct tdb_used_record), - false); - if (ecode != TDB_SUCCESS) { - return ecode; - } - if (size_to_bucket(frec_len(frec)) != bucket) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: offset %llu in wrong bucket" - " (%u vs %u)", - (long long)off, - bucket, size_to_bucket(frec_len(frec))); - } - if (prev && prev != frec_prev(frec)) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: offset %llu bad prev" - " (%llu vs %llu)", - (long long)off, - (long long)prev, (long long)frec_len(frec)); - } - return TDB_SUCCESS; -} - -static enum TDB_ERROR check_free_table(struct tdb_context *tdb, - tdb_off_t ftable_off, - unsigned ftable_num, - tdb_off_t fr[], - size_t num_free, - size_t *num_found) -{ - struct tdb_freetable ft; - tdb_off_t h; - unsigned int i; - enum TDB_ERROR ecode; - - ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft)); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC - || rec_key_length(&ft.hdr) != 0 - || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr) - || rec_hash(&ft.hdr) != 0) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: Invalid header on free table"); - } - - for (i = 0; i < TDB_FREE_BUCKETS; i++) { - tdb_off_t off, prev = 0, *p, first = 0; - struct tdb_free_record f; - - h = bucket_off(ftable_off, i); - for (off = tdb_read_off(tdb, h); off; off = f.next) { - if (TDB_OFF_IS_ERR(off)) { - return TDB_OFF_TO_ERR(off); - } - if (!first) { - off &= TDB_OFF_MASK; - first = off; - } - ecode = tdb_read_convert(tdb, off, &f, sizeof(f)); - if (ecode != TDB_SUCCESS) { - return ecode; - } - ecode = check_free(tdb, off, &f, prev, ftable_num, i); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - /* FIXME: Check hash bits */ - p = asearch(&off, fr, num_free, off_cmp); - if (!p) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb_check: Invalid offset" - " %llu in free table", - (long long)off); - } - /* Mark it invalid. */ - *p ^= 1; - (*num_found)++; - prev = off; - } - - if (first) { - /* Now we can check first back pointer. */ - ecode = tdb_read_convert(tdb, first, &f, sizeof(f)); - if (ecode != TDB_SUCCESS) { - return ecode; - } - ecode = check_free(tdb, first, &f, prev, ftable_num, i); - if (ecode != TDB_SUCCESS) { - return ecode; - } - } - } - return TDB_SUCCESS; -} - -/* Slow, but should be very rare. */ -tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off) -{ - size_t len; - enum TDB_ERROR ecode; - - for (len = 0; off + len < tdb->file->map_size; len++) { - char c; - ecode = tdb->tdb2.io->tread(tdb, off, &c, 1); - if (ecode != TDB_SUCCESS) { - return TDB_ERR_TO_OFF(ecode); - } - if (c != 0 && c != 0x43) - break; - } - return len; -} - -static enum TDB_ERROR check_linear(struct tdb_context *tdb, - tdb_off_t **used, size_t *num_used, - tdb_off_t **fr, size_t *num_free, - uint64_t features, tdb_off_t recovery) -{ - tdb_off_t off; - tdb_len_t len; - enum TDB_ERROR ecode; - bool found_recovery = false; - - for (off = sizeof(struct tdb_header); - off < tdb->file->map_size; - off += len) { - union { - struct tdb_used_record u; - struct tdb_free_record f; - struct tdb_recovery_record r; - } rec; - /* r is larger: only get that if we need to. */ - ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f)); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - /* If we crash after ftruncate, we can get zeroes or fill. */ - if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC - || rec.r.magic == 0x4343434343434343ULL) { - ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r)); - if (ecode != TDB_SUCCESS) { - return ecode; - } - if (recovery == off) { - found_recovery = true; - len = sizeof(rec.r) + rec.r.max_len; - } else { - len = dead_space(tdb, off); - if (TDB_OFF_IS_ERR(len)) { - return TDB_OFF_TO_ERR(len); - } - if (len < sizeof(rec.r)) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb_check: invalid" - " dead space at %zu", - (size_t)off); - } - - tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING, - "Dead space at %zu-%zu (of %zu)", - (size_t)off, (size_t)(off + len), - (size_t)tdb->file->map_size); - } - } else if (rec.r.magic == TDB_RECOVERY_MAGIC) { - ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r)); - if (ecode != TDB_SUCCESS) { - return ecode; - } - if (recovery != off) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb_check: unexpected" - " recovery record at offset" - " %zu", - (size_t)off); - } - if (rec.r.len > rec.r.max_len) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb_check: invalid recovery" - " length %zu", - (size_t)rec.r.len); - } - if (rec.r.eof > tdb->file->map_size) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb_check: invalid old EOF" - " %zu", (size_t)rec.r.eof); - } - found_recovery = true; - len = sizeof(rec.r) + rec.r.max_len; - } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) { - len = sizeof(rec.u) + frec_len(&rec.f); - if (off + len > tdb->file->map_size) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb_check: free overlength" - " %llu at offset %llu", - (long long)len, - (long long)off); - } - /* This record should be in free lists. */ - if (frec_ftable(&rec.f) != TDB_FTABLE_NONE - && !append(fr, num_free, off)) { - return tdb_logerr(tdb, TDB_ERR_OOM, - TDB_LOG_ERROR, - "tdb_check: tracking %zu'th" - " free record.", *num_free); - } - } else if (rec_magic(&rec.u) == TDB_USED_MAGIC - || rec_magic(&rec.u) == TDB_CHAIN_MAGIC - || rec_magic(&rec.u) == TDB_HTABLE_MAGIC - || rec_magic(&rec.u) == TDB_FTABLE_MAGIC - || rec_magic(&rec.u) == TDB_CAP_MAGIC) { - uint64_t klen, dlen, extra; - - /* This record is used! */ - if (!append(used, num_used, off)) { - return tdb_logerr(tdb, TDB_ERR_OOM, - TDB_LOG_ERROR, - "tdb_check: tracking %zu'th" - " used record.", *num_used); - } - - klen = rec_key_length(&rec.u); - dlen = rec_data_length(&rec.u); - extra = rec_extra_padding(&rec.u); - - len = sizeof(rec.u) + klen + dlen + extra; - if (off + len > tdb->file->map_size) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb_check: used overlength" - " %llu at offset %llu", - (long long)len, - (long long)off); - } - - if (len < sizeof(rec.f)) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb_check: too short record" - " %llu at %llu", - (long long)len, - (long long)off); - } - - /* Check that records have correct 0 at end (but may - * not in future). */ - if (extra && !features - && rec_magic(&rec.u) != TDB_CAP_MAGIC) { - const char *p; - char c; - p = tdb_access_read(tdb, off + sizeof(rec.u) - + klen + dlen, 1, false); - if (TDB_PTR_IS_ERR(p)) - return TDB_PTR_ERR(p); - c = *p; - tdb_access_release(tdb, p); - - if (c != '\0') { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb_check:" - " non-zero extra" - " at %llu", - (long long)off); - } - } - } else { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb_check: Bad magic 0x%llx" - " at offset %zu", - (long long)rec_magic(&rec.u), - (size_t)off); - } - } - - /* We must have found recovery area if there was one. */ - if (recovery != 0 && !found_recovery) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: expected a recovery area at %zu", - (size_t)recovery); - } - - return TDB_SUCCESS; -} - -enum TDB_ERROR tdb_check_(struct tdb_context *tdb, - enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *), - void *data) -{ - tdb_off_t *fr = NULL, *used = NULL, ft, recovery; - size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0, - num_capabilities = 0; - uint64_t features; - enum TDB_ERROR ecode; - - if (tdb->flags & TDB_CANT_CHECK) { - return tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING, - "tdb_check: database has unknown capability," - " cannot check."); - } - - if (tdb->flags & TDB_VERSION1) { - if (tdb1_check(tdb, check, data) == -1) - return tdb->last_error; - return TDB_SUCCESS; - } - - ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false); - if (ecode != TDB_SUCCESS) { - return tdb->last_error = ecode; - } - - ecode = tdb_lock_expand(tdb, F_RDLCK); - if (ecode != TDB_SUCCESS) { - tdb_allrecord_unlock(tdb, F_RDLCK); - return tdb->last_error = ecode; - } - - ecode = check_header(tdb, &recovery, &features, &num_capabilities); - if (ecode != TDB_SUCCESS) - goto out; - - /* First we do a linear scan, checking all records. */ - ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, features, - recovery); - if (ecode != TDB_SUCCESS) - goto out; - - for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) { - if (TDB_OFF_IS_ERR(ft)) { - ecode = TDB_OFF_TO_ERR(ft); - goto out; - } - ecode = check_free_table(tdb, ft, num_ftables, fr, num_free, - &num_found); - if (ecode != TDB_SUCCESS) - goto out; - num_ftables++; - } - - /* FIXME: Check key uniqueness? */ - ecode = check_hash(tdb, used, num_used, num_ftables + num_capabilities, - check, data); - if (ecode != TDB_SUCCESS) - goto out; - - if (num_found != num_free) { - ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_check: Not all entries are in" - " free table"); - } - -out: - tdb_allrecord_unlock(tdb, F_RDLCK); - tdb_unlock_expand(tdb, F_RDLCK); - free(fr); - free(used); - return tdb->last_error = ecode; -} diff --git a/ccan/tdb2/doc/TDB1_porting.txt b/ccan/tdb2/doc/TDB1_porting.txt deleted file mode 100644 index ef305cab..00000000 --- a/ccan/tdb2/doc/TDB1_porting.txt +++ /dev/null @@ -1,72 +0,0 @@ -Interface differences between TDB1 and TDB2. - -- tdb2 uses 'struct tdb_data', tdb1 uses 'struct TDB_DATA'. Use the - TDB_DATA typedef if you want portability between the two. - -- tdb2 functions return 0 on success, and a negative error on failure, - whereas tdb1 functions returned 0 on success, and -1 on failure. - tdb1 then used tdb_error() to determine the error; this is also - supported in tdb2 to ease backwards compatibility, though the other - form is preferred. - -- tdb2's tdb_fetch() returns an error, tdb1's returned the data directly - (or tdb_null, and you were supposed to check tdb_error() to find out why). - -- tdb2's tdb_nextkey() frees the old key's dptr, in tdb2 you needed to do - this manually. - -- tdb1's tdb_open/tdb_open_ex took an explicit hash size. tdb2's hash table - resizes as required. - -- tdb2 uses a linked list of attribute structures to implement logging and - alternate hashes. tdb1 used tdb_open_ex, which was not extensible. - -- tdb2 does locking on read-only databases (ie. O_RDONLY passed to tdb_open). - tdb1 did not: use the TDB_NOLOCK flag if you want to suppress locking. - -- tdb2's log function is simpler than tdb1's log function. The string is - already formatted, and it takes an enum tdb_log_level not a tdb_debug_level, - and which has only three values: TDB_LOG_ERROR, TDB_LOG_USE_ERROR and - TDB_LOG_WARNING. - -- tdb2 provides tdb_deq() for comparing two struct tdb_data. - -- tdb2's tdb_name() returns a copy of the name even for TDB_INTERNAL dbs. - -- tdb2 does not need tdb_reopen() or tdb_reopen_all(). If you call - fork() after during certain operations the child should close the - tdb, or complete the operations before continuing to use the tdb: - - tdb_transaction_start(): child must tdb_transaction_cancel() - tdb_lockall(): child must call tdb_unlockall() - tdb_lockall_read(): child must call tdb_unlockall_read() - tdb_chainlock(): child must call tdb_chainunlock() - tdb_parse() callback: child must return from tdb_parse() - -- tdb2 will not open a non-tdb file, even if O_CREAT is specified. - -- There is no tdb_traverse_read. For operating on TDB1 files, you can - simulate it by tdb_add_flag(tdb, TDB_RDONLY); tdb_traverse(); - tdb_remove_flag(tdb, TDB_RDONLY). This may be desirable because - traverse on TDB1 files use a write lock on the entire database - unless it's read-only. - -- Failure inside a transaction (such as a lock function failing) does - not implicitly cancel the transaction; you still need to call - tdb_transaction_cancel(). - -TDB1 Compatibility: - -- tdb2's offers a tdb1_incompatible_hash function, which is the same - as the default hash with the TDB_INCOMPATIBLE_HASH flag. There is - no way of marking an old TDB incompatible with versions < 1.2.6 - while using any other hash. - -- The TDB_ATTRIBUTE_TDB1_HASHSIZE attribute can be used to control the - hash size, but only when creating (ie. O_CREAT) a TDB1 - (ie. TDB_VERSION1). - -- There is no TDB_CLEAR_IF_FIRST flag; it has severe scalability and - API problems. If necessary, you can emulate this by using the open - hook and placing a 1-byte lock at offset 4. If your program forks, - you will need to place this lock again in the child. diff --git a/ccan/tdb2/doc/design-1.3.txt b/ccan/tdb2/doc/design-1.3.txt deleted file mode 100644 index 651ada08..00000000 --- a/ccan/tdb2/doc/design-1.3.txt +++ /dev/null @@ -1,1050 +0,0 @@ -TDB2: A Redesigning The Trivial DataBase - -Rusty Russell, IBM Corporation - -27-April-2010 - -Abstract - -The Trivial DataBase on-disk format is 32 bits; with usage cases -heading towards the 4G limit, that must change. This required -breakage provides an opportunity to revisit TDB's other design -decisions and reassess them. - -1 Introduction - -The Trivial DataBase was originally written by Andrew Tridgell as -a simple key/data pair storage system with the same API as dbm, -but allowing multiple readers and writers while being small -enough (< 1000 lines of C) to include in SAMBA. The simple design -created in 1999 has proven surprisingly robust and performant, -used in Samba versions 3 and 4 as well as numerous other -projects. Its useful life was greatly increased by the -(backwards-compatible!) addition of transaction support in 2005. - -The wider variety and greater demands of TDB-using code has lead -to some organic growth of the API, as well as some compromises on -the implementation. None of these, by themselves, are seen as -show-stoppers, but the cumulative effect is to a loss of elegance -over the initial, simple TDB implementation. Here is a table of -the approximate number of lines of implementation code and number -of API functions at the end of each year: - - -+-----------+----------------+--------------------------------+ -| Year End | API Functions | Lines of C Code Implementation | -+-----------+----------------+--------------------------------+ -+-----------+----------------+--------------------------------+ -| 1999 | 13 | 1195 | -+-----------+----------------+--------------------------------+ -| 2000 | 24 | 1725 | -+-----------+----------------+--------------------------------+ -| 2001 | 32 | 2228 | -+-----------+----------------+--------------------------------+ -| 2002 | 35 | 2481 | -+-----------+----------------+--------------------------------+ -| 2003 | 35 | 2552 | -+-----------+----------------+--------------------------------+ -| 2004 | 40 | 2584 | -+-----------+----------------+--------------------------------+ -| 2005 | 38 | 2647 | -+-----------+----------------+--------------------------------+ -| 2006 | 52 | 3754 | -+-----------+----------------+--------------------------------+ -| 2007 | 66 | 4398 | -+-----------+----------------+--------------------------------+ -| 2008 | 71 | 4768 | -+-----------+----------------+--------------------------------+ -| 2009 | 73 | 5715 | -+-----------+----------------+--------------------------------+ - - -This review is an attempt to catalog and address all the known -issues with TDB and create solutions which address the problems -without significantly increasing complexity; all involved are far -too aware of the dangers of second system syndrome in rewriting a -successful project like this. - -2 API Issues - -2.1 tdb_open_ex Is Not Expandable - -The tdb_open() call was expanded to tdb_open_ex(), which added an -optional hashing function and an optional logging function -argument. Additional arguments to open would require the -introduction of a tdb_open_ex2 call etc. - -2.1.1 Proposed Solution - -tdb_open() will take a linked-list of attributes: - -enum tdb_attribute { - - TDB_ATTRIBUTE_LOG = 0, - - TDB_ATTRIBUTE_HASH = 1 - -}; - -struct tdb_attribute_base { - - enum tdb_attribute attr; - - union tdb_attribute *next; - -}; - -struct tdb_attribute_log { - - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG -*/ - - tdb_log_func log_fn; - - void *log_private; - -}; - -struct tdb_attribute_hash { - - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH -*/ - - tdb_hash_func hash_fn; - - void *hash_private; - -}; - -union tdb_attribute { - - struct tdb_attribute_base base; - - struct tdb_attribute_log log; - - struct tdb_attribute_hash hash; - -}; - -This allows future attributes to be added, even if this expands -the size of the union. - -2.2 tdb_traverse Makes Impossible Guarantees - -tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, -and it was thought that it was important to guarantee that all -records which exist at the start and end of the traversal would -be included, and no record would be included twice. - -This adds complexity (see[Reliable-Traversal-Adds]) and does not -work anyway for records which are altered (in particular, those -which are expanded may be effectively deleted and re-added behind -the traversal). - -2.2.1 Proposed Solution - -Abandon the guarantee. You will see every record if no changes -occur during your traversal, otherwise you will see some subset. -You can prevent changes by using a transaction or the locking -API. - -2.3 Nesting of Transactions Is Fraught - -TDB has alternated between allowing nested transactions and not -allowing them. Various paths in the Samba codebase assume that -transactions will nest, and in a sense they can: the operation is -only committed to disk when the outer transaction is committed. -There are two problems, however: - -1. Canceling the inner transaction will cause the outer - transaction commit to fail, and will not undo any operations - since the inner transaction began. This problem is soluble with - some additional internal code. - -2. An inner transaction commit can be cancelled by the outer - transaction. This is desirable in the way which Samba's - database initialization code uses transactions, but could be a - surprise to any users expecting a successful transaction commit - to expose changes to others. - -The current solution is to specify the behavior at tdb_open(), -with the default currently that nested transactions are allowed. -This flag can also be changed at runtime. - -2.3.1 Proposed Solution - -Given the usage patterns, it seems that the “least-surprise” -behavior of disallowing nested transactions should become the -default. Additionally, it seems the outer transaction is the only -code which knows whether inner transactions should be allowed, so -a flag to indicate this could be added to tdb_transaction_start. -However, this behavior can be simulated with a wrapper which uses -tdb_add_flags() and tdb_remove_flags(), so the API should not be -expanded for this relatively-obscure case. - -2.4 Incorrect Hash Function is Not Detected - -tdb_open_ex() allows the calling code to specify a different hash -function to use, but does not check that all other processes -accessing this tdb are using the same hash function. The result -is that records are missing from tdb_fetch(). - -2.4.1 Proposed Solution - -The header should contain an example hash result (eg. the hash of -0xdeadbeef), and tdb_open_ex() should check that the given hash -function produces the same answer, or fail the tdb_open call. - -2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation - -In response to scalability issues with the free list ([TDB-Freelist-Is] -) two API workarounds have been incorporated in TDB: -tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The -latter actually calls the former with an argument of “5”. - -This code allows deleted records to accumulate without putting -them in the free list. On delete we iterate through each chain -and free them in a batch if there are more than max_dead entries. -These are never otherwise recycled except as a side-effect of a -tdb_repack. - -2.5.1 Proposed Solution - -With the scalability problems of the freelist solved, this API -can be removed. The TDB_VOLATILE flag may still be useful as a -hint that store and delete of records will be at least as common -as fetch in order to allow some internal tuning, but initially -will become a no-op. - -2.6 TDB Files Cannot Be Opened Multiple Times - In The Same Process - -No process can open the same TDB twice; we check and disallow it. -This is an unfortunate side-effect of fcntl locks, which operate -on a per-file rather than per-file-descriptor basis, and do not -nest. Thus, closing any file descriptor on a file clears all the -locks obtained by this process, even if they were placed using a -different file descriptor! - -Note that even if this were solved, deadlock could occur if -operations were nested: this is a more manageable programming -error in most cases. - -2.6.1 Proposed Solution - -We could lobby POSIX to fix the perverse rules, or at least lobby -Linux to violate them so that the most common implementation does -not have this restriction. This would be a generally good idea -for other fcntl lock users. - -Samba uses a wrapper which hands out the same tdb_context to -multiple callers if this happens, and does simple reference -counting. We should do this inside the tdb library, which already -emulates lock nesting internally; it would need to recognize when -deadlock occurs within a single process. This would create a new -failure mode for tdb operations (while we currently handle -locking failures, they are impossible in normal use and a process -encountering them can do little but give up). - -I do not see benefit in an additional tdb_open flag to indicate -whether re-opening is allowed, as though there may be some -benefit to adding a call to detect when a tdb_context is shared, -to allow other to create such an API. - -2.7 TDB API Is Not POSIX Thread-safe - -The TDB API uses an error code which can be queried after an -operation to determine what went wrong. This programming model -does not work with threads, unless specific additional guarantees -are given by the implementation. In addition, even -otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot] -). - -2.7.1 Proposed Solution - -Reachitecting the API to include a tdb_errcode pointer would be a -great deal of churn; we are better to guarantee that the -tdb_errcode is per-thread so the current programming model can be -maintained. - -This requires dynamic per-thread allocations, which is awkward -with POSIX threads (pthread_key_create space is limited and we -cannot simply allocate a key for every TDB). - -Internal locking is required to make sure that fcntl locks do not -overlap between threads, and also that the global list of tdbs is -maintained. - -The aim is that building tdb with -DTDB_PTHREAD will result in a -pthread-safe version of the library, and otherwise no overhead -will exist. - -2.8 *_nonblock Functions And *_mark Functions Expose - Implementation - -CTDB[footnote: -Clustered TDB, see http://ctdb.samba.org -] wishes to operate on TDB in a non-blocking manner. This is -currently done as follows: - -1. Call the _nonblock variant of an API function (eg. - tdb_lockall_nonblock). If this fails: - -2. Fork a child process, and wait for it to call the normal - variant (eg. tdb_lockall). - -3. If the child succeeds, call the _mark variant to indicate we - already have the locks (eg. tdb_lockall_mark). - -4. Upon completion, tell the child to release the locks (eg. - tdb_unlockall). - -5. Indicate to tdb that it should consider the locks removed (eg. - tdb_unlockall_mark). - -There are several issues with this approach. Firstly, adding two -new variants of each function clutters the API for an obscure -use, and so not all functions have three variants. Secondly, it -assumes that all paths of the functions ask for the same locks, -otherwise the parent process will have to get a lock which the -child doesn't have under some circumstances. I don't believe this -is currently the case, but it constrains the implementation. - -2.8.1 Proposed Solution - -Implement a hook for locking methods, so that the caller can -control the calls to create and remove fcntl locks. In this -scenario, ctdbd would operate as follows: - -1. Call the normal API function, eg tdb_lockall(). - -2. When the lock callback comes in, check if the child has the - lock. Initially, this is always false. If so, return 0. - Otherwise, try to obtain it in non-blocking mode. If that - fails, return EWOULDBLOCK. - -3. Release locks in the unlock callback as normal. - -4. If tdb_lockall() fails, see if we recorded a lock failure; if - so, call the child to repeat the operation. - -5. The child records what locks it obtains, and returns that - information to the parent. - -6. When the child has succeeded, goto 1. - -This is flexible enough to handle any potential locking scenario, -even when lock requirements change. It can be optimized so that -the parent does not release locks, just tells the child which -locks it doesn't need to obtain. - -It also keeps the complexity out of the API, and in ctdbd where -it is needed. - -2.9 tdb_chainlock Functions Expose Implementation - -tdb_chainlock locks some number of records, including the record -indicated by the given key. This gave atomicity guarantees; -no-one can start a transaction, alter, read or delete that key -while the lock is held. - -It also makes the same guarantee for any other key in the chain, -which is an internal implementation detail and potentially a -cause for deadlock. - -2.9.1 Proposed Solution - -None. It would be nice to have an explicit single entry lock -which effected no other keys. Unfortunately, this won't work for -an entry which doesn't exist. Thus while chainlock may be -implemented more efficiently for the existing case, it will still -have overlap issues with the non-existing case. So it is best to -keep the current (lack of) guarantee about which records will be -effected to avoid constraining our implementation. - -2.10 Signal Handling is Not Race-Free - -The tdb_setalarm_sigptr() call allows the caller's signal handler -to indicate that the tdb locking code should return with a -failure, rather than trying again when a signal is received (and -errno == EAGAIN). This is usually used to implement timeouts. - -Unfortunately, this does not work in the case where the signal is -received before the tdb code enters the fcntl() call to place the -lock: the code will sleep within the fcntl() code, unaware that -the signal wants it to exit. In the case of long timeouts, this -does not happen in practice. - -2.10.1 Proposed Solution - -The locking hooks proposed in[Proposed-Solution-locking-hook] -would allow the user to decide on whether to fail the lock -acquisition on a signal. This allows the caller to choose their -own compromise: they could narrow the race by checking -immediately before the fcntl call.[footnote: -It may be possible to make this race-free in some implementations -by having the signal handler alter the struct flock to make it -invalid. This will cause the fcntl() lock call to fail with -EINVAL if the signal occurs before the kernel is entered, -otherwise EAGAIN. -] - -2.11 The API Uses Gratuitous Typedefs, Capitals - -typedefs are useful for providing source compatibility when types -can differ across implementations, or arguably in the case of -function pointer definitions which are hard for humans to parse. -Otherwise it is simply obfuscation and pollutes the namespace. - -Capitalization is usually reserved for compile-time constants and -macros. - - TDB_CONTEXT There is no reason to use this over 'struct - tdb_context'; the definition isn't visible to the API user - anyway. - - TDB_DATA There is no reason to use this over struct TDB_DATA; - the struct needs to be understood by the API user. - - struct TDB_DATA This would normally be called 'struct - tdb_data'. - - enum TDB_ERROR Similarly, this would normally be enum - tdb_error. - -2.11.1 Proposed Solution - -None. Introducing lower case variants would please pedants like -myself, but if it were done the existing ones should be kept. -There is little point forcing a purely cosmetic change upon tdb -users. - -2.12 tdb_log_func Doesn't Take The - Private Pointer - -For API compatibility reasons, the logging function needs to call -tdb_get_logging_private() to retrieve the pointer registered by -the tdb_open_ex for logging. - -2.12.1 Proposed Solution - -It should simply take an extra argument, since we are prepared to -break the API/ABI. - -2.13 Various Callback Functions Are Not Typesafe - -The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take] - is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read -and tdb_check all take void * and must internally convert it to -the argument type they were expecting. - -If this type changes, the compiler will not produce warnings on -the callers, since it only sees void *. - -2.13.1 Proposed Solution - -With careful use of macros, we can create callback functions -which give a warning when used on gcc and the types of the -callback and its private argument differ. Unsupported compilers -will not give a warning, which is no worse than now. In addition, -the callbacks become clearer, as they need not use void * for -their parameter. - -See CCAN's typesafe_cb module at -http://ccan.ozlabs.org/info/typesafe_cb.html - -2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, - tdb_reopen_all Problematic - -The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB -file should be cleared if the caller discovers it is the only -process with the TDB open. However, if any caller does not -specify TDB_CLEAR_IF_FIRST it will not be detected, so will have -the TDB erased underneath them (usually resulting in a crash). - -There is a similar issue on fork(); if the parent exits (or -otherwise closes the tdb) before the child calls tdb_reopen_all() -to establish the lock used to indicate the TDB is opened by -someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe -it alone has opened the TDB and will erase it. - -2.14.1 Proposed Solution - -Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but -see [TDB_CLEAR_IF_FIRST-Imposes-Performance]. - -3 Performance And Scalability Issues - -3.1 TDB_CLEAR_IF_FIRST - Imposes Performance Penalty - -When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is -placed at offset 4 (aka. the ACTIVE_LOCK). While these locks -never conflict in normal tdb usage, they do add substantial -overhead for most fcntl lock implementations when the kernel -scans to detect if a lock conflict exists. This is often a single -linked list, making the time to acquire and release a fcntl lock -O(N) where N is the number of processes with the TDB open, not -the number actually doing work. - -In a Samba server it is common to have huge numbers of clients -sitting idle, and thus they have weaned themselves off the -TDB_CLEAR_IF_FIRST flag.[footnote: -There is a flag to tdb_reopen_all() which is used for this -optimization: if the parent process will outlive the child, the -child does not need the ACTIVE_LOCK. This is a workaround for -this very performance issue. -] - -3.1.1 Proposed Solution - -Remove the flag. It was a neat idea, but even trivial servers -tend to know when they are initializing for the first time and -can simply unlink the old tdb at that point. - -3.2 TDB Files Have a 4G Limit - -This seems to be becoming an issue (so much for “trivial”!), -particularly for ldb. - -3.2.1 Proposed Solution - -A new, incompatible TDB format which uses 64 bit offsets -internally rather than 32 bit as now. For simplicity of endian -conversion (which TDB does on the fly if required), all values -will be 64 bit on disk. In practice, some upper bits may be used -for other purposes, but at least 56 bits will be available for -file offsets. - -tdb_open() will automatically detect the old version, and even -create them if TDB_VERSION6 is specified to tdb_open. - -32 bit processes will still be able to access TDBs larger than 4G -(assuming that their off_t allows them to seek to 64 bits), they -will gracefully fall back as they fail to mmap. This can happen -already with large TDBs. - -Old versions of tdb will fail to open the new TDB files (since 28 -August 2009, commit 398d0c29290: prior to that any unrecognized -file format would be erased and initialized as a fresh tdb!) - -3.3 TDB Records Have a 4G Limit - -This has not been a reported problem, and the API uses size_t -which can be 64 bit on 64 bit platforms. However, other limits -may have made such an issue moot. - -3.3.1 Proposed Solution - -Record sizes will be 64 bit, with an error returned on 32 bit -platforms which try to access such records (the current -implementation would return TDB_ERR_OOM in a similar case). It -seems unlikely that 32 bit keys will be a limitation, so the -implementation may not support this (see [sub:Records-Incur-A]). - -3.4 Hash Size Is Determined At TDB Creation Time - -TDB contains a number of hash chains in the header; the number is -specified at creation time, and defaults to 131. This is such a -bottleneck on large databases (as each hash chain gets quite -long), that LDB uses 10,000 for this hash. In general it is -impossible to know what the 'right' answer is at database -creation time. - -3.4.1 Proposed Solution - -After comprehensive performance testing on various scalable hash -variants[footnote: -http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 -This was annoying because I was previously convinced that an -expanding tree of hashes would be very close to optimal. -], it became clear that it is hard to beat a straight linear hash -table which doubles in size when it reaches saturation. There are -three details which become important: - -1. On encountering a full bucket, we use the next bucket. - -2. Extra hash bits are stored with the offset, to reduce - comparisons. - -3. A marker entry is used on deleting an entry. - -The doubling of the table must be done under a transaction; we -will not reduce it on deletion, so it will be an unusual case. It -will either be placed at the head (other entries will be moved -out the way so we can expand). We could have a pointer in the -header to the current hashtable location, but that pointer would -have to be read frequently to check for hashtable moves. - -The locking for this is slightly more complex than the chained -case; we currently have one lock per bucket, and that means we -would need to expand the lock if we overflow to the next bucket. -The frequency of such collisions will effect our locking -heuristics: we can always lock more buckets than we need. - -One possible optimization is to only re-check the hash size on an -insert or a lookup miss. - -3.5 TDB Freelist Is Highly Contended - -TDB uses a single linked list for the free list. Allocation -occurs as follows, using heuristics which have evolved over time: - -1. Get the free list lock for this whole operation. - -2. Multiply length by 1.25, so we always over-allocate by 25%. - -3. Set the slack multiplier to 1. - -4. Examine the current freelist entry: if it is > length but < - the current best case, remember it as the best case. - -5. Multiply the slack multiplier by 1.05. - -6. If our best fit so far is less than length * slack multiplier, - return it. The slack will be turned into a new free record if - it's large enough. - -7. Otherwise, go onto the next freelist entry. - -Deleting a record occurs as follows: - -1. Lock the hash chain for this whole operation. - -2. Walk the chain to find the record, keeping the prev pointer - offset. - -3. If max_dead is non-zero: - - (a) Walk the hash chain again and count the dead records. - - (b) If it's more than max_dead, bulk free all the dead ones - (similar to steps 4 and below, but the lock is only obtained - once). - - (c) Simply mark this record as dead and return. - -4. Get the free list lock for the remainder of this operation. - -5. Examine the following block to see if it is - free; if so, enlarge the current block and remove that block - from the free list. This was disabled, as removal from the free - list was O(entries-in-free-list). - -6. Examine the preceeding block to see if it is free: for this - reason, each block has a 32-bit tailer which indicates its - length. If it is free, expand it to cover our new block and - return. - -7. Otherwise, prepend ourselves to the free list. - -Disabling right-merging (step [right-merging]) causes -fragmentation; the other heuristics proved insufficient to -address this, so the final answer to this was that when we expand -the TDB file inside a transaction commit, we repack the entire -tdb. - -The single list lock limits our allocation rate; due to the other -issues this is not currently seen as a bottleneck. - -3.5.1 Proposed Solution - -The first step is to remove all the current heuristics, as they -obviously interact, then examine them once the lock contention is -addressed. - -The free list must be split to reduce contention. Assuming -perfect free merging, we can at most have 1 free list entry for -each entry. This implies that the number of free lists is related -to the size of the hash table, but as it is rare to walk a large -number of free list entries we can use far fewer, say 1/32 of the -number of hash buckets. - -There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented] -) but it's not clear this would reduce contention in the common -case where all processes are allocating/freeing the same size. -Thus we almost certainly need to divide in other ways: the most -obvious is to divide the file into zones, and using a free list -(or set of free lists) for each. This approximates address -ordering. - -Note that this means we need to split the free lists when we -expand the file; this is probably acceptable when we double the -hash table size, since that is such an expensive operation -already. In the case of increasing the file size, there is an -optimization we can use: if we use M in the formula above as the -file size rounded up to the next power of 2, we only need -reshuffle free lists when the file size crosses a power of 2 -boundary, and reshuffling the free lists is trivial: we simply -merge every consecutive pair of free lists. - -The basic algorithm is as follows. Freeing is simple: - -1. Identify the correct zone. - -2. Lock the corresponding list. - -3. Re-check the zone (we didn't have a lock, sizes could have - changed): relock if necessary. - -4. Place the freed entry in the list for that zone. - -Allocation is a little more complicated, as we perform delayed -coalescing at this point: - -1. Pick a zone either the zone we last freed into, or based on a “ - random” number. - -2. Lock the corresponding list. - -3. Re-check the zone: relock if necessary. - -4. If the top entry is -large enough, remove it from the list and - return it. - -5. Otherwise, coalesce entries in the list. - - (a) - - (b) - - (c) - - (d) - -6. If there was no entry large enough, unlock the list and try - the next zone. - -7. - -8. - -9. If no zone satisfies, expand the file. - -This optimizes rapid insert/delete of free list entries by not -coalescing them all the time.. First-fit address ordering -ordering seems to be fairly good for keeping fragmentation low -(see [sub:TDB-Becomes-Fragmented]). Note that address ordering -does not need a tailer to coalesce, though if we needed one we -could have one cheaply: see [sub:Records-Incur-A]. - - - -I anticipate that the number of entries in each free zone would -be small, but it might be worth using one free entry to hold -pointers to the others for cache efficiency. - -3.6 TDB Becomes Fragmented - -Much of this is a result of allocation strategy[footnote: -The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 -ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps -] and deliberate hobbling of coalescing; internal fragmentation -(aka overallocation) is deliberately set at 25%, and external -fragmentation is only cured by the decision to repack the entire -db when a transaction commit needs to enlarge the file. - -3.6.1 Proposed Solution - -The 25% overhead on allocation works in practice for ldb because -indexes tend to expand by one record at a time. This internal -fragmentation can be resolved by having an “expanded” bit in the -header to note entries that have previously expanded, and -allocating more space for them. - -There are is a spectrum of possible solutions for external -fragmentation: one is to use a fragmentation-avoiding allocation -strategy such as best-fit address-order allocator. The other end -of the spectrum would be to use a bump allocator (very fast and -simple) and simply repack the file when we reach the end. - -There are three problems with efficient fragmentation-avoiding -allocators: they are non-trivial, they tend to use a single free -list for each size, and there's no evidence that tdb allocation -patterns will match those recorded for general allocators (though -it seems likely). - -Thus we don't spend too much effort on external fragmentation; we -will be no worse than the current code if we need to repack on -occasion. More effort is spent on reducing freelist contention, -and reducing overhead. - -3.7 Records Incur A 28-Byte Overhead - -Each TDB record has a header as follows: - -struct tdb_record { - - tdb_off_t next; /* offset of the next record in the list -*/ - - tdb_len_t rec_len; /* total byte length of record */ - - tdb_len_t key_len; /* byte length of key */ - - tdb_len_t data_len; /* byte length of data */ - - uint32_t full_hash; /* the full 32 bit hash of the key */ - - uint32_t magic; /* try to catch errors */ - - /* the following union is implied: - - union { - - char record[rec_len]; - - struct { - - char key[key_len]; - - char data[data_len]; - - } - - uint32_t totalsize; (tailer) - - } - - */ - -}; - -Naively, this would double to a 56-byte overhead on a 64 bit -implementation. - -3.7.1 Proposed Solution - -We can use various techniques to reduce this for an allocated -block: - -1. The 'next' pointer is not required, as we are using a flat - hash table. - -2. 'rec_len' can instead be expressed as an addition to key_len - and data_len (it accounts for wasted or overallocated length in - the record). Since the record length is always a multiple of 8, - we can conveniently fit it in 32 bits (representing up to 35 - bits). - -3. 'key_len' and 'data_len' can be reduced. I'm unwilling to - restrict 'data_len' to 32 bits, but instead we can combine the - two into one 64-bit field and using a 5 bit value which - indicates at what bit to divide the two. Keys are unlikely to - scale as fast as data, so I'm assuming a maximum key size of 32 - bits. - -4. 'full_hash' is used to avoid a memcmp on the “miss” case, but - this is diminishing returns after a handful of bits (at 10 - bits, it reduces 99.9% of false memcmp). As an aside, as the - lower bits are already incorporated in the hash table - resolution, the upper bits should be used here. - -5. 'magic' does not need to be enlarged: it currently reflects - one of 5 values (used, free, dead, recovery, and - unused_recovery). It is useful for quick sanity checking - however, and should not be eliminated. - -6. 'tailer' is only used to coalesce free blocks (so a block to - the right can find the header to check if this block is free). - This can be replaced by a single 'free' bit in the header of - the following block (and the tailer only exists in free - blocks).[footnote: -This technique from Thomas Standish. Data Structure Techniques. -Addison-Wesley, Reading, Massachusetts, 1980. -] The current proposed coalescing algorithm doesn't need this, - however. - -This produces a 16 byte used header like this: - -struct tdb_used_record { - - uint32_t magic : 16, - - prev_is_free: 1, - - key_data_divide: 5, - - top_hash: 10; - - uint32_t extra_octets; - - uint64_t key_and_data_len; - -}; - -And a free record like this: - -struct tdb_free_record { - - uint32_t free_magic; - - uint64_t total_length; - - ... - - uint64_t tailer; - -}; - - - -3.8 Transaction Commit Requires 4 fdatasync - -The current transaction algorithm is: - -1. write_recovery_data(); - -2. sync(); - -3. write_recovery_header(); - -4. sync(); - -5. overwrite_with_new_data(); - -6. sync(); - -7. remove_recovery_header(); - -8. sync(); - -On current ext3, each sync flushes all data to disk, so the next -3 syncs are relatively expensive. But this could become a -performance bottleneck on other filesystems such as ext4. - -3.8.1 Proposed Solution - - - - - - - - - -Neil Brown points out that this is overzealous, and only one sync -is needed: - -1. Bundle the recovery data, a transaction counter and a strong - checksum of the new data. - -2. Strong checksum that whole bundle. - -3. Store the bundle in the database. - -4. Overwrite the oldest of the two recovery pointers in the - header (identified using the transaction counter) with the - offset of this bundle. - -5. sync. - -6. Write the new data to the file. - -Checking for recovery means identifying the latest bundle with a -valid checksum and using the new data checksum to ensure that it -has been applied. This is more expensive than the current check, -but need only be done at open. For running databases, a separate -header field can be used to indicate a transaction in progress; -we need only check for recovery if this is set. - -3.9 TDB Does Not Have Snapshot Support - -3.9.1 Proposed Solution - -None. At some point you say “use a real database”. - -But as a thought experiment, if we implemented transactions to -only overwrite free entries (this is tricky: there must not be a -header in each entry which indicates whether it is free, but use -of presence in metadata elsewhere), and a pointer to the hash -table, we could create an entirely new commit without destroying -existing data. Then it would be easy to implement snapshots in a -similar way. - -This would not allow arbitrary changes to the database, such as -tdb_repack does, and would require more space (since we have to -preserve the current and future entries at once). If we used hash -trees rather than one big hash table, we might only have to -rewrite some sections of the hash, too. - -We could then implement snapshots using a similar method, using -multiple different hash tables/free tables. - -3.10 Transactions Cannot Operate in Parallel - -This would be useless for ldb, as it hits the index records with -just about every update. It would add significant complexity in -resolving clashes, and cause the all transaction callers to write -their code to loop in the case where the transactions spuriously -failed. - -3.10.1 Proposed Solution - -We could solve a small part of the problem by providing read-only -transactions. These would allow one write transaction to begin, -but it could not commit until all r/o transactions are done. This -would require a new RO_TRANSACTION_LOCK, which would be upgraded -on commit. - -3.11 Default Hash Function Is Suboptimal - -The Knuth-inspired multiplicative hash used by tdb is fairly slow -(especially if we expand it to 64 bits), and works best when the -hash bucket size is a prime number (which also means a slow -modulus). In addition, it is highly predictable which could -potentially lead to a Denial of Service attack in some TDB uses. - -3.11.1 Proposed Solution - -The Jenkins lookup3 hash[footnote: -http://burtleburtle.net/bob/c/lookup3.c -] is a fast and superbly-mixing hash. It's used by the Linux -kernel and almost everything else. This has the particular -properties that it takes an initial seed, and produces two 32 bit -hash numbers, which we can combine into a 64-bit hash. - -The seed should be created at tdb-creation time from some random -source, and placed in the header. This is far from foolproof, but -adds a little bit of protection against hash bombing. - -3.12 Reliable Traversal Adds Complexity - -We lock a record during traversal iteration, and try to grab that -lock in the delete code. If that grab on delete fails, we simply -mark it deleted and continue onwards; traversal checks for this -condition and does the delete when it moves off the record. - -If traversal terminates, the dead record may be left -indefinitely. - -3.12.1 Proposed Solution - -Remove reliability guarantees; see [traverse-Proposed-Solution]. - -3.13 Fcntl Locking Adds Overhead - -Placing a fcntl lock means a system call, as does removing one. -This is actually one reason why transactions can be faster -(everything is locked once at transaction start). In the -uncontended case, this overhead can theoretically be eliminated. - -3.13.1 Proposed Solution - -None. - -We tried this before with spinlock support, in the early days of -TDB, and it didn't make much difference except in manufactured -benchmarks. - -We could use spinlocks (with futex kernel support under Linux), -but it means that we lose automatic cleanup when a process dies -with a lock. There is a method of auto-cleanup under Linux, but -it's not supported by other operating systems. We could -reintroduce a clear-if-first-style lock and sweep for dead -futexes on open, but that wouldn't help the normal case of one -concurrent opener dying. Increasingly elaborate repair schemes -could be considered, but they require an ABI change (everyone -must use them) anyway, so there's no need to do this at the same -time as everything else. - diff --git a/ccan/tdb2/doc/design.lyx b/ccan/tdb2/doc/design.lyx deleted file mode 100644 index ba3f9cc6..00000000 --- a/ccan/tdb2/doc/design.lyx +++ /dev/null @@ -1,2689 +0,0 @@ -#LyX 1.6.7 created this file. For more info see http://www.lyx.org/ -\lyxformat 345 -\begin_document -\begin_header -\textclass article -\use_default_options true -\language english -\inputencoding auto -\font_roman default -\font_sans default -\font_typewriter default -\font_default_family default -\font_sc false -\font_osf false -\font_sf_scale 100 -\font_tt_scale 100 - -\graphics default -\paperfontsize default -\use_hyperref false -\papersize default -\use_geometry false -\use_amsmath 1 -\use_esint 1 -\cite_engine basic -\use_bibtopic false -\paperorientation portrait -\secnumdepth 3 -\tocdepth 3 -\paragraph_separation indent -\defskip medskip -\quotes_language english -\papercolumns 1 -\papersides 1 -\paperpagestyle default -\tracking_changes true -\output_changes true -\author "" -\author "" -\end_header - -\begin_body - -\begin_layout Title -TDB2: A Redesigning The Trivial DataBase -\end_layout - -\begin_layout Author -Rusty Russell, IBM Corporation -\end_layout - -\begin_layout Date -17-March-2011 -\end_layout - -\begin_layout Abstract -The Trivial DataBase on-disk format is 32 bits; with usage cases heading - towards the 4G limit, that must change. - This required breakage provides an opportunity to revisit TDB's other design - decisions and reassess them. -\end_layout - -\begin_layout Section -Introduction -\end_layout - -\begin_layout Standard -The Trivial DataBase was originally written by Andrew Tridgell as a simple - key/data pair storage system with the same API as dbm, but allowing multiple - readers and writers while being small enough (< 1000 lines of C) to include - in SAMBA. - The simple design created in 1999 has proven surprisingly robust and performant -, used in Samba versions 3 and 4 as well as numerous other projects. - Its useful life was greatly increased by the (backwards-compatible!) addition - of transaction support in 2005. -\end_layout - -\begin_layout Standard -The wider variety and greater demands of TDB-using code has lead to some - organic growth of the API, as well as some compromises on the implementation. - None of these, by themselves, are seen as show-stoppers, but the cumulative - effect is to a loss of elegance over the initial, simple TDB implementation. - Here is a table of the approximate number of lines of implementation code - and number of API functions at the end of each year: -\end_layout - -\begin_layout Standard -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\begin_layout Plain Layout -Year End -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -API Functions -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -Lines of C Code Implementation -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -1999 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -13 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -1195 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2000 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -24 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -1725 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2001 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -32 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -2228 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2002 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -35 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -2481 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2003 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -35 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -2552 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2004 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -40 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -2584 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2005 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -38 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -2647 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2006 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -52 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -3754 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2007 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -66 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -4398 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2008 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -71 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -4768 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2009 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -73 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -5715 -\end_layout - -\end_inset - - - - -\end_inset - - -\end_layout - -\begin_layout Standard -This review is an attempt to catalog and address all the known issues with - TDB and create solutions which address the problems without significantly - increasing complexity; all involved are far too aware of the dangers of - second system syndrome in rewriting a successful project like this. -\end_layout - -\begin_layout Section -API Issues -\end_layout - -\begin_layout Subsection -tdb_open_ex Is Not Expandable -\end_layout - -\begin_layout Standard -The tdb_open() call was expanded to tdb_open_ex(), which added an optional - hashing function and an optional logging function argument. - Additional arguments to open would require the introduction of a tdb_open_ex2 - call etc. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\begin_inset CommandInset label -LatexCommand label -name "attributes" - -\end_inset - - -\end_layout - -\begin_layout Standard -tdb_open() will take a linked-list of attributes: -\end_layout - -\begin_layout LyX-Code -enum tdb_attribute { -\end_layout - -\begin_layout LyX-Code - TDB_ATTRIBUTE_LOG = 0, -\end_layout - -\begin_layout LyX-Code - TDB_ATTRIBUTE_HASH = 1 -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout LyX-Code -struct tdb_attribute_base { -\end_layout - -\begin_layout LyX-Code - enum tdb_attribute attr; -\end_layout - -\begin_layout LyX-Code - union tdb_attribute *next; -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout LyX-Code -struct tdb_attribute_log { -\end_layout - -\begin_layout LyX-Code - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */ -\end_layout - -\begin_layout LyX-Code - tdb_log_func log_fn; -\end_layout - -\begin_layout LyX-Code - void *log_private; -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout LyX-Code -struct tdb_attribute_hash { -\end_layout - -\begin_layout LyX-Code - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */ -\end_layout - -\begin_layout LyX-Code - tdb_hash_func hash_fn; -\end_layout - -\begin_layout LyX-Code - void *hash_private; -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout LyX-Code -union tdb_attribute { -\end_layout - -\begin_layout LyX-Code - struct tdb_attribute_base base; -\end_layout - -\begin_layout LyX-Code - struct tdb_attribute_log log; -\end_layout - -\begin_layout LyX-Code - struct tdb_attribute_hash hash; -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout Standard -This allows future attributes to be added, even if this expands the size - of the union. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -tdb_traverse Makes Impossible Guarantees -\end_layout - -\begin_layout Standard -tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it - was thought that it was important to guarantee that all records which exist - at the start and end of the traversal would be included, and no record - would be included twice. -\end_layout - -\begin_layout Standard -This adds complexity (see -\begin_inset CommandInset ref -LatexCommand ref -reference "Reliable-Traversal-Adds" - -\end_inset - -) and does not work anyway for records which are altered (in particular, - those which are expanded may be effectively deleted and re-added behind - the traversal). -\end_layout - -\begin_layout Subsubsection -\begin_inset CommandInset label -LatexCommand label -name "traverse-Proposed-Solution" - -\end_inset - -Proposed Solution -\end_layout - -\begin_layout Standard -Abandon the guarantee. - You will see every record if no changes occur during your traversal, otherwise - you will see some subset. - You can prevent changes by using a transaction or the locking API. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. - Delete-during-traverse will still delete every record, too (assuming no - other changes). -\end_layout - -\begin_layout Subsection -Nesting of Transactions Is Fraught -\end_layout - -\begin_layout Standard -TDB has alternated between allowing nested transactions and not allowing - them. - Various paths in the Samba codebase assume that transactions will nest, - and in a sense they can: the operation is only committed to disk when the - outer transaction is committed. - There are two problems, however: -\end_layout - -\begin_layout Enumerate -Canceling the inner transaction will cause the outer transaction commit - to fail, and will not undo any operations since the inner transaction began. - This problem is soluble with some additional internal code. -\end_layout - -\begin_layout Enumerate -An inner transaction commit can be cancelled by the outer transaction. - This is desirable in the way which Samba's database initialization code - uses transactions, but could be a surprise to any users expecting a successful - transaction commit to expose changes to others. -\end_layout - -\begin_layout Standard -The current solution is to specify the behavior at tdb_open(), with the - default currently that nested transactions are allowed. - This flag can also be changed at runtime. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Given the usage patterns, it seems that the -\begin_inset Quotes eld -\end_inset - -least-surprise -\begin_inset Quotes erd -\end_inset - - behavior of disallowing nested transactions should become the default. - Additionally, it seems the outer transaction is the only code which knows - whether inner transactions should be allowed, so a flag to indicate this - could be added to tdb_transaction_start. - However, this behavior can be simulated with a wrapper which uses tdb_add_flags -() and tdb_remove_flags(), so the API should not be expanded for this relatively --obscure case. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete; the nesting flag has been removed. -\end_layout - -\begin_layout Subsection -Incorrect Hash Function is Not Detected -\end_layout - -\begin_layout Standard -tdb_open_ex() allows the calling code to specify a different hash function - to use, but does not check that all other processes accessing this tdb - are using the same hash function. - The result is that records are missing from tdb_fetch(). -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The header should contain an example hash result (eg. - the hash of 0xdeadbeef), and tdb_open_ex() should check that the given - hash function produces the same answer, or fail the tdb_open call. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -tdb_set_max_dead/TDB_VOLATILE Expose Implementation -\end_layout - -\begin_layout Standard -In response to scalability issues with the free list ( -\begin_inset CommandInset ref -LatexCommand ref -reference "TDB-Freelist-Is" - -\end_inset - -) two API workarounds have been incorporated in TDB: tdb_set_max_dead() - and the TDB_VOLATILE flag to tdb_open. - The latter actually calls the former with an argument of -\begin_inset Quotes eld -\end_inset - -5 -\begin_inset Quotes erd -\end_inset - -. -\end_layout - -\begin_layout Standard -This code allows deleted records to accumulate without putting them in the - free list. - On delete we iterate through each chain and free them in a batch if there - are more than max_dead entries. - These are never otherwise recycled except as a side-effect of a tdb_repack. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -With the scalability problems of the freelist solved, this API can be removed. - The TDB_VOLATILE flag may still be useful as a hint that store and delete - of records will be at least as common as fetch in order to allow some internal - tuning, but initially will become a no-op. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. - Unknown flags cause tdb_open() to fail as well, so they can be detected - at runtime. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "TDB-Files-Cannot" - -\end_inset - -TDB Files Cannot Be Opened Multiple Times In The Same Process -\end_layout - -\begin_layout Standard -No process can open the same TDB twice; we check and disallow it. - This is an unfortunate side-effect of fcntl locks, which operate on a per-file - rather than per-file-descriptor basis, and do not nest. - Thus, closing any file descriptor on a file clears all the locks obtained - by this process, even if they were placed using a different file descriptor! -\end_layout - -\begin_layout Standard -Note that even if this were solved, deadlock could occur if operations were - nested: this is a more manageable programming error in most cases. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -We could lobby POSIX to fix the perverse rules, or at least lobby Linux - to violate them so that the most common implementation does not have this - restriction. - This would be a generally good idea for other fcntl lock users. -\end_layout - -\begin_layout Standard -Samba uses a wrapper which hands out the same tdb_context to multiple callers - if this happens, and does simple reference counting. - We should do this inside the tdb library, which already emulates lock nesting - internally; it would need to recognize when deadlock occurs within a single - process. - This would create a new failure mode for tdb operations (while we currently - handle locking failures, they are impossible in normal use and a process - encountering them can do little but give up). -\end_layout - -\begin_layout Standard -I do not see benefit in an additional tdb_open flag to indicate whether - re-opening is allowed, as though there may be some benefit to adding a - call to detect when a tdb_context is shared, to allow other to create such - an API. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -TDB API Is Not POSIX Thread-safe -\end_layout - -\begin_layout Standard -The TDB API uses an error code which can be queried after an operation to - determine what went wrong. - This programming model does not work with threads, unless specific additional - guarantees are given by the implementation. - In addition, even otherwise-independent threads cannot open the same TDB - (as in -\begin_inset CommandInset ref -LatexCommand ref -reference "TDB-Files-Cannot" - -\end_inset - -). -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Reachitecting the API to include a tdb_errcode pointer would be a great - deal of churn, but fortunately most functions return 0 on success and -1 - on error: we can change these to return 0 on success and a negative error - code on error, and the API remains similar to previous. - The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA - pointer and return an error code. - It is also simpler to have tdb_nextkey replace its key argument in place, - freeing up any old .dptr. -\end_layout - -\begin_layout Standard -Internal locking is required to make sure that fcntl locks do not overlap - between threads, and also that the global list of tdbs is maintained. -\end_layout - -\begin_layout Standard -The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe - version of the library, and otherwise no overhead will exist. - Alternatively, a hooking mechanism similar to that proposed for -\begin_inset CommandInset ref -LatexCommand ref -reference "Proposed-Solution-locking-hook" - -\end_inset - - could be used to enable pthread locking at runtime. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Incomplete; API has been changed but thread safety has not been implemented. -\end_layout - -\begin_layout Subsection -*_nonblock Functions And *_mark Functions Expose Implementation -\end_layout - -\begin_layout Standard -CTDB -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -Clustered TDB, see http://ctdb.samba.org -\end_layout - -\end_inset - - wishes to operate on TDB in a non-blocking manner. - This is currently done as follows: -\end_layout - -\begin_layout Enumerate -Call the _nonblock variant of an API function (eg. - tdb_lockall_nonblock). - If this fails: -\end_layout - -\begin_layout Enumerate -Fork a child process, and wait for it to call the normal variant (eg. - tdb_lockall). -\end_layout - -\begin_layout Enumerate -If the child succeeds, call the _mark variant to indicate we already have - the locks (eg. - tdb_lockall_mark). -\end_layout - -\begin_layout Enumerate -Upon completion, tell the child to release the locks (eg. - tdb_unlockall). -\end_layout - -\begin_layout Enumerate -Indicate to tdb that it should consider the locks removed (eg. - tdb_unlockall_mark). -\end_layout - -\begin_layout Standard -There are several issues with this approach. - Firstly, adding two new variants of each function clutters the API for - an obscure use, and so not all functions have three variants. - Secondly, it assumes that all paths of the functions ask for the same locks, - otherwise the parent process will have to get a lock which the child doesn't - have under some circumstances. - I don't believe this is currently the case, but it constrains the implementatio -n. - -\end_layout - -\begin_layout Subsubsection -\begin_inset CommandInset label -LatexCommand label -name "Proposed-Solution-locking-hook" - -\end_inset - -Proposed Solution -\end_layout - -\begin_layout Standard -Implement a hook for locking methods, so that the caller can control the - calls to create and remove fcntl locks. - In this scenario, ctdbd would operate as follows: -\end_layout - -\begin_layout Enumerate -Call the normal API function, eg tdb_lockall(). -\end_layout - -\begin_layout Enumerate -When the lock callback comes in, check if the child has the lock. - Initially, this is always false. - If so, return 0. - Otherwise, try to obtain it in non-blocking mode. - If that fails, return EWOULDBLOCK. -\end_layout - -\begin_layout Enumerate -Release locks in the unlock callback as normal. -\end_layout - -\begin_layout Enumerate -If tdb_lockall() fails, see if we recorded a lock failure; if so, call the - child to repeat the operation. -\end_layout - -\begin_layout Enumerate -The child records what locks it obtains, and returns that information to - the parent. -\end_layout - -\begin_layout Enumerate -When the child has succeeded, goto 1. -\end_layout - -\begin_layout Standard -This is flexible enough to handle any potential locking scenario, even when - lock requirements change. - It can be optimized so that the parent does not release locks, just tells - the child which locks it doesn't need to obtain. -\end_layout - -\begin_layout Standard -It also keeps the complexity out of the API, and in ctdbd where it is needed. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Incomplete. -\end_layout - -\begin_layout Subsection -tdb_chainlock Functions Expose Implementation -\end_layout - -\begin_layout Standard -tdb_chainlock locks some number of records, including the record indicated - by the given key. - This gave atomicity guarantees; no-one can start a transaction, alter, - read or delete that key while the lock is held. -\end_layout - -\begin_layout Standard -It also makes the same guarantee for any other key in the chain, which is - an internal implementation detail and potentially a cause for deadlock. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -None. - It would be nice to have an explicit single entry lock which effected no - other keys. - Unfortunately, this won't work for an entry which doesn't exist. - Thus while chainlock may be implemented more efficiently for the existing - case, it will still have overlap issues with the non-existing case. - So it is best to keep the current (lack of) guarantee about which records - will be effected to avoid constraining our implementation. -\end_layout - -\begin_layout Subsection -Signal Handling is Not Race-Free -\end_layout - -\begin_layout Standard -The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate - that the tdb locking code should return with a failure, rather than trying - again when a signal is received (and errno == EAGAIN). - This is usually used to implement timeouts. -\end_layout - -\begin_layout Standard -Unfortunately, this does not work in the case where the signal is received - before the tdb code enters the fcntl() call to place the lock: the code - will sleep within the fcntl() code, unaware that the signal wants it to - exit. - In the case of long timeouts, this does not happen in practice. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The locking hooks proposed in -\begin_inset CommandInset ref -LatexCommand ref -reference "Proposed-Solution-locking-hook" - -\end_inset - - would allow the user to decide on whether to fail the lock acquisition - on a signal. - This allows the caller to choose their own compromise: they could narrow - the race by checking immediately before the fcntl call. -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -It may be possible to make this race-free in some implementations by having - the signal handler alter the struct flock to make it invalid. - This will cause the fcntl() lock call to fail with EINVAL if the signal - occurs before the kernel is entered, otherwise EAGAIN. -\end_layout - -\end_inset - - -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Incomplete. -\end_layout - -\begin_layout Subsection -The API Uses Gratuitous Typedefs, Capitals -\end_layout - -\begin_layout Standard -typedefs are useful for providing source compatibility when types can differ - across implementations, or arguably in the case of function pointer definitions - which are hard for humans to parse. - Otherwise it is simply obfuscation and pollutes the namespace. -\end_layout - -\begin_layout Standard -Capitalization is usually reserved for compile-time constants and macros. -\end_layout - -\begin_layout Description -TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the - definition isn't visible to the API user anyway. -\end_layout - -\begin_layout Description -TDB_DATA There is no reason to use this over struct TDB_DATA; the struct - needs to be understood by the API user. -\end_layout - -\begin_layout Description -struct -\begin_inset space ~ -\end_inset - -TDB_DATA This would normally be called 'struct tdb_data'. -\end_layout - -\begin_layout Description -enum -\begin_inset space ~ -\end_inset - -TDB_ERROR Similarly, this would normally be enum tdb_error. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -None. - Introducing lower case variants would please pedants like myself, but if - it were done the existing ones should be kept. - There is little point forcing a purely cosmetic change upon tdb users. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "tdb_log_func-Doesnt-Take" - -\end_inset - -tdb_log_func Doesn't Take The Private Pointer -\end_layout - -\begin_layout Standard -For API compatibility reasons, the logging function needs to call tdb_get_loggin -g_private() to retrieve the pointer registered by the tdb_open_ex for logging. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -It should simply take an extra argument, since we are prepared to break - the API/ABI. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -Various Callback Functions Are Not Typesafe -\end_layout - -\begin_layout Standard -The callback functions in tdb_set_logging_function (after -\begin_inset CommandInset ref -LatexCommand ref -reference "tdb_log_func-Doesnt-Take" - -\end_inset - - is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check - all take void * and must internally convert it to the argument type they - were expecting. -\end_layout - -\begin_layout Standard -If this type changes, the compiler will not produce warnings on the callers, - since it only sees void *. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -With careful use of macros, we can create callback functions which give - a warning when used on gcc and the types of the callback and its private - argument differ. - Unsupported compilers will not give a warning, which is no worse than now. - In addition, the callbacks become clearer, as they need not use void * - for their parameter. -\end_layout - -\begin_layout Standard -See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic -\end_layout - -\begin_layout Standard -The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should - be cleared if the caller discovers it is the only process with the TDB - open. - However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not - be detected, so will have the TDB erased underneath them (usually resulting - in a crash). -\end_layout - -\begin_layout Standard -There is a similar issue on fork(); if the parent exits (or otherwise closes - the tdb) before the child calls tdb_reopen_all() to establish the lock - used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener - at that moment will believe it alone has opened the TDB and will erase - it. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Remove TDB_CLEAR_IF_FIRST. - Other workarounds are possible, but see -\begin_inset CommandInset ref -LatexCommand ref -reference "TDB_CLEAR_IF_FIRST-Imposes-Performance" - -\end_inset - -. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -Extending The Header Is Difficult -\end_layout - -\begin_layout Standard -We have reserved (zeroed) words in the TDB header, which can be used for - future features. - If the future features are compulsory, the version number must be updated - to prevent old code from accessing the database. - But if the future feature is optional, we have no way of telling if older - code is accessing the database or not. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The header should contain a -\begin_inset Quotes eld -\end_inset - -format variant -\begin_inset Quotes erd -\end_inset - - value (64-bit). - This is divided into two 32-bit parts: -\end_layout - -\begin_layout Enumerate -The lower part reflects the format variant understood by code accessing - the database. -\end_layout - -\begin_layout Enumerate -The upper part reflects the format variant you must understand to write - to the database (otherwise you can only open for reading). -\end_layout - -\begin_layout Standard -The latter field can only be written at creation time, the former should - be written under the OPEN_LOCK when opening the database for writing, if - the variant of the code is lower than the current lowest variant. -\end_layout - -\begin_layout Standard -This should allow backwards-compatible features to be added, and detection - if older code (which doesn't understand the feature) writes to the database. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -Record Headers Are Not Expandible -\end_layout - -\begin_layout Standard -If we later want to add (say) checksums on keys and data, it would require - another format change, which we'd like to avoid. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -We often have extra padding at the tail of a record. - If we ensure that the first byte (if any) of this padding is zero, we will - have a way for future changes to detect code which doesn't understand a - new format: the new code would write (say) a 1 at the tail, and thus if - there is no tail or the first byte is 0, we would know the extension is - not present on that record. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -TDB Does Not Use Talloc -\end_layout - -\begin_layout Standard -Many users of TDB (particularly Samba) use the talloc allocator, and thus - have to wrap TDB in a talloc context to use it conveniently. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The allocation within TDB is not complicated enough to justify the use of - talloc, and I am reluctant to force another (excellent) library on TDB - users. - Nonetheless a compromise is possible. - An attribute (see -\begin_inset CommandInset ref -LatexCommand ref -reference "attributes" - -\end_inset - -) can be added later to tdb_open() to provide an alternate allocation mechanism, - specifically for talloc but usable by any other allocator (which would - ignore the -\begin_inset Quotes eld -\end_inset - -context -\begin_inset Quotes erd -\end_inset - - argument). -\end_layout - -\begin_layout Standard -This would form a talloc heirarchy as expected, but the caller would still - have to attach a destructor to the tdb context returned from tdb_open to - close it. - All TDB_DATA fields would be children of the tdb_context, and the caller - would still have to manage them (using talloc_free() or talloc_steal()). -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Deferred. -\end_layout - -\begin_layout Section -Performance And Scalability Issues -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "TDB_CLEAR_IF_FIRST-Imposes-Performance" - -\end_inset - -TDB_CLEAR_IF_FIRST Imposes Performance Penalty -\end_layout - -\begin_layout Standard -When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset - 4 (aka. - the ACTIVE_LOCK). - While these locks never conflict in normal tdb usage, they do add substantial - overhead for most fcntl lock implementations when the kernel scans to detect - if a lock conflict exists. - This is often a single linked list, making the time to acquire and release - a fcntl lock O(N) where N is the number of processes with the TDB open, - not the number actually doing work. -\end_layout - -\begin_layout Standard -In a Samba server it is common to have huge numbers of clients sitting idle, - and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag. -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -There is a flag to tdb_reopen_all() which is used for this optimization: - if the parent process will outlive the child, the child does not need the - ACTIVE_LOCK. - This is a workaround for this very performance issue. -\end_layout - -\end_inset - - -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Remove the flag. - It was a neat idea, but even trivial servers tend to know when they are - initializing for the first time and can simply unlink the old tdb at that - point. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -TDB Files Have a 4G Limit -\end_layout - -\begin_layout Standard -This seems to be becoming an issue (so much for -\begin_inset Quotes eld -\end_inset - -trivial -\begin_inset Quotes erd -\end_inset - -!), particularly for ldb. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -A new, incompatible TDB format which uses 64 bit offsets internally rather - than 32 bit as now. - For simplicity of endian conversion (which TDB does on the fly if required), - all values will be 64 bit on disk. - In practice, some upper bits may be used for other purposes, but at least - 56 bits will be available for file offsets. -\end_layout - -\begin_layout Standard -tdb_open() will automatically detect the old version, and even create them - if TDB_VERSION6 is specified to tdb_open. -\end_layout - -\begin_layout Standard -32 bit processes will still be able to access TDBs larger than 4G (assuming - that their off_t allows them to seek to 64 bits), they will gracefully - fall back as they fail to mmap. - This can happen already with large TDBs. -\end_layout - -\begin_layout Standard -Old versions of tdb will fail to open the new TDB files (since 28 August - 2009, commit 398d0c29290: prior to that any unrecognized file format would - be erased and initialized as a fresh tdb!) -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -TDB Records Have a 4G Limit -\end_layout - -\begin_layout Standard -This has not been a reported problem, and the API uses size_t which can - be 64 bit on 64 bit platforms. - However, other limits may have made such an issue moot. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Record sizes will be 64 bit, with an error returned on 32 bit platforms - which try to access such records (the current implementation would return - TDB_ERR_OOM in a similar case). - It seems unlikely that 32 bit keys will be a limitation, so the implementation - may not support this (see -\begin_inset CommandInset ref -LatexCommand ref -reference "sub:Records-Incur-A" - -\end_inset - -). -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -Hash Size Is Determined At TDB Creation Time -\end_layout - -\begin_layout Standard -TDB contains a number of hash chains in the header; the number is specified - at creation time, and defaults to 131. - This is such a bottleneck on large databases (as each hash chain gets quite - long), that LDB uses 10,000 for this hash. - In general it is impossible to know what the 'right' answer is at database - creation time. -\end_layout - -\begin_layout Subsubsection -\begin_inset CommandInset label -LatexCommand label -name "sub:Hash-Size-Solution" - -\end_inset - -Proposed Solution -\end_layout - -\begin_layout Standard -After comprehensive performance testing on various scalable hash variants -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying - because I was previously convinced that an expanding tree of hashes would - be very close to optimal. -\end_layout - -\end_inset - -, it became clear that it is hard to beat a straight linear hash table which - doubles in size when it reaches saturation. - Unfortunately, altering the hash table introduces serious locking complications -: the entire hash table needs to be locked to enlarge the hash table, and - others might be holding locks. - Particularly insidious are insertions done under tdb_chainlock. -\end_layout - -\begin_layout Standard -Thus an expanding layered hash will be used: an array of hash groups, with - each hash group exploding into pointers to lower hash groups once it fills, - turning into a hash tree. - This has implications for locking: we must lock the entire group in case - we need to expand it, yet we don't know how deep the tree is at that point. -\end_layout - -\begin_layout Standard -Note that bits from the hash table entries should be stolen to hold more - hash bits to reduce the penalty of collisions. - We can use the otherwise-unused lower 3 bits. - If we limit the size of the database to 64 exabytes, we can use the top - 8 bits of the hash entry as well. - These 11 bits would reduce false positives down to 1 in 2000 which is more - than we need: we can use one of the bits to indicate that the extra hash - bits are valid. - This means we can choose not to re-hash all entries when we expand a hash - group; simply use the next bits we need and mark them invalid. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "TDB-Freelist-Is" - -\end_inset - -TDB Freelist Is Highly Contended -\end_layout - -\begin_layout Standard -TDB uses a single linked list for the free list. - Allocation occurs as follows, using heuristics which have evolved over - time: -\end_layout - -\begin_layout Enumerate -Get the free list lock for this whole operation. -\end_layout - -\begin_layout Enumerate -Multiply length by 1.25, so we always over-allocate by 25%. -\end_layout - -\begin_layout Enumerate -Set the slack multiplier to 1. -\end_layout - -\begin_layout Enumerate -Examine the current freelist entry: if it is > length but < the current - best case, remember it as the best case. -\end_layout - -\begin_layout Enumerate -Multiply the slack multiplier by 1.05. -\end_layout - -\begin_layout Enumerate -If our best fit so far is less than length * slack multiplier, return it. - The slack will be turned into a new free record if it's large enough. -\end_layout - -\begin_layout Enumerate -Otherwise, go onto the next freelist entry. -\end_layout - -\begin_layout Standard -Deleting a record occurs as follows: -\end_layout - -\begin_layout Enumerate -Lock the hash chain for this whole operation. -\end_layout - -\begin_layout Enumerate -Walk the chain to find the record, keeping the prev pointer offset. -\end_layout - -\begin_layout Enumerate -If max_dead is non-zero: -\end_layout - -\begin_deeper -\begin_layout Enumerate -Walk the hash chain again and count the dead records. -\end_layout - -\begin_layout Enumerate -If it's more than max_dead, bulk free all the dead ones (similar to steps - 4 and below, but the lock is only obtained once). -\end_layout - -\begin_layout Enumerate -Simply mark this record as dead and return. - -\end_layout - -\end_deeper -\begin_layout Enumerate -Get the free list lock for the remainder of this operation. -\end_layout - -\begin_layout Enumerate -\begin_inset CommandInset label -LatexCommand label -name "right-merging" - -\end_inset - -Examine the following block to see if it is free; if so, enlarge the current - block and remove that block from the free list. - This was disabled, as removal from the free list was O(entries-in-free-list). -\end_layout - -\begin_layout Enumerate -Examine the preceeding block to see if it is free: for this reason, each - block has a 32-bit tailer which indicates its length. - If it is free, expand it to cover our new block and return. -\end_layout - -\begin_layout Enumerate -Otherwise, prepend ourselves to the free list. -\end_layout - -\begin_layout Standard -Disabling right-merging (step -\begin_inset CommandInset ref -LatexCommand ref -reference "right-merging" - -\end_inset - -) causes fragmentation; the other heuristics proved insufficient to address - this, so the final answer to this was that when we expand the TDB file - inside a transaction commit, we repack the entire tdb. -\end_layout - -\begin_layout Standard -The single list lock limits our allocation rate; due to the other issues - this is not currently seen as a bottleneck. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The first step is to remove all the current heuristics, as they obviously - interact, then examine them once the lock contention is addressed. -\end_layout - -\begin_layout Standard -The free list must be split to reduce contention. - Assuming perfect free merging, we can at most have 1 free list entry for - each entry. - This implies that the number of free lists is related to the size of the - hash table, but as it is rare to walk a large number of free list entries - we can use far fewer, say 1/32 of the number of hash buckets. -\end_layout - -\begin_layout Standard -It seems tempting to try to reuse the hash implementation which we use for - records here, but we have two ways of searching for free entries: for allocatio -n we search by size (and possibly zone) which produces too many clashes - for our hash table to handle well, and for coalescing we search by address. - Thus an array of doubly-linked free lists seems preferable. -\end_layout - -\begin_layout Standard -There are various benefits in using per-size free lists (see -\begin_inset CommandInset ref -LatexCommand ref -reference "sub:TDB-Becomes-Fragmented" - -\end_inset - -) but it's not clear this would reduce contention in the common case where - all processes are allocating/freeing the same size. - Thus we almost certainly need to divide in other ways: the most obvious - is to divide the file into zones, and using a free list (or table of free - lists) for each. - This approximates address ordering. -\end_layout - -\begin_layout Standard -Unfortunately it is difficult to know what heuristics should be used to - determine zone sizes, and our transaction code relies on being able to - create a -\begin_inset Quotes eld -\end_inset - -recovery area -\begin_inset Quotes erd -\end_inset - - by simply appending to the file (difficult if it would need to create a - new zone header). - Thus we use a linked-list of free tables; currently we only ever create - one, but if there is more than one we choose one at random to use. - In future we may use heuristics to add new free tables on contention. - We only expand the file when all free tables are exhausted. -\end_layout - -\begin_layout Standard -The basic algorithm is as follows. - Freeing is simple: -\end_layout - -\begin_layout Enumerate -Identify the correct free list. -\end_layout - -\begin_layout Enumerate -Lock the corresponding list. -\end_layout - -\begin_layout Enumerate -Re-check the list (we didn't have a lock, sizes could have changed): relock - if necessary. -\end_layout - -\begin_layout Enumerate -Place the freed entry in the list. -\end_layout - -\begin_layout Standard -Allocation is a little more complicated, as we perform delayed coalescing - at this point: -\end_layout - -\begin_layout Enumerate -Pick a free table; usually the previous one. -\end_layout - -\begin_layout Enumerate -Lock the corresponding list. -\end_layout - -\begin_layout Enumerate -If the top entry is -large enough, remove it from the list and return it. -\end_layout - -\begin_layout Enumerate -Otherwise, coalesce entries in the list.If there was no entry large enough, - unlock the list and try the next largest list -\end_layout - -\begin_layout Enumerate -If no list has an entry which meets our needs, try the next free table. -\end_layout - -\begin_layout Enumerate -If no zone satisfies, expand the file. -\end_layout - -\begin_layout Standard -This optimizes rapid insert/delete of free list entries by not coalescing - them all the time.. - First-fit address ordering ordering seems to be fairly good for keeping - fragmentation low (see -\begin_inset CommandInset ref -LatexCommand ref -reference "sub:TDB-Becomes-Fragmented" - -\end_inset - -). - Note that address ordering does not need a tailer to coalesce, though if - we needed one we could have one cheaply: see -\begin_inset CommandInset ref -LatexCommand ref -reference "sub:Records-Incur-A" - -\end_inset - -. - -\end_layout - -\begin_layout Standard -Each free entry has the free table number in the header: less than 255. - It also contains a doubly-linked list for easy deletion. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "sub:TDB-Becomes-Fragmented" - -\end_inset - -TDB Becomes Fragmented -\end_layout - -\begin_layout Standard -Much of this is a result of allocation strategy -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute -xas.edu/pub/garbage/malloc/ismm98.ps -\end_layout - -\end_inset - - and deliberate hobbling of coalescing; internal fragmentation (aka overallocati -on) is deliberately set at 25%, and external fragmentation is only cured - by the decision to repack the entire db when a transaction commit needs - to enlarge the file. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The 25% overhead on allocation works in practice for ldb because indexes - tend to expand by one record at a time. - This internal fragmentation can be resolved by having an -\begin_inset Quotes eld -\end_inset - -expanded -\begin_inset Quotes erd -\end_inset - - bit in the header to note entries that have previously expanded, and allocating - more space for them. -\end_layout - -\begin_layout Standard -There are is a spectrum of possible solutions for external fragmentation: - one is to use a fragmentation-avoiding allocation strategy such as best-fit - address-order allocator. - The other end of the spectrum would be to use a bump allocator (very fast - and simple) and simply repack the file when we reach the end. -\end_layout - -\begin_layout Standard -There are three problems with efficient fragmentation-avoiding allocators: - they are non-trivial, they tend to use a single free list for each size, - and there's no evidence that tdb allocation patterns will match those recorded - for general allocators (though it seems likely). -\end_layout - -\begin_layout Standard -Thus we don't spend too much effort on external fragmentation; we will be - no worse than the current code if we need to repack on occasion. - More effort is spent on reducing freelist contention, and reducing overhead. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "sub:Records-Incur-A" - -\end_inset - -Records Incur A 28-Byte Overhead -\end_layout - -\begin_layout Standard -Each TDB record has a header as follows: -\end_layout - -\begin_layout LyX-Code -struct tdb_record { -\end_layout - -\begin_layout LyX-Code - tdb_off_t next; /* offset of the next record in the list */ -\end_layout - -\begin_layout LyX-Code - tdb_len_t rec_len; /* total byte length of record */ -\end_layout - -\begin_layout LyX-Code - tdb_len_t key_len; /* byte length of key */ -\end_layout - -\begin_layout LyX-Code - tdb_len_t data_len; /* byte length of data */ -\end_layout - -\begin_layout LyX-Code - uint32_t full_hash; /* the full 32 bit hash of the key */ -\end_layout - -\begin_layout LyX-Code - uint32_t magic; /* try to catch errors */ -\end_layout - -\begin_layout LyX-Code - /* the following union is implied: -\end_layout - -\begin_layout LyX-Code - union { -\end_layout - -\begin_layout LyX-Code - char record[rec_len]; -\end_layout - -\begin_layout LyX-Code - struct { -\end_layout - -\begin_layout LyX-Code - char key[key_len]; -\end_layout - -\begin_layout LyX-Code - char data[data_len]; -\end_layout - -\begin_layout LyX-Code - } -\end_layout - -\begin_layout LyX-Code - uint32_t totalsize; (tailer) -\end_layout - -\begin_layout LyX-Code - } -\end_layout - -\begin_layout LyX-Code - */ -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout Standard -Naively, this would double to a 56-byte overhead on a 64 bit implementation. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -We can use various techniques to reduce this for an allocated block: -\end_layout - -\begin_layout Enumerate -The 'next' pointer is not required, as we are using a flat hash table. -\end_layout - -\begin_layout Enumerate -'rec_len' can instead be expressed as an addition to key_len and data_len - (it accounts for wasted or overallocated length in the record). - Since the record length is always a multiple of 8, we can conveniently - fit it in 32 bits (representing up to 35 bits). -\end_layout - -\begin_layout Enumerate -'key_len' and 'data_len' can be reduced. - I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine - the two into one 64-bit field and using a 5 bit value which indicates at - what bit to divide the two. - Keys are unlikely to scale as fast as data, so I'm assuming a maximum key - size of 32 bits. -\end_layout - -\begin_layout Enumerate -'full_hash' is used to avoid a memcmp on the -\begin_inset Quotes eld -\end_inset - -miss -\begin_inset Quotes erd -\end_inset - - case, but this is diminishing returns after a handful of bits (at 10 bits, - it reduces 99.9% of false memcmp). - As an aside, as the lower bits are already incorporated in the hash table - resolution, the upper bits should be used here. - Note that it's not clear that these bits will be a win, given the extra - bits in the hash table itself (see -\begin_inset CommandInset ref -LatexCommand ref -reference "sub:Hash-Size-Solution" - -\end_inset - -). -\end_layout - -\begin_layout Enumerate -'magic' does not need to be enlarged: it currently reflects one of 5 values - (used, free, dead, recovery, and unused_recovery). - It is useful for quick sanity checking however, and should not be eliminated. -\end_layout - -\begin_layout Enumerate -'tailer' is only used to coalesce free blocks (so a block to the right can - find the header to check if this block is free). - This can be replaced by a single 'free' bit in the header of the following - block (and the tailer only exists in free blocks). -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -This technique from Thomas Standish. - Data Structure Techniques. - Addison-Wesley, Reading, Massachusetts, 1980. -\end_layout - -\end_inset - - The current proposed coalescing algorithm doesn't need this, however. -\end_layout - -\begin_layout Standard -This produces a 16 byte used header like this: -\end_layout - -\begin_layout LyX-Code -struct tdb_used_record { -\end_layout - -\begin_layout LyX-Code - uint32_t used_magic : 16, -\end_layout - -\begin_layout LyX-Code - -\end_layout - -\begin_layout LyX-Code - key_data_divide: 5, -\end_layout - -\begin_layout LyX-Code - top_hash: 11; -\end_layout - -\begin_layout LyX-Code - uint32_t extra_octets; -\end_layout - -\begin_layout LyX-Code - uint64_t key_and_data_len; -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout Standard -And a free record like this: -\end_layout - -\begin_layout LyX-Code -struct tdb_free_record { -\end_layout - -\begin_layout LyX-Code - uint64_t free_magic: 8, -\end_layout - -\begin_layout LyX-Code - prev : 56; -\end_layout - -\begin_layout LyX-Code - -\end_layout - -\begin_layout LyX-Code - uint64_t free_table: 8, -\end_layout - -\begin_layout LyX-Code - total_length : 56 -\end_layout - -\begin_layout LyX-Code - uint64_t next;; -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout Standard -Note that by limiting valid offsets to 56 bits, we can pack everything we - need into 3 64-byte words, meaning our minimum record size is 8 bytes. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -Transaction Commit Requires 4 fdatasync -\end_layout - -\begin_layout Standard -The current transaction algorithm is: -\end_layout - -\begin_layout Enumerate -write_recovery_data(); -\end_layout - -\begin_layout Enumerate -sync(); -\end_layout - -\begin_layout Enumerate -write_recovery_header(); -\end_layout - -\begin_layout Enumerate -sync(); -\end_layout - -\begin_layout Enumerate -overwrite_with_new_data(); -\end_layout - -\begin_layout Enumerate -sync(); -\end_layout - -\begin_layout Enumerate -remove_recovery_header(); -\end_layout - -\begin_layout Enumerate -sync(); -\end_layout - -\begin_layout Standard -On current ext3, each sync flushes all data to disk, so the next 3 syncs - are relatively expensive. - But this could become a performance bottleneck on other filesystems such - as ext4. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Neil Brown points out that this is overzealous, and only one sync is needed: -\end_layout - -\begin_layout Enumerate -Bundle the recovery data, a transaction counter and a strong checksum of - the new data. -\end_layout - -\begin_layout Enumerate -Strong checksum that whole bundle. -\end_layout - -\begin_layout Enumerate -Store the bundle in the database. -\end_layout - -\begin_layout Enumerate -Overwrite the oldest of the two recovery pointers in the header (identified - using the transaction counter) with the offset of this bundle. -\end_layout - -\begin_layout Enumerate -sync. -\end_layout - -\begin_layout Enumerate -Write the new data to the file. -\end_layout - -\begin_layout Standard -Checking for recovery means identifying the latest bundle with a valid checksum - and using the new data checksum to ensure that it has been applied. - This is more expensive than the current check, but need only be done at - open. - For running databases, a separate header field can be used to indicate - a transaction in progress; we need only check for recovery if this is set. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Deferred. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "sub:TDB-Does-Not" - -\end_inset - -TDB Does Not Have Snapshot Support -\end_layout - -\begin_layout Subsubsection -Proposed SolutionNone. - At some point you say -\begin_inset Quotes eld -\end_inset - -use a real database -\begin_inset Quotes erd -\end_inset - - (but see -\begin_inset CommandInset ref -LatexCommand ref -reference "replay-attribute" - -\end_inset - -). -\end_layout - -\begin_layout Standard -But as a thought experiment, if we implemented transactions to only overwrite - free entries (this is tricky: there must not be a header in each entry - which indicates whether it is free, but use of presence in metadata elsewhere), - and a pointer to the hash table, we could create an entirely new commit - without destroying existing data. - Then it would be easy to implement snapshots in a similar way. -\end_layout - -\begin_layout Standard -This would not allow arbitrary changes to the database, such as tdb_repack - does, and would require more space (since we have to preserve the current - and future entries at once). - If we used hash trees rather than one big hash table, we might only have - to rewrite some sections of the hash, too. -\end_layout - -\begin_layout Standard -We could then implement snapshots using a similar method, using multiple - different hash tables/free tables. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Deferred. -\end_layout - -\begin_layout Subsection -Transactions Cannot Operate in Parallel -\end_layout - -\begin_layout Standard -This would be useless for ldb, as it hits the index records with just about - every update. - It would add significant complexity in resolving clashes, and cause the - all transaction callers to write their code to loop in the case where the - transactions spuriously failed. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -None (but see -\begin_inset CommandInset ref -LatexCommand ref -reference "replay-attribute" - -\end_inset - -). - We could solve a small part of the problem by providing read-only transactions. - These would allow one write transaction to begin, but it could not commit - until all r/o transactions are done. - This would require a new RO_TRANSACTION_LOCK, which would be upgraded on - commit. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Deferred. -\end_layout - -\begin_layout Subsection -Default Hash Function Is Suboptimal -\end_layout - -\begin_layout Standard -The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially - if we expand it to 64 bits), and works best when the hash bucket size is - a prime number (which also means a slow modulus). - In addition, it is highly predictable which could potentially lead to a - Denial of Service attack in some TDB uses. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The Jenkins lookup3 hash -\begin_inset Foot -status open - -\begin_layout Plain Layout -http://burtleburtle.net/bob/c/lookup3.c -\end_layout - -\end_inset - - is a fast and superbly-mixing hash. - It's used by the Linux kernel and almost everything else. - This has the particular properties that it takes an initial seed, and produces - two 32 bit hash numbers, which we can combine into a 64-bit hash. -\end_layout - -\begin_layout Standard -The seed should be created at tdb-creation time from some random source, - and placed in the header. - This is far from foolproof, but adds a little bit of protection against - hash bombing. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "Reliable-Traversal-Adds" - -\end_inset - -Reliable Traversal Adds Complexity -\end_layout - -\begin_layout Standard -We lock a record during traversal iteration, and try to grab that lock in - the delete code. - If that grab on delete fails, we simply mark it deleted and continue onwards; - traversal checks for this condition and does the delete when it moves off - the record. -\end_layout - -\begin_layout Standard -If traversal terminates, the dead record may be left indefinitely. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Remove reliability guarantees; see -\begin_inset CommandInset ref -LatexCommand ref -reference "traverse-Proposed-Solution" - -\end_inset - -. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -Fcntl Locking Adds Overhead -\end_layout - -\begin_layout Standard -Placing a fcntl lock means a system call, as does removing one. - This is actually one reason why transactions can be faster (everything - is locked once at transaction start). - In the uncontended case, this overhead can theoretically be eliminated. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -None. -\end_layout - -\begin_layout Standard -We tried this before with spinlock support, in the early days of TDB, and - it didn't make much difference except in manufactured benchmarks. -\end_layout - -\begin_layout Standard -We could use spinlocks (with futex kernel support under Linux), but it means - that we lose automatic cleanup when a process dies with a lock. - There is a method of auto-cleanup under Linux, but it's not supported by - other operating systems. - We could reintroduce a clear-if-first-style lock and sweep for dead futexes - on open, but that wouldn't help the normal case of one concurrent opener - dying. - Increasingly elaborate repair schemes could be considered, but they require - an ABI change (everyone must use them) anyway, so there's no need to do - this at the same time as everything else. -\end_layout - -\begin_layout Subsection -Some Transactions Don't Require Durability -\end_layout - -\begin_layout Standard -Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast) - usage, and occasionally empties the results into a transactional TDB. - This kind of usage prioritizes performance over durability: as long as - we are consistent, data can be lost. -\end_layout - -\begin_layout Standard -This would be more neatly implemented inside tdb: a -\begin_inset Quotes eld -\end_inset - -soft -\begin_inset Quotes erd -\end_inset - - transaction commit (ie. - syncless) which meant that data may be reverted on a crash. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -None. -\end_layout - -\begin_layout Standard -Unfortunately any transaction scheme which overwrites old data requires - a sync before that overwrite to avoid the possibility of corruption. -\end_layout - -\begin_layout Standard -It seems possible to use a scheme similar to that described in -\begin_inset CommandInset ref -LatexCommand ref -reference "sub:TDB-Does-Not" - -\end_inset - -,where transactions are committed without overwriting existing data, and - an array of top-level pointers were available in the header. - If the transaction is -\begin_inset Quotes eld -\end_inset - -soft -\begin_inset Quotes erd -\end_inset - - then we would not need a sync at all: existing processes would pick up - the new hash table and free list and work with that. -\end_layout - -\begin_layout Standard -At some later point, a sync would allow recovery of the old data into the - free lists (perhaps when the array of top-level pointers filled). - On crash, tdb_open() would examine the array of top levels, and apply the - transactions until it encountered an invalid checksum. -\end_layout - -\begin_layout Subsection -Tracing Is Fragile, Replay Is External -\end_layout - -\begin_layout Standard -The current TDB has compile-time-enabled tracing code, but it often breaks - as it is not enabled by default. - In a similar way, the ctdb code has an external wrapper which does replay - tracing so it can coordinate cluster-wide transactions. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\begin_inset CommandInset label -LatexCommand label -name "replay-attribute" - -\end_inset - - -\end_layout - -\begin_layout Standard -Tridge points out that an attribute can be later added to tdb_open (see - -\begin_inset CommandInset ref -LatexCommand ref -reference "attributes" - -\end_inset - -) to provide replay/trace hooks, which could become the basis for this and - future parallel transactions and snapshot support. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Deferred. -\end_layout - -\end_body -\end_document diff --git a/ccan/tdb2/doc/design.lyx,v b/ccan/tdb2/doc/design.lyx,v deleted file mode 100644 index 68e5ed27..00000000 --- a/ccan/tdb2/doc/design.lyx,v +++ /dev/null @@ -1,4679 +0,0 @@ -head 1.13; -access; -symbols; -locks; strict; -comment @# @; - - -1.13 -date 2011.03.01.11.46.54; author rusty; state Exp; -branches; -next 1.12; - -1.12 -date 2010.12.01.12.20.49; author rusty; state Exp; -branches; -next 1.11; - -1.11 -date 2010.12.01.11.55.20; author rusty; state Exp; -branches; -next 1.10; - -1.10 -date 2010.09.14.00.33.57; author rusty; state Exp; -branches; -next 1.9; - -1.9 -date 2010.09.09.07.25.12; author rusty; state Exp; -branches; -next 1.8; - -1.8 -date 2010.09.02.02.29.05; author rusty; state Exp; -branches; -next 1.7; - -1.7 -date 2010.09.01.10.58.12; author rusty; state Exp; -branches; -next 1.6; - -1.6 -date 2010.08.02.00.21.43; author rusty; state Exp; -branches; -next 1.5; - -1.5 -date 2010.08.02.00.21.16; author rusty; state Exp; -branches; -next 1.4; - -1.4 -date 2010.05.10.13.09.11; author rusty; state Exp; -branches; -next 1.3; - -1.3 -date 2010.05.10.11.58.37; author rusty; state Exp; -branches; -next 1.2; - -1.2 -date 2010.05.10.05.35.13; author rusty; state Exp; -branches; -next 1.1; - -1.1 -date 2010.05.04.02.29.16; author rusty; state Exp; -branches; -next ; - - -desc -@First draft -@ - - -1.13 -log -@Thread-safe API -@ -text -@#LyX 1.6.7 created this file. For more info see http://www.lyx.org/ -\lyxformat 345 -\begin_document -\begin_header -\textclass article -\use_default_options true -\language english -\inputencoding auto -\font_roman default -\font_sans default -\font_typewriter default -\font_default_family default -\font_sc false -\font_osf false -\font_sf_scale 100 -\font_tt_scale 100 - -\graphics default -\paperfontsize default -\use_hyperref false -\papersize default -\use_geometry false -\use_amsmath 1 -\use_esint 1 -\cite_engine basic -\use_bibtopic false -\paperorientation portrait -\secnumdepth 3 -\tocdepth 3 -\paragraph_separation indent -\defskip medskip -\quotes_language english -\papercolumns 1 -\papersides 1 -\paperpagestyle default -\tracking_changes true -\output_changes true -\author "Rusty Russell,,," -\author "" -\end_header - -\begin_body - -\begin_layout Title -TDB2: A Redesigning The Trivial DataBase -\end_layout - -\begin_layout Author -Rusty Russell, IBM Corporation -\end_layout - -\begin_layout Date -1-December-2010 -\end_layout - -\begin_layout Abstract -The Trivial DataBase on-disk format is 32 bits; with usage cases heading - towards the 4G limit, that must change. - This required breakage provides an opportunity to revisit TDB's other design - decisions and reassess them. -\end_layout - -\begin_layout Section -Introduction -\end_layout - -\begin_layout Standard -The Trivial DataBase was originally written by Andrew Tridgell as a simple - key/data pair storage system with the same API as dbm, but allowing multiple - readers and writers while being small enough (< 1000 lines of C) to include - in SAMBA. - The simple design created in 1999 has proven surprisingly robust and performant -, used in Samba versions 3 and 4 as well as numerous other projects. - Its useful life was greatly increased by the (backwards-compatible!) addition - of transaction support in 2005. -\end_layout - -\begin_layout Standard -The wider variety and greater demands of TDB-using code has lead to some - organic growth of the API, as well as some compromises on the implementation. - None of these, by themselves, are seen as show-stoppers, but the cumulative - effect is to a loss of elegance over the initial, simple TDB implementation. - Here is a table of the approximate number of lines of implementation code - and number of API functions at the end of each year: -\end_layout - -\begin_layout Standard -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\begin_layout Plain Layout -Year End -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -API Functions -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -Lines of C Code Implementation -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -1999 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -13 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -1195 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2000 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -24 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -1725 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2001 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -32 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -2228 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2002 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -35 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -2481 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2003 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -35 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -2552 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2004 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -40 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -2584 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2005 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -38 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -2647 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2006 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -52 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -3754 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2007 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -66 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -4398 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2008 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -71 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -4768 -\end_layout - -\end_inset - - - - -\begin_inset Text - -\begin_layout Plain Layout -2009 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -73 -\end_layout - -\end_inset - - -\begin_inset Text - -\begin_layout Plain Layout -5715 -\end_layout - -\end_inset - - - - -\end_inset - - -\end_layout - -\begin_layout Standard -This review is an attempt to catalog and address all the known issues with - TDB and create solutions which address the problems without significantly - increasing complexity; all involved are far too aware of the dangers of - second system syndrome in rewriting a successful project like this. -\end_layout - -\begin_layout Section -API Issues -\end_layout - -\begin_layout Subsection -tdb_open_ex Is Not Expandable -\end_layout - -\begin_layout Standard -The tdb_open() call was expanded to tdb_open_ex(), which added an optional - hashing function and an optional logging function argument. - Additional arguments to open would require the introduction of a tdb_open_ex2 - call etc. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\begin_inset CommandInset label -LatexCommand label -name "attributes" - -\end_inset - - -\end_layout - -\begin_layout Standard -tdb_open() will take a linked-list of attributes: -\end_layout - -\begin_layout LyX-Code -enum tdb_attribute { -\end_layout - -\begin_layout LyX-Code - TDB_ATTRIBUTE_LOG = 0, -\end_layout - -\begin_layout LyX-Code - TDB_ATTRIBUTE_HASH = 1 -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout LyX-Code -struct tdb_attribute_base { -\end_layout - -\begin_layout LyX-Code - enum tdb_attribute attr; -\end_layout - -\begin_layout LyX-Code - union tdb_attribute *next; -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout LyX-Code -struct tdb_attribute_log { -\end_layout - -\begin_layout LyX-Code - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */ -\end_layout - -\begin_layout LyX-Code - tdb_log_func log_fn; -\end_layout - -\begin_layout LyX-Code - void *log_private; -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout LyX-Code -struct tdb_attribute_hash { -\end_layout - -\begin_layout LyX-Code - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */ -\end_layout - -\begin_layout LyX-Code - tdb_hash_func hash_fn; -\end_layout - -\begin_layout LyX-Code - void *hash_private; -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout LyX-Code -union tdb_attribute { -\end_layout - -\begin_layout LyX-Code - struct tdb_attribute_base base; -\end_layout - -\begin_layout LyX-Code - struct tdb_attribute_log log; -\end_layout - -\begin_layout LyX-Code - struct tdb_attribute_hash hash; -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout Standard -This allows future attributes to be added, even if this expands the size - of the union. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -tdb_traverse Makes Impossible Guarantees -\end_layout - -\begin_layout Standard -tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it - was thought that it was important to guarantee that all records which exist - at the start and end of the traversal would be included, and no record - would be included twice. -\end_layout - -\begin_layout Standard -This adds complexity (see -\begin_inset CommandInset ref -LatexCommand ref -reference "Reliable-Traversal-Adds" - -\end_inset - -) and does not work anyway for records which are altered (in particular, - those which are expanded may be effectively deleted and re-added behind - the traversal). -\end_layout - -\begin_layout Subsubsection -\begin_inset CommandInset label -LatexCommand label -name "traverse-Proposed-Solution" - -\end_inset - -Proposed Solution -\end_layout - -\begin_layout Standard -Abandon the guarantee. - You will see every record if no changes occur during your traversal, otherwise - you will see some subset. - You can prevent changes by using a transaction or the locking API. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. - Delete-during-traverse will still delete every record, too (assuming no - other changes). -\end_layout - -\begin_layout Subsection -Nesting of Transactions Is Fraught -\end_layout - -\begin_layout Standard -TDB has alternated between allowing nested transactions and not allowing - them. - Various paths in the Samba codebase assume that transactions will nest, - and in a sense they can: the operation is only committed to disk when the - outer transaction is committed. - There are two problems, however: -\end_layout - -\begin_layout Enumerate -Canceling the inner transaction will cause the outer transaction commit - to fail, and will not undo any operations since the inner transaction began. - This problem is soluble with some additional internal code. -\end_layout - -\begin_layout Enumerate -An inner transaction commit can be cancelled by the outer transaction. - This is desirable in the way which Samba's database initialization code - uses transactions, but could be a surprise to any users expecting a successful - transaction commit to expose changes to others. -\end_layout - -\begin_layout Standard -The current solution is to specify the behavior at tdb_open(), with the - default currently that nested transactions are allowed. - This flag can also be changed at runtime. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Given the usage patterns, it seems that the -\begin_inset Quotes eld -\end_inset - -least-surprise -\begin_inset Quotes erd -\end_inset - - behavior of disallowing nested transactions should become the default. - Additionally, it seems the outer transaction is the only code which knows - whether inner transactions should be allowed, so a flag to indicate this - could be added to tdb_transaction_start. - However, this behavior can be simulated with a wrapper which uses tdb_add_flags -() and tdb_remove_flags(), so the API should not be expanded for this relatively --obscure case. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard - -\change_deleted 0 1298979572 -Incomplete; nesting flag is still defined as per tdb1. -\change_inserted 0 1298979584 -Complete; the nesting flag has been removed. -\change_unchanged - -\end_layout - -\begin_layout Subsection -Incorrect Hash Function is Not Detected -\end_layout - -\begin_layout Standard -tdb_open_ex() allows the calling code to specify a different hash function - to use, but does not check that all other processes accessing this tdb - are using the same hash function. - The result is that records are missing from tdb_fetch(). -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The header should contain an example hash result (eg. - the hash of 0xdeadbeef), and tdb_open_ex() should check that the given - hash function produces the same answer, or fail the tdb_open call. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -tdb_set_max_dead/TDB_VOLATILE Expose Implementation -\end_layout - -\begin_layout Standard -In response to scalability issues with the free list ( -\begin_inset CommandInset ref -LatexCommand ref -reference "TDB-Freelist-Is" - -\end_inset - -) two API workarounds have been incorporated in TDB: tdb_set_max_dead() - and the TDB_VOLATILE flag to tdb_open. - The latter actually calls the former with an argument of -\begin_inset Quotes eld -\end_inset - -5 -\begin_inset Quotes erd -\end_inset - -. -\end_layout - -\begin_layout Standard -This code allows deleted records to accumulate without putting them in the - free list. - On delete we iterate through each chain and free them in a batch if there - are more than max_dead entries. - These are never otherwise recycled except as a side-effect of a tdb_repack. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -With the scalability problems of the freelist solved, this API can be removed. - The TDB_VOLATILE flag may still be useful as a hint that store and delete - of records will be at least as common as fetch in order to allow some internal - tuning, but initially will become a no-op. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Incomplete. - TDB_VOLATILE still defined, but implementation should fail on unknown flags - to be future-proof. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "TDB-Files-Cannot" - -\end_inset - -TDB Files Cannot Be Opened Multiple Times In The Same Process -\end_layout - -\begin_layout Standard -No process can open the same TDB twice; we check and disallow it. - This is an unfortunate side-effect of fcntl locks, which operate on a per-file - rather than per-file-descriptor basis, and do not nest. - Thus, closing any file descriptor on a file clears all the locks obtained - by this process, even if they were placed using a different file descriptor! -\end_layout - -\begin_layout Standard -Note that even if this were solved, deadlock could occur if operations were - nested: this is a more manageable programming error in most cases. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -We could lobby POSIX to fix the perverse rules, or at least lobby Linux - to violate them so that the most common implementation does not have this - restriction. - This would be a generally good idea for other fcntl lock users. -\end_layout - -\begin_layout Standard -Samba uses a wrapper which hands out the same tdb_context to multiple callers - if this happens, and does simple reference counting. - We should do this inside the tdb library, which already emulates lock nesting - internally; it would need to recognize when deadlock occurs within a single - process. - This would create a new failure mode for tdb operations (while we currently - handle locking failures, they are impossible in normal use and a process - encountering them can do little but give up). -\end_layout - -\begin_layout Standard -I do not see benefit in an additional tdb_open flag to indicate whether - re-opening is allowed, as though there may be some benefit to adding a - call to detect when a tdb_context is shared, to allow other to create such - an API. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Incomplete. -\end_layout - -\begin_layout Subsection -TDB API Is Not POSIX Thread-safe -\end_layout - -\begin_layout Standard -The TDB API uses an error code which can be queried after an operation to - determine what went wrong. - This programming model does not work with threads, unless specific additional - guarantees are given by the implementation. - In addition, even otherwise-independent threads cannot open the same TDB - (as in -\begin_inset CommandInset ref -LatexCommand ref -reference "TDB-Files-Cannot" - -\end_inset - -). -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Reachitecting the API to include a tdb_errcode pointer would be a great - deal of churn -\change_inserted 0 1298979557 -, but fortunately most functions return 0 on success and -1 on error: we - can change these to return 0 on success and a negative error code on error, - and the API remains similar to previous. - The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA - pointer and return an error code. - It is also simpler to have tdb_nextkey replace its key argument in place, - freeing up any old .dptr. -\end_layout - -\begin_layout Standard - -\change_deleted 0 1298979438 -; we are better to guarantee that the tdb_errcode is per-thread so the current - programming model can be maintained. -\end_layout - -\begin_layout Standard - -\change_deleted 0 1298979438 -This requires dynamic per-thread allocations, which is awkward with POSIX - threads (pthread_key_create space is limited and we cannot simply allocate - a key for every TDB). -\change_unchanged - -\end_layout - -\begin_layout Standard -Internal locking is required to make sure that fcntl locks do not overlap - between threads, and also that the global list of tdbs is maintained. -\end_layout - -\begin_layout Standard -The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe - version of the library, and otherwise no overhead will exist. - Alternatively, a hooking mechanism similar to that proposed for -\begin_inset CommandInset ref -LatexCommand ref -reference "Proposed-Solution-locking-hook" - -\end_inset - - could be used to enable pthread locking at runtime. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Incomplete -\change_inserted 0 1298979681 -; API has been changed but thread safety has not been implemented. -\change_deleted 0 1298979669 -. -\change_unchanged - -\end_layout - -\begin_layout Subsection -*_nonblock Functions And *_mark Functions Expose Implementation -\end_layout - -\begin_layout Standard -CTDB -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -Clustered TDB, see http://ctdb.samba.org -\end_layout - -\end_inset - - wishes to operate on TDB in a non-blocking manner. - This is currently done as follows: -\end_layout - -\begin_layout Enumerate -Call the _nonblock variant of an API function (eg. - tdb_lockall_nonblock). - If this fails: -\end_layout - -\begin_layout Enumerate -Fork a child process, and wait for it to call the normal variant (eg. - tdb_lockall). -\end_layout - -\begin_layout Enumerate -If the child succeeds, call the _mark variant to indicate we already have - the locks (eg. - tdb_lockall_mark). -\end_layout - -\begin_layout Enumerate -Upon completion, tell the child to release the locks (eg. - tdb_unlockall). -\end_layout - -\begin_layout Enumerate -Indicate to tdb that it should consider the locks removed (eg. - tdb_unlockall_mark). -\end_layout - -\begin_layout Standard -There are several issues with this approach. - Firstly, adding two new variants of each function clutters the API for - an obscure use, and so not all functions have three variants. - Secondly, it assumes that all paths of the functions ask for the same locks, - otherwise the parent process will have to get a lock which the child doesn't - have under some circumstances. - I don't believe this is currently the case, but it constrains the implementatio -n. - -\end_layout - -\begin_layout Subsubsection -\begin_inset CommandInset label -LatexCommand label -name "Proposed-Solution-locking-hook" - -\end_inset - -Proposed Solution -\end_layout - -\begin_layout Standard -Implement a hook for locking methods, so that the caller can control the - calls to create and remove fcntl locks. - In this scenario, ctdbd would operate as follows: -\end_layout - -\begin_layout Enumerate -Call the normal API function, eg tdb_lockall(). -\end_layout - -\begin_layout Enumerate -When the lock callback comes in, check if the child has the lock. - Initially, this is always false. - If so, return 0. - Otherwise, try to obtain it in non-blocking mode. - If that fails, return EWOULDBLOCK. -\end_layout - -\begin_layout Enumerate -Release locks in the unlock callback as normal. -\end_layout - -\begin_layout Enumerate -If tdb_lockall() fails, see if we recorded a lock failure; if so, call the - child to repeat the operation. -\end_layout - -\begin_layout Enumerate -The child records what locks it obtains, and returns that information to - the parent. -\end_layout - -\begin_layout Enumerate -When the child has succeeded, goto 1. -\end_layout - -\begin_layout Standard -This is flexible enough to handle any potential locking scenario, even when - lock requirements change. - It can be optimized so that the parent does not release locks, just tells - the child which locks it doesn't need to obtain. -\end_layout - -\begin_layout Standard -It also keeps the complexity out of the API, and in ctdbd where it is needed. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Incomplete. -\end_layout - -\begin_layout Subsection -tdb_chainlock Functions Expose Implementation -\end_layout - -\begin_layout Standard -tdb_chainlock locks some number of records, including the record indicated - by the given key. - This gave atomicity guarantees; no-one can start a transaction, alter, - read or delete that key while the lock is held. -\end_layout - -\begin_layout Standard -It also makes the same guarantee for any other key in the chain, which is - an internal implementation detail and potentially a cause for deadlock. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -None. - It would be nice to have an explicit single entry lock which effected no - other keys. - Unfortunately, this won't work for an entry which doesn't exist. - Thus while chainlock may be implemented more efficiently for the existing - case, it will still have overlap issues with the non-existing case. - So it is best to keep the current (lack of) guarantee about which records - will be effected to avoid constraining our implementation. -\end_layout - -\begin_layout Subsection -Signal Handling is Not Race-Free -\end_layout - -\begin_layout Standard -The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate - that the tdb locking code should return with a failure, rather than trying - again when a signal is received (and errno == EAGAIN). - This is usually used to implement timeouts. -\end_layout - -\begin_layout Standard -Unfortunately, this does not work in the case where the signal is received - before the tdb code enters the fcntl() call to place the lock: the code - will sleep within the fcntl() code, unaware that the signal wants it to - exit. - In the case of long timeouts, this does not happen in practice. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The locking hooks proposed in -\begin_inset CommandInset ref -LatexCommand ref -reference "Proposed-Solution-locking-hook" - -\end_inset - - would allow the user to decide on whether to fail the lock acquisition - on a signal. - This allows the caller to choose their own compromise: they could narrow - the race by checking immediately before the fcntl call. -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -It may be possible to make this race-free in some implementations by having - the signal handler alter the struct flock to make it invalid. - This will cause the fcntl() lock call to fail with EINVAL if the signal - occurs before the kernel is entered, otherwise EAGAIN. -\end_layout - -\end_inset - - -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Incomplete. -\end_layout - -\begin_layout Subsection -The API Uses Gratuitous Typedefs, Capitals -\end_layout - -\begin_layout Standard -typedefs are useful for providing source compatibility when types can differ - across implementations, or arguably in the case of function pointer definitions - which are hard for humans to parse. - Otherwise it is simply obfuscation and pollutes the namespace. -\end_layout - -\begin_layout Standard -Capitalization is usually reserved for compile-time constants and macros. -\end_layout - -\begin_layout Description -TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the - definition isn't visible to the API user anyway. -\end_layout - -\begin_layout Description -TDB_DATA There is no reason to use this over struct TDB_DATA; the struct - needs to be understood by the API user. -\end_layout - -\begin_layout Description -struct -\begin_inset space ~ -\end_inset - -TDB_DATA This would normally be called 'struct tdb_data'. -\end_layout - -\begin_layout Description -enum -\begin_inset space ~ -\end_inset - -TDB_ERROR Similarly, this would normally be enum tdb_error. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -None. - Introducing lower case variants would please pedants like myself, but if - it were done the existing ones should be kept. - There is little point forcing a purely cosmetic change upon tdb users. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "tdb_log_func-Doesnt-Take" - -\end_inset - -tdb_log_func Doesn't Take The Private Pointer -\end_layout - -\begin_layout Standard -For API compatibility reasons, the logging function needs to call tdb_get_loggin -g_private() to retrieve the pointer registered by the tdb_open_ex for logging. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -It should simply take an extra argument, since we are prepared to break - the API/ABI. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -Various Callback Functions Are Not Typesafe -\end_layout - -\begin_layout Standard -The callback functions in tdb_set_logging_function (after -\begin_inset CommandInset ref -LatexCommand ref -reference "tdb_log_func-Doesnt-Take" - -\end_inset - - is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check - all take void * and must internally convert it to the argument type they - were expecting. -\end_layout - -\begin_layout Standard -If this type changes, the compiler will not produce warnings on the callers, - since it only sees void *. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -With careful use of macros, we can create callback functions which give - a warning when used on gcc and the types of the callback and its private - argument differ. - Unsupported compilers will not give a warning, which is no worse than now. - In addition, the callbacks become clearer, as they need not use void * - for their parameter. -\end_layout - -\begin_layout Standard -See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Incomplete. -\end_layout - -\begin_layout Subsection -TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic -\end_layout - -\begin_layout Standard -The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should - be cleared if the caller discovers it is the only process with the TDB - open. - However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not - be detected, so will have the TDB erased underneath them (usually resulting - in a crash). -\end_layout - -\begin_layout Standard -There is a similar issue on fork(); if the parent exits (or otherwise closes - the tdb) before the child calls tdb_reopen_all() to establish the lock - used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener - at that moment will believe it alone has opened the TDB and will erase - it. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Remove TDB_CLEAR_IF_FIRST. - Other workarounds are possible, but see -\begin_inset CommandInset ref -LatexCommand ref -reference "TDB_CLEAR_IF_FIRST-Imposes-Performance" - -\end_inset - -. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard - -\change_deleted 0 1298979699 -Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented. -\change_inserted 0 1298979700 -Complete. -\change_unchanged - -\end_layout - -\begin_layout Subsection -Extending The Header Is Difficult -\end_layout - -\begin_layout Standard -We have reserved (zeroed) words in the TDB header, which can be used for - future features. - If the future features are compulsory, the version number must be updated - to prevent old code from accessing the database. - But if the future feature is optional, we have no way of telling if older - code is accessing the database or not. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The header should contain a -\begin_inset Quotes eld -\end_inset - -format variant -\begin_inset Quotes erd -\end_inset - - value (64-bit). - This is divided into two 32-bit parts: -\end_layout - -\begin_layout Enumerate -The lower part reflects the format variant understood by code accessing - the database. -\end_layout - -\begin_layout Enumerate -The upper part reflects the format variant you must understand to write - to the database (otherwise you can only open for reading). -\end_layout - -\begin_layout Standard -The latter field can only be written at creation time, the former should - be written under the OPEN_LOCK when opening the database for writing, if - the variant of the code is lower than the current lowest variant. -\end_layout - -\begin_layout Standard -This should allow backwards-compatible features to be added, and detection - if older code (which doesn't understand the feature) writes to the database. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Incomplete. -\end_layout - -\begin_layout Subsection -Record Headers Are Not Expandible -\end_layout - -\begin_layout Standard -If we later want to add (say) checksums on keys and data, it would require - another format change, which we'd like to avoid. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -We often have extra padding at the tail of a record. - If we ensure that the first byte (if any) of this padding is zero, we will - have a way for future changes to detect code which doesn't understand a - new format: the new code would write (say) a 1 at the tail, and thus if - there is no tail or the first byte is 0, we would know the extension is - not present on that record. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Incomplete. -\end_layout - -\begin_layout Subsection -TDB Does Not Use Talloc -\end_layout - -\begin_layout Standard -Many users of TDB (particularly Samba) use the talloc allocator, and thus - have to wrap TDB in a talloc context to use it conveniently. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The allocation within TDB is not complicated enough to justify the use of - talloc, and I am reluctant to force another (excellent) library on TDB - users. - Nonetheless a compromise is possible. - An attribute (see -\begin_inset CommandInset ref -LatexCommand ref -reference "attributes" - -\end_inset - -) can be added later to tdb_open() to provide an alternate allocation mechanism, - specifically for talloc but usable by any other allocator (which would - ignore the -\begin_inset Quotes eld -\end_inset - -context -\begin_inset Quotes erd -\end_inset - - argument). -\end_layout - -\begin_layout Standard -This would form a talloc heirarchy as expected, but the caller would still - have to attach a destructor to the tdb context returned from tdb_open to - close it. - All TDB_DATA fields would be children of the tdb_context, and the caller - would still have to manage them (using talloc_free() or talloc_steal()). -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Deferred. -\end_layout - -\begin_layout Section -Performance And Scalability Issues -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "TDB_CLEAR_IF_FIRST-Imposes-Performance" - -\end_inset - -TDB_CLEAR_IF_FIRST Imposes Performance Penalty -\end_layout - -\begin_layout Standard -When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset - 4 (aka. - the ACTIVE_LOCK). - While these locks never conflict in normal tdb usage, they do add substantial - overhead for most fcntl lock implementations when the kernel scans to detect - if a lock conflict exists. - This is often a single linked list, making the time to acquire and release - a fcntl lock O(N) where N is the number of processes with the TDB open, - not the number actually doing work. -\end_layout - -\begin_layout Standard -In a Samba server it is common to have huge numbers of clients sitting idle, - and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag. -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -There is a flag to tdb_reopen_all() which is used for this optimization: - if the parent process will outlive the child, the child does not need the - ACTIVE_LOCK. - This is a workaround for this very performance issue. -\end_layout - -\end_inset - - -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Remove the flag. - It was a neat idea, but even trivial servers tend to know when they are - initializing for the first time and can simply unlink the old tdb at that - point. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard - -\change_deleted 0 1298979837 -Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing. -\change_inserted 0 1298979837 -Complete. -\change_unchanged - -\end_layout - -\begin_layout Subsection -TDB Files Have a 4G Limit -\end_layout - -\begin_layout Standard -This seems to be becoming an issue (so much for -\begin_inset Quotes eld -\end_inset - -trivial -\begin_inset Quotes erd -\end_inset - -!), particularly for ldb. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -A new, incompatible TDB format which uses 64 bit offsets internally rather - than 32 bit as now. - For simplicity of endian conversion (which TDB does on the fly if required), - all values will be 64 bit on disk. - In practice, some upper bits may be used for other purposes, but at least - 56 bits will be available for file offsets. -\end_layout - -\begin_layout Standard -tdb_open() will automatically detect the old version, and even create them - if TDB_VERSION6 is specified to tdb_open. -\end_layout - -\begin_layout Standard -32 bit processes will still be able to access TDBs larger than 4G (assuming - that their off_t allows them to seek to 64 bits), they will gracefully - fall back as they fail to mmap. - This can happen already with large TDBs. -\end_layout - -\begin_layout Standard -Old versions of tdb will fail to open the new TDB files (since 28 August - 2009, commit 398d0c29290: prior to that any unrecognized file format would - be erased and initialized as a fresh tdb!) -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -TDB Records Have a 4G Limit -\end_layout - -\begin_layout Standard -This has not been a reported problem, and the API uses size_t which can - be 64 bit on 64 bit platforms. - However, other limits may have made such an issue moot. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Record sizes will be 64 bit, with an error returned on 32 bit platforms - which try to access such records (the current implementation would return - TDB_ERR_OOM in a similar case). - It seems unlikely that 32 bit keys will be a limitation, so the implementation - may not support this (see -\begin_inset CommandInset ref -LatexCommand ref -reference "sub:Records-Incur-A" - -\end_inset - -). -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -Hash Size Is Determined At TDB Creation Time -\end_layout - -\begin_layout Standard -TDB contains a number of hash chains in the header; the number is specified - at creation time, and defaults to 131. - This is such a bottleneck on large databases (as each hash chain gets quite - long), that LDB uses 10,000 for this hash. - In general it is impossible to know what the 'right' answer is at database - creation time. -\end_layout - -\begin_layout Subsubsection -\begin_inset CommandInset label -LatexCommand label -name "sub:Hash-Size-Solution" - -\end_inset - -Proposed Solution -\end_layout - -\begin_layout Standard -After comprehensive performance testing on various scalable hash variants -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying - because I was previously convinced that an expanding tree of hashes would - be very close to optimal. -\end_layout - -\end_inset - -, it became clear that it is hard to beat a straight linear hash table which - doubles in size when it reaches saturation. - Unfortunately, altering the hash table introduces serious locking complications -: the entire hash table needs to be locked to enlarge the hash table, and - others might be holding locks. - Particularly insidious are insertions done under tdb_chainlock. -\end_layout - -\begin_layout Standard -Thus an expanding layered hash will be used: an array of hash groups, with - each hash group exploding into pointers to lower hash groups once it fills, - turning into a hash tree. - This has implications for locking: we must lock the entire group in case - we need to expand it, yet we don't know how deep the tree is at that point. -\end_layout - -\begin_layout Standard -Note that bits from the hash table entries should be stolen to hold more - hash bits to reduce the penalty of collisions. - We can use the otherwise-unused lower 3 bits. - If we limit the size of the database to 64 exabytes, we can use the top - 8 bits of the hash entry as well. - These 11 bits would reduce false positives down to 1 in 2000 which is more - than we need: we can use one of the bits to indicate that the extra hash - bits are valid. - This means we can choose not to re-hash all entries when we expand a hash - group; simply use the next bits we need and mark them invalid. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "TDB-Freelist-Is" - -\end_inset - -TDB Freelist Is Highly Contended -\end_layout - -\begin_layout Standard -TDB uses a single linked list for the free list. - Allocation occurs as follows, using heuristics which have evolved over - time: -\end_layout - -\begin_layout Enumerate -Get the free list lock for this whole operation. -\end_layout - -\begin_layout Enumerate -Multiply length by 1.25, so we always over-allocate by 25%. -\end_layout - -\begin_layout Enumerate -Set the slack multiplier to 1. -\end_layout - -\begin_layout Enumerate -Examine the current freelist entry: if it is > length but < the current - best case, remember it as the best case. -\end_layout - -\begin_layout Enumerate -Multiply the slack multiplier by 1.05. -\end_layout - -\begin_layout Enumerate -If our best fit so far is less than length * slack multiplier, return it. - The slack will be turned into a new free record if it's large enough. -\end_layout - -\begin_layout Enumerate -Otherwise, go onto the next freelist entry. -\end_layout - -\begin_layout Standard -Deleting a record occurs as follows: -\end_layout - -\begin_layout Enumerate -Lock the hash chain for this whole operation. -\end_layout - -\begin_layout Enumerate -Walk the chain to find the record, keeping the prev pointer offset. -\end_layout - -\begin_layout Enumerate -If max_dead is non-zero: -\end_layout - -\begin_deeper -\begin_layout Enumerate -Walk the hash chain again and count the dead records. -\end_layout - -\begin_layout Enumerate -If it's more than max_dead, bulk free all the dead ones (similar to steps - 4 and below, but the lock is only obtained once). -\end_layout - -\begin_layout Enumerate -Simply mark this record as dead and return. - -\end_layout - -\end_deeper -\begin_layout Enumerate -Get the free list lock for the remainder of this operation. -\end_layout - -\begin_layout Enumerate -\begin_inset CommandInset label -LatexCommand label -name "right-merging" - -\end_inset - -Examine the following block to see if it is free; if so, enlarge the current - block and remove that block from the free list. - This was disabled, as removal from the free list was O(entries-in-free-list). -\end_layout - -\begin_layout Enumerate -Examine the preceeding block to see if it is free: for this reason, each - block has a 32-bit tailer which indicates its length. - If it is free, expand it to cover our new block and return. -\end_layout - -\begin_layout Enumerate -Otherwise, prepend ourselves to the free list. -\end_layout - -\begin_layout Standard -Disabling right-merging (step -\begin_inset CommandInset ref -LatexCommand ref -reference "right-merging" - -\end_inset - -) causes fragmentation; the other heuristics proved insufficient to address - this, so the final answer to this was that when we expand the TDB file - inside a transaction commit, we repack the entire tdb. -\end_layout - -\begin_layout Standard -The single list lock limits our allocation rate; due to the other issues - this is not currently seen as a bottleneck. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The first step is to remove all the current heuristics, as they obviously - interact, then examine them once the lock contention is addressed. -\end_layout - -\begin_layout Standard -The free list must be split to reduce contention. - Assuming perfect free merging, we can at most have 1 free list entry for - each entry. - This implies that the number of free lists is related to the size of the - hash table, but as it is rare to walk a large number of free list entries - we can use far fewer, say 1/32 of the number of hash buckets. -\end_layout - -\begin_layout Standard -It seems tempting to try to reuse the hash implementation which we use for - records here, but we have two ways of searching for free entries: for allocatio -n we search by size (and possibly zone) which produces too many clashes - for our hash table to handle well, and for coalescing we search by address. - Thus an array of doubly-linked free lists seems preferable. -\end_layout - -\begin_layout Standard -There are various benefits in using per-size free lists (see -\begin_inset CommandInset ref -LatexCommand ref -reference "sub:TDB-Becomes-Fragmented" - -\end_inset - -) but it's not clear this would reduce contention in the common case where - all processes are allocating/freeing the same size. - Thus we almost certainly need to divide in other ways: the most obvious - is to divide the file into zones, and using a free list (or table of free - lists) for each. - This approximates address ordering. -\end_layout - -\begin_layout Standard -Unfortunately it is difficult to know what heuristics should be used to - determine zone sizes, and our transaction code relies on being able to - create a -\begin_inset Quotes eld -\end_inset - -recovery area -\begin_inset Quotes erd -\end_inset - - by simply appending to the file (difficult if it would need to create a - new zone header). - Thus we use a linked-list of free tables; currently we only ever create - one, but if there is more than one we choose one at random to use. - In future we may use heuristics to add new free tables on contention. - We only expand the file when all free tables are exhausted. -\end_layout - -\begin_layout Standard -The basic algorithm is as follows. - Freeing is simple: -\end_layout - -\begin_layout Enumerate -Identify the correct free list. -\end_layout - -\begin_layout Enumerate -Lock the corresponding list. -\end_layout - -\begin_layout Enumerate -Re-check the list (we didn't have a lock, sizes could have changed): relock - if necessary. -\end_layout - -\begin_layout Enumerate -Place the freed entry in the list. -\end_layout - -\begin_layout Standard -Allocation is a little more complicated, as we perform delayed coalescing - at this point: -\end_layout - -\begin_layout Enumerate -Pick a free table; usually the previous one. -\end_layout - -\begin_layout Enumerate -Lock the corresponding list. -\end_layout - -\begin_layout Enumerate -If the top entry is -large enough, remove it from the list and return it. -\end_layout - -\begin_layout Enumerate -Otherwise, coalesce entries in the list.If there was no entry large enough, - unlock the list and try the next largest list -\end_layout - -\begin_layout Enumerate -If no list has an entry which meets our needs, try the next free table. -\end_layout - -\begin_layout Enumerate -If no zone satisfies, expand the file. -\end_layout - -\begin_layout Standard -This optimizes rapid insert/delete of free list entries by not coalescing - them all the time.. - First-fit address ordering ordering seems to be fairly good for keeping - fragmentation low (see -\begin_inset CommandInset ref -LatexCommand ref -reference "sub:TDB-Becomes-Fragmented" - -\end_inset - -). - Note that address ordering does not need a tailer to coalesce, though if - we needed one we could have one cheaply: see -\begin_inset CommandInset ref -LatexCommand ref -reference "sub:Records-Incur-A" - -\end_inset - -. - -\end_layout - -\begin_layout Standard -Each free entry has the free table number in the header: less than 255. - It also contains a doubly-linked list for easy deletion. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "sub:TDB-Becomes-Fragmented" - -\end_inset - -TDB Becomes Fragmented -\end_layout - -\begin_layout Standard -Much of this is a result of allocation strategy -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute -xas.edu/pub/garbage/malloc/ismm98.ps -\end_layout - -\end_inset - - and deliberate hobbling of coalescing; internal fragmentation (aka overallocati -on) is deliberately set at 25%, and external fragmentation is only cured - by the decision to repack the entire db when a transaction commit needs - to enlarge the file. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The 25% overhead on allocation works in practice for ldb because indexes - tend to expand by one record at a time. - This internal fragmentation can be resolved by having an -\begin_inset Quotes eld -\end_inset - -expanded -\begin_inset Quotes erd -\end_inset - - bit in the header to note entries that have previously expanded, and allocating - more space for them. -\end_layout - -\begin_layout Standard -There are is a spectrum of possible solutions for external fragmentation: - one is to use a fragmentation-avoiding allocation strategy such as best-fit - address-order allocator. - The other end of the spectrum would be to use a bump allocator (very fast - and simple) and simply repack the file when we reach the end. -\end_layout - -\begin_layout Standard -There are three problems with efficient fragmentation-avoiding allocators: - they are non-trivial, they tend to use a single free list for each size, - and there's no evidence that tdb allocation patterns will match those recorded - for general allocators (though it seems likely). -\end_layout - -\begin_layout Standard -Thus we don't spend too much effort on external fragmentation; we will be - no worse than the current code if we need to repack on occasion. - More effort is spent on reducing freelist contention, and reducing overhead. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "sub:Records-Incur-A" - -\end_inset - -Records Incur A 28-Byte Overhead -\end_layout - -\begin_layout Standard -Each TDB record has a header as follows: -\end_layout - -\begin_layout LyX-Code -struct tdb_record { -\end_layout - -\begin_layout LyX-Code - tdb_off_t next; /* offset of the next record in the list */ -\end_layout - -\begin_layout LyX-Code - tdb_len_t rec_len; /* total byte length of record */ -\end_layout - -\begin_layout LyX-Code - tdb_len_t key_len; /* byte length of key */ -\end_layout - -\begin_layout LyX-Code - tdb_len_t data_len; /* byte length of data */ -\end_layout - -\begin_layout LyX-Code - uint32_t full_hash; /* the full 32 bit hash of the key */ -\end_layout - -\begin_layout LyX-Code - uint32_t magic; /* try to catch errors */ -\end_layout - -\begin_layout LyX-Code - /* the following union is implied: -\end_layout - -\begin_layout LyX-Code - union { -\end_layout - -\begin_layout LyX-Code - char record[rec_len]; -\end_layout - -\begin_layout LyX-Code - struct { -\end_layout - -\begin_layout LyX-Code - char key[key_len]; -\end_layout - -\begin_layout LyX-Code - char data[data_len]; -\end_layout - -\begin_layout LyX-Code - } -\end_layout - -\begin_layout LyX-Code - uint32_t totalsize; (tailer) -\end_layout - -\begin_layout LyX-Code - } -\end_layout - -\begin_layout LyX-Code - */ -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout Standard -Naively, this would double to a 56-byte overhead on a 64 bit implementation. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -We can use various techniques to reduce this for an allocated block: -\end_layout - -\begin_layout Enumerate -The 'next' pointer is not required, as we are using a flat hash table. -\end_layout - -\begin_layout Enumerate -'rec_len' can instead be expressed as an addition to key_len and data_len - (it accounts for wasted or overallocated length in the record). - Since the record length is always a multiple of 8, we can conveniently - fit it in 32 bits (representing up to 35 bits). -\end_layout - -\begin_layout Enumerate -'key_len' and 'data_len' can be reduced. - I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine - the two into one 64-bit field and using a 5 bit value which indicates at - what bit to divide the two. - Keys are unlikely to scale as fast as data, so I'm assuming a maximum key - size of 32 bits. -\end_layout - -\begin_layout Enumerate -'full_hash' is used to avoid a memcmp on the -\begin_inset Quotes eld -\end_inset - -miss -\begin_inset Quotes erd -\end_inset - - case, but this is diminishing returns after a handful of bits (at 10 bits, - it reduces 99.9% of false memcmp). - As an aside, as the lower bits are already incorporated in the hash table - resolution, the upper bits should be used here. - Note that it's not clear that these bits will be a win, given the extra - bits in the hash table itself (see -\begin_inset CommandInset ref -LatexCommand ref -reference "sub:Hash-Size-Solution" - -\end_inset - -). -\end_layout - -\begin_layout Enumerate -'magic' does not need to be enlarged: it currently reflects one of 5 values - (used, free, dead, recovery, and unused_recovery). - It is useful for quick sanity checking however, and should not be eliminated. -\end_layout - -\begin_layout Enumerate -'tailer' is only used to coalesce free blocks (so a block to the right can - find the header to check if this block is free). - This can be replaced by a single 'free' bit in the header of the following - block (and the tailer only exists in free blocks). -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -This technique from Thomas Standish. - Data Structure Techniques. - Addison-Wesley, Reading, Massachusetts, 1980. -\end_layout - -\end_inset - - The current proposed coalescing algorithm doesn't need this, however. -\end_layout - -\begin_layout Standard -This produces a 16 byte used header like this: -\end_layout - -\begin_layout LyX-Code -struct tdb_used_record { -\end_layout - -\begin_layout LyX-Code - uint32_t used_magic : 16, -\end_layout - -\begin_layout LyX-Code - -\end_layout - -\begin_layout LyX-Code - key_data_divide: 5, -\end_layout - -\begin_layout LyX-Code - top_hash: 11; -\end_layout - -\begin_layout LyX-Code - uint32_t extra_octets; -\end_layout - -\begin_layout LyX-Code - uint64_t key_and_data_len; -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout Standard -And a free record like this: -\end_layout - -\begin_layout LyX-Code -struct tdb_free_record { -\end_layout - -\begin_layout LyX-Code - uint64_t free_magic: 8, -\end_layout - -\begin_layout LyX-Code - prev : 56; -\end_layout - -\begin_layout LyX-Code - -\end_layout - -\begin_layout LyX-Code - uint64_t free_table: 8, -\end_layout - -\begin_layout LyX-Code - total_length : 56 -\end_layout - -\begin_layout LyX-Code - uint64_t next;; -\end_layout - -\begin_layout LyX-Code -}; -\end_layout - -\begin_layout Standard - -\change_deleted 0 1291206079 - -\change_unchanged -Note that by limiting valid offsets to 56 bits, we can pack everything we - need into 3 64-byte words, meaning our minimum record size is 8 bytes. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -Transaction Commit Requires 4 fdatasync -\end_layout - -\begin_layout Standard -The current transaction algorithm is: -\end_layout - -\begin_layout Enumerate -write_recovery_data(); -\end_layout - -\begin_layout Enumerate -sync(); -\end_layout - -\begin_layout Enumerate -write_recovery_header(); -\end_layout - -\begin_layout Enumerate -sync(); -\end_layout - -\begin_layout Enumerate -overwrite_with_new_data(); -\end_layout - -\begin_layout Enumerate -sync(); -\end_layout - -\begin_layout Enumerate -remove_recovery_header(); -\end_layout - -\begin_layout Enumerate -sync(); -\end_layout - -\begin_layout Standard -On current ext3, each sync flushes all data to disk, so the next 3 syncs - are relatively expensive. - But this could become a performance bottleneck on other filesystems such - as ext4. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Neil Brown points out that this is overzealous, and only one sync is needed: -\end_layout - -\begin_layout Enumerate -Bundle the recovery data, a transaction counter and a strong checksum of - the new data. -\end_layout - -\begin_layout Enumerate -Strong checksum that whole bundle. -\end_layout - -\begin_layout Enumerate -Store the bundle in the database. -\end_layout - -\begin_layout Enumerate -Overwrite the oldest of the two recovery pointers in the header (identified - using the transaction counter) with the offset of this bundle. -\end_layout - -\begin_layout Enumerate -sync. -\end_layout - -\begin_layout Enumerate -Write the new data to the file. -\end_layout - -\begin_layout Standard -Checking for recovery means identifying the latest bundle with a valid checksum - and using the new data checksum to ensure that it has been applied. - This is more expensive than the current check, but need only be done at - open. - For running databases, a separate header field can be used to indicate - a transaction in progress; we need only check for recovery if this is set. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Deferred. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "sub:TDB-Does-Not" - -\end_inset - -TDB Does Not Have Snapshot Support -\end_layout - -\begin_layout Subsubsection -Proposed SolutionNone. - At some point you say -\begin_inset Quotes eld -\end_inset - -use a real database -\begin_inset Quotes erd -\end_inset - - (but see -\begin_inset CommandInset ref -LatexCommand ref -reference "replay-attribute" - -\end_inset - -). -\end_layout - -\begin_layout Standard -But as a thought experiment, if we implemented transactions to only overwrite - free entries (this is tricky: there must not be a header in each entry - which indicates whether it is free, but use of presence in metadata elsewhere), - and a pointer to the hash table, we could create an entirely new commit - without destroying existing data. - Then it would be easy to implement snapshots in a similar way. -\end_layout - -\begin_layout Standard -This would not allow arbitrary changes to the database, such as tdb_repack - does, and would require more space (since we have to preserve the current - and future entries at once). - If we used hash trees rather than one big hash table, we might only have - to rewrite some sections of the hash, too. -\end_layout - -\begin_layout Standard -We could then implement snapshots using a similar method, using multiple - different hash tables/free tables. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Deferred. -\end_layout - -\begin_layout Subsection -Transactions Cannot Operate in Parallel -\end_layout - -\begin_layout Standard -This would be useless for ldb, as it hits the index records with just about - every update. - It would add significant complexity in resolving clashes, and cause the - all transaction callers to write their code to loop in the case where the - transactions spuriously failed. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -None (but see -\begin_inset CommandInset ref -LatexCommand ref -reference "replay-attribute" - -\end_inset - -). - We could solve a small part of the problem by providing read-only transactions. - These would allow one write transaction to begin, but it could not commit - until all r/o transactions are done. - This would require a new RO_TRANSACTION_LOCK, which would be upgraded on - commit. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Deferred. -\end_layout - -\begin_layout Subsection -Default Hash Function Is Suboptimal -\end_layout - -\begin_layout Standard -The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially - if we expand it to 64 bits), and works best when the hash bucket size is - a prime number (which also means a slow modulus). - In addition, it is highly predictable which could potentially lead to a - Denial of Service attack in some TDB uses. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -The Jenkins lookup3 hash -\begin_inset Foot -status open - -\begin_layout Plain Layout -http://burtleburtle.net/bob/c/lookup3.c -\end_layout - -\end_inset - - is a fast and superbly-mixing hash. - It's used by the Linux kernel and almost everything else. - This has the particular properties that it takes an initial seed, and produces - two 32 bit hash numbers, which we can combine into a 64-bit hash. -\end_layout - -\begin_layout Standard -The seed should be created at tdb-creation time from some random source, - and placed in the header. - This is far from foolproof, but adds a little bit of protection against - hash bombing. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -\begin_inset CommandInset label -LatexCommand label -name "Reliable-Traversal-Adds" - -\end_inset - -Reliable Traversal Adds Complexity -\end_layout - -\begin_layout Standard -We lock a record during traversal iteration, and try to grab that lock in - the delete code. - If that grab on delete fails, we simply mark it deleted and continue onwards; - traversal checks for this condition and does the delete when it moves off - the record. -\end_layout - -\begin_layout Standard -If traversal terminates, the dead record may be left indefinitely. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -Remove reliability guarantees; see -\begin_inset CommandInset ref -LatexCommand ref -reference "traverse-Proposed-Solution" - -\end_inset - -. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Complete. -\end_layout - -\begin_layout Subsection -Fcntl Locking Adds Overhead -\end_layout - -\begin_layout Standard -Placing a fcntl lock means a system call, as does removing one. - This is actually one reason why transactions can be faster (everything - is locked once at transaction start). - In the uncontended case, this overhead can theoretically be eliminated. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -None. -\end_layout - -\begin_layout Standard -We tried this before with spinlock support, in the early days of TDB, and - it didn't make much difference except in manufactured benchmarks. -\end_layout - -\begin_layout Standard -We could use spinlocks (with futex kernel support under Linux), but it means - that we lose automatic cleanup when a process dies with a lock. - There is a method of auto-cleanup under Linux, but it's not supported by - other operating systems. - We could reintroduce a clear-if-first-style lock and sweep for dead futexes - on open, but that wouldn't help the normal case of one concurrent opener - dying. - Increasingly elaborate repair schemes could be considered, but they require - an ABI change (everyone must use them) anyway, so there's no need to do - this at the same time as everything else. -\end_layout - -\begin_layout Subsection -Some Transactions Don't Require Durability -\end_layout - -\begin_layout Standard -Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast) - usage, and occasionally empties the results into a transactional TDB. - This kind of usage prioritizes performance over durability: as long as - we are consistent, data can be lost. -\end_layout - -\begin_layout Standard -This would be more neatly implemented inside tdb: a -\begin_inset Quotes eld -\end_inset - -soft -\begin_inset Quotes erd -\end_inset - - transaction commit (ie. - syncless) which meant that data may be reverted on a crash. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -None. -\end_layout - -\begin_layout Standard -Unfortunately any transaction scheme which overwrites old data requires - a sync before that overwrite to avoid the possibility of corruption. -\end_layout - -\begin_layout Standard -It seems possible to use a scheme similar to that described in -\begin_inset CommandInset ref -LatexCommand ref -reference "sub:TDB-Does-Not" - -\end_inset - -,where transactions are committed without overwriting existing data, and - an array of top-level pointers were available in the header. - If the transaction is -\begin_inset Quotes eld -\end_inset - -soft -\begin_inset Quotes erd -\end_inset - - then we would not need a sync at all: existing processes would pick up - the new hash table and free list and work with that. -\end_layout - -\begin_layout Standard -At some later point, a sync would allow recovery of the old data into the - free lists (perhaps when the array of top-level pointers filled). - On crash, tdb_open() would examine the array of top levels, and apply the - transactions until it encountered an invalid checksum. -\end_layout - -\begin_layout Subsection -Tracing Is Fragile, Replay Is External -\end_layout - -\begin_layout Standard -The current TDB has compile-time-enabled tracing code, but it often breaks - as it is not enabled by default. - In a similar way, the ctdb code has an external wrapper which does replay - tracing so it can coordinate cluster-wide transactions. -\end_layout - -\begin_layout Subsubsection -Proposed Solution -\begin_inset CommandInset label -LatexCommand label -name "replay-attribute" - -\end_inset - - -\end_layout - -\begin_layout Standard -Tridge points out that an attribute can be later added to tdb_open (see - -\begin_inset CommandInset ref -LatexCommand ref -reference "attributes" - -\end_inset - -) to provide replay/trace hooks, which could become the basis for this and - future parallel transactions and snapshot support. -\end_layout - -\begin_layout Subsubsection -Status -\end_layout - -\begin_layout Standard -Deferred. -\end_layout - -\end_body -\end_document -@ - - -1.12 -log -@Add status, some fixes, linked freelists. -@ -text -@d53 1 -a53 7 - -\change_deleted 0 1291204535 -14-September -\change_inserted 0 1291204533 -1-December -\change_unchanged --2010 -a580 2 -\change_inserted 0 1291204563 - -a583 2 - -\change_inserted 0 1291204572 -a587 2 - -\change_inserted 0 1291204573 -a588 2 -\change_unchanged - -a629 2 -\change_inserted 0 1291204588 - -a632 2 - -\change_inserted 0 1291204588 -a636 2 - -\change_inserted 0 1291204631 -a639 2 -\change_unchanged - -a693 2 -\change_inserted 0 1291204639 - -a696 2 - -\change_inserted 0 1291204640 -d702 1 -a702 1 -\change_inserted 0 1291204665 -d704 2 -a728 2 -\change_inserted 0 1291204671 - -a731 2 - -\change_inserted 0 1291204671 -a735 2 - -\change_inserted 0 1291204673 -a736 2 -\change_unchanged - -a780 2 -\change_inserted 0 1291204731 - -a783 2 - -\change_inserted 0 1291204732 -a787 2 - -\change_inserted 0 1291204779 -a790 2 -\change_unchanged - -a842 2 -\change_inserted 0 1291204830 - -a845 2 - -\change_inserted 0 1291204831 -a849 2 - -\change_inserted 0 1291204834 -a850 2 -\change_unchanged - -d879 9 -a887 2 - deal of churn; we are better to guarantee that the tdb_errcode is per-thread - so the current programming model can be maintained. -d891 9 -d903 2 -a922 2 -\change_inserted 0 1291204847 - -a925 2 - -\change_inserted 0 1291204847 -d930 5 -a934 3 - -\change_inserted 0 1291204852 -Incomplete. -a1051 2 -\change_inserted 0 1291204881 - -a1054 2 - -\change_inserted 0 1291204881 -a1058 2 - -\change_inserted 0 1291204885 -a1059 2 -\change_unchanged - -a1140 2 -\change_inserted 0 1291204898 - -a1143 2 - -\change_inserted 0 1291204898 -a1147 2 - -\change_inserted 0 1291204901 -a1148 2 -\change_unchanged - -a1224 2 -\change_inserted 0 1291204908 - -a1227 2 - -\change_inserted 0 1291204908 -a1231 2 - -\change_inserted 0 1291204908 -a1232 2 -\change_unchanged - -a1271 2 -\change_inserted 0 1291204917 - -a1274 2 - -\change_inserted 0 1291204917 -a1278 2 - -\change_inserted 0 1291204920 -a1279 2 -\change_unchanged - -a1316 2 -\change_inserted 0 1291204927 - -a1319 2 - -\change_inserted 0 1291204928 -d1325 1 -a1325 1 -\change_inserted 0 1291204942 -d1327 2 -a1381 2 -\change_inserted 0 1291205003 - -a1384 2 - -\change_inserted 0 1291205004 -a1388 2 - -\change_inserted 0 1291205007 -a1411 2 -\change_inserted 0 1291205019 - -a1414 2 - -\change_inserted 0 1291205019 -a1418 2 - -\change_inserted 0 1291205023 -a1419 2 -\change_unchanged - -a1465 2 -\change_inserted 0 1291205029 - -a1468 2 - -\change_inserted 0 1291205029 -a1472 2 - -\change_inserted 0 1291206020 -a1473 2 -\change_unchanged - -a1528 2 -\change_inserted 0 1291205043 - -a1531 2 - -\change_inserted 0 1291205043 -d1537 1 -a1537 1 -\change_inserted 0 1291205057 -d1539 2 -a1589 2 -\change_inserted 0 1291205062 - -a1592 2 - -\change_inserted 0 1291205062 -a1596 2 - -\change_inserted 0 1291205062 -a1597 2 -\change_unchanged - -a1626 2 -\change_inserted 0 1291205072 - -a1629 2 - -\change_inserted 0 1291205073 -a1633 2 - -\change_inserted 0 1291205073 -a1634 2 -\change_unchanged - -a1674 4 - -\change_deleted 0 1291204504 - -\change_unchanged -a1699 2 -\change_inserted 0 1291205079 - -a1702 2 - -\change_inserted 0 1291205080 -a1706 2 - -\change_inserted 0 1291205080 -a1707 2 -\change_unchanged - -a1833 2 -\change_inserted 0 1291205090 - -d1869 2 -a1870 7 - is to divide the file into zones, and using a free list (or -\change_inserted 0 1291205498 -table -\change_deleted 0 1291205497 -set -\change_unchanged - of free lists) for each. -a1871 2 -\change_inserted 0 1291205203 - -a1874 2 - -\change_inserted 0 1291205358 -a1890 21 -\change_unchanged - -\end_layout - -\begin_layout Standard - -\change_deleted 0 1291205198 -Note that this means we need to split the free lists when we expand the - file; this is probably acceptable when we double the hash table size, since - that is such an expensive operation already. - In the case of increasing the file size, there is an optimization we can - use: if we use M in the formula above as the file size rounded up to the - next power of 2, we only need reshuffle free lists when the file size crosses - a power of 2 boundary, -\emph on -and -\emph default -reshuffling the free lists is trivial: we simply merge every consecutive - pair of free lists. -\change_unchanged - -d1899 1 -a1899 7 -Identify the correct -\change_inserted 0 1291205366 -free list -\change_deleted 0 1291205364 -zone -\change_unchanged -. -d1907 2 -a1908 7 -Re-check the -\change_inserted 0 1291205372 -list -\change_deleted 0 1291205371 -zone -\change_unchanged - (we didn't have a lock, sizes could have changed): relock if necessary. -d1912 1 -a1912 5 -Place the freed entry in the list -\change_deleted 0 1291205382 - for that zone -\change_unchanged -. -d1921 1 -a1921 15 -Pick a -\change_deleted 0 1291205403 -zone either the zone we last freed into, or based on a -\begin_inset Quotes eld -\end_inset - -random -\begin_inset Quotes erd -\end_inset - - number. -\change_inserted 0 1291205411 -free table; usually the previous one. -\change_unchanged - -a1925 10 -\change_deleted 0 1291205432 - -\end_layout - -\begin_layout Enumerate - -\change_deleted 0 1291205428 -Re-check the zone: relock if necessary. -\change_unchanged - -d1934 1 -a1934 7 - unlock the list and try the next -\change_inserted 0 1291205455 -largest list -\change_deleted 0 1291205452 -zone. -\change_inserted 0 1291205457 - -a1937 2 - -\change_inserted 0 1291205476 -a1938 2 -\change_unchanged - -a1966 2 -\change_inserted 0 1291205542 - -a1969 2 - -\change_inserted 0 1291205591 -a1971 70 -\change_unchanged - -\end_layout - -\begin_layout Standard - -\change_deleted 0 1291205539 -I anticipate that the number of entries in each free zone would be small, - but it might be worth using one free entry to hold pointers to the others - for cache efficiency. -\change_unchanged - -\end_layout - -\begin_layout Standard - -\change_deleted 0 1291205534 -\begin_inset CommandInset label -LatexCommand label -name "freelist-in-zone" - -\end_inset - -If we want to avoid locking complexity (enlarging the free lists when we - enlarge the file) we could place the array of free lists at the beginning - of each zone. - This means existing array lists never move, but means that a record cannot - be larger than a zone. - That in turn implies that zones should be variable sized (say, power of - 2), which makes the question -\begin_inset Quotes eld -\end_inset - -what zone is this record in? -\begin_inset Quotes erd -\end_inset - - much harder (and -\begin_inset Quotes eld -\end_inset - -pick a random zone -\begin_inset Quotes erd -\end_inset - -, but that's less common). - It could be done with as few as 4 bits from the record header. -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout -Using -\begin_inset Formula $2^{16+N*3}$ -\end_inset - -means 0 gives a minimal 65536-byte zone, 15 gives the maximal -\begin_inset Formula $2^{61}$ -\end_inset - - byte zone. - Zones range in factor of 8 steps. - Given the zone size for the zone the current record is in, we can determine - the start of the zone. -\end_layout - -\end_inset - - -\change_inserted 0 1291205139 - -d2218 1 -a2218 5 - uint32_t -\change_inserted 0 1291205758 -used_ -\change_unchanged -magic : 16, -a2222 4 -\change_deleted 0 1291205693 - prev_is_free: 1, -\change_unchanged - -d2230 1 -a2230 7 - top_hash: 1 -\change_inserted 0 1291205704 -1 -\change_deleted 0 1291205704 -0 -\change_unchanged -; -d2254 1 -a2254 9 - uint -\change_inserted 0 1291205725 -64 -\change_deleted 0 1291205723 -32 -\change_unchanged -_t -\change_inserted 0 1291205753 -free_magic: 8, -a2257 2 - -\change_inserted 0 1291205746 -a2262 24 -\change_deleted 0 1291205749 -free_magic; -\change_unchanged - -\end_layout - -\begin_layout LyX-Code - uint64_t -\change_inserted 0 1291205786 -free_table: 8, -\end_layout - -\begin_layout LyX-Code - -\change_inserted 0 1291205788 - -\change_unchanged -total_length -\change_inserted 0 1291205792 - : 56 -\change_deleted 0 1291205790 -; -\change_unchanged - -d2266 1 -a2266 7 - uint64_t -\change_deleted 0 1291205801 -prev, -\change_unchanged -next; -\change_deleted 0 1291205811 - -d2270 1 -a2270 3 - -\change_deleted 0 1291205811 - ... -d2274 1 -a2274 5 - -\change_deleted 0 1291205808 - uint64_t tailer -\change_unchanged -; -d2283 5 -a2287 16 -\change_deleted 0 1291205827 -We might want to take some bits from the used record's top_hash (and the - free record which has 32 bits of padding to spare anyway) if we use variable - sized zones. - See -\begin_inset CommandInset ref -LatexCommand ref -reference "freelist-in-zone" - -\end_inset - -. - -\change_inserted 0 1291205885 - Note that by limiting valid offsets to 56 bits, we can pack everything - we need into 3 64-byte words, meaning our minimum record size is 8 bytes. -a2290 2 - -\change_inserted 0 1291205886 -a2294 2 - -\change_inserted 0 1291205886 -a2295 2 -\change_unchanged - -a2385 2 -\change_inserted 0 1291205894 - -a2388 2 - -\change_inserted 0 1291205894 -a2392 2 - -\change_inserted 0 1291205902 -a2393 2 -\change_unchanged - -a2415 4 - -\change_deleted 0 1291204504 - -\change_unchanged -a2445 2 -\change_inserted 0 1291205910 - -a2448 2 - -\change_inserted 0 1291205910 -a2452 2 - -\change_inserted 0 1291205914 -a2453 2 -\change_unchanged - -a2485 2 -\change_inserted 0 1291205919 - -a2488 2 - -\change_inserted 0 1291205919 -a2492 2 - -\change_inserted 0 1291205922 -a2493 2 -\change_unchanged - -a2533 2 -\change_inserted 0 1291205929 - -a2536 2 - -\change_inserted 0 1291205929 -a2540 2 - -\change_inserted 0 1291205929 -a2541 2 -\change_unchanged - -a2578 2 -\change_inserted 0 1291205932 - -a2581 2 - -\change_inserted 0 1291205933 -a2585 2 - -\change_inserted 0 1291205933 -a2586 2 -\change_unchanged - -a2724 2 -\change_inserted 0 1291205944 - -a2727 2 - -\change_inserted 0 1291205945 -a2731 2 - -\change_inserted 0 1291205948 -a2732 2 -\change_unchanged - -@ - - -1.11 -log -@Merge changes -@ -text -@d53 7 -a59 1 -14-September-2010 -d587 16 -d644 18 -d716 16 -d753 16 -d813 18 -d883 16 -d953 16 -d1084 16 -d1181 16 -d1273 16 -d1328 16 -d1381 16 -d1447 19 -a1465 2 - if older code (which doesn't understand the feature) writes to the database.Reco -rd Headers Are Not Expandible -d1484 16 -d1546 16 -d1617 16 -d1680 16 -d1725 16 -d1810 16 -d1951 8 -a1958 3 -Proposed SolutionThe first step is to remove all the current heuristics, - as they obviously interact, then examine them once the lock contention - is addressed. -d1989 7 -a1995 2 - is to divide the file into zones, and using a free list (or set of free - lists) for each. -d1997 2 -d2002 25 -d2039 2 -d2049 7 -a2055 1 -Identify the correct zone. -d2063 7 -a2069 2 -Re-check the zone (we didn't have a lock, sizes could have changed): relock - if necessary. -d2073 5 -a2077 1 -Place the freed entry in the list for that zone. -d2086 3 -a2088 1 -Pick a zone either the zone we last freed into, or based on a -d2097 4 -d2105 2 -d2110 2 -d2113 2 -d2123 15 -a2137 1 - unlock the list and try the next zone. -d2166 11 -d2180 2 -d2185 2 -d2190 2 -d2223 1 -a2223 1 -status open -d2243 2 -d2491 5 -a2495 1 - uint32_t magic : 16, -d2499 2 -d2502 2 -d2511 7 -a2517 1 - top_hash: 10; -d2541 29 -a2569 1 - uint32_t free_magic; -d2573 11 -a2583 1 - uint64_t total_length; -d2587 7 -a2593 1 - uint64_t prev, next; -d2597 2 -d2603 5 -a2607 1 - uint64_t tailer; -d2615 2 -d2628 18 -d2736 16 -d2808 16 -d2856 16 -d2912 16 -d2965 16 -d3119 16 -@ - - -1.10 -log -@Tracing attribute, talloc support. -@ -text -@d1 1 -a1 1 -#LyX 1.6.5 created this file. For more info see http://www.lyx.org/ -d53 1 -a53 7 - -\change_deleted 0 1283307542 -26-July -\change_inserted 0 1284423485 -14-September -\change_unchanged --2010 -a472 2 -\change_inserted 0 1284422789 - -a479 2 -\change_unchanged - -a838 2 - -\change_inserted 0 1284016998 -a846 2 -\change_unchanged - -a1194 2 -\change_inserted 0 1284015637 - -a1197 2 - -\change_inserted 0 1284015716 -a1201 2 - -\change_inserted 0 1284015906 -a1210 2 - -\change_inserted 0 1284015637 -a1214 2 - -\change_inserted 0 1284016114 -a1227 2 - -\change_inserted 0 1284016149 -a1232 2 - -\change_inserted 0 1284016639 -a1237 2 - -\change_inserted 0 1284016821 -a1243 2 - -\change_inserted 0 1284016803 -d1245 2 -a1246 9 - if older code (which doesn't understand the feature) writes to the database. -\change_deleted 0 1284016101 - -\end_layout - -\begin_layout Subsection - -\change_inserted 0 1284015634 -Record Headers Are Not Expandible -a1249 2 - -\change_inserted 0 1284015634 -a1254 2 - -\change_inserted 0 1284015634 -a1258 2 - -\change_inserted 0 1284422552 -a1267 2 - -\change_inserted 0 1284422568 -a1271 2 - -\change_inserted 0 1284422646 -a1276 2 - -\change_inserted 0 1284422656 -a1280 2 - -\change_inserted 0 1284423065 -a1305 2 - -\change_inserted 0 1284423042 -a1310 2 -\change_unchanged - -a1457 2 - -\change_inserted 0 1283336713 -a1463 2 - -\change_unchanged -d1482 2 -d1485 1 -a1485 51 -\change_deleted 0 1283307675 -There are three details which become important: -\end_layout - -\begin_layout Enumerate - -\change_deleted 0 1283307675 -On encountering a full bucket, we use the next bucket. -\end_layout - -\begin_layout Enumerate - -\change_deleted 0 1283307675 -Extra hash bits are stored with the offset, to reduce comparisons. -\end_layout - -\begin_layout Enumerate - -\change_deleted 0 1283307675 -A marker entry is used on deleting an entry. -\end_layout - -\begin_layout Standard - -\change_deleted 0 1283307675 -The doubling of the table must be done under a transaction; we will not - reduce it on deletion, so it will be an unusual case. - It will either be placed at the head (other entries will be moved out the - way so we can expand). - We could have a pointer in the header to the current hashtable location, - but that pointer would have to be read frequently to check for hashtable - moves. -\end_layout - -\begin_layout Standard - -\change_deleted 0 1283307675 -The locking for this is slightly more complex than the chained case; we - currently have one lock per bucket, and that means we would need to expand - the lock if we overflow to the next bucket. - The frequency of such collisions will effect our locking heuristics: we - can always lock more buckets than we need. -\end_layout - -\begin_layout Standard - -\change_deleted 0 1283307675 -One possible optimization is to only re-check the hash size on an insert - or a lookup miss. - -\change_inserted 0 1283307770 -a1492 2 - -\change_inserted 0 1283336187 -a1500 2 - -\change_inserted 0 1283336586 -a1510 2 -\change_unchanged - -d1636 3 -a1638 8 -Proposed Solution -\change_deleted 0 1283336858 - -\end_layout - -\begin_layout Standard -The first step is to remove all the current heuristics, as they obviously - interact, then examine them once the lock contention is addressed. -a1647 2 -\change_inserted 0 1283336910 - -a1650 2 - -\change_inserted 0 1283337052 -a1655 2 -\change_unchanged - -a1776 2 -\change_inserted 0 1283309850 - -a1779 2 - -\change_inserted 0 1283337216 -a1813 2 - -\change_inserted 0 1284424151 -a1825 2 -\change_unchanged - -a1830 2 -\change_unchanged - -a2031 2 - -\change_inserted 0 1283336739 -a2040 2 -\change_unchanged - -a2117 2 -\change_inserted 0 1283337133 - -a2120 2 - -\change_inserted 0 1283337139 -a2121 2 -\change_unchanged - -a2136 2 - -\change_inserted 0 1283337235 -a2147 2 -\change_unchanged - -d2251 1 -a2251 7 -Proposed Solution -\change_deleted 0 1284423472 - -\end_layout - -\begin_layout Standard -None. -d2261 1 -a2261 1 -\change_inserted 0 1284423891 -d2263 1 -a2263 4 -\change_deleted 0 1284423891 -. - -\change_inserted 0 1284423901 -a2271 2 -\change_unchanged - -a2293 2 -\change_inserted 0 1284423495 - -a2312 2 - -\change_inserted 0 1284424201 -d2321 1 -a2321 3 - -\change_unchanged -We could solve a small part of the problem by providing read-only transactions. -a2505 2 -\change_inserted 0 1284423555 - -a2508 2 - -\change_inserted 0 1284423617 -a2512 2 - -\change_inserted 0 1284423719 -a2519 2 - -\change_inserted 0 1284423864 -a2530 2 - -\change_inserted 0 1284423850 -a2540 2 -\change_unchanged - -@ - - -1.9 -log -@Extension mechanism. -@ -text -@d56 2 -a57 2 -\change_inserted 0 1284016854 -9-September -d479 11 -d1303 1 -a1303 1 -\change_inserted 0 1284016847 -d1310 56 -d1945 1 -a1945 1 -\change_inserted 0 1283310945 -d1956 2 -d2402 2 -d2416 4 -d2421 12 -d2455 2 -d2476 12 -d2673 47 -@ - - -1.8 -log -@Remove bogus footnote -@ -text -@d56 2 -a57 2 -\change_inserted 0 1283307544 -1-September -d838 12 -d1198 103 -@ - - -1.7 -log -@Moving hash table does not work. -@ -text -@a1436 12 -\begin_inset Foot -status collapsed - -\begin_layout Plain Layout - -\change_inserted 0 1283336450 -If we make the hash offsets zone-relative, then this only restricts the - zone size, not the overall database size. -\end_layout - -\end_inset - -@ - - -1.6 -log -@Commit changes -@ -text -@d38 1 -a38 1 -\author "" -d53 7 -a59 1 -26-July-2010 -d1333 10 -d1361 3 -a1363 1 - There are three details which become important: -d1367 2 -d1373 2 -d1379 2 -d1385 2 -d1397 2 -d1407 2 -d1411 45 -d1582 2 -d1598 14 -d1733 62 -d1996 13 -d2086 10 -d2110 15 -a2124 1 -\begin_layout LyX-Code -@ - - -1.5 -log -@Soft transaction commit -@ -text -@d38 1 -a38 1 -\author "Rusty Russell,,," -a52 4 - -\change_deleted 0 1280141199 -10-May-2010 -\change_inserted 0 1280141202 -a53 2 -\change_unchanged - -a2028 2 - -\change_inserted 0 1280140902 -a2034 2 - -\change_unchanged -a2212 2 -\change_inserted 0 1280140661 - -a2215 2 - -\change_inserted 0 1280140703 -a2219 2 - -\change_inserted 0 1280708312 -a2226 2 - -\change_inserted 0 1280708400 -a2239 2 - -\change_inserted 0 1280140836 -a2243 2 - -\change_inserted 0 1280708255 -a2247 2 - -\change_inserted 0 1280708374 -a2252 2 - -\change_inserted 0 1280141181 -a2274 2 - -\change_inserted 0 1280141345 -@ - - -1.4 -log -@Merge changes -@ -text -@d38 1 -a38 1 -\author "" -d53 2 -d56 4 -d2035 10 -d2223 84 -@ - - -1.3 -log -@Transaction and freelist rethink. -@ -text -@d38 1 -a38 1 -\author "Rusty Russell,,," -d53 1 -a53 1 -27-April-2010 -d662 1 -a662 5 - behavior of disallowing -\change_inserted 0 1272940179 -nested -\change_unchanged -transactions should become the default. -a1210 2 -\change_inserted 0 1272944650 - -a1214 2 - -\change_inserted 0 1272944763 -a1218 2 -\change_unchanged - -a1223 2 -\change_unchanged - -a1301 2 - -\change_inserted 0 1273478114 -a1310 2 -\change_unchanged - -d1515 1 -a1515 11 -The free list -\change_deleted 0 1273469807 -should -\change_inserted 0 1273469810 -must -\change_unchanged - be split -\change_deleted 0 1273469815 -into multiple lists -\change_unchanged -to reduce contention. -a1520 2 -\change_inserted 0 1273470006 - -a1523 2 - -\change_inserted 0 1273492055 -a1539 2 - -\change_inserted 0 1273483888 -a1551 2 -\change_unchanged - -a1554 8 - -\change_deleted 0 1272942055 -There are various ways to organize these lisys, but because we want to be - able to quickly identify which free list an entry is in, and reduce the - number of locks required for merging, we will use zoning (eg. - each free list covers some fixed fraction of the file). - -\change_inserted 0 1273484187 -d1556 1 -a1556 7 - -\change_deleted 0 1273484194 -The algorithm for f -\change_inserted 0 1273484194 -F -\change_unchanged -reeing is simple: -d1560 1 -a1560 7 -Identify the correct -\change_deleted 0 1273482856 -free list -\change_inserted 0 1273482857 -zone -\change_unchanged -. -d1564 1 -a1564 7 -Lock the -\change_inserted 0 1273482895 -corresponding -\change_unchanged -list -\change_inserted 0 1273482863 -. -a1567 2 - -\change_inserted 0 1273482909 -d1573 1 -a1573 13 - -\change_deleted 0 1273482885 -, and p -\change_inserted 0 1273482888 -P -\change_unchanged -lace the freed entry -\change_deleted 0 1273492415 -at the head -\change_inserted 0 1273492415 -in the list for that zone -\change_unchanged -. -d1577 2 -a1578 7 -Allocation is a little more complicated, as we -\change_deleted 0 1273483240 -merge entries as we walk the list: -\change_inserted 0 1273484250 -perform delayed coalescing at this point: -\change_unchanged - -d1582 1 -a1582 19 -Pick a -\change_deleted 0 1273482955 -free list; -\change_inserted 0 1273482957 -zone -\change_unchanged - either the -\change_deleted 0 1273482962 -list -\change_inserted 0 1273482962 -zone -\change_unchanged - we last freed -\change_deleted 0 1273482966 -o -\change_inserted 0 1273482966 -i -\change_unchanged -nto, or based on a -d1594 1 -a1594 9 -Lock th -\change_inserted 0 1273482980 -e corresponding -\change_deleted 0 1273482973 -at -\change_unchanged - list. -\change_inserted 0 1273482982 - -a1597 2 - -\change_inserted 0 1273483084 -a1598 53 -\change_unchanged - -\end_layout - -\begin_layout Enumerate -If the top entry is -\change_deleted 0 1273492155 -well-sized, -\change_inserted 0 1273492159 --large enough, -\change_unchanged -remove it from the list and return it. -\end_layout - -\begin_layout Enumerate -Otherwise, -\change_inserted 0 1273492206 -coalesce entries in the list. -\change_deleted 0 1273492200 -examine the entry to the right of it in the file. - If it is free: -\end_layout - -\begin_deeper -\begin_layout Enumerate - -\change_deleted 0 1273492200 -If that entry is in a different list, lock that list too. -\end_layout - -\begin_layout Enumerate - -\change_deleted 0 1273492200 -If we had to place a new lock, re-check that the entry is free. -\end_layout - -\begin_layout Enumerate - -\change_deleted 0 1273492200 -Remove that entry from its free list and expand this entry to cover it. -\end_layout - -\begin_layout Enumerate - -\change_deleted 0 1273485554 -Goto step 3. -\end_layout - -\end_deeper -\begin_layout Enumerate - -\change_inserted 0 1273485311 -If there was no entry large enough, unlock the list and try the next zone. -d1602 1 -a1602 5 - -\change_deleted 0 1273483646 -Repeat step 3 with each entry in the list. -\change_unchanged - -d1606 2 -a1607 5 - -\change_deleted 0 1273483668 -Unlock the list and repeat step 2 with the next list. -\change_unchanged - -d1611 1 -a1611 7 -If no -\change_deleted 0 1273483671 -list -\change_inserted 0 1273483671 -zone -\change_unchanged - satisfies, expand the file. -d1615 2 -a1616 9 -This optimizes rapid insert/delete of free list entries -\change_inserted 0 1273485794 - by not coalescing them all the time. -\change_deleted 0 1273483685 -, and allows us to get rid of the tailer altogether -\change_unchanged -. - -\change_inserted 0 1273492299 -a1638 39 - -\change_deleted 0 1273476840 -The question of -\begin_inset Quotes eld -\end_inset - -well-sized -\begin_inset Quotes erd -\end_inset - - free entries is more difficult: the 25% overhead works in practice for - ldb because indexes tend to expand by one record at a time. - This can be resolved by having an -\begin_inset Quotes eld -\end_inset - -expanded -\begin_inset Quotes erd -\end_inset - - bit in the header to note entries that have previously expanded, and allocating - more space for them. - Whether the -\begin_inset Quotes eld -\end_inset - -increasing slack -\begin_inset Quotes erd -\end_inset - - algorithm should be implemented or first-fit used is still unknown: we - will determine this once these other ideas are implemented. -\change_inserted 0 1273483750 - -\end_layout - -\begin_layout Standard - -\change_inserted 0 1273492450 -a1644 2 - -\change_inserted 0 1273470441 -a1654 2 - -\change_inserted 0 1273476556 -a1659 2 - -\change_inserted 0 1273470423 -a1661 2 -\change_unchanged - -a1672 2 - -\change_inserted 0 1273476847 -a1676 2 - -\change_inserted 0 1273476886 -a1691 2 - -\change_inserted 0 1273477233 -a1699 2 - -\change_inserted 0 1273477534 -a1706 2 - -\change_inserted 0 1273482700 -a1712 2 - -\change_inserted 0 1273478079 -a1722 2 - -\change_inserted 0 1273477839 -a1726 2 - -\change_inserted 0 1273477925 -a1730 2 - -\change_inserted 0 1273477925 -a1734 2 - -\change_inserted 0 1273477925 -a1738 2 - -\change_inserted 0 1273477925 -a1742 2 - -\change_inserted 0 1273477925 -a1746 2 - -\change_inserted 0 1273477925 -a1750 2 - -\change_inserted 0 1273477925 -a1754 2 - -\change_inserted 0 1273477925 -a1758 2 - -\change_inserted 0 1273477925 -a1762 2 - -\change_inserted 0 1273477925 -a1766 2 - -\change_inserted 0 1273477925 -a1770 2 - -\change_inserted 0 1273477925 -a1774 2 - -\change_inserted 0 1273477925 -a1778 2 - -\change_inserted 0 1273477925 -a1782 2 - -\change_inserted 0 1273477925 -a1786 2 - -\change_inserted 0 1273477925 -a1790 2 - -\change_inserted 0 1273477925 -a1794 2 - -\change_inserted 0 1273477925 -a1798 2 - -\change_inserted 0 1273492522 -a1802 2 - -\change_inserted 0 1273492530 -a1806 2 - -\change_inserted 0 1273492546 -a1810 2 - -\change_inserted 0 1273478239 -a1814 2 - -\change_inserted 0 1273479960 -a1821 2 - -\change_inserted 0 1273480265 -a1830 2 - -\change_inserted 0 1273480354 -a1845 2 - -\change_inserted 0 1273478968 -a1851 2 - -\change_inserted 0 1273492604 -a1859 2 - -\change_inserted 0 1273479572 -a1862 2 -\change_unchanged - -a1870 2 - -\change_inserted 0 1273480282 -a1874 2 - -\change_inserted 0 1273478931 -a1878 2 - -\change_inserted 0 1273481549 -a1882 2 - -\change_inserted 0 1273481557 -a1886 2 - -\change_inserted 0 1273480307 -a1890 2 - -\change_inserted 0 1273480335 -a1894 2 - -\change_inserted 0 1273479897 -a1898 2 - -\change_inserted 0 1273479653 -a1902 2 - -\change_inserted 0 1273480371 -a1906 2 - -\change_inserted 0 1273480464 -a1910 2 - -\change_inserted 0 1273480399 -a1914 2 - -\change_inserted 0 1273480425 -a1918 2 - -\change_inserted 0 1273480453 -a1922 2 - -\change_inserted 0 1273480455 -a1926 2 - -\change_inserted 0 1273480450 -a1930 2 - -\change_inserted 0 1273480452 -a1935 2 -\change_inserted 0 1273478830 - -a1942 5 - -\change_deleted 0 1273481604 -In theory, we could get away with 2: one after we write the new data, and - one to somehow atomically change over to it. -\change_inserted 0 1273481632 -a1946 2 - -\change_inserted 0 1273481724 -a1950 2 - -\change_inserted 0 1273481713 -a1954 2 - -\change_inserted 0 1273481717 -a1958 2 - -\change_inserted 0 1273481730 -a1962 2 - -\change_inserted 0 1273481736 -a1966 2 - -\change_inserted 0 1273481744 -a1970 2 - -\change_inserted 0 1273481748 -a1974 2 - -\change_inserted 0 1273482185 -a1978 2 - -\change_inserted 0 1273482259 -a1989 50 - -\change_deleted 0 1273481848 -None. - Trying to rewrite the transaction code is a separate experiment, which - I encourage someone else to do. - At some point you say -\begin_inset Quotes eld -\end_inset - -use a real database -\begin_inset Quotes erd -\end_inset - -. -\end_layout - -\begin_layout Standard - -\change_deleted 0 1273481848 -But as a thought experiment: -\change_unchanged - -\end_layout - -\begin_layout Standard - -\change_deleted 0 1273481788 -Say there was a pointer in the header which said where the hash table and - free list tables were, and that no blocks were labeled with whether they - were free or not (it had to be derived from what list they were in). - We could create new hash table and free list in some free space, and populate - it as we want the post-committed state to look. - Then we sync, then we switch the offset in the header, then we sync again. -\end_layout - -\begin_layout Standard - -\change_deleted 0 1273481788 -This would not allow arbitrary changes to the database, such as tdb_repack - does, and would require more space (since we have to preserve the current - and future entries at once). - If we used hash trees rather than one big hash table, we might only have - to rewrite some sections of the hash, too. -\change_inserted 0 1273481854 - -\end_layout - -\begin_layout Standard - -\change_inserted 0 1273482102 -a1993 2 - -\change_inserted 0 1273482061 -a1998 2 - -\change_inserted 0 1273482063 -a2002 2 - -\change_inserted 0 1273482072 -a2006 2 - -\change_inserted 0 1273482139 -a2011 2 - -\change_inserted 0 1273482364 -a2015 2 - -\change_inserted 0 1273482163 -a2019 2 - -\change_inserted 0 1273482493 -a2037 2 - -\change_inserted 0 1273482536 -a2046 2 -\change_unchanged - -a2049 2 - -\change_inserted 0 1273482641 -a2058 2 - -\change_inserted 0 1273481827 -d2067 2 -a2068 11 -We could -\change_inserted 0 1273481829 -then -\change_unchanged -implement snapshots using a similar method -\change_deleted 0 1273481838 - to the above, only -\change_inserted 0 1273481840 -, -\change_unchanged - using multiple different hash tables/free tables. -@ - - -1.2 -log -@After first feedback (Ronnie & Volker) -@ -text -@d1314 13 -d1531 11 -a1541 1 -The free list should be split into multiple lists to reduce contention. -d1547 39 -d1596 7 -d1604 1 -a1604 1 -The algorithm for freeing is simple: -d1608 7 -a1614 1 -Identify the correct free list. -d1618 30 -a1647 1 -Lock the list, and place the freed entry at the head. -d1651 7 -a1657 2 -Allocation is a little more complicated, as we merge entries as we walk - the list: -d1661 19 -a1679 1 -Pick a free list; either the list we last freed onto, or based on a -d1691 17 -a1707 1 -Lock that list. -d1711 7 -a1717 1 -If the top entry is well-sized, remove it from the list and return it. -d1721 5 -a1725 1 -Otherwise, examine the entry to the right of it in the file. -d1731 2 -d1737 2 -d1743 2 -d1749 2 -d1756 8 -d1765 2 -d1770 2 -d1773 2 -d1778 7 -a1784 1 -If no list satisfies, expand the file. -d1788 28 -a1815 2 -This optimizes rapid insert/delete of free list entries, and allows us to - get rid of the tailer altogether. -d1819 2 -d1851 1 -a1851 1 -\change_inserted 0 1272941474 -d1857 303 -a2159 18 -\change_inserted 0 1272942759 -There are various ways to organize these lists, but because we want to be - able to quickly identify which free list an entry is in, and reduce the - number of locks required for merging, we will use zoning (eg. - each of the N free lists in a tdb file of size M covers a fixed fraction - M/N). - Note that this means we need to reshuffle the free lists when we expand - the file; this is probably acceptable when we double the hash table size, - since that is such an expensive operation already. - In the case of increasing the file size, there is an optimization we can - use: if we use M in the formula above as the file size rounded up to the - next power of 2, we only need reshuffle free lists when the file size crosses - a power of 2 boundary, -\emph on -and -\emph default -reshuffling the free lists is trivial: we simply merge every consecutive - pair of free lists. -d2164 107 -d2276 2 -d2280 59 -d2346 2 -d2363 2 -d2366 2 -d2371 2 -d2382 2 -d2389 57 -d2458 13 -d2474 32 -a2505 2 -We could implement snapshots using a similar method to the above, only using - multiple different hash tables/free tables. -@ - - -1.1 -log -@Initial revision -@ -text -@d1 1 -a1 1 -#LyX 1.6.4 created this file. For more info see http://www.lyx.org/ -d36 3 -a38 3 -\tracking_changes false -\output_changes false -\author "" -d662 5 -a666 1 - behavior of disallowing transactions should become the default. -d1215 21 -d1527 2 -d1533 3 -a1535 1 - The algorithm for freeing is simple: -d1642 26 -@ diff --git a/ccan/tdb2/doc/design.pdf b/ccan/tdb2/doc/design.pdf deleted file mode 100644 index 558dc1f8..00000000 Binary files a/ccan/tdb2/doc/design.pdf and /dev/null differ diff --git a/ccan/tdb2/doc/design.txt b/ccan/tdb2/doc/design.txt deleted file mode 100644 index c2994a4c..00000000 --- a/ccan/tdb2/doc/design.txt +++ /dev/null @@ -1,1259 +0,0 @@ -TDB2: A Redesigning The Trivial DataBase - -Rusty Russell, IBM Corporation - -1-December-2010 - -Abstract - -The Trivial DataBase on-disk format is 32 bits; with usage cases -heading towards the 4G limit, that must change. This required -breakage provides an opportunity to revisit TDB's other design -decisions and reassess them. - -1 Introduction - -The Trivial DataBase was originally written by Andrew Tridgell as -a simple key/data pair storage system with the same API as dbm, -but allowing multiple readers and writers while being small -enough (< 1000 lines of C) to include in SAMBA. The simple design -created in 1999 has proven surprisingly robust and performant, -used in Samba versions 3 and 4 as well as numerous other -projects. Its useful life was greatly increased by the -(backwards-compatible!) addition of transaction support in 2005. - -The wider variety and greater demands of TDB-using code has lead -to some organic growth of the API, as well as some compromises on -the implementation. None of these, by themselves, are seen as -show-stoppers, but the cumulative effect is to a loss of elegance -over the initial, simple TDB implementation. Here is a table of -the approximate number of lines of implementation code and number -of API functions at the end of each year: - - -+-----------+----------------+--------------------------------+ -| Year End | API Functions | Lines of C Code Implementation | -+-----------+----------------+--------------------------------+ -+-----------+----------------+--------------------------------+ -| 1999 | 13 | 1195 | -+-----------+----------------+--------------------------------+ -| 2000 | 24 | 1725 | -+-----------+----------------+--------------------------------+ -| 2001 | 32 | 2228 | -+-----------+----------------+--------------------------------+ -| 2002 | 35 | 2481 | -+-----------+----------------+--------------------------------+ -| 2003 | 35 | 2552 | -+-----------+----------------+--------------------------------+ -| 2004 | 40 | 2584 | -+-----------+----------------+--------------------------------+ -| 2005 | 38 | 2647 | -+-----------+----------------+--------------------------------+ -| 2006 | 52 | 3754 | -+-----------+----------------+--------------------------------+ -| 2007 | 66 | 4398 | -+-----------+----------------+--------------------------------+ -| 2008 | 71 | 4768 | -+-----------+----------------+--------------------------------+ -| 2009 | 73 | 5715 | -+-----------+----------------+--------------------------------+ - - -This review is an attempt to catalog and address all the known -issues with TDB and create solutions which address the problems -without significantly increasing complexity; all involved are far -too aware of the dangers of second system syndrome in rewriting a -successful project like this. - -2 API Issues - -2.1 tdb_open_ex Is Not Expandable - -The tdb_open() call was expanded to tdb_open_ex(), which added an -optional hashing function and an optional logging function -argument. Additional arguments to open would require the -introduction of a tdb_open_ex2 call etc. - -2.1.1 Proposed Solution - -tdb_open() will take a linked-list of attributes: - -enum tdb_attribute { - - TDB_ATTRIBUTE_LOG = 0, - - TDB_ATTRIBUTE_HASH = 1 - -}; - -struct tdb_attribute_base { - - enum tdb_attribute attr; - - union tdb_attribute *next; - -}; - -struct tdb_attribute_log { - - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG -*/ - - tdb_log_func log_fn; - - void *log_private; - -}; - -struct tdb_attribute_hash { - - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH -*/ - - tdb_hash_func hash_fn; - - void *hash_private; - -}; - -union tdb_attribute { - - struct tdb_attribute_base base; - - struct tdb_attribute_log log; - - struct tdb_attribute_hash hash; - -}; - -This allows future attributes to be added, even if this expands -the size of the union. - -2.1.2 Status - -Complete. - -2.2 tdb_traverse Makes Impossible Guarantees - -tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, -and it was thought that it was important to guarantee that all -records which exist at the start and end of the traversal would -be included, and no record would be included twice. - -This adds complexity (see[Reliable-Traversal-Adds]) and does not -work anyway for records which are altered (in particular, those -which are expanded may be effectively deleted and re-added behind -the traversal). - -2.2.1 Proposed Solution - -Abandon the guarantee. You will see every record if no changes -occur during your traversal, otherwise you will see some subset. -You can prevent changes by using a transaction or the locking -API. - -2.2.2 Status - -Complete. Delete-during-traverse will still delete every record, -too (assuming no other changes). - -2.3 Nesting of Transactions Is Fraught - -TDB has alternated between allowing nested transactions and not -allowing them. Various paths in the Samba codebase assume that -transactions will nest, and in a sense they can: the operation is -only committed to disk when the outer transaction is committed. -There are two problems, however: - -1. Canceling the inner transaction will cause the outer - transaction commit to fail, and will not undo any operations - since the inner transaction began. This problem is soluble with - some additional internal code. - -2. An inner transaction commit can be cancelled by the outer - transaction. This is desirable in the way which Samba's - database initialization code uses transactions, but could be a - surprise to any users expecting a successful transaction commit - to expose changes to others. - -The current solution is to specify the behavior at tdb_open(), -with the default currently that nested transactions are allowed. -This flag can also be changed at runtime. - -2.3.1 Proposed Solution - -Given the usage patterns, it seems that the “least-surprise” -behavior of disallowing nested transactions should become the -default. Additionally, it seems the outer transaction is the only -code which knows whether inner transactions should be allowed, so -a flag to indicate this could be added to tdb_transaction_start. -However, this behavior can be simulated with a wrapper which uses -tdb_add_flags() and tdb_remove_flags(), so the API should not be -expanded for this relatively-obscure case. - -2.3.2 Status - -Incomplete; nesting flag is still defined as per tdb1. - -2.4 Incorrect Hash Function is Not Detected - -tdb_open_ex() allows the calling code to specify a different hash -function to use, but does not check that all other processes -accessing this tdb are using the same hash function. The result -is that records are missing from tdb_fetch(). - -2.4.1 Proposed Solution - -The header should contain an example hash result (eg. the hash of -0xdeadbeef), and tdb_open_ex() should check that the given hash -function produces the same answer, or fail the tdb_open call. - -2.4.2 Status - -Complete. - -2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation - -In response to scalability issues with the free list ([TDB-Freelist-Is] -) two API workarounds have been incorporated in TDB: -tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The -latter actually calls the former with an argument of “5”. - -This code allows deleted records to accumulate without putting -them in the free list. On delete we iterate through each chain -and free them in a batch if there are more than max_dead entries. -These are never otherwise recycled except as a side-effect of a -tdb_repack. - -2.5.1 Proposed Solution - -With the scalability problems of the freelist solved, this API -can be removed. The TDB_VOLATILE flag may still be useful as a -hint that store and delete of records will be at least as common -as fetch in order to allow some internal tuning, but initially -will become a no-op. - -2.5.2 Status - -Incomplete. TDB_VOLATILE still defined, but implementation should -fail on unknown flags to be future-proof. - -2.6 TDB Files Cannot Be Opened Multiple Times - In The Same Process - -No process can open the same TDB twice; we check and disallow it. -This is an unfortunate side-effect of fcntl locks, which operate -on a per-file rather than per-file-descriptor basis, and do not -nest. Thus, closing any file descriptor on a file clears all the -locks obtained by this process, even if they were placed using a -different file descriptor! - -Note that even if this were solved, deadlock could occur if -operations were nested: this is a more manageable programming -error in most cases. - -2.6.1 Proposed Solution - -We could lobby POSIX to fix the perverse rules, or at least lobby -Linux to violate them so that the most common implementation does -not have this restriction. This would be a generally good idea -for other fcntl lock users. - -Samba uses a wrapper which hands out the same tdb_context to -multiple callers if this happens, and does simple reference -counting. We should do this inside the tdb library, which already -emulates lock nesting internally; it would need to recognize when -deadlock occurs within a single process. This would create a new -failure mode for tdb operations (while we currently handle -locking failures, they are impossible in normal use and a process -encountering them can do little but give up). - -I do not see benefit in an additional tdb_open flag to indicate -whether re-opening is allowed, as though there may be some -benefit to adding a call to detect when a tdb_context is shared, -to allow other to create such an API. - -2.6.2 Status - -Incomplete. - -2.7 TDB API Is Not POSIX Thread-safe - -The TDB API uses an error code which can be queried after an -operation to determine what went wrong. This programming model -does not work with threads, unless specific additional guarantees -are given by the implementation. In addition, even -otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot] -). - -2.7.1 Proposed Solution - -Reachitecting the API to include a tdb_errcode pointer would be a -great deal of churn; we are better to guarantee that the -tdb_errcode is per-thread so the current programming model can be -maintained. - -This requires dynamic per-thread allocations, which is awkward -with POSIX threads (pthread_key_create space is limited and we -cannot simply allocate a key for every TDB). - -Internal locking is required to make sure that fcntl locks do not -overlap between threads, and also that the global list of tdbs is -maintained. - -The aim is that building tdb with -DTDB_PTHREAD will result in a -pthread-safe version of the library, and otherwise no overhead -will exist. Alternatively, a hooking mechanism similar to that -proposed for [Proposed-Solution-locking-hook] could be used to -enable pthread locking at runtime. - -2.7.2 Status - -Incomplete. - -2.8 *_nonblock Functions And *_mark Functions Expose - Implementation - -CTDB[footnote: -Clustered TDB, see http://ctdb.samba.org -] wishes to operate on TDB in a non-blocking manner. This is -currently done as follows: - -1. Call the _nonblock variant of an API function (eg. - tdb_lockall_nonblock). If this fails: - -2. Fork a child process, and wait for it to call the normal - variant (eg. tdb_lockall). - -3. If the child succeeds, call the _mark variant to indicate we - already have the locks (eg. tdb_lockall_mark). - -4. Upon completion, tell the child to release the locks (eg. - tdb_unlockall). - -5. Indicate to tdb that it should consider the locks removed (eg. - tdb_unlockall_mark). - -There are several issues with this approach. Firstly, adding two -new variants of each function clutters the API for an obscure -use, and so not all functions have three variants. Secondly, it -assumes that all paths of the functions ask for the same locks, -otherwise the parent process will have to get a lock which the -child doesn't have under some circumstances. I don't believe this -is currently the case, but it constrains the implementation. - -2.8.1 Proposed Solution - -Implement a hook for locking methods, so that the caller can -control the calls to create and remove fcntl locks. In this -scenario, ctdbd would operate as follows: - -1. Call the normal API function, eg tdb_lockall(). - -2. When the lock callback comes in, check if the child has the - lock. Initially, this is always false. If so, return 0. - Otherwise, try to obtain it in non-blocking mode. If that - fails, return EWOULDBLOCK. - -3. Release locks in the unlock callback as normal. - -4. If tdb_lockall() fails, see if we recorded a lock failure; if - so, call the child to repeat the operation. - -5. The child records what locks it obtains, and returns that - information to the parent. - -6. When the child has succeeded, goto 1. - -This is flexible enough to handle any potential locking scenario, -even when lock requirements change. It can be optimized so that -the parent does not release locks, just tells the child which -locks it doesn't need to obtain. - -It also keeps the complexity out of the API, and in ctdbd where -it is needed. - -2.8.2 Status - -Incomplete. - -2.9 tdb_chainlock Functions Expose Implementation - -tdb_chainlock locks some number of records, including the record -indicated by the given key. This gave atomicity guarantees; -no-one can start a transaction, alter, read or delete that key -while the lock is held. - -It also makes the same guarantee for any other key in the chain, -which is an internal implementation detail and potentially a -cause for deadlock. - -2.9.1 Proposed Solution - -None. It would be nice to have an explicit single entry lock -which effected no other keys. Unfortunately, this won't work for -an entry which doesn't exist. Thus while chainlock may be -implemented more efficiently for the existing case, it will still -have overlap issues with the non-existing case. So it is best to -keep the current (lack of) guarantee about which records will be -effected to avoid constraining our implementation. - -2.10 Signal Handling is Not Race-Free - -The tdb_setalarm_sigptr() call allows the caller's signal handler -to indicate that the tdb locking code should return with a -failure, rather than trying again when a signal is received (and -errno == EAGAIN). This is usually used to implement timeouts. - -Unfortunately, this does not work in the case where the signal is -received before the tdb code enters the fcntl() call to place the -lock: the code will sleep within the fcntl() code, unaware that -the signal wants it to exit. In the case of long timeouts, this -does not happen in practice. - -2.10.1 Proposed Solution - -The locking hooks proposed in[Proposed-Solution-locking-hook] -would allow the user to decide on whether to fail the lock -acquisition on a signal. This allows the caller to choose their -own compromise: they could narrow the race by checking -immediately before the fcntl call.[footnote: -It may be possible to make this race-free in some implementations -by having the signal handler alter the struct flock to make it -invalid. This will cause the fcntl() lock call to fail with -EINVAL if the signal occurs before the kernel is entered, -otherwise EAGAIN. -] - -2.10.2 Status - -Incomplete. - -2.11 The API Uses Gratuitous Typedefs, Capitals - -typedefs are useful for providing source compatibility when types -can differ across implementations, or arguably in the case of -function pointer definitions which are hard for humans to parse. -Otherwise it is simply obfuscation and pollutes the namespace. - -Capitalization is usually reserved for compile-time constants and -macros. - - TDB_CONTEXT There is no reason to use this over 'struct - tdb_context'; the definition isn't visible to the API user - anyway. - - TDB_DATA There is no reason to use this over struct TDB_DATA; - the struct needs to be understood by the API user. - - struct TDB_DATA This would normally be called 'struct - tdb_data'. - - enum TDB_ERROR Similarly, this would normally be enum - tdb_error. - -2.11.1 Proposed Solution - -None. Introducing lower case variants would please pedants like -myself, but if it were done the existing ones should be kept. -There is little point forcing a purely cosmetic change upon tdb -users. - -2.12 tdb_log_func Doesn't Take The - Private Pointer - -For API compatibility reasons, the logging function needs to call -tdb_get_logging_private() to retrieve the pointer registered by -the tdb_open_ex for logging. - -2.12.1 Proposed Solution - -It should simply take an extra argument, since we are prepared to -break the API/ABI. - -2.12.2 Status - -Complete. - -2.13 Various Callback Functions Are Not Typesafe - -The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take] - is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read -and tdb_check all take void * and must internally convert it to -the argument type they were expecting. - -If this type changes, the compiler will not produce warnings on -the callers, since it only sees void *. - -2.13.1 Proposed Solution - -With careful use of macros, we can create callback functions -which give a warning when used on gcc and the types of the -callback and its private argument differ. Unsupported compilers -will not give a warning, which is no worse than now. In addition, -the callbacks become clearer, as they need not use void * for -their parameter. - -See CCAN's typesafe_cb module at -http://ccan.ozlabs.org/info/typesafe_cb.html - -2.13.2 Status - -Incomplete. - -2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, - tdb_reopen_all Problematic - -The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB -file should be cleared if the caller discovers it is the only -process with the TDB open. However, if any caller does not -specify TDB_CLEAR_IF_FIRST it will not be detected, so will have -the TDB erased underneath them (usually resulting in a crash). - -There is a similar issue on fork(); if the parent exits (or -otherwise closes the tdb) before the child calls tdb_reopen_all() -to establish the lock used to indicate the TDB is opened by -someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe -it alone has opened the TDB and will erase it. - -2.14.1 Proposed Solution - -Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but -see [TDB_CLEAR_IF_FIRST-Imposes-Performance]. - -2.14.2 Status - -Incomplete, TDB_CLEAR_IF_FIRST still defined, but not -implemented. - -2.15 Extending The Header Is Difficult - -We have reserved (zeroed) words in the TDB header, which can be -used for future features. If the future features are compulsory, -the version number must be updated to prevent old code from -accessing the database. But if the future feature is optional, we -have no way of telling if older code is accessing the database or -not. - -2.15.1 Proposed Solution - -The header should contain a “format variant” value (64-bit). This -is divided into two 32-bit parts: - -1. The lower part reflects the format variant understood by code - accessing the database. - -2. The upper part reflects the format variant you must understand - to write to the database (otherwise you can only open for - reading). - -The latter field can only be written at creation time, the former -should be written under the OPEN_LOCK when opening the database -for writing, if the variant of the code is lower than the current -lowest variant. - -This should allow backwards-compatible features to be added, and -detection if older code (which doesn't understand the feature) -writes to the database. - -2.15.2 Status - -Incomplete. - -2.16 Record Headers Are Not Expandible - -If we later want to add (say) checksums on keys and data, it -would require another format change, which we'd like to avoid. - -2.16.1 Proposed Solution - -We often have extra padding at the tail of a record. If we ensure -that the first byte (if any) of this padding is zero, we will -have a way for future changes to detect code which doesn't -understand a new format: the new code would write (say) a 1 at -the tail, and thus if there is no tail or the first byte is 0, we -would know the extension is not present on that record. - -2.16.2 Status - -Incomplete. - -2.17 TDB Does Not Use Talloc - -Many users of TDB (particularly Samba) use the talloc allocator, -and thus have to wrap TDB in a talloc context to use it -conveniently. - -2.17.1 Proposed Solution - -The allocation within TDB is not complicated enough to justify -the use of talloc, and I am reluctant to force another -(excellent) library on TDB users. Nonetheless a compromise is -possible. An attribute (see [attributes]) can be added later to -tdb_open() to provide an alternate allocation mechanism, -specifically for talloc but usable by any other allocator (which -would ignore the “context” argument). - -This would form a talloc heirarchy as expected, but the caller -would still have to attach a destructor to the tdb context -returned from tdb_open to close it. All TDB_DATA fields would be -children of the tdb_context, and the caller would still have to -manage them (using talloc_free() or talloc_steal()). - -2.17.2 Status - -Deferred. - -3 Performance And Scalability Issues - -3.1 TDB_CLEAR_IF_FIRST - Imposes Performance Penalty - -When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is -placed at offset 4 (aka. the ACTIVE_LOCK). While these locks -never conflict in normal tdb usage, they do add substantial -overhead for most fcntl lock implementations when the kernel -scans to detect if a lock conflict exists. This is often a single -linked list, making the time to acquire and release a fcntl lock -O(N) where N is the number of processes with the TDB open, not -the number actually doing work. - -In a Samba server it is common to have huge numbers of clients -sitting idle, and thus they have weaned themselves off the -TDB_CLEAR_IF_FIRST flag.[footnote: -There is a flag to tdb_reopen_all() which is used for this -optimization: if the parent process will outlive the child, the -child does not need the ACTIVE_LOCK. This is a workaround for -this very performance issue. -] - -3.1.1 Proposed Solution - -Remove the flag. It was a neat idea, but even trivial servers -tend to know when they are initializing for the first time and -can simply unlink the old tdb at that point. - -3.1.2 Status - -Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing. - -3.2 TDB Files Have a 4G Limit - -This seems to be becoming an issue (so much for “trivial”!), -particularly for ldb. - -3.2.1 Proposed Solution - -A new, incompatible TDB format which uses 64 bit offsets -internally rather than 32 bit as now. For simplicity of endian -conversion (which TDB does on the fly if required), all values -will be 64 bit on disk. In practice, some upper bits may be used -for other purposes, but at least 56 bits will be available for -file offsets. - -tdb_open() will automatically detect the old version, and even -create them if TDB_VERSION6 is specified to tdb_open. - -32 bit processes will still be able to access TDBs larger than 4G -(assuming that their off_t allows them to seek to 64 bits), they -will gracefully fall back as they fail to mmap. This can happen -already with large TDBs. - -Old versions of tdb will fail to open the new TDB files (since 28 -August 2009, commit 398d0c29290: prior to that any unrecognized -file format would be erased and initialized as a fresh tdb!) - -3.2.2 Status - -Complete. - -3.3 TDB Records Have a 4G Limit - -This has not been a reported problem, and the API uses size_t -which can be 64 bit on 64 bit platforms. However, other limits -may have made such an issue moot. - -3.3.1 Proposed Solution - -Record sizes will be 64 bit, with an error returned on 32 bit -platforms which try to access such records (the current -implementation would return TDB_ERR_OOM in a similar case). It -seems unlikely that 32 bit keys will be a limitation, so the -implementation may not support this (see [sub:Records-Incur-A]). - -3.3.2 Status - -Complete. - -3.4 Hash Size Is Determined At TDB Creation Time - -TDB contains a number of hash chains in the header; the number is -specified at creation time, and defaults to 131. This is such a -bottleneck on large databases (as each hash chain gets quite -long), that LDB uses 10,000 for this hash. In general it is -impossible to know what the 'right' answer is at database -creation time. - -3.4.1 Proposed Solution - -After comprehensive performance testing on various scalable hash -variants[footnote: -http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 -This was annoying because I was previously convinced that an -expanding tree of hashes would be very close to optimal. -], it became clear that it is hard to beat a straight linear hash -table which doubles in size when it reaches saturation. -Unfortunately, altering the hash table introduces serious locking -complications: the entire hash table needs to be locked to -enlarge the hash table, and others might be holding locks. -Particularly insidious are insertions done under tdb_chainlock. - -Thus an expanding layered hash will be used: an array of hash -groups, with each hash group exploding into pointers to lower -hash groups once it fills, turning into a hash tree. This has -implications for locking: we must lock the entire group in case -we need to expand it, yet we don't know how deep the tree is at -that point. - -Note that bits from the hash table entries should be stolen to -hold more hash bits to reduce the penalty of collisions. We can -use the otherwise-unused lower 3 bits. If we limit the size of -the database to 64 exabytes, we can use the top 8 bits of the -hash entry as well. These 11 bits would reduce false positives -down to 1 in 2000 which is more than we need: we can use one of -the bits to indicate that the extra hash bits are valid. This -means we can choose not to re-hash all entries when we expand a -hash group; simply use the next bits we need and mark them -invalid. - -3.4.2 Status - -Complete. - -3.5 TDB Freelist Is Highly Contended - -TDB uses a single linked list for the free list. Allocation -occurs as follows, using heuristics which have evolved over time: - -1. Get the free list lock for this whole operation. - -2. Multiply length by 1.25, so we always over-allocate by 25%. - -3. Set the slack multiplier to 1. - -4. Examine the current freelist entry: if it is > length but < - the current best case, remember it as the best case. - -5. Multiply the slack multiplier by 1.05. - -6. If our best fit so far is less than length * slack multiplier, - return it. The slack will be turned into a new free record if - it's large enough. - -7. Otherwise, go onto the next freelist entry. - -Deleting a record occurs as follows: - -1. Lock the hash chain for this whole operation. - -2. Walk the chain to find the record, keeping the prev pointer - offset. - -3. If max_dead is non-zero: - - (a) Walk the hash chain again and count the dead records. - - (b) If it's more than max_dead, bulk free all the dead ones - (similar to steps 4 and below, but the lock is only obtained - once). - - (c) Simply mark this record as dead and return. - -4. Get the free list lock for the remainder of this operation. - -5. Examine the following block to see if it is - free; if so, enlarge the current block and remove that block - from the free list. This was disabled, as removal from the free - list was O(entries-in-free-list). - -6. Examine the preceeding block to see if it is free: for this - reason, each block has a 32-bit tailer which indicates its - length. If it is free, expand it to cover our new block and - return. - -7. Otherwise, prepend ourselves to the free list. - -Disabling right-merging (step [right-merging]) causes -fragmentation; the other heuristics proved insufficient to -address this, so the final answer to this was that when we expand -the TDB file inside a transaction commit, we repack the entire -tdb. - -The single list lock limits our allocation rate; due to the other -issues this is not currently seen as a bottleneck. - -3.5.1 Proposed Solution - -The first step is to remove all the current heuristics, as they -obviously interact, then examine them once the lock contention is -addressed. - -The free list must be split to reduce contention. Assuming -perfect free merging, we can at most have 1 free list entry for -each entry. This implies that the number of free lists is related -to the size of the hash table, but as it is rare to walk a large -number of free list entries we can use far fewer, say 1/32 of the -number of hash buckets. - -It seems tempting to try to reuse the hash implementation which -we use for records here, but we have two ways of searching for -free entries: for allocation we search by size (and possibly -zone) which produces too many clashes for our hash table to -handle well, and for coalescing we search by address. Thus an -array of doubly-linked free lists seems preferable. - -There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented] -) but it's not clear this would reduce contention in the common -case where all processes are allocating/freeing the same size. -Thus we almost certainly need to divide in other ways: the most -obvious is to divide the file into zones, and using a free list -(or table of free lists) for each. This approximates address -ordering. - -Unfortunately it is difficult to know what heuristics should be -used to determine zone sizes, and our transaction code relies on -being able to create a “recovery area” by simply appending to the -file (difficult if it would need to create a new zone header). -Thus we use a linked-list of free tables; currently we only ever -create one, but if there is more than one we choose one at random -to use. In future we may use heuristics to add new free tables on -contention. We only expand the file when all free tables are -exhausted. - -The basic algorithm is as follows. Freeing is simple: - -1. Identify the correct free list. - -2. Lock the corresponding list. - -3. Re-check the list (we didn't have a lock, sizes could have - changed): relock if necessary. - -4. Place the freed entry in the list. - -Allocation is a little more complicated, as we perform delayed -coalescing at this point: - -1. Pick a free table; usually the previous one. - -2. Lock the corresponding list. - -3. If the top entry is -large enough, remove it from the list and - return it. - -4. Otherwise, coalesce entries in the list.If there was no entry - large enough, unlock the list and try the next largest list - -5. If no list has an entry which meets our needs, try the next - free table. - -6. If no zone satisfies, expand the file. - -This optimizes rapid insert/delete of free list entries by not -coalescing them all the time.. First-fit address ordering -ordering seems to be fairly good for keeping fragmentation low -(see [sub:TDB-Becomes-Fragmented]). Note that address ordering -does not need a tailer to coalesce, though if we needed one we -could have one cheaply: see [sub:Records-Incur-A]. - -Each free entry has the free table number in the header: less -than 255. It also contains a doubly-linked list for easy -deletion. - -3.6 TDB Becomes Fragmented - -Much of this is a result of allocation strategy[footnote: -The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 -ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps -] and deliberate hobbling of coalescing; internal fragmentation -(aka overallocation) is deliberately set at 25%, and external -fragmentation is only cured by the decision to repack the entire -db when a transaction commit needs to enlarge the file. - -3.6.1 Proposed Solution - -The 25% overhead on allocation works in practice for ldb because -indexes tend to expand by one record at a time. This internal -fragmentation can be resolved by having an “expanded” bit in the -header to note entries that have previously expanded, and -allocating more space for them. - -There are is a spectrum of possible solutions for external -fragmentation: one is to use a fragmentation-avoiding allocation -strategy such as best-fit address-order allocator. The other end -of the spectrum would be to use a bump allocator (very fast and -simple) and simply repack the file when we reach the end. - -There are three problems with efficient fragmentation-avoiding -allocators: they are non-trivial, they tend to use a single free -list for each size, and there's no evidence that tdb allocation -patterns will match those recorded for general allocators (though -it seems likely). - -Thus we don't spend too much effort on external fragmentation; we -will be no worse than the current code if we need to repack on -occasion. More effort is spent on reducing freelist contention, -and reducing overhead. - -3.7 Records Incur A 28-Byte Overhead - -Each TDB record has a header as follows: - -struct tdb_record { - - tdb_off_t next; /* offset of the next record in the list -*/ - - tdb_len_t rec_len; /* total byte length of record */ - - tdb_len_t key_len; /* byte length of key */ - - tdb_len_t data_len; /* byte length of data */ - - uint32_t full_hash; /* the full 32 bit hash of the key */ - - uint32_t magic; /* try to catch errors */ - - /* the following union is implied: - - union { - - char record[rec_len]; - - struct { - - char key[key_len]; - - char data[data_len]; - - } - - uint32_t totalsize; (tailer) - - } - - */ - -}; - -Naively, this would double to a 56-byte overhead on a 64 bit -implementation. - -3.7.1 Proposed Solution - -We can use various techniques to reduce this for an allocated -block: - -1. The 'next' pointer is not required, as we are using a flat - hash table. - -2. 'rec_len' can instead be expressed as an addition to key_len - and data_len (it accounts for wasted or overallocated length in - the record). Since the record length is always a multiple of 8, - we can conveniently fit it in 32 bits (representing up to 35 - bits). - -3. 'key_len' and 'data_len' can be reduced. I'm unwilling to - restrict 'data_len' to 32 bits, but instead we can combine the - two into one 64-bit field and using a 5 bit value which - indicates at what bit to divide the two. Keys are unlikely to - scale as fast as data, so I'm assuming a maximum key size of 32 - bits. - -4. 'full_hash' is used to avoid a memcmp on the “miss” case, but - this is diminishing returns after a handful of bits (at 10 - bits, it reduces 99.9% of false memcmp). As an aside, as the - lower bits are already incorporated in the hash table - resolution, the upper bits should be used here. Note that it's - not clear that these bits will be a win, given the extra bits - in the hash table itself (see [sub:Hash-Size-Solution]). - -5. 'magic' does not need to be enlarged: it currently reflects - one of 5 values (used, free, dead, recovery, and - unused_recovery). It is useful for quick sanity checking - however, and should not be eliminated. - -6. 'tailer' is only used to coalesce free blocks (so a block to - the right can find the header to check if this block is free). - This can be replaced by a single 'free' bit in the header of - the following block (and the tailer only exists in free - blocks).[footnote: -This technique from Thomas Standish. Data Structure Techniques. -Addison-Wesley, Reading, Massachusetts, 1980. -] The current proposed coalescing algorithm doesn't need this, - however. - -This produces a 16 byte used header like this: - -struct tdb_used_record { - - uint32_t used_magic : 16, - - - - key_data_divide: 5, - - top_hash: 11; - - uint32_t extra_octets; - - uint64_t key_and_data_len; - -}; - -And a free record like this: - -struct tdb_free_record { - - uint64_t free_magic: 8, - - prev : 56; - - - - uint64_t free_table: 8, - - total_length : 56 - - uint64_t next;; - -}; - -Note that by limiting valid offsets to 56 bits, we can pack -everything we need into 3 64-byte words, meaning our minimum -record size is 8 bytes. - -3.7.2 Status - -Complete. - -3.8 Transaction Commit Requires 4 fdatasync - -The current transaction algorithm is: - -1. write_recovery_data(); - -2. sync(); - -3. write_recovery_header(); - -4. sync(); - -5. overwrite_with_new_data(); - -6. sync(); - -7. remove_recovery_header(); - -8. sync(); - -On current ext3, each sync flushes all data to disk, so the next -3 syncs are relatively expensive. But this could become a -performance bottleneck on other filesystems such as ext4. - -3.8.1 Proposed Solution - -Neil Brown points out that this is overzealous, and only one sync -is needed: - -1. Bundle the recovery data, a transaction counter and a strong - checksum of the new data. - -2. Strong checksum that whole bundle. - -3. Store the bundle in the database. - -4. Overwrite the oldest of the two recovery pointers in the - header (identified using the transaction counter) with the - offset of this bundle. - -5. sync. - -6. Write the new data to the file. - -Checking for recovery means identifying the latest bundle with a -valid checksum and using the new data checksum to ensure that it -has been applied. This is more expensive than the current check, -but need only be done at open. For running databases, a separate -header field can be used to indicate a transaction in progress; -we need only check for recovery if this is set. - -3.8.2 Status - -Deferred. - -3.9 TDB Does Not Have Snapshot Support - -3.9.1 Proposed SolutionNone. At some point you say “use a real - database” (but see [replay-attribute]). - -But as a thought experiment, if we implemented transactions to -only overwrite free entries (this is tricky: there must not be a -header in each entry which indicates whether it is free, but use -of presence in metadata elsewhere), and a pointer to the hash -table, we could create an entirely new commit without destroying -existing data. Then it would be easy to implement snapshots in a -similar way. - -This would not allow arbitrary changes to the database, such as -tdb_repack does, and would require more space (since we have to -preserve the current and future entries at once). If we used hash -trees rather than one big hash table, we might only have to -rewrite some sections of the hash, too. - -We could then implement snapshots using a similar method, using -multiple different hash tables/free tables. - -3.9.2 Status - -Deferred. - -3.10 Transactions Cannot Operate in Parallel - -This would be useless for ldb, as it hits the index records with -just about every update. It would add significant complexity in -resolving clashes, and cause the all transaction callers to write -their code to loop in the case where the transactions spuriously -failed. - -3.10.1 Proposed Solution - -None (but see [replay-attribute]). We could solve a small part of -the problem by providing read-only transactions. These would -allow one write transaction to begin, but it could not commit -until all r/o transactions are done. This would require a new -RO_TRANSACTION_LOCK, which would be upgraded on commit. - -3.10.2 Status - -Deferred. - -3.11 Default Hash Function Is Suboptimal - -The Knuth-inspired multiplicative hash used by tdb is fairly slow -(especially if we expand it to 64 bits), and works best when the -hash bucket size is a prime number (which also means a slow -modulus). In addition, it is highly predictable which could -potentially lead to a Denial of Service attack in some TDB uses. - -3.11.1 Proposed Solution - -The Jenkins lookup3 hash[footnote: -http://burtleburtle.net/bob/c/lookup3.c -] is a fast and superbly-mixing hash. It's used by the Linux -kernel and almost everything else. This has the particular -properties that it takes an initial seed, and produces two 32 bit -hash numbers, which we can combine into a 64-bit hash. - -The seed should be created at tdb-creation time from some random -source, and placed in the header. This is far from foolproof, but -adds a little bit of protection against hash bombing. - -3.11.2 Status - -Complete. - -3.12 Reliable Traversal Adds Complexity - -We lock a record during traversal iteration, and try to grab that -lock in the delete code. If that grab on delete fails, we simply -mark it deleted and continue onwards; traversal checks for this -condition and does the delete when it moves off the record. - -If traversal terminates, the dead record may be left -indefinitely. - -3.12.1 Proposed Solution - -Remove reliability guarantees; see [traverse-Proposed-Solution]. - -3.12.2 Status - -Complete. - -3.13 Fcntl Locking Adds Overhead - -Placing a fcntl lock means a system call, as does removing one. -This is actually one reason why transactions can be faster -(everything is locked once at transaction start). In the -uncontended case, this overhead can theoretically be eliminated. - -3.13.1 Proposed Solution - -None. - -We tried this before with spinlock support, in the early days of -TDB, and it didn't make much difference except in manufactured -benchmarks. - -We could use spinlocks (with futex kernel support under Linux), -but it means that we lose automatic cleanup when a process dies -with a lock. There is a method of auto-cleanup under Linux, but -it's not supported by other operating systems. We could -reintroduce a clear-if-first-style lock and sweep for dead -futexes on open, but that wouldn't help the normal case of one -concurrent opener dying. Increasingly elaborate repair schemes -could be considered, but they require an ABI change (everyone -must use them) anyway, so there's no need to do this at the same -time as everything else. - -3.14 Some Transactions Don't Require Durability - -Volker points out that gencache uses a CLEAR_IF_FIRST tdb for -normal (fast) usage, and occasionally empties the results into a -transactional TDB. This kind of usage prioritizes performance -over durability: as long as we are consistent, data can be lost. - -This would be more neatly implemented inside tdb: a “soft” -transaction commit (ie. syncless) which meant that data may be -reverted on a crash. - -3.14.1 Proposed Solution - -None. - -Unfortunately any transaction scheme which overwrites old data -requires a sync before that overwrite to avoid the possibility of -corruption. - -It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not] -,where transactions are committed without overwriting existing -data, and an array of top-level pointers were available in the -header. If the transaction is “soft” then we would not need a -sync at all: existing processes would pick up the new hash table -and free list and work with that. - -At some later point, a sync would allow recovery of the old data -into the free lists (perhaps when the array of top-level pointers -filled). On crash, tdb_open() would examine the array of top -levels, and apply the transactions until it encountered an -invalid checksum. - -3.15 Tracing Is Fragile, Replay Is External - -The current TDB has compile-time-enabled tracing code, but it -often breaks as it is not enabled by default. In a similar way, -the ctdb code has an external wrapper which does replay tracing -so it can coordinate cluster-wide transactions. - -3.15.1 Proposed Solution - -Tridge points out that an attribute can be later added to -tdb_open (see [attributes]) to provide replay/trace hooks, which -could become the basis for this and future parallel transactions -and snapshot support. - -3.15.2 Status - -Deferred. - diff --git a/ccan/tdb2/free.c b/ccan/tdb2/free.c deleted file mode 100644 index e693fe82..00000000 --- a/ccan/tdb2/free.c +++ /dev/null @@ -1,975 +0,0 @@ - /* - Trivial Database 2: free list/block handling - Copyright (C) Rusty Russell 2010 - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ -#include "private.h" -#include -#include -#include -#include -#include - -static unsigned fls64(uint64_t val) -{ - return ilog64(val); -} - -/* In which bucket would we find a particular record size? (ignoring header) */ -unsigned int size_to_bucket(tdb_len_t data_len) -{ - unsigned int bucket; - - /* We can't have records smaller than this. */ - assert(data_len >= TDB_MIN_DATA_LEN); - - /* Ignoring the header... */ - if (data_len - TDB_MIN_DATA_LEN <= 64) { - /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */ - bucket = (data_len - TDB_MIN_DATA_LEN) / 8; - } else { - /* After that we go power of 2. */ - bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2; - } - - if (unlikely(bucket >= TDB_FREE_BUCKETS)) - bucket = TDB_FREE_BUCKETS - 1; - return bucket; -} - -tdb_off_t first_ftable(struct tdb_context *tdb) -{ - return tdb_read_off(tdb, offsetof(struct tdb_header, free_table)); -} - -tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable) -{ - return tdb_read_off(tdb, ftable + offsetof(struct tdb_freetable,next)); -} - -enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb) -{ - /* Use reservoir sampling algorithm to select a free list at random. */ - unsigned int rnd, max = 0, count = 0; - tdb_off_t off; - - tdb->tdb2.ftable_off = off = first_ftable(tdb); - tdb->tdb2.ftable = 0; - - while (off) { - if (TDB_OFF_IS_ERR(off)) { - return TDB_OFF_TO_ERR(off); - } - - rnd = random(); - if (rnd >= max) { - tdb->tdb2.ftable_off = off; - tdb->tdb2.ftable = count; - max = rnd; - } - - off = next_ftable(tdb, off); - count++; - } - return TDB_SUCCESS; -} - -/* Offset of a given bucket. */ -tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket) -{ - return ftable_off + offsetof(struct tdb_freetable, buckets) - + bucket * sizeof(tdb_off_t); -} - -/* Returns free_buckets + 1, or list number to search, or -ve error. */ -static tdb_off_t find_free_head(struct tdb_context *tdb, - tdb_off_t ftable_off, - tdb_off_t bucket) -{ - /* Speculatively search for a non-zero bucket. */ - return tdb_find_nonzero_off(tdb, bucket_off(ftable_off, 0), - bucket, TDB_FREE_BUCKETS); -} - -static void check_list(struct tdb_context *tdb, tdb_off_t b_off) -{ -#ifdef CCAN_TDB2_DEBUG - tdb_off_t off, prev = 0, first; - struct tdb_free_record r; - - first = off = (tdb_read_off(tdb, b_off) & TDB_OFF_MASK); - while (off != 0) { - tdb_read_convert(tdb, off, &r, sizeof(r)); - if (frec_magic(&r) != TDB_FREE_MAGIC) - abort(); - if (prev && frec_prev(&r) != prev) - abort(); - prev = off; - off = r.next; - } - - if (first) { - tdb_read_convert(tdb, first, &r, sizeof(r)); - if (frec_prev(&r) != prev) - abort(); - } -#endif -} - -/* Remove from free bucket. */ -static enum TDB_ERROR remove_from_list(struct tdb_context *tdb, - tdb_off_t b_off, tdb_off_t r_off, - const struct tdb_free_record *r) -{ - tdb_off_t off, prev_next, head; - enum TDB_ERROR ecode; - - /* Is this only element in list? Zero out bucket, and we're done. */ - if (frec_prev(r) == r_off) - return tdb_write_off(tdb, b_off, 0); - - /* off = &r->prev->next */ - off = frec_prev(r) + offsetof(struct tdb_free_record, next); - - /* Get prev->next */ - prev_next = tdb_read_off(tdb, off); - if (TDB_OFF_IS_ERR(prev_next)) - return TDB_OFF_TO_ERR(prev_next); - - /* If prev->next == 0, we were head: update bucket to point to next. */ - if (prev_next == 0) { - /* We must preserve upper bits. */ - head = tdb_read_off(tdb, b_off); - if (TDB_OFF_IS_ERR(head)) - return TDB_OFF_TO_ERR(head); - - if ((head & TDB_OFF_MASK) != r_off) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "remove_from_list:" - " %llu head %llu on list %llu", - (long long)r_off, - (long long)head, - (long long)b_off); - } - head = ((head & ~TDB_OFF_MASK) | r->next); - ecode = tdb_write_off(tdb, b_off, head); - if (ecode != TDB_SUCCESS) - return ecode; - } else { - /* r->prev->next = r->next */ - ecode = tdb_write_off(tdb, off, r->next); - if (ecode != TDB_SUCCESS) - return ecode; - } - - /* If we were the tail, off = &head->prev. */ - if (r->next == 0) { - head = tdb_read_off(tdb, b_off); - if (TDB_OFF_IS_ERR(head)) - return TDB_OFF_TO_ERR(head); - head &= TDB_OFF_MASK; - off = head + offsetof(struct tdb_free_record, magic_and_prev); - } else { - /* off = &r->next->prev */ - off = r->next + offsetof(struct tdb_free_record, - magic_and_prev); - } - -#ifdef CCAN_TDB2_DEBUG - /* *off == r */ - if ((tdb_read_off(tdb, off) & TDB_OFF_MASK) != r_off) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "remove_from_list:" - " %llu bad prev in list %llu", - (long long)r_off, (long long)b_off); - } -#endif - /* r->next->prev = r->prev */ - return tdb_write_off(tdb, off, r->magic_and_prev); -} - -/* Enqueue in this free bucket: sets coalesce if we've added 128 - * entries to it. */ -static enum TDB_ERROR enqueue_in_free(struct tdb_context *tdb, - tdb_off_t b_off, - tdb_off_t off, - tdb_len_t len, - bool *coalesce) -{ - struct tdb_free_record new; - enum TDB_ERROR ecode; - tdb_off_t prev, head; - uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL)); - - head = tdb_read_off(tdb, b_off); - if (TDB_OFF_IS_ERR(head)) - return TDB_OFF_TO_ERR(head); - - /* We only need to set ftable_and_len; rest is set in enqueue_in_free */ - new.ftable_and_len = ((uint64_t)tdb->tdb2.ftable << (64 - TDB_OFF_UPPER_STEAL)) - | len; - - /* new->next = head. */ - new.next = (head & TDB_OFF_MASK); - - /* First element? Prev points to ourselves. */ - if (!new.next) { - new.magic_and_prev = (magic | off); - } else { - /* new->prev = next->prev */ - prev = tdb_read_off(tdb, - new.next + offsetof(struct tdb_free_record, - magic_and_prev)); - new.magic_and_prev = prev; - if (frec_magic(&new) != TDB_FREE_MAGIC) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "enqueue_in_free: %llu bad head" - " prev %llu", - (long long)new.next, - (long long)prev); - } - /* next->prev = new. */ - ecode = tdb_write_off(tdb, new.next - + offsetof(struct tdb_free_record, - magic_and_prev), - off | magic); - if (ecode != TDB_SUCCESS) { - return ecode; - } - -#ifdef CCAN_TDB2_DEBUG - prev = tdb_read_off(tdb, frec_prev(&new) - + offsetof(struct tdb_free_record, next)); - if (prev != 0) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "enqueue_in_free:" - " %llu bad tail next ptr %llu", - (long long)frec_prev(&new) - + offsetof(struct tdb_free_record, - next), - (long long)prev); - } -#endif - } - - /* Update enqueue count, but don't set high bit: see TDB_OFF_IS_ERR */ - if (*coalesce) - head += (1ULL << (64 - TDB_OFF_UPPER_STEAL)); - head &= ~(TDB_OFF_MASK | (1ULL << 63)); - head |= off; - - ecode = tdb_write_off(tdb, b_off, head); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - /* It's time to coalesce if counter wrapped. */ - if (*coalesce) - *coalesce = ((head & ~TDB_OFF_MASK) == 0); - - return tdb_write_convert(tdb, off, &new, sizeof(new)); -} - -static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable) -{ - tdb_off_t off; - unsigned int i; - - if (likely(tdb->tdb2.ftable == ftable)) - return tdb->tdb2.ftable_off; - - off = first_ftable(tdb); - for (i = 0; i < ftable; i++) { - if (TDB_OFF_IS_ERR(off)) { - break; - } - off = next_ftable(tdb, off); - } - return off; -} - -/* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and - * need to blatt the *protect record (which is set to an error). */ -static tdb_len_t coalesce(struct tdb_context *tdb, - tdb_off_t off, tdb_off_t b_off, - tdb_len_t data_len, - tdb_off_t *protect) -{ - tdb_off_t end; - struct tdb_free_record rec; - enum TDB_ERROR ecode; - - tdb->stats.alloc_coalesce_tried++; - end = off + sizeof(struct tdb_used_record) + data_len; - - while (end < tdb->file->map_size) { - const struct tdb_free_record *r; - tdb_off_t nb_off; - unsigned ftable, bucket; - - r = tdb_access_read(tdb, end, sizeof(*r), true); - if (TDB_PTR_IS_ERR(r)) { - ecode = TDB_PTR_ERR(r); - goto err; - } - - if (frec_magic(r) != TDB_FREE_MAGIC - || frec_ftable(r) == TDB_FTABLE_NONE) { - tdb_access_release(tdb, r); - break; - } - - ftable = frec_ftable(r); - bucket = size_to_bucket(frec_len(r)); - nb_off = ftable_offset(tdb, ftable); - if (TDB_OFF_IS_ERR(nb_off)) { - tdb_access_release(tdb, r); - ecode = TDB_OFF_TO_ERR(nb_off); - goto err; - } - nb_off = bucket_off(nb_off, bucket); - tdb_access_release(tdb, r); - - /* We may be violating lock order here, so best effort. */ - if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) - != TDB_SUCCESS) { - tdb->stats.alloc_coalesce_lockfail++; - break; - } - - /* Now we have lock, re-check. */ - ecode = tdb_read_convert(tdb, end, &rec, sizeof(rec)); - if (ecode != TDB_SUCCESS) { - tdb_unlock_free_bucket(tdb, nb_off); - goto err; - } - - if (unlikely(frec_magic(&rec) != TDB_FREE_MAGIC)) { - tdb->stats.alloc_coalesce_race++; - tdb_unlock_free_bucket(tdb, nb_off); - break; - } - - if (unlikely(frec_ftable(&rec) != ftable) - || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) { - tdb->stats.alloc_coalesce_race++; - tdb_unlock_free_bucket(tdb, nb_off); - break; - } - - /* Did we just mess up a record you were hoping to use? */ - if (end == *protect) { - tdb->stats.alloc_coalesce_iterate_clash++; - *protect = TDB_ERR_TO_OFF(TDB_ERR_NOEXIST); - } - - ecode = remove_from_list(tdb, nb_off, end, &rec); - check_list(tdb, nb_off); - if (ecode != TDB_SUCCESS) { - tdb_unlock_free_bucket(tdb, nb_off); - goto err; - } - - end += sizeof(struct tdb_used_record) + frec_len(&rec); - tdb_unlock_free_bucket(tdb, nb_off); - tdb->stats.alloc_coalesce_num_merged++; - } - - /* Didn't find any adjacent free? */ - if (end == off + sizeof(struct tdb_used_record) + data_len) - return 0; - - /* Before we expand, check this isn't one you wanted protected? */ - if (off == *protect) { - *protect = TDB_ERR_TO_OFF(TDB_ERR_EXISTS); - tdb->stats.alloc_coalesce_iterate_clash++; - } - - /* OK, expand initial record */ - ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); - if (ecode != TDB_SUCCESS) { - goto err; - } - - if (frec_len(&rec) != data_len) { - ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "coalesce: expected data len %zu not %zu", - (size_t)data_len, (size_t)frec_len(&rec)); - goto err; - } - - ecode = remove_from_list(tdb, b_off, off, &rec); - check_list(tdb, b_off); - if (ecode != TDB_SUCCESS) { - goto err; - } - - /* Try locking violation first. We don't allow coalesce recursion! */ - ecode = add_free_record(tdb, off, end - off, TDB_LOCK_NOWAIT, false); - if (ecode != TDB_SUCCESS) { - /* Need to drop lock. Can't rely on anything stable. */ - tdb->stats.alloc_coalesce_lockfail++; - *protect = TDB_ERR_TO_OFF(TDB_ERR_CORRUPT); - - /* We have to drop this to avoid deadlocks, so make sure record - * doesn't get coalesced by someone else! */ - rec.ftable_and_len = (TDB_FTABLE_NONE - << (64 - TDB_OFF_UPPER_STEAL)) - | (end - off - sizeof(struct tdb_used_record)); - ecode = tdb_write_off(tdb, - off + offsetof(struct tdb_free_record, - ftable_and_len), - rec.ftable_and_len); - if (ecode != TDB_SUCCESS) { - goto err; - } - - tdb_unlock_free_bucket(tdb, b_off); - - ecode = add_free_record(tdb, off, end - off, TDB_LOCK_WAIT, - false); - if (ecode != TDB_SUCCESS) { - return TDB_ERR_TO_OFF(ecode); - } - } else if (TDB_OFF_IS_ERR(*protect)) { - /* For simplicity, we always drop lock if they can't continue */ - tdb_unlock_free_bucket(tdb, b_off); - } - tdb->stats.alloc_coalesce_succeeded++; - - /* Return usable length. */ - return end - off - sizeof(struct tdb_used_record); - -err: - /* To unify error paths, we *always* unlock bucket on error. */ - tdb_unlock_free_bucket(tdb, b_off); - return TDB_ERR_TO_OFF(ecode); -} - -/* List is locked: we unlock it. */ -static enum TDB_ERROR coalesce_list(struct tdb_context *tdb, - tdb_off_t ftable_off, - tdb_off_t b_off, - unsigned int limit) -{ - enum TDB_ERROR ecode; - tdb_off_t off; - - off = tdb_read_off(tdb, b_off); - if (TDB_OFF_IS_ERR(off)) { - ecode = TDB_OFF_TO_ERR(off); - goto unlock_err; - } - /* A little bit of paranoia: counter should be 0. */ - off &= TDB_OFF_MASK; - - while (off && limit--) { - struct tdb_free_record rec; - tdb_len_t coal; - tdb_off_t next; - - ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); - if (ecode != TDB_SUCCESS) - goto unlock_err; - - next = rec.next; - coal = coalesce(tdb, off, b_off, frec_len(&rec), &next); - if (TDB_OFF_IS_ERR(coal)) { - /* This has already unlocked on error. */ - return TDB_OFF_TO_ERR(coal); - } - if (TDB_OFF_IS_ERR(next)) { - /* Coalescing had to unlock, so stop. */ - return TDB_SUCCESS; - } - /* Keep going if we're doing well... */ - limit += size_to_bucket(coal / 16 + TDB_MIN_DATA_LEN); - off = next; - } - - /* Now, move those elements to the tail of the list so we get something - * else next time. */ - if (off) { - struct tdb_free_record oldhrec, newhrec, oldtrec, newtrec; - tdb_off_t oldhoff, oldtoff, newtoff; - - /* The record we were up to is the new head. */ - ecode = tdb_read_convert(tdb, off, &newhrec, sizeof(newhrec)); - if (ecode != TDB_SUCCESS) - goto unlock_err; - - /* Get the new tail. */ - newtoff = frec_prev(&newhrec); - ecode = tdb_read_convert(tdb, newtoff, &newtrec, - sizeof(newtrec)); - if (ecode != TDB_SUCCESS) - goto unlock_err; - - /* Get the old head. */ - oldhoff = tdb_read_off(tdb, b_off); - if (TDB_OFF_IS_ERR(oldhoff)) { - ecode = TDB_OFF_TO_ERR(oldhoff); - goto unlock_err; - } - - /* This could happen if they all coalesced away. */ - if (oldhoff == off) - goto out; - - ecode = tdb_read_convert(tdb, oldhoff, &oldhrec, - sizeof(oldhrec)); - if (ecode != TDB_SUCCESS) - goto unlock_err; - - /* Get the old tail. */ - oldtoff = frec_prev(&oldhrec); - ecode = tdb_read_convert(tdb, oldtoff, &oldtrec, - sizeof(oldtrec)); - if (ecode != TDB_SUCCESS) - goto unlock_err; - - /* Old tail's next points to old head. */ - oldtrec.next = oldhoff; - - /* Old head's prev points to old tail. */ - oldhrec.magic_and_prev - = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL)) - | oldtoff; - - /* New tail's next is 0. */ - newtrec.next = 0; - - /* Write out the modified versions. */ - ecode = tdb_write_convert(tdb, oldtoff, &oldtrec, - sizeof(oldtrec)); - if (ecode != TDB_SUCCESS) - goto unlock_err; - - ecode = tdb_write_convert(tdb, oldhoff, &oldhrec, - sizeof(oldhrec)); - if (ecode != TDB_SUCCESS) - goto unlock_err; - - ecode = tdb_write_convert(tdb, newtoff, &newtrec, - sizeof(newtrec)); - if (ecode != TDB_SUCCESS) - goto unlock_err; - - /* And finally link in new head. */ - ecode = tdb_write_off(tdb, b_off, off); - if (ecode != TDB_SUCCESS) - goto unlock_err; - } -out: - tdb_unlock_free_bucket(tdb, b_off); - return TDB_SUCCESS; - -unlock_err: - tdb_unlock_free_bucket(tdb, b_off); - return ecode; -} - -/* List must not be locked if coalesce_ok is set. */ -enum TDB_ERROR add_free_record(struct tdb_context *tdb, - tdb_off_t off, tdb_len_t len_with_header, - enum tdb_lock_flags waitflag, - bool coalesce) -{ - tdb_off_t b_off; - tdb_len_t len; - enum TDB_ERROR ecode; - - assert(len_with_header >= sizeof(struct tdb_free_record)); - - len = len_with_header - sizeof(struct tdb_used_record); - - b_off = bucket_off(tdb->tdb2.ftable_off, size_to_bucket(len)); - ecode = tdb_lock_free_bucket(tdb, b_off, waitflag); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - ecode = enqueue_in_free(tdb, b_off, off, len, &coalesce); - check_list(tdb, b_off); - - /* Coalescing unlocks free list. */ - if (!ecode && coalesce) - ecode = coalesce_list(tdb, tdb->tdb2.ftable_off, b_off, 2); - else - tdb_unlock_free_bucket(tdb, b_off); - return ecode; -} - -static size_t adjust_size(size_t keylen, size_t datalen) -{ - size_t size = keylen + datalen; - - if (size < TDB_MIN_DATA_LEN) - size = TDB_MIN_DATA_LEN; - - /* Round to next uint64_t boundary. */ - return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL); -} - -/* If we have enough left over to be useful, split that off. */ -static size_t record_leftover(size_t keylen, size_t datalen, - bool want_extra, size_t total_len) -{ - ssize_t leftover; - - if (want_extra) - datalen += datalen / 2; - leftover = total_len - adjust_size(keylen, datalen); - - if (leftover < (ssize_t)sizeof(struct tdb_free_record)) - return 0; - - return leftover; -} - -/* We need size bytes to put our key and data in. */ -static tdb_off_t lock_and_alloc(struct tdb_context *tdb, - tdb_off_t ftable_off, - tdb_off_t bucket, - size_t keylen, size_t datalen, - bool want_extra, - unsigned magic, - unsigned hashlow) -{ - tdb_off_t off, b_off,best_off; - struct tdb_free_record best = { 0 }; - double multiplier; - size_t size = adjust_size(keylen, datalen); - enum TDB_ERROR ecode; - - tdb->stats.allocs++; - b_off = bucket_off(ftable_off, bucket); - - /* FIXME: Try non-blocking wait first, to measure contention. */ - /* Lock this bucket. */ - ecode = tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT); - if (ecode != TDB_SUCCESS) { - return TDB_ERR_TO_OFF(ecode); - } - - best.ftable_and_len = -1ULL; - best_off = 0; - - /* Get slack if we're after extra. */ - if (want_extra) - multiplier = 1.5; - else - multiplier = 1.0; - - /* Walk the list to see if any are large enough, getting less fussy - * as we go. */ - off = tdb_read_off(tdb, b_off); - if (TDB_OFF_IS_ERR(off)) { - ecode = TDB_OFF_TO_ERR(off); - goto unlock_err; - } - off &= TDB_OFF_MASK; - - while (off) { - const struct tdb_free_record *r; - tdb_len_t len; - tdb_off_t next; - - r = tdb_access_read(tdb, off, sizeof(*r), true); - if (TDB_PTR_IS_ERR(r)) { - ecode = TDB_PTR_ERR(r); - goto unlock_err; - } - - if (frec_magic(r) != TDB_FREE_MAGIC) { - ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "lock_and_alloc:" - " %llu non-free 0x%llx", - (long long)off, - (long long)r->magic_and_prev); - tdb_access_release(tdb, r); - goto unlock_err; - } - - if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) { - best_off = off; - best = *r; - } - - if (frec_len(&best) <= size * multiplier && best_off) { - tdb_access_release(tdb, r); - break; - } - - multiplier *= 1.01; - - next = r->next; - len = frec_len(r); - tdb_access_release(tdb, r); - off = next; - } - - /* If we found anything at all, use it. */ - if (best_off) { - struct tdb_used_record rec; - size_t leftover; - - /* We're happy with this size: take it. */ - ecode = remove_from_list(tdb, b_off, best_off, &best); - check_list(tdb, b_off); - if (ecode != TDB_SUCCESS) { - goto unlock_err; - } - - leftover = record_leftover(keylen, datalen, want_extra, - frec_len(&best)); - - assert(keylen + datalen + leftover <= frec_len(&best)); - /* We need to mark non-free before we drop lock, otherwise - * coalesce() could try to merge it! */ - ecode = set_header(tdb, &rec, magic, keylen, datalen, - frec_len(&best) - leftover, hashlow); - if (ecode != TDB_SUCCESS) { - goto unlock_err; - } - - ecode = tdb_write_convert(tdb, best_off, &rec, sizeof(rec)); - if (ecode != TDB_SUCCESS) { - goto unlock_err; - } - - /* For futureproofing, we put a 0 in any unused space. */ - if (rec_extra_padding(&rec)) { - ecode = tdb->tdb2.io->twrite(tdb, best_off + sizeof(rec) - + keylen + datalen, "", 1); - if (ecode != TDB_SUCCESS) { - goto unlock_err; - } - } - - /* Bucket of leftover will be <= current bucket, so nested - * locking is allowed. */ - if (leftover) { - tdb->stats.alloc_leftover++; - ecode = add_free_record(tdb, - best_off + sizeof(rec) - + frec_len(&best) - leftover, - leftover, TDB_LOCK_WAIT, false); - if (ecode != TDB_SUCCESS) { - best_off = TDB_ERR_TO_OFF(ecode); - } - } - tdb_unlock_free_bucket(tdb, b_off); - - return best_off; - } - - tdb_unlock_free_bucket(tdb, b_off); - return 0; - -unlock_err: - tdb_unlock_free_bucket(tdb, b_off); - return TDB_ERR_TO_OFF(ecode); -} - -/* Get a free block from current free list, or 0 if none, -ve on error. */ -static tdb_off_t get_free(struct tdb_context *tdb, - size_t keylen, size_t datalen, bool want_extra, - unsigned magic, unsigned hashlow) -{ - tdb_off_t off, ftable_off; - tdb_off_t start_b, b, ftable; - bool wrapped = false; - - /* If they are growing, add 50% to get to higher bucket. */ - if (want_extra) - start_b = size_to_bucket(adjust_size(keylen, - datalen + datalen / 2)); - else - start_b = size_to_bucket(adjust_size(keylen, datalen)); - - ftable_off = tdb->tdb2.ftable_off; - ftable = tdb->tdb2.ftable; - while (!wrapped || ftable_off != tdb->tdb2.ftable_off) { - /* Start at exact size bucket, and search up... */ - for (b = find_free_head(tdb, ftable_off, start_b); - b < TDB_FREE_BUCKETS; - b = find_free_head(tdb, ftable_off, b + 1)) { - /* Try getting one from list. */ - off = lock_and_alloc(tdb, ftable_off, - b, keylen, datalen, want_extra, - magic, hashlow); - if (TDB_OFF_IS_ERR(off)) - return off; - if (off != 0) { - if (b == start_b) - tdb->stats.alloc_bucket_exact++; - if (b == TDB_FREE_BUCKETS - 1) - tdb->stats.alloc_bucket_max++; - /* Worked? Stay using this list. */ - tdb->tdb2.ftable_off = ftable_off; - tdb->tdb2.ftable = ftable; - return off; - } - /* Didn't work. Try next bucket. */ - } - - if (TDB_OFF_IS_ERR(b)) { - return b; - } - - /* Hmm, try next table. */ - ftable_off = next_ftable(tdb, ftable_off); - if (TDB_OFF_IS_ERR(ftable_off)) { - return ftable_off; - } - ftable++; - - if (ftable_off == 0) { - wrapped = true; - ftable_off = first_ftable(tdb); - if (TDB_OFF_IS_ERR(ftable_off)) { - return ftable_off; - } - ftable = 0; - } - } - - return 0; -} - -enum TDB_ERROR set_header(struct tdb_context *tdb, - struct tdb_used_record *rec, - unsigned magic, uint64_t keylen, uint64_t datalen, - uint64_t actuallen, unsigned hashlow) -{ - uint64_t keybits = (fls64(keylen) + 1) / 2; - - /* Use bottom bits of hash, so it's independent of hash table size. */ - rec->magic_and_meta = (hashlow & ((1 << 11)-1)) - | ((actuallen - (keylen + datalen)) << 11) - | (keybits << 43) - | ((uint64_t)magic << 48); - rec->key_and_data_len = (keylen | (datalen << (keybits*2))); - - /* Encoding can fail on big values. */ - if (rec_key_length(rec) != keylen - || rec_data_length(rec) != datalen - || rec_extra_padding(rec) != actuallen - (keylen + datalen)) { - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "Could not encode k=%llu,d=%llu,a=%llu", - (long long)keylen, (long long)datalen, - (long long)actuallen); - } - return TDB_SUCCESS; -} - -/* You need 'size', this tells you how much you should expand by. */ -tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size) -{ - tdb_off_t new_size, top_size; - - /* limit size in order to avoid using up huge amounts of memory for - * in memory tdbs if an oddball huge record creeps in */ - if (size > 100 * 1024) { - top_size = map_size + size * 2; - } else { - top_size = map_size + size * 100; - } - - /* always make room for at least top_size more records, and at - least 25% more space. if the DB is smaller than 100MiB, - otherwise grow it by 10% only. */ - if (map_size > 100 * 1024 * 1024) { - new_size = map_size * 1.10; - } else { - new_size = map_size * 1.25; - } - - /* Round the database up to a multiple of the page size */ - if (new_size < top_size) - new_size = top_size; - return new_size - map_size; -} - -/* Expand the database. */ -static enum TDB_ERROR tdb_expand(struct tdb_context *tdb, tdb_len_t size) -{ - uint64_t old_size; - tdb_len_t wanted; - enum TDB_ERROR ecode; - - /* Need to hold a hash lock to expand DB: transactions rely on it. */ - if (!(tdb->flags & TDB_NOLOCK) - && !tdb->file->allrecord_lock.count && !tdb_has_hash_locks(tdb)) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_expand: must hold lock during expand"); - } - - /* Only one person can expand file at a time. */ - ecode = tdb_lock_expand(tdb, F_WRLCK); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - /* Someone else may have expanded the file, so retry. */ - old_size = tdb->file->map_size; - tdb->tdb2.io->oob(tdb, tdb->file->map_size, 1, true); - if (tdb->file->map_size != old_size) { - tdb_unlock_expand(tdb, F_WRLCK); - return TDB_SUCCESS; - } - - /* Overallocate. */ - wanted = tdb_expand_adjust(old_size, size); - /* We need room for the record header too. */ - wanted = adjust_size(0, sizeof(struct tdb_used_record) + wanted); - - ecode = tdb->tdb2.io->expand_file(tdb, wanted); - if (ecode != TDB_SUCCESS) { - tdb_unlock_expand(tdb, F_WRLCK); - return ecode; - } - - /* We need to drop this lock before adding free record. */ - tdb_unlock_expand(tdb, F_WRLCK); - - tdb->stats.expands++; - return add_free_record(tdb, old_size, wanted, TDB_LOCK_WAIT, true); -} - -/* This won't fail: it will expand the database if it has to. */ -tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen, - uint64_t hash, unsigned magic, bool growing) -{ - tdb_off_t off; - - /* We can't hold pointers during this: we could unmap! */ - assert(!tdb->tdb2.direct_access); - - for (;;) { - enum TDB_ERROR ecode; - off = get_free(tdb, keylen, datalen, growing, magic, hash); - if (likely(off != 0)) - break; - - ecode = tdb_expand(tdb, adjust_size(keylen, datalen)); - if (ecode != TDB_SUCCESS) { - return TDB_ERR_TO_OFF(ecode); - } - } - - return off; -} diff --git a/ccan/tdb2/hash.c b/ccan/tdb2/hash.c deleted file mode 100644 index 619d56f8..00000000 --- a/ccan/tdb2/hash.c +++ /dev/null @@ -1,913 +0,0 @@ - /* - Trivial Database 2: hash handling - Copyright (C) Rusty Russell 2010 - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ -#include "private.h" -#include -#include - -/* Default hash function. */ -uint64_t tdb_jenkins_hash(const void *key, size_t length, uint64_t seed, - void *unused) -{ - uint64_t ret; - /* hash64_stable assumes lower bits are more important; they are a - * slightly better hash. We use the upper bits first, so swap them. */ - ret = hash64_stable((const unsigned char *)key, length, seed); - return (ret >> 32) | (ret << 32); -} - -uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len) -{ - return tdb->hash_fn(ptr, len, tdb->hash_seed, tdb->hash_data); -} - -uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off) -{ - const struct tdb_used_record *r; - const void *key; - uint64_t klen, hash; - - r = tdb_access_read(tdb, off, sizeof(*r), true); - if (TDB_PTR_IS_ERR(r)) { - /* FIXME */ - return 0; - } - - klen = rec_key_length(r); - tdb_access_release(tdb, r); - - key = tdb_access_read(tdb, off + sizeof(*r), klen, false); - if (TDB_PTR_IS_ERR(key)) { - return 0; - } - - hash = tdb_hash(tdb, key, klen); - tdb_access_release(tdb, key); - return hash; -} - -/* Get bits from a value. */ -static uint32_t bits_from(uint64_t val, unsigned start, unsigned num) -{ - assert(num <= 32); - return (val >> start) & ((1U << num) - 1); -} - -/* We take bits from the top: that way we can lock whole sections of the hash - * by using lock ranges. */ -static uint32_t use_bits(struct hash_info *h, unsigned num) -{ - h->hash_used += num; - return bits_from(h->h, 64 - h->hash_used, num); -} - -static tdb_bool_err key_matches(struct tdb_context *tdb, - const struct tdb_used_record *rec, - tdb_off_t off, - const struct tdb_data *key) -{ - tdb_bool_err ret = false; - const char *rkey; - - if (rec_key_length(rec) != key->dsize) { - tdb->stats.compare_wrong_keylen++; - return ret; - } - - rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false); - if (TDB_PTR_IS_ERR(rkey)) { - return (tdb_bool_err)TDB_PTR_ERR(rkey); - } - if (memcmp(rkey, key->dptr, key->dsize) == 0) - ret = true; - else - tdb->stats.compare_wrong_keycmp++; - tdb_access_release(tdb, rkey); - return ret; -} - -/* Does entry match? */ -static tdb_bool_err match(struct tdb_context *tdb, - struct hash_info *h, - const struct tdb_data *key, - tdb_off_t val, - struct tdb_used_record *rec) -{ - tdb_off_t off; - enum TDB_ERROR ecode; - - tdb->stats.compares++; - /* Desired bucket must match. */ - if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK)) { - tdb->stats.compare_wrong_bucket++; - return false; - } - - /* Top bits of offset == next bits of hash. */ - if (bits_from(val, TDB_OFF_HASH_EXTRA_BIT, TDB_OFF_UPPER_STEAL_EXTRA) - != bits_from(h->h, 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA, - TDB_OFF_UPPER_STEAL_EXTRA)) { - tdb->stats.compare_wrong_offsetbits++; - return false; - } - - off = val & TDB_OFF_MASK; - ecode = tdb_read_convert(tdb, off, rec, sizeof(*rec)); - if (ecode != TDB_SUCCESS) { - return (tdb_bool_err)ecode; - } - - if ((h->h & ((1 << 11)-1)) != rec_hash(rec)) { - tdb->stats.compare_wrong_rechash++; - return false; - } - - return key_matches(tdb, rec, off, key); -} - -static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket) -{ - return group_start - + (bucket % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t); -} - -bool is_subhash(tdb_off_t val) -{ - return (val >> TDB_OFF_UPPER_STEAL_SUBHASH_BIT) & 1; -} - -/* FIXME: Guess the depth, don't over-lock! */ -static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size) -{ - *size = 1ULL << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS)); - return group << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS)); -} - -static tdb_off_t COLD find_in_chain(struct tdb_context *tdb, - struct tdb_data key, - tdb_off_t chain, - struct hash_info *h, - struct tdb_used_record *rec, - struct traverse_info *tinfo) -{ - tdb_off_t off, next; - enum TDB_ERROR ecode; - - /* In case nothing is free, we set these to zero. */ - h->home_bucket = h->found_bucket = 0; - - for (off = chain; off; off = next) { - unsigned int i; - - h->group_start = off; - ecode = tdb_read_convert(tdb, off, h->group, sizeof(h->group)); - if (ecode != TDB_SUCCESS) { - return TDB_ERR_TO_OFF(ecode); - } - - for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) { - tdb_off_t recoff; - if (!h->group[i]) { - /* Remember this empty bucket. */ - h->home_bucket = h->found_bucket = i; - continue; - } - - /* We can insert extra bits via add_to_hash - * empty bucket logic. */ - recoff = h->group[i] & TDB_OFF_MASK; - ecode = tdb_read_convert(tdb, recoff, rec, - sizeof(*rec)); - if (ecode != TDB_SUCCESS) { - return TDB_ERR_TO_OFF(ecode); - } - - ecode = TDB_OFF_TO_ERR(key_matches(tdb, rec, recoff, - &key)); - if (ecode < 0) { - return TDB_ERR_TO_OFF(ecode); - } - if (ecode == (enum TDB_ERROR)1) { - h->home_bucket = h->found_bucket = i; - - if (tinfo) { - tinfo->levels[tinfo->num_levels] - .hashtable = off; - tinfo->levels[tinfo->num_levels] - .total_buckets - = 1 << TDB_HASH_GROUP_BITS; - tinfo->levels[tinfo->num_levels].entry - = i; - tinfo->num_levels++; - } - return recoff; - } - } - next = tdb_read_off(tdb, off - + offsetof(struct tdb_chain, next)); - if (TDB_OFF_IS_ERR(next)) { - return next; - } - if (next) - next += sizeof(struct tdb_used_record); - } - return 0; -} - -/* This is the core routine which searches the hashtable for an entry. - * On error, no locks are held and -ve is returned. - * Otherwise, hinfo is filled in (and the optional tinfo). - * If not found, the return value is 0. - * If found, the return value is the offset, and *rec is the record. */ -tdb_off_t find_and_lock(struct tdb_context *tdb, - struct tdb_data key, - int ltype, - struct hash_info *h, - struct tdb_used_record *rec, - struct traverse_info *tinfo) -{ - uint32_t i, group; - tdb_off_t hashtable; - enum TDB_ERROR ecode; - - h->h = tdb_hash(tdb, key.dptr, key.dsize); - h->hash_used = 0; - group = use_bits(h, TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS); - h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS); - - h->hlock_start = hlock_range(group, &h->hlock_range); - ecode = tdb_lock_hashes(tdb, h->hlock_start, h->hlock_range, ltype, - TDB_LOCK_WAIT); - if (ecode != TDB_SUCCESS) { - return TDB_ERR_TO_OFF(ecode); - } - - hashtable = offsetof(struct tdb_header, hashtable); - if (tinfo) { - tinfo->toplevel_group = group; - tinfo->num_levels = 1; - tinfo->levels[0].entry = 0; - tinfo->levels[0].hashtable = hashtable - + (group << TDB_HASH_GROUP_BITS) * sizeof(tdb_off_t); - tinfo->levels[0].total_buckets = 1 << TDB_HASH_GROUP_BITS; - } - - while (h->hash_used <= 64) { - /* Read in the hash group. */ - h->group_start = hashtable - + group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS); - - ecode = tdb_read_convert(tdb, h->group_start, &h->group, - sizeof(h->group)); - if (ecode != TDB_SUCCESS) { - goto fail; - } - - /* Pointer to another hash table? Go down... */ - if (is_subhash(h->group[h->home_bucket])) { - hashtable = (h->group[h->home_bucket] & TDB_OFF_MASK) - + sizeof(struct tdb_used_record); - if (tinfo) { - /* When we come back, use *next* bucket */ - tinfo->levels[tinfo->num_levels-1].entry - += h->home_bucket + 1; - } - group = use_bits(h, TDB_SUBLEVEL_HASH_BITS - - TDB_HASH_GROUP_BITS); - h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS); - if (tinfo) { - tinfo->levels[tinfo->num_levels].hashtable - = hashtable; - tinfo->levels[tinfo->num_levels].total_buckets - = 1 << TDB_SUBLEVEL_HASH_BITS; - tinfo->levels[tinfo->num_levels].entry - = group << TDB_HASH_GROUP_BITS; - tinfo->num_levels++; - } - continue; - } - - /* It's in this group: search (until 0 or all searched) */ - for (i = 0, h->found_bucket = h->home_bucket; - i < (1 << TDB_HASH_GROUP_BITS); - i++, h->found_bucket = ((h->found_bucket+1) - % (1 << TDB_HASH_GROUP_BITS))) { - tdb_bool_err berr; - if (is_subhash(h->group[h->found_bucket])) - continue; - - if (!h->group[h->found_bucket]) - break; - - berr = match(tdb, h, &key, h->group[h->found_bucket], - rec); - if (berr < 0) { - ecode = TDB_OFF_TO_ERR(berr); - goto fail; - } - if (berr) { - if (tinfo) { - tinfo->levels[tinfo->num_levels-1].entry - += h->found_bucket; - } - return h->group[h->found_bucket] & TDB_OFF_MASK; - } - } - /* Didn't find it: h indicates where it would go. */ - return 0; - } - - return find_in_chain(tdb, key, hashtable, h, rec, tinfo); - -fail: - tdb_unlock_hashes(tdb, h->hlock_start, h->hlock_range, ltype); - return TDB_ERR_TO_OFF(ecode); -} - -/* I wrote a simple test, expanding a hash to 2GB, for the following - * cases: - * 1) Expanding all the buckets at once, - * 2) Expanding the bucket we wanted to place the new entry into. - * 3) Expanding the most-populated bucket, - * - * I measured the worst/average/best density during this process. - * 1) 3%/16%/30% - * 2) 4%/20%/38% - * 3) 6%/22%/41% - * - * So we figure out the busiest bucket for the moment. - */ -static unsigned fullest_bucket(struct tdb_context *tdb, - const tdb_off_t *group, - unsigned new_bucket) -{ - unsigned counts[1 << TDB_HASH_GROUP_BITS] = { 0 }; - unsigned int i, best_bucket; - - /* Count the new entry. */ - counts[new_bucket]++; - best_bucket = new_bucket; - - for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) { - unsigned this_bucket; - - if (is_subhash(group[i])) - continue; - this_bucket = group[i] & TDB_OFF_HASH_GROUP_MASK; - if (++counts[this_bucket] > counts[best_bucket]) - best_bucket = this_bucket; - } - - return best_bucket; -} - -static bool put_into_group(tdb_off_t *group, - unsigned bucket, tdb_off_t encoded) -{ - unsigned int i; - - for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) { - unsigned b = (bucket + i) % (1 << TDB_HASH_GROUP_BITS); - - if (group[b] == 0) { - group[b] = encoded; - return true; - } - } - return false; -} - -static void force_into_group(tdb_off_t *group, - unsigned bucket, tdb_off_t encoded) -{ - if (!put_into_group(group, bucket, encoded)) - abort(); -} - -static tdb_off_t encode_offset(tdb_off_t new_off, struct hash_info *h) -{ - return h->home_bucket - | new_off - | ((uint64_t)bits_from(h->h, - 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA, - TDB_OFF_UPPER_STEAL_EXTRA) - << TDB_OFF_HASH_EXTRA_BIT); -} - -/* Simply overwrite the hash entry we found before. */ -enum TDB_ERROR replace_in_hash(struct tdb_context *tdb, - struct hash_info *h, - tdb_off_t new_off) -{ - return tdb_write_off(tdb, hbucket_off(h->group_start, h->found_bucket), - encode_offset(new_off, h)); -} - -/* We slot in anywhere that's empty in the chain. */ -static enum TDB_ERROR COLD add_to_chain(struct tdb_context *tdb, - tdb_off_t subhash, - tdb_off_t new_off) -{ - tdb_off_t entry; - enum TDB_ERROR ecode; - - entry = tdb_find_zero_off(tdb, subhash, 1< 64) - return add_to_chain(tdb, subhash, off); - - h.h = hash_record(tdb, off); - gnum = use_bits(&h, TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS); - h.group_start = subhash - + gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS); - h.home_bucket = use_bits(&h, TDB_HASH_GROUP_BITS); - - group = tdb_access_write(tdb, h.group_start, - sizeof(*group) << TDB_HASH_GROUP_BITS, true); - if (TDB_PTR_IS_ERR(group)) { - return TDB_PTR_ERR(group); - } - force_into_group(group, h.home_bucket, encode_offset(off, &h)); - return tdb_access_commit(tdb, group); -} - -static enum TDB_ERROR expand_group(struct tdb_context *tdb, struct hash_info *h) -{ - unsigned bucket, num_vals, i, magic; - size_t subsize; - tdb_off_t subhash; - tdb_off_t vals[1 << TDB_HASH_GROUP_BITS]; - enum TDB_ERROR ecode; - - /* Attach new empty subhash under fullest bucket. */ - bucket = fullest_bucket(tdb, h->group, h->home_bucket); - - if (h->hash_used == 64) { - tdb->stats.alloc_chain++; - subsize = sizeof(struct tdb_chain); - magic = TDB_CHAIN_MAGIC; - } else { - tdb->stats.alloc_subhash++; - subsize = (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS); - magic = TDB_HTABLE_MAGIC; - } - - subhash = alloc(tdb, 0, subsize, 0, magic, false); - if (TDB_OFF_IS_ERR(subhash)) { - return TDB_OFF_TO_ERR(subhash); - } - - ecode = zero_out(tdb, subhash + sizeof(struct tdb_used_record), - subsize); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - /* Remove any which are destined for bucket or are in wrong place. */ - num_vals = 0; - for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) { - unsigned home_bucket = h->group[i] & TDB_OFF_HASH_GROUP_MASK; - if (!h->group[i] || is_subhash(h->group[i])) - continue; - if (home_bucket == bucket || home_bucket != i) { - vals[num_vals++] = h->group[i]; - h->group[i] = 0; - } - } - /* FIXME: This assert is valid, but we do this during unit test :( */ - /* assert(num_vals); */ - - /* Overwrite expanded bucket with subhash pointer. */ - h->group[bucket] = subhash | (1ULL << TDB_OFF_UPPER_STEAL_SUBHASH_BIT); - - /* Point to actual contents of record. */ - subhash += sizeof(struct tdb_used_record); - - /* Put values back. */ - for (i = 0; i < num_vals; i++) { - unsigned this_bucket = vals[i] & TDB_OFF_HASH_GROUP_MASK; - - if (this_bucket == bucket) { - ecode = add_to_subhash(tdb, subhash, h->hash_used, - vals[i]); - if (ecode != TDB_SUCCESS) - return ecode; - } else { - /* There should be room to put this back. */ - force_into_group(h->group, this_bucket, vals[i]); - } - } - return TDB_SUCCESS; -} - -enum TDB_ERROR delete_from_hash(struct tdb_context *tdb, struct hash_info *h) -{ - unsigned int i, num_movers = 0; - tdb_off_t movers[1 << TDB_HASH_GROUP_BITS]; - - h->group[h->found_bucket] = 0; - for (i = 1; i < (1 << TDB_HASH_GROUP_BITS); i++) { - unsigned this_bucket; - - this_bucket = (h->found_bucket+i) % (1 << TDB_HASH_GROUP_BITS); - /* Empty bucket? We're done. */ - if (!h->group[this_bucket]) - break; - - /* Ignore subhashes. */ - if (is_subhash(h->group[this_bucket])) - continue; - - /* If this one is not happy where it is, we'll move it. */ - if ((h->group[this_bucket] & TDB_OFF_HASH_GROUP_MASK) - != this_bucket) { - movers[num_movers++] = h->group[this_bucket]; - h->group[this_bucket] = 0; - } - } - - /* Put back the ones we erased. */ - for (i = 0; i < num_movers; i++) { - force_into_group(h->group, movers[i] & TDB_OFF_HASH_GROUP_MASK, - movers[i]); - } - - /* Now we write back the hash group */ - return tdb_write_convert(tdb, h->group_start, - h->group, sizeof(h->group)); -} - -enum TDB_ERROR add_to_hash(struct tdb_context *tdb, struct hash_info *h, - tdb_off_t new_off) -{ - enum TDB_ERROR ecode; - - /* We hit an empty bucket during search? That's where it goes. */ - if (!h->group[h->found_bucket]) { - h->group[h->found_bucket] = encode_offset(new_off, h); - /* Write back the modified group. */ - return tdb_write_convert(tdb, h->group_start, - h->group, sizeof(h->group)); - } - - if (h->hash_used > 64) - return add_to_chain(tdb, h->group_start, new_off); - - /* We're full. Expand. */ - ecode = expand_group(tdb, h); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - if (is_subhash(h->group[h->home_bucket])) { - /* We were expanded! */ - tdb_off_t hashtable; - unsigned int gnum; - - /* Write back the modified group. */ - ecode = tdb_write_convert(tdb, h->group_start, h->group, - sizeof(h->group)); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - /* Move hashinfo down a level. */ - hashtable = (h->group[h->home_bucket] & TDB_OFF_MASK) - + sizeof(struct tdb_used_record); - gnum = use_bits(h,TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS); - h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS); - h->group_start = hashtable - + gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS); - ecode = tdb_read_convert(tdb, h->group_start, &h->group, - sizeof(h->group)); - if (ecode != TDB_SUCCESS) { - return ecode; - } - } - - /* Expanding the group must have made room if it didn't choose this - * bucket. */ - if (put_into_group(h->group, h->home_bucket, encode_offset(new_off,h))){ - return tdb_write_convert(tdb, h->group_start, - h->group, sizeof(h->group)); - } - - /* This can happen if all hashes in group (and us) dropped into same - * group in subhash. */ - return add_to_hash(tdb, h, new_off); -} - -/* Traverse support: returns offset of record, or 0 or -ve error. */ -static tdb_off_t iterate_hash(struct tdb_context *tdb, - struct traverse_info *tinfo) -{ - tdb_off_t off, val, i; - struct traverse_level *tlevel; - - tlevel = &tinfo->levels[tinfo->num_levels-1]; - -again: - for (i = tdb_find_nonzero_off(tdb, tlevel->hashtable, - tlevel->entry, tlevel->total_buckets); - i != tlevel->total_buckets; - i = tdb_find_nonzero_off(tdb, tlevel->hashtable, - i+1, tlevel->total_buckets)) { - if (TDB_OFF_IS_ERR(i)) { - return i; - } - - val = tdb_read_off(tdb, tlevel->hashtable+sizeof(tdb_off_t)*i); - if (TDB_OFF_IS_ERR(val)) { - return val; - } - - off = val & TDB_OFF_MASK; - - /* This makes the delete-all-in-traverse case work - * (and simplifies our logic a little). */ - if (off == tinfo->prev) - continue; - - tlevel->entry = i; - - if (!is_subhash(val)) { - /* Found one. */ - tinfo->prev = off; - return off; - } - - /* When we come back, we want the next one */ - tlevel->entry++; - tinfo->num_levels++; - tlevel++; - tlevel->hashtable = off + sizeof(struct tdb_used_record); - tlevel->entry = 0; - /* Next level is a chain? */ - if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1)) - tlevel->total_buckets = (1 << TDB_HASH_GROUP_BITS); - else - tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS); - goto again; - } - - /* Nothing there? */ - if (tinfo->num_levels == 1) - return 0; - - /* Handle chained entries. */ - if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1)) { - tlevel->hashtable = tdb_read_off(tdb, tlevel->hashtable - + offsetof(struct tdb_chain, - next)); - if (TDB_OFF_IS_ERR(tlevel->hashtable)) { - return tlevel->hashtable; - } - if (tlevel->hashtable) { - tlevel->hashtable += sizeof(struct tdb_used_record); - tlevel->entry = 0; - goto again; - } - } - - /* Go back up and keep searching. */ - tinfo->num_levels--; - tlevel--; - goto again; -} - -/* Return success if we find something, TDB_ERR_NOEXIST if none. */ -enum TDB_ERROR next_in_hash(struct tdb_context *tdb, - struct traverse_info *tinfo, - TDB_DATA *kbuf, size_t *dlen) -{ - const unsigned group_bits = TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS; - tdb_off_t hl_start, hl_range, off; - enum TDB_ERROR ecode; - - while (tinfo->toplevel_group < (1 << group_bits)) { - hl_start = (tdb_off_t)tinfo->toplevel_group - << (64 - group_bits); - hl_range = 1ULL << group_bits; - ecode = tdb_lock_hashes(tdb, hl_start, hl_range, F_RDLCK, - TDB_LOCK_WAIT); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - off = iterate_hash(tdb, tinfo); - if (off) { - struct tdb_used_record rec; - - if (TDB_OFF_IS_ERR(off)) { - ecode = TDB_OFF_TO_ERR(off); - goto fail; - } - - ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec)); - if (ecode != TDB_SUCCESS) { - goto fail; - } - if (rec_magic(&rec) != TDB_USED_MAGIC) { - ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "next_in_hash:" - " corrupt record at %llu", - (long long)off); - goto fail; - } - - kbuf->dsize = rec_key_length(&rec); - - /* They want data as well? */ - if (dlen) { - *dlen = rec_data_length(&rec); - kbuf->dptr = tdb_alloc_read(tdb, - off + sizeof(rec), - kbuf->dsize - + *dlen); - } else { - kbuf->dptr = tdb_alloc_read(tdb, - off + sizeof(rec), - kbuf->dsize); - } - tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK); - if (TDB_PTR_IS_ERR(kbuf->dptr)) { - return TDB_PTR_ERR(kbuf->dptr); - } - return TDB_SUCCESS; - } - - tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK); - - tinfo->toplevel_group++; - tinfo->levels[0].hashtable - += (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS); - tinfo->levels[0].entry = 0; - } - return TDB_ERR_NOEXIST; - -fail: - tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK); - return ecode; - -} - -enum TDB_ERROR first_in_hash(struct tdb_context *tdb, - struct traverse_info *tinfo, - TDB_DATA *kbuf, size_t *dlen) -{ - tinfo->prev = 0; - tinfo->toplevel_group = 0; - tinfo->num_levels = 1; - tinfo->levels[0].hashtable = offsetof(struct tdb_header, hashtable); - tinfo->levels[0].entry = 0; - tinfo->levels[0].total_buckets = (1 << TDB_HASH_GROUP_BITS); - - return next_in_hash(tdb, tinfo, kbuf, dlen); -} - -/* Even if the entry isn't in this hash bucket, you'd have to lock this - * bucket to find it. */ -static enum TDB_ERROR chainlock(struct tdb_context *tdb, const TDB_DATA *key, - int ltype, enum tdb_lock_flags waitflag, - const char *func) -{ - enum TDB_ERROR ecode; - uint64_t h = tdb_hash(tdb, key->dptr, key->dsize); - tdb_off_t lockstart, locksize; - unsigned int group, gbits; - - gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS; - group = bits_from(h, 64 - gbits, gbits); - - lockstart = hlock_range(group, &locksize); - - ecode = tdb_lock_hashes(tdb, lockstart, locksize, ltype, waitflag); - tdb_trace_1rec(tdb, func, *key); - return ecode; -} - -/* lock/unlock one hash chain. This is meant to be used to reduce - contention - it cannot guarantee how many records will be locked */ -enum TDB_ERROR tdb_chainlock(struct tdb_context *tdb, TDB_DATA key) -{ - if (tdb->flags & TDB_VERSION1) { - if (tdb1_chainlock(tdb, key) == -1) - return tdb->last_error; - return TDB_SUCCESS; - } - return tdb->last_error = chainlock(tdb, &key, F_WRLCK, TDB_LOCK_WAIT, - "tdb_chainlock"); -} - -void tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key) -{ - uint64_t h = tdb_hash(tdb, key.dptr, key.dsize); - tdb_off_t lockstart, locksize; - unsigned int group, gbits; - - if (tdb->flags & TDB_VERSION1) { - tdb1_chainunlock(tdb, key); - return; - } - - gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS; - group = bits_from(h, 64 - gbits, gbits); - - lockstart = hlock_range(group, &locksize); - - tdb_trace_1rec(tdb, "tdb_chainunlock", key); - tdb_unlock_hashes(tdb, lockstart, locksize, F_WRLCK); -} - -enum TDB_ERROR tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key) -{ - if (tdb->flags & TDB_VERSION1) { - if (tdb1_chainlock_read(tdb, key) == -1) - return tdb->last_error; - return TDB_SUCCESS; - } - return tdb->last_error = chainlock(tdb, &key, F_RDLCK, TDB_LOCK_WAIT, - "tdb_chainlock_read"); -} - -void tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key) -{ - uint64_t h = tdb_hash(tdb, key.dptr, key.dsize); - tdb_off_t lockstart, locksize; - unsigned int group, gbits; - - if (tdb->flags & TDB_VERSION1) { - tdb1_chainunlock_read(tdb, key); - return; - } - gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS; - group = bits_from(h, 64 - gbits, gbits); - - lockstart = hlock_range(group, &locksize); - - tdb_trace_1rec(tdb, "tdb_chainunlock_read", key); - tdb_unlock_hashes(tdb, lockstart, locksize, F_RDLCK); -} diff --git a/ccan/tdb2/io.c b/ccan/tdb2/io.c deleted file mode 100644 index b4a6f0be..00000000 --- a/ccan/tdb2/io.c +++ /dev/null @@ -1,640 +0,0 @@ - /* - Unix SMB/CIFS implementation. - - trivial database library - - Copyright (C) Andrew Tridgell 1999-2005 - Copyright (C) Paul `Rusty' Russell 2000 - Copyright (C) Jeremy Allison 2000-2003 - Copyright (C) Rusty Russell 2010 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ -#include "private.h" -#include -#include - -void tdb_munmap(struct tdb_file *file) -{ - if (file->fd == -1) - return; - - if (file->map_ptr) { - munmap(file->map_ptr, file->map_size); - file->map_ptr = NULL; - } -} - -void tdb_mmap(struct tdb_context *tdb) -{ - int mmap_flags; - - if (tdb->flags & TDB_INTERNAL) - return; - - if (tdb->flags & TDB_NOMMAP) - return; - - if ((tdb->open_flags & O_ACCMODE) == O_RDONLY) - mmap_flags = PROT_READ; - else - mmap_flags = PROT_READ | PROT_WRITE; - - /* size_t can be smaller than off_t. */ - if ((size_t)tdb->file->map_size == tdb->file->map_size) { - tdb->file->map_ptr = mmap(NULL, tdb->file->map_size, - mmap_flags, - MAP_SHARED, tdb->file->fd, 0); - } else - tdb->file->map_ptr = MAP_FAILED; - - /* - * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!! - */ - if (tdb->file->map_ptr == MAP_FAILED) { - tdb->file->map_ptr = NULL; - tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING, - "tdb_mmap failed for size %lld (%s)", - (long long)tdb->file->map_size, strerror(errno)); - } -} - -/* check for an out of bounds access - if it is out of bounds then - see if the database has been expanded by someone else and expand - if necessary - note that "len" is the minimum length needed for the db. - - If probe is true, len being too large isn't a failure. -*/ -static enum TDB_ERROR tdb_oob(struct tdb_context *tdb, - tdb_off_t off, tdb_len_t len, bool probe) -{ - struct stat st; - enum TDB_ERROR ecode; - - /* We can't hold pointers during this: we could unmap! */ - assert(!tdb->tdb2.direct_access - || (tdb->flags & TDB_NOLOCK) - || tdb_has_expansion_lock(tdb)); - - if (len + off < len) { - if (probe) - return TDB_SUCCESS; - - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_oob off %llu len %llu wrap\n", - (long long)off, (long long)len); - } - - if (len + off <= tdb->file->map_size) - return TDB_SUCCESS; - if (tdb->flags & TDB_INTERNAL) { - if (probe) - return TDB_SUCCESS; - - tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_oob len %lld beyond internal" - " malloc size %lld", - (long long)(off + len), - (long long)tdb->file->map_size); - return TDB_ERR_IO; - } - - ecode = tdb_lock_expand(tdb, F_RDLCK); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - if (fstat(tdb->file->fd, &st) != 0) { - tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "Failed to fstat file: %s", strerror(errno)); - tdb_unlock_expand(tdb, F_RDLCK); - return TDB_ERR_IO; - } - - tdb_unlock_expand(tdb, F_RDLCK); - - if (st.st_size < off + len) { - if (probe) - return TDB_SUCCESS; - - tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_oob len %llu beyond eof at %zu", - (long long)(off + len), st.st_size); - return TDB_ERR_IO; - } - - /* Unmap, update size, remap */ - tdb_munmap(tdb->file); - - tdb->file->map_size = st.st_size; - tdb_mmap(tdb); - return TDB_SUCCESS; -} - -/* Endian conversion: we only ever deal with 8 byte quantities */ -void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size) -{ - assert(size % 8 == 0); - if (unlikely((tdb->flags & TDB_CONVERT)) && buf) { - uint64_t i, *p = (uint64_t *)buf; - for (i = 0; i < size / 8; i++) - p[i] = bswap_64(p[i]); - } - return buf; -} - -/* Return first non-zero offset in offset array, or end, or -ve error. */ -/* FIXME: Return the off? */ -uint64_t tdb_find_nonzero_off(struct tdb_context *tdb, - tdb_off_t base, uint64_t start, uint64_t end) -{ - uint64_t i; - const uint64_t *val; - - /* Zero vs non-zero is the same unconverted: minor optimization. */ - val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t), - (end - start) * sizeof(tdb_off_t), false); - if (TDB_PTR_IS_ERR(val)) { - return TDB_ERR_TO_OFF(TDB_PTR_ERR(val)); - } - - for (i = 0; i < (end - start); i++) { - if (val[i]) - break; - } - tdb_access_release(tdb, val); - return start + i; -} - -/* Return first zero offset in num offset array, or num, or -ve error. */ -uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off, - uint64_t num) -{ - uint64_t i; - const uint64_t *val; - - /* Zero vs non-zero is the same unconverted: minor optimization. */ - val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false); - if (TDB_PTR_IS_ERR(val)) { - return TDB_ERR_TO_OFF(TDB_PTR_ERR(val)); - } - - for (i = 0; i < num; i++) { - if (!val[i]) - break; - } - tdb_access_release(tdb, val); - return i; -} - -enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len) -{ - char buf[8192] = { 0 }; - void *p = tdb->tdb2.io->direct(tdb, off, len, true); - enum TDB_ERROR ecode = TDB_SUCCESS; - - assert(!(tdb->flags & TDB_RDONLY)); - if (TDB_PTR_IS_ERR(p)) { - return TDB_PTR_ERR(p); - } - if (p) { - memset(p, 0, len); - return ecode; - } - while (len) { - unsigned todo = len < sizeof(buf) ? len : sizeof(buf); - ecode = tdb->tdb2.io->twrite(tdb, off, buf, todo); - if (ecode != TDB_SUCCESS) { - break; - } - len -= todo; - off += todo; - } - return ecode; -} - -tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off) -{ - tdb_off_t ret; - enum TDB_ERROR ecode; - - if (likely(!(tdb->flags & TDB_CONVERT))) { - tdb_off_t *p = tdb->tdb2.io->direct(tdb, off, sizeof(*p), - false); - if (TDB_PTR_IS_ERR(p)) { - return TDB_ERR_TO_OFF(TDB_PTR_ERR(p)); - } - if (p) - return *p; - } - - ecode = tdb_read_convert(tdb, off, &ret, sizeof(ret)); - if (ecode != TDB_SUCCESS) { - return TDB_ERR_TO_OFF(ecode); - } - return ret; -} - -/* write a lump of data at a specified offset */ -static enum TDB_ERROR tdb_write(struct tdb_context *tdb, tdb_off_t off, - const void *buf, tdb_len_t len) -{ - enum TDB_ERROR ecode; - - if (tdb->flags & TDB_RDONLY) { - return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR, - "Write to read-only database"); - } - - ecode = tdb->tdb2.io->oob(tdb, off, len, false); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - if (tdb->file->map_ptr) { - memcpy(off + (char *)tdb->file->map_ptr, buf, len); - } else { - ssize_t ret; - ret = pwrite(tdb->file->fd, buf, len, off); - if (ret != len) { - /* This shouldn't happen: we avoid sparse files. */ - if (ret >= 0) - errno = ENOSPC; - - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_write: %zi at %zu len=%zu (%s)", - ret, (size_t)off, (size_t)len, - strerror(errno)); - } - } - return TDB_SUCCESS; -} - -/* read a lump of data at a specified offset */ -static enum TDB_ERROR tdb_read(struct tdb_context *tdb, tdb_off_t off, - void *buf, tdb_len_t len) -{ - enum TDB_ERROR ecode; - - ecode = tdb->tdb2.io->oob(tdb, off, len, false); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - if (tdb->file->map_ptr) { - memcpy(buf, off + (char *)tdb->file->map_ptr, len); - } else { - ssize_t r = pread(tdb->file->fd, buf, len, off); - if (r != len) { - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_read failed with %zi at %zu " - "len=%zu (%s) map_size=%zu", - r, (size_t)off, (size_t)len, - strerror(errno), - (size_t)tdb->file->map_size); - } - } - return TDB_SUCCESS; -} - -enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off, - const void *rec, size_t len) -{ - enum TDB_ERROR ecode; - - if (unlikely((tdb->flags & TDB_CONVERT))) { - void *conv = malloc(len); - if (!conv) { - return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "tdb_write: no memory converting" - " %zu bytes", len); - } - memcpy(conv, rec, len); - ecode = tdb->tdb2.io->twrite(tdb, off, - tdb_convert(tdb, conv, len), len); - free(conv); - } else { - ecode = tdb->tdb2.io->twrite(tdb, off, rec, len); - } - return ecode; -} - -enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off, - void *rec, size_t len) -{ - enum TDB_ERROR ecode = tdb->tdb2.io->tread(tdb, off, rec, len); - tdb_convert(tdb, rec, len); - return ecode; -} - -enum TDB_ERROR tdb_write_off(struct tdb_context *tdb, - tdb_off_t off, tdb_off_t val) -{ - if (tdb->flags & TDB_RDONLY) { - return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR, - "Write to read-only database"); - } - - if (likely(!(tdb->flags & TDB_CONVERT))) { - tdb_off_t *p = tdb->tdb2.io->direct(tdb, off, sizeof(*p), - true); - if (TDB_PTR_IS_ERR(p)) { - return TDB_PTR_ERR(p); - } - if (p) { - *p = val; - return TDB_SUCCESS; - } - } - return tdb_write_convert(tdb, off, &val, sizeof(val)); -} - -static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, - tdb_len_t len, unsigned int prefix) -{ - unsigned char *buf; - enum TDB_ERROR ecode; - - /* some systems don't like zero length malloc */ - buf = malloc(prefix + len ? prefix + len : 1); - if (!buf) { - tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_USE_ERROR, - "tdb_alloc_read malloc failed len=%zu", - (size_t)(prefix + len)); - return TDB_ERR_PTR(TDB_ERR_OOM); - } else { - ecode = tdb->tdb2.io->tread(tdb, offset, buf+prefix, len); - if (unlikely(ecode != TDB_SUCCESS)) { - free(buf); - return TDB_ERR_PTR(ecode); - } - } - return buf; -} - -/* read a lump of data, allocating the space for it */ -void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len) -{ - return _tdb_alloc_read(tdb, offset, len, 0); -} - -static enum TDB_ERROR fill(struct tdb_context *tdb, - const void *buf, size_t size, - tdb_off_t off, tdb_len_t len) -{ - while (len) { - size_t n = len > size ? size : len; - ssize_t ret = pwrite(tdb->file->fd, buf, n, off); - if (ret != n) { - if (ret >= 0) - errno = ENOSPC; - - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "fill failed:" - " %zi at %zu len=%zu (%s)", - ret, (size_t)off, (size_t)len, - strerror(errno)); - } - len -= n; - off += n; - } - return TDB_SUCCESS; -} - -/* expand a file. we prefer to use ftruncate, as that is what posix - says to use for mmap expansion */ -static enum TDB_ERROR tdb_expand_file(struct tdb_context *tdb, - tdb_len_t addition) -{ - char buf[8192]; - enum TDB_ERROR ecode; - - if (tdb->flags & TDB_RDONLY) { - return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR, - "Expand on read-only database"); - } - - if (tdb->flags & TDB_INTERNAL) { - char *new = realloc(tdb->file->map_ptr, - tdb->file->map_size + addition); - if (!new) { - return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "No memory to expand database"); - } - tdb->file->map_ptr = new; - tdb->file->map_size += addition; - } else { - /* Unmap before trying to write; old TDB claimed OpenBSD had - * problem with this otherwise. */ - tdb_munmap(tdb->file); - - /* If this fails, we try to fill anyway. */ - if (ftruncate(tdb->file->fd, tdb->file->map_size + addition)) - ; - - /* now fill the file with something. This ensures that the - file isn't sparse, which would be very bad if we ran out of - disk. This must be done with write, not via mmap */ - memset(buf, 0x43, sizeof(buf)); - ecode = fill(tdb, buf, sizeof(buf), tdb->file->map_size, - addition); - if (ecode != TDB_SUCCESS) - return ecode; - tdb->file->map_size += addition; - tdb_mmap(tdb); - } - return TDB_SUCCESS; -} - -const void *tdb_access_read(struct tdb_context *tdb, - tdb_off_t off, tdb_len_t len, bool convert) -{ - void *ret = NULL; - - if (likely(!(tdb->flags & TDB_CONVERT))) { - ret = tdb->tdb2.io->direct(tdb, off, len, false); - - if (TDB_PTR_IS_ERR(ret)) { - return ret; - } - } - if (!ret) { - struct tdb_access_hdr *hdr; - hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr)); - if (TDB_PTR_IS_ERR(hdr)) { - return hdr; - } - hdr->next = tdb->tdb2.access; - tdb->tdb2.access = hdr; - ret = hdr + 1; - if (convert) { - tdb_convert(tdb, (void *)ret, len); - } - } else - tdb->tdb2.direct_access++; - - return ret; -} - -void *tdb_access_write(struct tdb_context *tdb, - tdb_off_t off, tdb_len_t len, bool convert) -{ - void *ret = NULL; - - if (tdb->flags & TDB_RDONLY) { - tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR, - "Write to read-only database"); - return TDB_ERR_PTR(TDB_ERR_RDONLY); - } - - if (likely(!(tdb->flags & TDB_CONVERT))) { - ret = tdb->tdb2.io->direct(tdb, off, len, true); - - if (TDB_PTR_IS_ERR(ret)) { - return ret; - } - } - - if (!ret) { - struct tdb_access_hdr *hdr; - hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr)); - if (TDB_PTR_IS_ERR(hdr)) { - return hdr; - } - hdr->next = tdb->tdb2.access; - tdb->tdb2.access = hdr; - hdr->off = off; - hdr->len = len; - hdr->convert = convert; - ret = hdr + 1; - if (convert) - tdb_convert(tdb, (void *)ret, len); - } else - tdb->tdb2.direct_access++; - - return ret; -} - -static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p) -{ - struct tdb_access_hdr **hp; - - for (hp = &tdb->tdb2.access; *hp; hp = &(*hp)->next) { - if (*hp + 1 == p) - return hp; - } - return NULL; -} - -void tdb_access_release(struct tdb_context *tdb, const void *p) -{ - struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p); - - if (hp) { - hdr = *hp; - *hp = hdr->next; - free(hdr); - } else - tdb->tdb2.direct_access--; -} - -enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p) -{ - struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p); - enum TDB_ERROR ecode; - - if (hp) { - hdr = *hp; - if (hdr->convert) - ecode = tdb_write_convert(tdb, hdr->off, p, hdr->len); - else - ecode = tdb_write(tdb, hdr->off, p, hdr->len); - *hp = hdr->next; - free(hdr); - } else { - tdb->tdb2.direct_access--; - ecode = TDB_SUCCESS; - } - - return ecode; -} - -static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len, - bool write_mode) -{ - enum TDB_ERROR ecode; - - if (unlikely(!tdb->file->map_ptr)) - return NULL; - - ecode = tdb_oob(tdb, off, len, false); - if (unlikely(ecode != TDB_SUCCESS)) - return TDB_ERR_PTR(ecode); - return (char *)tdb->file->map_ptr + off; -} - -void tdb_inc_seqnum(struct tdb_context *tdb) -{ - tdb_off_t seq; - - if (tdb->flags & TDB_VERSION1) { - tdb1_increment_seqnum_nonblock(tdb); - return; - } - - if (likely(!(tdb->flags & TDB_CONVERT))) { - int64_t *direct; - - direct = tdb->tdb2.io->direct(tdb, - offsetof(struct tdb_header, - seqnum), - sizeof(*direct), true); - if (likely(direct)) { - /* Don't let it go negative, even briefly */ - if (unlikely((*direct) + 1) < 0) - *direct = 0; - (*direct)++; - return; - } - } - - seq = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum)); - if (!TDB_OFF_IS_ERR(seq)) { - seq++; - if (unlikely((int64_t)seq < 0)) - seq = 0; - tdb_write_off(tdb, offsetof(struct tdb_header, seqnum), seq); - } -} - -static const struct tdb_methods io_methods = { - tdb_read, - tdb_write, - tdb_oob, - tdb_expand_file, - tdb_direct, -}; - -/* - initialise the default methods table -*/ -void tdb_io_init(struct tdb_context *tdb) -{ - tdb->tdb2.io = &io_methods; -} diff --git a/ccan/tdb2/lock.c b/ccan/tdb2/lock.c deleted file mode 100644 index a71c95f6..00000000 --- a/ccan/tdb2/lock.c +++ /dev/null @@ -1,895 +0,0 @@ - /* - Unix SMB/CIFS implementation. - - trivial database library - - Copyright (C) Andrew Tridgell 1999-2005 - Copyright (C) Paul `Rusty' Russell 2000 - Copyright (C) Jeremy Allison 2000-2003 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ - -#include "private.h" -#include -#include - -/* If we were threaded, we could wait for unlock, but we're not, so fail. */ -enum TDB_ERROR owner_conflict(struct tdb_context *tdb, const char *call) -{ - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, - "%s: lock owned by another tdb in this process.", - call); -} - -/* If we fork, we no longer really own locks. */ -bool check_lock_pid(struct tdb_context *tdb, const char *call, bool log) -{ - /* No locks? No problem! */ - if (tdb->file->allrecord_lock.count == 0 - && tdb->file->num_lockrecs == 0) { - return true; - } - - /* No fork? No problem! */ - if (tdb->file->locker == getpid()) { - return true; - } - - if (log) { - tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, - "%s: fork() detected after lock acquisition!" - " (%u vs %u)", call, tdb->file->locker, getpid()); - } - return false; -} - -int tdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag, - void *unused) -{ - struct flock fl; - int ret; - - do { - fl.l_type = rw; - fl.l_whence = SEEK_SET; - fl.l_start = off; - fl.l_len = len; - - if (waitflag) - ret = fcntl(fd, F_SETLKW, &fl); - else - ret = fcntl(fd, F_SETLK, &fl); - } while (ret != 0 && errno == EINTR); - return ret; -} - -int tdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *unused) -{ - struct flock fl; - int ret; - - do { - fl.l_type = F_UNLCK; - fl.l_whence = SEEK_SET; - fl.l_start = off; - fl.l_len = len; - - ret = fcntl(fd, F_SETLKW, &fl); - } while (ret != 0 && errno == EINTR); - return ret; -} - -static int lock(struct tdb_context *tdb, - int rw, off_t off, off_t len, bool waitflag) -{ - int ret; - if (tdb->file->allrecord_lock.count == 0 - && tdb->file->num_lockrecs == 0) { - tdb->file->locker = getpid(); - } - - tdb->stats.lock_lowlevel++; - ret = tdb->lock_fn(tdb->file->fd, rw, off, len, waitflag, - tdb->lock_data); - if (!waitflag) { - tdb->stats.lock_nonblock++; - if (ret != 0) - tdb->stats.lock_nonblock_fail++; - } - return ret; -} - -static int unlock(struct tdb_context *tdb, int rw, off_t off, off_t len) -{ -#if 0 /* Check they matched up locks and unlocks correctly. */ - char line[80]; - FILE *locks; - bool found = false; - - locks = fopen("/proc/locks", "r"); - - while (fgets(line, 80, locks)) { - char *p; - int type, start, l; - - /* eg. 1: FLOCK ADVISORY WRITE 2440 08:01:2180826 0 EOF */ - p = strchr(line, ':') + 1; - if (strncmp(p, " POSIX ADVISORY ", strlen(" POSIX ADVISORY "))) - continue; - p += strlen(" FLOCK ADVISORY "); - if (strncmp(p, "READ ", strlen("READ ")) == 0) - type = F_RDLCK; - else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0) - type = F_WRLCK; - else - abort(); - p += 6; - if (atoi(p) != getpid()) - continue; - p = strchr(strchr(p, ' ') + 1, ' ') + 1; - start = atoi(p); - p = strchr(p, ' ') + 1; - if (strncmp(p, "EOF", 3) == 0) - l = 0; - else - l = atoi(p) - start + 1; - - if (off == start) { - if (len != l) { - fprintf(stderr, "Len %u should be %u: %s", - (int)len, l, line); - abort(); - } - if (type != rw) { - fprintf(stderr, "Type %s wrong: %s", - rw == F_RDLCK ? "READ" : "WRITE", line); - abort(); - } - found = true; - break; - } - } - - if (!found) { - fprintf(stderr, "Unlock on %u@%u not found!", - (int)off, (int)len); - abort(); - } - - fclose(locks); -#endif - - return tdb->unlock_fn(tdb->file->fd, rw, off, len, tdb->lock_data); -} - -/* a byte range locking function - return 0 on success - this functions locks len bytes at the specified offset. - - note that a len of zero means lock to end of file -*/ -enum TDB_ERROR tdb_brlock(struct tdb_context *tdb, - int rw_type, tdb_off_t offset, tdb_off_t len, - enum tdb_lock_flags flags) -{ - int ret; - - if (tdb->flags & TDB_NOLOCK) { - return TDB_SUCCESS; - } - - if (rw_type == F_WRLCK && (tdb->flags & TDB_RDONLY)) { - return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR, - "Write lock attempted on read-only database"); - } - - /* A 32 bit system cannot open a 64-bit file, but it could have - * expanded since then: check here. */ - if ((size_t)(offset + len) != offset + len) { - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_brlock: lock on giant offset %llu", - (long long)(offset + len)); - } - - ret = lock(tdb, rw_type, offset, len, flags & TDB_LOCK_WAIT); - if (ret != 0) { - /* Generic lock error. errno set by fcntl. - * EAGAIN is an expected return from non-blocking - * locks. */ - if (!(flags & TDB_LOCK_PROBE) - && (errno != EAGAIN && errno != EINTR)) { - tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_brlock failed (fd=%d) at" - " offset %zu rw_type=%d flags=%d len=%zu:" - " %s", - tdb->file->fd, (size_t)offset, rw_type, - flags, (size_t)len, strerror(errno)); - } - return TDB_ERR_LOCK; - } - return TDB_SUCCESS; -} - -enum TDB_ERROR tdb_brunlock(struct tdb_context *tdb, - int rw_type, tdb_off_t offset, size_t len) -{ - if (tdb->flags & TDB_NOLOCK) { - return TDB_SUCCESS; - } - - if (!check_lock_pid(tdb, "tdb_brunlock", true)) - return TDB_ERR_LOCK; - - if (unlock(tdb, rw_type, offset, len) == -1) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_brunlock failed (fd=%d) at offset %zu" - " rw_type=%d len=%zu: %s", - tdb->file->fd, (size_t)offset, rw_type, - (size_t)len, strerror(errno)); - } - return TDB_SUCCESS; -} - -/* - upgrade a read lock to a write lock. This needs to be handled in a - special way as some OSes (such as solaris) have too conservative - deadlock detection and claim a deadlock when progress can be - made. For those OSes we may loop for a while. -*/ -enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb, off_t start) -{ - int count = 1000; - - if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true)) - return TDB_ERR_LOCK; - - if (tdb->file->allrecord_lock.count != 1) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_allrecord_upgrade failed:" - " count %u too high", - tdb->file->allrecord_lock.count); - } - - if (tdb->file->allrecord_lock.off != 1) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_allrecord_upgrade failed:" - " already upgraded?"); - } - - if (tdb->file->allrecord_lock.owner != tdb) { - return owner_conflict(tdb, "tdb_allrecord_upgrade"); - } - - while (count--) { - struct timeval tv; - if (tdb_brlock(tdb, F_WRLCK, start, 0, - TDB_LOCK_WAIT|TDB_LOCK_PROBE) == TDB_SUCCESS) { - tdb->file->allrecord_lock.ltype = F_WRLCK; - tdb->file->allrecord_lock.off = 0; - return TDB_SUCCESS; - } - if (errno != EDEADLK) { - break; - } - /* sleep for as short a time as we can - more portable than usleep() */ - tv.tv_sec = 0; - tv.tv_usec = 1; - select(0, NULL, NULL, NULL, &tv); - } - - if (errno != EAGAIN && errno != EINTR) - tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_allrecord_upgrade failed"); - return TDB_ERR_LOCK; -} - -static struct tdb_lock *find_nestlock(struct tdb_context *tdb, tdb_off_t offset, - const struct tdb_context *owner) -{ - unsigned int i; - - for (i=0; ifile->num_lockrecs; i++) { - if (tdb->file->lockrecs[i].off == offset) { - if (owner && tdb->file->lockrecs[i].owner != owner) - return NULL; - return &tdb->file->lockrecs[i]; - } - } - return NULL; -} - -enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb) -{ - enum TDB_ERROR ecode; - - if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true)) - return TDB_ERR_LOCK; - - ecode = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK, - false); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - ecode = tdb_lock_open(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK); - if (ecode != TDB_SUCCESS) { - tdb_allrecord_unlock(tdb, F_WRLCK); - return ecode; - } - ecode = tdb_transaction_recover(tdb); - tdb_unlock_open(tdb, F_WRLCK); - tdb_allrecord_unlock(tdb, F_WRLCK); - - return ecode; -} - -/* lock an offset in the database. */ -enum TDB_ERROR tdb_nest_lock(struct tdb_context *tdb, - tdb_off_t offset, int ltype, - enum tdb_lock_flags flags) -{ - struct tdb_lock *new_lck; - enum TDB_ERROR ecode; - - if (!(tdb->flags & TDB_VERSION1) - && offset > (TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE - + tdb->file->map_size / 8)) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_nest_lock: invalid offset %zu ltype=%d", - (size_t)offset, ltype); - } - - if (tdb->flags & TDB_NOLOCK) - return TDB_SUCCESS; - - if (!check_lock_pid(tdb, "tdb_nest_lock", true)) { - return TDB_ERR_LOCK; - } - - tdb->stats.locks++; - - new_lck = find_nestlock(tdb, offset, NULL); - if (new_lck) { - if (new_lck->owner != tdb) { - return owner_conflict(tdb, "tdb_nest_lock"); - } - - if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_nest_lock:" - " offset %zu has read lock", - (size_t)offset); - } - /* Just increment the struct, posix locks don't stack. */ - new_lck->count++; - return TDB_SUCCESS; - } - -#if 0 - if (tdb->file->num_lockrecs - && offset >= TDB_HASH_LOCK_START - && offset < TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_nest_lock: already have a hash lock?"); - } -#endif - - new_lck = (struct tdb_lock *)realloc( - tdb->file->lockrecs, - sizeof(*tdb->file->lockrecs) * (tdb->file->num_lockrecs+1)); - if (new_lck == NULL) { - return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "tdb_nest_lock:" - " unable to allocate %zu lock struct", - tdb->file->num_lockrecs + 1); - } - tdb->file->lockrecs = new_lck; - - /* Since fcntl locks don't nest, we do a lock for the first one, - and simply bump the count for future ones */ - ecode = tdb_brlock(tdb, ltype, offset, 1, flags); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - /* First time we grab a lock, perhaps someone died in commit? */ - if (!(flags & TDB_LOCK_NOCHECK) - && tdb->file->num_lockrecs == 0) { - tdb_bool_err berr = tdb_needs_recovery(tdb); - if (berr != false) { - tdb_brunlock(tdb, ltype, offset, 1); - - if (berr < 0) - return TDB_OFF_TO_ERR(berr); - ecode = tdb_lock_and_recover(tdb); - if (ecode == TDB_SUCCESS) { - ecode = tdb_brlock(tdb, ltype, offset, 1, - flags); - } - if (ecode != TDB_SUCCESS) { - return ecode; - } - } - } - - tdb->file->lockrecs[tdb->file->num_lockrecs].owner = tdb; - tdb->file->lockrecs[tdb->file->num_lockrecs].off = offset; - tdb->file->lockrecs[tdb->file->num_lockrecs].count = 1; - tdb->file->lockrecs[tdb->file->num_lockrecs].ltype = ltype; - tdb->file->num_lockrecs++; - - return TDB_SUCCESS; -} - -enum TDB_ERROR tdb_nest_unlock(struct tdb_context *tdb, - tdb_off_t off, int ltype) -{ - struct tdb_lock *lck; - enum TDB_ERROR ecode; - - if (tdb->flags & TDB_NOLOCK) - return TDB_SUCCESS; - - lck = find_nestlock(tdb, off, tdb); - if ((lck == NULL) || (lck->count == 0)) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_nest_unlock: no lock for %zu", - (size_t)off); - } - - if (lck->count > 1) { - lck->count--; - return TDB_SUCCESS; - } - - /* - * This lock has count==1 left, so we need to unlock it in the - * kernel. We don't bother with decrementing the in-memory array - * element, we're about to overwrite it with the last array element - * anyway. - */ - ecode = tdb_brunlock(tdb, ltype, off, 1); - - /* - * Shrink the array by overwriting the element just unlocked with the - * last array element. - */ - *lck = tdb->file->lockrecs[--tdb->file->num_lockrecs]; - - return ecode; -} - -/* - get the transaction lock - */ -enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype) -{ - return tdb_nest_lock(tdb, TDB_TRANSACTION_LOCK, ltype, TDB_LOCK_WAIT); -} - -/* - release the transaction lock - */ -void tdb_transaction_unlock(struct tdb_context *tdb, int ltype) -{ - tdb_nest_unlock(tdb, TDB_TRANSACTION_LOCK, ltype); -} - -/* We only need to lock individual bytes, but Linux merges consecutive locks - * so we lock in contiguous ranges. */ -enum TDB_ERROR tdb_lock_gradual(struct tdb_context *tdb, - int ltype, enum tdb_lock_flags flags, - tdb_off_t off, tdb_off_t len) -{ - enum TDB_ERROR ecode; - enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT); - - if (len <= 1) { - /* 0 would mean to end-of-file... */ - assert(len != 0); - /* Single hash. Just do blocking lock. */ - return tdb_brlock(tdb, ltype, off, len, flags); - } - - /* First we try non-blocking. */ - ecode = tdb_brlock(tdb, ltype, off, len, nb_flags); - if (ecode != TDB_ERR_LOCK) { - return ecode; - } - - /* Try locking first half, then second. */ - ecode = tdb_lock_gradual(tdb, ltype, flags, off, len / 2); - if (ecode != TDB_SUCCESS) - return ecode; - - ecode = tdb_lock_gradual(tdb, ltype, flags, - off + len / 2, len - len / 2); - if (ecode != TDB_SUCCESS) { - tdb_brunlock(tdb, ltype, off, len / 2); - } - return ecode; -} - -/* lock/unlock entire database. It can only be upgradable if you have some - * other way of guaranteeing exclusivity (ie. transaction write lock). */ -enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype, - enum tdb_lock_flags flags, bool upgradable) -{ - enum TDB_ERROR ecode; - tdb_bool_err berr; - - if (tdb->flags & TDB_VERSION1) { - if (tdb1_allrecord_lock(tdb, ltype, flags, upgradable) == -1) - return tdb->last_error; - return TDB_SUCCESS; - } - - if (tdb->flags & TDB_NOLOCK) - return TDB_SUCCESS; - - if (!check_lock_pid(tdb, "tdb_allrecord_lock", true)) { - return TDB_ERR_LOCK; - } - - if (tdb->file->allrecord_lock.count) { - if (tdb->file->allrecord_lock.owner != tdb) { - return owner_conflict(tdb, "tdb_allrecord_lock"); - } - - if (ltype == F_RDLCK - || tdb->file->allrecord_lock.ltype == F_WRLCK) { - tdb->file->allrecord_lock.count++; - return TDB_SUCCESS; - } - - /* a global lock of a different type exists */ - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, - "tdb_allrecord_lock: already have %s lock", - tdb->file->allrecord_lock.ltype == F_RDLCK - ? "read" : "write"); - } - - if (tdb_has_hash_locks(tdb)) { - /* can't combine global and chain locks */ - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, - "tdb_allrecord_lock:" - " already have chain lock"); - } - - if (upgradable && ltype != F_RDLCK) { - /* tdb error: you can't upgrade a write lock! */ - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_allrecord_lock:" - " can't upgrade a write lock"); - } - - tdb->stats.locks++; -again: - /* Lock hashes, gradually. */ - ecode = tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START, - TDB_HASH_LOCK_RANGE); - if (ecode != TDB_SUCCESS) - return ecode; - - /* Lock free tables: there to end of file. */ - ecode = tdb_brlock(tdb, ltype, - TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE, - 0, flags); - if (ecode != TDB_SUCCESS) { - tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, - TDB_HASH_LOCK_RANGE); - return ecode; - } - - tdb->file->allrecord_lock.owner = tdb; - tdb->file->allrecord_lock.count = 1; - /* If it's upgradable, it's actually exclusive so we can treat - * it as a write lock. */ - tdb->file->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype; - tdb->file->allrecord_lock.off = upgradable; - - /* Now check for needing recovery. */ - if (flags & TDB_LOCK_NOCHECK) - return TDB_SUCCESS; - - berr = tdb_needs_recovery(tdb); - if (likely(berr == false)) - return TDB_SUCCESS; - - tdb_allrecord_unlock(tdb, ltype); - if (berr < 0) - return TDB_OFF_TO_ERR(berr); - ecode = tdb_lock_and_recover(tdb); - if (ecode != TDB_SUCCESS) { - return ecode; - } - goto again; -} - -enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb, - int ltype, enum tdb_lock_flags flags) -{ - return tdb_nest_lock(tdb, TDB_OPEN_LOCK, ltype, flags); -} - -void tdb_unlock_open(struct tdb_context *tdb, int ltype) -{ - tdb_nest_unlock(tdb, TDB_OPEN_LOCK, ltype); -} - -bool tdb_has_open_lock(struct tdb_context *tdb) -{ - return !(tdb->flags & TDB_NOLOCK) - && find_nestlock(tdb, TDB_OPEN_LOCK, tdb) != NULL; -} - -enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype) -{ - /* Lock doesn't protect data, so don't check (we recurse if we do!) */ - return tdb_nest_lock(tdb, TDB_EXPANSION_LOCK, ltype, - TDB_LOCK_WAIT | TDB_LOCK_NOCHECK); -} - -void tdb_unlock_expand(struct tdb_context *tdb, int ltype) -{ - tdb_nest_unlock(tdb, TDB_EXPANSION_LOCK, ltype); -} - -/* unlock entire db */ -void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype) -{ - if (tdb->flags & TDB_VERSION1) { - tdb1_allrecord_unlock(tdb, ltype); - return; - } - - if (tdb->flags & TDB_NOLOCK) - return; - - if (tdb->file->allrecord_lock.count == 0) { - tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, - "tdb_allrecord_unlock: not locked!"); - return; - } - - if (tdb->file->allrecord_lock.owner != tdb) { - tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, - "tdb_allrecord_unlock: not locked by us!"); - return; - } - - /* Upgradable locks are marked as write locks. */ - if (tdb->file->allrecord_lock.ltype != ltype - && (!tdb->file->allrecord_lock.off || ltype != F_RDLCK)) { - tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_allrecord_unlock: have %s lock", - tdb->file->allrecord_lock.ltype == F_RDLCK - ? "read" : "write"); - return; - } - - if (tdb->file->allrecord_lock.count > 1) { - tdb->file->allrecord_lock.count--; - return; - } - - tdb->file->allrecord_lock.count = 0; - tdb->file->allrecord_lock.ltype = 0; - - tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, 0); -} - -bool tdb_has_expansion_lock(struct tdb_context *tdb) -{ - return find_nestlock(tdb, TDB_EXPANSION_LOCK, tdb) != NULL; -} - -bool tdb_has_hash_locks(struct tdb_context *tdb) -{ - unsigned int i; - - for (i=0; ifile->num_lockrecs; i++) { - if (tdb->file->lockrecs[i].off >= TDB_HASH_LOCK_START - && tdb->file->lockrecs[i].off < (TDB_HASH_LOCK_START - + TDB_HASH_LOCK_RANGE)) - return true; - } - return false; -} - -static bool tdb_has_free_lock(struct tdb_context *tdb) -{ - unsigned int i; - - if (tdb->flags & TDB_NOLOCK) - return false; - - for (i=0; ifile->num_lockrecs; i++) { - if (tdb->file->lockrecs[i].off - > TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) - return true; - } - return false; -} - -enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb, - tdb_off_t hash_lock, - tdb_len_t hash_range, - int ltype, enum tdb_lock_flags waitflag) -{ - /* FIXME: Do this properly, using hlock_range */ - unsigned l = TDB_HASH_LOCK_START - + (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS)); - - /* a allrecord lock allows us to avoid per chain locks */ - if (tdb->file->allrecord_lock.count) { - if (!check_lock_pid(tdb, "tdb_lock_hashes", true)) - return TDB_ERR_LOCK; - - if (tdb->file->allrecord_lock.owner != tdb) - return owner_conflict(tdb, "tdb_lock_hashes"); - if (ltype == tdb->file->allrecord_lock.ltype - || ltype == F_RDLCK) { - return TDB_SUCCESS; - } - - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, - "tdb_lock_hashes:" - " already have %s allrecordlock", - tdb->file->allrecord_lock.ltype == F_RDLCK - ? "read" : "write"); - } - - if (tdb_has_free_lock(tdb)) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_lock_hashes: already have free lock"); - } - - if (tdb_has_expansion_lock(tdb)) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_lock_hashes:" - " already have expansion lock"); - } - - return tdb_nest_lock(tdb, l, ltype, waitflag); -} - -enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb, - tdb_off_t hash_lock, - tdb_len_t hash_range, int ltype) -{ - unsigned l = TDB_HASH_LOCK_START - + (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS)); - - if (tdb->flags & TDB_NOLOCK) - return 0; - - /* a allrecord lock allows us to avoid per chain locks */ - if (tdb->file->allrecord_lock.count) { - if (tdb->file->allrecord_lock.ltype == F_RDLCK - && ltype == F_WRLCK) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_unlock_hashes RO allrecord!"); - } - if (tdb->file->allrecord_lock.owner != tdb) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, - "tdb_unlock_hashes:" - " not locked by us!"); - } - return TDB_SUCCESS; - } - - return tdb_nest_unlock(tdb, l, ltype); -} - -/* Hash locks use TDB_HASH_LOCK_START + the next 30 bits. - * Then we begin; bucket offsets are sizeof(tdb_len_t) apart, so we divide. - * The result is that on 32 bit systems we don't use lock values > 2^31 on - * files that are less than 4GB. - */ -static tdb_off_t free_lock_off(tdb_off_t b_off) -{ - return TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE - + b_off / sizeof(tdb_off_t); -} - -enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off, - enum tdb_lock_flags waitflag) -{ - assert(b_off >= sizeof(struct tdb_header)); - - if (tdb->flags & TDB_NOLOCK) - return 0; - - /* a allrecord lock allows us to avoid per chain locks */ - if (tdb->file->allrecord_lock.count) { - if (!check_lock_pid(tdb, "tdb_lock_free_bucket", true)) - return TDB_ERR_LOCK; - - if (tdb->file->allrecord_lock.owner != tdb) { - return owner_conflict(tdb, "tdb_lock_free_bucket"); - } - - if (tdb->file->allrecord_lock.ltype == F_WRLCK) - return 0; - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_lock_free_bucket with" - " read-only allrecordlock!"); - } - -#if 0 /* FIXME */ - if (tdb_has_expansion_lock(tdb)) { - return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb_lock_free_bucket:" - " already have expansion lock"); - } -#endif - - return tdb_nest_lock(tdb, free_lock_off(b_off), F_WRLCK, waitflag); -} - -void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off) -{ - if (tdb->file->allrecord_lock.count) - return; - - tdb_nest_unlock(tdb, free_lock_off(b_off), F_WRLCK); -} - -enum TDB_ERROR tdb_lockall(struct tdb_context *tdb) -{ - return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false); -} - -void tdb_unlockall(struct tdb_context *tdb) -{ - tdb_allrecord_unlock(tdb, F_WRLCK); -} - -enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb) -{ - return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false); -} - -void tdb_unlockall_read(struct tdb_context *tdb) -{ - tdb_allrecord_unlock(tdb, F_RDLCK); -} - -void tdb_lock_cleanup(struct tdb_context *tdb) -{ - unsigned int i; - - /* We don't want to warn: they're allowed to close tdb after fork. */ - if (!check_lock_pid(tdb, "tdb_close", false)) - return; - - while (tdb->file->allrecord_lock.count - && tdb->file->allrecord_lock.owner == tdb) { - tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype); - } - - for (i=0; ifile->num_lockrecs; i++) { - if (tdb->file->lockrecs[i].owner == tdb) { - tdb_nest_unlock(tdb, - tdb->file->lockrecs[i].off, - tdb->file->lockrecs[i].ltype); - i--; - } - } -} diff --git a/ccan/tdb2/open.c b/ccan/tdb2/open.c deleted file mode 100644 index e238d992..00000000 --- a/ccan/tdb2/open.c +++ /dev/null @@ -1,884 +0,0 @@ - /* - Trivial Database 2: opening and closing TDBs - Copyright (C) Rusty Russell 2010 - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ -#include "private.h" -#include -#include - -/* all tdbs, to detect double-opens (fcntl file don't nest!) */ -static struct tdb_context *tdbs = NULL; - -static struct tdb_file *find_file(dev_t device, ino_t ino) -{ - struct tdb_context *i; - - for (i = tdbs; i; i = i->next) { - if (i->file->device == device && i->file->inode == ino) { - i->file->refcnt++; - return i->file; - } - } - return NULL; -} - -static bool read_all(int fd, void *buf, size_t len) -{ - while (len) { - ssize_t ret; - ret = read(fd, buf, len); - if (ret < 0) - return false; - if (ret == 0) { - /* ETOOSHORT? */ - errno = EWOULDBLOCK; - return false; - } - buf = (char *)buf + ret; - len -= ret; - } - return true; -} - -static uint64_t random_number(struct tdb_context *tdb) -{ - int fd; - uint64_t ret = 0; - struct timeval now; - - fd = open("/dev/urandom", O_RDONLY); - if (fd >= 0) { - if (read_all(fd, &ret, sizeof(ret))) { - close(fd); - return ret; - } - close(fd); - } - /* FIXME: Untested! Based on Wikipedia protocol description! */ - fd = open("/dev/egd-pool", O_RDWR); - if (fd >= 0) { - /* Command is 1, next byte is size we want to read. */ - char cmd[2] = { 1, sizeof(uint64_t) }; - if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) { - char reply[1 + sizeof(uint64_t)]; - int r = read(fd, reply, sizeof(reply)); - if (r > 1) { - /* Copy at least some bytes. */ - memcpy(&ret, reply+1, r - 1); - if (reply[0] == sizeof(uint64_t) - && r == sizeof(reply)) { - close(fd); - return ret; - } - } - } - close(fd); - } - - /* Fallback: pid and time. */ - gettimeofday(&now, NULL); - ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec; - tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING, - "tdb_open: random from getpid and time"); - return ret; -} - -static void tdb2_context_init(struct tdb_context *tdb) -{ - /* Initialize the TDB2 fields here */ - tdb_io_init(tdb); - tdb->tdb2.direct_access = 0; - tdb->tdb2.transaction = NULL; - tdb->tdb2.access = NULL; -} - -struct new_database { - struct tdb_header hdr; - struct tdb_freetable ftable; -}; - -/* initialise a new database */ -static enum TDB_ERROR tdb_new_database(struct tdb_context *tdb, - struct tdb_attribute_seed *seed, - struct tdb_header *hdr) -{ - /* We make it up in memory, then write it out if not internal */ - struct new_database newdb; - unsigned int magic_len; - ssize_t rlen; - enum TDB_ERROR ecode; - - /* Fill in the header */ - newdb.hdr.version = TDB_VERSION; - if (seed) - newdb.hdr.hash_seed = seed->seed; - else - newdb.hdr.hash_seed = random_number(tdb); - newdb.hdr.hash_test = TDB_HASH_MAGIC; - newdb.hdr.hash_test = tdb->hash_fn(&newdb.hdr.hash_test, - sizeof(newdb.hdr.hash_test), - newdb.hdr.hash_seed, - tdb->hash_data); - newdb.hdr.recovery = 0; - newdb.hdr.features_used = newdb.hdr.features_offered = TDB_FEATURE_MASK; - newdb.hdr.seqnum = 0; - newdb.hdr.capabilities = 0; - memset(newdb.hdr.reserved, 0, sizeof(newdb.hdr.reserved)); - /* Initial hashes are empty. */ - memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable)); - - /* Free is empty. */ - newdb.hdr.free_table = offsetof(struct new_database, ftable); - memset(&newdb.ftable, 0, sizeof(newdb.ftable)); - ecode = set_header(NULL, &newdb.ftable.hdr, TDB_FTABLE_MAGIC, 0, - sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr), - sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr), - 0); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - /* Magic food */ - memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food)); - strcpy(newdb.hdr.magic_food, TDB_MAGIC_FOOD); - - /* This creates an endian-converted database, as if read from disk */ - magic_len = sizeof(newdb.hdr.magic_food); - tdb_convert(tdb, - (char *)&newdb.hdr + magic_len, sizeof(newdb) - magic_len); - - *hdr = newdb.hdr; - - if (tdb->flags & TDB_INTERNAL) { - tdb->file->map_size = sizeof(newdb); - tdb->file->map_ptr = malloc(tdb->file->map_size); - if (!tdb->file->map_ptr) { - return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "tdb_new_database:" - " failed to allocate"); - } - memcpy(tdb->file->map_ptr, &newdb, tdb->file->map_size); - return TDB_SUCCESS; - } - if (lseek(tdb->file->fd, 0, SEEK_SET) == -1) { - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_new_database:" - " failed to seek: %s", strerror(errno)); - } - - if (ftruncate(tdb->file->fd, 0) == -1) { - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_new_database:" - " failed to truncate: %s", strerror(errno)); - } - - rlen = write(tdb->file->fd, &newdb, sizeof(newdb)); - if (rlen != sizeof(newdb)) { - if (rlen >= 0) - errno = ENOSPC; - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_new_database: %zi writing header: %s", - rlen, strerror(errno)); - } - return TDB_SUCCESS; -} - -static enum TDB_ERROR tdb_new_file(struct tdb_context *tdb) -{ - tdb->file = malloc(sizeof(*tdb->file)); - if (!tdb->file) - return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "tdb_open: cannot alloc tdb_file structure"); - tdb->file->num_lockrecs = 0; - tdb->file->lockrecs = NULL; - tdb->file->allrecord_lock.count = 0; - tdb->file->refcnt = 1; - tdb->file->map_ptr = NULL; - return TDB_SUCCESS; -} - -enum TDB_ERROR tdb_set_attribute(struct tdb_context *tdb, - const union tdb_attribute *attr) -{ - switch (attr->base.attr) { - case TDB_ATTRIBUTE_LOG: - tdb->log_fn = attr->log.fn; - tdb->log_data = attr->log.data; - break; - case TDB_ATTRIBUTE_HASH: - case TDB_ATTRIBUTE_SEED: - case TDB_ATTRIBUTE_OPENHOOK: - case TDB_ATTRIBUTE_TDB1_HASHSIZE: - return tdb->last_error - = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_set_attribute:" - " cannot set %s after opening", - attr->base.attr == TDB_ATTRIBUTE_HASH - ? "TDB_ATTRIBUTE_HASH" - : attr->base.attr == TDB_ATTRIBUTE_SEED - ? "TDB_ATTRIBUTE_SEED" - : attr->base.attr == TDB_ATTRIBUTE_OPENHOOK - ? "TDB_ATTRIBUTE_OPENHOOK" - : "TDB_ATTRIBUTE_TDB1_HASHSIZE"); - case TDB_ATTRIBUTE_STATS: - return tdb->last_error - = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_set_attribute:" - " cannot set TDB_ATTRIBUTE_STATS"); - case TDB_ATTRIBUTE_FLOCK: - tdb->lock_fn = attr->flock.lock; - tdb->unlock_fn = attr->flock.unlock; - tdb->lock_data = attr->flock.data; - break; - default: - return tdb->last_error - = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_set_attribute:" - " unknown attribute type %u", - attr->base.attr); - } - return TDB_SUCCESS; -} - -enum TDB_ERROR tdb_get_attribute(struct tdb_context *tdb, - union tdb_attribute *attr) -{ - switch (attr->base.attr) { - case TDB_ATTRIBUTE_LOG: - if (!tdb->log_fn) - return tdb->last_error = TDB_ERR_NOEXIST; - attr->log.fn = tdb->log_fn; - attr->log.data = tdb->log_data; - break; - case TDB_ATTRIBUTE_HASH: - attr->hash.fn = tdb->hash_fn; - attr->hash.data = tdb->hash_data; - break; - case TDB_ATTRIBUTE_SEED: - if (tdb->flags & TDB_VERSION1) - return tdb->last_error - = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_get_attribute:" - " cannot get TDB_ATTRIBUTE_SEED" - " on TDB1 tdb."); - attr->seed.seed = tdb->hash_seed; - break; - case TDB_ATTRIBUTE_OPENHOOK: - if (!tdb->openhook) - return tdb->last_error = TDB_ERR_NOEXIST; - attr->openhook.fn = tdb->openhook; - attr->openhook.data = tdb->openhook_data; - break; - case TDB_ATTRIBUTE_STATS: { - size_t size = attr->stats.size; - if (size > tdb->stats.size) - size = tdb->stats.size; - memcpy(&attr->stats, &tdb->stats, size); - break; - } - case TDB_ATTRIBUTE_FLOCK: - attr->flock.lock = tdb->lock_fn; - attr->flock.unlock = tdb->unlock_fn; - attr->flock.data = tdb->lock_data; - break; - case TDB_ATTRIBUTE_TDB1_HASHSIZE: - if (!(tdb->flags & TDB_VERSION1)) - return tdb->last_error - = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_get_attribute:" - " cannot get TDB_ATTRIBUTE_TDB1_HASHSIZE" - " on TDB2 tdb."); - attr->tdb1_hashsize.hsize = tdb->tdb1.header.hash_size; - break; - default: - return tdb->last_error - = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_get_attribute:" - " unknown attribute type %u", - attr->base.attr); - } - attr->base.next = NULL; - return TDB_SUCCESS; -} - -void tdb_unset_attribute(struct tdb_context *tdb, - enum tdb_attribute_type type) -{ - switch (type) { - case TDB_ATTRIBUTE_LOG: - tdb->log_fn = NULL; - break; - case TDB_ATTRIBUTE_OPENHOOK: - tdb->openhook = NULL; - break; - case TDB_ATTRIBUTE_HASH: - case TDB_ATTRIBUTE_SEED: - case TDB_ATTRIBUTE_TDB1_HASHSIZE: - tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, - "tdb_unset_attribute: cannot unset %s after opening", - type == TDB_ATTRIBUTE_HASH - ? "TDB_ATTRIBUTE_HASH" - : type == TDB_ATTRIBUTE_SEED - ? "TDB_ATTRIBUTE_SEED" - : "TDB_ATTRIBUTE_TDB1_HASHSIZE"); - break; - case TDB_ATTRIBUTE_STATS: - tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_unset_attribute:" - "cannot unset TDB_ATTRIBUTE_STATS"); - break; - case TDB_ATTRIBUTE_FLOCK: - tdb->lock_fn = tdb_fcntl_lock; - tdb->unlock_fn = tdb_fcntl_unlock; - break; - default: - tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_unset_attribute: unknown attribute type %u", - type); - } -} - -static bool is_tdb1(struct tdb1_header *hdr, const void *buf, ssize_t rlen) -{ - /* This code assumes we've tried to read entire tdb1 header. */ - BUILD_ASSERT(sizeof(*hdr) <= sizeof(struct tdb_header)); - - if (rlen < (ssize_t)sizeof(*hdr)) { - return false; - } - - memcpy(hdr, buf, sizeof(*hdr)); - if (strcmp(hdr->magic_food, TDB_MAGIC_FOOD) != 0) - return false; - - return hdr->version == TDB1_VERSION - || hdr->version == TDB1_BYTEREV(TDB1_VERSION); -} - -/* The top three bits of the capability tell us whether it matters. */ -enum TDB_ERROR unknown_capability(struct tdb_context *tdb, const char *caller, - tdb_off_t type) -{ - if (type & TDB_CAP_NOOPEN) { - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "%s: file has unknown capability %llu", - caller, type & TDB_CAP_NOOPEN); - } - - if ((type & TDB_CAP_NOWRITE) && !(tdb->flags & TDB_RDONLY)) { - return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_ERROR, - "%s: file has unknown capability %llu" - " (cannot write to it)", - caller, type & TDB_CAP_NOOPEN); - } - - if (type & TDB_CAP_NOCHECK) { - tdb->flags |= TDB_CANT_CHECK; - } - return TDB_SUCCESS; -} - -static enum TDB_ERROR capabilities_ok(struct tdb_context *tdb, - tdb_off_t capabilities) -{ - tdb_off_t off, next; - enum TDB_ERROR ecode = TDB_SUCCESS; - const struct tdb_capability *cap; - - /* Check capability list. */ - for (off = capabilities; off && ecode == TDB_SUCCESS; off = next) { - cap = tdb_access_read(tdb, off, sizeof(*cap), true); - if (TDB_PTR_IS_ERR(cap)) { - return TDB_PTR_ERR(cap); - } - - switch (cap->type & TDB_CAP_TYPE_MASK) { - /* We don't understand any capabilities (yet). */ - default: - ecode = unknown_capability(tdb, "tdb_open", cap->type); - } - next = cap->next; - tdb_access_release(tdb, cap); - } - return ecode; -} - -struct tdb_context *tdb_open(const char *name, int tdb_flags, - int open_flags, mode_t mode, - union tdb_attribute *attr) -{ - struct tdb_context *tdb; - struct stat st; - int saved_errno = 0; - uint64_t hash_test; - unsigned v; - ssize_t rlen; - struct tdb_header hdr; - struct tdb_attribute_seed *seed = NULL; - struct tdb_attribute_tdb1_hashsize *hsize_attr = NULL; - struct tdb_attribute_tdb1_max_dead *maxsize_attr = NULL; - tdb_bool_err berr; - enum TDB_ERROR ecode; - int openlock; - - tdb = malloc(sizeof(*tdb) + (name ? strlen(name) + 1 : 0)); - if (!tdb) { - /* Can't log this */ - errno = ENOMEM; - return NULL; - } - /* Set name immediately for logging functions. */ - if (name) { - tdb->name = strcpy((char *)(tdb + 1), name); - } else { - tdb->name = NULL; - } - tdb->flags = tdb_flags; - tdb->log_fn = NULL; - tdb->open_flags = open_flags; - tdb->last_error = TDB_SUCCESS; - tdb->file = NULL; - tdb->openhook = NULL; - tdb->lock_fn = tdb_fcntl_lock; - tdb->unlock_fn = tdb_fcntl_unlock; - tdb->hash_fn = tdb_jenkins_hash; - memset(&tdb->stats, 0, sizeof(tdb->stats)); - tdb->stats.base.attr = TDB_ATTRIBUTE_STATS; - tdb->stats.size = sizeof(tdb->stats); - - while (attr) { - switch (attr->base.attr) { - case TDB_ATTRIBUTE_HASH: - tdb->hash_fn = attr->hash.fn; - tdb->hash_data = attr->hash.data; - break; - case TDB_ATTRIBUTE_SEED: - seed = &attr->seed; - break; - case TDB_ATTRIBUTE_OPENHOOK: - tdb->openhook = attr->openhook.fn; - tdb->openhook_data = attr->openhook.data; - break; - case TDB_ATTRIBUTE_TDB1_HASHSIZE: - hsize_attr = &attr->tdb1_hashsize; - break; - case TDB_ATTRIBUTE_TDB1_MAX_DEAD: - maxsize_attr = &attr->tdb1_max_dead; - break; - default: - /* These are set as normal. */ - ecode = tdb_set_attribute(tdb, attr); - if (ecode != TDB_SUCCESS) - goto fail; - } - attr = attr->base.next; - } - - if (tdb_flags & ~(TDB_INTERNAL | TDB_NOLOCK | TDB_NOMMAP | TDB_CONVERT - | TDB_NOSYNC | TDB_SEQNUM | TDB_ALLOW_NESTING - | TDB_RDONLY | TDB_VERSION1)) { - ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, - "tdb_open: unknown flags %u", tdb_flags); - goto fail; - } - - if (hsize_attr) { - if (!(tdb_flags & TDB_VERSION1) || - (!(tdb_flags & TDB_INTERNAL) && !(open_flags & O_CREAT))) { - ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_open: can only use" - " TDB_ATTRIBUTE_TDB1_HASHSIZE when" - " creating a TDB_VERSION1 tdb"); - goto fail; - } - } - - if (seed) { - if (tdb_flags & TDB_VERSION1) { - ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_open:" - " cannot set TDB_ATTRIBUTE_SEED" - " on TDB1 tdb."); - goto fail; - } else if (!(tdb_flags & TDB_INTERNAL) - && !(open_flags & O_CREAT)) { - ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_open:" - " cannot set TDB_ATTRIBUTE_SEED" - " without O_CREAT."); - goto fail; - } - } - - if ((open_flags & O_ACCMODE) == O_WRONLY) { - ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, - "tdb_open: can't open tdb %s write-only", - name); - goto fail; - } - - if ((open_flags & O_ACCMODE) == O_RDONLY) { - openlock = F_RDLCK; - tdb->flags |= TDB_RDONLY; - } else { - if (tdb_flags & TDB_RDONLY) { - ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_open: can't use TDB_RDONLY" - " without O_RDONLY"); - goto fail; - } - openlock = F_WRLCK; - } - - /* internal databases don't need any of the rest. */ - if (tdb->flags & TDB_INTERNAL) { - tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP); - ecode = tdb_new_file(tdb); - if (ecode != TDB_SUCCESS) { - goto fail; - } - tdb->file->fd = -1; - if (tdb->flags & TDB_VERSION1) - ecode = tdb1_new_database(tdb, hsize_attr, maxsize_attr); - else { - ecode = tdb_new_database(tdb, seed, &hdr); - if (ecode == TDB_SUCCESS) { - tdb_convert(tdb, &hdr.hash_seed, - sizeof(hdr.hash_seed)); - tdb->hash_seed = hdr.hash_seed; - tdb2_context_init(tdb); - tdb_ftable_init(tdb); - } - } - if (ecode != TDB_SUCCESS) { - goto fail; - } - return tdb; - } - - if (stat(name, &st) != -1) - tdb->file = find_file(st.st_dev, st.st_ino); - - if (!tdb->file) { - int fd; - - if ((fd = open(name, open_flags, mode)) == -1) { - /* errno set by open(2) */ - saved_errno = errno; - tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_open: could not open file %s: %s", - name, strerror(errno)); - goto fail_errno; - } - - /* on exec, don't inherit the fd */ - v = fcntl(fd, F_GETFD, 0); - fcntl(fd, F_SETFD, v | FD_CLOEXEC); - - if (fstat(fd, &st) == -1) { - saved_errno = errno; - tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_open: could not stat open %s: %s", - name, strerror(errno)); - close(fd); - goto fail_errno; - } - - ecode = tdb_new_file(tdb); - if (ecode != TDB_SUCCESS) { - close(fd); - goto fail; - } - - tdb->file->fd = fd; - tdb->file->device = st.st_dev; - tdb->file->inode = st.st_ino; - tdb->file->map_ptr = NULL; - tdb->file->map_size = 0; - } - - /* ensure there is only one process initialising at once */ - ecode = tdb_lock_open(tdb, openlock, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK); - if (ecode != TDB_SUCCESS) { - saved_errno = errno; - goto fail_errno; - } - - /* call their open hook if they gave us one. */ - if (tdb->openhook) { - ecode = tdb->openhook(tdb->file->fd, tdb->openhook_data); - if (ecode != TDB_SUCCESS) { - tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_open: open hook failed"); - goto fail; - } - open_flags |= O_CREAT; - } - - /* If they used O_TRUNC, read will return 0. */ - rlen = pread(tdb->file->fd, &hdr, sizeof(hdr), 0); - if (rlen == 0 && (open_flags & O_CREAT)) { - if (tdb->flags & TDB_VERSION1) { - ecode = tdb1_new_database(tdb, hsize_attr, maxsize_attr); - if (ecode != TDB_SUCCESS) - goto fail; - goto finished; - } - ecode = tdb_new_database(tdb, seed, &hdr); - if (ecode != TDB_SUCCESS) { - goto fail; - } - } else if (rlen < 0) { - ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_open: error %s reading %s", - strerror(errno), name); - goto fail; - } else if (rlen < sizeof(hdr) - || strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) { - if (is_tdb1(&tdb->tdb1.header, &hdr, rlen)) { - ecode = tdb1_open(tdb, maxsize_attr); - if (!ecode) - goto finished; - goto fail; - } - ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_open: %s is not a tdb file", name); - goto fail; - } - - if (hdr.version != TDB_VERSION) { - if (hdr.version == bswap_64(TDB_VERSION)) - tdb->flags |= TDB_CONVERT; - else { - if (is_tdb1(&tdb->tdb1.header, &hdr, rlen)) { - ecode = tdb1_open(tdb, maxsize_attr); - if (!ecode) - goto finished; - goto fail; - } - /* wrong version */ - ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_open:" - " %s is unknown version 0x%llx", - name, (long long)hdr.version); - goto fail; - } - } else if (tdb->flags & TDB_CONVERT) { - ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_open:" - " %s does not need TDB_CONVERT", - name); - goto fail; - } - - /* This is a version2 tdb. */ - if (tdb->flags & TDB_VERSION1) { - tdb->flags &= ~TDB_VERSION1; - } - - tdb2_context_init(tdb); - - tdb_convert(tdb, &hdr, sizeof(hdr)); - tdb->hash_seed = hdr.hash_seed; - hash_test = TDB_HASH_MAGIC; - hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test)); - if (hdr.hash_test != hash_test) { - /* wrong hash variant */ - ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_open:" - " %s uses a different hash function", - name); - goto fail; - } - - ecode = capabilities_ok(tdb, hdr.capabilities); - if (ecode != TDB_SUCCESS) { - goto fail; - } - - /* Clear any features we don't understand. */ - if ((open_flags & O_ACCMODE) != O_RDONLY) { - hdr.features_used &= TDB_FEATURE_MASK; - ecode = tdb_write_convert(tdb, offsetof(struct tdb_header, - features_used), - &hdr.features_used, - sizeof(hdr.features_used)); - if (ecode != TDB_SUCCESS) - goto fail; - } - -finished: - if (tdb->flags & TDB_VERSION1) { - /* if needed, run recovery */ - if (tdb1_transaction_recover(tdb) == -1) { - ecode = tdb->last_error; - goto fail; - } - } - - tdb_unlock_open(tdb, openlock); - - /* This makes sure we have current map_size and mmap. */ - if (tdb->flags & TDB_VERSION1) { - ecode = tdb1_probe_length(tdb); - } else { - ecode = tdb->tdb2.io->oob(tdb, tdb->file->map_size, 1, true); - } - if (unlikely(ecode != TDB_SUCCESS)) - goto fail; - - if (!(tdb->flags & TDB_VERSION1)) { - /* Now it's fully formed, recover if necessary. */ - berr = tdb_needs_recovery(tdb); - if (unlikely(berr != false)) { - if (berr < 0) { - ecode = TDB_OFF_TO_ERR(berr); - goto fail; - } - ecode = tdb_lock_and_recover(tdb); - if (ecode != TDB_SUCCESS) { - goto fail; - } - } - - ecode = tdb_ftable_init(tdb); - if (ecode != TDB_SUCCESS) { - goto fail; - } - } - - tdb->next = tdbs; - tdbs = tdb; - return tdb; - - fail: - /* Map ecode to some logical errno. */ - switch (TDB_ERR_TO_OFF(ecode)) { - case TDB_ERR_TO_OFF(TDB_ERR_CORRUPT): - case TDB_ERR_TO_OFF(TDB_ERR_IO): - saved_errno = EIO; - break; - case TDB_ERR_TO_OFF(TDB_ERR_LOCK): - saved_errno = EWOULDBLOCK; - break; - case TDB_ERR_TO_OFF(TDB_ERR_OOM): - saved_errno = ENOMEM; - break; - case TDB_ERR_TO_OFF(TDB_ERR_EINVAL): - saved_errno = EINVAL; - break; - default: - saved_errno = EINVAL; - break; - } - -fail_errno: -#ifdef TDB_TRACE - close(tdb->tracefd); -#endif - if (tdb->file) { - tdb_lock_cleanup(tdb); - if (--tdb->file->refcnt == 0) { - assert(tdb->file->num_lockrecs == 0); - if (tdb->file->map_ptr) { - if (tdb->flags & TDB_INTERNAL) { - free(tdb->file->map_ptr); - } else - tdb_munmap(tdb->file); - } - if (close(tdb->file->fd) != 0) - tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_open: failed to close tdb fd" - " on error: %s", strerror(errno)); - free(tdb->file->lockrecs); - free(tdb->file); - } - } - - free(tdb); - errno = saved_errno; - return NULL; -} - -int tdb_close(struct tdb_context *tdb) -{ - int ret = 0; - struct tdb_context **i; - - tdb_trace(tdb, "tdb_close"); - - if (tdb->flags & TDB_VERSION1) { - if (tdb->tdb1.transaction) { - tdb1_transaction_cancel(tdb); - } - } else { - if (tdb->tdb2.transaction) { - tdb_transaction_cancel(tdb); - } - } - - if (tdb->file->map_ptr) { - if (tdb->flags & TDB_INTERNAL) - free(tdb->file->map_ptr); - else - tdb_munmap(tdb->file); - } - if (tdb->file) { - tdb_lock_cleanup(tdb); - if (--tdb->file->refcnt == 0) { - ret = close(tdb->file->fd); - free(tdb->file->lockrecs); - free(tdb->file); - } - } - - /* Remove from tdbs list */ - for (i = &tdbs; *i; i = &(*i)->next) { - if (*i == tdb) { - *i = tdb->next; - break; - } - } - -#ifdef TDB_TRACE - close(tdb->tracefd); -#endif - free(tdb); - - return ret; -} - -void tdb_foreach_(int (*fn)(struct tdb_context *, void *), void *p) -{ - struct tdb_context *i; - - for (i = tdbs; i; i = i->next) { - if (fn(i, p) != 0) - break; - } -} diff --git a/ccan/tdb2/private.h b/ccan/tdb2/private.h deleted file mode 100644 index ba7de3be..00000000 --- a/ccan/tdb2/private.h +++ /dev/null @@ -1,762 +0,0 @@ -#ifndef TDB_PRIVATE_H -#define TDB_PRIVATE_H - /* - Trivial Database 2: private types and prototypes - Copyright (C) Rusty Russell 2010 - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef TEST_IT -#define TEST_IT(cond) -#endif - -/* #define TDB_TRACE 1 */ - -#ifndef __STRING -#define __STRING(x) #x -#endif - -#ifndef __STRINGSTRING -#define __STRINGSTRING(x) __STRING(x) -#endif - -#ifndef __location__ -#define __location__ __FILE__ ":" __STRINGSTRING(__LINE__) -#endif - -typedef uint64_t tdb_len_t; -typedef uint64_t tdb_off_t; - -#define TDB_MAGIC_FOOD "TDB file\n" -#define TDB_VERSION ((uint64_t)(0x26011967 + 7)) -#define TDB1_VERSION (0x26011967 + 6) -#define TDB_USED_MAGIC ((uint64_t)0x1999) -#define TDB_HTABLE_MAGIC ((uint64_t)0x1888) -#define TDB_CHAIN_MAGIC ((uint64_t)0x1777) -#define TDB_FTABLE_MAGIC ((uint64_t)0x1666) -#define TDB_CAP_MAGIC ((uint64_t)0x1555) -#define TDB_FREE_MAGIC ((uint64_t)0xFE) -#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL) -#define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL) -#define TDB_RECOVERY_INVALID_MAGIC (0x0ULL) - -/* Capability bits. */ -#define TDB_CAP_TYPE_MASK 0x1FFFFFFFFFFFFFFFULL -#define TDB_CAP_NOCHECK 0x8000000000000000ULL -#define TDB_CAP_NOWRITE 0x4000000000000000ULL -#define TDB_CAP_NOOPEN 0x2000000000000000ULL - -#define TDB_OFF_IS_ERR(off) unlikely(off >= (tdb_off_t)(long)TDB_ERR_LAST) -#define TDB_OFF_TO_ERR(off) ((enum TDB_ERROR)(long)(off)) -#define TDB_ERR_TO_OFF(ecode) ((tdb_off_t)(long)(ecode)) - -/* Packing errors into pointers and v.v. */ -#define TDB_PTR_IS_ERR(ptr) \ - unlikely((unsigned long)(ptr) >= (unsigned long)TDB_ERR_LAST) -#define TDB_PTR_ERR(p) ((enum TDB_ERROR)(long)(p)) -#define TDB_ERR_PTR(err) ((void *)(long)(err)) - -/* Common case of returning true, false or -ve error. */ -typedef int tdb_bool_err; - -/* Prevent others from opening the file. */ -#define TDB_OPEN_LOCK 0 -/* Expanding file. */ -#define TDB_EXPANSION_LOCK 2 -/* Doing a transaction. */ -#define TDB_TRANSACTION_LOCK 8 -/* Hash chain locks. */ -#define TDB_HASH_LOCK_START 64 - -/* Range for hash locks. */ -#define TDB_HASH_LOCK_RANGE_BITS 30 -#define TDB_HASH_LOCK_RANGE (1 << TDB_HASH_LOCK_RANGE_BITS) - -/* We have 1024 entries in the top level. */ -#define TDB_TOPLEVEL_HASH_BITS 10 -/* And 64 entries in each sub-level: thus 64 bits exactly after 9 levels. */ -#define TDB_SUBLEVEL_HASH_BITS 6 -/* And 8 entries in each group, ie 8 groups per sublevel. */ -#define TDB_HASH_GROUP_BITS 3 -/* This is currently 10: beyond this we chain. */ -#define TDB_MAX_LEVELS (1+(64-TDB_TOPLEVEL_HASH_BITS) / TDB_SUBLEVEL_HASH_BITS) - -/* Extend file by least 100 times larger than needed. */ -#define TDB_EXTENSION_FACTOR 100 - -/* We steal bits from the offsets to store hash info. */ -#define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1) -/* We steal this many upper bits, giving a maximum offset of 64 exabytes. */ -#define TDB_OFF_UPPER_STEAL 8 -#define TDB_OFF_UPPER_STEAL_EXTRA 7 -/* The bit number where we store extra hash bits. */ -#define TDB_OFF_HASH_EXTRA_BIT 57 -#define TDB_OFF_UPPER_STEAL_SUBHASH_BIT 56 - -/* Additional features we understand. Currently: none. */ -#define TDB_FEATURE_MASK ((uint64_t)0) - -/* The bit number where we store the extra hash bits. */ -/* Convenience mask to get actual offset. */ -#define TDB_OFF_MASK \ - (((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK) - -/* How many buckets in a free list: see size_to_bucket(). */ -#define TDB_FREE_BUCKETS (64 - TDB_OFF_UPPER_STEAL) - -/* We have to be able to fit a free record here. */ -#define TDB_MIN_DATA_LEN \ - (sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record)) - -/* Indicates this entry is not on an flist (can happen during coalescing) */ -#define TDB_FTABLE_NONE ((1ULL << TDB_OFF_UPPER_STEAL) - 1) - -struct tdb_used_record { - /* For on-disk compatibility, we avoid bitfields: - magic: 16, (highest) - key_len_bits: 5, - extra_padding: 32 - hash_bits: 11 - */ - uint64_t magic_and_meta; - /* The bottom key_len_bits*2 are key length, rest is data length. */ - uint64_t key_and_data_len; -}; - -static inline unsigned rec_key_bits(const struct tdb_used_record *r) -{ - return ((r->magic_and_meta >> 43) & ((1 << 5)-1)) * 2; -} - -static inline uint64_t rec_key_length(const struct tdb_used_record *r) -{ - return r->key_and_data_len & ((1ULL << rec_key_bits(r)) - 1); -} - -static inline uint64_t rec_data_length(const struct tdb_used_record *r) -{ - return r->key_and_data_len >> rec_key_bits(r); -} - -static inline uint64_t rec_extra_padding(const struct tdb_used_record *r) -{ - return (r->magic_and_meta >> 11) & 0xFFFFFFFF; -} - -static inline uint32_t rec_hash(const struct tdb_used_record *r) -{ - return r->magic_and_meta & ((1 << 11) - 1); -} - -static inline uint16_t rec_magic(const struct tdb_used_record *r) -{ - return (r->magic_and_meta >> 48); -} - -struct tdb_free_record { - uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */ - uint64_t ftable_and_len; /* Len not counting these two fields. */ - /* This is why the minimum record size is 8 bytes. */ - uint64_t next; -}; - -static inline uint64_t frec_prev(const struct tdb_free_record *f) -{ - return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1); -} - -static inline uint64_t frec_magic(const struct tdb_free_record *f) -{ - return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL); -} - -static inline uint64_t frec_len(const struct tdb_free_record *f) -{ - return f->ftable_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1); -} - -static inline unsigned frec_ftable(const struct tdb_free_record *f) -{ - return f->ftable_and_len >> (64 - TDB_OFF_UPPER_STEAL); -} - -struct tdb_recovery_record { - uint64_t magic; - /* Length of record (add this header to get total length). */ - uint64_t max_len; - /* Length used. */ - uint64_t len; - /* Old length of file before transaction. */ - uint64_t eof; -}; - -/* If we bottom out of the subhashes, we chain. */ -struct tdb_chain { - tdb_off_t rec[1 << TDB_HASH_GROUP_BITS]; - tdb_off_t next; -}; - -/* this is stored at the front of every database */ -struct tdb_header { - char magic_food[64]; /* for /etc/magic */ - /* FIXME: Make me 32 bit? */ - uint64_t version; /* version of the code */ - uint64_t hash_test; /* result of hashing HASH_MAGIC. */ - uint64_t hash_seed; /* "random" seed written at creation time. */ - tdb_off_t free_table; /* (First) free table. */ - tdb_off_t recovery; /* Transaction recovery area. */ - - uint64_t features_used; /* Features all writers understand */ - uint64_t features_offered; /* Features offered */ - - uint64_t seqnum; /* Sequence number for TDB_SEQNUM */ - - tdb_off_t capabilities; /* Optional linked list of capabilities. */ - tdb_off_t reserved[22]; - - /* Top level hash table. */ - tdb_off_t hashtable[1ULL << TDB_TOPLEVEL_HASH_BITS]; -}; - -struct tdb_freetable { - struct tdb_used_record hdr; - tdb_off_t next; - tdb_off_t buckets[TDB_FREE_BUCKETS]; -}; - -struct tdb_capability { - struct tdb_used_record hdr; - tdb_off_t type; - tdb_off_t next; - /* ... */ -}; - -/* Information about a particular (locked) hash entry. */ -struct hash_info { - /* Full hash value of entry. */ - uint64_t h; - /* Start and length of lock acquired. */ - tdb_off_t hlock_start; - tdb_len_t hlock_range; - /* Start of hash group. */ - tdb_off_t group_start; - /* Bucket we belong in. */ - unsigned int home_bucket; - /* Bucket we (or an empty space) were found in. */ - unsigned int found_bucket; - /* How many bits of the hash are already used. */ - unsigned int hash_used; - /* Current working group. */ - tdb_off_t group[1 << TDB_HASH_GROUP_BITS]; -}; - -struct traverse_info { - struct traverse_level { - tdb_off_t hashtable; - /* We ignore groups here, and treat it as a big array. */ - unsigned entry; - unsigned int total_buckets; - } levels[TDB_MAX_LEVELS + 1]; - unsigned int num_levels; - unsigned int toplevel_group; - /* This makes delete-everything-inside-traverse work as expected. */ - tdb_off_t prev; -}; - -typedef uint32_t tdb1_len_t; -typedef uint32_t tdb1_off_t; - -enum tdb_lock_flags { - /* WAIT == F_SETLKW, NOWAIT == F_SETLK */ - TDB_LOCK_NOWAIT = 0, - TDB_LOCK_WAIT = 1, - /* If set, don't log an error on failure. */ - TDB_LOCK_PROBE = 2, - /* If set, don't check for recovery (used by recovery code). */ - TDB_LOCK_NOCHECK = 4, -}; - -struct tdb_lock { - struct tdb_context *owner; - off_t off; - uint32_t count; - uint32_t ltype; -}; - -/* This is only needed for tdb_access_commit, but used everywhere to - * simplify. */ -struct tdb_access_hdr { - struct tdb_access_hdr *next; - tdb_off_t off; - tdb_len_t len; - bool convert; -}; - -struct tdb_file { - /* How many are sharing us? */ - unsigned int refcnt; - - /* Mmap (if any), or malloc (for TDB_INTERNAL). */ - void *map_ptr; - - /* How much space has been mapped (<= current file size) */ - tdb_len_t map_size; - - /* The file descriptor (-1 for TDB_INTERNAL). */ - int fd; - - /* Lock information */ - pid_t locker; - struct tdb_lock allrecord_lock; - size_t num_lockrecs; - struct tdb_lock *lockrecs; - - /* Identity of this file. */ - dev_t device; - ino_t inode; -}; - -struct tdb_methods { - enum TDB_ERROR (*tread)(struct tdb_context *, tdb_off_t, void *, - tdb_len_t); - enum TDB_ERROR (*twrite)(struct tdb_context *, tdb_off_t, const void *, - tdb_len_t); - enum TDB_ERROR (*oob)(struct tdb_context *, tdb_off_t, tdb_len_t, bool); - enum TDB_ERROR (*expand_file)(struct tdb_context *, tdb_len_t); - void *(*direct)(struct tdb_context *, tdb_off_t, size_t, bool); -}; - -/* - internal prototypes -*/ -/* hash.c: */ -uint64_t tdb_jenkins_hash(const void *key, size_t length, uint64_t seed, - void *unused); - -enum TDB_ERROR first_in_hash(struct tdb_context *tdb, - struct traverse_info *tinfo, - TDB_DATA *kbuf, size_t *dlen); - -enum TDB_ERROR next_in_hash(struct tdb_context *tdb, - struct traverse_info *tinfo, - TDB_DATA *kbuf, size_t *dlen); - -/* Hash random memory. */ -uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len); - -/* Hash on disk. */ -uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off); - -/* Find and lock a hash entry (or where it would be). */ -tdb_off_t find_and_lock(struct tdb_context *tdb, - struct tdb_data key, - int ltype, - struct hash_info *h, - struct tdb_used_record *rec, - struct traverse_info *tinfo); - -enum TDB_ERROR replace_in_hash(struct tdb_context *tdb, - struct hash_info *h, - tdb_off_t new_off); - -enum TDB_ERROR add_to_hash(struct tdb_context *tdb, struct hash_info *h, - tdb_off_t new_off); - -enum TDB_ERROR delete_from_hash(struct tdb_context *tdb, struct hash_info *h); - -/* For tdb_check */ -bool is_subhash(tdb_off_t val); -enum TDB_ERROR unknown_capability(struct tdb_context *tdb, const char *caller, - tdb_off_t type); - -/* free.c: */ -enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb); - -/* check.c needs these to iterate through free lists. */ -tdb_off_t first_ftable(struct tdb_context *tdb); -tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable); - -/* This returns space or -ve error number. */ -tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen, - uint64_t hash, unsigned magic, bool growing); - -/* Put this record in a free list. */ -enum TDB_ERROR add_free_record(struct tdb_context *tdb, - tdb_off_t off, tdb_len_t len_with_header, - enum tdb_lock_flags waitflag, - bool coalesce_ok); - -/* Set up header for a used/ftable/htable/chain/capability record. */ -enum TDB_ERROR set_header(struct tdb_context *tdb, - struct tdb_used_record *rec, - unsigned magic, uint64_t keylen, uint64_t datalen, - uint64_t actuallen, unsigned hashlow); - -/* Used by tdb_check to verify. */ -unsigned int size_to_bucket(tdb_len_t data_len); -tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket); - -/* Used by tdb_summary */ -tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off); - -/* Adjust expansion, used by create_recovery_area */ -tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size); - -/* io.c: */ -/* Initialize tdb->methods. */ -void tdb_io_init(struct tdb_context *tdb); - -/* Convert endian of the buffer if required. */ -void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size); - -/* Unmap and try to map the tdb. */ -void tdb_munmap(struct tdb_file *file); -void tdb_mmap(struct tdb_context *tdb); - -/* Either alloc a copy, or give direct access. Release frees or noop. */ -const void *tdb_access_read(struct tdb_context *tdb, - tdb_off_t off, tdb_len_t len, bool convert); -void *tdb_access_write(struct tdb_context *tdb, - tdb_off_t off, tdb_len_t len, bool convert); - -/* Release result of tdb_access_read/write. */ -void tdb_access_release(struct tdb_context *tdb, const void *p); -/* Commit result of tdb_acces_write. */ -enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p); - -/* Convenience routine to get an offset. */ -tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off); - -/* Write an offset at an offset. */ -enum TDB_ERROR tdb_write_off(struct tdb_context *tdb, tdb_off_t off, - tdb_off_t val); - -/* Clear an ondisk area. */ -enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len); - -/* Return a non-zero offset between >= start < end in this array (or end). */ -tdb_off_t tdb_find_nonzero_off(struct tdb_context *tdb, - tdb_off_t base, - uint64_t start, - uint64_t end); - -/* Return a zero offset in this array, or num. */ -tdb_off_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off, - uint64_t num); - -/* Allocate and make a copy of some offset. */ -void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len); - -/* Writes a converted copy of a record. */ -enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off, - const void *rec, size_t len); - -/* Reads record and converts it */ -enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off, - void *rec, size_t len); - -/* Bump the seqnum (caller checks for tdb->flags & TDB_SEQNUM) */ -void tdb_inc_seqnum(struct tdb_context *tdb); - -/* lock.c: */ -/* Print message because another tdb owns a lock we want. */ -enum TDB_ERROR owner_conflict(struct tdb_context *tdb, const char *call); - -/* If we fork, we no longer really own locks. */ -bool check_lock_pid(struct tdb_context *tdb, const char *call, bool log); - -/* Lock/unlock a range of hashes. */ -enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb, - tdb_off_t hash_lock, tdb_len_t hash_range, - int ltype, enum tdb_lock_flags waitflag); -enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb, - tdb_off_t hash_lock, - tdb_len_t hash_range, int ltype); - -/* For closing the file. */ -void tdb_lock_cleanup(struct tdb_context *tdb); - -/* Lock/unlock a particular free bucket. */ -enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off, - enum tdb_lock_flags waitflag); -void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off); - -/* Serialize transaction start. */ -enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype); -void tdb_transaction_unlock(struct tdb_context *tdb, int ltype); - -/* Do we have any hash locks (ie. via tdb_chainlock) ? */ -bool tdb_has_hash_locks(struct tdb_context *tdb); - -/* Lock entire database. */ -enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype, - enum tdb_lock_flags flags, bool upgradable); -void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype); -enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb, off_t start); - -/* Serialize db open. */ -enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb, - int ltype, enum tdb_lock_flags flags); -void tdb_unlock_open(struct tdb_context *tdb, int ltype); -bool tdb_has_open_lock(struct tdb_context *tdb); - -/* Serialize db expand. */ -enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype); -void tdb_unlock_expand(struct tdb_context *tdb, int ltype); -bool tdb_has_expansion_lock(struct tdb_context *tdb); - -/* If it needs recovery, grab all the locks and do it. */ -enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb); - -/* Byte-range lock wrappers for TDB1 to access. */ -enum TDB_ERROR tdb_brlock(struct tdb_context *tdb, - int rw_type, tdb_off_t offset, tdb_off_t len, - enum tdb_lock_flags flags); - -enum TDB_ERROR tdb_brunlock(struct tdb_context *tdb, - int rw_type, tdb_off_t offset, size_t len); - -enum TDB_ERROR tdb_nest_lock(struct tdb_context *tdb, - tdb_off_t offset, int ltype, - enum tdb_lock_flags flags); - -enum TDB_ERROR tdb_nest_unlock(struct tdb_context *tdb, - tdb_off_t off, int ltype); - -enum TDB_ERROR tdb_lock_gradual(struct tdb_context *tdb, - int ltype, enum tdb_lock_flags flags, - tdb_off_t off, tdb_off_t len); - -/* Default lock and unlock functions. */ -int tdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag, void *); -int tdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *); - -/* transaction.c: */ -enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb); -tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb); - -/* this is stored at the front of every database */ -struct tdb1_header { - char magic_food[32]; /* for /etc/magic */ - uint32_t version; /* version of the code */ - uint32_t hash_size; /* number of hash entries */ - tdb1_off_t rwlocks; /* obsolete - kept to detect old formats */ - tdb1_off_t recovery_start; /* offset of transaction recovery region */ - tdb1_off_t sequence_number; /* used when TDB1_SEQNUM is set */ - uint32_t magic1_hash; /* hash of TDB_MAGIC_FOOD. */ - uint32_t magic2_hash; /* hash of TDB1_MAGIC. */ - tdb1_off_t reserved[27]; -}; - -struct tdb1_traverse_lock { - struct tdb1_traverse_lock *next; - uint32_t off; - uint32_t hash; - int lock_rw; -}; - -struct tdb_context { - /* Single list of all TDBs, to detect multiple opens. */ - struct tdb_context *next; - - /* Filename of the database. */ - const char *name; - - /* Logging function */ - void (*log_fn)(struct tdb_context *tdb, - enum tdb_log_level level, - enum TDB_ERROR ecode, - const char *message, - void *data); - void *log_data; - - /* Open flags passed to tdb_open. */ - int open_flags; - - /* low level (fnctl) lock functions. */ - int (*lock_fn)(int fd, int rw, off_t off, off_t len, bool w, void *); - int (*unlock_fn)(int fd, int rw, off_t off, off_t len, void *); - void *lock_data; - - /* the tdb flags passed to tdb_open. */ - uint32_t flags; - - /* Our statistics. */ - struct tdb_attribute_stats stats; - - /* The actual file information */ - struct tdb_file *file; - - /* Hash function. */ - uint64_t (*hash_fn)(const void *key, size_t len, uint64_t seed, void *); - void *hash_data; - uint64_t hash_seed; - - /* Our open hook, if any. */ - enum TDB_ERROR (*openhook)(int fd, void *data); - void *openhook_data; - - /* Last error we returned. */ - enum TDB_ERROR last_error; - - struct { - - /* Are we accessing directly? (debugging check). */ - int direct_access; - - /* Set if we are in a transaction. */ - struct tdb_transaction *transaction; - - /* What free table are we using? */ - tdb_off_t ftable_off; - unsigned int ftable; - - /* IO methods: changes for transactions. */ - const struct tdb_methods *io; - - /* Direct access information */ - struct tdb_access_hdr *access; - } tdb2; - - struct { - int traverse_read; /* read-only traversal */ - int traverse_write; /* read-write traversal */ - - struct tdb1_header header; /* a cached copy of the header */ - struct tdb1_traverse_lock travlocks; /* current traversal locks */ - const struct tdb1_methods *io; - struct tdb1_transaction *transaction; - int page_size; - int max_dead_records; - } tdb1; -}; - -#define TDB1_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24)) - -/* tdb1_check.c: */ -int tdb1_check(struct tdb_context *tdb, - enum TDB_ERROR (*check)(TDB_DATA key, TDB_DATA data, void *), - void *private_data); - - -/* tdb1_open.c: */ -enum TDB_ERROR tdb1_new_database(struct tdb_context *tdb, - struct tdb_attribute_tdb1_hashsize *hashsize, - struct tdb_attribute_tdb1_max_dead *max_dead); -enum TDB_ERROR tdb1_open(struct tdb_context *tdb, - struct tdb_attribute_tdb1_max_dead *max_dead); - -/* tdb1_io.c: */ -enum TDB_ERROR tdb1_probe_length(struct tdb_context *tdb); - -/* tdb1_lock.c: */ -int tdb1_allrecord_lock(struct tdb_context *tdb, int ltype, - enum tdb_lock_flags flags, bool upgradable); -int tdb1_allrecord_unlock(struct tdb_context *tdb, int ltype); - -int tdb1_chainlock(struct tdb_context *tdb, TDB_DATA key); -int tdb1_chainunlock(struct tdb_context *tdb, TDB_DATA key); -int tdb1_chainlock_read(struct tdb_context *tdb, TDB_DATA key); -int tdb1_chainunlock_read(struct tdb_context *tdb, TDB_DATA key); - -/* tdb1_transaction.c: */ -int tdb1_transaction_recover(struct tdb_context *tdb); -int tdb1_transaction_cancel(struct tdb_context *tdb); - -/* tdb1_traverse.c: */ -int tdb1_traverse(struct tdb_context *tdb, - int (*)(struct tdb_context *, TDB_DATA, TDB_DATA, void *), - void *private_data); - -/* tdb1_summary.c: */ -char *tdb1_summary(struct tdb_context *tdb); - -/* tdb1_tdb.c: */ -int tdb1_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag); -enum TDB_ERROR tdb1_fetch(struct tdb_context *tdb, TDB_DATA key, - TDB_DATA *data); -int tdb1_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf); -int tdb1_delete(struct tdb_context *tdb, TDB_DATA key); -int tdb1_exists(struct tdb_context *tdb, TDB_DATA key); -enum TDB_ERROR tdb1_parse_record(struct tdb_context *tdb, TDB_DATA key, - enum TDB_ERROR (*parser)(TDB_DATA key, - TDB_DATA data, - void *private_data), - void *private_data); -void tdb1_increment_seqnum_nonblock(struct tdb_context *tdb); -int tdb1_get_seqnum(struct tdb_context *tdb); -int tdb1_wipe_all(struct tdb_context *tdb); - -/* tdb1_transaction.c: */ -int tdb1_transaction_start(struct tdb_context *tdb); -int tdb1_transaction_prepare_commit(struct tdb_context *tdb); -int tdb1_transaction_commit(struct tdb_context *tdb); - -/* tdb1_traverse.c: */ -TDB_DATA tdb1_firstkey(struct tdb_context *tdb); -TDB_DATA tdb1_nextkey(struct tdb_context *tdb, TDB_DATA key); - -/* tdb.c: */ -enum TDB_ERROR COLD tdb_logerr(struct tdb_context *tdb, - enum TDB_ERROR ecode, - enum tdb_log_level level, - const char *fmt, ...); - -#ifdef TDB_TRACE -void tdb_trace(struct tdb_context *tdb, const char *op); -void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op); -void tdb_trace_open(struct tdb_context *tdb, const char *op, - unsigned hash_size, unsigned tdb_flags, unsigned open_flags); -void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret); -void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret); -void tdb_trace_1rec(struct tdb_context *tdb, const char *op, - TDB_DATA rec); -void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op, - TDB_DATA rec, int ret); -void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op, - TDB_DATA rec, TDB_DATA ret); -void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op, - TDB_DATA rec1, TDB_DATA rec2, unsigned flag, - int ret); -void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op, - TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret); -#else -#define tdb_trace(tdb, op) -#define tdb_trace_seqnum(tdb, seqnum, op) -#define tdb_trace_open(tdb, op, hash_size, tdb_flags, open_flags) -#define tdb_trace_ret(tdb, op, ret) -#define tdb_trace_retrec(tdb, op, ret) -#define tdb_trace_1rec(tdb, op, rec) -#define tdb_trace_1rec_ret(tdb, op, rec, ret) -#define tdb_trace_1rec_retrec(tdb, op, rec, ret) -#define tdb_trace_2rec_flag_ret(tdb, op, rec1, rec2, flag, ret) -#define tdb_trace_2rec_retrec(tdb, op, rec1, rec2, ret) -#endif /* !TDB_TRACE */ - -#endif diff --git a/ccan/tdb2/summary.c b/ccan/tdb2/summary.c deleted file mode 100644 index f3a3a085..00000000 --- a/ccan/tdb2/summary.c +++ /dev/null @@ -1,356 +0,0 @@ - /* - Trivial Database 2: human-readable summary code - Copyright (C) Rusty Russell 2010 - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ -#include "private.h" -#include -#include - -#define SUMMARY_FORMAT \ - "Size of file/data: %zu/%zu\n" \ - "Number of records: %zu\n" \ - "Smallest/average/largest keys: %zu/%zu/%zu\n%s" \ - "Smallest/average/largest data: %zu/%zu/%zu\n%s" \ - "Smallest/average/largest padding: %zu/%zu/%zu\n%s" \ - "Number of free records: %zu\n" \ - "Smallest/average/largest free records: %zu/%zu/%zu\n%s" \ - "Number of uncoalesced records: %zu\n" \ - "Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \ - "Toplevel hash used: %u of %u\n" \ - "Number of chains: %zu\n" \ - "Number of subhashes: %zu\n" \ - "Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \ - "Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n" - -#define BUCKET_SUMMARY_FORMAT_A \ - "Free bucket %zu: total entries %zu.\n" \ - "Smallest/average/largest length: %zu/%zu/%zu\n%s" -#define BUCKET_SUMMARY_FORMAT_B \ - "Free bucket %zu-%zu: total entries %zu.\n" \ - "Smallest/average/largest length: %zu/%zu/%zu\n%s" -#define CAPABILITY_FORMAT \ - "Capability %llu%s\n" - -#define HISTO_WIDTH 70 -#define HISTO_HEIGHT 20 - -static tdb_off_t count_hash(struct tdb_context *tdb, - tdb_off_t hash_off, unsigned bits) -{ - const tdb_off_t *h; - tdb_off_t count = 0; - unsigned int i; - - h = tdb_access_read(tdb, hash_off, sizeof(*h) << bits, true); - if (TDB_PTR_IS_ERR(h)) { - return TDB_ERR_TO_OFF(TDB_PTR_ERR(h)); - } - for (i = 0; i < (1 << bits); i++) - count += (h[i] != 0); - - tdb_access_release(tdb, h); - return count; -} - -static enum TDB_ERROR summarize(struct tdb_context *tdb, - struct tally *hashes, - struct tally *ftables, - struct tally *fr, - struct tally *keys, - struct tally *data, - struct tally *extra, - struct tally *uncoal, - struct tally *chains) -{ - tdb_off_t off; - tdb_len_t len; - tdb_len_t unc = 0; - - for (off = sizeof(struct tdb_header); - off < tdb->file->map_size; - off += len) { - const union { - struct tdb_used_record u; - struct tdb_free_record f; - struct tdb_recovery_record r; - } *p; - /* We might not be able to get the whole thing. */ - p = tdb_access_read(tdb, off, sizeof(p->f), true); - if (TDB_PTR_IS_ERR(p)) { - return TDB_PTR_ERR(p); - } - if (frec_magic(&p->f) != TDB_FREE_MAGIC) { - if (unc > 1) { - tally_add(uncoal, unc); - unc = 0; - } - } - - if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC - || p->r.magic == TDB_RECOVERY_MAGIC) { - len = sizeof(p->r) + p->r.max_len; - } else if (frec_magic(&p->f) == TDB_FREE_MAGIC) { - len = frec_len(&p->f); - tally_add(fr, len); - len += sizeof(p->u); - unc++; - } else if (rec_magic(&p->u) == TDB_USED_MAGIC) { - len = sizeof(p->u) - + rec_key_length(&p->u) - + rec_data_length(&p->u) - + rec_extra_padding(&p->u); - - tally_add(keys, rec_key_length(&p->u)); - tally_add(data, rec_data_length(&p->u)); - tally_add(extra, rec_extra_padding(&p->u)); - } else if (rec_magic(&p->u) == TDB_HTABLE_MAGIC) { - tdb_off_t count = count_hash(tdb, - off + sizeof(p->u), - TDB_SUBLEVEL_HASH_BITS); - if (TDB_OFF_IS_ERR(count)) { - return TDB_OFF_TO_ERR(count); - } - tally_add(hashes, count); - tally_add(extra, rec_extra_padding(&p->u)); - len = sizeof(p->u) - + rec_data_length(&p->u) - + rec_extra_padding(&p->u); - } else if (rec_magic(&p->u) == TDB_FTABLE_MAGIC) { - len = sizeof(p->u) - + rec_data_length(&p->u) - + rec_extra_padding(&p->u); - tally_add(ftables, rec_data_length(&p->u)); - tally_add(extra, rec_extra_padding(&p->u)); - } else if (rec_magic(&p->u) == TDB_CHAIN_MAGIC) { - len = sizeof(p->u) - + rec_data_length(&p->u) - + rec_extra_padding(&p->u); - tally_add(chains, 1); - tally_add(extra, rec_extra_padding(&p->u)); - } else { - len = dead_space(tdb, off); - if (TDB_OFF_IS_ERR(len)) { - return TDB_OFF_TO_ERR(len); - } - } - tdb_access_release(tdb, p); - } - if (unc) - tally_add(uncoal, unc); - return TDB_SUCCESS; -} - -static size_t num_capabilities(struct tdb_context *tdb) -{ - tdb_off_t off, next; - const struct tdb_capability *cap; - size_t count = 0; - - off = tdb_read_off(tdb, offsetof(struct tdb_header, capabilities)); - if (TDB_OFF_IS_ERR(off)) - return count; - - /* Count capability list. */ - for (; off; off = next) { - cap = tdb_access_read(tdb, off, sizeof(*cap), true); - if (TDB_PTR_IS_ERR(cap)) { - break; - } - count++; - next = cap->next; - tdb_access_release(tdb, cap); - } - return count; -} - -static void add_capabilities(struct tdb_context *tdb, size_t num, char *summary) -{ - tdb_off_t off, next; - const struct tdb_capability *cap; - size_t count = 0; - - /* Append to summary. */ - summary += strlen(summary); - - off = tdb_read_off(tdb, offsetof(struct tdb_header, capabilities)); - if (TDB_OFF_IS_ERR(off)) - return; - - /* Walk capability list. */ - for (; off; off = next) { - cap = tdb_access_read(tdb, off, sizeof(*cap), true); - if (TDB_PTR_IS_ERR(cap)) { - break; - } - count++; - sprintf(summary, CAPABILITY_FORMAT, - cap->type & TDB_CAP_TYPE_MASK, - /* Noopen? How did we get here? */ - (cap->type & TDB_CAP_NOOPEN) ? " (unopenable)" - : ((cap->type & TDB_CAP_NOWRITE) - && (cap->type & TDB_CAP_NOCHECK)) ? " (uncheckable,read-only)" - : (cap->type & TDB_CAP_NOWRITE) ? " (read-only)" - : (cap->type & TDB_CAP_NOCHECK) ? " (uncheckable)" - : ""); - summary += strlen(summary); - next = cap->next; - tdb_access_release(tdb, cap); - } -} - -enum TDB_ERROR tdb_summary(struct tdb_context *tdb, - enum tdb_summary_flags flags, - char **summary) -{ - tdb_len_t len; - size_t num_caps; - struct tally *ftables, *hashes, *freet, *keys, *data, *extra, *uncoal, - *chains; - char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg; - enum TDB_ERROR ecode; - - if (tdb->flags & TDB_VERSION1) { - /* tdb1 doesn't do graphs. */ - *summary = tdb1_summary(tdb); - if (!*summary) - return tdb->last_error; - return TDB_SUCCESS; - } - - hashesg = freeg = keysg = datag = extrag = uncoalg = NULL; - - ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false); - if (ecode != TDB_SUCCESS) { - return tdb->last_error = ecode; - } - - ecode = tdb_lock_expand(tdb, F_RDLCK); - if (ecode != TDB_SUCCESS) { - tdb_allrecord_unlock(tdb, F_RDLCK); - return tdb->last_error = ecode; - } - - /* Start stats off empty. */ - ftables = tally_new(HISTO_HEIGHT); - hashes = tally_new(HISTO_HEIGHT); - freet = tally_new(HISTO_HEIGHT); - keys = tally_new(HISTO_HEIGHT); - data = tally_new(HISTO_HEIGHT); - extra = tally_new(HISTO_HEIGHT); - uncoal = tally_new(HISTO_HEIGHT); - chains = tally_new(HISTO_HEIGHT); - if (!ftables || !hashes || !freet || !keys || !data || !extra - || !uncoal || !chains) { - ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "tdb_summary: failed to allocate" - " tally structures"); - goto unlock; - } - - ecode = summarize(tdb, hashes, ftables, freet, keys, data, extra, - uncoal, chains); - if (ecode != TDB_SUCCESS) { - goto unlock; - } - - if (flags & TDB_SUMMARY_HISTOGRAMS) { - hashesg = tally_histogram(hashes, HISTO_WIDTH, HISTO_HEIGHT); - freeg = tally_histogram(freet, HISTO_WIDTH, HISTO_HEIGHT); - keysg = tally_histogram(keys, HISTO_WIDTH, HISTO_HEIGHT); - datag = tally_histogram(data, HISTO_WIDTH, HISTO_HEIGHT); - extrag = tally_histogram(extra, HISTO_WIDTH, HISTO_HEIGHT); - uncoalg = tally_histogram(uncoal, HISTO_WIDTH, HISTO_HEIGHT); - } - - num_caps = num_capabilities(tdb); - - /* 20 is max length of a %llu. */ - len = strlen(SUMMARY_FORMAT) + 33*20 + 1 - + (hashesg ? strlen(hashesg) : 0) - + (freeg ? strlen(freeg) : 0) - + (keysg ? strlen(keysg) : 0) - + (datag ? strlen(datag) : 0) - + (extrag ? strlen(extrag) : 0) - + (uncoalg ? strlen(uncoalg) : 0) - + num_caps * (strlen(CAPABILITY_FORMAT) + 20*4); - - *summary = malloc(len); - if (!*summary) { - ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "tdb_summary: failed to allocate string"); - goto unlock; - } - - sprintf(*summary, SUMMARY_FORMAT, - (size_t)tdb->file->map_size, - tally_total(keys, NULL) + tally_total(data, NULL), - tally_num(keys), - tally_min(keys), tally_mean(keys), tally_max(keys), - keysg ? keysg : "", - tally_min(data), tally_mean(data), tally_max(data), - datag ? datag : "", - tally_min(extra), tally_mean(extra), tally_max(extra), - extrag ? extrag : "", - tally_num(freet), - tally_min(freet), tally_mean(freet), tally_max(freet), - freeg ? freeg : "", - tally_total(uncoal, NULL), - tally_min(uncoal), tally_mean(uncoal), tally_max(uncoal), - uncoalg ? uncoalg : "", - (unsigned)count_hash(tdb, offsetof(struct tdb_header, - hashtable), - TDB_TOPLEVEL_HASH_BITS), - 1 << TDB_TOPLEVEL_HASH_BITS, - tally_num(chains), - tally_num(hashes), - tally_min(hashes), tally_mean(hashes), tally_max(hashes), - hashesg ? hashesg : "", - tally_total(keys, NULL) * 100.0 / tdb->file->map_size, - tally_total(data, NULL) * 100.0 / tdb->file->map_size, - tally_total(extra, NULL) * 100.0 / tdb->file->map_size, - tally_total(freet, NULL) * 100.0 / tdb->file->map_size, - (tally_num(keys) + tally_num(freet) + tally_num(hashes)) - * sizeof(struct tdb_used_record) * 100.0 / tdb->file->map_size, - tally_num(ftables) * sizeof(struct tdb_freetable) - * 100.0 / tdb->file->map_size, - (tally_num(hashes) - * (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) - + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS) - + sizeof(struct tdb_chain) * tally_num(chains)) - * 100.0 / tdb->file->map_size); - - add_capabilities(tdb, num_caps, *summary); - -unlock: - free(hashesg); - free(freeg); - free(keysg); - free(datag); - free(extrag); - free(uncoalg); - free(hashes); - free(freet); - free(keys); - free(data); - free(extra); - free(uncoal); - free(ftables); - free(chains); - - tdb_allrecord_unlock(tdb, F_RDLCK); - tdb_unlock_expand(tdb, F_RDLCK); - return tdb->last_error = ecode; -} diff --git a/ccan/tdb2/tdb.c b/ccan/tdb2/tdb.c deleted file mode 100644 index 62607bf1..00000000 --- a/ccan/tdb2/tdb.c +++ /dev/null @@ -1,642 +0,0 @@ - /* - Trivial Database 2: fetch, store and misc routines. - Copyright (C) Rusty Russell 2010 - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ -#include "private.h" -#include -#include - -static enum TDB_ERROR update_rec_hdr(struct tdb_context *tdb, - tdb_off_t off, - tdb_len_t keylen, - tdb_len_t datalen, - struct tdb_used_record *rec, - uint64_t h) -{ - uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec); - enum TDB_ERROR ecode; - - ecode = set_header(tdb, rec, TDB_USED_MAGIC, keylen, datalen, - keylen + dataroom, h); - if (ecode == TDB_SUCCESS) { - ecode = tdb_write_convert(tdb, off, rec, sizeof(*rec)); - } - return ecode; -} - -static enum TDB_ERROR replace_data(struct tdb_context *tdb, - struct hash_info *h, - struct tdb_data key, struct tdb_data dbuf, - tdb_off_t old_off, tdb_len_t old_room, - bool growing) -{ - tdb_off_t new_off; - enum TDB_ERROR ecode; - - /* Allocate a new record. */ - new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, TDB_USED_MAGIC, - growing); - if (TDB_OFF_IS_ERR(new_off)) { - return TDB_OFF_TO_ERR(new_off); - } - - /* We didn't like the existing one: remove it. */ - if (old_off) { - tdb->stats.frees++; - ecode = add_free_record(tdb, old_off, - sizeof(struct tdb_used_record) - + key.dsize + old_room, - TDB_LOCK_WAIT, true); - if (ecode == TDB_SUCCESS) - ecode = replace_in_hash(tdb, h, new_off); - } else { - ecode = add_to_hash(tdb, h, new_off); - } - if (ecode != TDB_SUCCESS) { - return ecode; - } - - new_off += sizeof(struct tdb_used_record); - ecode = tdb->tdb2.io->twrite(tdb, new_off, key.dptr, key.dsize); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - new_off += key.dsize; - ecode = tdb->tdb2.io->twrite(tdb, new_off, dbuf.dptr, dbuf.dsize); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - if (tdb->flags & TDB_SEQNUM) - tdb_inc_seqnum(tdb); - - return TDB_SUCCESS; -} - -static enum TDB_ERROR update_data(struct tdb_context *tdb, - tdb_off_t off, - struct tdb_data dbuf, - tdb_len_t extra) -{ - enum TDB_ERROR ecode; - - ecode = tdb->tdb2.io->twrite(tdb, off, dbuf.dptr, dbuf.dsize); - if (ecode == TDB_SUCCESS && extra) { - /* Put a zero in; future versions may append other data. */ - ecode = tdb->tdb2.io->twrite(tdb, off + dbuf.dsize, "", 1); - } - if (tdb->flags & TDB_SEQNUM) - tdb_inc_seqnum(tdb); - - return ecode; -} - -enum TDB_ERROR tdb_store(struct tdb_context *tdb, - struct tdb_data key, struct tdb_data dbuf, int flag) -{ - struct hash_info h; - tdb_off_t off; - tdb_len_t old_room = 0; - struct tdb_used_record rec; - enum TDB_ERROR ecode; - - if (tdb->flags & TDB_VERSION1) { - if (tdb1_store(tdb, key, dbuf, flag) == -1) - return tdb->last_error; - return TDB_SUCCESS; - } - - off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL); - if (TDB_OFF_IS_ERR(off)) { - return tdb->last_error = TDB_OFF_TO_ERR(off); - } - - /* Now we have lock on this hash bucket. */ - if (flag == TDB_INSERT) { - if (off) { - ecode = TDB_ERR_EXISTS; - goto out; - } - } else { - if (off) { - old_room = rec_data_length(&rec) - + rec_extra_padding(&rec); - if (old_room >= dbuf.dsize) { - /* Can modify in-place. Easy! */ - ecode = update_rec_hdr(tdb, off, - key.dsize, dbuf.dsize, - &rec, h.h); - if (ecode != TDB_SUCCESS) { - goto out; - } - ecode = update_data(tdb, - off + sizeof(rec) - + key.dsize, dbuf, - old_room - dbuf.dsize); - if (ecode != TDB_SUCCESS) { - goto out; - } - tdb_unlock_hashes(tdb, h.hlock_start, - h.hlock_range, F_WRLCK); - return tdb->last_error = TDB_SUCCESS; - } - } else { - if (flag == TDB_MODIFY) { - /* if the record doesn't exist and we - are in TDB_MODIFY mode then we should fail - the store */ - ecode = TDB_ERR_NOEXIST; - goto out; - } - } - } - - /* If we didn't use the old record, this implies we're growing. */ - ecode = replace_data(tdb, &h, key, dbuf, off, old_room, off); -out: - tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK); - return tdb->last_error = ecode; -} - -enum TDB_ERROR tdb_append(struct tdb_context *tdb, - struct tdb_data key, struct tdb_data dbuf) -{ - struct hash_info h; - tdb_off_t off; - struct tdb_used_record rec; - tdb_len_t old_room = 0, old_dlen; - unsigned char *newdata; - struct tdb_data new_dbuf; - enum TDB_ERROR ecode; - - if (tdb->flags & TDB_VERSION1) { - if (tdb1_append(tdb, key, dbuf) == -1) - return tdb->last_error; - return TDB_SUCCESS; - } - - off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL); - if (TDB_OFF_IS_ERR(off)) { - return tdb->last_error = TDB_OFF_TO_ERR(off); - } - - if (off) { - old_dlen = rec_data_length(&rec); - old_room = old_dlen + rec_extra_padding(&rec); - - /* Fast path: can append in place. */ - if (rec_extra_padding(&rec) >= dbuf.dsize) { - ecode = update_rec_hdr(tdb, off, key.dsize, - old_dlen + dbuf.dsize, &rec, - h.h); - if (ecode != TDB_SUCCESS) { - goto out; - } - - off += sizeof(rec) + key.dsize + old_dlen; - ecode = update_data(tdb, off, dbuf, - rec_extra_padding(&rec)); - goto out; - } - - /* Slow path. */ - newdata = malloc(key.dsize + old_dlen + dbuf.dsize); - if (!newdata) { - ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "tdb_append:" - " failed to allocate %zu bytes", - (size_t)(key.dsize + old_dlen - + dbuf.dsize)); - goto out; - } - ecode = tdb->tdb2.io->tread(tdb, off + sizeof(rec) + key.dsize, - newdata, old_dlen); - if (ecode != TDB_SUCCESS) { - goto out_free_newdata; - } - memcpy(newdata + old_dlen, dbuf.dptr, dbuf.dsize); - new_dbuf.dptr = newdata; - new_dbuf.dsize = old_dlen + dbuf.dsize; - } else { - newdata = NULL; - new_dbuf = dbuf; - } - - /* If they're using tdb_append(), it implies they're growing record. */ - ecode = replace_data(tdb, &h, key, new_dbuf, off, old_room, true); - -out_free_newdata: - free(newdata); -out: - tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK); - return tdb->last_error = ecode; -} - -enum TDB_ERROR tdb_fetch(struct tdb_context *tdb, struct tdb_data key, - struct tdb_data *data) -{ - tdb_off_t off; - struct tdb_used_record rec; - struct hash_info h; - enum TDB_ERROR ecode; - - if (tdb->flags & TDB_VERSION1) - return tdb1_fetch(tdb, key, data); - - off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL); - if (TDB_OFF_IS_ERR(off)) { - return tdb->last_error = TDB_OFF_TO_ERR(off); - } - - if (!off) { - ecode = TDB_ERR_NOEXIST; - } else { - data->dsize = rec_data_length(&rec); - data->dptr = tdb_alloc_read(tdb, off + sizeof(rec) + key.dsize, - data->dsize); - if (TDB_PTR_IS_ERR(data->dptr)) { - ecode = TDB_PTR_ERR(data->dptr); - } else - ecode = TDB_SUCCESS; - } - - tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK); - return tdb->last_error = ecode; -} - -bool tdb_exists(struct tdb_context *tdb, TDB_DATA key) -{ - tdb_off_t off; - struct tdb_used_record rec; - struct hash_info h; - - if (tdb->flags & TDB_VERSION1) { - return tdb1_exists(tdb, key); - } - - off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL); - if (TDB_OFF_IS_ERR(off)) { - tdb->last_error = TDB_OFF_TO_ERR(off); - return false; - } - tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK); - - tdb->last_error = TDB_SUCCESS; - return off ? true : false; -} - -enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key) -{ - tdb_off_t off; - struct tdb_used_record rec; - struct hash_info h; - enum TDB_ERROR ecode; - - if (tdb->flags & TDB_VERSION1) { - if (tdb1_delete(tdb, key) == -1) - return tdb->last_error; - return TDB_SUCCESS; - } - - off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL); - if (TDB_OFF_IS_ERR(off)) { - return tdb->last_error = TDB_OFF_TO_ERR(off); - } - - if (!off) { - ecode = TDB_ERR_NOEXIST; - goto unlock; - } - - ecode = delete_from_hash(tdb, &h); - if (ecode != TDB_SUCCESS) { - goto unlock; - } - - /* Free the deleted entry. */ - tdb->stats.frees++; - ecode = add_free_record(tdb, off, - sizeof(struct tdb_used_record) - + rec_key_length(&rec) - + rec_data_length(&rec) - + rec_extra_padding(&rec), - TDB_LOCK_WAIT, true); - - if (tdb->flags & TDB_SEQNUM) - tdb_inc_seqnum(tdb); - -unlock: - tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK); - return tdb->last_error = ecode; -} - -unsigned int tdb_get_flags(struct tdb_context *tdb) -{ - return tdb->flags; -} - -static bool inside_transaction(const struct tdb_context *tdb) -{ - if (tdb->flags & TDB_VERSION1) - return tdb->tdb1.transaction != NULL; - else - return tdb->tdb2.transaction != NULL; -} - -static bool readonly_changable(struct tdb_context *tdb, const char *caller) -{ - if (inside_transaction(tdb)) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "%s: can't change" - " TDB_RDONLY inside transaction", - caller); - return false; - } - return true; -} - -void tdb_add_flag(struct tdb_context *tdb, unsigned flag) -{ - if (tdb->flags & TDB_INTERNAL) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_add_flag: internal db"); - return; - } - switch (flag) { - case TDB_NOLOCK: - tdb->flags |= TDB_NOLOCK; - break; - case TDB_NOMMAP: - tdb->flags |= TDB_NOMMAP; - tdb_munmap(tdb->file); - break; - case TDB_NOSYNC: - tdb->flags |= TDB_NOSYNC; - break; - case TDB_SEQNUM: - tdb->flags |= TDB_SEQNUM; - break; - case TDB_ALLOW_NESTING: - tdb->flags |= TDB_ALLOW_NESTING; - break; - case TDB_RDONLY: - if (readonly_changable(tdb, "tdb_add_flag")) - tdb->flags |= TDB_RDONLY; - break; - default: - tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_add_flag: Unknown flag %u", - flag); - } -} - -void tdb_remove_flag(struct tdb_context *tdb, unsigned flag) -{ - if (tdb->flags & TDB_INTERNAL) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_remove_flag: internal db"); - return; - } - switch (flag) { - case TDB_NOLOCK: - tdb->flags &= ~TDB_NOLOCK; - break; - case TDB_NOMMAP: - tdb->flags &= ~TDB_NOMMAP; - tdb_mmap(tdb); - break; - case TDB_NOSYNC: - tdb->flags &= ~TDB_NOSYNC; - break; - case TDB_SEQNUM: - tdb->flags &= ~TDB_SEQNUM; - break; - case TDB_ALLOW_NESTING: - tdb->flags &= ~TDB_ALLOW_NESTING; - break; - case TDB_RDONLY: - if ((tdb->open_flags & O_ACCMODE) == O_RDONLY) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_remove_flag: can't" - " remove TDB_RDONLY on tdb" - " opened with O_RDONLY"); - break; - } - if (readonly_changable(tdb, "tdb_remove_flag")) - tdb->flags &= ~TDB_RDONLY; - break; - default: - tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_remove_flag: Unknown flag %u", - flag); - } -} - -const char *tdb_errorstr(enum TDB_ERROR ecode) -{ - /* Gcc warns if you miss a case in the switch, so use that. */ - switch (TDB_ERR_TO_OFF(ecode)) { - case TDB_ERR_TO_OFF(TDB_SUCCESS): return "Success"; - case TDB_ERR_TO_OFF(TDB_ERR_CORRUPT): return "Corrupt database"; - case TDB_ERR_TO_OFF(TDB_ERR_IO): return "IO Error"; - case TDB_ERR_TO_OFF(TDB_ERR_LOCK): return "Locking error"; - case TDB_ERR_TO_OFF(TDB_ERR_OOM): return "Out of memory"; - case TDB_ERR_TO_OFF(TDB_ERR_EXISTS): return "Record exists"; - case TDB_ERR_TO_OFF(TDB_ERR_EINVAL): return "Invalid parameter"; - case TDB_ERR_TO_OFF(TDB_ERR_NOEXIST): return "Record does not exist"; - case TDB_ERR_TO_OFF(TDB_ERR_RDONLY): return "write not permitted"; - } - return "Invalid error code"; -} - -enum TDB_ERROR tdb_error(struct tdb_context *tdb) -{ - return tdb->last_error; -} - -enum TDB_ERROR COLD tdb_logerr(struct tdb_context *tdb, - enum TDB_ERROR ecode, - enum tdb_log_level level, - const char *fmt, ...) -{ - char *message; - va_list ap; - size_t len; - /* tdb_open paths care about errno, so save it. */ - int saved_errno = errno; - - if (!tdb->log_fn) - return ecode; - - va_start(ap, fmt); - len = vasprintf(&message, fmt, ap); - va_end(ap); - - if (len < 0) { - tdb->log_fn(tdb, TDB_LOG_ERROR, TDB_ERR_OOM, - "out of memory formatting message:", tdb->log_data); - tdb->log_fn(tdb, level, ecode, fmt, tdb->log_data); - } else { - tdb->log_fn(tdb, level, ecode, message, tdb->log_data); - free(message); - } - errno = saved_errno; - return ecode; -} - -enum TDB_ERROR tdb_parse_record_(struct tdb_context *tdb, - TDB_DATA key, - enum TDB_ERROR (*parse)(TDB_DATA k, - TDB_DATA d, - void *data), - void *data) -{ - tdb_off_t off; - struct tdb_used_record rec; - struct hash_info h; - enum TDB_ERROR ecode; - - if (tdb->flags & TDB_VERSION1) { - return tdb->last_error = tdb1_parse_record(tdb, key, parse, - data); - } - - off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL); - if (TDB_OFF_IS_ERR(off)) { - return tdb->last_error = TDB_OFF_TO_ERR(off); - } - - if (!off) { - ecode = TDB_ERR_NOEXIST; - } else { - const void *dptr; - dptr = tdb_access_read(tdb, off + sizeof(rec) + key.dsize, - rec_data_length(&rec), false); - if (TDB_PTR_IS_ERR(dptr)) { - ecode = TDB_PTR_ERR(dptr); - } else { - TDB_DATA d = tdb_mkdata(dptr, rec_data_length(&rec)); - - ecode = parse(key, d, data); - tdb_access_release(tdb, dptr); - } - } - - tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK); - return tdb->last_error = ecode; -} - -const char *tdb_name(const struct tdb_context *tdb) -{ - return tdb->name; -} - -int64_t tdb_get_seqnum(struct tdb_context *tdb) -{ - tdb_off_t off; - - if (tdb->flags & TDB_VERSION1) { - tdb1_off_t val; - tdb->last_error = TDB_SUCCESS; - val = tdb1_get_seqnum(tdb); - - if (tdb->last_error != TDB_SUCCESS) - return TDB_ERR_TO_OFF(tdb->last_error); - else - return val; - } - - off = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum)); - if (TDB_OFF_IS_ERR(off)) - tdb->last_error = TDB_OFF_TO_ERR(off); - else - tdb->last_error = TDB_SUCCESS; - return off; -} - - -int tdb_fd(const struct tdb_context *tdb) -{ - return tdb->file->fd; -} - -struct traverse_state { - enum TDB_ERROR error; - struct tdb_context *dest_db; -}; - -/* - traverse function for repacking - */ -static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, - struct traverse_state *state) -{ - state->error = tdb_store(state->dest_db, key, data, TDB_INSERT); - if (state->error != TDB_SUCCESS) { - return -1; - } - return 0; -} - -enum TDB_ERROR tdb_repack(struct tdb_context *tdb) -{ - struct tdb_context *tmp_db; - struct traverse_state state; - - state.error = tdb_transaction_start(tdb); - if (state.error != TDB_SUCCESS) { - return state.error; - } - - tmp_db = tdb_open("tmpdb", TDB_INTERNAL, O_RDWR|O_CREAT, 0, NULL); - if (tmp_db == NULL) { - state.error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - __location__ - " Failed to create tmp_db"); - tdb_transaction_cancel(tdb); - return tdb->last_error = state.error; - } - - state.dest_db = tmp_db; - if (tdb_traverse(tdb, repack_traverse, &state) < 0) { - goto fail; - } - - state.error = tdb_wipe_all(tdb); - if (state.error != TDB_SUCCESS) { - goto fail; - } - - state.dest_db = tdb; - if (tdb_traverse(tmp_db, repack_traverse, &state) < 0) { - goto fail; - } - - tdb_close(tmp_db); - return tdb_transaction_commit(tdb); - -fail: - tdb_transaction_cancel(tdb); - tdb_close(tmp_db); - return state.error; -} diff --git a/ccan/tdb2/tdb1_check.c b/ccan/tdb2/tdb1_check.c deleted file mode 100644 index 07ee0755..00000000 --- a/ccan/tdb2/tdb1_check.c +++ /dev/null @@ -1,478 +0,0 @@ - /* - Unix SMB/CIFS implementation. - - trivial database library - - Copyright (C) Rusty Russell 2009 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ -#include "tdb1_private.h" - -/* Since we opened it, these shouldn't fail unless it's recent corruption. */ -static bool tdb1_check_header(struct tdb_context *tdb, tdb1_off_t *recovery) -{ - struct tdb1_header hdr; - uint32_t h1, h2; - - if (tdb->tdb1.io->tdb1_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1) - return false; - if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) - goto corrupt; - - TDB1_CONV(hdr); - if (hdr.version != TDB1_VERSION) - goto corrupt; - - if (hdr.rwlocks != 0 && hdr.rwlocks != TDB1_HASH_RWLOCK_MAGIC) - goto corrupt; - - tdb1_header_hash(tdb, &h1, &h2); - if (hdr.magic1_hash && hdr.magic2_hash && - (hdr.magic1_hash != h1 || hdr.magic2_hash != h2)) - goto corrupt; - - if (hdr.hash_size == 0) - goto corrupt; - - if (hdr.hash_size != tdb->tdb1.header.hash_size) - goto corrupt; - - if (hdr.recovery_start != 0 && - hdr.recovery_start < TDB1_DATA_START(tdb->tdb1.header.hash_size)) - goto corrupt; - - *recovery = hdr.recovery_start; - return true; - -corrupt: - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Header is corrupt\n"); - return false; -} - -/* Generic record header check. */ -static bool tdb1_check_record(struct tdb_context *tdb, - tdb1_off_t off, - const struct tdb1_record *rec) -{ - tdb1_off_t tailer; - - /* Check rec->next: 0 or points to record offset, aligned. */ - if (rec->next > 0 && rec->next < TDB1_DATA_START(tdb->tdb1.header.hash_size)){ - tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Record offset %d too small next %d\n", - off, rec->next); - goto corrupt; - } - if (rec->next + sizeof(*rec) < rec->next) { - tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Record offset %d too large next %d\n", - off, rec->next); - goto corrupt; - } - if ((rec->next % TDB1_ALIGNMENT) != 0) { - tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Record offset %d misaligned next %d\n", - off, rec->next); - goto corrupt; - } - if (tdb->tdb1.io->tdb1_oob(tdb, rec->next, sizeof(*rec), 0)) - goto corrupt; - - /* Check rec_len: similar to rec->next, implies next record. */ - if ((rec->rec_len % TDB1_ALIGNMENT) != 0) { - tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Record offset %d misaligned length %d\n", - off, rec->rec_len); - goto corrupt; - } - /* Must fit tailer. */ - if (rec->rec_len < sizeof(tailer)) { - tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Record offset %d too short length %d\n", - off, rec->rec_len); - goto corrupt; - } - /* OOB allows "right at the end" access, so this works for last rec. */ - if (tdb->tdb1.io->tdb1_oob(tdb, off, sizeof(*rec)+rec->rec_len, 0)) - goto corrupt; - - /* Check tailer. */ - if (tdb1_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer), - &tailer) == -1) - goto corrupt; - if (tailer != sizeof(*rec) + rec->rec_len) { - tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Record offset %d invalid tailer\n", off); - goto corrupt; - } - - return true; - -corrupt: - tdb->last_error = TDB_ERR_CORRUPT; - return false; -} - -/* Grab some bytes: may copy if can't use mmap. - Caller has already done bounds check. */ -static TDB_DATA get_bytes(struct tdb_context *tdb, - tdb1_off_t off, tdb1_len_t len) -{ - TDB_DATA d; - - d.dsize = len; - - if (tdb->tdb1.transaction == NULL && tdb->file->map_ptr != NULL) - d.dptr = (unsigned char *)tdb->file->map_ptr + off; - else - d.dptr = tdb1_alloc_read(tdb, off, d.dsize); - return d; -} - -/* Frees data if we're not able to simply use mmap. */ -static void put_bytes(struct tdb_context *tdb, TDB_DATA d) -{ - if (tdb->tdb1.transaction == NULL && tdb->file->map_ptr != NULL) - return; - free(d.dptr); -} - -/* We use the excellent Jenkins lookup3 hash; this is based on hash_word2. - * See: http://burtleburtle.net/bob/c/lookup3.c - */ -#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) -static void jhash(uint32_t key, uint32_t *pc, uint32_t *pb) -{ - uint32_t a,b,c; - - /* Set up the internal state */ - a = b = c = 0xdeadbeef + *pc; - c += *pb; - a += key; - c ^= b; c -= rot(b,14); - a ^= c; a -= rot(c,11); - b ^= a; b -= rot(a,25); - c ^= b; c -= rot(b,16); - a ^= c; a -= rot(c,4); - b ^= a; b -= rot(a,14); - c ^= b; c -= rot(b,24); - *pc=c; *pb=b; -} - -/* - We want to check that all free records are in the free list - (only once), and all free list entries are free records. Similarly - for each hash chain of used records. - - Doing that naively (without walking hash chains, since we want to be - linear) means keeping a list of records which have been seen in each - hash chain, and another of records pointed to (ie. next pointers - from records and the initial hash chain heads). These two lists - should be equal. This will take 8 bytes per record, and require - sorting at the end. - - So instead, we record each offset in a bitmap such a way that - recording it twice will cancel out. Since each offset should appear - exactly twice, the bitmap should be zero at the end. - - The approach was inspired by Bloom Filters (see Wikipedia). For - each value, we flip K bits in a bitmap of size N. The number of - distinct arrangements is: - - N! / (K! * (N-K)!) - - Of course, not all arrangements are actually distinct, but testing - shows this formula to be close enough. - - So, if K == 8 and N == 256, the probability of two things flipping the same - bits is 1 in 409,663,695,276,000. - - Given that ldb uses a hash size of 10000, using 32 bytes per hash chain - (320k) seems reasonable. -*/ -#define NUM_HASHES 8 -#define BITMAP_BITS 256 - -static void bit_flip(unsigned char bits[], unsigned int idx) -{ - bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT)); -} - -/* We record offsets in a bitmap for the particular chain it should be in. */ -static void record_offset(unsigned char bits[], tdb1_off_t off) -{ - uint32_t h1 = off, h2 = 0; - unsigned int i; - - /* We get two good hash values out of jhash2, so we use both. Then - * we keep going to produce further hash values. */ - for (i = 0; i < NUM_HASHES / 2; i++) { - jhash(off, &h1, &h2); - bit_flip(bits, h1 % BITMAP_BITS); - bit_flip(bits, h2 % BITMAP_BITS); - h2++; - } -} - -/* Check that an in-use record is valid. */ -static bool tdb1_check_used_record(struct tdb_context *tdb, - tdb1_off_t off, - const struct tdb1_record *rec, - unsigned char **hashes, - enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, - void *), - void *private_data) -{ - TDB_DATA key, data; - - if (!tdb1_check_record(tdb, off, rec)) - return false; - - /* key + data + tailer must fit in record */ - if (rec->key_len + rec->data_len + sizeof(tdb1_off_t) > rec->rec_len) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Record offset %d too short for contents\n", off); - return false; - } - - key = get_bytes(tdb, off + sizeof(*rec), rec->key_len); - if (!key.dptr) - return false; - - if ((uint32_t)tdb_hash(tdb, key.dptr, key.dsize) != rec->full_hash) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Record offset %d has incorrect hash\n", off); - goto fail_put_key; - } - - /* Mark this offset as a known value for this hash bucket. */ - record_offset(hashes[TDB1_BUCKET(rec->full_hash)+1], off); - /* And similarly if the next pointer is valid. */ - if (rec->next) - record_offset(hashes[TDB1_BUCKET(rec->full_hash)+1], rec->next); - - /* If they supply a check function and this record isn't dead, - get data and feed it. */ - if (check && rec->magic != TDB1_DEAD_MAGIC) { - enum TDB_ERROR ecode; - - data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len, - rec->data_len); - if (!data.dptr) - goto fail_put_key; - - ecode = check(key, data, private_data); - if (ecode != TDB_SUCCESS) { - tdb->last_error = ecode; - goto fail_put_data; - } - put_bytes(tdb, data); - } - - put_bytes(tdb, key); - return true; - -fail_put_data: - put_bytes(tdb, data); -fail_put_key: - put_bytes(tdb, key); - return false; -} - -/* Check that an unused record is valid. */ -static bool tdb1_check_free_record(struct tdb_context *tdb, - tdb1_off_t off, - const struct tdb1_record *rec, - unsigned char **hashes) -{ - if (!tdb1_check_record(tdb, off, rec)) - return false; - - /* Mark this offset as a known value for the free list. */ - record_offset(hashes[0], off); - /* And similarly if the next pointer is valid. */ - if (rec->next) - record_offset(hashes[0], rec->next); - return true; -} - -/* Slow, but should be very rare. */ -size_t tdb1_dead_space(struct tdb_context *tdb, tdb1_off_t off) -{ - size_t len; - - for (len = 0; off + len < tdb->file->map_size; len++) { - char c; - if (tdb->tdb1.io->tdb1_read(tdb, off, &c, 1, 0)) - return 0; - if (c != 0 && c != 0x42) - break; - } - return len; -} - -int tdb1_check(struct tdb_context *tdb, - enum TDB_ERROR (*check)(TDB_DATA key, TDB_DATA data, void *), - void *private_data) -{ - unsigned int h; - unsigned char **hashes; - tdb1_off_t off, recovery_start; - struct tdb1_record rec; - bool found_recovery = false; - tdb1_len_t dead; - bool locked; - size_t alloc_len; - - /* We may have a write lock already, so don't re-lock. */ - if (tdb->file->allrecord_lock.count != 0) { - locked = false; - } else { - if (tdb_lockall_read(tdb) != TDB_SUCCESS) - return -1; - locked = true; - } - - /* Make sure we know true size of the underlying file. */ - tdb->tdb1.io->tdb1_oob(tdb, tdb->file->map_size, 1, 1); - - /* Header must be OK: also gets us the recovery ptr, if any. */ - if (!tdb1_check_header(tdb, &recovery_start)) - goto unlock; - - /* We should have the whole header, too. */ - if (tdb->file->map_size < TDB1_DATA_START(tdb->tdb1.header.hash_size)) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "File too short for hashes\n"); - goto unlock; - } - - /* One big malloc: pointers then bit arrays. */ - alloc_len = sizeof(hashes[0]) * (1+tdb->tdb1.header.hash_size) - + BITMAP_BITS / CHAR_BIT * (1+tdb->tdb1.header.hash_size); - hashes = (unsigned char **)calloc(1, alloc_len); - if (!hashes) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "tdb_check: could not allocate %zu", - alloc_len); - goto unlock; - } - - /* Initialize pointers */ - hashes[0] = (unsigned char *)(&hashes[1+tdb->tdb1.header.hash_size]); - for (h = 1; h < 1+tdb->tdb1.header.hash_size; h++) - hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT; - - /* Freelist and hash headers are all in a row: read them. */ - for (h = 0; h < 1+tdb->tdb1.header.hash_size; h++) { - if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP + h*sizeof(tdb1_off_t), - &off) == -1) - goto free; - if (off) - record_offset(hashes[h], off); - } - - /* For each record, read it in and check it's ok. */ - for (off = TDB1_DATA_START(tdb->tdb1.header.hash_size); - off < tdb->file->map_size; - off += sizeof(rec) + rec.rec_len) { - if (tdb->tdb1.io->tdb1_read(tdb, off, &rec, sizeof(rec), - TDB1_DOCONV()) == -1) - goto free; - switch (rec.magic) { - case TDB1_MAGIC: - case TDB1_DEAD_MAGIC: - if (!tdb1_check_used_record(tdb, off, &rec, hashes, - check, private_data)) - goto free; - break; - case TDB1_FREE_MAGIC: - if (!tdb1_check_free_record(tdb, off, &rec, hashes)) - goto free; - break; - /* If we crash after ftruncate, we can get zeroes or fill. */ - case TDB1_RECOVERY_INVALID_MAGIC: - case 0x42424242: - if (recovery_start == off) { - found_recovery = true; - break; - } - dead = tdb1_dead_space(tdb, off); - if (dead < sizeof(rec)) - goto corrupt; - - tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING, - "Dead space at %d-%d (of %u)\n", - off, off + dead, tdb->file->map_size); - rec.rec_len = dead - sizeof(rec); - break; - case TDB1_RECOVERY_MAGIC: - if (recovery_start != off) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Unexpected recovery record at offset %d\n", - off); - goto free; - } - found_recovery = true; - break; - default: ; - corrupt: - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Bad magic 0x%x at offset %d\n", - rec.magic, off); - goto free; - } - } - - /* Now, hashes should all be empty: each record exists and is referred - * to by one other. */ - for (h = 0; h < 1+tdb->tdb1.header.hash_size; h++) { - unsigned int i; - for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) { - if (hashes[h][i] != 0) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Hashes do not match records\n"); - goto free; - } - } - } - - /* We must have found recovery area if there was one. */ - if (recovery_start != 0 && !found_recovery) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "Expected a recovery area at %u\n", - recovery_start); - goto free; - } - - free(hashes); - if (locked) { - tdb_unlockall_read(tdb); - } - return 0; - -free: - free(hashes); -unlock: - if (locked) { - tdb_unlockall_read(tdb); - } - return -1; -} diff --git a/ccan/tdb2/tdb1_freelist.c b/ccan/tdb2/tdb1_freelist.c deleted file mode 100644 index ea368ec4..00000000 --- a/ccan/tdb2/tdb1_freelist.c +++ /dev/null @@ -1,322 +0,0 @@ - /* - Unix SMB/CIFS implementation. - - trivial database library - - Copyright (C) Andrew Tridgell 1999-2005 - Copyright (C) Paul `Rusty' Russell 2000 - Copyright (C) Jeremy Allison 2000-2003 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ - -#include "tdb1_private.h" - -/* read a freelist record and check for simple errors */ -int tdb1_rec_free_read(struct tdb_context *tdb, tdb1_off_t off, struct tdb1_record *rec) -{ - if (tdb->tdb1.io->tdb1_read(tdb, off, rec, sizeof(*rec),TDB1_DOCONV()) == -1) - return -1; - - if (rec->magic == TDB1_MAGIC) { - /* this happens when a app is showdown while deleting a record - we should - not completely fail when this happens */ - tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_WARNING, - "tdb1_rec_free_read non-free magic 0x%x at offset=%d - fixing\n", - rec->magic, off); - rec->magic = TDB1_FREE_MAGIC; - if (tdb->tdb1.io->tdb1_write(tdb, off, rec, sizeof(*rec)) == -1) - return -1; - } - - if (rec->magic != TDB1_FREE_MAGIC) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb1_rec_free_read bad magic 0x%x at offset=%d\n", - rec->magic, off); - return -1; - } - if (tdb->tdb1.io->tdb1_oob(tdb, rec->next, sizeof(*rec), 0) != 0) - return -1; - return 0; -} - - -/* update a record tailer (must hold allocation lock) */ -static int update_tailer(struct tdb_context *tdb, tdb1_off_t offset, - const struct tdb1_record *rec) -{ - tdb1_off_t totalsize; - - /* Offset of tailer from record header */ - totalsize = sizeof(*rec) + rec->rec_len; - return tdb1_ofs_write(tdb, offset + totalsize - sizeof(tdb1_off_t), - &totalsize); -} - -/* Add an element into the freelist. Merge adjacent records if - necessary. */ -int tdb1_free(struct tdb_context *tdb, tdb1_off_t offset, struct tdb1_record *rec) -{ - /* Allocation and tailer lock */ - if (tdb1_lock(tdb, -1, F_WRLCK) != 0) - return -1; - - /* set an initial tailer, so if we fail we don't leave a bogus record */ - if (update_tailer(tdb, offset, rec) != 0) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb_free: update_tailer failed!\n"); - goto fail; - } - - tdb->stats.alloc_coalesce_tried++; - /* Look left */ - if (offset - sizeof(tdb1_off_t) > TDB1_DATA_START(tdb->tdb1.header.hash_size)) { - tdb1_off_t left = offset - sizeof(tdb1_off_t); - struct tdb1_record l; - tdb1_off_t leftsize; - - /* Read in tailer and jump back to header */ - if (tdb1_ofs_read(tdb, left, &leftsize) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_free: left offset read failed at %u", left); - goto update; - } - - /* it could be uninitialised data */ - if (leftsize == 0 || leftsize == TDB1_PAD_U32) { - goto update; - } - - left = offset - leftsize; - - if (leftsize > offset || - left < TDB1_DATA_START(tdb->tdb1.header.hash_size)) { - goto update; - } - - /* Now read in the left record */ - if (tdb->tdb1.io->tdb1_read(tdb, left, &l, sizeof(l), TDB1_DOCONV()) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_free: left read failed at %u (%u)", left, leftsize); - goto update; - } - - /* If it's free, expand to include it. */ - if (l.magic == TDB1_FREE_MAGIC) { - /* we now merge the new record into the left record, rather than the other - way around. This makes the operation O(1) instead of O(n). This change - prevents traverse from being O(n^2) after a lot of deletes */ - l.rec_len += sizeof(*rec) + rec->rec_len; - if (tdb1_rec_write(tdb, left, &l) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_free: update_left failed at %u", left); - goto fail; - } - if (update_tailer(tdb, left, &l) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_free: update_tailer failed at %u", offset); - goto fail; - } - tdb->stats.alloc_coalesce_succeeded++; - tdb->stats.alloc_coalesce_num_merged++; - tdb->stats.frees++; - tdb1_unlock(tdb, -1, F_WRLCK); - return 0; - } - } - -update: - - /* Now, prepend to free list */ - rec->magic = TDB1_FREE_MAGIC; - - if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &rec->next) == -1 || - tdb1_rec_write(tdb, offset, rec) == -1 || - tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_free record write failed at offset=%d", - offset); - goto fail; - } - - /* And we're done. */ - tdb->stats.frees++; - tdb1_unlock(tdb, -1, F_WRLCK); - return 0; - - fail: - tdb1_unlock(tdb, -1, F_WRLCK); - return -1; -} - - - -/* - the core of tdb1_allocate - called when we have decided which - free list entry to use - - Note that we try to allocate by grabbing data from the end of an existing record, - not the beginning. This is so the left merge in a free is more likely to be - able to free up the record without fragmentation - */ -static tdb1_off_t tdb1_allocate_ofs(struct tdb_context *tdb, - tdb1_len_t length, tdb1_off_t rec_ptr, - struct tdb1_record *rec, tdb1_off_t last_ptr) -{ -#define MIN_REC_SIZE (sizeof(struct tdb1_record) + sizeof(tdb1_off_t) + 8) - - if (rec->rec_len < length + MIN_REC_SIZE) { - /* we have to grab the whole record */ - - /* unlink it from the previous record */ - if (tdb1_ofs_write(tdb, last_ptr, &rec->next) == -1) { - return 0; - } - - /* mark it not free */ - rec->magic = TDB1_MAGIC; - if (tdb1_rec_write(tdb, rec_ptr, rec) == -1) { - return 0; - } - tdb->stats.allocs++; - return rec_ptr; - } - - /* we're going to just shorten the existing record */ - rec->rec_len -= (length + sizeof(*rec)); - if (tdb1_rec_write(tdb, rec_ptr, rec) == -1) { - return 0; - } - if (update_tailer(tdb, rec_ptr, rec) == -1) { - return 0; - } - - /* and setup the new record */ - rec_ptr += sizeof(*rec) + rec->rec_len; - - memset(rec, '\0', sizeof(*rec)); - rec->rec_len = length; - rec->magic = TDB1_MAGIC; - - if (tdb1_rec_write(tdb, rec_ptr, rec) == -1) { - return 0; - } - - if (update_tailer(tdb, rec_ptr, rec) == -1) { - return 0; - } - - tdb->stats.allocs++; - tdb->stats.alloc_leftover++; - return rec_ptr; -} - -/* allocate some space from the free list. The offset returned points - to a unconnected tdb1_record within the database with room for at - least length bytes of total data - - 0 is returned if the space could not be allocated - */ -tdb1_off_t tdb1_allocate(struct tdb_context *tdb, tdb1_len_t length, struct tdb1_record *rec) -{ - tdb1_off_t rec_ptr, last_ptr, newrec_ptr; - struct { - tdb1_off_t rec_ptr, last_ptr; - tdb1_len_t rec_len; - } bestfit; - float multiplier = 1.0; - - if (tdb1_lock(tdb, -1, F_WRLCK) == -1) - return 0; - - /* over-allocate to reduce fragmentation */ - length *= 1.25; - - /* Extra bytes required for tailer */ - length += sizeof(tdb1_off_t); - length = TDB1_ALIGN(length, TDB1_ALIGNMENT); - - again: - last_ptr = TDB1_FREELIST_TOP; - - /* read in the freelist top */ - if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &rec_ptr) == -1) - goto fail; - - bestfit.rec_ptr = 0; - bestfit.last_ptr = 0; - bestfit.rec_len = 0; - - /* - this is a best fit allocation strategy. Originally we used - a first fit strategy, but it suffered from massive fragmentation - issues when faced with a slowly increasing record size. - */ - while (rec_ptr) { - if (tdb1_rec_free_read(tdb, rec_ptr, rec) == -1) { - goto fail; - } - - if (rec->rec_len >= length) { - if (bestfit.rec_ptr == 0 || - rec->rec_len < bestfit.rec_len) { - bestfit.rec_len = rec->rec_len; - bestfit.rec_ptr = rec_ptr; - bestfit.last_ptr = last_ptr; - } - } - - /* move to the next record */ - last_ptr = rec_ptr; - rec_ptr = rec->next; - - /* if we've found a record that is big enough, then - stop searching if its also not too big. The - definition of 'too big' changes as we scan - through */ - if (bestfit.rec_len > 0 && - bestfit.rec_len < length * multiplier) { - break; - } - - /* this multiplier means we only extremely rarely - search more than 50 or so records. At 50 records we - accept records up to 11 times larger than what we - want */ - multiplier *= 1.05; - } - - if (bestfit.rec_ptr != 0) { - if (tdb1_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) { - goto fail; - } - - newrec_ptr = tdb1_allocate_ofs(tdb, length, bestfit.rec_ptr, - rec, bestfit.last_ptr); - tdb1_unlock(tdb, -1, F_WRLCK); - return newrec_ptr; - } - - /* we didn't find enough space. See if we can expand the - database and if we can then try again */ - if (tdb1_expand(tdb, length + sizeof(*rec)) == 0) - goto again; - fail: - tdb1_unlock(tdb, -1, F_WRLCK); - return 0; -} diff --git a/ccan/tdb2/tdb1_hash.c b/ccan/tdb2/tdb1_hash.c deleted file mode 100644 index 2d5e4961..00000000 --- a/ccan/tdb2/tdb1_hash.c +++ /dev/null @@ -1,347 +0,0 @@ - /* - Unix SMB/CIFS implementation. - - trivial database library - - Copyright (C) Rusty Russell 2010 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ -#include "tdb1_private.h" - -/* This is based on the hash algorithm from gdbm */ -uint64_t tdb1_old_hash(const void *key, size_t len, uint64_t seed, void *unused) -{ - uint32_t value; /* Used to compute the hash value. */ - uint32_t i; /* Used to cycle through random values. */ - const unsigned char *dptr = key; - - /* Set the initial value from the key size. */ - for (value = 0x238F13AF * len, i=0; i < len; i++) - value = (value + (dptr[i] << (i*5 % 24))); - - return (1103515243 * value + 12345); -} - -#ifndef WORDS_BIGENDIAN -# define HASH_LITTLE_ENDIAN 1 -# define HASH_BIG_ENDIAN 0 -#else -# define HASH_LITTLE_ENDIAN 0 -# define HASH_BIG_ENDIAN 1 -#endif - -/* -------------------------------------------------------------------------------- -lookup3.c, by Bob Jenkins, May 2006, Public Domain. - -These are functions for producing 32-bit hashes for hash table lookup. -hash_word(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() -are externally useful functions. Routines to test the hash are included -if SELF_TEST is defined. You can use this free for any purpose. It's in -the public domain. It has no warranty. - -You probably want to use hashlittle(). hashlittle() and hashbig() -hash byte arrays. hashlittle() is is faster than hashbig() on -little-endian machines. Intel and AMD are little-endian machines. -On second thought, you probably want hashlittle2(), which is identical to -hashlittle() except it returns two 32-bit hashes for the price of one. -You could implement hashbig2() if you wanted but I haven't bothered here. - -If you want to find a hash of, say, exactly 7 integers, do - a = i1; b = i2; c = i3; - mix(a,b,c); - a += i4; b += i5; c += i6; - mix(a,b,c); - a += i7; - final(a,b,c); -then use c as the hash value. If you have a variable length array of -4-byte integers to hash, use hash_word(). If you have a byte array (like -a character string), use hashlittle(). If you have several byte arrays, or -a mix of things, see the comments above hashlittle(). - -Why is this so big? I read 12 bytes at a time into 3 4-byte integers, -then mix those integers. This is fast (you can do a lot more thorough -mixing with 12*3 instructions on 3 integers than you can with 3 instructions -on 1 byte), but shoehorning those bytes into integers efficiently is messy. -*/ - -#define hashsize(n) ((uint32_t)1<<(n)) -#define hashmask(n) (hashsize(n)-1) -#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) - -/* -------------------------------------------------------------------------------- -mix -- mix 3 32-bit values reversibly. - -This is reversible, so any information in (a,b,c) before mix() is -still in (a,b,c) after mix(). - -If four pairs of (a,b,c) inputs are run through mix(), or through -mix() in reverse, there are at least 32 bits of the output that -are sometimes the same for one pair and different for another pair. -This was tested for: -* pairs that differed by one bit, by two bits, in any combination - of top bits of (a,b,c), or in any combination of bottom bits of - (a,b,c). -* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed - the output delta to a Gray code (a^(a>>1)) so a string of 1's (as - is commonly produced by subtraction) look like a single 1-bit - difference. -* the base values were pseudorandom, all zero but one bit set, or - all zero plus a counter that starts at zero. - -Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that -satisfy this are - 4 6 8 16 19 4 - 9 15 3 18 27 15 - 14 9 3 7 17 3 -Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing -for "differ" defined as + with a one-bit base and a two-bit delta. I -used http://burtleburtle.net/bob/hash/avalanche.html to choose -the operations, constants, and arrangements of the variables. - -This does not achieve avalanche. There are input bits of (a,b,c) -that fail to affect some output bits of (a,b,c), especially of a. The -most thoroughly mixed value is c, but it doesn't really even achieve -avalanche in c. - -This allows some parallelism. Read-after-writes are good at doubling -the number of bits affected, so the goal of mixing pulls in the opposite -direction as the goal of parallelism. I did what I could. Rotates -seem to cost as much as shifts on every machine I could lay my hands -on, and rotates are much kinder to the top and bottom bits, so I used -rotates. -------------------------------------------------------------------------------- -*/ -#define mix(a,b,c) \ -{ \ - a -= c; a ^= rot(c, 4); c += b; \ - b -= a; b ^= rot(a, 6); a += c; \ - c -= b; c ^= rot(b, 8); b += a; \ - a -= c; a ^= rot(c,16); c += b; \ - b -= a; b ^= rot(a,19); a += c; \ - c -= b; c ^= rot(b, 4); b += a; \ -} - -/* -------------------------------------------------------------------------------- -final -- final mixing of 3 32-bit values (a,b,c) into c - -Pairs of (a,b,c) values differing in only a few bits will usually -produce values of c that look totally different. This was tested for -* pairs that differed by one bit, by two bits, in any combination - of top bits of (a,b,c), or in any combination of bottom bits of - (a,b,c). -* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed - the output delta to a Gray code (a^(a>>1)) so a string of 1's (as - is commonly produced by subtraction) look like a single 1-bit - difference. -* the base values were pseudorandom, all zero but one bit set, or - all zero plus a counter that starts at zero. - -These constants passed: - 14 11 25 16 4 14 24 - 12 14 25 16 4 14 24 -and these came close: - 4 8 15 26 3 22 24 - 10 8 15 26 3 22 24 - 11 8 15 26 3 22 24 -------------------------------------------------------------------------------- -*/ -#define final(a,b,c) \ -{ \ - c ^= b; c -= rot(b,14); \ - a ^= c; a -= rot(c,11); \ - b ^= a; b -= rot(a,25); \ - c ^= b; c -= rot(b,16); \ - a ^= c; a -= rot(c,4); \ - b ^= a; b -= rot(a,14); \ - c ^= b; c -= rot(b,24); \ -} - - -/* -------------------------------------------------------------------------------- -hashlittle() -- hash a variable-length key into a 32-bit value - k : the key (the unaligned variable-length array of bytes) - length : the length of the key, counting by bytes - val2 : IN: can be any 4-byte value OUT: second 32 bit hash. -Returns a 32-bit value. Every bit of the key affects every bit of -the return value. Two keys differing by one or two bits will have -totally different hash values. Note that the return value is better -mixed than val2, so use that first. - -The best hash table sizes are powers of 2. There is no need to do -mod a prime (mod is sooo slow!). If you need less than 32 bits, -use a bitmask. For example, if you need only 10 bits, do - h = (h & hashmask(10)); -In which case, the hash table should have hashsize(10) elements. - -If you are hashing n strings (uint8_t **)k, do it like this: - for (i=0, h=0; i 12) - { - a += k[0]; - b += k[1]; - c += k[2]; - mix(a,b,c); - length -= 12; - k += 3; - } - - /*----------------------------- handle the last (probably partial) block */ - k8 = (const uint8_t *)k; - switch(length) - { - case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; - case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ - case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ - case 9 : c+=k8[8]; /* fall through */ - case 8 : b+=k[1]; a+=k[0]; break; - case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ - case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ - case 5 : b+=k8[4]; /* fall through */ - case 4 : a+=k[0]; break; - case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ - case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ - case 1 : a+=k8[0]; break; - case 0 : return c; - } - } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { - const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ - const uint8_t *k8; - - /*--------------- all but last block: aligned reads and different mixing */ - while (length > 12) - { - a += k[0] + (((uint32_t)k[1])<<16); - b += k[2] + (((uint32_t)k[3])<<16); - c += k[4] + (((uint32_t)k[5])<<16); - mix(a,b,c); - length -= 12; - k += 6; - } - - /*----------------------------- handle the last (probably partial) block */ - k8 = (const uint8_t *)k; - switch(length) - { - case 12: c+=k[4]+(((uint32_t)k[5])<<16); - b+=k[2]+(((uint32_t)k[3])<<16); - a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ - case 10: c+=k[4]; - b+=k[2]+(((uint32_t)k[3])<<16); - a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 9 : c+=k8[8]; /* fall through */ - case 8 : b+=k[2]+(((uint32_t)k[3])<<16); - a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ - case 6 : b+=k[2]; - a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 5 : b+=k8[4]; /* fall through */ - case 4 : a+=k[0]+(((uint32_t)k[1])<<16); - break; - case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ - case 2 : a+=k[0]; - break; - case 1 : a+=k8[0]; - break; - case 0 : return c; /* zero length requires no mixing */ - } - - } else { /* need to read the key one byte at a time */ - const uint8_t *k = (const uint8_t *)key; - - /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ - while (length > 12) - { - a += k[0]; - a += ((uint32_t)k[1])<<8; - a += ((uint32_t)k[2])<<16; - a += ((uint32_t)k[3])<<24; - b += k[4]; - b += ((uint32_t)k[5])<<8; - b += ((uint32_t)k[6])<<16; - b += ((uint32_t)k[7])<<24; - c += k[8]; - c += ((uint32_t)k[9])<<8; - c += ((uint32_t)k[10])<<16; - c += ((uint32_t)k[11])<<24; - mix(a,b,c); - length -= 12; - k += 12; - } - - /*-------------------------------- last block: affect all 32 bits of (c) */ - switch(length) /* all the case statements fall through */ - { - case 12: c+=((uint32_t)k[11])<<24; - case 11: c+=((uint32_t)k[10])<<16; - case 10: c+=((uint32_t)k[9])<<8; - case 9 : c+=k[8]; - case 8 : b+=((uint32_t)k[7])<<24; - case 7 : b+=((uint32_t)k[6])<<16; - case 6 : b+=((uint32_t)k[5])<<8; - case 5 : b+=k[4]; - case 4 : a+=((uint32_t)k[3])<<24; - case 3 : a+=((uint32_t)k[2])<<16; - case 2 : a+=((uint32_t)k[1])<<8; - case 1 : a+=k[0]; - break; - case 0 : return c; - } - } - - final(a,b,c); - return c; -} - -uint64_t tdb1_incompatible_hash(const void *key, size_t len, uint64_t seed, - void *unused) -{ - return hashlittle(key, len); -} diff --git a/ccan/tdb2/tdb1_io.c b/ccan/tdb2/tdb1_io.c deleted file mode 100644 index 488f3d84..00000000 --- a/ccan/tdb2/tdb1_io.c +++ /dev/null @@ -1,543 +0,0 @@ - /* - Unix SMB/CIFS implementation. - - trivial database library - - Copyright (C) Andrew Tridgell 1999-2005 - Copyright (C) Paul `Rusty' Russell 2000 - Copyright (C) Jeremy Allison 2000-2003 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ - - -#include "tdb1_private.h" -#ifndef MAX -#define MAX(a,b) ((a) > (b) ? (a) : (b)) -#endif - -/* check for an out of bounds access - if it is out of bounds then - see if the database has been expanded by someone else and expand - if necessary - note that "len" is the minimum length needed for the db -*/ -static int tdb1_oob(struct tdb_context *tdb, tdb1_off_t off, tdb1_len_t len, - int probe) -{ - struct stat st; - if (len + off < len) { - if (!probe) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_oob off %d len %d wrap\n", - (int)off, (int)len); - } - return -1; - } - - if (off + len <= tdb->file->map_size) - return 0; - if (tdb->flags & TDB_INTERNAL) { - if (!probe) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_oob len %d beyond internal malloc size %u", - (int)(off + len), (int)tdb->file->map_size); - } - return -1; - } - - if (fstat(tdb->file->fd, &st) == -1) { - tdb->last_error = TDB_ERR_IO; - return -1; - } - - if (st.st_size < (size_t)off + len) { - if (!probe) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_oob len %u beyond eof at %u", - (int)(off + len), (int)st.st_size); - } - return -1; - } - - /* Beware >4G files! */ - if ((tdb1_off_t)st.st_size != st.st_size) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_oob len %llu too large!\n", - (long long)st.st_size); - return -1; - } - - /* Unmap, update size, remap */ - if (tdb1_munmap(tdb) == -1) { - tdb->last_error = TDB_ERR_IO; - return -1; - } - tdb->file->map_size = st.st_size; - tdb1_mmap(tdb); - return 0; -} - -/* write a lump of data at a specified offset */ -static int tdb1_write(struct tdb_context *tdb, tdb1_off_t off, - const void *buf, tdb1_len_t len) -{ - if (len == 0) { - return 0; - } - - if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) { - tdb->last_error = TDB_ERR_RDONLY; - return -1; - } - - if (tdb->tdb1.io->tdb1_oob(tdb, off, len, 0) != 0) - return -1; - - if (tdb->file->map_ptr) { - memcpy(off + (char *)tdb->file->map_ptr, buf, len); - } else { - ssize_t written = pwrite(tdb->file->fd, buf, len, off); - if ((written != (ssize_t)len) && (written != -1)) { - tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_WARNING, - "tdb1_write: wrote only " - "%d of %d bytes at %d, trying once more", - (int)written, len, off); - written = pwrite(tdb->file->fd, - (const char *)buf+written, - len-written, - off+written); - } - if (written == -1) { - /* Ensure ecode is set for log fn. */ - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_write failed at %d " - "len=%d (%s)", - off, len, strerror(errno)); - return -1; - } else if (written != (ssize_t)len) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_write: failed to " - "write %d bytes at %d in two attempts", - len, off); - return -1; - } - } - return 0; -} - -/* Endian conversion: we only ever deal with 4 byte quantities */ -void *tdb1_convert(void *buf, uint32_t size) -{ - uint32_t i, *p = (uint32_t *)buf; - for (i = 0; i < size / 4; i++) - p[i] = TDB1_BYTEREV(p[i]); - return buf; -} - - -/* read a lump of data at a specified offset, maybe convert */ -static int tdb1_read(struct tdb_context *tdb, tdb1_off_t off, void *buf, - tdb1_len_t len, int cv) -{ - if (tdb->tdb1.io->tdb1_oob(tdb, off, len, 0) != 0) { - return -1; - } - - if (tdb->file->map_ptr) { - memcpy(buf, off + (char *)tdb->file->map_ptr, len); - } else { - ssize_t ret = pread(tdb->file->fd, buf, len, off); - if (ret != (ssize_t)len) { - /* Ensure ecode is set for log fn. */ - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_read failed at %d " - "len=%d ret=%d (%s) map_size=%d", - (int)off, (int)len, (int)ret, - strerror(errno), - (int)tdb->file->map_size); - return -1; - } - } - if (cv) { - tdb1_convert(buf, len); - } - return 0; -} - - - -/* - do an unlocked scan of the hash table heads to find the next non-zero head. The value - will then be confirmed with the lock held -*/ -static void tdb1_next_hash_chain(struct tdb_context *tdb, uint32_t *chain) -{ - uint32_t h = *chain; - if (tdb->file->map_ptr) { - for (;h < tdb->tdb1.header.hash_size;h++) { - if (0 != *(uint32_t *)(TDB1_HASH_TOP(h) + (unsigned char *)tdb->file->map_ptr)) { - break; - } - } - } else { - uint32_t off=0; - for (;h < tdb->tdb1.header.hash_size;h++) { - if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(h), &off) != 0 || off != 0) { - break; - } - } - } - (*chain) = h; -} - - -int tdb1_munmap(struct tdb_context *tdb) -{ - if (tdb->flags & TDB_INTERNAL) - return 0; - -#if HAVE_MMAP - if (tdb->file->map_ptr) { - int ret; - - ret = munmap(tdb->file->map_ptr, tdb->file->map_size); - if (ret != 0) - return ret; - } -#endif - tdb->file->map_ptr = NULL; - return 0; -} - -void tdb1_mmap(struct tdb_context *tdb) -{ - if (tdb->flags & TDB_INTERNAL) - return; - -#if HAVE_MMAP - if (!(tdb->flags & TDB_NOMMAP)) { - int mmap_flags; - if ((tdb->open_flags & O_ACCMODE) == O_RDONLY) - mmap_flags = PROT_READ; - else - mmap_flags = PROT_READ | PROT_WRITE; - - tdb->file->map_ptr = mmap(NULL, tdb->file->map_size, - mmap_flags, - MAP_SHARED|MAP_FILE, tdb->file->fd, 0); - - /* - * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!! - */ - - if (tdb->file->map_ptr == MAP_FAILED) { - tdb->file->map_ptr = NULL; - tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_WARNING, - "tdb1_mmap failed for size %d (%s)", - tdb->file->map_size, strerror(errno)); - } - } else { - tdb->file->map_ptr = NULL; - } -#else - tdb->file->map_ptr = NULL; -#endif -} - -/* expand a file. we prefer to use ftruncate, as that is what posix - says to use for mmap expansion */ -static int tdb1_expand_file(struct tdb_context *tdb, tdb1_off_t size, tdb1_off_t addition) -{ - char buf[8192]; - - if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) { - tdb->last_error = TDB_ERR_RDONLY; - return -1; - } - - if (ftruncate(tdb->file->fd, size+addition) == -1) { - char b = 0; - ssize_t written = pwrite(tdb->file->fd, &b, 1, - (size+addition) - 1); - if (written == 0) { - /* try once more, potentially revealing errno */ - written = pwrite(tdb->file->fd, &b, 1, - (size+addition) - 1); - } - if (written == 0) { - /* again - give up, guessing errno */ - errno = ENOSPC; - } - if (written != 1) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "expand_file to %d failed (%s)", - size+addition, - strerror(errno)); - return -1; - } - } - - /* now fill the file with something. This ensures that the - file isn't sparse, which would be very bad if we ran out of - disk. This must be done with write, not via mmap */ - memset(buf, TDB1_PAD_BYTE, sizeof(buf)); - while (addition) { - size_t n = addition>sizeof(buf)?sizeof(buf):addition; - ssize_t written = pwrite(tdb->file->fd, buf, n, size); - if (written == 0) { - /* prevent infinite loops: try _once_ more */ - written = pwrite(tdb->file->fd, buf, n, size); - } - if (written == 0) { - /* give up, trying to provide a useful errno */ - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "expand_file write " - "returned 0 twice: giving up!"); - errno = ENOSPC; - return -1; - } else if (written == -1) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "expand_file write of " - "%d bytes failed (%s)", (int)n, - strerror(errno)); - return -1; - } else if (written != n) { - tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_WARNING, - "expand_file: wrote " - "only %d of %d bytes - retrying", - (int)written, (int)n); - } - addition -= written; - size += written; - } - tdb->stats.expands++; - return 0; -} - - -/* You need 'size', this tells you how much you should expand by. */ -tdb1_off_t tdb1_expand_adjust(tdb1_off_t map_size, tdb1_off_t size, int page_size) -{ - tdb1_off_t new_size, top_size; - - /* limit size in order to avoid using up huge amounts of memory for - * in memory tdbs if an oddball huge record creeps in */ - if (size > 100 * 1024) { - top_size = map_size + size * 2; - } else { - top_size = map_size + size * 100; - } - - /* always make room for at least top_size more records, and at - least 25% more space. if the DB is smaller than 100MiB, - otherwise grow it by 10% only. */ - if (map_size > 100 * 1024 * 1024) { - new_size = map_size * 1.10; - } else { - new_size = map_size * 1.25; - } - - /* Round the database up to a multiple of the page size */ - new_size = MAX(top_size, new_size); - return TDB1_ALIGN(new_size, page_size) - map_size; -} - -/* expand the database at least size bytes by expanding the underlying - file and doing the mmap again if necessary */ -int tdb1_expand(struct tdb_context *tdb, tdb1_off_t size) -{ - struct tdb1_record rec; - tdb1_off_t offset; - - if (tdb1_lock(tdb, -1, F_WRLCK) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "lock failed in tdb1_expand"); - return -1; - } - - /* must know about any previous expansions by another process */ - tdb->tdb1.io->tdb1_oob(tdb, tdb->file->map_size, 1, 1); - - size = tdb1_expand_adjust(tdb->file->map_size, size, - tdb->tdb1.page_size); - - if (!(tdb->flags & TDB_INTERNAL)) - tdb1_munmap(tdb); - - /* - * We must ensure the file is unmapped before doing this - * to ensure consistency with systems like OpenBSD where - * writes and mmaps are not consistent. - */ - - /* expand the file itself */ - if (!(tdb->flags & TDB_INTERNAL)) { - if (tdb->tdb1.io->tdb1_expand_file(tdb, tdb->file->map_size, size) != 0) - goto fail; - } - - tdb->file->map_size += size; - - if (tdb->flags & TDB_INTERNAL) { - char *new_map_ptr = (char *)realloc(tdb->file->map_ptr, - tdb->file->map_size); - if (!new_map_ptr) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, - TDB_LOG_ERROR, - "tdb1_expand: no memory"); - tdb->file->map_size -= size; - goto fail; - } - tdb->file->map_ptr = new_map_ptr; - } else { - /* - * We must ensure the file is remapped before adding the space - * to ensure consistency with systems like OpenBSD where - * writes and mmaps are not consistent. - */ - - /* We're ok if the mmap fails as we'll fallback to read/write */ - tdb1_mmap(tdb); - } - - /* form a new freelist record */ - memset(&rec,'\0',sizeof(rec)); - rec.rec_len = size - sizeof(rec); - - /* link it into the free list */ - offset = tdb->file->map_size - size; - if (tdb1_free(tdb, offset, &rec) == -1) - goto fail; - - tdb1_unlock(tdb, -1, F_WRLCK); - return 0; - fail: - tdb1_unlock(tdb, -1, F_WRLCK); - return -1; -} - -/* read/write a tdb1_off_t */ -int tdb1_ofs_read(struct tdb_context *tdb, tdb1_off_t offset, tdb1_off_t *d) -{ - return tdb->tdb1.io->tdb1_read(tdb, offset, (char*)d, sizeof(*d), TDB1_DOCONV()); -} - -int tdb1_ofs_write(struct tdb_context *tdb, tdb1_off_t offset, tdb1_off_t *d) -{ - tdb1_off_t off = *d; - return tdb->tdb1.io->tdb1_write(tdb, offset, TDB1_CONV(off), sizeof(*d)); -} - - -/* read a lump of data, allocating the space for it */ -unsigned char *tdb1_alloc_read(struct tdb_context *tdb, tdb1_off_t offset, tdb1_len_t len) -{ - unsigned char *buf; - - /* some systems don't like zero length malloc */ - - if (!(buf = (unsigned char *)malloc(len ? len : 1))) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "tdb1_alloc_read malloc failed" - " len=%d (%s)", - len, strerror(errno)); - return NULL; - } - if (tdb->tdb1.io->tdb1_read(tdb, offset, buf, len, 0) == -1) { - SAFE_FREE(buf); - return NULL; - } - return buf; -} - -/* Give a piece of tdb data to a parser */ -enum TDB_ERROR tdb1_parse_data(struct tdb_context *tdb, TDB_DATA key, - tdb1_off_t offset, tdb1_len_t len, - enum TDB_ERROR (*parser)(TDB_DATA key, - TDB_DATA data, - void *private_data), - void *private_data) -{ - TDB_DATA data; - enum TDB_ERROR result; - - data.dsize = len; - - if ((tdb->tdb1.transaction == NULL) && (tdb->file->map_ptr != NULL)) { - /* - * Optimize by avoiding the malloc/memcpy/free, point the - * parser directly at the mmap area. - */ - if (tdb->tdb1.io->tdb1_oob(tdb, offset, len, 0) != 0) { - return tdb->last_error; - } - data.dptr = offset + (unsigned char *)tdb->file->map_ptr; - return parser(key, data, private_data); - } - - if (!(data.dptr = tdb1_alloc_read(tdb, offset, len))) { - return tdb->last_error; - } - - result = parser(key, data, private_data); - free(data.dptr); - return result; -} - -/* read/write a record */ -int tdb1_rec_read(struct tdb_context *tdb, tdb1_off_t offset, struct tdb1_record *rec) -{ - if (tdb->tdb1.io->tdb1_read(tdb, offset, rec, sizeof(*rec),TDB1_DOCONV()) == -1) - return -1; - if (TDB1_BAD_MAGIC(rec)) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb1_rec_read bad magic 0x%x at offset=%d", - rec->magic, offset); - return -1; - } - return tdb->tdb1.io->tdb1_oob(tdb, rec->next, sizeof(*rec), 0); -} - -int tdb1_rec_write(struct tdb_context *tdb, tdb1_off_t offset, struct tdb1_record *rec) -{ - struct tdb1_record r = *rec; - return tdb->tdb1.io->tdb1_write(tdb, offset, TDB1_CONV(r), sizeof(r)); -} - -static const struct tdb1_methods io1_methods = { - tdb1_read, - tdb1_write, - tdb1_next_hash_chain, - tdb1_oob, - tdb1_expand_file, -}; - -/* - initialise the default methods table -*/ -void tdb1_io_init(struct tdb_context *tdb) -{ - tdb->tdb1.io = &io1_methods; -} - -enum TDB_ERROR tdb1_probe_length(struct tdb_context *tdb) -{ - tdb->last_error = TDB_SUCCESS; - tdb->tdb1.io->tdb1_oob(tdb, tdb->file->map_size, 1, true); - return tdb->last_error; -} diff --git a/ccan/tdb2/tdb1_lock.c b/ccan/tdb2/tdb1_lock.c deleted file mode 100644 index 5cc0ad65..00000000 --- a/ccan/tdb2/tdb1_lock.c +++ /dev/null @@ -1,560 +0,0 @@ - /* - Unix SMB/CIFS implementation. - - trivial database library - - Copyright (C) Andrew Tridgell 1999-2005 - Copyright (C) Paul `Rusty' Russell 2000 - Copyright (C) Jeremy Allison 2000-2003 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ - -#include "tdb1_private.h" - -/* list -1 is the alloc list, otherwise a hash chain. */ -static tdb1_off_t lock_offset(int list) -{ - return TDB1_FREELIST_TOP + 4*list; -} - -/* a byte range locking function - return 0 on success - this functions locks/unlocks 1 byte at the specified offset. - - On error, errno is also set so that errors are passed back properly - through tdb1_open(). - - note that a len of zero means lock to end of file -*/ -int tdb1_brlock(struct tdb_context *tdb, - int rw_type, tdb1_off_t offset, size_t len, - enum tdb_lock_flags flags) -{ - enum TDB_ERROR ecode = tdb_brlock(tdb, rw_type, offset, len, flags - | TDB_LOCK_NOCHECK); - if (ecode == TDB_SUCCESS) - return 0; - tdb->last_error = ecode; - return -1; -} - -int tdb1_brunlock(struct tdb_context *tdb, - int rw_type, tdb1_off_t offset, size_t len) -{ - enum TDB_ERROR ecode = tdb_brunlock(tdb, rw_type, offset, len); - if (ecode == TDB_SUCCESS) - return 0; - tdb->last_error = ecode; - return -1; -} - -int tdb1_allrecord_upgrade(struct tdb_context *tdb) -{ - enum TDB_ERROR ecode = tdb_allrecord_upgrade(tdb, TDB1_FREELIST_TOP); - if (ecode == TDB_SUCCESS) - return 0; - tdb->last_error = ecode; - return -1; -} - -static struct tdb_lock *tdb1_find_nestlock(struct tdb_context *tdb, - tdb1_off_t offset) -{ - unsigned int i; - - for (i=0; ifile->num_lockrecs; i++) { - if (tdb->file->lockrecs[i].off == offset) { - return &tdb->file->lockrecs[i]; - } - } - return NULL; -} - -/* lock an offset in the database. */ -int tdb1_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype, - enum tdb_lock_flags flags) -{ - enum TDB_ERROR ecode; - - if (offset >= lock_offset(tdb->tdb1.header.hash_size)) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb1_lock: invalid offset %u for" - " ltype=%d", - offset, ltype); - return -1; - } - - ecode = tdb_nest_lock(tdb, offset, ltype, flags | TDB_LOCK_NOCHECK); - if (unlikely(ecode != TDB_SUCCESS)) { - tdb->last_error = ecode; - return -1; - } - return 0; -} - -static int tdb1_lock_and_recover(struct tdb_context *tdb) -{ - int ret; - - /* We need to match locking order in transaction commit. */ - if (tdb1_brlock(tdb, F_WRLCK, TDB1_FREELIST_TOP, 0, - TDB_LOCK_WAIT|TDB_LOCK_NOCHECK)) { - return -1; - } - - if (tdb1_brlock(tdb, F_WRLCK, TDB1_OPEN_LOCK, 1, - TDB_LOCK_WAIT|TDB_LOCK_NOCHECK)) { - tdb1_brunlock(tdb, F_WRLCK, TDB1_FREELIST_TOP, 0); - return -1; - } - - ret = tdb1_transaction_recover(tdb); - - tdb1_brunlock(tdb, F_WRLCK, TDB1_OPEN_LOCK, 1); - tdb1_brunlock(tdb, F_WRLCK, TDB1_FREELIST_TOP, 0); - - return ret; -} - -static bool have_data_locks(const struct tdb_context *tdb) -{ - unsigned int i; - - for (i = 0; i < tdb->file->num_lockrecs; i++) { - if (tdb->file->lockrecs[i].off >= lock_offset(-1)) - return true; - } - return false; -} - -static int tdb1_lock_list(struct tdb_context *tdb, int list, int ltype, - enum tdb_lock_flags waitflag) -{ - int ret; - bool check = false; - - /* a allrecord lock allows us to avoid per chain locks */ - if (tdb->file->allrecord_lock.count) { - if (!check_lock_pid(tdb, "tdb1_lock_list", true)) { - tdb->last_error = TDB_ERR_LOCK; - return -1; - } - if (tdb->file->allrecord_lock.owner != tdb) { - tdb->last_error = owner_conflict(tdb, "tdb1_lock_list"); - return -1; - } - if (ltype == tdb->file->allrecord_lock.ltype - || ltype == F_RDLCK) { - return 0; - } - tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, - TDB_LOG_USE_ERROR, - "tdb1_lock_list:" - " already have read lock"); - return -1; - } - - /* Only check when we grab first data lock. */ - check = !have_data_locks(tdb); - ret = tdb1_nest_lock(tdb, lock_offset(list), ltype, waitflag); - - if (ret == 0 && check) { - tdb_bool_err berr = tdb1_needs_recovery(tdb); - - if (berr < 0) { - return -1; - } - if (berr == true) { - tdb1_nest_unlock(tdb, lock_offset(list), ltype); - - if (tdb1_lock_and_recover(tdb) == -1) { - return -1; - } - return tdb1_lock_list(tdb, list, ltype, waitflag); - } - } - return ret; -} - -/* lock a list in the database. list -1 is the alloc list */ -int tdb1_lock(struct tdb_context *tdb, int list, int ltype) -{ - int ret; - - ret = tdb1_lock_list(tdb, list, ltype, TDB_LOCK_WAIT); - /* Don't log for EAGAIN and EINTR: they could have overridden lock fns */ - if (ret && errno != EAGAIN && errno != EINTR) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_lock failed on list %d " - "ltype=%d (%s)", list, ltype, strerror(errno)); - } - return ret; -} - -int tdb1_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype) -{ - enum TDB_ERROR ecode; - - /* Sanity checks */ - if (offset >= lock_offset(tdb->tdb1.header.hash_size)) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR, - "tdb1_unlock: offset %u invalid (%d)", - offset, tdb->tdb1.header.hash_size); - return -1; - } - - ecode = tdb_nest_unlock(tdb, offset, ltype); - if (unlikely(ecode != TDB_SUCCESS)) { - tdb->last_error = ecode; - return -1; - } - return 0; -} - -int tdb1_unlock(struct tdb_context *tdb, int list, int ltype) -{ - /* a global lock allows us to avoid per chain locks */ - if (tdb->file->allrecord_lock.count && - (ltype == tdb->file->allrecord_lock.ltype || ltype == F_RDLCK)) { - if (tdb->file->allrecord_lock.owner != tdb) { - tdb->last_error = owner_conflict(tdb, "tdb1_unlock"); - return -1; - } - return 0; - } - - if (tdb->file->allrecord_lock.count) { - tdb->last_error = TDB_ERR_LOCK; - return -1; - } - - return tdb1_nest_unlock(tdb, lock_offset(list), ltype); -} - -/* - get the transaction lock - */ -int tdb1_transaction_lock(struct tdb_context *tdb, int ltype, - enum tdb_lock_flags lockflags) -{ - return tdb1_nest_lock(tdb, TDB1_TRANSACTION_LOCK, ltype, lockflags); -} - -/* - release the transaction lock - */ -int tdb1_transaction_unlock(struct tdb_context *tdb, int ltype) -{ - return tdb1_nest_unlock(tdb, TDB1_TRANSACTION_LOCK, ltype); -} - -/* lock/unlock entire database. It can only be upgradable if you have some - * other way of guaranteeing exclusivity (ie. transaction write lock). - * We do the locking gradually to avoid being starved by smaller locks. */ -int tdb1_allrecord_lock(struct tdb_context *tdb, int ltype, - enum tdb_lock_flags flags, bool upgradable) -{ - enum TDB_ERROR ecode; - tdb_bool_err berr; - - /* tdb_lock_gradual() doesn't know about tdb->tdb1.traverse_read. */ - if (tdb->tdb1.traverse_read && !(tdb->flags & TDB_NOLOCK)) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, - TDB_LOG_USE_ERROR, - "tdb1_allrecord_lock during" - " tdb1_read_traverse"); - return -1; - } - - if (tdb->file->allrecord_lock.count - && tdb->file->allrecord_lock.ltype == ltype) { - tdb->file->allrecord_lock.count++; - return 0; - } - - if (tdb1_have_extra_locks(tdb)) { - /* can't combine global and chain locks */ - tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, - TDB_LOG_USE_ERROR, - "tdb1_allrecord_lock holding" - " other locks"); - return -1; - } - - if (upgradable && ltype != F_RDLCK) { - /* tdb error: you can't upgrade a write lock! */ - tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, - TDB_LOG_ERROR, - "tdb1_allrecord_lock cannot" - " have upgradable write lock"); - return -1; - } - - /* We cover two kinds of locks: - * 1) Normal chain locks. Taken for almost all operations. - * 3) Individual records locks. Taken after normal or free - * chain locks. - * - * It is (1) which cause the starvation problem, so we're only - * gradual for that. */ - ecode = tdb_lock_gradual(tdb, ltype, flags | TDB_LOCK_NOCHECK, - TDB1_FREELIST_TOP, tdb->tdb1.header.hash_size * 4); - if (ecode != TDB_SUCCESS) { - tdb->last_error = ecode; - return -1; - } - - /* Grab individual record locks. */ - if (tdb1_brlock(tdb, ltype, lock_offset(tdb->tdb1.header.hash_size), 0, - flags) == -1) { - tdb1_brunlock(tdb, ltype, TDB1_FREELIST_TOP, - tdb->tdb1.header.hash_size * 4); - return -1; - } - - tdb->file->allrecord_lock.owner = tdb; - tdb->file->allrecord_lock.count = 1; - tdb->file->locker = getpid(); - /* If it's upgradable, it's actually exclusive so we can treat - * it as a write lock. */ - tdb->file->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype; - tdb->file->allrecord_lock.off = upgradable; - - berr = tdb1_needs_recovery(tdb); - if (berr < 0) { - return -1; - } - - if (berr == true) { - tdb1_allrecord_unlock(tdb, ltype); - if (tdb1_lock_and_recover(tdb) == -1) { - return -1; - } - return tdb1_allrecord_lock(tdb, ltype, flags, upgradable); - } - - return 0; -} - - - -/* unlock entire db */ -int tdb1_allrecord_unlock(struct tdb_context *tdb, int ltype) -{ - /* Don't try this during r/o traversal! */ - if (tdb->tdb1.traverse_read) { - tdb->last_error = TDB_ERR_LOCK; - return -1; - } - - if (tdb->file->allrecord_lock.count == 0) { - tdb->last_error = TDB_ERR_LOCK; - return -1; - } - - /* Upgradable locks are marked as write locks. */ - if (tdb->file->allrecord_lock.ltype != ltype - && (!tdb->file->allrecord_lock.off || ltype != F_RDLCK)) { - tdb->last_error = TDB_ERR_LOCK; - return -1; - } - - if (tdb->file->allrecord_lock.count > 1) { - if (tdb->file->allrecord_lock.owner != tdb) { - tdb->last_error - = owner_conflict(tdb, "tdb1_allrecord_unlock"); - return -1; - } - tdb->file->allrecord_lock.count--; - return 0; - } - - tdb->file->allrecord_lock.count = 0; - tdb->file->allrecord_lock.ltype = 0; - - if (tdb1_brunlock(tdb, ltype, TDB1_FREELIST_TOP, 0)) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_unlockall failed (%s)", strerror(errno)); - return -1; - } - - return 0; -} - -/* lock/unlock one hash chain. This is meant to be used to reduce - contention - it cannot guarantee how many records will be locked */ -int tdb1_chainlock(struct tdb_context *tdb, TDB_DATA key) -{ - int ret = tdb1_lock(tdb, - TDB1_BUCKET(tdb_hash(tdb, key.dptr, key.dsize)), - F_WRLCK); - return ret; -} - -int tdb1_chainunlock(struct tdb_context *tdb, TDB_DATA key) -{ - return tdb1_unlock(tdb, TDB1_BUCKET(tdb_hash(tdb, key.dptr, key.dsize)), - F_WRLCK); -} - -int tdb1_chainlock_read(struct tdb_context *tdb, TDB_DATA key) -{ - int ret; - ret = tdb1_lock(tdb, TDB1_BUCKET(tdb_hash(tdb, key.dptr, key.dsize)), - F_RDLCK); - return ret; -} - -int tdb1_chainunlock_read(struct tdb_context *tdb, TDB_DATA key) -{ - return tdb1_unlock(tdb, TDB1_BUCKET(tdb_hash(tdb, key.dptr, key.dsize)), - F_RDLCK); -} - -/* record lock stops delete underneath */ -int tdb1_lock_record(struct tdb_context *tdb, tdb1_off_t off) -{ - if (tdb->file->allrecord_lock.count) { - if (!check_lock_pid(tdb, "tdb1_lock_record", true)) { - tdb->last_error = TDB_ERR_LOCK; - return -1; - } - if (tdb->file->allrecord_lock.owner != tdb) { - tdb->last_error = owner_conflict(tdb, - "tdb1_lock_record"); - return -1; - } - return 0; - } - return off ? tdb1_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0; -} - -/* - Write locks override our own fcntl readlocks, so check it here. - Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not - an error to fail to get the lock here. -*/ -int tdb1_write_lock_record(struct tdb_context *tdb, tdb1_off_t off) -{ - struct tdb1_traverse_lock *i; - for (i = &tdb->tdb1.travlocks; i; i = i->next) - if (i->off == off) - return -1; - if (tdb->file->allrecord_lock.count) { - if (!check_lock_pid(tdb, "tdb1_write_lock_record", true)) { - tdb->last_error = TDB_ERR_LOCK; - return -1; - } - if (tdb->file->allrecord_lock.owner != tdb) { - tdb->last_error - = owner_conflict(tdb, "tdb1_write_lock_record"); - return -1; - } - if (tdb->file->allrecord_lock.ltype == F_WRLCK) { - return 0; - } - return -1; - } - return tdb1_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE); -} - -int tdb1_write_unlock_record(struct tdb_context *tdb, tdb1_off_t off) -{ - if (tdb->file->allrecord_lock.count) { - if (tdb->file->allrecord_lock.owner != tdb) { - tdb->last_error - = owner_conflict(tdb, - "tdb1_write_unlock_record"); - return -1; - } - return 0; - } - return tdb1_brunlock(tdb, F_WRLCK, off, 1); -} - -/* fcntl locks don't stack: avoid unlocking someone else's */ -int tdb1_unlock_record(struct tdb_context *tdb, tdb1_off_t off) -{ - struct tdb1_traverse_lock *i; - uint32_t count = 0; - - if (tdb->file->allrecord_lock.count) { - if (tdb->file->allrecord_lock.owner != tdb) { - tdb->last_error = owner_conflict(tdb, - "tdb1_unlock_record"); - return -1; - } - return 0; - } - - if (off == 0) - return 0; - for (i = &tdb->tdb1.travlocks; i; i = i->next) - if (i->off == off) - count++; - return (count == 1 ? tdb1_brunlock(tdb, F_RDLCK, off, 1) : 0); -} - -bool tdb1_have_extra_locks(struct tdb_context *tdb) -{ - unsigned int extra = tdb->file->num_lockrecs; - - /* A transaction holds the lock for all records. */ - if (!tdb->tdb1.transaction && tdb->file->allrecord_lock.count) { - return true; - } - - /* We always hold the active lock if CLEAR_IF_FIRST. */ - if (tdb1_find_nestlock(tdb, TDB1_ACTIVE_LOCK)) { - extra--; - } - - /* In a transaction, we expect to hold the transaction lock */ - if (tdb->tdb1.transaction - && tdb1_find_nestlock(tdb, TDB1_TRANSACTION_LOCK)) { - extra--; - } - - return extra; -} - -/* The transaction code uses this to remove all locks. */ -void tdb1_release_transaction_locks(struct tdb_context *tdb) -{ - unsigned int i, active = 0; - - if (tdb->file->allrecord_lock.count != 0) { - tdb1_brunlock(tdb, tdb->file->allrecord_lock.ltype, TDB1_FREELIST_TOP, 0); - tdb->file->allrecord_lock.count = 0; - } - - for (i=0;ifile->num_lockrecs;i++) { - struct tdb_lock *lck = &tdb->file->lockrecs[i]; - - /* Don't release the active lock! Copy it to first entry. */ - if (lck->off == TDB1_ACTIVE_LOCK) { - tdb->file->lockrecs[active++] = *lck; - } else { - tdb1_brunlock(tdb, lck->ltype, lck->off, 1); - } - } - tdb->file->num_lockrecs = active; - if (tdb->file->num_lockrecs == 0) { - SAFE_FREE(tdb->file->lockrecs); - } -} diff --git a/ccan/tdb2/tdb1_open.c b/ccan/tdb2/tdb1_open.c deleted file mode 100644 index e668616a..00000000 --- a/ccan/tdb2/tdb1_open.c +++ /dev/null @@ -1,234 +0,0 @@ - /* - Unix SMB/CIFS implementation. - - trivial database library - - Copyright (C) Andrew Tridgell 1999-2005 - Copyright (C) Paul `Rusty' Russell 2000 - Copyright (C) Jeremy Allison 2000-2003 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ -#include -#include "tdb1_private.h" -#include - -/* We use two hashes to double-check they're using the right hash function. */ -void tdb1_header_hash(struct tdb_context *tdb, - uint32_t *magic1_hash, uint32_t *magic2_hash) -{ - uint32_t tdb1_magic = TDB1_MAGIC; - - *magic1_hash = tdb_hash(tdb, TDB_MAGIC_FOOD, sizeof(TDB_MAGIC_FOOD)); - *magic2_hash = tdb_hash(tdb, TDB1_CONV(tdb1_magic), sizeof(tdb1_magic)); - - /* Make sure at least one hash is non-zero! */ - if (*magic1_hash == 0 && *magic2_hash == 0) - *magic1_hash = 1; -} - -static void tdb_context_init(struct tdb_context *tdb, - struct tdb_attribute_tdb1_max_dead *max_dead) -{ - assert(tdb->flags & TDB_VERSION1); - - tdb1_io_init(tdb); - - tdb->tdb1.traverse_read = tdb->tdb1.traverse_write = 0; - memset(&tdb->tdb1.travlocks, 0, sizeof(tdb->tdb1.travlocks)); - tdb->tdb1.transaction = NULL; - - /* cache the page size */ - tdb->tdb1.page_size = getpagesize(); - if (tdb->tdb1.page_size <= 0) { - tdb->tdb1.page_size = 0x2000; - } - - if (max_dead) { - tdb->tdb1.max_dead_records = max_dead->max_dead; - } else { - tdb->tdb1.max_dead_records = 0; - } -} - -/* initialise a new database */ -enum TDB_ERROR tdb1_new_database(struct tdb_context *tdb, - struct tdb_attribute_tdb1_hashsize *hashsize, - struct tdb_attribute_tdb1_max_dead *max_dead) -{ - struct tdb1_header *newdb; - size_t size; - int hash_size = TDB1_DEFAULT_HASH_SIZE; - enum TDB_ERROR ret; - - tdb_context_init(tdb, max_dead); - - /* Default TDB2 hash becomes default TDB1 hash. */ - if (tdb->hash_fn == tdb_jenkins_hash) - tdb->hash_fn = tdb1_old_hash; - - if (hashsize) - hash_size = hashsize->hsize; - - /* We make it up in memory, then write it out if not internal */ - size = sizeof(struct tdb1_header) + (hash_size+1)*sizeof(tdb1_off_t); - if (!(newdb = (struct tdb1_header *)calloc(size, 1))) { - return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "Could not allocate new database header"); - } - - /* Fill in the header */ - newdb->version = TDB1_VERSION; - newdb->hash_size = hash_size; - - tdb1_header_hash(tdb, &newdb->magic1_hash, &newdb->magic2_hash); - - /* Make sure older tdbs (which don't check the magic hash fields) - * will refuse to open this TDB. */ - if (tdb->hash_fn == tdb1_incompatible_hash) - newdb->rwlocks = TDB1_HASH_RWLOCK_MAGIC; - - memcpy(&tdb->tdb1.header, newdb, sizeof(tdb->tdb1.header)); - /* This creates an endian-converted db. */ - TDB1_CONV(*newdb); - /* Don't endian-convert the magic food! */ - memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1); - - if (tdb->flags & TDB_INTERNAL) { - tdb->file->map_size = size; - tdb->file->map_ptr = (char *)newdb; - return TDB_SUCCESS; - } - if (lseek(tdb->file->fd, 0, SEEK_SET) == -1) { - ret = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_new_database: lseek failed"); - goto fail; - } - - if (ftruncate(tdb->file->fd, 0) == -1) { - ret = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_new_database: ftruncate failed"); - goto fail; - } - - if (!tdb1_write_all(tdb->file->fd, newdb, size)) { - ret = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_new_database: write failed"); - goto fail; - } - ret = TDB_SUCCESS; - - fail: - SAFE_FREE(newdb); - return ret; -} - -typedef void (*tdb1_log_func)(struct tdb_context *, enum tdb_log_level, enum TDB_ERROR, - const char *, void *); -typedef uint64_t (*tdb1_hash_func)(const void *key, size_t len, uint64_t seed, - void *data); - -struct tdb1_logging_context { - tdb1_log_func log_fn; - void *log_private; -}; - -static bool hash_correct(struct tdb_context *tdb, - uint32_t *m1, uint32_t *m2) -{ - /* older TDB without magic hash references */ - if (tdb->tdb1.header.magic1_hash == 0 - && tdb->tdb1.header.magic2_hash == 0) { - return true; - } - - tdb1_header_hash(tdb, m1, m2); - return (tdb->tdb1.header.magic1_hash == *m1 && - tdb->tdb1.header.magic2_hash == *m2); -} - -static bool check_header_hash(struct tdb_context *tdb, - uint32_t *m1, uint32_t *m2) -{ - if (hash_correct(tdb, m1, m2)) - return true; - - /* If they use one inbuilt, try the other inbuilt hash. */ - if (tdb->hash_fn == tdb1_old_hash) - tdb->hash_fn = tdb1_incompatible_hash; - else if (tdb->hash_fn == tdb1_incompatible_hash) - tdb->hash_fn = tdb1_old_hash; - else - return false; - return hash_correct(tdb, m1, m2); -} - -/* We are hold the TDB open lock on tdb->fd. */ -enum TDB_ERROR tdb1_open(struct tdb_context *tdb, - struct tdb_attribute_tdb1_max_dead *max_dead) -{ - const char *hash_alg; - uint32_t magic1, magic2; - - tdb->flags |= TDB_VERSION1; - - tdb_context_init(tdb, max_dead); - - /* Default TDB2 hash becomes default TDB1 hash. */ - if (tdb->hash_fn == tdb_jenkins_hash) { - tdb->hash_fn = tdb1_old_hash; - hash_alg = "default"; - } else if (tdb->hash_fn == tdb1_incompatible_hash) - hash_alg = "tdb1_incompatible_hash"; - else - hash_alg = "the user defined"; - - if (tdb->tdb1.header.version != TDB1_BYTEREV(TDB1_VERSION)) { - if (tdb->flags & TDB_CONVERT) { - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_open:" - " %s does not need TDB_CONVERT", - tdb->name); - } - } else { - tdb->flags |= TDB_CONVERT; - tdb1_convert(&tdb->tdb1.header, sizeof(tdb->tdb1.header)); - } - - if (tdb->tdb1.header.rwlocks != 0 && - tdb->tdb1.header.rwlocks != TDB1_HASH_RWLOCK_MAGIC) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb1_open: spinlocks no longer supported"); - } - - if (!check_header_hash(tdb, &magic1, &magic2)) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_USE_ERROR, - "tdb1_open: " - "%s was not created with %s hash function we are using\n" - "magic1_hash[0x%08X %s 0x%08X] " - "magic2_hash[0x%08X %s 0x%08X]", - tdb->name, hash_alg, - tdb->tdb1.header.magic1_hash, - (tdb->tdb1.header.magic1_hash == magic1) ? "==" : "!=", - magic1, - tdb->tdb1.header.magic2_hash, - (tdb->tdb1.header.magic2_hash == magic2) ? "==" : "!=", - magic2); - } - return TDB_SUCCESS; -} diff --git a/ccan/tdb2/tdb1_private.h b/ccan/tdb2/tdb1_private.h deleted file mode 100644 index cb22b9f3..00000000 --- a/ccan/tdb2/tdb1_private.h +++ /dev/null @@ -1,179 +0,0 @@ -#ifndef CCAN_TDB2_TDB1_PRIVATE_H -#define CCAN_TDB2_TDB1_PRIVATE_H - /* - Unix SMB/CIFS implementation. - - trivial database library - private includes - - Copyright (C) Andrew Tridgell 2005 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ - -#include "private.h" - -#include - -/* #define TDB_TRACE 1 */ -#ifndef HAVE_GETPAGESIZE -#define getpagesize() 0x2000 -#endif - -#ifndef __STRING -#define __STRING(x) #x -#endif - -#ifndef __STRINGSTRING -#define __STRINGSTRING(x) __STRING(x) -#endif - -#ifndef __location__ -#define __location__ __FILE__ ":" __STRINGSTRING(__LINE__) -#endif - -#ifndef offsetof -#define offsetof(t,f) ((unsigned int)&((t *)0)->f) -#endif - -#define TDB1_VERSION (0x26011967 + 6) -#define TDB1_MAGIC (0x26011999U) -#define TDB1_FREE_MAGIC (~TDB1_MAGIC) -#define TDB1_DEAD_MAGIC (0xFEE1DEAD) -#define TDB1_RECOVERY_MAGIC (0xf53bc0e7U) -#define TDB1_RECOVERY_INVALID_MAGIC (0x0) -#define TDB1_HASH_RWLOCK_MAGIC (0xbad1a51U) -#define TDB1_ALIGNMENT 4 -#define TDB1_DEFAULT_HASH_SIZE 131 -#define TDB1_FREELIST_TOP (sizeof(struct tdb1_header)) -#define TDB1_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1)) -#define TDB1_DEAD(r) ((r)->magic == TDB1_DEAD_MAGIC) -#define TDB1_BAD_MAGIC(r) ((r)->magic != TDB1_MAGIC && !TDB1_DEAD(r)) -#define TDB1_HASH_TOP(hash) (TDB1_FREELIST_TOP + (TDB1_BUCKET(hash)+1)*sizeof(tdb1_off_t)) -#define TDB1_HASHTABLE_SIZE(tdb) ((tdb->tdb1.header.hash_size+1)*sizeof(tdb1_off_t)) -#define TDB1_DATA_START(hash_size) (TDB1_HASH_TOP(hash_size-1) + sizeof(tdb1_off_t)) -#define TDB1_RECOVERY_HEAD offsetof(struct tdb1_header, recovery_start) -#define TDB1_SEQNUM_OFS offsetof(struct tdb1_header, sequence_number) -#define TDB1_PAD_BYTE 0x42 -#define TDB1_PAD_U32 0x42424242 - -/* lock offsets */ -#define TDB1_OPEN_LOCK 0 -#define TDB1_ACTIVE_LOCK 4 -#define TDB1_TRANSACTION_LOCK 8 - -/* free memory if the pointer is valid and zero the pointer */ -#ifndef SAFE_FREE -#define SAFE_FREE(x) do { if ((x) != NULL) {free((void *)x); (x)=NULL;} } while(0) -#endif - -#define TDB1_BUCKET(hash) ((hash) % tdb->tdb1.header.hash_size) - -#define TDB1_DOCONV() (tdb->flags & TDB_CONVERT) -#define TDB1_CONV(x) (TDB1_DOCONV() ? tdb1_convert(&x, sizeof(x)) : &x) - -/* the body of the database is made of one tdb1_record for the free space - plus a separate data list for each hash value */ -struct tdb1_record { - tdb1_off_t next; /* offset of the next record in the list */ - tdb1_len_t rec_len; /* total byte length of record */ - tdb1_len_t key_len; /* byte length of key */ - tdb1_len_t data_len; /* byte length of data */ - uint32_t full_hash; /* the full 32 bit hash of the key */ - uint32_t magic; /* try to catch errors */ - /* the following union is implied: - union { - char record[rec_len]; - struct { - char key[key_len]; - char data[data_len]; - } - uint32_t totalsize; (tailer) - } - */ -}; - - -struct tdb1_methods { - int (*tdb1_read)(struct tdb_context *, tdb1_off_t , void *, tdb1_len_t , int ); - int (*tdb1_write)(struct tdb_context *, tdb1_off_t, const void *, tdb1_len_t); - void (*next_hash_chain)(struct tdb_context *, uint32_t *); - int (*tdb1_oob)(struct tdb_context *, tdb1_off_t, tdb1_len_t, int ); - int (*tdb1_expand_file)(struct tdb_context *, tdb1_off_t , tdb1_off_t ); -}; - - -/* - internal prototypes -*/ -int tdb1_munmap(struct tdb_context *tdb); -void tdb1_mmap(struct tdb_context *tdb); -int tdb1_lock(struct tdb_context *tdb, int list, int ltype); -int tdb1_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype, - enum tdb_lock_flags flags); -int tdb1_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype); -int tdb1_unlock(struct tdb_context *tdb, int list, int ltype); -int tdb1_brlock(struct tdb_context *tdb, - int rw_type, tdb1_off_t offset, size_t len, - enum tdb_lock_flags flags); -int tdb1_brunlock(struct tdb_context *tdb, - int rw_type, tdb1_off_t offset, size_t len); -bool tdb1_have_extra_locks(struct tdb_context *tdb); -void tdb1_release_transaction_locks(struct tdb_context *tdb); -int tdb1_transaction_lock(struct tdb_context *tdb, int ltype, - enum tdb_lock_flags lockflags); -int tdb1_transaction_unlock(struct tdb_context *tdb, int ltype); -int tdb1_recovery_area(struct tdb_context *tdb, - const struct tdb1_methods *methods, - tdb1_off_t *recovery_offset, - struct tdb1_record *rec); -int tdb1_allrecord_upgrade(struct tdb_context *tdb); -int tdb1_write_lock_record(struct tdb_context *tdb, tdb1_off_t off); -int tdb1_write_unlock_record(struct tdb_context *tdb, tdb1_off_t off); -int tdb1_ofs_read(struct tdb_context *tdb, tdb1_off_t offset, tdb1_off_t *d); -int tdb1_ofs_write(struct tdb_context *tdb, tdb1_off_t offset, tdb1_off_t *d); -void *tdb1_convert(void *buf, uint32_t size); -int tdb1_free(struct tdb_context *tdb, tdb1_off_t offset, struct tdb1_record *rec); -tdb1_off_t tdb1_allocate(struct tdb_context *tdb, tdb1_len_t length, struct tdb1_record *rec); -int tdb1_ofs_read(struct tdb_context *tdb, tdb1_off_t offset, tdb1_off_t *d); -int tdb1_ofs_write(struct tdb_context *tdb, tdb1_off_t offset, tdb1_off_t *d); -int tdb1_lock_record(struct tdb_context *tdb, tdb1_off_t off); -int tdb1_unlock_record(struct tdb_context *tdb, tdb1_off_t off); -tdb_bool_err tdb1_needs_recovery(struct tdb_context *tdb); -int tdb1_rec_read(struct tdb_context *tdb, tdb1_off_t offset, struct tdb1_record *rec); -int tdb1_rec_write(struct tdb_context *tdb, tdb1_off_t offset, struct tdb1_record *rec); -int tdb1_do_delete(struct tdb_context *tdb, tdb1_off_t rec_ptr, struct tdb1_record *rec); -unsigned char *tdb1_alloc_read(struct tdb_context *tdb, tdb1_off_t offset, tdb1_len_t len); -enum TDB_ERROR tdb1_parse_data(struct tdb_context *tdb, TDB_DATA key, - tdb1_off_t offset, tdb1_len_t len, - enum TDB_ERROR (*parser)(TDB_DATA key, - TDB_DATA data, - void *private_data), - void *private_data); -tdb1_off_t tdb1_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype, - struct tdb1_record *rec); -void tdb1_io_init(struct tdb_context *tdb); -int tdb1_expand(struct tdb_context *tdb, tdb1_off_t size); -tdb1_off_t tdb1_expand_adjust(tdb1_off_t map_size, tdb1_off_t size, int page_size); -int tdb1_rec_free_read(struct tdb_context *tdb, tdb1_off_t off, - struct tdb1_record *rec); -bool tdb1_write_all(int fd, const void *buf, size_t count); -void tdb1_header_hash(struct tdb_context *tdb, - uint32_t *magic1_hash, uint32_t *magic2_hash); -uint64_t tdb1_old_hash(const void *key, size_t len, uint64_t seed, void *); -size_t tdb1_dead_space(struct tdb_context *tdb, tdb1_off_t off); -#endif /* CCAN_TDB2_TDB1_PRIVATE_H */ diff --git a/ccan/tdb2/tdb1_summary.c b/ccan/tdb2/tdb1_summary.c deleted file mode 100644 index b74b8f44..00000000 --- a/ccan/tdb2/tdb1_summary.c +++ /dev/null @@ -1,202 +0,0 @@ - /* - Trivial Database: human-readable summary code - Copyright (C) Rusty Russell 2010 - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ -#include "tdb1_private.h" - -#define SUMMARY_FORMAT1 \ - "Size of file/data: %u/%zu\n" \ - "Number of records: %zu\n" \ - "Smallest/average/largest keys: %zu/%zu/%zu\n" \ - "Smallest/average/largest data: %zu/%zu/%zu\n" \ - "Smallest/average/largest padding: %zu/%zu/%zu\n" \ - "Number of dead records: %zu\n" \ - "Smallest/average/largest dead records: %zu/%zu/%zu\n" \ - "Number of free records: %zu\n" \ - "Smallest/average/largest free records: %zu/%zu/%zu\n" \ - "Number of hash chains: %zu\n" \ - "Smallest/average/largest hash chains: %zu/%zu/%zu\n" \ - "Number of uncoalesced records: %zu\n" \ - "Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n" \ - "Percentage keys/data/padding/free/dead/rechdrs&tailers/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n" - -/* We don't use tally module, to keep upstream happy. */ -struct tally { - size_t min, max, total; - size_t num; -}; - -static void tally1_init(struct tally *tally) -{ - tally->total = 0; - tally->num = 0; - tally->min = tally->max = 0; -} - -static void tally1_add(struct tally *tally, size_t len) -{ - if (tally->num == 0) - tally->max = tally->min = len; - else if (len > tally->max) - tally->max = len; - else if (len < tally->min) - tally->min = len; - tally->num++; - tally->total += len; -} - -static size_t tally1_mean(const struct tally *tally) -{ - if (!tally->num) - return 0; - return tally->total / tally->num; -} - -static size_t get_hash_length(struct tdb_context *tdb, unsigned int i) -{ - tdb1_off_t rec_ptr; - size_t count = 0; - - if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(i), &rec_ptr) == -1) - return 0; - - /* keep looking until we find the right record */ - while (rec_ptr) { - struct tdb1_record r; - ++count; - if (tdb1_rec_read(tdb, rec_ptr, &r) == -1) - return 0; - rec_ptr = r.next; - } - return count; -} - -char *tdb1_summary(struct tdb_context *tdb) -{ - tdb1_off_t off, rec_off; - struct tally freet, keys, data, dead, extra, hash, uncoal; - struct tdb1_record rec; - char *ret = NULL; - bool locked; - size_t len, unc = 0; - struct tdb1_record recovery; - - /* We may have a write lock already, so don't lock. */ - if (tdb->file->allrecord_lock.count != 0) { - locked = false; - } else { - if (tdb_lockall_read(tdb) != TDB_SUCCESS) - return NULL; - locked = true; - } - - if (tdb1_recovery_area(tdb, tdb->tdb1.io, &rec_off, &recovery) != 0) { - goto unlock; - } - - tally1_init(&freet); - tally1_init(&keys); - tally1_init(&data); - tally1_init(&dead); - tally1_init(&extra); - tally1_init(&hash); - tally1_init(&uncoal); - - for (off = TDB1_DATA_START(tdb->tdb1.header.hash_size); - off < tdb->file->map_size - 1; - off += sizeof(rec) + rec.rec_len) { - if (tdb->tdb1.io->tdb1_read(tdb, off, &rec, sizeof(rec), - TDB1_DOCONV()) == -1) - goto unlock; - switch (rec.magic) { - case TDB1_MAGIC: - tally1_add(&keys, rec.key_len); - tally1_add(&data, rec.data_len); - tally1_add(&extra, rec.rec_len - (rec.key_len - + rec.data_len)); - if (unc > 1) - tally1_add(&uncoal, unc - 1); - unc = 0; - break; - case TDB1_FREE_MAGIC: - tally1_add(&freet, rec.rec_len); - unc++; - break; - /* If we crash after ftruncate, we can get zeroes or fill. */ - case TDB1_RECOVERY_INVALID_MAGIC: - case 0x42424242: - unc++; - /* If it's a valid recovery, we can trust rec_len. */ - if (off != rec_off) { - rec.rec_len = tdb1_dead_space(tdb, off) - - sizeof(rec); - } - /* Fall through */ - case TDB1_DEAD_MAGIC: - tally1_add(&dead, rec.rec_len); - break; - default: - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "Unexpected record magic 0x%x" - " at offset %d", - rec.magic, off); - goto unlock; - } - } - if (unc > 1) - tally1_add(&uncoal, unc - 1); - - for (off = 0; off < tdb->tdb1.header.hash_size; off++) - tally1_add(&hash, get_hash_length(tdb, off)); - - /* 20 is max length of a %zu. */ - len = strlen(SUMMARY_FORMAT1) + 35*20 + 1; - ret = (char *)malloc(len); - if (!ret) - goto unlock; - - snprintf(ret, len, SUMMARY_FORMAT1, - (tdb1_len_t)tdb->file->map_size, keys.total+data.total, - keys.num, - keys.min, tally1_mean(&keys), keys.max, - data.min, tally1_mean(&data), data.max, - extra.min, tally1_mean(&extra), extra.max, - dead.num, - dead.min, tally1_mean(&dead), dead.max, - freet.num, - freet.min, tally1_mean(&freet), freet.max, - hash.num, - hash.min, tally1_mean(&hash), hash.max, - uncoal.total, - uncoal.min, tally1_mean(&uncoal), uncoal.max, - keys.total * 100.0 / tdb->file->map_size, - data.total * 100.0 / tdb->file->map_size, - extra.total * 100.0 / tdb->file->map_size, - freet.total * 100.0 / tdb->file->map_size, - dead.total * 100.0 / tdb->file->map_size, - (keys.num + freet.num + dead.num) - * (sizeof(struct tdb1_record) + sizeof(uint32_t)) - * 100.0 / tdb->file->map_size, - tdb->tdb1.header.hash_size * sizeof(tdb1_off_t) - * 100.0 / (tdb1_len_t)tdb->file->map_size); - -unlock: - if (locked) { - tdb_unlockall_read(tdb); - } - return ret; -} diff --git a/ccan/tdb2/tdb1_tdb.c b/ccan/tdb2/tdb1_tdb.c deleted file mode 100644 index a220f471..00000000 --- a/ccan/tdb2/tdb1_tdb.c +++ /dev/null @@ -1,829 +0,0 @@ - /* - Unix SMB/CIFS implementation. - - trivial database library - - Copyright (C) Andrew Tridgell 1999-2005 - Copyright (C) Paul `Rusty' Russell 2000 - Copyright (C) Jeremy Allison 2000-2003 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ - -#include "tdb1_private.h" -#include - -/* - non-blocking increment of the tdb sequence number if the tdb has been opened using - the TDB_SEQNUM flag -*/ -void tdb1_increment_seqnum_nonblock(struct tdb_context *tdb) -{ - tdb1_off_t seqnum=0; - - if (!(tdb->flags & TDB_SEQNUM)) { - return; - } - - /* we ignore errors from this, as we have no sane way of - dealing with them. - */ - tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum); - seqnum++; - tdb1_ofs_write(tdb, TDB1_SEQNUM_OFS, &seqnum); -} - -/* - increment the tdb sequence number if the tdb has been opened using - the TDB_SEQNUM flag -*/ -static void tdb1_increment_seqnum(struct tdb_context *tdb) -{ - if (!(tdb->flags & TDB_SEQNUM)) { - return; - } - - if (tdb1_nest_lock(tdb, TDB1_SEQNUM_OFS, F_WRLCK, - TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) { - return; - } - - tdb1_increment_seqnum_nonblock(tdb); - - tdb1_nest_unlock(tdb, TDB1_SEQNUM_OFS, F_WRLCK); -} - -static enum TDB_ERROR tdb1_key_compare(TDB_DATA key, TDB_DATA data, - void *matches_) -{ - bool *matches = matches_; - *matches = (memcmp(data.dptr, key.dptr, data.dsize) == 0); - return TDB_SUCCESS; -} - -/* Returns 0 on fail; last_error will be TDB_ERR_NOEXIST if it simply - * wasn't there, otherwise a real error. - * On success, return offset of record, and fills in rec */ -static tdb1_off_t tdb1_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, - struct tdb1_record *r) -{ - tdb1_off_t rec_ptr; - - /* read in the hash top */ - if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) - return 0; - - /* keep looking until we find the right record */ - while (rec_ptr) { - if (tdb1_rec_read(tdb, rec_ptr, r) == -1) - return 0; - - tdb->stats.compares++; - if (TDB1_DEAD(r)) { - tdb->stats.compare_wrong_bucket++; - } else if (key.dsize != r->key_len) { - tdb->stats.compare_wrong_keylen++; - } else if (hash != r->full_hash) { - tdb->stats.compare_wrong_rechash++; - } else { - enum TDB_ERROR ecode; - bool matches; - ecode = tdb1_parse_data(tdb, key, rec_ptr + sizeof(*r), - r->key_len, tdb1_key_compare, - &matches); - - if (ecode != TDB_SUCCESS) { - tdb->last_error = ecode; - return 0; - } - - if (!matches) { - tdb->stats.compare_wrong_keycmp++; - } else { - return rec_ptr; - } - } - /* detect tight infinite loop */ - if (rec_ptr == r->next) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb1_find: loop detected."); - return 0; - } - rec_ptr = r->next; - } - tdb->last_error = TDB_ERR_NOEXIST; - return 0; -} - -/* As tdb1_find, but if you succeed, keep the lock */ -tdb1_off_t tdb1_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype, - struct tdb1_record *rec) -{ - uint32_t rec_ptr; - - if (tdb1_lock(tdb, TDB1_BUCKET(hash), locktype) == -1) - return 0; - if (!(rec_ptr = tdb1_find(tdb, key, hash, rec))) - tdb1_unlock(tdb, TDB1_BUCKET(hash), locktype); - return rec_ptr; -} - -static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key); - -/* update an entry in place - this only works if the new data size - is <= the old data size and the key exists. - on failure return -1. -*/ -static int tdb1_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf) -{ - struct tdb1_record rec; - tdb1_off_t rec_ptr; - - /* find entry */ - if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec))) - return -1; - - /* it could be an exact duplicate of what is there - this is - * surprisingly common (eg. with a ldb re-index). */ - if (rec.key_len == key.dsize && - rec.data_len == dbuf.dsize && - rec.full_hash == hash) { - TDB_DATA data = _tdb1_fetch(tdb, key); - if (data.dsize == dbuf.dsize && - memcmp(data.dptr, dbuf.dptr, data.dsize) == 0) { - if (data.dptr) { - free(data.dptr); - } - return 0; - } - if (data.dptr) { - free(data.dptr); - } - } - - /* must be long enough key, data and tailer */ - if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb1_off_t)) { - tdb->last_error = TDB_SUCCESS; /* Not really an error */ - return -1; - } - - if (tdb->tdb1.io->tdb1_write(tdb, rec_ptr + sizeof(rec) + rec.key_len, - dbuf.dptr, dbuf.dsize) == -1) - return -1; - - if (dbuf.dsize != rec.data_len) { - /* update size */ - rec.data_len = dbuf.dsize; - return tdb1_rec_write(tdb, rec_ptr, &rec); - } - - return 0; -} - -/* find an entry in the database given a key */ -/* If an entry doesn't exist tdb1_err will be set to - * TDB_ERR_NOEXIST. If a key has no data attached - * then the TDB_DATA will have zero length but - * a non-zero pointer - */ -static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key) -{ - tdb1_off_t rec_ptr; - struct tdb1_record rec; - TDB_DATA ret; - uint32_t hash; - - /* find which hash bucket it is in */ - hash = tdb_hash(tdb, key.dptr, key.dsize); - if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) { - ret.dptr = NULL; - ret.dsize = 0; - return ret; - } - - ret.dptr = tdb1_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, - rec.data_len); - ret.dsize = rec.data_len; - tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK); - return ret; -} - -enum TDB_ERROR tdb1_fetch(struct tdb_context *tdb, TDB_DATA key, TDB_DATA *data) -{ - *data = _tdb1_fetch(tdb, key); - if (data->dptr == NULL) - return tdb->last_error; - return TDB_SUCCESS; -} - -enum TDB_ERROR tdb1_parse_record(struct tdb_context *tdb, TDB_DATA key, - enum TDB_ERROR (*parser)(TDB_DATA key, - TDB_DATA data, - void *private_data), - void *private_data) -{ - tdb1_off_t rec_ptr; - struct tdb1_record rec; - enum TDB_ERROR ret; - uint32_t hash; - - /* find which hash bucket it is in */ - hash = tdb_hash(tdb, key.dptr, key.dsize); - - if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) { - return tdb->last_error; - } - - ret = tdb1_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len, - rec.data_len, parser, private_data); - - tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK); - - return ret; -} - -/* check if an entry in the database exists - - note that 1 is returned if the key is found and 0 is returned if not found - this doesn't match the conventions in the rest of this module, but is - compatible with gdbm -*/ -static int tdb1_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash) -{ - struct tdb1_record rec; - - if (tdb1_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0) - return 0; - tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK); - return 1; -} - -int tdb1_exists(struct tdb_context *tdb, TDB_DATA key) -{ - uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize); - int ret; - - assert(tdb->flags & TDB_VERSION1); - ret = tdb1_exists_hash(tdb, key, hash); - return ret; -} - -/* actually delete an entry in the database given the offset */ -int tdb1_do_delete(struct tdb_context *tdb, tdb1_off_t rec_ptr, struct tdb1_record *rec) -{ - tdb1_off_t last_ptr, i; - struct tdb1_record lastrec; - - if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) return -1; - - if (((tdb->tdb1.traverse_write != 0) && (!TDB1_DEAD(rec))) || - tdb1_write_lock_record(tdb, rec_ptr) == -1) { - /* Someone traversing here: mark it as dead */ - rec->magic = TDB1_DEAD_MAGIC; - return tdb1_rec_write(tdb, rec_ptr, rec); - } - if (tdb1_write_unlock_record(tdb, rec_ptr) != 0) - return -1; - - /* find previous record in hash chain */ - if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(rec->full_hash), &i) == -1) - return -1; - for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next) - if (tdb1_rec_read(tdb, i, &lastrec) == -1) - return -1; - - /* unlink it: next ptr is at start of record. */ - if (last_ptr == 0) - last_ptr = TDB1_HASH_TOP(rec->full_hash); - if (tdb1_ofs_write(tdb, last_ptr, &rec->next) == -1) - return -1; - - /* recover the space */ - if (tdb1_free(tdb, rec_ptr, rec) == -1) - return -1; - return 0; -} - -static int tdb1_count_dead(struct tdb_context *tdb, uint32_t hash) -{ - int res = 0; - tdb1_off_t rec_ptr; - struct tdb1_record rec; - - /* read in the hash top */ - if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) - return 0; - - while (rec_ptr) { - if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1) - return 0; - - if (rec.magic == TDB1_DEAD_MAGIC) { - res += 1; - } - rec_ptr = rec.next; - } - return res; -} - -/* - * Purge all DEAD records from a hash chain - */ -static int tdb1_purge_dead(struct tdb_context *tdb, uint32_t hash) -{ - int res = -1; - struct tdb1_record rec; - tdb1_off_t rec_ptr; - - if (tdb1_lock(tdb, -1, F_WRLCK) == -1) { - return -1; - } - - /* read in the hash top */ - if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) - goto fail; - - while (rec_ptr) { - tdb1_off_t next; - - if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1) { - goto fail; - } - - next = rec.next; - - if (rec.magic == TDB1_DEAD_MAGIC - && tdb1_do_delete(tdb, rec_ptr, &rec) == -1) { - goto fail; - } - rec_ptr = next; - } - res = 0; - fail: - tdb1_unlock(tdb, -1, F_WRLCK); - return res; -} - -/* delete an entry in the database given a key */ -static int tdb1_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash) -{ - tdb1_off_t rec_ptr; - struct tdb1_record rec; - int ret; - - if (tdb->tdb1.max_dead_records != 0) { - - /* - * Allow for some dead records per hash chain, mainly for - * tdb's with a very high create/delete rate like locking.tdb. - */ - - if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1) - return -1; - - if (tdb1_count_dead(tdb, hash) >= tdb->tdb1.max_dead_records) { - /* - * Don't let the per-chain freelist grow too large, - * delete all existing dead records - */ - tdb1_purge_dead(tdb, hash); - } - - if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec))) { - tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK); - return -1; - } - - /* - * Just mark the record as dead. - */ - rec.magic = TDB1_DEAD_MAGIC; - ret = tdb1_rec_write(tdb, rec_ptr, &rec); - } - else { - if (!(rec_ptr = tdb1_find_lock_hash(tdb, key, hash, F_WRLCK, - &rec))) - return -1; - - ret = tdb1_do_delete(tdb, rec_ptr, &rec); - } - - if (ret == 0) { - tdb1_increment_seqnum(tdb); - } - - if (tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_WRLCK) != 0) - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_delete: WARNING tdb1_unlock failed!"); - return ret; -} - -int tdb1_delete(struct tdb_context *tdb, TDB_DATA key) -{ - uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize); - int ret; - - assert(tdb->flags & TDB_VERSION1); - ret = tdb1_delete_hash(tdb, key, hash); - return ret; -} - -/* - * See if we have a dead record around with enough space - */ -static tdb1_off_t tdb1_find_dead(struct tdb_context *tdb, uint32_t hash, - struct tdb1_record *r, tdb1_len_t length) -{ - tdb1_off_t rec_ptr; - - /* read in the hash top */ - if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) - return 0; - - /* keep looking until we find the right record */ - while (rec_ptr) { - if (tdb1_rec_read(tdb, rec_ptr, r) == -1) - return 0; - - if (TDB1_DEAD(r) && r->rec_len >= length) { - /* - * First fit for simple coding, TODO: change to best - * fit - */ - return rec_ptr; - } - rec_ptr = r->next; - } - return 0; -} - -static int _tdb1_store(struct tdb_context *tdb, TDB_DATA key, - TDB_DATA dbuf, int flag, uint32_t hash) -{ - struct tdb1_record rec; - tdb1_off_t rec_ptr; - int ret = -1; - - /* check for it existing, on insert. */ - if (flag == TDB_INSERT) { - if (tdb1_exists_hash(tdb, key, hash)) { - tdb->last_error = TDB_ERR_EXISTS; - goto fail; - } - if (tdb->last_error != TDB_ERR_NOEXIST) { - goto fail; - } - } else { - /* first try in-place update, on modify or replace. */ - if (tdb1_update_hash(tdb, key, hash, dbuf) == 0) { - goto done; - } - if (tdb->last_error != TDB_SUCCESS) { - if (tdb->last_error != TDB_ERR_NOEXIST) { - goto fail; - } - if (flag == TDB_MODIFY) { - /* if the record doesn't exist and we are in TDB1_MODIFY mode then - we should fail the store */ - goto fail; - } - } - } - /* reset the error code potentially set by the tdb1_update() */ - tdb->last_error = TDB_SUCCESS; - - /* delete any existing record - if it doesn't exist we don't - care. Doing this first reduces fragmentation, and avoids - coalescing with `allocated' block before it's updated. */ - if (flag != TDB_INSERT) - tdb1_delete_hash(tdb, key, hash); - - if (tdb->tdb1.max_dead_records != 0) { - /* - * Allow for some dead records per hash chain, look if we can - * find one that can hold the new record. We need enough space - * for key, data and tailer. If we find one, we don't have to - * consult the central freelist. - */ - rec_ptr = tdb1_find_dead( - tdb, hash, &rec, - key.dsize + dbuf.dsize + sizeof(tdb1_off_t)); - - if (rec_ptr != 0) { - rec.key_len = key.dsize; - rec.data_len = dbuf.dsize; - rec.full_hash = hash; - rec.magic = TDB1_MAGIC; - if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1 - || tdb->tdb1.io->tdb1_write( - tdb, rec_ptr + sizeof(rec), - key.dptr, key.dsize) == -1 - || tdb->tdb1.io->tdb1_write( - tdb, rec_ptr + sizeof(rec) + key.dsize, - dbuf.dptr, dbuf.dsize) == -1) { - goto fail; - } - goto done; - } - } - - /* - * We have to allocate some space from the freelist, so this means we - * have to lock it. Use the chance to purge all the DEAD records from - * the hash chain under the freelist lock. - */ - - if (tdb1_lock(tdb, -1, F_WRLCK) == -1) { - goto fail; - } - - if ((tdb->tdb1.max_dead_records != 0) - && (tdb1_purge_dead(tdb, hash) == -1)) { - tdb1_unlock(tdb, -1, F_WRLCK); - goto fail; - } - - /* we have to allocate some space */ - rec_ptr = tdb1_allocate(tdb, key.dsize + dbuf.dsize, &rec); - - tdb1_unlock(tdb, -1, F_WRLCK); - - if (rec_ptr == 0) { - goto fail; - } - - /* Read hash top into next ptr */ - if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec.next) == -1) - goto fail; - - rec.key_len = key.dsize; - rec.data_len = dbuf.dsize; - rec.full_hash = hash; - rec.magic = TDB1_MAGIC; - - /* write out and point the top of the hash chain at it */ - if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1 - || tdb->tdb1.io->tdb1_write(tdb, rec_ptr + sizeof(rec), - key.dptr, key.dsize) == -1 - || tdb->tdb1.io->tdb1_write(tdb, rec_ptr + sizeof(rec) + key.dsize, - dbuf.dptr, dbuf.dsize) == -1 - || tdb1_ofs_write(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) { - /* Need to tdb1_unallocate() here */ - goto fail; - } - - done: - ret = 0; - fail: - if (ret == 0) { - tdb1_increment_seqnum(tdb); - } - return ret; -} - -/* store an element in the database, replacing any existing element - with the same key - - return 0 on success, -1 on failure -*/ -int tdb1_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag) -{ - uint32_t hash; - int ret; - - assert(tdb->flags & TDB_VERSION1); - - if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_RDONLY, - TDB_LOG_USE_ERROR, - "tdb_store: read-only tdb"); - return -1; - } - - /* find which hash bucket it is in */ - hash = tdb_hash(tdb, key.dptr, key.dsize); - if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1) - return -1; - - ret = _tdb1_store(tdb, key, dbuf, flag, hash); - tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK); - return ret; -} - -/* Append to an entry. Create if not exist. */ -int tdb1_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf) -{ - uint32_t hash; - TDB_DATA dbuf; - int ret = -1; - - assert(tdb->flags & TDB_VERSION1); - - /* find which hash bucket it is in */ - hash = tdb_hash(tdb, key.dptr, key.dsize); - if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1) - return -1; - - dbuf = _tdb1_fetch(tdb, key); - - if (dbuf.dptr == NULL) { - dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize); - } else { - unsigned int new_len = dbuf.dsize + new_dbuf.dsize; - unsigned char *new_dptr; - - /* realloc '0' is special: don't do that. */ - if (new_len == 0) - new_len = 1; - new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len); - if (new_dptr == NULL) { - free(dbuf.dptr); - } - dbuf.dptr = new_dptr; - } - - if (dbuf.dptr == NULL) { - tdb->last_error = TDB_ERR_OOM; - goto failed; - } - - memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize); - dbuf.dsize += new_dbuf.dsize; - - ret = _tdb1_store(tdb, key, dbuf, 0, hash); - -failed: - tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK); - SAFE_FREE(dbuf.dptr); - return ret; -} - - -/* - get the tdb sequence number. Only makes sense if the writers opened - with TDB1_SEQNUM set. Note that this sequence number will wrap quite - quickly, so it should only be used for a 'has something changed' - test, not for code that relies on the count of the number of changes - made. If you want a counter then use a tdb record. - - The aim of this sequence number is to allow for a very lightweight - test of a possible tdb change. -*/ -int tdb1_get_seqnum(struct tdb_context *tdb) -{ - tdb1_off_t seqnum=0; - - tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum); - return seqnum; -} - - -/* - add a region of the file to the freelist. Length is the size of the region in bytes, - which includes the free list header that needs to be added - */ -static int tdb1_free_region(struct tdb_context *tdb, tdb1_off_t offset, ssize_t length) -{ - struct tdb1_record rec; - if (length <= sizeof(rec)) { - /* the region is not worth adding */ - return 0; - } - if (length + offset > tdb->file->map_size) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb1_free_region: adding region beyond" - " end of file"); - return -1; - } - memset(&rec,'\0',sizeof(rec)); - rec.rec_len = length - sizeof(rec); - if (tdb1_free(tdb, offset, &rec) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_free_region: failed to add free record"); - return -1; - } - return 0; -} - -/* - wipe the entire database, deleting all records. This can be done - very fast by using a allrecord lock. The entire data portion of the - file becomes a single entry in the freelist. - - This code carefully steps around the recovery area, leaving it alone - */ -int tdb1_wipe_all(struct tdb_context *tdb) -{ - int i; - tdb1_off_t offset = 0; - ssize_t data_len; - tdb1_off_t recovery_head; - tdb1_len_t recovery_size = 0; - - if (tdb_lockall(tdb) != TDB_SUCCESS) { - return -1; - } - - - /* see if the tdb has a recovery area, and remember its size - if so. We don't want to lose this as otherwise each - tdb1_wipe_all() in a transaction will increase the size of - the tdb by the size of the recovery area */ - if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, &recovery_head) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_wipe_all: failed to read recovery head"); - goto failed; - } - - if (recovery_head != 0) { - struct tdb1_record rec; - if (tdb->tdb1.io->tdb1_read(tdb, recovery_head, &rec, sizeof(rec), TDB1_DOCONV()) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_wipe_all: failed to read recovery record"); - return -1; - } - recovery_size = rec.rec_len + sizeof(rec); - } - - /* wipe the hashes */ - for (i=0;itdb1.header.hash_size;i++) { - if (tdb1_ofs_write(tdb, TDB1_HASH_TOP(i), &offset) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_wipe_all: failed to write hash %d", i); - goto failed; - } - } - - /* wipe the freelist */ - if (tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_wipe_all: failed to write freelist"); - goto failed; - } - - /* add all the rest of the file to the freelist, possibly leaving a gap - for the recovery area */ - if (recovery_size == 0) { - /* the simple case - the whole file can be used as a freelist */ - data_len = (tdb->file->map_size - TDB1_DATA_START(tdb->tdb1.header.hash_size)); - if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) { - goto failed; - } - } else { - /* we need to add two freelist entries - one on either - side of the recovery area - - Note that we cannot shift the recovery area during - this operation. Only the transaction.c code may - move the recovery area or we risk subtle data - corruption - */ - data_len = (recovery_head - TDB1_DATA_START(tdb->tdb1.header.hash_size)); - if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) { - goto failed; - } - /* and the 2nd free list entry after the recovery area - if any */ - data_len = tdb->file->map_size - (recovery_head+recovery_size); - if (tdb1_free_region(tdb, recovery_head+recovery_size, data_len) != 0) { - goto failed; - } - } - - tdb1_increment_seqnum_nonblock(tdb); - tdb_unlockall(tdb); - return 0; - -failed: - tdb_unlockall(tdb); - return -1; -} - -/* Even on files, we can get partial writes due to signals. */ -bool tdb1_write_all(int fd, const void *buf, size_t count) -{ - while (count) { - ssize_t ret; - ret = write(fd, buf, count); - if (ret < 0) - return false; - buf = (const char *)buf + ret; - count -= ret; - } - return true; -} diff --git a/ccan/tdb2/tdb1_transaction.c b/ccan/tdb2/tdb1_transaction.c deleted file mode 100644 index 9cb95235..00000000 --- a/ccan/tdb2/tdb1_transaction.c +++ /dev/null @@ -1,1339 +0,0 @@ - /* - Unix SMB/CIFS implementation. - - trivial database library - - Copyright (C) Andrew Tridgell 2005 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ - -#include "tdb1_private.h" - -/* - transaction design: - - - only allow a single transaction at a time per database. This makes - using the transaction API simpler, as otherwise the caller would - have to cope with temporary failures in transactions that conflict - with other current transactions - - - keep the transaction recovery information in the same file as the - database, using a special 'transaction recovery' record pointed at - by the header. This removes the need for extra journal files as - used by some other databases - - - dynamically allocated the transaction recover record, re-using it - for subsequent transactions. If a larger record is needed then - tdb1_free() the old record to place it on the normal tdb freelist - before allocating the new record - - - during transactions, keep a linked list of writes all that have - been performed by intercepting all tdb1_write() calls. The hooked - transaction versions of tdb1_read() and tdb1_write() check this - linked list and try to use the elements of the list in preference - to the real database. - - - don't allow any locks to be held when a transaction starts, - otherwise we can end up with deadlock (plus lack of lock nesting - in posix locks would mean the lock is lost) - - - if the caller gains a lock during the transaction but doesn't - release it then fail the commit - - - allow for nested calls to tdb1_transaction_start(), re-using the - existing transaction record. If the inner transaction is cancelled - then a subsequent commit will fail - - - keep a mirrored copy of the tdb hash chain heads to allow for the - fast hash heads scan on traverse, updating the mirrored copy in - the transaction version of tdb1_write - - - allow callers to mix transaction and non-transaction use of tdb, - although once a transaction is started then an exclusive lock is - gained until the transaction is committed or cancelled - - - the commit stategy involves first saving away all modified data - into a linearised buffer in the transaction recovery area, then - marking the transaction recovery area with a magic value to - indicate a valid recovery record. In total 4 fsync/msync calls are - needed per commit to prevent race conditions. It might be possible - to reduce this to 3 or even 2 with some more work. - - - check for a valid recovery record on open of the tdb, while the - open lock is held. Automatically recover from the transaction - recovery area if needed, then continue with the open as - usual. This allows for smooth crash recovery with no administrator - intervention. - - - if TDB_NOSYNC is passed to flags in tdb1_open then transactions are - still available, but no transaction recovery area is used and no - fsync/msync calls are made. - - - if TDB_ALLOW_NESTING is passed to flags in tdb open, or added using - tdb1_add_flags() transaction nesting is enabled. - The default is that transaction nesting is NOT allowed. - - Beware. when transactions are nested a transaction successfully - completed with tdb1_transaction_commit() can be silently unrolled later. -*/ - - -/* - hold the context of any current transaction -*/ -struct tdb1_transaction { - /* we keep a mirrored copy of the tdb hash heads here so - tdb1_next_hash_chain() can operate efficiently */ - uint32_t *hash_heads; - - /* the original io methods - used to do IOs to the real db */ - const struct tdb1_methods *io_methods; - - /* the list of transaction blocks. When a block is first - written to, it gets created in this list */ - uint8_t **blocks; - uint32_t num_blocks; - uint32_t block_size; /* bytes in each block */ - uint32_t last_block_size; /* number of valid bytes in the last block */ - - /* non-zero when an internal transaction error has - occurred. All write operations will then fail until the - transaction is ended */ - int transaction_error; - - /* when inside a transaction we need to keep track of any - nested tdb1_transaction_start() calls, as these are allowed, - but don't create a new transaction */ - int nesting; - - /* set when a prepare has already occurred */ - bool prepared; - tdb1_off_t magic_offset; - - /* old file size before transaction */ - tdb1_len_t old_map_size; - - /* did we expand in this transaction */ - bool expanded; -}; - - -/* - read while in a transaction. We need to check first if the data is in our list - of transaction elements, then if not do a real read -*/ -static int transaction1_read(struct tdb_context *tdb, tdb1_off_t off, void *buf, - tdb1_len_t len, int cv) -{ - uint32_t blk; - - /* break it down into block sized ops */ - while (len + (off % tdb->tdb1.transaction->block_size) > tdb->tdb1.transaction->block_size) { - tdb1_len_t len2 = tdb->tdb1.transaction->block_size - (off % tdb->tdb1.transaction->block_size); - if (transaction1_read(tdb, off, buf, len2, cv) != 0) { - return -1; - } - len -= len2; - off += len2; - buf = (void *)(len2 + (char *)buf); - } - - if (len == 0) { - return 0; - } - - blk = off / tdb->tdb1.transaction->block_size; - - /* see if we have it in the block list */ - if (tdb->tdb1.transaction->num_blocks <= blk || - tdb->tdb1.transaction->blocks[blk] == NULL) { - /* nope, do a real read */ - if (tdb->tdb1.transaction->io_methods->tdb1_read(tdb, off, buf, len, cv) != 0) { - goto fail; - } - return 0; - } - - /* it is in the block list. Now check for the last block */ - if (blk == tdb->tdb1.transaction->num_blocks-1) { - if (len > tdb->tdb1.transaction->last_block_size) { - goto fail; - } - } - - /* now copy it out of this block */ - memcpy(buf, tdb->tdb1.transaction->blocks[blk] + (off % tdb->tdb1.transaction->block_size), len); - if (cv) { - tdb1_convert(buf, len); - } - return 0; - -fail: - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "transaction_read: failed at off=%d len=%d", - off, len); - tdb->tdb1.transaction->transaction_error = 1; - return -1; -} - - -/* - write while in a transaction -*/ -static int transaction1_write(struct tdb_context *tdb, tdb1_off_t off, - const void *buf, tdb1_len_t len) -{ - uint32_t blk; - - /* Only a commit is allowed on a prepared transaction */ - if (tdb->tdb1.transaction->prepared) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, - "transaction_write: transaction already" - " prepared, write not allowed"); - tdb->tdb1.transaction->transaction_error = 1; - return -1; - } - - /* if the write is to a hash head, then update the transaction - hash heads */ - if (len == sizeof(tdb1_off_t) && off >= TDB1_FREELIST_TOP && - off < TDB1_FREELIST_TOP+TDB1_HASHTABLE_SIZE(tdb)) { - uint32_t chain = (off-TDB1_FREELIST_TOP) / sizeof(tdb1_off_t); - memcpy(&tdb->tdb1.transaction->hash_heads[chain], buf, len); - } - - /* break it up into block sized chunks */ - while (len + (off % tdb->tdb1.transaction->block_size) > tdb->tdb1.transaction->block_size) { - tdb1_len_t len2 = tdb->tdb1.transaction->block_size - (off % tdb->tdb1.transaction->block_size); - if (transaction1_write(tdb, off, buf, len2) != 0) { - return -1; - } - len -= len2; - off += len2; - if (buf != NULL) { - buf = (const void *)(len2 + (const char *)buf); - } - } - - if (len == 0) { - return 0; - } - - blk = off / tdb->tdb1.transaction->block_size; - off = off % tdb->tdb1.transaction->block_size; - - if (tdb->tdb1.transaction->num_blocks <= blk) { - uint8_t **new_blocks; - /* expand the blocks array */ - if (tdb->tdb1.transaction->blocks == NULL) { - new_blocks = (uint8_t **)malloc( - (blk+1)*sizeof(uint8_t *)); - } else { - new_blocks = (uint8_t **)realloc( - tdb->tdb1.transaction->blocks, - (blk+1)*sizeof(uint8_t *)); - } - if (new_blocks == NULL) { - tdb->last_error = TDB_ERR_OOM; - goto fail; - } - memset(&new_blocks[tdb->tdb1.transaction->num_blocks], 0, - (1+(blk - tdb->tdb1.transaction->num_blocks))*sizeof(uint8_t *)); - tdb->tdb1.transaction->blocks = new_blocks; - tdb->tdb1.transaction->num_blocks = blk+1; - tdb->tdb1.transaction->last_block_size = 0; - } - - /* allocate and fill a block? */ - if (tdb->tdb1.transaction->blocks[blk] == NULL) { - tdb->tdb1.transaction->blocks[blk] = (uint8_t *)calloc(tdb->tdb1.transaction->block_size, 1); - if (tdb->tdb1.transaction->blocks[blk] == NULL) { - tdb->last_error = TDB_ERR_OOM; - tdb->tdb1.transaction->transaction_error = 1; - return -1; - } - if (tdb->tdb1.transaction->old_map_size > blk * tdb->tdb1.transaction->block_size) { - tdb1_len_t len2 = tdb->tdb1.transaction->block_size; - if (len2 + (blk * tdb->tdb1.transaction->block_size) > tdb->tdb1.transaction->old_map_size) { - len2 = tdb->tdb1.transaction->old_map_size - (blk * tdb->tdb1.transaction->block_size); - } - if (tdb->tdb1.transaction->io_methods->tdb1_read(tdb, blk * tdb->tdb1.transaction->block_size, - tdb->tdb1.transaction->blocks[blk], - len2, 0) != 0) { - SAFE_FREE(tdb->tdb1.transaction->blocks[blk]); - tdb->last_error = TDB_ERR_IO; - goto fail; - } - if (blk == tdb->tdb1.transaction->num_blocks-1) { - tdb->tdb1.transaction->last_block_size = len2; - } - } - } - - /* overwrite part of an existing block */ - if (buf == NULL) { - memset(tdb->tdb1.transaction->blocks[blk] + off, 0, len); - } else { - memcpy(tdb->tdb1.transaction->blocks[blk] + off, buf, len); - } - if (blk == tdb->tdb1.transaction->num_blocks-1) { - if (len + off > tdb->tdb1.transaction->last_block_size) { - tdb->tdb1.transaction->last_block_size = len + off; - } - } - - return 0; - -fail: - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "transaction_write: failed at off=%d len=%d", - (blk*tdb->tdb1.transaction->block_size) + off, len); - tdb->tdb1.transaction->transaction_error = 1; - return -1; -} - - -/* - write while in a transaction - this varient never expands the transaction blocks, it only - updates existing blocks. This means it cannot change the recovery size -*/ -static int transaction1_write_existing(struct tdb_context *tdb, tdb1_off_t off, - const void *buf, tdb1_len_t len) -{ - uint32_t blk; - - /* break it up into block sized chunks */ - while (len + (off % tdb->tdb1.transaction->block_size) > tdb->tdb1.transaction->block_size) { - tdb1_len_t len2 = tdb->tdb1.transaction->block_size - (off % tdb->tdb1.transaction->block_size); - if (transaction1_write_existing(tdb, off, buf, len2) != 0) { - return -1; - } - len -= len2; - off += len2; - if (buf != NULL) { - buf = (const void *)(len2 + (const char *)buf); - } - } - - if (len == 0) { - return 0; - } - - blk = off / tdb->tdb1.transaction->block_size; - off = off % tdb->tdb1.transaction->block_size; - - if (tdb->tdb1.transaction->num_blocks <= blk || - tdb->tdb1.transaction->blocks[blk] == NULL) { - return 0; - } - - if (blk == tdb->tdb1.transaction->num_blocks-1 && - off + len > tdb->tdb1.transaction->last_block_size) { - if (off >= tdb->tdb1.transaction->last_block_size) { - return 0; - } - len = tdb->tdb1.transaction->last_block_size - off; - } - - /* overwrite part of an existing block */ - memcpy(tdb->tdb1.transaction->blocks[blk] + off, buf, len); - - return 0; -} - - -/* - accelerated hash chain head search, using the cached hash heads -*/ -static void transaction1_next_hash_chain(struct tdb_context *tdb, uint32_t *chain) -{ - uint32_t h = *chain; - for (;h < tdb->tdb1.header.hash_size;h++) { - /* the +1 takes account of the freelist */ - if (0 != tdb->tdb1.transaction->hash_heads[h+1]) { - break; - } - } - (*chain) = h; -} - -/* - out of bounds check during a transaction -*/ -static int transaction1_oob(struct tdb_context *tdb, tdb1_off_t off, tdb1_off_t len, int probe) -{ - if (off + len >= off && off + len <= tdb->file->map_size) { - return 0; - } - tdb->last_error = TDB_ERR_IO; - return -1; -} - -/* - transaction version of tdb1_expand(). -*/ -static int transaction1_expand_file(struct tdb_context *tdb, tdb1_off_t size, - tdb1_off_t addition) -{ - /* add a write to the transaction elements, so subsequent - reads see the zero data */ - if (transaction1_write(tdb, size, NULL, addition) != 0) { - return -1; - } - - tdb->tdb1.transaction->expanded = true; - - return 0; -} - -static const struct tdb1_methods transaction1_methods = { - transaction1_read, - transaction1_write, - transaction1_next_hash_chain, - transaction1_oob, - transaction1_expand_file, -}; - - -/* - start a tdb transaction. No token is returned, as only a single - transaction is allowed to be pending per tdb_context -*/ -static int _tdb1_transaction_start(struct tdb_context *tdb) -{ - /* some sanity checks */ - if (tdb->flags & TDB_INTERNAL) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb1_transaction_start:" - " cannot start a" - " transaction on an" - " internal tdb"); - return -1; - } - - if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_RDONLY, - TDB_LOG_USE_ERROR, - "tdb_transaction_start:" - " cannot start a" - " transaction on a " - " read-only tdb"); - return -1; - } - - /* cope with nested tdb1_transaction_start() calls */ - if (tdb->tdb1.transaction != NULL) { - if (!(tdb->flags & TDB_ALLOW_NESTING)) { - tdb->last_error - = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_transaction_start:" - " already inside transaction"); - return -1; - } - tdb->stats.transaction_nest++; - tdb->tdb1.transaction->nesting++; - return 0; - } - - if (tdb1_have_extra_locks(tdb)) { - /* the caller must not have any locks when starting a - transaction as otherwise we'll be screwed by lack - of nested locks in posix */ - tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, - "tdb1_transaction_start: cannot start a" - " transaction with locks held"); - return -1; - } - - if (tdb->tdb1.travlocks.next != NULL) { - /* you cannot use transactions inside a traverse (although you can use - traverse inside a transaction) as otherwise you can end up with - deadlock */ - tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, - "tdb1_transaction_start: cannot start a" - " transaction within a traverse"); - return -1; - } - - tdb->tdb1.transaction = (struct tdb1_transaction *) - calloc(sizeof(struct tdb1_transaction), 1); - if (tdb->tdb1.transaction == NULL) { - tdb->last_error = TDB_ERR_OOM; - return -1; - } - - /* a page at a time seems like a reasonable compromise between compactness and efficiency */ - tdb->tdb1.transaction->block_size = tdb->tdb1.page_size; - - /* get the transaction write lock. This is a blocking lock. As - discussed with Volker, there are a number of ways we could - make this async, which we will probably do in the future */ - if (tdb1_transaction_lock(tdb, F_WRLCK, TDB_LOCK_WAIT) == -1) { - SAFE_FREE(tdb->tdb1.transaction->blocks); - SAFE_FREE(tdb->tdb1.transaction); - return -1; - } - - /* get a read lock from the freelist to the end of file. This - is upgraded to a write lock during the commit */ - if (tdb1_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true) == -1) { - if (errno != EAGAIN && errno != EINTR) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_start:" - " failed to get hash locks"); - } - goto fail_allrecord_lock; - } - - /* setup a copy of the hash table heads so the hash scan in - traverse can be fast */ - tdb->tdb1.transaction->hash_heads = (uint32_t *) - calloc(tdb->tdb1.header.hash_size+1, sizeof(uint32_t)); - if (tdb->tdb1.transaction->hash_heads == NULL) { - tdb->last_error = TDB_ERR_OOM; - goto fail; - } - if (tdb->tdb1.io->tdb1_read(tdb, TDB1_FREELIST_TOP, tdb->tdb1.transaction->hash_heads, - TDB1_HASHTABLE_SIZE(tdb), 0) != 0) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_start: failed to read hash heads"); - goto fail; - } - - /* make sure we know about any file expansions already done by - anyone else */ - tdb->tdb1.io->tdb1_oob(tdb, tdb->file->map_size, 1, 1); - tdb->tdb1.transaction->old_map_size = tdb->file->map_size; - - /* finally hook the io methods, replacing them with - transaction specific methods */ - tdb->tdb1.transaction->io_methods = tdb->tdb1.io; - tdb->tdb1.io = &transaction1_methods; - - tdb->stats.transactions++; - return 0; - -fail: - tdb1_allrecord_unlock(tdb, F_RDLCK); -fail_allrecord_lock: - tdb1_transaction_unlock(tdb, F_WRLCK); - SAFE_FREE(tdb->tdb1.transaction->blocks); - SAFE_FREE(tdb->tdb1.transaction->hash_heads); - SAFE_FREE(tdb->tdb1.transaction); - return -1; -} - -int tdb1_transaction_start(struct tdb_context *tdb) -{ - return _tdb1_transaction_start(tdb); -} - -/* - sync to disk -*/ -static int transaction1_sync(struct tdb_context *tdb, tdb1_off_t offset, tdb1_len_t length) -{ - if (tdb->flags & TDB_NOSYNC) { - return 0; - } - -#if HAVE_FDATASYNC - if (fdatasync(tdb->file->fd) != 0) { -#else - if (fsync(tdb->file->fd) != 0) { -#endif - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_transaction: fsync failed"); - return -1; - } -#if HAVE_MMAP - if (tdb->file->map_ptr) { - tdb1_off_t moffset = offset & ~(tdb->tdb1.page_size-1); - if (msync(moffset + (char *)tdb->file->map_ptr, - length + (offset - moffset), MS_SYNC) != 0) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_transaction:" - " msync failed - %s", - strerror(errno)); - return -1; - } - } -#endif - return 0; -} - - -static int _tdb1_transaction_cancel(struct tdb_context *tdb) -{ - int i, ret = 0; - - if (tdb->tdb1.transaction == NULL) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, - "tdb1_transaction_cancel:" - " no transaction"); - return -1; - } - - if (tdb->tdb1.transaction->nesting != 0) { - tdb->tdb1.transaction->transaction_error = 1; - tdb->tdb1.transaction->nesting--; - return 0; - } - - tdb->file->map_size = tdb->tdb1.transaction->old_map_size; - - /* free all the transaction blocks */ - for (i=0;itdb1.transaction->num_blocks;i++) { - if (tdb->tdb1.transaction->blocks[i] != NULL) { - free(tdb->tdb1.transaction->blocks[i]); - } - } - SAFE_FREE(tdb->tdb1.transaction->blocks); - - if (tdb->tdb1.transaction->magic_offset) { - const struct tdb1_methods *methods = tdb->tdb1.transaction->io_methods; - const uint32_t invalid = TDB1_RECOVERY_INVALID_MAGIC; - - /* remove the recovery marker */ - if (methods->tdb1_write(tdb, tdb->tdb1.transaction->magic_offset, &invalid, 4) == -1 || - transaction1_sync(tdb, tdb->tdb1.transaction->magic_offset, 4) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_cancel: failed to" - " remove recovery magic"); - ret = -1; - } - } - - /* This also removes the OPEN_LOCK, if we have it. */ - tdb1_release_transaction_locks(tdb); - - /* restore the normal io methods */ - tdb->tdb1.io = tdb->tdb1.transaction->io_methods; - - SAFE_FREE(tdb->tdb1.transaction->hash_heads); - SAFE_FREE(tdb->tdb1.transaction); - - return ret; -} - -/* - cancel the current transaction -*/ -int tdb1_transaction_cancel(struct tdb_context *tdb) -{ - tdb->stats.transaction_cancel++; - return _tdb1_transaction_cancel(tdb); -} - -/* - work out how much space the linearised recovery data will consume -*/ -static tdb1_len_t tdb1_recovery_size(struct tdb_context *tdb) -{ - tdb1_len_t recovery_size = 0; - int i; - - recovery_size = sizeof(uint32_t); - for (i=0;itdb1.transaction->num_blocks;i++) { - if (i * tdb->tdb1.transaction->block_size >= tdb->tdb1.transaction->old_map_size) { - break; - } - if (tdb->tdb1.transaction->blocks[i] == NULL) { - continue; - } - recovery_size += 2*sizeof(tdb1_off_t); - if (i == tdb->tdb1.transaction->num_blocks-1) { - recovery_size += tdb->tdb1.transaction->last_block_size; - } else { - recovery_size += tdb->tdb1.transaction->block_size; - } - } - - return recovery_size; -} - -int tdb1_recovery_area(struct tdb_context *tdb, - const struct tdb1_methods *methods, - tdb1_off_t *recovery_offset, - struct tdb1_record *rec) -{ - if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, recovery_offset) == -1) { - return -1; - } - - if (*recovery_offset == 0) { - rec->rec_len = 0; - return 0; - } - - if (methods->tdb1_read(tdb, *recovery_offset, rec, sizeof(*rec), - TDB1_DOCONV()) == -1) { - return -1; - } - - /* ignore invalid recovery regions: can happen in crash */ - if (rec->magic != TDB1_RECOVERY_MAGIC && - rec->magic != TDB1_RECOVERY_INVALID_MAGIC) { - *recovery_offset = 0; - rec->rec_len = 0; - } - return 0; -} - -/* - allocate the recovery area, or use an existing recovery area if it is - large enough -*/ -static int tdb1_recovery_allocate(struct tdb_context *tdb, - tdb1_len_t *recovery_size, - tdb1_off_t *recovery_offset, - tdb1_len_t *recovery_max_size) -{ - struct tdb1_record rec; - const struct tdb1_methods *methods = tdb->tdb1.transaction->io_methods; - tdb1_off_t recovery_head; - - if (tdb1_recovery_area(tdb, methods, &recovery_head, &rec) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_recovery_allocate:" - " failed to read recovery head"); - return -1; - } - - *recovery_size = tdb1_recovery_size(tdb); - - if (recovery_head != 0 && *recovery_size <= rec.rec_len) { - /* it fits in the existing area */ - *recovery_max_size = rec.rec_len; - *recovery_offset = recovery_head; - return 0; - } - - /* we need to free up the old recovery area, then allocate a - new one at the end of the file. Note that we cannot use - tdb1_allocate() to allocate the new one as that might return - us an area that is being currently used (as of the start of - the transaction) */ - if (recovery_head != 0) { - if (tdb1_free(tdb, recovery_head, &rec) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_recovery_allocate: failed to free" - " previous recovery area"); - return -1; - } - } - - /* the tdb1_free() call might have increased the recovery size */ - *recovery_size = tdb1_recovery_size(tdb); - - /* round up to a multiple of page size */ - *recovery_max_size = tdb1_expand_adjust(tdb->file->map_size, - *recovery_size, - tdb->tdb1.page_size) - - sizeof(rec); - - *recovery_offset = tdb->file->map_size; - recovery_head = *recovery_offset; - - if (methods->tdb1_expand_file(tdb, tdb->tdb1.transaction->old_map_size, - (tdb->file->map_size - tdb->tdb1.transaction->old_map_size) + - sizeof(rec) + *recovery_max_size) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_recovery_allocate:" - " failed to create recovery area"); - return -1; - } - tdb->stats.transaction_expand_file++; - - /* remap the file (if using mmap) */ - methods->tdb1_oob(tdb, tdb->file->map_size, 1, 1); - - /* we have to reset the old map size so that we don't try to expand the file - again in the transaction commit, which would destroy the recovery area */ - tdb->tdb1.transaction->old_map_size = tdb->file->map_size; - - /* write the recovery header offset and sync - we can sync without a race here - as the magic ptr in the recovery record has not been set */ - TDB1_CONV(recovery_head); - if (methods->tdb1_write(tdb, TDB1_RECOVERY_HEAD, - &recovery_head, sizeof(tdb1_off_t)) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_recovery_allocate:" - " failed to write recovery head"); - return -1; - } - if (transaction1_write_existing(tdb, TDB1_RECOVERY_HEAD, &recovery_head, sizeof(tdb1_off_t)) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_recovery_allocate:" - " failed to write recovery head"); - return -1; - } - - return 0; -} - - -/* - setup the recovery data that will be used on a crash during commit -*/ -static int transaction1_setup_recovery(struct tdb_context *tdb, - tdb1_off_t *magic_offset) -{ - tdb1_len_t recovery_size; - unsigned char *data, *p; - const struct tdb1_methods *methods = tdb->tdb1.transaction->io_methods; - struct tdb1_record *rec; - tdb1_off_t recovery_offset, recovery_max_size; - tdb1_off_t old_map_size = tdb->tdb1.transaction->old_map_size; - uint32_t magic, tailer; - int i; - - /* - check that the recovery area has enough space - */ - if (tdb1_recovery_allocate(tdb, &recovery_size, - &recovery_offset, &recovery_max_size) == -1) { - return -1; - } - - data = (unsigned char *)malloc(recovery_size + sizeof(*rec)); - if (data == NULL) { - tdb->last_error = TDB_ERR_OOM; - return -1; - } - - rec = (struct tdb1_record *)data; - memset(rec, 0, sizeof(*rec)); - - rec->magic = TDB1_RECOVERY_INVALID_MAGIC; - rec->data_len = recovery_size; - rec->rec_len = recovery_max_size; - rec->key_len = old_map_size; - TDB1_CONV(*rec); - - /* build the recovery data into a single blob to allow us to do a single - large write, which should be more efficient */ - p = data + sizeof(*rec); - for (i=0;itdb1.transaction->num_blocks;i++) { - tdb1_off_t offset; - tdb1_len_t length; - - if (tdb->tdb1.transaction->blocks[i] == NULL) { - continue; - } - - offset = i * tdb->tdb1.transaction->block_size; - length = tdb->tdb1.transaction->block_size; - if (i == tdb->tdb1.transaction->num_blocks-1) { - length = tdb->tdb1.transaction->last_block_size; - } - - if (offset >= old_map_size) { - continue; - } - if (offset + length > tdb->tdb1.transaction->old_map_size) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb1_transaction_setup_recovery: transaction data over new region boundary"); - free(data); - return -1; - } - memcpy(p, &offset, 4); - memcpy(p+4, &length, 4); - if (TDB1_DOCONV()) { - tdb1_convert(p, 8); - } - /* the recovery area contains the old data, not the - new data, so we have to call the original tdb1_read - method to get it */ - if (methods->tdb1_read(tdb, offset, p + 8, length, 0) != 0) { - free(data); - tdb->last_error = TDB_ERR_IO; - return -1; - } - p += 8 + length; - } - - /* and the tailer */ - tailer = sizeof(*rec) + recovery_max_size; - memcpy(p, &tailer, 4); - if (TDB1_DOCONV()) { - tdb1_convert(p, 4); - } - - /* write the recovery data to the recovery area */ - if (methods->tdb1_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_setup_recovery:" - " failed to write recovery data"); - free(data); - return -1; - } - if (transaction1_write_existing(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_setup_recovery: failed to write" - " secondary recovery data"); - free(data); - return -1; - } - - /* as we don't have ordered writes, we have to sync the recovery - data before we update the magic to indicate that the recovery - data is present */ - if (transaction1_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) { - free(data); - return -1; - } - - free(data); - - magic = TDB1_RECOVERY_MAGIC; - TDB1_CONV(magic); - - *magic_offset = recovery_offset + offsetof(struct tdb1_record, magic); - - if (methods->tdb1_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_setup_recovery:" - " failed to write recovery magic"); - return -1; - } - if (transaction1_write_existing(tdb, *magic_offset, &magic, sizeof(magic)) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_setup_recovery:" - " failed to write secondary recovery magic"); - return -1; - } - - /* ensure the recovery magic marker is on disk */ - if (transaction1_sync(tdb, *magic_offset, sizeof(magic)) == -1) { - return -1; - } - - return 0; -} - -static int _tdb1_transaction_prepare_commit(struct tdb_context *tdb) -{ - const struct tdb1_methods *methods; - - if (tdb->tdb1.transaction == NULL) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, - "tdb1_transaction_prepare_commit:" - " no transaction"); - return -1; - } - - if (tdb->tdb1.transaction->prepared) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, - "tdb1_transaction_prepare_commit:" - " transaction already prepared"); - _tdb1_transaction_cancel(tdb); - return -1; - } - - if (tdb->tdb1.transaction->transaction_error) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_transaction_prepare_commit:" - " transaction error pending"); - _tdb1_transaction_cancel(tdb); - return -1; - } - - - if (tdb->tdb1.transaction->nesting != 0) { - return 0; - } - - /* check for a null transaction */ - if (tdb->tdb1.transaction->blocks == NULL) { - return 0; - } - - methods = tdb->tdb1.transaction->io_methods; - - /* if there are any locks pending then the caller has not - nested their locks properly, so fail the transaction */ - if (tdb1_have_extra_locks(tdb)) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR, - "tdb1_transaction_prepare_commit:" - " locks pending on commit"); - _tdb1_transaction_cancel(tdb); - return -1; - } - - /* upgrade the main transaction lock region to a write lock */ - if (tdb1_allrecord_upgrade(tdb) == -1) { - if (errno != EAGAIN && errno != EINTR) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_prepare_commit:" - " failed to upgrade hash locks"); - } - return -1; - } - - /* get the open lock - this prevents new users attaching to the database - during the commit */ - if (tdb1_nest_lock(tdb, TDB1_OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT) == -1) { - if (errno != EAGAIN && errno != EINTR) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_prepare_commit:" - " failed to get open lock"); - } - return -1; - } - - if (!(tdb->flags & TDB_NOSYNC)) { - /* write the recovery data to the end of the file */ - if (transaction1_setup_recovery(tdb, &tdb->tdb1.transaction->magic_offset) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_prepare_commit:" - " failed to setup recovery data"); - return -1; - } - } - - tdb->tdb1.transaction->prepared = true; - - /* expand the file to the new size if needed */ - if (tdb->file->map_size != tdb->tdb1.transaction->old_map_size) { - if (methods->tdb1_expand_file(tdb, tdb->tdb1.transaction->old_map_size, - tdb->file->map_size - - tdb->tdb1.transaction->old_map_size) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_prepare_commit:" - " expansion failed"); - return -1; - } - tdb->stats.transaction_expand_file++; - tdb->file->map_size = tdb->tdb1.transaction->old_map_size; - methods->tdb1_oob(tdb, tdb->file->map_size, 1, 1); - } - - /* Keep the open lock until the actual commit */ - - return 0; -} - -/* - prepare to commit the current transaction -*/ -int tdb1_transaction_prepare_commit(struct tdb_context *tdb) -{ - return _tdb1_transaction_prepare_commit(tdb); -} - -/* A repack is worthwhile if the largest is less than half total free. */ -static bool repack_worthwhile(struct tdb_context *tdb) -{ - tdb1_off_t ptr; - struct tdb1_record rec; - tdb1_len_t total = 0, largest = 0; - - if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &ptr) == -1) { - return false; - } - - while (ptr != 0 && tdb1_rec_free_read(tdb, ptr, &rec) == 0) { - total += rec.rec_len; - if (rec.rec_len > largest) { - largest = rec.rec_len; - } - ptr = rec.next; - } - - return total > largest * 2; -} - -/* - commit the current transaction -*/ -int tdb1_transaction_commit(struct tdb_context *tdb) -{ - const struct tdb1_methods *methods; - int i; - bool need_repack = false; - - if (tdb->tdb1.transaction == NULL) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, - "tdb1_transaction_commit:" - " no transaction"); - return -1; - } - - if (tdb->tdb1.transaction->transaction_error) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb1_transaction_commit:" - " transaction error pending"); - _tdb1_transaction_cancel(tdb); - return -1; - } - - - if (tdb->tdb1.transaction->nesting != 0) { - tdb->tdb1.transaction->nesting--; - return 0; - } - - /* check for a null transaction */ - if (tdb->tdb1.transaction->blocks == NULL) { - _tdb1_transaction_cancel(tdb); - return 0; - } - - if (!tdb->tdb1.transaction->prepared) { - int ret = _tdb1_transaction_prepare_commit(tdb); - if (ret) { - _tdb1_transaction_cancel(tdb); - return ret; - } - } - - methods = tdb->tdb1.transaction->io_methods; - - /* perform all the writes */ - for (i=0;itdb1.transaction->num_blocks;i++) { - tdb1_off_t offset; - tdb1_len_t length; - - if (tdb->tdb1.transaction->blocks[i] == NULL) { - continue; - } - - offset = i * tdb->tdb1.transaction->block_size; - length = tdb->tdb1.transaction->block_size; - if (i == tdb->tdb1.transaction->num_blocks-1) { - length = tdb->tdb1.transaction->last_block_size; - } - - if (methods->tdb1_write(tdb, offset, tdb->tdb1.transaction->blocks[i], length) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_commit:" - " write failed during commit"); - - /* we've overwritten part of the data and - possibly expanded the file, so we need to - run the crash recovery code */ - tdb->tdb1.io = methods; - tdb1_transaction_recover(tdb); - - _tdb1_transaction_cancel(tdb); - - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_commit: write failed"); - return -1; - } - SAFE_FREE(tdb->tdb1.transaction->blocks[i]); - } - - /* Do this before we drop lock or blocks. */ - if (tdb->tdb1.transaction->expanded) { - need_repack = repack_worthwhile(tdb); - } - - SAFE_FREE(tdb->tdb1.transaction->blocks); - tdb->tdb1.transaction->num_blocks = 0; - - /* ensure the new data is on disk */ - if (transaction1_sync(tdb, 0, tdb->file->map_size) == -1) { - return -1; - } - - /* - TODO: maybe write to some dummy hdr field, or write to magic - offset without mmap, before the last sync, instead of the - utime() call - */ - - /* on some systems (like Linux 2.6.x) changes via mmap/msync - don't change the mtime of the file, this means the file may - not be backed up (as tdb rounding to block sizes means that - file size changes are quite rare too). The following forces - mtime changes when a transaction completes */ -#if HAVE_UTIME - utime(tdb->name, NULL); -#endif - - /* use a transaction cancel to free memory and remove the - transaction locks */ - _tdb1_transaction_cancel(tdb); - - if (need_repack) { - if (tdb_repack(tdb) != 0) - return -1; - } - - return 0; -} - - -/* - recover from an aborted transaction. Must be called with exclusive - database write access already established (including the open - lock to prevent new processes attaching) -*/ -int tdb1_transaction_recover(struct tdb_context *tdb) -{ - tdb1_off_t recovery_head, recovery_eof; - unsigned char *data, *p; - uint32_t zero = 0; - struct tdb1_record rec; - - /* find the recovery area */ - if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, &recovery_head) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_recover:" - " failed to read recovery head"); - return -1; - } - - if (recovery_head == 0) { - /* we have never allocated a recovery record */ - return 0; - } - - /* read the recovery record */ - if (tdb->tdb1.io->tdb1_read(tdb, recovery_head, &rec, - sizeof(rec), TDB1_DOCONV()) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_recover:" - " failed to read recovery record"); - return -1; - } - - if (rec.magic != TDB1_RECOVERY_MAGIC) { - /* there is no valid recovery data */ - return 0; - } - - if (tdb->flags & TDB_RDONLY) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb1_transaction_recover:" - " attempt to recover read only" - " database"); - return -1; - } - - recovery_eof = rec.key_len; - - data = (unsigned char *)malloc(rec.data_len); - if (data == NULL) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "tdb1_transaction_recover:" - " failed to allocate recovery data"); - return -1; - } - - /* read the full recovery data */ - if (tdb->tdb1.io->tdb1_read(tdb, recovery_head + sizeof(rec), data, - rec.data_len, 0) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_recover:" - " failed to read recovery data"); - return -1; - } - - /* recover the file data */ - p = data; - while (p+8 < data + rec.data_len) { - uint32_t ofs, len; - if (TDB1_DOCONV()) { - tdb1_convert(p, 8); - } - memcpy(&ofs, p, 4); - memcpy(&len, p+4, 4); - - if (tdb->tdb1.io->tdb1_write(tdb, ofs, p+8, len) == -1) { - free(data); - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_recover: failed to recover" - " %d bytes at offset %d", len, ofs); - return -1; - } - p += 8 + len; - } - - free(data); - - if (transaction1_sync(tdb, 0, tdb->file->map_size) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_recover: failed to sync recovery"); - return -1; - } - - /* if the recovery area is after the recovered eof then remove it */ - if (recovery_eof <= recovery_head) { - if (tdb1_ofs_write(tdb, TDB1_RECOVERY_HEAD, &zero) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_recover: failed to remove" - " recovery head"); - return -1; - } - } - - /* remove the recovery magic */ - if (tdb1_ofs_write(tdb, recovery_head + offsetof(struct tdb1_record, magic), - &zero) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_recover: failed to remove" - " recovery magic"); - return -1; - } - - if (transaction1_sync(tdb, 0, recovery_eof) == -1) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_transaction_recover:" - " failed to sync2 recovery"); - return -1; - } - - tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING, - "tdb1_transaction_recover: recovered %d byte database", - recovery_eof); - - /* all done */ - return 0; -} - -/* Any I/O failures we say "needs recovery". */ -tdb_bool_err tdb1_needs_recovery(struct tdb_context *tdb) -{ - tdb1_off_t recovery_head; - struct tdb1_record rec; - - /* find the recovery area */ - if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, &recovery_head) == -1) { - return TDB_ERR_TO_OFF(tdb->last_error); - } - - if (recovery_head == 0) { - /* we have never allocated a recovery record */ - return false; - } - - /* read the recovery record */ - if (tdb->tdb1.io->tdb1_read(tdb, recovery_head, &rec, - sizeof(rec), TDB1_DOCONV()) == -1) { - return TDB_ERR_TO_OFF(tdb->last_error); - } - - return (rec.magic == TDB1_RECOVERY_MAGIC); -} diff --git a/ccan/tdb2/tdb1_traverse.c b/ccan/tdb2/tdb1_traverse.c deleted file mode 100644 index d9d3649f..00000000 --- a/ccan/tdb2/tdb1_traverse.c +++ /dev/null @@ -1,373 +0,0 @@ - /* - Unix SMB/CIFS implementation. - - trivial database library - - Copyright (C) Andrew Tridgell 1999-2005 - Copyright (C) Paul `Rusty' Russell 2000 - Copyright (C) Jeremy Allison 2000-2003 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ - -#include "tdb1_private.h" - -#define TDB1_NEXT_LOCK_ERR ((tdb1_off_t)-1) - -static TDB_DATA tdb1_null; - -/* Uses traverse lock: 0 = finish, TDB1_NEXT_LOCK_ERR = error, - other = record offset */ -static tdb1_off_t tdb1_next_lock(struct tdb_context *tdb, struct tdb1_traverse_lock *tlock, - struct tdb1_record *rec) -{ - int want_next = (tlock->off != 0); - - /* Lock each chain from the start one. */ - for (; tlock->hash < tdb->tdb1.header.hash_size; tlock->hash++) { - if (!tlock->off && tlock->hash != 0) { - /* this is an optimisation for the common case where - the hash chain is empty, which is particularly - common for the use of tdb with ldb, where large - hashes are used. In that case we spend most of our - time in tdb1_brlock(), locking empty hash chains. - - To avoid this, we do an unlocked pre-check to see - if the hash chain is empty before starting to look - inside it. If it is empty then we can avoid that - hash chain. If it isn't empty then we can't believe - the value we get back, as we read it without a - lock, so instead we get the lock and re-fetch the - value below. - - Notice that not doing this optimisation on the - first hash chain is critical. We must guarantee - that we have done at least one fcntl lock at the - start of a search to guarantee that memory is - coherent on SMP systems. If records are added by - others during the search then thats OK, and we - could possibly miss those with this trick, but we - could miss them anyway without this trick, so the - semantics don't change. - - With a non-indexed ldb search this trick gains us a - factor of around 80 in speed on a linux 2.6.x - system (testing using ldbtest). - */ - tdb->tdb1.io->next_hash_chain(tdb, &tlock->hash); - if (tlock->hash == tdb->tdb1.header.hash_size) { - continue; - } - } - - if (tdb1_lock(tdb, tlock->hash, tlock->lock_rw) == -1) - return TDB1_NEXT_LOCK_ERR; - - /* No previous record? Start at top of chain. */ - if (!tlock->off) { - if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(tlock->hash), - &tlock->off) == -1) - goto fail; - } else { - /* Otherwise unlock the previous record. */ - if (tdb1_unlock_record(tdb, tlock->off) != 0) - goto fail; - } - - if (want_next) { - /* We have offset of old record: grab next */ - if (tdb1_rec_read(tdb, tlock->off, rec) == -1) - goto fail; - tlock->off = rec->next; - } - - /* Iterate through chain */ - while( tlock->off) { - tdb1_off_t current; - if (tdb1_rec_read(tdb, tlock->off, rec) == -1) - goto fail; - - /* Detect infinite loops. From "Shlomi Yaakobovich" . */ - if (tlock->off == rec->next) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, - TDB_LOG_ERROR, - "tdb1_next_lock:" - " loop detected."); - goto fail; - } - - if (!TDB1_DEAD(rec)) { - /* Woohoo: we found one! */ - if (tdb1_lock_record(tdb, tlock->off) != 0) - goto fail; - return tlock->off; - } - - /* Try to clean dead ones from old traverses */ - current = tlock->off; - tlock->off = rec->next; - if (!((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) && - tdb1_do_delete(tdb, current, rec) != 0) - goto fail; - } - tdb1_unlock(tdb, tlock->hash, tlock->lock_rw); - want_next = 0; - } - /* We finished iteration without finding anything */ - tdb->last_error = TDB_SUCCESS; - return 0; - - fail: - tlock->off = 0; - if (tdb1_unlock(tdb, tlock->hash, tlock->lock_rw) != 0) - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_next_lock: On error unlock failed!"); - return TDB1_NEXT_LOCK_ERR; -} - -/* traverse the entire database - calling fn(tdb, key, data) on each element. - return -1 on error or the record count traversed - if fn is NULL then it is not called - a non-zero return value from fn() indicates that the traversal should stop - */ -static int tdb1_traverse_internal(struct tdb_context *tdb, - int (*fn)(struct tdb_context *, - TDB_DATA, TDB_DATA, void *), - void *private_data, - struct tdb1_traverse_lock *tl) -{ - TDB_DATA key, dbuf; - struct tdb1_record rec; - int ret = 0, count = 0; - tdb1_off_t off; - - /* This was in the initializaton, above, but the IRIX compiler - * did not like it. crh - */ - tl->next = tdb->tdb1.travlocks.next; - - /* fcntl locks don't stack: beware traverse inside traverse */ - tdb->tdb1.travlocks.next = tl; - - /* tdb1_next_lock places locks on the record returned, and its chain */ - while ((off = tdb1_next_lock(tdb, tl, &rec)) != 0) { - if (off == TDB1_NEXT_LOCK_ERR) { - ret = -1; - goto out; - } - count++; - /* now read the full record */ - key.dptr = tdb1_alloc_read(tdb, tl->off + sizeof(rec), - rec.key_len + rec.data_len); - if (!key.dptr) { - ret = -1; - if (tdb1_unlock(tdb, tl->hash, tl->lock_rw) != 0) - goto out; - if (tdb1_unlock_record(tdb, tl->off) != 0) - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_traverse: key.dptr == NULL and" - " unlock_record failed!"); - goto out; - } - key.dsize = rec.key_len; - dbuf.dptr = key.dptr + rec.key_len; - dbuf.dsize = rec.data_len; - - /* Drop chain lock, call out */ - if (tdb1_unlock(tdb, tl->hash, tl->lock_rw) != 0) { - ret = -1; - SAFE_FREE(key.dptr); - goto out; - } - if (fn && fn(tdb, key, dbuf, private_data)) { - /* They want us to terminate traversal */ - if (tdb1_unlock_record(tdb, tl->off) != 0) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_traverse:" - " unlock_record failed!"); - ret = -1; - } - SAFE_FREE(key.dptr); - goto out; - } - SAFE_FREE(key.dptr); - } -out: - tdb->tdb1.travlocks.next = tl->next; - if (ret < 0) - return -1; - else - return count; -} - - -/* - a read style traverse - only if db read only -*/ -static int tdb1_traverse_read(struct tdb_context *tdb, - int (*fn)(struct tdb_context *, - TDB_DATA, TDB_DATA, void *), - void *private_data) -{ - struct tdb1_traverse_lock tl = { NULL, 0, 0, F_RDLCK }; - int ret; - - /* we need to get a read lock on the transaction lock here to - cope with the lock ordering semantics of solaris10 */ - if (tdb1_transaction_lock(tdb, F_RDLCK, TDB_LOCK_WAIT)) { - return -1; - } - - tdb->tdb1.traverse_read++; - ret = tdb1_traverse_internal(tdb, fn, private_data, &tl); - tdb->tdb1.traverse_read--; - - tdb1_transaction_unlock(tdb, F_RDLCK); - - return ret; -} - -/* - a write style traverse - needs to get the transaction lock to - prevent deadlocks - - WARNING: The data buffer given to the callback fn does NOT meet the - alignment restrictions malloc gives you. -*/ -int tdb1_traverse(struct tdb_context *tdb, - int (*fn)(struct tdb_context *, TDB_DATA, TDB_DATA, void *), - void *private_data) -{ - struct tdb1_traverse_lock tl = { NULL, 0, 0, F_WRLCK }; - int ret; - - /* If we're read-only, we don't have to write-lock whole db. */ - if (tdb->flags & TDB_RDONLY) { - return tdb1_traverse_read(tdb, fn, private_data); - } - - if (tdb1_transaction_lock(tdb, F_WRLCK, TDB_LOCK_WAIT)) { - return -1; - } - - tdb->tdb1.traverse_write++; - ret = tdb1_traverse_internal(tdb, fn, private_data, &tl); - tdb->tdb1.traverse_write--; - - tdb1_transaction_unlock(tdb, F_WRLCK); - - return ret; -} - - -/* find the first entry in the database and return its key */ -TDB_DATA tdb1_firstkey(struct tdb_context *tdb) -{ - TDB_DATA key; - struct tdb1_record rec; - tdb1_off_t off; - - /* release any old lock */ - if (tdb1_unlock_record(tdb, tdb->tdb1.travlocks.off) != 0) - return tdb1_null; - tdb->tdb1.travlocks.off = tdb->tdb1.travlocks.hash = 0; - tdb->tdb1.travlocks.lock_rw = F_RDLCK; - - /* Grab first record: locks chain and returned record. */ - off = tdb1_next_lock(tdb, &tdb->tdb1.travlocks, &rec); - if (off == 0 || off == TDB1_NEXT_LOCK_ERR) { - return tdb1_null; - } - /* now read the key */ - key.dsize = rec.key_len; - key.dptr =tdb1_alloc_read(tdb,tdb->tdb1.travlocks.off+sizeof(rec),key.dsize); - - /* Unlock the hash chain of the record we just read. */ - if (tdb1_unlock(tdb, tdb->tdb1.travlocks.hash, tdb->tdb1.travlocks.lock_rw) != 0) - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_firstkey:" - " error occurred while tdb1_unlocking!"); - return key; -} - -/* find the next entry in the database, returning its key */ -TDB_DATA tdb1_nextkey(struct tdb_context *tdb, TDB_DATA oldkey) -{ - uint32_t oldhash; - TDB_DATA key = tdb1_null; - struct tdb1_record rec; - unsigned char *k = NULL; - tdb1_off_t off; - - /* Is locked key the old key? If so, traverse will be reliable. */ - if (tdb->tdb1.travlocks.off) { - if (tdb1_lock(tdb,tdb->tdb1.travlocks.hash,tdb->tdb1.travlocks.lock_rw)) - return tdb1_null; - if (tdb1_rec_read(tdb, tdb->tdb1.travlocks.off, &rec) == -1 - || !(k = tdb1_alloc_read(tdb,tdb->tdb1.travlocks.off+sizeof(rec), - rec.key_len)) - || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) { - /* No, it wasn't: unlock it and start from scratch */ - if (tdb1_unlock_record(tdb, tdb->tdb1.travlocks.off) != 0) { - SAFE_FREE(k); - return tdb1_null; - } - if (tdb1_unlock(tdb, tdb->tdb1.travlocks.hash, tdb->tdb1.travlocks.lock_rw) != 0) { - SAFE_FREE(k); - return tdb1_null; - } - tdb->tdb1.travlocks.off = 0; - } - - SAFE_FREE(k); - } - - if (!tdb->tdb1.travlocks.off) { - /* No previous element: do normal find, and lock record */ - tdb->tdb1.travlocks.off = tdb1_find_lock_hash(tdb, oldkey, tdb_hash(tdb, oldkey.dptr, oldkey.dsize), tdb->tdb1.travlocks.lock_rw, &rec); - if (!tdb->tdb1.travlocks.off) { - return tdb1_null; - } - tdb->tdb1.travlocks.hash = TDB1_BUCKET(rec.full_hash); - if (tdb1_lock_record(tdb, tdb->tdb1.travlocks.off) != 0) { - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_nextkey: lock_record failed (%s)!", - strerror(errno)); - return tdb1_null; - } - } - oldhash = tdb->tdb1.travlocks.hash; - - /* Grab next record: locks chain and returned record, - unlocks old record */ - off = tdb1_next_lock(tdb, &tdb->tdb1.travlocks, &rec); - if (off != TDB1_NEXT_LOCK_ERR && off != 0) { - key.dsize = rec.key_len; - key.dptr = tdb1_alloc_read(tdb, tdb->tdb1.travlocks.off+sizeof(rec), - key.dsize); - /* Unlock the chain of this new record */ - if (tdb1_unlock(tdb, tdb->tdb1.travlocks.hash, tdb->tdb1.travlocks.lock_rw) != 0) - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_nextkey: WARNING tdb1_unlock failed!"); - } - /* Unlock the chain of old record */ - if (tdb1_unlock(tdb, TDB1_BUCKET(oldhash), tdb->tdb1.travlocks.lock_rw) != 0) - tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR, - "tdb1_nextkey: WARNING tdb1_unlock failed!"); - return key; -} diff --git a/ccan/tdb2/tdb2.h b/ccan/tdb2/tdb2.h deleted file mode 100644 index 3fa99b15..00000000 --- a/ccan/tdb2/tdb2.h +++ /dev/null @@ -1,924 +0,0 @@ -#ifndef CCAN_TDB2_H -#define CCAN_TDB2_H - -/* - TDB version 2: trivial database library - - Copyright (C) Andrew Tridgell 1999-2004 - Copyright (C) Rusty Russell 2010-2011 - - ** NOTE! The following LGPL license applies to the tdb - ** library. This does NOT imply that all of Samba is released - ** under the LGPL - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef _SAMBA_BUILD_ -#include "config.h" -#if HAVE_FILE_OFFSET_BITS -#define _FILE_OFFSET_BITS 64 -#endif -/* For mode_t */ -#include -/* For O_* flags. */ -#include -/* For sig_atomic_t. */ -#include -/* For uint64_t */ -#include -/* For bool */ -#include -/* For memcmp */ -#include -#endif -#include -#include -#include - -union tdb_attribute; -struct tdb_context; - -/** - * tdb_open - open a database file - * @name: the file name (can be NULL if flags contains TDB_INTERNAL) - * @tdb_flags: options for this database - * @open_flags: flags argument for tdb's open() call. - * @mode: mode argument for tdb's open() call. - * @attributes: linked list of extra attributes for this tdb. - * - * This call opens (and potentially creates) a database file. - * Multiple processes can have the TDB file open at once. - * - * On failure it will return NULL, and set errno: it may also call - * any log attribute found in @attributes. - * - * See also: - * union tdb_attribute - */ -struct tdb_context *tdb_open(const char *name, int tdb_flags, - int open_flags, mode_t mode, - union tdb_attribute *attributes); - - -/* flags for tdb_open() */ -#define TDB_DEFAULT 0 /* just a readability place holder */ -#define TDB_INTERNAL 2 /* don't store on disk */ -#define TDB_NOLOCK 4 /* don't do any locking */ -#define TDB_NOMMAP 8 /* don't use mmap */ -#define TDB_CONVERT 16 /* convert endian */ -#define TDB_NOSYNC 64 /* don't use synchronous transactions */ -#define TDB_SEQNUM 128 /* maintain a sequence number */ -#define TDB_ALLOW_NESTING 256 /* fake nested transactions */ -#define TDB_RDONLY 512 /* implied by O_RDONLY */ -#define TDB_VERSION1 1024 /* create/open an old style TDB */ -#define TDB_CANT_CHECK 2048 /* has a feature which we don't understand */ - -/** - * tdb1_incompatible_hash - better (Jenkins) hash for tdb1 - * - * This is better than the default hash for tdb1; but older versions of the - * tdb library (prior to version 1.2.6) won't be able to open them. - * - * It only makes sense to specify this (using tdb_attribute_hash) when - * creating (with O_CREAT) an old tdb version using TDB_VERSION1. It's - * equivalent to the TDB_INCOMPATIBLE_HASH flag for tdb1. - */ -uint64_t tdb1_incompatible_hash(const void *, size_t, uint64_t, void *); - -/** - * tdb_close - close and free a tdb. - * @tdb: the tdb context returned from tdb_open() - * - * This always succeeds, in that @tdb is unusable after this call. But if - * some unexpected error occurred while closing, it will return non-zero - * (the only clue as to cause will be via the log attribute). - */ -int tdb_close(struct tdb_context *tdb); - -/** - * struct tdb_data - representation of keys or values. - * @dptr: the data pointer - * @dsize: the size of the data pointed to by dptr. - * - * This is the "blob" representation of keys and data used by TDB. - */ -typedef struct tdb_data { - unsigned char *dptr; - size_t dsize; -} TDB_DATA; - -/** - * enum TDB_ERROR - error returns for TDB - * - * See Also: - * tdb_errorstr() - */ -enum TDB_ERROR { - TDB_SUCCESS = 0, /* No error. */ - TDB_ERR_CORRUPT = -1, /* We read the db, and it was bogus. */ - TDB_ERR_IO = -2, /* We couldn't read/write the db. */ - TDB_ERR_LOCK = -3, /* Locking failed. */ - TDB_ERR_OOM = -4, /* Out of Memory. */ - TDB_ERR_EXISTS = -5, /* The key already exists. */ - TDB_ERR_NOEXIST = -6, /* The key does not exist. */ - TDB_ERR_EINVAL = -7, /* You're using it wrong. */ - TDB_ERR_RDONLY = -8, /* The database is read-only. */ - TDB_ERR_LAST = TDB_ERR_RDONLY -}; - -/** - * tdb_store - store a key/value pair in a tdb. - * @tdb: the tdb context returned from tdb_open() - * @key: the key - * @dbuf: the data to associate with the key. - * @flag: TDB_REPLACE, TDB_INSERT or TDB_MODIFY. - * - * This inserts (or overwrites) a key/value pair in the TDB. If flag - * is TDB_REPLACE, it doesn't matter whether the key exists or not; - * TDB_INSERT means it must not exist (returns TDB_ERR_EXISTS otherwise), - * and TDB_MODIFY means it must exist (returns TDB_ERR_NOEXIST otherwise). - * - * On success, this returns TDB_SUCCESS. - * - * See also: - * tdb_fetch, tdb_transaction_start, tdb_append, tdb_delete. - */ -enum TDB_ERROR tdb_store(struct tdb_context *tdb, - struct tdb_data key, - struct tdb_data dbuf, - int flag); - -/* flags to tdb_store() */ -#define TDB_REPLACE 1 /* A readability place holder */ -#define TDB_INSERT 2 /* Don't overwrite an existing entry */ -#define TDB_MODIFY 3 /* Don't create an existing entry */ - -/** - * tdb_fetch - fetch a value from a tdb. - * @tdb: the tdb context returned from tdb_open() - * @key: the key - * @data: pointer to data. - * - * This looks up a key in the database and sets it in @data. - * - * If it returns TDB_SUCCESS, the key was found: it is your - * responsibility to call free() on @data->dptr. - * - * Otherwise, it returns an error (usually, TDB_ERR_NOEXIST) and @data is - * undefined. - */ -enum TDB_ERROR tdb_fetch(struct tdb_context *tdb, struct tdb_data key, - struct tdb_data *data); - -/** - * tdb_errorstr - map the tdb error onto a constant readable string - * @ecode: the enum TDB_ERROR to map. - * - * This is useful for displaying errors to users. - */ -const char *tdb_errorstr(enum TDB_ERROR ecode); - -/** - * tdb_append - append a value to a key/value pair in a tdb. - * @tdb: the tdb context returned from tdb_open() - * @key: the key - * @dbuf: the data to append. - * - * This is equivalent to fetching a record, reallocating .dptr to add the - * data, and writing it back, only it's much more efficient. If the key - * doesn't exist, it's equivalent to tdb_store (with an additional hint that - * you expect to expand the record in future). - * - * See Also: - * tdb_fetch(), tdb_store() - */ -enum TDB_ERROR tdb_append(struct tdb_context *tdb, - struct tdb_data key, struct tdb_data dbuf); - -/** - * tdb_delete - delete a key from a tdb. - * @tdb: the tdb context returned from tdb_open() - * @key: the key to delete. - * - * Returns TDB_SUCCESS on success, or an error (usually TDB_ERR_NOEXIST). - * - * See Also: - * tdb_fetch(), tdb_store() - */ -enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key); - -/** - * tdb_exists - does a key exist in the database? - * @tdb: the tdb context returned from tdb_open() - * @key: the key to search for. - * - * Returns true if it exists, or false if it doesn't or any other error. - */ -bool tdb_exists(struct tdb_context *tdb, TDB_DATA key); - -/** - * tdb_deq - are struct tdb_data equal? - * @a: one struct tdb_data - * @b: another struct tdb_data - */ -static inline bool tdb_deq(struct tdb_data a, struct tdb_data b) -{ - return a.dsize == b.dsize && memcmp(a.dptr, b.dptr, a.dsize) == 0; -} - -/** - * tdb_mkdata - make a struct tdb_data from const data - * @p: the constant pointer - * @len: the length - * - * As the dptr member of struct tdb_data is not constant, you need to - * cast it. This function keeps thost casts in one place, as well as - * suppressing the warning some compilers give when casting away a - * qualifier (eg. gcc with -Wcast-qual) - */ -static inline struct tdb_data tdb_mkdata(const void *p, size_t len) -{ - struct tdb_data d; - d.dptr = cast_const(void *, p); - d.dsize = len; - return d; -} - -/** - * tdb_transaction_start - start a transaction - * @tdb: the tdb context returned from tdb_open() - * - * This begins a series of atomic operations. Other processes will be able - * to read the tdb, but not alter it (they will block), nor will they see - * any changes until tdb_transaction_commit() is called. - * - * Note that if the TDB_ALLOW_NESTING flag is set, a tdb_transaction_start() - * within a transaction will succeed, but it's not a real transaction: - * (1) An inner transaction which is committed is not actually committed until - * the outer transaction is; if the outer transaction is cancelled, the - * inner ones are discarded. - * (2) tdb_transaction_cancel() marks the outer transaction as having an error, - * so the final tdb_transaction_commit() will fail. - * (3) the outer transaction will see the results of the inner transaction. - * - * See Also: - * tdb_transaction_cancel, tdb_transaction_commit. - */ -enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb); - -/** - * tdb_transaction_cancel - abandon a transaction - * @tdb: the tdb context returned from tdb_open() - * - * This aborts a transaction, discarding any changes which were made. - * tdb_close() does this implicitly. - */ -void tdb_transaction_cancel(struct tdb_context *tdb); - -/** - * tdb_transaction_commit - commit a transaction - * @tdb: the tdb context returned from tdb_open() - * - * This completes a transaction, writing any changes which were made. - * - * fsync() is used to commit the transaction (unless TDB_NOSYNC is set), - * making it robust against machine crashes, but very slow compared to - * other TDB operations. - * - * A failure can only be caused by unexpected errors (eg. I/O or - * memory); this is no point looping on transaction failure. - * - * See Also: - * tdb_transaction_prepare_commit() - */ -enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb); - -/** - * tdb_transaction_prepare_commit - prepare to commit a transaction - * @tdb: the tdb context returned from tdb_open() - * - * This ensures we have the resources to commit a transaction (using - * tdb_transaction_commit): if this succeeds then a transaction will only - * fail if the write() or fsync() calls fail. - * - * If this fails you must still call tdb_transaction_cancel() to cancel - * the transaction. - * - * See Also: - * tdb_transaction_commit() - */ -enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb); - -/** - * tdb_traverse - traverse a TDB - * @tdb: the tdb context returned from tdb_open() - * @fn: the function to call for every key/value pair (or NULL) - * @p: the pointer to hand to @f - * - * This walks the TDB until all they keys have been traversed, or @fn - * returns non-zero. If the traverse function or other processes are - * changing data or adding or deleting keys, the traverse may be - * unreliable: keys may be skipped or (rarely) visited twice. - * - * There is one specific exception: the special case of deleting the - * current key does not undermine the reliability of the traversal. - * - * On success, returns the number of keys iterated. On error returns - * a negative enum TDB_ERROR value. - */ -#define tdb_traverse(tdb, fn, p) \ - tdb_traverse_(tdb, typesafe_cb_preargs(int, void *, (fn), (p), \ - struct tdb_context *, \ - TDB_DATA, TDB_DATA), (p)) - -int64_t tdb_traverse_(struct tdb_context *tdb, - int (*fn)(struct tdb_context *, - TDB_DATA, TDB_DATA, void *), void *p); - -/** - * tdb_parse_record - operate directly on data in the database. - * @tdb: the tdb context returned from tdb_open() - * @key: the key whose record we should hand to @parse - * @parse: the function to call for the data - * @data: the private pointer to hand to @parse (types must match). - * - * This avoids a copy for many cases, by handing you a pointer into - * the memory-mapped database. It also locks the record to prevent - * other accesses at the same time. - * - * Do not alter the data handed to parse()! - */ -#define tdb_parse_record(tdb, key, parse, data) \ - tdb_parse_record_((tdb), (key), \ - typesafe_cb_preargs(enum TDB_ERROR, void *, \ - (parse), (data), \ - TDB_DATA, TDB_DATA), (data)) - -enum TDB_ERROR tdb_parse_record_(struct tdb_context *tdb, - TDB_DATA key, - enum TDB_ERROR (*parse)(TDB_DATA k, - TDB_DATA d, - void *data), - void *data); - -/** - * tdb_get_seqnum - get a database sequence number - * @tdb: the tdb context returned from tdb_open() - * - * This returns a sequence number: any change to the database from a - * tdb context opened with the TDB_SEQNUM flag will cause that number - * to increment. Note that the incrementing is unreliable (it is done - * without locking), so this is only useful as an optimization. - * - * For example, you may have a regular database backup routine which - * does not operate if the sequence number is unchanged. In the - * unlikely event of a failed increment, it will be backed up next - * time any way. - * - * Returns an enum TDB_ERROR (ie. negative) on error. - */ -int64_t tdb_get_seqnum(struct tdb_context *tdb); - -/** - * tdb_firstkey - get the "first" key in a TDB - * @tdb: the tdb context returned from tdb_open() - * @key: pointer to key. - * - * This returns an arbitrary key in the database; with tdb_nextkey() it allows - * open-coded traversal of the database, though it is slightly less efficient - * than tdb_traverse. - * - * It is your responsibility to free @key->dptr on success. - * - * Returns TDB_ERR_NOEXIST if the database is empty. - */ -enum TDB_ERROR tdb_firstkey(struct tdb_context *tdb, struct tdb_data *key); - -/** - * tdb_nextkey - get the "next" key in a TDB - * @tdb: the tdb context returned from tdb_open() - * @key: a key returned by tdb_firstkey() or tdb_nextkey(). - * - * This returns another key in the database; it will free @key.dptr for - * your convenience. - * - * Returns TDB_ERR_NOEXIST if there are no more keys. - */ -enum TDB_ERROR tdb_nextkey(struct tdb_context *tdb, struct tdb_data *key); - -/** - * tdb_chainlock - lock a record in the TDB - * @tdb: the tdb context returned from tdb_open() - * @key: the key to lock. - * - * This prevents any access occurring to a group of keys including @key, - * even if @key does not exist. This allows primitive atomic updates of - * records without using transactions. - * - * You cannot begin a transaction while holding a tdb_chainlock(), nor can - * you do any operations on any other keys in the database. This also means - * that you cannot hold more than one tdb_chainlock() at a time. - * - * See Also: - * tdb_chainunlock() - */ -enum TDB_ERROR tdb_chainlock(struct tdb_context *tdb, TDB_DATA key); - -/** - * tdb_chainunlock - unlock a record in the TDB - * @tdb: the tdb context returned from tdb_open() - * @key: the key to unlock. - * - * The key must have previously been locked by tdb_chainlock(). - */ -void tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key); - -/** - * tdb_chainlock_read - lock a record in the TDB, for reading - * @tdb: the tdb context returned from tdb_open() - * @key: the key to lock. - * - * This prevents any changes from occurring to a group of keys including @key, - * even if @key does not exist. This allows primitive atomic updates of - * records without using transactions. - * - * You cannot begin a transaction while holding a tdb_chainlock_read(), nor can - * you do any operations on any other keys in the database. This also means - * that you cannot hold more than one tdb_chainlock()/read() at a time. - * - * See Also: - * tdb_chainlock() - */ -enum TDB_ERROR tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key); - -/** - * tdb_chainunlock_read - unlock a record in the TDB for reading - * @tdb: the tdb context returned from tdb_open() - * @key: the key to unlock. - * - * The key must have previously been locked by tdb_chainlock_read(). - */ -void tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key); - -/** - * tdb_lockall - lock the entire TDB - * @tdb: the tdb context returned from tdb_open() - * - * You cannot hold a tdb_chainlock while calling this. It nests, so you - * must call tdb_unlockall as many times as you call tdb_lockall. - */ -enum TDB_ERROR tdb_lockall(struct tdb_context *tdb); - -/** - * tdb_unlockall - unlock the entire TDB - * @tdb: the tdb context returned from tdb_open() - */ -void tdb_unlockall(struct tdb_context *tdb); - -/** - * tdb_lockall_read - lock the entire TDB for reading - * @tdb: the tdb context returned from tdb_open() - * - * This prevents others writing to the database, eg. tdb_delete, tdb_store, - * tdb_append, but not tdb_fetch. - * - * You cannot hold a tdb_chainlock while calling this. It nests, so you - * must call tdb_unlockall_read as many times as you call tdb_lockall_read. - */ -enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb); - -/** - * tdb_unlockall_read - unlock the entire TDB for reading - * @tdb: the tdb context returned from tdb_open() - */ -void tdb_unlockall_read(struct tdb_context *tdb); - -/** - * tdb_wipe_all - wipe the database clean - * @tdb: the tdb context returned from tdb_open() - * - * Completely erase the database. This is faster than iterating through - * each key and doing tdb_delete. - */ -enum TDB_ERROR tdb_wipe_all(struct tdb_context *tdb); - -/** - * tdb_repack - repack the database - * @tdb: the tdb context returned from tdb_open() - * - * This repacks the database; if it is suffering from a great deal of - * fragmentation this might help. However, it can take twice the - * memory of the existing TDB. - */ -enum TDB_ERROR tdb_repack(struct tdb_context *tdb); - -/** - * tdb_check - check a TDB for consistency - * @tdb: the tdb context returned from tdb_open() - * @check: function to check each key/data pair (or NULL) - * @data: argument for @check, must match type. - * - * This performs a consistency check of the open database, optionally calling - * a check() function on each record so you can do your own data consistency - * checks as well. If check() returns an error, that is returned from - * tdb_check(). - * - * Note that the TDB uses a feature which we don't understand which - * indicates we can't run tdb_check(), this will log a warning to that - * effect and return TDB_SUCCESS. You can detect this condition by - * looking for TDB_CANT_CHECK in tdb_get_flags(). - * - * Returns TDB_SUCCESS or an error. - */ -#define tdb_check(tdb, check, data) \ - tdb_check_((tdb), typesafe_cb_preargs(enum TDB_ERROR, void *, \ - (check), (data), \ - struct tdb_data, \ - struct tdb_data), \ - (data)) - -enum TDB_ERROR tdb_check_(struct tdb_context *tdb, - enum TDB_ERROR (*check)(struct tdb_data k, - struct tdb_data d, - void *data), - void *data); - -/** - * tdb_error - get the last error (not threadsafe) - * @tdb: the tdb context returned from tdb_open() - * - * Returns the last error returned by a TDB function. - * - * This makes porting from TDB1 easier, but note that the last error is not - * reliable in threaded programs. - */ -enum TDB_ERROR tdb_error(struct tdb_context *tdb); - -/** - * enum tdb_summary_flags - flags for tdb_summary. - */ -enum tdb_summary_flags { - TDB_SUMMARY_HISTOGRAMS = 1 /* Draw graphs in the summary. */ -}; - -/** - * tdb_summary - return a string describing the TDB state - * @tdb: the tdb context returned from tdb_open() - * @flags: flags to control the summary output. - * @summary: pointer to string to allocate. - * - * This returns a developer-readable string describing the overall - * state of the tdb, such as the percentage used and sizes of records. - * It is designed to provide information about the tdb at a glance - * without displaying any keys or data in the database. - * - * On success, sets @summary to point to a malloc()'ed nul-terminated - * multi-line string. It is your responsibility to free() it. - */ -enum TDB_ERROR tdb_summary(struct tdb_context *tdb, - enum tdb_summary_flags flags, - char **summary); - - -/** - * tdb_get_flags - return the flags for a tdb - * @tdb: the tdb context returned from tdb_open() - * - * This returns the flags on the current tdb. Some of these are caused by - * the flags argument to tdb_open(), others (such as TDB_CONVERT) are - * intuited. - */ -unsigned int tdb_get_flags(struct tdb_context *tdb); - -/** - * tdb_add_flag - set a flag for a tdb - * @tdb: the tdb context returned from tdb_open() - * @flag: one of TDB_NOLOCK, TDB_NOMMAP, TDB_NOSYNC or TDB_ALLOW_NESTING. - * - * You can use this to set a flag on the TDB. You cannot set these flags - * on a TDB_INTERNAL tdb. - */ -void tdb_add_flag(struct tdb_context *tdb, unsigned flag); - -/** - * tdb_remove_flag - unset a flag for a tdb - * @tdb: the tdb context returned from tdb_open() - * @flag: one of TDB_NOLOCK, TDB_NOMMAP, TDB_NOSYNC or TDB_ALLOW_NESTING. - * - * You can use this to clear a flag on the TDB. You cannot clear flags - * on a TDB_INTERNAL tdb. - */ -void tdb_remove_flag(struct tdb_context *tdb, unsigned flag); - -/** - * enum tdb_attribute_type - descriminator for union tdb_attribute. - */ -enum tdb_attribute_type { - TDB_ATTRIBUTE_LOG = 0, - TDB_ATTRIBUTE_HASH = 1, - TDB_ATTRIBUTE_SEED = 2, - TDB_ATTRIBUTE_STATS = 3, - TDB_ATTRIBUTE_OPENHOOK = 4, - TDB_ATTRIBUTE_FLOCK = 5, - TDB_ATTRIBUTE_TDB1_HASHSIZE = 128, - TDB_ATTRIBUTE_TDB1_MAX_DEAD = 129, -}; - -/** - * tdb_get_attribute - get an attribute for an existing tdb - * @tdb: the tdb context returned from tdb_open() - * @attr: the union tdb_attribute to set. - * - * This gets an attribute from a TDB which has previously been set (or - * may return the default values). Set @attr.base.attr to the - * attribute type you want get. - */ -enum TDB_ERROR tdb_get_attribute(struct tdb_context *tdb, - union tdb_attribute *attr); - -/** - * tdb_set_attribute - set an attribute for an existing tdb - * @tdb: the tdb context returned from tdb_open() - * @attr: the union tdb_attribute to set. - * - * This sets an attribute on a TDB, overriding any previous attribute - * of the same type. It returns TDB_ERR_EINVAL if the attribute is - * unknown or invalid. - * - * Note that TDB_ATTRIBUTE_HASH, TDB_ATTRIBUTE_SEED, - * TDB_ATTRIBUTE_OPENHOOK and TDB_ATTRIBUTE_TDB1_HASHSIZE cannot - * currently be set after tdb_open. - */ -enum TDB_ERROR tdb_set_attribute(struct tdb_context *tdb, - const union tdb_attribute *attr); - -/** - * tdb_unset_attribute - reset an attribute for an existing tdb - * @tdb: the tdb context returned from tdb_open() - * @type: the attribute type to unset. - * - * This unsets an attribute on a TDB, returning it to the defaults - * (where applicable). - * - * Note that it only makes sense for TDB_ATTRIBUTE_LOG and TDB_ATTRIBUTE_FLOCK - * to be unset. - */ -void tdb_unset_attribute(struct tdb_context *tdb, - enum tdb_attribute_type type); - -/** - * tdb_name - get the name of a tdb - * @tdb: the tdb context returned from tdb_open() - * - * This returns a copy of the name string, made at tdb_open() time. If that - * argument was NULL (possible for a TDB_INTERNAL db) this will return NULL. - * - * This is mostly useful for logging. - */ -const char *tdb_name(const struct tdb_context *tdb); - -/** - * tdb_fd - get the file descriptor of a tdb - * @tdb: the tdb context returned from tdb_open() - * - * This returns the file descriptor for the underlying database file, or -1 - * for TDB_INTERNAL. - */ -int tdb_fd(const struct tdb_context *tdb); - -/** - * tdb_foreach - iterate through every open TDB. - * @fn: the function to call for every TDB - * @p: the pointer to hand to @fn - * - * TDB internally keeps track of all open TDBs; this function allows you to - * iterate through them. If @fn returns non-zero, traversal stops. - */ -#define tdb_foreach(fn, p) \ - tdb_foreach_(typesafe_cb_preargs(int, void *, (fn), (p), \ - struct tdb_context *), (p)) - -void tdb_foreach_(int (*fn)(struct tdb_context *, void *), void *p); - -/** - * struct tdb_attribute_base - common fields for all tdb attributes. - */ -struct tdb_attribute_base { - enum tdb_attribute_type attr; - union tdb_attribute *next; -}; - -/** - * enum tdb_log_level - log levels for tdb_attribute_log - * @TDB_LOG_ERROR: used to log unrecoverable errors such as I/O errors - * or internal consistency failures. - * @TDB_LOG_USE_ERROR: used to log usage errors such as invalid parameters - * or writing to a read-only database. - * @TDB_LOG_WARNING: used for informational messages on issues which - * are unusual but handled by TDB internally, such - * as a failure to mmap or failure to open /dev/urandom. - */ -enum tdb_log_level { - TDB_LOG_ERROR, - TDB_LOG_USE_ERROR, - TDB_LOG_WARNING -}; - -/** - * struct tdb_attribute_log - log function attribute - * - * This attribute provides a hook for you to log errors. - */ -struct tdb_attribute_log { - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */ - void (*fn)(struct tdb_context *tdb, - enum tdb_log_level level, - enum TDB_ERROR ecode, - const char *message, - void *data); - void *data; -}; - -/** - * struct tdb_attribute_hash - hash function attribute - * - * This attribute allows you to provide an alternative hash function. - * This hash function will be handed keys from the database; it will also - * be handed the 8-byte TDB_HASH_MAGIC value for checking the header (the - * tdb_open() will fail if the hash value doesn't match the header). - * - * Note that if your hash function gives different results on - * different machine endians, your tdb will no longer work across - * different architectures! - */ -struct tdb_attribute_hash { - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */ - uint64_t (*fn)(const void *key, size_t len, uint64_t seed, - void *data); - void *data; -}; - -/** - * struct tdb_attribute_seed - hash function seed attribute - * - * The hash function seed is normally taken from /dev/urandom (or equivalent) - * but can be set manually here. This is mainly for testing purposes. - */ -struct tdb_attribute_seed { - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_SEED */ - uint64_t seed; -}; - -/** - * struct tdb_attribute_stats - tdb operational statistics - * - * This attribute records statistics of various low-level TDB operations. - * This can be used to assist performance evaluation. This is only - * useful for tdb_get_attribute(). - * - * New fields will be added at the end, hence the "size" argument which - * indicates how large your structure is: it must be filled in before - * calling tdb_get_attribute(), which will overwrite it with the size - * tdb knows about. - */ -struct tdb_attribute_stats { - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_STATS */ - size_t size; /* = sizeof(struct tdb_attribute_stats) */ - uint64_t allocs; - uint64_t alloc_subhash; - uint64_t alloc_chain; - uint64_t alloc_bucket_exact; - uint64_t alloc_bucket_max; - uint64_t alloc_leftover; - uint64_t alloc_coalesce_tried; - uint64_t alloc_coalesce_iterate_clash; - uint64_t alloc_coalesce_lockfail; - uint64_t alloc_coalesce_race; - uint64_t alloc_coalesce_succeeded; - uint64_t alloc_coalesce_num_merged; - uint64_t compares; - uint64_t compare_wrong_bucket; - uint64_t compare_wrong_offsetbits; - uint64_t compare_wrong_keylen; - uint64_t compare_wrong_rechash; - uint64_t compare_wrong_keycmp; - uint64_t transactions; - uint64_t transaction_cancel; - uint64_t transaction_nest; - uint64_t transaction_expand_file; - uint64_t transaction_read_direct; - uint64_t transaction_read_direct_fail; - uint64_t transaction_write_direct; - uint64_t transaction_write_direct_fail; - uint64_t expands; - uint64_t frees; - uint64_t locks; - uint64_t lock_lowlevel; - uint64_t lock_nonblock; - uint64_t lock_nonblock_fail; -}; - -/** - * struct tdb_attribute_openhook - tdb special effects hook for open - * - * This attribute contains a function to call once we have the OPEN_LOCK - * for the tdb, but before we've examined its contents. If this succeeds, - * the tdb will be populated if it's then zero-length. - * - * This is a hack to allow support for TDB1-style TDB_CLEAR_IF_FIRST - * behaviour. - */ -struct tdb_attribute_openhook { - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_OPENHOOK */ - enum TDB_ERROR (*fn)(int fd, void *data); - void *data; -}; - -/** - * struct tdb_attribute_flock - tdb special effects hook for file locking - * - * This attribute contains function to call to place locks on a file; it can - * be used to support non-blocking operations or lock proxying. - * - * They should return 0 on success, -1 on failure and set errno. - * - * An error will be logged on error if errno is neither EAGAIN nor EINTR - * (normally it would only return EAGAIN if waitflag is false, and - * loop internally on EINTR). - */ -struct tdb_attribute_flock { - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_FLOCK */ - int (*lock)(int fd,int rw, off_t off, off_t len, bool waitflag, void *); - int (*unlock)(int fd, int rw, off_t off, off_t len, void *); - void *data; -}; - -/** - * struct tdb_attribute_tdb1_hashsize - tdb1 hashsize - * - * This attribute allows setting the TDB1 hashsize; it only makes sense with - * O_CREAT and TDB_VERSION1. - * - * Hashsize should generally be a prime, such as 10007. - */ -struct tdb_attribute_tdb1_hashsize { - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_TDB1_HASHSIZE */ - unsigned int hsize; -}; - -/** - * struct tdb_attribute_tdb1_max_dead - tdb1 number of maximum dead records. - * - * TDB1 has a method to speed up its slow free list: it lets a certain - * number of "dead" records build up before freeing them. This is - * particularly useful for volatile TDBs; setting it to 5 is - * equivalent to tdb1's TDB_VOLATILE flag. - */ -struct tdb_attribute_tdb1_max_dead { - struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_TDB1_MAX_DEAD */ - unsigned int max_dead; -}; - -/** - * union tdb_attribute - tdb attributes. - * - * This represents all the known attributes. - * - * See also: - * struct tdb_attribute_log, struct tdb_attribute_hash, - * struct tdb_attribute_seed, struct tdb_attribute_stats, - * struct tdb_attribute_openhook, struct tdb_attribute_flock. - */ -union tdb_attribute { - struct tdb_attribute_base base; - struct tdb_attribute_log log; - struct tdb_attribute_hash hash; - struct tdb_attribute_seed seed; - struct tdb_attribute_stats stats; - struct tdb_attribute_openhook openhook; - struct tdb_attribute_flock flock; - struct tdb_attribute_tdb1_hashsize tdb1_hashsize; - struct tdb_attribute_tdb1_max_dead tdb1_max_dead; -}; - -#ifdef __cplusplus -} -#endif - -#endif /* tdb2.h */ diff --git a/ccan/tdb2/test/api-12-store.c b/ccan/tdb2/test/api-12-store.c deleted file mode 100644 index ccec53e0..00000000 --- a/ccan/tdb2/test/api-12-store.c +++ /dev/null @@ -1,56 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "logging.h" - -/* We use the same seed which we saw a failure on. */ -static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p) -{ - return hash64_stable((const unsigned char *)key, len, - *(uint64_t *)p); -} - -int main(int argc, char *argv[]) -{ - unsigned int i, j; - struct tdb_context *tdb; - uint64_t seed = 16014841315512641303ULL; - union tdb_attribute fixed_hattr - = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, - .fn = fixedhash, - .data = &seed } }; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT }; - struct tdb_data key = { (unsigned char *)&j, sizeof(j) }; - struct tdb_data data = { (unsigned char *)&j, sizeof(j) }; - - fixed_hattr.base.next = &tap_log_attr; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 500 * 3) + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-12-store.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr); - ok1(tdb); - if (!tdb) - continue; - - /* We seemed to lose some keys. - * Insert and check they're in there! */ - for (j = 0; j < 500; j++) { - struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */ - ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0); - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(tdb_deq(d, data)); - free(d.dptr); - } - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-13-delete.c b/ccan/tdb2/test/api-13-delete.c deleted file mode 100644 index 0287a6ab..00000000 --- a/ccan/tdb2/test/api-13-delete.c +++ /dev/null @@ -1,210 +0,0 @@ -#include // For TDB_TOPLEVEL_HASH_BITS -#include -#include -#include -#include -#include -#include -#include "logging.h" - -/* We rig the hash so adjacent-numbered records always clash. */ -static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv) -{ - return ((uint64_t)*(const unsigned int *)key) - << (64 - TDB_TOPLEVEL_HASH_BITS - 1); -} - -/* We use the same seed which we saw a failure on. */ -static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p) -{ - return hash64_stable((const unsigned char *)key, len, - *(uint64_t *)p); -} - -static bool store_records(struct tdb_context *tdb) -{ - int i; - struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; - struct tdb_data d, data = { (unsigned char *)&i, sizeof(i) }; - - for (i = 0; i < 1000; i++) { - if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) - return false; - tdb_fetch(tdb, key, &d); - if (!tdb_deq(d, data)) - return false; - free(d.dptr); - } - return true; -} - -static void test_val(struct tdb_context *tdb, uint64_t val) -{ - uint64_t v; - struct tdb_data key = { (unsigned char *)&v, sizeof(v) }; - struct tdb_data d, data = { (unsigned char *)&v, sizeof(v) }; - - /* Insert an entry, then delete it. */ - v = val; - /* Delete should fail. */ - ok1(tdb_delete(tdb, key) == TDB_ERR_NOEXIST); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Insert should succeed. */ - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Delete should succeed. */ - ok1(tdb_delete(tdb, key) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Re-add it, then add collision. */ - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - v = val + 1; - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Can find both? */ - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(d.dsize == data.dsize); - free(d.dptr); - v = val; - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(d.dsize == data.dsize); - free(d.dptr); - - /* Delete second one. */ - v = val + 1; - ok1(tdb_delete(tdb, key) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Re-add */ - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Now, try deleting first one. */ - v = val; - ok1(tdb_delete(tdb, key) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Can still find second? */ - v = val + 1; - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(d.dsize == data.dsize); - free(d.dptr); - - /* Now, this will be ideally placed. */ - v = val + 2; - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* This will collide with both. */ - v = val; - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - - /* We can still find them all, right? */ - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(d.dsize == data.dsize); - free(d.dptr); - v = val + 1; - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(d.dsize == data.dsize); - free(d.dptr); - v = val + 2; - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(d.dsize == data.dsize); - free(d.dptr); - - /* And if we delete val + 1, that val + 2 should not move! */ - v = val + 1; - ok1(tdb_delete(tdb, key) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - v = val; - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(d.dsize == data.dsize); - free(d.dptr); - v = val + 2; - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(d.dsize == data.dsize); - free(d.dptr); - - /* Delete those two, so we are empty. */ - ok1(tdb_delete(tdb, key) == 0); - v = val; - ok1(tdb_delete(tdb, key) == 0); - - ok1(tdb_check(tdb, NULL, NULL) == 0); -} - -int main(int argc, char *argv[]) -{ - unsigned int i, j; - struct tdb_context *tdb; - uint64_t seed = 16014841315512641303ULL; - union tdb_attribute clash_hattr - = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, - .fn = clash } }; - union tdb_attribute fixed_hattr - = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, - .fn = fixedhash, - .data = &seed } }; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - /* These two values gave trouble before. */ - int vals[] = { 755, 837 }; - - clash_hattr.base.next = &tap_log_attr; - fixed_hattr.base.next = &tap_log_attr; - - plan_tests(sizeof(flags) / sizeof(flags[0]) - * (39 * 3 + 5 + sizeof(vals)/sizeof(vals[0])*2) + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-13-delete.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &clash_hattr); - ok1(tdb); - if (!tdb) - continue; - - /* Check start of hash table. */ - test_val(tdb, 0); - - /* Check end of hash table. */ - test_val(tdb, -1ULL); - - /* Check mixed bitpattern. */ - test_val(tdb, 0x123456789ABCDEF0ULL); - - ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0 - && tdb->file->num_lockrecs == 0)); - tdb_close(tdb); - - /* Deleting these entries in the db gave problems. */ - tdb = tdb_open("run-13-delete.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr); - ok1(tdb); - if (!tdb) - continue; - - ok1(store_records(tdb)); - ok1(tdb_check(tdb, NULL, NULL) == 0); - for (j = 0; j < sizeof(vals)/sizeof(vals[0]); j++) { - struct tdb_data key; - - key.dptr = (unsigned char *)&vals[j]; - key.dsize = sizeof(vals[j]); - ok1(tdb_delete(tdb, key) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - } - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-14-exists.c b/ccan/tdb2/test/api-14-exists.c deleted file mode 100644 index 698006fa..00000000 --- a/ccan/tdb2/test/api-14-exists.c +++ /dev/null @@ -1,58 +0,0 @@ -#include -#include -#include -#include -#include -#include "logging.h" - -static bool test_records(struct tdb_context *tdb) -{ - int i; - struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; - struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; - - for (i = 0; i < 1000; i++) { - if (tdb_exists(tdb, key)) - return false; - if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) - return false; - if (!tdb_exists(tdb, key)) - return false; - } - - for (i = 0; i < 1000; i++) { - if (!tdb_exists(tdb, key)) - return false; - if (tdb_delete(tdb, key) != 0) - return false; - if (tdb_exists(tdb, key)) - return false; - } - return true; -} - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-14-exists.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - if (ok1(tdb)) - ok1(test_records(tdb)); - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-16-wipe_all.c b/ccan/tdb2/test/api-16-wipe_all.c deleted file mode 100644 index d17eff8e..00000000 --- a/ccan/tdb2/test/api-16-wipe_all.c +++ /dev/null @@ -1,50 +0,0 @@ -#include -#include -#include -#include -#include -#include "logging.h" - -static bool add_records(struct tdb_context *tdb) -{ - int i; - struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; - struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; - - for (i = 0; i < 1000; i++) { - if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) - return false; - } - return true; -} - - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-16-wipe_all.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - if (ok1(tdb)) { - struct tdb_data key; - ok1(add_records(tdb)); - ok1(tdb_wipe_all(tdb) == TDB_SUCCESS); - ok1(tdb_firstkey(tdb, &key) == TDB_ERR_NOEXIST); - tdb_close(tdb); - } - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-21-parse_record.c b/ccan/tdb2/test/api-21-parse_record.c deleted file mode 100644 index def4f456..00000000 --- a/ccan/tdb2/test/api-21-parse_record.c +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include -#include "logging.h" - -static enum TDB_ERROR parse(TDB_DATA key, TDB_DATA data, TDB_DATA *expected) -{ - if (!tdb_deq(data, *expected)) - return TDB_ERR_EINVAL; - return TDB_SUCCESS; -} - -static enum TDB_ERROR parse_err(TDB_DATA key, TDB_DATA data, void *unused) -{ - return 100; -} - -static bool test_records(struct tdb_context *tdb) -{ - int i; - struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; - struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; - - for (i = 0; i < 1000; i++) { - if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) - return false; - } - - for (i = 0; i < 1000; i++) { - if (tdb_parse_record(tdb, key, parse, &data) != TDB_SUCCESS) - return false; - } - - if (tdb_parse_record(tdb, key, parse, &data) != TDB_ERR_NOEXIST) - return false; - - /* Test error return from parse function. */ - i = 0; - if (tdb_parse_record(tdb, key, parse_err, NULL) != 100) - return false; - - return true; -} - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("api-21-parse_record.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - if (ok1(tdb)) - ok1(test_records(tdb)); - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-55-transaction.c b/ccan/tdb2/test/api-55-transaction.c deleted file mode 100644 index 9c1044b4..00000000 --- a/ccan/tdb2/test/api-55-transaction.c +++ /dev/null @@ -1,76 +0,0 @@ -#include // struct tdb_context -#include -#include -#include -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - unsigned char *buffer; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data; - - buffer = malloc(1000); - for (i = 0; i < 1000; i++) - buffer[i] = i; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 20 + 1); - - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-55-transaction.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - ok1(tdb_transaction_start(tdb) == 0); - data.dptr = buffer; - data.dsize = 1000; - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == 1000); - ok1(memcmp(data.dptr, buffer, data.dsize) == 0); - free(data.dptr); - - /* Cancelling a transaction means no store */ - tdb_transaction_cancel(tdb); - ok1(tdb->file->allrecord_lock.count == 0 - && tdb->file->num_lockrecs == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - ok1(tdb_fetch(tdb, key, &data) == TDB_ERR_NOEXIST); - - /* Commit the transaction. */ - ok1(tdb_transaction_start(tdb) == 0); - data.dptr = buffer; - data.dsize = 1000; - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == 1000); - ok1(memcmp(data.dptr, buffer, data.dsize) == 0); - free(data.dptr); - ok1(tdb_transaction_commit(tdb) == 0); - ok1(tdb->file->allrecord_lock.count == 0 - && tdb->file->num_lockrecs == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == 1000); - ok1(memcmp(data.dptr, buffer, data.dsize) == 0); - free(data.dptr); - - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - free(buffer); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-80-tdb_fd.c b/ccan/tdb2/test/api-80-tdb_fd.c deleted file mode 100644 index 0088f9b7..00000000 --- a/ccan/tdb2/test/api-80-tdb_fd.c +++ /dev/null @@ -1,36 +0,0 @@ -#include -#include -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 3); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("api-80-tdb_fd.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - if (!ok1(tdb)) - continue; - - if (flags[i] & TDB_INTERNAL) - ok1(tdb_fd(tdb) == -1); - else - ok1(tdb_fd(tdb) > 2); - tdb_close(tdb); - ok1(tap_log_messages == 0); - } - return exit_status(); -} diff --git a/ccan/tdb2/test/api-81-seqnum.c b/ccan/tdb2/test/api-81-seqnum.c deleted file mode 100644 index c1eb7517..00000000 --- a/ccan/tdb2/test/api-81-seqnum.c +++ /dev/null @@ -1,79 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i, seq; - struct tdb_context *tdb; - struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */ - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data = tdb_mkdata("data", 4); - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 15 + 8 * 13); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("api-81-seqnum.tdb", flags[i]|TDB_SEQNUM, - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - if (!ok1(tdb)) - continue; - - seq = 0; - ok1(tdb_get_seqnum(tdb) == seq); - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_get_seqnum(tdb) == ++seq); - /* Fetch doesn't change seqnum */ - if (ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS)) - free(d.dptr); - ok1(tdb_get_seqnum(tdb) == seq); - ok1(tdb_append(tdb, key, data) == TDB_SUCCESS); - /* Append in tdb1 (or store over value) bumps twice! */ - if (flags[i] & TDB_VERSION1) - seq++; - ok1(tdb_get_seqnum(tdb) == ++seq); - - ok1(tdb_delete(tdb, key) == TDB_SUCCESS); - ok1(tdb_get_seqnum(tdb) == ++seq); - /* Empty append works */ - ok1(tdb_append(tdb, key, data) == TDB_SUCCESS); - ok1(tdb_get_seqnum(tdb) == ++seq); - - ok1(tdb_wipe_all(tdb) == TDB_SUCCESS); - ok1(tdb_get_seqnum(tdb) == ++seq); - - if (!(flags[i] & TDB_INTERNAL)) { - ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_get_seqnum(tdb) == ++seq); - /* Append in tdb1 (or store over value) bumps twice! */ - if (flags[i] & TDB_VERSION1) - seq++; - ok1(tdb_append(tdb, key, data) == TDB_SUCCESS); - ok1(tdb_get_seqnum(tdb) == ++seq); - ok1(tdb_delete(tdb, key) == TDB_SUCCESS); - ok1(tdb_get_seqnum(tdb) == ++seq); - ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS); - ok1(tdb_get_seqnum(tdb) == seq); - - ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_get_seqnum(tdb) == seq + 1); - tdb_transaction_cancel(tdb); - ok1(tdb_get_seqnum(tdb) == seq); - } - tdb_close(tdb); - ok1(tap_log_messages == 0); - } - return exit_status(); -} diff --git a/ccan/tdb2/test/api-82-lockattr.c b/ccan/tdb2/test/api-82-lockattr.c deleted file mode 100644 index 048feacf..00000000 --- a/ccan/tdb2/test/api-82-lockattr.c +++ /dev/null @@ -1,248 +0,0 @@ -#include // for tdb_fcntl_unlock -#include -#include -#include -#include -#include -#include -#include "logging.h" - -static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag, - void *_err) -{ - int *lock_err = _err; - struct flock fl; - int ret; - - if (*lock_err) { - errno = *lock_err; - return -1; - } - - do { - fl.l_type = rw; - fl.l_whence = SEEK_SET; - fl.l_start = off; - fl.l_len = len; - - if (waitflag) - ret = fcntl(fd, F_SETLKW, &fl); - else - ret = fcntl(fd, F_SETLK, &fl); - } while (ret != 0 && errno == EINTR); - - return ret; -} - -static int trav_err; -static int trav(struct tdb_context *tdb, TDB_DATA k, TDB_DATA d, int *err) -{ - *err = trav_err; - return 0; -} - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - union tdb_attribute lock_attr; - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data = tdb_mkdata("data", 4); - int lock_err; - - lock_attr.base.attr = TDB_ATTRIBUTE_FLOCK; - lock_attr.base.next = &tap_log_attr; - lock_attr.flock.lock = mylock; - lock_attr.flock.unlock = tdb_fcntl_unlock; - lock_attr.flock.data = &lock_err; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 80); - - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - struct tdb_data d; - unsigned int num_oom_messages; - - /* TDB1 double logs here. */ - if (flags[i] & TDB_VERSION1) { - num_oom_messages = 2; - } else { - num_oom_messages = 1; - } - - /* Nonblocking open; expect no error message. */ - lock_err = EAGAIN; - tdb = tdb_open("run-82-lockattr.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr); - ok(errno == lock_err, "Errno is %u", errno); - ok1(!tdb); - ok1(tap_log_messages == 0); - - lock_err = EINTR; - tdb = tdb_open("run-82-lockattr.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr); - ok(errno == lock_err, "Errno is %u", errno); - ok1(!tdb); - ok1(tap_log_messages == 0); - - /* Forced fail open. */ - lock_err = ENOMEM; - tdb = tdb_open("run-82-lockattr.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr); - ok1(errno == lock_err); - ok1(!tdb); - ok1(tap_log_messages == 1); - tap_log_messages = 0; - - lock_err = 0; - tdb = tdb_open("run-82-lockattr.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr); - if (!ok1(tdb)) - continue; - ok1(tap_log_messages == 0); - - /* Nonblocking store. */ - lock_err = EAGAIN; - ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = EINTR; - ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = ENOMEM; - ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK); - ok1(tap_log_messages == num_oom_messages); - tap_log_messages = 0; - - /* Nonblocking fetch. */ - lock_err = EAGAIN; - ok1(!tdb_exists(tdb, key)); - ok1(tap_log_messages == 0); - lock_err = EINTR; - ok1(!tdb_exists(tdb, key)); - ok1(tap_log_messages == 0); - lock_err = ENOMEM; - ok1(!tdb_exists(tdb, key)); - ok1(tap_log_messages == num_oom_messages); - tap_log_messages = 0; - - lock_err = EAGAIN; - ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = EINTR; - ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = ENOMEM; - ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK); - ok1(tap_log_messages == num_oom_messages); - tap_log_messages = 0; - - /* Nonblocking delete. */ - lock_err = EAGAIN; - ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = EINTR; - ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = ENOMEM; - ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK); - ok1(tap_log_messages == num_oom_messages); - tap_log_messages = 0; - - /* Nonblocking locks. */ - lock_err = EAGAIN; - ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = EINTR; - ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = ENOMEM; - ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK); - ok1(tap_log_messages == num_oom_messages); - tap_log_messages = 0; - - lock_err = EAGAIN; - ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = EINTR; - ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = ENOMEM; - ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK); - ok1(tap_log_messages == num_oom_messages); - tap_log_messages = 0; - - lock_err = EAGAIN; - ok1(tdb_lockall(tdb) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = EINTR; - ok1(tdb_lockall(tdb) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = ENOMEM; - ok1(tdb_lockall(tdb) == TDB_ERR_LOCK); - /* This actually does divide and conquer. */ - ok1(tap_log_messages > 0); - tap_log_messages = 0; - - lock_err = EAGAIN; - ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = EINTR; - ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = ENOMEM; - ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK); - ok1(tap_log_messages > 0); - tap_log_messages = 0; - - /* Nonblocking traverse; go nonblock partway through. */ - lock_err = 0; - ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0); - trav_err = EAGAIN; - ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - trav_err = EINTR; - lock_err = 0; - ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - trav_err = ENOMEM; - lock_err = 0; - ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK); - ok1(tap_log_messages == num_oom_messages); - tap_log_messages = 0; - - /* Nonblocking transactions. */ - lock_err = EAGAIN; - ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = EINTR; - ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - lock_err = ENOMEM; - ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK); - ok1(tap_log_messages == 1); - tap_log_messages = 0; - - /* Nonblocking transaction prepare. */ - lock_err = 0; - ok1(tdb_transaction_start(tdb) == 0); - ok1(tdb_delete(tdb, key) == 0); - - lock_err = EAGAIN; - ok1(tdb_transaction_prepare_commit(tdb) == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - - lock_err = 0; - ok1(tdb_transaction_prepare_commit(tdb) == 0); - ok1(tdb_transaction_commit(tdb) == 0); - - /* And the transaction was committed, right? */ - ok1(!tdb_exists(tdb, key)); - tdb_close(tdb); - ok1(tap_log_messages == 0); - } - return exit_status(); -} diff --git a/ccan/tdb2/test/api-83-openhook.c b/ccan/tdb2/test/api-83-openhook.c deleted file mode 100644 index e7e94738..00000000 --- a/ccan/tdb2/test/api-83-openhook.c +++ /dev/null @@ -1,99 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "external-agent.h" -#include "logging.h" - -static enum TDB_ERROR clear_if_first(int fd, void *arg) -{ -/* We hold a lock offset 4 always, so we can tell if anyone is holding it. - * (This is compatible with tdb1's TDB_CLEAR_IF_FIRST flag). */ - struct flock fl; - - if (arg != clear_if_first) - return TDB_ERR_CORRUPT; - - fl.l_type = F_WRLCK; - fl.l_whence = SEEK_SET; - fl.l_start = 4; - fl.l_len = 1; - - if (fcntl(fd, F_SETLK, &fl) == 0) { - /* We must be first ones to open it! */ - diag("truncating file!"); - if (ftruncate(fd, 0) != 0) { - return TDB_ERR_IO; - } - } - fl.l_type = F_RDLCK; - if (fcntl(fd, F_SETLKW, &fl) != 0) { - return TDB_ERR_IO; - } - return TDB_SUCCESS; -} - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - struct agent *agent; - union tdb_attribute cif; - struct tdb_data key = tdb_mkdata("key", 3); - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK; - cif.openhook.base.next = &tap_log_attr; - cif.openhook.fn = clear_if_first; - cif.openhook.data = clear_if_first; - - agent = prepare_external_agent(); - plan_tests(sizeof(flags) / sizeof(flags[0]) * 13); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - /* Create it */ - tdb = tdb_open("run-83-openhook.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, NULL); - ok1(tdb); - ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0); - tdb_close(tdb); - - /* Now, open with CIF, should clear it. */ - tdb = tdb_open("run-83-openhook.tdb", flags[i], - O_RDWR, 0, &cif); - ok1(tdb); - ok1(!tdb_exists(tdb, key)); - ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0); - - /* Agent should not clear it, since it's still open. */ - ok1(external_agent_operation(agent, OPEN_WITH_HOOK, - "run-83-openhook.tdb") == SUCCESS); - ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS); - ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS); - - /* Still exists for us too. */ - ok1(tdb_exists(tdb, key)); - - /* Close it, now agent should clear it. */ - tdb_close(tdb); - - ok1(external_agent_operation(agent, OPEN_WITH_HOOK, - "run-83-openhook.tdb") == SUCCESS); - ok1(external_agent_operation(agent, FETCH, "key") == FAILED); - ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS); - - ok1(tap_log_messages == 0); - } - - free_external_agent(agent); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-91-get-stats.c b/ccan/tdb2/test/api-91-get-stats.c deleted file mode 100644 index d9a22ca4..00000000 --- a/ccan/tdb2/test/api-91-get-stats.c +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 11); - - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - union tdb_attribute *attr; - struct tdb_data key = tdb_mkdata("key", 3); - - tdb = tdb_open("run-91-get-stats.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0); - - /* Use malloc so valgrind will catch overruns. */ - attr = malloc(sizeof *attr); - attr->stats.base.attr = TDB_ATTRIBUTE_STATS; - attr->stats.size = sizeof(*attr); - - ok1(tdb_get_attribute(tdb, attr) == 0); - ok1(attr->stats.size == sizeof(*attr)); - ok1(attr->stats.allocs > 0); - ok1(attr->stats.expands > 0); - ok1(attr->stats.locks > 0); - free(attr); - - /* Try short one. */ - attr = malloc(offsetof(struct tdb_attribute_stats, allocs) - + sizeof(attr->stats.allocs)); - attr->stats.base.attr = TDB_ATTRIBUTE_STATS; - attr->stats.size = offsetof(struct tdb_attribute_stats, allocs) - + sizeof(attr->stats.allocs); - ok1(tdb_get_attribute(tdb, attr) == 0); - ok1(attr->stats.size == sizeof(*attr)); - ok1(attr->stats.allocs > 0); - free(attr); - ok1(tap_log_messages == 0); - - tdb_close(tdb); - - } - return exit_status(); -} diff --git a/ccan/tdb2/test/api-92-get-set-readonly.c b/ccan/tdb2/test/api-92-get-set-readonly.c deleted file mode 100644 index 483b50d7..00000000 --- a/ccan/tdb2/test/api-92-get-set-readonly.c +++ /dev/null @@ -1,120 +0,0 @@ -#include -#include -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i, extra_msgs; - struct tdb_context *tdb; - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data = tdb_mkdata("data", 4); - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 48); - - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - /* RW -> R0 */ - tdb = tdb_open("run-92-get-set-readonly.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - ok1(!(tdb_get_flags(tdb) & TDB_RDONLY)); - - /* TDB1 complains multiple times. */ - if (flags[i] & TDB_VERSION1) { - extra_msgs = 1; - } else { - extra_msgs = 0; - } - - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS); - - tdb_add_flag(tdb, TDB_RDONLY); - ok1(tdb_get_flags(tdb) & TDB_RDONLY); - - /* Can't store, append, delete. */ - ok1(tdb_store(tdb, key, data, TDB_MODIFY) == TDB_ERR_RDONLY); - ok1(tap_log_messages == 1); - ok1(tdb_append(tdb, key, data) == TDB_ERR_RDONLY); - tap_log_messages -= extra_msgs; - ok1(tap_log_messages == 2); - ok1(tdb_delete(tdb, key) == TDB_ERR_RDONLY); - tap_log_messages -= extra_msgs; - ok1(tap_log_messages == 3); - - /* Can't start a transaction, or any write lock. */ - ok1(tdb_transaction_start(tdb) == TDB_ERR_RDONLY); - ok1(tap_log_messages == 4); - ok1(tdb_chainlock(tdb, key) == TDB_ERR_RDONLY); - tap_log_messages -= extra_msgs; - ok1(tap_log_messages == 5); - ok1(tdb_lockall(tdb) == TDB_ERR_RDONLY); - ok1(tap_log_messages == 6); - ok1(tdb_wipe_all(tdb) == TDB_ERR_RDONLY); - ok1(tap_log_messages == 7); - - /* Back to RW. */ - tdb_remove_flag(tdb, TDB_RDONLY); - ok1(!(tdb_get_flags(tdb) & TDB_RDONLY)); - - ok1(tdb_store(tdb, key, data, TDB_MODIFY) == TDB_SUCCESS); - ok1(tdb_append(tdb, key, data) == TDB_SUCCESS); - ok1(tdb_delete(tdb, key) == TDB_SUCCESS); - - ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS); - ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS); - - ok1(tdb_chainlock(tdb, key) == TDB_SUCCESS); - tdb_chainunlock(tdb, key); - ok1(tdb_lockall(tdb) == TDB_SUCCESS); - tdb_unlockall(tdb); - ok1(tdb_wipe_all(tdb) == TDB_SUCCESS); - ok1(tap_log_messages == 7); - - tdb_close(tdb); - - /* R0 -> RW */ - tdb = tdb_open("run-92-get-set-readonly.tdb", flags[i], - O_RDONLY, 0600, &tap_log_attr); - ok1(tdb); - ok1(tdb_get_flags(tdb) & TDB_RDONLY); - - /* Can't store, append, delete. */ - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_ERR_RDONLY); - ok1(tap_log_messages == 8); - ok1(tdb_append(tdb, key, data) == TDB_ERR_RDONLY); - tap_log_messages -= extra_msgs; - ok1(tap_log_messages == 9); - ok1(tdb_delete(tdb, key) == TDB_ERR_RDONLY); - tap_log_messages -= extra_msgs; - ok1(tap_log_messages == 10); - - /* Can't start a transaction, or any write lock. */ - ok1(tdb_transaction_start(tdb) == TDB_ERR_RDONLY); - ok1(tap_log_messages == 11); - ok1(tdb_chainlock(tdb, key) == TDB_ERR_RDONLY); - tap_log_messages -= extra_msgs; - ok1(tap_log_messages == 12); - ok1(tdb_lockall(tdb) == TDB_ERR_RDONLY); - ok1(tap_log_messages == 13); - ok1(tdb_wipe_all(tdb) == TDB_ERR_RDONLY); - ok1(tap_log_messages == 14); - - /* Can't remove TDB_RDONLY since we opened with O_RDONLY */ - tdb_remove_flag(tdb, TDB_RDONLY); - ok1(tap_log_messages == 15); - ok1(tdb_get_flags(tdb) & TDB_RDONLY); - tdb_close(tdb); - - ok1(tap_log_messages == 15); - tap_log_messages = 0; - } - return exit_status(); -} diff --git a/ccan/tdb2/test/api-93-repack.c b/ccan/tdb2/test/api-93-repack.c deleted file mode 100644 index 74a8b5ed..00000000 --- a/ccan/tdb2/test/api-93-repack.c +++ /dev/null @@ -1,82 +0,0 @@ -#include -#include -#include -#include -#include -#include "logging.h" - -#define NUM_TESTS 1000 - -static bool store_all(struct tdb_context *tdb) -{ - unsigned int i; - struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; - struct tdb_data dbuf = { (unsigned char *)&i, sizeof(i) }; - - for (i = 0; i < NUM_TESTS; i++) { - if (tdb_store(tdb, key, dbuf, TDB_INSERT) != TDB_SUCCESS) - return false; - } - return true; -} - -static int mark_entry(struct tdb_context *tdb, - TDB_DATA key, TDB_DATA data, bool found[]) -{ - unsigned int num; - - if (key.dsize != sizeof(num)) - return -1; - memcpy(&num, key.dptr, key.dsize); - if (num >= NUM_TESTS) - return -1; - if (found[num]) - return -1; - found[num] = true; - return 0; -} - -static bool is_all_set(bool found[], unsigned int num) -{ - unsigned int i; - - for (i = 0; i < num; i++) - if (!found[i]) - return false; - return true; -} - -int main(int argc, char *argv[]) -{ - unsigned int i; - bool found[NUM_TESTS]; - struct tdb_context *tdb; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_VERSION1|TDB_NOMMAP, - TDB_VERSION1|TDB_CONVERT, - TDB_VERSION1|TDB_NOMMAP|TDB_CONVERT - }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 6 + 1); - - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-93-repack.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - break; - - ok1(store_all(tdb)); - - ok1(tdb_repack(tdb) == TDB_SUCCESS); - memset(found, 0, sizeof(found)); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - ok1(tdb_traverse(tdb, mark_entry, found) == NUM_TESTS); - ok1(is_all_set(found, NUM_TESTS)); - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-add-remove-flags.c b/ccan/tdb2/test/api-add-remove-flags.c deleted file mode 100644 index 231b9f6c..00000000 --- a/ccan/tdb2/test/api-add-remove-flags.c +++ /dev/null @@ -1,94 +0,0 @@ -#include // for tdb_context -#include -#include -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - plan_tests(173); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-add-remove-flags.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - ok1(tdb_get_flags(tdb) == tdb->flags); - tap_log_messages = 0; - tdb_add_flag(tdb, TDB_NOLOCK); - if (flags[i] & TDB_INTERNAL) - ok1(tap_log_messages == 1); - else { - ok1(tap_log_messages == 0); - ok1(tdb_get_flags(tdb) & TDB_NOLOCK); - } - - tap_log_messages = 0; - tdb_add_flag(tdb, TDB_NOMMAP); - if (flags[i] & TDB_INTERNAL) - ok1(tap_log_messages == 1); - else { - ok1(tap_log_messages == 0); - ok1(tdb_get_flags(tdb) & TDB_NOMMAP); - ok1(tdb->file->map_ptr == NULL); - } - - tap_log_messages = 0; - tdb_add_flag(tdb, TDB_NOSYNC); - if (flags[i] & TDB_INTERNAL) - ok1(tap_log_messages == 1); - else { - ok1(tap_log_messages == 0); - ok1(tdb_get_flags(tdb) & TDB_NOSYNC); - } - - ok1(tdb_get_flags(tdb) == tdb->flags); - - tap_log_messages = 0; - tdb_remove_flag(tdb, TDB_NOLOCK); - if (flags[i] & TDB_INTERNAL) - ok1(tap_log_messages == 1); - else { - ok1(tap_log_messages == 0); - ok1(!(tdb_get_flags(tdb) & TDB_NOLOCK)); - } - - tap_log_messages = 0; - tdb_remove_flag(tdb, TDB_NOMMAP); - if (flags[i] & TDB_INTERNAL) - ok1(tap_log_messages == 1); - else { - ok1(tap_log_messages == 0); - ok1(!(tdb_get_flags(tdb) & TDB_NOMMAP)); - ok1(tdb->file->map_ptr != NULL); - } - - tap_log_messages = 0; - tdb_remove_flag(tdb, TDB_NOSYNC); - if (flags[i] & TDB_INTERNAL) - ok1(tap_log_messages == 1); - else { - ok1(tap_log_messages == 0); - ok1(!(tdb_get_flags(tdb) & TDB_NOSYNC)); - } - - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-check-callback.c b/ccan/tdb2/test/api-check-callback.c deleted file mode 100644 index 1ea263d3..00000000 --- a/ccan/tdb2/test/api-check-callback.c +++ /dev/null @@ -1,90 +0,0 @@ -#include -#include -#include -#include -#include -#include "logging.h" - -#define NUM_RECORDS 1000 - -static bool store_records(struct tdb_context *tdb) -{ - int i; - struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; - struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; - - for (i = 0; i < NUM_RECORDS; i++) - if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) - return false; - return true; -} - -static enum TDB_ERROR check(struct tdb_data key, - struct tdb_data data, - bool *array) -{ - int val; - - if (key.dsize != sizeof(val)) { - diag("Wrong key size: %zu\n", key.dsize); - return TDB_ERR_CORRUPT; - } - - if (key.dsize != data.dsize - || memcmp(key.dptr, data.dptr, sizeof(val)) != 0) { - diag("Key and data differ\n"); - return TDB_ERR_CORRUPT; - } - - memcpy(&val, key.dptr, sizeof(val)); - if (val >= NUM_RECORDS || val < 0) { - diag("check value %i\n", val); - return TDB_ERR_CORRUPT; - } - - if (array[val]) { - diag("Value %i already seen\n", val); - return TDB_ERR_CORRUPT; - } - - array[val] = true; - return TDB_SUCCESS; -} - -int main(int argc, char *argv[]) -{ - unsigned int i, j; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - bool array[NUM_RECORDS]; - - tdb = tdb_open("run-check-callback.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - ok1(store_records(tdb)); - for (j = 0; j < NUM_RECORDS; j++) - array[j] = false; - ok1(tdb_check(tdb, check, array) == TDB_SUCCESS); - for (j = 0; j < NUM_RECORDS; j++) - if (!array[j]) - break; - ok1(j == NUM_RECORDS); - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-firstkey-nextkey.c b/ccan/tdb2/test/api-firstkey-nextkey.c deleted file mode 100644 index e0374d86..00000000 --- a/ccan/tdb2/test/api-firstkey-nextkey.c +++ /dev/null @@ -1,163 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "logging.h" - -#define NUM_RECORDS 1000 - -static bool store_records(struct tdb_context *tdb) -{ - int i; - struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; - struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; - - for (i = 0; i < NUM_RECORDS; i++) - if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) - return false; - return true; -} - -struct trav_data { - unsigned int records[NUM_RECORDS]; - unsigned int calls; -}; - -static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p) -{ - struct trav_data *td = p; - int val; - - memcpy(&val, dbuf.dptr, dbuf.dsize); - td->records[td->calls++] = val; - return 0; -} - -/* Since tdb_nextkey frees dptr, we need to clone it. */ -static TDB_DATA dup_key(TDB_DATA key) -{ - void *p = malloc(key.dsize); - memcpy(p, key.dptr, key.dsize); - key.dptr = p; - return key; -} - -int main(int argc, char *argv[]) -{ - unsigned int i, j; - int num; - struct trav_data td; - TDB_DATA k; - struct tdb_context *tdb; - union tdb_attribute seed_attr; - enum TDB_ERROR ecode; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - seed_attr.base.attr = TDB_ATTRIBUTE_SEED; - seed_attr.base.next = &tap_log_attr; - seed_attr.seed.seed = 6334326220117065685ULL; - - plan_tests(sizeof(flags) / sizeof(flags[0]) - * (NUM_RECORDS*6 + (NUM_RECORDS-1)*3 + 22) + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("api-firstkey-nextkey.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, - flags[i] & TDB_VERSION1 ? NULL : &seed_attr); - ok1(tdb); - if (!tdb) - continue; - - ok1(tdb_firstkey(tdb, &k) == TDB_ERR_NOEXIST); - - /* One entry... */ - k.dptr = (unsigned char *)# - k.dsize = sizeof(num); - num = 0; - ok1(tdb_store(tdb, k, k, TDB_INSERT) == 0); - ok1(tdb_firstkey(tdb, &k) == TDB_SUCCESS); - ok1(k.dsize == sizeof(num)); - ok1(memcmp(k.dptr, &num, sizeof(num)) == 0); - ok1(tdb_nextkey(tdb, &k) == TDB_ERR_NOEXIST); - - /* Two entries. */ - k.dptr = (unsigned char *)# - k.dsize = sizeof(num); - num = 1; - ok1(tdb_store(tdb, k, k, TDB_INSERT) == 0); - ok1(tdb_firstkey(tdb, &k) == TDB_SUCCESS); - ok1(k.dsize == sizeof(num)); - memcpy(&num, k.dptr, sizeof(num)); - ok1(num == 0 || num == 1); - ok1(tdb_nextkey(tdb, &k) == TDB_SUCCESS); - ok1(k.dsize == sizeof(j)); - memcpy(&j, k.dptr, sizeof(j)); - ok1(j == 0 || j == 1); - ok1(j != num); - ok1(tdb_nextkey(tdb, &k) == TDB_ERR_NOEXIST); - - /* Clean up. */ - k.dptr = (unsigned char *)# - k.dsize = sizeof(num); - num = 0; - ok1(tdb_delete(tdb, k) == 0); - num = 1; - ok1(tdb_delete(tdb, k) == 0); - - /* Now lots of records. */ - ok1(store_records(tdb)); - td.calls = 0; - - num = tdb_traverse(tdb, trav, &td); - ok1(num == NUM_RECORDS); - ok1(td.calls == NUM_RECORDS); - - /* Simple loop should match tdb_traverse */ - for (j = 0, ecode = tdb_firstkey(tdb, &k); j < td.calls; j++) { - int val; - - ok1(ecode == TDB_SUCCESS); - ok1(k.dsize == sizeof(val)); - memcpy(&val, k.dptr, k.dsize); - ok1(td.records[j] == val); - ecode = tdb_nextkey(tdb, &k); - } - - /* But arbitrary orderings should work too. */ - for (j = td.calls-1; j > 0; j--) { - k.dptr = (unsigned char *)&td.records[j-1]; - k.dsize = sizeof(td.records[j-1]); - k = dup_key(k); - ok1(tdb_nextkey(tdb, &k) == TDB_SUCCESS); - ok1(k.dsize == sizeof(td.records[j])); - ok1(memcmp(k.dptr, &td.records[j], k.dsize) == 0); - free(k.dptr); - } - - /* Even delete should work. */ - for (j = 0, ecode = tdb_firstkey(tdb, &k); - ecode != TDB_ERR_NOEXIST; - j++) { - ok1(ecode == TDB_SUCCESS); - ok1(k.dsize == 4); - ok1(tdb_delete(tdb, k) == 0); - ecode = tdb_nextkey(tdb, &k); - } - - diag("delete using first/nextkey gave %u of %u records", - j, NUM_RECORDS); - ok1(j == NUM_RECORDS); - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-fork-test.c b/ccan/tdb2/test/api-fork-test.c deleted file mode 100644 index 6feb618c..00000000 --- a/ccan/tdb2/test/api-fork-test.c +++ /dev/null @@ -1,204 +0,0 @@ -/* Test forking while holding lock. - * - * There are only five ways to do this currently: - * (1) grab a tdb_chainlock, then fork. - * (2) grab a tdb_lockall, then fork. - * (3) grab a tdb_lockall_read, then fork. - * (4) start a transaction, then fork. - * (5) fork from inside a tdb_parse() callback. - * - * Note that we don't hold a lock across tdb_traverse callbacks, so - * that doesn't matter. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "logging.h" - -static enum TDB_ERROR fork_in_parse(TDB_DATA key, TDB_DATA data, - struct tdb_context *tdb) -{ - int status, extra_messages; - - if (tdb_get_flags(tdb) & TDB_VERSION1) { - extra_messages = 1; - } else { - extra_messages = 0; - } - - if (fork() == 0) { - /* We expect this to fail. */ - if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK) - exit(1); - tap_log_messages -= extra_messages; - - if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK) - exit(1); - - tap_log_messages -= extra_messages; - if (tap_log_messages != 2) - exit(2); - - tdb_close(tdb); - if (tap_log_messages != 2) - exit(3); - exit(0); - } - wait(&status); - ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0); - return TDB_SUCCESS; -} - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data = tdb_mkdata("data", 4); - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 14); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - int status, extra_messages; - - if (flags[i] & TDB_VERSION1) { - extra_messages = 1; - } else { - extra_messages = 0; - } - - tap_log_messages = 0; - - tdb = tdb_open("run-fork-test.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - if (!ok1(tdb)) - continue; - - /* Put a record in here. */ - ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_SUCCESS); - - ok1(tdb_chainlock(tdb, key) == TDB_SUCCESS); - if (fork() == 0) { - /* We expect this to fail. */ - if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK) - return 1; - tap_log_messages -= extra_messages; - - if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK) - return 1; - tap_log_messages -= extra_messages; - - if (tap_log_messages != 2) - return 2; - - tdb_chainunlock(tdb, key); - if (tap_log_messages != 3) - return 3; - tdb_close(tdb); - if (tap_log_messages != 3) - return 4; - return 0; - } - wait(&status); - ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0); - tdb_chainunlock(tdb, key); - - ok1(tdb_lockall(tdb) == TDB_SUCCESS); - if (fork() == 0) { - /* We expect this to fail. */ - if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK) - return 1; - tap_log_messages -= extra_messages; - - if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK) - return 1; - tap_log_messages -= extra_messages; - - if (tap_log_messages != 2) - return 2; - - tdb_unlockall(tdb); - if (tap_log_messages != 2) - return 3; - tdb_close(tdb); - if (tap_log_messages != 2) - return 4; - return 0; - } - wait(&status); - ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0); - tdb_unlockall(tdb); - - ok1(tdb_lockall_read(tdb) == TDB_SUCCESS); - if (fork() == 0) { - /* We expect this to fail. */ - /* This would always fail anyway... */ - if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK) - return 1; - tap_log_messages -= extra_messages; - - if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK) - return 1; - tap_log_messages -= extra_messages; - - if (tap_log_messages != 2) - return 2; - - tdb_unlockall_read(tdb); - if (tap_log_messages != 2) - return 3; - tdb_close(tdb); - if (tap_log_messages != 2) - return 4; - return 0; - } - wait(&status); - ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0); - tdb_unlockall_read(tdb); - - ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); - /* If transactions is empty, noop "commit" succeeds. */ - ok1(tdb_delete(tdb, key) == TDB_SUCCESS); - if (fork() == 0) { - /* We expect this to fail. */ - if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK) - return 1; - tap_log_messages -= extra_messages; - - if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK) - return 1; - tap_log_messages -= extra_messages; - - if (tap_log_messages != 2) - return 2; - - if (tdb_transaction_commit(tdb) != TDB_ERR_LOCK) - return 3; - tap_log_messages -= extra_messages; - - tdb_close(tdb); - if (tap_log_messages < 3) - return 4; - return 0; - } - wait(&status); - ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0); - tdb_transaction_cancel(tdb); - - ok1(tdb_parse_record(tdb, key, fork_in_parse, tdb) - == TDB_SUCCESS); - tdb_close(tdb); - ok1(tap_log_messages == 0); - } - return exit_status(); -} diff --git a/ccan/tdb2/test/api-locktimeout.c b/ccan/tdb2/test/api-locktimeout.c deleted file mode 100644 index 21a26c46..00000000 --- a/ccan/tdb2/test/api-locktimeout.c +++ /dev/null @@ -1,194 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include "logging.h" -#include "external-agent.h" - -#undef alarm -#define alarm fast_alarm - -/* Speed things up by doing things in milliseconds. */ -static unsigned int fast_alarm(unsigned int milli_seconds) -{ - struct itimerval it; - - it.it_interval.tv_sec = it.it_interval.tv_usec = 0; - it.it_value.tv_sec = milli_seconds / 1000; - it.it_value.tv_usec = milli_seconds * 1000; - setitimer(ITIMER_REAL, &it, NULL); - return 0; -} - -#define CatchSignal(sig, handler) signal((sig), (handler)) - -static void do_nothing(int signum) -{ -} - -/* This example code is taken from SAMBA, so try not to change it. */ -static struct flock flock_struct; - -/* Return a value which is none of v1, v2 or v3. */ -static inline short int invalid_value(short int v1, short int v2, short int v3) -{ - short int try = (v1+v2+v3)^((v1+v2+v3) << 16); - while (try == v1 || try == v2 || try == v3) - try++; - return try; -} - -/* We invalidate in as many ways as we can, so the OS rejects it */ -static void invalidate_flock_struct(int signum) -{ - flock_struct.l_type = invalid_value(F_RDLCK, F_WRLCK, F_UNLCK); - flock_struct.l_whence = invalid_value(SEEK_SET, SEEK_CUR, SEEK_END); - flock_struct.l_start = -1; - /* A large negative. */ - flock_struct.l_len = (((off_t)1 << (sizeof(off_t)*CHAR_BIT - 1)) + 1); -} - -static int timeout_lock(int fd, int rw, off_t off, off_t len, bool waitflag, - void *_timeout) -{ - int ret, saved_errno = errno; - unsigned int timeout = *(unsigned int *)_timeout; - - flock_struct.l_type = rw; - flock_struct.l_whence = SEEK_SET; - flock_struct.l_start = off; - flock_struct.l_len = len; - - CatchSignal(SIGALRM, invalidate_flock_struct); - alarm(timeout); - - for (;;) { - if (waitflag) - ret = fcntl(fd, F_SETLKW, &flock_struct); - else - ret = fcntl(fd, F_SETLK, &flock_struct); - - if (ret == 0) - break; - - /* Not signalled? Something else went wrong. */ - if (flock_struct.l_len == len) { - if (errno == EAGAIN || errno == EINTR) - continue; - saved_errno = errno; - break; - } else { - saved_errno = EINTR; - break; - } - } - - alarm(0); - errno = saved_errno; - return ret; -} - -static int tdb_chainlock_with_timeout_internal(struct tdb_context *tdb, - TDB_DATA key, - unsigned int timeout, - int rw_type) -{ - union tdb_attribute locking; - enum TDB_ERROR ecode; - - if (timeout) { - locking.base.attr = TDB_ATTRIBUTE_FLOCK; - ecode = tdb_get_attribute(tdb, &locking); - if (ecode != TDB_SUCCESS) - return ecode; - - /* Replace locking function with our own. */ - locking.flock.data = &timeout; - locking.flock.lock = timeout_lock; - - ecode = tdb_set_attribute(tdb, &locking); - if (ecode != TDB_SUCCESS) - return ecode; - } - if (rw_type == F_RDLCK) - ecode = tdb_chainlock_read(tdb, key); - else - ecode = tdb_chainlock(tdb, key); - - if (timeout) { - tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK); - } - return ecode; -} - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - TDB_DATA key = tdb_mkdata("hello", 5); - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - struct agent *agent; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 15); - - agent = prepare_external_agent(); - - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - enum TDB_ERROR ecode; - tdb = tdb_open("run-locktimeout.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - if (!ok1(tdb)) - break; - - /* Simple cases: should succeed. */ - ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20, - F_RDLCK); - ok1(ecode == TDB_SUCCESS); - ok1(tap_log_messages == 0); - - tdb_chainunlock_read(tdb, key); - ok1(tap_log_messages == 0); - - ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20, - F_WRLCK); - ok1(ecode == TDB_SUCCESS); - ok1(tap_log_messages == 0); - - tdb_chainunlock(tdb, key); - ok1(tap_log_messages == 0); - - /* OK, get agent to start transaction, then we should time out. */ - ok1(external_agent_operation(agent, OPEN, "run-locktimeout.tdb") - == SUCCESS); - ok1(external_agent_operation(agent, TRANSACTION_START, "") - == SUCCESS); - ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20, - F_WRLCK); - ok1(ecode == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - - /* Even if we get a different signal, should be fine. */ - CatchSignal(SIGUSR1, do_nothing); - external_agent_operation(agent, SEND_SIGNAL, ""); - ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20, - F_WRLCK); - ok1(ecode == TDB_ERR_LOCK); - ok1(tap_log_messages == 0); - - ok1(external_agent_operation(agent, TRANSACTION_COMMIT, "") - == SUCCESS); - ok1(external_agent_operation(agent, CLOSE, "") - == SUCCESS); - tdb_close(tdb); - } - free_external_agent(agent); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-missing-entries.c b/ccan/tdb2/test/api-missing-entries.c deleted file mode 100644 index 0b21e1ee..00000000 --- a/ccan/tdb2/test/api-missing-entries.c +++ /dev/null @@ -1,43 +0,0 @@ -/* Another test revealed that we lost an entry. This reproduces it. */ -#include -#include -#include -#include -#include -#include -#include "logging.h" - -#define NUM_RECORDS 1189 - -/* We use the same seed which we saw this failure on. */ -static uint64_t failhash(const void *key, size_t len, uint64_t seed, void *p) -{ - seed = 699537674708983027ULL; - return hash64_stable((const unsigned char *)key, len, seed); -} - -int main(int argc, char *argv[]) -{ - int i; - struct tdb_context *tdb; - struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; - struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; - union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, - .fn = failhash } }; - - hattr.base.next = &tap_log_attr; - plan_tests(1 + NUM_RECORDS + 2); - - tdb = tdb_open("run-missing-entries.tdb", TDB_INTERNAL, - O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr); - if (ok1(tdb)) { - for (i = 0; i < NUM_RECORDS; i++) { - ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0); - } - ok1(tdb_check(tdb, NULL, NULL) == 0); - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-open-multiple-times.c b/ccan/tdb2/test/api-open-multiple-times.c deleted file mode 100644 index 16562069..00000000 --- a/ccan/tdb2/test/api-open-multiple-times.c +++ /dev/null @@ -1,93 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i, extra_messages; - struct tdb_context *tdb, *tdb2; - struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; - struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; - struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */ - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 28); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-open-multiple-times.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - if (flags[i] & TDB_VERSION1) { - extra_messages = 1; - } else { - extra_messages = 0; - } - tdb2 = tdb_open("run-open-multiple-times.tdb", flags[i], - O_RDWR|O_CREAT, 0600, &tap_log_attr); - ok1(tdb_check(tdb, NULL, NULL) == 0); - ok1(tdb_check(tdb2, NULL, NULL) == 0); - - /* Store in one, fetch in the other. */ - ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0); - ok1(tdb_fetch(tdb2, key, &d) == TDB_SUCCESS); - ok1(tdb_deq(d, data)); - free(d.dptr); - - /* Vice versa, with delete. */ - ok1(tdb_delete(tdb2, key) == 0); - ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_NOEXIST); - - /* OK, now close first one, check second still good. */ - ok1(tdb_close(tdb) == 0); - - ok1(tdb_store(tdb2, key, data, TDB_REPLACE) == 0); - ok1(tdb_fetch(tdb2, key, &d) == TDB_SUCCESS); - ok1(tdb_deq(d, data)); - free(d.dptr); - - /* Reopen */ - tdb = tdb_open("run-open-multiple-times.tdb", flags[i], - O_RDWR|O_CREAT, 0600, &tap_log_attr); - ok1(tdb); - - ok1(tdb_transaction_start(tdb2) == 0); - - /* Anything in the other one should fail. */ - ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK); - tap_log_messages -= extra_messages; - ok1(tap_log_messages == 1); - ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK); - tap_log_messages -= extra_messages; - ok1(tap_log_messages == 2); - ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK); - ok1(tap_log_messages == 3); - ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK); - tap_log_messages -= extra_messages; - ok1(tap_log_messages == 4); - - /* Transaciton should work as normal. */ - ok1(tdb_store(tdb2, key, data, TDB_REPLACE) == TDB_SUCCESS); - - /* Now... try closing with locks held. */ - ok1(tdb_close(tdb2) == 0); - - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(tdb_deq(d, data)); - free(d.dptr); - ok1(tdb_close(tdb) == 0); - ok1(tap_log_messages == 4); - tap_log_messages = 0; - } - - return exit_status(); -} diff --git a/ccan/tdb2/test/api-record-expand.c b/ccan/tdb2/test/api-record-expand.c deleted file mode 100644 index 48ad1cdf..00000000 --- a/ccan/tdb2/test/api-record-expand.c +++ /dev/null @@ -1,55 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "logging.h" - -#define MAX_SIZE 10000 -#define SIZE_STEP 131 - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data; - - data.dptr = malloc(MAX_SIZE); - memset(data.dptr, 0x24, MAX_SIZE); - - plan_tests(sizeof(flags) / sizeof(flags[0]) - * (3 + (1 + (MAX_SIZE/SIZE_STEP)) * 2) + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-record-expand.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - data.dsize = 0; - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - for (data.dsize = 0; - data.dsize < MAX_SIZE; - data.dsize += SIZE_STEP) { - memset(data.dptr, data.dsize, data.dsize); - ok1(tdb_store(tdb, key, data, TDB_MODIFY) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - } - tdb_close(tdb); - } - ok1(tap_log_messages == 0); - free(data.dptr); - - return exit_status(); -} diff --git a/ccan/tdb2/test/api-simple-delete.c b/ccan/tdb2/test/api-simple-delete.c deleted file mode 100644 index a5b65d60..00000000 --- a/ccan/tdb2/test/api-simple-delete.c +++ /dev/null @@ -1,43 +0,0 @@ -#include -#include -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data = tdb_mkdata("data", 4); - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-simple-delete.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (tdb) { - /* Delete should fail. */ - ok1(tdb_delete(tdb, key) == TDB_ERR_NOEXIST); - ok1(tdb_check(tdb, NULL, NULL) == 0); - /* Insert should succeed. */ - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - /* Delete should now work. */ - ok1(tdb_delete(tdb, key) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - tdb_close(tdb); - } - } - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-summary.c b/ccan/tdb2/test/api-summary.c deleted file mode 100644 index e0e292ea..00000000 --- a/ccan/tdb2/test/api-summary.c +++ /dev/null @@ -1,63 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i, j; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - struct tdb_data key = { (unsigned char *)&j, sizeof(j) }; - struct tdb_data data = { (unsigned char *)&j, sizeof(j) }; - char *summary; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 2 * 5) + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-summary.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - /* Put some stuff in there. */ - for (j = 0; j < 500; j++) { - /* Make sure padding varies to we get some graphs! */ - data.dsize = j % (sizeof(j) + 1); - if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) - fail("Storing in tdb"); - } - - for (j = 0; - j <= TDB_SUMMARY_HISTOGRAMS; - j += TDB_SUMMARY_HISTOGRAMS) { - ok1(tdb_summary(tdb, j, &summary) == TDB_SUCCESS); - ok1(strstr(summary, "Number of records: 500\n")); - ok1(strstr(summary, "Smallest/average/largest keys: 4/4/4\n")); - ok1(strstr(summary, "Smallest/average/largest data: 0/2/4\n")); - if (!(flags[i] & TDB_VERSION1) - && j == TDB_SUMMARY_HISTOGRAMS) { - ok1(strstr(summary, "|") - && strstr(summary, "*")); - } else { - ok1(!strstr(summary, "|") - && !strstr(summary, "*")); - } - free(summary); - } - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/api-tdb1-flag-removal.c b/ccan/tdb2/test/api-tdb1-flag-removal.c deleted file mode 100644 index 28f24e63..00000000 --- a/ccan/tdb2/test/api-tdb1-flag-removal.c +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 3 + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-12-store.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - if (!ok1(tdb)) - continue; - - tdb_close(tdb); - - tdb = tdb_open("run-12-store.tdb", flags[i] | TDB_VERSION1, - O_RDWR, 0600, &tap_log_attr); - if (!ok1(tdb)) - continue; - /* It's not a version1 */ - ok1(!(tdb_get_flags(tdb) & TDB_VERSION1)); - - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/external-agent.c b/ccan/tdb2/test/external-agent.c deleted file mode 100644 index 01c7106f..00000000 --- a/ccan/tdb2/test/external-agent.c +++ /dev/null @@ -1,256 +0,0 @@ -#include "external-agent.h" -#include "logging.h" -#include "lock-tracking.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct tdb_context *tdb; - -void (*external_agent_free)(void *) = free; - -static enum TDB_ERROR clear_if_first(int fd, void *arg) -{ -/* We hold a lock offset 4 always, so we can tell if anyone is holding it. - * (This is compatible with tdb1's TDB_CLEAR_IF_FIRST flag). */ - struct flock fl; - - fl.l_type = F_WRLCK; - fl.l_whence = SEEK_SET; - fl.l_start = 4; - fl.l_len = 1; - - if (fcntl(fd, F_SETLK, &fl) == 0) { - /* We must be first ones to open it! */ - diag("agent truncating file!"); - if (ftruncate(fd, 0) != 0) { - return TDB_ERR_IO; - } - } - fl.l_type = F_RDLCK; - if (fcntl(fd, F_SETLKW, &fl) != 0) { - return TDB_ERR_IO; - } - return TDB_SUCCESS; -} - -static enum agent_return do_operation(enum operation op, const char *name) -{ - TDB_DATA k; - enum agent_return ret; - TDB_DATA data; - enum TDB_ERROR ecode; - union tdb_attribute cif; - - if (op != OPEN && op != OPEN_WITH_HOOK && !tdb) { - diag("external: No tdb open!"); - return OTHER_FAILURE; - } - - diag("external: %s", operation_name(op)); - - k = tdb_mkdata(name, strlen(name)); - - locking_would_block = 0; - switch (op) { - case OPEN: - if (tdb) { - diag("Already have tdb %s open", tdb->name); - return OTHER_FAILURE; - } - tdb = tdb_open(name, TDB_DEFAULT, O_RDWR, 0, &tap_log_attr); - if (!tdb) { - if (!locking_would_block) - diag("Opening tdb gave %s", strerror(errno)); - forget_locking(); - ret = OTHER_FAILURE; - } else - ret = SUCCESS; - break; - case OPEN_WITH_HOOK: - if (tdb) { - diag("Already have tdb %s open", tdb->name); - return OTHER_FAILURE; - } - cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK; - cif.openhook.base.next = &tap_log_attr; - cif.openhook.fn = clear_if_first; - tdb = tdb_open(name, TDB_DEFAULT, O_RDWR, 0, &cif); - if (!tdb) { - if (!locking_would_block) - diag("Opening tdb gave %s", strerror(errno)); - forget_locking(); - ret = OTHER_FAILURE; - } else - ret = SUCCESS; - break; - case FETCH: - ecode = tdb_fetch(tdb, k, &data); - if (ecode == TDB_ERR_NOEXIST) { - ret = FAILED; - } else if (ecode < 0) { - ret = OTHER_FAILURE; - } else if (!tdb_deq(data, k)) { - ret = OTHER_FAILURE; - external_agent_free(data.dptr); - } else { - ret = SUCCESS; - external_agent_free(data.dptr); - } - break; - case STORE: - ret = tdb_store(tdb, k, k, 0) == 0 ? SUCCESS : OTHER_FAILURE; - break; - case TRANSACTION_START: - ret = tdb_transaction_start(tdb) == 0 ? SUCCESS : OTHER_FAILURE; - break; - case TRANSACTION_COMMIT: - ret = tdb_transaction_commit(tdb)==0 ? SUCCESS : OTHER_FAILURE; - break; - case NEEDS_RECOVERY: - if (tdb->flags & TDB_VERSION1) - ret = tdb1_needs_recovery(tdb) ? SUCCESS : FAILED; - else - ret = tdb_needs_recovery(tdb) ? SUCCESS : FAILED; - break; - case CHECK: - ret = tdb_check(tdb, NULL, NULL) == 0 ? SUCCESS : OTHER_FAILURE; - break; - case CLOSE: - ret = tdb_close(tdb) == 0 ? SUCCESS : OTHER_FAILURE; - tdb = NULL; - break; - case SEND_SIGNAL: - /* We do this async */ - ret = SUCCESS; - break; - default: - ret = OTHER_FAILURE; - } - - if (locking_would_block) - ret = WOULD_HAVE_BLOCKED; - - return ret; -} - -struct agent { - int cmdfd, responsefd; -}; - -/* Do this before doing any tdb stuff. Return handle, or NULL. */ -struct agent *prepare_external_agent(void) -{ - int pid, ret; - int command[2], response[2]; - char name[1+PATH_MAX]; - - if (pipe(command) != 0 || pipe(response) != 0) - return NULL; - - pid = fork(); - if (pid < 0) - return NULL; - - if (pid != 0) { - struct agent *agent = malloc(sizeof(*agent)); - - close(command[0]); - close(response[1]); - agent->cmdfd = command[1]; - agent->responsefd = response[0]; - return agent; - } - - close(command[1]); - close(response[0]); - - /* We want to fail, not block. */ - nonblocking_locks = true; - log_prefix = "external: "; - while ((ret = read(command[0], name, sizeof(name))) > 0) { - enum agent_return result; - - result = do_operation(name[0], name+1); - if (write(response[1], &result, sizeof(result)) - != sizeof(result)) - err(1, "Writing response"); - if (name[0] == SEND_SIGNAL) { - struct timeval ten_ms; - ten_ms.tv_sec = 0; - ten_ms.tv_usec = 10000; - select(0, NULL, NULL, NULL, &ten_ms); - kill(getppid(), SIGUSR1); - } - } - exit(0); -} - -/* Ask the external agent to try to do an operation. */ -enum agent_return external_agent_operation(struct agent *agent, - enum operation op, - const char *name) -{ - enum agent_return res; - unsigned int len; - char *string; - - if (!name) - name = ""; - len = 1 + strlen(name) + 1; - string = malloc(len); - - string[0] = op; - strcpy(string+1, name); - - if (write(agent->cmdfd, string, len) != len - || read(agent->responsefd, &res, sizeof(res)) != sizeof(res)) - res = AGENT_DIED; - - free(string); - return res; -} - -const char *agent_return_name(enum agent_return ret) -{ - return ret == SUCCESS ? "SUCCESS" - : ret == WOULD_HAVE_BLOCKED ? "WOULD_HAVE_BLOCKED" - : ret == AGENT_DIED ? "AGENT_DIED" - : ret == FAILED ? "FAILED" - : ret == OTHER_FAILURE ? "OTHER_FAILURE" - : "**INVALID**"; -} - -const char *operation_name(enum operation op) -{ - switch (op) { - case OPEN: return "OPEN"; - case OPEN_WITH_HOOK: return "OPEN_WITH_HOOK"; - case FETCH: return "FETCH"; - case STORE: return "STORE"; - case CHECK: return "CHECK"; - case TRANSACTION_START: return "TRANSACTION_START"; - case TRANSACTION_COMMIT: return "TRANSACTION_COMMIT"; - case NEEDS_RECOVERY: return "NEEDS_RECOVERY"; - case SEND_SIGNAL: return "SEND_SIGNAL"; - case CLOSE: return "CLOSE"; - } - return "**INVALID**"; -} - -void free_external_agent(struct agent *agent) -{ - close(agent->cmdfd); - close(agent->responsefd); - free(agent); -} diff --git a/ccan/tdb2/test/external-agent.h b/ccan/tdb2/test/external-agent.h deleted file mode 100644 index 9d25c582..00000000 --- a/ccan/tdb2/test/external-agent.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef TDB2_TEST_EXTERNAL_AGENT_H -#define TDB2_TEST_EXTERNAL_AGENT_H - -/* For locking tests, we need a different process to try things at - * various times. */ -enum operation { - OPEN, - OPEN_WITH_HOOK, - FETCH, - STORE, - TRANSACTION_START, - TRANSACTION_COMMIT, - NEEDS_RECOVERY, - CHECK, - SEND_SIGNAL, - CLOSE, -}; - -/* Do this before doing any tdb stuff. Return handle, or -1. */ -struct agent *prepare_external_agent(void); - -enum agent_return { - SUCCESS, - WOULD_HAVE_BLOCKED, - AGENT_DIED, - FAILED, /* For fetch, or NEEDS_RECOVERY */ - OTHER_FAILURE, -}; - -/* Ask the external agent to try to do an operation. - * name == tdb name for OPEN/OPEN_WITH_CLEAR_IF_FIRST, - * record name for FETCH/STORE (store stores name as data too) - */ -enum agent_return external_agent_operation(struct agent *handle, - enum operation op, - const char *name); - -/* Hook into free() on tdb_data in external agent. */ -void (*external_agent_free)(void *); - -/* Mapping enum -> string. */ -const char *agent_return_name(enum agent_return ret); -const char *operation_name(enum operation op); - -void free_external_agent(struct agent *agent); -#endif /* TDB2_TEST_EXTERNAL_AGENT_H */ diff --git a/ccan/tdb2/test/failtest_helper.c b/ccan/tdb2/test/failtest_helper.c deleted file mode 100644 index ab79de19..00000000 --- a/ccan/tdb2/test/failtest_helper.c +++ /dev/null @@ -1,96 +0,0 @@ -#include "failtest_helper.h" -#include "logging.h" -#include -#include - -bool failtest_suppress = false; - -/* FIXME: From ccan/str */ -static inline bool strends(const char *str, const char *postfix) -{ - if (strlen(str) < strlen(postfix)) - return false; - - return !strcmp(str + strlen(str) - strlen(postfix), postfix); -} - -bool failmatch(const struct failtest_call *call, - const char *file, int line, enum failtest_call_type type) -{ - return call->type == type - && call->line == line - && ((strcmp(call->file, file) == 0) - || (strends(call->file, file) - && (call->file[strlen(call->file) - strlen(file) - 1] - == '/'))); -} - -static bool is_nonblocking_lock(const struct failtest_call *call) -{ - return call->type == FAILTEST_FCNTL && call->u.fcntl.cmd == F_SETLK; -} - -static bool is_unlock(const struct failtest_call *call) -{ - return call->type == FAILTEST_FCNTL - && call->u.fcntl.arg.fl.l_type == F_UNLCK; -} - -bool exit_check_log(struct tlist_calls *history) -{ - const struct failtest_call *i; - - tlist_for_each(history, i, list) { - if (!i->fail) - continue; - /* Failing the /dev/urandom open doesn't count: we fall back. */ - if (failmatch(i, URANDOM_OPEN)) - continue; - - /* Similarly with read fail. */ - if (failmatch(i, URANDOM_READ)) - continue; - - /* Initial allocation of tdb doesn't log. */ - if (failmatch(i, INITIAL_TDB_MALLOC)) - continue; - - /* We don't block "failures" on non-blocking locks. */ - if (is_nonblocking_lock(i)) - continue; - - if (!tap_log_messages) - diag("We didn't log for %s:%u", i->file, i->line); - return tap_log_messages != 0; - } - return true; -} - -/* Some places we soldier on despite errors: only fail them once. */ -enum failtest_result -block_repeat_failures(struct tlist_calls *history) -{ - const struct failtest_call *last; - - last = tlist_tail(history, list); - - if (failtest_suppress) - return FAIL_DONT_FAIL; - - if (failmatch(last, INITIAL_TDB_MALLOC) - || failmatch(last, URANDOM_OPEN) - || failmatch(last, URANDOM_READ)) { - return FAIL_PROBE; - } - - /* We handle mmap failing, by falling back to read/write, so - * don't try all possible paths. */ - if (last->type == FAILTEST_MMAP) - return FAIL_PROBE; - - /* Unlock or non-blocking lock is fail-once. */ - if (is_unlock(last) || is_nonblocking_lock(last)) - return FAIL_PROBE; - - return FAIL_OK; -} diff --git a/ccan/tdb2/test/failtest_helper.h b/ccan/tdb2/test/failtest_helper.h deleted file mode 100644 index 4130aff1..00000000 --- a/ccan/tdb2/test/failtest_helper.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef TDB2_TEST_FAILTEST_HELPER_H -#define TDB2_TEST_FAILTEST_HELPER_H -#include -#include - -/* FIXME: Check these! */ -#define INITIAL_TDB_MALLOC "open.c", 445, FAILTEST_MALLOC -#define URANDOM_OPEN "open.c", 62, FAILTEST_OPEN -#define URANDOM_READ "open.c", 42, FAILTEST_READ - -bool exit_check_log(struct tlist_calls *history); -bool failmatch(const struct failtest_call *call, - const char *file, int line, enum failtest_call_type type); -enum failtest_result block_repeat_failures(struct tlist_calls *history); - -/* Set this to suppress failure. */ -extern bool failtest_suppress; - -#endif /* TDB2_TEST_LOGGING_H */ diff --git a/ccan/tdb2/test/jenkins-be-hash.tdb1 b/ccan/tdb2/test/jenkins-be-hash.tdb1 deleted file mode 100644 index b6528404..00000000 Binary files a/ccan/tdb2/test/jenkins-be-hash.tdb1 and /dev/null differ diff --git a/ccan/tdb2/test/jenkins-le-hash.tdb1 b/ccan/tdb2/test/jenkins-le-hash.tdb1 deleted file mode 100644 index 007e0a33..00000000 Binary files a/ccan/tdb2/test/jenkins-le-hash.tdb1 and /dev/null differ diff --git a/ccan/tdb2/test/layout.c b/ccan/tdb2/test/layout.c deleted file mode 100644 index ae37f565..00000000 --- a/ccan/tdb2/test/layout.c +++ /dev/null @@ -1,402 +0,0 @@ -/* TDB tools to create various canned database layouts. */ -#include "layout.h" -#include -#include -#include -#include -#include "logging.h" - -struct tdb_layout *new_tdb_layout(void) -{ - struct tdb_layout *layout = malloc(sizeof(*layout)); - layout->num_elems = 0; - layout->elem = NULL; - return layout; -} - -static void add(struct tdb_layout *layout, union tdb_layout_elem elem) -{ - layout->elem = realloc(layout->elem, - sizeof(layout->elem[0]) - * (layout->num_elems+1)); - layout->elem[layout->num_elems++] = elem; -} - -void tdb_layout_add_freetable(struct tdb_layout *layout) -{ - union tdb_layout_elem elem; - elem.base.type = FREETABLE; - add(layout, elem); -} - -void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len, - unsigned ftable) -{ - union tdb_layout_elem elem; - elem.base.type = FREE; - elem.free.len = len; - elem.free.ftable_num = ftable; - add(layout, elem); -} - -void tdb_layout_add_capability(struct tdb_layout *layout, - uint64_t type, - bool write_breaks, - bool check_breaks, - bool open_breaks, - tdb_len_t extra) -{ - union tdb_layout_elem elem; - elem.base.type = CAPABILITY; - elem.capability.type = type; - if (write_breaks) - elem.capability.type |= TDB_CAP_NOWRITE; - if (open_breaks) - elem.capability.type |= TDB_CAP_NOOPEN; - if (check_breaks) - elem.capability.type |= TDB_CAP_NOCHECK; - elem.capability.extra = extra; - add(layout, elem); -} - -static struct tdb_data dup_key(struct tdb_data key) -{ - struct tdb_data ret; - ret.dsize = key.dsize; - ret.dptr = malloc(ret.dsize); - memcpy(ret.dptr, key.dptr, ret.dsize); - return ret; -} - -void tdb_layout_add_used(struct tdb_layout *layout, - TDB_DATA key, TDB_DATA data, - tdb_len_t extra) -{ - union tdb_layout_elem elem; - elem.base.type = DATA; - elem.used.key = dup_key(key); - elem.used.data = dup_key(data); - elem.used.extra = extra; - add(layout, elem); -} - -static tdb_len_t free_record_len(tdb_len_t len) -{ - return sizeof(struct tdb_used_record) + len; -} - -static tdb_len_t data_record_len(struct tle_used *used) -{ - tdb_len_t len; - len = sizeof(struct tdb_used_record) - + used->key.dsize + used->data.dsize + used->extra; - assert(len >= sizeof(struct tdb_free_record)); - return len; -} - -static tdb_len_t hashtable_len(struct tle_hashtable *htable) -{ - return sizeof(struct tdb_used_record) - + (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) - + htable->extra; -} - -static tdb_len_t capability_len(struct tle_capability *cap) -{ - return sizeof(struct tdb_capability) + cap->extra; -} - -static tdb_len_t freetable_len(struct tle_freetable *ftable) -{ - return sizeof(struct tdb_freetable); -} - -static void set_free_record(void *mem, tdb_len_t len) -{ - /* We do all the work in add_to_freetable */ -} - -static void add_zero_pad(struct tdb_used_record *u, size_t len, size_t extra) -{ - if (extra) - ((char *)(u + 1))[len] = '\0'; -} - -static void set_data_record(void *mem, struct tdb_context *tdb, - struct tle_used *used) -{ - struct tdb_used_record *u = mem; - - set_header(tdb, u, TDB_USED_MAGIC, used->key.dsize, used->data.dsize, - used->key.dsize + used->data.dsize + used->extra, - tdb_hash(tdb, used->key.dptr, used->key.dsize)); - memcpy(u + 1, used->key.dptr, used->key.dsize); - memcpy((char *)(u + 1) + used->key.dsize, - used->data.dptr, used->data.dsize); - add_zero_pad(u, used->key.dsize + used->data.dsize, used->extra); -} - -static void set_hashtable(void *mem, struct tdb_context *tdb, - struct tle_hashtable *htable) -{ - struct tdb_used_record *u = mem; - tdb_len_t len = sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS; - - set_header(tdb, u, TDB_HTABLE_MAGIC, 0, len, len + htable->extra, 0); - memset(u + 1, 0, len); - add_zero_pad(u, len, htable->extra); -} - -static void set_capability(void *mem, struct tdb_context *tdb, - struct tle_capability *cap, struct tdb_header *hdr, - tdb_off_t last_cap) -{ - struct tdb_capability *c = mem; - tdb_len_t len = sizeof(*c) - sizeof(struct tdb_used_record) + cap->extra; - - c->type = cap->type; - c->next = 0; - set_header(tdb, &c->hdr, TDB_CAP_MAGIC, 0, len, len, 0); - - /* Append to capability list. */ - if (!last_cap) { - hdr->capabilities = cap->base.off; - } else { - c = (struct tdb_capability *)((char *)hdr + last_cap); - c->next = cap->base.off; - } -} - -static void set_freetable(void *mem, struct tdb_context *tdb, - struct tle_freetable *freetable, struct tdb_header *hdr, - tdb_off_t last_ftable) -{ - struct tdb_freetable *ftable = mem; - memset(ftable, 0, sizeof(*ftable)); - set_header(tdb, &ftable->hdr, TDB_FTABLE_MAGIC, 0, - sizeof(*ftable) - sizeof(ftable->hdr), - sizeof(*ftable) - sizeof(ftable->hdr), 0); - - if (last_ftable) { - ftable = (struct tdb_freetable *)((char *)hdr + last_ftable); - ftable->next = freetable->base.off; - } else { - hdr->free_table = freetable->base.off; - } -} - -static void add_to_freetable(struct tdb_context *tdb, - tdb_off_t eoff, - tdb_off_t elen, - unsigned ftable, - struct tle_freetable *freetable) -{ - tdb->tdb2.ftable_off = freetable->base.off; - tdb->tdb2.ftable = ftable; - add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen, - TDB_LOCK_WAIT, false); -} - -static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned ingroup) -{ - return group_start - + (ingroup % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t); -} - -/* Get bits from a value. */ -static uint32_t bits(uint64_t val, unsigned start, unsigned num) -{ - assert(num <= 32); - return (val >> start) & ((1U << num) - 1); -} - -/* We take bits from the top: that way we can lock whole sections of the hash - * by using lock ranges. */ -static uint32_t use_bits(uint64_t h, unsigned num, unsigned *used) -{ - *used += num; - return bits(h, 64 - *used, num); -} - -static tdb_off_t encode_offset(tdb_off_t new_off, unsigned bucket, - uint64_t h) -{ - return bucket - | new_off - | ((uint64_t)bits(h, 64 - TDB_OFF_UPPER_STEAL_EXTRA, - TDB_OFF_UPPER_STEAL_EXTRA) - << TDB_OFF_HASH_EXTRA_BIT); -} - -/* FIXME: Our hash table handling here is primitive: we don't expand! */ -static void add_to_hashtable(struct tdb_context *tdb, - tdb_off_t eoff, - struct tdb_data key) -{ - uint64_t h = tdb_hash(tdb, key.dptr, key.dsize); - tdb_off_t b_off, group_start; - unsigned i, group, in_group; - unsigned used = 0; - - group = use_bits(h, TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS, &used); - in_group = use_bits(h, TDB_HASH_GROUP_BITS, &used); - - group_start = offsetof(struct tdb_header, hashtable) - + group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS); - - for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) { - unsigned bucket = (in_group + i) % (1 << TDB_HASH_GROUP_BITS); - - b_off = hbucket_off(group_start, bucket); - if (tdb_read_off(tdb, b_off) == 0) { - tdb_write_off(tdb, b_off, - encode_offset(eoff, in_group, h)); - return; - } - } - abort(); -} - -static struct tle_freetable *find_ftable(struct tdb_layout *layout, unsigned num) -{ - unsigned i; - - for (i = 0; i < layout->num_elems; i++) { - if (layout->elem[i].base.type != FREETABLE) - continue; - if (num == 0) - return &layout->elem[i].ftable; - num--; - } - abort(); -} - -/* FIXME: Support TDB_CONVERT */ -struct tdb_context *tdb_layout_get(struct tdb_layout *layout, - void (*freefn)(void *), - union tdb_attribute *attr) -{ - unsigned int i; - tdb_off_t off, len, last_ftable, last_cap; - char *mem; - struct tdb_context *tdb; - - off = sizeof(struct tdb_header); - - /* First pass of layout: calc lengths */ - for (i = 0; i < layout->num_elems; i++) { - union tdb_layout_elem *e = &layout->elem[i]; - e->base.off = off; - switch (e->base.type) { - case FREETABLE: - len = freetable_len(&e->ftable); - break; - case FREE: - len = free_record_len(e->free.len); - break; - case DATA: - len = data_record_len(&e->used); - break; - case HASHTABLE: - len = hashtable_len(&e->hashtable); - break; - case CAPABILITY: - len = capability_len(&e->capability); - break; - default: - abort(); - } - off += len; - } - - mem = malloc(off); - /* Fill with some weird pattern. */ - memset(mem, 0x99, off); - /* Now populate our header, cribbing from a real TDB header. */ - tdb = tdb_open(NULL, TDB_INTERNAL, O_RDWR, 0, attr); - memcpy(mem, tdb->file->map_ptr, sizeof(struct tdb_header)); - - /* Mug the tdb we have to make it use this. */ - freefn(tdb->file->map_ptr); - tdb->file->map_ptr = mem; - tdb->file->map_size = off; - - last_ftable = 0; - last_cap = 0; - for (i = 0; i < layout->num_elems; i++) { - union tdb_layout_elem *e = &layout->elem[i]; - switch (e->base.type) { - case FREETABLE: - set_freetable(mem + e->base.off, tdb, &e->ftable, - (struct tdb_header *)mem, last_ftable); - last_ftable = e->base.off; - break; - case FREE: - set_free_record(mem + e->base.off, e->free.len); - break; - case DATA: - set_data_record(mem + e->base.off, tdb, &e->used); - break; - case HASHTABLE: - set_hashtable(mem + e->base.off, tdb, &e->hashtable); - break; - case CAPABILITY: - set_capability(mem + e->base.off, tdb, &e->capability, - (struct tdb_header *)mem, last_cap); - last_cap = e->base.off; - break; - } - } - /* Must have a free table! */ - assert(last_ftable); - - /* Now fill the free and hash tables. */ - for (i = 0; i < layout->num_elems; i++) { - union tdb_layout_elem *e = &layout->elem[i]; - switch (e->base.type) { - case FREE: - add_to_freetable(tdb, e->base.off, e->free.len, - e->free.ftable_num, - find_ftable(layout, e->free.ftable_num)); - break; - case DATA: - add_to_hashtable(tdb, e->base.off, e->used.key); - break; - default: - break; - } - } - - tdb->tdb2.ftable_off = find_ftable(layout, 0)->base.off; - return tdb; -} - -void tdb_layout_write(struct tdb_layout *layout, void (*freefn)(void *), - union tdb_attribute *attr, const char *filename) -{ - struct tdb_context *tdb = tdb_layout_get(layout, freefn, attr); - int fd; - - fd = open(filename, O_WRONLY|O_TRUNC|O_CREAT, 0600); - if (fd < 0) - err(1, "opening %s for writing", filename); - if (write(fd, tdb->file->map_ptr, tdb->file->map_size) - != tdb->file->map_size) - err(1, "writing %s", filename); - close(fd); - tdb_close(tdb); -} - -void tdb_layout_free(struct tdb_layout *layout) -{ - unsigned int i; - - for (i = 0; i < layout->num_elems; i++) { - if (layout->elem[i].base.type == DATA) { - free(layout->elem[i].used.key.dptr); - free(layout->elem[i].used.data.dptr); - } - } - free(layout->elem); - free(layout); -} diff --git a/ccan/tdb2/test/layout.h b/ccan/tdb2/test/layout.h deleted file mode 100644 index 9a714846..00000000 --- a/ccan/tdb2/test/layout.h +++ /dev/null @@ -1,87 +0,0 @@ -#ifndef TDB2_TEST_LAYOUT_H -#define TDB2_TEST_LAYOUT_H -#include - -struct tdb_layout *new_tdb_layout(void); -void tdb_layout_add_freetable(struct tdb_layout *layout); -void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len, - unsigned ftable); -void tdb_layout_add_used(struct tdb_layout *layout, - TDB_DATA key, TDB_DATA data, - tdb_len_t extra); -void tdb_layout_add_capability(struct tdb_layout *layout, - uint64_t type, - bool write_breaks, - bool check_breaks, - bool open_breaks, - tdb_len_t extra); - -#if 0 /* FIXME: Allow allocation of subtables */ -void tdb_layout_add_hashtable(struct tdb_layout *layout, - int htable_parent, /* -1 == toplevel */ - unsigned int bucket, - tdb_len_t extra); -#endif -/* freefn is needed if we're using failtest_free. */ -struct tdb_context *tdb_layout_get(struct tdb_layout *layout, - void (*freefn)(void *), - union tdb_attribute *attr); -void tdb_layout_write(struct tdb_layout *layout, void (*freefn)(void *), - union tdb_attribute *attr, const char *filename); - -void tdb_layout_free(struct tdb_layout *layout); - -enum layout_type { - FREETABLE, FREE, DATA, HASHTABLE, CAPABILITY -}; - -/* Shared by all union members. */ -struct tle_base { - enum layout_type type; - tdb_off_t off; -}; - -struct tle_freetable { - struct tle_base base; -}; - -struct tle_free { - struct tle_base base; - tdb_len_t len; - unsigned ftable_num; -}; - -struct tle_used { - struct tle_base base; - TDB_DATA key; - TDB_DATA data; - tdb_len_t extra; -}; - -struct tle_hashtable { - struct tle_base base; - int parent; - unsigned int bucket; - tdb_len_t extra; -}; - -struct tle_capability { - struct tle_base base; - uint64_t type; - tdb_len_t extra; -}; - -union tdb_layout_elem { - struct tle_base base; - struct tle_freetable ftable; - struct tle_free free; - struct tle_used used; - struct tle_hashtable hashtable; - struct tle_capability capability; -}; - -struct tdb_layout { - unsigned int num_elems; - union tdb_layout_elem *elem; -}; -#endif /* TDB2_TEST_LAYOUT_H */ diff --git a/ccan/tdb2/test/lock-tracking.c b/ccan/tdb2/test/lock-tracking.c deleted file mode 100644 index e253db9f..00000000 --- a/ccan/tdb2/test/lock-tracking.c +++ /dev/null @@ -1,158 +0,0 @@ -/* We save the locks so we can reaquire them. */ -#include -#include -#include -#include -#include -#include -#include "lock-tracking.h" - -struct lock { - struct lock *next; - unsigned int off; - unsigned int len; - int type; -}; -static struct lock *locks; -int locking_errors = 0; -bool suppress_lockcheck = false; -bool nonblocking_locks; -int locking_would_block = 0; -void (*unlock_callback)(int fd); - -int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ ) -{ - va_list ap; - int ret, arg3; - struct flock *fl; - bool may_block = false; - - if (cmd != F_SETLK && cmd != F_SETLKW) { - /* This may be totally bogus, but we don't know in general. */ - va_start(ap, cmd); - arg3 = va_arg(ap, int); - va_end(ap); - - return fcntl(fd, cmd, arg3); - } - - va_start(ap, cmd); - fl = va_arg(ap, struct flock *); - va_end(ap); - - if (cmd == F_SETLKW && nonblocking_locks) { - cmd = F_SETLK; - may_block = true; - } - ret = fcntl(fd, cmd, fl); - - /* Detect when we failed, but might have been OK if we waited. */ - if (may_block && ret == -1 && (errno == EAGAIN || errno == EACCES)) { - locking_would_block++; - } - - if (fl->l_type == F_UNLCK) { - struct lock **l; - struct lock *old = NULL; - - for (l = &locks; *l; l = &(*l)->next) { - if ((*l)->off == fl->l_start - && (*l)->len == fl->l_len) { - if (ret == 0) { - old = *l; - *l = (*l)->next; - free(old); - } - break; - } - } - if (!old && !suppress_lockcheck) { - diag("Unknown unlock %u@%u - %i", - (int)fl->l_len, (int)fl->l_start, ret); - locking_errors++; - } - } else { - struct lock *new, *i; - unsigned int fl_end = fl->l_start + fl->l_len; - if (fl->l_len == 0) - fl_end = (unsigned int)-1; - - /* Check for overlaps: we shouldn't do this. */ - for (i = locks; i; i = i->next) { - unsigned int i_end = i->off + i->len; - if (i->len == 0) - i_end = (unsigned int)-1; - - if (fl->l_start >= i->off && fl->l_start < i_end) - break; - if (fl_end > i->off && fl_end < i_end) - break; - - /* tdb_allrecord_lock does this, handle adjacent: */ - if (fl->l_start > TDB_HASH_LOCK_START - && fl->l_start == i_end && fl->l_type == i->type) { - if (ret == 0) { - i->len = fl->l_len - ? i->len + fl->l_len - : 0; - } - goto done; - } - } - if (i) { - /* Special case: upgrade of allrecord lock. */ - if (i->type == F_RDLCK && fl->l_type == F_WRLCK - && i->off == TDB_HASH_LOCK_START - && fl->l_start == TDB_HASH_LOCK_START - && i->len == 0 - && fl->l_len == 0) { - if (ret == 0) - i->type = F_WRLCK; - goto done; - } - /* allrecord upgrade for tdb1. */ - if (i->type == F_RDLCK && fl->l_type == F_WRLCK - && i->off == TDB1_FREELIST_TOP - && fl->l_start == TDB1_FREELIST_TOP - && i->len == 0 - && fl->l_len == 0) { - if (ret == 0) - i->type = F_WRLCK; - goto done; - } - - if (!suppress_lockcheck) { - diag("%s lock %u@%u overlaps %u@%u", - fl->l_type == F_WRLCK ? "write" : "read", - (int)fl->l_len, (int)fl->l_start, - i->len, (int)i->off); - locking_errors++; - } - } - - if (ret == 0) { - new = malloc(sizeof *new); - new->off = fl->l_start; - new->len = fl->l_len; - new->type = fl->l_type; - new->next = locks; - locks = new; - } - } -done: - if (ret == 0 && fl->l_type == F_UNLCK && unlock_callback) - unlock_callback(fd); - return ret; -} - -unsigned int forget_locking(void) -{ - unsigned int num = 0; - while (locks) { - struct lock *next = locks->next; - free(locks); - locks = next; - num++; - } - return num; -} diff --git a/ccan/tdb2/test/lock-tracking.h b/ccan/tdb2/test/lock-tracking.h deleted file mode 100644 index f2c9c446..00000000 --- a/ccan/tdb2/test/lock-tracking.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef LOCK_TRACKING_H -#define LOCK_TRACKING_H -#include - -/* Set this if you want a callback after fnctl unlock. */ -extern void (*unlock_callback)(int fd); - -/* Replacement fcntl. */ -int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ ); - -/* Discard locking info: returns number of locks outstanding. */ -unsigned int forget_locking(void); - -/* Number of errors in locking. */ -extern int locking_errors; - -/* Suppress lock checking. */ -extern bool suppress_lockcheck; - -/* Make all locks non-blocking. */ -extern bool nonblocking_locks; - -/* Number of times we failed a lock because we made it non-blocking. */ -extern int locking_would_block; -#endif /* LOCK_TRACKING_H */ diff --git a/ccan/tdb2/test/logging.c b/ccan/tdb2/test/logging.c deleted file mode 100644 index 0712cc00..00000000 --- a/ccan/tdb2/test/logging.c +++ /dev/null @@ -1,31 +0,0 @@ -#include -#include -#include -#include "logging.h" - -unsigned tap_log_messages; -const char *log_prefix = ""; -char *log_last = NULL; -bool suppress_logging; - -union tdb_attribute tap_log_attr = { - .log = { .base = { .attr = TDB_ATTRIBUTE_LOG }, - .fn = tap_log_fn } -}; - -void tap_log_fn(struct tdb_context *tdb, - enum tdb_log_level level, - enum TDB_ERROR ecode, - const char *message, void *priv) -{ - if (suppress_logging) - return; - - diag("tdb log level %u: %s: %s%s", - level, tdb_errorstr(ecode), log_prefix, message); - if (log_last) - free(log_last); - log_last = strdup(message); - tap_log_messages++; -} - diff --git a/ccan/tdb2/test/logging.h b/ccan/tdb2/test/logging.h deleted file mode 100644 index 2dfea145..00000000 --- a/ccan/tdb2/test/logging.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef TDB2_TEST_LOGGING_H -#define TDB2_TEST_LOGGING_H -#include -#include -#include - -extern bool suppress_logging; -extern const char *log_prefix; -extern unsigned tap_log_messages; -extern union tdb_attribute tap_log_attr; -extern char *log_last; - -void tap_log_fn(struct tdb_context *tdb, - enum tdb_log_level level, - enum TDB_ERROR ecode, - const char *message, void *priv); -#endif /* TDB2_TEST_LOGGING_H */ diff --git a/ccan/tdb2/test/old-nohash-be.tdb1 b/ccan/tdb2/test/old-nohash-be.tdb1 deleted file mode 100644 index 1c49116c..00000000 Binary files a/ccan/tdb2/test/old-nohash-be.tdb1 and /dev/null differ diff --git a/ccan/tdb2/test/old-nohash-le.tdb1 b/ccan/tdb2/test/old-nohash-le.tdb1 deleted file mode 100644 index 0655072d..00000000 Binary files a/ccan/tdb2/test/old-nohash-le.tdb1 and /dev/null differ diff --git a/ccan/tdb2/test/run-001-encode.c b/ccan/tdb2/test/run-001-encode.c deleted file mode 100644 index 67616fcd..00000000 --- a/ccan/tdb2/test/run-001-encode.c +++ /dev/null @@ -1,41 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_used_record rec; - struct tdb_context tdb = { .log_fn = tap_log_fn }; - - plan_tests(64 + 32 + 48*6 + 1); - - /* We should be able to encode any data value. */ - for (i = 0; i < 64; i++) - ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, 0, 1ULL << i, - 1ULL << i, 0) == 0); - - /* And any key and data with < 64 bits between them. */ - for (i = 0; i < 32; i++) { - tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i; - ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen, - klen + dlen, 0) == 0); - } - - /* We should neatly encode all values. */ - for (i = 0; i < 48; i++) { - uint64_t h = 1ULL << (i < 5 ? i : 4); - uint64_t klen = 1ULL << (i < 16 ? i : 15); - uint64_t dlen = 1ULL << i; - uint64_t xlen = 1ULL << (i < 32 ? i : 31); - ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen, - klen+dlen+xlen, h) == 0); - ok1(rec_key_length(&rec) == klen); - ok1(rec_data_length(&rec) == dlen); - ok1(rec_extra_padding(&rec) == xlen); - ok1((uint64_t)rec_hash(&rec) == h); - ok1(rec_magic(&rec) == TDB_USED_MAGIC); - } - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/run-001-fls.c b/ccan/tdb2/test/run-001-fls.c deleted file mode 100644 index 4449f69d..00000000 --- a/ccan/tdb2/test/run-001-fls.c +++ /dev/null @@ -1,33 +0,0 @@ -#include "tdb2-source.h" -#include - -static unsigned int dumb_fls(uint64_t num) -{ - int i; - - for (i = 63; i >= 0; i--) { - if (num & (1ULL << i)) - break; - } - return i + 1; -} - -int main(int argc, char *argv[]) -{ - unsigned int i, j; - - plan_tests(64 * 64 + 2); - - ok1(fls64(0) == 0); - ok1(dumb_fls(0) == 0); - - for (i = 0; i < 64; i++) { - for (j = 0; j < 64; j++) { - uint64_t val = (1ULL << i) | (1ULL << j); - ok(fls64(val) == dumb_fls(val), - "%llu -> %u should be %u", (long long)val, - fls64(val), dumb_fls(val)); - } - } - return exit_status(); -} diff --git a/ccan/tdb2/test/run-01-new_database.c b/ccan/tdb2/test/run-01-new_database.c deleted file mode 100644 index a5f0dd37..00000000 --- a/ccan/tdb2/test/run-01-new_database.c +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include "tdb2-source.h" -#include -#include -#include "logging.h" -#include "failtest_helper.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - failtest_init(argc, argv); - failtest_hook = block_repeat_failures; - failtest_exit_check = exit_check_log; - plan_tests(sizeof(flags) / sizeof(flags[0]) * 3); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-new_database.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - if (!ok1(tdb)) - failtest_exit(exit_status()); - - failtest_suppress = true; - ok1(tdb_check(tdb, NULL, NULL) == 0); - failtest_suppress = false; - tdb_close(tdb); - if (!ok1(tap_log_messages == 0)) - break; - } - failtest_exit(exit_status()); -} diff --git a/ccan/tdb2/test/run-02-expand.c b/ccan/tdb2/test/run-02-expand.c deleted file mode 100644 index e3f5905a..00000000 --- a/ccan/tdb2/test/run-02-expand.c +++ /dev/null @@ -1,62 +0,0 @@ -#include -#include "tdb2-source.h" -#include -#include -#include "logging.h" -#include "failtest_helper.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - uint64_t val; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 11 + 1); - - failtest_init(argc, argv); - failtest_hook = block_repeat_failures; - failtest_exit_check = exit_check_log; - - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - failtest_suppress = true; - tdb = tdb_open("run-expand.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - if (!ok1(tdb)) - break; - - val = tdb->file->map_size; - /* Need some hash lock for expand. */ - ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0); - failtest_suppress = false; - if (!ok1(tdb_expand(tdb, 1) == 0)) { - failtest_suppress = true; - tdb_close(tdb); - break; - } - failtest_suppress = true; - - ok1(tdb->file->map_size >= val + 1 * TDB_EXTENSION_FACTOR); - ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - val = tdb->file->map_size; - ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0); - failtest_suppress = false; - if (!ok1(tdb_expand(tdb, 1024) == 0)) { - failtest_suppress = true; - tdb_close(tdb); - break; - } - failtest_suppress = true; - ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0); - ok1(tdb->file->map_size >= val + 1024 * TDB_EXTENSION_FACTOR); - ok1(tdb_check(tdb, NULL, NULL) == 0); - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - failtest_exit(exit_status()); -} diff --git a/ccan/tdb2/test/run-03-coalesce.c b/ccan/tdb2/test/run-03-coalesce.c deleted file mode 100644 index 99f94fe1..00000000 --- a/ccan/tdb2/test/run-03-coalesce.c +++ /dev/null @@ -1,178 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" -#include "layout.h" - -static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off) -{ - struct tdb_free_record f; - enum TDB_ERROR ecode; - - ecode = tdb_read_convert(tdb, off, &f, sizeof(f)); - if (ecode != TDB_SUCCESS) - return ecode; - if (frec_magic(&f) != TDB_FREE_MAGIC) - return TDB_ERR_CORRUPT; - return frec_len(&f); -} - -int main(int argc, char *argv[]) -{ - tdb_off_t b_off, test; - struct tdb_context *tdb; - struct tdb_layout *layout; - struct tdb_data data, key; - tdb_len_t len; - - /* FIXME: Test TDB_CONVERT */ - /* FIXME: Test lock order fail. */ - - plan_tests(42); - data = tdb_mkdata("world", 5); - key = tdb_mkdata("hello", 5); - - /* No coalescing can be done due to EOF */ - layout = new_tdb_layout(); - tdb_layout_add_freetable(layout); - len = 1024; - tdb_layout_add_free(layout, len, 0); - tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb"); - /* NOMMAP is for lockcheck. */ - tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0, - &tap_log_attr); - ok1(tdb_check(tdb, NULL, NULL) == 0); - ok1(free_record_length(tdb, layout->elem[1].base.off) == len); - - /* Figure out which bucket free entry is. */ - b_off = bucket_off(tdb->tdb2.ftable_off, size_to_bucket(len)); - /* Lock and fail to coalesce. */ - ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); - test = layout->elem[1].base.off; - ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len, &test) - == 0); - tdb_unlock_free_bucket(tdb, b_off); - ok1(free_record_length(tdb, layout->elem[1].base.off) == len); - ok1(test == layout->elem[1].base.off); - ok1(tdb_check(tdb, NULL, NULL) == 0); - tdb_close(tdb); - tdb_layout_free(layout); - - /* No coalescing can be done due to used record */ - layout = new_tdb_layout(); - tdb_layout_add_freetable(layout); - tdb_layout_add_free(layout, 1024, 0); - tdb_layout_add_used(layout, key, data, 6); - tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb"); - /* NOMMAP is for lockcheck. */ - tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0, - &tap_log_attr); - ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Figure out which bucket free entry is. */ - b_off = bucket_off(tdb->tdb2.ftable_off, size_to_bucket(1024)); - /* Lock and fail to coalesce. */ - ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); - test = layout->elem[1].base.off; - ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test) - == 0); - tdb_unlock_free_bucket(tdb, b_off); - ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024); - ok1(test == layout->elem[1].base.off); - ok1(tdb_check(tdb, NULL, NULL) == 0); - tdb_close(tdb); - tdb_layout_free(layout); - - /* Coalescing can be done due to two free records, then EOF */ - layout = new_tdb_layout(); - tdb_layout_add_freetable(layout); - tdb_layout_add_free(layout, 1024, 0); - tdb_layout_add_free(layout, 2048, 0); - tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb"); - /* NOMMAP is for lockcheck. */ - tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0, - &tap_log_attr); - ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024); - ok1(free_record_length(tdb, layout->elem[2].base.off) == 2048); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Figure out which bucket (first) free entry is. */ - b_off = bucket_off(tdb->tdb2.ftable_off, size_to_bucket(1024)); - /* Lock and coalesce. */ - ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); - test = layout->elem[2].base.off; - ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test) - == 1024 + sizeof(struct tdb_used_record) + 2048); - /* Should tell us it's erased this one... */ - ok1(test == TDB_ERR_NOEXIST); - ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0); - ok1(free_record_length(tdb, layout->elem[1].base.off) - == 1024 + sizeof(struct tdb_used_record) + 2048); - ok1(tdb_check(tdb, NULL, NULL) == 0); - tdb_close(tdb); - tdb_layout_free(layout); - - /* Coalescing can be done due to two free records, then data */ - layout = new_tdb_layout(); - tdb_layout_add_freetable(layout); - tdb_layout_add_free(layout, 1024, 0); - tdb_layout_add_free(layout, 512, 0); - tdb_layout_add_used(layout, key, data, 6); - tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb"); - /* NOMMAP is for lockcheck. */ - tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0, - &tap_log_attr); - ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024); - ok1(free_record_length(tdb, layout->elem[2].base.off) == 512); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Figure out which bucket free entry is. */ - b_off = bucket_off(tdb->tdb2.ftable_off, size_to_bucket(1024)); - /* Lock and coalesce. */ - ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); - test = layout->elem[2].base.off; - ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test) - == 1024 + sizeof(struct tdb_used_record) + 512); - ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0); - ok1(free_record_length(tdb, layout->elem[1].base.off) - == 1024 + sizeof(struct tdb_used_record) + 512); - ok1(test == TDB_ERR_NOEXIST); - ok1(tdb_check(tdb, NULL, NULL) == 0); - tdb_close(tdb); - tdb_layout_free(layout); - - /* Coalescing can be done due to three free records, then EOF */ - layout = new_tdb_layout(); - tdb_layout_add_freetable(layout); - tdb_layout_add_free(layout, 1024, 0); - tdb_layout_add_free(layout, 512, 0); - tdb_layout_add_free(layout, 256, 0); - tdb_layout_write(layout, free, &tap_log_attr, "run-03-coalesce.tdb"); - /* NOMMAP is for lockcheck. */ - tdb = tdb_open("run-03-coalesce.tdb", TDB_NOMMAP, O_RDWR, 0, - &tap_log_attr); - ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024); - ok1(free_record_length(tdb, layout->elem[2].base.off) == 512); - ok1(free_record_length(tdb, layout->elem[3].base.off) == 256); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Figure out which bucket free entry is. */ - b_off = bucket_off(tdb->tdb2.ftable_off, size_to_bucket(1024)); - /* Lock and coalesce. */ - ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0); - test = layout->elem[2].base.off; - ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test) - == 1024 + sizeof(struct tdb_used_record) + 512 - + sizeof(struct tdb_used_record) + 256); - ok1(tdb->file->allrecord_lock.count == 0 - && tdb->file->num_lockrecs == 0); - ok1(free_record_length(tdb, layout->elem[1].base.off) - == 1024 + sizeof(struct tdb_used_record) + 512 - + sizeof(struct tdb_used_record) + 256); - ok1(tdb_check(tdb, NULL, NULL) == 0); - tdb_close(tdb); - tdb_layout_free(layout); - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/run-04-basichash.c b/ccan/tdb2/test/run-04-basichash.c deleted file mode 100644 index 48527444..00000000 --- a/ccan/tdb2/test/run-04-basichash.c +++ /dev/null @@ -1,260 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" - -/* We rig the hash so adjacent-numbered records always clash. */ -static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv) -{ - return ((uint64_t)*(const unsigned int *)key) - << (64 - TDB_TOPLEVEL_HASH_BITS - 1); -} - -int main(int argc, char *argv[]) -{ - unsigned int i, j; - struct tdb_context *tdb; - unsigned int v; - struct tdb_used_record rec; - struct tdb_data key = { (unsigned char *)&v, sizeof(v) }; - struct tdb_data dbuf = { (unsigned char *)&v, sizeof(v) }; - union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, - .fn = clash } }; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - }; - - hattr.base.next = &tap_log_attr; - - plan_tests(sizeof(flags) / sizeof(flags[0]) - * (91 + (2 * ((1 << TDB_HASH_GROUP_BITS) - 1))) + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - struct hash_info h; - tdb_off_t new_off, off, subhash; - - tdb = tdb_open("run-04-basichash.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr); - ok1(tdb); - if (!tdb) - continue; - - v = 0; - /* Should not find it. */ - ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0); - /* Should have created correct hash. */ - ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); - /* Should have located space in group 0, bucket 0. */ - ok1(h.group_start == offsetof(struct tdb_header, hashtable)); - ok1(h.home_bucket == 0); - ok1(h.found_bucket == 0); - ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS); - - /* Should have lock on bucket 0 */ - ok1(h.hlock_start == 0); - ok1(h.hlock_range == - 1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS))); - ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1); - ok1((tdb->flags & TDB_NOLOCK) - || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START); - /* FIXME: Check lock length */ - - /* Allocate a new record. */ - new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h, - TDB_USED_MAGIC, false); - ok1(!TDB_OFF_IS_ERR(new_off)); - - /* We should be able to add it now. */ - ok1(add_to_hash(tdb, &h, new_off) == 0); - - /* Make sure we fill it in for later finding. */ - off = new_off + sizeof(struct tdb_used_record); - ok1(!tdb->tdb2.io->twrite(tdb, off, key.dptr, key.dsize)); - off += key.dsize; - ok1(!tdb->tdb2.io->twrite(tdb, off, dbuf.dptr, dbuf.dsize)); - - /* We should be able to unlock that OK. */ - ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, - F_WRLCK) == 0); - - /* Database should be consistent. */ - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Now, this should give a successful lookup. */ - ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) - == new_off); - /* Should have created correct hash. */ - ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); - /* Should have located space in group 0, bucket 0. */ - ok1(h.group_start == offsetof(struct tdb_header, hashtable)); - ok1(h.home_bucket == 0); - ok1(h.found_bucket == 0); - ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS); - - /* Should have lock on bucket 0 */ - ok1(h.hlock_start == 0); - ok1(h.hlock_range == - 1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS))); - ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1); - ok1((tdb->flags & TDB_NOLOCK) - || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START); - /* FIXME: Check lock length */ - - ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, - F_WRLCK) == 0); - - /* Database should be consistent. */ - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Test expansion. */ - v = 1; - ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0); - /* Should have created correct hash. */ - ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); - /* Should have located space in group 0, bucket 1. */ - ok1(h.group_start == offsetof(struct tdb_header, hashtable)); - ok1(h.home_bucket == 0); - ok1(h.found_bucket == 1); - ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS); - - /* Should have lock on bucket 0 */ - ok1(h.hlock_start == 0); - ok1(h.hlock_range == - 1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS))); - ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1); - ok1((tdb->flags & TDB_NOLOCK) - || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START); - /* FIXME: Check lock length */ - - /* Make it expand 0'th bucket. */ - ok1(expand_group(tdb, &h) == 0); - /* First one should be subhash, next should be empty. */ - ok1(is_subhash(h.group[0])); - subhash = (h.group[0] & TDB_OFF_MASK); - for (j = 1; j < (1 << TDB_HASH_GROUP_BITS); j++) - ok1(h.group[j] == 0); - - ok1(tdb_write_convert(tdb, h.group_start, - h.group, sizeof(h.group)) == 0); - ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, - F_WRLCK) == 0); - - /* Should be happy with expansion. */ - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Should be able to find it. */ - v = 0; - ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) - == new_off); - /* Should have created correct hash. */ - ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); - /* Should have located space in expanded group 0, bucket 0. */ - ok1(h.group_start == subhash + sizeof(struct tdb_used_record)); - ok1(h.home_bucket == 0); - ok1(h.found_bucket == 0); - ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS - + TDB_SUBLEVEL_HASH_BITS); - - /* Should have lock on bucket 0 */ - ok1(h.hlock_start == 0); - ok1(h.hlock_range == - 1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS))); - ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1); - ok1((tdb->flags & TDB_NOLOCK) - || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START); - /* FIXME: Check lock length */ - - /* Simple delete should work. */ - ok1(delete_from_hash(tdb, &h) == 0); - ok1(add_free_record(tdb, new_off, - sizeof(struct tdb_used_record) - + rec_key_length(&rec) - + rec_data_length(&rec) - + rec_extra_padding(&rec), - TDB_LOCK_NOWAIT, false) == 0); - ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, - F_WRLCK) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Test second-level expansion: should expand 0th bucket. */ - v = 0; - ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0); - /* Should have created correct hash. */ - ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); - /* Should have located space in group 0, bucket 0. */ - ok1(h.group_start == subhash + sizeof(struct tdb_used_record)); - ok1(h.home_bucket == 0); - ok1(h.found_bucket == 0); - ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS+TDB_SUBLEVEL_HASH_BITS); - - /* Should have lock on bucket 0 */ - ok1(h.hlock_start == 0); - ok1(h.hlock_range == - 1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS))); - ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1); - ok1((tdb->flags & TDB_NOLOCK) - || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START); - /* FIXME: Check lock length */ - - ok1(expand_group(tdb, &h) == 0); - /* First one should be subhash, next should be empty. */ - ok1(is_subhash(h.group[0])); - subhash = (h.group[0] & TDB_OFF_MASK); - for (j = 1; j < (1 << TDB_HASH_GROUP_BITS); j++) - ok1(h.group[j] == 0); - ok1(tdb_write_convert(tdb, h.group_start, - h.group, sizeof(h.group)) == 0); - ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, - F_WRLCK) == 0); - - /* Should be happy with expansion. */ - ok1(tdb_check(tdb, NULL, NULL) == 0); - - ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0); - /* Should have created correct hash. */ - ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); - /* Should have located space in group 0, bucket 0. */ - ok1(h.group_start == subhash + sizeof(struct tdb_used_record)); - ok1(h.home_bucket == 0); - ok1(h.found_bucket == 0); - ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS - + TDB_SUBLEVEL_HASH_BITS * 2); - - /* We should be able to add it now. */ - /* Allocate a new record. */ - new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h, - TDB_USED_MAGIC, false); - ok1(!TDB_OFF_IS_ERR(new_off)); - ok1(add_to_hash(tdb, &h, new_off) == 0); - - /* Make sure we fill it in for later finding. */ - off = new_off + sizeof(struct tdb_used_record); - ok1(!tdb->tdb2.io->twrite(tdb, off, key.dptr, key.dsize)); - off += key.dsize; - ok1(!tdb->tdb2.io->twrite(tdb, off, dbuf.dptr, dbuf.dsize)); - - /* We should be able to unlock that OK. */ - ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, - F_WRLCK) == 0); - - /* Database should be consistent. */ - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Should be able to find it. */ - v = 0; - ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) - == new_off); - /* Should have created correct hash. */ - ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); - /* Should have located space in expanded group 0, bucket 0. */ - ok1(h.group_start == subhash + sizeof(struct tdb_used_record)); - ok1(h.home_bucket == 0); - ok1(h.found_bucket == 0); - ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS - + TDB_SUBLEVEL_HASH_BITS * 2); - - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/run-05-readonly-open.c b/ccan/tdb2/test/run-05-readonly-open.c deleted file mode 100644 index 80eb5675..00000000 --- a/ccan/tdb2/test/run-05-readonly-open.c +++ /dev/null @@ -1,75 +0,0 @@ -#include -#include "tdb2-source.h" -#include -#include -#include "logging.h" -#include "failtest_helper.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data = tdb_mkdata("data", 4), d; - union tdb_attribute seed_attr; - unsigned int msgs = 0; - - failtest_init(argc, argv); - failtest_hook = block_repeat_failures; - failtest_exit_check = exit_check_log; - - seed_attr.base.attr = TDB_ATTRIBUTE_SEED; - seed_attr.base.next = &tap_log_attr; - seed_attr.seed.seed = 0; - - failtest_suppress = true; - plan_tests(sizeof(flags) / sizeof(flags[0]) * 11); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-05-readonly-open.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, - flags[i] & TDB_VERSION1 - ? &tap_log_attr : &seed_attr); - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - tdb_close(tdb); - - failtest_suppress = false; - tdb = tdb_open("run-05-readonly-open.tdb", flags[i], - O_RDONLY, 0600, &tap_log_attr); - if (!ok1(tdb)) - break; - ok1(tap_log_messages == msgs); - /* Fetch should succeed, stores should fail. */ - if (!ok1(tdb_fetch(tdb, key, &d) == 0)) - goto fail; - ok1(tdb_deq(d, data)); - free(d.dptr); - if (!ok1(tdb_store(tdb, key, data, TDB_MODIFY) - == TDB_ERR_RDONLY)) - goto fail; - ok1(tap_log_messages == ++msgs); - if (!ok1(tdb_store(tdb, key, data, TDB_INSERT) - == TDB_ERR_RDONLY)) - goto fail; - ok1(tap_log_messages == ++msgs); - failtest_suppress = true; - ok1(tdb_check(tdb, NULL, NULL) == 0); - tdb_close(tdb); - ok1(tap_log_messages == msgs); - /* SIGH: failtest bug, it doesn't save the tdb file because - * we have it read-only. If we go around again, it gets - * changed underneath us and things get screwy. */ - if (failtest_has_failed()) - break; - } - failtest_exit(exit_status()); - -fail: - failtest_suppress = true; - tdb_close(tdb); - failtest_exit(exit_status()); -} diff --git a/ccan/tdb2/test/run-10-simple-store.c b/ccan/tdb2/test/run-10-simple-store.c deleted file mode 100644 index 10bbb498..00000000 --- a/ccan/tdb2/test/run-10-simple-store.c +++ /dev/null @@ -1,63 +0,0 @@ -#include -#include "tdb2-source.h" -#include -#include -#include "logging.h" -#include "failtest_helper.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data = tdb_mkdata("data", 4); - - failtest_init(argc, argv); - failtest_hook = block_repeat_failures; - failtest_exit_check = exit_check_log; - - failtest_suppress = true; - plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-10-simple-store.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - if (!ok1(tdb)) - break; - /* Modify should fail. */ - failtest_suppress = false; - if (!ok1(tdb_store(tdb, key, data, TDB_MODIFY) - == TDB_ERR_NOEXIST)) - goto fail; - failtest_suppress = true; - ok1(tdb_check(tdb, NULL, NULL) == 0); - /* Insert should succeed. */ - failtest_suppress = false; - if (!ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0)) - goto fail; - failtest_suppress = true; - ok1(tdb_check(tdb, NULL, NULL) == 0); - /* Second insert should fail. */ - failtest_suppress = false; - if (!ok1(tdb_store(tdb, key, data, TDB_INSERT) - == TDB_ERR_EXISTS)) - goto fail; - failtest_suppress = true; - ok1(tdb_check(tdb, NULL, NULL) == 0); - tdb_close(tdb); - } - ok1(tap_log_messages == 0); - failtest_exit(exit_status()); - -fail: - failtest_suppress = true; - tdb_close(tdb); - failtest_exit(exit_status()); -} diff --git a/ccan/tdb2/test/run-11-simple-fetch.c b/ccan/tdb2/test/run-11-simple-fetch.c deleted file mode 100644 index ad97be30..00000000 --- a/ccan/tdb2/test/run-11-simple-fetch.c +++ /dev/null @@ -1,63 +0,0 @@ -#include -#include "tdb2-source.h" -#include -#include -#include "logging.h" -#include "failtest_helper.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data = tdb_mkdata("data", 4); - - failtest_init(argc, argv); - failtest_hook = block_repeat_failures; - failtest_exit_check = exit_check_log; - - failtest_suppress = true; - plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-11-simple-fetch.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (tdb) { - struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */ - - /* fetch should fail. */ - failtest_suppress = false; - if (!ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_NOEXIST)) - goto fail; - failtest_suppress = true; - ok1(tdb_check(tdb, NULL, NULL) == 0); - /* Insert should succeed. */ - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - /* Fetch should now work. */ - failtest_suppress = false; - if (!ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS)) - goto fail; - failtest_suppress = true; - ok1(tdb_deq(d, data)); - free(d.dptr); - ok1(tdb_check(tdb, NULL, NULL) == 0); - tdb_close(tdb); - } - } - ok1(tap_log_messages == 0); - failtest_exit(exit_status()); - -fail: - failtest_suppress = true; - tdb_close(tdb); - failtest_exit(exit_status()); -} diff --git a/ccan/tdb2/test/run-12-check.c b/ccan/tdb2/test/run-12-check.c deleted file mode 100644 index b55bfe7d..00000000 --- a/ccan/tdb2/test/run-12-check.c +++ /dev/null @@ -1,50 +0,0 @@ -#include -#include -#include "tdb2-source.h" -#include -#include -#include "logging.h" -#include "failtest_helper.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, - TDB_INTERNAL|TDB_CONVERT, - TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, - TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1 }; - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data = tdb_mkdata("data", 4); - - failtest_init(argc, argv); - failtest_hook = block_repeat_failures; - failtest_exit_check = exit_check_log; - - failtest_suppress = true; - plan_tests(sizeof(flags) / sizeof(flags[0]) * 3 + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-12-check.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - - /* This is what we really want to test: tdb_check(). */ - failtest_suppress = false; - if (!ok1(tdb_check(tdb, NULL, NULL) == 0)) - goto fail; - failtest_suppress = true; - - tdb_close(tdb); - } - ok1(tap_log_messages == 0); - failtest_exit(exit_status()); - -fail: - failtest_suppress = true; - tdb_close(tdb); - failtest_exit(exit_status()); -} diff --git a/ccan/tdb2/test/run-15-append.c b/ccan/tdb2/test/run-15-append.c deleted file mode 100644 index 39afaf7c..00000000 --- a/ccan/tdb2/test/run-15-append.c +++ /dev/null @@ -1,153 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include "logging.h" - -#define MAX_SIZE 13100 -#define SIZE_STEP 131 - -static tdb_off_t tdb_offset(struct tdb_context *tdb, struct tdb_data key) -{ - tdb_off_t off; - struct tdb_used_record rec; - struct hash_info h; - - if (tdb_get_flags(tdb) & TDB_VERSION1) { - struct tdb1_record rec; - return tdb1_find(tdb, key, tdb_hash(tdb, key.dptr, key.dsize), - &rec); - } - - off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL); - if (TDB_OFF_IS_ERR(off)) - return 0; - tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK); - return off; -} - -int main(int argc, char *argv[]) -{ - unsigned int i, j, moves; - struct tdb_context *tdb; - unsigned char *buffer; - tdb_off_t oldoff = 0, newoff; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - TDB_INTERNAL|TDB_VERSION1, TDB_VERSION1, - TDB_NOMMAP|TDB_VERSION1, - TDB_INTERNAL|TDB_CONVERT|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data; - - buffer = malloc(MAX_SIZE); - for (i = 0; i < MAX_SIZE; i++) - buffer[i] = i; - - plan_tests(sizeof(flags) / sizeof(flags[0]) - * ((3 + MAX_SIZE/SIZE_STEP * 5) * 2 + 7) - + 1); - - /* Using tdb_store. */ - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-append.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - moves = 0; - for (j = 0; j < MAX_SIZE; j += SIZE_STEP) { - data.dptr = buffer; - data.dsize = j; - ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == j); - ok1(memcmp(data.dptr, buffer, data.dsize) == 0); - free(data.dptr); - newoff = tdb_offset(tdb, key); - if (newoff != oldoff) - moves++; - oldoff = newoff; - } - ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0 - && tdb->file->num_lockrecs == 0)); - if (flags[i] & TDB_VERSION1) { - /* TDB1 simply over-size by 25%. */ - ok(moves <= ilog64(j / SIZE_STEP)*4, - "Moved %u times", moves); - } else { - /* We should increase by 50% each time... */ - ok(moves <= ilog64(j / SIZE_STEP)*2, - "Moved %u times", moves); - } - tdb_close(tdb); - } - - /* Using tdb_append. */ - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - size_t prev_len = 0; - tdb = tdb_open("run-append.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - moves = 0; - for (j = 0; j < MAX_SIZE; j += SIZE_STEP) { - data.dptr = buffer + prev_len; - data.dsize = j - prev_len; - ok1(tdb_append(tdb, key, data) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == j); - ok1(memcmp(data.dptr, buffer, data.dsize) == 0); - free(data.dptr); - prev_len = data.dsize; - newoff = tdb_offset(tdb, key); - if (newoff != oldoff) - moves++; - oldoff = newoff; - } - ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0 - && tdb->file->num_lockrecs == 0)); - if (flags[i] & TDB_VERSION1) { - /* TDB1 simply over-size by 25%. */ - ok(moves <= ilog64(j / SIZE_STEP)*4, - "Moved %u times", moves); - } else { - /* We should increase by 50% each time... */ - ok(moves <= ilog64(j / SIZE_STEP)*2, - "Moved %u times", moves); - } - tdb_close(tdb); - } - - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-append.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - /* Huge initial store. */ - data.dptr = buffer; - data.dsize = MAX_SIZE; - ok1(tdb_append(tdb, key, data) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == MAX_SIZE); - ok1(memcmp(data.dptr, buffer, data.dsize) == 0); - free(data.dptr); - ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0 - && tdb->file->num_lockrecs == 0)); - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - free(buffer); - return exit_status(); -} diff --git a/ccan/tdb2/test/run-20-growhash.c b/ccan/tdb2/test/run-20-growhash.c deleted file mode 100644 index 65cead03..00000000 --- a/ccan/tdb2/test/run-20-growhash.c +++ /dev/null @@ -1,137 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" - -static uint64_t myhash(const void *key, size_t len, uint64_t seed, void *priv) -{ - return *(const uint64_t *)key; -} - -static void add_bits(uint64_t *val, unsigned new, unsigned new_bits, - unsigned *done) -{ - *done += new_bits; - *val |= ((uint64_t)new << (64 - *done)); -} - -static uint64_t make_key(unsigned topgroup, unsigned topbucket, - unsigned subgroup1, unsigned subbucket1, - unsigned subgroup2, unsigned subbucket2) -{ - uint64_t key = 0; - unsigned done = 0; - - add_bits(&key, topgroup, TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS, - &done); - add_bits(&key, topbucket, TDB_HASH_GROUP_BITS, &done); - add_bits(&key, subgroup1, TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS, - &done); - add_bits(&key, subbucket1, TDB_HASH_GROUP_BITS, &done); - add_bits(&key, subgroup2, TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS, - &done); - add_bits(&key, subbucket2, TDB_HASH_GROUP_BITS, &done); - return key; -} - -int main(int argc, char *argv[]) -{ - unsigned int i, j; - struct tdb_context *tdb; - uint64_t kdata; - struct tdb_used_record rec; - struct tdb_data key = { (unsigned char *)&kdata, sizeof(kdata) }; - struct tdb_data dbuf = { (unsigned char *)&kdata, sizeof(kdata) }; - union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, - .fn = myhash } }; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - }; - - hattr.base.next = &tap_log_attr; - - plan_tests(sizeof(flags) / sizeof(flags[0]) - * (9 + (20 + 2 * ((1 << TDB_HASH_GROUP_BITS) - 2)) - * (1 << TDB_HASH_GROUP_BITS)) + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - struct hash_info h; - - tdb = tdb_open("run-20-growhash.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr); - ok1(tdb); - if (!tdb) - continue; - - /* Fill a group. */ - for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) { - kdata = make_key(0, j, 0, 0, 0, 0); - ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); - } - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Check first still exists. */ - kdata = make_key(0, 0, 0, 0, 0, 0); - ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL) != 0); - /* Should have created correct hash. */ - ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); - /* Should have located space in group 0, bucket 0. */ - ok1(h.group_start == offsetof(struct tdb_header, hashtable)); - ok1(h.home_bucket == 0); - ok1(h.found_bucket == 0); - ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS); - /* Entire group should be full! */ - for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) - ok1(h.group[j] != 0); - - ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, - F_RDLCK) == 0); - - /* Now, add one more to each should expand (that) bucket. */ - for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) { - unsigned int k; - kdata = make_key(0, j, 0, 1, 0, 0); - ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL)); - /* Should have created correct hash. */ - ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); - /* Should have moved to subhash */ - ok1(h.group_start >= sizeof(struct tdb_header)); - ok1(h.home_bucket == 1); - ok1(h.found_bucket == 1); - ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS - + TDB_SUBLEVEL_HASH_BITS); - ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, - F_RDLCK) == 0); - - /* Keep adding, make it expand again. */ - for (k = 2; k < (1 << TDB_HASH_GROUP_BITS); k++) { - kdata = make_key(0, j, 0, k, 0, 0); - ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - } - - /* This should tip it over to sub-sub-hash. */ - kdata = make_key(0, j, 0, 0, 0, 1); - ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL)); - /* Should have created correct hash. */ - ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize)); - /* Should have moved to subhash */ - ok1(h.group_start >= sizeof(struct tdb_header)); - ok1(h.home_bucket == 1); - ok1(h.found_bucket == 1); - ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS - + TDB_SUBLEVEL_HASH_BITS + TDB_SUBLEVEL_HASH_BITS); - ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, - F_RDLCK) == 0); - } - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/run-25-hashoverload.c b/ccan/tdb2/test/run-25-hashoverload.c deleted file mode 100644 index 0e143026..00000000 --- a/ccan/tdb2/test/run-25-hashoverload.c +++ /dev/null @@ -1,113 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" - -static uint64_t badhash(const void *key, size_t len, uint64_t seed, void *priv) -{ - return 0; -} - -static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p) -{ - if (p) - return tdb_delete(tdb, key); - return 0; -} - -int main(int argc, char *argv[]) -{ - unsigned int i, j; - struct tdb_context *tdb; - struct tdb_data key = { (unsigned char *)&j, sizeof(j) }; - struct tdb_data dbuf = { (unsigned char *)&j, sizeof(j) }; - union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, - .fn = badhash } }; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT, - }; - - hattr.base.next = &tap_log_attr; - - plan_tests(6883); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */ - - tdb = tdb_open("run-25-hashoverload.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr); - ok1(tdb); - if (!tdb) - continue; - - /* Fill a group. */ - for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) { - ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); - } - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Now store one last value: should form chain. */ - ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Check we can find them all. */ - for (j = 0; j < (1 << TDB_HASH_GROUP_BITS) + 1; j++) { - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(d.dsize == sizeof(j)); - ok1(d.dptr != NULL); - ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0); - free(d.dptr); - } - - /* Now add a *lot* more. */ - for (j = (1 << TDB_HASH_GROUP_BITS) + 1; - j < (16 << TDB_HASH_GROUP_BITS); - j++) { - ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(d.dsize == sizeof(j)); - ok1(d.dptr != NULL); - ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0); - free(d.dptr); - } - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Traverse through them. */ - ok1(tdb_traverse(tdb, trav, NULL) == j); - - /* Empty the first chain-worth. */ - for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) - ok1(tdb_delete(tdb, key) == 0); - - ok1(tdb_check(tdb, NULL, NULL) == 0); - - for (j = (1 << TDB_HASH_GROUP_BITS); - j < (16 << TDB_HASH_GROUP_BITS); - j++) { - ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS); - ok1(d.dsize == sizeof(j)); - ok1(d.dptr != NULL); - ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0); - free(d.dptr); - } - - /* Traverse through them. */ - ok1(tdb_traverse(tdb, trav, NULL) - == (15 << TDB_HASH_GROUP_BITS)); - - /* Re-add */ - for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) { - ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0); - } - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Now try deleting as we go. */ - ok1(tdb_traverse(tdb, trav, trav) - == (16 << TDB_HASH_GROUP_BITS)); - ok1(tdb_check(tdb, NULL, NULL) == 0); - ok1(tdb_traverse(tdb, trav, NULL) == 0); - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/run-30-exhaust-before-expand.c b/ccan/tdb2/test/run-30-exhaust-before-expand.c deleted file mode 100644 index db391e7e..00000000 --- a/ccan/tdb2/test/run-30-exhaust-before-expand.c +++ /dev/null @@ -1,72 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include "logging.h" - -static bool empty_freetable(struct tdb_context *tdb) -{ - struct tdb_freetable ftab; - unsigned int i; - - /* Now, free table should be completely exhausted in zone 0 */ - if (tdb_read_convert(tdb, tdb->tdb2.ftable_off, &ftab, sizeof(ftab)) != 0) - abort(); - - for (i = 0; i < sizeof(ftab.buckets)/sizeof(ftab.buckets[0]); i++) { - if (ftab.buckets[i]) - return false; - } - return true; -} - - -int main(int argc, char *argv[]) -{ - unsigned int i, j; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 9 + 1); - - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - TDB_DATA k; - uint64_t size; - bool was_empty = false; - - k.dptr = (void *)&j; - k.dsize = sizeof(j); - - tdb = tdb_open("run-30-exhaust-before-expand.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - ok1(empty_freetable(tdb)); - /* Need some hash lock for expand. */ - ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0); - /* Create some free space. */ - ok1(tdb_expand(tdb, 1) == 0); - ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0); - ok1(tdb_check(tdb, NULL, NULL) == 0); - ok1(!empty_freetable(tdb)); - - size = tdb->file->map_size; - /* Insert minimal-length records until we expand. */ - for (j = 0; tdb->file->map_size == size; j++) { - was_empty = empty_freetable(tdb); - if (tdb_store(tdb, k, k, TDB_INSERT) != 0) - err(1, "Failed to store record %i", j); - } - - /* Would have been empty before expansion, but no longer. */ - ok1(was_empty); - ok1(!empty_freetable(tdb)); - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/run-35-convert.c b/ccan/tdb2/test/run-35-convert.c deleted file mode 100644 index b7b1e6e5..00000000 --- a/ccan/tdb2/test/run-35-convert.c +++ /dev/null @@ -1,57 +0,0 @@ -#include -#include -#include "tdb2-source.h" -#include -#include -#include "logging.h" -#include "failtest_helper.h" - -int main(int argc, char *argv[]) -{ - unsigned int i, messages = 0; - struct tdb_context *tdb; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - - failtest_init(argc, argv); - failtest_hook = block_repeat_failures; - failtest_exit_check = exit_check_log; - plan_tests(sizeof(flags) / sizeof(flags[0]) * 4); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-35-convert.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - if (!ok1(tdb)) - failtest_exit(exit_status()); - - tdb_close(tdb); - /* If we say TDB_CONVERT, it must be converted */ - tdb = tdb_open("run-35-convert.tdb", - flags[i]|TDB_CONVERT, - O_RDWR, 0600, &tap_log_attr); - if (flags[i] & TDB_CONVERT) { - if (!tdb) - failtest_exit(exit_status()); - ok1(tdb_get_flags(tdb) & TDB_CONVERT); - tdb_close(tdb); - } else { - if (!ok1(!tdb && errno == EIO)) - failtest_exit(exit_status()); - ok1(tap_log_messages == ++messages); - if (!ok1(log_last && strstr(log_last, "TDB_CONVERT"))) - failtest_exit(exit_status()); - } - - /* If don't say TDB_CONVERT, it *may* be converted */ - tdb = tdb_open("run-35-convert.tdb", - flags[i] & ~TDB_CONVERT, - O_RDWR, 0600, &tap_log_attr); - if (!tdb) - failtest_exit(exit_status()); - ok1(tdb_get_flags(tdb) == flags[i]); - tdb_close(tdb); - } - failtest_exit(exit_status()); -} diff --git a/ccan/tdb2/test/run-50-multiple-freelists.c b/ccan/tdb2/test/run-50-multiple-freelists.c deleted file mode 100644 index 44fee941..00000000 --- a/ccan/tdb2/test/run-50-multiple-freelists.c +++ /dev/null @@ -1,70 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" -#include "layout.h" - -int main(int argc, char *argv[]) -{ - tdb_off_t off; - struct tdb_context *tdb; - struct tdb_layout *layout; - TDB_DATA key, data; - union tdb_attribute seed; - - /* This seed value previously tickled a layout.c bug. */ - seed.base.attr = TDB_ATTRIBUTE_SEED; - seed.seed.seed = 0xb1142bc054d035b4ULL; - seed.base.next = &tap_log_attr; - - plan_tests(11); - key = tdb_mkdata("Hello", 5); - data = tdb_mkdata("world", 5); - - /* Create a TDB with three free tables. */ - layout = new_tdb_layout(); - tdb_layout_add_freetable(layout); - tdb_layout_add_freetable(layout); - tdb_layout_add_freetable(layout); - tdb_layout_add_free(layout, 80, 0); - /* Used record prevent coalescing. */ - tdb_layout_add_used(layout, key, data, 6); - tdb_layout_add_free(layout, 160, 1); - key.dsize--; - tdb_layout_add_used(layout, key, data, 7); - tdb_layout_add_free(layout, 320, 2); - key.dsize--; - tdb_layout_add_used(layout, key, data, 8); - tdb_layout_add_free(layout, 40, 0); - tdb = tdb_layout_get(layout, free, &seed); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0, - TDB_USED_MAGIC, 0); - ok1(off == layout->elem[3].base.off); - ok1(tdb->tdb2.ftable_off == layout->elem[0].base.off); - - off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0, - TDB_USED_MAGIC, 0); - ok1(off == layout->elem[5].base.off); - ok1(tdb->tdb2.ftable_off == layout->elem[1].base.off); - - off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0, - TDB_USED_MAGIC, 0); - ok1(off == layout->elem[7].base.off); - ok1(tdb->tdb2.ftable_off == layout->elem[2].base.off); - - off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0, - TDB_USED_MAGIC, 0); - ok1(off == layout->elem[9].base.off); - ok1(tdb->tdb2.ftable_off == layout->elem[0].base.off); - - /* Now we fail. */ - off = get_free(tdb, 0, 0, 1, TDB_USED_MAGIC, 0); - ok1(off == 0); - - tdb_close(tdb); - tdb_layout_free(layout); - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/run-56-open-during-transaction.c b/ccan/tdb2/test/run-56-open-during-transaction.c deleted file mode 100644 index 9262c052..00000000 --- a/ccan/tdb2/test/run-56-open-during-transaction.c +++ /dev/null @@ -1,169 +0,0 @@ -#include -#include -#include "lock-tracking.h" - -static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset); -static ssize_t write_check(int fd, const void *buf, size_t count); -static int ftruncate_check(int fd, off_t length); - -#define pwrite pwrite_check -#define write write_check -#define fcntl fcntl_with_lockcheck -#define ftruncate ftruncate_check - -#include "tdb2-source.h" -#include -#include -#include -#include -#include -#include "external-agent.h" -#include "logging.h" - -static struct agent *agent; -static bool opened; -static int errors = 0; -#define TEST_DBNAME "run-56-open-during-transaction.tdb" - -#undef write -#undef pwrite -#undef fcntl -#undef ftruncate - -static bool is_same(const char *snapshot, const char *latest, off_t len) -{ - unsigned i; - - for (i = 0; i < len; i++) { - if (snapshot[i] != latest[i]) - return false; - } - return true; -} - -static bool compare_file(int fd, const char *snapshot, off_t snapshot_len) -{ - char *contents; - bool same; - - /* over-length read serves as length check. */ - contents = malloc(snapshot_len+1); - same = pread(fd, contents, snapshot_len+1, 0) == snapshot_len - && is_same(snapshot, contents, snapshot_len); - free(contents); - return same; -} - -static void check_file_intact(int fd) -{ - enum agent_return ret; - struct stat st; - char *contents; - - fstat(fd, &st); - contents = malloc(st.st_size); - if (pread(fd, contents, st.st_size, 0) != st.st_size) { - diag("Read fail"); - errors++; - return; - } - - /* Ask agent to open file. */ - ret = external_agent_operation(agent, OPEN, TEST_DBNAME); - - /* It's OK to open it, but it must not have changed! */ - if (!compare_file(fd, contents, st.st_size)) { - diag("Agent changed file after opening %s", - agent_return_name(ret)); - errors++; - } - - if (ret == SUCCESS) { - ret = external_agent_operation(agent, CLOSE, NULL); - if (ret != SUCCESS) { - diag("Agent failed to close tdb: %s", - agent_return_name(ret)); - errors++; - } - } else if (ret != WOULD_HAVE_BLOCKED) { - diag("Agent opening file gave %s", - agent_return_name(ret)); - errors++; - } - - free(contents); -} - -static void after_unlock(int fd) -{ - if (opened) - check_file_intact(fd); -} - -static ssize_t pwrite_check(int fd, - const void *buf, size_t count, off_t offset) -{ - if (opened) - check_file_intact(fd); - - return pwrite(fd, buf, count, offset); -} - -static ssize_t write_check(int fd, const void *buf, size_t count) -{ - if (opened) - check_file_intact(fd); - - return write(fd, buf, count); -} - -static int ftruncate_check(int fd, off_t length) -{ - if (opened) - check_file_intact(fd); - - return ftruncate(fd, length); - -} - -int main(int argc, char *argv[]) -{ - const int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - int i; - struct tdb_context *tdb; - TDB_DATA key, data; - - plan_tests(sizeof(flags)/sizeof(flags[0]) * 5); - agent = prepare_external_agent(); - if (!agent) - err(1, "preparing agent"); - - unlock_callback = after_unlock; - for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) { - diag("Test with %s and %s\n", - (flags[i] & TDB_CONVERT) ? "CONVERT" : "DEFAULT", - (flags[i] & TDB_NOMMAP) ? "no mmap" : "mmap"); - unlink(TEST_DBNAME); - tdb = tdb_open(TEST_DBNAME, flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - - opened = true; - ok1(tdb_transaction_start(tdb) == 0); - key = tdb_mkdata("hi", strlen("hi")); - data = tdb_mkdata("world", strlen("world")); - - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb_transaction_commit(tdb) == 0); - ok(!errors, "We had %u open errors", errors); - - opened = false; - tdb_close(tdb); - } - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-57-die-during-transaction.c b/ccan/tdb2/test/run-57-die-during-transaction.c deleted file mode 100644 index 42102ae7..00000000 --- a/ccan/tdb2/test/run-57-die-during-transaction.c +++ /dev/null @@ -1,296 +0,0 @@ -#include -#include -#include "lock-tracking.h" -#include -#include -#include -static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset); -static ssize_t write_check(int fd, const void *buf, size_t count); -static int ftruncate_check(int fd, off_t length); - -#define pwrite pwrite_check -#define write write_check -#define fcntl fcntl_with_lockcheck -#define ftruncate ftruncate_check - -/* There's a malloc inside transaction_setup_recovery, and valgrind complains - * when we longjmp and leak it. */ -#define MAX_ALLOCATIONS 10 -static void *allocated[MAX_ALLOCATIONS]; -static unsigned max_alloc = 0; - -static void *malloc_noleak(size_t len) -{ - unsigned int i; - - for (i = 0; i < MAX_ALLOCATIONS; i++) - if (!allocated[i]) { - allocated[i] = malloc(len); - if (i > max_alloc) { - max_alloc = i; - diag("max_alloc: %i", max_alloc); - } - return allocated[i]; - } - diag("Too many allocations!"); - abort(); -} - -static void *realloc_noleak(void *p, size_t size) -{ - unsigned int i; - - for (i = 0; i < MAX_ALLOCATIONS; i++) { - if (allocated[i] == p) { - if (i > max_alloc) { - max_alloc = i; - diag("max_alloc: %i", max_alloc); - } - return allocated[i] = realloc(p, size); - } - } - diag("Untracked realloc!"); - abort(); -} - -static void free_noleak(void *p) -{ - unsigned int i; - - /* We don't catch asprintf, so don't complain if we miss one. */ - for (i = 0; i < MAX_ALLOCATIONS; i++) { - if (allocated[i] == p) { - allocated[i] = NULL; - break; - } - } - free(p); -} - -static void free_all(void) -{ - unsigned int i; - - for (i = 0; i < MAX_ALLOCATIONS; i++) { - free(allocated[i]); - allocated[i] = NULL; - } -} - -#define malloc malloc_noleak -#define free free_noleak -#define realloc realloc_noleak - -#include "tdb2-source.h" - -#undef malloc -#undef free -#undef realloc -#undef write -#undef pwrite -#undef fcntl -#undef ftruncate - -#include -#include -#include -#include -#include "external-agent.h" -#include "logging.h" - -static bool in_transaction; -static int target, current; -static jmp_buf jmpbuf; -#define TEST_DBNAME "run-57-die-during-transaction.tdb" -#define KEY_STRING "helloworld" - -static void maybe_die(int fd) -{ - if (in_transaction && current++ == target) { - longjmp(jmpbuf, 1); - } -} - -static ssize_t pwrite_check(int fd, - const void *buf, size_t count, off_t offset) -{ - ssize_t ret; - - maybe_die(fd); - - ret = pwrite(fd, buf, count, offset); - if (ret != count) - return ret; - - maybe_die(fd); - return ret; -} - -static ssize_t write_check(int fd, const void *buf, size_t count) -{ - ssize_t ret; - - maybe_die(fd); - - ret = write(fd, buf, count); - if (ret != count) - return ret; - - maybe_die(fd); - return ret; -} - -static int ftruncate_check(int fd, off_t length) -{ - int ret; - - maybe_die(fd); - - ret = ftruncate(fd, length); - - maybe_die(fd); - return ret; -} - -static bool test_death(enum operation op, struct agent *agent, int flags) -{ - struct tdb_context *tdb = NULL; - TDB_DATA key; - enum agent_return ret; - int needed_recovery = 0; - - current = target = 0; -reset: - unlink(TEST_DBNAME); - tdb = tdb_open(TEST_DBNAME, flags|TDB_NOMMAP, - O_CREAT|O_TRUNC|O_RDWR, 0600, &tap_log_attr); - if (!tdb) { - diag("Failed opening TDB: %s", strerror(errno)); - return false; - } - - if (setjmp(jmpbuf) != 0) { - /* We're partway through. Simulate our death. */ - close(tdb->file->fd); - forget_locking(); - in_transaction = false; - - ret = external_agent_operation(agent, NEEDS_RECOVERY, ""); - if (ret == SUCCESS) - needed_recovery++; - else if (ret != FAILED) { - diag("Step %u agent NEEDS_RECOVERY = %s", current, - agent_return_name(ret)); - return false; - } - - ret = external_agent_operation(agent, op, KEY_STRING); - if (ret != SUCCESS) { - diag("Step %u op %s failed = %s", current, - operation_name(op), - agent_return_name(ret)); - return false; - } - - ret = external_agent_operation(agent, NEEDS_RECOVERY, ""); - if (ret != FAILED) { - diag("Still needs recovery after step %u = %s", - current, agent_return_name(ret)); - return false; - } - - ret = external_agent_operation(agent, CHECK, ""); - if (ret != SUCCESS) { - diag("Step %u check failed = %s", current, - agent_return_name(ret)); - return false; - } - - ret = external_agent_operation(agent, CLOSE, ""); - if (ret != SUCCESS) { - diag("Step %u close failed = %s", current, - agent_return_name(ret)); - return false; - } - - /* Suppress logging as this tries to use closed fd. */ - suppress_logging = true; - suppress_lockcheck = true; - tdb_close(tdb); - suppress_logging = false; - suppress_lockcheck = false; - target++; - current = 0; - free_all(); - goto reset; - } - - /* Put key for agent to fetch. */ - key = tdb_mkdata(KEY_STRING, strlen(KEY_STRING)); - if (tdb_store(tdb, key, key, TDB_INSERT) != 0) - return false; - - /* This is the key we insert in transaction. */ - key.dsize--; - - ret = external_agent_operation(agent, OPEN, TEST_DBNAME); - if (ret != SUCCESS) - errx(1, "Agent failed to open: %s", agent_return_name(ret)); - - ret = external_agent_operation(agent, FETCH, KEY_STRING); - if (ret != SUCCESS) - errx(1, "Agent failed find key: %s", agent_return_name(ret)); - - in_transaction = true; - if (tdb_transaction_start(tdb) != 0) - return false; - - if (tdb_store(tdb, key, key, TDB_INSERT) != 0) - return false; - - if (tdb_transaction_commit(tdb) != 0) - return false; - - in_transaction = false; - - /* We made it! */ - diag("Completed %u runs", current); - tdb_close(tdb); - ret = external_agent_operation(agent, CLOSE, ""); - if (ret != SUCCESS) { - diag("Step %u close failed = %s", current, - agent_return_name(ret)); - return false; - } - - ok1(needed_recovery); - ok1(locking_errors == 0); - ok1(forget_locking() == 0); - locking_errors = 0; - return true; -} - -int main(int argc, char *argv[]) -{ - enum operation ops[] = { FETCH, STORE, TRANSACTION_START }; - struct agent *agent; - int i, flags; - - plan_tests(24); - unlock_callback = maybe_die; - - external_agent_free = free_noleak; - agent = prepare_external_agent(); - if (!agent) - err(1, "preparing agent"); - - for (flags = TDB_DEFAULT; flags <= TDB_VERSION1; flags += TDB_VERSION1) { - for (i = 0; i < sizeof(ops)/sizeof(ops[0]); i++) { - diag("Testing %s after death", operation_name(ops[i])); - ok1(test_death(ops[i], agent, flags)); - } - } - - free_external_agent(agent); - return exit_status(); -} diff --git a/ccan/tdb2/test/run-64-bit-tdb.c b/ccan/tdb2/test/run-64-bit-tdb.c deleted file mode 100644 index 566f5465..00000000 --- a/ccan/tdb2/test/run-64-bit-tdb.c +++ /dev/null @@ -1,72 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT }; - - if (sizeof(off_t) <= 4) { - plan_tests(1); - pass("No 64 bit off_t"); - return exit_status(); - } - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 14); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - off_t old_size; - TDB_DATA k, d; - struct hash_info h; - struct tdb_used_record rec; - tdb_off_t off; - - tdb = tdb_open("run-64-bit-tdb.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - old_size = tdb->file->map_size; - - /* This makes a sparse file */ - ok1(ftruncate(tdb->file->fd, 0xFFFFFFF0) == 0); - ok1(add_free_record(tdb, old_size, 0xFFFFFFF0 - old_size, - TDB_LOCK_WAIT, false) == TDB_SUCCESS); - - /* Now add a little record past the 4G barrier. */ - ok1(tdb_expand_file(tdb, 100) == TDB_SUCCESS); - ok1(add_free_record(tdb, 0xFFFFFFF0, 100, TDB_LOCK_WAIT, false) - == TDB_SUCCESS); - - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - - /* Test allocation path. */ - k = tdb_mkdata("key", 4); - d = tdb_mkdata("data", 5); - ok1(tdb_store(tdb, k, d, TDB_INSERT) == 0); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - - /* Make sure it put it at end as we expected. */ - off = find_and_lock(tdb, k, F_RDLCK, &h, &rec, NULL); - ok1(off >= 0xFFFFFFF0); - tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK); - - ok1(tdb_fetch(tdb, k, &d) == 0); - ok1(d.dsize == 5); - ok1(strcmp((char *)d.dptr, "data") == 0); - free(d.dptr); - - ok1(tdb_delete(tdb, k) == 0); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - - tdb_close(tdb); - } - - /* We might get messages about mmap failing, so don't test - * tap_log_messages */ - return exit_status(); -} diff --git a/ccan/tdb2/test/run-90-get-set-attributes.c b/ccan/tdb2/test/run-90-get-set-attributes.c deleted file mode 100644 index 4cbbda03..00000000 --- a/ccan/tdb2/test/run-90-get-set-attributes.c +++ /dev/null @@ -1,186 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" - -static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag, - void *unused) -{ - return 0; -} - -static int myunlock(int fd, int rw, off_t off, off_t len, void *unused) -{ - return 0; -} - -static uint64_t hash_fn(const void *key, size_t len, uint64_t seed, - void *priv) -{ - return 0; -} - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - union tdb_attribute seed_attr; - union tdb_attribute hash_attr; - union tdb_attribute lock_attr; - - seed_attr.base.attr = TDB_ATTRIBUTE_SEED; - seed_attr.base.next = &hash_attr; - seed_attr.seed.seed = 100; - - hash_attr.base.attr = TDB_ATTRIBUTE_HASH; - hash_attr.base.next = &lock_attr; - hash_attr.hash.fn = hash_fn; - hash_attr.hash.data = &hash_attr; - - lock_attr.base.attr = TDB_ATTRIBUTE_FLOCK; - lock_attr.base.next = &tap_log_attr; - lock_attr.flock.lock = mylock; - lock_attr.flock.unlock = myunlock; - lock_attr.flock.data = &lock_attr; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 49 - + sizeof(flags) / sizeof(flags[0]) / 2); - - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - union tdb_attribute attr; - - /* First open with no attributes. */ - tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, NULL); - ok1(tdb); - - /* Get log on no attributes will fail */ - attr.base.attr = TDB_ATTRIBUTE_LOG; - ok1(tdb_get_attribute(tdb, &attr) == TDB_ERR_NOEXIST); - /* These always work. */ - attr.base.attr = TDB_ATTRIBUTE_HASH; - ok1(tdb_get_attribute(tdb, &attr) == 0); - ok1(attr.base.attr == TDB_ATTRIBUTE_HASH); - if (flags[i] & TDB_VERSION1) { - ok1(attr.hash.fn == tdb1_old_hash); - } else { - ok1(attr.hash.fn == tdb_jenkins_hash); - } - attr.base.attr = TDB_ATTRIBUTE_FLOCK; - ok1(tdb_get_attribute(tdb, &attr) == 0); - ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK); - ok1(attr.flock.lock == tdb_fcntl_lock); - ok1(attr.flock.unlock == tdb_fcntl_unlock); - attr.base.attr = TDB_ATTRIBUTE_SEED; - if (flags[i] & TDB_VERSION1) { - ok1(tdb_get_attribute(tdb, &attr) == TDB_ERR_EINVAL); - tap_log_messages = 0; - } else { - ok1(tdb_get_attribute(tdb, &attr) == 0); - ok1(attr.base.attr == TDB_ATTRIBUTE_SEED); - /* This is possible, just astronomically unlikely. */ - ok1(attr.seed.seed != 0); - } - - /* Unset attributes. */ - tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG); - tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK); - - /* Set them. */ - ok1(tdb_set_attribute(tdb, &tap_log_attr) == 0); - ok1(tdb_set_attribute(tdb, &lock_attr) == 0); - /* These should fail. */ - ok1(tdb_set_attribute(tdb, &seed_attr) == TDB_ERR_EINVAL); - ok1(tap_log_messages == 1); - ok1(tdb_set_attribute(tdb, &hash_attr) == TDB_ERR_EINVAL); - ok1(tap_log_messages == 2); - tap_log_messages = 0; - - /* Getting them should work as expected. */ - attr.base.attr = TDB_ATTRIBUTE_LOG; - ok1(tdb_get_attribute(tdb, &attr) == 0); - ok1(attr.base.attr == TDB_ATTRIBUTE_LOG); - ok1(attr.log.fn == tap_log_attr.log.fn); - ok1(attr.log.data == tap_log_attr.log.data); - - attr.base.attr = TDB_ATTRIBUTE_FLOCK; - ok1(tdb_get_attribute(tdb, &attr) == 0); - ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK); - ok1(attr.flock.lock == mylock); - ok1(attr.flock.unlock == myunlock); - ok1(attr.flock.data == &lock_attr); - - /* Unset them again. */ - tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK); - ok1(tap_log_messages == 0); - tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG); - ok1(tap_log_messages == 0); - - tdb_close(tdb); - ok1(tap_log_messages == 0); - - /* Now open with all attributes. */ - tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, - &seed_attr); - - if (flags[i] & TDB_VERSION1) { - ok1(!tdb); - ok1(tap_log_messages == 1); - tap_log_messages = 0; - tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, - &hash_attr); - } - ok1(tdb); - - /* Get will succeed */ - attr.base.attr = TDB_ATTRIBUTE_LOG; - ok1(tdb_get_attribute(tdb, &attr) == 0); - ok1(attr.base.attr == TDB_ATTRIBUTE_LOG); - ok1(attr.log.fn == tap_log_attr.log.fn); - ok1(attr.log.data == tap_log_attr.log.data); - - attr.base.attr = TDB_ATTRIBUTE_HASH; - ok1(tdb_get_attribute(tdb, &attr) == 0); - ok1(attr.base.attr == TDB_ATTRIBUTE_HASH); - ok1(attr.hash.fn == hash_fn); - ok1(attr.hash.data == &hash_attr); - - attr.base.attr = TDB_ATTRIBUTE_FLOCK; - ok1(tdb_get_attribute(tdb, &attr) == 0); - ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK); - ok1(attr.flock.lock == mylock); - ok1(attr.flock.unlock == myunlock); - ok1(attr.flock.data == &lock_attr); - - attr.base.attr = TDB_ATTRIBUTE_SEED; - if (flags[i] & TDB_VERSION1) { - ok1(tdb_get_attribute(tdb, &attr) == TDB_ERR_EINVAL); - ok1(tap_log_messages == 1); - tap_log_messages = 0; - } else { - ok1(tdb_get_attribute(tdb, &attr) == 0); - ok1(attr.base.attr == TDB_ATTRIBUTE_SEED); - ok1(attr.seed.seed == seed_attr.seed.seed); - } - - /* Unset attributes. */ - tdb_unset_attribute(tdb, TDB_ATTRIBUTE_HASH); - ok1(tap_log_messages == 1); - tdb_unset_attribute(tdb, TDB_ATTRIBUTE_SEED); - ok1(tap_log_messages == 2); - tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK); - tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG); - ok1(tap_log_messages == 2); - tap_log_messages = 0; - - tdb_close(tdb); - - } - return exit_status(); -} diff --git a/ccan/tdb2/test/run-capabilities.c b/ccan/tdb2/test/run-capabilities.c deleted file mode 100644 index 4b25f9c5..00000000 --- a/ccan/tdb2/test/run-capabilities.c +++ /dev/null @@ -1,272 +0,0 @@ -#include -#include "tdb2-source.h" -#include -#include "logging.h" -#include "layout.h" -#include "failtest_helper.h" -#include -#include - -static size_t len_of(bool breaks_check, bool breaks_write, bool breaks_open) -{ - size_t len = 0; - if (breaks_check) - len += 8; - if (breaks_write) - len += 16; - if (breaks_open) - len += 32; - return len; -} - -/* Creates a TDB with various capabilities. */ -static void create_tdb(const char *name, - unsigned int cap, - bool breaks_check, - bool breaks_write, - bool breaks_open, ...) -{ - TDB_DATA key, data; - va_list ap; - struct tdb_layout *layout; - struct tdb_context *tdb; - int fd; - - key = tdb_mkdata("Hello", 5); - data = tdb_mkdata("world", 5); - - /* Create a TDB with some data, and some capabilities */ - layout = new_tdb_layout(); - tdb_layout_add_freetable(layout); - tdb_layout_add_used(layout, key, data, 6); - tdb_layout_add_free(layout, 80, 0); - tdb_layout_add_capability(layout, cap, - breaks_write, breaks_check, breaks_open, - len_of(breaks_check, breaks_write, breaks_open)); - - va_start(ap, breaks_open); - while ((cap = va_arg(ap, int)) != 0) { - breaks_check = va_arg(ap, int); - breaks_write = va_arg(ap, int); - breaks_open = va_arg(ap, int); - - key.dsize--; - tdb_layout_add_used(layout, key, data, 11 - key.dsize); - tdb_layout_add_free(layout, 80, 0); - tdb_layout_add_capability(layout, cap, - breaks_write, breaks_check, - breaks_open, - len_of(breaks_check, breaks_write, - breaks_open)); - } - va_end(ap); - - /* We open-code this, because we need to use the failtest write. */ - tdb = tdb_layout_get(layout, failtest_free, &tap_log_attr); - - fd = open(name, O_RDWR|O_TRUNC|O_CREAT, 0600); - if (fd < 0) - err(1, "opening %s for writing", name); - if (write(fd, tdb->file->map_ptr, tdb->file->map_size) - != tdb->file->map_size) - err(1, "writing %s", name); - close(fd); - tdb_close(tdb); - tdb_layout_free(layout); -} - -/* Note all the "goto out" early exits: they're to shorten failtest time. */ -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - char *summary; - - failtest_init(argc, argv); - failtest_hook = block_repeat_failures; - failtest_exit_check = exit_check_log; - plan_tests(60); - - failtest_suppress = true; - /* Capability says you can ignore it? */ - create_tdb("run-capabilities.tdb", 1, false, false, false, 0); - - failtest_suppress = false; - tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0, - &tap_log_attr); - failtest_suppress = true; - if (!ok1(tdb)) - goto out; - ok1(tap_log_messages == 0); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - ok1(tap_log_messages == 0); - tdb_close(tdb); - - /* Two capabilitues say you can ignore them? */ - create_tdb("run-capabilities.tdb", - 1, false, false, false, - 2, false, false, false, 0); - - failtest_suppress = false; - tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0, - &tap_log_attr); - failtest_suppress = true; - if (!ok1(tdb)) - goto out; - ok1(tap_log_messages == 0); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - ok1(tap_log_messages == 0); - ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS); - ok1(strstr(summary, "Capability 1\n")); - free(summary); - tdb_close(tdb); - - /* Capability says you can't check. */ - create_tdb("run-capabilities.tdb", - 1, false, false, false, - 2, true, false, false, 0); - - failtest_suppress = false; - tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0, - &tap_log_attr); - failtest_suppress = true; - if (!ok1(tdb)) - goto out; - ok1(tap_log_messages == 0); - ok1(tdb_get_flags(tdb) & TDB_CANT_CHECK); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - /* We expect a warning! */ - ok1(tap_log_messages == 1); - ok1(strstr(log_last, "capabilit")); - ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS); - ok1(strstr(summary, "Capability 1\n")); - ok1(strstr(summary, "Capability 2 (uncheckable)\n")); - free(summary); - tdb_close(tdb); - - /* Capability says you can't write. */ - create_tdb("run-capabilities.tdb", - 1, false, false, false, - 2, false, true, false, 0); - - failtest_suppress = false; - tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0, - &tap_log_attr); - failtest_suppress = true; - /* We expect a message. */ - ok1(!tdb); - if (!ok1(tap_log_messages == 2)) - goto out; - if (!ok1(strstr(log_last, "unknown"))) - goto out; - ok1(strstr(log_last, "write")); - - /* We can open it read-only though! */ - failtest_suppress = false; - tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDONLY, 0, - &tap_log_attr); - failtest_suppress = true; - if (!ok1(tdb)) - goto out; - ok1(tap_log_messages == 2); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - ok1(tap_log_messages == 2); - ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS); - ok1(strstr(summary, "Capability 1\n")); - ok1(strstr(summary, "Capability 2 (read-only)\n")); - free(summary); - tdb_close(tdb); - - /* Capability says you can't open. */ - create_tdb("run-capabilities.tdb", - 1, false, false, false, - 2, false, false, true, 0); - - failtest_suppress = false; - tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0, - &tap_log_attr); - failtest_suppress = true; - /* We expect a message. */ - ok1(!tdb); - if (!ok1(tap_log_messages == 3)) - goto out; - if (!ok1(strstr(log_last, "unknown"))) - goto out; - - /* Combine capabilities correctly. */ - create_tdb("run-capabilities.tdb", - 1, false, false, false, - 2, true, false, false, - 3, false, true, false, 0); - - failtest_suppress = false; - tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0, - &tap_log_attr); - failtest_suppress = true; - /* We expect a message. */ - ok1(!tdb); - if (!ok1(tap_log_messages == 4)) - goto out; - if (!ok1(strstr(log_last, "unknown"))) - goto out; - ok1(strstr(log_last, "write")); - - /* We can open it read-only though! */ - failtest_suppress = false; - tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDONLY, 0, - &tap_log_attr); - failtest_suppress = true; - if (!ok1(tdb)) - goto out; - ok1(tap_log_messages == 4); - ok1(tdb_get_flags(tdb) & TDB_CANT_CHECK); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - /* We expect a warning! */ - ok1(tap_log_messages == 5); - ok1(strstr(log_last, "unknown")); - ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS); - ok1(strstr(summary, "Capability 1\n")); - ok1(strstr(summary, "Capability 2 (uncheckable)\n")); - ok1(strstr(summary, "Capability 3 (read-only)\n")); - free(summary); - tdb_close(tdb); - - /* Two capability flags in one. */ - create_tdb("run-capabilities.tdb", - 1, false, false, false, - 2, true, true, false, - 0); - - failtest_suppress = false; - tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDWR, 0, - &tap_log_attr); - failtest_suppress = true; - /* We expect a message. */ - ok1(!tdb); - if (!ok1(tap_log_messages == 6)) - goto out; - if (!ok1(strstr(log_last, "unknown"))) - goto out; - ok1(strstr(log_last, "write")); - - /* We can open it read-only though! */ - failtest_suppress = false; - tdb = tdb_open("run-capabilities.tdb", TDB_DEFAULT, O_RDONLY, 0, - &tap_log_attr); - failtest_suppress = true; - if (!ok1(tdb)) - goto out; - ok1(tap_log_messages == 6); - ok1(tdb_get_flags(tdb) & TDB_CANT_CHECK); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - /* We expect a warning! */ - ok1(tap_log_messages == 7); - ok1(strstr(log_last, "unknown")); - ok1(tdb_summary(tdb, 0, &summary) == TDB_SUCCESS); - ok1(strstr(summary, "Capability 1\n")); - ok1(strstr(summary, "Capability 2 (uncheckable,read-only)\n")); - free(summary); - tdb_close(tdb); - -out: - failtest_exit(exit_status()); -} diff --git a/ccan/tdb2/test/run-expand-in-transaction.c b/ccan/tdb2/test/run-expand-in-transaction.c deleted file mode 100644 index 0fa2a57f..00000000 --- a/ccan/tdb2/test/run-expand-in-transaction.c +++ /dev/null @@ -1,39 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - struct tdb_data key = tdb_mkdata("key", 3); - struct tdb_data data = tdb_mkdata("data", 4); - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1); - - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - size_t size; - tdb = tdb_open("run-expand-in-transaction.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - size = tdb->file->map_size; - ok1(tdb_transaction_start(tdb) == 0); - ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0); - ok1(tdb->file->map_size > size); - ok1(tdb_transaction_commit(tdb) == 0); - ok1(tdb->file->map_size > size); - ok1(tdb_check(tdb, NULL, NULL) == 0); - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/run-features.c b/ccan/tdb2/test/run-features.c deleted file mode 100644 index b086869d..00000000 --- a/ccan/tdb2/test/run-features.c +++ /dev/null @@ -1,64 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - unsigned int i, j; - struct tdb_context *tdb; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; - struct tdb_data key = { (unsigned char *)&j, sizeof(j) }; - struct tdb_data data = { (unsigned char *)&j, sizeof(j) }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - uint64_t features; - tdb = tdb_open("run-features.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - /* Put some stuff in there. */ - for (j = 0; j < 100; j++) { - if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) - fail("Storing in tdb"); - } - - /* Mess with features fields in hdr. */ - features = (~TDB_FEATURE_MASK ^ 1); - ok1(tdb_write_convert(tdb, offsetof(struct tdb_header, - features_used), - &features, sizeof(features)) == 0); - ok1(tdb_write_convert(tdb, offsetof(struct tdb_header, - features_offered), - &features, sizeof(features)) == 0); - tdb_close(tdb); - - tdb = tdb_open("run-features.tdb", flags[i], O_RDWR, 0, - &tap_log_attr); - ok1(tdb); - if (!tdb) - continue; - - /* Should not have changed features offered. */ - ok1(tdb_read_convert(tdb, offsetof(struct tdb_header, - features_offered), - &features, sizeof(features)) == 0); - ok1(features == (~TDB_FEATURE_MASK ^ 1)); - - /* Should have cleared unknown bits in features_used. */ - ok1(tdb_read_convert(tdb, offsetof(struct tdb_header, - features_used), - &features, sizeof(features)) == 0); - ok1(features == (1 & TDB_FEATURE_MASK)); - - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} - - diff --git a/ccan/tdb2/test/run-lockall.c b/ccan/tdb2/test/run-lockall.c deleted file mode 100644 index 7cd9b849..00000000 --- a/ccan/tdb2/test/run-lockall.c +++ /dev/null @@ -1,74 +0,0 @@ -#include -#include -#include "lock-tracking.h" - -#define fcntl fcntl_with_lockcheck -#include "tdb2-source.h" - -#include -#include -#include -#include -#include -#include "external-agent.h" -#include "logging.h" - -#define TEST_DBNAME "run-lockall.tdb" - -#undef fcntl - -int main(int argc, char *argv[]) -{ - struct agent *agent; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT, - TDB_VERSION1, TDB_NOMMAP|TDB_VERSION1, - TDB_CONVERT|TDB_VERSION1, - TDB_NOMMAP|TDB_CONVERT|TDB_VERSION1 }; - int i; - - plan_tests(13 * sizeof(flags)/sizeof(flags[0]) + 1); - agent = prepare_external_agent(); - if (!agent) - err(1, "preparing agent"); - - for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) { - enum agent_return ret; - struct tdb_context *tdb; - - tdb = tdb_open(TEST_DBNAME, flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - - ret = external_agent_operation(agent, OPEN, TEST_DBNAME); - ok1(ret == SUCCESS); - - ok1(tdb_lockall(tdb) == TDB_SUCCESS); - ok1(external_agent_operation(agent, STORE, "key") - == WOULD_HAVE_BLOCKED); - ok1(external_agent_operation(agent, FETCH, "key") - == WOULD_HAVE_BLOCKED); - /* Test nesting. */ - ok1(tdb_lockall(tdb) == TDB_SUCCESS); - tdb_unlockall(tdb); - tdb_unlockall(tdb); - - ok1(external_agent_operation(agent, STORE, "key") == SUCCESS); - - ok1(tdb_lockall_read(tdb) == TDB_SUCCESS); - ok1(external_agent_operation(agent, STORE, "key") - == WOULD_HAVE_BLOCKED); - ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS); - ok1(tdb_lockall_read(tdb) == TDB_SUCCESS); - tdb_unlockall_read(tdb); - tdb_unlockall_read(tdb); - - ok1(external_agent_operation(agent, STORE, "key") == SUCCESS); - ok1(external_agent_operation(agent, CLOSE, NULL) == SUCCESS); - tdb_close(tdb); - } - - free_external_agent(agent); - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/run-remap-in-read_traverse.c b/ccan/tdb2/test/run-remap-in-read_traverse.c deleted file mode 100644 index b70a841a..00000000 --- a/ccan/tdb2/test/run-remap-in-read_traverse.c +++ /dev/null @@ -1,57 +0,0 @@ -#include "tdb2-source.h" -/* We had a bug where we marked the tdb read-only for a tdb_traverse_read. - * If we then expanded the tdb, we would remap read-only, and later SEGV. */ -#include -#include "external-agent.h" -#include "logging.h" - -static bool file_larger(int fd, tdb_len_t size) -{ - struct stat st; - - fstat(fd, &st); - return st.st_size != size; -} - -static unsigned add_records_to_grow(struct agent *agent, int fd, tdb_len_t size) -{ - unsigned int i; - - for (i = 0; !file_larger(fd, size); i++) { - char data[20]; - sprintf(data, "%i", i); - if (external_agent_operation(agent, STORE, data) != SUCCESS) - return 0; - } - diag("Added %u records to grow file", i); - return i; -} - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct agent *agent; - struct tdb_context *tdb; - struct tdb_data d = tdb_mkdata("hello", 5); - const char filename[] = "run-remap-in-read_traverse.tdb"; - - plan_tests(4); - - agent = prepare_external_agent(); - - tdb = tdb_open(filename, TDB_DEFAULT, - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - - ok1(external_agent_operation(agent, OPEN, filename) == SUCCESS); - i = add_records_to_grow(agent, tdb->file->fd, tdb->file->map_size); - - /* Do a traverse. */ - ok1(tdb_traverse(tdb, NULL, NULL) == i); - - /* Now store something! */ - ok1(tdb_store(tdb, d, d, TDB_INSERT) == 0); - ok1(tap_log_messages == 0); - tdb_close(tdb); - free_external_agent(agent); - return exit_status(); -} diff --git a/ccan/tdb2/test/run-seed.c b/ccan/tdb2/test/run-seed.c deleted file mode 100644 index 09193964..00000000 --- a/ccan/tdb2/test/run-seed.c +++ /dev/null @@ -1,61 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" - -static int log_count = 0; - -/* Normally we get a log when setting random seed. */ -static void my_log_fn(struct tdb_context *tdb, - enum tdb_log_level level, - enum TDB_ERROR ecode, - const char *message, void *priv) -{ - log_count++; -} - -static union tdb_attribute log_attr = { - .log = { .base = { .attr = TDB_ATTRIBUTE_LOG }, - .fn = my_log_fn } -}; - -int main(int argc, char *argv[]) -{ - unsigned int i; - struct tdb_context *tdb; - union tdb_attribute attr; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT }; - - attr.seed.base.attr = TDB_ATTRIBUTE_SEED; - attr.seed.base.next = &log_attr; - attr.seed.seed = 42; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 4 * 3); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - struct tdb_header hdr; - int fd; - tdb = tdb_open("run-seed.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &attr); - ok1(tdb); - if (!tdb) - continue; - ok1(tdb_check(tdb, NULL, NULL) == 0); - ok1(tdb->hash_seed == 42); - ok1(log_count == 0); - tdb_close(tdb); - - if (flags[i] & TDB_INTERNAL) - continue; - - fd = open("run-seed.tdb", O_RDONLY); - ok1(fd >= 0); - ok1(read(fd, &hdr, sizeof(hdr)) == sizeof(hdr)); - if (flags[i] & TDB_CONVERT) - ok1(bswap_64(hdr.hash_seed) == 42); - else - ok1(hdr.hash_seed == 42); - close(fd); - } - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-3G-file.c b/ccan/tdb2/test/run-tdb1-3G-file.c deleted file mode 100644 index f3403dd1..00000000 --- a/ccan/tdb2/test/run-tdb1-3G-file.c +++ /dev/null @@ -1,125 +0,0 @@ -/* We need this otherwise fcntl locking fails. */ -#define _FILE_OFFSET_BITS 64 -#include "tdb2-source.h" -#include -#include -#include -#include "logging.h" - -static int tdb1_expand_file_sparse(struct tdb_context *tdb, - tdb1_off_t size, - tdb1_off_t addition) -{ - if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) { - tdb->last_error = TDB_ERR_RDONLY; - return -1; - } - - if (ftruncate(tdb->file->fd, size+addition) == -1) { - char b = 0; - ssize_t written = pwrite(tdb->file->fd, &b, 1, (size+addition) - 1); - if (written == 0) { - /* try once more, potentially revealing errno */ - written = pwrite(tdb->file->fd, &b, 1, (size+addition) - 1); - } - if (written == 0) { - /* again - give up, guessing errno */ - errno = ENOSPC; - } - if (written != 1) { - tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "expand_file to %d failed (%s)", - size+addition, - strerror(errno)); - return -1; - } - } - - return 0; -} - -static const struct tdb1_methods large_io_methods = { - tdb1_read, - tdb1_write, - tdb1_next_hash_chain, - tdb1_oob, - tdb1_expand_file_sparse -}; - -static int test_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, - void *_data) -{ - TDB_DATA *expect = _data; - ok1(key.dsize == strlen("hi")); - ok1(memcmp(key.dptr, "hi", strlen("hi")) == 0); - ok1(data.dsize == expect->dsize); - ok1(memcmp(data.dptr, expect->dptr, data.dsize) == 0); - return 0; -} - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - TDB_DATA key, orig_data, data; - uint32_t hash; - tdb1_off_t rec_ptr; - struct tdb1_record rec; - union tdb_attribute hsize; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 1024; - - plan_tests(26); - tdb = tdb_open("run-36-file.tdb1", TDB_VERSION1, - O_CREAT|O_TRUNC|O_RDWR, 0600, &hsize); - - ok1(tdb); - tdb->tdb1.io = &large_io_methods; - - /* Enlarge the file (internally multiplies by 2). */ - ok1(tdb1_expand(tdb, 1500000000) == 0); - - /* Put an entry in, and check it. */ - key.dsize = strlen("hi"); - key.dptr = (void *)"hi"; - orig_data.dsize = strlen("world"); - orig_data.dptr = (void *)"world"; - - ok1(tdb_store(tdb, key, orig_data, TDB_INSERT) == TDB_SUCCESS); - - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == strlen("world")); - ok1(memcmp(data.dptr, "world", strlen("world")) == 0); - free(data.dptr); - - /* That currently fills at the end, make sure that's true. */ - hash = tdb_hash(tdb, key.dptr, key.dsize); - rec_ptr = tdb1_find_lock_hash(tdb, key, hash, F_RDLCK, &rec); - ok1(rec_ptr); - ok1(rec_ptr > 2U*1024*1024*1024); - tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK); - - /* Traverse must work. */ - ok1(tdb_traverse(tdb, test_traverse, &orig_data) == 1); - - /* Delete should work. */ - ok1(tdb_delete(tdb, key) == TDB_SUCCESS); - - ok1(tdb_traverse(tdb, test_traverse, NULL) == 0); - - /* Transactions should work. */ - ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); - ok1(tdb_store(tdb, key, orig_data, TDB_INSERT) == TDB_SUCCESS); - - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == strlen("world")); - ok1(memcmp(data.dptr, "world", strlen("world")) == 0); - free(data.dptr); - ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS); - - ok1(tdb_traverse(tdb, test_traverse, &orig_data) == 1); - tdb_close(tdb); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-bad-tdb-header.c b/ccan/tdb2/test/run-tdb1-bad-tdb-header.c deleted file mode 100644 index c1d76740..00000000 --- a/ccan/tdb2/test/run-tdb1-bad-tdb-header.c +++ /dev/null @@ -1,52 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - struct tdb1_header hdr; - int fd; - union tdb_attribute hsize; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 1024; - - plan_tests(11); - /* Cannot open fine if complete crap, even with O_CREAT. */ - fd = open("run-bad-tdb-header.tdb1", O_RDWR|O_CREAT|O_TRUNC, 0600); - ok1(fd >= 0); - ok1(write(fd, "hello world", 11) == 11); - close(fd); - tdb = tdb_open("run-bad-tdb-header.tdb1", 0, O_RDWR, 0, &tap_log_attr); - ok1(!tdb); - tdb = tdb_open("run-bad-tdb-header.tdb1", 0, O_CREAT|O_RDWR, - 0600, &hsize); - ok1(!tdb); - - /* With truncate, will be fine. */ - tdb = tdb_open("run-bad-tdb-header.tdb1", TDB_VERSION1, - O_RDWR|O_CREAT|O_TRUNC, 0600, &hsize); - ok1(tdb); - tdb_close(tdb); - - /* Now, with wrong version it should *not* overwrite. */ - fd = open("run-bad-tdb-header.tdb1", O_RDWR); - ok1(fd >= 0); - ok1(read(fd, &hdr, sizeof(hdr)) == sizeof(hdr)); - ok1(hdr.version == TDB1_VERSION); - hdr.version++; - lseek(fd, 0, SEEK_SET); - ok1(write(fd, &hdr, sizeof(hdr)) == sizeof(hdr)); - close(fd); - - tdb = tdb_open("run-bad-tdb-header.tdb1", TDB_VERSION1, O_RDWR|O_CREAT, - 0600, &hsize); - ok1(errno == EIO); - ok1(!tdb); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-check.c b/ccan/tdb2/test/run-tdb1-check.c deleted file mode 100644 index e939d040..00000000 --- a/ccan/tdb2/test/run-tdb1-check.c +++ /dev/null @@ -1,59 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - TDB_DATA key, data; - union tdb_attribute hsize; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 1; - - plan_tests(13); - tdb = tdb_open("run-check.tdb1", TDB_VERSION1, - O_CREAT|O_TRUNC|O_RDWR, 0600, &hsize); - - ok1(tdb); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - - key.dsize = strlen("hi"); - key.dptr = (void *)"hi"; - data.dsize = strlen("world"); - data.dptr = (void *)"world"; - - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - tdb = tdb_open("run-check.tdb1", TDB_VERSION1, O_RDWR, 0, &tap_log_attr); - ok1(tdb); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - tdb = tdb_open("test/tdb1.corrupt", TDB_VERSION1, O_RDWR, 0, - &tap_log_attr); - ok1(tdb); - ok1(tdb_check(tdb, NULL, NULL) == TDB_ERR_CORRUPT); - ok1(tdb_error(tdb) == TDB_ERR_CORRUPT); - tdb_close(tdb); - - /* Big and little endian should work! */ - tdb = tdb_open("test/old-nohash-le.tdb1", TDB_VERSION1, O_RDWR, 0, - &tap_log_attr); - ok1(tdb); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - tdb = tdb_open("test/old-nohash-be.tdb1", TDB_VERSION1, O_RDWR, 0, - &tap_log_attr); - ok1(tdb); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-corrupt.c b/ccan/tdb2/test/run-tdb1-corrupt.c deleted file mode 100644 index 35bc4c3f..00000000 --- a/ccan/tdb2/test/run-tdb1-corrupt.c +++ /dev/null @@ -1,123 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include -#include "logging.h" - -static int check(TDB_DATA key, TDB_DATA data, void *private) -{ - unsigned int *sizes = private; - - if (key.dsize > strlen("hello")) - return -1; - if (memcmp(key.dptr, "hello", key.dsize) != 0) - return -1; - - if (data.dsize != strlen("world")) - return -1; - if (memcmp(data.dptr, "world", data.dsize) != 0) - return -1; - - sizes[0] += key.dsize; - sizes[1] += data.dsize; - return 0; -} - -static void tdb1_flip_bit(struct tdb_context *tdb, unsigned int bit) -{ - unsigned int off = bit / CHAR_BIT; - unsigned char mask = (1 << (bit % CHAR_BIT)); - - if (tdb->file->map_ptr) - ((unsigned char *)tdb->file->map_ptr)[off] ^= mask; - else { - unsigned char c; - if (pread(tdb->file->fd, &c, 1, off) != 1) - err(1, "pread"); - c ^= mask; - if (pwrite(tdb->file->fd, &c, 1, off) != 1) - err(1, "pwrite"); - } -} - -static void check_test(struct tdb_context *tdb) -{ - TDB_DATA key, data; - unsigned int i, verifiable, corrupt, sizes[2], dsize, ksize; - - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - - key.dptr = (void *)"hello"; - data.dsize = strlen("world"); - data.dptr = (void *)"world"; - - /* Key and data size respectively. */ - dsize = ksize = 0; - - /* 5 keys in hash size 2 means we'll have multichains. */ - for (key.dsize = 1; key.dsize <= 5; key.dsize++) { - ksize += key.dsize; - dsize += data.dsize; - if (tdb_store(tdb, key, data, TDB_INSERT) != TDB_SUCCESS) - abort(); - } - - /* This is how many bytes we expect to be verifiable. */ - /* From the file header. */ - verifiable = strlen(TDB_MAGIC_FOOD) + 1 - + 2 * sizeof(uint32_t) + 2 * sizeof(tdb1_off_t) - + 2 * sizeof(uint32_t); - /* From the free list chain and hash chains. */ - verifiable += 3 * sizeof(tdb1_off_t); - /* From the record headers & tailer */ - verifiable += 5 * (sizeof(struct tdb1_record) + sizeof(uint32_t)); - /* The free block: we ignore datalen, keylen, full_hash. */ - verifiable += sizeof(struct tdb1_record) - 3*sizeof(uint32_t) + - sizeof(uint32_t); - /* Our check function verifies the key and data. */ - verifiable += ksize + dsize; - - /* Flip one bit at a time, make sure it detects verifiable bytes. */ - for (i = 0, corrupt = 0; i < tdb->file->map_size * CHAR_BIT; i++) { - tdb1_flip_bit(tdb, i); - memset(sizes, 0, sizeof(sizes)); - if (tdb_check(tdb, check, sizes) == TDB_ERR_CORRUPT) - corrupt++; - else if (sizes[0] != ksize || sizes[1] != dsize) - corrupt++; - tdb1_flip_bit(tdb, i); - } - ok(corrupt == verifiable * CHAR_BIT, "corrupt %u should be %u", - corrupt, verifiable * CHAR_BIT); -} - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - union tdb_attribute hsize; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 2; - - plan_tests(4); - /* This should use mmap. */ - tdb = tdb_open("run-corrupt.tdb1", TDB_VERSION1, - O_CREAT|O_TRUNC|O_RDWR, 0600, &hsize); - - if (!tdb) - abort(); - check_test(tdb); - tdb_close(tdb); - - /* This should not. */ - tdb = tdb_open("run-corrupt.tdb1", TDB_VERSION1|TDB_NOMMAP, - O_CREAT|O_TRUNC|O_RDWR, 0600, &hsize); - - if (!tdb) - abort(); - check_test(tdb); - tdb_close(tdb); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-endian.c b/ccan/tdb2/test/run-tdb1-endian.c deleted file mode 100644 index 3b91d45b..00000000 --- a/ccan/tdb2/test/run-tdb1-endian.c +++ /dev/null @@ -1,56 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - TDB_DATA key, data; - union tdb_attribute hsize; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 1024; - - plan_tests(14); - tdb = tdb_open("run-endian.tdb1", - TDB_VERSION1|TDB_CONVERT, - O_CREAT|O_TRUNC|O_RDWR, 0600, &hsize); - - ok1(tdb); - key.dsize = strlen("hi"); - key.dptr = (void *)"hi"; - data.dsize = strlen("world"); - data.dptr = (void *)"world"; - - ok1(tdb_store(tdb, key, data, TDB_MODIFY) == TDB_ERR_NOEXIST); - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS); - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_ERR_EXISTS); - ok1(tdb_store(tdb, key, data, TDB_MODIFY) == TDB_SUCCESS); - - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == strlen("world")); - ok1(memcmp(data.dptr, "world", strlen("world")) == 0); - free(data.dptr); - - key.dsize++; - ok1(tdb_fetch(tdb, key, &data) == TDB_ERR_NOEXIST); - ok1(data.dptr == NULL); - tdb_close(tdb); - - /* Reopen: should read it */ - tdb = tdb_open("run-endian.tdb1", 0, O_RDWR, 0, NULL); - ok1(tdb); - - key.dsize = strlen("hi"); - key.dptr = (void *)"hi"; - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == strlen("world")); - ok1(memcmp(data.dptr, "world", strlen("world")) == 0); - free(data.dptr); - tdb_close(tdb); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-hashsize.c b/ccan/tdb2/test/run-tdb1-hashsize.c deleted file mode 100644 index 8a78196c..00000000 --- a/ccan/tdb2/test/run-tdb1-hashsize.c +++ /dev/null @@ -1,61 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - union tdb_attribute hsize, h2; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 1024; - - plan_tests(14); - tdb = tdb_open("run-tdb1-hashsize.tdb1", TDB_VERSION1, - O_CREAT|O_TRUNC|O_RDWR, 0600, &hsize); - ok1(tdb); - h2.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - ok1(tdb_get_attribute(tdb, &h2) == TDB_SUCCESS); - ok1(h2.tdb1_hashsize.hsize == hsize.tdb1_hashsize.hsize); - tdb_close(tdb); - - /* Can't specify TDB_ATTRIBUTE_TDB1_HASHSIZE without O_CREAT */ - tdb = tdb_open("run-tdb1-hashsize.tdb1", TDB_VERSION1, - O_RDWR, 0600, &hsize); - ok1(!tdb); - ok1(tap_log_messages == 1); - - /* Can't specify TDB_ATTRIBUTE_TDB1_HASHSIZE for version2. */ - tdb = tdb_open("run-tdb1-hashsize.tdb", TDB_DEFAULT, - O_CREAT|O_TRUNC|O_RDWR, 0600, &hsize); - ok1(!tdb); - ok1(tap_log_messages == 2); - - /* We can get attribute even if we didn't set it though. */ - tdb = tdb_open("run-tdb1-hashsize.tdb1", TDB_DEFAULT, - O_RDWR, 0600, &tap_log_attr); - - ok1(tdb); - memset(&h2, 0, sizeof(h2)); - h2.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - ok1(tdb_get_attribute(tdb, &h2) == TDB_SUCCESS); - ok1(h2.tdb1_hashsize.hsize == hsize.tdb1_hashsize.hsize); - tdb_close(tdb); - - /* Check for default hash size. */ - tdb = tdb_open("run-tdb1-hashsize.tdb1", TDB_VERSION1, - O_CREAT|O_TRUNC|O_RDWR, 0600, &tap_log_attr); - - ok1(tdb); - memset(&h2, 0, sizeof(h2)); - h2.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - ok1(tdb_get_attribute(tdb, &h2) == TDB_SUCCESS); - ok1(h2.tdb1_hashsize.hsize == TDB1_DEFAULT_HASH_SIZE); - tdb_close(tdb); - ok1(tap_log_messages == 2); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-incompatible.c b/ccan/tdb2/test/run-tdb1-incompatible.c deleted file mode 100644 index 46ab5669..00000000 --- a/ccan/tdb2/test/run-tdb1-incompatible.c +++ /dev/null @@ -1,213 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include - -static uint64_t tdb1_dumb_hash(const void *key, size_t len, uint64_t seed, - void *unused) -{ - return len; -} - -static void log_fn(struct tdb_context *tdb, enum tdb_log_level level, - enum TDB_ERROR ecode, const char *message, void *priv) -{ - unsigned int *count = priv; - if (strstr(message, "hash")) - (*count)++; -} - -static unsigned int hdr_rwlocks(const char *fname) -{ - struct tdb1_header hdr; - - int fd = open(fname, O_RDONLY); - if (fd == -1) - return -1; - - if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) - return -1; - - close(fd); - return hdr.rwlocks; -} - -static uint64_t jenkins_hashfn(const void *key, size_t len, uint64_t seed, - void *unused) -{ - return hashlittle(key, len); -} - -static uint64_t old_hash(const void *key, size_t len, uint64_t seed, - void *unused) -{ - return tdb1_old_hash(key, len, seed, unused); -} - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - unsigned int log_count, flags; - TDB_DATA d; - union tdb_attribute log_attr, jhash_attr, ohash_attr, - incompat_hash_attr, dumbhash_attr; - - log_attr.base.attr = TDB_ATTRIBUTE_LOG; - log_attr.base.next = NULL; - log_attr.log.fn = log_fn; - log_attr.log.data = &log_count; - - jhash_attr.base.attr = TDB_ATTRIBUTE_HASH; - jhash_attr.base.next = &log_attr; - jhash_attr.hash.fn = jenkins_hashfn; - - ohash_attr.base.attr = TDB_ATTRIBUTE_HASH; - ohash_attr.base.next = &log_attr; - ohash_attr.hash.fn = old_hash; - - incompat_hash_attr.base.attr = TDB_ATTRIBUTE_HASH; - incompat_hash_attr.base.next = &log_attr; - incompat_hash_attr.hash.fn = tdb1_incompatible_hash; - - dumbhash_attr.base.attr = TDB_ATTRIBUTE_HASH; - dumbhash_attr.base.next = &log_attr; - dumbhash_attr.hash.fn = tdb1_dumb_hash; - - plan_tests(42 * 2); - - for (flags = 0; flags <= TDB_CONVERT; flags += TDB_CONVERT) { - unsigned int rwmagic = TDB1_HASH_RWLOCK_MAGIC; - - if (flags & TDB_CONVERT) - tdb1_convert(&rwmagic, sizeof(rwmagic)); - - /* Create an old-style hash. */ - log_count = 0; - tdb = tdb_open("run-incompatible.tdb1", flags|TDB_VERSION1, - O_CREAT|O_RDWR|O_TRUNC, 0600, &log_attr); - ok1(tdb); - ok1(log_count == 0); - d.dptr = (void *)"Hello"; - d.dsize = 5; - ok1(tdb_store(tdb, d, d, TDB_INSERT) == TDB_SUCCESS); - tdb_close(tdb); - - /* Should not have marked rwlocks field. */ - ok1(hdr_rwlocks("run-incompatible.tdb1") == 0); - - /* We can still open any old-style with incompat hash. */ - log_count = 0; - tdb = tdb_open("run-incompatible.tdb1", - TDB_VERSION1, - O_RDWR, 0600, &incompat_hash_attr); - ok1(tdb); - ok1(log_count == 0); - ok1(tdb_fetch(tdb, d, &d) == TDB_SUCCESS); - ok1(d.dsize == 5); - free(d.dptr); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - log_count = 0; - tdb = tdb_open("test/jenkins-le-hash.tdb1", - TDB_VERSION1, O_RDONLY, 0, &jhash_attr); - ok1(tdb); - ok1(log_count == 0); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - log_count = 0; - tdb = tdb_open("test/jenkins-be-hash.tdb1", - TDB_VERSION1, O_RDONLY, 0, &jhash_attr); - ok1(tdb); - ok1(log_count == 0); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - /* OK, now create with incompatible hash. */ - log_count = 0; - tdb = tdb_open("run-incompatible.tdb1", - flags|TDB_VERSION1, - O_CREAT|O_RDWR|O_TRUNC, 0600, - &incompat_hash_attr); - ok1(tdb); - ok1(log_count == 0); - d.dptr = (void *)"Hello"; - d.dsize = 5; - ok1(tdb_store(tdb, d, d, TDB_INSERT) == TDB_SUCCESS); - tdb_close(tdb); - - /* Should have marked rwlocks field. */ - ok1(hdr_rwlocks("run-incompatible.tdb1") == rwmagic); - - /* Cannot open with old hash. */ - log_count = 0; - tdb = tdb_open("run-incompatible.tdb1", TDB_VERSION1, - O_RDWR, 0600, &ohash_attr); - ok1(!tdb); - ok1(log_count == 1); - - /* Can open with jenkins hash. */ - log_count = 0; - tdb = tdb_open("run-incompatible.tdb1", TDB_VERSION1, - O_RDWR, 0600, &jhash_attr); - ok1(tdb); - ok1(log_count == 0); - ok1(tdb_fetch(tdb, d, &d) == TDB_SUCCESS); - ok1(d.dsize == 5); - free(d.dptr); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - /* Can open by letting it figure it out itself. */ - log_count = 0; - tdb = tdb_open("run-incompatible.tdb1", TDB_VERSION1, - O_RDWR, 0600, &log_attr); - ok1(tdb); - ok1(log_count == 0); - d.dptr = (void *)"Hello"; - d.dsize = 5; - ok1(tdb_fetch(tdb, d, &d) == TDB_SUCCESS); - ok1(d.dsize == 5); - free(d.dptr); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - /* FIXME: Not possible with TDB2 :( */ - /* We can also use incompatible hash with other hashes. */ - log_count = 0; - tdb = tdb_open("run-incompatible.tdb1", - flags|TDB_VERSION1, - O_CREAT|O_RDWR|O_TRUNC, 0600, &dumbhash_attr); - ok1(tdb); - ok1(log_count == 0); - d.dptr = (void *)"Hello"; - d.dsize = 5; - ok1(tdb_store(tdb, d, d, TDB_INSERT) == TDB_SUCCESS); - tdb_close(tdb); - - /* FIXME: Should have marked rwlocks field. */ - ok1(hdr_rwlocks("run-incompatible.tdb1") != rwmagic); - - /* It should not open if we don't specify. */ - log_count = 0; - tdb = tdb_open("run-incompatible.tdb1", TDB_VERSION1, O_RDWR, 0, - &log_attr); - ok1(!tdb); - ok1(log_count == 1); - - /* Should reopen with correct hash. */ - log_count = 0; - tdb = tdb_open("run-incompatible.tdb1", TDB_VERSION1, O_RDWR, 0, - &dumbhash_attr); - ok1(tdb); - ok1(log_count == 0); - ok1(tdb_fetch(tdb, d, &d) == TDB_SUCCESS); - ok1(d.dsize == 5); - free(d.dptr); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - } - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-nested-transactions.c b/ccan/tdb2/test/run-tdb1-nested-transactions.c deleted file mode 100644 index 149e5318..00000000 --- a/ccan/tdb2/test/run-tdb1-nested-transactions.c +++ /dev/null @@ -1,73 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - TDB_DATA key, data; - union tdb_attribute hsize; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 1024; - - plan_tests(30); - key.dsize = strlen("hi"); - key.dptr = (void *)"hi"; - - tdb = tdb_open("run-nested-transactions.tdb1", - TDB_VERSION1, O_CREAT|O_TRUNC|O_RDWR, 0600, &hsize); - ok1(tdb); - - /* No nesting by default. */ - ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); - data.dptr = (void *)"world"; - data.dsize = strlen("world"); - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS); - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == strlen("world")); - ok1(memcmp(data.dptr, "world", strlen("world")) == 0); - free(data.dptr); - ok1(tdb_transaction_start(tdb) == TDB_ERR_EINVAL); - ok1(tap_log_messages == 1); - - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == strlen("world")); - ok1(memcmp(data.dptr, "world", strlen("world")) == 0); - free(data.dptr); - ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS); - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == strlen("world")); - ok1(memcmp(data.dptr, "world", strlen("world")) == 0); - free(data.dptr); - tdb_close(tdb); - - tdb = tdb_open("run-nested-transactions.tdb1", - TDB_ALLOW_NESTING, O_RDWR, 0, &tap_log_attr); - ok1(tdb); - - ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); - ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); - ok1(tdb_delete(tdb, key) == TDB_SUCCESS); - ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS); - ok1(!tdb_exists(tdb, key)); - tdb_transaction_cancel(tdb); - ok1(tap_log_messages == 1); - /* Surprise! Kills inner "committed" transaction. */ - ok1(tdb_exists(tdb, key)); - - ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); - ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); - ok1(tdb_delete(tdb, key) == TDB_SUCCESS); - ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS); - ok1(!tdb_exists(tdb, key)); - ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS); - ok1(!tdb_exists(tdb, key)); - tdb_close(tdb); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-nested-traverse.c b/ccan/tdb2/test/run-tdb1-nested-traverse.c deleted file mode 100644 index cf5aa4a2..00000000 --- a/ccan/tdb2/test/run-tdb1-nested-traverse.c +++ /dev/null @@ -1,87 +0,0 @@ -#include "tdb1-lock-tracking.h" -#define fcntl fcntl_with_lockcheck1 -#include "tdb2-source.h" -#include -#undef fcntl -#include -#include -#include -#include "tdb1-external-agent.h" -#include "logging.h" - -static struct agent *agent; - -static bool correct_key(TDB_DATA key) -{ - return key.dsize == strlen("hi") - && memcmp(key.dptr, "hi", key.dsize) == 0; -} - -static bool correct_data(TDB_DATA data) -{ - return data.dsize == strlen("world") - && memcmp(data.dptr, "world", data.dsize) == 0; -} - -static int traverse2(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, - void *p) -{ - ok1(correct_key(key)); - ok1(correct_data(data)); - return 0; -} - -static int traverse1(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, - void *p) -{ - ok1(correct_key(key)); - ok1(correct_data(data)); - ok1(external_agent_operation1(agent, TRANSACTION_START, tdb->name) - == WOULD_HAVE_BLOCKED); - tdb_traverse(tdb, traverse2, NULL); - - /* That should *not* release the transaction lock! */ - ok1(external_agent_operation1(agent, TRANSACTION_START, tdb->name) - == WOULD_HAVE_BLOCKED); - return 0; -} - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - TDB_DATA key, data; - union tdb_attribute hsize; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 1024; - - plan_tests(17); - agent = prepare_external_agent1(); - if (!agent) - err(1, "preparing agent"); - - tdb = tdb_open("run-nested-traverse.tdb1", TDB_VERSION1, - O_CREAT|O_TRUNC|O_RDWR, 0600, &hsize); - ok1(tdb); - - ok1(external_agent_operation1(agent, OPEN, tdb->name) == SUCCESS); - ok1(external_agent_operation1(agent, TRANSACTION_START, tdb->name) - == SUCCESS); - ok1(external_agent_operation1(agent, TRANSACTION_COMMIT, tdb->name) - == SUCCESS); - - key.dsize = strlen("hi"); - key.dptr = (void *)"hi"; - data.dptr = (void *)"world"; - data.dsize = strlen("world"); - - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS); - tdb_traverse(tdb, traverse1, NULL); - tdb_add_flag(tdb, TDB_RDONLY); - tdb_traverse(tdb, traverse1, NULL); - tdb_remove_flag(tdb, TDB_RDONLY); - tdb_close(tdb); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-no-lock-during-traverse.c b/ccan/tdb2/test/run-tdb1-no-lock-during-traverse.c deleted file mode 100644 index b2b7a781..00000000 --- a/ccan/tdb2/test/run-tdb1-no-lock-during-traverse.c +++ /dev/null @@ -1,111 +0,0 @@ -#include -#include -#include "tdb1-lock-tracking.h" - -#define fcntl fcntl_with_lockcheck1 - -#include "tdb2-source.h" -#include -#include -#include -#include "logging.h" - -#undef fcntl - -#define NUM_ENTRIES 10 - -static bool prepare_entries(struct tdb_context *tdb) -{ - unsigned int i; - TDB_DATA key, data; - - for (i = 0; i < NUM_ENTRIES; i++) { - key.dsize = sizeof(i); - key.dptr = (void *)&i; - data.dsize = strlen("world"); - data.dptr = (void *)"world"; - - if (tdb_store(tdb, key, data, 0) != TDB_SUCCESS) - return false; - } - return true; -} - -static void delete_entries(struct tdb_context *tdb) -{ - unsigned int i; - TDB_DATA key; - - for (i = 0; i < NUM_ENTRIES; i++) { - key.dsize = sizeof(i); - key.dptr = (void *)&i; - - ok1(tdb_delete(tdb, key) == TDB_SUCCESS); - } -} - -/* We don't know how many times this will run. */ -static int delete_other(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, - void *private_data) -{ - unsigned int i; - memcpy(&i, key.dptr, 4); - i = (i + 1) % NUM_ENTRIES; - key.dptr = (void *)&i; - if (tdb_delete(tdb, key) != TDB_SUCCESS) - (*(int *)private_data)++; - return 0; -} - -static int delete_self(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, - void *private_data) -{ - ok1(tdb_delete(tdb, key) == TDB_SUCCESS); - return 0; -} - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - int errors = 0; - union tdb_attribute hsize; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 1024; - - plan_tests(40); - tdb = tdb_open("run-no-lock-during-traverse.tdb1", - TDB_VERSION1, O_CREAT|O_TRUNC|O_RDWR, - 0600, &hsize); - - ok1(tdb); - ok1(prepare_entries(tdb)); - ok1(locking_errors1 == 0); - ok1(tdb_lockall(tdb) == 0); - ok1(locking_errors1 == 0); - ok1(tdb_traverse(tdb, delete_other, &errors) >= 0); - ok1(errors == 0); - ok1(locking_errors1 == 0); - tdb_unlockall(tdb); - - ok1(prepare_entries(tdb)); - ok1(locking_errors1 == 0); - ok1(tdb_lockall(tdb) == 0); - ok1(locking_errors1 == 0); - ok1(tdb_traverse(tdb, delete_self, NULL) == NUM_ENTRIES); - ok1(locking_errors1 == 0); - tdb_unlockall(tdb); - - ok1(prepare_entries(tdb)); - ok1(locking_errors1 == 0); - ok1(tdb_lockall(tdb) == 0); - ok1(locking_errors1 == 0); - delete_entries(tdb); - ok1(locking_errors1 == 0); - tdb_unlockall(tdb); - - ok1(tdb_close(tdb) == 0); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-oldhash.c b/ccan/tdb2/test/run-tdb1-oldhash.c deleted file mode 100644 index f9cffa25..00000000 --- a/ccan/tdb2/test/run-tdb1-oldhash.c +++ /dev/null @@ -1,45 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - union tdb_attribute incompat_hash_attr; - - incompat_hash_attr.base.attr = TDB_ATTRIBUTE_HASH; - incompat_hash_attr.base.next = &tap_log_attr; - incompat_hash_attr.hash.fn = tdb1_incompatible_hash; - - plan_tests(8); - - /* Old format (with zeroes in the hash magic fields) should - * open with any hash (since we don't know what hash they used). */ - tdb = tdb_open("test/old-nohash-le.tdb1", TDB_VERSION1, O_RDWR, 0, - &tap_log_attr); - ok1(tdb); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - tdb = tdb_open("test/old-nohash-be.tdb1", TDB_VERSION1, O_RDWR, 0, - &tap_log_attr); - ok1(tdb); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - tdb = tdb_open("test/old-nohash-le.tdb1", TDB_VERSION1, O_RDWR, 0, - &incompat_hash_attr); - ok1(tdb); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - tdb = tdb_open("test/old-nohash-be.tdb1", TDB_VERSION1, O_RDWR, 0, - &incompat_hash_attr); - ok1(tdb); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-readonly-check.c b/ccan/tdb2/test/run-tdb1-readonly-check.c deleted file mode 100644 index f42a8f5e..00000000 --- a/ccan/tdb2/test/run-tdb1-readonly-check.c +++ /dev/null @@ -1,47 +0,0 @@ -/* We should be able to tdb_check a O_RDONLY tdb, and we were previously allowed - * to tdb_check() inside a transaction (though that's paranoia!). */ -#include "tdb2-source.h" -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - TDB_DATA key, data; - union tdb_attribute hsize; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 1024; - - plan_tests(10); - tdb = tdb_open("run-readonly-check.tdb1", - TDB_VERSION1, - O_CREAT|O_TRUNC|O_RDWR, 0600, &hsize); - - ok1(tdb); - key.dsize = strlen("hi"); - key.dptr = (void *)"hi"; - data.dsize = strlen("world"); - data.dptr = (void *)"world"; - - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - - /* We are also allowed to do a check inside a transaction. */ - ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - ok1(tdb_close(tdb) == 0); - - tdb = tdb_open("run-readonly-check.tdb1", - TDB_DEFAULT, O_RDONLY, 0, &tap_log_attr); - - ok1(tdb); - ok1(tdb_store(tdb, key, data, TDB_MODIFY) == TDB_ERR_RDONLY); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - ok1(tdb_close(tdb) == 0); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-rwlock-check.c b/ccan/tdb2/test/run-tdb1-rwlock-check.c deleted file mode 100644 index 44a2eeb8..00000000 --- a/ccan/tdb2/test/run-tdb1-rwlock-check.c +++ /dev/null @@ -1,42 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include - -static void log_fn(struct tdb_context *tdb, enum tdb_log_level level, - enum TDB_ERROR ecode, const char *message, void *priv) -{ - unsigned int *count = priv; - if (strstr(message, "spinlocks")) - (*count)++; -} - -/* The code should barf on TDBs created with rwlocks. */ -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - unsigned int log_count; - union tdb_attribute log_attr; - - log_attr.base.attr = TDB_ATTRIBUTE_LOG; - log_attr.base.next = NULL; - log_attr.log.fn = log_fn; - log_attr.log.data = &log_count; - - plan_tests(4); - - /* We should fail to open rwlock-using tdbs of either endian. */ - log_count = 0; - tdb = tdb_open("test/rwlock-le.tdb1", TDB_VERSION1, O_RDWR, 0, - &log_attr); - ok1(!tdb); - ok1(log_count == 1); - - log_count = 0; - tdb = tdb_open("test/rwlock-be.tdb1", TDB_VERSION1, O_RDWR, 0, - &log_attr); - ok1(!tdb); - ok1(log_count == 1); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-seqnum-wrap.c b/ccan/tdb2/test/run-tdb1-seqnum-wrap.c deleted file mode 100644 index c3eb278e..00000000 --- a/ccan/tdb2/test/run-tdb1-seqnum-wrap.c +++ /dev/null @@ -1,39 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - unsigned int i; - struct tdb1_header hdr; - struct tdb_data key = { (unsigned char *)&hdr, sizeof(hdr) }; - struct tdb_data data = { (unsigned char *)&hdr, sizeof(hdr) }; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 7); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-tdb1-seqnum-wrap.tdb1", - flags[i]|TDB_VERSION1|TDB_SEQNUM, - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - ok1(tdb); - if (!tdb) - break; - ok1(pread(tdb->file->fd, &hdr, sizeof(hdr), 0) == sizeof(hdr)); - hdr.sequence_number = 0xFFFFFFFF; - ok1(pwrite(tdb->file->fd, &hdr, sizeof(hdr), 0) == sizeof(hdr)); - - /* Must not be negative: that would mean an error! */ - ok1(tdb_get_seqnum(tdb) == 0xFFFFFFFF); - - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS); - ok1(tdb_get_seqnum(tdb) == 0); - tdb_close(tdb); - ok1(tap_log_messages == 0); - } - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-summary.c b/ccan/tdb2/test/run-tdb1-summary.c deleted file mode 100644 index 5107b8e4..00000000 --- a/ccan/tdb2/test/run-tdb1-summary.c +++ /dev/null @@ -1,56 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include - -int main(int argc, char *argv[]) -{ - unsigned int i, j; - struct tdb_context *tdb; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT }; - TDB_DATA key = { (unsigned char *)&j, sizeof(j) }; - TDB_DATA data = { (unsigned char *)&j, sizeof(j) }; - char *summary; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 14); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-summary.tdb1", flags[i]|TDB_VERSION1, - O_RDWR|O_CREAT|O_TRUNC, 0600, NULL); - ok1(tdb); - if (!tdb) - continue; - - /* Put some stuff in there. */ - for (j = 0; j < 500; j++) { - /* Make sure padding varies to we get some graphs! */ - data.dsize = j % (sizeof(j) + 1); - if (tdb_store(tdb, key, data, TDB_REPLACE) - != TDB_SUCCESS) { - fail("Storing in tdb"); - } - } - - summary = tdb1_summary(tdb); - diag("%s", summary); - ok1(strstr(summary, "Size of file/data: ")); - ok1(strstr(summary, "Number of records: 500\n")); - ok1(strstr(summary, "Smallest/average/largest keys: 4/4/4\n")); - ok1(strstr(summary, "Smallest/average/largest data: 0/2/4\n")); - ok1(strstr(summary, "Smallest/average/largest padding: ")); - ok1(strstr(summary, "Number of dead records: 0\n")); - ok1(strstr(summary, "Number of free records: 1\n")); - ok1(strstr(summary, "Smallest/average/largest free records: ")); - ok1(strstr(summary, "Number of hash chains: 131\n")); - ok1(strstr(summary, "Smallest/average/largest hash chains: ")); - ok1(strstr(summary, "Number of uncoalesced records: 0\n")); - ok1(strstr(summary, "Smallest/average/largest uncoalesced runs: 0/0/0\n")); - ok1(strstr(summary, "Percentage keys/data/padding/free/dead/rechdrs&tailers/hashes: ")); - - free(summary); - tdb_close(tdb); - } - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-traverse-in-transaction.c b/ccan/tdb2/test/run-tdb1-traverse-in-transaction.c deleted file mode 100644 index 691aaf94..00000000 --- a/ccan/tdb2/test/run-tdb1-traverse-in-transaction.c +++ /dev/null @@ -1,85 +0,0 @@ -#include "config.h" -#include "tdb1-lock-tracking.h" -#define fcntl fcntl_with_lockcheck1 -#include "tdb2-source.h" -#include -#undef fcntl_with_lockcheck -#include -#include -#include -#include "tdb1-external-agent.h" -#include "logging.h" - -static struct agent *agent; - -static bool correct_key(TDB_DATA key) -{ - return key.dsize == strlen("hi") - && memcmp(key.dptr, "hi", key.dsize) == 0; -} - -static bool correct_data(TDB_DATA data) -{ - return data.dsize == strlen("world") - && memcmp(data.dptr, "world", data.dsize) == 0; -} - -static int traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, - void *p) -{ - ok1(correct_key(key)); - ok1(correct_data(data)); - return 0; -} - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - TDB_DATA key, data; - union tdb_attribute hsize; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 1024; - - plan_tests(13); - agent = prepare_external_agent1(); - if (!agent) - err(1, "preparing agent"); - - tdb = tdb_open("run-traverse-in-transaction.tdb1", - TDB_VERSION1, O_CREAT|O_TRUNC|O_RDWR, - 0600, &hsize); - ok1(tdb); - - key.dsize = strlen("hi"); - key.dptr = (void *)"hi"; - data.dptr = (void *)"world"; - data.dsize = strlen("world"); - - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS); - - ok1(external_agent_operation1(agent, OPEN, tdb->name) == SUCCESS); - - ok1(tdb_transaction_start(tdb) == TDB_SUCCESS); - ok1(external_agent_operation1(agent, TRANSACTION_START, tdb->name) - == WOULD_HAVE_BLOCKED); - tdb_traverse(tdb, traverse, NULL); - - /* That should *not* release the transaction lock! */ - ok1(external_agent_operation1(agent, TRANSACTION_START, tdb->name) - == WOULD_HAVE_BLOCKED); - tdb_traverse(tdb, traverse, NULL); - - /* That should *not* release the transaction lock! */ - ok1(external_agent_operation1(agent, TRANSACTION_START, tdb->name) - == WOULD_HAVE_BLOCKED); - ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS); - /* Now we should be fine. */ - ok1(external_agent_operation1(agent, TRANSACTION_START, tdb->name) - == SUCCESS); - - tdb_close(tdb); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-wronghash-fail.c b/ccan/tdb2/test/run-tdb1-wronghash-fail.c deleted file mode 100644 index 63c1bdf1..00000000 --- a/ccan/tdb2/test/run-tdb1-wronghash-fail.c +++ /dev/null @@ -1,143 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include - -static void log_fn(struct tdb_context *tdb, enum tdb_log_level level, - enum TDB_ERROR ecode, const char *message, void *priv) -{ - unsigned int *count = priv; - if (strstr(message, "hash")) - (*count)++; -} - -static uint64_t jenkins_hashfn(const void *key, size_t len, uint64_t seed, - void *unused) -{ - return hashlittle(key, len); -} - -/* the tdb1_old_hash function is "magic" as it automatically makes us test the - * tdb1_incompatible_hash as well, so use this wrapper. */ -static uint64_t old_hash(const void *key, size_t len, uint64_t seed, - void *unused) -{ - return tdb1_old_hash(key, len, seed, unused); -} - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - unsigned int log_count; - TDB_DATA d; - union tdb_attribute log_attr, jhash_attr, ohash_attr, - incompat_hash_attr; - - log_attr.base.attr = TDB_ATTRIBUTE_LOG; - log_attr.base.next = NULL; - log_attr.log.fn = log_fn; - log_attr.log.data = &log_count; - - jhash_attr.base.attr = TDB_ATTRIBUTE_HASH; - jhash_attr.base.next = &log_attr; - jhash_attr.hash.fn = jenkins_hashfn; - - ohash_attr.base.attr = TDB_ATTRIBUTE_HASH; - ohash_attr.base.next = &log_attr; - ohash_attr.hash.fn = old_hash; - - incompat_hash_attr.base.attr = TDB_ATTRIBUTE_HASH; - incompat_hash_attr.base.next = &log_attr; - incompat_hash_attr.hash.fn = tdb1_incompatible_hash; - - plan_tests(28); - - /* Create with default hash. */ - log_count = 0; - tdb = tdb_open("run-wronghash-fail.tdb1", TDB_VERSION1, - O_CREAT|O_RDWR|O_TRUNC, 0600, &log_attr); - ok1(tdb); - ok1(log_count == 0); - d.dptr = (void *)"Hello"; - d.dsize = 5; - ok1(tdb_store(tdb, d, d, TDB_INSERT) == TDB_SUCCESS); - tdb_close(tdb); - - /* Fail to open with different hash. */ - tdb = tdb_open("run-wronghash-fail.tdb1", TDB_VERSION1, O_RDWR, 0, - &jhash_attr); - ok1(!tdb); - ok1(log_count == 1); - - /* Create with different hash. */ - log_count = 0; - tdb = tdb_open("run-wronghash-fail.tdb1", TDB_VERSION1, - O_CREAT|O_RDWR|O_TRUNC, 0600, &jhash_attr); - ok1(tdb); - ok1(log_count == 0); - tdb_close(tdb); - - /* Endian should be no problem. */ - log_count = 0; - tdb = tdb_open("test/jenkins-le-hash.tdb1", TDB_VERSION1, O_RDWR, 0, - &ohash_attr); - ok1(!tdb); - ok1(log_count == 1); - - log_count = 0; - tdb = tdb_open("test/jenkins-be-hash.tdb1", TDB_VERSION1, O_RDWR, 0, - &ohash_attr); - ok1(!tdb); - ok1(log_count == 1); - - log_count = 0; - /* Fail to open with old default hash. */ - tdb = tdb_open("run-wronghash-fail.tdb1", TDB_VERSION1, O_RDWR, 0, - &ohash_attr); - ok1(!tdb); - ok1(log_count == 1); - - log_count = 0; - tdb = tdb_open("test/jenkins-le-hash.tdb1", TDB_VERSION1, O_RDONLY, - 0, &incompat_hash_attr); - ok1(tdb); - ok1(log_count == 0); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - log_count = 0; - tdb = tdb_open("test/jenkins-be-hash.tdb1", TDB_VERSION1, O_RDONLY, - 0, &incompat_hash_attr); - ok1(tdb); - ok1(log_count == 0); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - /* It should open with jenkins hash if we don't specify. */ - log_count = 0; - tdb = tdb_open("test/jenkins-le-hash.tdb1", TDB_VERSION1, O_RDWR, 0, - &log_attr); - ok1(tdb); - ok1(log_count == 0); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - log_count = 0; - tdb = tdb_open("test/jenkins-be-hash.tdb1", TDB_VERSION1, O_RDWR, 0, - &log_attr); - ok1(tdb); - ok1(log_count == 0); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - log_count = 0; - tdb = tdb_open("run-wronghash-fail.tdb1", TDB_VERSION1, O_RDONLY, - 0, &log_attr); - ok1(tdb); - ok1(log_count == 0); - ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS); - tdb_close(tdb); - - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1-zero-append.c b/ccan/tdb2/test/run-tdb1-zero-append.c deleted file mode 100644 index fdc9cdce..00000000 --- a/ccan/tdb2/test/run-tdb1-zero-append.c +++ /dev/null @@ -1,36 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - TDB_DATA key, data; - union tdb_attribute hsize; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 1024; - - plan_tests(5); - tdb = tdb_open(NULL, TDB_INTERNAL|TDB_VERSION1, O_CREAT|O_TRUNC|O_RDWR, - 0600, &hsize); - ok1(tdb); - - /* Tickle bug on appending zero length buffer to zero length buffer. */ - key.dsize = strlen("hi"); - key.dptr = (void *)"hi"; - data.dptr = (void *)"world"; - data.dsize = 0; - - ok1(tdb_append(tdb, key, data) == TDB_SUCCESS); - ok1(tdb_append(tdb, key, data) == TDB_SUCCESS); - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == 0); - free(data.dptr); - tdb_close(tdb); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb1.c b/ccan/tdb2/test/run-tdb1.c deleted file mode 100644 index dca6473b..00000000 --- a/ccan/tdb2/test/run-tdb1.c +++ /dev/null @@ -1,42 +0,0 @@ -#include "tdb2-source.h" -#include -#include -#include -#include "logging.h" - -int main(int argc, char *argv[]) -{ - struct tdb_context *tdb; - TDB_DATA key, data; - union tdb_attribute hsize; - - hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE; - hsize.base.next = &tap_log_attr; - hsize.tdb1_hashsize.hsize = 1024; - - plan_tests(9); - tdb = tdb_open("run.tdb1", TDB_VERSION1, - O_CREAT|O_TRUNC|O_RDWR, 0600, &hsize); - - ok1(tdb); - key.dsize = strlen("hi"); - key.dptr = (void *)"hi"; - data.dsize = strlen("world"); - data.dptr = (void *)"world"; - - ok1(tdb_store(tdb, key, data, TDB_MODIFY) == TDB_ERR_NOEXIST); - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_SUCCESS); - ok1(tdb_store(tdb, key, data, TDB_INSERT) == TDB_ERR_EXISTS); - ok1(tdb_store(tdb, key, data, TDB_MODIFY) == TDB_SUCCESS); - - ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS); - ok1(data.dsize == strlen("world")); - ok1(memcmp(data.dptr, "world", strlen("world")) == 0); - free(data.dptr); - - key.dsize++; - ok1(tdb_fetch(tdb, key, &data) == TDB_ERR_NOEXIST); - tdb_close(tdb); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb_errorstr.c b/ccan/tdb2/test/run-tdb_errorstr.c deleted file mode 100644 index 742fb891..00000000 --- a/ccan/tdb2/test/run-tdb_errorstr.c +++ /dev/null @@ -1,52 +0,0 @@ -#include "tdb2-source.h" -#include - -int main(int argc, char *argv[]) -{ - enum TDB_ERROR err; - plan_tests(TDB_ERR_RDONLY*-1 + 2); - - for (err = TDB_SUCCESS; err >= TDB_ERR_RDONLY; err--) { - switch (err) { - case TDB_SUCCESS: - ok1(!strcmp(tdb_errorstr(err), - "Success")); - break; - case TDB_ERR_IO: - ok1(!strcmp(tdb_errorstr(err), - "IO Error")); - break; - case TDB_ERR_LOCK: - ok1(!strcmp(tdb_errorstr(err), - "Locking error")); - break; - case TDB_ERR_OOM: - ok1(!strcmp(tdb_errorstr(err), - "Out of memory")); - break; - case TDB_ERR_EXISTS: - ok1(!strcmp(tdb_errorstr(err), - "Record exists")); - break; - case TDB_ERR_EINVAL: - ok1(!strcmp(tdb_errorstr(err), - "Invalid parameter")); - break; - case TDB_ERR_NOEXIST: - ok1(!strcmp(tdb_errorstr(err), - "Record does not exist")); - break; - case TDB_ERR_RDONLY: - ok1(!strcmp(tdb_errorstr(err), - "write not permitted")); - break; - case TDB_ERR_CORRUPT: - ok1(!strcmp(tdb_errorstr(err), - "Corrupt database")); - break; - } - } - ok1(!strcmp(tdb_errorstr(err), "Invalid error code")); - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-tdb_foreach.c b/ccan/tdb2/test/run-tdb_foreach.c deleted file mode 100644 index b17f0780..00000000 --- a/ccan/tdb2/test/run-tdb_foreach.c +++ /dev/null @@ -1,86 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" - -static int drop_count(struct tdb_context *tdb, unsigned int *count) -{ - if (--(*count) == 0) - return 1; - return 0; -} - -static int set_found(struct tdb_context *tdb, bool found[3]) -{ - unsigned int idx; - - if (strcmp(tdb_name(tdb), "run-tdb_foreach0.tdb") == 0) - idx = 0; - else if (strcmp(tdb_name(tdb), "run-tdb_foreach1.tdb") == 0) - idx = 1; - else if (strcmp(tdb_name(tdb), "run-tdb_foreach2.tdb") == 0) - idx = 2; - else - abort(); - - if (found[idx]) - abort(); - found[idx] = true; - return 0; -} - -int main(int argc, char *argv[]) -{ - unsigned int i, count; - bool found[3]; - struct tdb_context *tdb0, *tdb1, *tdb2; - int flags[] = { TDB_DEFAULT, TDB_NOMMAP, - TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT }; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 8); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb0 = tdb_open("run-tdb_foreach0.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - tdb1 = tdb_open("run-tdb_foreach1.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - tdb2 = tdb_open("run-tdb_foreach2.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr); - - memset(found, 0, sizeof(found)); - tdb_foreach(set_found, found); - ok1(found[0] && found[1] && found[2]); - - /* Test premature iteration termination */ - count = 1; - tdb_foreach(drop_count, &count); - ok1(count == 0); - - tdb_close(tdb1); - memset(found, 0, sizeof(found)); - tdb_foreach(set_found, found); - ok1(found[0] && !found[1] && found[2]); - - tdb_close(tdb2); - memset(found, 0, sizeof(found)); - tdb_foreach(set_found, found); - ok1(found[0] && !found[1] && !found[2]); - - tdb1 = tdb_open("run-tdb_foreach1.tdb", flags[i], - O_RDWR, 0600, &tap_log_attr); - memset(found, 0, sizeof(found)); - tdb_foreach(set_found, found); - ok1(found[0] && found[1] && !found[2]); - - tdb_close(tdb0); - memset(found, 0, sizeof(found)); - tdb_foreach(set_found, found); - ok1(!found[0] && found[1] && !found[2]); - - tdb_close(tdb1); - memset(found, 0, sizeof(found)); - tdb_foreach(set_found, found); - ok1(!found[0] && !found[1] && !found[2]); - ok1(tap_log_messages == 0); - } - - return exit_status(); -} diff --git a/ccan/tdb2/test/run-traverse.c b/ccan/tdb2/test/run-traverse.c deleted file mode 100644 index 4de0ebde..00000000 --- a/ccan/tdb2/test/run-traverse.c +++ /dev/null @@ -1,203 +0,0 @@ -#include "tdb2-source.h" -#include -#include "logging.h" - -#define NUM_RECORDS 1000 - -/* We use the same seed which we saw a failure on. */ -static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p) -{ - return hash64_stable((const unsigned char *)key, len, - *(uint64_t *)p); -} - -static bool store_records(struct tdb_context *tdb) -{ - int i; - struct tdb_data key = { (unsigned char *)&i, sizeof(i) }; - struct tdb_data data = { (unsigned char *)&i, sizeof(i) }; - - for (i = 0; i < NUM_RECORDS; i++) - if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) - return false; - return true; -} - -struct trav_data { - unsigned int calls, call_limit; - int low, high; - bool mismatch; - bool delete; - enum TDB_ERROR delete_error; -}; - -static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, - struct trav_data *td) -{ - int val; - - td->calls++; - if (key.dsize != sizeof(val) || dbuf.dsize != sizeof(val) - || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) { - td->mismatch = true; - return -1; - } - memcpy(&val, dbuf.dptr, dbuf.dsize); - if (val < td->low) - td->low = val; - if (val > td->high) - td->high = val; - - if (td->delete) { - td->delete_error = tdb_delete(tdb, key); - if (td->delete_error != TDB_SUCCESS) { - return -1; - } - } - - if (td->calls == td->call_limit) - return 1; - return 0; -} - -struct trav_grow_data { - unsigned int calls; - unsigned int num_large; - bool mismatch; - enum TDB_ERROR error; -}; - -static int trav_grow(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, - struct trav_grow_data *tgd) -{ - int val; - unsigned char buffer[128] = { 0 }; - - tgd->calls++; - if (key.dsize != sizeof(val) || dbuf.dsize < sizeof(val) - || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) { - tgd->mismatch = true; - return -1; - } - - if (dbuf.dsize > sizeof(val)) - /* We must have seen this before! */ - tgd->num_large++; - - /* Make a big difference to the database. */ - dbuf.dptr = buffer; - dbuf.dsize = sizeof(buffer); - tgd->error = tdb_append(tdb, key, dbuf); - if (tgd->error != TDB_SUCCESS) { - return -1; - } - return 0; -} - -int main(int argc, char *argv[]) -{ - unsigned int i; - int num; - struct trav_data td; - struct trav_grow_data tgd; - struct tdb_context *tdb; - uint64_t seed = 16014841315512641303ULL; - int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP, - TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT, - TDB_NOMMAP|TDB_CONVERT }; - union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH }, - .fn = fixedhash, - .data = &seed } }; - - hattr.base.next = &tap_log_attr; - - plan_tests(sizeof(flags) / sizeof(flags[0]) * 32 + 1); - for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) { - tdb = tdb_open("run-traverse.tdb", flags[i], - O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr); - ok1(tdb); - if (!tdb) - continue; - - ok1(tdb_traverse(tdb, NULL, NULL) == 0); - - ok1(store_records(tdb)); - num = tdb_traverse(tdb, NULL, NULL); - ok1(num == NUM_RECORDS); - - /* Full traverse. */ - td.calls = 0; - td.call_limit = UINT_MAX; - td.low = INT_MAX; - td.high = INT_MIN; - td.mismatch = false; - td.delete = false; - - num = tdb_traverse(tdb, trav, &td); - ok1(num == NUM_RECORDS); - ok1(!td.mismatch); - ok1(td.calls == NUM_RECORDS); - ok1(td.low == 0); - ok1(td.high == NUM_RECORDS-1); - - /* Short traverse. */ - td.calls = 0; - td.call_limit = NUM_RECORDS / 2; - td.low = INT_MAX; - td.high = INT_MIN; - td.mismatch = false; - td.delete = false; - - num = tdb_traverse(tdb, trav, &td); - ok1(num == NUM_RECORDS / 2); - ok1(!td.mismatch); - ok1(td.calls == NUM_RECORDS / 2); - ok1(td.low <= NUM_RECORDS / 2); - ok1(td.high > NUM_RECORDS / 2); - ok1(tdb_check(tdb, NULL, NULL) == 0); - ok1(tap_log_messages == 0); - - /* Deleting traverse (delete everything). */ - td.calls = 0; - td.call_limit = UINT_MAX; - td.low = INT_MAX; - td.high = INT_MIN; - td.mismatch = false; - td.delete = true; - td.delete_error = TDB_SUCCESS; - num = tdb_traverse(tdb, trav, &td); - ok1(num == NUM_RECORDS); - ok1(td.delete_error == TDB_SUCCESS); - ok1(!td.mismatch); - ok1(td.calls == NUM_RECORDS); - ok1(td.low == 0); - ok1(td.high == NUM_RECORDS - 1); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Now it's empty! */ - ok1(tdb_traverse(tdb, NULL, NULL) == 0); - - /* Re-add. */ - ok1(store_records(tdb)); - ok1(tdb_traverse(tdb, NULL, NULL) == NUM_RECORDS); - ok1(tdb_check(tdb, NULL, NULL) == 0); - - /* Grow. This will cause us to be reshuffled. */ - tgd.calls = 0; - tgd.num_large = 0; - tgd.mismatch = false; - tgd.error = TDB_SUCCESS; - ok1(tdb_traverse(tdb, trav_grow, &tgd) > 1); - ok1(tgd.error == 0); - ok1(!tgd.mismatch); - ok1(tdb_check(tdb, NULL, NULL) == 0); - ok1(tgd.num_large < tgd.calls); - diag("growing db: %u calls, %u repeats", - tgd.calls, tgd.num_large); - - tdb_close(tdb); - } - - ok1(tap_log_messages == 0); - return exit_status(); -} diff --git a/ccan/tdb2/test/rwlock-be.tdb1 b/ccan/tdb2/test/rwlock-be.tdb1 deleted file mode 100644 index 45b5f09a..00000000 Binary files a/ccan/tdb2/test/rwlock-be.tdb1 and /dev/null differ diff --git a/ccan/tdb2/test/rwlock-le.tdb1 b/ccan/tdb2/test/rwlock-le.tdb1 deleted file mode 100644 index 45b5f09a..00000000 Binary files a/ccan/tdb2/test/rwlock-le.tdb1 and /dev/null differ diff --git a/ccan/tdb2/test/tdb1-external-agent.c b/ccan/tdb2/test/tdb1-external-agent.c deleted file mode 100644 index ffde0770..00000000 --- a/ccan/tdb2/test/tdb1-external-agent.c +++ /dev/null @@ -1,188 +0,0 @@ -#include "tdb1-external-agent.h" -#include "tdb1-lock-tracking.h" -#include "logging.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct tdb_context *tdb; - -static enum agent_return do_operation(enum operation op, const char *name) -{ - TDB_DATA k; - enum agent_return ret; - TDB_DATA data; - - if (op != OPEN && !tdb) { - diag("external: No tdb open!"); - return OTHER_FAILURE; - } - - k.dptr = (void *)name; - k.dsize = strlen(name); - - locking_would_block1 = 0; - switch (op) { - case OPEN: - if (tdb) { - diag("Already have tdb %s open", tdb->name); - return OTHER_FAILURE; - } - tdb = tdb_open(name, TDB_VERSION1, O_RDWR, 0, &tap_log_attr); - if (!tdb) { - if (!locking_would_block1) - diag("Opening tdb gave %s", strerror(errno)); - ret = OTHER_FAILURE; - } else - ret = SUCCESS; - break; - case TRANSACTION_START: - ret = tdb_transaction_start(tdb) == TDB_SUCCESS ? SUCCESS : OTHER_FAILURE; - break; - case FETCH: - if (tdb_fetch(tdb, k, &data) != TDB_SUCCESS) { - if (tdb->last_error == TDB_ERR_NOEXIST) - ret = FAILED; - else - ret = OTHER_FAILURE; - } else if (data.dsize != k.dsize - || memcmp(data.dptr, k.dptr, k.dsize) != 0) { - ret = OTHER_FAILURE; - } else { - ret = SUCCESS; - } - free(data.dptr); - break; - case STORE: - if (tdb_store(tdb, k, k, 0) == TDB_SUCCESS) - ret = SUCCESS; - else - ret = OTHER_FAILURE; - break; - case TRANSACTION_COMMIT: - ret = tdb_transaction_commit(tdb) == TDB_SUCCESS ? SUCCESS : OTHER_FAILURE; - break; - case CHECK: - ret = tdb_check(tdb, NULL, NULL) == TDB_SUCCESS ? SUCCESS : OTHER_FAILURE; - break; - case NEEDS_RECOVERY: - ret = tdb1_needs_recovery(tdb) ? SUCCESS : FAILED; - break; - case CLOSE: - ret = tdb_close(tdb) == 0 ? SUCCESS : OTHER_FAILURE; - tdb = NULL; - break; - default: - ret = OTHER_FAILURE; - } - - if (locking_would_block1) - ret = WOULD_HAVE_BLOCKED; - - return ret; -} - -struct agent { - int cmdfd, responsefd; -}; - -/* Do this before doing any tdb stuff. Return handle, or NULL. */ -struct agent *prepare_external_agent1(void) -{ - int pid, ret; - int command[2], response[2]; - char name[1+PATH_MAX]; - - if (pipe(command) != 0 || pipe(response) != 0) - return NULL; - - pid = fork(); - if (pid < 0) - return NULL; - - if (pid != 0) { - struct agent *agent = malloc(sizeof(*agent)); - - close(command[0]); - close(response[1]); - agent->cmdfd = command[1]; - agent->responsefd = response[0]; - return agent; - } - - close(command[1]); - close(response[0]); - - /* We want to fail, not block. */ - nonblocking_locks1 = true; - log_prefix = "external: "; - while ((ret = read(command[0], name, sizeof(name))) > 0) { - enum agent_return result; - - result = do_operation(name[0], name+1); - if (write(response[1], &result, sizeof(result)) - != sizeof(result)) - err(1, "Writing response"); - } - exit(0); -} - -/* Ask the external agent to try to do an operation. */ -enum agent_return external_agent_operation1(struct agent *agent, - enum operation op, - const char *name) -{ - enum agent_return res; - unsigned int len; - char *string; - - if (!name) - name = ""; - len = 1 + strlen(name) + 1; - string = malloc(len); - - string[0] = op; - strcpy(string+1, name); - - if (write(agent->cmdfd, string, len) != len - || read(agent->responsefd, &res, sizeof(res)) != sizeof(res)) - res = AGENT_DIED; - - free(string); - return res; -} - -const char *agent_return_name1(enum agent_return ret) -{ - return ret == SUCCESS ? "SUCCESS" - : ret == WOULD_HAVE_BLOCKED ? "WOULD_HAVE_BLOCKED" - : ret == AGENT_DIED ? "AGENT_DIED" - : ret == FAILED ? "FAILED" - : ret == OTHER_FAILURE ? "OTHER_FAILURE" - : "**INVALID**"; -} - -const char *operation_name1(enum operation op) -{ - switch (op) { - case OPEN: return "OPEN"; - case TRANSACTION_START: return "TRANSACTION_START"; - case FETCH: return "FETCH"; - case STORE: return "STORE"; - case TRANSACTION_COMMIT: return "TRANSACTION_COMMIT"; - case CHECK: return "CHECK"; - case NEEDS_RECOVERY: return "NEEDS_RECOVERY"; - case CLOSE: return "CLOSE"; - } - return "**INVALID**"; -} diff --git a/ccan/tdb2/test/tdb1-external-agent.h b/ccan/tdb2/test/tdb1-external-agent.h deleted file mode 100644 index ee903b65..00000000 --- a/ccan/tdb2/test/tdb1-external-agent.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef TDB_TEST_EXTERNAL_AGENT_H -#define TDB_TEST_EXTERNAL_AGENT_H - -/* For locking tests, we need a different process to try things at - * various times. */ -enum operation { - OPEN, - TRANSACTION_START, - FETCH, - STORE, - TRANSACTION_COMMIT, - CHECK, - NEEDS_RECOVERY, - CLOSE, -}; - -/* Do this before doing any tdb stuff. Return handle, or -1. */ -struct agent *prepare_external_agent1(void); - -enum agent_return { - SUCCESS, - WOULD_HAVE_BLOCKED, - AGENT_DIED, - FAILED, /* For fetch, or NEEDS_RECOVERY */ - OTHER_FAILURE, -}; - -/* Ask the external agent to try to do an operation. - * name == tdb name for OPEN/OPEN_WITH_CLEAR_IF_FIRST, - * record name for FETCH/STORE (store stores name as data too) - */ -enum agent_return external_agent_operation1(struct agent *handle, - enum operation op, - const char *name); - -/* Mapping enum -> string. */ -const char *agent_return_name1(enum agent_return ret); -const char *operation_name1(enum operation op); - -#endif /* TDB_TEST_EXTERNAL_AGENT_H */ diff --git a/ccan/tdb2/test/tdb1-lock-tracking.c b/ccan/tdb2/test/tdb1-lock-tracking.c deleted file mode 100644 index 197b1f07..00000000 --- a/ccan/tdb2/test/tdb1-lock-tracking.c +++ /dev/null @@ -1,146 +0,0 @@ -/* We save the locks so we can reaquire them. */ -#include -#include -#include -#include -#include -#include -#include "tdb1-lock-tracking.h" - -struct lock { - struct lock *next; - unsigned int off; - unsigned int len; - int type; -}; -static struct lock *locks; -int locking_errors1 = 0; -bool suppress_lockcheck1 = false; -bool nonblocking_locks1; -int locking_would_block1 = 0; -void (*unlock_callback1)(int fd); - -int fcntl_with_lockcheck1(int fd, int cmd, ... /* arg */ ) -{ - va_list ap; - int ret, arg3; - struct flock *fl; - bool may_block = false; - - if (cmd != F_SETLK && cmd != F_SETLKW) { - /* This may be totally bogus, but we don't know in general. */ - va_start(ap, cmd); - arg3 = va_arg(ap, int); - va_end(ap); - - return fcntl(fd, cmd, arg3); - } - - va_start(ap, cmd); - fl = va_arg(ap, struct flock *); - va_end(ap); - - if (cmd == F_SETLKW && nonblocking_locks1) { - cmd = F_SETLK; - may_block = true; - } - ret = fcntl(fd, cmd, fl); - - /* Detect when we failed, but might have been OK if we waited. */ - if (may_block && ret == -1 && (errno == EAGAIN || errno == EACCES)) { - locking_would_block1++; - } - - if (fl->l_type == F_UNLCK) { - struct lock **l; - struct lock *old = NULL; - - for (l = &locks; *l; l = &(*l)->next) { - if ((*l)->off == fl->l_start - && (*l)->len == fl->l_len) { - if (ret == 0) { - old = *l; - *l = (*l)->next; - free(old); - } - break; - } - } - if (!old && !suppress_lockcheck1) { - diag("Unknown unlock %u@%u - %i", - (int)fl->l_len, (int)fl->l_start, ret); - locking_errors1++; - } - } else { - struct lock *new, *i; - unsigned int fl_end = fl->l_start + fl->l_len; - if (fl->l_len == 0) - fl_end = (unsigned int)-1; - - /* Check for overlaps: we shouldn't do this. */ - for (i = locks; i; i = i->next) { - unsigned int i_end = i->off + i->len; - if (i->len == 0) - i_end = (unsigned int)-1; - - if (fl->l_start >= i->off && fl->l_start < i_end) - break; - if (fl_end >= i->off && fl_end < i_end) - break; - - /* tdb_allrecord_lock does this, handle adjacent: */ - if (fl->l_start == i_end && fl->l_type == i->type) { - if (ret == 0) { - i->len = fl->l_len - ? i->len + fl->l_len - : 0; - } - goto done; - } - } - if (i) { - /* Special case: upgrade of allrecord lock. */ - if (i->type == F_RDLCK && fl->l_type == F_WRLCK - && i->off == TDB1_FREELIST_TOP - && fl->l_start == TDB1_FREELIST_TOP - && i->len == 0 - && fl->l_len == 0) { - if (ret == 0) - i->type = F_WRLCK; - goto done; - } - if (!suppress_lockcheck1) { - diag("%s lock %u@%u overlaps %u@%u", - fl->l_type == F_WRLCK ? "write" : "read", - (int)fl->l_len, (int)fl->l_start, - i->len, (int)i->off); - locking_errors1++; - } - } - - if (ret == 0) { - new = malloc(sizeof *new); - new->off = fl->l_start; - new->len = fl->l_len; - new->type = fl->l_type; - new->next = locks; - locks = new; - } - } -done: - if (ret == 0 && fl->l_type == F_UNLCK && unlock_callback1) - unlock_callback1(fd); - return ret; -} - -unsigned int forget_locking1(void) -{ - unsigned int num = 0; - while (locks) { - struct lock *next = locks->next; - free(locks); - locks = next; - num++; - } - return num; -} diff --git a/ccan/tdb2/test/tdb1-lock-tracking.h b/ccan/tdb2/test/tdb1-lock-tracking.h deleted file mode 100644 index cb8c2f12..00000000 --- a/ccan/tdb2/test/tdb1-lock-tracking.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef TDB1_LOCK_TRACKING_H -#define TDB1_LOCK_TRACKING_H -#include -#include - -/* Set this if you want a callback after fnctl unlock. */ -extern void (*unlock_callback1)(int fd); - -/* Replacement fcntl. */ -int fcntl_with_lockcheck1(int fd, int cmd, ... /* arg */ ); - -/* Discard locking info: returns number of locks outstanding. */ -unsigned int forget_locking1(void); - -/* Number of errors in locking. */ -extern int locking_errors1; - -/* Suppress lock checking. */ -extern bool suppress_lockcheck1; - -/* Make all locks non-blocking. */ -extern bool nonblocking_locks1; - -/* Number of times we failed a lock because we made it non-blocking. */ -extern int locking_would_block1; -#endif /* LOCK_TRACKING_H */ diff --git a/ccan/tdb2/test/tdb1.corrupt b/ccan/tdb2/test/tdb1.corrupt deleted file mode 100644 index 83d66774..00000000 Binary files a/ccan/tdb2/test/tdb1.corrupt and /dev/null differ diff --git a/ccan/tdb2/test/tdb2-source.h b/ccan/tdb2/test/tdb2-source.h deleted file mode 100644 index 28ab3513..00000000 --- a/ccan/tdb2/test/tdb2-source.h +++ /dev/null @@ -1,21 +0,0 @@ -#include "config.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include diff --git a/ccan/tdb2/tools/Makefile b/ccan/tdb2/tools/Makefile deleted file mode 100644 index 11188c3b..00000000 --- a/ccan/tdb2/tools/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -OBJS:=../../tdb2.o ../../hash.o ../../tally.o -CFLAGS:=-I../../.. -I.. -Wall -g -O3 #-g -pg -LDFLAGS:=-L../../.. - -default: tdb2torture tdb2tool tdb2dump tdb2restore mktdb2 speed growtdb-bench - -tdb2dump: tdb2dump.c $(OBJS) -tdb2restore: tdb2restore.c $(OBJS) -tdb2torture: tdb2torture.c $(OBJS) -tdb2tool: tdb2tool.c $(OBJS) -mktdb2: mktdb2.c $(OBJS) -speed: speed.c $(OBJS) -growtdb-bench: growtdb-bench.c $(OBJS) - -clean: - rm -f tdb2torture tdb2dump tdb2restore tdb2tool mktdb2 speed growtdb-bench diff --git a/ccan/tdb2/tools/growtdb-bench.c b/ccan/tdb2/tools/growtdb-bench.c deleted file mode 100644 index 205ff86e..00000000 --- a/ccan/tdb2/tools/growtdb-bench.c +++ /dev/null @@ -1,114 +0,0 @@ -#include "tdb2.h" -#include -#include -#include -#include -#include -#include -#include -#include - -static void logfn(struct tdb_context *tdb, - enum tdb_log_level level, - enum TDB_ERROR ecode, - const char *message, - void *data) -{ - fprintf(stderr, "tdb:%s:%s:%s\n", - tdb_name(tdb), tdb_errorstr(ecode), message); -} - -int main(int argc, char *argv[]) -{ - unsigned int i, j, users, groups; - TDB_DATA idxkey, idxdata; - TDB_DATA k, d, gk; - char cmd[100]; - struct tdb_context *tdb; - enum TDB_ERROR ecode; - union tdb_attribute log; - - if (argc != 3) { - printf("Usage: growtdb-bench \n"); - exit(1); - } - users = atoi(argv[1]); - groups = atoi(argv[2]); - - sprintf(cmd, "cat /proc/%i/statm", getpid()); - - log.base.attr = TDB_ATTRIBUTE_LOG; - log.base.next = NULL; - log.log.fn = logfn; - - tdb = tdb_open("/tmp/growtdb.tdb", TDB_DEFAULT, - O_RDWR|O_CREAT|O_TRUNC, 0600, &log); - - idxkey.dptr = (unsigned char *)"User index"; - idxkey.dsize = strlen("User index"); - idxdata.dsize = 51; - idxdata.dptr = calloc(idxdata.dsize, 1); - - /* Create users. */ - k.dsize = 48; - k.dptr = calloc(k.dsize, 1); - d.dsize = 64; - d.dptr = calloc(d.dsize, 1); - - tdb_transaction_start(tdb); - for (i = 0; i < users; i++) { - memcpy(k.dptr, &i, sizeof(i)); - ecode = tdb_store(tdb, k, d, TDB_INSERT); - if (ecode != TDB_SUCCESS) - errx(1, "tdb insert failed: %s", tdb_errorstr(ecode)); - - /* This simulates a growing index record. */ - ecode = tdb_append(tdb, idxkey, idxdata); - if (ecode != TDB_SUCCESS) - errx(1, "tdb append failed: %s", tdb_errorstr(ecode)); - } - if ((ecode = tdb_transaction_commit(tdb)) != 0) - errx(1, "tdb commit1 failed: %s", tdb_errorstr(ecode)); - - if ((ecode = tdb_check(tdb, NULL, NULL)) != 0) - errx(1, "tdb_check failed after initial insert!"); - - system(cmd); - - /* Now put them all in groups: add 32 bytes to each record for - * a group. */ - gk.dsize = 48; - gk.dptr = calloc(k.dsize, 1); - gk.dptr[gk.dsize-1] = 1; - - d.dsize = 32; - for (i = 0; i < groups; i++) { - tdb_transaction_start(tdb); - /* Create the "group". */ - memcpy(gk.dptr, &i, sizeof(i)); - ecode = tdb_store(tdb, gk, d, TDB_INSERT); - if (ecode != TDB_SUCCESS) - errx(1, "tdb insert failed: %s", tdb_errorstr(ecode)); - - /* Now populate it. */ - for (j = 0; j < users; j++) { - /* Append to the user. */ - memcpy(k.dptr, &j, sizeof(j)); - if ((ecode = tdb_append(tdb, k, d)) != 0) - errx(1, "tdb append failed: %s", - tdb_errorstr(ecode)); - - /* Append to the group. */ - if ((ecode = tdb_append(tdb, gk, d)) != 0) - errx(1, "tdb append failed: %s", - tdb_errorstr(ecode)); - } - if ((ecode = tdb_transaction_commit(tdb)) != 0) - errx(1, "tdb commit2 failed: %s", tdb_errorstr(ecode)); - if ((ecode = tdb_check(tdb, NULL, NULL)) != 0) - errx(1, "tdb_check failed after iteration %i!", i); - system(cmd); - } - - return 0; -} diff --git a/ccan/tdb2/tools/mktdb2.c b/ccan/tdb2/tools/mktdb2.c deleted file mode 100644 index c8c28034..00000000 --- a/ccan/tdb2/tools/mktdb2.c +++ /dev/null @@ -1,29 +0,0 @@ -#include "tdb2.h" -#include -#include -#include -#include - -int main(int argc, char *argv[]) -{ - unsigned int i, num_recs; - struct tdb_context *tdb; - - if (argc != 3 || (num_recs = atoi(argv[2])) == 0) - errx(1, "Usage: mktdb "); - - tdb = tdb_open(argv[1], TDB_DEFAULT, O_CREAT|O_TRUNC|O_RDWR, 0600,NULL); - if (!tdb) - err(1, "Opening %s", argv[1]); - - for (i = 0; i < num_recs; i++) { - TDB_DATA d; - - d.dptr = (void *)&i; - d.dsize = sizeof(i); - if (tdb_store(tdb, d, d, TDB_INSERT) != 0) - err(1, "Failed to store record %i", i); - } - printf("Done\n"); - return 0; -} diff --git a/ccan/tdb2/tools/speed.c b/ccan/tdb2/tools/speed.c deleted file mode 100644 index ccb5ae34..00000000 --- a/ccan/tdb2/tools/speed.c +++ /dev/null @@ -1,443 +0,0 @@ -/* Simple speed test for TDB */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "tdb2.h" - -/* Nanoseconds per operation */ -static size_t normalize(const struct timeval *start, - const struct timeval *stop, - unsigned int num) -{ - struct timeval diff; - - timersub(stop, start, &diff); - - /* Floating point is more accurate here. */ - return (double)(diff.tv_sec * 1000000 + diff.tv_usec) - / num * 1000; -} - -static size_t file_size(void) -{ - struct stat st; - - if (stat("/tmp/speed.tdb", &st) != 0) - return -1; - return st.st_size; -} - -static int count_record(struct tdb_context *tdb, - TDB_DATA key, TDB_DATA data, void *p) -{ - int *total = p; - *total += *(int *)data.dptr; - return 0; -} - -static void dump_and_clear_stats(struct tdb_context **tdb, - int flags, - union tdb_attribute *attr) -{ - union tdb_attribute stats; - enum TDB_ERROR ecode; - - stats.base.attr = TDB_ATTRIBUTE_STATS; - stats.stats.size = sizeof(stats.stats); - ecode = tdb_get_attribute(*tdb, &stats); - if (ecode != TDB_SUCCESS) - errx(1, "Getting stats: %s", tdb_errorstr(ecode)); - - printf("allocs = %llu\n", - (unsigned long long)stats.stats.allocs); - printf(" alloc_subhash = %llu\n", - (unsigned long long)stats.stats.alloc_subhash); - printf(" alloc_chain = %llu\n", - (unsigned long long)stats.stats.alloc_chain); - printf(" alloc_bucket_exact = %llu\n", - (unsigned long long)stats.stats.alloc_bucket_exact); - printf(" alloc_bucket_max = %llu\n", - (unsigned long long)stats.stats.alloc_bucket_max); - printf(" alloc_leftover = %llu\n", - (unsigned long long)stats.stats.alloc_leftover); - printf(" alloc_coalesce_tried = %llu\n", - (unsigned long long)stats.stats.alloc_coalesce_tried); - printf(" alloc_coalesce_iterate_clash = %llu\n", - (unsigned long long)stats.stats.alloc_coalesce_iterate_clash); - printf(" alloc_coalesce_lockfail = %llu\n", - (unsigned long long)stats.stats.alloc_coalesce_lockfail); - printf(" alloc_coalesce_race = %llu\n", - (unsigned long long)stats.stats.alloc_coalesce_race); - printf(" alloc_coalesce_succeeded = %llu\n", - (unsigned long long)stats.stats.alloc_coalesce_succeeded); - printf(" alloc_coalesce_num_merged = %llu\n", - (unsigned long long)stats.stats.alloc_coalesce_num_merged); - printf("compares = %llu\n", - (unsigned long long)stats.stats.compares); - printf(" compare_wrong_bucket = %llu\n", - (unsigned long long)stats.stats.compare_wrong_bucket); - printf(" compare_wrong_offsetbits = %llu\n", - (unsigned long long)stats.stats.compare_wrong_offsetbits); - printf(" compare_wrong_keylen = %llu\n", - (unsigned long long)stats.stats.compare_wrong_keylen); - printf(" compare_wrong_rechash = %llu\n", - (unsigned long long)stats.stats.compare_wrong_rechash); - printf(" compare_wrong_keycmp = %llu\n", - (unsigned long long)stats.stats.compare_wrong_keycmp); - printf("transactions = %llu\n", - (unsigned long long)stats.stats.transactions); - printf(" transaction_cancel = %llu\n", - (unsigned long long)stats.stats.transaction_cancel); - printf(" transaction_nest = %llu\n", - (unsigned long long)stats.stats.transaction_nest); - printf(" transaction_expand_file = %llu\n", - (unsigned long long)stats.stats.transaction_expand_file); - printf(" transaction_read_direct = %llu\n", - (unsigned long long)stats.stats.transaction_read_direct); - printf(" transaction_read_direct_fail = %llu\n", - (unsigned long long)stats.stats.transaction_read_direct_fail); - printf(" transaction_write_direct = %llu\n", - (unsigned long long)stats.stats.transaction_write_direct); - printf(" transaction_write_direct_fail = %llu\n", - (unsigned long long)stats.stats.transaction_write_direct_fail); - printf("expands = %llu\n", - (unsigned long long)stats.stats.expands); - printf("frees = %llu\n", - (unsigned long long)stats.stats.frees); - printf("locks = %llu\n", - (unsigned long long)stats.stats.locks); - printf(" lock_lowlevel = %llu\n", - (unsigned long long)stats.stats.lock_lowlevel); - printf(" lock_nonblock = %llu\n", - (unsigned long long)stats.stats.lock_nonblock); - printf(" lock_nonblock_fail = %llu\n", - (unsigned long long)stats.stats.lock_nonblock_fail); - - /* Now clear. */ - tdb_close(*tdb); - *tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR, 0, attr); -} - -static void tdb_log(struct tdb_context *tdb, - enum tdb_log_level level, - enum TDB_ERROR ecode, - const char *message, - void *data) -{ - fprintf(stderr, "tdb:%s:%s:%s\n", - tdb_name(tdb), tdb_errorstr(ecode), message); -} - -int main(int argc, char *argv[]) -{ - unsigned int i, j, num = 1000, stage = 0, stopat = -1; - int flags = TDB_DEFAULT; - bool transaction = false, summary = false; - TDB_DATA key, data; - struct tdb_context *tdb; - struct timeval start, stop; - union tdb_attribute seed, log; - bool do_stats = false; - enum TDB_ERROR ecode; - - /* Try to keep benchmarks even. */ - seed.base.attr = TDB_ATTRIBUTE_SEED; - seed.base.next = NULL; - seed.seed.seed = 0; - - log.base.attr = TDB_ATTRIBUTE_LOG; - log.base.next = &seed; - log.log.fn = tdb_log; - - if (argv[1] && strcmp(argv[1], "--internal") == 0) { - flags = TDB_INTERNAL; - argc--; - argv++; - } - if (argv[1] && strcmp(argv[1], "--transaction") == 0) { - transaction = true; - argc--; - argv++; - } - if (argv[1] && strcmp(argv[1], "--no-sync") == 0) { - flags |= TDB_NOSYNC; - argc--; - argv++; - } - if (argv[1] && strcmp(argv[1], "--summary") == 0) { - summary = true; - argc--; - argv++; - } - if (argv[1] && strcmp(argv[1], "--stats") == 0) { - do_stats = true; - argc--; - argv++; - } - - tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR|O_CREAT|O_TRUNC, - 0600, &log); - if (!tdb) - err(1, "Opening /tmp/speed.tdb"); - - key.dptr = (void *)&i; - key.dsize = sizeof(i); - data = key; - - if (argv[1]) { - num = atoi(argv[1]); - argv++; - argc--; - } - - if (argv[1]) { - stopat = atoi(argv[1]); - argv++; - argc--; - } - - /* Add 1000 records. */ - printf("Adding %u records: ", num); fflush(stdout); - if (transaction && (ecode = tdb_transaction_start(tdb))) - errx(1, "starting transaction: %s", tdb_errorstr(ecode)); - gettimeofday(&start, NULL); - for (i = 0; i < num; i++) - if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0) - errx(1, "Inserting key %u in tdb: %s", - i, tdb_errorstr(ecode)); - gettimeofday(&stop, NULL); - if (transaction && (ecode = tdb_transaction_commit(tdb))) - errx(1, "committing transaction: %s", tdb_errorstr(ecode)); - printf(" %zu ns (%zu bytes)\n", - normalize(&start, &stop, num), file_size()); - - if (tdb_check(tdb, NULL, NULL)) - errx(1, "tdb_check failed!"); - if (summary) { - char *sumstr = NULL; - tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); - printf("%s\n", sumstr); - free(sumstr); - } - if (do_stats) - dump_and_clear_stats(&tdb, flags, &log); - - if (++stage == stopat) - exit(0); - - /* Finding 1000 records. */ - printf("Finding %u records: ", num); fflush(stdout); - if (transaction && (ecode = tdb_transaction_start(tdb))) - errx(1, "starting transaction: %s", tdb_errorstr(ecode)); - gettimeofday(&start, NULL); - for (i = 0; i < num; i++) { - struct tdb_data dbuf; - if ((ecode = tdb_fetch(tdb, key, &dbuf)) != TDB_SUCCESS - || *(int *)dbuf.dptr != i) { - errx(1, "Fetching key %u in tdb gave %u", - i, ecode ? ecode : *(int *)dbuf.dptr); - } - } - gettimeofday(&stop, NULL); - if (transaction && (ecode = tdb_transaction_commit(tdb))) - errx(1, "committing transaction: %s", tdb_errorstr(ecode)); - printf(" %zu ns (%zu bytes)\n", - normalize(&start, &stop, num), file_size()); - if (tdb_check(tdb, NULL, NULL)) - errx(1, "tdb_check failed!"); - if (summary) { - char *sumstr = NULL; - tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); - printf("%s\n", sumstr); - free(sumstr); - } - if (do_stats) - dump_and_clear_stats(&tdb, flags, &log); - if (++stage == stopat) - exit(0); - - /* Missing 1000 records. */ - printf("Missing %u records: ", num); fflush(stdout); - if (transaction && (ecode = tdb_transaction_start(tdb))) - errx(1, "starting transaction: %s", tdb_errorstr(ecode)); - gettimeofday(&start, NULL); - for (i = num; i < num*2; i++) { - struct tdb_data dbuf; - ecode = tdb_fetch(tdb, key, &dbuf); - if (ecode != TDB_ERR_NOEXIST) - errx(1, "Fetching key %u in tdb gave %s", - i, tdb_errorstr(ecode)); - } - gettimeofday(&stop, NULL); - if (transaction && (ecode = tdb_transaction_commit(tdb))) - errx(1, "committing transaction: %s", tdb_errorstr(ecode)); - printf(" %zu ns (%zu bytes)\n", - normalize(&start, &stop, num), file_size()); - if (tdb_check(tdb, NULL, NULL)) - errx(1, "tdb_check failed!"); - if (summary) { - char *sumstr = NULL; - tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); - printf("%s\n", sumstr); - free(sumstr); - } - if (do_stats) - dump_and_clear_stats(&tdb, flags, &log); - if (++stage == stopat) - exit(0); - - /* Traverse 1000 records. */ - printf("Traversing %u records: ", num); fflush(stdout); - if (transaction && (ecode = tdb_transaction_start(tdb))) - errx(1, "starting transaction: %s", tdb_errorstr(ecode)); - i = 0; - gettimeofday(&start, NULL); - if (tdb_traverse(tdb, count_record, &i) != num) - errx(1, "Traverse returned wrong number of records"); - if (i != (num - 1) * (num / 2)) - errx(1, "Traverse tallied to %u", i); - gettimeofday(&stop, NULL); - if (transaction && (ecode = tdb_transaction_commit(tdb))) - errx(1, "committing transaction: %s", tdb_errorstr(ecode)); - printf(" %zu ns (%zu bytes)\n", - normalize(&start, &stop, num), file_size()); - if (tdb_check(tdb, NULL, NULL)) - errx(1, "tdb_check failed!"); - if (summary) { - char *sumstr = NULL; - tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); - printf("%s\n", sumstr); - free(sumstr); - } - if (do_stats) - dump_and_clear_stats(&tdb, flags, &log); - if (++stage == stopat) - exit(0); - - /* Delete 1000 records (not in order). */ - printf("Deleting %u records: ", num); fflush(stdout); - if (transaction && (ecode = tdb_transaction_start(tdb))) - errx(1, "starting transaction: %s", tdb_errorstr(ecode)); - gettimeofday(&start, NULL); - for (j = 0; j < num; j++) { - i = (j + 100003) % num; - if ((ecode = tdb_delete(tdb, key)) != TDB_SUCCESS) - errx(1, "Deleting key %u in tdb: %s", - i, tdb_errorstr(ecode)); - } - gettimeofday(&stop, NULL); - if (transaction && (ecode = tdb_transaction_commit(tdb))) - errx(1, "committing transaction: %s", tdb_errorstr(ecode)); - printf(" %zu ns (%zu bytes)\n", - normalize(&start, &stop, num), file_size()); - if (tdb_check(tdb, NULL, NULL)) - errx(1, "tdb_check failed!"); - if (summary) { - char *sumstr = NULL; - tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); - printf("%s\n", sumstr); - free(sumstr); - } - if (do_stats) - dump_and_clear_stats(&tdb, flags, &log); - if (++stage == stopat) - exit(0); - - /* Re-add 1000 records (not in order). */ - printf("Re-adding %u records: ", num); fflush(stdout); - if (transaction && (ecode = tdb_transaction_start(tdb))) - errx(1, "starting transaction: %s", tdb_errorstr(ecode)); - gettimeofday(&start, NULL); - for (j = 0; j < num; j++) { - i = (j + 100003) % num; - if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0) - errx(1, "Inserting key %u in tdb: %s", - i, tdb_errorstr(ecode)); - } - gettimeofday(&stop, NULL); - if (transaction && (ecode = tdb_transaction_commit(tdb))) - errx(1, "committing transaction: %s", tdb_errorstr(ecode)); - printf(" %zu ns (%zu bytes)\n", - normalize(&start, &stop, num), file_size()); - if (tdb_check(tdb, NULL, NULL)) - errx(1, "tdb_check failed!"); - if (summary) { - char *sumstr = NULL; - tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); - printf("%s\n", sumstr); - free(sumstr); - } - if (do_stats) - dump_and_clear_stats(&tdb, flags, &log); - if (++stage == stopat) - exit(0); - - /* Append 1000 records. */ - if (transaction && (ecode = tdb_transaction_start(tdb))) - errx(1, "starting transaction: %s", tdb_errorstr(ecode)); - printf("Appending %u records: ", num); fflush(stdout); - gettimeofday(&start, NULL); - for (i = 0; i < num; i++) - if ((ecode = tdb_append(tdb, key, data)) != TDB_SUCCESS) - errx(1, "Appending key %u in tdb: %s", - i, tdb_errorstr(ecode)); - gettimeofday(&stop, NULL); - if (transaction && (ecode = tdb_transaction_commit(tdb))) - errx(1, "committing transaction: %s", tdb_errorstr(ecode)); - printf(" %zu ns (%zu bytes)\n", - normalize(&start, &stop, num), file_size()); - if (tdb_check(tdb, NULL, NULL)) - errx(1, "tdb_check failed!"); - if (summary) { - char *sumstr = NULL; - tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); - printf("%s\n", sumstr); - free(sumstr); - } - if (++stage == stopat) - exit(0); - - /* Churn 1000 records: not in order! */ - if (transaction && (ecode = tdb_transaction_start(tdb))) - errx(1, "starting transaction: %s", tdb_errorstr(ecode)); - printf("Churning %u records: ", num); fflush(stdout); - gettimeofday(&start, NULL); - for (j = 0; j < num; j++) { - i = (j + 1000019) % num; - if ((ecode = tdb_delete(tdb, key)) != TDB_SUCCESS) - errx(1, "Deleting key %u in tdb: %s", - i, tdb_errorstr(ecode)); - i += num; - if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0) - errx(1, "Inserting key %u in tdb: %s", - i, tdb_errorstr(ecode)); - } - gettimeofday(&stop, NULL); - if (transaction && (ecode = tdb_transaction_commit(tdb))) - errx(1, "committing transaction: %s", tdb_errorstr(ecode)); - printf(" %zu ns (%zu bytes)\n", - normalize(&start, &stop, num), file_size()); - - if (tdb_check(tdb, NULL, NULL)) - errx(1, "tdb_check failed!"); - if (summary) { - char *sumstr = NULL; - tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr); - printf("%s\n", sumstr); - free(sumstr); - } - if (do_stats) - dump_and_clear_stats(&tdb, flags, &log); - if (++stage == stopat) - exit(0); - - return 0; -} diff --git a/ccan/tdb2/tools/tdb2dump.c b/ccan/tdb2/tools/tdb2dump.c deleted file mode 100644 index bf9216f7..00000000 --- a/ccan/tdb2/tools/tdb2dump.c +++ /dev/null @@ -1,115 +0,0 @@ -/* - simple tdb2 dump util - Copyright (C) Andrew Tridgell 2001 - Copyright (C) Rusty Russell 2011 - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ -#include "tdb2.h" -#include -#include -#include -#include -#include -#include -#include - -static void print_data(TDB_DATA d) -{ - unsigned char *p = (unsigned char *)d.dptr; - int len = d.dsize; - while (len--) { - if (isprint(*p) && !strchr("\"\\", *p)) { - fputc(*p, stdout); - } else { - printf("\\%02X", *p); - } - p++; - } -} - -static int traverse_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *state) -{ - printf("{\n"); - printf("key(%d) = \"", (int)key.dsize); - print_data(key); - printf("\"\n"); - printf("data(%d) = \"", (int)dbuf.dsize); - print_data(dbuf); - printf("\"\n"); - printf("}\n"); - return 0; -} - -static int dump_tdb(const char *fname, const char *keyname) -{ - struct tdb_context *tdb; - TDB_DATA key, value; - - tdb = tdb_open(fname, 0, O_RDONLY, 0, NULL); - if (!tdb) { - printf("Failed to open %s\n", fname); - return 1; - } - - if (!keyname) { - tdb_traverse(tdb, traverse_fn, NULL); - } else { - key = tdb_mkdata(keyname, strlen(keyname)); - if (tdb_fetch(tdb, key, &value) != 0) { - return 1; - } else { - print_data(value); - free(value.dptr); - } - } - - return 0; -} - -static void usage( void) -{ - printf( "Usage: tdb2dump [options] \n\n"); - printf( " -h this help message\n"); - printf( " -k keyname dumps value of keyname\n"); -} - - int main(int argc, char *argv[]) -{ - char *fname, *keyname=NULL; - int c; - - if (argc < 2) { - printf("Usage: tdb2dump \n"); - exit(1); - } - - while ((c = getopt( argc, argv, "hk:")) != -1) { - switch (c) { - case 'h': - usage(); - exit( 0); - case 'k': - keyname = optarg; - break; - default: - usage(); - exit( 1); - } - } - - fname = argv[optind]; - - return dump_tdb(fname, keyname); -} diff --git a/ccan/tdb2/tools/tdb2restore.c b/ccan/tdb2/tools/tdb2restore.c deleted file mode 100644 index 658215a1..00000000 --- a/ccan/tdb2/tools/tdb2restore.c +++ /dev/null @@ -1,227 +0,0 @@ -/* - tdb2restore -- construct a tdb from tdbdump output. - Copyright (C) Volker Lendecke 2010 - Copyright (C) Simon McVittie 2005 - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "tdb2.h" -#include -#include -#include -#include -#include -#include - -#define debug_fprintf(file, fmt, ...) do {/*nothing*/} while (0) - -static int read_linehead(FILE *f) -{ - int i, c; - int num_bytes; - char prefix[128]; - - while (1) { - c = getc(f); - if (c == EOF) { - return -1; - } - if (c == '(') { - break; - } - } - for (i=0; idptr = (unsigned char *)malloc(size); - if (d->dptr == NULL) { - return -1; - } - d->dsize = size; - - for (i=0; idptr[i] = (low|high); - } else { - d->dptr[i] = c; - } - } - return 0; -} - -static int swallow(FILE *f, const char *s, int *eof) -{ - char line[128]; - - if (fgets(line, sizeof(line), f) == NULL) { - if (eof != NULL) { - *eof = 1; - } - return -1; - } - if (strcmp(line, s) != 0) { - return -1; - } - return 0; -} - -static bool read_rec(FILE *f, struct tdb_context *tdb, int *eof) -{ - int length; - struct tdb_data key, data; - bool ret = false; - enum TDB_ERROR e; - - key.dptr = NULL; - data.dptr = NULL; - - if (swallow(f, "{\n", eof) == -1) { - goto fail; - } - length = read_linehead(f); - if (length == -1) { - goto fail; - } - if (read_data(f, &key, length) == -1) { - goto fail; - } - if (swallow(f, "\"\n", NULL) == -1) { - goto fail; - } - length = read_linehead(f); - if (length == -1) { - goto fail; - } - if (read_data(f, &data, length) == -1) { - goto fail; - } - if ((swallow(f, "\"\n", NULL) == -1) - || (swallow(f, "}\n", NULL) == -1)) { - goto fail; - } - e = tdb_store(tdb, key, data, TDB_INSERT); - if (e != TDB_SUCCESS) { - fprintf(stderr, "TDB error: %s\n", tdb_errorstr(e)); - goto fail; - } - - ret = true; -fail: - free(key.dptr); - free(data.dptr); - return ret; -} - -static int restore_tdb(const char *fname) -{ - struct tdb_context *tdb; - - tdb = tdb_open(fname, 0, O_RDWR|O_CREAT|O_EXCL, 0666, NULL); - if (!tdb) { - perror("tdb_open"); - fprintf(stderr, "Failed to open %s\n", fname); - return 1; - } - - while (1) { - int eof = 0; - if (!read_rec(stdin, tdb, &eof)) { - if (eof) { - break; - } - return 1; - } - } - if (tdb_close(tdb)) { - fprintf(stderr, "Error closing tdb\n"); - return 1; - } - fprintf(stderr, "EOF\n"); - return 0; -} - -int main(int argc, char *argv[]) -{ - char *fname; - - if (argc < 2) { - printf("Usage: %s dbname < tdbdump_output\n", argv[0]); - exit(1); - } - - fname = argv[1]; - - return restore_tdb(fname); -} diff --git a/ccan/tdb2/tools/tdb2tool.c b/ccan/tdb2/tools/tdb2tool.c deleted file mode 100644 index 8073561b..00000000 --- a/ccan/tdb2/tools/tdb2tool.c +++ /dev/null @@ -1,802 +0,0 @@ -/* - Unix SMB/CIFS implementation. - Samba database functions - Copyright (C) Andrew Tridgell 1999-2000 - Copyright (C) Paul `Rusty' Russell 2000 - Copyright (C) Jeremy Allison 2000 - Copyright (C) Andrew Esh 2001 - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include "tdb2.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int do_command(void); -const char *cmdname; -char *arg1, *arg2; -size_t arg1len, arg2len; -int bIterate = 0; -char *line; -TDB_DATA iterate_kbuf; -char cmdline[1024]; -static int disable_mmap; - -enum commands { - CMD_CREATE_TDB, - CMD_OPEN_TDB, - CMD_TRANSACTION_START, - CMD_TRANSACTION_COMMIT, - CMD_TRANSACTION_CANCEL, - CMD_ERASE, - CMD_DUMP, - CMD_INSERT, - CMD_MOVE, - CMD_STORE, - CMD_SHOW, - CMD_KEYS, - CMD_HEXKEYS, - CMD_DELETE, -#if 0 - CMD_LIST_HASH_FREE, - CMD_LIST_FREE, -#endif - CMD_INFO, - CMD_MMAP, - CMD_SPEED, - CMD_FIRST, - CMD_NEXT, - CMD_SYSTEM, - CMD_CHECK, - CMD_QUIT, - CMD_HELP -}; - -typedef struct { - const char *name; - enum commands cmd; -} COMMAND_TABLE; - -COMMAND_TABLE cmd_table[] = { - {"create", CMD_CREATE_TDB}, - {"open", CMD_OPEN_TDB}, -#if 0 - {"transaction_start", CMD_TRANSACTION_START}, - {"transaction_commit", CMD_TRANSACTION_COMMIT}, - {"transaction_cancel", CMD_TRANSACTION_CANCEL}, -#endif - {"erase", CMD_ERASE}, - {"dump", CMD_DUMP}, - {"insert", CMD_INSERT}, - {"move", CMD_MOVE}, - {"store", CMD_STORE}, - {"show", CMD_SHOW}, - {"keys", CMD_KEYS}, - {"hexkeys", CMD_HEXKEYS}, - {"delete", CMD_DELETE}, -#if 0 - {"list", CMD_LIST_HASH_FREE}, - {"free", CMD_LIST_FREE}, -#endif - {"info", CMD_INFO}, - {"speed", CMD_SPEED}, - {"mmap", CMD_MMAP}, - {"first", CMD_FIRST}, - {"1", CMD_FIRST}, - {"next", CMD_NEXT}, - {"n", CMD_NEXT}, - {"check", CMD_CHECK}, - {"quit", CMD_QUIT}, - {"q", CMD_QUIT}, - {"!", CMD_SYSTEM}, - {NULL, CMD_HELP} -}; - -struct timeval tp1,tp2; - -static void _start_timer(void) -{ - gettimeofday(&tp1,NULL); -} - -static double _end_timer(void) -{ - gettimeofday(&tp2,NULL); - return((tp2.tv_sec - tp1.tv_sec) + - (tp2.tv_usec - tp1.tv_usec)*1.0e-6); -} - -static void tdb_log(struct tdb_context *tdb, - enum tdb_log_level level, - enum TDB_ERROR ecode, - const char *message, - void *data) -{ - fprintf(stderr, "tdb:%s:%s:%s\n", - tdb_name(tdb), tdb_errorstr(ecode), message); -} - -/* a tdb tool for manipulating a tdb database */ - -static struct tdb_context *tdb; - -static int print_rec(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state); -static int print_key(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state); -static int print_hexkey(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state); - -static void print_asc(const char *buf,int len) -{ - int i; - - /* We're probably printing ASCII strings so don't try to display - the trailing NULL character. */ - - if (buf[len - 1] == 0) - len--; - - for (i=0;i8) printf(" "); - while (n--) printf(" "); - - n = i%16; - if (n > 8) n = 8; - print_asc(&buf[i-(i%16)],n); printf(" "); - n = (i%16) - n; - if (n>0) print_asc(&buf[i-n],n); - printf("\n"); - } -} - -static void help(void) -{ - printf("\n" -"tdbtool: \n" -" create dbname : create a database\n" -" open dbname : open an existing database\n" -" openjh dbname : open an existing database (jenkins hash)\n" -" transaction_start : start a transaction\n" -" transaction_commit : commit a transaction\n" -" transaction_cancel : cancel a transaction\n" -" erase : erase the database\n" -" dump : dump the database as strings\n" -" keys : dump the database keys as strings\n" -" hexkeys : dump the database keys as hex values\n" -" info : print summary info about the database\n" -" insert key data : insert a record\n" -" move key file : move a record to a destination tdb\n" -" store key data : store a record (replace)\n" -" show key : show a record by key\n" -" delete key : delete a record by key\n" -#if 0 -" list : print the database hash table and freelist\n" -" free : print the database freelist\n" -#endif -" check : check the integrity of an opened database\n" -" speed : perform speed tests on the database\n" -" ! command : execute system command\n" -" 1 | first : print the first record\n" -" n | next : print the next record\n" -" q | quit : terminate\n" -" \\n : repeat 'next' command\n" -"\n"); -} - -static void terror(enum TDB_ERROR err, const char *why) -{ - if (err != TDB_SUCCESS) - printf("%s:%s\n", tdb_errorstr(err), why); - else - printf("%s\n", why); -} - -static void create_tdb(const char *tdbname) -{ - union tdb_attribute log_attr; - log_attr.base.attr = TDB_ATTRIBUTE_LOG; - log_attr.base.next = NULL; - log_attr.log.fn = tdb_log; - - if (tdb) tdb_close(tdb); - tdb = tdb_open(tdbname, (disable_mmap?TDB_NOMMAP:0), - O_RDWR | O_CREAT | O_TRUNC, 0600, &log_attr); - if (!tdb) { - printf("Could not create %s: %s\n", tdbname, strerror(errno)); - } -} - -static void open_tdb(const char *tdbname) -{ - union tdb_attribute log_attr; - log_attr.base.attr = TDB_ATTRIBUTE_LOG; - log_attr.base.next = NULL; - log_attr.log.fn = tdb_log; - - if (tdb) tdb_close(tdb); - tdb = tdb_open(tdbname, disable_mmap?TDB_NOMMAP:0, O_RDWR, 0600, - &log_attr); - if (!tdb) { - printf("Could not open %s: %s\n", tdbname, strerror(errno)); - } -} - -static void insert_tdb(char *keyname, size_t keylen, char* data, size_t datalen) -{ - TDB_DATA key, dbuf; - enum TDB_ERROR ecode; - - if ((keyname == NULL) || (keylen == 0)) { - terror(TDB_SUCCESS, "need key"); - return; - } - - key.dptr = (unsigned char *)keyname; - key.dsize = keylen; - dbuf.dptr = (unsigned char *)data; - dbuf.dsize = datalen; - - ecode = tdb_store(tdb, key, dbuf, TDB_INSERT); - if (ecode) { - terror(ecode, "insert failed"); - } -} - -static void store_tdb(char *keyname, size_t keylen, char* data, size_t datalen) -{ - TDB_DATA key, dbuf; - enum TDB_ERROR ecode; - - if ((keyname == NULL) || (keylen == 0)) { - terror(TDB_SUCCESS, "need key"); - return; - } - - if ((data == NULL) || (datalen == 0)) { - terror(TDB_SUCCESS, "need data"); - return; - } - - key.dptr = (unsigned char *)keyname; - key.dsize = keylen; - dbuf.dptr = (unsigned char *)data; - dbuf.dsize = datalen; - - printf("Storing key:\n"); - print_rec(tdb, key, dbuf, NULL); - - ecode = tdb_store(tdb, key, dbuf, TDB_REPLACE); - if (ecode) { - terror(ecode, "store failed"); - } -} - -static void show_tdb(char *keyname, size_t keylen) -{ - TDB_DATA key, dbuf; - enum TDB_ERROR ecode; - - if ((keyname == NULL) || (keylen == 0)) { - terror(TDB_SUCCESS, "need key"); - return; - } - - key.dptr = (unsigned char *)keyname; - key.dsize = keylen; - - ecode = tdb_fetch(tdb, key, &dbuf); - if (ecode) { - terror(ecode, "fetch failed"); - return; - } - - print_rec(tdb, key, dbuf, NULL); - - free( dbuf.dptr ); -} - -static void delete_tdb(char *keyname, size_t keylen) -{ - TDB_DATA key; - enum TDB_ERROR ecode; - - if ((keyname == NULL) || (keylen == 0)) { - terror(TDB_SUCCESS, "need key"); - return; - } - - key.dptr = (unsigned char *)keyname; - key.dsize = keylen; - - ecode = tdb_delete(tdb, key); - if (ecode) { - terror(ecode, "delete failed"); - } -} - -static void move_rec(char *keyname, size_t keylen, char* tdbname) -{ - TDB_DATA key, dbuf; - struct tdb_context *dst_tdb; - enum TDB_ERROR ecode; - - if ((keyname == NULL) || (keylen == 0)) { - terror(TDB_SUCCESS, "need key"); - return; - } - - if ( !tdbname ) { - terror(TDB_SUCCESS, "need destination tdb name"); - return; - } - - key.dptr = (unsigned char *)keyname; - key.dsize = keylen; - - ecode = tdb_fetch(tdb, key, &dbuf); - if (ecode) { - terror(ecode, "fetch failed"); - return; - } - - print_rec(tdb, key, dbuf, NULL); - - dst_tdb = tdb_open(tdbname, 0, O_RDWR, 0600, NULL); - if ( !dst_tdb ) { - terror(TDB_SUCCESS, "unable to open destination tdb"); - return; - } - - ecode = tdb_store( dst_tdb, key, dbuf, TDB_REPLACE); - if (ecode) - terror(ecode, "failed to move record"); - else - printf("record moved\n"); - - tdb_close( dst_tdb ); -} - -static int print_rec(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state) -{ - printf("\nkey %d bytes\n", (int)key.dsize); - print_asc((const char *)key.dptr, key.dsize); - printf("\ndata %d bytes\n", (int)dbuf.dsize); - print_data((const char *)dbuf.dptr, dbuf.dsize); - return 0; -} - -static int print_key(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state) -{ - printf("key %d bytes: ", (int)key.dsize); - print_asc((const char *)key.dptr, key.dsize); - printf("\n"); - return 0; -} - -static int print_hexkey(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state) -{ - printf("key %d bytes\n", (int)key.dsize); - print_data((const char *)key.dptr, key.dsize); - printf("\n"); - return 0; -} - -static int total_bytes; - -static int traverse_fn(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state) -{ - total_bytes += dbuf.dsize; - return 0; -} - -static void info_tdb(void) -{ - enum TDB_ERROR ecode; - char *summary; - - ecode = tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &summary); - - if (ecode) { - terror(ecode, "Getting summary"); - } else { - printf("%s", summary); - free(summary); - } -} - -static void speed_tdb(const char *tlimit) -{ - unsigned timelimit = tlimit?atoi(tlimit):0; - double t; - int ops; - if (timelimit == 0) timelimit = 5; - - ops = 0; - printf("Testing store speed for %u seconds\n", timelimit); - _start_timer(); - do { - long int r = random(); - TDB_DATA key, dbuf; - key = tdb_mkdata("store test", strlen("store test")); - dbuf.dptr = (unsigned char *)&r; - dbuf.dsize = sizeof(r); - tdb_store(tdb, key, dbuf, TDB_REPLACE); - t = _end_timer(); - ops++; - } while (t < timelimit); - printf("%10.3f ops/sec\n", ops/t); - - ops = 0; - printf("Testing fetch speed for %u seconds\n", timelimit); - _start_timer(); - do { - long int r = random(); - TDB_DATA key, dbuf; - key = tdb_mkdata("store test", strlen("store test")); - dbuf.dptr = (unsigned char *)&r; - dbuf.dsize = sizeof(r); - tdb_fetch(tdb, key, &dbuf); - t = _end_timer(); - ops++; - } while (t < timelimit); - printf("%10.3f ops/sec\n", ops/t); - - ops = 0; - printf("Testing transaction speed for %u seconds\n", timelimit); - _start_timer(); - do { - long int r = random(); - TDB_DATA key, dbuf; - key = tdb_mkdata("transaction test", strlen("transaction test")); - dbuf.dptr = (unsigned char *)&r; - dbuf.dsize = sizeof(r); - tdb_transaction_start(tdb); - tdb_store(tdb, key, dbuf, TDB_REPLACE); - tdb_transaction_commit(tdb); - t = _end_timer(); - ops++; - } while (t < timelimit); - printf("%10.3f ops/sec\n", ops/t); - - ops = 0; - printf("Testing traverse speed for %u seconds\n", timelimit); - _start_timer(); - do { - tdb_traverse(tdb, traverse_fn, NULL); - t = _end_timer(); - ops++; - } while (t < timelimit); - printf("%10.3f ops/sec\n", ops/t); -} - -static void toggle_mmap(void) -{ - disable_mmap = !disable_mmap; - if (disable_mmap) { - printf("mmap is disabled\n"); - } else { - printf("mmap is enabled\n"); - } -} - -static char *tdb_getline(const char *prompt) -{ - static char thisline[1024]; - char *p; - fputs(prompt, stdout); - thisline[0] = 0; - p = fgets(thisline, sizeof(thisline)-1, stdin); - if (p) p = strchr(p, '\n'); - if (p) *p = 0; - return p?thisline:NULL; -} - -static int do_delete_fn(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, - void *state) -{ - return tdb_delete(the_tdb, key); -} - -static void first_record(struct tdb_context *the_tdb, TDB_DATA *pkey) -{ - TDB_DATA dbuf; - enum TDB_ERROR ecode; - ecode = tdb_firstkey(the_tdb, pkey); - if (!ecode) - ecode = tdb_fetch(the_tdb, *pkey, &dbuf); - if (ecode) terror(ecode, "fetch failed"); - else { - print_rec(the_tdb, *pkey, dbuf, NULL); - } -} - -static void next_record(struct tdb_context *the_tdb, TDB_DATA *pkey) -{ - TDB_DATA dbuf; - enum TDB_ERROR ecode; - ecode = tdb_nextkey(the_tdb, pkey); - - if (!ecode) - ecode = tdb_fetch(the_tdb, *pkey, &dbuf); - if (ecode) - terror(ecode, "fetch failed"); - else - print_rec(the_tdb, *pkey, dbuf, NULL); -} - -static void check_db(struct tdb_context *the_tdb) -{ - if (!the_tdb) { - printf("Error: No database opened!\n"); - } else { - if (tdb_check(the_tdb, NULL, NULL) != 0) - printf("Integrity check for the opened database failed.\n"); - else - printf("Database integrity is OK.\n"); - } -} - -static int do_command(void) -{ - COMMAND_TABLE *ctp = cmd_table; - enum commands mycmd = CMD_HELP; - int cmd_len; - - if (cmdname && strlen(cmdname) == 0) { - mycmd = CMD_NEXT; - } else { - while (ctp->name) { - cmd_len = strlen(ctp->name); - if (strncmp(ctp->name,cmdname,cmd_len) == 0) { - mycmd = ctp->cmd; - break; - } - ctp++; - } - } - - switch (mycmd) { - case CMD_CREATE_TDB: - bIterate = 0; - create_tdb(arg1); - return 0; - case CMD_OPEN_TDB: - bIterate = 0; - open_tdb(arg1); - return 0; - case CMD_SYSTEM: - /* Shell command */ - if (system(arg1) == -1) { - terror(TDB_SUCCESS, "system() call failed\n"); - } - return 0; - case CMD_QUIT: - return 1; - default: - /* all the rest require a open database */ - if (!tdb) { - bIterate = 0; - terror(TDB_SUCCESS, "database not open"); - help(); - return 0; - } - switch (mycmd) { - case CMD_TRANSACTION_START: - bIterate = 0; - tdb_transaction_start(tdb); - return 0; - case CMD_TRANSACTION_COMMIT: - bIterate = 0; - tdb_transaction_commit(tdb); - return 0; - case CMD_TRANSACTION_CANCEL: - bIterate = 0; - tdb_transaction_cancel(tdb); - return 0; - case CMD_ERASE: - bIterate = 0; - tdb_traverse(tdb, do_delete_fn, NULL); - return 0; - case CMD_DUMP: - bIterate = 0; - tdb_traverse(tdb, print_rec, NULL); - return 0; - case CMD_INSERT: - bIterate = 0; - insert_tdb(arg1, arg1len,arg2,arg2len); - return 0; - case CMD_MOVE: - bIterate = 0; - move_rec(arg1,arg1len,arg2); - return 0; - case CMD_STORE: - bIterate = 0; - store_tdb(arg1,arg1len,arg2,arg2len); - return 0; - case CMD_SHOW: - bIterate = 0; - show_tdb(arg1, arg1len); - return 0; - case CMD_KEYS: - tdb_traverse(tdb, print_key, NULL); - return 0; - case CMD_HEXKEYS: - tdb_traverse(tdb, print_hexkey, NULL); - return 0; - case CMD_DELETE: - bIterate = 0; - delete_tdb(arg1,arg1len); - return 0; -#if 0 - case CMD_LIST_HASH_FREE: - tdb_dump_all(tdb); - return 0; - case CMD_LIST_FREE: - tdb_printfreelist(tdb); - return 0; -#endif - case CMD_INFO: - info_tdb(); - return 0; - case CMD_SPEED: - speed_tdb(arg1); - return 0; - case CMD_MMAP: - toggle_mmap(); - return 0; - case CMD_FIRST: - bIterate = 1; - first_record(tdb, &iterate_kbuf); - return 0; - case CMD_NEXT: - if (bIterate) - next_record(tdb, &iterate_kbuf); - return 0; - case CMD_CHECK: - check_db(tdb); - return 0; - case CMD_HELP: - help(); - return 0; - case CMD_CREATE_TDB: - case CMD_OPEN_TDB: - case CMD_SYSTEM: - case CMD_QUIT: - /* - * unhandled commands. cases included here to avoid compiler - * warnings. - */ - return 0; - } - } - - return 0; -} - -static char *convert_string(char *instring, size_t *sizep) -{ - size_t length = 0; - char *outp, *inp; - char temp[3]; - - outp = inp = instring; - - while (*inp) { - if (*inp == '\\') { - inp++; - if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) { - temp[0] = *inp++; - temp[1] = '\0'; - if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) { - temp[1] = *inp++; - temp[2] = '\0'; - } - *outp++ = (char)strtol((const char *)temp,NULL,16); - } else { - *outp++ = *inp++; - } - } else { - *outp++ = *inp++; - } - length++; - } - *sizep = length; - return instring; -} - -int main(int argc, char *argv[]) -{ - cmdname = ""; - arg1 = NULL; - arg1len = 0; - arg2 = NULL; - arg2len = 0; - - if (argv[1]) { - cmdname = "open"; - arg1 = argv[1]; - do_command(); - cmdname = ""; - arg1 = NULL; - } - - switch (argc) { - case 1: - case 2: - /* Interactive mode */ - while ((cmdname = tdb_getline("tdb> "))) { - arg2 = arg1 = NULL; - if ((arg1 = strchr((const char *)cmdname,' ')) != NULL) { - arg1++; - arg2 = arg1; - while (*arg2) { - if (*arg2 == ' ') { - *arg2++ = '\0'; - break; - } - if ((*arg2++ == '\\') && (*arg2 == ' ')) { - arg2++; - } - } - } - if (arg1) arg1 = convert_string(arg1,&arg1len); - if (arg2) arg2 = convert_string(arg2,&arg2len); - if (do_command()) break; - } - break; - case 5: - arg2 = convert_string(argv[4],&arg2len); - case 4: - arg1 = convert_string(argv[3],&arg1len); - case 3: - cmdname = argv[2]; - default: - do_command(); - break; - } - - if (tdb) tdb_close(tdb); - - return 0; -} diff --git a/ccan/tdb2/tools/tdb2torture.c b/ccan/tdb2/tools/tdb2torture.c deleted file mode 100644 index 29ecb6af..00000000 --- a/ccan/tdb2/tools/tdb2torture.c +++ /dev/null @@ -1,498 +0,0 @@ -/* this tests tdb by doing lots of ops from several simultaneous - writers - that stresses the locking code. -*/ - -#include "tdb2.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -//#define REOPEN_PROB 30 -#define DELETE_PROB 8 -#define STORE_PROB 4 -#define APPEND_PROB 6 -#define TRANSACTION_PROB 10 -#define TRANSACTION_PREPARE_PROB 2 -#define LOCKSTORE_PROB 5 -#define TRAVERSE_PROB 20 -#define TRAVERSE_MOD_PROB 100 -#define TRAVERSE_ABORT_PROB 500 -#define CULL_PROB 100 -#define KEYLEN 3 -#define DATALEN 100 - -static struct tdb_context *db; -static int in_transaction; -static int in_traverse; -static int error_count; -#if TRANSACTION_PROB -static int always_transaction = 0; -#endif -static int loopnum; -static int count_pipe; -static union tdb_attribute log_attr; -static union tdb_attribute seed_attr; - -static void tdb_log(struct tdb_context *tdb, - enum tdb_log_level level, - enum TDB_ERROR ecode, - const char *message, - void *data) -{ - printf("tdb:%s:%s:%s\n", - tdb_name(tdb), tdb_errorstr(ecode), message); - fflush(stdout); -#if 0 - { - char str[200]; - signal(SIGUSR1, SIG_IGN); - sprintf(str,"xterm -e gdb /proc/%d/exe %d", getpid(), getpid()); - system(str); - } -#endif -} - -#include "../private.h" - -static void segv_handler(int sig, siginfo_t *info, void *p) -{ - char string[100]; - - sprintf(string, "%u: death at %p (map_ptr %p, map_size %zu)\n", - getpid(), info->si_addr, db->file->map_ptr, - (size_t)db->file->map_size); - if (write(2, string, strlen(string)) > 0) - sleep(60); - _exit(11); -} - -static void fatal(struct tdb_context *tdb, const char *why) -{ - fprintf(stderr, "%u:%s:%s\n", getpid(), why, - tdb ? tdb_errorstr(tdb_error(tdb)) : "(no tdb)"); - error_count++; -} - -static char *randbuf(int len) -{ - char *buf; - int i; - buf = (char *)malloc(len+1); - - for (i=0;i. -*/ - -#include "private.h" -#define SAFE_FREE(x) do { if ((x) != NULL) {free((void *)x); (x)=NULL;} } while(0) - -/* - transaction design: - - - only allow a single transaction at a time per database. This makes - using the transaction API simpler, as otherwise the caller would - have to cope with temporary failures in transactions that conflict - with other current transactions - - - keep the transaction recovery information in the same file as the - database, using a special 'transaction recovery' record pointed at - by the header. This removes the need for extra journal files as - used by some other databases - - - dynamically allocated the transaction recover record, re-using it - for subsequent transactions. If a larger record is needed then - tdb_free() the old record to place it on the normal tdb freelist - before allocating the new record - - - during transactions, keep a linked list of writes all that have - been performed by intercepting all tdb_write() calls. The hooked - transaction versions of tdb_read() and tdb_write() check this - linked list and try to use the elements of the list in preference - to the real database. - - - don't allow any locks to be held when a transaction starts, - otherwise we can end up with deadlock (plus lack of lock nesting - in POSIX locks would mean the lock is lost) - - - if the caller gains a lock during the transaction but doesn't - release it then fail the commit - - - allow for nested calls to tdb_transaction_start(), re-using the - existing transaction record. If the inner transaction is canceled - then a subsequent commit will fail - - - keep a mirrored copy of the tdb hash chain heads to allow for the - fast hash heads scan on traverse, updating the mirrored copy in - the transaction version of tdb_write - - - allow callers to mix transaction and non-transaction use of tdb, - although once a transaction is started then an exclusive lock is - gained until the transaction is committed or canceled - - - the commit stategy involves first saving away all modified data - into a linearised buffer in the transaction recovery area, then - marking the transaction recovery area with a magic value to - indicate a valid recovery record. In total 4 fsync/msync calls are - needed per commit to prevent race conditions. It might be possible - to reduce this to 3 or even 2 with some more work. - - - check for a valid recovery record on open of the tdb, while the - open lock is held. Automatically recover from the transaction - recovery area if needed, then continue with the open as - usual. This allows for smooth crash recovery with no administrator - intervention. - - - if TDB_NOSYNC is passed to flags in tdb_open then transactions are - still available, but no transaction recovery area is used and no - fsync/msync calls are made. -*/ - -/* - hold the context of any current transaction -*/ -struct tdb_transaction { - /* the original io methods - used to do IOs to the real db */ - const struct tdb_methods *io_methods; - - /* the list of transaction blocks. When a block is first - written to, it gets created in this list */ - uint8_t **blocks; - size_t num_blocks; - size_t last_block_size; /* number of valid bytes in the last block */ - - /* non-zero when an internal transaction error has - occurred. All write operations will then fail until the - transaction is ended */ - int transaction_error; - - /* when inside a transaction we need to keep track of any - nested tdb_transaction_start() calls, as these are allowed, - but don't create a new transaction */ - unsigned int nesting; - - /* set when a prepare has already occurred */ - bool prepared; - tdb_off_t magic_offset; - - /* old file size before transaction */ - tdb_len_t old_map_size; -}; - -/* This doesn't really need to be pagesize, but we use it for similar reasons. */ -#define PAGESIZE 65536 - -/* - read while in a transaction. We need to check first if the data is in our list - of transaction elements, then if not do a real read -*/ -static enum TDB_ERROR transaction_read(struct tdb_context *tdb, tdb_off_t off, - void *buf, tdb_len_t len) -{ - size_t blk; - enum TDB_ERROR ecode; - - /* break it down into block sized ops */ - while (len + (off % PAGESIZE) > PAGESIZE) { - tdb_len_t len2 = PAGESIZE - (off % PAGESIZE); - ecode = transaction_read(tdb, off, buf, len2); - if (ecode != TDB_SUCCESS) { - return ecode; - } - len -= len2; - off += len2; - buf = (void *)(len2 + (char *)buf); - } - - if (len == 0) { - return TDB_SUCCESS; - } - - blk = off / PAGESIZE; - - /* see if we have it in the block list */ - if (tdb->tdb2.transaction->num_blocks <= blk || - tdb->tdb2.transaction->blocks[blk] == NULL) { - /* nope, do a real read */ - ecode = tdb->tdb2.transaction->io_methods->tread(tdb, off, buf, len); - if (ecode != TDB_SUCCESS) { - goto fail; - } - return 0; - } - - /* it is in the block list. Now check for the last block */ - if (blk == tdb->tdb2.transaction->num_blocks-1) { - if (len > tdb->tdb2.transaction->last_block_size) { - ecode = TDB_ERR_IO; - goto fail; - } - } - - /* now copy it out of this block */ - memcpy(buf, tdb->tdb2.transaction->blocks[blk] + (off % PAGESIZE), len); - return TDB_SUCCESS; - -fail: - tdb->tdb2.transaction->transaction_error = 1; - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "transaction_read: failed at off=%zu len=%zu", - (size_t)off, (size_t)len); -} - - -/* - write while in a transaction -*/ -static enum TDB_ERROR transaction_write(struct tdb_context *tdb, tdb_off_t off, - const void *buf, tdb_len_t len) -{ - size_t blk; - enum TDB_ERROR ecode; - - /* Only a commit is allowed on a prepared transaction */ - if (tdb->tdb2.transaction->prepared) { - ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR, - "transaction_write: transaction already" - " prepared, write not allowed"); - goto fail; - } - - /* break it up into block sized chunks */ - while (len + (off % PAGESIZE) > PAGESIZE) { - tdb_len_t len2 = PAGESIZE - (off % PAGESIZE); - ecode = transaction_write(tdb, off, buf, len2); - if (ecode != TDB_SUCCESS) { - return ecode; - } - len -= len2; - off += len2; - if (buf != NULL) { - buf = (const void *)(len2 + (const char *)buf); - } - } - - if (len == 0) { - return TDB_SUCCESS; - } - - blk = off / PAGESIZE; - off = off % PAGESIZE; - - if (tdb->tdb2.transaction->num_blocks <= blk) { - uint8_t **new_blocks; - /* expand the blocks array */ - if (tdb->tdb2.transaction->blocks == NULL) { - new_blocks = (uint8_t **)malloc( - (blk+1)*sizeof(uint8_t *)); - } else { - new_blocks = (uint8_t **)realloc( - tdb->tdb2.transaction->blocks, - (blk+1)*sizeof(uint8_t *)); - } - if (new_blocks == NULL) { - ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "transaction_write:" - " failed to allocate"); - goto fail; - } - memset(&new_blocks[tdb->tdb2.transaction->num_blocks], 0, - (1+(blk - tdb->tdb2.transaction->num_blocks))*sizeof(uint8_t *)); - tdb->tdb2.transaction->blocks = new_blocks; - tdb->tdb2.transaction->num_blocks = blk+1; - tdb->tdb2.transaction->last_block_size = 0; - } - - /* allocate and fill a block? */ - if (tdb->tdb2.transaction->blocks[blk] == NULL) { - tdb->tdb2.transaction->blocks[blk] = (uint8_t *)calloc(PAGESIZE, 1); - if (tdb->tdb2.transaction->blocks[blk] == NULL) { - ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "transaction_write:" - " failed to allocate"); - goto fail; - } - if (tdb->tdb2.transaction->old_map_size > blk * PAGESIZE) { - tdb_len_t len2 = PAGESIZE; - if (len2 + (blk * PAGESIZE) > tdb->tdb2.transaction->old_map_size) { - len2 = tdb->tdb2.transaction->old_map_size - (blk * PAGESIZE); - } - ecode = tdb->tdb2.transaction->io_methods->tread(tdb, - blk * PAGESIZE, - tdb->tdb2.transaction->blocks[blk], - len2); - if (ecode != TDB_SUCCESS) { - ecode = tdb_logerr(tdb, ecode, - TDB_LOG_ERROR, - "transaction_write:" - " failed to" - " read old block: %s", - strerror(errno)); - SAFE_FREE(tdb->tdb2.transaction->blocks[blk]); - goto fail; - } - if (blk == tdb->tdb2.transaction->num_blocks-1) { - tdb->tdb2.transaction->last_block_size = len2; - } - } - } - - /* overwrite part of an existing block */ - if (buf == NULL) { - memset(tdb->tdb2.transaction->blocks[blk] + off, 0, len); - } else { - memcpy(tdb->tdb2.transaction->blocks[blk] + off, buf, len); - } - if (blk == tdb->tdb2.transaction->num_blocks-1) { - if (len + off > tdb->tdb2.transaction->last_block_size) { - tdb->tdb2.transaction->last_block_size = len + off; - } - } - - return TDB_SUCCESS; - -fail: - tdb->tdb2.transaction->transaction_error = 1; - return ecode; -} - - -/* - write while in a transaction - this variant never expands the transaction blocks, it only - updates existing blocks. This means it cannot change the recovery size -*/ -static void transaction_write_existing(struct tdb_context *tdb, tdb_off_t off, - const void *buf, tdb_len_t len) -{ - size_t blk; - - /* break it up into block sized chunks */ - while (len + (off % PAGESIZE) > PAGESIZE) { - tdb_len_t len2 = PAGESIZE - (off % PAGESIZE); - transaction_write_existing(tdb, off, buf, len2); - len -= len2; - off += len2; - if (buf != NULL) { - buf = (const void *)(len2 + (const char *)buf); - } - } - - if (len == 0) { - return; - } - - blk = off / PAGESIZE; - off = off % PAGESIZE; - - if (tdb->tdb2.transaction->num_blocks <= blk || - tdb->tdb2.transaction->blocks[blk] == NULL) { - return; - } - - if (blk == tdb->tdb2.transaction->num_blocks-1 && - off + len > tdb->tdb2.transaction->last_block_size) { - if (off >= tdb->tdb2.transaction->last_block_size) { - return; - } - len = tdb->tdb2.transaction->last_block_size - off; - } - - /* overwrite part of an existing block */ - memcpy(tdb->tdb2.transaction->blocks[blk] + off, buf, len); -} - - -/* - out of bounds check during a transaction -*/ -static enum TDB_ERROR transaction_oob(struct tdb_context *tdb, - tdb_off_t off, tdb_len_t len, bool probe) -{ - if ((off + len >= off && off + len <= tdb->file->map_size) || probe) { - return TDB_SUCCESS; - } - - tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_oob len %lld beyond transaction size %lld", - (long long)(off + len), - (long long)tdb->file->map_size); - return TDB_ERR_IO; -} - -/* - transaction version of tdb_expand(). -*/ -static enum TDB_ERROR transaction_expand_file(struct tdb_context *tdb, - tdb_off_t addition) -{ - enum TDB_ERROR ecode; - - /* add a write to the transaction elements, so subsequent - reads see the zero data */ - ecode = transaction_write(tdb, tdb->file->map_size, NULL, addition); - if (ecode == TDB_SUCCESS) { - tdb->file->map_size += addition; - } - return ecode; -} - -static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off, - size_t len, bool write_mode) -{ - size_t blk = off / PAGESIZE, end_blk; - - /* This is wrong for zero-length blocks, but will fail gracefully */ - end_blk = (off + len - 1) / PAGESIZE; - - /* Can only do direct if in single block and we've already copied. */ - if (write_mode) { - tdb->stats.transaction_write_direct++; - if (blk != end_blk - || blk >= tdb->tdb2.transaction->num_blocks - || tdb->tdb2.transaction->blocks[blk] == NULL) { - tdb->stats.transaction_write_direct_fail++; - return NULL; - } - return tdb->tdb2.transaction->blocks[blk] + off % PAGESIZE; - } - - tdb->stats.transaction_read_direct++; - /* Single which we have copied? */ - if (blk == end_blk - && blk < tdb->tdb2.transaction->num_blocks - && tdb->tdb2.transaction->blocks[blk]) - return tdb->tdb2.transaction->blocks[blk] + off % PAGESIZE; - - /* Otherwise must be all not copied. */ - while (blk <= end_blk) { - if (blk >= tdb->tdb2.transaction->num_blocks) - break; - if (tdb->tdb2.transaction->blocks[blk]) { - tdb->stats.transaction_read_direct_fail++; - return NULL; - } - blk++; - } - return tdb->tdb2.transaction->io_methods->direct(tdb, off, len, false); -} - -static const struct tdb_methods transaction_methods = { - transaction_read, - transaction_write, - transaction_oob, - transaction_expand_file, - transaction_direct, -}; - -/* - sync to disk -*/ -static enum TDB_ERROR transaction_sync(struct tdb_context *tdb, - tdb_off_t offset, tdb_len_t length) -{ - if (tdb->flags & TDB_NOSYNC) { - return TDB_SUCCESS; - } - - if (fsync(tdb->file->fd) != 0) { - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_transaction: fsync failed: %s", - strerror(errno)); - } -#ifdef MS_SYNC - if (tdb->file->map_ptr) { - tdb_off_t moffset = offset & ~(getpagesize()-1); - if (msync(moffset + (char *)tdb->file->map_ptr, - length + (offset - moffset), MS_SYNC) != 0) { - return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR, - "tdb_transaction: msync failed: %s", - strerror(errno)); - } - } -#endif - return TDB_SUCCESS; -} - - -static void _tdb_transaction_cancel(struct tdb_context *tdb) -{ - int i; - enum TDB_ERROR ecode; - - if (tdb->tdb2.transaction == NULL) { - tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, - "tdb_transaction_cancel: no transaction"); - return; - } - - if (tdb->tdb2.transaction->nesting != 0) { - tdb->tdb2.transaction->transaction_error = 1; - tdb->tdb2.transaction->nesting--; - return; - } - - tdb->file->map_size = tdb->tdb2.transaction->old_map_size; - - /* free all the transaction blocks */ - for (i=0;itdb2.transaction->num_blocks;i++) { - if (tdb->tdb2.transaction->blocks[i] != NULL) { - free(tdb->tdb2.transaction->blocks[i]); - } - } - SAFE_FREE(tdb->tdb2.transaction->blocks); - - if (tdb->tdb2.transaction->magic_offset) { - const struct tdb_methods *methods = tdb->tdb2.transaction->io_methods; - uint64_t invalid = TDB_RECOVERY_INVALID_MAGIC; - - /* remove the recovery marker */ - ecode = methods->twrite(tdb, tdb->tdb2.transaction->magic_offset, - &invalid, sizeof(invalid)); - if (ecode == TDB_SUCCESS) - ecode = transaction_sync(tdb, - tdb->tdb2.transaction->magic_offset, - sizeof(invalid)); - if (ecode != TDB_SUCCESS) { - tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_transaction_cancel: failed to remove" - " recovery magic"); - } - } - - if (tdb->file->allrecord_lock.count) - tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype); - - /* restore the normal io methods */ - tdb->tdb2.io = tdb->tdb2.transaction->io_methods; - - tdb_transaction_unlock(tdb, F_WRLCK); - - if (tdb_has_open_lock(tdb)) - tdb_unlock_open(tdb, F_WRLCK); - - SAFE_FREE(tdb->tdb2.transaction); -} - -/* - start a tdb transaction. No token is returned, as only a single - transaction is allowed to be pending per tdb_context -*/ -enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb) -{ - enum TDB_ERROR ecode; - - if (tdb->flags & TDB_VERSION1) { - if (tdb1_transaction_start(tdb) == -1) - return tdb->last_error; - return TDB_SUCCESS; - } - - tdb->stats.transactions++; - /* some sanity checks */ - if (tdb->flags & TDB_INTERNAL) { - return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_transaction_start:" - " cannot start a" - " transaction on an" - " internal tdb"); - } - - if (tdb->flags & TDB_RDONLY) { - return tdb->last_error = tdb_logerr(tdb, TDB_ERR_RDONLY, - TDB_LOG_USE_ERROR, - "tdb_transaction_start:" - " cannot start a" - " transaction on a " - " read-only tdb"); - } - - /* cope with nested tdb_transaction_start() calls */ - if (tdb->tdb2.transaction != NULL) { - if (!(tdb->flags & TDB_ALLOW_NESTING)) { - return tdb->last_error - = tdb_logerr(tdb, TDB_ERR_IO, - TDB_LOG_USE_ERROR, - "tdb_transaction_start:" - " already inside transaction"); - } - tdb->tdb2.transaction->nesting++; - tdb->stats.transaction_nest++; - return 0; - } - - if (tdb_has_hash_locks(tdb)) { - /* the caller must not have any locks when starting a - transaction as otherwise we'll be screwed by lack - of nested locks in POSIX */ - return tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, - TDB_LOG_USE_ERROR, - "tdb_transaction_start:" - " cannot start a" - " transaction with locks" - " held"); - } - - tdb->tdb2.transaction = (struct tdb_transaction *) - calloc(sizeof(struct tdb_transaction), 1); - if (tdb->tdb2.transaction == NULL) { - return tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, - TDB_LOG_ERROR, - "tdb_transaction_start:" - " cannot allocate"); - } - - /* get the transaction write lock. This is a blocking lock. As - discussed with Volker, there are a number of ways we could - make this async, which we will probably do in the future */ - ecode = tdb_transaction_lock(tdb, F_WRLCK); - if (ecode != TDB_SUCCESS) { - SAFE_FREE(tdb->tdb2.transaction->blocks); - SAFE_FREE(tdb->tdb2.transaction); - return tdb->last_error = ecode; - } - - /* get a read lock over entire file. This is upgraded to a write - lock during the commit */ - ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true); - if (ecode != TDB_SUCCESS) { - goto fail_allrecord_lock; - } - - /* make sure we know about any file expansions already done by - anyone else */ - tdb->tdb2.io->oob(tdb, tdb->file->map_size, 1, true); - tdb->tdb2.transaction->old_map_size = tdb->file->map_size; - - /* finally hook the io methods, replacing them with - transaction specific methods */ - tdb->tdb2.transaction->io_methods = tdb->tdb2.io; - tdb->tdb2.io = &transaction_methods; - return tdb->last_error = TDB_SUCCESS; - -fail_allrecord_lock: - tdb_transaction_unlock(tdb, F_WRLCK); - SAFE_FREE(tdb->tdb2.transaction->blocks); - SAFE_FREE(tdb->tdb2.transaction); - return tdb->last_error = ecode; -} - - -/* - cancel the current transaction -*/ -void tdb_transaction_cancel(struct tdb_context *tdb) -{ - if (tdb->flags & TDB_VERSION1) { - tdb1_transaction_cancel(tdb); - return; - } - tdb->stats.transaction_cancel++; - _tdb_transaction_cancel(tdb); -} - -/* - work out how much space the linearised recovery data will consume (worst case) -*/ -static tdb_len_t tdb_recovery_size(struct tdb_context *tdb) -{ - tdb_len_t recovery_size = 0; - int i; - - recovery_size = 0; - for (i=0;itdb2.transaction->num_blocks;i++) { - if (i * PAGESIZE >= tdb->tdb2.transaction->old_map_size) { - break; - } - if (tdb->tdb2.transaction->blocks[i] == NULL) { - continue; - } - recovery_size += 2*sizeof(tdb_off_t); - if (i == tdb->tdb2.transaction->num_blocks-1) { - recovery_size += tdb->tdb2.transaction->last_block_size; - } else { - recovery_size += PAGESIZE; - } - } - - return recovery_size; -} - -static enum TDB_ERROR tdb_recovery_area(struct tdb_context *tdb, - const struct tdb_methods *methods, - tdb_off_t *recovery_offset, - struct tdb_recovery_record *rec) -{ - enum TDB_ERROR ecode; - - *recovery_offset = tdb_read_off(tdb, - offsetof(struct tdb_header, recovery)); - if (TDB_OFF_IS_ERR(*recovery_offset)) { - return TDB_OFF_TO_ERR(*recovery_offset); - } - - if (*recovery_offset == 0) { - rec->max_len = 0; - return TDB_SUCCESS; - } - - ecode = methods->tread(tdb, *recovery_offset, rec, sizeof(*rec)); - if (ecode != TDB_SUCCESS) - return ecode; - - tdb_convert(tdb, rec, sizeof(*rec)); - /* ignore invalid recovery regions: can happen in crash */ - if (rec->magic != TDB_RECOVERY_MAGIC && - rec->magic != TDB_RECOVERY_INVALID_MAGIC) { - *recovery_offset = 0; - rec->max_len = 0; - } - return TDB_SUCCESS; -} - -static unsigned int same(const unsigned char *new, - const unsigned char *old, - unsigned int length) -{ - unsigned int i; - - for (i = 0; i < length; i++) { - if (new[i] != old[i]) - break; - } - return i; -} - -static unsigned int different(const unsigned char *new, - const unsigned char *old, - unsigned int length, - unsigned int min_same, - unsigned int *samelen) -{ - unsigned int i; - - *samelen = 0; - for (i = 0; i < length; i++) { - if (new[i] == old[i]) { - (*samelen)++; - } else { - if (*samelen >= min_same) { - return i - *samelen; - } - *samelen = 0; - } - } - - if (*samelen < min_same) - *samelen = 0; - return length - *samelen; -} - -/* Allocates recovery blob, without tdb_recovery_record at head set up. */ -static struct tdb_recovery_record *alloc_recovery(struct tdb_context *tdb, - tdb_len_t *len) -{ - struct tdb_recovery_record *rec; - size_t i; - enum TDB_ERROR ecode; - unsigned char *p; - const struct tdb_methods *old_methods = tdb->tdb2.io; - - rec = malloc(sizeof(*rec) + tdb_recovery_size(tdb)); - if (!rec) { - tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "transaction_setup_recovery:" - " cannot allocate"); - return TDB_ERR_PTR(TDB_ERR_OOM); - } - - /* We temporarily revert to the old I/O methods, so we can use - * tdb_access_read */ - tdb->tdb2.io = tdb->tdb2.transaction->io_methods; - - /* build the recovery data into a single blob to allow us to do a single - large write, which should be more efficient */ - p = (unsigned char *)(rec + 1); - for (i=0;itdb2.transaction->num_blocks;i++) { - tdb_off_t offset; - tdb_len_t length; - unsigned int off; - const unsigned char *buffer; - - if (tdb->tdb2.transaction->blocks[i] == NULL) { - continue; - } - - offset = i * PAGESIZE; - length = PAGESIZE; - if (i == tdb->tdb2.transaction->num_blocks-1) { - length = tdb->tdb2.transaction->last_block_size; - } - - if (offset >= tdb->tdb2.transaction->old_map_size) { - continue; - } - - if (offset + length > tdb->file->map_size) { - ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_transaction_setup_recovery:" - " transaction data over new region" - " boundary"); - goto fail; - } - if (offset + length > tdb->tdb2.transaction->old_map_size) { - /* Short read at EOF. */ - length = tdb->tdb2.transaction->old_map_size - offset; - } - buffer = tdb_access_read(tdb, offset, length, false); - if (TDB_PTR_IS_ERR(buffer)) { - ecode = TDB_PTR_ERR(buffer); - goto fail; - } - - /* Skip over anything the same at the start. */ - off = same(tdb->tdb2.transaction->blocks[i], buffer, length); - offset += off; - - while (off < length) { - tdb_len_t len; - unsigned int samelen; - - len = different(tdb->tdb2.transaction->blocks[i] + off, - buffer + off, length - off, - sizeof(offset) + sizeof(len) + 1, - &samelen); - - memcpy(p, &offset, sizeof(offset)); - memcpy(p + sizeof(offset), &len, sizeof(len)); - tdb_convert(tdb, p, sizeof(offset) + sizeof(len)); - p += sizeof(offset) + sizeof(len); - memcpy(p, buffer + off, len); - p += len; - off += len + samelen; - offset += len + samelen; - } - tdb_access_release(tdb, buffer); - } - - *len = p - (unsigned char *)(rec + 1); - tdb->tdb2.io = old_methods; - return rec; - -fail: - free(rec); - tdb->tdb2.io = old_methods; - return TDB_ERR_PTR(ecode); -} - -static tdb_off_t create_recovery_area(struct tdb_context *tdb, - tdb_len_t rec_length, - struct tdb_recovery_record *rec) -{ - tdb_off_t off, recovery_off; - tdb_len_t addition; - enum TDB_ERROR ecode; - const struct tdb_methods *methods = tdb->tdb2.transaction->io_methods; - - /* round up to a multiple of page size. Overallocate, since each - * such allocation forces us to expand the file. */ - rec->max_len = tdb_expand_adjust(tdb->file->map_size, rec_length); - - /* Round up to a page. */ - rec->max_len = ((sizeof(*rec) + rec->max_len + PAGESIZE-1) - & ~(PAGESIZE-1)) - - sizeof(*rec); - - off = tdb->file->map_size; - - /* Restore ->map_size before calling underlying expand_file. - Also so that we don't try to expand the file again in the - transaction commit, which would destroy the recovery - area */ - addition = (tdb->file->map_size - tdb->tdb2.transaction->old_map_size) + - sizeof(*rec) + rec->max_len; - tdb->file->map_size = tdb->tdb2.transaction->old_map_size; - tdb->stats.transaction_expand_file++; - ecode = methods->expand_file(tdb, addition); - if (ecode != TDB_SUCCESS) { - tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_recovery_allocate:" - " failed to create recovery area"); - return TDB_ERR_TO_OFF(ecode); - } - - /* we have to reset the old map size so that we don't try to - expand the file again in the transaction commit, which - would destroy the recovery area */ - tdb->tdb2.transaction->old_map_size = tdb->file->map_size; - - /* write the recovery header offset and sync - we can sync without a race here - as the magic ptr in the recovery record has not been set */ - recovery_off = off; - tdb_convert(tdb, &recovery_off, sizeof(recovery_off)); - ecode = methods->twrite(tdb, offsetof(struct tdb_header, recovery), - &recovery_off, sizeof(tdb_off_t)); - if (ecode != TDB_SUCCESS) { - tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_recovery_allocate:" - " failed to write recovery head"); - return TDB_ERR_TO_OFF(ecode); - } - transaction_write_existing(tdb, offsetof(struct tdb_header, recovery), - &recovery_off, - sizeof(tdb_off_t)); - return off; -} - -/* - setup the recovery data that will be used on a crash during commit -*/ -static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb) -{ - tdb_len_t recovery_size = 0; - tdb_off_t recovery_off = 0; - tdb_off_t old_map_size = tdb->tdb2.transaction->old_map_size; - struct tdb_recovery_record *recovery; - const struct tdb_methods *methods = tdb->tdb2.transaction->io_methods; - uint64_t magic; - enum TDB_ERROR ecode; - - recovery = alloc_recovery(tdb, &recovery_size); - if (TDB_PTR_IS_ERR(recovery)) - return TDB_PTR_ERR(recovery); - - ecode = tdb_recovery_area(tdb, methods, &recovery_off, recovery); - if (ecode) { - free(recovery); - return ecode; - } - - if (recovery->max_len < recovery_size) { - /* Not large enough. Free up old recovery area. */ - if (recovery_off) { - tdb->stats.frees++; - ecode = add_free_record(tdb, recovery_off, - sizeof(*recovery) - + recovery->max_len, - TDB_LOCK_WAIT, true); - free(recovery); - if (ecode != TDB_SUCCESS) { - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_recovery_allocate:" - " failed to free previous" - " recovery area"); - } - - /* Refresh recovery after add_free_record above. */ - recovery = alloc_recovery(tdb, &recovery_size); - if (TDB_PTR_IS_ERR(recovery)) - return TDB_PTR_ERR(recovery); - } - - recovery_off = create_recovery_area(tdb, recovery_size, - recovery); - if (TDB_OFF_IS_ERR(recovery_off)) { - free(recovery); - return TDB_OFF_TO_ERR(recovery_off); - } - } - - /* Now we know size, convert rec header. */ - recovery->magic = TDB_RECOVERY_INVALID_MAGIC; - recovery->len = recovery_size; - recovery->eof = old_map_size; - tdb_convert(tdb, recovery, sizeof(*recovery)); - - /* write the recovery data to the recovery area */ - ecode = methods->twrite(tdb, recovery_off, recovery, recovery_size); - if (ecode != TDB_SUCCESS) { - free(recovery); - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_transaction_setup_recovery:" - " failed to write recovery data"); - } - transaction_write_existing(tdb, recovery_off, recovery, recovery_size); - - free(recovery); - - /* as we don't have ordered writes, we have to sync the recovery - data before we update the magic to indicate that the recovery - data is present */ - ecode = transaction_sync(tdb, recovery_off, recovery_size); - if (ecode != TDB_SUCCESS) - return ecode; - - magic = TDB_RECOVERY_MAGIC; - tdb_convert(tdb, &magic, sizeof(magic)); - - tdb->tdb2.transaction->magic_offset - = recovery_off + offsetof(struct tdb_recovery_record, magic); - - ecode = methods->twrite(tdb, tdb->tdb2.transaction->magic_offset, - &magic, sizeof(magic)); - if (ecode != TDB_SUCCESS) { - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_transaction_setup_recovery:" - " failed to write recovery magic"); - } - transaction_write_existing(tdb, tdb->tdb2.transaction->magic_offset, - &magic, sizeof(magic)); - - /* ensure the recovery magic marker is on disk */ - return transaction_sync(tdb, tdb->tdb2.transaction->magic_offset, - sizeof(magic)); -} - -static enum TDB_ERROR _tdb_transaction_prepare_commit(struct tdb_context *tdb) -{ - const struct tdb_methods *methods; - enum TDB_ERROR ecode; - - if (tdb->tdb2.transaction == NULL) { - return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, - "tdb_transaction_prepare_commit:" - " no transaction"); - } - - if (tdb->tdb2.transaction->prepared) { - _tdb_transaction_cancel(tdb); - return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR, - "tdb_transaction_prepare_commit:" - " transaction already prepared"); - } - - if (tdb->tdb2.transaction->transaction_error) { - _tdb_transaction_cancel(tdb); - return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR, - "tdb_transaction_prepare_commit:" - " transaction error pending"); - } - - - if (tdb->tdb2.transaction->nesting != 0) { - return TDB_SUCCESS; - } - - /* check for a null transaction */ - if (tdb->tdb2.transaction->blocks == NULL) { - return TDB_SUCCESS; - } - - methods = tdb->tdb2.transaction->io_methods; - - /* upgrade the main transaction lock region to a write lock */ - ecode = tdb_allrecord_upgrade(tdb, TDB_HASH_LOCK_START); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - /* get the open lock - this prevents new users attaching to the database - during the commit */ - ecode = tdb_lock_open(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK); - if (ecode != TDB_SUCCESS) { - return ecode; - } - - /* Since we have whole db locked, we don't need the expansion lock. */ - if (!(tdb->flags & TDB_NOSYNC)) { - /* Sets up tdb->tdb2.transaction->recovery and - * tdb->tdb2.transaction->magic_offset. */ - ecode = transaction_setup_recovery(tdb); - if (ecode != TDB_SUCCESS) { - return ecode; - } - } - - tdb->tdb2.transaction->prepared = true; - - /* expand the file to the new size if needed */ - if (tdb->file->map_size != tdb->tdb2.transaction->old_map_size) { - tdb_len_t add; - - add = tdb->file->map_size - tdb->tdb2.transaction->old_map_size; - /* Restore original map size for tdb_expand_file */ - tdb->file->map_size = tdb->tdb2.transaction->old_map_size; - ecode = methods->expand_file(tdb, add); - if (ecode != TDB_SUCCESS) { - return ecode; - } - } - - /* Keep the open lock until the actual commit */ - return TDB_SUCCESS; -} - -/* - prepare to commit the current transaction -*/ -enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb) -{ - if (tdb->flags & TDB_VERSION1) { - if (tdb1_transaction_prepare_commit(tdb) == -1) - return tdb->last_error; - return TDB_SUCCESS; - } - return tdb->last_error = _tdb_transaction_prepare_commit(tdb); -} - -/* - commit the current transaction -*/ -enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb) -{ - const struct tdb_methods *methods; - int i; - enum TDB_ERROR ecode; - - if (tdb->flags & TDB_VERSION1) { - if (tdb1_transaction_commit(tdb) == -1) - return tdb->last_error; - return TDB_SUCCESS; - } - - if (tdb->tdb2.transaction == NULL) { - return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, - TDB_LOG_USE_ERROR, - "tdb_transaction_commit:" - " no transaction"); - } - - tdb_trace(tdb, "tdb_transaction_commit"); - - if (tdb->tdb2.transaction->nesting != 0) { - tdb->tdb2.transaction->nesting--; - return tdb->last_error = TDB_SUCCESS; - } - - /* check for a null transaction */ - if (tdb->tdb2.transaction->blocks == NULL) { - _tdb_transaction_cancel(tdb); - return tdb->last_error = TDB_SUCCESS; - } - - if (!tdb->tdb2.transaction->prepared) { - ecode = _tdb_transaction_prepare_commit(tdb); - if (ecode != TDB_SUCCESS) { - _tdb_transaction_cancel(tdb); - return tdb->last_error = ecode; - } - } - - methods = tdb->tdb2.transaction->io_methods; - - /* perform all the writes */ - for (i=0;itdb2.transaction->num_blocks;i++) { - tdb_off_t offset; - tdb_len_t length; - - if (tdb->tdb2.transaction->blocks[i] == NULL) { - continue; - } - - offset = i * PAGESIZE; - length = PAGESIZE; - if (i == tdb->tdb2.transaction->num_blocks-1) { - length = tdb->tdb2.transaction->last_block_size; - } - - ecode = methods->twrite(tdb, offset, - tdb->tdb2.transaction->blocks[i], length); - if (ecode != TDB_SUCCESS) { - /* we've overwritten part of the data and - possibly expanded the file, so we need to - run the crash recovery code */ - tdb->tdb2.io = methods; - tdb_transaction_recover(tdb); - - _tdb_transaction_cancel(tdb); - - return tdb->last_error = ecode; - } - SAFE_FREE(tdb->tdb2.transaction->blocks[i]); - } - - SAFE_FREE(tdb->tdb2.transaction->blocks); - tdb->tdb2.transaction->num_blocks = 0; - - /* ensure the new data is on disk */ - ecode = transaction_sync(tdb, 0, tdb->file->map_size); - if (ecode != TDB_SUCCESS) { - return tdb->last_error = ecode; - } - - /* - TODO: maybe write to some dummy hdr field, or write to magic - offset without mmap, before the last sync, instead of the - utime() call - */ - - /* on some systems (like Linux 2.6.x) changes via mmap/msync - don't change the mtime of the file, this means the file may - not be backed up (as tdb rounding to block sizes means that - file size changes are quite rare too). The following forces - mtime changes when a transaction completes */ -#if HAVE_UTIME - utime(tdb->name, NULL); -#endif - - /* use a transaction cancel to free memory and remove the - transaction locks: it "restores" map_size, too. */ - tdb->tdb2.transaction->old_map_size = tdb->file->map_size; - _tdb_transaction_cancel(tdb); - - return tdb->last_error = TDB_SUCCESS; -} - - -/* - recover from an aborted transaction. Must be called with exclusive - database write access already established (including the open - lock to prevent new processes attaching) -*/ -enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb) -{ - tdb_off_t recovery_head, recovery_eof; - unsigned char *data, *p; - struct tdb_recovery_record rec; - enum TDB_ERROR ecode; - - /* find the recovery area */ - recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery)); - if (TDB_OFF_IS_ERR(recovery_head)) { - ecode = TDB_OFF_TO_ERR(recovery_head); - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_transaction_recover:" - " failed to read recovery head"); - } - - if (recovery_head == 0) { - /* we have never allocated a recovery record */ - return TDB_SUCCESS; - } - - /* read the recovery record */ - ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec)); - if (ecode != TDB_SUCCESS) { - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_transaction_recover:" - " failed to read recovery record"); - } - - if (rec.magic != TDB_RECOVERY_MAGIC) { - /* there is no valid recovery data */ - return TDB_SUCCESS; - } - - if (tdb->flags & TDB_RDONLY) { - return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR, - "tdb_transaction_recover:" - " attempt to recover read only database"); - } - - recovery_eof = rec.eof; - - data = (unsigned char *)malloc(rec.len); - if (data == NULL) { - return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR, - "tdb_transaction_recover:" - " failed to allocate recovery data"); - } - - /* read the full recovery data */ - ecode = tdb->tdb2.io->tread(tdb, recovery_head + sizeof(rec), data, - rec.len); - if (ecode != TDB_SUCCESS) { - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_transaction_recover:" - " failed to read recovery data"); - } - - /* recover the file data */ - p = data; - while (p+sizeof(tdb_off_t)+sizeof(tdb_len_t) < data + rec.len) { - tdb_off_t ofs; - tdb_len_t len; - tdb_convert(tdb, p, sizeof(ofs) + sizeof(len)); - memcpy(&ofs, p, sizeof(ofs)); - memcpy(&len, p + sizeof(ofs), sizeof(len)); - p += sizeof(ofs) + sizeof(len); - - ecode = tdb->tdb2.io->twrite(tdb, ofs, p, len); - if (ecode != TDB_SUCCESS) { - free(data); - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_transaction_recover:" - " failed to recover %zu bytes" - " at offset %zu", - (size_t)len, (size_t)ofs); - } - p += len; - } - - free(data); - - ecode = transaction_sync(tdb, 0, tdb->file->map_size); - if (ecode != TDB_SUCCESS) { - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_transaction_recover:" - " failed to sync recovery"); - } - - /* if the recovery area is after the recovered eof then remove it */ - if (recovery_eof <= recovery_head) { - ecode = tdb_write_off(tdb, offsetof(struct tdb_header, - recovery), - 0); - if (ecode != TDB_SUCCESS) { - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_transaction_recover:" - " failed to remove recovery head"); - } - } - - /* remove the recovery magic */ - ecode = tdb_write_off(tdb, - recovery_head - + offsetof(struct tdb_recovery_record, magic), - TDB_RECOVERY_INVALID_MAGIC); - if (ecode != TDB_SUCCESS) { - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_transaction_recover:" - " failed to remove recovery magic"); - } - - ecode = transaction_sync(tdb, 0, recovery_eof); - if (ecode != TDB_SUCCESS) { - return tdb_logerr(tdb, ecode, TDB_LOG_ERROR, - "tdb_transaction_recover:" - " failed to sync2 recovery"); - } - - tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING, - "tdb_transaction_recover: recovered %zu byte database", - (size_t)recovery_eof); - - /* all done */ - return TDB_SUCCESS; -} - -tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb) -{ - tdb_off_t recovery_head; - struct tdb_recovery_record rec; - enum TDB_ERROR ecode; - - /* find the recovery area */ - recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery)); - if (TDB_OFF_IS_ERR(recovery_head)) { - return recovery_head; - } - - if (recovery_head == 0) { - /* we have never allocated a recovery record */ - return false; - } - - /* read the recovery record */ - ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec)); - if (ecode != TDB_SUCCESS) { - return TDB_ERR_TO_OFF(ecode); - } - - return (rec.magic == TDB_RECOVERY_MAGIC); -} diff --git a/ccan/tdb2/traverse.c b/ccan/tdb2/traverse.c deleted file mode 100644 index 0bf41899..00000000 --- a/ccan/tdb2/traverse.c +++ /dev/null @@ -1,134 +0,0 @@ - /* - Trivial Database 2: traverse function. - Copyright (C) Rusty Russell 2010 - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 3 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, see . -*/ -#include "private.h" -#include - -int64_t tdb_traverse_(struct tdb_context *tdb, - int (*fn)(struct tdb_context *, - TDB_DATA, TDB_DATA, void *), - void *p) -{ - enum TDB_ERROR ecode; - struct traverse_info tinfo; - struct tdb_data k, d; - int64_t count = 0; - - if (tdb->flags & TDB_VERSION1) { - count = tdb1_traverse(tdb, fn, p); - if (count == -1) - return TDB_ERR_TO_OFF(tdb->last_error); - return count; - } - - k.dptr = NULL; - for (ecode = first_in_hash(tdb, &tinfo, &k, &d.dsize); - ecode == TDB_SUCCESS; - ecode = next_in_hash(tdb, &tinfo, &k, &d.dsize)) { - d.dptr = k.dptr + k.dsize; - - count++; - if (fn && fn(tdb, k, d, p)) { - free(k.dptr); - tdb->last_error = TDB_SUCCESS; - return count; - } - free(k.dptr); - } - - if (ecode != TDB_ERR_NOEXIST) { - return TDB_ERR_TO_OFF(tdb->last_error = ecode); - } - tdb->last_error = TDB_SUCCESS; - return count; -} - -enum TDB_ERROR tdb_firstkey(struct tdb_context *tdb, struct tdb_data *key) -{ - struct traverse_info tinfo; - - if (tdb->flags & TDB_VERSION1) { - tdb->last_error = TDB_SUCCESS; - *key = tdb1_firstkey(tdb); - /* TDB1 didn't set error for last key. */ - if (!key->dptr && tdb->last_error == TDB_SUCCESS) { - tdb->last_error = TDB_ERR_NOEXIST; - } - return tdb->last_error; - } - - return tdb->last_error = first_in_hash(tdb, &tinfo, key, NULL); -} - -/* We lock twice, not very efficient. We could keep last key & tinfo cached. */ -enum TDB_ERROR tdb_nextkey(struct tdb_context *tdb, struct tdb_data *key) -{ - struct traverse_info tinfo; - struct hash_info h; - struct tdb_used_record rec; - - if (tdb->flags & TDB_VERSION1) { - struct tdb_data last_key = *key; - tdb->last_error = TDB_SUCCESS; - *key = tdb1_nextkey(tdb, last_key); - free(last_key.dptr); - /* TDB1 didn't set error for last key. */ - if (!key->dptr && tdb->last_error == TDB_SUCCESS) { - tdb->last_error = TDB_ERR_NOEXIST; - } - return tdb->last_error; - } - - tinfo.prev = find_and_lock(tdb, *key, F_RDLCK, &h, &rec, &tinfo); - free(key->dptr); - if (TDB_OFF_IS_ERR(tinfo.prev)) { - return tdb->last_error = TDB_OFF_TO_ERR(tinfo.prev); - } - tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK); - - return tdb->last_error = next_in_hash(tdb, &tinfo, key, NULL); -} - -static int wipe_one(struct tdb_context *tdb, - TDB_DATA key, TDB_DATA data, enum TDB_ERROR *ecode) -{ - *ecode = tdb_delete(tdb, key); - return (*ecode != TDB_SUCCESS); -} - -enum TDB_ERROR tdb_wipe_all(struct tdb_context *tdb) -{ - enum TDB_ERROR ecode; - int64_t count; - - if (tdb->flags & TDB_VERSION1) { - if (tdb1_wipe_all(tdb) == -1) - return tdb->last_error; - return TDB_SUCCESS; - } - - ecode = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false); - if (ecode != TDB_SUCCESS) - return tdb->last_error = ecode; - - /* FIXME: Be smarter. */ - count = tdb_traverse(tdb, wipe_one, &ecode); - if (count < 0) - ecode = TDB_OFF_TO_ERR(count); - tdb_allrecord_unlock(tdb, F_WRLCK); - return tdb->last_error = ecode; -}