2 Trivial Database 2: free list/block handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
25 tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
33 static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
36 struct tdb_header hdr;
38 if (tdb_read_convert(tdb, 0, &hdr, sizeof(hdr)) == -1)
40 /* magic food should not be converted, so convert back. */
41 tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
43 hash_test = TDB_HASH_MAGIC;
44 hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
45 if (hdr.hash_test != hash_test) {
46 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
47 "check: hash test %llu should be %llu\n",
48 (long long)hdr.hash_test,
49 (long long)hash_test);
53 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
54 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
55 "check: bad magic '%.*s'\n",
56 (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
60 *recovery = hdr.recovery;
62 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
63 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
64 "tdb_check: invalid recovery offset %zu\n",
70 /* Don't check reserved: they *can* be used later. */
74 static bool check_hash_tree(struct tdb_context *tdb,
75 tdb_off_t off, unsigned int group_bits,
77 unsigned hprefix_bits,
82 static bool check_hash_record(struct tdb_context *tdb,
85 unsigned hprefix_bits,
90 struct tdb_used_record rec;
92 if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
95 if (rec_data_length(&rec)
96 != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
97 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
98 "tdb_check: Bad hash table length %llu vs %llu\n",
99 (long long)rec_data_length(&rec),
100 (long long)sizeof(tdb_off_t)<<TDB_SUBLEVEL_HASH_BITS);
103 if (rec_key_length(&rec) != 0) {
104 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
105 "tdb_check: Bad hash table key length %llu\n",
106 (long long)rec_key_length(&rec));
109 if (rec_hash(&rec) != 0) {
110 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
111 "tdb_check: Bad hash table hash value %llu\n",
112 (long long)rec_hash(&rec));
117 return check_hash_tree(tdb, off,
118 TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
119 hprefix, hprefix_bits,
120 used, num_used, num_found);
123 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
125 /* Can overflow an int. */
131 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
135 return (h >> (64 - *used)) & ((1U << num) - 1);
138 static bool check_hash_tree(struct tdb_context *tdb,
139 tdb_off_t off, unsigned int group_bits,
141 unsigned hprefix_bits,
147 const tdb_off_t *hash;
148 struct tdb_used_record rec;
150 hash = tdb_access_read(tdb, off,
152 << (group_bits + TDB_HASH_GROUP_BITS),
157 for (g = 0; g < (1 << group_bits); g++) {
158 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
159 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
160 unsigned int bucket, i, used_bits;
166 off = group[b] & TDB_OFF_MASK;
167 p = asearch(&off, used, num_used, off_cmp);
169 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
170 "tdb_check: Invalid offset %llu "
175 /* Mark it invalid. */
179 if (is_subhash(group[b])) {
182 << (group_bits + TDB_HASH_GROUP_BITS))
183 + g * (1 << TDB_HASH_GROUP_BITS) + b;
185 if (!check_hash_record(tdb,
186 group[b] & TDB_OFF_MASK,
190 + TDB_HASH_GROUP_BITS,
191 used, num_used, num_found))
197 /* Does it belong here at all? */
198 h = hash_record(tdb, off);
200 if (get_bits(h, hprefix_bits, &used_bits) != hprefix
202 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
203 "check: bad hash placement"
204 " 0x%llx vs 0x%llx\n",
205 (long long)h, (long long)hprefix);
209 /* Does it belong in this group? */
210 if (get_bits(h, group_bits, &used_bits) != g) {
211 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
212 "check: bad group %llu vs %u\n",
217 /* Are bucket bits correct? */
218 bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
219 if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
221 used_bits -= TDB_HASH_GROUP_BITS;
222 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
223 "check: bad bucket %u vs %u\n",
224 (unsigned)get_bits(h,
231 /* There must not be any zero entries between
232 * the bucket it belongs in and this one! */
235 i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
237 tdb->log(tdb, TDB_DEBUG_ERROR,
239 "check: bad group placement"
246 if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
249 /* Bottom bits must match header. */
250 if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
251 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
252 "tdb_check: Bad hash magic at"
253 " offset %llu (0x%llx vs 0x%llx)\n",
256 (long long)rec_hash(&rec));
261 tdb_access_release(tdb, hash);
265 tdb_access_release(tdb, hash);
269 static bool check_hash(struct tdb_context *tdb,
271 size_t num_used, size_t num_flists)
273 /* Free lists also show up as used. */
274 size_t num_found = num_flists;
276 if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
277 TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
278 0, 0, used, num_used, &num_found))
281 if (num_found != num_used) {
282 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
283 "tdb_check: Not all entries are in hash\n");
289 static bool check_free(struct tdb_context *tdb,
291 const struct tdb_free_record *frec,
292 tdb_off_t prev, tdb_off_t flist_off, unsigned int bucket)
294 if (frec_magic(frec) != TDB_FREE_MAGIC) {
295 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
296 "tdb_check: offset %llu bad magic 0x%llx\n",
297 (long long)off, (long long)frec->magic_and_meta);
300 if (frec_flist(frec) != flist_off) {
301 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
302 "tdb_check: offset %llu bad freelist 0x%llx\n",
303 (long long)off, (long long)frec_flist(frec));
307 if (tdb->methods->oob(tdb, off
308 + frec->data_len+sizeof(struct tdb_used_record),
311 if (size_to_bucket(frec->data_len) != bucket) {
312 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
313 "tdb_check: offset %llu in wrong bucket %u vs %u\n",
315 bucket, size_to_bucket(frec->data_len));
318 if (prev != frec->prev) {
319 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
320 "tdb_check: offset %llu bad prev %llu vs %llu\n",
322 (long long)prev, (long long)frec->prev);
328 static bool check_free_list(struct tdb_context *tdb,
334 struct tdb_freelist flist;
338 if (tdb_read_convert(tdb, flist_off, &flist, sizeof(flist)) == -1)
341 if (rec_magic(&flist.hdr) != TDB_MAGIC
342 || rec_key_length(&flist.hdr) != 0
343 || rec_data_length(&flist.hdr) != sizeof(flist) - sizeof(flist.hdr)
344 || rec_hash(&flist.hdr) != 1) {
345 tdb->log(tdb, TDB_DEBUG_ERROR,
347 "tdb_check: Invalid header on free list\n");
351 for (i = 0; i < TDB_FREE_BUCKETS; i++) {
352 tdb_off_t off, prev = 0, *p;
353 struct tdb_free_record f;
355 h = bucket_off(flist_off, i);
356 for (off = tdb_read_off(tdb, h); off; off = f.next) {
357 if (off == TDB_OFF_ERR)
359 if (tdb_read_convert(tdb, off, &f, sizeof(f)))
361 if (!check_free(tdb, off, &f, prev, flist_off, i))
364 /* FIXME: Check hash bits */
365 p = asearch(&off, free, num_free, off_cmp);
367 tdb->log(tdb, TDB_DEBUG_ERROR,
369 "tdb_check: Invalid offset"
370 " %llu in free table\n",
374 /* Mark it invalid. */
383 /* Slow, but should be very rare. */
384 size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
388 for (len = 0; off + len < tdb->map_size; len++) {
390 if (tdb->methods->read(tdb, off, &c, 1))
392 if (c != 0 && c != 0x43)
398 static bool check_linear(struct tdb_context *tdb,
399 tdb_off_t **used, size_t *num_used,
400 tdb_off_t **free, size_t *num_free,
405 bool found_recovery = false;
407 for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
409 struct tdb_used_record u;
410 struct tdb_free_record f;
411 struct tdb_recovery_record r;
413 p = tdb_get(tdb, off, &pad, sizeof(pad));
417 /* If we crash after ftruncate, we can get zeroes or fill. */
418 if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
419 || p->r.magic == 0x4343434343434343ULL) {
420 if (recovery == off) {
421 found_recovery = true;
422 len = sizeof(p->r) + p->r.max_len;
424 len = dead_space(tdb, off);
425 if (len < sizeof(p->r)) {
426 tdb->log(tdb, TDB_DEBUG_ERROR,
428 "tdb_check: invalid dead space"
429 " at %zu\n", (size_t)off);
433 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
434 "Dead space at %zu-%zu (of %zu)\n",
435 (size_t)off, (size_t)(off + len),
436 (size_t)tdb->map_size);
438 } else if (p->r.magic == TDB_RECOVERY_MAGIC) {
439 if (recovery != off) {
440 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
441 "tdb_check: unexpected recovery"
442 " record at offset %zu\n",
446 found_recovery = true;
447 len = sizeof(p->r) + p->r.max_len;
448 } else if (frec_magic(&p->f) == TDB_FREE_MAGIC
449 || frec_magic(&p->f) == TDB_COALESCING_MAGIC) {
450 len = sizeof(p->u) + p->f.data_len;
451 if (off + len > tdb->map_size) {
452 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
453 "tdb_check: free overlength %llu"
455 (long long)len, (long long)off);
458 /* This record is free! */
459 if (frec_magic(&p->f) == TDB_FREE_MAGIC
460 && !append(free, num_free, off))
463 uint64_t klen, dlen, extra;
465 /* This record is used! */
466 if (rec_magic(&p->u) != TDB_MAGIC) {
467 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
468 "tdb_check: Bad magic 0x%llx"
470 (long long)rec_magic(&p->u),
475 if (!append(used, num_used, off))
478 klen = rec_key_length(&p->u);
479 dlen = rec_data_length(&p->u);
480 extra = rec_extra_padding(&p->u);
482 len = sizeof(p->u) + klen + dlen + extra;
483 if (off + len > tdb->map_size) {
484 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
485 "tdb_check: used overlength %llu"
487 (long long)len, (long long)off);
491 if (len < sizeof(p->f)) {
492 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
493 "tdb_check: too short record %llu at"
495 (long long)len, (long long)off);
501 /* We must have found recovery area if there was one. */
502 if (recovery != 0 && !found_recovery) {
503 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
504 "tdb_check: expected a recovery area at %zu\n",
512 /* FIXME: call check() function. */
513 int tdb_check(struct tdb_context *tdb,
514 int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
517 tdb_off_t *free = NULL, *used = NULL, flist, recovery;
518 size_t num_free = 0, num_used = 0, num_found = 0, num_flists = 0;
520 if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
523 if (tdb_lock_expand(tdb, F_RDLCK) != 0) {
524 tdb_allrecord_unlock(tdb, F_RDLCK);
528 if (!check_header(tdb, &recovery))
531 /* First we do a linear scan, checking all records. */
532 if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery))
535 for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
536 if (flist == TDB_OFF_ERR)
538 if (!check_free_list(tdb, flist, free, num_free, &num_found))
543 /* FIXME: Check key uniqueness? */
544 if (!check_hash(tdb, used, num_used, num_flists))
547 if (num_found != num_free) {
548 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
549 "tdb_check: Not all entries are in free table\n");
553 tdb_allrecord_unlock(tdb, F_RDLCK);
554 tdb_unlock_expand(tdb, F_RDLCK);
558 tdb_allrecord_unlock(tdb, F_RDLCK);
559 tdb_unlock_expand(tdb, F_RDLCK);