2 Trivial Database 2: free list/block handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
25 tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
33 static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
36 struct tdb_header hdr;
38 if (tdb_read_convert(tdb, 0, &hdr, sizeof(hdr)) == -1)
40 /* magic food should not be converted, so convert back. */
41 tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
43 hash_test = TDB_HASH_MAGIC;
44 hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
45 if (hdr.hash_test != hash_test) {
46 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
47 "check: hash test %llu should be %llu",
48 (long long)hdr.hash_test,
49 (long long)hash_test);
53 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
54 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
55 "check: bad magic '%.*s'",
56 (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
60 *recovery = hdr.recovery;
62 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
63 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
64 "tdb_check: invalid recovery offset %zu",
70 /* Don't check reserved: they *can* be used later. */
74 static bool check_hash_tree(struct tdb_context *tdb,
75 tdb_off_t off, unsigned int group_bits,
77 unsigned hprefix_bits,
81 int (*check)(TDB_DATA, TDB_DATA, void *),
84 static bool check_hash_chain(struct tdb_context *tdb,
90 int (*check)(TDB_DATA, TDB_DATA, void *),
93 struct tdb_used_record rec;
95 if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
98 if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
99 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
100 "tdb_check: Bad hash chain magic %llu",
101 (long long)rec_magic(&rec));
105 if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
106 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
107 "tdb_check: Bad hash chain length %llu vs %zu",
108 (long long)rec_data_length(&rec),
109 sizeof(struct tdb_chain));
112 if (rec_key_length(&rec) != 0) {
113 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
114 "tdb_check: Bad hash chain key length %llu",
115 (long long)rec_key_length(&rec));
118 if (rec_hash(&rec) != 0) {
119 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
120 "tdb_check: Bad hash chain hash value %llu",
121 (long long)rec_hash(&rec));
126 if (!check_hash_tree(tdb, off, 0, hash, 64,
127 used, num_used, num_found, check, private_data))
130 off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
131 if (off == TDB_OFF_ERR)
136 return check_hash_chain(tdb, off, hash, used, num_used, num_found,
137 check, private_data);
140 static bool check_hash_record(struct tdb_context *tdb,
143 unsigned hprefix_bits,
147 int (*check)(TDB_DATA, TDB_DATA, void *),
150 struct tdb_used_record rec;
152 if (hprefix_bits >= 64)
153 return check_hash_chain(tdb, off, hprefix, used, num_used,
154 num_found, check, private_data);
156 if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
159 if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
160 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
161 "tdb_check: Bad hash table magic %llu",
162 (long long)rec_magic(&rec));
165 if (rec_data_length(&rec)
166 != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
167 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
168 "tdb_check: Bad hash table length %llu vs %llu",
169 (long long)rec_data_length(&rec),
170 (long long)sizeof(tdb_off_t)
171 << TDB_SUBLEVEL_HASH_BITS);
174 if (rec_key_length(&rec) != 0) {
175 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
176 "tdb_check: Bad hash table key length %llu",
177 (long long)rec_key_length(&rec));
180 if (rec_hash(&rec) != 0) {
181 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
182 "tdb_check: Bad hash table hash value %llu",
183 (long long)rec_hash(&rec));
188 return check_hash_tree(tdb, off,
189 TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
190 hprefix, hprefix_bits,
191 used, num_used, num_found, check, private_data);
194 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
196 /* Can overflow an int. */
202 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
206 return (h >> (64 - *used)) & ((1U << num) - 1);
209 static bool check_hash_tree(struct tdb_context *tdb,
210 tdb_off_t off, unsigned int group_bits,
212 unsigned hprefix_bits,
216 int (*check)(TDB_DATA, TDB_DATA, void *),
220 const tdb_off_t *hash;
221 struct tdb_used_record rec;
223 hash = tdb_access_read(tdb, off,
225 << (group_bits + TDB_HASH_GROUP_BITS),
230 for (g = 0; g < (1 << group_bits); g++) {
231 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
232 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
233 unsigned int bucket, i, used_bits;
239 off = group[b] & TDB_OFF_MASK;
240 p = asearch(&off, used, num_used, off_cmp);
242 tdb_logerr(tdb, TDB_ERR_CORRUPT,
244 "tdb_check: Invalid offset %llu "
245 "in hash", (long long)off);
248 /* Mark it invalid. */
252 if (hprefix_bits == 64) {
253 /* Chained entries are unordered. */
254 if (is_subhash(group[b])) {
255 tdb_logerr(tdb, TDB_ERR_CORRUPT,
257 "tdb_check: Invalid chain"
261 h = hash_record(tdb, off);
263 tdb_logerr(tdb, TDB_ERR_CORRUPT,
265 "check: bad hash chain"
272 if (tdb_read_convert(tdb, off, &rec,
278 if (is_subhash(group[b])) {
281 << (group_bits + TDB_HASH_GROUP_BITS))
282 + g * (1 << TDB_HASH_GROUP_BITS) + b;
284 if (!check_hash_record(tdb,
285 group[b] & TDB_OFF_MASK,
289 + TDB_HASH_GROUP_BITS,
290 used, num_used, num_found,
291 check, private_data))
297 /* Does it belong here at all? */
298 h = hash_record(tdb, off);
300 if (get_bits(h, hprefix_bits, &used_bits) != hprefix
302 tdb_logerr(tdb, TDB_ERR_CORRUPT,
304 "check: bad hash placement"
306 (long long)h, (long long)hprefix);
310 /* Does it belong in this group? */
311 if (get_bits(h, group_bits, &used_bits) != g) {
312 tdb_logerr(tdb, TDB_ERR_CORRUPT,
314 "check: bad group %llu vs %u",
319 /* Are bucket bits correct? */
320 bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
321 if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
323 used_bits -= TDB_HASH_GROUP_BITS;
324 tdb_logerr(tdb, TDB_ERR_CORRUPT,
326 "check: bad bucket %u vs %u",
327 (unsigned)get_bits(h,
334 /* There must not be any zero entries between
335 * the bucket it belongs in and this one! */
338 i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
340 tdb_logerr(tdb, TDB_ERR_CORRUPT,
342 "check: bad group placement"
349 if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
352 /* Bottom bits must match header. */
353 if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
354 tdb_logerr(tdb, TDB_ERR_CORRUPT,
356 "tdb_check: Bad hash magic at"
357 " offset %llu (0x%llx vs 0x%llx)",
360 (long long)rec_hash(&rec));
367 key.dsize = rec_key_length(&rec);
368 data.dsize = rec_data_length(&rec);
369 key.dptr = (void *)tdb_access_read(tdb,
371 key.dsize + data.dsize,
375 data.dptr = key.dptr + key.dsize;
376 if (check(key, data, private_data) != 0)
378 tdb_access_release(tdb, key.dptr);
382 tdb_access_release(tdb, hash);
386 tdb_access_release(tdb, hash);
390 static bool check_hash(struct tdb_context *tdb,
392 size_t num_used, size_t num_ftables,
393 int (*check)(TDB_DATA, TDB_DATA, void *),
396 /* Free tables also show up as used. */
397 size_t num_found = num_ftables;
399 if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
400 TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
401 0, 0, used, num_used, &num_found,
402 check, private_data))
405 if (num_found != num_used) {
406 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
407 "tdb_check: Not all entries are in hash");
413 static bool check_free(struct tdb_context *tdb,
415 const struct tdb_free_record *frec,
416 tdb_off_t prev, unsigned int ftable,
419 if (frec_magic(frec) != TDB_FREE_MAGIC) {
420 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
421 "tdb_check: offset %llu bad magic 0x%llx",
422 (long long)off, (long long)frec->magic_and_prev);
425 if (frec_ftable(frec) != ftable) {
426 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
427 "tdb_check: offset %llu bad freetable %u",
428 (long long)off, frec_ftable(frec));
432 if (tdb->methods->oob(tdb, off
433 + frec_len(frec) + sizeof(struct tdb_used_record),
436 if (size_to_bucket(frec_len(frec)) != bucket) {
437 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
438 "tdb_check: offset %llu in wrong bucket %u vs %u",
440 bucket, size_to_bucket(frec_len(frec)));
443 if (prev != frec_prev(frec)) {
444 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
445 "tdb_check: offset %llu bad prev %llu vs %llu",
447 (long long)prev, (long long)frec_len(frec));
453 static bool check_free_table(struct tdb_context *tdb,
454 tdb_off_t ftable_off,
460 struct tdb_freetable ft;
464 if (tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft)) == -1)
467 if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
468 || rec_key_length(&ft.hdr) != 0
469 || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
470 || rec_hash(&ft.hdr) != 0) {
471 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
472 "tdb_check: Invalid header on free table");
476 for (i = 0; i < TDB_FREE_BUCKETS; i++) {
477 tdb_off_t off, prev = 0, *p;
478 struct tdb_free_record f;
480 h = bucket_off(ftable_off, i);
481 for (off = tdb_read_off(tdb, h); off; off = f.next) {
482 if (off == TDB_OFF_ERR)
484 if (tdb_read_convert(tdb, off, &f, sizeof(f)))
486 if (!check_free(tdb, off, &f, prev, ftable_num, i))
489 /* FIXME: Check hash bits */
490 p = asearch(&off, free, num_free, off_cmp);
492 tdb_logerr(tdb, TDB_ERR_CORRUPT,
494 "tdb_check: Invalid offset"
495 " %llu in free table",
499 /* Mark it invalid. */
508 /* Slow, but should be very rare. */
509 size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
513 for (len = 0; off + len < tdb->map_size; len++) {
515 if (tdb->methods->read(tdb, off, &c, 1))
517 if (c != 0 && c != 0x43)
523 static bool check_linear(struct tdb_context *tdb,
524 tdb_off_t **used, size_t *num_used,
525 tdb_off_t **free, size_t *num_free,
530 bool found_recovery = false;
532 for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
534 struct tdb_used_record u;
535 struct tdb_free_record f;
536 struct tdb_recovery_record r;
538 /* r is larger: only get that if we need to. */
539 if (tdb_read_convert(tdb, off, &rec, sizeof(rec.f)) == -1)
542 /* If we crash after ftruncate, we can get zeroes or fill. */
543 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
544 || rec.r.magic == 0x4343434343434343ULL) {
545 if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
548 if (recovery == off) {
549 found_recovery = true;
550 len = sizeof(rec.r) + rec.r.max_len;
552 len = dead_space(tdb, off);
553 if (len < sizeof(rec.r)) {
554 tdb_logerr(tdb, TDB_ERR_CORRUPT,
556 "tdb_check: invalid dead"
562 tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_WARNING,
563 "Dead space at %zu-%zu (of %zu)",
564 (size_t)off, (size_t)(off + len),
565 (size_t)tdb->map_size);
567 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
568 if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
570 if (recovery != off) {
571 tdb_logerr(tdb, TDB_ERR_CORRUPT,
573 "tdb_check: unexpected recovery"
574 " record at offset %zu",
578 if (rec.r.len > rec.r.max_len) {
579 tdb_logerr(tdb, TDB_ERR_CORRUPT,
581 "tdb_check: invalid recovery length"
582 " %zu", (size_t)rec.r.len);
585 if (rec.r.eof > tdb->map_size) {
586 tdb_logerr(tdb, TDB_ERR_CORRUPT,
588 "tdb_check: invalid old EOF"
589 " %zu", (size_t)rec.r.eof);
592 found_recovery = true;
593 len = sizeof(rec.r) + rec.r.max_len;
594 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
595 len = sizeof(rec.u) + frec_len(&rec.f);
596 if (off + len > tdb->map_size) {
597 tdb_logerr(tdb, TDB_ERR_CORRUPT,
599 "tdb_check: free overlength %llu"
601 (long long)len, (long long)off);
604 /* This record should be in free lists. */
605 if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
606 && !append(free, num_free, off))
608 } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
609 || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
610 || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
611 || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
612 uint64_t klen, dlen, extra;
614 /* This record is used! */
615 if (!append(used, num_used, off))
618 klen = rec_key_length(&rec.u);
619 dlen = rec_data_length(&rec.u);
620 extra = rec_extra_padding(&rec.u);
622 len = sizeof(rec.u) + klen + dlen + extra;
623 if (off + len > tdb->map_size) {
624 tdb_logerr(tdb, TDB_ERR_CORRUPT,
626 "tdb_check: used overlength %llu"
628 (long long)len, (long long)off);
632 if (len < sizeof(rec.f)) {
633 tdb_logerr(tdb, TDB_ERR_CORRUPT,
635 "tdb_check: too short record %llu"
637 (long long)len, (long long)off);
641 tdb_logerr(tdb, TDB_ERR_CORRUPT,
643 "tdb_check: Bad magic 0x%llx at offset %zu",
644 (long long)rec_magic(&rec.u), (size_t)off);
649 /* We must have found recovery area if there was one. */
650 if (recovery != 0 && !found_recovery) {
651 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
652 "tdb_check: expected a recovery area at %zu",
660 int tdb_check(struct tdb_context *tdb,
661 int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
664 tdb_off_t *free = NULL, *used = NULL, ft, recovery;
665 size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
667 if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
670 if (tdb_lock_expand(tdb, F_RDLCK) != 0) {
671 tdb_allrecord_unlock(tdb, F_RDLCK);
675 if (!check_header(tdb, &recovery))
678 /* First we do a linear scan, checking all records. */
679 if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery))
682 for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
683 if (ft == TDB_OFF_ERR)
685 if (!check_free_table(tdb, ft, num_ftables, free, num_free,
691 /* FIXME: Check key uniqueness? */
692 if (!check_hash(tdb, used, num_used, num_ftables, check, private_data))
695 if (num_found != num_free) {
696 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
697 "tdb_check: Not all entries are in free table");
701 tdb_allrecord_unlock(tdb, F_RDLCK);
702 tdb_unlock_expand(tdb, F_RDLCK);
706 tdb_allrecord_unlock(tdb, F_RDLCK);
707 tdb_unlock_expand(tdb, F_RDLCK);