2 Trivial Database 2: free list/block handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
25 tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
33 static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
36 struct tdb_header hdr;
39 ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
40 if (ecode != TDB_SUCCESS) {
44 /* magic food should not be converted, so convert back. */
45 tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
47 hash_test = TDB_HASH_MAGIC;
48 hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
49 if (hdr.hash_test != hash_test) {
50 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
51 "check: hash test %llu should be %llu",
52 (long long)hdr.hash_test,
53 (long long)hash_test);
57 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
58 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
59 "check: bad magic '%.*s'",
60 (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
64 *recovery = hdr.recovery;
66 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
67 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
68 "tdb_check: invalid recovery offset %zu",
74 /* Don't check reserved: they *can* be used later. */
78 static bool check_hash_tree(struct tdb_context *tdb,
79 tdb_off_t off, unsigned int group_bits,
81 unsigned hprefix_bits,
85 int (*check)(TDB_DATA, TDB_DATA, void *),
88 static bool check_hash_chain(struct tdb_context *tdb,
94 int (*check)(TDB_DATA, TDB_DATA, void *),
97 struct tdb_used_record rec;
100 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
101 if (ecode != TDB_SUCCESS) {
106 if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
107 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
108 "tdb_check: Bad hash chain magic %llu",
109 (long long)rec_magic(&rec));
113 if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
114 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
115 "tdb_check: Bad hash chain length %llu vs %zu",
116 (long long)rec_data_length(&rec),
117 sizeof(struct tdb_chain));
120 if (rec_key_length(&rec) != 0) {
121 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
122 "tdb_check: Bad hash chain key length %llu",
123 (long long)rec_key_length(&rec));
126 if (rec_hash(&rec) != 0) {
127 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
128 "tdb_check: Bad hash chain hash value %llu",
129 (long long)rec_hash(&rec));
134 if (!check_hash_tree(tdb, off, 0, hash, 64,
135 used, num_used, num_found, check, private_data))
138 off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
139 if (off == TDB_OFF_ERR)
144 return check_hash_chain(tdb, off, hash, used, num_used, num_found,
145 check, private_data);
148 static bool check_hash_record(struct tdb_context *tdb,
151 unsigned hprefix_bits,
155 int (*check)(TDB_DATA, TDB_DATA, void *),
158 struct tdb_used_record rec;
159 enum TDB_ERROR ecode;
161 if (hprefix_bits >= 64)
162 return check_hash_chain(tdb, off, hprefix, used, num_used,
163 num_found, check, private_data);
165 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
166 if (ecode != TDB_SUCCESS) {
171 if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
172 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
173 "tdb_check: Bad hash table magic %llu",
174 (long long)rec_magic(&rec));
177 if (rec_data_length(&rec)
178 != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
179 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
180 "tdb_check: Bad hash table length %llu vs %llu",
181 (long long)rec_data_length(&rec),
182 (long long)sizeof(tdb_off_t)
183 << TDB_SUBLEVEL_HASH_BITS);
186 if (rec_key_length(&rec) != 0) {
187 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
188 "tdb_check: Bad hash table key length %llu",
189 (long long)rec_key_length(&rec));
192 if (rec_hash(&rec) != 0) {
193 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
194 "tdb_check: Bad hash table hash value %llu",
195 (long long)rec_hash(&rec));
200 return check_hash_tree(tdb, off,
201 TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
202 hprefix, hprefix_bits,
203 used, num_used, num_found, check, private_data);
206 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
208 /* Can overflow an int. */
214 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
218 return (h >> (64 - *used)) & ((1U << num) - 1);
221 static bool check_hash_tree(struct tdb_context *tdb,
222 tdb_off_t off, unsigned int group_bits,
224 unsigned hprefix_bits,
228 int (*check)(TDB_DATA, TDB_DATA, void *),
232 const tdb_off_t *hash;
233 struct tdb_used_record rec;
234 enum TDB_ERROR ecode;
236 hash = tdb_access_read(tdb, off,
238 << (group_bits + TDB_HASH_GROUP_BITS),
243 for (g = 0; g < (1 << group_bits); g++) {
244 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
245 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
246 unsigned int bucket, i, used_bits;
252 off = group[b] & TDB_OFF_MASK;
253 p = asearch(&off, used, num_used, off_cmp);
255 tdb_logerr(tdb, TDB_ERR_CORRUPT,
257 "tdb_check: Invalid offset %llu "
258 "in hash", (long long)off);
261 /* Mark it invalid. */
265 if (hprefix_bits == 64) {
266 /* Chained entries are unordered. */
267 if (is_subhash(group[b])) {
268 tdb_logerr(tdb, TDB_ERR_CORRUPT,
270 "tdb_check: Invalid chain"
274 h = hash_record(tdb, off);
276 tdb_logerr(tdb, TDB_ERR_CORRUPT,
278 "check: bad hash chain"
285 ecode = tdb_read_convert(tdb, off, &rec,
287 if (ecode != TDB_SUCCESS) {
294 if (is_subhash(group[b])) {
297 << (group_bits + TDB_HASH_GROUP_BITS))
298 + g * (1 << TDB_HASH_GROUP_BITS) + b;
300 if (!check_hash_record(tdb,
301 group[b] & TDB_OFF_MASK,
305 + TDB_HASH_GROUP_BITS,
306 used, num_used, num_found,
307 check, private_data))
313 /* Does it belong here at all? */
314 h = hash_record(tdb, off);
316 if (get_bits(h, hprefix_bits, &used_bits) != hprefix
318 tdb_logerr(tdb, TDB_ERR_CORRUPT,
320 "check: bad hash placement"
322 (long long)h, (long long)hprefix);
326 /* Does it belong in this group? */
327 if (get_bits(h, group_bits, &used_bits) != g) {
328 tdb_logerr(tdb, TDB_ERR_CORRUPT,
330 "check: bad group %llu vs %u",
335 /* Are bucket bits correct? */
336 bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
337 if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
339 used_bits -= TDB_HASH_GROUP_BITS;
340 tdb_logerr(tdb, TDB_ERR_CORRUPT,
342 "check: bad bucket %u vs %u",
343 (unsigned)get_bits(h,
350 /* There must not be any zero entries between
351 * the bucket it belongs in and this one! */
354 i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
356 tdb_logerr(tdb, TDB_ERR_CORRUPT,
358 "check: bad group placement"
365 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
366 if (ecode != TDB_SUCCESS) {
371 /* Bottom bits must match header. */
372 if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
373 tdb_logerr(tdb, TDB_ERR_CORRUPT,
375 "tdb_check: Bad hash magic at"
376 " offset %llu (0x%llx vs 0x%llx)",
379 (long long)rec_hash(&rec));
386 key.dsize = rec_key_length(&rec);
387 data.dsize = rec_data_length(&rec);
388 key.dptr = (void *)tdb_access_read(tdb,
390 key.dsize + data.dsize,
394 data.dptr = key.dptr + key.dsize;
395 if (check(key, data, private_data) != 0)
397 tdb_access_release(tdb, key.dptr);
401 tdb_access_release(tdb, hash);
405 tdb_access_release(tdb, hash);
409 static bool check_hash(struct tdb_context *tdb,
411 size_t num_used, size_t num_ftables,
412 int (*check)(TDB_DATA, TDB_DATA, void *),
415 /* Free tables also show up as used. */
416 size_t num_found = num_ftables;
418 if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
419 TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
420 0, 0, used, num_used, &num_found,
421 check, private_data))
424 if (num_found != num_used) {
425 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
426 "tdb_check: Not all entries are in hash");
432 static bool check_free(struct tdb_context *tdb,
434 const struct tdb_free_record *frec,
435 tdb_off_t prev, unsigned int ftable,
438 enum TDB_ERROR ecode;
440 if (frec_magic(frec) != TDB_FREE_MAGIC) {
441 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
442 "tdb_check: offset %llu bad magic 0x%llx",
443 (long long)off, (long long)frec->magic_and_prev);
446 if (frec_ftable(frec) != ftable) {
447 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
448 "tdb_check: offset %llu bad freetable %u",
449 (long long)off, frec_ftable(frec));
453 ecode = tdb->methods->oob(tdb, off
455 + sizeof(struct tdb_used_record),
457 if (ecode != TDB_SUCCESS) {
461 if (size_to_bucket(frec_len(frec)) != bucket) {
462 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
463 "tdb_check: offset %llu in wrong bucket %u vs %u",
465 bucket, size_to_bucket(frec_len(frec)));
468 if (prev != frec_prev(frec)) {
469 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
470 "tdb_check: offset %llu bad prev %llu vs %llu",
472 (long long)prev, (long long)frec_len(frec));
478 static bool check_free_table(struct tdb_context *tdb,
479 tdb_off_t ftable_off,
485 struct tdb_freetable ft;
488 enum TDB_ERROR ecode;
490 ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
491 if (ecode != TDB_SUCCESS) {
496 if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
497 || rec_key_length(&ft.hdr) != 0
498 || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
499 || rec_hash(&ft.hdr) != 0) {
500 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
501 "tdb_check: Invalid header on free table");
505 for (i = 0; i < TDB_FREE_BUCKETS; i++) {
506 tdb_off_t off, prev = 0, *p;
507 struct tdb_free_record f;
509 h = bucket_off(ftable_off, i);
510 for (off = tdb_read_off(tdb, h); off; off = f.next) {
511 if (off == TDB_OFF_ERR)
513 ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
514 if (ecode != TDB_SUCCESS) {
518 if (!check_free(tdb, off, &f, prev, ftable_num, i))
521 /* FIXME: Check hash bits */
522 p = asearch(&off, fr, num_free, off_cmp);
524 tdb_logerr(tdb, TDB_ERR_CORRUPT,
526 "tdb_check: Invalid offset"
527 " %llu in free table",
531 /* Mark it invalid. */
540 /* Slow, but should be very rare. */
541 size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
544 enum TDB_ERROR ecode;
546 for (len = 0; off + len < tdb->map_size; len++) {
548 ecode = tdb->methods->tread(tdb, off, &c, 1);
549 if (ecode != TDB_SUCCESS) {
553 if (c != 0 && c != 0x43)
559 static bool check_linear(struct tdb_context *tdb,
560 tdb_off_t **used, size_t *num_used,
561 tdb_off_t **fr, size_t *num_free,
566 enum TDB_ERROR ecode;
567 bool found_recovery = false;
569 for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
571 struct tdb_used_record u;
572 struct tdb_free_record f;
573 struct tdb_recovery_record r;
575 /* r is larger: only get that if we need to. */
576 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
577 if (ecode != TDB_SUCCESS) {
582 /* If we crash after ftruncate, we can get zeroes or fill. */
583 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
584 || rec.r.magic == 0x4343434343434343ULL) {
585 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
586 if (ecode != TDB_SUCCESS) {
590 if (recovery == off) {
591 found_recovery = true;
592 len = sizeof(rec.r) + rec.r.max_len;
594 len = dead_space(tdb, off);
595 if (len < sizeof(rec.r)) {
596 tdb_logerr(tdb, TDB_ERR_CORRUPT,
598 "tdb_check: invalid dead"
604 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
605 "Dead space at %zu-%zu (of %zu)",
606 (size_t)off, (size_t)(off + len),
607 (size_t)tdb->map_size);
609 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
610 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
611 if (ecode != TDB_SUCCESS) {
615 if (recovery != off) {
616 tdb_logerr(tdb, TDB_ERR_CORRUPT,
618 "tdb_check: unexpected recovery"
619 " record at offset %zu",
623 if (rec.r.len > rec.r.max_len) {
624 tdb_logerr(tdb, TDB_ERR_CORRUPT,
626 "tdb_check: invalid recovery length"
627 " %zu", (size_t)rec.r.len);
630 if (rec.r.eof > tdb->map_size) {
631 tdb_logerr(tdb, TDB_ERR_CORRUPT,
633 "tdb_check: invalid old EOF"
634 " %zu", (size_t)rec.r.eof);
637 found_recovery = true;
638 len = sizeof(rec.r) + rec.r.max_len;
639 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
640 len = sizeof(rec.u) + frec_len(&rec.f);
641 if (off + len > tdb->map_size) {
642 tdb_logerr(tdb, TDB_ERR_CORRUPT,
644 "tdb_check: free overlength %llu"
646 (long long)len, (long long)off);
649 /* This record should be in free lists. */
650 if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
651 && !append(fr, num_free, off)) {
652 tdb_logerr(tdb, TDB_ERR_OOM,
654 "tdb_check: tracking %zu'th"
655 " free record.", *num_free);
658 } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
659 || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
660 || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
661 || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
662 uint64_t klen, dlen, extra;
664 /* This record is used! */
665 if (!append(used, num_used, off)) {
666 tdb_logerr(tdb, TDB_ERR_OOM,
668 "tdb_check: tracking %zu'th"
669 " used record.", *num_used);
673 klen = rec_key_length(&rec.u);
674 dlen = rec_data_length(&rec.u);
675 extra = rec_extra_padding(&rec.u);
677 len = sizeof(rec.u) + klen + dlen + extra;
678 if (off + len > tdb->map_size) {
679 tdb_logerr(tdb, TDB_ERR_CORRUPT,
681 "tdb_check: used overlength %llu"
683 (long long)len, (long long)off);
687 if (len < sizeof(rec.f)) {
688 tdb_logerr(tdb, TDB_ERR_CORRUPT,
690 "tdb_check: too short record %llu"
692 (long long)len, (long long)off);
696 tdb_logerr(tdb, TDB_ERR_CORRUPT,
698 "tdb_check: Bad magic 0x%llx at offset %zu",
699 (long long)rec_magic(&rec.u), (size_t)off);
704 /* We must have found recovery area if there was one. */
705 if (recovery != 0 && !found_recovery) {
706 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
707 "tdb_check: expected a recovery area at %zu",
715 int tdb_check(struct tdb_context *tdb,
716 int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
719 tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
720 size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
721 enum TDB_ERROR ecode;
723 ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
724 if (ecpde != TDB_SUCCESS) {
729 ecode = tdb_lock_expand(tdb, F_RDLCK);
730 if (ecode != TDB_SUCCESS) {
732 tdb_allrecord_unlock(tdb, F_RDLCK);
736 if (!check_header(tdb, &recovery))
739 /* First we do a linear scan, checking all records. */
740 if (!check_linear(tdb, &used, &num_used, &fr, &num_free, recovery))
743 for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
744 if (ft == TDB_OFF_ERR)
746 if (!check_free_table(tdb, ft, num_ftables, fr, num_free,
752 /* FIXME: Check key uniqueness? */
753 if (!check_hash(tdb, used, num_used, num_ftables, check, private_data))
756 if (num_found != num_free) {
757 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
758 "tdb_check: Not all entries are in free table");
762 tdb_allrecord_unlock(tdb, F_RDLCK);
763 tdb_unlock_expand(tdb, F_RDLCK);
771 tdb_allrecord_unlock(tdb, F_RDLCK);
772 tdb_unlock_expand(tdb, F_RDLCK);