2 Trivial Database 2: free list/block handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
25 tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
33 static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
36 struct tdb_header hdr;
39 ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
40 if (ecode != TDB_SUCCESS) {
44 /* magic food should not be converted, so convert back. */
45 tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
47 hash_test = TDB_HASH_MAGIC;
48 hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
49 if (hdr.hash_test != hash_test) {
50 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
51 "check: hash test %llu should be %llu",
52 (long long)hdr.hash_test,
53 (long long)hash_test);
57 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
58 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
59 "check: bad magic '%.*s'",
60 (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
64 *recovery = hdr.recovery;
66 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
67 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
68 "tdb_check: invalid recovery offset %zu",
74 /* Don't check reserved: they *can* be used later. */
78 static bool check_hash_tree(struct tdb_context *tdb,
79 tdb_off_t off, unsigned int group_bits,
81 unsigned hprefix_bits,
85 int (*check)(TDB_DATA, TDB_DATA, void *),
88 static bool check_hash_chain(struct tdb_context *tdb,
94 int (*check)(TDB_DATA, TDB_DATA, void *),
97 struct tdb_used_record rec;
100 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
101 if (ecode != TDB_SUCCESS) {
106 if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
107 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
108 "tdb_check: Bad hash chain magic %llu",
109 (long long)rec_magic(&rec));
113 if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
114 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
115 "tdb_check: Bad hash chain length %llu vs %zu",
116 (long long)rec_data_length(&rec),
117 sizeof(struct tdb_chain));
120 if (rec_key_length(&rec) != 0) {
121 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
122 "tdb_check: Bad hash chain key length %llu",
123 (long long)rec_key_length(&rec));
126 if (rec_hash(&rec) != 0) {
127 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
128 "tdb_check: Bad hash chain hash value %llu",
129 (long long)rec_hash(&rec));
134 if (!check_hash_tree(tdb, off, 0, hash, 64,
135 used, num_used, num_found, check, private_data))
138 off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
139 if (TDB_OFF_IS_ERR(off)) {
146 return check_hash_chain(tdb, off, hash, used, num_used, num_found,
147 check, private_data);
150 static bool check_hash_record(struct tdb_context *tdb,
153 unsigned hprefix_bits,
157 int (*check)(TDB_DATA, TDB_DATA, void *),
160 struct tdb_used_record rec;
161 enum TDB_ERROR ecode;
163 if (hprefix_bits >= 64)
164 return check_hash_chain(tdb, off, hprefix, used, num_used,
165 num_found, check, private_data);
167 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
168 if (ecode != TDB_SUCCESS) {
173 if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
174 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
175 "tdb_check: Bad hash table magic %llu",
176 (long long)rec_magic(&rec));
179 if (rec_data_length(&rec)
180 != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
181 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
182 "tdb_check: Bad hash table length %llu vs %llu",
183 (long long)rec_data_length(&rec),
184 (long long)sizeof(tdb_off_t)
185 << TDB_SUBLEVEL_HASH_BITS);
188 if (rec_key_length(&rec) != 0) {
189 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
190 "tdb_check: Bad hash table key length %llu",
191 (long long)rec_key_length(&rec));
194 if (rec_hash(&rec) != 0) {
195 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
196 "tdb_check: Bad hash table hash value %llu",
197 (long long)rec_hash(&rec));
202 return check_hash_tree(tdb, off,
203 TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
204 hprefix, hprefix_bits,
205 used, num_used, num_found, check, private_data);
208 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
210 /* Can overflow an int. */
216 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
220 return (h >> (64 - *used)) & ((1U << num) - 1);
223 static bool check_hash_tree(struct tdb_context *tdb,
224 tdb_off_t off, unsigned int group_bits,
226 unsigned hprefix_bits,
230 int (*check)(TDB_DATA, TDB_DATA, void *),
234 const tdb_off_t *hash;
235 struct tdb_used_record rec;
236 enum TDB_ERROR ecode;
238 hash = tdb_access_read(tdb, off,
240 << (group_bits + TDB_HASH_GROUP_BITS),
242 if (TDB_PTR_IS_ERR(hash)) {
243 tdb->ecode = TDB_PTR_ERR(hash);
247 for (g = 0; g < (1 << group_bits); g++) {
248 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
249 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
250 unsigned int bucket, i, used_bits;
256 off = group[b] & TDB_OFF_MASK;
257 p = asearch(&off, used, num_used, off_cmp);
259 tdb_logerr(tdb, TDB_ERR_CORRUPT,
261 "tdb_check: Invalid offset %llu "
262 "in hash", (long long)off);
265 /* Mark it invalid. */
269 if (hprefix_bits == 64) {
270 /* Chained entries are unordered. */
271 if (is_subhash(group[b])) {
272 tdb_logerr(tdb, TDB_ERR_CORRUPT,
274 "tdb_check: Invalid chain"
278 h = hash_record(tdb, off);
280 tdb_logerr(tdb, TDB_ERR_CORRUPT,
282 "check: bad hash chain"
289 ecode = tdb_read_convert(tdb, off, &rec,
291 if (ecode != TDB_SUCCESS) {
298 if (is_subhash(group[b])) {
301 << (group_bits + TDB_HASH_GROUP_BITS))
302 + g * (1 << TDB_HASH_GROUP_BITS) + b;
304 if (!check_hash_record(tdb,
305 group[b] & TDB_OFF_MASK,
309 + TDB_HASH_GROUP_BITS,
310 used, num_used, num_found,
311 check, private_data))
317 /* Does it belong here at all? */
318 h = hash_record(tdb, off);
320 if (get_bits(h, hprefix_bits, &used_bits) != hprefix
322 tdb_logerr(tdb, TDB_ERR_CORRUPT,
324 "check: bad hash placement"
326 (long long)h, (long long)hprefix);
330 /* Does it belong in this group? */
331 if (get_bits(h, group_bits, &used_bits) != g) {
332 tdb_logerr(tdb, TDB_ERR_CORRUPT,
334 "check: bad group %llu vs %u",
339 /* Are bucket bits correct? */
340 bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
341 if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
343 used_bits -= TDB_HASH_GROUP_BITS;
344 tdb_logerr(tdb, TDB_ERR_CORRUPT,
346 "check: bad bucket %u vs %u",
347 (unsigned)get_bits(h,
354 /* There must not be any zero entries between
355 * the bucket it belongs in and this one! */
358 i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
360 tdb_logerr(tdb, TDB_ERR_CORRUPT,
362 "check: bad group placement"
369 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
370 if (ecode != TDB_SUCCESS) {
375 /* Bottom bits must match header. */
376 if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
377 tdb_logerr(tdb, TDB_ERR_CORRUPT,
379 "tdb_check: Bad hash magic at"
380 " offset %llu (0x%llx vs 0x%llx)",
383 (long long)rec_hash(&rec));
390 key.dsize = rec_key_length(&rec);
391 data.dsize = rec_data_length(&rec);
392 key.dptr = (void *)tdb_access_read(tdb,
394 key.dsize + data.dsize,
396 if (TDB_PTR_IS_ERR(key.dptr)) {
397 tdb->ecode = TDB_PTR_ERR(key.dptr);
400 data.dptr = key.dptr + key.dsize;
401 if (check(key, data, private_data) != 0)
403 tdb_access_release(tdb, key.dptr);
407 tdb_access_release(tdb, hash);
411 tdb_access_release(tdb, hash);
415 static bool check_hash(struct tdb_context *tdb,
417 size_t num_used, size_t num_ftables,
418 int (*check)(TDB_DATA, TDB_DATA, void *),
421 /* Free tables also show up as used. */
422 size_t num_found = num_ftables;
424 if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
425 TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
426 0, 0, used, num_used, &num_found,
427 check, private_data))
430 if (num_found != num_used) {
431 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
432 "tdb_check: Not all entries are in hash");
438 static bool check_free(struct tdb_context *tdb,
440 const struct tdb_free_record *frec,
441 tdb_off_t prev, unsigned int ftable,
444 enum TDB_ERROR ecode;
446 if (frec_magic(frec) != TDB_FREE_MAGIC) {
447 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
448 "tdb_check: offset %llu bad magic 0x%llx",
449 (long long)off, (long long)frec->magic_and_prev);
452 if (frec_ftable(frec) != ftable) {
453 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
454 "tdb_check: offset %llu bad freetable %u",
455 (long long)off, frec_ftable(frec));
459 ecode = tdb->methods->oob(tdb, off
461 + sizeof(struct tdb_used_record),
463 if (ecode != TDB_SUCCESS) {
467 if (size_to_bucket(frec_len(frec)) != bucket) {
468 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
469 "tdb_check: offset %llu in wrong bucket %u vs %u",
471 bucket, size_to_bucket(frec_len(frec)));
474 if (prev != frec_prev(frec)) {
475 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
476 "tdb_check: offset %llu bad prev %llu vs %llu",
478 (long long)prev, (long long)frec_len(frec));
484 static bool check_free_table(struct tdb_context *tdb,
485 tdb_off_t ftable_off,
491 struct tdb_freetable ft;
494 enum TDB_ERROR ecode;
496 ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
497 if (ecode != TDB_SUCCESS) {
502 if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
503 || rec_key_length(&ft.hdr) != 0
504 || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
505 || rec_hash(&ft.hdr) != 0) {
506 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
507 "tdb_check: Invalid header on free table");
511 for (i = 0; i < TDB_FREE_BUCKETS; i++) {
512 tdb_off_t off, prev = 0, *p;
513 struct tdb_free_record f;
515 h = bucket_off(ftable_off, i);
516 for (off = tdb_read_off(tdb, h); off; off = f.next) {
517 if (TDB_OFF_IS_ERR(off)) {
521 ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
522 if (ecode != TDB_SUCCESS) {
526 if (!check_free(tdb, off, &f, prev, ftable_num, i))
529 /* FIXME: Check hash bits */
530 p = asearch(&off, fr, num_free, off_cmp);
532 tdb_logerr(tdb, TDB_ERR_CORRUPT,
534 "tdb_check: Invalid offset"
535 " %llu in free table",
539 /* Mark it invalid. */
548 /* Slow, but should be very rare. */
549 size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
552 enum TDB_ERROR ecode;
554 for (len = 0; off + len < tdb->map_size; len++) {
556 ecode = tdb->methods->tread(tdb, off, &c, 1);
557 if (ecode != TDB_SUCCESS) {
561 if (c != 0 && c != 0x43)
567 static bool check_linear(struct tdb_context *tdb,
568 tdb_off_t **used, size_t *num_used,
569 tdb_off_t **fr, size_t *num_free,
574 enum TDB_ERROR ecode;
575 bool found_recovery = false;
577 for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
579 struct tdb_used_record u;
580 struct tdb_free_record f;
581 struct tdb_recovery_record r;
583 /* r is larger: only get that if we need to. */
584 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
585 if (ecode != TDB_SUCCESS) {
590 /* If we crash after ftruncate, we can get zeroes or fill. */
591 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
592 || rec.r.magic == 0x4343434343434343ULL) {
593 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
594 if (ecode != TDB_SUCCESS) {
598 if (recovery == off) {
599 found_recovery = true;
600 len = sizeof(rec.r) + rec.r.max_len;
602 len = dead_space(tdb, off);
603 if (len < sizeof(rec.r)) {
604 tdb_logerr(tdb, TDB_ERR_CORRUPT,
606 "tdb_check: invalid dead"
612 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
613 "Dead space at %zu-%zu (of %zu)",
614 (size_t)off, (size_t)(off + len),
615 (size_t)tdb->map_size);
617 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
618 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
619 if (ecode != TDB_SUCCESS) {
623 if (recovery != off) {
624 tdb_logerr(tdb, TDB_ERR_CORRUPT,
626 "tdb_check: unexpected recovery"
627 " record at offset %zu",
631 if (rec.r.len > rec.r.max_len) {
632 tdb_logerr(tdb, TDB_ERR_CORRUPT,
634 "tdb_check: invalid recovery length"
635 " %zu", (size_t)rec.r.len);
638 if (rec.r.eof > tdb->map_size) {
639 tdb_logerr(tdb, TDB_ERR_CORRUPT,
641 "tdb_check: invalid old EOF"
642 " %zu", (size_t)rec.r.eof);
645 found_recovery = true;
646 len = sizeof(rec.r) + rec.r.max_len;
647 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
648 len = sizeof(rec.u) + frec_len(&rec.f);
649 if (off + len > tdb->map_size) {
650 tdb_logerr(tdb, TDB_ERR_CORRUPT,
652 "tdb_check: free overlength %llu"
654 (long long)len, (long long)off);
657 /* This record should be in free lists. */
658 if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
659 && !append(fr, num_free, off)) {
660 tdb_logerr(tdb, TDB_ERR_OOM,
662 "tdb_check: tracking %zu'th"
663 " free record.", *num_free);
666 } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
667 || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
668 || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
669 || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
670 uint64_t klen, dlen, extra;
672 /* This record is used! */
673 if (!append(used, num_used, off)) {
674 tdb_logerr(tdb, TDB_ERR_OOM,
676 "tdb_check: tracking %zu'th"
677 " used record.", *num_used);
681 klen = rec_key_length(&rec.u);
682 dlen = rec_data_length(&rec.u);
683 extra = rec_extra_padding(&rec.u);
685 len = sizeof(rec.u) + klen + dlen + extra;
686 if (off + len > tdb->map_size) {
687 tdb_logerr(tdb, TDB_ERR_CORRUPT,
689 "tdb_check: used overlength %llu"
691 (long long)len, (long long)off);
695 if (len < sizeof(rec.f)) {
696 tdb_logerr(tdb, TDB_ERR_CORRUPT,
698 "tdb_check: too short record %llu"
700 (long long)len, (long long)off);
704 tdb_logerr(tdb, TDB_ERR_CORRUPT,
706 "tdb_check: Bad magic 0x%llx at offset %zu",
707 (long long)rec_magic(&rec.u), (size_t)off);
712 /* We must have found recovery area if there was one. */
713 if (recovery != 0 && !found_recovery) {
714 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
715 "tdb_check: expected a recovery area at %zu",
723 int tdb_check(struct tdb_context *tdb,
724 int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
727 tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
728 size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
729 enum TDB_ERROR ecode;
731 ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
732 if (ecode != TDB_SUCCESS) {
737 ecode = tdb_lock_expand(tdb, F_RDLCK);
738 if (ecode != TDB_SUCCESS) {
740 tdb_allrecord_unlock(tdb, F_RDLCK);
744 if (!check_header(tdb, &recovery))
747 /* First we do a linear scan, checking all records. */
748 if (!check_linear(tdb, &used, &num_used, &fr, &num_free, recovery))
751 for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
752 if (TDB_OFF_IS_ERR(ft)) {
756 if (!check_free_table(tdb, ft, num_ftables, fr, num_free,
762 /* FIXME: Check key uniqueness? */
763 if (!check_hash(tdb, used, num_used, num_ftables, check, private_data))
766 if (num_found != num_free) {
767 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
768 "tdb_check: Not all entries are in free table");
772 tdb_allrecord_unlock(tdb, F_RDLCK);
773 tdb_unlock_expand(tdb, F_RDLCK);
781 tdb_allrecord_unlock(tdb, F_RDLCK);
782 tdb_unlock_expand(tdb, F_RDLCK);