2 Trivial Database 2: free list/block handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
25 tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
33 static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
36 struct tdb_header hdr;
39 ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
40 if (ecode != TDB_SUCCESS) {
44 /* magic food should not be converted, so convert back. */
45 tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
47 hash_test = TDB_HASH_MAGIC;
48 hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
49 if (hdr.hash_test != hash_test) {
50 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
51 "check: hash test %llu should be %llu",
52 (long long)hdr.hash_test,
53 (long long)hash_test);
57 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
58 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
59 "check: bad magic '%.*s'",
60 (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
64 *recovery = hdr.recovery;
66 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
67 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
68 "tdb_check: invalid recovery offset %zu",
74 /* Don't check reserved: they *can* be used later. */
78 static bool check_hash_tree(struct tdb_context *tdb,
79 tdb_off_t off, unsigned int group_bits,
81 unsigned hprefix_bits,
85 int (*check)(TDB_DATA, TDB_DATA, void *),
88 static bool check_hash_chain(struct tdb_context *tdb,
94 int (*check)(TDB_DATA, TDB_DATA, void *),
97 struct tdb_used_record rec;
100 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
101 if (ecode != TDB_SUCCESS) {
106 if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
107 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
108 "tdb_check: Bad hash chain magic %llu",
109 (long long)rec_magic(&rec));
113 if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
114 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
115 "tdb_check: Bad hash chain length %llu vs %zu",
116 (long long)rec_data_length(&rec),
117 sizeof(struct tdb_chain));
120 if (rec_key_length(&rec) != 0) {
121 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
122 "tdb_check: Bad hash chain key length %llu",
123 (long long)rec_key_length(&rec));
126 if (rec_hash(&rec) != 0) {
127 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
128 "tdb_check: Bad hash chain hash value %llu",
129 (long long)rec_hash(&rec));
134 if (!check_hash_tree(tdb, off, 0, hash, 64,
135 used, num_used, num_found, check, private_data))
138 off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
139 if (off == TDB_OFF_ERR)
144 return check_hash_chain(tdb, off, hash, used, num_used, num_found,
145 check, private_data);
148 static bool check_hash_record(struct tdb_context *tdb,
151 unsigned hprefix_bits,
155 int (*check)(TDB_DATA, TDB_DATA, void *),
158 struct tdb_used_record rec;
159 enum TDB_ERROR ecode;
161 if (hprefix_bits >= 64)
162 return check_hash_chain(tdb, off, hprefix, used, num_used,
163 num_found, check, private_data);
165 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
166 if (ecode != TDB_SUCCESS) {
171 if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
172 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
173 "tdb_check: Bad hash table magic %llu",
174 (long long)rec_magic(&rec));
177 if (rec_data_length(&rec)
178 != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
179 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
180 "tdb_check: Bad hash table length %llu vs %llu",
181 (long long)rec_data_length(&rec),
182 (long long)sizeof(tdb_off_t)
183 << TDB_SUBLEVEL_HASH_BITS);
186 if (rec_key_length(&rec) != 0) {
187 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
188 "tdb_check: Bad hash table key length %llu",
189 (long long)rec_key_length(&rec));
192 if (rec_hash(&rec) != 0) {
193 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
194 "tdb_check: Bad hash table hash value %llu",
195 (long long)rec_hash(&rec));
200 return check_hash_tree(tdb, off,
201 TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
202 hprefix, hprefix_bits,
203 used, num_used, num_found, check, private_data);
206 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
208 /* Can overflow an int. */
214 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
218 return (h >> (64 - *used)) & ((1U << num) - 1);
221 static bool check_hash_tree(struct tdb_context *tdb,
222 tdb_off_t off, unsigned int group_bits,
224 unsigned hprefix_bits,
228 int (*check)(TDB_DATA, TDB_DATA, void *),
232 const tdb_off_t *hash;
233 struct tdb_used_record rec;
234 enum TDB_ERROR ecode;
236 hash = tdb_access_read(tdb, off,
238 << (group_bits + TDB_HASH_GROUP_BITS),
240 if (TDB_PTR_IS_ERR(hash)) {
241 tdb->ecode = TDB_PTR_ERR(hash);
245 for (g = 0; g < (1 << group_bits); g++) {
246 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
247 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
248 unsigned int bucket, i, used_bits;
254 off = group[b] & TDB_OFF_MASK;
255 p = asearch(&off, used, num_used, off_cmp);
257 tdb_logerr(tdb, TDB_ERR_CORRUPT,
259 "tdb_check: Invalid offset %llu "
260 "in hash", (long long)off);
263 /* Mark it invalid. */
267 if (hprefix_bits == 64) {
268 /* Chained entries are unordered. */
269 if (is_subhash(group[b])) {
270 tdb_logerr(tdb, TDB_ERR_CORRUPT,
272 "tdb_check: Invalid chain"
276 h = hash_record(tdb, off);
278 tdb_logerr(tdb, TDB_ERR_CORRUPT,
280 "check: bad hash chain"
287 ecode = tdb_read_convert(tdb, off, &rec,
289 if (ecode != TDB_SUCCESS) {
296 if (is_subhash(group[b])) {
299 << (group_bits + TDB_HASH_GROUP_BITS))
300 + g * (1 << TDB_HASH_GROUP_BITS) + b;
302 if (!check_hash_record(tdb,
303 group[b] & TDB_OFF_MASK,
307 + TDB_HASH_GROUP_BITS,
308 used, num_used, num_found,
309 check, private_data))
315 /* Does it belong here at all? */
316 h = hash_record(tdb, off);
318 if (get_bits(h, hprefix_bits, &used_bits) != hprefix
320 tdb_logerr(tdb, TDB_ERR_CORRUPT,
322 "check: bad hash placement"
324 (long long)h, (long long)hprefix);
328 /* Does it belong in this group? */
329 if (get_bits(h, group_bits, &used_bits) != g) {
330 tdb_logerr(tdb, TDB_ERR_CORRUPT,
332 "check: bad group %llu vs %u",
337 /* Are bucket bits correct? */
338 bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
339 if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
341 used_bits -= TDB_HASH_GROUP_BITS;
342 tdb_logerr(tdb, TDB_ERR_CORRUPT,
344 "check: bad bucket %u vs %u",
345 (unsigned)get_bits(h,
352 /* There must not be any zero entries between
353 * the bucket it belongs in and this one! */
356 i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
358 tdb_logerr(tdb, TDB_ERR_CORRUPT,
360 "check: bad group placement"
367 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
368 if (ecode != TDB_SUCCESS) {
373 /* Bottom bits must match header. */
374 if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
375 tdb_logerr(tdb, TDB_ERR_CORRUPT,
377 "tdb_check: Bad hash magic at"
378 " offset %llu (0x%llx vs 0x%llx)",
381 (long long)rec_hash(&rec));
388 key.dsize = rec_key_length(&rec);
389 data.dsize = rec_data_length(&rec);
390 key.dptr = (void *)tdb_access_read(tdb,
392 key.dsize + data.dsize,
394 if (TDB_PTR_IS_ERR(key.dptr)) {
395 tdb->ecode = TDB_PTR_ERR(key.dptr);
398 data.dptr = key.dptr + key.dsize;
399 if (check(key, data, private_data) != 0)
401 tdb_access_release(tdb, key.dptr);
405 tdb_access_release(tdb, hash);
409 tdb_access_release(tdb, hash);
413 static bool check_hash(struct tdb_context *tdb,
415 size_t num_used, size_t num_ftables,
416 int (*check)(TDB_DATA, TDB_DATA, void *),
419 /* Free tables also show up as used. */
420 size_t num_found = num_ftables;
422 if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
423 TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
424 0, 0, used, num_used, &num_found,
425 check, private_data))
428 if (num_found != num_used) {
429 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
430 "tdb_check: Not all entries are in hash");
436 static bool check_free(struct tdb_context *tdb,
438 const struct tdb_free_record *frec,
439 tdb_off_t prev, unsigned int ftable,
442 enum TDB_ERROR ecode;
444 if (frec_magic(frec) != TDB_FREE_MAGIC) {
445 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
446 "tdb_check: offset %llu bad magic 0x%llx",
447 (long long)off, (long long)frec->magic_and_prev);
450 if (frec_ftable(frec) != ftable) {
451 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
452 "tdb_check: offset %llu bad freetable %u",
453 (long long)off, frec_ftable(frec));
457 ecode = tdb->methods->oob(tdb, off
459 + sizeof(struct tdb_used_record),
461 if (ecode != TDB_SUCCESS) {
465 if (size_to_bucket(frec_len(frec)) != bucket) {
466 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
467 "tdb_check: offset %llu in wrong bucket %u vs %u",
469 bucket, size_to_bucket(frec_len(frec)));
472 if (prev != frec_prev(frec)) {
473 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
474 "tdb_check: offset %llu bad prev %llu vs %llu",
476 (long long)prev, (long long)frec_len(frec));
482 static bool check_free_table(struct tdb_context *tdb,
483 tdb_off_t ftable_off,
489 struct tdb_freetable ft;
492 enum TDB_ERROR ecode;
494 ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
495 if (ecode != TDB_SUCCESS) {
500 if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
501 || rec_key_length(&ft.hdr) != 0
502 || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
503 || rec_hash(&ft.hdr) != 0) {
504 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
505 "tdb_check: Invalid header on free table");
509 for (i = 0; i < TDB_FREE_BUCKETS; i++) {
510 tdb_off_t off, prev = 0, *p;
511 struct tdb_free_record f;
513 h = bucket_off(ftable_off, i);
514 for (off = tdb_read_off(tdb, h); off; off = f.next) {
515 if (off == TDB_OFF_ERR)
517 ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
518 if (ecode != TDB_SUCCESS) {
522 if (!check_free(tdb, off, &f, prev, ftable_num, i))
525 /* FIXME: Check hash bits */
526 p = asearch(&off, fr, num_free, off_cmp);
528 tdb_logerr(tdb, TDB_ERR_CORRUPT,
530 "tdb_check: Invalid offset"
531 " %llu in free table",
535 /* Mark it invalid. */
544 /* Slow, but should be very rare. */
545 size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
548 enum TDB_ERROR ecode;
550 for (len = 0; off + len < tdb->map_size; len++) {
552 ecode = tdb->methods->tread(tdb, off, &c, 1);
553 if (ecode != TDB_SUCCESS) {
557 if (c != 0 && c != 0x43)
563 static bool check_linear(struct tdb_context *tdb,
564 tdb_off_t **used, size_t *num_used,
565 tdb_off_t **fr, size_t *num_free,
570 enum TDB_ERROR ecode;
571 bool found_recovery = false;
573 for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
575 struct tdb_used_record u;
576 struct tdb_free_record f;
577 struct tdb_recovery_record r;
579 /* r is larger: only get that if we need to. */
580 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
581 if (ecode != TDB_SUCCESS) {
586 /* If we crash after ftruncate, we can get zeroes or fill. */
587 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
588 || rec.r.magic == 0x4343434343434343ULL) {
589 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
590 if (ecode != TDB_SUCCESS) {
594 if (recovery == off) {
595 found_recovery = true;
596 len = sizeof(rec.r) + rec.r.max_len;
598 len = dead_space(tdb, off);
599 if (len < sizeof(rec.r)) {
600 tdb_logerr(tdb, TDB_ERR_CORRUPT,
602 "tdb_check: invalid dead"
608 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
609 "Dead space at %zu-%zu (of %zu)",
610 (size_t)off, (size_t)(off + len),
611 (size_t)tdb->map_size);
613 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
614 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
615 if (ecode != TDB_SUCCESS) {
619 if (recovery != off) {
620 tdb_logerr(tdb, TDB_ERR_CORRUPT,
622 "tdb_check: unexpected recovery"
623 " record at offset %zu",
627 if (rec.r.len > rec.r.max_len) {
628 tdb_logerr(tdb, TDB_ERR_CORRUPT,
630 "tdb_check: invalid recovery length"
631 " %zu", (size_t)rec.r.len);
634 if (rec.r.eof > tdb->map_size) {
635 tdb_logerr(tdb, TDB_ERR_CORRUPT,
637 "tdb_check: invalid old EOF"
638 " %zu", (size_t)rec.r.eof);
641 found_recovery = true;
642 len = sizeof(rec.r) + rec.r.max_len;
643 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
644 len = sizeof(rec.u) + frec_len(&rec.f);
645 if (off + len > tdb->map_size) {
646 tdb_logerr(tdb, TDB_ERR_CORRUPT,
648 "tdb_check: free overlength %llu"
650 (long long)len, (long long)off);
653 /* This record should be in free lists. */
654 if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
655 && !append(fr, num_free, off)) {
656 tdb_logerr(tdb, TDB_ERR_OOM,
658 "tdb_check: tracking %zu'th"
659 " free record.", *num_free);
662 } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
663 || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
664 || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
665 || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
666 uint64_t klen, dlen, extra;
668 /* This record is used! */
669 if (!append(used, num_used, off)) {
670 tdb_logerr(tdb, TDB_ERR_OOM,
672 "tdb_check: tracking %zu'th"
673 " used record.", *num_used);
677 klen = rec_key_length(&rec.u);
678 dlen = rec_data_length(&rec.u);
679 extra = rec_extra_padding(&rec.u);
681 len = sizeof(rec.u) + klen + dlen + extra;
682 if (off + len > tdb->map_size) {
683 tdb_logerr(tdb, TDB_ERR_CORRUPT,
685 "tdb_check: used overlength %llu"
687 (long long)len, (long long)off);
691 if (len < sizeof(rec.f)) {
692 tdb_logerr(tdb, TDB_ERR_CORRUPT,
694 "tdb_check: too short record %llu"
696 (long long)len, (long long)off);
700 tdb_logerr(tdb, TDB_ERR_CORRUPT,
702 "tdb_check: Bad magic 0x%llx at offset %zu",
703 (long long)rec_magic(&rec.u), (size_t)off);
708 /* We must have found recovery area if there was one. */
709 if (recovery != 0 && !found_recovery) {
710 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
711 "tdb_check: expected a recovery area at %zu",
719 int tdb_check(struct tdb_context *tdb,
720 int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
723 tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
724 size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
725 enum TDB_ERROR ecode;
727 ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
728 if (ecpde != TDB_SUCCESS) {
733 ecode = tdb_lock_expand(tdb, F_RDLCK);
734 if (ecode != TDB_SUCCESS) {
736 tdb_allrecord_unlock(tdb, F_RDLCK);
740 if (!check_header(tdb, &recovery))
743 /* First we do a linear scan, checking all records. */
744 if (!check_linear(tdb, &used, &num_used, &fr, &num_free, recovery))
747 for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
748 if (ft == TDB_OFF_ERR)
750 if (!check_free_table(tdb, ft, num_ftables, fr, num_free,
756 /* FIXME: Check key uniqueness? */
757 if (!check_hash(tdb, used, num_used, num_ftables, check, private_data))
760 if (num_found != num_free) {
761 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
762 "tdb_check: Not all entries are in free table");
766 tdb_allrecord_unlock(tdb, F_RDLCK);
767 tdb_unlock_expand(tdb, F_RDLCK);
775 tdb_allrecord_unlock(tdb, F_RDLCK);
776 tdb_unlock_expand(tdb, F_RDLCK);