2 Trivial Database 2: free list/block handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
25 tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
33 static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery)
36 struct tdb_header hdr;
39 ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
40 if (ecode != TDB_SUCCESS) {
43 /* magic food should not be converted, so convert back. */
44 tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
46 hash_test = TDB_HASH_MAGIC;
47 hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
48 if (hdr.hash_test != hash_test) {
49 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
50 "check: hash test %llu should be %llu",
51 (long long)hdr.hash_test,
52 (long long)hash_test);
55 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
56 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
57 "check: bad magic '%.*s'",
58 (unsigned)sizeof(hdr.magic_food),
62 *recovery = hdr.recovery;
64 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
65 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
67 " invalid recovery offset %zu",
72 /* Don't check reserved: they *can* be used later. */
76 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
77 tdb_off_t off, unsigned int group_bits,
79 unsigned hprefix_bits,
83 int (*check)(TDB_DATA, TDB_DATA, void *),
86 static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb,
92 int (*check)(TDB_DATA, TDB_DATA, void *),
95 struct tdb_used_record rec;
98 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
99 if (ecode != TDB_SUCCESS) {
103 if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
104 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
105 "tdb_check: Bad hash chain magic %llu",
106 (long long)rec_magic(&rec));
109 if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
110 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
112 " Bad hash chain length %llu vs %zu",
113 (long long)rec_data_length(&rec),
114 sizeof(struct tdb_chain));
116 if (rec_key_length(&rec) != 0) {
117 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
118 "tdb_check: Bad hash chain key length %llu",
119 (long long)rec_key_length(&rec));
121 if (rec_hash(&rec) != 0) {
122 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
123 "tdb_check: Bad hash chain hash value %llu",
124 (long long)rec_hash(&rec));
128 ecode = check_hash_tree(tdb, off, 0, hash, 64,
129 used, num_used, num_found, check, private_data);
130 if (ecode != TDB_SUCCESS) {
134 off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
135 if (TDB_OFF_IS_ERR(off)) {
141 return check_hash_chain(tdb, off, hash, used, num_used, num_found,
142 check, private_data);
145 static enum TDB_ERROR check_hash_record(struct tdb_context *tdb,
148 unsigned hprefix_bits,
152 int (*check)(TDB_DATA, TDB_DATA, void*),
155 struct tdb_used_record rec;
156 enum TDB_ERROR ecode;
158 if (hprefix_bits >= 64)
159 return check_hash_chain(tdb, off, hprefix, used, num_used,
160 num_found, check, private_data);
162 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
163 if (ecode != TDB_SUCCESS) {
167 if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
168 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
169 "tdb_check: Bad hash table magic %llu",
170 (long long)rec_magic(&rec));
172 if (rec_data_length(&rec)
173 != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
174 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
176 " Bad hash table length %llu vs %llu",
177 (long long)rec_data_length(&rec),
178 (long long)sizeof(tdb_off_t)
179 << TDB_SUBLEVEL_HASH_BITS);
181 if (rec_key_length(&rec) != 0) {
182 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
183 "tdb_check: Bad hash table key length %llu",
184 (long long)rec_key_length(&rec));
186 if (rec_hash(&rec) != 0) {
187 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
188 "tdb_check: Bad hash table hash value %llu",
189 (long long)rec_hash(&rec));
193 return check_hash_tree(tdb, off,
194 TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
195 hprefix, hprefix_bits,
196 used, num_used, num_found, check, private_data);
199 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
201 /* Can overflow an int. */
207 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
211 return (h >> (64 - *used)) & ((1U << num) - 1);
214 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
215 tdb_off_t off, unsigned int group_bits,
217 unsigned hprefix_bits,
221 int (*check)(TDB_DATA, TDB_DATA, void *),
225 const tdb_off_t *hash;
226 struct tdb_used_record rec;
227 enum TDB_ERROR ecode;
229 hash = tdb_access_read(tdb, off,
231 << (group_bits + TDB_HASH_GROUP_BITS),
233 if (TDB_PTR_IS_ERR(hash)) {
234 return TDB_PTR_ERR(hash);
237 for (g = 0; g < (1 << group_bits); g++) {
238 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
239 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
240 unsigned int bucket, i, used_bits;
246 off = group[b] & TDB_OFF_MASK;
247 p = asearch(&off, used, num_used, off_cmp);
249 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
251 "tdb_check: Invalid offset"
256 /* Mark it invalid. */
260 if (hprefix_bits == 64) {
261 /* Chained entries are unordered. */
262 if (is_subhash(group[b])) {
263 ecode = TDB_ERR_CORRUPT;
264 tdb_logerr(tdb, ecode,
266 "tdb_check: Invalid chain"
270 h = hash_record(tdb, off);
272 ecode = TDB_ERR_CORRUPT;
273 tdb_logerr(tdb, ecode,
275 "check: bad hash chain"
282 ecode = tdb_read_convert(tdb, off, &rec,
284 if (ecode != TDB_SUCCESS) {
290 if (is_subhash(group[b])) {
293 << (group_bits + TDB_HASH_GROUP_BITS))
294 + g * (1 << TDB_HASH_GROUP_BITS) + b;
296 ecode = check_hash_record(tdb,
297 group[b] & TDB_OFF_MASK,
301 + TDB_HASH_GROUP_BITS,
302 used, num_used, num_found,
303 check, private_data);
304 if (ecode != TDB_SUCCESS) {
311 /* Does it belong here at all? */
312 h = hash_record(tdb, off);
314 if (get_bits(h, hprefix_bits, &used_bits) != hprefix
316 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
318 "check: bad hash placement"
325 /* Does it belong in this group? */
326 if (get_bits(h, group_bits, &used_bits) != g) {
327 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
329 "check: bad group %llu"
335 /* Are bucket bits correct? */
336 bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
337 if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
339 used_bits -= TDB_HASH_GROUP_BITS;
340 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
342 "check: bad bucket %u vs %u",
343 (unsigned)get_bits(h,
350 /* There must not be any zero entries between
351 * the bucket it belongs in and this one! */
354 i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
356 ecode = TDB_ERR_CORRUPT;
357 tdb_logerr(tdb, ecode,
359 "check: bad group placement"
366 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
367 if (ecode != TDB_SUCCESS) {
371 /* Bottom bits must match header. */
372 if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
373 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
375 "tdb_check: Bad hash magic"
377 " (0x%llx vs 0x%llx)",
380 (long long)rec_hash(&rec));
387 key.dsize = rec_key_length(&rec);
388 data.dsize = rec_data_length(&rec);
389 key.dptr = (void *)tdb_access_read(tdb,
391 key.dsize + data.dsize,
393 if (TDB_PTR_IS_ERR(key.dptr)) {
394 ecode = TDB_PTR_ERR(key.dptr);
397 data.dptr = key.dptr + key.dsize;
398 if (check(key, data, private_data) != 0) {
399 ecode = TDB_ERR_CORRUPT;
402 tdb_access_release(tdb, key.dptr);
406 tdb_access_release(tdb, hash);
410 tdb_access_release(tdb, hash);
414 static enum TDB_ERROR check_hash(struct tdb_context *tdb,
416 size_t num_used, size_t num_ftables,
417 int (*check)(TDB_DATA, TDB_DATA, void *),
420 /* Free tables also show up as used. */
421 size_t num_found = num_ftables;
422 enum TDB_ERROR ecode;
424 ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
425 TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
426 0, 0, used, num_used, &num_found,
427 check, private_data);
428 if (ecode == TDB_SUCCESS) {
429 if (num_found != num_used) {
430 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
431 "tdb_check: Not all entries"
438 static enum TDB_ERROR check_free(struct tdb_context *tdb,
440 const struct tdb_free_record *frec,
441 tdb_off_t prev, unsigned int ftable,
444 enum TDB_ERROR ecode;
446 if (frec_magic(frec) != TDB_FREE_MAGIC) {
447 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
448 "tdb_check: offset %llu bad magic 0x%llx",
450 (long long)frec->magic_and_prev);
452 if (frec_ftable(frec) != ftable) {
453 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
454 "tdb_check: offset %llu bad freetable %u",
455 (long long)off, frec_ftable(frec));
459 ecode = tdb->methods->oob(tdb, off
461 + sizeof(struct tdb_used_record),
463 if (ecode != TDB_SUCCESS) {
466 if (size_to_bucket(frec_len(frec)) != bucket) {
467 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
468 "tdb_check: offset %llu in wrong bucket"
471 bucket, size_to_bucket(frec_len(frec)));
473 if (prev != frec_prev(frec)) {
474 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
475 "tdb_check: offset %llu bad prev"
478 (long long)prev, (long long)frec_len(frec));
483 static enum TDB_ERROR check_free_table(struct tdb_context *tdb,
484 tdb_off_t ftable_off,
490 struct tdb_freetable ft;
493 enum TDB_ERROR ecode;
495 ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
496 if (ecode != TDB_SUCCESS) {
500 if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
501 || rec_key_length(&ft.hdr) != 0
502 || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
503 || rec_hash(&ft.hdr) != 0) {
504 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
505 "tdb_check: Invalid header on free table");
508 for (i = 0; i < TDB_FREE_BUCKETS; i++) {
509 tdb_off_t off, prev = 0, *p;
510 struct tdb_free_record f;
512 h = bucket_off(ftable_off, i);
513 for (off = tdb_read_off(tdb, h); off; off = f.next) {
514 if (TDB_OFF_IS_ERR(off)) {
517 ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
518 if (ecode != TDB_SUCCESS) {
521 ecode = check_free(tdb, off, &f, prev, ftable_num, i);
522 if (ecode != TDB_SUCCESS) {
526 /* FIXME: Check hash bits */
527 p = asearch(&off, fr, num_free, off_cmp);
529 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
531 "tdb_check: Invalid offset"
532 " %llu in free table",
535 /* Mark it invalid. */
544 /* Slow, but should be very rare. */
545 tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off)
548 enum TDB_ERROR ecode;
550 for (len = 0; off + len < tdb->map_size; len++) {
552 ecode = tdb->methods->tread(tdb, off, &c, 1);
553 if (ecode != TDB_SUCCESS) {
556 if (c != 0 && c != 0x43)
562 static enum TDB_ERROR check_linear(struct tdb_context *tdb,
563 tdb_off_t **used, size_t *num_used,
564 tdb_off_t **fr, size_t *num_free,
569 enum TDB_ERROR ecode;
570 bool found_recovery = false;
572 for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
574 struct tdb_used_record u;
575 struct tdb_free_record f;
576 struct tdb_recovery_record r;
578 /* r is larger: only get that if we need to. */
579 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
580 if (ecode != TDB_SUCCESS) {
584 /* If we crash after ftruncate, we can get zeroes or fill. */
585 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
586 || rec.r.magic == 0x4343434343434343ULL) {
587 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
588 if (ecode != TDB_SUCCESS) {
591 if (recovery == off) {
592 found_recovery = true;
593 len = sizeof(rec.r) + rec.r.max_len;
595 len = dead_space(tdb, off);
596 if (TDB_OFF_IS_ERR(len)) {
599 if (len < sizeof(rec.r)) {
600 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
603 " dead space at %zu",
607 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
608 "Dead space at %zu-%zu (of %zu)",
609 (size_t)off, (size_t)(off + len),
610 (size_t)tdb->map_size);
612 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
613 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
614 if (ecode != TDB_SUCCESS) {
617 if (recovery != off) {
618 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
620 "tdb_check: unexpected"
621 " recovery record at offset"
625 if (rec.r.len > rec.r.max_len) {
626 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
628 "tdb_check: invalid recovery"
632 if (rec.r.eof > tdb->map_size) {
633 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
635 "tdb_check: invalid old EOF"
636 " %zu", (size_t)rec.r.eof);
638 found_recovery = true;
639 len = sizeof(rec.r) + rec.r.max_len;
640 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
641 len = sizeof(rec.u) + frec_len(&rec.f);
642 if (off + len > tdb->map_size) {
643 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
645 "tdb_check: free overlength"
646 " %llu at offset %llu",
650 /* This record should be in free lists. */
651 if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
652 && !append(fr, num_free, off)) {
653 return tdb_logerr(tdb, TDB_ERR_OOM,
655 "tdb_check: tracking %zu'th"
656 " free record.", *num_free);
658 } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
659 || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
660 || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
661 || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
662 uint64_t klen, dlen, extra;
664 /* This record is used! */
665 if (!append(used, num_used, off)) {
666 return tdb_logerr(tdb, TDB_ERR_OOM,
668 "tdb_check: tracking %zu'th"
669 " used record.", *num_used);
672 klen = rec_key_length(&rec.u);
673 dlen = rec_data_length(&rec.u);
674 extra = rec_extra_padding(&rec.u);
676 len = sizeof(rec.u) + klen + dlen + extra;
677 if (off + len > tdb->map_size) {
678 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
680 "tdb_check: used overlength"
681 " %llu at offset %llu",
686 if (len < sizeof(rec.f)) {
687 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
689 "tdb_check: too short record"
695 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
697 "tdb_check: Bad magic 0x%llx"
699 (long long)rec_magic(&rec.u),
704 /* We must have found recovery area if there was one. */
705 if (recovery != 0 && !found_recovery) {
706 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
707 "tdb_check: expected a recovery area at %zu",
714 int tdb_check(struct tdb_context *tdb,
715 int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
718 tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
719 size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
720 enum TDB_ERROR ecode;
722 ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
723 if (ecode != TDB_SUCCESS) {
728 ecode = tdb_lock_expand(tdb, F_RDLCK);
729 if (ecode != TDB_SUCCESS) {
731 tdb_allrecord_unlock(tdb, F_RDLCK);
735 ecode = check_header(tdb, &recovery);
736 if (ecode != TDB_SUCCESS)
739 /* First we do a linear scan, checking all records. */
740 ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, recovery);
741 if (ecode != TDB_SUCCESS)
744 for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
745 if (TDB_OFF_IS_ERR(ft)) {
749 ecode = check_free_table(tdb, ft, num_ftables, fr, num_free,
751 if (ecode != TDB_SUCCESS)
756 /* FIXME: Check key uniqueness? */
757 ecode = check_hash(tdb, used, num_used, num_ftables, check,
759 if (ecode != TDB_SUCCESS)
762 if (num_found != num_free) {
763 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
764 "tdb_check: Not all entries are in"
769 tdb_allrecord_unlock(tdb, F_RDLCK);
770 tdb_unlock_expand(tdb, F_RDLCK);
773 if (ecode != TDB_SUCCESS) {