2 Trivial Database 2: free list/block handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
25 tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
33 static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery,
37 struct tdb_header hdr;
40 ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
41 if (ecode != TDB_SUCCESS) {
44 /* magic food should not be converted, so convert back. */
45 tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
47 hash_test = TDB_HASH_MAGIC;
48 hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
49 if (hdr.hash_test != hash_test) {
50 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
51 "check: hash test %llu should be %llu",
52 (long long)hdr.hash_test,
53 (long long)hash_test);
56 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
57 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
58 "check: bad magic '%.*s'",
59 (unsigned)sizeof(hdr.magic_food),
63 /* Features which are used must be a subset of features offered. */
64 if (hdr.features_used & ~hdr.features_offered) {
65 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
66 "check: features used (0x%llx) which"
67 " are not offered (0x%llx)",
68 (long long)hdr.features_used,
69 (long long)hdr.features_offered);
72 *features = hdr.features_offered;
73 *recovery = hdr.recovery;
75 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
76 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
78 " invalid recovery offset %zu",
83 /* Don't check reserved: they *can* be used later. */
87 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
88 tdb_off_t off, unsigned int group_bits,
90 unsigned hprefix_bits,
94 enum TDB_ERROR (*check)(TDB_DATA,
98 static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb,
104 enum TDB_ERROR (*check)(TDB_DATA,
109 struct tdb_used_record rec;
110 enum TDB_ERROR ecode;
112 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
113 if (ecode != TDB_SUCCESS) {
117 if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
118 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
119 "tdb_check: Bad hash chain magic %llu",
120 (long long)rec_magic(&rec));
123 if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
124 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
126 " Bad hash chain length %llu vs %zu",
127 (long long)rec_data_length(&rec),
128 sizeof(struct tdb_chain));
130 if (rec_key_length(&rec) != 0) {
131 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
132 "tdb_check: Bad hash chain key length %llu",
133 (long long)rec_key_length(&rec));
135 if (rec_hash(&rec) != 0) {
136 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
137 "tdb_check: Bad hash chain hash value %llu",
138 (long long)rec_hash(&rec));
142 ecode = check_hash_tree(tdb, off, 0, hash, 64,
143 used, num_used, num_found, check, private_data);
144 if (ecode != TDB_SUCCESS) {
148 off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
149 if (TDB_OFF_IS_ERR(off)) {
155 return check_hash_chain(tdb, off, hash, used, num_used, num_found,
156 check, private_data);
159 static enum TDB_ERROR check_hash_record(struct tdb_context *tdb,
162 unsigned hprefix_bits,
166 enum TDB_ERROR (*check)(TDB_DATA,
171 struct tdb_used_record rec;
172 enum TDB_ERROR ecode;
174 if (hprefix_bits >= 64)
175 return check_hash_chain(tdb, off, hprefix, used, num_used,
176 num_found, check, private_data);
178 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
179 if (ecode != TDB_SUCCESS) {
183 if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
184 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
185 "tdb_check: Bad hash table magic %llu",
186 (long long)rec_magic(&rec));
188 if (rec_data_length(&rec)
189 != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
190 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
192 " Bad hash table length %llu vs %llu",
193 (long long)rec_data_length(&rec),
194 (long long)sizeof(tdb_off_t)
195 << TDB_SUBLEVEL_HASH_BITS);
197 if (rec_key_length(&rec) != 0) {
198 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
199 "tdb_check: Bad hash table key length %llu",
200 (long long)rec_key_length(&rec));
202 if (rec_hash(&rec) != 0) {
203 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
204 "tdb_check: Bad hash table hash value %llu",
205 (long long)rec_hash(&rec));
209 return check_hash_tree(tdb, off,
210 TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
211 hprefix, hprefix_bits,
212 used, num_used, num_found, check, private_data);
215 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
217 /* Can overflow an int. */
223 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
227 return (h >> (64 - *used)) & ((1U << num) - 1);
230 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
231 tdb_off_t off, unsigned int group_bits,
233 unsigned hprefix_bits,
237 enum TDB_ERROR (*check)(TDB_DATA,
242 const tdb_off_t *hash;
243 struct tdb_used_record rec;
244 enum TDB_ERROR ecode;
246 hash = tdb_access_read(tdb, off,
248 << (group_bits + TDB_HASH_GROUP_BITS),
250 if (TDB_PTR_IS_ERR(hash)) {
251 return TDB_PTR_ERR(hash);
254 for (g = 0; g < (1 << group_bits); g++) {
255 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
256 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
257 unsigned int bucket, i, used_bits;
263 off = group[b] & TDB_OFF_MASK;
264 p = asearch(&off, used, num_used, off_cmp);
266 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
268 "tdb_check: Invalid offset"
273 /* Mark it invalid. */
277 if (hprefix_bits == 64) {
278 /* Chained entries are unordered. */
279 if (is_subhash(group[b])) {
280 ecode = TDB_ERR_CORRUPT;
281 tdb_logerr(tdb, ecode,
283 "tdb_check: Invalid chain"
287 h = hash_record(tdb, off);
289 ecode = TDB_ERR_CORRUPT;
290 tdb_logerr(tdb, ecode,
292 "check: bad hash chain"
299 ecode = tdb_read_convert(tdb, off, &rec,
301 if (ecode != TDB_SUCCESS) {
307 if (is_subhash(group[b])) {
310 << (group_bits + TDB_HASH_GROUP_BITS))
311 + g * (1 << TDB_HASH_GROUP_BITS) + b;
313 ecode = check_hash_record(tdb,
314 group[b] & TDB_OFF_MASK,
318 + TDB_HASH_GROUP_BITS,
319 used, num_used, num_found,
320 check, private_data);
321 if (ecode != TDB_SUCCESS) {
328 /* Does it belong here at all? */
329 h = hash_record(tdb, off);
331 if (get_bits(h, hprefix_bits, &used_bits) != hprefix
333 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
335 "check: bad hash placement"
342 /* Does it belong in this group? */
343 if (get_bits(h, group_bits, &used_bits) != g) {
344 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
346 "check: bad group %llu"
352 /* Are bucket bits correct? */
353 bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
354 if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
356 used_bits -= TDB_HASH_GROUP_BITS;
357 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
359 "check: bad bucket %u vs %u",
360 (unsigned)get_bits(h,
367 /* There must not be any zero entries between
368 * the bucket it belongs in and this one! */
371 i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
373 ecode = TDB_ERR_CORRUPT;
374 tdb_logerr(tdb, ecode,
376 "check: bad group placement"
383 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
384 if (ecode != TDB_SUCCESS) {
388 /* Bottom bits must match header. */
389 if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
390 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
392 "tdb_check: Bad hash magic"
394 " (0x%llx vs 0x%llx)",
397 (long long)rec_hash(&rec));
404 key.dsize = rec_key_length(&rec);
405 data.dsize = rec_data_length(&rec);
406 key.dptr = (void *)tdb_access_read(tdb,
408 key.dsize + data.dsize,
410 if (TDB_PTR_IS_ERR(key.dptr)) {
411 ecode = TDB_PTR_ERR(key.dptr);
414 data.dptr = key.dptr + key.dsize;
415 ecode = check(key, data, private_data);
416 if (ecode != TDB_SUCCESS) {
419 tdb_access_release(tdb, key.dptr);
423 tdb_access_release(tdb, hash);
427 tdb_access_release(tdb, hash);
431 static enum TDB_ERROR check_hash(struct tdb_context *tdb,
433 size_t num_used, size_t num_ftables,
434 int (*check)(TDB_DATA, TDB_DATA, void *),
437 /* Free tables also show up as used. */
438 size_t num_found = num_ftables;
439 enum TDB_ERROR ecode;
441 ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
442 TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
443 0, 0, used, num_used, &num_found,
444 check, private_data);
445 if (ecode == TDB_SUCCESS) {
446 if (num_found != num_used) {
447 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
448 "tdb_check: Not all entries"
455 static enum TDB_ERROR check_free(struct tdb_context *tdb,
457 const struct tdb_free_record *frec,
458 tdb_off_t prev, unsigned int ftable,
461 enum TDB_ERROR ecode;
463 if (frec_magic(frec) != TDB_FREE_MAGIC) {
464 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
465 "tdb_check: offset %llu bad magic 0x%llx",
467 (long long)frec->magic_and_prev);
469 if (frec_ftable(frec) != ftable) {
470 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
471 "tdb_check: offset %llu bad freetable %u",
472 (long long)off, frec_ftable(frec));
476 ecode = tdb->methods->oob(tdb, off
478 + sizeof(struct tdb_used_record),
480 if (ecode != TDB_SUCCESS) {
483 if (size_to_bucket(frec_len(frec)) != bucket) {
484 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
485 "tdb_check: offset %llu in wrong bucket"
488 bucket, size_to_bucket(frec_len(frec)));
490 if (prev != frec_prev(frec)) {
491 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
492 "tdb_check: offset %llu bad prev"
495 (long long)prev, (long long)frec_len(frec));
500 static enum TDB_ERROR check_free_table(struct tdb_context *tdb,
501 tdb_off_t ftable_off,
507 struct tdb_freetable ft;
510 enum TDB_ERROR ecode;
512 ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
513 if (ecode != TDB_SUCCESS) {
517 if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
518 || rec_key_length(&ft.hdr) != 0
519 || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
520 || rec_hash(&ft.hdr) != 0) {
521 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
522 "tdb_check: Invalid header on free table");
525 for (i = 0; i < TDB_FREE_BUCKETS; i++) {
526 tdb_off_t off, prev = 0, *p;
527 struct tdb_free_record f;
529 h = bucket_off(ftable_off, i);
530 for (off = tdb_read_off(tdb, h); off; off = f.next) {
531 if (TDB_OFF_IS_ERR(off)) {
534 ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
535 if (ecode != TDB_SUCCESS) {
538 ecode = check_free(tdb, off, &f, prev, ftable_num, i);
539 if (ecode != TDB_SUCCESS) {
543 /* FIXME: Check hash bits */
544 p = asearch(&off, fr, num_free, off_cmp);
546 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
548 "tdb_check: Invalid offset"
549 " %llu in free table",
552 /* Mark it invalid. */
561 /* Slow, but should be very rare. */
562 tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off)
565 enum TDB_ERROR ecode;
567 for (len = 0; off + len < tdb->map_size; len++) {
569 ecode = tdb->methods->tread(tdb, off, &c, 1);
570 if (ecode != TDB_SUCCESS) {
573 if (c != 0 && c != 0x43)
579 static enum TDB_ERROR check_linear(struct tdb_context *tdb,
580 tdb_off_t **used, size_t *num_used,
581 tdb_off_t **fr, size_t *num_free,
582 uint64_t features, tdb_off_t recovery)
586 enum TDB_ERROR ecode;
587 bool found_recovery = false;
589 for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
591 struct tdb_used_record u;
592 struct tdb_free_record f;
593 struct tdb_recovery_record r;
595 /* r is larger: only get that if we need to. */
596 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
597 if (ecode != TDB_SUCCESS) {
601 /* If we crash after ftruncate, we can get zeroes or fill. */
602 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
603 || rec.r.magic == 0x4343434343434343ULL) {
604 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
605 if (ecode != TDB_SUCCESS) {
608 if (recovery == off) {
609 found_recovery = true;
610 len = sizeof(rec.r) + rec.r.max_len;
612 len = dead_space(tdb, off);
613 if (TDB_OFF_IS_ERR(len)) {
616 if (len < sizeof(rec.r)) {
617 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
620 " dead space at %zu",
624 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
625 "Dead space at %zu-%zu (of %zu)",
626 (size_t)off, (size_t)(off + len),
627 (size_t)tdb->map_size);
629 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
630 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
631 if (ecode != TDB_SUCCESS) {
634 if (recovery != off) {
635 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
637 "tdb_check: unexpected"
638 " recovery record at offset"
642 if (rec.r.len > rec.r.max_len) {
643 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
645 "tdb_check: invalid recovery"
649 if (rec.r.eof > tdb->map_size) {
650 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
652 "tdb_check: invalid old EOF"
653 " %zu", (size_t)rec.r.eof);
655 found_recovery = true;
656 len = sizeof(rec.r) + rec.r.max_len;
657 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
658 len = sizeof(rec.u) + frec_len(&rec.f);
659 if (off + len > tdb->map_size) {
660 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
662 "tdb_check: free overlength"
663 " %llu at offset %llu",
667 /* This record should be in free lists. */
668 if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
669 && !append(fr, num_free, off)) {
670 return tdb_logerr(tdb, TDB_ERR_OOM,
672 "tdb_check: tracking %zu'th"
673 " free record.", *num_free);
675 } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
676 || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
677 || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
678 || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
679 uint64_t klen, dlen, extra;
681 /* This record is used! */
682 if (!append(used, num_used, off)) {
683 return tdb_logerr(tdb, TDB_ERR_OOM,
685 "tdb_check: tracking %zu'th"
686 " used record.", *num_used);
689 klen = rec_key_length(&rec.u);
690 dlen = rec_data_length(&rec.u);
691 extra = rec_extra_padding(&rec.u);
693 len = sizeof(rec.u) + klen + dlen + extra;
694 if (off + len > tdb->map_size) {
695 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
697 "tdb_check: used overlength"
698 " %llu at offset %llu",
703 if (len < sizeof(rec.f)) {
704 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
706 "tdb_check: too short record"
712 /* Check that records have correct 0 at end (but may
714 if (extra && !features) {
717 p = tdb_access_read(tdb, off + sizeof(rec.u)
718 + klen + dlen, 1, false);
719 if (TDB_PTR_IS_ERR(p))
720 return TDB_PTR_ERR(p);
722 tdb_access_release(tdb, p);
725 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
734 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
736 "tdb_check: Bad magic 0x%llx"
738 (long long)rec_magic(&rec.u),
743 /* We must have found recovery area if there was one. */
744 if (recovery != 0 && !found_recovery) {
745 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
746 "tdb_check: expected a recovery area at %zu",
753 enum TDB_ERROR tdb_check_(struct tdb_context *tdb,
754 enum TDB_ERROR (*check)(TDB_DATA key, TDB_DATA data,
758 tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
759 size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
761 enum TDB_ERROR ecode;
763 ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
764 if (ecode != TDB_SUCCESS) {
768 ecode = tdb_lock_expand(tdb, F_RDLCK);
769 if (ecode != TDB_SUCCESS) {
770 tdb_allrecord_unlock(tdb, F_RDLCK);
774 ecode = check_header(tdb, &recovery, &features);
775 if (ecode != TDB_SUCCESS)
778 /* First we do a linear scan, checking all records. */
779 ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, features,
781 if (ecode != TDB_SUCCESS)
784 for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
785 if (TDB_OFF_IS_ERR(ft)) {
789 ecode = check_free_table(tdb, ft, num_ftables, fr, num_free,
791 if (ecode != TDB_SUCCESS)
796 /* FIXME: Check key uniqueness? */
797 ecode = check_hash(tdb, used, num_used, num_ftables, check, private);
798 if (ecode != TDB_SUCCESS)
801 if (num_found != num_free) {
802 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
803 "tdb_check: Not all entries are in"
808 tdb_allrecord_unlock(tdb, F_RDLCK);
809 tdb_unlock_expand(tdb, F_RDLCK);