]> git.ozlabs.org Git - ccan/blob - ccan/tdb2/check.c
tdb2: rework free.c functions to return enum TDB_ERROR.
[ccan] / ccan / tdb2 / check.c
1  /*
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
21
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
24 {
25         tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
26         if (!new)
27                 return false;
28         new[(*num)++] = off;
29         *arr = new;
30         return true;
31 }
32
33 static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
34 {
35         uint64_t hash_test;
36         struct tdb_header hdr;
37         enum TDB_ERROR ecode;
38
39         ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
40         if (ecode != TDB_SUCCESS) {
41                 tdb->ecode = ecode;
42                 return false;
43         }
44         /* magic food should not be converted, so convert back. */
45         tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
46
47         hash_test = TDB_HASH_MAGIC;
48         hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
49         if (hdr.hash_test != hash_test) {
50                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
51                            "check: hash test %llu should be %llu",
52                            (long long)hdr.hash_test,
53                            (long long)hash_test);
54                 return false;
55         }
56
57         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
58                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
59                            "check: bad magic '%.*s'",
60                            (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
61                 return false;
62         }
63
64         *recovery = hdr.recovery;
65         if (*recovery) {
66                 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
67                         tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
68                                  "tdb_check: invalid recovery offset %zu",
69                                  (size_t)*recovery);
70                         return false;
71                 }
72         }
73
74         /* Don't check reserved: they *can* be used later. */
75         return true;
76 }
77
78 static bool check_hash_tree(struct tdb_context *tdb,
79                             tdb_off_t off, unsigned int group_bits,
80                             uint64_t hprefix,
81                             unsigned hprefix_bits,
82                             tdb_off_t used[],
83                             size_t num_used,
84                             size_t *num_found,
85                             int (*check)(TDB_DATA, TDB_DATA, void *),
86                             void *private_data);
87
88 static bool check_hash_chain(struct tdb_context *tdb,
89                              tdb_off_t off,
90                              uint64_t hash,
91                              tdb_off_t used[],
92                              size_t num_used,
93                              size_t *num_found,
94                              int (*check)(TDB_DATA, TDB_DATA, void *),
95                              void *private_data)
96 {
97         struct tdb_used_record rec;
98         enum TDB_ERROR ecode;
99
100         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
101         if (ecode != TDB_SUCCESS) {
102                 tdb->ecode = ecode;
103                 return false;
104         }
105
106         if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
107                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
108                            "tdb_check: Bad hash chain magic %llu",
109                            (long long)rec_magic(&rec));
110                 return false;
111         }
112
113         if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
114                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
115                            "tdb_check: Bad hash chain length %llu vs %zu",
116                            (long long)rec_data_length(&rec),
117                            sizeof(struct tdb_chain));
118                 return false;
119         }
120         if (rec_key_length(&rec) != 0) {
121                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
122                          "tdb_check: Bad hash chain key length %llu",
123                          (long long)rec_key_length(&rec));
124                 return false;
125         }
126         if (rec_hash(&rec) != 0) {
127                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
128                          "tdb_check: Bad hash chain hash value %llu",
129                          (long long)rec_hash(&rec));
130                 return false;
131         }
132
133         off += sizeof(rec);
134         if (!check_hash_tree(tdb, off, 0, hash, 64,
135                              used, num_used, num_found, check, private_data))
136                 return false;
137
138         off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
139         if (TDB_OFF_IS_ERR(off)) {
140                 tdb->ecode = off;
141                 return false;
142         }
143         if (off == 0)
144                 return true;
145         (*num_found)++;
146         return check_hash_chain(tdb, off, hash, used, num_used, num_found,
147                                 check, private_data);
148 }
149
150 static bool check_hash_record(struct tdb_context *tdb,
151                               tdb_off_t off,
152                               uint64_t hprefix,
153                               unsigned hprefix_bits,
154                               tdb_off_t used[],
155                               size_t num_used,
156                               size_t *num_found,
157                               int (*check)(TDB_DATA, TDB_DATA, void *),
158                               void *private_data)
159 {
160         struct tdb_used_record rec;
161         enum TDB_ERROR ecode;
162
163         if (hprefix_bits >= 64)
164                 return check_hash_chain(tdb, off, hprefix, used, num_used,
165                                         num_found, check, private_data);
166
167         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
168         if (ecode != TDB_SUCCESS) {
169                 tdb->ecode = ecode;
170                 return false;
171         }
172
173         if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
174                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
175                            "tdb_check: Bad hash table magic %llu",
176                            (long long)rec_magic(&rec));
177                 return false;
178         }
179         if (rec_data_length(&rec)
180             != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
181                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
182                            "tdb_check: Bad hash table length %llu vs %llu",
183                            (long long)rec_data_length(&rec),
184                            (long long)sizeof(tdb_off_t)
185                            << TDB_SUBLEVEL_HASH_BITS);
186                 return false;
187         }
188         if (rec_key_length(&rec) != 0) {
189                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
190                          "tdb_check: Bad hash table key length %llu",
191                          (long long)rec_key_length(&rec));
192                 return false;
193         }
194         if (rec_hash(&rec) != 0) {
195                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
196                          "tdb_check: Bad hash table hash value %llu",
197                          (long long)rec_hash(&rec));
198                 return false;
199         }
200
201         off += sizeof(rec);
202         return check_hash_tree(tdb, off,
203                                TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
204                                hprefix, hprefix_bits,
205                                used, num_used, num_found, check, private_data);
206 }
207
208 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
209 {
210         /* Can overflow an int. */
211         return *a > *b ? 1
212                 : *a < *b ? -1
213                 : 0;
214 }
215
216 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
217 {
218         *used += num;
219
220         return (h >> (64 - *used)) & ((1U << num) - 1);
221 }
222
223 static bool check_hash_tree(struct tdb_context *tdb,
224                             tdb_off_t off, unsigned int group_bits,
225                             uint64_t hprefix,
226                             unsigned hprefix_bits,
227                             tdb_off_t used[],
228                             size_t num_used,
229                             size_t *num_found,
230                             int (*check)(TDB_DATA, TDB_DATA, void *),
231                             void *private_data)
232 {
233         unsigned int g, b;
234         const tdb_off_t *hash;
235         struct tdb_used_record rec;
236         enum TDB_ERROR ecode;
237
238         hash = tdb_access_read(tdb, off,
239                                sizeof(tdb_off_t)
240                                << (group_bits + TDB_HASH_GROUP_BITS),
241                                true);
242         if (TDB_PTR_IS_ERR(hash)) {
243                 tdb->ecode = TDB_PTR_ERR(hash);
244                 return false;
245         }
246
247         for (g = 0; g < (1 << group_bits); g++) {
248                 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
249                 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
250                         unsigned int bucket, i, used_bits;
251                         uint64_t h;
252                         tdb_off_t *p;
253                         if (group[b] == 0)
254                                 continue;
255
256                         off = group[b] & TDB_OFF_MASK;
257                         p = asearch(&off, used, num_used, off_cmp);
258                         if (!p) {
259                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
260                                            TDB_LOG_ERROR,
261                                            "tdb_check: Invalid offset %llu "
262                                            "in hash", (long long)off);
263                                 goto fail;
264                         }
265                         /* Mark it invalid. */
266                         *p ^= 1;
267                         (*num_found)++;
268
269                         if (hprefix_bits == 64) {
270                                 /* Chained entries are unordered. */
271                                 if (is_subhash(group[b])) {
272                                         tdb_logerr(tdb, TDB_ERR_CORRUPT,
273                                                    TDB_LOG_ERROR,
274                                                    "tdb_check: Invalid chain"
275                                                    " entry subhash");
276                                         goto fail;
277                                 }
278                                 h = hash_record(tdb, off);
279                                 if (h != hprefix) {
280                                         tdb_logerr(tdb, TDB_ERR_CORRUPT,
281                                                    TDB_LOG_ERROR,
282                                                    "check: bad hash chain"
283                                                    " placement"
284                                                    " 0x%llx vs 0x%llx",
285                                                    (long long)h,
286                                                    (long long)hprefix);
287                                         goto fail;
288                                 }
289                                 ecode = tdb_read_convert(tdb, off, &rec,
290                                                          sizeof(rec));
291                                 if (ecode != TDB_SUCCESS) {
292                                         tdb->ecode = ecode;
293                                         goto fail;
294                                 }
295                                 goto check;
296                         }
297
298                         if (is_subhash(group[b])) {
299                                 uint64_t subprefix;
300                                 subprefix = (hprefix
301                                      << (group_bits + TDB_HASH_GROUP_BITS))
302                                         + g * (1 << TDB_HASH_GROUP_BITS) + b;
303
304                                 if (!check_hash_record(tdb,
305                                                group[b] & TDB_OFF_MASK,
306                                                subprefix,
307                                                hprefix_bits
308                                                        + group_bits
309                                                        + TDB_HASH_GROUP_BITS,
310                                                used, num_used, num_found,
311                                                check, private_data))
312                                         goto fail;
313                                 continue;
314                         }
315                         /* A normal entry */
316
317                         /* Does it belong here at all? */
318                         h = hash_record(tdb, off);
319                         used_bits = 0;
320                         if (get_bits(h, hprefix_bits, &used_bits) != hprefix
321                             && hprefix_bits) {
322                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
323                                            TDB_LOG_ERROR,
324                                            "check: bad hash placement"
325                                            " 0x%llx vs 0x%llx",
326                                          (long long)h, (long long)hprefix);
327                                 goto fail;
328                         }
329
330                         /* Does it belong in this group? */
331                         if (get_bits(h, group_bits, &used_bits) != g) {
332                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
333                                            TDB_LOG_ERROR,
334                                            "check: bad group %llu vs %u",
335                                            (long long)h, g);
336                                 goto fail;
337                         }
338
339                         /* Are bucket bits correct? */
340                         bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
341                         if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
342                             != bucket) {
343                                 used_bits -= TDB_HASH_GROUP_BITS;
344                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
345                                            TDB_LOG_ERROR,
346                                          "check: bad bucket %u vs %u",
347                                          (unsigned)get_bits(h,
348                                                         TDB_HASH_GROUP_BITS,
349                                                         &used_bits),
350                                          bucket);
351                                 goto fail;
352                         }
353
354                         /* There must not be any zero entries between
355                          * the bucket it belongs in and this one! */
356                         for (i = bucket;
357                              i != b;
358                              i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
359                                 if (group[i] == 0) {
360                                         tdb_logerr(tdb, TDB_ERR_CORRUPT,
361                                                    TDB_LOG_ERROR,
362                                                    "check: bad group placement"
363                                                    " %u vs %u",
364                                                    b, bucket);
365                                         goto fail;
366                                 }
367                         }
368
369                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
370                         if (ecode != TDB_SUCCESS) {
371                                 tdb->ecode = ecode;
372                                 goto fail;
373                         }
374
375                         /* Bottom bits must match header. */
376                         if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
377                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
378                                            TDB_LOG_ERROR,
379                                            "tdb_check: Bad hash magic at"
380                                            " offset %llu (0x%llx vs 0x%llx)",
381                                            (long long)off,
382                                            (long long)h,
383                                            (long long)rec_hash(&rec));
384                                 goto fail;
385                         }
386
387                 check:
388                         if (check) {
389                                 TDB_DATA key, data;
390                                 key.dsize = rec_key_length(&rec);
391                                 data.dsize = rec_data_length(&rec);
392                                 key.dptr = (void *)tdb_access_read(tdb,
393                                                    off + sizeof(rec),
394                                                    key.dsize + data.dsize,
395                                                    false);
396                                 if (TDB_PTR_IS_ERR(key.dptr)) {
397                                         tdb->ecode = TDB_PTR_ERR(key.dptr);
398                                         goto fail;
399                                 }
400                                 data.dptr = key.dptr + key.dsize;
401                                 if (check(key, data, private_data) != 0)
402                                         goto fail;
403                                 tdb_access_release(tdb, key.dptr);
404                         }
405                 }
406         }
407         tdb_access_release(tdb, hash);
408         return true;
409
410 fail:
411         tdb_access_release(tdb, hash);
412         return false;
413 }
414
415 static bool check_hash(struct tdb_context *tdb,
416                        tdb_off_t used[],
417                        size_t num_used, size_t num_ftables,
418                        int (*check)(TDB_DATA, TDB_DATA, void *),
419                        void *private_data)
420 {
421         /* Free tables also show up as used. */
422         size_t num_found = num_ftables;
423
424         if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
425                              TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
426                              0, 0, used, num_used, &num_found,
427                              check, private_data))
428                 return false;
429
430         if (num_found != num_used) {
431                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
432                            "tdb_check: Not all entries are in hash");
433                 return false;
434         }
435         return true;
436 }
437
438 static bool check_free(struct tdb_context *tdb,
439                        tdb_off_t off,
440                        const struct tdb_free_record *frec,
441                        tdb_off_t prev, unsigned int ftable,
442                        unsigned int bucket)
443 {
444         enum TDB_ERROR ecode;
445
446         if (frec_magic(frec) != TDB_FREE_MAGIC) {
447                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
448                            "tdb_check: offset %llu bad magic 0x%llx",
449                            (long long)off, (long long)frec->magic_and_prev);
450                 return false;
451         }
452         if (frec_ftable(frec) != ftable) {
453                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
454                            "tdb_check: offset %llu bad freetable %u",
455                            (long long)off, frec_ftable(frec));
456                 return false;
457         }
458
459         ecode = tdb->methods->oob(tdb, off
460                                   + frec_len(frec)
461                                   + sizeof(struct tdb_used_record),
462                                   false);
463         if (ecode != TDB_SUCCESS) {
464                 tdb->ecode = ecode;
465                 return false;
466         }
467         if (size_to_bucket(frec_len(frec)) != bucket) {
468                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
469                            "tdb_check: offset %llu in wrong bucket %u vs %u",
470                            (long long)off,
471                            bucket, size_to_bucket(frec_len(frec)));
472                 return false;
473         }
474         if (prev != frec_prev(frec)) {
475                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
476                            "tdb_check: offset %llu bad prev %llu vs %llu",
477                            (long long)off,
478                            (long long)prev, (long long)frec_len(frec));
479                 return false;
480         }
481         return true;
482 }
483
484 static bool check_free_table(struct tdb_context *tdb,
485                              tdb_off_t ftable_off,
486                              unsigned ftable_num,
487                              tdb_off_t fr[],
488                              size_t num_free,
489                              size_t *num_found)
490 {
491         struct tdb_freetable ft;
492         tdb_off_t h;
493         unsigned int i;
494         enum TDB_ERROR ecode;
495
496         ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
497         if (ecode != TDB_SUCCESS) {
498                 tdb->ecode = ecode;
499                 return false;
500         }
501
502         if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
503             || rec_key_length(&ft.hdr) != 0
504             || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
505             || rec_hash(&ft.hdr) != 0) {
506                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
507                            "tdb_check: Invalid header on free table");
508                 return false;
509         }
510
511         for (i = 0; i < TDB_FREE_BUCKETS; i++) {
512                 tdb_off_t off, prev = 0, *p;
513                 struct tdb_free_record f;
514
515                 h = bucket_off(ftable_off, i);
516                 for (off = tdb_read_off(tdb, h); off; off = f.next) {
517                         if (TDB_OFF_IS_ERR(off)) {
518                                 tdb->ecode = off;
519                                 return false;
520                         }
521                         ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
522                         if (ecode != TDB_SUCCESS) {
523                                 tdb->ecode = ecode;
524                                 return false;
525                         }
526                         if (!check_free(tdb, off, &f, prev, ftable_num, i))
527                                 return false;
528
529                         /* FIXME: Check hash bits */
530                         p = asearch(&off, fr, num_free, off_cmp);
531                         if (!p) {
532                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
533                                            TDB_LOG_ERROR,
534                                            "tdb_check: Invalid offset"
535                                            " %llu in free table",
536                                            (long long)off);
537                                 return false;
538                         }
539                         /* Mark it invalid. */
540                         *p ^= 1;
541                         (*num_found)++;
542                         prev = off;
543                 }
544         }
545         return true;
546 }
547
548 /* Slow, but should be very rare. */
549 size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
550 {
551         size_t len;
552         enum TDB_ERROR ecode;
553
554         for (len = 0; off + len < tdb->map_size; len++) {
555                 char c;
556                 ecode = tdb->methods->tread(tdb, off, &c, 1);
557                 if (ecode != TDB_SUCCESS) {
558                         tdb->ecode = ecode;
559                         return 0;
560                 }
561                 if (c != 0 && c != 0x43)
562                         break;
563         }
564         return len;
565 }
566
567 static bool check_linear(struct tdb_context *tdb,
568                          tdb_off_t **used, size_t *num_used,
569                          tdb_off_t **fr, size_t *num_free,
570                          tdb_off_t recovery)
571 {
572         tdb_off_t off;
573         tdb_len_t len;
574         enum TDB_ERROR ecode;
575         bool found_recovery = false;
576
577         for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
578                 union {
579                         struct tdb_used_record u;
580                         struct tdb_free_record f;
581                         struct tdb_recovery_record r;
582                 } rec;
583                 /* r is larger: only get that if we need to. */
584                 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
585                 if (ecode != TDB_SUCCESS) {
586                         tdb->ecode = ecode;
587                         return false;
588                 }
589
590                 /* If we crash after ftruncate, we can get zeroes or fill. */
591                 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
592                     || rec.r.magic ==  0x4343434343434343ULL) {
593                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
594                         if (ecode != TDB_SUCCESS) {
595                                 tdb->ecode = ecode;
596                                 return false;
597                         }
598                         if (recovery == off) {
599                                 found_recovery = true;
600                                 len = sizeof(rec.r) + rec.r.max_len;
601                         } else {
602                                 len = dead_space(tdb, off);
603                                 if (len < sizeof(rec.r)) {
604                                         tdb_logerr(tdb, TDB_ERR_CORRUPT,
605                                                    TDB_LOG_ERROR,
606                                                    "tdb_check: invalid dead"
607                                                    " space at %zu",
608                                                    (size_t)off);
609                                         return false;
610                                 }
611
612                                 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
613                                            "Dead space at %zu-%zu (of %zu)",
614                                            (size_t)off, (size_t)(off + len),
615                                            (size_t)tdb->map_size);
616                         }
617                 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
618                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
619                         if (ecode != TDB_SUCCESS) {
620                                 tdb->ecode = ecode;
621                                 return false;
622                         }
623                         if (recovery != off) {
624                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
625                                            TDB_LOG_ERROR,
626                                            "tdb_check: unexpected recovery"
627                                            " record at offset %zu",
628                                            (size_t)off);
629                                 return false;
630                         }
631                         if (rec.r.len > rec.r.max_len) {
632                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
633                                            TDB_LOG_ERROR,
634                                            "tdb_check: invalid recovery length"
635                                            " %zu", (size_t)rec.r.len);
636                                 return false;
637                         }
638                         if (rec.r.eof > tdb->map_size) {
639                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
640                                            TDB_LOG_ERROR,
641                                            "tdb_check: invalid old EOF"
642                                            " %zu", (size_t)rec.r.eof);
643                                 return false;
644                         }
645                         found_recovery = true;
646                         len = sizeof(rec.r) + rec.r.max_len;
647                 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
648                         len = sizeof(rec.u) + frec_len(&rec.f);
649                         if (off + len > tdb->map_size) {
650                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
651                                            TDB_LOG_ERROR,
652                                            "tdb_check: free overlength %llu"
653                                            " at offset %llu",
654                                            (long long)len, (long long)off);
655                                 return false;
656                         }
657                         /* This record should be in free lists. */
658                         if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
659                             && !append(fr, num_free, off)) {
660                                 tdb_logerr(tdb, TDB_ERR_OOM,
661                                            TDB_LOG_ERROR,
662                                            "tdb_check: tracking %zu'th"
663                                            " free record.", *num_free);
664                                 return false;
665                         }
666                 } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
667                            || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
668                            || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
669                            || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
670                         uint64_t klen, dlen, extra;
671
672                         /* This record is used! */
673                         if (!append(used, num_used, off)) {
674                                 tdb_logerr(tdb, TDB_ERR_OOM,
675                                            TDB_LOG_ERROR,
676                                            "tdb_check: tracking %zu'th"
677                                            " used record.", *num_used);
678                                 return false;
679                         }
680
681                         klen = rec_key_length(&rec.u);
682                         dlen = rec_data_length(&rec.u);
683                         extra = rec_extra_padding(&rec.u);
684
685                         len = sizeof(rec.u) + klen + dlen + extra;
686                         if (off + len > tdb->map_size) {
687                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
688                                            TDB_LOG_ERROR,
689                                            "tdb_check: used overlength %llu"
690                                            " at offset %llu",
691                                            (long long)len, (long long)off);
692                                 return false;
693                         }
694
695                         if (len < sizeof(rec.f)) {
696                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
697                                            TDB_LOG_ERROR,
698                                            "tdb_check: too short record %llu"
699                                            " at %llu",
700                                            (long long)len, (long long)off);
701                                 return false;
702                         }
703                 } else {
704                         tdb_logerr(tdb, TDB_ERR_CORRUPT,
705                                    TDB_LOG_ERROR,
706                                    "tdb_check: Bad magic 0x%llx at offset %zu",
707                                    (long long)rec_magic(&rec.u), (size_t)off);
708                         return false;
709                 }
710         }
711
712         /* We must have found recovery area if there was one. */
713         if (recovery != 0 && !found_recovery) {
714                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
715                            "tdb_check: expected a recovery area at %zu",
716                            (size_t)recovery);
717                 return false;
718         }
719
720         return true;
721 }
722
723 int tdb_check(struct tdb_context *tdb,
724               int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
725               void *private_data)
726 {
727         tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
728         size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
729         enum TDB_ERROR ecode;
730
731         ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
732         if (ecode != TDB_SUCCESS) {
733                 tdb->ecode = ecode;
734                 return -1;
735         }
736
737         ecode = tdb_lock_expand(tdb, F_RDLCK);
738         if (ecode != TDB_SUCCESS) {
739                 tdb->ecode = ecode;
740                 tdb_allrecord_unlock(tdb, F_RDLCK);
741                 return -1;
742         }
743
744         if (!check_header(tdb, &recovery))
745                 goto fail;
746
747         /* First we do a linear scan, checking all records. */
748         if (!check_linear(tdb, &used, &num_used, &fr, &num_free, recovery))
749                 goto fail;
750
751         for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
752                 if (TDB_OFF_IS_ERR(ft)) {
753                         tdb->ecode = ft;
754                         goto fail;
755                 }
756                 if (!check_free_table(tdb, ft, num_ftables, fr, num_free,
757                                       &num_found))
758                         goto fail;
759                 num_ftables++;
760         }
761
762         /* FIXME: Check key uniqueness? */
763         if (!check_hash(tdb, used, num_used, num_ftables, check, private_data))
764                 goto fail;
765
766         if (num_found != num_free) {
767                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
768                            "tdb_check: Not all entries are in free table");
769                 return -1;
770         }
771
772         tdb_allrecord_unlock(tdb, F_RDLCK);
773         tdb_unlock_expand(tdb, F_RDLCK);
774         free(fr);
775         free(used);
776         return 0;
777
778 fail:
779         free(fr);
780         free(used);
781         tdb_allrecord_unlock(tdb, F_RDLCK);
782         tdb_unlock_expand(tdb, F_RDLCK);
783         return -1;
784 }