tdb2: change API to return the error value.
[ccan] / ccan / tdb2 / check.c
1  /*
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
21
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
24 {
25         tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
26         if (!new)
27                 return false;
28         new[(*num)++] = off;
29         *arr = new;
30         return true;
31 }
32
33 static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery)
34 {
35         uint64_t hash_test;
36         struct tdb_header hdr;
37         enum TDB_ERROR ecode;
38
39         ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
40         if (ecode != TDB_SUCCESS) {
41                 return ecode;
42         }
43         /* magic food should not be converted, so convert back. */
44         tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
45
46         hash_test = TDB_HASH_MAGIC;
47         hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
48         if (hdr.hash_test != hash_test) {
49                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
50                                   "check: hash test %llu should be %llu",
51                                   (long long)hdr.hash_test,
52                                   (long long)hash_test);
53         }
54
55         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
56                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
57                                   "check: bad magic '%.*s'",
58                                   (unsigned)sizeof(hdr.magic_food),
59                                   hdr.magic_food);
60         }
61
62         *recovery = hdr.recovery;
63         if (*recovery) {
64                 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
65                         return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
66                                           "tdb_check:"
67                                           " invalid recovery offset %zu",
68                                           (size_t)*recovery);
69                 }
70         }
71
72         /* Don't check reserved: they *can* be used later. */
73         return TDB_SUCCESS;
74 }
75
76 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
77                                       tdb_off_t off, unsigned int group_bits,
78                                       uint64_t hprefix,
79                                       unsigned hprefix_bits,
80                                       tdb_off_t used[],
81                                       size_t num_used,
82                                       size_t *num_found,
83                                       enum TDB_ERROR (*check)(TDB_DATA,
84                                                               TDB_DATA, void *),
85                                       void *private_data);
86
87 static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb,
88                                        tdb_off_t off,
89                                        uint64_t hash,
90                                        tdb_off_t used[],
91                                        size_t num_used,
92                                        size_t *num_found,
93                                        enum TDB_ERROR (*check)(TDB_DATA,
94                                                                TDB_DATA,
95                                                                void *),
96                                        void *private_data)
97 {
98         struct tdb_used_record rec;
99         enum TDB_ERROR ecode;
100
101         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
102         if (ecode != TDB_SUCCESS) {
103                 return ecode;
104         }
105
106         if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
107                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
108                                   "tdb_check: Bad hash chain magic %llu",
109                                   (long long)rec_magic(&rec));
110         }
111
112         if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
113                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
114                                   "tdb_check:"
115                                   " Bad hash chain length %llu vs %zu",
116                                   (long long)rec_data_length(&rec),
117                                   sizeof(struct tdb_chain));
118         }
119         if (rec_key_length(&rec) != 0) {
120                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
121                                   "tdb_check: Bad hash chain key length %llu",
122                                   (long long)rec_key_length(&rec));
123         }
124         if (rec_hash(&rec) != 0) {
125                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
126                                   "tdb_check: Bad hash chain hash value %llu",
127                                   (long long)rec_hash(&rec));
128         }
129
130         off += sizeof(rec);
131         ecode = check_hash_tree(tdb, off, 0, hash, 64,
132                                 used, num_used, num_found, check, private_data);
133         if (ecode != TDB_SUCCESS) {
134                 return false;
135         }
136
137         off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
138         if (TDB_OFF_IS_ERR(off)) {
139                 return off;
140         }
141         if (off == 0)
142                 return TDB_SUCCESS;
143         (*num_found)++;
144         return check_hash_chain(tdb, off, hash, used, num_used, num_found,
145                                 check, private_data);
146 }
147
148 static enum TDB_ERROR check_hash_record(struct tdb_context *tdb,
149                                         tdb_off_t off,
150                                         uint64_t hprefix,
151                                         unsigned hprefix_bits,
152                                         tdb_off_t used[],
153                                         size_t num_used,
154                                         size_t *num_found,
155                                         enum TDB_ERROR (*check)(TDB_DATA,
156                                                                 TDB_DATA,
157                                                                 void *),
158                                         void *private_data)
159 {
160         struct tdb_used_record rec;
161         enum TDB_ERROR ecode;
162
163         if (hprefix_bits >= 64)
164                 return check_hash_chain(tdb, off, hprefix, used, num_used,
165                                         num_found, check, private_data);
166
167         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
168         if (ecode != TDB_SUCCESS) {
169                 return ecode;
170         }
171
172         if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
173                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
174                                   "tdb_check: Bad hash table magic %llu",
175                                   (long long)rec_magic(&rec));
176         }
177         if (rec_data_length(&rec)
178             != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
179                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
180                                   "tdb_check:"
181                                   " Bad hash table length %llu vs %llu",
182                                   (long long)rec_data_length(&rec),
183                                   (long long)sizeof(tdb_off_t)
184                                   << TDB_SUBLEVEL_HASH_BITS);
185         }
186         if (rec_key_length(&rec) != 0) {
187                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
188                                   "tdb_check: Bad hash table key length %llu",
189                                   (long long)rec_key_length(&rec));
190         }
191         if (rec_hash(&rec) != 0) {
192                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
193                                   "tdb_check: Bad hash table hash value %llu",
194                                   (long long)rec_hash(&rec));
195         }
196
197         off += sizeof(rec);
198         return check_hash_tree(tdb, off,
199                                TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
200                                hprefix, hprefix_bits,
201                                used, num_used, num_found, check, private_data);
202 }
203
204 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
205 {
206         /* Can overflow an int. */
207         return *a > *b ? 1
208                 : *a < *b ? -1
209                 : 0;
210 }
211
212 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
213 {
214         *used += num;
215
216         return (h >> (64 - *used)) & ((1U << num) - 1);
217 }
218
219 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
220                                       tdb_off_t off, unsigned int group_bits,
221                                       uint64_t hprefix,
222                                       unsigned hprefix_bits,
223                                       tdb_off_t used[],
224                                       size_t num_used,
225                                       size_t *num_found,
226                                       enum TDB_ERROR (*check)(TDB_DATA,
227                                                               TDB_DATA, void *),
228                                       void *private_data)
229 {
230         unsigned int g, b;
231         const tdb_off_t *hash;
232         struct tdb_used_record rec;
233         enum TDB_ERROR ecode;
234
235         hash = tdb_access_read(tdb, off,
236                                sizeof(tdb_off_t)
237                                << (group_bits + TDB_HASH_GROUP_BITS),
238                                true);
239         if (TDB_PTR_IS_ERR(hash)) {
240                 return TDB_PTR_ERR(hash);
241         }
242
243         for (g = 0; g < (1 << group_bits); g++) {
244                 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
245                 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
246                         unsigned int bucket, i, used_bits;
247                         uint64_t h;
248                         tdb_off_t *p;
249                         if (group[b] == 0)
250                                 continue;
251
252                         off = group[b] & TDB_OFF_MASK;
253                         p = asearch(&off, used, num_used, off_cmp);
254                         if (!p) {
255                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
256                                                    TDB_LOG_ERROR,
257                                                    "tdb_check: Invalid offset"
258                                                    " %llu in hash",
259                                                    (long long)off);
260                                 goto fail;
261                         }
262                         /* Mark it invalid. */
263                         *p ^= 1;
264                         (*num_found)++;
265
266                         if (hprefix_bits == 64) {
267                                 /* Chained entries are unordered. */
268                                 if (is_subhash(group[b])) {
269                                         ecode = TDB_ERR_CORRUPT;
270                                         tdb_logerr(tdb, ecode,
271                                                    TDB_LOG_ERROR,
272                                                    "tdb_check: Invalid chain"
273                                                    " entry subhash");
274                                         goto fail;
275                                 }
276                                 h = hash_record(tdb, off);
277                                 if (h != hprefix) {
278                                         ecode = TDB_ERR_CORRUPT;
279                                         tdb_logerr(tdb, ecode,
280                                                    TDB_LOG_ERROR,
281                                                    "check: bad hash chain"
282                                                    " placement"
283                                                    " 0x%llx vs 0x%llx",
284                                                    (long long)h,
285                                                    (long long)hprefix);
286                                         goto fail;
287                                 }
288                                 ecode = tdb_read_convert(tdb, off, &rec,
289                                                          sizeof(rec));
290                                 if (ecode != TDB_SUCCESS) {
291                                         goto fail;
292                                 }
293                                 goto check;
294                         }
295
296                         if (is_subhash(group[b])) {
297                                 uint64_t subprefix;
298                                 subprefix = (hprefix
299                                      << (group_bits + TDB_HASH_GROUP_BITS))
300                                         + g * (1 << TDB_HASH_GROUP_BITS) + b;
301
302                                 ecode = check_hash_record(tdb,
303                                                group[b] & TDB_OFF_MASK,
304                                                subprefix,
305                                                hprefix_bits
306                                                        + group_bits
307                                                        + TDB_HASH_GROUP_BITS,
308                                                used, num_used, num_found,
309                                                check, private_data);
310                                 if (ecode != TDB_SUCCESS) {
311                                         goto fail;
312                                 }
313                                 continue;
314                         }
315                         /* A normal entry */
316
317                         /* Does it belong here at all? */
318                         h = hash_record(tdb, off);
319                         used_bits = 0;
320                         if (get_bits(h, hprefix_bits, &used_bits) != hprefix
321                             && hprefix_bits) {
322                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
323                                                    TDB_LOG_ERROR,
324                                                    "check: bad hash placement"
325                                                    " 0x%llx vs 0x%llx",
326                                                    (long long)h,
327                                                    (long long)hprefix);
328                                 goto fail;
329                         }
330
331                         /* Does it belong in this group? */
332                         if (get_bits(h, group_bits, &used_bits) != g) {
333                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
334                                                    TDB_LOG_ERROR,
335                                                    "check: bad group %llu"
336                                                    " vs %u",
337                                                    (long long)h, g);
338                                 goto fail;
339                         }
340
341                         /* Are bucket bits correct? */
342                         bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
343                         if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
344                             != bucket) {
345                                 used_bits -= TDB_HASH_GROUP_BITS;
346                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
347                                                    TDB_LOG_ERROR,
348                                                    "check: bad bucket %u vs %u",
349                                                    (unsigned)get_bits(h,
350                                                         TDB_HASH_GROUP_BITS,
351                                                         &used_bits),
352                                                    bucket);
353                                 goto fail;
354                         }
355
356                         /* There must not be any zero entries between
357                          * the bucket it belongs in and this one! */
358                         for (i = bucket;
359                              i != b;
360                              i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
361                                 if (group[i] == 0) {
362                                         ecode = TDB_ERR_CORRUPT;
363                                         tdb_logerr(tdb, ecode,
364                                                    TDB_LOG_ERROR,
365                                                    "check: bad group placement"
366                                                    " %u vs %u",
367                                                    b, bucket);
368                                         goto fail;
369                                 }
370                         }
371
372                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
373                         if (ecode != TDB_SUCCESS) {
374                                 goto fail;
375                         }
376
377                         /* Bottom bits must match header. */
378                         if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
379                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
380                                                    TDB_LOG_ERROR,
381                                                    "tdb_check: Bad hash magic"
382                                                    " at offset %llu"
383                                                    " (0x%llx vs 0x%llx)",
384                                                    (long long)off,
385                                                    (long long)h,
386                                                    (long long)rec_hash(&rec));
387                                 goto fail;
388                         }
389
390                 check:
391                         if (check) {
392                                 TDB_DATA key, data;
393                                 key.dsize = rec_key_length(&rec);
394                                 data.dsize = rec_data_length(&rec);
395                                 key.dptr = (void *)tdb_access_read(tdb,
396                                                    off + sizeof(rec),
397                                                    key.dsize + data.dsize,
398                                                    false);
399                                 if (TDB_PTR_IS_ERR(key.dptr)) {
400                                         ecode = TDB_PTR_ERR(key.dptr);
401                                         goto fail;
402                                 }
403                                 data.dptr = key.dptr + key.dsize;
404                                 ecode = check(key, data, private_data);
405                                 if (ecode != TDB_SUCCESS) {
406                                         goto fail;
407                                 }
408                                 tdb_access_release(tdb, key.dptr);
409                         }
410                 }
411         }
412         tdb_access_release(tdb, hash);
413         return TDB_SUCCESS;
414
415 fail:
416         tdb_access_release(tdb, hash);
417         return ecode;
418 }
419
420 static enum TDB_ERROR check_hash(struct tdb_context *tdb,
421                                  tdb_off_t used[],
422                                  size_t num_used, size_t num_ftables,
423                                  int (*check)(TDB_DATA, TDB_DATA, void *),
424                                  void *private_data)
425 {
426         /* Free tables also show up as used. */
427         size_t num_found = num_ftables;
428         enum TDB_ERROR ecode;
429
430         ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
431                                 TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
432                                 0, 0, used, num_used, &num_found,
433                                 check, private_data);
434         if (ecode == TDB_SUCCESS) {
435                 if (num_found != num_used) {
436                         ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
437                                            "tdb_check: Not all entries"
438                                            " are in hash");
439                 }
440         }
441         return ecode;
442 }
443
444 static enum TDB_ERROR check_free(struct tdb_context *tdb,
445                                  tdb_off_t off,
446                                  const struct tdb_free_record *frec,
447                                  tdb_off_t prev, unsigned int ftable,
448                                  unsigned int bucket)
449 {
450         enum TDB_ERROR ecode;
451
452         if (frec_magic(frec) != TDB_FREE_MAGIC) {
453                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
454                                   "tdb_check: offset %llu bad magic 0x%llx",
455                                   (long long)off,
456                                   (long long)frec->magic_and_prev);
457         }
458         if (frec_ftable(frec) != ftable) {
459                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
460                                   "tdb_check: offset %llu bad freetable %u",
461                                   (long long)off, frec_ftable(frec));
462
463         }
464
465         ecode = tdb->methods->oob(tdb, off
466                                   + frec_len(frec)
467                                   + sizeof(struct tdb_used_record),
468                                   false);
469         if (ecode != TDB_SUCCESS) {
470                 return ecode;
471         }
472         if (size_to_bucket(frec_len(frec)) != bucket) {
473                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
474                                   "tdb_check: offset %llu in wrong bucket"
475                                   " (%u vs %u)",
476                                   (long long)off,
477                                   bucket, size_to_bucket(frec_len(frec)));
478         }
479         if (prev != frec_prev(frec)) {
480                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
481                                   "tdb_check: offset %llu bad prev"
482                                   " (%llu vs %llu)",
483                                   (long long)off,
484                                   (long long)prev, (long long)frec_len(frec));
485         }
486         return TDB_SUCCESS;
487 }
488
489 static enum TDB_ERROR check_free_table(struct tdb_context *tdb,
490                                        tdb_off_t ftable_off,
491                                        unsigned ftable_num,
492                                        tdb_off_t fr[],
493                                        size_t num_free,
494                                        size_t *num_found)
495 {
496         struct tdb_freetable ft;
497         tdb_off_t h;
498         unsigned int i;
499         enum TDB_ERROR ecode;
500
501         ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
502         if (ecode != TDB_SUCCESS) {
503                 return ecode;
504         }
505
506         if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
507             || rec_key_length(&ft.hdr) != 0
508             || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
509             || rec_hash(&ft.hdr) != 0) {
510                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
511                                   "tdb_check: Invalid header on free table");
512         }
513
514         for (i = 0; i < TDB_FREE_BUCKETS; i++) {
515                 tdb_off_t off, prev = 0, *p;
516                 struct tdb_free_record f;
517
518                 h = bucket_off(ftable_off, i);
519                 for (off = tdb_read_off(tdb, h); off; off = f.next) {
520                         if (TDB_OFF_IS_ERR(off)) {
521                                 return off;
522                         }
523                         ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
524                         if (ecode != TDB_SUCCESS) {
525                                 return ecode;
526                         }
527                         ecode = check_free(tdb, off, &f, prev, ftable_num, i);
528                         if (ecode != TDB_SUCCESS) {
529                                 return false;
530                         }
531
532                         /* FIXME: Check hash bits */
533                         p = asearch(&off, fr, num_free, off_cmp);
534                         if (!p) {
535                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
536                                                   TDB_LOG_ERROR,
537                                                   "tdb_check: Invalid offset"
538                                                   " %llu in free table",
539                                                   (long long)off);
540                         }
541                         /* Mark it invalid. */
542                         *p ^= 1;
543                         (*num_found)++;
544                         prev = off;
545                 }
546         }
547         return TDB_SUCCESS;
548 }
549
550 /* Slow, but should be very rare. */
551 tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off)
552 {
553         size_t len;
554         enum TDB_ERROR ecode;
555
556         for (len = 0; off + len < tdb->map_size; len++) {
557                 char c;
558                 ecode = tdb->methods->tread(tdb, off, &c, 1);
559                 if (ecode != TDB_SUCCESS) {
560                         return ecode;
561                 }
562                 if (c != 0 && c != 0x43)
563                         break;
564         }
565         return len;
566 }
567
568 static enum TDB_ERROR check_linear(struct tdb_context *tdb,
569                                    tdb_off_t **used, size_t *num_used,
570                                    tdb_off_t **fr, size_t *num_free,
571                                    tdb_off_t recovery)
572 {
573         tdb_off_t off;
574         tdb_len_t len;
575         enum TDB_ERROR ecode;
576         bool found_recovery = false;
577
578         for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
579                 union {
580                         struct tdb_used_record u;
581                         struct tdb_free_record f;
582                         struct tdb_recovery_record r;
583                 } rec;
584                 /* r is larger: only get that if we need to. */
585                 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
586                 if (ecode != TDB_SUCCESS) {
587                         return ecode;
588                 }
589
590                 /* If we crash after ftruncate, we can get zeroes or fill. */
591                 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
592                     || rec.r.magic ==  0x4343434343434343ULL) {
593                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
594                         if (ecode != TDB_SUCCESS) {
595                                 return ecode;
596                         }
597                         if (recovery == off) {
598                                 found_recovery = true;
599                                 len = sizeof(rec.r) + rec.r.max_len;
600                         } else {
601                                 len = dead_space(tdb, off);
602                                 if (TDB_OFF_IS_ERR(len)) {
603                                         return len;
604                                 }
605                                 if (len < sizeof(rec.r)) {
606                                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
607                                                           TDB_LOG_ERROR,
608                                                           "tdb_check: invalid"
609                                                           " dead space at %zu",
610                                                           (size_t)off);
611                                 }
612
613                                 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
614                                            "Dead space at %zu-%zu (of %zu)",
615                                            (size_t)off, (size_t)(off + len),
616                                            (size_t)tdb->map_size);
617                         }
618                 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
619                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
620                         if (ecode != TDB_SUCCESS) {
621                                 return ecode;
622                         }
623                         if (recovery != off) {
624                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
625                                                   TDB_LOG_ERROR,
626                                                   "tdb_check: unexpected"
627                                                   " recovery record at offset"
628                                                   " %zu",
629                                                   (size_t)off);
630                         }
631                         if (rec.r.len > rec.r.max_len) {
632                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
633                                                   TDB_LOG_ERROR,
634                                                   "tdb_check: invalid recovery"
635                                                   " length %zu",
636                                                   (size_t)rec.r.len);
637                         }
638                         if (rec.r.eof > tdb->map_size) {
639                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
640                                                   TDB_LOG_ERROR,
641                                                   "tdb_check: invalid old EOF"
642                                                   " %zu", (size_t)rec.r.eof);
643                         }
644                         found_recovery = true;
645                         len = sizeof(rec.r) + rec.r.max_len;
646                 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
647                         len = sizeof(rec.u) + frec_len(&rec.f);
648                         if (off + len > tdb->map_size) {
649                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
650                                                   TDB_LOG_ERROR,
651                                                   "tdb_check: free overlength"
652                                                   " %llu at offset %llu",
653                                                   (long long)len,
654                                                   (long long)off);
655                         }
656                         /* This record should be in free lists. */
657                         if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
658                             && !append(fr, num_free, off)) {
659                                 return tdb_logerr(tdb, TDB_ERR_OOM,
660                                                   TDB_LOG_ERROR,
661                                                   "tdb_check: tracking %zu'th"
662                                                   " free record.", *num_free);
663                         }
664                 } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
665                            || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
666                            || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
667                            || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
668                         uint64_t klen, dlen, extra;
669
670                         /* This record is used! */
671                         if (!append(used, num_used, off)) {
672                                 return tdb_logerr(tdb, TDB_ERR_OOM,
673                                                   TDB_LOG_ERROR,
674                                                   "tdb_check: tracking %zu'th"
675                                                   " used record.", *num_used);
676                         }
677
678                         klen = rec_key_length(&rec.u);
679                         dlen = rec_data_length(&rec.u);
680                         extra = rec_extra_padding(&rec.u);
681
682                         len = sizeof(rec.u) + klen + dlen + extra;
683                         if (off + len > tdb->map_size) {
684                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
685                                                   TDB_LOG_ERROR,
686                                                   "tdb_check: used overlength"
687                                                   " %llu at offset %llu",
688                                                   (long long)len,
689                                                   (long long)off);
690                         }
691
692                         if (len < sizeof(rec.f)) {
693                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
694                                                   TDB_LOG_ERROR,
695                                                   "tdb_check: too short record"
696                                                   " %llu at %llu",
697                                                   (long long)len,
698                                                   (long long)off);
699                         }
700                 } else {
701                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
702                                           TDB_LOG_ERROR,
703                                           "tdb_check: Bad magic 0x%llx"
704                                           " at offset %zu",
705                                           (long long)rec_magic(&rec.u),
706                                           (size_t)off);
707                 }
708         }
709
710         /* We must have found recovery area if there was one. */
711         if (recovery != 0 && !found_recovery) {
712                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
713                                   "tdb_check: expected a recovery area at %zu",
714                                   (size_t)recovery);
715         }
716
717         return TDB_SUCCESS;
718 }
719
720 enum TDB_ERROR tdb_check(struct tdb_context *tdb,
721                          enum TDB_ERROR (*check)(TDB_DATA key, TDB_DATA data,
722                                                  void *private_data),
723                          void *private_data)
724 {
725         tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
726         size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
727         enum TDB_ERROR ecode;
728
729         ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
730         if (ecode != TDB_SUCCESS) {
731                 return ecode;
732         }
733
734         ecode = tdb_lock_expand(tdb, F_RDLCK);
735         if (ecode != TDB_SUCCESS) {
736                 tdb_allrecord_unlock(tdb, F_RDLCK);
737                 return ecode;
738         }
739
740         ecode = check_header(tdb, &recovery);
741         if (ecode != TDB_SUCCESS)
742                 goto out;
743
744         /* First we do a linear scan, checking all records. */
745         ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, recovery);
746         if (ecode != TDB_SUCCESS)
747                 goto out;
748
749         for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
750                 if (TDB_OFF_IS_ERR(ft)) {
751                         ecode = ft;
752                         goto out;
753                 }
754                 ecode = check_free_table(tdb, ft, num_ftables, fr, num_free,
755                                          &num_found);
756                 if (ecode != TDB_SUCCESS)
757                         goto out;
758                 num_ftables++;
759         }
760
761         /* FIXME: Check key uniqueness? */
762         ecode = check_hash(tdb, used, num_used, num_ftables, check,
763                            private_data);
764         if (ecode != TDB_SUCCESS)
765                 goto out;
766
767         if (num_found != num_free) {
768                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
769                                    "tdb_check: Not all entries are in"
770                                    " free table");
771         }
772
773 out:
774         tdb_allrecord_unlock(tdb, F_RDLCK);
775         tdb_unlock_expand(tdb, F_RDLCK);
776         free(fr);
777         free(used);
778         return ecode;
779 }