]> git.ozlabs.org Git - ccan/blob - ccan/tdb2/check.c
tdb2: rework check.c internal functions to return enum TDB_ERROR.
[ccan] / ccan / tdb2 / check.c
1  /*
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
21
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
24 {
25         tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
26         if (!new)
27                 return false;
28         new[(*num)++] = off;
29         *arr = new;
30         return true;
31 }
32
33 static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery)
34 {
35         uint64_t hash_test;
36         struct tdb_header hdr;
37         enum TDB_ERROR ecode;
38
39         ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
40         if (ecode != TDB_SUCCESS) {
41                 return ecode;
42         }
43         /* magic food should not be converted, so convert back. */
44         tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
45
46         hash_test = TDB_HASH_MAGIC;
47         hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
48         if (hdr.hash_test != hash_test) {
49                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
50                                   "check: hash test %llu should be %llu",
51                                   (long long)hdr.hash_test,
52                                   (long long)hash_test);
53         }
54
55         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
56                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
57                                   "check: bad magic '%.*s'",
58                                   (unsigned)sizeof(hdr.magic_food),
59                                   hdr.magic_food);
60         }
61
62         *recovery = hdr.recovery;
63         if (*recovery) {
64                 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
65                         return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
66                                           "tdb_check:"
67                                           " invalid recovery offset %zu",
68                                           (size_t)*recovery);
69                 }
70         }
71
72         /* Don't check reserved: they *can* be used later. */
73         return TDB_SUCCESS;
74 }
75
76 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
77                                       tdb_off_t off, unsigned int group_bits,
78                                       uint64_t hprefix,
79                                       unsigned hprefix_bits,
80                                       tdb_off_t used[],
81                                       size_t num_used,
82                                       size_t *num_found,
83                                       int (*check)(TDB_DATA, TDB_DATA, void *),
84                                       void *private_data);
85
86 static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb,
87                                        tdb_off_t off,
88                                        uint64_t hash,
89                                        tdb_off_t used[],
90                                        size_t num_used,
91                                        size_t *num_found,
92                                        int (*check)(TDB_DATA, TDB_DATA, void *),
93                                        void *private_data)
94 {
95         struct tdb_used_record rec;
96         enum TDB_ERROR ecode;
97
98         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
99         if (ecode != TDB_SUCCESS) {
100                 return ecode;
101         }
102
103         if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
104                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
105                                   "tdb_check: Bad hash chain magic %llu",
106                                   (long long)rec_magic(&rec));
107         }
108
109         if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
110                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
111                                   "tdb_check:"
112                                   " Bad hash chain length %llu vs %zu",
113                                   (long long)rec_data_length(&rec),
114                                   sizeof(struct tdb_chain));
115         }
116         if (rec_key_length(&rec) != 0) {
117                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
118                                   "tdb_check: Bad hash chain key length %llu",
119                                   (long long)rec_key_length(&rec));
120         }
121         if (rec_hash(&rec) != 0) {
122                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
123                                   "tdb_check: Bad hash chain hash value %llu",
124                                   (long long)rec_hash(&rec));
125         }
126
127         off += sizeof(rec);
128         ecode = check_hash_tree(tdb, off, 0, hash, 64,
129                                 used, num_used, num_found, check, private_data);
130         if (ecode != TDB_SUCCESS) {
131                 return false;
132         }
133
134         off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
135         if (TDB_OFF_IS_ERR(off)) {
136                 return off;
137         }
138         if (off == 0)
139                 return TDB_SUCCESS;
140         (*num_found)++;
141         return check_hash_chain(tdb, off, hash, used, num_used, num_found,
142                                 check, private_data);
143 }
144
145 static enum TDB_ERROR check_hash_record(struct tdb_context *tdb,
146                                         tdb_off_t off,
147                                         uint64_t hprefix,
148                                         unsigned hprefix_bits,
149                                         tdb_off_t used[],
150                                         size_t num_used,
151                                         size_t *num_found,
152                                         int (*check)(TDB_DATA, TDB_DATA, void*),
153                                         void *private_data)
154 {
155         struct tdb_used_record rec;
156         enum TDB_ERROR ecode;
157
158         if (hprefix_bits >= 64)
159                 return check_hash_chain(tdb, off, hprefix, used, num_used,
160                                         num_found, check, private_data);
161
162         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
163         if (ecode != TDB_SUCCESS) {
164                 return ecode;
165         }
166
167         if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
168                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
169                                   "tdb_check: Bad hash table magic %llu",
170                                   (long long)rec_magic(&rec));
171         }
172         if (rec_data_length(&rec)
173             != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
174                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
175                                   "tdb_check:"
176                                   " Bad hash table length %llu vs %llu",
177                                   (long long)rec_data_length(&rec),
178                                   (long long)sizeof(tdb_off_t)
179                                   << TDB_SUBLEVEL_HASH_BITS);
180         }
181         if (rec_key_length(&rec) != 0) {
182                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
183                                   "tdb_check: Bad hash table key length %llu",
184                                   (long long)rec_key_length(&rec));
185         }
186         if (rec_hash(&rec) != 0) {
187                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
188                                   "tdb_check: Bad hash table hash value %llu",
189                                   (long long)rec_hash(&rec));
190         }
191
192         off += sizeof(rec);
193         return check_hash_tree(tdb, off,
194                                TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
195                                hprefix, hprefix_bits,
196                                used, num_used, num_found, check, private_data);
197 }
198
199 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
200 {
201         /* Can overflow an int. */
202         return *a > *b ? 1
203                 : *a < *b ? -1
204                 : 0;
205 }
206
207 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
208 {
209         *used += num;
210
211         return (h >> (64 - *used)) & ((1U << num) - 1);
212 }
213
214 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
215                                       tdb_off_t off, unsigned int group_bits,
216                                       uint64_t hprefix,
217                                       unsigned hprefix_bits,
218                                       tdb_off_t used[],
219                                       size_t num_used,
220                                       size_t *num_found,
221                                       int (*check)(TDB_DATA, TDB_DATA, void *),
222                                       void *private_data)
223 {
224         unsigned int g, b;
225         const tdb_off_t *hash;
226         struct tdb_used_record rec;
227         enum TDB_ERROR ecode;
228
229         hash = tdb_access_read(tdb, off,
230                                sizeof(tdb_off_t)
231                                << (group_bits + TDB_HASH_GROUP_BITS),
232                                true);
233         if (TDB_PTR_IS_ERR(hash)) {
234                 return TDB_PTR_ERR(hash);
235         }
236
237         for (g = 0; g < (1 << group_bits); g++) {
238                 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
239                 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
240                         unsigned int bucket, i, used_bits;
241                         uint64_t h;
242                         tdb_off_t *p;
243                         if (group[b] == 0)
244                                 continue;
245
246                         off = group[b] & TDB_OFF_MASK;
247                         p = asearch(&off, used, num_used, off_cmp);
248                         if (!p) {
249                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
250                                                    TDB_LOG_ERROR,
251                                                    "tdb_check: Invalid offset"
252                                                    " %llu in hash",
253                                                    (long long)off);
254                                 goto fail;
255                         }
256                         /* Mark it invalid. */
257                         *p ^= 1;
258                         (*num_found)++;
259
260                         if (hprefix_bits == 64) {
261                                 /* Chained entries are unordered. */
262                                 if (is_subhash(group[b])) {
263                                         ecode = TDB_ERR_CORRUPT;
264                                         tdb_logerr(tdb, ecode,
265                                                    TDB_LOG_ERROR,
266                                                    "tdb_check: Invalid chain"
267                                                    " entry subhash");
268                                         goto fail;
269                                 }
270                                 h = hash_record(tdb, off);
271                                 if (h != hprefix) {
272                                         ecode = TDB_ERR_CORRUPT;
273                                         tdb_logerr(tdb, ecode,
274                                                    TDB_LOG_ERROR,
275                                                    "check: bad hash chain"
276                                                    " placement"
277                                                    " 0x%llx vs 0x%llx",
278                                                    (long long)h,
279                                                    (long long)hprefix);
280                                         goto fail;
281                                 }
282                                 ecode = tdb_read_convert(tdb, off, &rec,
283                                                          sizeof(rec));
284                                 if (ecode != TDB_SUCCESS) {
285                                         goto fail;
286                                 }
287                                 goto check;
288                         }
289
290                         if (is_subhash(group[b])) {
291                                 uint64_t subprefix;
292                                 subprefix = (hprefix
293                                      << (group_bits + TDB_HASH_GROUP_BITS))
294                                         + g * (1 << TDB_HASH_GROUP_BITS) + b;
295
296                                 ecode = check_hash_record(tdb,
297                                                group[b] & TDB_OFF_MASK,
298                                                subprefix,
299                                                hprefix_bits
300                                                        + group_bits
301                                                        + TDB_HASH_GROUP_BITS,
302                                                used, num_used, num_found,
303                                                check, private_data);
304                                 if (ecode != TDB_SUCCESS) {
305                                         goto fail;
306                                 }
307                                 continue;
308                         }
309                         /* A normal entry */
310
311                         /* Does it belong here at all? */
312                         h = hash_record(tdb, off);
313                         used_bits = 0;
314                         if (get_bits(h, hprefix_bits, &used_bits) != hprefix
315                             && hprefix_bits) {
316                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
317                                                    TDB_LOG_ERROR,
318                                                    "check: bad hash placement"
319                                                    " 0x%llx vs 0x%llx",
320                                                    (long long)h,
321                                                    (long long)hprefix);
322                                 goto fail;
323                         }
324
325                         /* Does it belong in this group? */
326                         if (get_bits(h, group_bits, &used_bits) != g) {
327                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
328                                                    TDB_LOG_ERROR,
329                                                    "check: bad group %llu"
330                                                    " vs %u",
331                                                    (long long)h, g);
332                                 goto fail;
333                         }
334
335                         /* Are bucket bits correct? */
336                         bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
337                         if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
338                             != bucket) {
339                                 used_bits -= TDB_HASH_GROUP_BITS;
340                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
341                                                    TDB_LOG_ERROR,
342                                                    "check: bad bucket %u vs %u",
343                                                    (unsigned)get_bits(h,
344                                                         TDB_HASH_GROUP_BITS,
345                                                         &used_bits),
346                                                    bucket);
347                                 goto fail;
348                         }
349
350                         /* There must not be any zero entries between
351                          * the bucket it belongs in and this one! */
352                         for (i = bucket;
353                              i != b;
354                              i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
355                                 if (group[i] == 0) {
356                                         ecode = TDB_ERR_CORRUPT;
357                                         tdb_logerr(tdb, ecode,
358                                                    TDB_LOG_ERROR,
359                                                    "check: bad group placement"
360                                                    " %u vs %u",
361                                                    b, bucket);
362                                         goto fail;
363                                 }
364                         }
365
366                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
367                         if (ecode != TDB_SUCCESS) {
368                                 goto fail;
369                         }
370
371                         /* Bottom bits must match header. */
372                         if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
373                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
374                                                    TDB_LOG_ERROR,
375                                                    "tdb_check: Bad hash magic"
376                                                    " at offset %llu"
377                                                    " (0x%llx vs 0x%llx)",
378                                                    (long long)off,
379                                                    (long long)h,
380                                                    (long long)rec_hash(&rec));
381                                 goto fail;
382                         }
383
384                 check:
385                         if (check) {
386                                 TDB_DATA key, data;
387                                 key.dsize = rec_key_length(&rec);
388                                 data.dsize = rec_data_length(&rec);
389                                 key.dptr = (void *)tdb_access_read(tdb,
390                                                    off + sizeof(rec),
391                                                    key.dsize + data.dsize,
392                                                    false);
393                                 if (TDB_PTR_IS_ERR(key.dptr)) {
394                                         ecode = TDB_PTR_ERR(key.dptr);
395                                         goto fail;
396                                 }
397                                 data.dptr = key.dptr + key.dsize;
398                                 if (check(key, data, private_data) != 0) {
399                                         ecode = TDB_ERR_CORRUPT;
400                                         goto fail;
401                                 }
402                                 tdb_access_release(tdb, key.dptr);
403                         }
404                 }
405         }
406         tdb_access_release(tdb, hash);
407         return TDB_SUCCESS;
408
409 fail:
410         tdb_access_release(tdb, hash);
411         return ecode;
412 }
413
414 static enum TDB_ERROR check_hash(struct tdb_context *tdb,
415                                  tdb_off_t used[],
416                                  size_t num_used, size_t num_ftables,
417                                  int (*check)(TDB_DATA, TDB_DATA, void *),
418                                  void *private_data)
419 {
420         /* Free tables also show up as used. */
421         size_t num_found = num_ftables;
422         enum TDB_ERROR ecode;
423
424         ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
425                                 TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
426                                 0, 0, used, num_used, &num_found,
427                                 check, private_data);
428         if (ecode == TDB_SUCCESS) {
429                 if (num_found != num_used) {
430                         ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
431                                            "tdb_check: Not all entries"
432                                            " are in hash");
433                 }
434         }
435         return ecode;
436 }
437
438 static enum TDB_ERROR check_free(struct tdb_context *tdb,
439                                  tdb_off_t off,
440                                  const struct tdb_free_record *frec,
441                                  tdb_off_t prev, unsigned int ftable,
442                                  unsigned int bucket)
443 {
444         enum TDB_ERROR ecode;
445
446         if (frec_magic(frec) != TDB_FREE_MAGIC) {
447                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
448                                   "tdb_check: offset %llu bad magic 0x%llx",
449                                   (long long)off,
450                                   (long long)frec->magic_and_prev);
451         }
452         if (frec_ftable(frec) != ftable) {
453                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
454                                   "tdb_check: offset %llu bad freetable %u",
455                                   (long long)off, frec_ftable(frec));
456
457         }
458
459         ecode = tdb->methods->oob(tdb, off
460                                   + frec_len(frec)
461                                   + sizeof(struct tdb_used_record),
462                                   false);
463         if (ecode != TDB_SUCCESS) {
464                 return ecode;
465         }
466         if (size_to_bucket(frec_len(frec)) != bucket) {
467                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
468                                   "tdb_check: offset %llu in wrong bucket"
469                                   " (%u vs %u)",
470                                   (long long)off,
471                                   bucket, size_to_bucket(frec_len(frec)));
472         }
473         if (prev != frec_prev(frec)) {
474                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
475                                   "tdb_check: offset %llu bad prev"
476                                   " (%llu vs %llu)",
477                                   (long long)off,
478                                   (long long)prev, (long long)frec_len(frec));
479         }
480         return TDB_SUCCESS;
481 }
482
483 static enum TDB_ERROR check_free_table(struct tdb_context *tdb,
484                                        tdb_off_t ftable_off,
485                                        unsigned ftable_num,
486                                        tdb_off_t fr[],
487                                        size_t num_free,
488                                        size_t *num_found)
489 {
490         struct tdb_freetable ft;
491         tdb_off_t h;
492         unsigned int i;
493         enum TDB_ERROR ecode;
494
495         ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
496         if (ecode != TDB_SUCCESS) {
497                 return ecode;
498         }
499
500         if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
501             || rec_key_length(&ft.hdr) != 0
502             || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
503             || rec_hash(&ft.hdr) != 0) {
504                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
505                                   "tdb_check: Invalid header on free table");
506         }
507
508         for (i = 0; i < TDB_FREE_BUCKETS; i++) {
509                 tdb_off_t off, prev = 0, *p;
510                 struct tdb_free_record f;
511
512                 h = bucket_off(ftable_off, i);
513                 for (off = tdb_read_off(tdb, h); off; off = f.next) {
514                         if (TDB_OFF_IS_ERR(off)) {
515                                 return off;
516                         }
517                         ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
518                         if (ecode != TDB_SUCCESS) {
519                                 return ecode;
520                         }
521                         ecode = check_free(tdb, off, &f, prev, ftable_num, i);
522                         if (ecode != TDB_SUCCESS) {
523                                 return false;
524                         }
525
526                         /* FIXME: Check hash bits */
527                         p = asearch(&off, fr, num_free, off_cmp);
528                         if (!p) {
529                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
530                                                   TDB_LOG_ERROR,
531                                                   "tdb_check: Invalid offset"
532                                                   " %llu in free table",
533                                                   (long long)off);
534                         }
535                         /* Mark it invalid. */
536                         *p ^= 1;
537                         (*num_found)++;
538                         prev = off;
539                 }
540         }
541         return TDB_SUCCESS;
542 }
543
544 /* Slow, but should be very rare. */
545 tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off)
546 {
547         size_t len;
548         enum TDB_ERROR ecode;
549
550         for (len = 0; off + len < tdb->map_size; len++) {
551                 char c;
552                 ecode = tdb->methods->tread(tdb, off, &c, 1);
553                 if (ecode != TDB_SUCCESS) {
554                         return ecode;
555                 }
556                 if (c != 0 && c != 0x43)
557                         break;
558         }
559         return len;
560 }
561
562 static enum TDB_ERROR check_linear(struct tdb_context *tdb,
563                                    tdb_off_t **used, size_t *num_used,
564                                    tdb_off_t **fr, size_t *num_free,
565                                    tdb_off_t recovery)
566 {
567         tdb_off_t off;
568         tdb_len_t len;
569         enum TDB_ERROR ecode;
570         bool found_recovery = false;
571
572         for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
573                 union {
574                         struct tdb_used_record u;
575                         struct tdb_free_record f;
576                         struct tdb_recovery_record r;
577                 } rec;
578                 /* r is larger: only get that if we need to. */
579                 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
580                 if (ecode != TDB_SUCCESS) {
581                         return ecode;
582                 }
583
584                 /* If we crash after ftruncate, we can get zeroes or fill. */
585                 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
586                     || rec.r.magic ==  0x4343434343434343ULL) {
587                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
588                         if (ecode != TDB_SUCCESS) {
589                                 return ecode;
590                         }
591                         if (recovery == off) {
592                                 found_recovery = true;
593                                 len = sizeof(rec.r) + rec.r.max_len;
594                         } else {
595                                 len = dead_space(tdb, off);
596                                 if (TDB_OFF_IS_ERR(len)) {
597                                         return len;
598                                 }
599                                 if (len < sizeof(rec.r)) {
600                                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
601                                                           TDB_LOG_ERROR,
602                                                           "tdb_check: invalid"
603                                                           " dead space at %zu",
604                                                           (size_t)off);
605                                 }
606
607                                 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
608                                            "Dead space at %zu-%zu (of %zu)",
609                                            (size_t)off, (size_t)(off + len),
610                                            (size_t)tdb->map_size);
611                         }
612                 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
613                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
614                         if (ecode != TDB_SUCCESS) {
615                                 return ecode;
616                         }
617                         if (recovery != off) {
618                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
619                                                   TDB_LOG_ERROR,
620                                                   "tdb_check: unexpected"
621                                                   " recovery record at offset"
622                                                   " %zu",
623                                                   (size_t)off);
624                         }
625                         if (rec.r.len > rec.r.max_len) {
626                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
627                                                   TDB_LOG_ERROR,
628                                                   "tdb_check: invalid recovery"
629                                                   " length %zu",
630                                                   (size_t)rec.r.len);
631                         }
632                         if (rec.r.eof > tdb->map_size) {
633                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
634                                                   TDB_LOG_ERROR,
635                                                   "tdb_check: invalid old EOF"
636                                                   " %zu", (size_t)rec.r.eof);
637                         }
638                         found_recovery = true;
639                         len = sizeof(rec.r) + rec.r.max_len;
640                 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
641                         len = sizeof(rec.u) + frec_len(&rec.f);
642                         if (off + len > tdb->map_size) {
643                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
644                                                   TDB_LOG_ERROR,
645                                                   "tdb_check: free overlength"
646                                                   " %llu at offset %llu",
647                                                   (long long)len,
648                                                   (long long)off);
649                         }
650                         /* This record should be in free lists. */
651                         if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
652                             && !append(fr, num_free, off)) {
653                                 return tdb_logerr(tdb, TDB_ERR_OOM,
654                                                   TDB_LOG_ERROR,
655                                                   "tdb_check: tracking %zu'th"
656                                                   " free record.", *num_free);
657                         }
658                 } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
659                            || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
660                            || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
661                            || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
662                         uint64_t klen, dlen, extra;
663
664                         /* This record is used! */
665                         if (!append(used, num_used, off)) {
666                                 return tdb_logerr(tdb, TDB_ERR_OOM,
667                                                   TDB_LOG_ERROR,
668                                                   "tdb_check: tracking %zu'th"
669                                                   " used record.", *num_used);
670                         }
671
672                         klen = rec_key_length(&rec.u);
673                         dlen = rec_data_length(&rec.u);
674                         extra = rec_extra_padding(&rec.u);
675
676                         len = sizeof(rec.u) + klen + dlen + extra;
677                         if (off + len > tdb->map_size) {
678                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
679                                                   TDB_LOG_ERROR,
680                                                   "tdb_check: used overlength"
681                                                   " %llu at offset %llu",
682                                                   (long long)len,
683                                                   (long long)off);
684                         }
685
686                         if (len < sizeof(rec.f)) {
687                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
688                                                   TDB_LOG_ERROR,
689                                                   "tdb_check: too short record"
690                                                   " %llu at %llu",
691                                                   (long long)len,
692                                                   (long long)off);
693                         }
694                 } else {
695                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
696                                           TDB_LOG_ERROR,
697                                           "tdb_check: Bad magic 0x%llx"
698                                           " at offset %zu",
699                                           (long long)rec_magic(&rec.u),
700                                           (size_t)off);
701                 }
702         }
703
704         /* We must have found recovery area if there was one. */
705         if (recovery != 0 && !found_recovery) {
706                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
707                                   "tdb_check: expected a recovery area at %zu",
708                                   (size_t)recovery);
709         }
710
711         return TDB_SUCCESS;
712 }
713
714 int tdb_check(struct tdb_context *tdb,
715               int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
716               void *private_data)
717 {
718         tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
719         size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
720         enum TDB_ERROR ecode;
721
722         ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
723         if (ecode != TDB_SUCCESS) {
724                 tdb->ecode = ecode;
725                 return -1;
726         }
727
728         ecode = tdb_lock_expand(tdb, F_RDLCK);
729         if (ecode != TDB_SUCCESS) {
730                 tdb->ecode = ecode;
731                 tdb_allrecord_unlock(tdb, F_RDLCK);
732                 return -1;
733         }
734
735         ecode = check_header(tdb, &recovery);
736         if (ecode != TDB_SUCCESS)
737                 goto out;
738
739         /* First we do a linear scan, checking all records. */
740         ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, recovery);
741         if (ecode != TDB_SUCCESS)
742                 goto out;
743
744         for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
745                 if (TDB_OFF_IS_ERR(ft)) {
746                         tdb->ecode = ft;
747                         goto out;
748                 }
749                 ecode = check_free_table(tdb, ft, num_ftables, fr, num_free,
750                                          &num_found);
751                 if (ecode != TDB_SUCCESS)
752                         goto out;
753                 num_ftables++;
754         }
755
756         /* FIXME: Check key uniqueness? */
757         ecode = check_hash(tdb, used, num_used, num_ftables, check,
758                            private_data);
759         if (ecode != TDB_SUCCESS)
760                 goto out;
761
762         if (num_found != num_free) {
763                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
764                                    "tdb_check: Not all entries are in"
765                                    " free table");
766         }
767
768 out:
769         tdb_allrecord_unlock(tdb, F_RDLCK);
770         tdb_unlock_expand(tdb, F_RDLCK);
771         free(fr);
772         free(used);
773         if (ecode != TDB_SUCCESS) {
774                 tdb->ecode = ecode;
775                 return -1;
776         }
777         return 0;
778 }