]> git.ozlabs.org Git - ccan/blob - ccan/tdb2/check.c
ttxml: removed cruft from tests
[ccan] / ccan / tdb2 / check.c
1  /*
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
21
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
24 {
25         tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
26         if (!new)
27                 return false;
28         new[(*num)++] = off;
29         *arr = new;
30         return true;
31 }
32
33 static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery,
34                                    uint64_t *features, size_t *num_capabilities)
35 {
36         uint64_t hash_test;
37         struct tdb_header hdr;
38         enum TDB_ERROR ecode;
39         tdb_off_t off, next;
40
41         ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
42         if (ecode != TDB_SUCCESS) {
43                 return ecode;
44         }
45         /* magic food should not be converted, so convert back. */
46         tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
47
48         hash_test = TDB_HASH_MAGIC;
49         hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
50         if (hdr.hash_test != hash_test) {
51                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
52                                   "check: hash test %llu should be %llu",
53                                   (long long)hdr.hash_test,
54                                   (long long)hash_test);
55         }
56
57         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
58                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
59                                   "check: bad magic '%.*s'",
60                                   (unsigned)sizeof(hdr.magic_food),
61                                   hdr.magic_food);
62         }
63
64         /* Features which are used must be a subset of features offered. */
65         if (hdr.features_used & ~hdr.features_offered) {
66                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
67                                   "check: features used (0x%llx) which"
68                                   " are not offered (0x%llx)",
69                                   (long long)hdr.features_used,
70                                   (long long)hdr.features_offered);
71         }
72
73         *features = hdr.features_offered;
74         *recovery = hdr.recovery;
75         if (*recovery) {
76                 if (*recovery < sizeof(hdr)
77                     || *recovery > tdb->file->map_size) {
78                         return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
79                                           "tdb_check:"
80                                           " invalid recovery offset %zu",
81                                           (size_t)*recovery);
82                 }
83         }
84
85         for (off = hdr.capabilities; off && ecode == TDB_SUCCESS; off = next) {
86                 const struct tdb_capability *cap;
87                 enum TDB_ERROR err;
88
89                 cap = tdb_access_read(tdb, off, sizeof(*cap), true);
90                 if (TDB_PTR_IS_ERR(cap)) {
91                         return TDB_PTR_ERR(cap);
92                 }
93
94                 /* All capabilities are unknown. */
95                 err = unknown_capability(tdb, "tdb_check", cap->type);
96                 next = cap->next;
97                 tdb_access_release(tdb, cap);
98                 if (err)
99                         return err;
100                 (*num_capabilities)++;
101         }
102
103         /* Don't check reserved: they *can* be used later. */
104         return TDB_SUCCESS;
105 }
106
107 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
108                                       tdb_off_t off, unsigned int group_bits,
109                                       uint64_t hprefix,
110                                       unsigned hprefix_bits,
111                                       tdb_off_t used[],
112                                       size_t num_used,
113                                       size_t *num_found,
114                                       enum TDB_ERROR (*check)(TDB_DATA,
115                                                               TDB_DATA, void *),
116                                       void *data);
117
118 static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb,
119                                        tdb_off_t off,
120                                        uint64_t hash,
121                                        tdb_off_t used[],
122                                        size_t num_used,
123                                        size_t *num_found,
124                                        enum TDB_ERROR (*check)(TDB_DATA,
125                                                                TDB_DATA,
126                                                                void *),
127                                        void *data)
128 {
129         struct tdb_used_record rec;
130         enum TDB_ERROR ecode;
131
132         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
133         if (ecode != TDB_SUCCESS) {
134                 return ecode;
135         }
136
137         if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
138                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
139                                   "tdb_check: Bad hash chain magic %llu",
140                                   (long long)rec_magic(&rec));
141         }
142
143         if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
144                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
145                                   "tdb_check:"
146                                   " Bad hash chain length %llu vs %zu",
147                                   (long long)rec_data_length(&rec),
148                                   sizeof(struct tdb_chain));
149         }
150         if (rec_key_length(&rec) != 0) {
151                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
152                                   "tdb_check: Bad hash chain key length %llu",
153                                   (long long)rec_key_length(&rec));
154         }
155         if (rec_hash(&rec) != 0) {
156                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
157                                   "tdb_check: Bad hash chain hash value %llu",
158                                   (long long)rec_hash(&rec));
159         }
160
161         off += sizeof(rec);
162         ecode = check_hash_tree(tdb, off, 0, hash, 64,
163                                 used, num_used, num_found, check, data);
164         if (ecode != TDB_SUCCESS) {
165                 return ecode;
166         }
167
168         off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
169         if (TDB_OFF_IS_ERR(off)) {
170                 return TDB_OFF_TO_ERR(off);
171         }
172         if (off == 0)
173                 return TDB_SUCCESS;
174         (*num_found)++;
175         return check_hash_chain(tdb, off, hash, used, num_used, num_found,
176                                 check, data);
177 }
178
179 static enum TDB_ERROR check_hash_record(struct tdb_context *tdb,
180                                         tdb_off_t off,
181                                         uint64_t hprefix,
182                                         unsigned hprefix_bits,
183                                         tdb_off_t used[],
184                                         size_t num_used,
185                                         size_t *num_found,
186                                         enum TDB_ERROR (*check)(TDB_DATA,
187                                                                 TDB_DATA,
188                                                                 void *),
189                                         void *data)
190 {
191         struct tdb_used_record rec;
192         enum TDB_ERROR ecode;
193
194         if (hprefix_bits >= 64)
195                 return check_hash_chain(tdb, off, hprefix, used, num_used,
196                                         num_found, check, data);
197
198         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
199         if (ecode != TDB_SUCCESS) {
200                 return ecode;
201         }
202
203         if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
204                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
205                                   "tdb_check: Bad hash table magic %llu",
206                                   (long long)rec_magic(&rec));
207         }
208         if (rec_data_length(&rec)
209             != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
210                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
211                                   "tdb_check:"
212                                   " Bad hash table length %llu vs %llu",
213                                   (long long)rec_data_length(&rec),
214                                   (long long)sizeof(tdb_off_t)
215                                   << TDB_SUBLEVEL_HASH_BITS);
216         }
217         if (rec_key_length(&rec) != 0) {
218                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
219                                   "tdb_check: Bad hash table key length %llu",
220                                   (long long)rec_key_length(&rec));
221         }
222         if (rec_hash(&rec) != 0) {
223                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
224                                   "tdb_check: Bad hash table hash value %llu",
225                                   (long long)rec_hash(&rec));
226         }
227
228         off += sizeof(rec);
229         return check_hash_tree(tdb, off,
230                                TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
231                                hprefix, hprefix_bits,
232                                used, num_used, num_found, check, data);
233 }
234
235 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
236 {
237         /* Can overflow an int. */
238         return *a > *b ? 1
239                 : *a < *b ? -1
240                 : 0;
241 }
242
243 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
244 {
245         *used += num;
246
247         return (h >> (64 - *used)) & ((1U << num) - 1);
248 }
249
250 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
251                                       tdb_off_t off, unsigned int group_bits,
252                                       uint64_t hprefix,
253                                       unsigned hprefix_bits,
254                                       tdb_off_t used[],
255                                       size_t num_used,
256                                       size_t *num_found,
257                                       enum TDB_ERROR (*check)(TDB_DATA,
258                                                               TDB_DATA, void *),
259                                       void *data)
260 {
261         unsigned int g, b;
262         const tdb_off_t *hash;
263         struct tdb_used_record rec;
264         enum TDB_ERROR ecode;
265
266         hash = tdb_access_read(tdb, off,
267                                sizeof(tdb_off_t)
268                                << (group_bits + TDB_HASH_GROUP_BITS),
269                                true);
270         if (TDB_PTR_IS_ERR(hash)) {
271                 return TDB_PTR_ERR(hash);
272         }
273
274         for (g = 0; g < (1 << group_bits); g++) {
275                 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
276                 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
277                         unsigned int bucket, i, used_bits;
278                         uint64_t h;
279                         tdb_off_t *p;
280                         if (group[b] == 0)
281                                 continue;
282
283                         off = group[b] & TDB_OFF_MASK;
284                         p = asearch(&off, used, num_used, off_cmp);
285                         if (!p) {
286                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
287                                                    TDB_LOG_ERROR,
288                                                    "tdb_check: Invalid offset"
289                                                    " %llu in hash",
290                                                    (long long)off);
291                                 goto fail;
292                         }
293                         /* Mark it invalid. */
294                         *p ^= 1;
295                         (*num_found)++;
296
297                         if (hprefix_bits == 64) {
298                                 /* Chained entries are unordered. */
299                                 if (is_subhash(group[b])) {
300                                         ecode = TDB_ERR_CORRUPT;
301                                         tdb_logerr(tdb, ecode,
302                                                    TDB_LOG_ERROR,
303                                                    "tdb_check: Invalid chain"
304                                                    " entry subhash");
305                                         goto fail;
306                                 }
307                                 h = hash_record(tdb, off);
308                                 if (h != hprefix) {
309                                         ecode = TDB_ERR_CORRUPT;
310                                         tdb_logerr(tdb, ecode,
311                                                    TDB_LOG_ERROR,
312                                                    "check: bad hash chain"
313                                                    " placement"
314                                                    " 0x%llx vs 0x%llx",
315                                                    (long long)h,
316                                                    (long long)hprefix);
317                                         goto fail;
318                                 }
319                                 ecode = tdb_read_convert(tdb, off, &rec,
320                                                          sizeof(rec));
321                                 if (ecode != TDB_SUCCESS) {
322                                         goto fail;
323                                 }
324                                 goto check;
325                         }
326
327                         if (is_subhash(group[b])) {
328                                 uint64_t subprefix;
329                                 subprefix = (hprefix
330                                      << (group_bits + TDB_HASH_GROUP_BITS))
331                                         + g * (1 << TDB_HASH_GROUP_BITS) + b;
332
333                                 ecode = check_hash_record(tdb,
334                                                group[b] & TDB_OFF_MASK,
335                                                subprefix,
336                                                hprefix_bits
337                                                        + group_bits
338                                                        + TDB_HASH_GROUP_BITS,
339                                                used, num_used, num_found,
340                                                check, data);
341                                 if (ecode != TDB_SUCCESS) {
342                                         goto fail;
343                                 }
344                                 continue;
345                         }
346                         /* A normal entry */
347
348                         /* Does it belong here at all? */
349                         h = hash_record(tdb, off);
350                         used_bits = 0;
351                         if (get_bits(h, hprefix_bits, &used_bits) != hprefix
352                             && hprefix_bits) {
353                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
354                                                    TDB_LOG_ERROR,
355                                                    "check: bad hash placement"
356                                                    " 0x%llx vs 0x%llx",
357                                                    (long long)h,
358                                                    (long long)hprefix);
359                                 goto fail;
360                         }
361
362                         /* Does it belong in this group? */
363                         if (get_bits(h, group_bits, &used_bits) != g) {
364                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
365                                                    TDB_LOG_ERROR,
366                                                    "check: bad group %llu"
367                                                    " vs %u",
368                                                    (long long)h, g);
369                                 goto fail;
370                         }
371
372                         /* Are bucket bits correct? */
373                         bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
374                         if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
375                             != bucket) {
376                                 used_bits -= TDB_HASH_GROUP_BITS;
377                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
378                                                    TDB_LOG_ERROR,
379                                                    "check: bad bucket %u vs %u",
380                                                    (unsigned)get_bits(h,
381                                                         TDB_HASH_GROUP_BITS,
382                                                         &used_bits),
383                                                    bucket);
384                                 goto fail;
385                         }
386
387                         /* There must not be any zero entries between
388                          * the bucket it belongs in and this one! */
389                         for (i = bucket;
390                              i != b;
391                              i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
392                                 if (group[i] == 0) {
393                                         ecode = TDB_ERR_CORRUPT;
394                                         tdb_logerr(tdb, ecode,
395                                                    TDB_LOG_ERROR,
396                                                    "check: bad group placement"
397                                                    " %u vs %u",
398                                                    b, bucket);
399                                         goto fail;
400                                 }
401                         }
402
403                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
404                         if (ecode != TDB_SUCCESS) {
405                                 goto fail;
406                         }
407
408                         /* Bottom bits must match header. */
409                         if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
410                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
411                                                    TDB_LOG_ERROR,
412                                                    "tdb_check: Bad hash magic"
413                                                    " at offset %llu"
414                                                    " (0x%llx vs 0x%llx)",
415                                                    (long long)off,
416                                                    (long long)h,
417                                                    (long long)rec_hash(&rec));
418                                 goto fail;
419                         }
420
421                 check:
422                         if (check) {
423                                 TDB_DATA k, d;
424                                 const unsigned char *kptr;
425
426                                 kptr = tdb_access_read(tdb,
427                                                        off + sizeof(rec),
428                                                        rec_key_length(&rec)
429                                                        + rec_data_length(&rec),
430                                                        false);
431                                 if (TDB_PTR_IS_ERR(kptr)) {
432                                         ecode = TDB_PTR_ERR(kptr);
433                                         goto fail;
434                                 }
435
436                                 k = tdb_mkdata(kptr, rec_key_length(&rec));
437                                 d = tdb_mkdata(kptr + k.dsize,
438                                                rec_data_length(&rec));
439                                 ecode = check(k, d, data);
440                                 tdb_access_release(tdb, kptr);
441                                 if (ecode != TDB_SUCCESS) {
442                                         goto fail;
443                                 }
444                         }
445                 }
446         }
447         tdb_access_release(tdb, hash);
448         return TDB_SUCCESS;
449
450 fail:
451         tdb_access_release(tdb, hash);
452         return ecode;
453 }
454
455 static enum TDB_ERROR check_hash(struct tdb_context *tdb,
456                                  tdb_off_t used[],
457                                  size_t num_used, size_t num_other_used,
458                                  enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *),
459                                  void *data)
460 {
461         /* Free tables and capabilities also show up as used. */
462         size_t num_found = num_other_used;
463         enum TDB_ERROR ecode;
464
465         ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
466                                 TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
467                                 0, 0, used, num_used, &num_found,
468                                 check, data);
469         if (ecode == TDB_SUCCESS) {
470                 if (num_found != num_used) {
471                         ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
472                                            "tdb_check: Not all entries"
473                                            " are in hash");
474                 }
475         }
476         return ecode;
477 }
478
479 static enum TDB_ERROR check_free(struct tdb_context *tdb,
480                                  tdb_off_t off,
481                                  const struct tdb_free_record *frec,
482                                  tdb_off_t prev, unsigned int ftable,
483                                  unsigned int bucket)
484 {
485         enum TDB_ERROR ecode;
486
487         if (frec_magic(frec) != TDB_FREE_MAGIC) {
488                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
489                                   "tdb_check: offset %llu bad magic 0x%llx",
490                                   (long long)off,
491                                   (long long)frec->magic_and_prev);
492         }
493         if (frec_ftable(frec) != ftable) {
494                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
495                                   "tdb_check: offset %llu bad freetable %u",
496                                   (long long)off, frec_ftable(frec));
497
498         }
499
500         ecode = tdb->tdb2.io->oob(tdb, off,
501                                   frec_len(frec)
502                                   + sizeof(struct tdb_used_record),
503                                   false);
504         if (ecode != TDB_SUCCESS) {
505                 return ecode;
506         }
507         if (size_to_bucket(frec_len(frec)) != bucket) {
508                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
509                                   "tdb_check: offset %llu in wrong bucket"
510                                   " (%u vs %u)",
511                                   (long long)off,
512                                   bucket, size_to_bucket(frec_len(frec)));
513         }
514         if (prev && prev != frec_prev(frec)) {
515                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
516                                   "tdb_check: offset %llu bad prev"
517                                   " (%llu vs %llu)",
518                                   (long long)off,
519                                   (long long)prev, (long long)frec_len(frec));
520         }
521         return TDB_SUCCESS;
522 }
523
524 static enum TDB_ERROR check_free_table(struct tdb_context *tdb,
525                                        tdb_off_t ftable_off,
526                                        unsigned ftable_num,
527                                        tdb_off_t fr[],
528                                        size_t num_free,
529                                        size_t *num_found)
530 {
531         struct tdb_freetable ft;
532         tdb_off_t h;
533         unsigned int i;
534         enum TDB_ERROR ecode;
535
536         ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
537         if (ecode != TDB_SUCCESS) {
538                 return ecode;
539         }
540
541         if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
542             || rec_key_length(&ft.hdr) != 0
543             || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
544             || rec_hash(&ft.hdr) != 0) {
545                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
546                                   "tdb_check: Invalid header on free table");
547         }
548
549         for (i = 0; i < TDB_FREE_BUCKETS; i++) {
550                 tdb_off_t off, prev = 0, *p, first = 0;
551                 struct tdb_free_record f;
552
553                 h = bucket_off(ftable_off, i);
554                 for (off = tdb_read_off(tdb, h); off; off = f.next) {
555                         if (TDB_OFF_IS_ERR(off)) {
556                                 return TDB_OFF_TO_ERR(off);
557                         }
558                         if (!first) {
559                                 off &= TDB_OFF_MASK;
560                                 first = off;
561                         }
562                         ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
563                         if (ecode != TDB_SUCCESS) {
564                                 return ecode;
565                         }
566                         ecode = check_free(tdb, off, &f, prev, ftable_num, i);
567                         if (ecode != TDB_SUCCESS) {
568                                 return ecode;
569                         }
570
571                         /* FIXME: Check hash bits */
572                         p = asearch(&off, fr, num_free, off_cmp);
573                         if (!p) {
574                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
575                                                   TDB_LOG_ERROR,
576                                                   "tdb_check: Invalid offset"
577                                                   " %llu in free table",
578                                                   (long long)off);
579                         }
580                         /* Mark it invalid. */
581                         *p ^= 1;
582                         (*num_found)++;
583                         prev = off;
584                 }
585
586                 if (first) {
587                         /* Now we can check first back pointer. */
588                         ecode = tdb_read_convert(tdb, first, &f, sizeof(f));
589                         if (ecode != TDB_SUCCESS) {
590                                 return ecode;
591                         }
592                         ecode = check_free(tdb, first, &f, prev, ftable_num, i);
593                         if (ecode != TDB_SUCCESS) {
594                                 return ecode;
595                         }
596                 }
597         }
598         return TDB_SUCCESS;
599 }
600
601 /* Slow, but should be very rare. */
602 tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off)
603 {
604         size_t len;
605         enum TDB_ERROR ecode;
606
607         for (len = 0; off + len < tdb->file->map_size; len++) {
608                 char c;
609                 ecode = tdb->tdb2.io->tread(tdb, off, &c, 1);
610                 if (ecode != TDB_SUCCESS) {
611                         return TDB_ERR_TO_OFF(ecode);
612                 }
613                 if (c != 0 && c != 0x43)
614                         break;
615         }
616         return len;
617 }
618
619 static enum TDB_ERROR check_linear(struct tdb_context *tdb,
620                                    tdb_off_t **used, size_t *num_used,
621                                    tdb_off_t **fr, size_t *num_free,
622                                    uint64_t features, tdb_off_t recovery)
623 {
624         tdb_off_t off;
625         tdb_len_t len;
626         enum TDB_ERROR ecode;
627         bool found_recovery = false;
628
629         for (off = sizeof(struct tdb_header);
630              off < tdb->file->map_size;
631              off += len) {
632                 union {
633                         struct tdb_used_record u;
634                         struct tdb_free_record f;
635                         struct tdb_recovery_record r;
636                 } rec;
637                 /* r is larger: only get that if we need to. */
638                 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
639                 if (ecode != TDB_SUCCESS) {
640                         return ecode;
641                 }
642
643                 /* If we crash after ftruncate, we can get zeroes or fill. */
644                 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
645                     || rec.r.magic ==  0x4343434343434343ULL) {
646                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
647                         if (ecode != TDB_SUCCESS) {
648                                 return ecode;
649                         }
650                         if (recovery == off) {
651                                 found_recovery = true;
652                                 len = sizeof(rec.r) + rec.r.max_len;
653                         } else {
654                                 len = dead_space(tdb, off);
655                                 if (TDB_OFF_IS_ERR(len)) {
656                                         return TDB_OFF_TO_ERR(len);
657                                 }
658                                 if (len < sizeof(rec.r)) {
659                                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
660                                                           TDB_LOG_ERROR,
661                                                           "tdb_check: invalid"
662                                                           " dead space at %zu",
663                                                           (size_t)off);
664                                 }
665
666                                 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
667                                            "Dead space at %zu-%zu (of %zu)",
668                                            (size_t)off, (size_t)(off + len),
669                                            (size_t)tdb->file->map_size);
670                         }
671                 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
672                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
673                         if (ecode != TDB_SUCCESS) {
674                                 return ecode;
675                         }
676                         if (recovery != off) {
677                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
678                                                   TDB_LOG_ERROR,
679                                                   "tdb_check: unexpected"
680                                                   " recovery record at offset"
681                                                   " %zu",
682                                                   (size_t)off);
683                         }
684                         if (rec.r.len > rec.r.max_len) {
685                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
686                                                   TDB_LOG_ERROR,
687                                                   "tdb_check: invalid recovery"
688                                                   " length %zu",
689                                                   (size_t)rec.r.len);
690                         }
691                         if (rec.r.eof > tdb->file->map_size) {
692                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
693                                                   TDB_LOG_ERROR,
694                                                   "tdb_check: invalid old EOF"
695                                                   " %zu", (size_t)rec.r.eof);
696                         }
697                         found_recovery = true;
698                         len = sizeof(rec.r) + rec.r.max_len;
699                 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
700                         len = sizeof(rec.u) + frec_len(&rec.f);
701                         if (off + len > tdb->file->map_size) {
702                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
703                                                   TDB_LOG_ERROR,
704                                                   "tdb_check: free overlength"
705                                                   " %llu at offset %llu",
706                                                   (long long)len,
707                                                   (long long)off);
708                         }
709                         /* This record should be in free lists. */
710                         if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
711                             && !append(fr, num_free, off)) {
712                                 return tdb_logerr(tdb, TDB_ERR_OOM,
713                                                   TDB_LOG_ERROR,
714                                                   "tdb_check: tracking %zu'th"
715                                                   " free record.", *num_free);
716                         }
717                 } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
718                            || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
719                            || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
720                            || rec_magic(&rec.u) == TDB_FTABLE_MAGIC
721                            || rec_magic(&rec.u) == TDB_CAP_MAGIC) {
722                         uint64_t klen, dlen, extra;
723
724                         /* This record is used! */
725                         if (!append(used, num_used, off)) {
726                                 return tdb_logerr(tdb, TDB_ERR_OOM,
727                                                   TDB_LOG_ERROR,
728                                                   "tdb_check: tracking %zu'th"
729                                                   " used record.", *num_used);
730                         }
731
732                         klen = rec_key_length(&rec.u);
733                         dlen = rec_data_length(&rec.u);
734                         extra = rec_extra_padding(&rec.u);
735
736                         len = sizeof(rec.u) + klen + dlen + extra;
737                         if (off + len > tdb->file->map_size) {
738                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
739                                                   TDB_LOG_ERROR,
740                                                   "tdb_check: used overlength"
741                                                   " %llu at offset %llu",
742                                                   (long long)len,
743                                                   (long long)off);
744                         }
745
746                         if (len < sizeof(rec.f)) {
747                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
748                                                   TDB_LOG_ERROR,
749                                                   "tdb_check: too short record"
750                                                   " %llu at %llu",
751                                                   (long long)len,
752                                                   (long long)off);
753                         }
754
755                         /* Check that records have correct 0 at end (but may
756                          * not in future). */
757                         if (extra && !features
758                             && rec_magic(&rec.u) != TDB_CAP_MAGIC) {
759                                 const char *p;
760                                 char c;
761                                 p = tdb_access_read(tdb, off + sizeof(rec.u)
762                                                     + klen + dlen, 1, false);
763                                 if (TDB_PTR_IS_ERR(p))
764                                         return TDB_PTR_ERR(p);
765                                 c = *p;
766                                 tdb_access_release(tdb, p);
767
768                                 if (c != '\0') {
769                                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
770                                                           TDB_LOG_ERROR,
771                                                           "tdb_check:"
772                                                           " non-zero extra"
773                                                           " at %llu",
774                                                           (long long)off);
775                                 }
776                         }
777                 } else {
778                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
779                                           TDB_LOG_ERROR,
780                                           "tdb_check: Bad magic 0x%llx"
781                                           " at offset %zu",
782                                           (long long)rec_magic(&rec.u),
783                                           (size_t)off);
784                 }
785         }
786
787         /* We must have found recovery area if there was one. */
788         if (recovery != 0 && !found_recovery) {
789                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
790                                   "tdb_check: expected a recovery area at %zu",
791                                   (size_t)recovery);
792         }
793
794         return TDB_SUCCESS;
795 }
796
797 enum TDB_ERROR tdb_check_(struct tdb_context *tdb,
798                           enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *),
799                           void *data)
800 {
801         tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
802         size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0,
803                 num_capabilities = 0;
804         uint64_t features;
805         enum TDB_ERROR ecode;
806
807         if (tdb->flags & TDB_CANT_CHECK) {
808                 return tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
809                                   "tdb_check: database has unknown capability,"
810                                   " cannot check.");
811         }
812
813         if (tdb->flags & TDB_VERSION1) {
814                 if (tdb1_check(tdb, check, data) == -1)
815                         return tdb->last_error;
816                 return TDB_SUCCESS;
817         }
818
819         ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
820         if (ecode != TDB_SUCCESS) {
821                 return tdb->last_error = ecode;
822         }
823
824         ecode = tdb_lock_expand(tdb, F_RDLCK);
825         if (ecode != TDB_SUCCESS) {
826                 tdb_allrecord_unlock(tdb, F_RDLCK);
827                 return tdb->last_error = ecode;
828         }
829
830         ecode = check_header(tdb, &recovery, &features, &num_capabilities);
831         if (ecode != TDB_SUCCESS)
832                 goto out;
833
834         /* First we do a linear scan, checking all records. */
835         ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, features,
836                              recovery);
837         if (ecode != TDB_SUCCESS)
838                 goto out;
839
840         for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
841                 if (TDB_OFF_IS_ERR(ft)) {
842                         ecode = TDB_OFF_TO_ERR(ft);
843                         goto out;
844                 }
845                 ecode = check_free_table(tdb, ft, num_ftables, fr, num_free,
846                                          &num_found);
847                 if (ecode != TDB_SUCCESS)
848                         goto out;
849                 num_ftables++;
850         }
851
852         /* FIXME: Check key uniqueness? */
853         ecode = check_hash(tdb, used, num_used, num_ftables + num_capabilities,
854                            check, data);
855         if (ecode != TDB_SUCCESS)
856                 goto out;
857
858         if (num_found != num_free) {
859                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
860                                    "tdb_check: Not all entries are in"
861                                    " free table");
862         }
863
864 out:
865         tdb_allrecord_unlock(tdb, F_RDLCK);
866         tdb_unlock_expand(tdb, F_RDLCK);
867         free(fr);
868         free(used);
869         return tdb->last_error = ecode;
870 }