tdb2: make tests work in parallel.
[ccan] / ccan / tdb2 / check.c
1  /*
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
21
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
24 {
25         tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
26         if (!new)
27                 return false;
28         new[(*num)++] = off;
29         *arr = new;
30         return true;
31 }
32
33 static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery,
34                                    uint64_t *features)
35 {
36         uint64_t hash_test;
37         struct tdb_header hdr;
38         enum TDB_ERROR ecode;
39
40         ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
41         if (ecode != TDB_SUCCESS) {
42                 return ecode;
43         }
44         /* magic food should not be converted, so convert back. */
45         tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
46
47         hash_test = TDB_HASH_MAGIC;
48         hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
49         if (hdr.hash_test != hash_test) {
50                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
51                                   "check: hash test %llu should be %llu",
52                                   (long long)hdr.hash_test,
53                                   (long long)hash_test);
54         }
55
56         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
57                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
58                                   "check: bad magic '%.*s'",
59                                   (unsigned)sizeof(hdr.magic_food),
60                                   hdr.magic_food);
61         }
62
63         /* Features which are used must be a subset of features offered. */
64         if (hdr.features_used & ~hdr.features_offered) {
65                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
66                                   "check: features used (0x%llx) which"
67                                   " are not offered (0x%llx)",
68                                   (long long)hdr.features_used,
69                                   (long long)hdr.features_offered);
70         }
71
72         *features = hdr.features_offered;
73         *recovery = hdr.recovery;
74         if (*recovery) {
75                 if (*recovery < sizeof(hdr)
76                     || *recovery > tdb->file->map_size) {
77                         return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
78                                           "tdb_check:"
79                                           " invalid recovery offset %zu",
80                                           (size_t)*recovery);
81                 }
82         }
83
84         /* Don't check reserved: they *can* be used later. */
85         return TDB_SUCCESS;
86 }
87
88 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
89                                       tdb_off_t off, unsigned int group_bits,
90                                       uint64_t hprefix,
91                                       unsigned hprefix_bits,
92                                       tdb_off_t used[],
93                                       size_t num_used,
94                                       size_t *num_found,
95                                       enum TDB_ERROR (*check)(TDB_DATA,
96                                                               TDB_DATA, void *),
97                                       void *data);
98
99 static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb,
100                                        tdb_off_t off,
101                                        uint64_t hash,
102                                        tdb_off_t used[],
103                                        size_t num_used,
104                                        size_t *num_found,
105                                        enum TDB_ERROR (*check)(TDB_DATA,
106                                                                TDB_DATA,
107                                                                void *),
108                                        void *data)
109 {
110         struct tdb_used_record rec;
111         enum TDB_ERROR ecode;
112
113         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
114         if (ecode != TDB_SUCCESS) {
115                 return ecode;
116         }
117
118         if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
119                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
120                                   "tdb_check: Bad hash chain magic %llu",
121                                   (long long)rec_magic(&rec));
122         }
123
124         if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
125                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
126                                   "tdb_check:"
127                                   " Bad hash chain length %llu vs %zu",
128                                   (long long)rec_data_length(&rec),
129                                   sizeof(struct tdb_chain));
130         }
131         if (rec_key_length(&rec) != 0) {
132                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
133                                   "tdb_check: Bad hash chain key length %llu",
134                                   (long long)rec_key_length(&rec));
135         }
136         if (rec_hash(&rec) != 0) {
137                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
138                                   "tdb_check: Bad hash chain hash value %llu",
139                                   (long long)rec_hash(&rec));
140         }
141
142         off += sizeof(rec);
143         ecode = check_hash_tree(tdb, off, 0, hash, 64,
144                                 used, num_used, num_found, check, data);
145         if (ecode != TDB_SUCCESS) {
146                 return ecode;
147         }
148
149         off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
150         if (TDB_OFF_IS_ERR(off)) {
151                 return off;
152         }
153         if (off == 0)
154                 return TDB_SUCCESS;
155         (*num_found)++;
156         return check_hash_chain(tdb, off, hash, used, num_used, num_found,
157                                 check, data);
158 }
159
160 static enum TDB_ERROR check_hash_record(struct tdb_context *tdb,
161                                         tdb_off_t off,
162                                         uint64_t hprefix,
163                                         unsigned hprefix_bits,
164                                         tdb_off_t used[],
165                                         size_t num_used,
166                                         size_t *num_found,
167                                         enum TDB_ERROR (*check)(TDB_DATA,
168                                                                 TDB_DATA,
169                                                                 void *),
170                                         void *data)
171 {
172         struct tdb_used_record rec;
173         enum TDB_ERROR ecode;
174
175         if (hprefix_bits >= 64)
176                 return check_hash_chain(tdb, off, hprefix, used, num_used,
177                                         num_found, check, data);
178
179         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
180         if (ecode != TDB_SUCCESS) {
181                 return ecode;
182         }
183
184         if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
185                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
186                                   "tdb_check: Bad hash table magic %llu",
187                                   (long long)rec_magic(&rec));
188         }
189         if (rec_data_length(&rec)
190             != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
191                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
192                                   "tdb_check:"
193                                   " Bad hash table length %llu vs %llu",
194                                   (long long)rec_data_length(&rec),
195                                   (long long)sizeof(tdb_off_t)
196                                   << TDB_SUBLEVEL_HASH_BITS);
197         }
198         if (rec_key_length(&rec) != 0) {
199                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
200                                   "tdb_check: Bad hash table key length %llu",
201                                   (long long)rec_key_length(&rec));
202         }
203         if (rec_hash(&rec) != 0) {
204                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
205                                   "tdb_check: Bad hash table hash value %llu",
206                                   (long long)rec_hash(&rec));
207         }
208
209         off += sizeof(rec);
210         return check_hash_tree(tdb, off,
211                                TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
212                                hprefix, hprefix_bits,
213                                used, num_used, num_found, check, data);
214 }
215
216 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
217 {
218         /* Can overflow an int. */
219         return *a > *b ? 1
220                 : *a < *b ? -1
221                 : 0;
222 }
223
224 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
225 {
226         *used += num;
227
228         return (h >> (64 - *used)) & ((1U << num) - 1);
229 }
230
231 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
232                                       tdb_off_t off, unsigned int group_bits,
233                                       uint64_t hprefix,
234                                       unsigned hprefix_bits,
235                                       tdb_off_t used[],
236                                       size_t num_used,
237                                       size_t *num_found,
238                                       enum TDB_ERROR (*check)(TDB_DATA,
239                                                               TDB_DATA, void *),
240                                       void *data)
241 {
242         unsigned int g, b;
243         const tdb_off_t *hash;
244         struct tdb_used_record rec;
245         enum TDB_ERROR ecode;
246
247         hash = tdb_access_read(tdb, off,
248                                sizeof(tdb_off_t)
249                                << (group_bits + TDB_HASH_GROUP_BITS),
250                                true);
251         if (TDB_PTR_IS_ERR(hash)) {
252                 return TDB_PTR_ERR(hash);
253         }
254
255         for (g = 0; g < (1 << group_bits); g++) {
256                 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
257                 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
258                         unsigned int bucket, i, used_bits;
259                         uint64_t h;
260                         tdb_off_t *p;
261                         if (group[b] == 0)
262                                 continue;
263
264                         off = group[b] & TDB_OFF_MASK;
265                         p = asearch(&off, used, num_used, off_cmp);
266                         if (!p) {
267                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
268                                                    TDB_LOG_ERROR,
269                                                    "tdb_check: Invalid offset"
270                                                    " %llu in hash",
271                                                    (long long)off);
272                                 goto fail;
273                         }
274                         /* Mark it invalid. */
275                         *p ^= 1;
276                         (*num_found)++;
277
278                         if (hprefix_bits == 64) {
279                                 /* Chained entries are unordered. */
280                                 if (is_subhash(group[b])) {
281                                         ecode = TDB_ERR_CORRUPT;
282                                         tdb_logerr(tdb, ecode,
283                                                    TDB_LOG_ERROR,
284                                                    "tdb_check: Invalid chain"
285                                                    " entry subhash");
286                                         goto fail;
287                                 }
288                                 h = hash_record(tdb, off);
289                                 if (h != hprefix) {
290                                         ecode = TDB_ERR_CORRUPT;
291                                         tdb_logerr(tdb, ecode,
292                                                    TDB_LOG_ERROR,
293                                                    "check: bad hash chain"
294                                                    " placement"
295                                                    " 0x%llx vs 0x%llx",
296                                                    (long long)h,
297                                                    (long long)hprefix);
298                                         goto fail;
299                                 }
300                                 ecode = tdb_read_convert(tdb, off, &rec,
301                                                          sizeof(rec));
302                                 if (ecode != TDB_SUCCESS) {
303                                         goto fail;
304                                 }
305                                 goto check;
306                         }
307
308                         if (is_subhash(group[b])) {
309                                 uint64_t subprefix;
310                                 subprefix = (hprefix
311                                      << (group_bits + TDB_HASH_GROUP_BITS))
312                                         + g * (1 << TDB_HASH_GROUP_BITS) + b;
313
314                                 ecode = check_hash_record(tdb,
315                                                group[b] & TDB_OFF_MASK,
316                                                subprefix,
317                                                hprefix_bits
318                                                        + group_bits
319                                                        + TDB_HASH_GROUP_BITS,
320                                                used, num_used, num_found,
321                                                check, data);
322                                 if (ecode != TDB_SUCCESS) {
323                                         goto fail;
324                                 }
325                                 continue;
326                         }
327                         /* A normal entry */
328
329                         /* Does it belong here at all? */
330                         h = hash_record(tdb, off);
331                         used_bits = 0;
332                         if (get_bits(h, hprefix_bits, &used_bits) != hprefix
333                             && hprefix_bits) {
334                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
335                                                    TDB_LOG_ERROR,
336                                                    "check: bad hash placement"
337                                                    " 0x%llx vs 0x%llx",
338                                                    (long long)h,
339                                                    (long long)hprefix);
340                                 goto fail;
341                         }
342
343                         /* Does it belong in this group? */
344                         if (get_bits(h, group_bits, &used_bits) != g) {
345                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
346                                                    TDB_LOG_ERROR,
347                                                    "check: bad group %llu"
348                                                    " vs %u",
349                                                    (long long)h, g);
350                                 goto fail;
351                         }
352
353                         /* Are bucket bits correct? */
354                         bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
355                         if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
356                             != bucket) {
357                                 used_bits -= TDB_HASH_GROUP_BITS;
358                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
359                                                    TDB_LOG_ERROR,
360                                                    "check: bad bucket %u vs %u",
361                                                    (unsigned)get_bits(h,
362                                                         TDB_HASH_GROUP_BITS,
363                                                         &used_bits),
364                                                    bucket);
365                                 goto fail;
366                         }
367
368                         /* There must not be any zero entries between
369                          * the bucket it belongs in and this one! */
370                         for (i = bucket;
371                              i != b;
372                              i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
373                                 if (group[i] == 0) {
374                                         ecode = TDB_ERR_CORRUPT;
375                                         tdb_logerr(tdb, ecode,
376                                                    TDB_LOG_ERROR,
377                                                    "check: bad group placement"
378                                                    " %u vs %u",
379                                                    b, bucket);
380                                         goto fail;
381                                 }
382                         }
383
384                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
385                         if (ecode != TDB_SUCCESS) {
386                                 goto fail;
387                         }
388
389                         /* Bottom bits must match header. */
390                         if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
391                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
392                                                    TDB_LOG_ERROR,
393                                                    "tdb_check: Bad hash magic"
394                                                    " at offset %llu"
395                                                    " (0x%llx vs 0x%llx)",
396                                                    (long long)off,
397                                                    (long long)h,
398                                                    (long long)rec_hash(&rec));
399                                 goto fail;
400                         }
401
402                 check:
403                         if (check) {
404                                 TDB_DATA k, d;
405                                 const unsigned char *kptr;
406
407                                 kptr = tdb_access_read(tdb,
408                                                        off + sizeof(rec),
409                                                        rec_key_length(&rec)
410                                                        + rec_data_length(&rec),
411                                                        false);
412                                 if (TDB_PTR_IS_ERR(kptr)) {
413                                         ecode = TDB_PTR_ERR(kptr);
414                                         goto fail;
415                                 }
416
417                                 k = tdb_mkdata(kptr, rec_key_length(&rec));
418                                 d = tdb_mkdata(kptr + k.dsize,
419                                                rec_data_length(&rec));
420                                 ecode = check(k, d, data);
421                                 tdb_access_release(tdb, kptr);
422                                 if (ecode != TDB_SUCCESS) {
423                                         goto fail;
424                                 }
425                         }
426                 }
427         }
428         tdb_access_release(tdb, hash);
429         return TDB_SUCCESS;
430
431 fail:
432         tdb_access_release(tdb, hash);
433         return ecode;
434 }
435
436 static enum TDB_ERROR check_hash(struct tdb_context *tdb,
437                                  tdb_off_t used[],
438                                  size_t num_used, size_t num_ftables,
439                                  enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *),
440                                  void *data)
441 {
442         /* Free tables also show up as used. */
443         size_t num_found = num_ftables;
444         enum TDB_ERROR ecode;
445
446         ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
447                                 TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
448                                 0, 0, used, num_used, &num_found,
449                                 check, data);
450         if (ecode == TDB_SUCCESS) {
451                 if (num_found != num_used) {
452                         ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
453                                            "tdb_check: Not all entries"
454                                            " are in hash");
455                 }
456         }
457         return ecode;
458 }
459
460 static enum TDB_ERROR check_free(struct tdb_context *tdb,
461                                  tdb_off_t off,
462                                  const struct tdb_free_record *frec,
463                                  tdb_off_t prev, unsigned int ftable,
464                                  unsigned int bucket)
465 {
466         enum TDB_ERROR ecode;
467
468         if (frec_magic(frec) != TDB_FREE_MAGIC) {
469                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
470                                   "tdb_check: offset %llu bad magic 0x%llx",
471                                   (long long)off,
472                                   (long long)frec->magic_and_prev);
473         }
474         if (frec_ftable(frec) != ftable) {
475                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
476                                   "tdb_check: offset %llu bad freetable %u",
477                                   (long long)off, frec_ftable(frec));
478
479         }
480
481         ecode = tdb->tdb2.io->oob(tdb, off
482                                   + frec_len(frec)
483                                   + sizeof(struct tdb_used_record),
484                                   false);
485         if (ecode != TDB_SUCCESS) {
486                 return ecode;
487         }
488         if (size_to_bucket(frec_len(frec)) != bucket) {
489                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
490                                   "tdb_check: offset %llu in wrong bucket"
491                                   " (%u vs %u)",
492                                   (long long)off,
493                                   bucket, size_to_bucket(frec_len(frec)));
494         }
495         if (prev && prev != frec_prev(frec)) {
496                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
497                                   "tdb_check: offset %llu bad prev"
498                                   " (%llu vs %llu)",
499                                   (long long)off,
500                                   (long long)prev, (long long)frec_len(frec));
501         }
502         return TDB_SUCCESS;
503 }
504
505 static enum TDB_ERROR check_free_table(struct tdb_context *tdb,
506                                        tdb_off_t ftable_off,
507                                        unsigned ftable_num,
508                                        tdb_off_t fr[],
509                                        size_t num_free,
510                                        size_t *num_found)
511 {
512         struct tdb_freetable ft;
513         tdb_off_t h;
514         unsigned int i;
515         enum TDB_ERROR ecode;
516
517         ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
518         if (ecode != TDB_SUCCESS) {
519                 return ecode;
520         }
521
522         if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
523             || rec_key_length(&ft.hdr) != 0
524             || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
525             || rec_hash(&ft.hdr) != 0) {
526                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
527                                   "tdb_check: Invalid header on free table");
528         }
529
530         for (i = 0; i < TDB_FREE_BUCKETS; i++) {
531                 tdb_off_t off, prev = 0, *p, first = 0;
532                 struct tdb_free_record f;
533
534                 h = bucket_off(ftable_off, i);
535                 for (off = tdb_read_off(tdb, h); off; off = f.next) {
536                         if (TDB_OFF_IS_ERR(off)) {
537                                 return off;
538                         }
539                         if (!first) {
540                                 off &= TDB_OFF_MASK;
541                                 first = off;
542                         }
543                         ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
544                         if (ecode != TDB_SUCCESS) {
545                                 return ecode;
546                         }
547                         ecode = check_free(tdb, off, &f, prev, ftable_num, i);
548                         if (ecode != TDB_SUCCESS) {
549                                 return ecode;
550                         }
551
552                         /* FIXME: Check hash bits */
553                         p = asearch(&off, fr, num_free, off_cmp);
554                         if (!p) {
555                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
556                                                   TDB_LOG_ERROR,
557                                                   "tdb_check: Invalid offset"
558                                                   " %llu in free table",
559                                                   (long long)off);
560                         }
561                         /* Mark it invalid. */
562                         *p ^= 1;
563                         (*num_found)++;
564                         prev = off;
565                 }
566
567                 if (first) {
568                         /* Now we can check first back pointer. */
569                         ecode = tdb_read_convert(tdb, first, &f, sizeof(f));
570                         if (ecode != TDB_SUCCESS) {
571                                 return ecode;
572                         }
573                         ecode = check_free(tdb, first, &f, prev, ftable_num, i);
574                         if (ecode != TDB_SUCCESS) {
575                                 return ecode;
576                         }
577                 }
578         }
579         return TDB_SUCCESS;
580 }
581
582 /* Slow, but should be very rare. */
583 tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off)
584 {
585         size_t len;
586         enum TDB_ERROR ecode;
587
588         for (len = 0; off + len < tdb->file->map_size; len++) {
589                 char c;
590                 ecode = tdb->tdb2.io->tread(tdb, off, &c, 1);
591                 if (ecode != TDB_SUCCESS) {
592                         return ecode;
593                 }
594                 if (c != 0 && c != 0x43)
595                         break;
596         }
597         return len;
598 }
599
600 static enum TDB_ERROR check_linear(struct tdb_context *tdb,
601                                    tdb_off_t **used, size_t *num_used,
602                                    tdb_off_t **fr, size_t *num_free,
603                                    uint64_t features, tdb_off_t recovery)
604 {
605         tdb_off_t off;
606         tdb_len_t len;
607         enum TDB_ERROR ecode;
608         bool found_recovery = false;
609
610         for (off = sizeof(struct tdb_header);
611              off < tdb->file->map_size;
612              off += len) {
613                 union {
614                         struct tdb_used_record u;
615                         struct tdb_free_record f;
616                         struct tdb_recovery_record r;
617                 } rec;
618                 /* r is larger: only get that if we need to. */
619                 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
620                 if (ecode != TDB_SUCCESS) {
621                         return ecode;
622                 }
623
624                 /* If we crash after ftruncate, we can get zeroes or fill. */
625                 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
626                     || rec.r.magic ==  0x4343434343434343ULL) {
627                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
628                         if (ecode != TDB_SUCCESS) {
629                                 return ecode;
630                         }
631                         if (recovery == off) {
632                                 found_recovery = true;
633                                 len = sizeof(rec.r) + rec.r.max_len;
634                         } else {
635                                 len = dead_space(tdb, off);
636                                 if (TDB_OFF_IS_ERR(len)) {
637                                         return len;
638                                 }
639                                 if (len < sizeof(rec.r)) {
640                                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
641                                                           TDB_LOG_ERROR,
642                                                           "tdb_check: invalid"
643                                                           " dead space at %zu",
644                                                           (size_t)off);
645                                 }
646
647                                 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
648                                            "Dead space at %zu-%zu (of %zu)",
649                                            (size_t)off, (size_t)(off + len),
650                                            (size_t)tdb->file->map_size);
651                         }
652                 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
653                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
654                         if (ecode != TDB_SUCCESS) {
655                                 return ecode;
656                         }
657                         if (recovery != off) {
658                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
659                                                   TDB_LOG_ERROR,
660                                                   "tdb_check: unexpected"
661                                                   " recovery record at offset"
662                                                   " %zu",
663                                                   (size_t)off);
664                         }
665                         if (rec.r.len > rec.r.max_len) {
666                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
667                                                   TDB_LOG_ERROR,
668                                                   "tdb_check: invalid recovery"
669                                                   " length %zu",
670                                                   (size_t)rec.r.len);
671                         }
672                         if (rec.r.eof > tdb->file->map_size) {
673                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
674                                                   TDB_LOG_ERROR,
675                                                   "tdb_check: invalid old EOF"
676                                                   " %zu", (size_t)rec.r.eof);
677                         }
678                         found_recovery = true;
679                         len = sizeof(rec.r) + rec.r.max_len;
680                 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
681                         len = sizeof(rec.u) + frec_len(&rec.f);
682                         if (off + len > tdb->file->map_size) {
683                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
684                                                   TDB_LOG_ERROR,
685                                                   "tdb_check: free overlength"
686                                                   " %llu at offset %llu",
687                                                   (long long)len,
688                                                   (long long)off);
689                         }
690                         /* This record should be in free lists. */
691                         if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
692                             && !append(fr, num_free, off)) {
693                                 return tdb_logerr(tdb, TDB_ERR_OOM,
694                                                   TDB_LOG_ERROR,
695                                                   "tdb_check: tracking %zu'th"
696                                                   " free record.", *num_free);
697                         }
698                 } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
699                            || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
700                            || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
701                            || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
702                         uint64_t klen, dlen, extra;
703
704                         /* This record is used! */
705                         if (!append(used, num_used, off)) {
706                                 return tdb_logerr(tdb, TDB_ERR_OOM,
707                                                   TDB_LOG_ERROR,
708                                                   "tdb_check: tracking %zu'th"
709                                                   " used record.", *num_used);
710                         }
711
712                         klen = rec_key_length(&rec.u);
713                         dlen = rec_data_length(&rec.u);
714                         extra = rec_extra_padding(&rec.u);
715
716                         len = sizeof(rec.u) + klen + dlen + extra;
717                         if (off + len > tdb->file->map_size) {
718                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
719                                                   TDB_LOG_ERROR,
720                                                   "tdb_check: used overlength"
721                                                   " %llu at offset %llu",
722                                                   (long long)len,
723                                                   (long long)off);
724                         }
725
726                         if (len < sizeof(rec.f)) {
727                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
728                                                   TDB_LOG_ERROR,
729                                                   "tdb_check: too short record"
730                                                   " %llu at %llu",
731                                                   (long long)len,
732                                                   (long long)off);
733                         }
734
735                         /* Check that records have correct 0 at end (but may
736                          * not in future). */
737                         if (extra && !features) {
738                                 const char *p;
739                                 char c;
740                                 p = tdb_access_read(tdb, off + sizeof(rec.u)
741                                                     + klen + dlen, 1, false);
742                                 if (TDB_PTR_IS_ERR(p))
743                                         return TDB_PTR_ERR(p);
744                                 c = *p;
745                                 tdb_access_release(tdb, p);
746
747                                 if (c != '\0') {
748                                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
749                                                           TDB_LOG_ERROR,
750                                                           "tdb_check:"
751                                                           " non-zero extra"
752                                                           " at %llu",
753                                                           (long long)off);
754                                 }
755                         }
756                 } else {
757                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
758                                           TDB_LOG_ERROR,
759                                           "tdb_check: Bad magic 0x%llx"
760                                           " at offset %zu",
761                                           (long long)rec_magic(&rec.u),
762                                           (size_t)off);
763                 }
764         }
765
766         /* We must have found recovery area if there was one. */
767         if (recovery != 0 && !found_recovery) {
768                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
769                                   "tdb_check: expected a recovery area at %zu",
770                                   (size_t)recovery);
771         }
772
773         return TDB_SUCCESS;
774 }
775
776 enum TDB_ERROR tdb_check_(struct tdb_context *tdb,
777                           enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *),
778                           void *data)
779 {
780         tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
781         size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
782         uint64_t features;
783         enum TDB_ERROR ecode;
784
785         if (tdb->flags & TDB_VERSION1) {
786                 if (tdb1_check(tdb, check, data) == -1)
787                         return tdb->last_error;
788                 return TDB_SUCCESS;
789         }
790
791         ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
792         if (ecode != TDB_SUCCESS) {
793                 return tdb->last_error = ecode;
794         }
795
796         ecode = tdb_lock_expand(tdb, F_RDLCK);
797         if (ecode != TDB_SUCCESS) {
798                 tdb_allrecord_unlock(tdb, F_RDLCK);
799                 return tdb->last_error = ecode;
800         }
801
802         ecode = check_header(tdb, &recovery, &features);
803         if (ecode != TDB_SUCCESS)
804                 goto out;
805
806         /* First we do a linear scan, checking all records. */
807         ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, features,
808                              recovery);
809         if (ecode != TDB_SUCCESS)
810                 goto out;
811
812         for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
813                 if (TDB_OFF_IS_ERR(ft)) {
814                         ecode = ft;
815                         goto out;
816                 }
817                 ecode = check_free_table(tdb, ft, num_ftables, fr, num_free,
818                                          &num_found);
819                 if (ecode != TDB_SUCCESS)
820                         goto out;
821                 num_ftables++;
822         }
823
824         /* FIXME: Check key uniqueness? */
825         ecode = check_hash(tdb, used, num_used, num_ftables, check, data);
826         if (ecode != TDB_SUCCESS)
827                 goto out;
828
829         if (num_found != num_free) {
830                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
831                                    "tdb_check: Not all entries are in"
832                                    " free table");
833         }
834
835 out:
836         tdb_allrecord_unlock(tdb, F_RDLCK);
837         tdb_unlock_expand(tdb, F_RDLCK);
838         free(fr);
839         free(used);
840         return tdb->last_error = ecode;
841 }