]> git.ozlabs.org Git - ccan/blob - ccan/tdb2/check.c
b602d80d64777030aeb08fab853b1df9561561d2
[ccan] / ccan / tdb2 / check.c
1  /*
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
21
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
24 {
25         tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
26         if (!new)
27                 return false;
28         new[(*num)++] = off;
29         *arr = new;
30         return true;
31 }
32
33 static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery,
34                                    uint64_t *features)
35 {
36         uint64_t hash_test;
37         struct tdb_header hdr;
38         enum TDB_ERROR ecode;
39
40         ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
41         if (ecode != TDB_SUCCESS) {
42                 return ecode;
43         }
44         /* magic food should not be converted, so convert back. */
45         tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
46
47         hash_test = TDB_HASH_MAGIC;
48         hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
49         if (hdr.hash_test != hash_test) {
50                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
51                                   "check: hash test %llu should be %llu",
52                                   (long long)hdr.hash_test,
53                                   (long long)hash_test);
54         }
55
56         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
57                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
58                                   "check: bad magic '%.*s'",
59                                   (unsigned)sizeof(hdr.magic_food),
60                                   hdr.magic_food);
61         }
62
63         /* Features which are used must be a subset of features offered. */
64         if (hdr.features_used & ~hdr.features_offered) {
65                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
66                                   "check: features used (0x%llx) which"
67                                   " are not offered (0x%llx)",
68                                   (long long)hdr.features_used,
69                                   (long long)hdr.features_offered);
70         }
71
72         *features = hdr.features_offered;
73         *recovery = hdr.recovery;
74         if (*recovery) {
75                 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
76                         return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
77                                           "tdb_check:"
78                                           " invalid recovery offset %zu",
79                                           (size_t)*recovery);
80                 }
81         }
82
83         /* Don't check reserved: they *can* be used later. */
84         return TDB_SUCCESS;
85 }
86
87 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
88                                       tdb_off_t off, unsigned int group_bits,
89                                       uint64_t hprefix,
90                                       unsigned hprefix_bits,
91                                       tdb_off_t used[],
92                                       size_t num_used,
93                                       size_t *num_found,
94                                       enum TDB_ERROR (*check)(TDB_DATA,
95                                                               TDB_DATA, void *),
96                                       void *private_data);
97
98 static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb,
99                                        tdb_off_t off,
100                                        uint64_t hash,
101                                        tdb_off_t used[],
102                                        size_t num_used,
103                                        size_t *num_found,
104                                        enum TDB_ERROR (*check)(TDB_DATA,
105                                                                TDB_DATA,
106                                                                void *),
107                                        void *private_data)
108 {
109         struct tdb_used_record rec;
110         enum TDB_ERROR ecode;
111
112         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
113         if (ecode != TDB_SUCCESS) {
114                 return ecode;
115         }
116
117         if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
118                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
119                                   "tdb_check: Bad hash chain magic %llu",
120                                   (long long)rec_magic(&rec));
121         }
122
123         if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
124                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
125                                   "tdb_check:"
126                                   " Bad hash chain length %llu vs %zu",
127                                   (long long)rec_data_length(&rec),
128                                   sizeof(struct tdb_chain));
129         }
130         if (rec_key_length(&rec) != 0) {
131                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
132                                   "tdb_check: Bad hash chain key length %llu",
133                                   (long long)rec_key_length(&rec));
134         }
135         if (rec_hash(&rec) != 0) {
136                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
137                                   "tdb_check: Bad hash chain hash value %llu",
138                                   (long long)rec_hash(&rec));
139         }
140
141         off += sizeof(rec);
142         ecode = check_hash_tree(tdb, off, 0, hash, 64,
143                                 used, num_used, num_found, check, private_data);
144         if (ecode != TDB_SUCCESS) {
145                 return ecode;
146         }
147
148         off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
149         if (TDB_OFF_IS_ERR(off)) {
150                 return off;
151         }
152         if (off == 0)
153                 return TDB_SUCCESS;
154         (*num_found)++;
155         return check_hash_chain(tdb, off, hash, used, num_used, num_found,
156                                 check, private_data);
157 }
158
159 static enum TDB_ERROR check_hash_record(struct tdb_context *tdb,
160                                         tdb_off_t off,
161                                         uint64_t hprefix,
162                                         unsigned hprefix_bits,
163                                         tdb_off_t used[],
164                                         size_t num_used,
165                                         size_t *num_found,
166                                         enum TDB_ERROR (*check)(TDB_DATA,
167                                                                 TDB_DATA,
168                                                                 void *),
169                                         void *private_data)
170 {
171         struct tdb_used_record rec;
172         enum TDB_ERROR ecode;
173
174         if (hprefix_bits >= 64)
175                 return check_hash_chain(tdb, off, hprefix, used, num_used,
176                                         num_found, check, private_data);
177
178         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
179         if (ecode != TDB_SUCCESS) {
180                 return ecode;
181         }
182
183         if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
184                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
185                                   "tdb_check: Bad hash table magic %llu",
186                                   (long long)rec_magic(&rec));
187         }
188         if (rec_data_length(&rec)
189             != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
190                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
191                                   "tdb_check:"
192                                   " Bad hash table length %llu vs %llu",
193                                   (long long)rec_data_length(&rec),
194                                   (long long)sizeof(tdb_off_t)
195                                   << TDB_SUBLEVEL_HASH_BITS);
196         }
197         if (rec_key_length(&rec) != 0) {
198                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
199                                   "tdb_check: Bad hash table key length %llu",
200                                   (long long)rec_key_length(&rec));
201         }
202         if (rec_hash(&rec) != 0) {
203                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
204                                   "tdb_check: Bad hash table hash value %llu",
205                                   (long long)rec_hash(&rec));
206         }
207
208         off += sizeof(rec);
209         return check_hash_tree(tdb, off,
210                                TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
211                                hprefix, hprefix_bits,
212                                used, num_used, num_found, check, private_data);
213 }
214
215 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
216 {
217         /* Can overflow an int. */
218         return *a > *b ? 1
219                 : *a < *b ? -1
220                 : 0;
221 }
222
223 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
224 {
225         *used += num;
226
227         return (h >> (64 - *used)) & ((1U << num) - 1);
228 }
229
230 static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
231                                       tdb_off_t off, unsigned int group_bits,
232                                       uint64_t hprefix,
233                                       unsigned hprefix_bits,
234                                       tdb_off_t used[],
235                                       size_t num_used,
236                                       size_t *num_found,
237                                       enum TDB_ERROR (*check)(TDB_DATA,
238                                                               TDB_DATA, void *),
239                                       void *private_data)
240 {
241         unsigned int g, b;
242         const tdb_off_t *hash;
243         struct tdb_used_record rec;
244         enum TDB_ERROR ecode;
245
246         hash = tdb_access_read(tdb, off,
247                                sizeof(tdb_off_t)
248                                << (group_bits + TDB_HASH_GROUP_BITS),
249                                true);
250         if (TDB_PTR_IS_ERR(hash)) {
251                 return TDB_PTR_ERR(hash);
252         }
253
254         for (g = 0; g < (1 << group_bits); g++) {
255                 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
256                 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
257                         unsigned int bucket, i, used_bits;
258                         uint64_t h;
259                         tdb_off_t *p;
260                         if (group[b] == 0)
261                                 continue;
262
263                         off = group[b] & TDB_OFF_MASK;
264                         p = asearch(&off, used, num_used, off_cmp);
265                         if (!p) {
266                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
267                                                    TDB_LOG_ERROR,
268                                                    "tdb_check: Invalid offset"
269                                                    " %llu in hash",
270                                                    (long long)off);
271                                 goto fail;
272                         }
273                         /* Mark it invalid. */
274                         *p ^= 1;
275                         (*num_found)++;
276
277                         if (hprefix_bits == 64) {
278                                 /* Chained entries are unordered. */
279                                 if (is_subhash(group[b])) {
280                                         ecode = TDB_ERR_CORRUPT;
281                                         tdb_logerr(tdb, ecode,
282                                                    TDB_LOG_ERROR,
283                                                    "tdb_check: Invalid chain"
284                                                    " entry subhash");
285                                         goto fail;
286                                 }
287                                 h = hash_record(tdb, off);
288                                 if (h != hprefix) {
289                                         ecode = TDB_ERR_CORRUPT;
290                                         tdb_logerr(tdb, ecode,
291                                                    TDB_LOG_ERROR,
292                                                    "check: bad hash chain"
293                                                    " placement"
294                                                    " 0x%llx vs 0x%llx",
295                                                    (long long)h,
296                                                    (long long)hprefix);
297                                         goto fail;
298                                 }
299                                 ecode = tdb_read_convert(tdb, off, &rec,
300                                                          sizeof(rec));
301                                 if (ecode != TDB_SUCCESS) {
302                                         goto fail;
303                                 }
304                                 goto check;
305                         }
306
307                         if (is_subhash(group[b])) {
308                                 uint64_t subprefix;
309                                 subprefix = (hprefix
310                                      << (group_bits + TDB_HASH_GROUP_BITS))
311                                         + g * (1 << TDB_HASH_GROUP_BITS) + b;
312
313                                 ecode = check_hash_record(tdb,
314                                                group[b] & TDB_OFF_MASK,
315                                                subprefix,
316                                                hprefix_bits
317                                                        + group_bits
318                                                        + TDB_HASH_GROUP_BITS,
319                                                used, num_used, num_found,
320                                                check, private_data);
321                                 if (ecode != TDB_SUCCESS) {
322                                         goto fail;
323                                 }
324                                 continue;
325                         }
326                         /* A normal entry */
327
328                         /* Does it belong here at all? */
329                         h = hash_record(tdb, off);
330                         used_bits = 0;
331                         if (get_bits(h, hprefix_bits, &used_bits) != hprefix
332                             && hprefix_bits) {
333                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
334                                                    TDB_LOG_ERROR,
335                                                    "check: bad hash placement"
336                                                    " 0x%llx vs 0x%llx",
337                                                    (long long)h,
338                                                    (long long)hprefix);
339                                 goto fail;
340                         }
341
342                         /* Does it belong in this group? */
343                         if (get_bits(h, group_bits, &used_bits) != g) {
344                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
345                                                    TDB_LOG_ERROR,
346                                                    "check: bad group %llu"
347                                                    " vs %u",
348                                                    (long long)h, g);
349                                 goto fail;
350                         }
351
352                         /* Are bucket bits correct? */
353                         bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
354                         if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
355                             != bucket) {
356                                 used_bits -= TDB_HASH_GROUP_BITS;
357                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
358                                                    TDB_LOG_ERROR,
359                                                    "check: bad bucket %u vs %u",
360                                                    (unsigned)get_bits(h,
361                                                         TDB_HASH_GROUP_BITS,
362                                                         &used_bits),
363                                                    bucket);
364                                 goto fail;
365                         }
366
367                         /* There must not be any zero entries between
368                          * the bucket it belongs in and this one! */
369                         for (i = bucket;
370                              i != b;
371                              i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
372                                 if (group[i] == 0) {
373                                         ecode = TDB_ERR_CORRUPT;
374                                         tdb_logerr(tdb, ecode,
375                                                    TDB_LOG_ERROR,
376                                                    "check: bad group placement"
377                                                    " %u vs %u",
378                                                    b, bucket);
379                                         goto fail;
380                                 }
381                         }
382
383                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
384                         if (ecode != TDB_SUCCESS) {
385                                 goto fail;
386                         }
387
388                         /* Bottom bits must match header. */
389                         if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
390                                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
391                                                    TDB_LOG_ERROR,
392                                                    "tdb_check: Bad hash magic"
393                                                    " at offset %llu"
394                                                    " (0x%llx vs 0x%llx)",
395                                                    (long long)off,
396                                                    (long long)h,
397                                                    (long long)rec_hash(&rec));
398                                 goto fail;
399                         }
400
401                 check:
402                         if (check) {
403                                 TDB_DATA key, data;
404                                 key.dsize = rec_key_length(&rec);
405                                 data.dsize = rec_data_length(&rec);
406                                 key.dptr = (void *)tdb_access_read(tdb,
407                                                    off + sizeof(rec),
408                                                    key.dsize + data.dsize,
409                                                    false);
410                                 if (TDB_PTR_IS_ERR(key.dptr)) {
411                                         ecode = TDB_PTR_ERR(key.dptr);
412                                         goto fail;
413                                 }
414                                 data.dptr = key.dptr + key.dsize;
415                                 ecode = check(key, data, private_data);
416                                 if (ecode != TDB_SUCCESS) {
417                                         goto fail;
418                                 }
419                                 tdb_access_release(tdb, key.dptr);
420                         }
421                 }
422         }
423         tdb_access_release(tdb, hash);
424         return TDB_SUCCESS;
425
426 fail:
427         tdb_access_release(tdb, hash);
428         return ecode;
429 }
430
431 static enum TDB_ERROR check_hash(struct tdb_context *tdb,
432                                  tdb_off_t used[],
433                                  size_t num_used, size_t num_ftables,
434                                  int (*check)(TDB_DATA, TDB_DATA, void *),
435                                  void *private_data)
436 {
437         /* Free tables also show up as used. */
438         size_t num_found = num_ftables;
439         enum TDB_ERROR ecode;
440
441         ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
442                                 TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
443                                 0, 0, used, num_used, &num_found,
444                                 check, private_data);
445         if (ecode == TDB_SUCCESS) {
446                 if (num_found != num_used) {
447                         ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
448                                            "tdb_check: Not all entries"
449                                            " are in hash");
450                 }
451         }
452         return ecode;
453 }
454
455 static enum TDB_ERROR check_free(struct tdb_context *tdb,
456                                  tdb_off_t off,
457                                  const struct tdb_free_record *frec,
458                                  tdb_off_t prev, unsigned int ftable,
459                                  unsigned int bucket)
460 {
461         enum TDB_ERROR ecode;
462
463         if (frec_magic(frec) != TDB_FREE_MAGIC) {
464                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
465                                   "tdb_check: offset %llu bad magic 0x%llx",
466                                   (long long)off,
467                                   (long long)frec->magic_and_prev);
468         }
469         if (frec_ftable(frec) != ftable) {
470                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
471                                   "tdb_check: offset %llu bad freetable %u",
472                                   (long long)off, frec_ftable(frec));
473
474         }
475
476         ecode = tdb->methods->oob(tdb, off
477                                   + frec_len(frec)
478                                   + sizeof(struct tdb_used_record),
479                                   false);
480         if (ecode != TDB_SUCCESS) {
481                 return ecode;
482         }
483         if (size_to_bucket(frec_len(frec)) != bucket) {
484                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
485                                   "tdb_check: offset %llu in wrong bucket"
486                                   " (%u vs %u)",
487                                   (long long)off,
488                                   bucket, size_to_bucket(frec_len(frec)));
489         }
490         if (prev != frec_prev(frec)) {
491                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
492                                   "tdb_check: offset %llu bad prev"
493                                   " (%llu vs %llu)",
494                                   (long long)off,
495                                   (long long)prev, (long long)frec_len(frec));
496         }
497         return TDB_SUCCESS;
498 }
499
500 static enum TDB_ERROR check_free_table(struct tdb_context *tdb,
501                                        tdb_off_t ftable_off,
502                                        unsigned ftable_num,
503                                        tdb_off_t fr[],
504                                        size_t num_free,
505                                        size_t *num_found)
506 {
507         struct tdb_freetable ft;
508         tdb_off_t h;
509         unsigned int i;
510         enum TDB_ERROR ecode;
511
512         ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
513         if (ecode != TDB_SUCCESS) {
514                 return ecode;
515         }
516
517         if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
518             || rec_key_length(&ft.hdr) != 0
519             || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
520             || rec_hash(&ft.hdr) != 0) {
521                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
522                                   "tdb_check: Invalid header on free table");
523         }
524
525         for (i = 0; i < TDB_FREE_BUCKETS; i++) {
526                 tdb_off_t off, prev = 0, *p;
527                 struct tdb_free_record f;
528
529                 h = bucket_off(ftable_off, i);
530                 for (off = tdb_read_off(tdb, h); off; off = f.next) {
531                         if (TDB_OFF_IS_ERR(off)) {
532                                 return off;
533                         }
534                         ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
535                         if (ecode != TDB_SUCCESS) {
536                                 return ecode;
537                         }
538                         ecode = check_free(tdb, off, &f, prev, ftable_num, i);
539                         if (ecode != TDB_SUCCESS) {
540                                 return ecode;
541                         }
542
543                         /* FIXME: Check hash bits */
544                         p = asearch(&off, fr, num_free, off_cmp);
545                         if (!p) {
546                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
547                                                   TDB_LOG_ERROR,
548                                                   "tdb_check: Invalid offset"
549                                                   " %llu in free table",
550                                                   (long long)off);
551                         }
552                         /* Mark it invalid. */
553                         *p ^= 1;
554                         (*num_found)++;
555                         prev = off;
556                 }
557         }
558         return TDB_SUCCESS;
559 }
560
561 /* Slow, but should be very rare. */
562 tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off)
563 {
564         size_t len;
565         enum TDB_ERROR ecode;
566
567         for (len = 0; off + len < tdb->map_size; len++) {
568                 char c;
569                 ecode = tdb->methods->tread(tdb, off, &c, 1);
570                 if (ecode != TDB_SUCCESS) {
571                         return ecode;
572                 }
573                 if (c != 0 && c != 0x43)
574                         break;
575         }
576         return len;
577 }
578
579 static enum TDB_ERROR check_linear(struct tdb_context *tdb,
580                                    tdb_off_t **used, size_t *num_used,
581                                    tdb_off_t **fr, size_t *num_free,
582                                    uint64_t features, tdb_off_t recovery)
583 {
584         tdb_off_t off;
585         tdb_len_t len;
586         enum TDB_ERROR ecode;
587         bool found_recovery = false;
588
589         for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
590                 union {
591                         struct tdb_used_record u;
592                         struct tdb_free_record f;
593                         struct tdb_recovery_record r;
594                 } rec;
595                 /* r is larger: only get that if we need to. */
596                 ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
597                 if (ecode != TDB_SUCCESS) {
598                         return ecode;
599                 }
600
601                 /* If we crash after ftruncate, we can get zeroes or fill. */
602                 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
603                     || rec.r.magic ==  0x4343434343434343ULL) {
604                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
605                         if (ecode != TDB_SUCCESS) {
606                                 return ecode;
607                         }
608                         if (recovery == off) {
609                                 found_recovery = true;
610                                 len = sizeof(rec.r) + rec.r.max_len;
611                         } else {
612                                 len = dead_space(tdb, off);
613                                 if (TDB_OFF_IS_ERR(len)) {
614                                         return len;
615                                 }
616                                 if (len < sizeof(rec.r)) {
617                                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
618                                                           TDB_LOG_ERROR,
619                                                           "tdb_check: invalid"
620                                                           " dead space at %zu",
621                                                           (size_t)off);
622                                 }
623
624                                 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
625                                            "Dead space at %zu-%zu (of %zu)",
626                                            (size_t)off, (size_t)(off + len),
627                                            (size_t)tdb->map_size);
628                         }
629                 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
630                         ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
631                         if (ecode != TDB_SUCCESS) {
632                                 return ecode;
633                         }
634                         if (recovery != off) {
635                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
636                                                   TDB_LOG_ERROR,
637                                                   "tdb_check: unexpected"
638                                                   " recovery record at offset"
639                                                   " %zu",
640                                                   (size_t)off);
641                         }
642                         if (rec.r.len > rec.r.max_len) {
643                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
644                                                   TDB_LOG_ERROR,
645                                                   "tdb_check: invalid recovery"
646                                                   " length %zu",
647                                                   (size_t)rec.r.len);
648                         }
649                         if (rec.r.eof > tdb->map_size) {
650                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
651                                                   TDB_LOG_ERROR,
652                                                   "tdb_check: invalid old EOF"
653                                                   " %zu", (size_t)rec.r.eof);
654                         }
655                         found_recovery = true;
656                         len = sizeof(rec.r) + rec.r.max_len;
657                 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
658                         len = sizeof(rec.u) + frec_len(&rec.f);
659                         if (off + len > tdb->map_size) {
660                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
661                                                   TDB_LOG_ERROR,
662                                                   "tdb_check: free overlength"
663                                                   " %llu at offset %llu",
664                                                   (long long)len,
665                                                   (long long)off);
666                         }
667                         /* This record should be in free lists. */
668                         if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
669                             && !append(fr, num_free, off)) {
670                                 return tdb_logerr(tdb, TDB_ERR_OOM,
671                                                   TDB_LOG_ERROR,
672                                                   "tdb_check: tracking %zu'th"
673                                                   " free record.", *num_free);
674                         }
675                 } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
676                            || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
677                            || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
678                            || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
679                         uint64_t klen, dlen, extra;
680
681                         /* This record is used! */
682                         if (!append(used, num_used, off)) {
683                                 return tdb_logerr(tdb, TDB_ERR_OOM,
684                                                   TDB_LOG_ERROR,
685                                                   "tdb_check: tracking %zu'th"
686                                                   " used record.", *num_used);
687                         }
688
689                         klen = rec_key_length(&rec.u);
690                         dlen = rec_data_length(&rec.u);
691                         extra = rec_extra_padding(&rec.u);
692
693                         len = sizeof(rec.u) + klen + dlen + extra;
694                         if (off + len > tdb->map_size) {
695                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
696                                                   TDB_LOG_ERROR,
697                                                   "tdb_check: used overlength"
698                                                   " %llu at offset %llu",
699                                                   (long long)len,
700                                                   (long long)off);
701                         }
702
703                         if (len < sizeof(rec.f)) {
704                                 return tdb_logerr(tdb, TDB_ERR_CORRUPT,
705                                                   TDB_LOG_ERROR,
706                                                   "tdb_check: too short record"
707                                                   " %llu at %llu",
708                                                   (long long)len,
709                                                   (long long)off);
710                         }
711
712                         /* Check that records have correct 0 at end (but may
713                          * not in future). */
714                         if (extra && !features) {
715                                 const char *p;
716                                 char c;
717                                 p = tdb_access_read(tdb, off + sizeof(rec.u)
718                                                     + klen + dlen, 1, false);
719                                 if (TDB_PTR_IS_ERR(p))
720                                         return TDB_PTR_ERR(p);
721                                 c = *p;
722                                 tdb_access_release(tdb, p);
723
724                                 if (c != '\0') {
725                                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
726                                                           TDB_LOG_ERROR,
727                                                           "tdb_check:"
728                                                           " non-zero extra"
729                                                           " at %llu",
730                                                           (long long)off);
731                                 }
732                         }
733                 } else {
734                         return tdb_logerr(tdb, TDB_ERR_CORRUPT,
735                                           TDB_LOG_ERROR,
736                                           "tdb_check: Bad magic 0x%llx"
737                                           " at offset %zu",
738                                           (long long)rec_magic(&rec.u),
739                                           (size_t)off);
740                 }
741         }
742
743         /* We must have found recovery area if there was one. */
744         if (recovery != 0 && !found_recovery) {
745                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
746                                   "tdb_check: expected a recovery area at %zu",
747                                   (size_t)recovery);
748         }
749
750         return TDB_SUCCESS;
751 }
752
753 enum TDB_ERROR tdb_check_(struct tdb_context *tdb,
754                           enum TDB_ERROR (*check)(TDB_DATA key, TDB_DATA data,
755                                                   void *private),
756                           void *private)
757 {
758         tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
759         size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
760         uint64_t features;
761         enum TDB_ERROR ecode;
762
763         ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
764         if (ecode != TDB_SUCCESS) {
765                 return ecode;
766         }
767
768         ecode = tdb_lock_expand(tdb, F_RDLCK);
769         if (ecode != TDB_SUCCESS) {
770                 tdb_allrecord_unlock(tdb, F_RDLCK);
771                 return ecode;
772         }
773
774         ecode = check_header(tdb, &recovery, &features);
775         if (ecode != TDB_SUCCESS)
776                 goto out;
777
778         /* First we do a linear scan, checking all records. */
779         ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, features,
780                              recovery);
781         if (ecode != TDB_SUCCESS)
782                 goto out;
783
784         for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
785                 if (TDB_OFF_IS_ERR(ft)) {
786                         ecode = ft;
787                         goto out;
788                 }
789                 ecode = check_free_table(tdb, ft, num_ftables, fr, num_free,
790                                          &num_found);
791                 if (ecode != TDB_SUCCESS)
792                         goto out;
793                 num_ftables++;
794         }
795
796         /* FIXME: Check key uniqueness? */
797         ecode = check_hash(tdb, used, num_used, num_ftables, check, private);
798         if (ecode != TDB_SUCCESS)
799                 goto out;
800
801         if (num_found != num_free) {
802                 ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
803                                    "tdb_check: Not all entries are in"
804                                    " free table");
805         }
806
807 out:
808         tdb_allrecord_unlock(tdb, F_RDLCK);
809         tdb_unlock_expand(tdb, F_RDLCK);
810         free(fr);
811         free(used);
812         return ecode;
813 }