]> git.ozlabs.org Git - ccan/blob - ccan/tdb2/check.c
tdb2: compare the extra 11 hash bits in the header.
[ccan] / ccan / tdb2 / check.c
1  /* 
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4    
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
21
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
24 {
25         tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
26         if (!new)
27                 return false;
28         new[(*num)++] = off;
29         *arr = new;
30         return true;
31 }
32
33 static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
34 {
35         uint64_t hash_test;
36         struct tdb_header hdr;
37
38         if (tdb_read_convert(tdb, 0, &hdr, sizeof(hdr)) == -1)
39                 return false;
40         /* magic food should not be converted, so convert back. */
41         tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
42
43         hash_test = TDB_HASH_MAGIC;
44         hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
45         if (hdr.hash_test != hash_test) {
46                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
47                            "check: hash test %llu should be %llu",
48                            (long long)hdr.hash_test,
49                            (long long)hash_test);
50                 return false;
51         }
52
53         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
54                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
55                            "check: bad magic '%.*s'",
56                            (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
57                 return false;
58         }
59
60         *recovery = hdr.recovery;
61         if (*recovery) {
62                 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
63                         tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
64                                  "tdb_check: invalid recovery offset %zu",
65                                  (size_t)*recovery);
66                         return false;
67                 }
68         }
69
70         /* Don't check reserved: they *can* be used later. */
71         return true;
72 }
73
74 static bool check_hash_tree(struct tdb_context *tdb,
75                             tdb_off_t off, unsigned int group_bits,
76                             uint64_t hprefix,
77                             unsigned hprefix_bits,
78                             tdb_off_t used[],
79                             size_t num_used,
80                             size_t *num_found,
81                             int (*check)(TDB_DATA, TDB_DATA, void *),
82                             void *private_data);
83
84 static bool check_hash_record(struct tdb_context *tdb,
85                               tdb_off_t off,
86                               uint64_t hprefix,
87                               unsigned hprefix_bits,
88                               tdb_off_t used[],
89                               size_t num_used,
90                               size_t *num_found,
91                               int (*check)(TDB_DATA, TDB_DATA, void *),
92                               void *private_data)
93 {
94         struct tdb_used_record rec;
95
96         if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
97                 return false;
98
99         if (rec_data_length(&rec)
100             != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
101                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
102                            "tdb_check: Bad hash table length %llu vs %llu",
103                            (long long)rec_data_length(&rec),
104                            (long long)sizeof(tdb_off_t)
105                            << TDB_SUBLEVEL_HASH_BITS);
106                 return false;
107         }
108         if (rec_key_length(&rec) != 0) {
109                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
110                          "tdb_check: Bad hash table key length %llu",
111                          (long long)rec_key_length(&rec));
112                 return false;
113         }
114         if (rec_hash(&rec) != 0) {
115                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
116                          "tdb_check: Bad hash table hash value %llu",
117                          (long long)rec_hash(&rec));
118                 return false;
119         }
120
121         off += sizeof(rec);
122         return check_hash_tree(tdb, off,
123                                TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
124                                hprefix, hprefix_bits,
125                                used, num_used, num_found, check, private_data);
126 }
127
128 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
129 {
130         /* Can overflow an int. */
131         return *a > *b ? 1
132                 : *a < *b ? -1
133                 : 0;
134 }
135
136 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
137 {
138         *used += num;
139
140         return (h >> (64 - *used)) & ((1U << num) - 1);
141 }
142
143 static bool check_hash_tree(struct tdb_context *tdb,
144                             tdb_off_t off, unsigned int group_bits,
145                             uint64_t hprefix,
146                             unsigned hprefix_bits,
147                             tdb_off_t used[],
148                             size_t num_used,
149                             size_t *num_found,
150                             int (*check)(TDB_DATA, TDB_DATA, void *),
151                             void *private_data)
152 {
153         unsigned int g, b;
154         const tdb_off_t *hash;
155         struct tdb_used_record rec;
156
157         hash = tdb_access_read(tdb, off,
158                                sizeof(tdb_off_t)
159                                << (group_bits + TDB_HASH_GROUP_BITS),
160                                true);
161         if (!hash)
162                 return false;
163
164         for (g = 0; g < (1 << group_bits); g++) {
165                 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
166                 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
167                         unsigned int bucket, i, used_bits;
168                         uint64_t h;
169                         tdb_off_t *p;
170                         if (group[b] == 0)
171                                 continue;
172
173                         off = group[b] & TDB_OFF_MASK;
174                         p = asearch(&off, used, num_used, off_cmp);
175                         if (!p) {
176                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
177                                            TDB_DEBUG_ERROR,
178                                            "tdb_check: Invalid offset %llu "
179                                            "in hash", (long long)off);
180                                 goto fail;
181                         }
182                         /* Mark it invalid. */
183                         *p ^= 1;
184                         (*num_found)++;
185
186                         if (is_subhash(group[b])) {
187                                 uint64_t subprefix;
188                                 subprefix = (hprefix 
189                                      << (group_bits + TDB_HASH_GROUP_BITS))
190                                         + g * (1 << TDB_HASH_GROUP_BITS) + b;
191
192                                 if (!check_hash_record(tdb,
193                                                group[b] & TDB_OFF_MASK,
194                                                subprefix,
195                                                hprefix_bits
196                                                        + group_bits
197                                                        + TDB_HASH_GROUP_BITS,
198                                                used, num_used, num_found,
199                                                check, private_data))
200                                         goto fail;
201                                 continue;
202                         }
203                         /* A normal entry */
204
205                         /* Does it belong here at all? */
206                         h = hash_record(tdb, off);
207                         used_bits = 0;
208                         if (get_bits(h, hprefix_bits, &used_bits) != hprefix
209                             && hprefix_bits) {
210                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
211                                            TDB_DEBUG_ERROR,
212                                            "check: bad hash placement"
213                                            " 0x%llx vs 0x%llx",
214                                          (long long)h, (long long)hprefix);
215                                 goto fail;
216                         }
217
218                         /* Does it belong in this group? */
219                         if (get_bits(h, group_bits, &used_bits) != g) {
220                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
221                                            TDB_DEBUG_ERROR,
222                                            "check: bad group %llu vs %u",
223                                            (long long)h, g);
224                                 goto fail;
225                         }
226
227                         /* Are bucket bits correct? */
228                         bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
229                         if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
230                             != bucket) {
231                                 used_bits -= TDB_HASH_GROUP_BITS;
232                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
233                                            TDB_DEBUG_ERROR,
234                                          "check: bad bucket %u vs %u",
235                                          (unsigned)get_bits(h,
236                                                         TDB_HASH_GROUP_BITS,
237                                                         &used_bits),
238                                          bucket);
239                                 goto fail;
240                         }
241
242                         /* There must not be any zero entries between
243                          * the bucket it belongs in and this one! */
244                         for (i = bucket;
245                              i != b;
246                              i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
247                                 if (group[i] == 0) {
248                                         tdb_logerr(tdb, TDB_ERR_CORRUPT,
249                                                    TDB_DEBUG_ERROR,
250                                                    "check: bad group placement"
251                                                    " %u vs %u",
252                                                    b, bucket);
253                                         goto fail;
254                                 }
255                         }
256
257                         if (tdb_read_convert(tdb, off, &rec, sizeof(rec)))
258                                 goto fail;
259
260                         /* Bottom bits must match header. */
261                         if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
262                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
263                                            TDB_DEBUG_ERROR,
264                                            "tdb_check: Bad hash magic at"
265                                            " offset %llu (0x%llx vs 0x%llx)",
266                                            (long long)off,
267                                            (long long)h,
268                                            (long long)rec_hash(&rec));
269                                 goto fail;
270                         }
271
272                         if (check) {
273                                 TDB_DATA key, data;
274                                 key.dsize = rec_key_length(&rec);
275                                 data.dsize = rec_data_length(&rec);
276                                 key.dptr = (void *)tdb_access_read(tdb,
277                                                    off + sizeof(rec),
278                                                    key.dsize + data.dsize,
279                                                    false);
280                                 if (!key.dptr)
281                                         goto fail;
282                                 data.dptr = key.dptr + key.dsize;
283                                 if (check(key, data, private_data) != 0)
284                                         goto fail;
285                                 tdb_access_release(tdb, key.dptr);
286                         }
287                 }
288         }
289         tdb_access_release(tdb, hash);
290         return true;
291
292 fail:
293         tdb_access_release(tdb, hash);
294         return false;
295 }
296
297 static bool check_hash(struct tdb_context *tdb,
298                        tdb_off_t used[],
299                        size_t num_used, size_t num_flists,
300                        int (*check)(TDB_DATA, TDB_DATA, void *),
301                        void *private_data)
302 {
303         /* Free lists also show up as used. */
304         size_t num_found = num_flists;
305
306         if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
307                              TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
308                              0, 0, used, num_used, &num_found,
309                              check, private_data))
310                 return false;
311
312         if (num_found != num_used) {
313                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
314                            "tdb_check: Not all entries are in hash");
315                 return false;
316         }
317         return true;
318 }
319
320 static bool check_free(struct tdb_context *tdb,
321                        tdb_off_t off,
322                        const struct tdb_free_record *frec,
323                        tdb_off_t prev, unsigned int flist, unsigned int bucket)
324 {
325         if (frec_magic(frec) != TDB_FREE_MAGIC) {
326                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
327                            "tdb_check: offset %llu bad magic 0x%llx",
328                            (long long)off, (long long)frec->magic_and_prev);
329                 return false;
330         }
331         if (frec_flist(frec) != flist) {
332                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
333                            "tdb_check: offset %llu bad freelist %u",
334                            (long long)off, frec_flist(frec));
335                 return false;
336         }
337
338         if (tdb->methods->oob(tdb, off
339                               + frec_len(frec) + sizeof(struct tdb_used_record),
340                               false))
341                 return false;
342         if (size_to_bucket(frec_len(frec)) != bucket) {
343                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
344                            "tdb_check: offset %llu in wrong bucket %u vs %u",
345                            (long long)off,
346                            bucket, size_to_bucket(frec_len(frec)));
347                 return false;
348         }
349         if (prev != frec_prev(frec)) {
350                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
351                            "tdb_check: offset %llu bad prev %llu vs %llu",
352                            (long long)off,
353                            (long long)prev, (long long)frec_len(frec));
354                 return false;
355         }
356         return true;
357 }
358                        
359 static bool check_free_list(struct tdb_context *tdb,
360                             tdb_off_t flist_off,
361                             unsigned flist_num,
362                             tdb_off_t free[],
363                             size_t num_free,
364                             size_t *num_found)
365 {
366         struct tdb_freelist flist;
367         tdb_off_t h;
368         unsigned int i;
369
370         if (tdb_read_convert(tdb, flist_off, &flist, sizeof(flist)) == -1)
371                 return false;
372
373         if (rec_magic(&flist.hdr) != TDB_MAGIC
374             || rec_key_length(&flist.hdr) != 0
375             || rec_data_length(&flist.hdr) != sizeof(flist) - sizeof(flist.hdr)
376             || rec_hash(&flist.hdr) != 1) {
377                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
378                            "tdb_check: Invalid header on free list");
379                 return false;
380         }
381
382         for (i = 0; i < TDB_FREE_BUCKETS; i++) {
383                 tdb_off_t off, prev = 0, *p;
384                 struct tdb_free_record f;
385
386                 h = bucket_off(flist_off, i);
387                 for (off = tdb_read_off(tdb, h); off; off = f.next) {
388                         if (off == TDB_OFF_ERR)
389                                 return false;
390                         if (tdb_read_convert(tdb, off, &f, sizeof(f)))
391                                 return false;
392                         if (!check_free(tdb, off, &f, prev, flist_num, i))
393                                 return false;
394
395                         /* FIXME: Check hash bits */
396                         p = asearch(&off, free, num_free, off_cmp);
397                         if (!p) {
398                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
399                                            TDB_DEBUG_ERROR,
400                                            "tdb_check: Invalid offset"
401                                            " %llu in free table",
402                                            (long long)off);
403                                 return false;
404                         }
405                         /* Mark it invalid. */
406                         *p ^= 1;
407                         (*num_found)++;
408                         prev = off;
409                 }
410         }
411         return true;
412 }
413
414 /* Slow, but should be very rare. */
415 size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
416 {
417         size_t len;
418
419         for (len = 0; off + len < tdb->map_size; len++) {
420                 char c;
421                 if (tdb->methods->read(tdb, off, &c, 1))
422                         return 0;
423                 if (c != 0 && c != 0x43)
424                         break;
425         }
426         return len;
427 }
428
429 static bool check_linear(struct tdb_context *tdb,
430                          tdb_off_t **used, size_t *num_used,
431                          tdb_off_t **free, size_t *num_free,
432                          tdb_off_t recovery)
433 {
434         tdb_off_t off;
435         tdb_len_t len;
436         bool found_recovery = false;
437
438         for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
439                 union {
440                         struct tdb_used_record u;
441                         struct tdb_free_record f;
442                         struct tdb_recovery_record r;
443                 } rec;
444                 /* r is larger: only get that if we need to. */
445                 if (tdb_read_convert(tdb, off, &rec, sizeof(rec.f)) == -1)
446                         return false;
447
448                 /* If we crash after ftruncate, we can get zeroes or fill. */
449                 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
450                     || rec.r.magic ==  0x4343434343434343ULL) {
451                         if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
452                                 return false;
453
454                         if (recovery == off) {
455                                 found_recovery = true;
456                                 len = sizeof(rec.r) + rec.r.max_len;
457                         } else {
458                                 len = dead_space(tdb, off);
459                                 if (len < sizeof(rec.r)) {
460                                         tdb_logerr(tdb, TDB_ERR_CORRUPT,
461                                                    TDB_DEBUG_ERROR,
462                                                    "tdb_check: invalid dead"
463                                                    " space at %zu",
464                                                    (size_t)off);
465                                         return false;
466                                 }
467
468                                 tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_WARNING,
469                                            "Dead space at %zu-%zu (of %zu)",
470                                            (size_t)off, (size_t)(off + len),
471                                            (size_t)tdb->map_size);
472                         }
473                 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
474                         if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
475                                 return false;
476                         if (recovery != off) {
477                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
478                                            TDB_DEBUG_ERROR,
479                                            "tdb_check: unexpected recovery"
480                                            " record at offset %zu",
481                                            (size_t)off);
482                                 return false;
483                         }
484                         if (rec.r.len > rec.r.max_len) {
485                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
486                                            TDB_DEBUG_ERROR,
487                                            "tdb_check: invalid recovery length"
488                                            " %zu", (size_t)rec.r.len);
489                                 return false;
490                         }
491                         if (rec.r.eof > tdb->map_size) {
492                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
493                                            TDB_DEBUG_ERROR,
494                                            "tdb_check: invalid old EOF"
495                                            " %zu", (size_t)rec.r.eof);
496                                 return false;
497                         }
498                         found_recovery = true;
499                         len = sizeof(rec.r) + rec.r.max_len;
500                 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC
501                            || frec_magic(&rec.f) == TDB_COALESCING_MAGIC) {
502                         len = sizeof(rec.u) + frec_len(&rec.f);
503                         if (off + len > tdb->map_size) {
504                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
505                                            TDB_DEBUG_ERROR,
506                                            "tdb_check: free overlength %llu"
507                                            " at offset %llu",
508                                            (long long)len, (long long)off);
509                                 return false;
510                         }
511                         /* This record is free! */
512                         if (frec_magic(&rec.f) == TDB_FREE_MAGIC
513                             && !append(free, num_free, off))
514                                 return false;
515                 } else {
516                         uint64_t klen, dlen, extra;
517
518                         /* This record is used! */
519                         if (rec_magic(&rec.u) != TDB_MAGIC) {
520                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
521                                            TDB_DEBUG_ERROR,
522                                            "tdb_check: Bad magic 0x%llx"
523                                            " at offset %zu",
524                                            (long long)rec_magic(&rec.u),
525                                            (size_t)off);
526                                 return false;
527                         }
528
529                         if (!append(used, num_used, off))
530                                 return false;
531
532                         klen = rec_key_length(&rec.u);
533                         dlen = rec_data_length(&rec.u);
534                         extra = rec_extra_padding(&rec.u);
535
536                         len = sizeof(rec.u) + klen + dlen + extra;
537                         if (off + len > tdb->map_size) {
538                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
539                                            TDB_DEBUG_ERROR,
540                                            "tdb_check: used overlength %llu"
541                                            " at offset %llu",
542                                            (long long)len, (long long)off);
543                                 return false;
544                         }
545
546                         if (len < sizeof(rec.f)) {
547                                 tdb_logerr(tdb, TDB_ERR_CORRUPT,
548                                            TDB_DEBUG_ERROR,
549                                            "tdb_check: too short record %llu"
550                                            " at %llu",
551                                            (long long)len, (long long)off);
552                                 return false;
553                         }
554                 }
555         }
556
557         /* We must have found recovery area if there was one. */
558         if (recovery != 0 && !found_recovery) {
559                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
560                            "tdb_check: expected a recovery area at %zu",
561                            (size_t)recovery);
562                 return false;
563         }
564
565         return true;
566 }
567
568 int tdb_check(struct tdb_context *tdb,
569               int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
570               void *private_data)
571 {
572         tdb_off_t *free = NULL, *used = NULL, flist, recovery;
573         size_t num_free = 0, num_used = 0, num_found = 0, num_flists = 0;
574
575         if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
576                 return -1;
577
578         if (tdb_lock_expand(tdb, F_RDLCK) != 0) {
579                 tdb_allrecord_unlock(tdb, F_RDLCK);
580                 return -1;
581         }
582
583         if (!check_header(tdb, &recovery))
584                 goto fail;
585
586         /* First we do a linear scan, checking all records. */
587         if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery))
588                 goto fail;
589
590         for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
591                 if (flist == TDB_OFF_ERR)
592                         goto fail;
593                 if (!check_free_list(tdb, flist, num_flists, free, num_free,
594                                      &num_found))
595                         goto fail;
596                 num_flists++;
597         }
598
599         /* FIXME: Check key uniqueness? */
600         if (!check_hash(tdb, used, num_used, num_flists, check, private_data))
601                 goto fail;
602
603         if (num_found != num_free) {
604                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_DEBUG_ERROR,
605                            "tdb_check: Not all entries are in free table");
606                 return -1;
607         }
608
609         tdb_allrecord_unlock(tdb, F_RDLCK);
610         tdb_unlock_expand(tdb, F_RDLCK);
611         return 0;
612
613 fail:
614         tdb_allrecord_unlock(tdb, F_RDLCK);
615         tdb_unlock_expand(tdb, F_RDLCK);
616         return -1;
617 }