]> git.ozlabs.org Git - ccan/blob - ccan/tdb2/check.c
c5450cccb0cd17d9e8ba248db18fb272b152023c
[ccan] / ccan / tdb2 / check.c
1  /* 
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4    
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
21
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
24 {
25         tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
26         if (!new)
27                 return false;
28         new[(*num)++] = off;
29         *arr = new;
30         return true;
31 }
32
33 static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
34 {
35         uint64_t hash_test;
36         struct tdb_header hdr;
37
38         if (tdb_read_convert(tdb, 0, &hdr, sizeof(hdr)) == -1)
39                 return false;
40         /* magic food should not be converted, so convert back. */
41         tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
42
43         hash_test = TDB_HASH_MAGIC;
44         hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
45         if (hdr.hash_test != hash_test) {
46                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
47                          "check: hash test %llu should be %llu\n",
48                          (long long)hdr.hash_test,
49                          (long long)hash_test);
50                 return false;
51         }
52
53         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
54                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
55                          "check: bad magic '%.*s'\n",
56                          (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
57                 return false;
58         }
59
60         *recovery = hdr.recovery;
61         if (*recovery) {
62                 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
63                         tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
64                                  "tdb_check: invalid recovery offset %zu\n",
65                                  (size_t)*recovery);
66                         return false;
67                 }
68         }
69
70         /* Don't check reserved: they *can* be used later. */
71         return true;
72 }
73
74 static bool check_hash_tree(struct tdb_context *tdb,
75                             tdb_off_t off, unsigned int group_bits,
76                             uint64_t hprefix,
77                             unsigned hprefix_bits,
78                             tdb_off_t used[],
79                             size_t num_used,
80                             size_t *num_found,
81                             int (*check)(TDB_DATA, TDB_DATA, void *),
82                             void *private_data);
83
84 static bool check_hash_record(struct tdb_context *tdb,
85                               tdb_off_t off,
86                               uint64_t hprefix,
87                               unsigned hprefix_bits,
88                               tdb_off_t used[],
89                               size_t num_used,
90                               size_t *num_found,
91                               int (*check)(TDB_DATA, TDB_DATA, void *),
92                               void *private_data)
93 {
94         struct tdb_used_record rec;
95
96         if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
97                 return false;
98
99         if (rec_data_length(&rec)
100             != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
101                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
102                          "tdb_check: Bad hash table length %llu vs %llu\n",
103                          (long long)rec_data_length(&rec),
104                          (long long)sizeof(tdb_off_t)<<TDB_SUBLEVEL_HASH_BITS);
105                 return false;
106         }
107         if (rec_key_length(&rec) != 0) {
108                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
109                          "tdb_check: Bad hash table key length %llu\n",
110                          (long long)rec_key_length(&rec));
111                 return false;
112         }
113         if (rec_hash(&rec) != 0) {
114                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
115                          "tdb_check: Bad hash table hash value %llu\n",
116                          (long long)rec_hash(&rec));
117                 return false;
118         }
119
120         off += sizeof(rec);
121         return check_hash_tree(tdb, off,
122                                TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
123                                hprefix, hprefix_bits,
124                                used, num_used, num_found, check, private_data);
125 }
126
127 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
128 {
129         /* Can overflow an int. */
130         return *a > *b ? 1
131                 : *a < *b ? -1
132                 : 0;
133 }
134
135 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
136 {
137         *used += num;
138
139         return (h >> (64 - *used)) & ((1U << num) - 1);
140 }
141
142 static bool check_hash_tree(struct tdb_context *tdb,
143                             tdb_off_t off, unsigned int group_bits,
144                             uint64_t hprefix,
145                             unsigned hprefix_bits,
146                             tdb_off_t used[],
147                             size_t num_used,
148                             size_t *num_found,
149                             int (*check)(TDB_DATA, TDB_DATA, void *),
150                             void *private_data)
151 {
152         unsigned int g, b;
153         const tdb_off_t *hash;
154         struct tdb_used_record rec;
155
156         hash = tdb_access_read(tdb, off,
157                                sizeof(tdb_off_t)
158                                << (group_bits + TDB_HASH_GROUP_BITS),
159                                true);
160         if (!hash)
161                 return false;
162
163         for (g = 0; g < (1 << group_bits); g++) {
164                 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
165                 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
166                         unsigned int bucket, i, used_bits;
167                         uint64_t h;
168                         tdb_off_t *p;
169                         if (group[b] == 0)
170                                 continue;
171
172                         off = group[b] & TDB_OFF_MASK;
173                         p = asearch(&off, used, num_used, off_cmp);
174                         if (!p) {
175                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
176                                          "tdb_check: Invalid offset %llu "
177                                          "in hash\n",
178                                          (long long)off);
179                                 goto fail;
180                         }
181                         /* Mark it invalid. */
182                         *p ^= 1;
183                         (*num_found)++;
184
185                         if (is_subhash(group[b])) {
186                                 uint64_t subprefix;
187                                 subprefix = (hprefix 
188                                      << (group_bits + TDB_HASH_GROUP_BITS))
189                                         + g * (1 << TDB_HASH_GROUP_BITS) + b;
190
191                                 if (!check_hash_record(tdb,
192                                                group[b] & TDB_OFF_MASK,
193                                                subprefix,
194                                                hprefix_bits
195                                                        + group_bits
196                                                        + TDB_HASH_GROUP_BITS,
197                                                used, num_used, num_found,
198                                                check, private_data))
199                                         goto fail;
200                                 continue;
201                         }
202                         /* A normal entry */
203
204                         /* Does it belong here at all? */
205                         h = hash_record(tdb, off);
206                         used_bits = 0;
207                         if (get_bits(h, hprefix_bits, &used_bits) != hprefix
208                             && hprefix_bits) {
209                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
210                                          "check: bad hash placement"
211                                          " 0x%llx vs 0x%llx\n",
212                                          (long long)h, (long long)hprefix);
213                                 goto fail;
214                         }
215
216                         /* Does it belong in this group? */
217                         if (get_bits(h, group_bits, &used_bits) != g) {
218                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
219                                          "check: bad group %llu vs %u\n",
220                                          (long long)h, g);
221                                 goto fail;
222                         }
223
224                         /* Are bucket bits correct? */
225                         bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
226                         if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
227                             != bucket) {
228                                 used_bits -= TDB_HASH_GROUP_BITS;
229                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
230                                          "check: bad bucket %u vs %u\n",
231                                          (unsigned)get_bits(h,
232                                                             TDB_HASH_GROUP_BITS,
233                                                             &used_bits),
234                                          bucket);
235                                 goto fail;
236                         }
237
238                         /* There must not be any zero entries between
239                          * the bucket it belongs in and this one! */
240                         for (i = bucket;
241                              i != b;
242                              i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
243                                 if (group[i] == 0) {
244                                         tdb->log(tdb, TDB_DEBUG_ERROR,
245                                                  tdb->log_priv,
246                                                  "check: bad group placement"
247                                                  " %u vs %u\n",
248                                                  b, bucket);
249                                         goto fail;
250                                 }
251                         }
252
253                         if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
254                                 goto fail;
255
256                         /* Bottom bits must match header. */
257                         if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
258                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
259                                          "tdb_check: Bad hash magic at"
260                                          " offset %llu (0x%llx vs 0x%llx)\n",
261                                          (long long)off,
262                                          (long long)h,
263                                          (long long)rec_hash(&rec));
264                                 goto fail;
265                         }
266
267                         if (check) {
268                                 TDB_DATA key, data;
269                                 key.dsize = rec_key_length(&rec);
270                                 data.dsize = rec_data_length(&rec);
271                                 key.dptr = (void *)tdb_access_read(tdb,
272                                                    off + sizeof(rec),
273                                                    key.dsize + data.dsize,
274                                                    false);
275                                 if (!key.dptr)
276                                         goto fail;
277                                 data.dptr = key.dptr + key.dsize;
278                                 if (check(key, data, private_data) != 0)
279                                         goto fail;
280                                 tdb_access_release(tdb, key.dptr);
281                         }
282                 }
283         }
284         tdb_access_release(tdb, hash);
285         return true;
286
287 fail:
288         tdb_access_release(tdb, hash);
289         return false;
290 }
291
292 static bool check_hash(struct tdb_context *tdb,
293                        tdb_off_t used[],
294                        size_t num_used, size_t num_flists,
295                        int (*check)(TDB_DATA, TDB_DATA, void *),
296                        void *private_data)
297 {
298         /* Free lists also show up as used. */
299         size_t num_found = num_flists;
300
301         if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
302                              TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
303                              0, 0, used, num_used, &num_found,
304                              check, private_data))
305                 return false;
306
307         if (num_found != num_used) {
308                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
309                          "tdb_check: Not all entries are in hash\n");
310                 return false;
311         }
312         return true;
313 }
314
315 static bool check_free(struct tdb_context *tdb,
316                        tdb_off_t off,
317                        const struct tdb_free_record *frec,
318                        tdb_off_t prev, tdb_off_t flist_off, unsigned int bucket)
319 {
320         if (frec_magic(frec) != TDB_FREE_MAGIC) {
321                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
322                          "tdb_check: offset %llu bad magic 0x%llx\n",
323                          (long long)off, (long long)frec->magic_and_meta);
324                 return false;
325         }
326         if (frec_flist(frec) != flist_off) {
327                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
328                          "tdb_check: offset %llu bad freelist 0x%llx\n",
329                          (long long)off, (long long)frec_flist(frec));
330                 return false;
331         }
332
333         if (tdb->methods->oob(tdb, off
334                               + frec->data_len+sizeof(struct tdb_used_record),
335                               false))
336                 return false;
337         if (size_to_bucket(frec->data_len) != bucket) {
338                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
339                          "tdb_check: offset %llu in wrong bucket %u vs %u\n",
340                          (long long)off,
341                          bucket, size_to_bucket(frec->data_len));
342                 return false;
343         }
344         if (prev != frec->prev) {
345                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
346                          "tdb_check: offset %llu bad prev %llu vs %llu\n",
347                          (long long)off,
348                          (long long)prev, (long long)frec->prev);
349                 return false;
350         }
351         return true;
352 }
353                        
354 static bool check_free_list(struct tdb_context *tdb,
355                             tdb_off_t flist_off,
356                             tdb_off_t free[],
357                             size_t num_free,
358                             size_t *num_found)
359 {
360         struct tdb_freelist flist;
361         tdb_off_t h;
362         unsigned int i;
363
364         if (tdb_read_convert(tdb, flist_off, &flist, sizeof(flist)) == -1)
365                 return false;
366
367         if (rec_magic(&flist.hdr) != TDB_MAGIC
368             || rec_key_length(&flist.hdr) != 0
369             || rec_data_length(&flist.hdr) != sizeof(flist) - sizeof(flist.hdr)
370             || rec_hash(&flist.hdr) != 1) {
371                 tdb->log(tdb, TDB_DEBUG_ERROR,
372                          tdb->log_priv,
373                          "tdb_check: Invalid header on free list\n");
374                 return false;
375         }
376
377         for (i = 0; i < TDB_FREE_BUCKETS; i++) {
378                 tdb_off_t off, prev = 0, *p;
379                 struct tdb_free_record f;
380
381                 h = bucket_off(flist_off, i);
382                 for (off = tdb_read_off(tdb, h); off; off = f.next) {
383                         if (off == TDB_OFF_ERR)
384                                 return false;
385                         if (tdb_read_convert(tdb, off, &f, sizeof(f)))
386                                 return false;
387                         if (!check_free(tdb, off, &f, prev, flist_off, i))
388                                 return false;
389
390                         /* FIXME: Check hash bits */
391                         p = asearch(&off, free, num_free, off_cmp);
392                         if (!p) {
393                                 tdb->log(tdb, TDB_DEBUG_ERROR,
394                                          tdb->log_priv,
395                                          "tdb_check: Invalid offset"
396                                          " %llu in free table\n",
397                                          (long long)off);
398                                 return false;
399                         }
400                         /* Mark it invalid. */
401                         *p ^= 1;
402                         (*num_found)++;
403                         prev = off;
404                 }
405         }
406         return true;
407 }
408
409 /* Slow, but should be very rare. */
410 size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
411 {
412         size_t len;
413
414         for (len = 0; off + len < tdb->map_size; len++) {
415                 char c;
416                 if (tdb->methods->read(tdb, off, &c, 1))
417                         return 0;
418                 if (c != 0 && c != 0x43)
419                         break;
420         }
421         return len;
422 }
423
424 static bool check_linear(struct tdb_context *tdb,
425                          tdb_off_t **used, size_t *num_used,
426                          tdb_off_t **free, size_t *num_free,
427                          tdb_off_t recovery)
428 {
429         tdb_off_t off;
430         tdb_len_t len;
431         bool found_recovery = false;
432
433         for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
434                 union {
435                         struct tdb_used_record u;
436                         struct tdb_free_record f;
437                         struct tdb_recovery_record r;
438                 } pad, *p;
439                 p = tdb_get(tdb, off, &pad, sizeof(pad));
440                 if (!p)
441                         return false;
442
443                 /* If we crash after ftruncate, we can get zeroes or fill. */
444                 if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
445                     || p->r.magic ==  0x4343434343434343ULL) {
446                         if (recovery == off) {
447                                 found_recovery = true;
448                                 len = sizeof(p->r) + p->r.max_len;
449                         } else {
450                                 len = dead_space(tdb, off);
451                                 if (len < sizeof(p->r)) {
452                                         tdb->log(tdb, TDB_DEBUG_ERROR,
453                                                  tdb->log_priv,
454                                                  "tdb_check: invalid dead space"
455                                                  " at %zu\n", (size_t)off);
456                                         return false;
457                                 }
458
459                                 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
460                                          "Dead space at %zu-%zu (of %zu)\n",
461                                          (size_t)off, (size_t)(off + len),
462                                          (size_t)tdb->map_size);
463                         }
464                 } else if (p->r.magic == TDB_RECOVERY_MAGIC) {
465                         if (recovery != off) {
466                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
467                                          "tdb_check: unexpected recovery"
468                                          " record at offset %zu\n",
469                                          (size_t)off);
470                                 return false;
471                         }
472                         found_recovery = true;
473                         len = sizeof(p->r) + p->r.max_len;
474                 } else if (frec_magic(&p->f) == TDB_FREE_MAGIC
475                            || frec_magic(&p->f) == TDB_COALESCING_MAGIC) {
476                         len = sizeof(p->u) + p->f.data_len;
477                         if (off + len > tdb->map_size) {
478                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
479                                          "tdb_check: free overlength %llu"
480                                          " at offset %llu\n",
481                                          (long long)len, (long long)off);
482                                 return false;
483                         }
484                         /* This record is free! */
485                         if (frec_magic(&p->f) == TDB_FREE_MAGIC
486                             && !append(free, num_free, off))
487                                 return false;
488                 } else {
489                         uint64_t klen, dlen, extra;
490
491                         /* This record is used! */
492                         if (rec_magic(&p->u) != TDB_MAGIC) {
493                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
494                                          "tdb_check: Bad magic 0x%llx"
495                                          " at offset %llu\n",
496                                          (long long)rec_magic(&p->u),
497                                          (long long)off);
498                                 return false;
499                         }
500
501                         if (!append(used, num_used, off))
502                                 return false;
503
504                         klen = rec_key_length(&p->u);
505                         dlen = rec_data_length(&p->u);
506                         extra = rec_extra_padding(&p->u);
507
508                         len = sizeof(p->u) + klen + dlen + extra;
509                         if (off + len > tdb->map_size) {
510                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
511                                          "tdb_check: used overlength %llu"
512                                          " at offset %llu\n",
513                                          (long long)len, (long long)off);
514                                 return false;
515                         }
516
517                         if (len < sizeof(p->f)) {
518                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
519                                          "tdb_check: too short record %llu at"
520                                          " %llu\n",
521                                          (long long)len, (long long)off);
522                                 return false;
523                         }
524                 }
525         }
526
527         /* We must have found recovery area if there was one. */
528         if (recovery != 0 && !found_recovery) {
529                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
530                          "tdb_check: expected a recovery area at %zu\n",
531                          (size_t)recovery);
532                 return false;
533         }
534
535         return true;
536 }
537
538 int tdb_check(struct tdb_context *tdb,
539               int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
540               void *private_data)
541 {
542         tdb_off_t *free = NULL, *used = NULL, flist, recovery;
543         size_t num_free = 0, num_used = 0, num_found = 0, num_flists = 0;
544
545         if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
546                 return -1;
547
548         if (tdb_lock_expand(tdb, F_RDLCK) != 0) {
549                 tdb_allrecord_unlock(tdb, F_RDLCK);
550                 return -1;
551         }
552
553         if (!check_header(tdb, &recovery))
554                 goto fail;
555
556         /* First we do a linear scan, checking all records. */
557         if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery))
558                 goto fail;
559
560         for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
561                 if (flist == TDB_OFF_ERR)
562                         goto fail;
563                 if (!check_free_list(tdb, flist, free, num_free, &num_found))
564                         goto fail;
565                 num_flists++;
566         }
567
568         /* FIXME: Check key uniqueness? */
569         if (!check_hash(tdb, used, num_used, num_flists, check, private_data))
570                 goto fail;
571
572         if (num_found != num_free) {
573                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
574                          "tdb_check: Not all entries are in free table\n");
575                 return -1;
576         }
577
578         tdb_allrecord_unlock(tdb, F_RDLCK);
579         tdb_unlock_expand(tdb, F_RDLCK);
580         return 0;
581
582 fail:
583         tdb_allrecord_unlock(tdb, F_RDLCK);
584         tdb_unlock_expand(tdb, F_RDLCK);
585         return -1;
586 }