1ce56c2c474c892653deff8e79c0f6ca9088daa1
[ccan] / ccan / tdb2 / check.c
1  /* 
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4    
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
21
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
24 {
25         tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
26         if (!new)
27                 return false;
28         new[(*num)++] = off;
29         *arr = new;
30         return true;
31 }
32
33 static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
34 {
35         uint64_t hash_test;
36         struct tdb_header hdr;
37
38         if (tdb_read_convert(tdb, 0, &hdr, sizeof(hdr)) == -1)
39                 return false;
40         /* magic food should not be converted, so convert back. */
41         tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
42
43         hash_test = TDB_HASH_MAGIC;
44         hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
45         if (hdr.hash_test != hash_test) {
46                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
47                          "check: hash test %llu should be %llu\n",
48                          (long long)hdr.hash_test,
49                          (long long)hash_test);
50                 return false;
51         }
52
53         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
54                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
55                          "check: bad magic '%.*s'\n",
56                          (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
57                 return false;
58         }
59
60         *recovery = hdr.recovery;
61         if (*recovery) {
62                 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
63                         tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
64                                  "tdb_check: invalid recovery offset %zu\n",
65                                  (size_t)*recovery);
66                         return false;
67                 }
68         }
69
70         /* Don't check reserved: they *can* be used later. */
71         return true;
72 }
73
74 static bool check_hash_tree(struct tdb_context *tdb,
75                             tdb_off_t off, unsigned int group_bits,
76                             uint64_t hprefix,
77                             unsigned hprefix_bits,
78                             tdb_off_t used[],
79                             size_t num_used,
80                             size_t *num_found,
81                             int (*check)(TDB_DATA, TDB_DATA, void *),
82                             void *private_data);
83
84 static bool check_hash_record(struct tdb_context *tdb,
85                               tdb_off_t off,
86                               uint64_t hprefix,
87                               unsigned hprefix_bits,
88                               tdb_off_t used[],
89                               size_t num_used,
90                               size_t *num_found,
91                               int (*check)(TDB_DATA, TDB_DATA, void *),
92                               void *private_data)
93 {
94         struct tdb_used_record rec;
95
96         if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
97                 return false;
98
99         if (rec_data_length(&rec)
100             != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
101                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
102                          "tdb_check: Bad hash table length %llu vs %llu\n",
103                          (long long)rec_data_length(&rec),
104                          (long long)sizeof(tdb_off_t)<<TDB_SUBLEVEL_HASH_BITS);
105                 return false;
106         }
107         if (rec_key_length(&rec) != 0) {
108                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
109                          "tdb_check: Bad hash table key length %llu\n",
110                          (long long)rec_key_length(&rec));
111                 return false;
112         }
113         if (rec_hash(&rec) != 0) {
114                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
115                          "tdb_check: Bad hash table hash value %llu\n",
116                          (long long)rec_hash(&rec));
117                 return false;
118         }
119
120         off += sizeof(rec);
121         return check_hash_tree(tdb, off,
122                                TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
123                                hprefix, hprefix_bits,
124                                used, num_used, num_found, check, private_data);
125 }
126
127 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
128 {
129         /* Can overflow an int. */
130         return *a > *b ? 1
131                 : *a < *b ? -1
132                 : 0;
133 }
134
135 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
136 {
137         *used += num;
138
139         return (h >> (64 - *used)) & ((1U << num) - 1);
140 }
141
142 static bool check_hash_tree(struct tdb_context *tdb,
143                             tdb_off_t off, unsigned int group_bits,
144                             uint64_t hprefix,
145                             unsigned hprefix_bits,
146                             tdb_off_t used[],
147                             size_t num_used,
148                             size_t *num_found,
149                             int (*check)(TDB_DATA, TDB_DATA, void *),
150                             void *private_data)
151 {
152         unsigned int g, b;
153         const tdb_off_t *hash;
154         struct tdb_used_record rec;
155
156         hash = tdb_access_read(tdb, off,
157                                sizeof(tdb_off_t)
158                                << (group_bits + TDB_HASH_GROUP_BITS),
159                                true);
160         if (!hash)
161                 return false;
162
163         for (g = 0; g < (1 << group_bits); g++) {
164                 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
165                 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
166                         unsigned int bucket, i, used_bits;
167                         uint64_t h;
168                         tdb_off_t *p;
169                         if (group[b] == 0)
170                                 continue;
171
172                         off = group[b] & TDB_OFF_MASK;
173                         p = asearch(&off, used, num_used, off_cmp);
174                         if (!p) {
175                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
176                                          "tdb_check: Invalid offset %llu "
177                                          "in hash\n",
178                                          (long long)off);
179                                 goto fail;
180                         }
181                         /* Mark it invalid. */
182                         *p ^= 1;
183                         (*num_found)++;
184
185                         if (is_subhash(group[b])) {
186                                 uint64_t subprefix;
187                                 subprefix = (hprefix 
188                                      << (group_bits + TDB_HASH_GROUP_BITS))
189                                         + g * (1 << TDB_HASH_GROUP_BITS) + b;
190
191                                 if (!check_hash_record(tdb,
192                                                group[b] & TDB_OFF_MASK,
193                                                subprefix,
194                                                hprefix_bits
195                                                        + group_bits
196                                                        + TDB_HASH_GROUP_BITS,
197                                                used, num_used, num_found,
198                                                check, private_data))
199                                         goto fail;
200                                 continue;
201                         }
202                         /* A normal entry */
203
204                         /* Does it belong here at all? */
205                         h = hash_record(tdb, off);
206                         used_bits = 0;
207                         if (get_bits(h, hprefix_bits, &used_bits) != hprefix
208                             && hprefix_bits) {
209                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
210                                          "check: bad hash placement"
211                                          " 0x%llx vs 0x%llx\n",
212                                          (long long)h, (long long)hprefix);
213                                 goto fail;
214                         }
215
216                         /* Does it belong in this group? */
217                         if (get_bits(h, group_bits, &used_bits) != g) {
218                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
219                                          "check: bad group %llu vs %u\n",
220                                          (long long)h, g);
221                                 goto fail;
222                         }
223
224                         /* Are bucket bits correct? */
225                         bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
226                         if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
227                             != bucket) {
228                                 used_bits -= TDB_HASH_GROUP_BITS;
229                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
230                                          "check: bad bucket %u vs %u\n",
231                                          (unsigned)get_bits(h,
232                                                             TDB_HASH_GROUP_BITS,
233                                                             &used_bits),
234                                          bucket);
235                                 goto fail;
236                         }
237
238                         /* There must not be any zero entries between
239                          * the bucket it belongs in and this one! */
240                         for (i = bucket;
241                              i != b;
242                              i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
243                                 if (group[i] == 0) {
244                                         tdb->log(tdb, TDB_DEBUG_ERROR,
245                                                  tdb->log_priv,
246                                                  "check: bad group placement"
247                                                  " %u vs %u\n",
248                                                  b, bucket);
249                                         goto fail;
250                                 }
251                         }
252
253                         if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
254                                 goto fail;
255
256                         /* Bottom bits must match header. */
257                         if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
258                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
259                                          "tdb_check: Bad hash magic at"
260                                          " offset %llu (0x%llx vs 0x%llx)\n",
261                                          (long long)off,
262                                          (long long)h,
263                                          (long long)rec_hash(&rec));
264                                 goto fail;
265                         }
266
267                         if (check) {
268                                 TDB_DATA key, data;
269                                 key.dsize = rec_key_length(&rec);
270                                 data.dsize = rec_data_length(&rec);
271                                 key.dptr = (void *)tdb_access_read(tdb,
272                                                    off + sizeof(rec),
273                                                    key.dsize + data.dsize,
274                                                    false);
275                                 if (!key.dptr)
276                                         goto fail;
277                                 data.dptr = key.dptr + key.dsize;
278                                 if (check(key, data, private_data) != 0)
279                                         goto fail;
280                                 tdb_access_release(tdb, key.dptr);
281                         }
282                 }
283         }
284         tdb_access_release(tdb, hash);
285         return true;
286
287 fail:
288         tdb_access_release(tdb, hash);
289         return false;
290 }
291
292 static bool check_hash(struct tdb_context *tdb,
293                        tdb_off_t used[],
294                        size_t num_used, size_t num_flists,
295                        int (*check)(TDB_DATA, TDB_DATA, void *),
296                        void *private_data)
297 {
298         /* Free lists also show up as used. */
299         size_t num_found = num_flists;
300
301         if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
302                              TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
303                              0, 0, used, num_used, &num_found,
304                              check, private_data))
305                 return false;
306
307         if (num_found != num_used) {
308                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
309                          "tdb_check: Not all entries are in hash\n");
310                 return false;
311         }
312         return true;
313 }
314
315 static bool check_free(struct tdb_context *tdb,
316                        tdb_off_t off,
317                        const struct tdb_free_record *frec,
318                        tdb_off_t prev, unsigned int flist, unsigned int bucket)
319 {
320         if (frec_magic(frec) != TDB_FREE_MAGIC) {
321                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
322                          "tdb_check: offset %llu bad magic 0x%llx\n",
323                          (long long)off, (long long)frec->magic_and_prev);
324                 return false;
325         }
326         if (frec_flist(frec) != flist) {
327                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
328                          "tdb_check: offset %llu bad freelist %u\n",
329                          (long long)off, frec_flist(frec));
330                 return false;
331         }
332
333         if (tdb->methods->oob(tdb, off
334                               + frec_len(frec) + sizeof(struct tdb_used_record),
335                               false))
336                 return false;
337         if (size_to_bucket(frec_len(frec)) != bucket) {
338                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
339                          "tdb_check: offset %llu in wrong bucket %u vs %u\n",
340                          (long long)off,
341                          bucket, size_to_bucket(frec_len(frec)));
342                 return false;
343         }
344         if (prev != frec_prev(frec)) {
345                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
346                          "tdb_check: offset %llu bad prev %llu vs %llu\n",
347                          (long long)off,
348                          (long long)prev, (long long)frec_len(frec));
349                 return false;
350         }
351         return true;
352 }
353                        
354 static bool check_free_list(struct tdb_context *tdb,
355                             tdb_off_t flist_off,
356                             unsigned flist_num,
357                             tdb_off_t free[],
358                             size_t num_free,
359                             size_t *num_found)
360 {
361         struct tdb_freelist flist;
362         tdb_off_t h;
363         unsigned int i;
364
365         if (tdb_read_convert(tdb, flist_off, &flist, sizeof(flist)) == -1)
366                 return false;
367
368         if (rec_magic(&flist.hdr) != TDB_MAGIC
369             || rec_key_length(&flist.hdr) != 0
370             || rec_data_length(&flist.hdr) != sizeof(flist) - sizeof(flist.hdr)
371             || rec_hash(&flist.hdr) != 1) {
372                 tdb->log(tdb, TDB_DEBUG_ERROR,
373                          tdb->log_priv,
374                          "tdb_check: Invalid header on free list\n");
375                 return false;
376         }
377
378         for (i = 0; i < TDB_FREE_BUCKETS; i++) {
379                 tdb_off_t off, prev = 0, *p;
380                 struct tdb_free_record f;
381
382                 h = bucket_off(flist_off, i);
383                 for (off = tdb_read_off(tdb, h); off; off = f.next) {
384                         if (off == TDB_OFF_ERR)
385                                 return false;
386                         if (tdb_read_convert(tdb, off, &f, sizeof(f)))
387                                 return false;
388                         if (!check_free(tdb, off, &f, prev, flist_num, i))
389                                 return false;
390
391                         /* FIXME: Check hash bits */
392                         p = asearch(&off, free, num_free, off_cmp);
393                         if (!p) {
394                                 tdb->log(tdb, TDB_DEBUG_ERROR,
395                                          tdb->log_priv,
396                                          "tdb_check: Invalid offset"
397                                          " %llu in free table\n",
398                                          (long long)off);
399                                 return false;
400                         }
401                         /* Mark it invalid. */
402                         *p ^= 1;
403                         (*num_found)++;
404                         prev = off;
405                 }
406         }
407         return true;
408 }
409
410 /* Slow, but should be very rare. */
411 size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
412 {
413         size_t len;
414
415         for (len = 0; off + len < tdb->map_size; len++) {
416                 char c;
417                 if (tdb->methods->read(tdb, off, &c, 1))
418                         return 0;
419                 if (c != 0 && c != 0x43)
420                         break;
421         }
422         return len;
423 }
424
425 static bool check_linear(struct tdb_context *tdb,
426                          tdb_off_t **used, size_t *num_used,
427                          tdb_off_t **free, size_t *num_free,
428                          tdb_off_t recovery)
429 {
430         tdb_off_t off;
431         tdb_len_t len;
432         bool found_recovery = false;
433
434         for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
435                 union {
436                         struct tdb_used_record u;
437                         struct tdb_free_record f;
438                         struct tdb_recovery_record r;
439                 } rec;
440                 /* r is larger: only get that if we need to. */
441                 if (tdb_read_convert(tdb, off, &rec, sizeof(rec.f)) == -1)
442                         return false;
443
444                 /* If we crash after ftruncate, we can get zeroes or fill. */
445                 if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
446                     || rec.r.magic ==  0x4343434343434343ULL) {
447                         if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
448                                 return false;
449
450                         if (recovery == off) {
451                                 found_recovery = true;
452                                 len = sizeof(rec.r) + rec.r.max_len;
453                         } else {
454                                 len = dead_space(tdb, off);
455                                 if (len < sizeof(rec.r)) {
456                                         tdb->log(tdb, TDB_DEBUG_ERROR,
457                                                  tdb->log_priv,
458                                                  "tdb_check: invalid dead space"
459                                                  " at %zu\n", (size_t)off);
460                                         return false;
461                                 }
462
463                                 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
464                                          "Dead space at %zu-%zu (of %zu)\n",
465                                          (size_t)off, (size_t)(off + len),
466                                          (size_t)tdb->map_size);
467                         }
468                 } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
469                         if (tdb_read_convert(tdb, off, &rec, sizeof(rec.r)))
470                                 return false;
471                         if (recovery != off) {
472                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
473                                          "tdb_check: unexpected recovery"
474                                          " record at offset %zu\n",
475                                          (size_t)off);
476                                 return false;
477                         }
478                         if (rec.r.len > rec.r.max_len) {
479                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
480                                          "tdb_check: invalid recovery length"
481                                          " %zu\n", (size_t)rec.r.len);
482                                 return false;
483                         }
484                         if (rec.r.eof > tdb->map_size) {
485                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
486                                          "tdb_check: invalid old EOF"
487                                          " %zu\n", (size_t)rec.r.eof);
488                                 return false;
489                         }
490                         found_recovery = true;
491                         len = sizeof(rec.r) + rec.r.max_len;
492                 } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC
493                            || frec_magic(&rec.f) == TDB_COALESCING_MAGIC) {
494                         len = sizeof(rec.u) + frec_len(&rec.f);
495                         if (off + len > tdb->map_size) {
496                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
497                                          "tdb_check: free overlength %llu"
498                                          " at offset %llu\n",
499                                          (long long)len, (long long)off);
500                                 return false;
501                         }
502                         /* This record is free! */
503                         if (frec_magic(&rec.f) == TDB_FREE_MAGIC
504                             && !append(free, num_free, off))
505                                 return false;
506                 } else {
507                         uint64_t klen, dlen, extra;
508
509                         /* This record is used! */
510                         if (rec_magic(&rec.u) != TDB_MAGIC) {
511                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
512                                          "tdb_check: Bad magic 0x%llx"
513                                          " at offset %llu\n",
514                                          (long long)rec_magic(&rec.u),
515                                          (long long)off);
516                                 return false;
517                         }
518
519                         if (!append(used, num_used, off))
520                                 return false;
521
522                         klen = rec_key_length(&rec.u);
523                         dlen = rec_data_length(&rec.u);
524                         extra = rec_extra_padding(&rec.u);
525
526                         len = sizeof(rec.u) + klen + dlen + extra;
527                         if (off + len > tdb->map_size) {
528                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
529                                          "tdb_check: used overlength %llu"
530                                          " at offset %llu\n",
531                                          (long long)len, (long long)off);
532                                 return false;
533                         }
534
535                         if (len < sizeof(rec.f)) {
536                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
537                                          "tdb_check: too short record %llu at"
538                                          " %llu\n",
539                                          (long long)len, (long long)off);
540                                 return false;
541                         }
542                 }
543         }
544
545         /* We must have found recovery area if there was one. */
546         if (recovery != 0 && !found_recovery) {
547                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
548                          "tdb_check: expected a recovery area at %zu\n",
549                          (size_t)recovery);
550                 return false;
551         }
552
553         return true;
554 }
555
556 int tdb_check(struct tdb_context *tdb,
557               int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
558               void *private_data)
559 {
560         tdb_off_t *free = NULL, *used = NULL, flist, recovery;
561         size_t num_free = 0, num_used = 0, num_found = 0, num_flists = 0;
562
563         if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
564                 return -1;
565
566         if (tdb_lock_expand(tdb, F_RDLCK) != 0) {
567                 tdb_allrecord_unlock(tdb, F_RDLCK);
568                 return -1;
569         }
570
571         if (!check_header(tdb, &recovery))
572                 goto fail;
573
574         /* First we do a linear scan, checking all records. */
575         if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery))
576                 goto fail;
577
578         for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
579                 if (flist == TDB_OFF_ERR)
580                         goto fail;
581                 if (!check_free_list(tdb, flist, num_flists, free, num_free,
582                                      &num_found))
583                         goto fail;
584                 num_flists++;
585         }
586
587         /* FIXME: Check key uniqueness? */
588         if (!check_hash(tdb, used, num_used, num_flists, check, private_data))
589                 goto fail;
590
591         if (num_found != num_free) {
592                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
593                          "tdb_check: Not all entries are in free table\n");
594                 return -1;
595         }
596
597         tdb_allrecord_unlock(tdb, F_RDLCK);
598         tdb_unlock_expand(tdb, F_RDLCK);
599         return 0;
600
601 fail:
602         tdb_allrecord_unlock(tdb, F_RDLCK);
603         tdb_unlock_expand(tdb, F_RDLCK);
604         return -1;
605 }