]> git.ozlabs.org Git - ccan/blob - ccan/tdb2/check.c
tdb2: shrink free header from 32 to 24 bytes.
[ccan] / ccan / tdb2 / check.c
1  /* 
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4    
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
21
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
24 {
25         tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
26         if (!new)
27                 return false;
28         new[(*num)++] = off;
29         *arr = new;
30         return true;
31 }
32
33 static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
34 {
35         uint64_t hash_test;
36         struct tdb_header hdr;
37
38         if (tdb_read_convert(tdb, 0, &hdr, sizeof(hdr)) == -1)
39                 return false;
40         /* magic food should not be converted, so convert back. */
41         tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
42
43         hash_test = TDB_HASH_MAGIC;
44         hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
45         if (hdr.hash_test != hash_test) {
46                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
47                          "check: hash test %llu should be %llu\n",
48                          (long long)hdr.hash_test,
49                          (long long)hash_test);
50                 return false;
51         }
52
53         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
54                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
55                          "check: bad magic '%.*s'\n",
56                          (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
57                 return false;
58         }
59
60         *recovery = hdr.recovery;
61         if (*recovery) {
62                 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
63                         tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
64                                  "tdb_check: invalid recovery offset %zu\n",
65                                  (size_t)*recovery);
66                         return false;
67                 }
68         }
69
70         /* Don't check reserved: they *can* be used later. */
71         return true;
72 }
73
74 static bool check_hash_tree(struct tdb_context *tdb,
75                             tdb_off_t off, unsigned int group_bits,
76                             uint64_t hprefix,
77                             unsigned hprefix_bits,
78                             tdb_off_t used[],
79                             size_t num_used,
80                             size_t *num_found,
81                             int (*check)(TDB_DATA, TDB_DATA, void *),
82                             void *private_data);
83
84 static bool check_hash_record(struct tdb_context *tdb,
85                               tdb_off_t off,
86                               uint64_t hprefix,
87                               unsigned hprefix_bits,
88                               tdb_off_t used[],
89                               size_t num_used,
90                               size_t *num_found,
91                               int (*check)(TDB_DATA, TDB_DATA, void *),
92                               void *private_data)
93 {
94         struct tdb_used_record rec;
95
96         if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
97                 return false;
98
99         if (rec_data_length(&rec)
100             != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
101                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
102                          "tdb_check: Bad hash table length %llu vs %llu\n",
103                          (long long)rec_data_length(&rec),
104                          (long long)sizeof(tdb_off_t)<<TDB_SUBLEVEL_HASH_BITS);
105                 return false;
106         }
107         if (rec_key_length(&rec) != 0) {
108                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
109                          "tdb_check: Bad hash table key length %llu\n",
110                          (long long)rec_key_length(&rec));
111                 return false;
112         }
113         if (rec_hash(&rec) != 0) {
114                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
115                          "tdb_check: Bad hash table hash value %llu\n",
116                          (long long)rec_hash(&rec));
117                 return false;
118         }
119
120         off += sizeof(rec);
121         return check_hash_tree(tdb, off,
122                                TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
123                                hprefix, hprefix_bits,
124                                used, num_used, num_found, check, private_data);
125 }
126
127 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
128 {
129         /* Can overflow an int. */
130         return *a > *b ? 1
131                 : *a < *b ? -1
132                 : 0;
133 }
134
135 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
136 {
137         *used += num;
138
139         return (h >> (64 - *used)) & ((1U << num) - 1);
140 }
141
142 static bool check_hash_tree(struct tdb_context *tdb,
143                             tdb_off_t off, unsigned int group_bits,
144                             uint64_t hprefix,
145                             unsigned hprefix_bits,
146                             tdb_off_t used[],
147                             size_t num_used,
148                             size_t *num_found,
149                             int (*check)(TDB_DATA, TDB_DATA, void *),
150                             void *private_data)
151 {
152         unsigned int g, b;
153         const tdb_off_t *hash;
154         struct tdb_used_record rec;
155
156         hash = tdb_access_read(tdb, off,
157                                sizeof(tdb_off_t)
158                                << (group_bits + TDB_HASH_GROUP_BITS),
159                                true);
160         if (!hash)
161                 return false;
162
163         for (g = 0; g < (1 << group_bits); g++) {
164                 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
165                 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
166                         unsigned int bucket, i, used_bits;
167                         uint64_t h;
168                         tdb_off_t *p;
169                         if (group[b] == 0)
170                                 continue;
171
172                         off = group[b] & TDB_OFF_MASK;
173                         p = asearch(&off, used, num_used, off_cmp);
174                         if (!p) {
175                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
176                                          "tdb_check: Invalid offset %llu "
177                                          "in hash\n",
178                                          (long long)off);
179                                 goto fail;
180                         }
181                         /* Mark it invalid. */
182                         *p ^= 1;
183                         (*num_found)++;
184
185                         if (is_subhash(group[b])) {
186                                 uint64_t subprefix;
187                                 subprefix = (hprefix 
188                                      << (group_bits + TDB_HASH_GROUP_BITS))
189                                         + g * (1 << TDB_HASH_GROUP_BITS) + b;
190
191                                 if (!check_hash_record(tdb,
192                                                group[b] & TDB_OFF_MASK,
193                                                subprefix,
194                                                hprefix_bits
195                                                        + group_bits
196                                                        + TDB_HASH_GROUP_BITS,
197                                                used, num_used, num_found,
198                                                check, private_data))
199                                         goto fail;
200                                 continue;
201                         }
202                         /* A normal entry */
203
204                         /* Does it belong here at all? */
205                         h = hash_record(tdb, off);
206                         used_bits = 0;
207                         if (get_bits(h, hprefix_bits, &used_bits) != hprefix
208                             && hprefix_bits) {
209                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
210                                          "check: bad hash placement"
211                                          " 0x%llx vs 0x%llx\n",
212                                          (long long)h, (long long)hprefix);
213                                 goto fail;
214                         }
215
216                         /* Does it belong in this group? */
217                         if (get_bits(h, group_bits, &used_bits) != g) {
218                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
219                                          "check: bad group %llu vs %u\n",
220                                          (long long)h, g);
221                                 goto fail;
222                         }
223
224                         /* Are bucket bits correct? */
225                         bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
226                         if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
227                             != bucket) {
228                                 used_bits -= TDB_HASH_GROUP_BITS;
229                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
230                                          "check: bad bucket %u vs %u\n",
231                                          (unsigned)get_bits(h,
232                                                             TDB_HASH_GROUP_BITS,
233                                                             &used_bits),
234                                          bucket);
235                                 goto fail;
236                         }
237
238                         /* There must not be any zero entries between
239                          * the bucket it belongs in and this one! */
240                         for (i = bucket;
241                              i != b;
242                              i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
243                                 if (group[i] == 0) {
244                                         tdb->log(tdb, TDB_DEBUG_ERROR,
245                                                  tdb->log_priv,
246                                                  "check: bad group placement"
247                                                  " %u vs %u\n",
248                                                  b, bucket);
249                                         goto fail;
250                                 }
251                         }
252
253                         if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
254                                 goto fail;
255
256                         /* Bottom bits must match header. */
257                         if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
258                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
259                                          "tdb_check: Bad hash magic at"
260                                          " offset %llu (0x%llx vs 0x%llx)\n",
261                                          (long long)off,
262                                          (long long)h,
263                                          (long long)rec_hash(&rec));
264                                 goto fail;
265                         }
266
267                         if (check) {
268                                 TDB_DATA key, data;
269                                 key.dsize = rec_key_length(&rec);
270                                 data.dsize = rec_data_length(&rec);
271                                 key.dptr = (void *)tdb_access_read(tdb,
272                                                    off + sizeof(rec),
273                                                    key.dsize + data.dsize,
274                                                    false);
275                                 if (!key.dptr)
276                                         goto fail;
277                                 data.dptr = key.dptr + key.dsize;
278                                 if (check(key, data, private_data) != 0)
279                                         goto fail;
280                                 tdb_access_release(tdb, key.dptr);
281                         }
282                 }
283         }
284         tdb_access_release(tdb, hash);
285         return true;
286
287 fail:
288         tdb_access_release(tdb, hash);
289         return false;
290 }
291
292 static bool check_hash(struct tdb_context *tdb,
293                        tdb_off_t used[],
294                        size_t num_used, size_t num_flists,
295                        int (*check)(TDB_DATA, TDB_DATA, void *),
296                        void *private_data)
297 {
298         /* Free lists also show up as used. */
299         size_t num_found = num_flists;
300
301         if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
302                              TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
303                              0, 0, used, num_used, &num_found,
304                              check, private_data))
305                 return false;
306
307         if (num_found != num_used) {
308                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
309                          "tdb_check: Not all entries are in hash\n");
310                 return false;
311         }
312         return true;
313 }
314
315 static bool check_free(struct tdb_context *tdb,
316                        tdb_off_t off,
317                        const struct tdb_free_record *frec,
318                        tdb_off_t prev, unsigned int flist, unsigned int bucket)
319 {
320         if (frec_magic(frec) != TDB_FREE_MAGIC) {
321                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
322                          "tdb_check: offset %llu bad magic 0x%llx\n",
323                          (long long)off, (long long)frec->magic_and_prev);
324                 return false;
325         }
326         if (frec_flist(frec) != flist) {
327                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
328                          "tdb_check: offset %llu bad freelist %u\n",
329                          (long long)off, frec_flist(frec));
330                 return false;
331         }
332
333         if (tdb->methods->oob(tdb, off
334                               + frec_len(frec) + sizeof(struct tdb_used_record),
335                               false))
336                 return false;
337         if (size_to_bucket(frec_len(frec)) != bucket) {
338                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
339                          "tdb_check: offset %llu in wrong bucket %u vs %u\n",
340                          (long long)off,
341                          bucket, size_to_bucket(frec_len(frec)));
342                 return false;
343         }
344         if (prev != frec_prev(frec)) {
345                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
346                          "tdb_check: offset %llu bad prev %llu vs %llu\n",
347                          (long long)off,
348                          (long long)prev, (long long)frec_len(frec));
349                 return false;
350         }
351         return true;
352 }
353                        
354 static bool check_free_list(struct tdb_context *tdb,
355                             tdb_off_t flist_off,
356                             unsigned flist_num,
357                             tdb_off_t free[],
358                             size_t num_free,
359                             size_t *num_found)
360 {
361         struct tdb_freelist flist;
362         tdb_off_t h;
363         unsigned int i;
364
365         if (tdb_read_convert(tdb, flist_off, &flist, sizeof(flist)) == -1)
366                 return false;
367
368         if (rec_magic(&flist.hdr) != TDB_MAGIC
369             || rec_key_length(&flist.hdr) != 0
370             || rec_data_length(&flist.hdr) != sizeof(flist) - sizeof(flist.hdr)
371             || rec_hash(&flist.hdr) != 1) {
372                 tdb->log(tdb, TDB_DEBUG_ERROR,
373                          tdb->log_priv,
374                          "tdb_check: Invalid header on free list\n");
375                 return false;
376         }
377
378         for (i = 0; i < TDB_FREE_BUCKETS; i++) {
379                 tdb_off_t off, prev = 0, *p;
380                 struct tdb_free_record f;
381
382                 h = bucket_off(flist_off, i);
383                 for (off = tdb_read_off(tdb, h); off; off = f.next) {
384                         if (off == TDB_OFF_ERR)
385                                 return false;
386                         if (tdb_read_convert(tdb, off, &f, sizeof(f)))
387                                 return false;
388                         if (!check_free(tdb, off, &f, prev, flist_num, i))
389                                 return false;
390
391                         /* FIXME: Check hash bits */
392                         p = asearch(&off, free, num_free, off_cmp);
393                         if (!p) {
394                                 tdb->log(tdb, TDB_DEBUG_ERROR,
395                                          tdb->log_priv,
396                                          "tdb_check: Invalid offset"
397                                          " %llu in free table\n",
398                                          (long long)off);
399                                 return false;
400                         }
401                         /* Mark it invalid. */
402                         *p ^= 1;
403                         (*num_found)++;
404                         prev = off;
405                 }
406         }
407         return true;
408 }
409
410 /* Slow, but should be very rare. */
411 size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
412 {
413         size_t len;
414
415         for (len = 0; off + len < tdb->map_size; len++) {
416                 char c;
417                 if (tdb->methods->read(tdb, off, &c, 1))
418                         return 0;
419                 if (c != 0 && c != 0x43)
420                         break;
421         }
422         return len;
423 }
424
425 static bool check_linear(struct tdb_context *tdb,
426                          tdb_off_t **used, size_t *num_used,
427                          tdb_off_t **free, size_t *num_free,
428                          tdb_off_t recovery)
429 {
430         tdb_off_t off;
431         tdb_len_t len;
432         bool found_recovery = false;
433
434         for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
435                 union {
436                         struct tdb_used_record u;
437                         struct tdb_free_record f;
438                         struct tdb_recovery_record r;
439                 } pad, *p;
440                 /* r is larger: only get that if we need to. */
441                 p = tdb_get(tdb, off, &pad, sizeof(pad.f));
442                 if (!p)
443                         return false;
444
445                 /* If we crash after ftruncate, we can get zeroes or fill. */
446                 if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
447                     || p->r.magic ==  0x4343434343434343ULL) {
448                         p = tdb_get(tdb, off, &pad, sizeof(pad.r));
449                         if (!p)
450                                 return false;
451                         if (recovery == off) {
452                                 found_recovery = true;
453                                 len = sizeof(p->r) + p->r.max_len;
454                         } else {
455                                 len = dead_space(tdb, off);
456                                 if (len < sizeof(p->r)) {
457                                         tdb->log(tdb, TDB_DEBUG_ERROR,
458                                                  tdb->log_priv,
459                                                  "tdb_check: invalid dead space"
460                                                  " at %zu\n", (size_t)off);
461                                         return false;
462                                 }
463
464                                 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
465                                          "Dead space at %zu-%zu (of %zu)\n",
466                                          (size_t)off, (size_t)(off + len),
467                                          (size_t)tdb->map_size);
468                         }
469                 } else if (p->r.magic == TDB_RECOVERY_MAGIC) {
470                         p = tdb_get(tdb, off, &pad, sizeof(pad.r));
471                         if (!p)
472                                 return false;
473                         if (recovery != off) {
474                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
475                                          "tdb_check: unexpected recovery"
476                                          " record at offset %zu\n",
477                                          (size_t)off);
478                                 return false;
479                         }
480                         if (p->r.len > p->r.max_len) {
481                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
482                                          "tdb_check: invalid recovery length"
483                                          " %zu\n", (size_t)p->r.len);
484                                 return false;
485                         }
486                         if (p->r.eof > tdb->map_size) {
487                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
488                                          "tdb_check: invalid old EOF"
489                                          " %zu\n", (size_t)p->r.eof);
490                                 return false;
491                         }
492                         found_recovery = true;
493                         len = sizeof(p->r) + p->r.max_len;
494                 } else if (frec_magic(&p->f) == TDB_FREE_MAGIC
495                            || frec_magic(&p->f) == TDB_COALESCING_MAGIC) {
496                         len = sizeof(p->u) + frec_len(&p->f);
497                         if (off + len > tdb->map_size) {
498                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
499                                          "tdb_check: free overlength %llu"
500                                          " at offset %llu\n",
501                                          (long long)len, (long long)off);
502                                 return false;
503                         }
504                         /* This record is free! */
505                         if (frec_magic(&p->f) == TDB_FREE_MAGIC
506                             && !append(free, num_free, off))
507                                 return false;
508                 } else {
509                         uint64_t klen, dlen, extra;
510
511                         /* This record is used! */
512                         if (rec_magic(&p->u) != TDB_MAGIC) {
513                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
514                                          "tdb_check: Bad magic 0x%llx"
515                                          " at offset %llu\n",
516                                          (long long)rec_magic(&p->u),
517                                          (long long)off);
518                                 return false;
519                         }
520
521                         if (!append(used, num_used, off))
522                                 return false;
523
524                         klen = rec_key_length(&p->u);
525                         dlen = rec_data_length(&p->u);
526                         extra = rec_extra_padding(&p->u);
527
528                         len = sizeof(p->u) + klen + dlen + extra;
529                         if (off + len > tdb->map_size) {
530                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
531                                          "tdb_check: used overlength %llu"
532                                          " at offset %llu\n",
533                                          (long long)len, (long long)off);
534                                 return false;
535                         }
536
537                         if (len < sizeof(p->f)) {
538                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
539                                          "tdb_check: too short record %llu at"
540                                          " %llu\n",
541                                          (long long)len, (long long)off);
542                                 return false;
543                         }
544                 }
545         }
546
547         /* We must have found recovery area if there was one. */
548         if (recovery != 0 && !found_recovery) {
549                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
550                          "tdb_check: expected a recovery area at %zu\n",
551                          (size_t)recovery);
552                 return false;
553         }
554
555         return true;
556 }
557
558 int tdb_check(struct tdb_context *tdb,
559               int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
560               void *private_data)
561 {
562         tdb_off_t *free = NULL, *used = NULL, flist, recovery;
563         size_t num_free = 0, num_used = 0, num_found = 0, num_flists = 0;
564
565         if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
566                 return -1;
567
568         if (tdb_lock_expand(tdb, F_RDLCK) != 0) {
569                 tdb_allrecord_unlock(tdb, F_RDLCK);
570                 return -1;
571         }
572
573         if (!check_header(tdb, &recovery))
574                 goto fail;
575
576         /* First we do a linear scan, checking all records. */
577         if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery))
578                 goto fail;
579
580         for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
581                 if (flist == TDB_OFF_ERR)
582                         goto fail;
583                 if (!check_free_list(tdb, flist, num_flists, free, num_free,
584                                      &num_found))
585                         goto fail;
586                 num_flists++;
587         }
588
589         /* FIXME: Check key uniqueness? */
590         if (!check_hash(tdb, used, num_used, num_flists, check, private_data))
591                 goto fail;
592
593         if (num_found != num_free) {
594                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
595                          "tdb_check: Not all entries are in free table\n");
596                 return -1;
597         }
598
599         tdb_allrecord_unlock(tdb, F_RDLCK);
600         tdb_unlock_expand(tdb, F_RDLCK);
601         return 0;
602
603 fail:
604         tdb_allrecord_unlock(tdb, F_RDLCK);
605         tdb_unlock_expand(tdb, F_RDLCK);
606         return -1;
607 }