tdb2: cancel transactions on tdb_close
[ccan] / ccan / tdb2 / check.c
1  /* 
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4    
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/asearch/asearch.h>
21
22 /* We keep an ordered array of offsets. */
23 static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
24 {
25         tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
26         if (!new)
27                 return false;
28         new[(*num)++] = off;
29         *arr = new;
30         return true;
31 }
32
33 static bool check_header(struct tdb_context *tdb, tdb_off_t *recovery)
34 {
35         uint64_t hash_test;
36         struct tdb_header hdr;
37
38         if (tdb_read_convert(tdb, 0, &hdr, sizeof(hdr)) == -1)
39                 return false;
40         /* magic food should not be converted, so convert back. */
41         tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
42
43         hash_test = TDB_HASH_MAGIC;
44         hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
45         if (hdr.hash_test != hash_test) {
46                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
47                          "check: hash test %llu should be %llu\n",
48                          (long long)hdr.hash_test,
49                          (long long)hash_test);
50                 return false;
51         }
52
53         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
54                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
55                          "check: bad magic '%.*s'\n",
56                          (unsigned)sizeof(hdr.magic_food), hdr.magic_food);
57                 return false;
58         }
59
60         *recovery = hdr.recovery;
61         if (*recovery) {
62                 if (*recovery < sizeof(hdr) || *recovery > tdb->map_size) {
63                         tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
64                                  "tdb_check: invalid recovery offset %zu\n",
65                                  (size_t)*recovery);
66                         return false;
67                 }
68         }
69
70         /* Don't check reserved: they *can* be used later. */
71         return true;
72 }
73
74 static bool check_hash_tree(struct tdb_context *tdb,
75                             tdb_off_t off, unsigned int group_bits,
76                             uint64_t hprefix,
77                             unsigned hprefix_bits,
78                             tdb_off_t used[],
79                             size_t num_used,
80                             size_t *num_found);
81
82 static bool check_hash_record(struct tdb_context *tdb,
83                               tdb_off_t off,
84                               uint64_t hprefix,
85                               unsigned hprefix_bits,
86                               tdb_off_t used[],
87                               size_t num_used,
88                               size_t *num_found)
89 {
90         struct tdb_used_record rec;
91
92         if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
93                 return false;
94
95         if (rec_data_length(&rec)
96             != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
97                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
98                          "tdb_check: Bad hash table length %llu vs %llu\n",
99                          (long long)rec_data_length(&rec),
100                          (long long)sizeof(tdb_off_t)<<TDB_SUBLEVEL_HASH_BITS);
101                 return false;
102         }
103         if (rec_key_length(&rec) != 0) {
104                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
105                          "tdb_check: Bad hash table key length %llu\n",
106                          (long long)rec_key_length(&rec));
107                 return false;
108         }
109         if (rec_hash(&rec) != 0) {
110                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
111                          "tdb_check: Bad hash table hash value %llu\n",
112                          (long long)rec_hash(&rec));
113                 return false;
114         }
115
116         off += sizeof(rec);
117         return check_hash_tree(tdb, off,
118                                TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
119                                hprefix, hprefix_bits,
120                                used, num_used, num_found);
121 }
122
123 static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
124 {
125         /* Can overflow an int. */
126         return *a > *b ? 1
127                 : *a < *b ? -1
128                 : 0;
129 }
130
131 static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
132 {
133         *used += num;
134
135         return (h >> (64 - *used)) & ((1U << num) - 1);
136 }
137
138 static bool check_hash_tree(struct tdb_context *tdb,
139                             tdb_off_t off, unsigned int group_bits,
140                             uint64_t hprefix,
141                             unsigned hprefix_bits,
142                             tdb_off_t used[],
143                             size_t num_used,
144                             size_t *num_found)
145 {
146         unsigned int g, b;
147         const tdb_off_t *hash;
148         struct tdb_used_record rec;
149
150         hash = tdb_access_read(tdb, off,
151                                sizeof(tdb_off_t)
152                                << (group_bits + TDB_HASH_GROUP_BITS),
153                                true);
154         if (!hash)
155                 return false;
156
157         for (g = 0; g < (1 << group_bits); g++) {
158                 const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
159                 for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
160                         unsigned int bucket, i, used_bits;
161                         uint64_t h;
162                         tdb_off_t *p;
163                         if (group[b] == 0)
164                                 continue;
165
166                         off = group[b] & TDB_OFF_MASK;
167                         p = asearch(&off, used, num_used, off_cmp);
168                         if (!p) {
169                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
170                                          "tdb_check: Invalid offset %llu "
171                                          "in hash\n",
172                                          (long long)off);
173                                 goto fail;
174                         }
175                         /* Mark it invalid. */
176                         *p ^= 1;
177                         (*num_found)++;
178
179                         if (is_subhash(group[b])) {
180                                 uint64_t subprefix;
181                                 subprefix = (hprefix 
182                                      << (group_bits + TDB_HASH_GROUP_BITS))
183                                         + g * (1 << TDB_HASH_GROUP_BITS) + b;
184
185                                 if (!check_hash_record(tdb,
186                                                group[b] & TDB_OFF_MASK,
187                                                subprefix,
188                                                hprefix_bits
189                                                        + group_bits
190                                                        + TDB_HASH_GROUP_BITS,
191                                                used, num_used, num_found))
192                                         goto fail;
193                                 continue;
194                         }
195                         /* A normal entry */
196
197                         /* Does it belong here at all? */
198                         h = hash_record(tdb, off);
199                         used_bits = 0;
200                         if (get_bits(h, hprefix_bits, &used_bits) != hprefix
201                             && hprefix_bits) {
202                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
203                                          "check: bad hash placement"
204                                          " 0x%llx vs 0x%llx\n",
205                                          (long long)h, (long long)hprefix);
206                                 goto fail;
207                         }
208
209                         /* Does it belong in this group? */
210                         if (get_bits(h, group_bits, &used_bits) != g) {
211                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
212                                          "check: bad group %llu vs %u\n",
213                                          (long long)h, g);
214                                 goto fail;
215                         }
216
217                         /* Are bucket bits correct? */
218                         bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
219                         if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
220                             != bucket) {
221                                 used_bits -= TDB_HASH_GROUP_BITS;
222                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
223                                          "check: bad bucket %u vs %u\n",
224                                          (unsigned)get_bits(h,
225                                                             TDB_HASH_GROUP_BITS,
226                                                             &used_bits),
227                                          bucket);
228                                 goto fail;
229                         }
230
231                         /* There must not be any zero entries between
232                          * the bucket it belongs in and this one! */
233                         for (i = bucket;
234                              i != b;
235                              i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
236                                 if (group[i] == 0) {
237                                         tdb->log(tdb, TDB_DEBUG_ERROR,
238                                                  tdb->log_priv,
239                                                  "check: bad group placement"
240                                                  " %u vs %u\n",
241                                                  b, bucket);
242                                         goto fail;
243                                 }
244                         }
245
246                         if (tdb_read_convert(tdb, off, &rec, sizeof(rec)) == -1)
247                                 goto fail;
248
249                         /* Bottom bits must match header. */
250                         if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
251                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
252                                          "tdb_check: Bad hash magic at"
253                                          " offset %llu (0x%llx vs 0x%llx)\n",
254                                          (long long)off,
255                                          (long long)h,
256                                          (long long)rec_hash(&rec));
257                                 goto fail;
258                         }
259                 }
260         }
261         tdb_access_release(tdb, hash);
262         return true;
263
264 fail:
265         tdb_access_release(tdb, hash);
266         return false;
267 }
268
269 static bool check_hash(struct tdb_context *tdb,
270                        tdb_off_t used[],
271                        size_t num_used, size_t num_flists)
272 {
273         /* Free lists also show up as used. */
274         size_t num_found = num_flists;
275
276         if (!check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
277                              TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
278                              0, 0, used, num_used, &num_found))
279                 return false;
280
281         if (num_found != num_used) {
282                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
283                          "tdb_check: Not all entries are in hash\n");
284                 return false;
285         }
286         return true;
287 }
288
289 static bool check_free(struct tdb_context *tdb,
290                        tdb_off_t off,
291                        const struct tdb_free_record *frec,
292                        tdb_off_t prev, tdb_off_t flist_off, unsigned int bucket)
293 {
294         if (frec_magic(frec) != TDB_FREE_MAGIC) {
295                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
296                          "tdb_check: offset %llu bad magic 0x%llx\n",
297                          (long long)off, (long long)frec->magic_and_meta);
298                 return false;
299         }
300         if (frec_flist(frec) != flist_off) {
301                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
302                          "tdb_check: offset %llu bad freelist 0x%llx\n",
303                          (long long)off, (long long)frec_flist(frec));
304                 return false;
305         }
306
307         if (tdb->methods->oob(tdb, off
308                               + frec->data_len+sizeof(struct tdb_used_record),
309                               false))
310                 return false;
311         if (size_to_bucket(frec->data_len) != bucket) {
312                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
313                          "tdb_check: offset %llu in wrong bucket %u vs %u\n",
314                          (long long)off,
315                          bucket, size_to_bucket(frec->data_len));
316                 return false;
317         }
318         if (prev != frec->prev) {
319                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
320                          "tdb_check: offset %llu bad prev %llu vs %llu\n",
321                          (long long)off,
322                          (long long)prev, (long long)frec->prev);
323                 return false;
324         }
325         return true;
326 }
327                        
328 static bool check_free_list(struct tdb_context *tdb,
329                             tdb_off_t flist_off,
330                             tdb_off_t free[],
331                             size_t num_free,
332                             size_t *num_found)
333 {
334         struct tdb_freelist flist;
335         tdb_off_t h;
336         unsigned int i;
337
338         if (tdb_read_convert(tdb, flist_off, &flist, sizeof(flist)) == -1)
339                 return false;
340
341         if (rec_magic(&flist.hdr) != TDB_MAGIC
342             || rec_key_length(&flist.hdr) != 0
343             || rec_data_length(&flist.hdr) != sizeof(flist) - sizeof(flist.hdr)
344             || rec_hash(&flist.hdr) != 1) {
345                 tdb->log(tdb, TDB_DEBUG_ERROR,
346                          tdb->log_priv,
347                          "tdb_check: Invalid header on free list\n");
348                 return false;
349         }
350
351         for (i = 0; i < TDB_FREE_BUCKETS; i++) {
352                 tdb_off_t off, prev = 0, *p;
353                 struct tdb_free_record f;
354
355                 h = bucket_off(flist_off, i);
356                 for (off = tdb_read_off(tdb, h); off; off = f.next) {
357                         if (off == TDB_OFF_ERR)
358                                 return false;
359                         if (tdb_read_convert(tdb, off, &f, sizeof(f)))
360                                 return false;
361                         if (!check_free(tdb, off, &f, prev, flist_off, i))
362                                 return false;
363
364                         /* FIXME: Check hash bits */
365                         p = asearch(&off, free, num_free, off_cmp);
366                         if (!p) {
367                                 tdb->log(tdb, TDB_DEBUG_ERROR,
368                                          tdb->log_priv,
369                                          "tdb_check: Invalid offset"
370                                          " %llu in free table\n",
371                                          (long long)off);
372                                 return false;
373                         }
374                         /* Mark it invalid. */
375                         *p ^= 1;
376                         (*num_found)++;
377                         prev = off;
378                 }
379         }
380         return true;
381 }
382
383 /* Slow, but should be very rare. */
384 static size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
385 {
386         size_t len;
387
388         for (len = 0; off + len < tdb->map_size; len++) {
389                 char c;
390                 if (tdb->methods->read(tdb, off, &c, 1))
391                         return 0;
392                 if (c != 0 && c != 0x43)
393                         break;
394         }
395         return len;
396 }
397
398 static bool check_linear(struct tdb_context *tdb,
399                          tdb_off_t **used, size_t *num_used,
400                          tdb_off_t **free, size_t *num_free,
401                          tdb_off_t recovery)
402 {
403         tdb_off_t off;
404         tdb_len_t len;
405         bool found_recovery = false;
406
407         for (off = sizeof(struct tdb_header); off < tdb->map_size; off += len) {
408                 union {
409                         struct tdb_used_record u;
410                         struct tdb_free_record f;
411                         struct tdb_recovery_record r;
412                 } pad, *p;
413                 p = tdb_get(tdb, off, &pad, sizeof(pad));
414                 if (!p)
415                         return false;
416
417                 /* If we crash after ftruncate, we can get zeroes or fill. */
418                 if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
419                     || p->r.magic ==  0x4343434343434343ULL) {
420                         if (recovery == off) {
421                                 found_recovery = true;
422                                 len = sizeof(p->r) + p->r.max_len;
423                         } else {
424                                 len = dead_space(tdb, off);
425                                 if (len < sizeof(p->r)) {
426                                         tdb->log(tdb, TDB_DEBUG_ERROR,
427                                                  tdb->log_priv,
428                                                  "tdb_check: invalid dead space"
429                                                  " at %zu\n", (size_t)off);
430                                         return false;
431                                 }
432
433                                 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
434                                          "Dead space at %zu-%zu (of %zu)\n",
435                                          (size_t)off, (size_t)(off + len),
436                                          (size_t)tdb->map_size);
437                         }
438                 } else if (p->r.magic == TDB_RECOVERY_MAGIC) {
439                         if (recovery != off) {
440                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
441                                          "tdb_check: unexpected recovery"
442                                          " record at offset %zu\n",
443                                          (size_t)off);
444                                 return false;
445                         }
446                         found_recovery = true;
447                         len = sizeof(p->r) + p->r.max_len;
448                 } else if (frec_magic(&p->f) == TDB_FREE_MAGIC
449                            || frec_magic(&p->f) == TDB_COALESCING_MAGIC) {
450                         len = sizeof(p->u) + p->f.data_len;
451                         if (off + len > tdb->map_size) {
452                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
453                                          "tdb_check: free overlength %llu"
454                                          " at offset %llu\n",
455                                          (long long)len, (long long)off);
456                                 return false;
457                         }
458                         /* This record is free! */
459                         if (frec_magic(&p->f) == TDB_FREE_MAGIC
460                             && !append(free, num_free, off))
461                                 return false;
462                 } else {
463                         uint64_t klen, dlen, extra;
464
465                         /* This record is used! */
466                         if (rec_magic(&p->u) != TDB_MAGIC) {
467                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
468                                          "tdb_check: Bad magic 0x%llx"
469                                          " at offset %llu\n",
470                                          (long long)rec_magic(&p->u),
471                                          (long long)off);
472                                 return false;
473                         }
474
475                         if (!append(used, num_used, off))
476                                 return false;
477
478                         klen = rec_key_length(&p->u);
479                         dlen = rec_data_length(&p->u);
480                         extra = rec_extra_padding(&p->u);
481
482                         len = sizeof(p->u) + klen + dlen + extra;
483                         if (off + len > tdb->map_size) {
484                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
485                                          "tdb_check: used overlength %llu"
486                                          " at offset %llu\n",
487                                          (long long)len, (long long)off);
488                                 return false;
489                         }
490
491                         if (len < sizeof(p->f)) {
492                                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
493                                          "tdb_check: too short record %llu at"
494                                          " %llu\n",
495                                          (long long)len, (long long)off);
496                                 return false;
497                         }
498                 }
499         }
500
501         /* We must have found recovery area if there was one. */
502         if (recovery != 0 && !found_recovery) {
503                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
504                          "tdb_check: expected a recovery area at %zu\n",
505                          (size_t)recovery);
506                 return false;
507         }
508
509         return true;
510 }
511
512 /* FIXME: call check() function. */
513 int tdb_check(struct tdb_context *tdb,
514               int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
515               void *private_data)
516 {
517         tdb_off_t *free = NULL, *used = NULL, flist, recovery;
518         size_t num_free = 0, num_used = 0, num_found = 0, num_flists = 0;
519
520         if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false) != 0)
521                 return -1;
522
523         if (tdb_lock_expand(tdb, F_RDLCK) != 0) {
524                 tdb_allrecord_unlock(tdb, F_RDLCK);
525                 return -1;
526         }
527
528         if (!check_header(tdb, &recovery))
529                 goto fail;
530
531         /* First we do a linear scan, checking all records. */
532         if (!check_linear(tdb, &used, &num_used, &free, &num_free, recovery))
533                 goto fail;
534
535         for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
536                 if (flist == TDB_OFF_ERR)
537                         goto fail;
538                 if (!check_free_list(tdb, flist, free, num_free, &num_found))
539                         goto fail;
540                 num_flists++;
541         }
542
543         /* FIXME: Check key uniqueness? */
544         if (!check_hash(tdb, used, num_used, num_flists))
545                 goto fail;
546
547         if (num_found != num_free) {
548                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
549                          "tdb_check: Not all entries are in free table\n");
550                 return false;
551         }
552
553         tdb_allrecord_unlock(tdb, F_RDLCK);
554         tdb_unlock_expand(tdb, F_RDLCK);
555         return 0;
556
557 fail:
558         tdb_allrecord_unlock(tdb, F_RDLCK);
559         tdb_unlock_expand(tdb, F_RDLCK);
560         return -1;
561 }