git.ozlabs.org Git - ccan/blob - ccan/tdb2/tdb1_check.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Rusty Russell             2009
   7
   8      ** NOTE! The following LGPL license applies to the tdb
   9      ** library. This does NOT imply that all of Samba is released
  10      ** under the LGPL
  11
  12    This library is free software; you can redistribute it and/or
  13    modify it under the terms of the GNU Lesser General Public
  14    License as published by the Free Software Foundation; either
  15    version 3 of the License, or (at your option) any later version.
  16
  17    This library is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20    Lesser General Public License for more details.
  21
  22    You should have received a copy of the GNU Lesser General Public
  23    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  24 */
  25 #include "tdb1_private.h"
  26
  27 /* Since we opened it, these shouldn't fail unless it's recent corruption. */
  28 static bool tdb1_check_header(struct tdb1_context *tdb, tdb1_off_t *recovery)
  29 {
  30         struct tdb1_header hdr;
  31         uint32_t h1, h2;
  32
  33         if (tdb->methods->tdb1_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1)
  34                 return false;
  35         if (strcmp(hdr.magic_food, TDB1_MAGIC_FOOD) != 0)
  36                 goto corrupt;
  37
  38         TDB1_CONV(hdr);
  39         if (hdr.version != TDB1_VERSION)
  40                 goto corrupt;
  41
  42         if (hdr.rwlocks != 0 && hdr.rwlocks != TDB1_HASH_RWLOCK_MAGIC)
  43                 goto corrupt;
  44
  45         tdb1_header_hash(tdb, &h1, &h2);
  46         if (hdr.magic1_hash && hdr.magic2_hash &&
  47             (hdr.magic1_hash != h1 || hdr.magic2_hash != h2))
  48                 goto corrupt;
  49
  50         if (hdr.hash_size == 0)
  51                 goto corrupt;
  52
  53         if (hdr.hash_size != tdb->header.hash_size)
  54                 goto corrupt;
  55
  56         if (hdr.recovery_start != 0 &&
  57             hdr.recovery_start < TDB1_DATA_START(tdb->header.hash_size))
  58                 goto corrupt;
  59
  60         *recovery = hdr.recovery_start;
  61         return true;
  62
  63 corrupt:
  64         tdb->ecode = TDB1_ERR_CORRUPT;
  65         TDB1_LOG((tdb, TDB1_DEBUG_ERROR, "Header is corrupt\n"));
  66         return false;
  67 }
  68
  69 /* Generic record header check. */
  70 static bool tdb1_check_record(struct tdb1_context *tdb,
  71                              tdb1_off_t off,
  72                              const struct tdb1_record *rec)
  73 {
  74         tdb1_off_t tailer;
  75
  76         /* Check rec->next: 0 or points to record offset, aligned. */
  77         if (rec->next > 0 && rec->next < TDB1_DATA_START(tdb->header.hash_size)){
  78                 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
  79                          "Record offset %d too small next %d\n",
  80                          off, rec->next));
  81                 goto corrupt;
  82         }
  83         if (rec->next + sizeof(*rec) < rec->next) {
  84                 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
  85                          "Record offset %d too large next %d\n",
  86                          off, rec->next));
  87                 goto corrupt;
  88         }
  89         if ((rec->next % TDB1_ALIGNMENT) != 0) {
  90                 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
  91                          "Record offset %d misaligned next %d\n",
  92                          off, rec->next));
  93                 goto corrupt;
  94         }
  95         if (tdb->methods->tdb1_oob(tdb, rec->next+sizeof(*rec), 0))
  96                 goto corrupt;
  97
  98         /* Check rec_len: similar to rec->next, implies next record. */
  99         if ((rec->rec_len % TDB1_ALIGNMENT) != 0) {
 100                 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
 101                          "Record offset %d misaligned length %d\n",
 102                          off, rec->rec_len));
 103                 goto corrupt;
 104         }
 105         /* Must fit tailer. */
 106         if (rec->rec_len < sizeof(tailer)) {
 107                 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
 108                          "Record offset %d too short length %d\n",
 109                          off, rec->rec_len));
 110                 goto corrupt;
 111         }
 112         /* OOB allows "right at the end" access, so this works for last rec. */
 113         if (tdb->methods->tdb1_oob(tdb, off+sizeof(*rec)+rec->rec_len, 0))
 114                 goto corrupt;
 115
 116         /* Check tailer. */
 117         if (tdb1_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer),
 118                          &tailer) == -1)
 119                 goto corrupt;
 120         if (tailer != sizeof(*rec) + rec->rec_len) {
 121                 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
 122                          "Record offset %d invalid tailer\n", off));
 123                 goto corrupt;
 124         }
 125
 126         return true;
 127
 128 corrupt:
 129         tdb->ecode = TDB1_ERR_CORRUPT;
 130         return false;
 131 }
 132
 133 /* Grab some bytes: may copy if can't use mmap.
 134    Caller has already done bounds check. */
 135 static TDB1_DATA get_bytes(struct tdb1_context *tdb,
 136                           tdb1_off_t off, tdb1_len_t len)
 137 {
 138         TDB1_DATA d;
 139
 140         d.dsize = len;
 141
 142         if (tdb->transaction == NULL && tdb->map_ptr != NULL)
 143                 d.dptr = (unsigned char *)tdb->map_ptr + off;
 144         else
 145                 d.dptr = tdb1_alloc_read(tdb, off, d.dsize);
 146         return d;
 147 }
 148
 149 /* Frees data if we're not able to simply use mmap. */
 150 static void put_bytes(struct tdb1_context *tdb, TDB1_DATA d)
 151 {
 152         if (tdb->transaction == NULL && tdb->map_ptr != NULL)
 153                 return;
 154         free(d.dptr);
 155 }
 156
 157 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
 158  * See: http://burtleburtle.net/bob/c/lookup3.c
 159  */
 160 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
 161 static void jhash(uint32_t key, uint32_t *pc, uint32_t *pb)
 162 {
 163         uint32_t a,b,c;
 164
 165         /* Set up the internal state */
 166         a = b = c = 0xdeadbeef + *pc;
 167         c += *pb;
 168         a += key;
 169         c ^= b; c -= rot(b,14);
 170         a ^= c; a -= rot(c,11);
 171         b ^= a; b -= rot(a,25);
 172         c ^= b; c -= rot(b,16);
 173         a ^= c; a -= rot(c,4);
 174         b ^= a; b -= rot(a,14);
 175         c ^= b; c -= rot(b,24);
 176         *pc=c; *pb=b;
 177 }
 178
 179 /*
 180   We want to check that all free records are in the free list
 181   (only once), and all free list entries are free records.  Similarly
 182   for each hash chain of used records.
 183
 184   Doing that naively (without walking hash chains, since we want to be
 185   linear) means keeping a list of records which have been seen in each
 186   hash chain, and another of records pointed to (ie. next pointers
 187   from records and the initial hash chain heads).  These two lists
 188   should be equal.  This will take 8 bytes per record, and require
 189   sorting at the end.
 190
 191   So instead, we record each offset in a bitmap such a way that
 192   recording it twice will cancel out.  Since each offset should appear
 193   exactly twice, the bitmap should be zero at the end.
 194
 195   The approach was inspired by Bloom Filters (see Wikipedia).  For
 196   each value, we flip K bits in a bitmap of size N.  The number of
 197   distinct arrangements is:
 198
 199         N! / (K! * (N-K)!)
 200
 201   Of course, not all arrangements are actually distinct, but testing
 202   shows this formula to be close enough.
 203
 204   So, if K == 8 and N == 256, the probability of two things flipping the same
 205   bits is 1 in 409,663,695,276,000.
 206
 207   Given that ldb uses a hash size of 10000, using 32 bytes per hash chain
 208   (320k) seems reasonable.
 209 */
 210 #define NUM_HASHES 8
 211 #define BITMAP_BITS 256
 212
 213 static void bit_flip(unsigned char bits[], unsigned int idx)
 214 {
 215         bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT));
 216 }
 217
 218 /* We record offsets in a bitmap for the particular chain it should be in.  */
 219 static void record_offset(unsigned char bits[], tdb1_off_t off)
 220 {
 221         uint32_t h1 = off, h2 = 0;
 222         unsigned int i;
 223
 224         /* We get two good hash values out of jhash2, so we use both.  Then
 225          * we keep going to produce further hash values. */
 226         for (i = 0; i < NUM_HASHES / 2; i++) {
 227                 jhash(off, &h1, &h2);
 228                 bit_flip(bits, h1 % BITMAP_BITS);
 229                 bit_flip(bits, h2 % BITMAP_BITS);
 230                 h2++;
 231         }
 232 }
 233
 234 /* Check that an in-use record is valid. */
 235 static bool tdb1_check_used_record(struct tdb1_context *tdb,
 236                                   tdb1_off_t off,
 237                                   const struct tdb1_record *rec,
 238                                   unsigned char **hashes,
 239                                   int (*check)(TDB1_DATA, TDB1_DATA, void *),
 240                                   void *private_data)
 241 {
 242         TDB1_DATA key, data;
 243
 244         if (!tdb1_check_record(tdb, off, rec))
 245                 return false;
 246
 247         /* key + data + tailer must fit in record */
 248         if (rec->key_len + rec->data_len + sizeof(tdb1_off_t) > rec->rec_len) {
 249                 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
 250                          "Record offset %d too short for contents\n", off));
 251                 return false;
 252         }
 253
 254         key = get_bytes(tdb, off + sizeof(*rec), rec->key_len);
 255         if (!key.dptr)
 256                 return false;
 257
 258         if (tdb->hash_fn(&key) != rec->full_hash) {
 259                 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
 260                          "Record offset %d has incorrect hash\n", off));
 261                 goto fail_put_key;
 262         }
 263
 264         /* Mark this offset as a known value for this hash bucket. */
 265         record_offset(hashes[TDB1_BUCKET(rec->full_hash)+1], off);
 266         /* And similarly if the next pointer is valid. */
 267         if (rec->next)
 268                 record_offset(hashes[TDB1_BUCKET(rec->full_hash)+1], rec->next);
 269
 270         /* If they supply a check function and this record isn't dead,
 271            get data and feed it. */
 272         if (check && rec->magic != TDB1_DEAD_MAGIC) {
 273                 data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len,
 274                                  rec->data_len);
 275                 if (!data.dptr)
 276                         goto fail_put_key;
 277
 278                 if (check(key, data, private_data) == -1)
 279                         goto fail_put_data;
 280                 put_bytes(tdb, data);
 281         }
 282
 283         put_bytes(tdb, key);
 284         return true;
 285
 286 fail_put_data:
 287         put_bytes(tdb, data);
 288 fail_put_key:
 289         put_bytes(tdb, key);
 290         return false;
 291 }
 292
 293 /* Check that an unused record is valid. */
 294 static bool tdb1_check_free_record(struct tdb1_context *tdb,
 295                                   tdb1_off_t off,
 296                                   const struct tdb1_record *rec,
 297                                   unsigned char **hashes)
 298 {
 299         if (!tdb1_check_record(tdb, off, rec))
 300                 return false;
 301
 302         /* Mark this offset as a known value for the free list. */
 303         record_offset(hashes[0], off);
 304         /* And similarly if the next pointer is valid. */
 305         if (rec->next)
 306                 record_offset(hashes[0], rec->next);
 307         return true;
 308 }
 309
 310 /* Slow, but should be very rare. */
 311 size_t tdb1_dead_space(struct tdb1_context *tdb, tdb1_off_t off)
 312 {
 313         size_t len;
 314
 315         for (len = 0; off + len < tdb->map_size; len++) {
 316                 char c;
 317                 if (tdb->methods->tdb1_read(tdb, off, &c, 1, 0))
 318                         return 0;
 319                 if (c != 0 && c != 0x42)
 320                         break;
 321         }
 322         return len;
 323 }
 324
 325 int tdb1_check(struct tdb1_context *tdb,
 326               int (*check)(TDB1_DATA key, TDB1_DATA data, void *private_data),
 327               void *private_data)
 328 {
 329         unsigned int h;
 330         unsigned char **hashes;
 331         tdb1_off_t off, recovery_start;
 332         struct tdb1_record rec;
 333         bool found_recovery = false;
 334         tdb1_len_t dead;
 335         bool locked;
 336
 337         /* Read-only databases use no locking at all: it's best-effort.
 338          * We may have a write lock already, so skip that case too. */
 339         if (tdb->read_only || tdb->allrecord_lock.count != 0) {
 340                 locked = false;
 341         } else {
 342                 if (tdb1_lockall_read(tdb) == -1)
 343                         return -1;
 344                 locked = true;
 345         }
 346
 347         /* Make sure we know true size of the underlying file. */
 348         tdb->methods->tdb1_oob(tdb, tdb->map_size + 1, 1);
 349
 350         /* Header must be OK: also gets us the recovery ptr, if any. */
 351         if (!tdb1_check_header(tdb, &recovery_start))
 352                 goto unlock;
 353
 354         /* We should have the whole header, too. */
 355         if (tdb->map_size < TDB1_DATA_START(tdb->header.hash_size)) {
 356                 tdb->ecode = TDB1_ERR_CORRUPT;
 357                 TDB1_LOG((tdb, TDB1_DEBUG_ERROR, "File too short for hashes\n"));
 358                 goto unlock;
 359         }
 360
 361         /* One big malloc: pointers then bit arrays. */
 362         hashes = (unsigned char **)calloc(
 363                         1, sizeof(hashes[0]) * (1+tdb->header.hash_size)
 364                         + BITMAP_BITS / CHAR_BIT * (1+tdb->header.hash_size));
 365         if (!hashes) {
 366                 tdb->ecode = TDB1_ERR_OOM;
 367                 goto unlock;
 368         }
 369
 370         /* Initialize pointers */
 371         hashes[0] = (unsigned char *)(&hashes[1+tdb->header.hash_size]);
 372         for (h = 1; h < 1+tdb->header.hash_size; h++)
 373                 hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT;
 374
 375         /* Freelist and hash headers are all in a row: read them. */
 376         for (h = 0; h < 1+tdb->header.hash_size; h++) {
 377                 if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP + h*sizeof(tdb1_off_t),
 378                                  &off) == -1)
 379                         goto free;
 380                 if (off)
 381                         record_offset(hashes[h], off);
 382         }
 383
 384         /* For each record, read it in and check it's ok. */
 385         for (off = TDB1_DATA_START(tdb->header.hash_size);
 386              off < tdb->map_size;
 387              off += sizeof(rec) + rec.rec_len) {
 388                 if (tdb->methods->tdb1_read(tdb, off, &rec, sizeof(rec),
 389                                            TDB1_DOCONV()) == -1)
 390                         goto free;
 391                 switch (rec.magic) {
 392                 case TDB1_MAGIC:
 393                 case TDB1_DEAD_MAGIC:
 394                         if (!tdb1_check_used_record(tdb, off, &rec, hashes,
 395                                                    check, private_data))
 396                                 goto free;
 397                         break;
 398                 case TDB1_FREE_MAGIC:
 399                         if (!tdb1_check_free_record(tdb, off, &rec, hashes))
 400                                 goto free;
 401                         break;
 402                 /* If we crash after ftruncate, we can get zeroes or fill. */
 403                 case TDB1_RECOVERY_INVALID_MAGIC:
 404                 case 0x42424242:
 405                         if (recovery_start == off) {
 406                                 found_recovery = true;
 407                                 break;
 408                         }
 409                         dead = tdb1_dead_space(tdb, off);
 410                         if (dead < sizeof(rec))
 411                                 goto corrupt;
 412
 413                         TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
 414                                  "Dead space at %d-%d (of %u)\n",
 415                                  off, off + dead, tdb->map_size));
 416                         rec.rec_len = dead - sizeof(rec);
 417                         break;
 418                 case TDB1_RECOVERY_MAGIC:
 419                         if (recovery_start != off) {
 420                                 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
 421                                          "Unexpected recovery record at offset %d\n",
 422                                          off));
 423                                 goto free;
 424                         }
 425                         found_recovery = true;
 426                         break;
 427                 default: ;
 428                 corrupt:
 429                         tdb->ecode = TDB1_ERR_CORRUPT;
 430                         TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
 431                                  "Bad magic 0x%x at offset %d\n",
 432                                  rec.magic, off));
 433                         goto free;
 434                 }
 435         }
 436
 437         /* Now, hashes should all be empty: each record exists and is referred
 438          * to by one other. */
 439         for (h = 0; h < 1+tdb->header.hash_size; h++) {
 440                 unsigned int i;
 441                 for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) {
 442                         if (hashes[h][i] != 0) {
 443                                 tdb->ecode = TDB1_ERR_CORRUPT;
 444                                 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
 445                                          "Hashes do not match records\n"));
 446                                 goto free;
 447                         }
 448                 }
 449         }
 450
 451         /* We must have found recovery area if there was one. */
 452         if (recovery_start != 0 && !found_recovery) {
 453                 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
 454                          "Expected a recovery area at %u\n",
 455                          recovery_start));
 456                 goto free;
 457         }
 458
 459         free(hashes);
 460         if (locked) {
 461                 tdb1_unlockall_read(tdb);
 462         }
 463         return 0;
 464
 465 free:
 466         free(hashes);
 467 unlock:
 468         if (locked) {
 469                 tdb1_unlockall_read(tdb);
 470         }
 471         return -1;
 472 }