git.ozlabs.org Git - ccan/blob - ccan/tdb2/tdb1_check.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Rusty Russell             2009
   7
   8      ** NOTE! The following LGPL license applies to the tdb
   9      ** library. This does NOT imply that all of Samba is released
  10      ** under the LGPL
  11
  12    This library is free software; you can redistribute it and/or
  13    modify it under the terms of the GNU Lesser General Public
  14    License as published by the Free Software Foundation; either
  15    version 3 of the License, or (at your option) any later version.
  16
  17    This library is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20    Lesser General Public License for more details.
  21
  22    You should have received a copy of the GNU Lesser General Public
  23    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  24 */
  25 #include "tdb1_private.h"
  26
  27 /* Since we opened it, these shouldn't fail unless it's recent corruption. */
  28 static bool tdb1_check_header(struct tdb_context *tdb, tdb1_off_t *recovery)
  29 {
  30         struct tdb1_header hdr;
  31         uint32_t h1, h2;
  32
  33         if (tdb->tdb1.io->tdb1_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1)
  34                 return false;
  35         if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0)
  36                 goto corrupt;
  37
  38         TDB1_CONV(hdr);
  39         if (hdr.version != TDB1_VERSION)
  40                 goto corrupt;
  41
  42         if (hdr.rwlocks != 0 && hdr.rwlocks != TDB1_HASH_RWLOCK_MAGIC)
  43                 goto corrupt;
  44
  45         tdb1_header_hash(tdb, &h1, &h2);
  46         if (hdr.magic1_hash && hdr.magic2_hash &&
  47             (hdr.magic1_hash != h1 || hdr.magic2_hash != h2))
  48                 goto corrupt;
  49
  50         if (hdr.hash_size == 0)
  51                 goto corrupt;
  52
  53         if (hdr.hash_size != tdb->tdb1.header.hash_size)
  54                 goto corrupt;
  55
  56         if (hdr.recovery_start != 0 &&
  57             hdr.recovery_start < TDB1_DATA_START(tdb->tdb1.header.hash_size))
  58                 goto corrupt;
  59
  60         *recovery = hdr.recovery_start;
  61         return true;
  62
  63 corrupt:
  64         tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
  65                                 "Header is corrupt\n");
  66         return false;
  67 }
  68
  69 /* Generic record header check. */
  70 static bool tdb1_check_record(struct tdb_context *tdb,
  71                              tdb1_off_t off,
  72                              const struct tdb1_record *rec)
  73 {
  74         tdb1_off_t tailer;
  75
  76         /* Check rec->next: 0 or points to record offset, aligned. */
  77         if (rec->next > 0 && rec->next < TDB1_DATA_START(tdb->tdb1.header.hash_size)){
  78                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
  79                            "Record offset %d too small next %d\n",
  80                            off, rec->next);
  81                 goto corrupt;
  82         }
  83         if (rec->next + sizeof(*rec) < rec->next) {
  84                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
  85                            "Record offset %d too large next %d\n",
  86                            off, rec->next);
  87                 goto corrupt;
  88         }
  89         if ((rec->next % TDB1_ALIGNMENT) != 0) {
  90                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
  91                            "Record offset %d misaligned next %d\n",
  92                            off, rec->next);
  93                 goto corrupt;
  94         }
  95         if (tdb->tdb1.io->tdb1_oob(tdb, rec->next, sizeof(*rec), 0))
  96                 goto corrupt;
  97
  98         /* Check rec_len: similar to rec->next, implies next record. */
  99         if ((rec->rec_len % TDB1_ALIGNMENT) != 0) {
 100                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 101                            "Record offset %d misaligned length %d\n",
 102                            off, rec->rec_len);
 103                 goto corrupt;
 104         }
 105         /* Must fit tailer. */
 106         if (rec->rec_len < sizeof(tailer)) {
 107                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 108                            "Record offset %d too short length %d\n",
 109                            off, rec->rec_len);
 110                 goto corrupt;
 111         }
 112         /* OOB allows "right at the end" access, so this works for last rec. */
 113         if (tdb->tdb1.io->tdb1_oob(tdb, off, sizeof(*rec)+rec->rec_len, 0))
 114                 goto corrupt;
 115
 116         /* Check tailer. */
 117         if (tdb1_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer),
 118                          &tailer) == -1)
 119                 goto corrupt;
 120         if (tailer != sizeof(*rec) + rec->rec_len) {
 121                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 122                            "Record offset %d invalid tailer\n", off);
 123                 goto corrupt;
 124         }
 125
 126         return true;
 127
 128 corrupt:
 129         tdb->last_error = TDB_ERR_CORRUPT;
 130         return false;
 131 }
 132
 133 /* Grab some bytes: may copy if can't use mmap.
 134    Caller has already done bounds check. */
 135 static TDB_DATA get_bytes(struct tdb_context *tdb,
 136                           tdb1_off_t off, tdb1_len_t len)
 137 {
 138         TDB_DATA d;
 139
 140         d.dsize = len;
 141
 142         if (tdb->tdb1.transaction == NULL && tdb->file->map_ptr != NULL)
 143                 d.dptr = (unsigned char *)tdb->file->map_ptr + off;
 144         else
 145                 d.dptr = tdb1_alloc_read(tdb, off, d.dsize);
 146         return d;
 147 }
 148
 149 /* Frees data if we're not able to simply use mmap. */
 150 static void put_bytes(struct tdb_context *tdb, TDB_DATA d)
 151 {
 152         if (tdb->tdb1.transaction == NULL && tdb->file->map_ptr != NULL)
 153                 return;
 154         free(d.dptr);
 155 }
 156
 157 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
 158  * See: http://burtleburtle.net/bob/c/lookup3.c
 159  */
 160 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
 161 static void jhash(uint32_t key, uint32_t *pc, uint32_t *pb)
 162 {
 163         uint32_t a,b,c;
 164
 165         /* Set up the internal state */
 166         a = b = c = 0xdeadbeef + *pc;
 167         c += *pb;
 168         a += key;
 169         c ^= b; c -= rot(b,14);
 170         a ^= c; a -= rot(c,11);
 171         b ^= a; b -= rot(a,25);
 172         c ^= b; c -= rot(b,16);
 173         a ^= c; a -= rot(c,4);
 174         b ^= a; b -= rot(a,14);
 175         c ^= b; c -= rot(b,24);
 176         *pc=c; *pb=b;
 177 }
 178
 179 /*
 180   We want to check that all free records are in the free list
 181   (only once), and all free list entries are free records.  Similarly
 182   for each hash chain of used records.
 183
 184   Doing that naively (without walking hash chains, since we want to be
 185   linear) means keeping a list of records which have been seen in each
 186   hash chain, and another of records pointed to (ie. next pointers
 187   from records and the initial hash chain heads).  These two lists
 188   should be equal.  This will take 8 bytes per record, and require
 189   sorting at the end.
 190
 191   So instead, we record each offset in a bitmap such a way that
 192   recording it twice will cancel out.  Since each offset should appear
 193   exactly twice, the bitmap should be zero at the end.
 194
 195   The approach was inspired by Bloom Filters (see Wikipedia).  For
 196   each value, we flip K bits in a bitmap of size N.  The number of
 197   distinct arrangements is:
 198
 199         N! / (K! * (N-K)!)
 200
 201   Of course, not all arrangements are actually distinct, but testing
 202   shows this formula to be close enough.
 203
 204   So, if K == 8 and N == 256, the probability of two things flipping the same
 205   bits is 1 in 409,663,695,276,000.
 206
 207   Given that ldb uses a hash size of 10000, using 32 bytes per hash chain
 208   (320k) seems reasonable.
 209 */
 210 #define NUM_HASHES 8
 211 #define BITMAP_BITS 256
 212
 213 static void bit_flip(unsigned char bits[], unsigned int idx)
 214 {
 215         bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT));
 216 }
 217
 218 /* We record offsets in a bitmap for the particular chain it should be in.  */
 219 static void record_offset(unsigned char bits[], tdb1_off_t off)
 220 {
 221         uint32_t h1 = off, h2 = 0;
 222         unsigned int i;
 223
 224         /* We get two good hash values out of jhash2, so we use both.  Then
 225          * we keep going to produce further hash values. */
 226         for (i = 0; i < NUM_HASHES / 2; i++) {
 227                 jhash(off, &h1, &h2);
 228                 bit_flip(bits, h1 % BITMAP_BITS);
 229                 bit_flip(bits, h2 % BITMAP_BITS);
 230                 h2++;
 231         }
 232 }
 233
 234 /* Check that an in-use record is valid. */
 235 static bool tdb1_check_used_record(struct tdb_context *tdb,
 236                                   tdb1_off_t off,
 237                                   const struct tdb1_record *rec,
 238                                   unsigned char **hashes,
 239                                   enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA,
 240                                                           void *),
 241                                   void *private_data)
 242 {
 243         TDB_DATA key, data;
 244
 245         if (!tdb1_check_record(tdb, off, rec))
 246                 return false;
 247
 248         /* key + data + tailer must fit in record */
 249         if (rec->key_len + rec->data_len + sizeof(tdb1_off_t) > rec->rec_len) {
 250                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 251                                         "Record offset %d too short for contents\n", off);
 252                 return false;
 253         }
 254
 255         key = get_bytes(tdb, off + sizeof(*rec), rec->key_len);
 256         if (!key.dptr)
 257                 return false;
 258
 259         if ((uint32_t)tdb_hash(tdb, key.dptr, key.dsize) != rec->full_hash) {
 260                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 261                                         "Record offset %d has incorrect hash\n", off);
 262                 goto fail_put_key;
 263         }
 264
 265         /* Mark this offset as a known value for this hash bucket. */
 266         record_offset(hashes[TDB1_BUCKET(rec->full_hash)+1], off);
 267         /* And similarly if the next pointer is valid. */
 268         if (rec->next)
 269                 record_offset(hashes[TDB1_BUCKET(rec->full_hash)+1], rec->next);
 270
 271         /* If they supply a check function and this record isn't dead,
 272            get data and feed it. */
 273         if (check && rec->magic != TDB1_DEAD_MAGIC) {
 274                 enum TDB_ERROR ecode;
 275
 276                 data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len,
 277                                  rec->data_len);
 278                 if (!data.dptr)
 279                         goto fail_put_key;
 280
 281                 ecode = check(key, data, private_data);
 282                 if (ecode != TDB_SUCCESS) {
 283                         tdb->last_error = ecode;
 284                         goto fail_put_data;
 285                 }
 286                 put_bytes(tdb, data);
 287         }
 288
 289         put_bytes(tdb, key);
 290         return true;
 291
 292 fail_put_data:
 293         put_bytes(tdb, data);
 294 fail_put_key:
 295         put_bytes(tdb, key);
 296         return false;
 297 }
 298
 299 /* Check that an unused record is valid. */
 300 static bool tdb1_check_free_record(struct tdb_context *tdb,
 301                                   tdb1_off_t off,
 302                                   const struct tdb1_record *rec,
 303                                   unsigned char **hashes)
 304 {
 305         if (!tdb1_check_record(tdb, off, rec))
 306                 return false;
 307
 308         /* Mark this offset as a known value for the free list. */
 309         record_offset(hashes[0], off);
 310         /* And similarly if the next pointer is valid. */
 311         if (rec->next)
 312                 record_offset(hashes[0], rec->next);
 313         return true;
 314 }
 315
 316 /* Slow, but should be very rare. */
 317 size_t tdb1_dead_space(struct tdb_context *tdb, tdb1_off_t off)
 318 {
 319         size_t len;
 320
 321         for (len = 0; off + len < tdb->file->map_size; len++) {
 322                 char c;
 323                 if (tdb->tdb1.io->tdb1_read(tdb, off, &c, 1, 0))
 324                         return 0;
 325                 if (c != 0 && c != 0x42)
 326                         break;
 327         }
 328         return len;
 329 }
 330
 331 int tdb1_check(struct tdb_context *tdb,
 332                enum TDB_ERROR (*check)(TDB_DATA key, TDB_DATA data, void *),
 333                void *private_data)
 334 {
 335         unsigned int h;
 336         unsigned char **hashes;
 337         tdb1_off_t off, recovery_start;
 338         struct tdb1_record rec;
 339         bool found_recovery = false;
 340         tdb1_len_t dead;
 341         bool locked;
 342         size_t alloc_len;
 343
 344         /* We may have a write lock already, so don't re-lock. */
 345         if (tdb->file->allrecord_lock.count != 0) {
 346                 locked = false;
 347         } else {
 348                 if (tdb_lockall_read(tdb) != TDB_SUCCESS)
 349                         return -1;
 350                 locked = true;
 351         }
 352
 353         /* Make sure we know true size of the underlying file. */
 354         tdb->tdb1.io->tdb1_oob(tdb, tdb->file->map_size, 1, 1);
 355
 356         /* Header must be OK: also gets us the recovery ptr, if any. */
 357         if (!tdb1_check_header(tdb, &recovery_start))
 358                 goto unlock;
 359
 360         /* We should have the whole header, too. */
 361         if (tdb->file->map_size < TDB1_DATA_START(tdb->tdb1.header.hash_size)) {
 362                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 363                                         "File too short for hashes\n");
 364                 goto unlock;
 365         }
 366
 367         /* One big malloc: pointers then bit arrays. */
 368         alloc_len = sizeof(hashes[0]) * (1+tdb->tdb1.header.hash_size)
 369                 + BITMAP_BITS / CHAR_BIT * (1+tdb->tdb1.header.hash_size);
 370         hashes = (unsigned char **)calloc(1, alloc_len);
 371         if (!hashes) {
 372                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
 373                                              "tdb_check: could not allocate %zu",
 374                                              alloc_len);
 375                 goto unlock;
 376         }
 377
 378         /* Initialize pointers */
 379         hashes[0] = (unsigned char *)(&hashes[1+tdb->tdb1.header.hash_size]);
 380         for (h = 1; h < 1+tdb->tdb1.header.hash_size; h++)
 381                 hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT;
 382
 383         /* Freelist and hash headers are all in a row: read them. */
 384         for (h = 0; h < 1+tdb->tdb1.header.hash_size; h++) {
 385                 if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP + h*sizeof(tdb1_off_t),
 386                                  &off) == -1)
 387                         goto free;
 388                 if (off)
 389                         record_offset(hashes[h], off);
 390         }
 391
 392         /* For each record, read it in and check it's ok. */
 393         for (off = TDB1_DATA_START(tdb->tdb1.header.hash_size);
 394              off < tdb->file->map_size;
 395              off += sizeof(rec) + rec.rec_len) {
 396                 if (tdb->tdb1.io->tdb1_read(tdb, off, &rec, sizeof(rec),
 397                                            TDB1_DOCONV()) == -1)
 398                         goto free;
 399                 switch (rec.magic) {
 400                 case TDB1_MAGIC:
 401                 case TDB1_DEAD_MAGIC:
 402                         if (!tdb1_check_used_record(tdb, off, &rec, hashes,
 403                                                    check, private_data))
 404                                 goto free;
 405                         break;
 406                 case TDB1_FREE_MAGIC:
 407                         if (!tdb1_check_free_record(tdb, off, &rec, hashes))
 408                                 goto free;
 409                         break;
 410                 /* If we crash after ftruncate, we can get zeroes or fill. */
 411                 case TDB1_RECOVERY_INVALID_MAGIC:
 412                 case 0x42424242:
 413                         if (recovery_start == off) {
 414                                 found_recovery = true;
 415                                 break;
 416                         }
 417                         dead = tdb1_dead_space(tdb, off);
 418                         if (dead < sizeof(rec))
 419                                 goto corrupt;
 420
 421                         tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
 422                                    "Dead space at %d-%d (of %u)\n",
 423                                    off, off + dead, tdb->file->map_size);
 424                         rec.rec_len = dead - sizeof(rec);
 425                         break;
 426                 case TDB1_RECOVERY_MAGIC:
 427                         if (recovery_start != off) {
 428                                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 429                                                         "Unexpected recovery record at offset %d\n",
 430                                                         off);
 431                                 goto free;
 432                         }
 433                         found_recovery = true;
 434                         break;
 435                 default: ;
 436                 corrupt:
 437                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 438                                                 "Bad magic 0x%x at offset %d\n",
 439                                                 rec.magic, off);
 440                         goto free;
 441                 }
 442         }
 443
 444         /* Now, hashes should all be empty: each record exists and is referred
 445          * to by one other. */
 446         for (h = 0; h < 1+tdb->tdb1.header.hash_size; h++) {
 447                 unsigned int i;
 448                 for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) {
 449                         if (hashes[h][i] != 0) {
 450                                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 451                                                         "Hashes do not match records\n");
 452                                 goto free;
 453                         }
 454                 }
 455         }
 456
 457         /* We must have found recovery area if there was one. */
 458         if (recovery_start != 0 && !found_recovery) {
 459                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 460                                         "Expected a recovery area at %u\n",
 461                                         recovery_start);
 462                 goto free;
 463         }
 464
 465         free(hashes);
 466         if (locked) {
 467                 tdb_unlockall_read(tdb);
 468         }
 469         return 0;
 470
 471 free:
 472         free(hashes);
 473 unlock:
 474         if (locked) {
 475                 tdb_unlockall_read(tdb);
 476         }
 477         return -1;
 478 }