git.ozlabs.org Git - ccan/blob - ccan/tdb2/tdb1_tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb1_private.h"
  29
  30 TDB_DATA tdb1_null;
  31
  32 /*
  33   non-blocking increment of the tdb sequence number if the tdb has been opened using
  34   the TDB_SEQNUM flag
  35 */
  36 void tdb1_increment_seqnum_nonblock(struct tdb_context *tdb)
  37 {
  38         tdb1_off_t seqnum=0;
  39
  40         if (!(tdb->flags & TDB_SEQNUM)) {
  41                 return;
  42         }
  43
  44         /* we ignore errors from this, as we have no sane way of
  45            dealing with them.
  46         */
  47         tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum);
  48         seqnum++;
  49         tdb1_ofs_write(tdb, TDB1_SEQNUM_OFS, &seqnum);
  50 }
  51
  52 /*
  53   increment the tdb sequence number if the tdb has been opened using
  54   the TDB_SEQNUM flag
  55 */
  56 static void tdb1_increment_seqnum(struct tdb_context *tdb)
  57 {
  58         if (!(tdb->flags & TDB_SEQNUM)) {
  59                 return;
  60         }
  61
  62         if (tdb1_nest_lock(tdb, TDB1_SEQNUM_OFS, F_WRLCK,
  63                            TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
  64                 return;
  65         }
  66
  67         tdb1_increment_seqnum_nonblock(tdb);
  68
  69         tdb1_nest_unlock(tdb, TDB1_SEQNUM_OFS, F_WRLCK);
  70 }
  71
  72 static int tdb1_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
  73 {
  74         return memcmp(data.dptr, key.dptr, data.dsize);
  75 }
  76
  77 /* Returns 0 on fail.  On success, return offset of record, and fills
  78    in rec */
  79 static tdb1_off_t tdb1_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
  80                         struct tdb1_record *r)
  81 {
  82         tdb1_off_t rec_ptr;
  83
  84         /* read in the hash top */
  85         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
  86                 return 0;
  87
  88         /* keep looking until we find the right record */
  89         while (rec_ptr) {
  90                 if (tdb1_rec_read(tdb, rec_ptr, r) == -1)
  91                         return 0;
  92
  93                 if (!TDB1_DEAD(r) && hash==r->full_hash
  94                     && key.dsize==r->key_len
  95                     && tdb1_parse_data(tdb, key, rec_ptr + sizeof(*r),
  96                                       r->key_len, tdb1_key_compare,
  97                                       NULL) == 0) {
  98                         return rec_ptr;
  99                 }
 100                 /* detect tight infinite loop */
 101                 if (rec_ptr == r->next) {
 102                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT,
 103                                                 TDB_LOG_ERROR,
 104                                                 "tdb1_find: loop detected.");
 105                         return 0;
 106                 }
 107                 rec_ptr = r->next;
 108         }
 109         tdb->last_error = TDB_ERR_NOEXIST;
 110         return 0;
 111 }
 112
 113 /* As tdb1_find, but if you succeed, keep the lock */
 114 tdb1_off_t tdb1_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
 115                            struct tdb1_record *rec)
 116 {
 117         uint32_t rec_ptr;
 118
 119         if (tdb1_lock(tdb, TDB1_BUCKET(hash), locktype) == -1)
 120                 return 0;
 121         if (!(rec_ptr = tdb1_find(tdb, key, hash, rec)))
 122                 tdb1_unlock(tdb, TDB1_BUCKET(hash), locktype);
 123         return rec_ptr;
 124 }
 125
 126 static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key);
 127
 128 /* update an entry in place - this only works if the new data size
 129    is <= the old data size and the key exists.
 130    on failure return -1.
 131 */
 132 static int tdb1_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
 133 {
 134         struct tdb1_record rec;
 135         tdb1_off_t rec_ptr;
 136
 137         /* find entry */
 138         if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec)))
 139                 return -1;
 140
 141         /* it could be an exact duplicate of what is there - this is
 142          * surprisingly common (eg. with a ldb re-index). */
 143         if (rec.key_len == key.dsize &&
 144             rec.data_len == dbuf.dsize &&
 145             rec.full_hash == hash) {
 146                 TDB_DATA data = _tdb1_fetch(tdb, key);
 147                 if (data.dsize == dbuf.dsize &&
 148                     memcmp(data.dptr, dbuf.dptr, data.dsize) == 0) {
 149                         if (data.dptr) {
 150                                 free(data.dptr);
 151                         }
 152                         return 0;
 153                 }
 154                 if (data.dptr) {
 155                         free(data.dptr);
 156                 }
 157         }
 158
 159         /* must be long enough key, data and tailer */
 160         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb1_off_t)) {
 161                 tdb->last_error = TDB_SUCCESS; /* Not really an error */
 162                 return -1;
 163         }
 164
 165         if (tdb->tdb1.io->tdb1_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 166                       dbuf.dptr, dbuf.dsize) == -1)
 167                 return -1;
 168
 169         if (dbuf.dsize != rec.data_len) {
 170                 /* update size */
 171                 rec.data_len = dbuf.dsize;
 172                 return tdb1_rec_write(tdb, rec_ptr, &rec);
 173         }
 174
 175         return 0;
 176 }
 177
 178 /* find an entry in the database given a key */
 179 /* If an entry doesn't exist tdb1_err will be set to
 180  * TDB_ERR_NOEXIST. If a key has no data attached
 181  * then the TDB_DATA will have zero length but
 182  * a non-zero pointer
 183  */
 184 static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key)
 185 {
 186         tdb1_off_t rec_ptr;
 187         struct tdb1_record rec;
 188         TDB_DATA ret;
 189         uint32_t hash;
 190
 191         /* find which hash bucket it is in */
 192         hash = tdb_hash(tdb, key.dptr, key.dsize);
 193         if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
 194                 return tdb1_null;
 195
 196         ret.dptr = tdb1_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 197                                   rec.data_len);
 198         ret.dsize = rec.data_len;
 199         tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
 200         return ret;
 201 }
 202
 203 TDB_DATA tdb1_fetch(struct tdb_context *tdb, TDB_DATA key)
 204 {
 205         TDB_DATA ret = _tdb1_fetch(tdb, key);
 206
 207         return ret;
 208 }
 209
 210 /*
 211  * Find an entry in the database and hand the record's data to a parsing
 212  * function. The parsing function is executed under the chain read lock, so it
 213  * should be fast and should not block on other syscalls.
 214  *
 215  * DON'T CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
 216  *
 217  * For mmapped tdb's that do not have a transaction open it points the parsing
 218  * function directly at the mmap area, it avoids the malloc/memcpy in this
 219  * case. If a transaction is open or no mmap is available, it has to do
 220  * malloc/read/parse/free.
 221  *
 222  * This is interesting for all readers of potentially large data structures in
 223  * the tdb records, ldb indexes being one example.
 224  *
 225  * Return -1 if the record was not found.
 226  */
 227
 228 int tdb1_parse_record(struct tdb_context *tdb, TDB_DATA key,
 229                      int (*parser)(TDB_DATA key, TDB_DATA data,
 230                                    void *private_data),
 231                      void *private_data)
 232 {
 233         tdb1_off_t rec_ptr;
 234         struct tdb1_record rec;
 235         int ret;
 236         uint32_t hash;
 237
 238         /* find which hash bucket it is in */
 239         hash = tdb_hash(tdb, key.dptr, key.dsize);
 240
 241         if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
 242                 /* record not found */
 243                 tdb->last_error = TDB_ERR_NOEXIST;
 244                 return -1;
 245         }
 246
 247         ret = tdb1_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
 248                              rec.data_len, parser, private_data);
 249
 250         tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
 251
 252         return ret;
 253 }
 254
 255 /* check if an entry in the database exists
 256
 257    note that 1 is returned if the key is found and 0 is returned if not found
 258    this doesn't match the conventions in the rest of this module, but is
 259    compatible with gdbm
 260 */
 261 static int tdb1_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 262 {
 263         struct tdb1_record rec;
 264
 265         if (tdb1_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
 266                 return 0;
 267         tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
 268         return 1;
 269 }
 270
 271 int tdb1_exists(struct tdb_context *tdb, TDB_DATA key)
 272 {
 273         uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize);
 274         int ret;
 275
 276         ret = tdb1_exists_hash(tdb, key, hash);
 277         return ret;
 278 }
 279
 280 /* actually delete an entry in the database given the offset */
 281 int tdb1_do_delete(struct tdb_context *tdb, tdb1_off_t rec_ptr, struct tdb1_record *rec)
 282 {
 283         tdb1_off_t last_ptr, i;
 284         struct tdb1_record lastrec;
 285
 286         if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) return -1;
 287
 288         if (((tdb->tdb1.traverse_write != 0) && (!TDB1_DEAD(rec))) ||
 289             tdb1_write_lock_record(tdb, rec_ptr) == -1) {
 290                 /* Someone traversing here: mark it as dead */
 291                 rec->magic = TDB1_DEAD_MAGIC;
 292                 return tdb1_rec_write(tdb, rec_ptr, rec);
 293         }
 294         if (tdb1_write_unlock_record(tdb, rec_ptr) != 0)
 295                 return -1;
 296
 297         /* find previous record in hash chain */
 298         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(rec->full_hash), &i) == -1)
 299                 return -1;
 300         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
 301                 if (tdb1_rec_read(tdb, i, &lastrec) == -1)
 302                         return -1;
 303
 304         /* unlink it: next ptr is at start of record. */
 305         if (last_ptr == 0)
 306                 last_ptr = TDB1_HASH_TOP(rec->full_hash);
 307         if (tdb1_ofs_write(tdb, last_ptr, &rec->next) == -1)
 308                 return -1;
 309
 310         /* recover the space */
 311         if (tdb1_free(tdb, rec_ptr, rec) == -1)
 312                 return -1;
 313         return 0;
 314 }
 315
 316 static int tdb1_count_dead(struct tdb_context *tdb, uint32_t hash)
 317 {
 318         int res = 0;
 319         tdb1_off_t rec_ptr;
 320         struct tdb1_record rec;
 321
 322         /* read in the hash top */
 323         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
 324                 return 0;
 325
 326         while (rec_ptr) {
 327                 if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1)
 328                         return 0;
 329
 330                 if (rec.magic == TDB1_DEAD_MAGIC) {
 331                         res += 1;
 332                 }
 333                 rec_ptr = rec.next;
 334         }
 335         return res;
 336 }
 337
 338 /*
 339  * Purge all DEAD records from a hash chain
 340  */
 341 static int tdb1_purge_dead(struct tdb_context *tdb, uint32_t hash)
 342 {
 343         int res = -1;
 344         struct tdb1_record rec;
 345         tdb1_off_t rec_ptr;
 346
 347         if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
 348                 return -1;
 349         }
 350
 351         /* read in the hash top */
 352         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
 353                 goto fail;
 354
 355         while (rec_ptr) {
 356                 tdb1_off_t next;
 357
 358                 if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1) {
 359                         goto fail;
 360                 }
 361
 362                 next = rec.next;
 363
 364                 if (rec.magic == TDB1_DEAD_MAGIC
 365                     && tdb1_do_delete(tdb, rec_ptr, &rec) == -1) {
 366                         goto fail;
 367                 }
 368                 rec_ptr = next;
 369         }
 370         res = 0;
 371  fail:
 372         tdb1_unlock(tdb, -1, F_WRLCK);
 373         return res;
 374 }
 375
 376 /* delete an entry in the database given a key */
 377 static int tdb1_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 378 {
 379         tdb1_off_t rec_ptr;
 380         struct tdb1_record rec;
 381         int ret;
 382
 383         if (tdb->tdb1.max_dead_records != 0) {
 384
 385                 /*
 386                  * Allow for some dead records per hash chain, mainly for
 387                  * tdb's with a very high create/delete rate like locking.tdb.
 388                  */
 389
 390                 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
 391                         return -1;
 392
 393                 if (tdb1_count_dead(tdb, hash) >= tdb->tdb1.max_dead_records) {
 394                         /*
 395                          * Don't let the per-chain freelist grow too large,
 396                          * delete all existing dead records
 397                          */
 398                         tdb1_purge_dead(tdb, hash);
 399                 }
 400
 401                 if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec))) {
 402                         tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
 403                         return -1;
 404                 }
 405
 406                 /*
 407                  * Just mark the record as dead.
 408                  */
 409                 rec.magic = TDB1_DEAD_MAGIC;
 410                 ret = tdb1_rec_write(tdb, rec_ptr, &rec);
 411         }
 412         else {
 413                 if (!(rec_ptr = tdb1_find_lock_hash(tdb, key, hash, F_WRLCK,
 414                                                    &rec)))
 415                         return -1;
 416
 417                 ret = tdb1_do_delete(tdb, rec_ptr, &rec);
 418         }
 419
 420         if (ret == 0) {
 421                 tdb1_increment_seqnum(tdb);
 422         }
 423
 424         if (tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_WRLCK) != 0)
 425                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 426                            "tdb1_delete: WARNING tdb1_unlock failed!");
 427         return ret;
 428 }
 429
 430 int tdb1_delete(struct tdb_context *tdb, TDB_DATA key)
 431 {
 432         uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize);
 433         int ret;
 434
 435         ret = tdb1_delete_hash(tdb, key, hash);
 436         return ret;
 437 }
 438
 439 /*
 440  * See if we have a dead record around with enough space
 441  */
 442 static tdb1_off_t tdb1_find_dead(struct tdb_context *tdb, uint32_t hash,
 443                                struct tdb1_record *r, tdb1_len_t length)
 444 {
 445         tdb1_off_t rec_ptr;
 446
 447         /* read in the hash top */
 448         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
 449                 return 0;
 450
 451         /* keep looking until we find the right record */
 452         while (rec_ptr) {
 453                 if (tdb1_rec_read(tdb, rec_ptr, r) == -1)
 454                         return 0;
 455
 456                 if (TDB1_DEAD(r) && r->rec_len >= length) {
 457                         /*
 458                          * First fit for simple coding, TODO: change to best
 459                          * fit
 460                          */
 461                         return rec_ptr;
 462                 }
 463                 rec_ptr = r->next;
 464         }
 465         return 0;
 466 }
 467
 468 static int _tdb1_store(struct tdb_context *tdb, TDB_DATA key,
 469                        TDB_DATA dbuf, int flag, uint32_t hash)
 470 {
 471         struct tdb1_record rec;
 472         tdb1_off_t rec_ptr;
 473         char *p = NULL;
 474         int ret = -1;
 475
 476         /* check for it existing, on insert. */
 477         if (flag == TDB_INSERT) {
 478                 if (tdb1_exists_hash(tdb, key, hash)) {
 479                         tdb->last_error = TDB_ERR_EXISTS;
 480                         goto fail;
 481                 }
 482         } else {
 483                 /* first try in-place update, on modify or replace. */
 484                 if (tdb1_update_hash(tdb, key, hash, dbuf) == 0) {
 485                         goto done;
 486                 }
 487                 if (tdb->last_error == TDB_ERR_NOEXIST &&
 488                     flag == TDB_MODIFY) {
 489                         /* if the record doesn't exist and we are in TDB1_MODIFY mode then
 490                          we should fail the store */
 491                         goto fail;
 492                 }
 493         }
 494         /* reset the error code potentially set by the tdb1_update() */
 495         tdb->last_error = TDB_SUCCESS;
 496
 497         /* delete any existing record - if it doesn't exist we don't
 498            care.  Doing this first reduces fragmentation, and avoids
 499            coalescing with `allocated' block before it's updated. */
 500         if (flag != TDB_INSERT)
 501                 tdb1_delete_hash(tdb, key, hash);
 502
 503         /* Copy key+value *before* allocating free space in case malloc
 504            fails and we are left with a dead spot in the tdb. */
 505
 506         if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
 507                 tdb->last_error = TDB_ERR_OOM;
 508                 goto fail;
 509         }
 510
 511         memcpy(p, key.dptr, key.dsize);
 512         if (dbuf.dsize)
 513                 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
 514
 515         if (tdb->tdb1.max_dead_records != 0) {
 516                 /*
 517                  * Allow for some dead records per hash chain, look if we can
 518                  * find one that can hold the new record. We need enough space
 519                  * for key, data and tailer. If we find one, we don't have to
 520                  * consult the central freelist.
 521                  */
 522                 rec_ptr = tdb1_find_dead(
 523                         tdb, hash, &rec,
 524                         key.dsize + dbuf.dsize + sizeof(tdb1_off_t));
 525
 526                 if (rec_ptr != 0) {
 527                         rec.key_len = key.dsize;
 528                         rec.data_len = dbuf.dsize;
 529                         rec.full_hash = hash;
 530                         rec.magic = TDB1_MAGIC;
 531                         if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1
 532                             || tdb->tdb1.io->tdb1_write(
 533                                     tdb, rec_ptr + sizeof(rec),
 534                                     p, key.dsize + dbuf.dsize) == -1) {
 535                                 goto fail;
 536                         }
 537                         goto done;
 538                 }
 539         }
 540
 541         /*
 542          * We have to allocate some space from the freelist, so this means we
 543          * have to lock it. Use the chance to purge all the DEAD records from
 544          * the hash chain under the freelist lock.
 545          */
 546
 547         if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
 548                 goto fail;
 549         }
 550
 551         if ((tdb->tdb1.max_dead_records != 0)
 552             && (tdb1_purge_dead(tdb, hash) == -1)) {
 553                 tdb1_unlock(tdb, -1, F_WRLCK);
 554                 goto fail;
 555         }
 556
 557         /* we have to allocate some space */
 558         rec_ptr = tdb1_allocate(tdb, key.dsize + dbuf.dsize, &rec);
 559
 560         tdb1_unlock(tdb, -1, F_WRLCK);
 561
 562         if (rec_ptr == 0) {
 563                 goto fail;
 564         }
 565
 566         /* Read hash top into next ptr */
 567         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec.next) == -1)
 568                 goto fail;
 569
 570         rec.key_len = key.dsize;
 571         rec.data_len = dbuf.dsize;
 572         rec.full_hash = hash;
 573         rec.magic = TDB1_MAGIC;
 574
 575         /* write out and point the top of the hash chain at it */
 576         if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1
 577             || tdb->tdb1.io->tdb1_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
 578             || tdb1_ofs_write(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) {
 579                 /* Need to tdb1_unallocate() here */
 580                 goto fail;
 581         }
 582
 583  done:
 584         ret = 0;
 585  fail:
 586         if (ret == 0) {
 587                 tdb1_increment_seqnum(tdb);
 588         }
 589
 590         SAFE_FREE(p);
 591         return ret;
 592 }
 593
 594 /* store an element in the database, replacing any existing element
 595    with the same key
 596
 597    return 0 on success, -1 on failure
 598 */
 599 int tdb1_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
 600 {
 601         uint32_t hash;
 602         int ret;
 603
 604         if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) {
 605                 tdb->last_error = TDB_ERR_RDONLY;
 606                 return -1;
 607         }
 608
 609         /* find which hash bucket it is in */
 610         hash = tdb_hash(tdb, key.dptr, key.dsize);
 611         if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
 612                 return -1;
 613
 614         ret = _tdb1_store(tdb, key, dbuf, flag, hash);
 615         tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
 616         return ret;
 617 }
 618
 619 /* Append to an entry. Create if not exist. */
 620 int tdb1_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
 621 {
 622         uint32_t hash;
 623         TDB_DATA dbuf;
 624         int ret = -1;
 625
 626         /* find which hash bucket it is in */
 627         hash = tdb_hash(tdb, key.dptr, key.dsize);
 628         if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
 629                 return -1;
 630
 631         dbuf = _tdb1_fetch(tdb, key);
 632
 633         if (dbuf.dptr == NULL) {
 634                 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
 635         } else {
 636                 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
 637                 unsigned char *new_dptr;
 638
 639                 /* realloc '0' is special: don't do that. */
 640                 if (new_len == 0)
 641                         new_len = 1;
 642                 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
 643                 if (new_dptr == NULL) {
 644                         free(dbuf.dptr);
 645                 }
 646                 dbuf.dptr = new_dptr;
 647         }
 648
 649         if (dbuf.dptr == NULL) {
 650                 tdb->last_error = TDB_ERR_OOM;
 651                 goto failed;
 652         }
 653
 654         memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
 655         dbuf.dsize += new_dbuf.dsize;
 656
 657         ret = _tdb1_store(tdb, key, dbuf, 0, hash);
 658
 659 failed:
 660         tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
 661         SAFE_FREE(dbuf.dptr);
 662         return ret;
 663 }
 664
 665
 666 /*
 667   get the tdb sequence number. Only makes sense if the writers opened
 668   with TDB1_SEQNUM set. Note that this sequence number will wrap quite
 669   quickly, so it should only be used for a 'has something changed'
 670   test, not for code that relies on the count of the number of changes
 671   made. If you want a counter then use a tdb record.
 672
 673   The aim of this sequence number is to allow for a very lightweight
 674   test of a possible tdb change.
 675 */
 676 int tdb1_get_seqnum(struct tdb_context *tdb)
 677 {
 678         tdb1_off_t seqnum=0;
 679
 680         tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum);
 681         return seqnum;
 682 }
 683
 684
 685 /*
 686   add a region of the file to the freelist. Length is the size of the region in bytes,
 687   which includes the free list header that needs to be added
 688  */
 689 static int tdb1_free_region(struct tdb_context *tdb, tdb1_off_t offset, ssize_t length)
 690 {
 691         struct tdb1_record rec;
 692         if (length <= sizeof(rec)) {
 693                 /* the region is not worth adding */
 694                 return 0;
 695         }
 696         if (length + offset > tdb->file->map_size) {
 697                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 698                                         "tdb1_free_region: adding region beyond"
 699                                         " end of file");
 700                 return -1;
 701         }
 702         memset(&rec,'\0',sizeof(rec));
 703         rec.rec_len = length - sizeof(rec);
 704         if (tdb1_free(tdb, offset, &rec) == -1) {
 705                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 706                            "tdb1_free_region: failed to add free record");
 707                 return -1;
 708         }
 709         return 0;
 710 }
 711
 712 /*
 713   wipe the entire database, deleting all records. This can be done
 714   very fast by using a allrecord lock. The entire data portion of the
 715   file becomes a single entry in the freelist.
 716
 717   This code carefully steps around the recovery area, leaving it alone
 718  */
 719 int tdb1_wipe_all(struct tdb_context *tdb)
 720 {
 721         int i;
 722         tdb1_off_t offset = 0;
 723         ssize_t data_len;
 724         tdb1_off_t recovery_head;
 725         tdb1_len_t recovery_size = 0;
 726
 727         if (tdb1_lockall(tdb) != 0) {
 728                 return -1;
 729         }
 730
 731
 732         /* see if the tdb has a recovery area, and remember its size
 733            if so. We don't want to lose this as otherwise each
 734            tdb1_wipe_all() in a transaction will increase the size of
 735            the tdb by the size of the recovery area */
 736         if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, &recovery_head) == -1) {
 737                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 738                            "tdb1_wipe_all: failed to read recovery head");
 739                 goto failed;
 740         }
 741
 742         if (recovery_head != 0) {
 743                 struct tdb1_record rec;
 744                 if (tdb->tdb1.io->tdb1_read(tdb, recovery_head, &rec, sizeof(rec), TDB1_DOCONV()) == -1) {
 745                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 746                                    "tdb1_wipe_all: failed to read recovery record");
 747                         return -1;
 748                 }
 749                 recovery_size = rec.rec_len + sizeof(rec);
 750         }
 751
 752         /* wipe the hashes */
 753         for (i=0;i<tdb->tdb1.header.hash_size;i++) {
 754                 if (tdb1_ofs_write(tdb, TDB1_HASH_TOP(i), &offset) == -1) {
 755                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 756                                    "tdb1_wipe_all: failed to write hash %d", i);
 757                         goto failed;
 758                 }
 759         }
 760
 761         /* wipe the freelist */
 762         if (tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) {
 763                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 764                            "tdb1_wipe_all: failed to write freelist");
 765                 goto failed;
 766         }
 767
 768         /* add all the rest of the file to the freelist, possibly leaving a gap
 769            for the recovery area */
 770         if (recovery_size == 0) {
 771                 /* the simple case - the whole file can be used as a freelist */
 772                 data_len = (tdb->file->map_size - TDB1_DATA_START(tdb->tdb1.header.hash_size));
 773                 if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) {
 774                         goto failed;
 775                 }
 776         } else {
 777                 /* we need to add two freelist entries - one on either
 778                    side of the recovery area
 779
 780                    Note that we cannot shift the recovery area during
 781                    this operation. Only the transaction.c code may
 782                    move the recovery area or we risk subtle data
 783                    corruption
 784                 */
 785                 data_len = (recovery_head - TDB1_DATA_START(tdb->tdb1.header.hash_size));
 786                 if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) {
 787                         goto failed;
 788                 }
 789                 /* and the 2nd free list entry after the recovery area - if any */
 790                 data_len = tdb->file->map_size - (recovery_head+recovery_size);
 791                 if (tdb1_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
 792                         goto failed;
 793                 }
 794         }
 795
 796         if (tdb1_unlockall(tdb) != 0) {
 797                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 798                            "tdb1_wipe_all: failed to unlock");
 799                 goto failed;
 800         }
 801
 802         return 0;
 803
 804 failed:
 805         tdb1_unlockall(tdb);
 806         return -1;
 807 }
 808
 809 struct traverse_state {
 810         enum TDB_ERROR error;
 811         struct tdb_context *dest_db;
 812 };
 813
 814 /*
 815   traverse function for repacking
 816  */
 817 static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
 818 {
 819         struct traverse_state *state = (struct traverse_state *)private_data;
 820         if (tdb1_store(state->dest_db, key, data, TDB_INSERT) != 0) {
 821                 state->error = state->dest_db->last_error;
 822                 return -1;
 823         }
 824         return 0;
 825 }
 826
 827 /*
 828   repack a tdb
 829  */
 830 int tdb1_repack(struct tdb_context *tdb)
 831 {
 832         struct tdb_context *tmp_db;
 833         struct traverse_state state;
 834         union tdb_attribute hsize;
 835
 836         hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE;
 837         hsize.base.next = NULL;
 838         hsize.tdb1_hashsize.hsize = tdb->tdb1.header.hash_size;
 839
 840         if (tdb1_transaction_start(tdb) != 0) {
 841                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 842                            __location__ " Failed to start transaction");
 843                 return -1;
 844         }
 845
 846         tmp_db = tdb_open("tmpdb", TDB_INTERNAL, O_RDWR|O_CREAT, 0, &hsize);
 847         if (tmp_db == NULL) {
 848                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
 849                                         __location__ " Failed to create tmp_db");
 850                 tdb1_transaction_cancel(tdb);
 851                 return -1;
 852         }
 853
 854         state.error = TDB_SUCCESS;
 855         state.dest_db = tmp_db;
 856
 857         if (tdb1_traverse_read(tdb, repack_traverse, &state) == -1) {
 858                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 859                            __location__ " Failed to traverse copying out");
 860                 tdb1_transaction_cancel(tdb);
 861                 tdb_close(tmp_db);
 862                 return -1;
 863         }
 864
 865         if (state.error != TDB_SUCCESS) {
 866                 tdb->last_error = tdb_logerr(tdb, state.error, TDB_LOG_ERROR,
 867                                         __location__ " Error during traversal");
 868                 tdb1_transaction_cancel(tdb);
 869                 tdb_close(tmp_db);
 870                 return -1;
 871         }
 872
 873         if (tdb1_wipe_all(tdb) != 0) {
 874                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 875                            __location__ " Failed to wipe database\n");
 876                 tdb1_transaction_cancel(tdb);
 877                 tdb_close(tmp_db);
 878                 return -1;
 879         }
 880
 881         state.error = TDB_SUCCESS;
 882         state.dest_db = tdb;
 883
 884         if (tdb1_traverse_read(tmp_db, repack_traverse, &state) == -1) {
 885                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 886                            __location__ " Failed to traverse copying back");
 887                 tdb1_transaction_cancel(tdb);
 888                 tdb_close(tmp_db);
 889                 return -1;
 890         }
 891
 892         if (state.error) {
 893                 tdb->last_error = tdb_logerr(tdb, state.error, TDB_LOG_ERROR,
 894                                         __location__ " Error during second traversal");
 895                 tdb1_transaction_cancel(tdb);
 896                 tdb_close(tmp_db);
 897                 return -1;
 898         }
 899
 900         tdb_close(tmp_db);
 901
 902         if (tdb1_transaction_commit(tdb) != 0) {
 903                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 904                            __location__ " Failed to commit");
 905                 return -1;
 906         }
 907
 908         return 0;
 909 }
 910
 911 /* Even on files, we can get partial writes due to signals. */
 912 bool tdb1_write_all(int fd, const void *buf, size_t count)
 913 {
 914         while (count) {
 915                 ssize_t ret;
 916                 ret = write(fd, buf, count);
 917                 if (ret < 0)
 918                         return false;
 919                 buf = (const char *)buf + ret;
 920                 count -= ret;
 921         }
 922         return true;
 923 }