git.ozlabs.org Git - ccan/blob - ccan/tdb2/tdb1_tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb1_private.h"
  29 #include <assert.h>
  30
  31 TDB_DATA tdb1_null;
  32
  33 /*
  34   non-blocking increment of the tdb sequence number if the tdb has been opened using
  35   the TDB_SEQNUM flag
  36 */
  37 void tdb1_increment_seqnum_nonblock(struct tdb_context *tdb)
  38 {
  39         tdb1_off_t seqnum=0;
  40
  41         if (!(tdb->flags & TDB_SEQNUM)) {
  42                 return;
  43         }
  44
  45         /* we ignore errors from this, as we have no sane way of
  46            dealing with them.
  47         */
  48         tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum);
  49         seqnum++;
  50         tdb1_ofs_write(tdb, TDB1_SEQNUM_OFS, &seqnum);
  51 }
  52
  53 /*
  54   increment the tdb sequence number if the tdb has been opened using
  55   the TDB_SEQNUM flag
  56 */
  57 static void tdb1_increment_seqnum(struct tdb_context *tdb)
  58 {
  59         if (!(tdb->flags & TDB_SEQNUM)) {
  60                 return;
  61         }
  62
  63         if (tdb1_nest_lock(tdb, TDB1_SEQNUM_OFS, F_WRLCK,
  64                            TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
  65                 return;
  66         }
  67
  68         tdb1_increment_seqnum_nonblock(tdb);
  69
  70         tdb1_nest_unlock(tdb, TDB1_SEQNUM_OFS, F_WRLCK);
  71 }
  72
  73 static int tdb1_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
  74 {
  75         return memcmp(data.dptr, key.dptr, data.dsize);
  76 }
  77
  78 /* Returns 0 on fail.  On success, return offset of record, and fills
  79    in rec */
  80 static tdb1_off_t tdb1_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
  81                         struct tdb1_record *r)
  82 {
  83         tdb1_off_t rec_ptr;
  84
  85         /* read in the hash top */
  86         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
  87                 return 0;
  88
  89         /* keep looking until we find the right record */
  90         while (rec_ptr) {
  91                 if (tdb1_rec_read(tdb, rec_ptr, r) == -1)
  92                         return 0;
  93
  94                 if (!TDB1_DEAD(r) && hash==r->full_hash
  95                     && key.dsize==r->key_len
  96                     && tdb1_parse_data(tdb, key, rec_ptr + sizeof(*r),
  97                                       r->key_len, tdb1_key_compare,
  98                                       NULL) == 0) {
  99                         return rec_ptr;
 100                 }
 101                 /* detect tight infinite loop */
 102                 if (rec_ptr == r->next) {
 103                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT,
 104                                                 TDB_LOG_ERROR,
 105                                                 "tdb1_find: loop detected.");
 106                         return 0;
 107                 }
 108                 rec_ptr = r->next;
 109         }
 110         tdb->last_error = TDB_ERR_NOEXIST;
 111         return 0;
 112 }
 113
 114 /* As tdb1_find, but if you succeed, keep the lock */
 115 tdb1_off_t tdb1_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
 116                            struct tdb1_record *rec)
 117 {
 118         uint32_t rec_ptr;
 119
 120         if (tdb1_lock(tdb, TDB1_BUCKET(hash), locktype) == -1)
 121                 return 0;
 122         if (!(rec_ptr = tdb1_find(tdb, key, hash, rec)))
 123                 tdb1_unlock(tdb, TDB1_BUCKET(hash), locktype);
 124         return rec_ptr;
 125 }
 126
 127 static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key);
 128
 129 /* update an entry in place - this only works if the new data size
 130    is <= the old data size and the key exists.
 131    on failure return -1.
 132 */
 133 static int tdb1_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
 134 {
 135         struct tdb1_record rec;
 136         tdb1_off_t rec_ptr;
 137
 138         /* find entry */
 139         if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec)))
 140                 return -1;
 141
 142         /* it could be an exact duplicate of what is there - this is
 143          * surprisingly common (eg. with a ldb re-index). */
 144         if (rec.key_len == key.dsize &&
 145             rec.data_len == dbuf.dsize &&
 146             rec.full_hash == hash) {
 147                 TDB_DATA data = _tdb1_fetch(tdb, key);
 148                 if (data.dsize == dbuf.dsize &&
 149                     memcmp(data.dptr, dbuf.dptr, data.dsize) == 0) {
 150                         if (data.dptr) {
 151                                 free(data.dptr);
 152                         }
 153                         return 0;
 154                 }
 155                 if (data.dptr) {
 156                         free(data.dptr);
 157                 }
 158         }
 159
 160         /* must be long enough key, data and tailer */
 161         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb1_off_t)) {
 162                 tdb->last_error = TDB_SUCCESS; /* Not really an error */
 163                 return -1;
 164         }
 165
 166         if (tdb->tdb1.io->tdb1_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 167                       dbuf.dptr, dbuf.dsize) == -1)
 168                 return -1;
 169
 170         if (dbuf.dsize != rec.data_len) {
 171                 /* update size */
 172                 rec.data_len = dbuf.dsize;
 173                 return tdb1_rec_write(tdb, rec_ptr, &rec);
 174         }
 175
 176         return 0;
 177 }
 178
 179 /* find an entry in the database given a key */
 180 /* If an entry doesn't exist tdb1_err will be set to
 181  * TDB_ERR_NOEXIST. If a key has no data attached
 182  * then the TDB_DATA will have zero length but
 183  * a non-zero pointer
 184  */
 185 static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key)
 186 {
 187         tdb1_off_t rec_ptr;
 188         struct tdb1_record rec;
 189         TDB_DATA ret;
 190         uint32_t hash;
 191
 192         /* find which hash bucket it is in */
 193         hash = tdb_hash(tdb, key.dptr, key.dsize);
 194         if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
 195                 return tdb1_null;
 196
 197         ret.dptr = tdb1_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 198                                   rec.data_len);
 199         ret.dsize = rec.data_len;
 200         tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
 201         return ret;
 202 }
 203
 204 enum TDB_ERROR tdb1_fetch(struct tdb_context *tdb, TDB_DATA key, TDB_DATA *data)
 205 {
 206         *data = _tdb1_fetch(tdb, key);
 207         if (data->dptr == NULL)
 208                 return tdb->last_error;
 209         return TDB_SUCCESS;
 210 }
 211
 212 enum TDB_ERROR tdb1_parse_record(struct tdb_context *tdb, TDB_DATA key,
 213                                  enum TDB_ERROR (*parser)(TDB_DATA key,
 214                                                           TDB_DATA data,
 215                                                           void *private_data),
 216                                  void *private_data)
 217 {
 218         tdb1_off_t rec_ptr;
 219         struct tdb1_record rec;
 220         enum TDB_ERROR ret;
 221         uint32_t hash;
 222
 223         /* find which hash bucket it is in */
 224         hash = tdb_hash(tdb, key.dptr, key.dsize);
 225
 226         if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
 227                 /* record not found */
 228                 return TDB_ERR_NOEXIST;
 229         }
 230
 231         ret = tdb1_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
 232                              rec.data_len, parser, private_data);
 233
 234         tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
 235
 236         return ret;
 237 }
 238
 239 /* check if an entry in the database exists
 240
 241    note that 1 is returned if the key is found and 0 is returned if not found
 242    this doesn't match the conventions in the rest of this module, but is
 243    compatible with gdbm
 244 */
 245 static int tdb1_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 246 {
 247         struct tdb1_record rec;
 248
 249         if (tdb1_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
 250                 return 0;
 251         tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
 252         return 1;
 253 }
 254
 255 int tdb1_exists(struct tdb_context *tdb, TDB_DATA key)
 256 {
 257         uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize);
 258         int ret;
 259
 260         assert(tdb->flags & TDB_VERSION1);
 261         ret = tdb1_exists_hash(tdb, key, hash);
 262         return ret;
 263 }
 264
 265 /* actually delete an entry in the database given the offset */
 266 int tdb1_do_delete(struct tdb_context *tdb, tdb1_off_t rec_ptr, struct tdb1_record *rec)
 267 {
 268         tdb1_off_t last_ptr, i;
 269         struct tdb1_record lastrec;
 270
 271         if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) return -1;
 272
 273         if (((tdb->tdb1.traverse_write != 0) && (!TDB1_DEAD(rec))) ||
 274             tdb1_write_lock_record(tdb, rec_ptr) == -1) {
 275                 /* Someone traversing here: mark it as dead */
 276                 rec->magic = TDB1_DEAD_MAGIC;
 277                 return tdb1_rec_write(tdb, rec_ptr, rec);
 278         }
 279         if (tdb1_write_unlock_record(tdb, rec_ptr) != 0)
 280                 return -1;
 281
 282         /* find previous record in hash chain */
 283         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(rec->full_hash), &i) == -1)
 284                 return -1;
 285         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
 286                 if (tdb1_rec_read(tdb, i, &lastrec) == -1)
 287                         return -1;
 288
 289         /* unlink it: next ptr is at start of record. */
 290         if (last_ptr == 0)
 291                 last_ptr = TDB1_HASH_TOP(rec->full_hash);
 292         if (tdb1_ofs_write(tdb, last_ptr, &rec->next) == -1)
 293                 return -1;
 294
 295         /* recover the space */
 296         if (tdb1_free(tdb, rec_ptr, rec) == -1)
 297                 return -1;
 298         return 0;
 299 }
 300
 301 static int tdb1_count_dead(struct tdb_context *tdb, uint32_t hash)
 302 {
 303         int res = 0;
 304         tdb1_off_t rec_ptr;
 305         struct tdb1_record rec;
 306
 307         /* read in the hash top */
 308         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
 309                 return 0;
 310
 311         while (rec_ptr) {
 312                 if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1)
 313                         return 0;
 314
 315                 if (rec.magic == TDB1_DEAD_MAGIC) {
 316                         res += 1;
 317                 }
 318                 rec_ptr = rec.next;
 319         }
 320         return res;
 321 }
 322
 323 /*
 324  * Purge all DEAD records from a hash chain
 325  */
 326 static int tdb1_purge_dead(struct tdb_context *tdb, uint32_t hash)
 327 {
 328         int res = -1;
 329         struct tdb1_record rec;
 330         tdb1_off_t rec_ptr;
 331
 332         if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
 333                 return -1;
 334         }
 335
 336         /* read in the hash top */
 337         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
 338                 goto fail;
 339
 340         while (rec_ptr) {
 341                 tdb1_off_t next;
 342
 343                 if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1) {
 344                         goto fail;
 345                 }
 346
 347                 next = rec.next;
 348
 349                 if (rec.magic == TDB1_DEAD_MAGIC
 350                     && tdb1_do_delete(tdb, rec_ptr, &rec) == -1) {
 351                         goto fail;
 352                 }
 353                 rec_ptr = next;
 354         }
 355         res = 0;
 356  fail:
 357         tdb1_unlock(tdb, -1, F_WRLCK);
 358         return res;
 359 }
 360
 361 /* delete an entry in the database given a key */
 362 static int tdb1_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 363 {
 364         tdb1_off_t rec_ptr;
 365         struct tdb1_record rec;
 366         int ret;
 367
 368         if (tdb->tdb1.max_dead_records != 0) {
 369
 370                 /*
 371                  * Allow for some dead records per hash chain, mainly for
 372                  * tdb's with a very high create/delete rate like locking.tdb.
 373                  */
 374
 375                 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
 376                         return -1;
 377
 378                 if (tdb1_count_dead(tdb, hash) >= tdb->tdb1.max_dead_records) {
 379                         /*
 380                          * Don't let the per-chain freelist grow too large,
 381                          * delete all existing dead records
 382                          */
 383                         tdb1_purge_dead(tdb, hash);
 384                 }
 385
 386                 if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec))) {
 387                         tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
 388                         return -1;
 389                 }
 390
 391                 /*
 392                  * Just mark the record as dead.
 393                  */
 394                 rec.magic = TDB1_DEAD_MAGIC;
 395                 ret = tdb1_rec_write(tdb, rec_ptr, &rec);
 396         }
 397         else {
 398                 if (!(rec_ptr = tdb1_find_lock_hash(tdb, key, hash, F_WRLCK,
 399                                                    &rec)))
 400                         return -1;
 401
 402                 ret = tdb1_do_delete(tdb, rec_ptr, &rec);
 403         }
 404
 405         if (ret == 0) {
 406                 tdb1_increment_seqnum(tdb);
 407         }
 408
 409         if (tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_WRLCK) != 0)
 410                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 411                            "tdb1_delete: WARNING tdb1_unlock failed!");
 412         return ret;
 413 }
 414
 415 int tdb1_delete(struct tdb_context *tdb, TDB_DATA key)
 416 {
 417         uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize);
 418         int ret;
 419
 420         assert(tdb->flags & TDB_VERSION1);
 421         ret = tdb1_delete_hash(tdb, key, hash);
 422         return ret;
 423 }
 424
 425 /*
 426  * See if we have a dead record around with enough space
 427  */
 428 static tdb1_off_t tdb1_find_dead(struct tdb_context *tdb, uint32_t hash,
 429                                struct tdb1_record *r, tdb1_len_t length)
 430 {
 431         tdb1_off_t rec_ptr;
 432
 433         /* read in the hash top */
 434         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
 435                 return 0;
 436
 437         /* keep looking until we find the right record */
 438         while (rec_ptr) {
 439                 if (tdb1_rec_read(tdb, rec_ptr, r) == -1)
 440                         return 0;
 441
 442                 if (TDB1_DEAD(r) && r->rec_len >= length) {
 443                         /*
 444                          * First fit for simple coding, TODO: change to best
 445                          * fit
 446                          */
 447                         return rec_ptr;
 448                 }
 449                 rec_ptr = r->next;
 450         }
 451         return 0;
 452 }
 453
 454 static int _tdb1_store(struct tdb_context *tdb, TDB_DATA key,
 455                        TDB_DATA dbuf, int flag, uint32_t hash)
 456 {
 457         struct tdb1_record rec;
 458         tdb1_off_t rec_ptr;
 459         char *p = NULL;
 460         int ret = -1;
 461
 462         /* check for it existing, on insert. */
 463         if (flag == TDB_INSERT) {
 464                 if (tdb1_exists_hash(tdb, key, hash)) {
 465                         tdb->last_error = TDB_ERR_EXISTS;
 466                         goto fail;
 467                 }
 468         } else {
 469                 /* first try in-place update, on modify or replace. */
 470                 if (tdb1_update_hash(tdb, key, hash, dbuf) == 0) {
 471                         goto done;
 472                 }
 473                 if (tdb->last_error == TDB_ERR_NOEXIST &&
 474                     flag == TDB_MODIFY) {
 475                         /* if the record doesn't exist and we are in TDB1_MODIFY mode then
 476                          we should fail the store */
 477                         goto fail;
 478                 }
 479         }
 480         /* reset the error code potentially set by the tdb1_update() */
 481         tdb->last_error = TDB_SUCCESS;
 482
 483         /* delete any existing record - if it doesn't exist we don't
 484            care.  Doing this first reduces fragmentation, and avoids
 485            coalescing with `allocated' block before it's updated. */
 486         if (flag != TDB_INSERT)
 487                 tdb1_delete_hash(tdb, key, hash);
 488
 489         /* Copy key+value *before* allocating free space in case malloc
 490            fails and we are left with a dead spot in the tdb. */
 491
 492         if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
 493                 tdb->last_error = TDB_ERR_OOM;
 494                 goto fail;
 495         }
 496
 497         memcpy(p, key.dptr, key.dsize);
 498         if (dbuf.dsize)
 499                 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
 500
 501         if (tdb->tdb1.max_dead_records != 0) {
 502                 /*
 503                  * Allow for some dead records per hash chain, look if we can
 504                  * find one that can hold the new record. We need enough space
 505                  * for key, data and tailer. If we find one, we don't have to
 506                  * consult the central freelist.
 507                  */
 508                 rec_ptr = tdb1_find_dead(
 509                         tdb, hash, &rec,
 510                         key.dsize + dbuf.dsize + sizeof(tdb1_off_t));
 511
 512                 if (rec_ptr != 0) {
 513                         rec.key_len = key.dsize;
 514                         rec.data_len = dbuf.dsize;
 515                         rec.full_hash = hash;
 516                         rec.magic = TDB1_MAGIC;
 517                         if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1
 518                             || tdb->tdb1.io->tdb1_write(
 519                                     tdb, rec_ptr + sizeof(rec),
 520                                     p, key.dsize + dbuf.dsize) == -1) {
 521                                 goto fail;
 522                         }
 523                         goto done;
 524                 }
 525         }
 526
 527         /*
 528          * We have to allocate some space from the freelist, so this means we
 529          * have to lock it. Use the chance to purge all the DEAD records from
 530          * the hash chain under the freelist lock.
 531          */
 532
 533         if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
 534                 goto fail;
 535         }
 536
 537         if ((tdb->tdb1.max_dead_records != 0)
 538             && (tdb1_purge_dead(tdb, hash) == -1)) {
 539                 tdb1_unlock(tdb, -1, F_WRLCK);
 540                 goto fail;
 541         }
 542
 543         /* we have to allocate some space */
 544         rec_ptr = tdb1_allocate(tdb, key.dsize + dbuf.dsize, &rec);
 545
 546         tdb1_unlock(tdb, -1, F_WRLCK);
 547
 548         if (rec_ptr == 0) {
 549                 goto fail;
 550         }
 551
 552         /* Read hash top into next ptr */
 553         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec.next) == -1)
 554                 goto fail;
 555
 556         rec.key_len = key.dsize;
 557         rec.data_len = dbuf.dsize;
 558         rec.full_hash = hash;
 559         rec.magic = TDB1_MAGIC;
 560
 561         /* write out and point the top of the hash chain at it */
 562         if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1
 563             || tdb->tdb1.io->tdb1_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
 564             || tdb1_ofs_write(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) {
 565                 /* Need to tdb1_unallocate() here */
 566                 goto fail;
 567         }
 568
 569  done:
 570         ret = 0;
 571  fail:
 572         if (ret == 0) {
 573                 tdb1_increment_seqnum(tdb);
 574         }
 575
 576         SAFE_FREE(p);
 577         return ret;
 578 }
 579
 580 /* store an element in the database, replacing any existing element
 581    with the same key
 582
 583    return 0 on success, -1 on failure
 584 */
 585 int tdb1_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
 586 {
 587         uint32_t hash;
 588         int ret;
 589
 590         assert(tdb->flags & TDB_VERSION1);
 591
 592         if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) {
 593                 tdb->last_error = TDB_ERR_RDONLY;
 594                 return -1;
 595         }
 596
 597         /* find which hash bucket it is in */
 598         hash = tdb_hash(tdb, key.dptr, key.dsize);
 599         if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
 600                 return -1;
 601
 602         ret = _tdb1_store(tdb, key, dbuf, flag, hash);
 603         tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
 604         return ret;
 605 }
 606
 607 /* Append to an entry. Create if not exist. */
 608 int tdb1_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
 609 {
 610         uint32_t hash;
 611         TDB_DATA dbuf;
 612         int ret = -1;
 613
 614         assert(tdb->flags & TDB_VERSION1);
 615
 616         /* find which hash bucket it is in */
 617         hash = tdb_hash(tdb, key.dptr, key.dsize);
 618         if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
 619                 return -1;
 620
 621         dbuf = _tdb1_fetch(tdb, key);
 622
 623         if (dbuf.dptr == NULL) {
 624                 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
 625         } else {
 626                 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
 627                 unsigned char *new_dptr;
 628
 629                 /* realloc '0' is special: don't do that. */
 630                 if (new_len == 0)
 631                         new_len = 1;
 632                 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
 633                 if (new_dptr == NULL) {
 634                         free(dbuf.dptr);
 635                 }
 636                 dbuf.dptr = new_dptr;
 637         }
 638
 639         if (dbuf.dptr == NULL) {
 640                 tdb->last_error = TDB_ERR_OOM;
 641                 goto failed;
 642         }
 643
 644         memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
 645         dbuf.dsize += new_dbuf.dsize;
 646
 647         ret = _tdb1_store(tdb, key, dbuf, 0, hash);
 648
 649 failed:
 650         tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
 651         SAFE_FREE(dbuf.dptr);
 652         return ret;
 653 }
 654
 655
 656 /*
 657   get the tdb sequence number. Only makes sense if the writers opened
 658   with TDB1_SEQNUM set. Note that this sequence number will wrap quite
 659   quickly, so it should only be used for a 'has something changed'
 660   test, not for code that relies on the count of the number of changes
 661   made. If you want a counter then use a tdb record.
 662
 663   The aim of this sequence number is to allow for a very lightweight
 664   test of a possible tdb change.
 665 */
 666 int tdb1_get_seqnum(struct tdb_context *tdb)
 667 {
 668         tdb1_off_t seqnum=0;
 669
 670         tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum);
 671         return seqnum;
 672 }
 673
 674
 675 /*
 676   add a region of the file to the freelist. Length is the size of the region in bytes,
 677   which includes the free list header that needs to be added
 678  */
 679 static int tdb1_free_region(struct tdb_context *tdb, tdb1_off_t offset, ssize_t length)
 680 {
 681         struct tdb1_record rec;
 682         if (length <= sizeof(rec)) {
 683                 /* the region is not worth adding */
 684                 return 0;
 685         }
 686         if (length + offset > tdb->file->map_size) {
 687                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 688                                         "tdb1_free_region: adding region beyond"
 689                                         " end of file");
 690                 return -1;
 691         }
 692         memset(&rec,'\0',sizeof(rec));
 693         rec.rec_len = length - sizeof(rec);
 694         if (tdb1_free(tdb, offset, &rec) == -1) {
 695                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 696                            "tdb1_free_region: failed to add free record");
 697                 return -1;
 698         }
 699         return 0;
 700 }
 701
 702 /*
 703   wipe the entire database, deleting all records. This can be done
 704   very fast by using a allrecord lock. The entire data portion of the
 705   file becomes a single entry in the freelist.
 706
 707   This code carefully steps around the recovery area, leaving it alone
 708  */
 709 int tdb1_wipe_all(struct tdb_context *tdb)
 710 {
 711         int i;
 712         tdb1_off_t offset = 0;
 713         ssize_t data_len;
 714         tdb1_off_t recovery_head;
 715         tdb1_len_t recovery_size = 0;
 716
 717         if (tdb_lockall(tdb) != TDB_SUCCESS) {
 718                 return -1;
 719         }
 720
 721
 722         /* see if the tdb has a recovery area, and remember its size
 723            if so. We don't want to lose this as otherwise each
 724            tdb1_wipe_all() in a transaction will increase the size of
 725            the tdb by the size of the recovery area */
 726         if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, &recovery_head) == -1) {
 727                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 728                            "tdb1_wipe_all: failed to read recovery head");
 729                 goto failed;
 730         }
 731
 732         if (recovery_head != 0) {
 733                 struct tdb1_record rec;
 734                 if (tdb->tdb1.io->tdb1_read(tdb, recovery_head, &rec, sizeof(rec), TDB1_DOCONV()) == -1) {
 735                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 736                                    "tdb1_wipe_all: failed to read recovery record");
 737                         return -1;
 738                 }
 739                 recovery_size = rec.rec_len + sizeof(rec);
 740         }
 741
 742         /* wipe the hashes */
 743         for (i=0;i<tdb->tdb1.header.hash_size;i++) {
 744                 if (tdb1_ofs_write(tdb, TDB1_HASH_TOP(i), &offset) == -1) {
 745                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 746                                    "tdb1_wipe_all: failed to write hash %d", i);
 747                         goto failed;
 748                 }
 749         }
 750
 751         /* wipe the freelist */
 752         if (tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) {
 753                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 754                            "tdb1_wipe_all: failed to write freelist");
 755                 goto failed;
 756         }
 757
 758         /* add all the rest of the file to the freelist, possibly leaving a gap
 759            for the recovery area */
 760         if (recovery_size == 0) {
 761                 /* the simple case - the whole file can be used as a freelist */
 762                 data_len = (tdb->file->map_size - TDB1_DATA_START(tdb->tdb1.header.hash_size));
 763                 if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) {
 764                         goto failed;
 765                 }
 766         } else {
 767                 /* we need to add two freelist entries - one on either
 768                    side of the recovery area
 769
 770                    Note that we cannot shift the recovery area during
 771                    this operation. Only the transaction.c code may
 772                    move the recovery area or we risk subtle data
 773                    corruption
 774                 */
 775                 data_len = (recovery_head - TDB1_DATA_START(tdb->tdb1.header.hash_size));
 776                 if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) {
 777                         goto failed;
 778                 }
 779                 /* and the 2nd free list entry after the recovery area - if any */
 780                 data_len = tdb->file->map_size - (recovery_head+recovery_size);
 781                 if (tdb1_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
 782                         goto failed;
 783                 }
 784         }
 785
 786         tdb_unlockall(tdb);
 787         return 0;
 788
 789 failed:
 790         tdb_unlockall(tdb);
 791         return -1;
 792 }
 793
 794 struct traverse_state {
 795         enum TDB_ERROR error;
 796         struct tdb_context *dest_db;
 797 };
 798
 799 /*
 800   traverse function for repacking
 801  */
 802 static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
 803 {
 804         struct traverse_state *state = (struct traverse_state *)private_data;
 805         if (tdb1_store(state->dest_db, key, data, TDB_INSERT) != 0) {
 806                 state->error = state->dest_db->last_error;
 807                 return -1;
 808         }
 809         return 0;
 810 }
 811
 812 /*
 813   repack a tdb
 814  */
 815 int tdb1_repack(struct tdb_context *tdb)
 816 {
 817         struct tdb_context *tmp_db;
 818         struct traverse_state state;
 819         union tdb_attribute hsize;
 820
 821         hsize.base.attr = TDB_ATTRIBUTE_TDB1_HASHSIZE;
 822         hsize.base.next = NULL;
 823         hsize.tdb1_hashsize.hsize = tdb->tdb1.header.hash_size;
 824
 825         if (tdb1_transaction_start(tdb) != 0) {
 826                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 827                            __location__ " Failed to start transaction");
 828                 return -1;
 829         }
 830
 831         tmp_db = tdb_open("tmpdb", TDB_INTERNAL, O_RDWR|O_CREAT, 0, &hsize);
 832         if (tmp_db == NULL) {
 833                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
 834                                         __location__ " Failed to create tmp_db");
 835                 tdb1_transaction_cancel(tdb);
 836                 return -1;
 837         }
 838
 839         state.error = TDB_SUCCESS;
 840         state.dest_db = tmp_db;
 841
 842         if (tdb1_traverse(tdb, repack_traverse, &state) == -1) {
 843                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 844                            __location__ " Failed to traverse copying out");
 845                 tdb1_transaction_cancel(tdb);
 846                 tdb_close(tmp_db);
 847                 return -1;
 848         }
 849
 850         if (state.error != TDB_SUCCESS) {
 851                 tdb->last_error = tdb_logerr(tdb, state.error, TDB_LOG_ERROR,
 852                                         __location__ " Error during traversal");
 853                 tdb1_transaction_cancel(tdb);
 854                 tdb_close(tmp_db);
 855                 return -1;
 856         }
 857
 858         if (tdb1_wipe_all(tdb) != 0) {
 859                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 860                            __location__ " Failed to wipe database\n");
 861                 tdb1_transaction_cancel(tdb);
 862                 tdb_close(tmp_db);
 863                 return -1;
 864         }
 865
 866         state.error = TDB_SUCCESS;
 867         state.dest_db = tdb;
 868
 869         if (tdb1_traverse(tmp_db, repack_traverse, &state) == -1) {
 870                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 871                            __location__ " Failed to traverse copying back");
 872                 tdb1_transaction_cancel(tdb);
 873                 tdb_close(tmp_db);
 874                 return -1;
 875         }
 876
 877         if (state.error) {
 878                 tdb->last_error = tdb_logerr(tdb, state.error, TDB_LOG_ERROR,
 879                                         __location__ " Error during second traversal");
 880                 tdb1_transaction_cancel(tdb);
 881                 tdb_close(tmp_db);
 882                 return -1;
 883         }
 884
 885         tdb_close(tmp_db);
 886
 887         if (tdb1_transaction_commit(tdb) != 0) {
 888                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 889                            __location__ " Failed to commit");
 890                 return -1;
 891         }
 892
 893         return 0;
 894 }
 895
 896 /* Even on files, we can get partial writes due to signals. */
 897 bool tdb1_write_all(int fd, const void *buf, size_t count)
 898 {
 899         while (count) {
 900                 ssize_t ret;
 901                 ret = write(fd, buf, count);
 902                 if (ret < 0)
 903                         return false;
 904                 buf = (const char *)buf + ret;
 905                 count -= ret;
 906         }
 907         return true;
 908 }