git.ozlabs.org Git - ccan/blob - ccan/tdb2/transaction.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              2005
   7    Copyright (C) Rusty Russell                2010
   8
   9      ** NOTE! The following LGPL license applies to the tdb
  10      ** library. This does NOT imply that all of Samba is released
  11      ** under the LGPL
  12
  13    This library is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU Lesser General Public
  15    License as published by the Free Software Foundation; either
  16    version 3 of the License, or (at your option) any later version.
  17
  18    This library is distributed in the hope that it will be useful,
  19    but WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    Lesser General Public License for more details.
  22
  23    You should have received a copy of the GNU Lesser General Public
  24    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  25 */
  26
  27 #include "private.h"
  28 #define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0)
  29
  30 /*
  31   transaction design:
  32
  33   - only allow a single transaction at a time per database. This makes
  34     using the transaction API simpler, as otherwise the caller would
  35     have to cope with temporary failures in transactions that conflict
  36     with other current transactions
  37
  38   - keep the transaction recovery information in the same file as the
  39     database, using a special 'transaction recovery' record pointed at
  40     by the header. This removes the need for extra journal files as
  41     used by some other databases
  42
  43   - dynamically allocated the transaction recover record, re-using it
  44     for subsequent transactions. If a larger record is needed then
  45     tdb_free() the old record to place it on the normal tdb freelist
  46     before allocating the new record
  47
  48   - during transactions, keep a linked list of writes all that have
  49     been performed by intercepting all tdb_write() calls. The hooked
  50     transaction versions of tdb_read() and tdb_write() check this
  51     linked list and try to use the elements of the list in preference
  52     to the real database.
  53
  54   - don't allow any locks to be held when a transaction starts,
  55     otherwise we can end up with deadlock (plus lack of lock nesting
  56     in posix locks would mean the lock is lost)
  57
  58   - if the caller gains a lock during the transaction but doesn't
  59     release it then fail the commit
  60
  61   - allow for nested calls to tdb_transaction_start(), re-using the
  62     existing transaction record. If the inner transaction is cancelled
  63     then a subsequent commit will fail
  64
  65   - keep a mirrored copy of the tdb hash chain heads to allow for the
  66     fast hash heads scan on traverse, updating the mirrored copy in
  67     the transaction version of tdb_write
  68
  69   - allow callers to mix transaction and non-transaction use of tdb,
  70     although once a transaction is started then an exclusive lock is
  71     gained until the transaction is committed or cancelled
  72
  73   - the commit stategy involves first saving away all modified data
  74     into a linearised buffer in the transaction recovery area, then
  75     marking the transaction recovery area with a magic value to
  76     indicate a valid recovery record. In total 4 fsync/msync calls are
  77     needed per commit to prevent race conditions. It might be possible
  78     to reduce this to 3 or even 2 with some more work.
  79
  80   - check for a valid recovery record on open of the tdb, while the
  81     open lock is held. Automatically recover from the transaction
  82     recovery area if needed, then continue with the open as
  83     usual. This allows for smooth crash recovery with no administrator
  84     intervention.
  85
  86   - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
  87     still available, but no transaction recovery area is used and no
  88     fsync/msync calls are made.
  89 */
  90
  91
  92 /*
  93   hold the context of any current transaction
  94 */
  95 struct tdb_transaction {
  96         /* the original io methods - used to do IOs to the real db */
  97         const struct tdb_methods *io_methods;
  98
  99         /* the list of transaction blocks. When a block is first
 100            written to, it gets created in this list */
 101         uint8_t **blocks;
 102         size_t num_blocks;
 103         size_t last_block_size; /* number of valid bytes in the last block */
 104
 105         /* non-zero when an internal transaction error has
 106            occurred. All write operations will then fail until the
 107            transaction is ended */
 108         int transaction_error;
 109
 110         /* when inside a transaction we need to keep track of any
 111            nested tdb_transaction_start() calls, as these are allowed,
 112            but don't create a new transaction */
 113         int nesting;
 114
 115         /* set when a prepare has already occurred */
 116         bool prepared;
 117         tdb_off_t magic_offset;
 118
 119         /* old file size before transaction */
 120         tdb_len_t old_map_size;
 121 };
 122
 123
 124 /*
 125   read while in a transaction. We need to check first if the data is in our list
 126   of transaction elements, then if not do a real read
 127 */
 128 static enum TDB_ERROR transaction_read(struct tdb_context *tdb, tdb_off_t off,
 129                                        void *buf, tdb_len_t len)
 130 {
 131         size_t blk;
 132         enum TDB_ERROR ecode;
 133
 134         /* break it down into block sized ops */
 135         while (len + (off % getpagesize()) > getpagesize()) {
 136                 tdb_len_t len2 = getpagesize() - (off % getpagesize());
 137                 ecode = transaction_read(tdb, off, buf, len2);
 138                 if (ecode != TDB_SUCCESS) {
 139                         return ecode;
 140                 }
 141                 len -= len2;
 142                 off += len2;
 143                 buf = (void *)(len2 + (char *)buf);
 144         }
 145
 146         if (len == 0) {
 147                 return TDB_SUCCESS;
 148         }
 149
 150         blk = off / getpagesize();
 151
 152         /* see if we have it in the block list */
 153         if (tdb->transaction->num_blocks <= blk ||
 154             tdb->transaction->blocks[blk] == NULL) {
 155                 /* nope, do a real read */
 156                 ecode = tdb->transaction->io_methods->tread(tdb, off, buf, len);
 157                 if (ecode != TDB_SUCCESS) {
 158                         goto fail;
 159                 }
 160                 return 0;
 161         }
 162
 163         /* it is in the block list. Now check for the last block */
 164         if (blk == tdb->transaction->num_blocks-1) {
 165                 if (len > tdb->transaction->last_block_size) {
 166                         ecode = TDB_ERR_IO;
 167                         goto fail;
 168                 }
 169         }
 170
 171         /* now copy it out of this block */
 172         memcpy(buf, tdb->transaction->blocks[blk] + (off % getpagesize()), len);
 173         return TDB_SUCCESS;
 174
 175 fail:
 176         tdb->transaction->transaction_error = 1;
 177         return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 178                           "transaction_read: failed at off=%zu len=%zu",
 179                           (size_t)off, (size_t)len);
 180 }
 181
 182
 183 /*
 184   write while in a transaction
 185 */
 186 static enum TDB_ERROR transaction_write(struct tdb_context *tdb, tdb_off_t off,
 187                                         const void *buf, tdb_len_t len)
 188 {
 189         size_t blk;
 190         enum TDB_ERROR ecode;
 191
 192         /* Only a commit is allowed on a prepared transaction */
 193         if (tdb->transaction->prepared) {
 194                 ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR,
 195                                    "transaction_write: transaction already"
 196                                    " prepared, write not allowed");
 197                 goto fail;
 198         }
 199
 200         /* break it up into block sized chunks */
 201         while (len + (off % getpagesize()) > getpagesize()) {
 202                 tdb_len_t len2 = getpagesize() - (off % getpagesize());
 203                 ecode = transaction_write(tdb, off, buf, len2);
 204                 if (ecode != TDB_SUCCESS) {
 205                         return -1;
 206                 }
 207                 len -= len2;
 208                 off += len2;
 209                 if (buf != NULL) {
 210                         buf = (const void *)(len2 + (const char *)buf);
 211                 }
 212         }
 213
 214         if (len == 0) {
 215                 return TDB_SUCCESS;
 216         }
 217
 218         blk = off / getpagesize();
 219         off = off % getpagesize();
 220
 221         if (tdb->transaction->num_blocks <= blk) {
 222                 uint8_t **new_blocks;
 223                 /* expand the blocks array */
 224                 if (tdb->transaction->blocks == NULL) {
 225                         new_blocks = (uint8_t **)malloc(
 226                                 (blk+1)*sizeof(uint8_t *));
 227                 } else {
 228                         new_blocks = (uint8_t **)realloc(
 229                                 tdb->transaction->blocks,
 230                                 (blk+1)*sizeof(uint8_t *));
 231                 }
 232                 if (new_blocks == NULL) {
 233                         ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
 234                                            "transaction_write:"
 235                                            " failed to allocate");
 236                         goto fail;
 237                 }
 238                 memset(&new_blocks[tdb->transaction->num_blocks], 0,
 239                        (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
 240                 tdb->transaction->blocks = new_blocks;
 241                 tdb->transaction->num_blocks = blk+1;
 242                 tdb->transaction->last_block_size = 0;
 243         }
 244
 245         /* allocate and fill a block? */
 246         if (tdb->transaction->blocks[blk] == NULL) {
 247                 tdb->transaction->blocks[blk] = (uint8_t *)calloc(getpagesize(), 1);
 248                 if (tdb->transaction->blocks[blk] == NULL) {
 249                         ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
 250                                            "transaction_write:"
 251                                            " failed to allocate");
 252                         goto fail;
 253                 }
 254                 if (tdb->transaction->old_map_size > blk * getpagesize()) {
 255                         tdb_len_t len2 = getpagesize();
 256                         if (len2 + (blk * getpagesize()) > tdb->transaction->old_map_size) {
 257                                 len2 = tdb->transaction->old_map_size - (blk * getpagesize());
 258                         }
 259                         ecode = tdb->transaction->io_methods->tread(tdb,
 260                                         blk * getpagesize(),
 261                                         tdb->transaction->blocks[blk],
 262                                         len2);
 263                         if (ecode != TDB_SUCCESS) {
 264                                 ecode = tdb_logerr(tdb, ecode,
 265                                                    TDB_LOG_ERROR,
 266                                                    "transaction_write:"
 267                                                    " failed to"
 268                                                    " read old block: %s",
 269                                                    strerror(errno));
 270                                 SAFE_FREE(tdb->transaction->blocks[blk]);
 271                                 goto fail;
 272                         }
 273                         if (blk == tdb->transaction->num_blocks-1) {
 274                                 tdb->transaction->last_block_size = len2;
 275                         }
 276                 }
 277         }
 278
 279         /* overwrite part of an existing block */
 280         if (buf == NULL) {
 281                 memset(tdb->transaction->blocks[blk] + off, 0, len);
 282         } else {
 283                 memcpy(tdb->transaction->blocks[blk] + off, buf, len);
 284         }
 285         if (blk == tdb->transaction->num_blocks-1) {
 286                 if (len + off > tdb->transaction->last_block_size) {
 287                         tdb->transaction->last_block_size = len + off;
 288                 }
 289         }
 290
 291         return TDB_SUCCESS;
 292
 293 fail:
 294         tdb->transaction->transaction_error = 1;
 295         return ecode;
 296 }
 297
 298
 299 /*
 300   write while in a transaction - this varient never expands the transaction blocks, it only
 301   updates existing blocks. This means it cannot change the recovery size
 302 */
 303 static void transaction_write_existing(struct tdb_context *tdb, tdb_off_t off,
 304                                        const void *buf, tdb_len_t len)
 305 {
 306         size_t blk;
 307
 308         /* break it up into block sized chunks */
 309         while (len + (off % getpagesize()) > getpagesize()) {
 310                 tdb_len_t len2 = getpagesize() - (off % getpagesize());
 311                 transaction_write_existing(tdb, off, buf, len2);
 312                 len -= len2;
 313                 off += len2;
 314                 if (buf != NULL) {
 315                         buf = (const void *)(len2 + (const char *)buf);
 316                 }
 317         }
 318
 319         if (len == 0) {
 320                 return;
 321         }
 322
 323         blk = off / getpagesize();
 324         off = off % getpagesize();
 325
 326         if (tdb->transaction->num_blocks <= blk ||
 327             tdb->transaction->blocks[blk] == NULL) {
 328                 return;
 329         }
 330
 331         if (blk == tdb->transaction->num_blocks-1 &&
 332             off + len > tdb->transaction->last_block_size) {
 333                 if (off >= tdb->transaction->last_block_size) {
 334                         return;
 335                 }
 336                 len = tdb->transaction->last_block_size - off;
 337         }
 338
 339         /* overwrite part of an existing block */
 340         memcpy(tdb->transaction->blocks[blk] + off, buf, len);
 341 }
 342
 343
 344 /*
 345   out of bounds check during a transaction
 346 */
 347 static enum TDB_ERROR transaction_oob(struct tdb_context *tdb, tdb_off_t len,
 348                                       bool probe)
 349 {
 350         if (len <= tdb->map_size) {
 351                 return TDB_SUCCESS;
 352         }
 353         if (!probe) {
 354                 tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
 355                            "tdb_oob len %lld beyond transaction size %lld",
 356                            (long long)len,
 357                            (long long)tdb->map_size);
 358         }
 359         return TDB_ERR_IO;
 360 }
 361
 362 /*
 363   transaction version of tdb_expand().
 364 */
 365 static enum TDB_ERROR transaction_expand_file(struct tdb_context *tdb,
 366                                               tdb_off_t addition)
 367 {
 368         enum TDB_ERROR ecode;
 369
 370         /* add a write to the transaction elements, so subsequent
 371            reads see the zero data */
 372         ecode = transaction_write(tdb, tdb->map_size, NULL, addition);
 373         if (ecode == TDB_SUCCESS) {
 374                 tdb->map_size += addition;
 375         }
 376         return ecode;
 377 }
 378
 379 static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off,
 380                                 size_t len, bool write_mode)
 381 {
 382         size_t blk = off / getpagesize(), end_blk;
 383
 384         /* This is wrong for zero-length blocks, but will fail gracefully */
 385         end_blk = (off + len - 1) / getpagesize();
 386
 387         /* Can only do direct if in single block and we've already copied. */
 388         if (write_mode) {
 389                 if (blk != end_blk)
 390                         return NULL;
 391                 if (blk >= tdb->transaction->num_blocks)
 392                         return NULL;
 393                 if (tdb->transaction->blocks[blk] == NULL)
 394                         return NULL;
 395                 return tdb->transaction->blocks[blk] + off % getpagesize();
 396         }
 397
 398         /* Single which we have copied? */
 399         if (blk == end_blk
 400             && blk < tdb->transaction->num_blocks
 401             && tdb->transaction->blocks[blk])
 402                 return tdb->transaction->blocks[blk] + off % getpagesize();
 403
 404         /* Otherwise must be all not copied. */
 405         while (blk < end_blk) {
 406                 if (blk >= tdb->transaction->num_blocks)
 407                         break;
 408                 if (tdb->transaction->blocks[blk])
 409                         return NULL;
 410                 blk++;
 411         }
 412         return tdb->transaction->io_methods->direct(tdb, off, len, false);
 413 }
 414
 415 static const struct tdb_methods transaction_methods = {
 416         transaction_read,
 417         transaction_write,
 418         transaction_oob,
 419         transaction_expand_file,
 420         transaction_direct,
 421 };
 422
 423 /*
 424   sync to disk
 425 */
 426 static enum TDB_ERROR transaction_sync(struct tdb_context *tdb,
 427                                        tdb_off_t offset, tdb_len_t length)
 428 {
 429         if (tdb->flags & TDB_NOSYNC) {
 430                 return TDB_SUCCESS;
 431         }
 432
 433         if (fsync(tdb->fd) != 0) {
 434                 return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
 435                                   "tdb_transaction: fsync failed: %s",
 436                                   strerror(errno));
 437         }
 438 #ifdef MS_SYNC
 439         if (tdb->map_ptr) {
 440                 tdb_off_t moffset = offset & ~(getpagesize()-1);
 441                 if (msync(moffset + (char *)tdb->map_ptr,
 442                           length + (offset - moffset), MS_SYNC) != 0) {
 443                         return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
 444                                           "tdb_transaction: msync failed: %s",
 445                                           strerror(errno));
 446                 }
 447         }
 448 #endif
 449         return TDB_SUCCESS;
 450 }
 451
 452
 453 static void _tdb_transaction_cancel(struct tdb_context *tdb)
 454 {
 455         int i;
 456         enum TDB_ERROR ecode;
 457
 458         if (tdb->transaction == NULL) {
 459                 tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
 460                            "tdb_transaction_cancel: no transaction");
 461                 return;
 462         }
 463
 464         if (tdb->transaction->nesting != 0) {
 465                 tdb->transaction->transaction_error = 1;
 466                 tdb->transaction->nesting--;
 467                 return;
 468         }
 469
 470         tdb->map_size = tdb->transaction->old_map_size;
 471
 472         /* free all the transaction blocks */
 473         for (i=0;i<tdb->transaction->num_blocks;i++) {
 474                 if (tdb->transaction->blocks[i] != NULL) {
 475                         free(tdb->transaction->blocks[i]);
 476                 }
 477         }
 478         SAFE_FREE(tdb->transaction->blocks);
 479
 480         if (tdb->transaction->magic_offset) {
 481                 const struct tdb_methods *methods = tdb->transaction->io_methods;
 482                 uint64_t invalid = TDB_RECOVERY_INVALID_MAGIC;
 483
 484                 /* remove the recovery marker */
 485                 ecode = methods->twrite(tdb, tdb->transaction->magic_offset,
 486                                         &invalid, sizeof(invalid));
 487                 if (ecode == TDB_SUCCESS)
 488                         ecode = transaction_sync(tdb,
 489                                                  tdb->transaction->magic_offset,
 490                                                  sizeof(invalid));
 491                 if (ecode != TDB_SUCCESS) {
 492                         tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 493                                    "tdb_transaction_cancel: failed to remove"
 494                                    " recovery magic");
 495                 }
 496         }
 497
 498         if (tdb->allrecord_lock.count)
 499                 tdb_allrecord_unlock(tdb, tdb->allrecord_lock.ltype);
 500
 501         /* restore the normal io methods */
 502         tdb->methods = tdb->transaction->io_methods;
 503
 504         tdb_transaction_unlock(tdb, F_WRLCK);
 505
 506         if (tdb_has_open_lock(tdb))
 507                 tdb_unlock_open(tdb);
 508
 509         SAFE_FREE(tdb->transaction);
 510 }
 511
 512 /*
 513   start a tdb transaction. No token is returned, as only a single
 514   transaction is allowed to be pending per tdb_context
 515 */
 516 int tdb_transaction_start(struct tdb_context *tdb)
 517 {
 518         enum TDB_ERROR ecode;
 519
 520         /* some sanity checks */
 521         if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) {
 522                 tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
 523                            "tdb_transaction_start: cannot start a transaction"
 524                            " on a read-only or internal db");
 525                 return -1;
 526         }
 527
 528         /* cope with nested tdb_transaction_start() calls */
 529         if (tdb->transaction != NULL) {
 530                 tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_USE_ERROR,
 531                            "tdb_transaction_start:"
 532                            " already inside transaction");
 533                 return -1;
 534         }
 535
 536         if (tdb_has_hash_locks(tdb)) {
 537                 /* the caller must not have any locks when starting a
 538                    transaction as otherwise we'll be screwed by lack
 539                    of nested locks in posix */
 540                 tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
 541                            "tdb_transaction_start: cannot start a transaction"
 542                            " with locks held");
 543                 return -1;
 544         }
 545
 546         tdb->transaction = (struct tdb_transaction *)
 547                 calloc(sizeof(struct tdb_transaction), 1);
 548         if (tdb->transaction == NULL) {
 549                 tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
 550                            "tdb_transaction_start: cannot allocate");
 551                 return -1;
 552         }
 553
 554         /* get the transaction write lock. This is a blocking lock. As
 555            discussed with Volker, there are a number of ways we could
 556            make this async, which we will probably do in the future */
 557         ecode = tdb_transaction_lock(tdb, F_WRLCK);
 558         if (ecode != TDB_SUCCESS) {
 559                 tdb->ecode = ecode;
 560                 SAFE_FREE(tdb->transaction->blocks);
 561                 SAFE_FREE(tdb->transaction);
 562                 return -1;
 563         }
 564
 565         /* get a read lock over entire file. This is upgraded to a write
 566            lock during the commit */
 567         ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true);
 568         if (ecode != TDB_SUCCESS) {
 569                 tdb->ecode = ecode;
 570                 goto fail_allrecord_lock;
 571         }
 572
 573         /* make sure we know about any file expansions already done by
 574            anyone else */
 575         tdb->methods->oob(tdb, tdb->map_size + 1, true);
 576         tdb->transaction->old_map_size = tdb->map_size;
 577
 578         /* finally hook the io methods, replacing them with
 579            transaction specific methods */
 580         tdb->transaction->io_methods = tdb->methods;
 581         tdb->methods = &transaction_methods;
 582         return 0;
 583
 584 fail_allrecord_lock:
 585         tdb_transaction_unlock(tdb, F_WRLCK);
 586         SAFE_FREE(tdb->transaction->blocks);
 587         SAFE_FREE(tdb->transaction);
 588         return -1;
 589 }
 590
 591
 592 /*
 593   cancel the current transaction
 594 */
 595 void tdb_transaction_cancel(struct tdb_context *tdb)
 596 {
 597         _tdb_transaction_cancel(tdb);
 598 }
 599
 600 /*
 601   work out how much space the linearised recovery data will consume
 602 */
 603 static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
 604 {
 605         tdb_len_t recovery_size = 0;
 606         int i;
 607
 608         recovery_size = sizeof(tdb_len_t);
 609         for (i=0;i<tdb->transaction->num_blocks;i++) {
 610                 if (i * getpagesize() >= tdb->transaction->old_map_size) {
 611                         break;
 612                 }
 613                 if (tdb->transaction->blocks[i] == NULL) {
 614                         continue;
 615                 }
 616                 recovery_size += 2*sizeof(tdb_off_t);
 617                 if (i == tdb->transaction->num_blocks-1) {
 618                         recovery_size += tdb->transaction->last_block_size;
 619                 } else {
 620                         recovery_size += getpagesize();
 621                 }
 622         }
 623
 624         return recovery_size;
 625 }
 626
 627 /*
 628   allocate the recovery area, or use an existing recovery area if it is
 629   large enough
 630 */
 631 static enum TDB_ERROR tdb_recovery_allocate(struct tdb_context *tdb,
 632                                             tdb_len_t *recovery_size,
 633                                             tdb_off_t *recovery_offset,
 634                                             tdb_len_t *recovery_max_size)
 635 {
 636         struct tdb_recovery_record rec;
 637         const struct tdb_methods *methods = tdb->transaction->io_methods;
 638         tdb_off_t recovery_head;
 639         size_t addition;
 640         enum TDB_ERROR ecode;
 641
 642         recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
 643         if (TDB_OFF_IS_ERR(recovery_head)) {
 644                 return tdb_logerr(tdb, recovery_head, TDB_LOG_ERROR,
 645                                   "tdb_recovery_allocate:"
 646                                   " failed to read recovery head");
 647         }
 648
 649         if (recovery_head != 0) {
 650                 ecode = methods->tread(tdb, recovery_head, &rec, sizeof(rec));
 651                 if (ecode != TDB_SUCCESS) {
 652                         return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 653                                           "tdb_recovery_allocate:"
 654                                           " failed to read recovery record");
 655                 }
 656                 tdb_convert(tdb, &rec, sizeof(rec));
 657                 /* ignore invalid recovery regions: can happen in crash */
 658                 if (rec.magic != TDB_RECOVERY_MAGIC &&
 659                     rec.magic != TDB_RECOVERY_INVALID_MAGIC) {
 660                         recovery_head = 0;
 661                 }
 662         }
 663
 664         *recovery_size = tdb_recovery_size(tdb);
 665
 666         if (recovery_head != 0 && *recovery_size <= rec.max_len) {
 667                 /* it fits in the existing area */
 668                 *recovery_max_size = rec.max_len;
 669                 *recovery_offset = recovery_head;
 670                 return TDB_SUCCESS;
 671         }
 672
 673         /* we need to free up the old recovery area, then allocate a
 674            new one at the end of the file. Note that we cannot use
 675            normal allocation to allocate the new one as that might return
 676            us an area that is being currently used (as of the start of
 677            the transaction) */
 678         if (recovery_head != 0) {
 679                 add_stat(tdb, frees, 1);
 680                 ecode = add_free_record(tdb, recovery_head,
 681                                         sizeof(rec) + rec.max_len);
 682                 if (ecode != TDB_SUCCESS) {
 683                         return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 684                                           "tdb_recovery_allocate:"
 685                                           " failed to free previous"
 686                                           " recovery area");
 687                 }
 688         }
 689
 690         /* the tdb_free() call might have increased the recovery size */
 691         *recovery_size = tdb_recovery_size(tdb);
 692
 693         /* round up to a multiple of page size */
 694         *recovery_max_size
 695                 = (((sizeof(rec) + *recovery_size) + getpagesize()-1)
 696                    & ~(getpagesize()-1))
 697                 - sizeof(rec);
 698         *recovery_offset = tdb->map_size;
 699         recovery_head = *recovery_offset;
 700
 701         /* Restore ->map_size before calling underlying expand_file.
 702            Also so that we don't try to expand the file again in the
 703            transaction commit, which would destroy the recovery
 704            area */
 705         addition = (tdb->map_size - tdb->transaction->old_map_size) +
 706                 sizeof(rec) + *recovery_max_size;
 707         tdb->map_size = tdb->transaction->old_map_size;
 708         ecode = methods->expand_file(tdb, addition);
 709         if (ecode != TDB_SUCCESS) {
 710                 return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 711                                   "tdb_recovery_allocate:"
 712                                   " failed to create recovery area");
 713         }
 714
 715         /* we have to reset the old map size so that we don't try to
 716            expand the file again in the transaction commit, which
 717            would destroy the recovery area */
 718         tdb->transaction->old_map_size = tdb->map_size;
 719
 720         /* write the recovery header offset and sync - we can sync without a race here
 721            as the magic ptr in the recovery record has not been set */
 722         tdb_convert(tdb, &recovery_head, sizeof(recovery_head));
 723         ecode = methods->twrite(tdb, offsetof(struct tdb_header, recovery),
 724                                 &recovery_head, sizeof(tdb_off_t));
 725         if (ecode != TDB_SUCCESS) {
 726                 return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 727                                   "tdb_recovery_allocate:"
 728                                   " failed to write recovery head");
 729         }
 730         transaction_write_existing(tdb, offsetof(struct tdb_header, recovery),
 731                                    &recovery_head,
 732                                    sizeof(tdb_off_t));
 733         return TDB_SUCCESS;
 734 }
 735
 736 /* Set up header for the recovery record. */
 737 static void set_recovery_header(struct tdb_recovery_record *rec,
 738                                 uint64_t magic,
 739                                 uint64_t datalen, uint64_t actuallen,
 740                                 uint64_t oldsize)
 741 {
 742         rec->magic = magic;
 743         rec->max_len = actuallen;
 744         rec->len = datalen;
 745         rec->eof = oldsize;
 746 }
 747
 748 /*
 749   setup the recovery data that will be used on a crash during commit
 750 */
 751 static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb,
 752                                                  tdb_off_t *magic_offset)
 753 {
 754         tdb_len_t recovery_size;
 755         unsigned char *data, *p;
 756         const struct tdb_methods *methods = tdb->transaction->io_methods;
 757         struct tdb_recovery_record *rec;
 758         tdb_off_t recovery_offset, recovery_max_size;
 759         tdb_off_t old_map_size = tdb->transaction->old_map_size;
 760         uint64_t magic, tailer;
 761         int i;
 762         enum TDB_ERROR ecode;
 763
 764         /*
 765           check that the recovery area has enough space
 766         */
 767         ecode = tdb_recovery_allocate(tdb, &recovery_size,
 768                                       &recovery_offset, &recovery_max_size);
 769         if (ecode != TDB_SUCCESS) {
 770                 return ecode;
 771         }
 772
 773         data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
 774         if (data == NULL) {
 775                 return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
 776                                   "transaction_setup_recovery:"
 777                                   " cannot allocate");
 778         }
 779
 780         rec = (struct tdb_recovery_record *)data;
 781         set_recovery_header(rec, TDB_RECOVERY_INVALID_MAGIC,
 782                             recovery_size, recovery_max_size, old_map_size);
 783         tdb_convert(tdb, rec, sizeof(*rec));
 784
 785         /* build the recovery data into a single blob to allow us to do a single
 786            large write, which should be more efficient */
 787         p = data + sizeof(*rec);
 788         for (i=0;i<tdb->transaction->num_blocks;i++) {
 789                 tdb_off_t offset;
 790                 tdb_len_t length;
 791
 792                 if (tdb->transaction->blocks[i] == NULL) {
 793                         continue;
 794                 }
 795
 796                 offset = i * getpagesize();
 797                 length = getpagesize();
 798                 if (i == tdb->transaction->num_blocks-1) {
 799                         length = tdb->transaction->last_block_size;
 800                 }
 801
 802                 if (offset >= old_map_size) {
 803                         continue;
 804                 }
 805                 if (offset + length > tdb->map_size) {
 806                         free(data);
 807                         return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
 808                                           "tdb_transaction_setup_recovery:"
 809                                           " transaction data over new region"
 810                                           " boundary");
 811                 }
 812                 memcpy(p, &offset, sizeof(offset));
 813                 memcpy(p + sizeof(offset), &length, sizeof(length));
 814                 tdb_convert(tdb, p, sizeof(offset) + sizeof(length));
 815
 816                 /* the recovery area contains the old data, not the
 817                    new data, so we have to call the original tdb_read
 818                    method to get it */
 819                 ecode = methods->tread(tdb, offset,
 820                                        p + sizeof(offset) + sizeof(length),
 821                                        length);
 822                 if (ecode != TDB_SUCCESS) {
 823                         free(data);
 824                         return ecode;
 825                 }
 826                 p += sizeof(offset) + sizeof(length) + length;
 827         }
 828
 829         /* and the tailer */
 830         tailer = sizeof(*rec) + recovery_max_size;
 831         memcpy(p, &tailer, sizeof(tailer));
 832         tdb_convert(tdb, p, sizeof(tailer));
 833
 834         /* write the recovery data to the recovery area */
 835         ecode = methods->twrite(tdb, recovery_offset, data,
 836                                 sizeof(*rec) + recovery_size);
 837         if (ecode != TDB_SUCCESS) {
 838                 free(data);
 839                 return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 840                                   "tdb_transaction_setup_recovery:"
 841                                   " failed to write recovery data");
 842         }
 843         transaction_write_existing(tdb, recovery_offset, data,
 844                                    sizeof(*rec) + recovery_size);
 845
 846         /* as we don't have ordered writes, we have to sync the recovery
 847            data before we update the magic to indicate that the recovery
 848            data is present */
 849         ecode = transaction_sync(tdb, recovery_offset,
 850                                  sizeof(*rec) + recovery_size);
 851         if (ecode != TDB_SUCCESS) {
 852                 free(data);
 853                 return ecode;
 854         }
 855
 856         free(data);
 857
 858         magic = TDB_RECOVERY_MAGIC;
 859         tdb_convert(tdb, &magic, sizeof(magic));
 860
 861         *magic_offset = recovery_offset + offsetof(struct tdb_recovery_record,
 862                                                    magic);
 863
 864         ecode = methods->twrite(tdb, *magic_offset, &magic, sizeof(magic));
 865         if (ecode != TDB_SUCCESS) {
 866                 return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 867                                   "tdb_transaction_setup_recovery:"
 868                                   " failed to write recovery magic");
 869         }
 870         transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic));
 871
 872         /* ensure the recovery magic marker is on disk */
 873         return transaction_sync(tdb, *magic_offset, sizeof(magic));
 874 }
 875
 876 static enum TDB_ERROR _tdb_transaction_prepare_commit(struct tdb_context *tdb)
 877 {
 878         const struct tdb_methods *methods;
 879         enum TDB_ERROR ecode;
 880
 881         if (tdb->transaction == NULL) {
 882                 return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
 883                                   "tdb_transaction_prepare_commit:"
 884                                   " no transaction");
 885         }
 886
 887         if (tdb->transaction->prepared) {
 888                 _tdb_transaction_cancel(tdb);
 889                 return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
 890                                   "tdb_transaction_prepare_commit:"
 891                                   " transaction already prepared");
 892         }
 893
 894         if (tdb->transaction->transaction_error) {
 895                 _tdb_transaction_cancel(tdb);
 896                 return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR,
 897                                   "tdb_transaction_prepare_commit:"
 898                                   " transaction error pending");
 899         }
 900
 901
 902         if (tdb->transaction->nesting != 0) {
 903                 tdb->transaction->nesting--;
 904                 return TDB_SUCCESS;
 905         }
 906
 907         /* check for a null transaction */
 908         if (tdb->transaction->blocks == NULL) {
 909                 return TDB_SUCCESS;
 910         }
 911
 912         methods = tdb->transaction->io_methods;
 913
 914         /* upgrade the main transaction lock region to a write lock */
 915         ecode = tdb_allrecord_upgrade(tdb);
 916         if (ecode != TDB_SUCCESS) {
 917                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 918                          "tdb_transaction_prepare_commit:"
 919                          " failed to upgrade hash locks");
 920                 _tdb_transaction_cancel(tdb);
 921                 return ecode;
 922         }
 923
 924         /* get the open lock - this prevents new users attaching to the database
 925            during the commit */
 926         ecode = tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
 927         if (ecode != TDB_SUCCESS) {
 928                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 929                            "tdb_transaction_prepare_commit:"
 930                            " failed to get open lock");
 931                 _tdb_transaction_cancel(tdb);
 932                 return ecode;
 933         }
 934
 935         /* Since we have whole db locked, we don't need the expansion lock. */
 936         if (!(tdb->flags & TDB_NOSYNC)) {
 937                 /* write the recovery data to the end of the file */
 938                 ecode = transaction_setup_recovery(tdb,
 939                                                    &tdb->transaction
 940                                                    ->magic_offset);
 941                 if (ecode != TDB_SUCCESS) {
 942                         tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 943                                  "tdb_transaction_prepare_commit:"
 944                                  " failed to setup recovery data");
 945                         _tdb_transaction_cancel(tdb);
 946                         return ecode;
 947                 }
 948         }
 949
 950         tdb->transaction->prepared = true;
 951
 952         /* expand the file to the new size if needed */
 953         if (tdb->map_size != tdb->transaction->old_map_size) {
 954                 tdb_len_t add = tdb->map_size - tdb->transaction->old_map_size;
 955                 /* Restore original map size for tdb_expand_file */
 956                 tdb->map_size = tdb->transaction->old_map_size;
 957                 ecode = methods->expand_file(tdb, add);
 958                 if (ecode != TDB_SUCCESS) {
 959                         tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 960                                  "tdb_transaction_prepare_commit:"
 961                                  " expansion failed");
 962                         _tdb_transaction_cancel(tdb);
 963                         return ecode;
 964                 }
 965         }
 966
 967         /* Keep the open lock until the actual commit */
 968         return TDB_SUCCESS;
 969 }
 970
 971 /*
 972    prepare to commit the current transaction
 973 */
 974 int tdb_transaction_prepare_commit(struct tdb_context *tdb)
 975 {
 976         tdb->ecode = _tdb_transaction_prepare_commit(tdb);
 977         if (tdb->ecode != TDB_SUCCESS)
 978                 return -1;
 979         return 0;
 980 }
 981
 982 /*
 983   commit the current transaction
 984 */
 985 int tdb_transaction_commit(struct tdb_context *tdb)
 986 {
 987         const struct tdb_methods *methods;
 988         int i;
 989         enum TDB_ERROR ecode;
 990
 991         if (tdb->transaction == NULL) {
 992                 tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
 993                          "tdb_transaction_commit: no transaction");
 994                 return -1;
 995         }
 996
 997         tdb_trace(tdb, "tdb_transaction_commit");
 998
 999         if (tdb->transaction->nesting != 0) {
1000                 tdb->transaction->nesting--;
1001                 return 0;
1002         }
1003
1004         /* check for a null transaction */
1005         if (tdb->transaction->blocks == NULL) {
1006                 _tdb_transaction_cancel(tdb);
1007                 return 0;
1008         }
1009
1010         if (!tdb->transaction->prepared) {
1011                 tdb->ecode = _tdb_transaction_prepare_commit(tdb);
1012                 if (tdb->ecode != TDB_SUCCESS)
1013                         return -1;
1014         }
1015
1016         methods = tdb->transaction->io_methods;
1017
1018         /* perform all the writes */
1019         for (i=0;i<tdb->transaction->num_blocks;i++) {
1020                 tdb_off_t offset;
1021                 tdb_len_t length;
1022
1023                 if (tdb->transaction->blocks[i] == NULL) {
1024                         continue;
1025                 }
1026
1027                 offset = i * getpagesize();
1028                 length = getpagesize();
1029                 if (i == tdb->transaction->num_blocks-1) {
1030                         length = tdb->transaction->last_block_size;
1031                 }
1032
1033                 ecode = methods->twrite(tdb, offset,
1034                                         tdb->transaction->blocks[i], length);
1035                 if (ecode != TDB_SUCCESS) {
1036                         tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1037                                    "tdb_transaction_commit:"
1038                                    " write failed during commit");
1039
1040                         /* we've overwritten part of the data and
1041                            possibly expanded the file, so we need to
1042                            run the crash recovery code */
1043                         tdb->methods = methods;
1044                         tdb_transaction_recover(tdb);
1045
1046                         _tdb_transaction_cancel(tdb);
1047
1048                         return -1;
1049                 }
1050                 SAFE_FREE(tdb->transaction->blocks[i]);
1051         }
1052
1053         SAFE_FREE(tdb->transaction->blocks);
1054         tdb->transaction->num_blocks = 0;
1055
1056         /* ensure the new data is on disk */
1057         ecode = transaction_sync(tdb, 0, tdb->map_size);
1058         if (ecode != TDB_SUCCESS) {
1059                 tdb->ecode = ecode;
1060                 return -1;
1061         }
1062
1063         /*
1064           TODO: maybe write to some dummy hdr field, or write to magic
1065           offset without mmap, before the last sync, instead of the
1066           utime() call
1067         */
1068
1069         /* on some systems (like Linux 2.6.x) changes via mmap/msync
1070            don't change the mtime of the file, this means the file may
1071            not be backed up (as tdb rounding to block sizes means that
1072            file size changes are quite rare too). The following forces
1073            mtime changes when a transaction completes */
1074 #if HAVE_UTIME
1075         utime(tdb->name, NULL);
1076 #endif
1077
1078         /* use a transaction cancel to free memory and remove the
1079            transaction locks */
1080         _tdb_transaction_cancel(tdb);
1081
1082         return 0;
1083 }
1084
1085
1086 /*
1087   recover from an aborted transaction. Must be called with exclusive
1088   database write access already established (including the open
1089   lock to prevent new processes attaching)
1090 */
1091 enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb)
1092 {
1093         tdb_off_t recovery_head, recovery_eof;
1094         unsigned char *data, *p;
1095         struct tdb_recovery_record rec;
1096         enum TDB_ERROR ecode;
1097
1098         /* find the recovery area */
1099         recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
1100         if (TDB_OFF_IS_ERR(recovery_head)) {
1101                 return tdb_logerr(tdb, recovery_head, TDB_LOG_ERROR,
1102                                   "tdb_transaction_recover:"
1103                                   " failed to read recovery head");
1104         }
1105
1106         if (recovery_head == 0) {
1107                 /* we have never allocated a recovery record */
1108                 return TDB_SUCCESS;
1109         }
1110
1111         /* read the recovery record */
1112         ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec));
1113         if (ecode != TDB_SUCCESS) {
1114                 return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1115                                   "tdb_transaction_recover:"
1116                                   " failed to read recovery record");
1117         }
1118
1119         if (rec.magic != TDB_RECOVERY_MAGIC) {
1120                 /* there is no valid recovery data */
1121                 return TDB_SUCCESS;
1122         }
1123
1124         if (tdb->read_only) {
1125                 return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
1126                                   "tdb_transaction_recover:"
1127                                   " attempt to recover read only database");
1128         }
1129
1130         recovery_eof = rec.eof;
1131
1132         data = (unsigned char *)malloc(rec.len);
1133         if (data == NULL) {
1134                 return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
1135                                   "tdb_transaction_recover:"
1136                                   " failed to allocate recovery data");
1137         }
1138
1139         /* read the full recovery data */
1140         ecode = tdb->methods->tread(tdb, recovery_head + sizeof(rec), data,
1141                                     rec.len);
1142         if (ecode != TDB_SUCCESS) {
1143                 return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1144                                   "tdb_transaction_recover:"
1145                                   " failed to read recovery data");
1146         }
1147
1148         /* recover the file data */
1149         p = data;
1150         while (p+sizeof(tdb_off_t)+sizeof(tdb_len_t) < data + rec.len) {
1151                 tdb_off_t ofs;
1152                 tdb_len_t len;
1153                 tdb_convert(tdb, p, sizeof(ofs) + sizeof(len));
1154                 memcpy(&ofs, p, sizeof(ofs));
1155                 memcpy(&len, p + sizeof(ofs), sizeof(len));
1156                 p += sizeof(ofs) + sizeof(len);
1157
1158                 ecode = tdb->methods->twrite(tdb, ofs, p, len);
1159                 if (ecode != TDB_SUCCESS) {
1160                         free(data);
1161                         return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1162                                           "tdb_transaction_recover:"
1163                                           " failed to recover %zu bytes"
1164                                           " at offset %zu",
1165                                           (size_t)len, (size_t)ofs);
1166                 }
1167                 p += len;
1168         }
1169
1170         free(data);
1171
1172         ecode = transaction_sync(tdb, 0, tdb->map_size);
1173         if (ecode != TDB_SUCCESS) {
1174                 return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1175                                   "tdb_transaction_recover:"
1176                                   " failed to sync recovery");
1177         }
1178
1179         /* if the recovery area is after the recovered eof then remove it */
1180         if (recovery_eof <= recovery_head) {
1181                 ecode = tdb_write_off(tdb, offsetof(struct tdb_header,
1182                                                     recovery),
1183                                       0);
1184                 if (ecode != TDB_SUCCESS) {
1185                         return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1186                                           "tdb_transaction_recover:"
1187                                           " failed to remove recovery head");
1188                 }
1189         }
1190
1191         /* remove the recovery magic */
1192         ecode = tdb_write_off(tdb,
1193                               recovery_head
1194                               + offsetof(struct tdb_recovery_record, magic),
1195                               TDB_RECOVERY_INVALID_MAGIC);
1196         if (ecode != TDB_SUCCESS) {
1197                 return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1198                                   "tdb_transaction_recover:"
1199                                   " failed to remove recovery magic");
1200         }
1201
1202         ecode = transaction_sync(tdb, 0, recovery_eof);
1203         if (ecode != TDB_SUCCESS) {
1204                 return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1205                                   "tdb_transaction_recover:"
1206                                   " failed to sync2 recovery");
1207         }
1208
1209         tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
1210                    "tdb_transaction_recover: recovered %zu byte database",
1211                    (size_t)recovery_eof);
1212
1213         /* all done */
1214         return TDB_SUCCESS;
1215 }
1216
1217 tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb)
1218 {
1219         tdb_off_t recovery_head;
1220         struct tdb_recovery_record rec;
1221         enum TDB_ERROR ecode;
1222
1223         /* find the recovery area */
1224         recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
1225         if (TDB_OFF_IS_ERR(recovery_head)) {
1226                 return recovery_head;
1227         }
1228
1229         if (recovery_head == 0) {
1230                 /* we have never allocated a recovery record */
1231                 return false;
1232         }
1233
1234         /* read the recovery record */
1235         ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec));
1236         if (ecode != TDB_SUCCESS) {
1237                 return ecode;
1238         }
1239
1240         return (rec.magic == TDB_RECOVERY_MAGIC);
1241 }