git.ozlabs.org Git - ccan/blob - ccan/tdb2/lock.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "private.h"
  29 #include <assert.h>
  30 #include <ccan/build_assert/build_assert.h>
  31
  32 /* If we were threaded, we could wait for unlock, but we're not, so fail. */
  33 static enum TDB_ERROR owner_conflict(struct tdb_context *tdb, const char *call)
  34 {
  35         return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
  36                           "%s: lock owned by another tdb in this process.",
  37                           call);
  38 }
  39
  40 /* If we fork, we no longer really own locks. */
  41 static bool check_lock_pid(struct tdb_context *tdb,
  42                            const char *call, bool log)
  43 {
  44         /* No locks?  No problem! */
  45         if (tdb->file->allrecord_lock.count == 0
  46             && tdb->file->num_lockrecs == 0) {
  47                 return true;
  48         }
  49
  50         /* No fork?  No problem! */
  51         if (tdb->file->locker == getpid()) {
  52                 return true;
  53         }
  54
  55         if (log) {
  56                 tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
  57                            "%s: fork() detected after lock acquisition!"
  58                            " (%u vs %u)", call, tdb->file->locker, getpid());
  59         }
  60         return false;
  61 }
  62
  63 static int fcntl_lock(struct tdb_context *tdb,
  64                       int rw, off_t off, off_t len, bool waitflag)
  65 {
  66         struct flock fl;
  67
  68         fl.l_type = rw;
  69         fl.l_whence = SEEK_SET;
  70         fl.l_start = off;
  71         fl.l_len = len;
  72         fl.l_pid = 0;
  73
  74         if (tdb->file->allrecord_lock.count == 0
  75             && tdb->file->num_lockrecs == 0) {
  76                 tdb->file->locker = getpid();
  77         }
  78
  79         add_stat(tdb, lock_lowlevel, 1);
  80         if (waitflag)
  81                 return fcntl(tdb->file->fd, F_SETLKW, &fl);
  82         else {
  83                 add_stat(tdb, lock_nonblock, 1);
  84                 return fcntl(tdb->file->fd, F_SETLK, &fl);
  85         }
  86 }
  87
  88 static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
  89 {
  90         struct flock fl;
  91 #if 0 /* Check they matched up locks and unlocks correctly. */
  92         char line[80];
  93         FILE *locks;
  94         bool found = false;
  95
  96         locks = fopen("/proc/locks", "r");
  97
  98         while (fgets(line, 80, locks)) {
  99                 char *p;
 100                 int type, start, l;
 101
 102                 /* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
 103                 p = strchr(line, ':') + 1;
 104                 if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
 105                         continue;
 106                 p += strlen(" FLOCK  ADVISORY  ");
 107                 if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
 108                         type = F_RDLCK;
 109                 else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
 110                         type = F_WRLCK;
 111                 else
 112                         abort();
 113                 p += 6;
 114                 if (atoi(p) != getpid())
 115                         continue;
 116                 p = strchr(strchr(p, ' ') + 1, ' ') + 1;
 117                 start = atoi(p);
 118                 p = strchr(p, ' ') + 1;
 119                 if (strncmp(p, "EOF", 3) == 0)
 120                         l = 0;
 121                 else
 122                         l = atoi(p) - start + 1;
 123
 124                 if (off == start) {
 125                         if (len != l) {
 126                                 fprintf(stderr, "Len %u should be %u: %s",
 127                                         (int)len, l, line);
 128                                 abort();
 129                         }
 130                         if (type != rw) {
 131                                 fprintf(stderr, "Type %s wrong: %s",
 132                                         rw == F_RDLCK ? "READ" : "WRITE", line);
 133                                 abort();
 134                         }
 135                         found = true;
 136                         break;
 137                 }
 138         }
 139
 140         if (!found) {
 141                 fprintf(stderr, "Unlock on %u@%u not found!",
 142                         (int)off, (int)len);
 143                 abort();
 144         }
 145
 146         fclose(locks);
 147 #endif
 148
 149         fl.l_type = F_UNLCK;
 150         fl.l_whence = SEEK_SET;
 151         fl.l_start = off;
 152         fl.l_len = len;
 153         fl.l_pid = 0;
 154
 155         return fcntl(tdb->file->fd, F_SETLKW, &fl);
 156 }
 157
 158 /* a byte range locking function - return 0 on success
 159    this functions locks len bytes at the specified offset.
 160
 161    note that a len of zero means lock to end of file
 162 */
 163 static enum TDB_ERROR tdb_brlock(struct tdb_context *tdb,
 164                                  int rw_type, tdb_off_t offset, tdb_off_t len,
 165                                  enum tdb_lock_flags flags)
 166 {
 167         int ret;
 168
 169         if (tdb->flags & TDB_NOLOCK) {
 170                 return TDB_SUCCESS;
 171         }
 172
 173         if (rw_type == F_WRLCK && tdb->read_only) {
 174                 return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
 175                                   "Write lock attempted on read-only database");
 176         }
 177
 178         /* A 32 bit system cannot open a 64-bit file, but it could have
 179          * expanded since then: check here. */
 180         if ((size_t)(offset + len) != offset + len) {
 181                 return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
 182                                   "tdb_brlock: lock on giant offset %llu",
 183                                   (long long)(offset + len));
 184         }
 185
 186         do {
 187                 ret = fcntl_lock(tdb, rw_type, offset, len,
 188                                  flags & TDB_LOCK_WAIT);
 189         } while (ret == -1 && errno == EINTR);
 190
 191         if (ret == -1) {
 192                 /* Generic lock error. errno set by fcntl.
 193                  * EAGAIN is an expected return from non-blocking
 194                  * locks. */
 195                 if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
 196                         tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 197                                    "tdb_brlock failed (fd=%d) at"
 198                                    " offset %zu rw_type=%d flags=%d len=%zu:"
 199                                    " %s",
 200                                    tdb->file->fd, (size_t)offset, rw_type,
 201                                    flags, (size_t)len, strerror(errno));
 202                 }
 203                 return TDB_ERR_LOCK;
 204         }
 205         return TDB_SUCCESS;
 206 }
 207
 208 static enum TDB_ERROR tdb_brunlock(struct tdb_context *tdb,
 209                                    int rw_type, tdb_off_t offset, size_t len)
 210 {
 211         int ret;
 212
 213         if (tdb->flags & TDB_NOLOCK) {
 214                 return TDB_SUCCESS;
 215         }
 216
 217         do {
 218                 ret = fcntl_unlock(tdb, rw_type, offset, len);
 219         } while (ret == -1 && errno == EINTR);
 220
 221         /* If we fail, *then* we verify that we owned the lock.  If not, ok. */
 222         if (ret == -1 && check_lock_pid(tdb, "tdb_brunlock", false)) {
 223                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 224                                   "tdb_brunlock failed (fd=%d) at offset %zu"
 225                                   " rw_type=%d len=%zu",
 226                                   tdb->file->fd, (size_t)offset, rw_type,
 227                                   (size_t)len);
 228         }
 229         return TDB_SUCCESS;
 230 }
 231
 232 /*
 233   upgrade a read lock to a write lock. This needs to be handled in a
 234   special way as some OSes (such as solaris) have too conservative
 235   deadlock detection and claim a deadlock when progress can be
 236   made. For those OSes we may loop for a while.
 237 */
 238 enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb)
 239 {
 240         int count = 1000;
 241
 242         if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true))
 243                 return TDB_ERR_LOCK;
 244
 245         if (tdb->file->allrecord_lock.count != 1) {
 246                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 247                                   "tdb_allrecord_upgrade failed:"
 248                                   " count %u too high",
 249                                   tdb->file->allrecord_lock.count);
 250         }
 251
 252         if (tdb->file->allrecord_lock.off != 1) {
 253                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 254                                   "tdb_allrecord_upgrade failed:"
 255                                   " already upgraded?");
 256         }
 257
 258         if (tdb->file->allrecord_lock.owner != tdb) {
 259                 return owner_conflict(tdb, "tdb_allrecord_upgrade");
 260         }
 261
 262         while (count--) {
 263                 struct timeval tv;
 264                 if (tdb_brlock(tdb, F_WRLCK,
 265                                TDB_HASH_LOCK_START, 0,
 266                                TDB_LOCK_WAIT|TDB_LOCK_PROBE) == TDB_SUCCESS) {
 267                         tdb->file->allrecord_lock.ltype = F_WRLCK;
 268                         tdb->file->allrecord_lock.off = 0;
 269                         return TDB_SUCCESS;
 270                 }
 271                 if (errno != EDEADLK) {
 272                         break;
 273                 }
 274                 /* sleep for as short a time as we can - more portable than usleep() */
 275                 tv.tv_sec = 0;
 276                 tv.tv_usec = 1;
 277                 select(0, NULL, NULL, NULL, &tv);
 278         }
 279         return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 280                           "tdb_allrecord_upgrade failed");
 281 }
 282
 283 static struct tdb_lock *find_nestlock(struct tdb_context *tdb, tdb_off_t offset,
 284                                       const struct tdb_context *owner)
 285 {
 286         unsigned int i;
 287
 288         for (i=0; i<tdb->file->num_lockrecs; i++) {
 289                 if (tdb->file->lockrecs[i].off == offset) {
 290                         if (owner && tdb->file->lockrecs[i].owner != owner)
 291                                 return NULL;
 292                         return &tdb->file->lockrecs[i];
 293                 }
 294         }
 295         return NULL;
 296 }
 297
 298 enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb)
 299 {
 300         enum TDB_ERROR ecode;
 301
 302         if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true))
 303                 return TDB_ERR_LOCK;
 304
 305         ecode = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK,
 306                                    false);
 307         if (ecode != TDB_SUCCESS) {
 308                 return ecode;
 309         }
 310
 311         ecode = tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
 312         if (ecode != TDB_SUCCESS) {
 313                 tdb_allrecord_unlock(tdb, F_WRLCK);
 314                 return ecode;
 315         }
 316         ecode = tdb_transaction_recover(tdb);
 317         tdb_unlock_open(tdb);
 318         tdb_allrecord_unlock(tdb, F_WRLCK);
 319
 320         return ecode;
 321 }
 322
 323 /* lock an offset in the database. */
 324 static enum TDB_ERROR tdb_nest_lock(struct tdb_context *tdb,
 325                                     tdb_off_t offset, int ltype,
 326                                     enum tdb_lock_flags flags)
 327 {
 328         struct tdb_lock *new_lck;
 329         enum TDB_ERROR ecode;
 330
 331         if (offset > (TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE
 332                       + tdb->file->map_size / 8)) {
 333                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 334                                   "tdb_nest_lock: invalid offset %zu ltype=%d",
 335                                   (size_t)offset, ltype);
 336         }
 337
 338         if (tdb->flags & TDB_NOLOCK)
 339                 return TDB_SUCCESS;
 340
 341         if (!check_lock_pid(tdb, "tdb_nest_lock", true)) {
 342                 return TDB_ERR_LOCK;
 343         }
 344
 345         add_stat(tdb, locks, 1);
 346
 347         new_lck = find_nestlock(tdb, offset, NULL);
 348         if (new_lck) {
 349                 if (new_lck->owner != tdb) {
 350                         return owner_conflict(tdb, "tdb_nest_lock");
 351                 }
 352
 353                 if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
 354                         return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 355                                           "tdb_nest_lock:"
 356                                           " offset %zu has read lock",
 357                                           (size_t)offset);
 358                 }
 359                 /* Just increment the struct, posix locks don't stack. */
 360                 new_lck->count++;
 361                 return TDB_SUCCESS;
 362         }
 363
 364         if (tdb->file->num_lockrecs
 365             && offset >= TDB_HASH_LOCK_START
 366             && offset < TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) {
 367                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 368                                   "tdb_nest_lock: already have a hash lock?");
 369         }
 370
 371         new_lck = (struct tdb_lock *)realloc(
 372                 tdb->file->lockrecs,
 373                 sizeof(*tdb->file->lockrecs) * (tdb->file->num_lockrecs+1));
 374         if (new_lck == NULL) {
 375                 return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
 376                                   "tdb_nest_lock:"
 377                                   " unable to allocate %zu lock struct",
 378                                   tdb->file->num_lockrecs + 1);
 379         }
 380         tdb->file->lockrecs = new_lck;
 381
 382         /* Since fcntl locks don't nest, we do a lock for the first one,
 383            and simply bump the count for future ones */
 384         ecode = tdb_brlock(tdb, ltype, offset, 1, flags);
 385         if (ecode != TDB_SUCCESS) {
 386                 return ecode;
 387         }
 388
 389         /* First time we grab a lock, perhaps someone died in commit? */
 390         if (!(flags & TDB_LOCK_NOCHECK)
 391             && tdb->file->num_lockrecs == 0) {
 392                 tdb_bool_err berr = tdb_needs_recovery(tdb);
 393                 if (berr != false) {
 394                         tdb_brunlock(tdb, ltype, offset, 1);
 395
 396                         if (berr < 0)
 397                                 return berr;
 398                         ecode = tdb_lock_and_recover(tdb);
 399                         if (ecode == TDB_SUCCESS) {
 400                                 ecode = tdb_brlock(tdb, ltype, offset, 1,
 401                                                    flags);
 402                         }
 403                         if (ecode != TDB_SUCCESS) {
 404                                 return ecode;
 405                         }
 406                 }
 407         }
 408
 409         tdb->file->lockrecs[tdb->file->num_lockrecs].owner = tdb;
 410         tdb->file->lockrecs[tdb->file->num_lockrecs].off = offset;
 411         tdb->file->lockrecs[tdb->file->num_lockrecs].count = 1;
 412         tdb->file->lockrecs[tdb->file->num_lockrecs].ltype = ltype;
 413         tdb->file->num_lockrecs++;
 414
 415         return TDB_SUCCESS;
 416 }
 417
 418 static enum TDB_ERROR tdb_nest_unlock(struct tdb_context *tdb,
 419                                       tdb_off_t off, int ltype)
 420 {
 421         struct tdb_lock *lck;
 422         enum TDB_ERROR ecode;
 423
 424         if (tdb->flags & TDB_NOLOCK)
 425                 return TDB_SUCCESS;
 426
 427         lck = find_nestlock(tdb, off, tdb);
 428         if ((lck == NULL) || (lck->count == 0)) {
 429                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 430                                   "tdb_nest_unlock: no lock for %zu",
 431                                   (size_t)off);
 432         }
 433
 434         if (lck->count > 1) {
 435                 lck->count--;
 436                 return TDB_SUCCESS;
 437         }
 438
 439         /*
 440          * This lock has count==1 left, so we need to unlock it in the
 441          * kernel. We don't bother with decrementing the in-memory array
 442          * element, we're about to overwrite it with the last array element
 443          * anyway.
 444          */
 445         ecode = tdb_brunlock(tdb, ltype, off, 1);
 446
 447         /*
 448          * Shrink the array by overwriting the element just unlocked with the
 449          * last array element.
 450          */
 451         *lck = tdb->file->lockrecs[--tdb->file->num_lockrecs];
 452
 453         return ecode;
 454 }
 455
 456 /*
 457   get the transaction lock
 458  */
 459 enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype)
 460 {
 461         return tdb_nest_lock(tdb, TDB_TRANSACTION_LOCK, ltype, TDB_LOCK_WAIT);
 462 }
 463
 464 /*
 465   release the transaction lock
 466  */
 467 void tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
 468 {
 469         tdb_nest_unlock(tdb, TDB_TRANSACTION_LOCK, ltype);
 470 }
 471
 472 /* We only need to lock individual bytes, but Linux merges consecutive locks
 473  * so we lock in contiguous ranges. */
 474 static enum TDB_ERROR tdb_lock_gradual(struct tdb_context *tdb,
 475                                        int ltype, enum tdb_lock_flags flags,
 476                                        tdb_off_t off, tdb_off_t len)
 477 {
 478         enum TDB_ERROR ecode;
 479         enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
 480
 481         if (len <= 1) {
 482                 /* 0 would mean to end-of-file... */
 483                 assert(len != 0);
 484                 /* Single hash.  Just do blocking lock. */
 485                 return tdb_brlock(tdb, ltype, off, len, flags);
 486         }
 487
 488         /* First we try non-blocking. */
 489         if (tdb_brlock(tdb, ltype, off, len, nb_flags) == TDB_SUCCESS) {
 490                 return TDB_SUCCESS;
 491         }
 492
 493         /* Try locking first half, then second. */
 494         ecode = tdb_lock_gradual(tdb, ltype, flags, off, len / 2);
 495         if (ecode != TDB_SUCCESS)
 496                 return ecode;
 497
 498         ecode = tdb_lock_gradual(tdb, ltype, flags,
 499                                  off + len / 2, len - len / 2);
 500         if (ecode != TDB_SUCCESS) {
 501                 tdb_brunlock(tdb, ltype, off, len / 2);
 502         }
 503         return ecode;
 504 }
 505
 506 /* lock/unlock entire database.  It can only be upgradable if you have some
 507  * other way of guaranteeing exclusivity (ie. transaction write lock). */
 508 enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 509                                   enum tdb_lock_flags flags, bool upgradable)
 510 {
 511         enum TDB_ERROR ecode;
 512         tdb_bool_err berr;
 513
 514         if (tdb->flags & TDB_NOLOCK)
 515                 return TDB_SUCCESS;
 516
 517         if (!check_lock_pid(tdb, "tdb_allrecord_lock", true)) {
 518                 return TDB_ERR_LOCK;
 519         }
 520
 521         if (tdb->file->allrecord_lock.count) {
 522                 if (tdb->file->allrecord_lock.owner != tdb) {
 523                         return owner_conflict(tdb, "tdb_allrecord_lock");
 524                 }
 525
 526                 if (ltype == F_RDLCK
 527                     || tdb->file->allrecord_lock.ltype == F_WRLCK) {
 528                         tdb->file->allrecord_lock.count++;
 529                         return TDB_SUCCESS;
 530                 }
 531
 532                 /* a global lock of a different type exists */
 533                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
 534                                   "tdb_allrecord_lock: already have %s lock",
 535                                   tdb->file->allrecord_lock.ltype == F_RDLCK
 536                                   ? "read" : "write");
 537         }
 538
 539         if (tdb_has_hash_locks(tdb)) {
 540                 /* can't combine global and chain locks */
 541                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
 542                                   "tdb_allrecord_lock:"
 543                                   " already have chain lock");
 544         }
 545
 546         if (upgradable && ltype != F_RDLCK) {
 547                 /* tdb error: you can't upgrade a write lock! */
 548                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 549                                   "tdb_allrecord_lock:"
 550                                   " can't upgrade a write lock");
 551         }
 552
 553         add_stat(tdb, locks, 1);
 554 again:
 555         /* Lock hashes, gradually. */
 556         ecode = tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START,
 557                                  TDB_HASH_LOCK_RANGE);
 558         if (ecode != TDB_SUCCESS) {
 559                 if (!(flags & TDB_LOCK_PROBE)) {
 560                         tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 561                                    "tdb_allrecord_lock hashes failed");
 562                 }
 563                 return ecode;
 564         }
 565
 566         /* Lock free tables: there to end of file. */
 567         ecode = tdb_brlock(tdb, ltype,
 568                            TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE,
 569                            0, flags);
 570         if (ecode != TDB_SUCCESS) {
 571                 if (!(flags & TDB_LOCK_PROBE)) {
 572                         tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
 573                                  "tdb_allrecord_lock freetables failed");
 574                 }
 575                 tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START,
 576                              TDB_HASH_LOCK_RANGE);
 577                 return ecode;
 578         }
 579
 580         tdb->file->allrecord_lock.owner = tdb;
 581         tdb->file->allrecord_lock.count = 1;
 582         /* If it's upgradable, it's actually exclusive so we can treat
 583          * it as a write lock. */
 584         tdb->file->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
 585         tdb->file->allrecord_lock.off = upgradable;
 586
 587         /* Now check for needing recovery. */
 588         if (flags & TDB_LOCK_NOCHECK)
 589                 return TDB_SUCCESS;
 590
 591         berr = tdb_needs_recovery(tdb);
 592         if (likely(berr == false))
 593                 return TDB_SUCCESS;
 594
 595         tdb_allrecord_unlock(tdb, ltype);
 596         if (berr < 0)
 597                 return berr;
 598         ecode = tdb_lock_and_recover(tdb);
 599         if (ecode != TDB_SUCCESS) {
 600                 return ecode;
 601         }
 602         goto again;
 603 }
 604
 605 enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb, enum tdb_lock_flags flags)
 606 {
 607         return tdb_nest_lock(tdb, TDB_OPEN_LOCK, F_WRLCK, flags);
 608 }
 609
 610 void tdb_unlock_open(struct tdb_context *tdb)
 611 {
 612         tdb_nest_unlock(tdb, TDB_OPEN_LOCK, F_WRLCK);
 613 }
 614
 615 bool tdb_has_open_lock(struct tdb_context *tdb)
 616 {
 617         return !(tdb->flags & TDB_NOLOCK)
 618                 && find_nestlock(tdb, TDB_OPEN_LOCK, tdb) != NULL;
 619 }
 620
 621 enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype)
 622 {
 623         /* Lock doesn't protect data, so don't check (we recurse if we do!) */
 624         return tdb_nest_lock(tdb, TDB_EXPANSION_LOCK, ltype,
 625                              TDB_LOCK_WAIT | TDB_LOCK_NOCHECK);
 626 }
 627
 628 void tdb_unlock_expand(struct tdb_context *tdb, int ltype)
 629 {
 630         tdb_nest_unlock(tdb, TDB_EXPANSION_LOCK, ltype);
 631 }
 632
 633 /* unlock entire db */
 634 void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
 635 {
 636         if (tdb->flags & TDB_NOLOCK)
 637                 return;
 638
 639         if (tdb->file->allrecord_lock.count == 0) {
 640                 tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
 641                            "tdb_allrecord_unlock: not locked!");
 642                 return;
 643         }
 644
 645         if (tdb->file->allrecord_lock.owner != tdb) {
 646                 tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
 647                            "tdb_allrecord_unlock: not locked by us!");
 648                 return;
 649         }
 650
 651         /* Upgradable locks are marked as write locks. */
 652         if (tdb->file->allrecord_lock.ltype != ltype
 653             && (!tdb->file->allrecord_lock.off || ltype != F_RDLCK)) {
 654                 tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 655                            "tdb_allrecord_unlock: have %s lock",
 656                            tdb->file->allrecord_lock.ltype == F_RDLCK
 657                            ? "read" : "write");
 658                 return;
 659         }
 660
 661         if (tdb->file->allrecord_lock.count > 1) {
 662                 tdb->file->allrecord_lock.count--;
 663                 return;
 664         }
 665
 666         tdb->file->allrecord_lock.count = 0;
 667         tdb->file->allrecord_lock.ltype = 0;
 668
 669         tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, 0);
 670 }
 671
 672 bool tdb_has_expansion_lock(struct tdb_context *tdb)
 673 {
 674         return find_nestlock(tdb, TDB_EXPANSION_LOCK, tdb) != NULL;
 675 }
 676
 677 bool tdb_has_hash_locks(struct tdb_context *tdb)
 678 {
 679         unsigned int i;
 680
 681         for (i=0; i<tdb->file->num_lockrecs; i++) {
 682                 if (tdb->file->lockrecs[i].off >= TDB_HASH_LOCK_START
 683                     && tdb->file->lockrecs[i].off < (TDB_HASH_LOCK_START
 684                                                      + TDB_HASH_LOCK_RANGE))
 685                         return true;
 686         }
 687         return false;
 688 }
 689
 690 static bool tdb_has_free_lock(struct tdb_context *tdb)
 691 {
 692         unsigned int i;
 693
 694         if (tdb->flags & TDB_NOLOCK)
 695                 return false;
 696
 697         for (i=0; i<tdb->file->num_lockrecs; i++) {
 698                 if (tdb->file->lockrecs[i].off
 699                     > TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE)
 700                         return true;
 701         }
 702         return false;
 703 }
 704
 705 enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb,
 706                                tdb_off_t hash_lock,
 707                                tdb_len_t hash_range,
 708                                int ltype, enum tdb_lock_flags waitflag)
 709 {
 710         /* FIXME: Do this properly, using hlock_range */
 711         unsigned lock = TDB_HASH_LOCK_START
 712                 + (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS));
 713
 714         /* a allrecord lock allows us to avoid per chain locks */
 715         if (tdb->file->allrecord_lock.count) {
 716                 if (!check_lock_pid(tdb, "tdb_lock_hashes", true))
 717                         return TDB_ERR_LOCK;
 718
 719                 if (tdb->file->allrecord_lock.owner != tdb)
 720                         return owner_conflict(tdb, "tdb_lock_hashes");
 721                 if (ltype == tdb->file->allrecord_lock.ltype
 722                     || ltype == F_RDLCK) {
 723                         return TDB_SUCCESS;
 724                 }
 725
 726                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
 727                                   "tdb_lock_hashes:"
 728                                   " already have %s allrecordlock",
 729                                   tdb->file->allrecord_lock.ltype == F_RDLCK
 730                                   ? "read" : "write");
 731         }
 732
 733         if (tdb_has_free_lock(tdb)) {
 734                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 735                                   "tdb_lock_hashes: already have free lock");
 736         }
 737
 738         if (tdb_has_expansion_lock(tdb)) {
 739                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 740                                   "tdb_lock_hashes:"
 741                                   " already have expansion lock");
 742         }
 743
 744         return tdb_nest_lock(tdb, lock, ltype, waitflag);
 745 }
 746
 747 enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb,
 748                                  tdb_off_t hash_lock,
 749                                  tdb_len_t hash_range, int ltype)
 750 {
 751         unsigned lock = TDB_HASH_LOCK_START
 752                 + (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS));
 753
 754         if (tdb->flags & TDB_NOLOCK)
 755                 return 0;
 756
 757         /* a allrecord lock allows us to avoid per chain locks */
 758         if (tdb->file->allrecord_lock.count) {
 759                 if (tdb->file->allrecord_lock.ltype == F_RDLCK
 760                     && ltype == F_WRLCK) {
 761                         return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 762                                           "tdb_unlock_hashes RO allrecord!");
 763                 }
 764                 return TDB_SUCCESS;
 765         }
 766
 767         return tdb_nest_unlock(tdb, lock, ltype);
 768 }
 769
 770 /* Hash locks use TDB_HASH_LOCK_START + the next 30 bits.
 771  * Then we begin; bucket offsets are sizeof(tdb_len_t) apart, so we divide.
 772  * The result is that on 32 bit systems we don't use lock values > 2^31 on
 773  * files that are less than 4GB.
 774  */
 775 static tdb_off_t free_lock_off(tdb_off_t b_off)
 776 {
 777         return TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE
 778                 + b_off / sizeof(tdb_off_t);
 779 }
 780
 781 enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
 782                                     enum tdb_lock_flags waitflag)
 783 {
 784         assert(b_off >= sizeof(struct tdb_header));
 785
 786         if (tdb->flags & TDB_NOLOCK)
 787                 return 0;
 788
 789         /* a allrecord lock allows us to avoid per chain locks */
 790         if (tdb->file->allrecord_lock.count) {
 791                 if (!check_lock_pid(tdb, "tdb_lock_free_bucket", true))
 792                         return TDB_ERR_LOCK;
 793
 794                 if (tdb->file->allrecord_lock.ltype == F_WRLCK)
 795                         return 0;
 796                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 797                                   "tdb_lock_free_bucket with"
 798                                   " read-only allrecordlock!");
 799         }
 800
 801 #if 0 /* FIXME */
 802         if (tdb_has_expansion_lock(tdb)) {
 803                 return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
 804                                   "tdb_lock_free_bucket:"
 805                                   " already have expansion lock");
 806         }
 807 #endif
 808
 809         return tdb_nest_lock(tdb, free_lock_off(b_off), F_WRLCK, waitflag);
 810 }
 811
 812 void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off)
 813 {
 814         if (tdb->file->allrecord_lock.count)
 815                 return;
 816
 817         tdb_nest_unlock(tdb, free_lock_off(b_off), F_WRLCK);
 818 }
 819
 820 enum TDB_ERROR tdb_lockall(struct tdb_context *tdb)
 821 {
 822         return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
 823 }
 824
 825 void tdb_unlockall(struct tdb_context *tdb)
 826 {
 827         tdb_allrecord_unlock(tdb, F_WRLCK);
 828 }
 829
 830 enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb)
 831 {
 832         return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
 833 }
 834
 835 void tdb_unlockall_read(struct tdb_context *tdb)
 836 {
 837         tdb_allrecord_unlock(tdb, F_RDLCK);
 838 }
 839
 840 void tdb_lock_cleanup(struct tdb_context *tdb)
 841 {
 842         unsigned int i;
 843
 844         while (tdb->file->allrecord_lock.count
 845                && tdb->file->allrecord_lock.owner == tdb) {
 846                 tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype);
 847         }
 848
 849         for (i=0; i<tdb->file->num_lockrecs; i++) {
 850                 if (tdb->file->lockrecs[i].owner == tdb) {
 851                         tdb_nest_unlock(tdb,
 852                                         tdb->file->lockrecs[i].off,
 853                                         tdb->file->lockrecs[i].ltype);
 854                         i--;
 855                 }
 856         }
 857 }