2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #include "tdb1_private.h"
30 static int fcntl_lock(struct tdb1_context *tdb,
31 int rw, off_t off, off_t len, bool waitflag)
36 fl.l_whence = SEEK_SET;
42 return fcntl(tdb->fd, F_SETLKW, &fl);
44 return fcntl(tdb->fd, F_SETLK, &fl);
47 static int fcntl_unlock(struct tdb1_context *tdb, int rw, off_t off, off_t len)
50 #if 0 /* Check they matched up locks and unlocks correctly. */
55 locks = fopen("/proc/locks", "r");
57 while (fgets(line, 80, locks)) {
61 /* eg. 1: FLOCK ADVISORY WRITE 2440 08:01:2180826 0 EOF */
62 p = strchr(line, ':') + 1;
63 if (strncmp(p, " POSIX ADVISORY ", strlen(" POSIX ADVISORY ")))
65 p += strlen(" FLOCK ADVISORY ");
66 if (strncmp(p, "READ ", strlen("READ ")) == 0)
68 else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
73 if (atoi(p) != getpid())
75 p = strchr(strchr(p, ' ') + 1, ' ') + 1;
77 p = strchr(p, ' ') + 1;
78 if (strncmp(p, "EOF", 3) == 0)
81 l = atoi(p) - start + 1;
85 fprintf(stderr, "Len %u should be %u: %s",
90 fprintf(stderr, "Type %s wrong: %s",
91 rw == F_RDLCK ? "READ" : "WRITE", line);
100 fprintf(stderr, "Unlock on %u@%u not found!\n",
109 fl.l_whence = SEEK_SET;
114 return fcntl(tdb->fd, F_SETLKW, &fl);
117 /* list -1 is the alloc list, otherwise a hash chain. */
118 static tdb1_off_t lock_offset(int list)
120 return TDB1_FREELIST_TOP + 4*list;
123 /* a byte range locking function - return 0 on success
124 this functions locks/unlocks 1 byte at the specified offset.
126 On error, errno is also set so that errors are passed back properly
129 note that a len of zero means lock to end of file
131 int tdb1_brlock(struct tdb1_context *tdb,
132 int rw_type, tdb1_off_t offset, size_t len,
133 enum tdb1_lock_flags flags)
137 if (tdb->flags & TDB1_NOLOCK) {
141 if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
142 tdb->ecode = TDB1_ERR_RDONLY;
147 ret = fcntl_lock(tdb, rw_type, offset, len,
148 flags & TDB1_LOCK_WAIT);
149 } while (ret == -1 && errno == EINTR);
152 tdb->ecode = TDB1_ERR_LOCK;
153 /* Generic lock error. errno set by fcntl.
154 * EAGAIN is an expected return from non-blocking
156 if (!(flags & TDB1_LOCK_PROBE) && errno != EAGAIN) {
157 TDB1_LOG((tdb, TDB1_DEBUG_TRACE,"tdb1_brlock failed (fd=%d) at offset %d rw_type=%d flags=%d len=%d\n",
158 tdb->fd, offset, rw_type, flags, (int)len));
165 int tdb1_brunlock(struct tdb1_context *tdb,
166 int rw_type, tdb1_off_t offset, size_t len)
170 if (tdb->flags & TDB1_NOLOCK) {
175 ret = fcntl_unlock(tdb, rw_type, offset, len);
176 } while (ret == -1 && errno == EINTR);
179 TDB1_LOG((tdb, TDB1_DEBUG_TRACE,"tdb1_brunlock failed (fd=%d) at offset %d rw_type=%d len=%d\n",
180 tdb->fd, offset, rw_type, (int)len));
186 upgrade a read lock to a write lock. This needs to be handled in a
187 special way as some OSes (such as solaris) have too conservative
188 deadlock detection and claim a deadlock when progress can be
189 made. For those OSes we may loop for a while.
191 int tdb1_allrecord_upgrade(struct tdb1_context *tdb)
195 if (tdb->allrecord_lock.count != 1) {
196 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
197 "tdb1_allrecord_upgrade failed: count %u too high\n",
198 tdb->allrecord_lock.count));
202 if (tdb->allrecord_lock.off != 1) {
203 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,
204 "tdb1_allrecord_upgrade failed: already upgraded?\n"));
210 if (tdb1_brlock(tdb, F_WRLCK, TDB1_FREELIST_TOP, 0,
211 TDB1_LOCK_WAIT|TDB1_LOCK_PROBE) == 0) {
212 tdb->allrecord_lock.ltype = F_WRLCK;
213 tdb->allrecord_lock.off = 0;
216 if (errno != EDEADLK) {
219 /* sleep for as short a time as we can - more portable than usleep() */
222 select(0, NULL, NULL, NULL, &tv);
224 TDB1_LOG((tdb, TDB1_DEBUG_TRACE,"tdb1_allrecord_upgrade failed\n"));
228 static struct tdb1_lock_type *tdb1_find_nestlock(struct tdb1_context *tdb,
233 for (i=0; i<tdb->num_lockrecs; i++) {
234 if (tdb->lockrecs[i].off == offset) {
235 return &tdb->lockrecs[i];
241 /* lock an offset in the database. */
242 int tdb1_nest_lock(struct tdb1_context *tdb, uint32_t offset, int ltype,
243 enum tdb1_lock_flags flags)
245 struct tdb1_lock_type *new_lck;
247 if (offset >= lock_offset(tdb->header.hash_size)) {
248 tdb->ecode = TDB1_ERR_LOCK;
249 TDB1_LOG((tdb, TDB1_DEBUG_ERROR,"tdb1_lock: invalid offset %u for ltype=%d\n",
253 if (tdb->flags & TDB1_NOLOCK)
256 new_lck = tdb1_find_nestlock(tdb, offset);
259 * Just increment the in-memory struct, posix locks
266 new_lck = (struct tdb1_lock_type *)realloc(
268 sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
269 if (new_lck == NULL) {
273 tdb->lockrecs = new_lck;
275 /* Since fcntl locks don't nest, we do a lock for the first one,
276 and simply bump the count for future ones */
277 if (tdb1_brlock(tdb, ltype, offset, 1, flags)) {
281 tdb->lockrecs[tdb->num_lockrecs].off = offset;
282 tdb->lockrecs[tdb->num_lockrecs].count = 1;
283 tdb->lockrecs[tdb->num_lockrecs].ltype = ltype;
289 static int tdb1_lock_and_recover(struct tdb1_context *tdb)
293 /* We need to match locking order in transaction commit. */
294 if (tdb1_brlock(tdb, F_WRLCK, TDB1_FREELIST_TOP, 0, TDB1_LOCK_WAIT)) {
298 if (tdb1_brlock(tdb, F_WRLCK, TDB1_OPEN_LOCK, 1, TDB1_LOCK_WAIT)) {
299 tdb1_brunlock(tdb, F_WRLCK, TDB1_FREELIST_TOP, 0);
303 ret = tdb1_transaction_recover(tdb);
305 tdb1_brunlock(tdb, F_WRLCK, TDB1_OPEN_LOCK, 1);
306 tdb1_brunlock(tdb, F_WRLCK, TDB1_FREELIST_TOP, 0);
311 static bool have_data_locks(const struct tdb1_context *tdb)
315 for (i = 0; i < tdb->num_lockrecs; i++) {
316 if (tdb->lockrecs[i].off >= lock_offset(-1))
322 static int tdb1_lock_list(struct tdb1_context *tdb, int list, int ltype,
323 enum tdb1_lock_flags waitflag)
328 /* a allrecord lock allows us to avoid per chain locks */
329 if (tdb->allrecord_lock.count &&
330 (ltype == tdb->allrecord_lock.ltype || ltype == F_RDLCK)) {
334 if (tdb->allrecord_lock.count) {
335 tdb->ecode = TDB1_ERR_LOCK;
338 /* Only check when we grab first data lock. */
339 check = !have_data_locks(tdb);
340 ret = tdb1_nest_lock(tdb, lock_offset(list), ltype, waitflag);
342 if (ret == 0 && check && tdb1_needs_recovery(tdb)) {
343 tdb1_nest_unlock(tdb, lock_offset(list), ltype);
345 if (tdb1_lock_and_recover(tdb) == -1) {
348 return tdb1_lock_list(tdb, list, ltype, waitflag);
354 /* lock a list in the database. list -1 is the alloc list */
355 int tdb1_lock(struct tdb1_context *tdb, int list, int ltype)
359 ret = tdb1_lock_list(tdb, list, ltype, TDB1_LOCK_WAIT);
361 TDB1_LOG((tdb, TDB1_DEBUG_ERROR, "tdb1_lock failed on list %d "
362 "ltype=%d (%s)\n", list, ltype, strerror(errno)));
367 int tdb1_nest_unlock(struct tdb1_context *tdb, uint32_t offset, int ltype)
370 struct tdb1_lock_type *lck;
372 if (tdb->flags & TDB1_NOLOCK)
376 if (offset >= lock_offset(tdb->header.hash_size)) {
377 TDB1_LOG((tdb, TDB1_DEBUG_ERROR, "tdb1_unlock: offset %u invalid (%d)\n", offset, tdb->header.hash_size));
381 lck = tdb1_find_nestlock(tdb, offset);
382 if ((lck == NULL) || (lck->count == 0)) {
383 TDB1_LOG((tdb, TDB1_DEBUG_ERROR, "tdb1_unlock: count is 0\n"));
387 if (lck->count > 1) {
393 * This lock has count==1 left, so we need to unlock it in the
394 * kernel. We don't bother with decrementing the in-memory array
395 * element, we're about to overwrite it with the last array element
399 ret = tdb1_brunlock(tdb, ltype, offset, 1);
402 * Shrink the array by overwriting the element just unlocked with the
403 * last array element.
405 *lck = tdb->lockrecs[--tdb->num_lockrecs];
408 * We don't bother with realloc when the array shrinks, but if we have
409 * a completely idle tdb we should get rid of the locked array.
412 if (tdb->num_lockrecs == 0) {
413 SAFE_FREE(tdb->lockrecs);
417 TDB1_LOG((tdb, TDB1_DEBUG_ERROR, "tdb1_unlock: An error occurred unlocking!\n"));
421 int tdb1_unlock(struct tdb1_context *tdb, int list, int ltype)
423 /* a global lock allows us to avoid per chain locks */
424 if (tdb->allrecord_lock.count &&
425 (ltype == tdb->allrecord_lock.ltype || ltype == F_RDLCK)) {
429 if (tdb->allrecord_lock.count) {
430 tdb->ecode = TDB1_ERR_LOCK;
434 return tdb1_nest_unlock(tdb, lock_offset(list), ltype);
438 get the transaction lock
440 int tdb1_transaction_lock(struct tdb1_context *tdb, int ltype,
441 enum tdb1_lock_flags lockflags)
443 return tdb1_nest_lock(tdb, TDB1_TRANSACTION_LOCK, ltype, lockflags);
447 release the transaction lock
449 int tdb1_transaction_unlock(struct tdb1_context *tdb, int ltype)
451 return tdb1_nest_unlock(tdb, TDB1_TRANSACTION_LOCK, ltype);
454 /* Returns 0 if all done, -1 if error, 1 if ok. */
455 static int tdb1_allrecord_check(struct tdb1_context *tdb, int ltype,
456 enum tdb1_lock_flags flags, bool upgradable)
458 /* There are no locks on read-only dbs */
459 if (tdb->read_only || tdb->traverse_read) {
460 tdb->ecode = TDB1_ERR_LOCK;
464 if (tdb->allrecord_lock.count && tdb->allrecord_lock.ltype == ltype) {
465 tdb->allrecord_lock.count++;
469 if (tdb->allrecord_lock.count) {
470 /* a global lock of a different type exists */
471 tdb->ecode = TDB1_ERR_LOCK;
475 if (tdb1_have_extra_locks(tdb)) {
476 /* can't combine global and chain locks */
477 tdb->ecode = TDB1_ERR_LOCK;
481 if (upgradable && ltype != F_RDLCK) {
482 /* tdb error: you can't upgrade a write lock! */
483 tdb->ecode = TDB1_ERR_LOCK;
489 /* We only need to lock individual bytes, but Linux merges consecutive locks
490 * so we lock in contiguous ranges. */
491 static int tdb1_chainlock_gradual(struct tdb1_context *tdb,
492 int ltype, enum tdb1_lock_flags flags,
493 size_t off, size_t len)
496 enum tdb1_lock_flags nb_flags = (flags & ~TDB1_LOCK_WAIT);
499 /* Single record. Just do blocking lock. */
500 return tdb1_brlock(tdb, ltype, off, len, flags);
503 /* First we try non-blocking. */
504 ret = tdb1_brlock(tdb, ltype, off, len, nb_flags);
509 /* Try locking first half, then second. */
510 ret = tdb1_chainlock_gradual(tdb, ltype, flags, off, len / 2);
514 ret = tdb1_chainlock_gradual(tdb, ltype, flags,
515 off + len / 2, len - len / 2);
517 tdb1_brunlock(tdb, ltype, off, len / 2);
523 /* lock/unlock entire database. It can only be upgradable if you have some
524 * other way of guaranteeing exclusivity (ie. transaction write lock).
525 * We do the locking gradually to avoid being starved by smaller locks. */
526 int tdb1_allrecord_lock(struct tdb1_context *tdb, int ltype,
527 enum tdb1_lock_flags flags, bool upgradable)
529 switch (tdb1_allrecord_check(tdb, ltype, flags, upgradable)) {
536 /* We cover two kinds of locks:
537 * 1) Normal chain locks. Taken for almost all operations.
538 * 3) Individual records locks. Taken after normal or free
541 * It is (1) which cause the starvation problem, so we're only
542 * gradual for that. */
543 if (tdb1_chainlock_gradual(tdb, ltype, flags, TDB1_FREELIST_TOP,
544 tdb->header.hash_size * 4) == -1) {
548 /* Grab individual record locks. */
549 if (tdb1_brlock(tdb, ltype, lock_offset(tdb->header.hash_size), 0,
551 tdb1_brunlock(tdb, ltype, TDB1_FREELIST_TOP,
552 tdb->header.hash_size * 4);
556 tdb->allrecord_lock.count = 1;
557 /* If it's upgradable, it's actually exclusive so we can treat
558 * it as a write lock. */
559 tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
560 tdb->allrecord_lock.off = upgradable;
562 if (tdb1_needs_recovery(tdb)) {
563 tdb1_allrecord_unlock(tdb, ltype);
564 if (tdb1_lock_and_recover(tdb) == -1) {
567 return tdb1_allrecord_lock(tdb, ltype, flags, upgradable);
575 /* unlock entire db */
576 int tdb1_allrecord_unlock(struct tdb1_context *tdb, int ltype)
578 /* There are no locks on read-only dbs */
579 if (tdb->read_only || tdb->traverse_read) {
580 tdb->ecode = TDB1_ERR_LOCK;
584 if (tdb->allrecord_lock.count == 0) {
585 tdb->ecode = TDB1_ERR_LOCK;
589 /* Upgradable locks are marked as write locks. */
590 if (tdb->allrecord_lock.ltype != ltype
591 && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
592 tdb->ecode = TDB1_ERR_LOCK;
596 if (tdb->allrecord_lock.count > 1) {
597 tdb->allrecord_lock.count--;
601 if (tdb1_brunlock(tdb, ltype, TDB1_FREELIST_TOP, 0)) {
602 TDB1_LOG((tdb, TDB1_DEBUG_ERROR, "tdb1_unlockall failed (%s)\n", strerror(errno)));
606 tdb->allrecord_lock.count = 0;
607 tdb->allrecord_lock.ltype = 0;
612 /* lock entire database with write lock */
613 int tdb1_lockall(struct tdb1_context *tdb)
615 return tdb1_allrecord_lock(tdb, F_WRLCK, TDB1_LOCK_WAIT, false);
618 /* unlock entire database with write lock */
619 int tdb1_unlockall(struct tdb1_context *tdb)
621 return tdb1_allrecord_unlock(tdb, F_WRLCK);
624 /* lock entire database with read lock */
625 int tdb1_lockall_read(struct tdb1_context *tdb)
627 return tdb1_allrecord_lock(tdb, F_RDLCK, TDB1_LOCK_WAIT, false);
630 /* unlock entire database with read lock */
631 int tdb1_unlockall_read(struct tdb1_context *tdb)
633 return tdb1_allrecord_unlock(tdb, F_RDLCK);
636 /* lock/unlock one hash chain. This is meant to be used to reduce
637 contention - it cannot guarantee how many records will be locked */
638 int tdb1_chainlock(struct tdb1_context *tdb, TDB1_DATA key)
640 int ret = tdb1_lock(tdb, TDB1_BUCKET(tdb->hash_fn(&key)), F_WRLCK);
644 int tdb1_chainunlock(struct tdb1_context *tdb, TDB1_DATA key)
646 return tdb1_unlock(tdb, TDB1_BUCKET(tdb->hash_fn(&key)), F_WRLCK);
649 int tdb1_chainlock_read(struct tdb1_context *tdb, TDB1_DATA key)
652 ret = tdb1_lock(tdb, TDB1_BUCKET(tdb->hash_fn(&key)), F_RDLCK);
656 int tdb1_chainunlock_read(struct tdb1_context *tdb, TDB1_DATA key)
658 return tdb1_unlock(tdb, TDB1_BUCKET(tdb->hash_fn(&key)), F_RDLCK);
661 /* record lock stops delete underneath */
662 int tdb1_lock_record(struct tdb1_context *tdb, tdb1_off_t off)
664 if (tdb->allrecord_lock.count) {
667 return off ? tdb1_brlock(tdb, F_RDLCK, off, 1, TDB1_LOCK_WAIT) : 0;
671 Write locks override our own fcntl readlocks, so check it here.
672 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
673 an error to fail to get the lock here.
675 int tdb1_write_lock_record(struct tdb1_context *tdb, tdb1_off_t off)
677 struct tdb1_traverse_lock *i;
678 for (i = &tdb->travlocks; i; i = i->next)
681 if (tdb->allrecord_lock.count) {
682 if (tdb->allrecord_lock.ltype == F_WRLCK) {
687 return tdb1_brlock(tdb, F_WRLCK, off, 1, TDB1_LOCK_NOWAIT|TDB1_LOCK_PROBE);
690 int tdb1_write_unlock_record(struct tdb1_context *tdb, tdb1_off_t off)
692 if (tdb->allrecord_lock.count) {
695 return tdb1_brunlock(tdb, F_WRLCK, off, 1);
698 /* fcntl locks don't stack: avoid unlocking someone else's */
699 int tdb1_unlock_record(struct tdb1_context *tdb, tdb1_off_t off)
701 struct tdb1_traverse_lock *i;
704 if (tdb->allrecord_lock.count) {
710 for (i = &tdb->travlocks; i; i = i->next)
713 return (count == 1 ? tdb1_brunlock(tdb, F_RDLCK, off, 1) : 0);
716 bool tdb1_have_extra_locks(struct tdb1_context *tdb)
718 unsigned int extra = tdb->num_lockrecs;
720 /* A transaction holds the lock for all records. */
721 if (!tdb->transaction && tdb->allrecord_lock.count) {
725 /* We always hold the active lock if CLEAR_IF_FIRST. */
726 if (tdb1_find_nestlock(tdb, TDB1_ACTIVE_LOCK)) {
730 /* In a transaction, we expect to hold the transaction lock */
732 && tdb1_find_nestlock(tdb, TDB1_TRANSACTION_LOCK)) {
739 /* The transaction code uses this to remove all locks. */
740 void tdb1_release_transaction_locks(struct tdb1_context *tdb)
742 unsigned int i, active = 0;
744 if (tdb->allrecord_lock.count != 0) {
745 tdb1_brunlock(tdb, tdb->allrecord_lock.ltype, TDB1_FREELIST_TOP, 0);
746 tdb->allrecord_lock.count = 0;
749 for (i=0;i<tdb->num_lockrecs;i++) {
750 struct tdb1_lock_type *lck = &tdb->lockrecs[i];
752 /* Don't release the active lock! Copy it to first entry. */
753 if (lck->off == TDB1_ACTIVE_LOCK) {
754 tdb->lockrecs[active++] = *lck;
756 tdb1_brunlock(tdb, lck->ltype, lck->off, 1);
759 tdb->num_lockrecs = active;
760 if (tdb->num_lockrecs == 0) {
761 SAFE_FREE(tdb->lockrecs);