2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #include "tdb1_private.h"
30 static int fcntl_lock(struct tdb1_context *tdb,
31 int rw, off_t off, off_t len, bool waitflag)
36 fl.l_whence = SEEK_SET;
42 return fcntl(tdb->fd, F_SETLKW, &fl);
44 return fcntl(tdb->fd, F_SETLK, &fl);
47 static int fcntl_unlock(struct tdb1_context *tdb, int rw, off_t off, off_t len)
50 #if 0 /* Check they matched up locks and unlocks correctly. */
55 locks = fopen("/proc/locks", "r");
57 while (fgets(line, 80, locks)) {
61 /* eg. 1: FLOCK ADVISORY WRITE 2440 08:01:2180826 0 EOF */
62 p = strchr(line, ':') + 1;
63 if (strncmp(p, " POSIX ADVISORY ", strlen(" POSIX ADVISORY ")))
65 p += strlen(" FLOCK ADVISORY ");
66 if (strncmp(p, "READ ", strlen("READ ")) == 0)
68 else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
73 if (atoi(p) != getpid())
75 p = strchr(strchr(p, ' ') + 1, ' ') + 1;
77 p = strchr(p, ' ') + 1;
78 if (strncmp(p, "EOF", 3) == 0)
81 l = atoi(p) - start + 1;
85 fprintf(stderr, "Len %u should be %u: %s",
90 fprintf(stderr, "Type %s wrong: %s",
91 rw == F_RDLCK ? "READ" : "WRITE", line);
100 fprintf(stderr, "Unlock on %u@%u not found!\n",
109 fl.l_whence = SEEK_SET;
114 return fcntl(tdb->fd, F_SETLKW, &fl);
117 /* list -1 is the alloc list, otherwise a hash chain. */
118 static tdb1_off_t lock_offset(int list)
120 return TDB1_FREELIST_TOP + 4*list;
123 /* a byte range locking function - return 0 on success
124 this functions locks/unlocks 1 byte at the specified offset.
126 On error, errno is also set so that errors are passed back properly
129 note that a len of zero means lock to end of file
131 int tdb1_brlock(struct tdb1_context *tdb,
132 int rw_type, tdb1_off_t offset, size_t len,
133 enum tdb_lock_flags flags)
137 if (tdb->flags & TDB1_NOLOCK) {
141 if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
142 tdb->last_error = TDB_ERR_RDONLY;
147 ret = fcntl_lock(tdb, rw_type, offset, len,
148 flags & TDB_LOCK_WAIT);
149 } while (ret == -1 && errno == EINTR);
152 tdb->last_error = TDB_ERR_LOCK;
153 /* Generic lock error. errno set by fcntl.
154 * EAGAIN is an expected return from non-blocking
156 if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
157 tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
158 "tdb1_brlock failed (fd=%d) at offset %d rw_type=%d flags=%d len=%d",
159 tdb->fd, offset, rw_type, flags, (int)len);
166 int tdb1_brunlock(struct tdb1_context *tdb,
167 int rw_type, tdb1_off_t offset, size_t len)
171 if (tdb->flags & TDB1_NOLOCK) {
176 ret = fcntl_unlock(tdb, rw_type, offset, len);
177 } while (ret == -1 && errno == EINTR);
180 tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
181 "tdb1_brunlock failed (fd=%d) at offset"
182 " %d rw_type=%d len=%d",
183 tdb->fd, offset, rw_type, (int)len);
189 upgrade a read lock to a write lock. This needs to be handled in a
190 special way as some OSes (such as solaris) have too conservative
191 deadlock detection and claim a deadlock when progress can be
192 made. For those OSes we may loop for a while.
194 int tdb1_allrecord_upgrade(struct tdb1_context *tdb)
198 if (tdb->allrecord_lock.count != 1) {
199 tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
200 "tdb1_allrecord_upgrade failed: "
202 tdb->allrecord_lock.count);
206 if (tdb->allrecord_lock.off != 1) {
207 tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
208 "tdb1_allrecord_upgrade failed:"
209 " already upgraded?");
215 if (tdb1_brlock(tdb, F_WRLCK, TDB1_FREELIST_TOP, 0,
216 TDB_LOCK_WAIT|TDB_LOCK_PROBE) == 0) {
217 tdb->allrecord_lock.ltype = F_WRLCK;
218 tdb->allrecord_lock.off = 0;
221 if (errno != EDEADLK) {
224 /* sleep for as short a time as we can - more portable than usleep() */
227 select(0, NULL, NULL, NULL, &tv);
229 tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
230 "tdb1_allrecord_upgrade failed");
234 static struct tdb1_lock_type *tdb1_find_nestlock(struct tdb1_context *tdb,
239 for (i=0; i<tdb->num_lockrecs; i++) {
240 if (tdb->lockrecs[i].off == offset) {
241 return &tdb->lockrecs[i];
247 /* lock an offset in the database. */
248 int tdb1_nest_lock(struct tdb1_context *tdb, uint32_t offset, int ltype,
249 enum tdb_lock_flags flags)
251 struct tdb1_lock_type *new_lck;
253 if (offset >= lock_offset(tdb->header.hash_size)) {
254 tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
255 "tdb1_lock: invalid offset %u for"
260 if (tdb->flags & TDB1_NOLOCK)
263 new_lck = tdb1_find_nestlock(tdb, offset);
266 * Just increment the in-memory struct, posix locks
273 new_lck = (struct tdb1_lock_type *)realloc(
275 sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
276 if (new_lck == NULL) {
280 tdb->lockrecs = new_lck;
282 /* Since fcntl locks don't nest, we do a lock for the first one,
283 and simply bump the count for future ones */
284 if (tdb1_brlock(tdb, ltype, offset, 1, flags)) {
288 tdb->lockrecs[tdb->num_lockrecs].off = offset;
289 tdb->lockrecs[tdb->num_lockrecs].count = 1;
290 tdb->lockrecs[tdb->num_lockrecs].ltype = ltype;
296 static int tdb1_lock_and_recover(struct tdb1_context *tdb)
300 /* We need to match locking order in transaction commit. */
301 if (tdb1_brlock(tdb, F_WRLCK, TDB1_FREELIST_TOP, 0, TDB_LOCK_WAIT)) {
305 if (tdb1_brlock(tdb, F_WRLCK, TDB1_OPEN_LOCK, 1, TDB_LOCK_WAIT)) {
306 tdb1_brunlock(tdb, F_WRLCK, TDB1_FREELIST_TOP, 0);
310 ret = tdb1_transaction_recover(tdb);
312 tdb1_brunlock(tdb, F_WRLCK, TDB1_OPEN_LOCK, 1);
313 tdb1_brunlock(tdb, F_WRLCK, TDB1_FREELIST_TOP, 0);
318 static bool have_data_locks(const struct tdb1_context *tdb)
322 for (i = 0; i < tdb->num_lockrecs; i++) {
323 if (tdb->lockrecs[i].off >= lock_offset(-1))
329 static int tdb1_lock_list(struct tdb1_context *tdb, int list, int ltype,
330 enum tdb_lock_flags waitflag)
335 /* a allrecord lock allows us to avoid per chain locks */
336 if (tdb->allrecord_lock.count &&
337 (ltype == tdb->allrecord_lock.ltype || ltype == F_RDLCK)) {
341 if (tdb->allrecord_lock.count) {
342 tdb->last_error = TDB_ERR_LOCK;
345 /* Only check when we grab first data lock. */
346 check = !have_data_locks(tdb);
347 ret = tdb1_nest_lock(tdb, lock_offset(list), ltype, waitflag);
349 if (ret == 0 && check && tdb1_needs_recovery(tdb)) {
350 tdb1_nest_unlock(tdb, lock_offset(list), ltype);
352 if (tdb1_lock_and_recover(tdb) == -1) {
355 return tdb1_lock_list(tdb, list, ltype, waitflag);
361 /* lock a list in the database. list -1 is the alloc list */
362 int tdb1_lock(struct tdb1_context *tdb, int list, int ltype)
366 ret = tdb1_lock_list(tdb, list, ltype, TDB_LOCK_WAIT);
368 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
369 "tdb1_lock failed on list %d "
370 "ltype=%d (%s)", list, ltype, strerror(errno));
375 int tdb1_nest_unlock(struct tdb1_context *tdb, uint32_t offset, int ltype)
378 struct tdb1_lock_type *lck;
380 if (tdb->flags & TDB1_NOLOCK)
384 if (offset >= lock_offset(tdb->header.hash_size)) {
385 tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
386 "tdb1_unlock: offset %u invalid (%d)",
387 offset, tdb->header.hash_size);
391 lck = tdb1_find_nestlock(tdb, offset);
392 if ((lck == NULL) || (lck->count == 0)) {
393 tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
394 "tdb1_unlock: count is 0");
398 if (lck->count > 1) {
404 * This lock has count==1 left, so we need to unlock it in the
405 * kernel. We don't bother with decrementing the in-memory array
406 * element, we're about to overwrite it with the last array element
410 ret = tdb1_brunlock(tdb, ltype, offset, 1);
413 * Shrink the array by overwriting the element just unlocked with the
414 * last array element.
416 *lck = tdb->lockrecs[--tdb->num_lockrecs];
419 * We don't bother with realloc when the array shrinks, but if we have
420 * a completely idle tdb we should get rid of the locked array.
423 if (tdb->num_lockrecs == 0) {
424 SAFE_FREE(tdb->lockrecs);
430 int tdb1_unlock(struct tdb1_context *tdb, int list, int ltype)
432 /* a global lock allows us to avoid per chain locks */
433 if (tdb->allrecord_lock.count &&
434 (ltype == tdb->allrecord_lock.ltype || ltype == F_RDLCK)) {
438 if (tdb->allrecord_lock.count) {
439 tdb->last_error = TDB_ERR_LOCK;
443 return tdb1_nest_unlock(tdb, lock_offset(list), ltype);
447 get the transaction lock
449 int tdb1_transaction_lock(struct tdb1_context *tdb, int ltype,
450 enum tdb_lock_flags lockflags)
452 return tdb1_nest_lock(tdb, TDB1_TRANSACTION_LOCK, ltype, lockflags);
456 release the transaction lock
458 int tdb1_transaction_unlock(struct tdb1_context *tdb, int ltype)
460 return tdb1_nest_unlock(tdb, TDB1_TRANSACTION_LOCK, ltype);
463 /* Returns 0 if all done, -1 if error, 1 if ok. */
464 static int tdb1_allrecord_check(struct tdb1_context *tdb, int ltype,
465 enum tdb_lock_flags flags, bool upgradable)
467 /* There are no locks on read-only dbs */
468 if (tdb->read_only || tdb->traverse_read) {
469 tdb->last_error = TDB_ERR_LOCK;
473 if (tdb->allrecord_lock.count && tdb->allrecord_lock.ltype == ltype) {
474 tdb->allrecord_lock.count++;
478 if (tdb->allrecord_lock.count) {
479 /* a global lock of a different type exists */
480 tdb->last_error = TDB_ERR_LOCK;
484 if (tdb1_have_extra_locks(tdb)) {
485 /* can't combine global and chain locks */
486 tdb->last_error = TDB_ERR_LOCK;
490 if (upgradable && ltype != F_RDLCK) {
491 /* tdb error: you can't upgrade a write lock! */
492 tdb->last_error = TDB_ERR_LOCK;
498 /* We only need to lock individual bytes, but Linux merges consecutive locks
499 * so we lock in contiguous ranges. */
500 static int tdb1_chainlock_gradual(struct tdb1_context *tdb,
501 int ltype, enum tdb_lock_flags flags,
502 size_t off, size_t len)
505 enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
508 /* Single record. Just do blocking lock. */
509 return tdb1_brlock(tdb, ltype, off, len, flags);
512 /* First we try non-blocking. */
513 ret = tdb1_brlock(tdb, ltype, off, len, nb_flags);
518 /* Try locking first half, then second. */
519 ret = tdb1_chainlock_gradual(tdb, ltype, flags, off, len / 2);
523 ret = tdb1_chainlock_gradual(tdb, ltype, flags,
524 off + len / 2, len - len / 2);
526 tdb1_brunlock(tdb, ltype, off, len / 2);
532 /* lock/unlock entire database. It can only be upgradable if you have some
533 * other way of guaranteeing exclusivity (ie. transaction write lock).
534 * We do the locking gradually to avoid being starved by smaller locks. */
535 int tdb1_allrecord_lock(struct tdb1_context *tdb, int ltype,
536 enum tdb_lock_flags flags, bool upgradable)
538 switch (tdb1_allrecord_check(tdb, ltype, flags, upgradable)) {
545 /* We cover two kinds of locks:
546 * 1) Normal chain locks. Taken for almost all operations.
547 * 3) Individual records locks. Taken after normal or free
550 * It is (1) which cause the starvation problem, so we're only
551 * gradual for that. */
552 if (tdb1_chainlock_gradual(tdb, ltype, flags, TDB1_FREELIST_TOP,
553 tdb->header.hash_size * 4) == -1) {
557 /* Grab individual record locks. */
558 if (tdb1_brlock(tdb, ltype, lock_offset(tdb->header.hash_size), 0,
560 tdb1_brunlock(tdb, ltype, TDB1_FREELIST_TOP,
561 tdb->header.hash_size * 4);
565 tdb->allrecord_lock.count = 1;
566 /* If it's upgradable, it's actually exclusive so we can treat
567 * it as a write lock. */
568 tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
569 tdb->allrecord_lock.off = upgradable;
571 if (tdb1_needs_recovery(tdb)) {
572 tdb1_allrecord_unlock(tdb, ltype);
573 if (tdb1_lock_and_recover(tdb) == -1) {
576 return tdb1_allrecord_lock(tdb, ltype, flags, upgradable);
584 /* unlock entire db */
585 int tdb1_allrecord_unlock(struct tdb1_context *tdb, int ltype)
587 /* There are no locks on read-only dbs */
588 if (tdb->read_only || tdb->traverse_read) {
589 tdb->last_error = TDB_ERR_LOCK;
593 if (tdb->allrecord_lock.count == 0) {
594 tdb->last_error = TDB_ERR_LOCK;
598 /* Upgradable locks are marked as write locks. */
599 if (tdb->allrecord_lock.ltype != ltype
600 && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
601 tdb->last_error = TDB_ERR_LOCK;
605 if (tdb->allrecord_lock.count > 1) {
606 tdb->allrecord_lock.count--;
610 if (tdb1_brunlock(tdb, ltype, TDB1_FREELIST_TOP, 0)) {
611 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
612 "tdb1_unlockall failed (%s)", strerror(errno));
616 tdb->allrecord_lock.count = 0;
617 tdb->allrecord_lock.ltype = 0;
622 /* lock entire database with write lock */
623 int tdb1_lockall(struct tdb1_context *tdb)
625 return tdb1_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
628 /* unlock entire database with write lock */
629 int tdb1_unlockall(struct tdb1_context *tdb)
631 return tdb1_allrecord_unlock(tdb, F_WRLCK);
634 /* lock entire database with read lock */
635 int tdb1_lockall_read(struct tdb1_context *tdb)
637 return tdb1_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
640 /* unlock entire database with read lock */
641 int tdb1_unlockall_read(struct tdb1_context *tdb)
643 return tdb1_allrecord_unlock(tdb, F_RDLCK);
646 /* lock/unlock one hash chain. This is meant to be used to reduce
647 contention - it cannot guarantee how many records will be locked */
648 int tdb1_chainlock(struct tdb1_context *tdb, TDB_DATA key)
650 int ret = tdb1_lock(tdb, TDB1_BUCKET(tdb->hash_fn(&key)), F_WRLCK);
654 int tdb1_chainunlock(struct tdb1_context *tdb, TDB_DATA key)
656 return tdb1_unlock(tdb, TDB1_BUCKET(tdb->hash_fn(&key)), F_WRLCK);
659 int tdb1_chainlock_read(struct tdb1_context *tdb, TDB_DATA key)
662 ret = tdb1_lock(tdb, TDB1_BUCKET(tdb->hash_fn(&key)), F_RDLCK);
666 int tdb1_chainunlock_read(struct tdb1_context *tdb, TDB_DATA key)
668 return tdb1_unlock(tdb, TDB1_BUCKET(tdb->hash_fn(&key)), F_RDLCK);
671 /* record lock stops delete underneath */
672 int tdb1_lock_record(struct tdb1_context *tdb, tdb1_off_t off)
674 if (tdb->allrecord_lock.count) {
677 return off ? tdb1_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0;
681 Write locks override our own fcntl readlocks, so check it here.
682 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
683 an error to fail to get the lock here.
685 int tdb1_write_lock_record(struct tdb1_context *tdb, tdb1_off_t off)
687 struct tdb1_traverse_lock *i;
688 for (i = &tdb->travlocks; i; i = i->next)
691 if (tdb->allrecord_lock.count) {
692 if (tdb->allrecord_lock.ltype == F_WRLCK) {
697 return tdb1_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
700 int tdb1_write_unlock_record(struct tdb1_context *tdb, tdb1_off_t off)
702 if (tdb->allrecord_lock.count) {
705 return tdb1_brunlock(tdb, F_WRLCK, off, 1);
708 /* fcntl locks don't stack: avoid unlocking someone else's */
709 int tdb1_unlock_record(struct tdb1_context *tdb, tdb1_off_t off)
711 struct tdb1_traverse_lock *i;
714 if (tdb->allrecord_lock.count) {
720 for (i = &tdb->travlocks; i; i = i->next)
723 return (count == 1 ? tdb1_brunlock(tdb, F_RDLCK, off, 1) : 0);
726 bool tdb1_have_extra_locks(struct tdb1_context *tdb)
728 unsigned int extra = tdb->num_lockrecs;
730 /* A transaction holds the lock for all records. */
731 if (!tdb->transaction && tdb->allrecord_lock.count) {
735 /* We always hold the active lock if CLEAR_IF_FIRST. */
736 if (tdb1_find_nestlock(tdb, TDB1_ACTIVE_LOCK)) {
740 /* In a transaction, we expect to hold the transaction lock */
742 && tdb1_find_nestlock(tdb, TDB1_TRANSACTION_LOCK)) {
749 /* The transaction code uses this to remove all locks. */
750 void tdb1_release_transaction_locks(struct tdb1_context *tdb)
752 unsigned int i, active = 0;
754 if (tdb->allrecord_lock.count != 0) {
755 tdb1_brunlock(tdb, tdb->allrecord_lock.ltype, TDB1_FREELIST_TOP, 0);
756 tdb->allrecord_lock.count = 0;
759 for (i=0;i<tdb->num_lockrecs;i++) {
760 struct tdb1_lock_type *lck = &tdb->lockrecs[i];
762 /* Don't release the active lock! Copy it to first entry. */
763 if (lck->off == TDB1_ACTIVE_LOCK) {
764 tdb->lockrecs[active++] = *lck;
766 tdb1_brunlock(tdb, lck->ltype, lck->off, 1);
769 tdb->num_lockrecs = active;
770 if (tdb->num_lockrecs == 0) {
771 SAFE_FREE(tdb->lockrecs);