2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #include "tdb1_private.h"
32 non-blocking increment of the tdb sequence number if the tdb has been opened using
35 void tdb1_increment_seqnum_nonblock(struct tdb_context *tdb)
39 if (!(tdb->flags & TDB_SEQNUM)) {
43 /* we ignore errors from this, as we have no sane way of
46 tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum);
48 tdb1_ofs_write(tdb, TDB1_SEQNUM_OFS, &seqnum);
52 increment the tdb sequence number if the tdb has been opened using
55 static void tdb1_increment_seqnum(struct tdb_context *tdb)
57 if (!(tdb->flags & TDB_SEQNUM)) {
61 if (tdb1_nest_lock(tdb, TDB1_SEQNUM_OFS, F_WRLCK,
62 TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
66 tdb1_increment_seqnum_nonblock(tdb);
68 tdb1_nest_unlock(tdb, TDB1_SEQNUM_OFS, F_WRLCK);
71 static enum TDB_ERROR tdb1_key_compare(TDB_DATA key, TDB_DATA data,
74 bool *matches = matches_;
75 *matches = (memcmp(data.dptr, key.dptr, data.dsize) == 0);
79 /* Returns 0 on fail; last_error will be TDB_ERR_NOEXIST if it simply
80 * wasn't there, otherwise a real error.
81 * On success, return offset of record, and fills in rec */
82 static tdb1_off_t tdb1_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
83 struct tdb1_record *r)
87 /* read in the hash top */
88 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
91 /* keep looking until we find the right record */
93 if (tdb1_rec_read(tdb, rec_ptr, r) == -1)
96 tdb->stats.compares++;
98 tdb->stats.compare_wrong_bucket++;
99 } else if (key.dsize != r->key_len) {
100 tdb->stats.compare_wrong_keylen++;
101 } else if (hash != r->full_hash) {
102 tdb->stats.compare_wrong_rechash++;
104 enum TDB_ERROR ecode;
106 ecode = tdb1_parse_data(tdb, key, rec_ptr + sizeof(*r),
107 r->key_len, tdb1_key_compare,
110 if (ecode != TDB_SUCCESS) {
111 tdb->last_error = ecode;
116 tdb->stats.compare_wrong_keycmp++;
121 /* detect tight infinite loop */
122 if (rec_ptr == r->next) {
123 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT,
125 "tdb1_find: loop detected.");
130 tdb->last_error = TDB_ERR_NOEXIST;
134 /* As tdb1_find, but if you succeed, keep the lock */
135 tdb1_off_t tdb1_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
136 struct tdb1_record *rec)
140 if (tdb1_lock(tdb, TDB1_BUCKET(hash), locktype) == -1)
142 if (!(rec_ptr = tdb1_find(tdb, key, hash, rec)))
143 tdb1_unlock(tdb, TDB1_BUCKET(hash), locktype);
147 static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key);
149 /* update an entry in place - this only works if the new data size
150 is <= the old data size and the key exists.
151 on failure return -1.
153 static int tdb1_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
155 struct tdb1_record rec;
159 if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec)))
162 /* it could be an exact duplicate of what is there - this is
163 * surprisingly common (eg. with a ldb re-index). */
164 if (rec.key_len == key.dsize &&
165 rec.data_len == dbuf.dsize &&
166 rec.full_hash == hash) {
167 TDB_DATA data = _tdb1_fetch(tdb, key);
168 if (data.dsize == dbuf.dsize &&
169 memcmp(data.dptr, dbuf.dptr, data.dsize) == 0) {
180 /* must be long enough key, data and tailer */
181 if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb1_off_t)) {
182 tdb->last_error = TDB_SUCCESS; /* Not really an error */
186 if (tdb->tdb1.io->tdb1_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
187 dbuf.dptr, dbuf.dsize) == -1)
190 if (dbuf.dsize != rec.data_len) {
192 rec.data_len = dbuf.dsize;
193 return tdb1_rec_write(tdb, rec_ptr, &rec);
199 /* find an entry in the database given a key */
200 /* If an entry doesn't exist tdb1_err will be set to
201 * TDB_ERR_NOEXIST. If a key has no data attached
202 * then the TDB_DATA will have zero length but
205 static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key)
208 struct tdb1_record rec;
212 /* find which hash bucket it is in */
213 hash = tdb_hash(tdb, key.dptr, key.dsize);
214 if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
220 ret.dptr = tdb1_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
222 ret.dsize = rec.data_len;
223 tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
227 enum TDB_ERROR tdb1_fetch(struct tdb_context *tdb, TDB_DATA key, TDB_DATA *data)
229 *data = _tdb1_fetch(tdb, key);
230 if (data->dptr == NULL)
231 return tdb->last_error;
235 enum TDB_ERROR tdb1_parse_record(struct tdb_context *tdb, TDB_DATA key,
236 enum TDB_ERROR (*parser)(TDB_DATA key,
242 struct tdb1_record rec;
246 /* find which hash bucket it is in */
247 hash = tdb_hash(tdb, key.dptr, key.dsize);
249 if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
250 return tdb->last_error;
253 ret = tdb1_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
254 rec.data_len, parser, private_data);
256 tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
261 /* check if an entry in the database exists
263 note that 1 is returned if the key is found and 0 is returned if not found
264 this doesn't match the conventions in the rest of this module, but is
267 static int tdb1_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
269 struct tdb1_record rec;
271 if (tdb1_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
273 tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
277 int tdb1_exists(struct tdb_context *tdb, TDB_DATA key)
279 uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize);
282 assert(tdb->flags & TDB_VERSION1);
283 ret = tdb1_exists_hash(tdb, key, hash);
287 /* actually delete an entry in the database given the offset */
288 int tdb1_do_delete(struct tdb_context *tdb, tdb1_off_t rec_ptr, struct tdb1_record *rec)
290 tdb1_off_t last_ptr, i;
291 struct tdb1_record lastrec;
293 if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) return -1;
295 if (((tdb->tdb1.traverse_write != 0) && (!TDB1_DEAD(rec))) ||
296 tdb1_write_lock_record(tdb, rec_ptr) == -1) {
297 /* Someone traversing here: mark it as dead */
298 rec->magic = TDB1_DEAD_MAGIC;
299 return tdb1_rec_write(tdb, rec_ptr, rec);
301 if (tdb1_write_unlock_record(tdb, rec_ptr) != 0)
304 /* find previous record in hash chain */
305 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(rec->full_hash), &i) == -1)
307 for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
308 if (tdb1_rec_read(tdb, i, &lastrec) == -1)
311 /* unlink it: next ptr is at start of record. */
313 last_ptr = TDB1_HASH_TOP(rec->full_hash);
314 if (tdb1_ofs_write(tdb, last_ptr, &rec->next) == -1)
317 /* recover the space */
318 if (tdb1_free(tdb, rec_ptr, rec) == -1)
323 static int tdb1_count_dead(struct tdb_context *tdb, uint32_t hash)
327 struct tdb1_record rec;
329 /* read in the hash top */
330 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
334 if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1)
337 if (rec.magic == TDB1_DEAD_MAGIC) {
346 * Purge all DEAD records from a hash chain
348 static int tdb1_purge_dead(struct tdb_context *tdb, uint32_t hash)
351 struct tdb1_record rec;
354 if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
358 /* read in the hash top */
359 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
365 if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1) {
371 if (rec.magic == TDB1_DEAD_MAGIC
372 && tdb1_do_delete(tdb, rec_ptr, &rec) == -1) {
379 tdb1_unlock(tdb, -1, F_WRLCK);
383 /* delete an entry in the database given a key */
384 static int tdb1_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
387 struct tdb1_record rec;
390 if (tdb->tdb1.max_dead_records != 0) {
393 * Allow for some dead records per hash chain, mainly for
394 * tdb's with a very high create/delete rate like locking.tdb.
397 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
400 if (tdb1_count_dead(tdb, hash) >= tdb->tdb1.max_dead_records) {
402 * Don't let the per-chain freelist grow too large,
403 * delete all existing dead records
405 tdb1_purge_dead(tdb, hash);
408 if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec))) {
409 tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
414 * Just mark the record as dead.
416 rec.magic = TDB1_DEAD_MAGIC;
417 ret = tdb1_rec_write(tdb, rec_ptr, &rec);
420 if (!(rec_ptr = tdb1_find_lock_hash(tdb, key, hash, F_WRLCK,
424 ret = tdb1_do_delete(tdb, rec_ptr, &rec);
428 tdb1_increment_seqnum(tdb);
431 if (tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_WRLCK) != 0)
432 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
433 "tdb1_delete: WARNING tdb1_unlock failed!");
437 int tdb1_delete(struct tdb_context *tdb, TDB_DATA key)
439 uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize);
442 assert(tdb->flags & TDB_VERSION1);
443 ret = tdb1_delete_hash(tdb, key, hash);
448 * See if we have a dead record around with enough space
450 static tdb1_off_t tdb1_find_dead(struct tdb_context *tdb, uint32_t hash,
451 struct tdb1_record *r, tdb1_len_t length)
455 /* read in the hash top */
456 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
459 /* keep looking until we find the right record */
461 if (tdb1_rec_read(tdb, rec_ptr, r) == -1)
464 if (TDB1_DEAD(r) && r->rec_len >= length) {
466 * First fit for simple coding, TODO: change to best
476 static int _tdb1_store(struct tdb_context *tdb, TDB_DATA key,
477 TDB_DATA dbuf, int flag, uint32_t hash)
479 struct tdb1_record rec;
483 /* check for it existing, on insert. */
484 if (flag == TDB_INSERT) {
485 if (tdb1_exists_hash(tdb, key, hash)) {
486 tdb->last_error = TDB_ERR_EXISTS;
489 if (tdb->last_error != TDB_ERR_NOEXIST) {
493 /* first try in-place update, on modify or replace. */
494 if (tdb1_update_hash(tdb, key, hash, dbuf) == 0) {
497 if (tdb->last_error != TDB_SUCCESS) {
498 if (tdb->last_error != TDB_ERR_NOEXIST) {
501 if (flag == TDB_MODIFY) {
502 /* if the record doesn't exist and we are in TDB1_MODIFY mode then
503 we should fail the store */
508 /* reset the error code potentially set by the tdb1_update() */
509 tdb->last_error = TDB_SUCCESS;
511 /* delete any existing record - if it doesn't exist we don't
512 care. Doing this first reduces fragmentation, and avoids
513 coalescing with `allocated' block before it's updated. */
514 if (flag != TDB_INSERT)
515 tdb1_delete_hash(tdb, key, hash);
517 if (tdb->tdb1.max_dead_records != 0) {
519 * Allow for some dead records per hash chain, look if we can
520 * find one that can hold the new record. We need enough space
521 * for key, data and tailer. If we find one, we don't have to
522 * consult the central freelist.
524 rec_ptr = tdb1_find_dead(
526 key.dsize + dbuf.dsize + sizeof(tdb1_off_t));
529 rec.key_len = key.dsize;
530 rec.data_len = dbuf.dsize;
531 rec.full_hash = hash;
532 rec.magic = TDB1_MAGIC;
533 if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1
534 || tdb->tdb1.io->tdb1_write(
535 tdb, rec_ptr + sizeof(rec),
536 key.dptr, key.dsize) == -1
537 || tdb->tdb1.io->tdb1_write(
538 tdb, rec_ptr + sizeof(rec) + key.dsize,
539 dbuf.dptr, dbuf.dsize) == -1) {
547 * We have to allocate some space from the freelist, so this means we
548 * have to lock it. Use the chance to purge all the DEAD records from
549 * the hash chain under the freelist lock.
552 if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
556 if ((tdb->tdb1.max_dead_records != 0)
557 && (tdb1_purge_dead(tdb, hash) == -1)) {
558 tdb1_unlock(tdb, -1, F_WRLCK);
562 /* we have to allocate some space */
563 rec_ptr = tdb1_allocate(tdb, key.dsize + dbuf.dsize, &rec);
565 tdb1_unlock(tdb, -1, F_WRLCK);
571 /* Read hash top into next ptr */
572 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec.next) == -1)
575 rec.key_len = key.dsize;
576 rec.data_len = dbuf.dsize;
577 rec.full_hash = hash;
578 rec.magic = TDB1_MAGIC;
580 /* write out and point the top of the hash chain at it */
581 if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1
582 || tdb->tdb1.io->tdb1_write(tdb, rec_ptr + sizeof(rec),
583 key.dptr, key.dsize) == -1
584 || tdb->tdb1.io->tdb1_write(tdb, rec_ptr + sizeof(rec) + key.dsize,
585 dbuf.dptr, dbuf.dsize) == -1
586 || tdb1_ofs_write(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) {
587 /* Need to tdb1_unallocate() here */
595 tdb1_increment_seqnum(tdb);
600 /* store an element in the database, replacing any existing element
603 return 0 on success, -1 on failure
605 int tdb1_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
610 assert(tdb->flags & TDB_VERSION1);
612 if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) {
613 tdb->last_error = tdb_logerr(tdb, TDB_ERR_RDONLY,
615 "tdb_store: read-only tdb");
619 /* find which hash bucket it is in */
620 hash = tdb_hash(tdb, key.dptr, key.dsize);
621 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
624 ret = _tdb1_store(tdb, key, dbuf, flag, hash);
625 tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
629 /* Append to an entry. Create if not exist. */
630 int tdb1_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
636 assert(tdb->flags & TDB_VERSION1);
638 /* find which hash bucket it is in */
639 hash = tdb_hash(tdb, key.dptr, key.dsize);
640 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
643 dbuf = _tdb1_fetch(tdb, key);
645 if (dbuf.dptr == NULL) {
646 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
648 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
649 unsigned char *new_dptr;
651 /* realloc '0' is special: don't do that. */
654 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
655 if (new_dptr == NULL) {
658 dbuf.dptr = new_dptr;
661 if (dbuf.dptr == NULL) {
662 tdb->last_error = TDB_ERR_OOM;
666 memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
667 dbuf.dsize += new_dbuf.dsize;
669 ret = _tdb1_store(tdb, key, dbuf, 0, hash);
672 tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
673 SAFE_FREE(dbuf.dptr);
679 get the tdb sequence number. Only makes sense if the writers opened
680 with TDB1_SEQNUM set. Note that this sequence number will wrap quite
681 quickly, so it should only be used for a 'has something changed'
682 test, not for code that relies on the count of the number of changes
683 made. If you want a counter then use a tdb record.
685 The aim of this sequence number is to allow for a very lightweight
686 test of a possible tdb change.
688 int tdb1_get_seqnum(struct tdb_context *tdb)
692 tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum);
698 add a region of the file to the freelist. Length is the size of the region in bytes,
699 which includes the free list header that needs to be added
701 static int tdb1_free_region(struct tdb_context *tdb, tdb1_off_t offset, ssize_t length)
703 struct tdb1_record rec;
704 if (length <= sizeof(rec)) {
705 /* the region is not worth adding */
708 if (length + offset > tdb->file->map_size) {
709 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
710 "tdb1_free_region: adding region beyond"
714 memset(&rec,'\0',sizeof(rec));
715 rec.rec_len = length - sizeof(rec);
716 if (tdb1_free(tdb, offset, &rec) == -1) {
717 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
718 "tdb1_free_region: failed to add free record");
725 wipe the entire database, deleting all records. This can be done
726 very fast by using a allrecord lock. The entire data portion of the
727 file becomes a single entry in the freelist.
729 This code carefully steps around the recovery area, leaving it alone
731 int tdb1_wipe_all(struct tdb_context *tdb)
734 tdb1_off_t offset = 0;
736 tdb1_off_t recovery_head;
737 tdb1_len_t recovery_size = 0;
739 if (tdb_lockall(tdb) != TDB_SUCCESS) {
744 /* see if the tdb has a recovery area, and remember its size
745 if so. We don't want to lose this as otherwise each
746 tdb1_wipe_all() in a transaction will increase the size of
747 the tdb by the size of the recovery area */
748 if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, &recovery_head) == -1) {
749 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
750 "tdb1_wipe_all: failed to read recovery head");
754 if (recovery_head != 0) {
755 struct tdb1_record rec;
756 if (tdb->tdb1.io->tdb1_read(tdb, recovery_head, &rec, sizeof(rec), TDB1_DOCONV()) == -1) {
757 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
758 "tdb1_wipe_all: failed to read recovery record");
761 recovery_size = rec.rec_len + sizeof(rec);
764 /* wipe the hashes */
765 for (i=0;i<tdb->tdb1.header.hash_size;i++) {
766 if (tdb1_ofs_write(tdb, TDB1_HASH_TOP(i), &offset) == -1) {
767 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
768 "tdb1_wipe_all: failed to write hash %d", i);
773 /* wipe the freelist */
774 if (tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) {
775 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
776 "tdb1_wipe_all: failed to write freelist");
780 /* add all the rest of the file to the freelist, possibly leaving a gap
781 for the recovery area */
782 if (recovery_size == 0) {
783 /* the simple case - the whole file can be used as a freelist */
784 data_len = (tdb->file->map_size - TDB1_DATA_START(tdb->tdb1.header.hash_size));
785 if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) {
789 /* we need to add two freelist entries - one on either
790 side of the recovery area
792 Note that we cannot shift the recovery area during
793 this operation. Only the transaction.c code may
794 move the recovery area or we risk subtle data
797 data_len = (recovery_head - TDB1_DATA_START(tdb->tdb1.header.hash_size));
798 if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) {
801 /* and the 2nd free list entry after the recovery area - if any */
802 data_len = tdb->file->map_size - (recovery_head+recovery_size);
803 if (tdb1_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
808 tdb1_increment_seqnum_nonblock(tdb);
817 /* Even on files, we can get partial writes due to signals. */
818 bool tdb1_write_all(int fd, const void *buf, size_t count)
822 ret = write(fd, buf, count);
825 buf = (const char *)buf + ret;