2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #include "tdb1_private.h"
32 non-blocking increment of the tdb sequence number if the tdb has been opened using
35 void tdb1_increment_seqnum_nonblock(struct tdb_context *tdb)
39 if (!(tdb->flags & TDB_SEQNUM)) {
43 /* we ignore errors from this, as we have no sane way of
46 tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum);
48 tdb1_ofs_write(tdb, TDB1_SEQNUM_OFS, &seqnum);
52 increment the tdb sequence number if the tdb has been opened using
55 static void tdb1_increment_seqnum(struct tdb_context *tdb)
57 if (!(tdb->flags & TDB_SEQNUM)) {
61 if (tdb1_nest_lock(tdb, TDB1_SEQNUM_OFS, F_WRLCK,
62 TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
66 tdb1_increment_seqnum_nonblock(tdb);
68 tdb1_nest_unlock(tdb, TDB1_SEQNUM_OFS, F_WRLCK);
71 static int tdb1_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
73 return memcmp(data.dptr, key.dptr, data.dsize);
76 /* Returns 0 on fail. On success, return offset of record, and fills
78 static tdb1_off_t tdb1_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
79 struct tdb1_record *r)
83 /* read in the hash top */
84 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
87 /* keep looking until we find the right record */
89 if (tdb1_rec_read(tdb, rec_ptr, r) == -1)
92 tdb->stats.compares++;
94 tdb->stats.compare_wrong_bucket++;
95 } else if (key.dsize != r->key_len) {
96 tdb->stats.compare_wrong_keylen++;
97 } else if (hash != r->full_hash) {
98 tdb->stats.compare_wrong_rechash++;
99 } else if (tdb1_parse_data(tdb, key, rec_ptr + sizeof(*r),
100 r->key_len, tdb1_key_compare,
102 tdb->stats.compare_wrong_keycmp++;
106 /* detect tight infinite loop */
107 if (rec_ptr == r->next) {
108 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT,
110 "tdb1_find: loop detected.");
115 tdb->last_error = TDB_ERR_NOEXIST;
119 /* As tdb1_find, but if you succeed, keep the lock */
120 tdb1_off_t tdb1_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
121 struct tdb1_record *rec)
125 if (tdb1_lock(tdb, TDB1_BUCKET(hash), locktype) == -1)
127 if (!(rec_ptr = tdb1_find(tdb, key, hash, rec)))
128 tdb1_unlock(tdb, TDB1_BUCKET(hash), locktype);
132 static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key);
134 /* update an entry in place - this only works if the new data size
135 is <= the old data size and the key exists.
136 on failure return -1.
138 static int tdb1_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
140 struct tdb1_record rec;
144 if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec)))
147 /* it could be an exact duplicate of what is there - this is
148 * surprisingly common (eg. with a ldb re-index). */
149 if (rec.key_len == key.dsize &&
150 rec.data_len == dbuf.dsize &&
151 rec.full_hash == hash) {
152 TDB_DATA data = _tdb1_fetch(tdb, key);
153 if (data.dsize == dbuf.dsize &&
154 memcmp(data.dptr, dbuf.dptr, data.dsize) == 0) {
165 /* must be long enough key, data and tailer */
166 if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb1_off_t)) {
167 tdb->last_error = TDB_SUCCESS; /* Not really an error */
171 if (tdb->tdb1.io->tdb1_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
172 dbuf.dptr, dbuf.dsize) == -1)
175 if (dbuf.dsize != rec.data_len) {
177 rec.data_len = dbuf.dsize;
178 return tdb1_rec_write(tdb, rec_ptr, &rec);
184 /* find an entry in the database given a key */
185 /* If an entry doesn't exist tdb1_err will be set to
186 * TDB_ERR_NOEXIST. If a key has no data attached
187 * then the TDB_DATA will have zero length but
190 static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key)
193 struct tdb1_record rec;
197 /* find which hash bucket it is in */
198 hash = tdb_hash(tdb, key.dptr, key.dsize);
199 if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
205 ret.dptr = tdb1_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
207 ret.dsize = rec.data_len;
208 tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
212 enum TDB_ERROR tdb1_fetch(struct tdb_context *tdb, TDB_DATA key, TDB_DATA *data)
214 *data = _tdb1_fetch(tdb, key);
215 if (data->dptr == NULL)
216 return tdb->last_error;
220 enum TDB_ERROR tdb1_parse_record(struct tdb_context *tdb, TDB_DATA key,
221 enum TDB_ERROR (*parser)(TDB_DATA key,
227 struct tdb1_record rec;
231 /* find which hash bucket it is in */
232 hash = tdb_hash(tdb, key.dptr, key.dsize);
234 if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
235 /* record not found */
236 return TDB_ERR_NOEXIST;
239 ret = tdb1_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
240 rec.data_len, parser, private_data);
242 tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
247 /* check if an entry in the database exists
249 note that 1 is returned if the key is found and 0 is returned if not found
250 this doesn't match the conventions in the rest of this module, but is
253 static int tdb1_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
255 struct tdb1_record rec;
257 if (tdb1_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
259 tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
263 int tdb1_exists(struct tdb_context *tdb, TDB_DATA key)
265 uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize);
268 assert(tdb->flags & TDB_VERSION1);
269 ret = tdb1_exists_hash(tdb, key, hash);
273 /* actually delete an entry in the database given the offset */
274 int tdb1_do_delete(struct tdb_context *tdb, tdb1_off_t rec_ptr, struct tdb1_record *rec)
276 tdb1_off_t last_ptr, i;
277 struct tdb1_record lastrec;
279 if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) return -1;
281 if (((tdb->tdb1.traverse_write != 0) && (!TDB1_DEAD(rec))) ||
282 tdb1_write_lock_record(tdb, rec_ptr) == -1) {
283 /* Someone traversing here: mark it as dead */
284 rec->magic = TDB1_DEAD_MAGIC;
285 return tdb1_rec_write(tdb, rec_ptr, rec);
287 if (tdb1_write_unlock_record(tdb, rec_ptr) != 0)
290 /* find previous record in hash chain */
291 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(rec->full_hash), &i) == -1)
293 for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
294 if (tdb1_rec_read(tdb, i, &lastrec) == -1)
297 /* unlink it: next ptr is at start of record. */
299 last_ptr = TDB1_HASH_TOP(rec->full_hash);
300 if (tdb1_ofs_write(tdb, last_ptr, &rec->next) == -1)
303 /* recover the space */
304 if (tdb1_free(tdb, rec_ptr, rec) == -1)
309 static int tdb1_count_dead(struct tdb_context *tdb, uint32_t hash)
313 struct tdb1_record rec;
315 /* read in the hash top */
316 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
320 if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1)
323 if (rec.magic == TDB1_DEAD_MAGIC) {
332 * Purge all DEAD records from a hash chain
334 static int tdb1_purge_dead(struct tdb_context *tdb, uint32_t hash)
337 struct tdb1_record rec;
340 if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
344 /* read in the hash top */
345 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
351 if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1) {
357 if (rec.magic == TDB1_DEAD_MAGIC
358 && tdb1_do_delete(tdb, rec_ptr, &rec) == -1) {
365 tdb1_unlock(tdb, -1, F_WRLCK);
369 /* delete an entry in the database given a key */
370 static int tdb1_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
373 struct tdb1_record rec;
376 if (tdb->tdb1.max_dead_records != 0) {
379 * Allow for some dead records per hash chain, mainly for
380 * tdb's with a very high create/delete rate like locking.tdb.
383 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
386 if (tdb1_count_dead(tdb, hash) >= tdb->tdb1.max_dead_records) {
388 * Don't let the per-chain freelist grow too large,
389 * delete all existing dead records
391 tdb1_purge_dead(tdb, hash);
394 if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec))) {
395 tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
400 * Just mark the record as dead.
402 rec.magic = TDB1_DEAD_MAGIC;
403 ret = tdb1_rec_write(tdb, rec_ptr, &rec);
406 if (!(rec_ptr = tdb1_find_lock_hash(tdb, key, hash, F_WRLCK,
410 ret = tdb1_do_delete(tdb, rec_ptr, &rec);
414 tdb1_increment_seqnum(tdb);
417 if (tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_WRLCK) != 0)
418 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
419 "tdb1_delete: WARNING tdb1_unlock failed!");
423 int tdb1_delete(struct tdb_context *tdb, TDB_DATA key)
425 uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize);
428 assert(tdb->flags & TDB_VERSION1);
429 ret = tdb1_delete_hash(tdb, key, hash);
434 * See if we have a dead record around with enough space
436 static tdb1_off_t tdb1_find_dead(struct tdb_context *tdb, uint32_t hash,
437 struct tdb1_record *r, tdb1_len_t length)
441 /* read in the hash top */
442 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
445 /* keep looking until we find the right record */
447 if (tdb1_rec_read(tdb, rec_ptr, r) == -1)
450 if (TDB1_DEAD(r) && r->rec_len >= length) {
452 * First fit for simple coding, TODO: change to best
462 static int _tdb1_store(struct tdb_context *tdb, TDB_DATA key,
463 TDB_DATA dbuf, int flag, uint32_t hash)
465 struct tdb1_record rec;
470 /* check for it existing, on insert. */
471 if (flag == TDB_INSERT) {
472 if (tdb1_exists_hash(tdb, key, hash)) {
473 tdb->last_error = TDB_ERR_EXISTS;
477 /* first try in-place update, on modify or replace. */
478 if (tdb1_update_hash(tdb, key, hash, dbuf) == 0) {
481 if (tdb->last_error == TDB_ERR_NOEXIST &&
482 flag == TDB_MODIFY) {
483 /* if the record doesn't exist and we are in TDB1_MODIFY mode then
484 we should fail the store */
488 /* reset the error code potentially set by the tdb1_update() */
489 tdb->last_error = TDB_SUCCESS;
491 /* delete any existing record - if it doesn't exist we don't
492 care. Doing this first reduces fragmentation, and avoids
493 coalescing with `allocated' block before it's updated. */
494 if (flag != TDB_INSERT)
495 tdb1_delete_hash(tdb, key, hash);
497 /* Copy key+value *before* allocating free space in case malloc
498 fails and we are left with a dead spot in the tdb. */
500 if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
501 tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
502 "tdb1_store: out of memory"
507 memcpy(p, key.dptr, key.dsize);
509 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
511 if (tdb->tdb1.max_dead_records != 0) {
513 * Allow for some dead records per hash chain, look if we can
514 * find one that can hold the new record. We need enough space
515 * for key, data and tailer. If we find one, we don't have to
516 * consult the central freelist.
518 rec_ptr = tdb1_find_dead(
520 key.dsize + dbuf.dsize + sizeof(tdb1_off_t));
523 rec.key_len = key.dsize;
524 rec.data_len = dbuf.dsize;
525 rec.full_hash = hash;
526 rec.magic = TDB1_MAGIC;
527 if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1
528 || tdb->tdb1.io->tdb1_write(
529 tdb, rec_ptr + sizeof(rec),
530 p, key.dsize + dbuf.dsize) == -1) {
538 * We have to allocate some space from the freelist, so this means we
539 * have to lock it. Use the chance to purge all the DEAD records from
540 * the hash chain under the freelist lock.
543 if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
547 if ((tdb->tdb1.max_dead_records != 0)
548 && (tdb1_purge_dead(tdb, hash) == -1)) {
549 tdb1_unlock(tdb, -1, F_WRLCK);
553 /* we have to allocate some space */
554 rec_ptr = tdb1_allocate(tdb, key.dsize + dbuf.dsize, &rec);
556 tdb1_unlock(tdb, -1, F_WRLCK);
562 /* Read hash top into next ptr */
563 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec.next) == -1)
566 rec.key_len = key.dsize;
567 rec.data_len = dbuf.dsize;
568 rec.full_hash = hash;
569 rec.magic = TDB1_MAGIC;
571 /* write out and point the top of the hash chain at it */
572 if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1
573 || tdb->tdb1.io->tdb1_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
574 || tdb1_ofs_write(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) {
575 /* Need to tdb1_unallocate() here */
583 tdb1_increment_seqnum(tdb);
590 /* store an element in the database, replacing any existing element
593 return 0 on success, -1 on failure
595 int tdb1_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
600 assert(tdb->flags & TDB_VERSION1);
602 if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) {
603 tdb->last_error = tdb_logerr(tdb, TDB_ERR_RDONLY,
605 "tdb_store: read-only tdb");
609 /* find which hash bucket it is in */
610 hash = tdb_hash(tdb, key.dptr, key.dsize);
611 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
614 ret = _tdb1_store(tdb, key, dbuf, flag, hash);
615 tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
619 /* Append to an entry. Create if not exist. */
620 int tdb1_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
626 assert(tdb->flags & TDB_VERSION1);
628 /* find which hash bucket it is in */
629 hash = tdb_hash(tdb, key.dptr, key.dsize);
630 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
633 dbuf = _tdb1_fetch(tdb, key);
635 if (dbuf.dptr == NULL) {
636 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
638 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
639 unsigned char *new_dptr;
641 /* realloc '0' is special: don't do that. */
644 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
645 if (new_dptr == NULL) {
648 dbuf.dptr = new_dptr;
651 if (dbuf.dptr == NULL) {
652 tdb->last_error = TDB_ERR_OOM;
656 memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
657 dbuf.dsize += new_dbuf.dsize;
659 ret = _tdb1_store(tdb, key, dbuf, 0, hash);
662 tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
663 SAFE_FREE(dbuf.dptr);
669 get the tdb sequence number. Only makes sense if the writers opened
670 with TDB1_SEQNUM set. Note that this sequence number will wrap quite
671 quickly, so it should only be used for a 'has something changed'
672 test, not for code that relies on the count of the number of changes
673 made. If you want a counter then use a tdb record.
675 The aim of this sequence number is to allow for a very lightweight
676 test of a possible tdb change.
678 int tdb1_get_seqnum(struct tdb_context *tdb)
682 tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum);
688 add a region of the file to the freelist. Length is the size of the region in bytes,
689 which includes the free list header that needs to be added
691 static int tdb1_free_region(struct tdb_context *tdb, tdb1_off_t offset, ssize_t length)
693 struct tdb1_record rec;
694 if (length <= sizeof(rec)) {
695 /* the region is not worth adding */
698 if (length + offset > tdb->file->map_size) {
699 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
700 "tdb1_free_region: adding region beyond"
704 memset(&rec,'\0',sizeof(rec));
705 rec.rec_len = length - sizeof(rec);
706 if (tdb1_free(tdb, offset, &rec) == -1) {
707 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
708 "tdb1_free_region: failed to add free record");
715 wipe the entire database, deleting all records. This can be done
716 very fast by using a allrecord lock. The entire data portion of the
717 file becomes a single entry in the freelist.
719 This code carefully steps around the recovery area, leaving it alone
721 int tdb1_wipe_all(struct tdb_context *tdb)
724 tdb1_off_t offset = 0;
726 tdb1_off_t recovery_head;
727 tdb1_len_t recovery_size = 0;
729 if (tdb_lockall(tdb) != TDB_SUCCESS) {
734 /* see if the tdb has a recovery area, and remember its size
735 if so. We don't want to lose this as otherwise each
736 tdb1_wipe_all() in a transaction will increase the size of
737 the tdb by the size of the recovery area */
738 if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, &recovery_head) == -1) {
739 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
740 "tdb1_wipe_all: failed to read recovery head");
744 if (recovery_head != 0) {
745 struct tdb1_record rec;
746 if (tdb->tdb1.io->tdb1_read(tdb, recovery_head, &rec, sizeof(rec), TDB1_DOCONV()) == -1) {
747 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
748 "tdb1_wipe_all: failed to read recovery record");
751 recovery_size = rec.rec_len + sizeof(rec);
754 /* wipe the hashes */
755 for (i=0;i<tdb->tdb1.header.hash_size;i++) {
756 if (tdb1_ofs_write(tdb, TDB1_HASH_TOP(i), &offset) == -1) {
757 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
758 "tdb1_wipe_all: failed to write hash %d", i);
763 /* wipe the freelist */
764 if (tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) {
765 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
766 "tdb1_wipe_all: failed to write freelist");
770 /* add all the rest of the file to the freelist, possibly leaving a gap
771 for the recovery area */
772 if (recovery_size == 0) {
773 /* the simple case - the whole file can be used as a freelist */
774 data_len = (tdb->file->map_size - TDB1_DATA_START(tdb->tdb1.header.hash_size));
775 if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) {
779 /* we need to add two freelist entries - one on either
780 side of the recovery area
782 Note that we cannot shift the recovery area during
783 this operation. Only the transaction.c code may
784 move the recovery area or we risk subtle data
787 data_len = (recovery_head - TDB1_DATA_START(tdb->tdb1.header.hash_size));
788 if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) {
791 /* and the 2nd free list entry after the recovery area - if any */
792 data_len = tdb->file->map_size - (recovery_head+recovery_size);
793 if (tdb1_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
798 tdb1_increment_seqnum_nonblock(tdb);
807 /* Even on files, we can get partial writes due to signals. */
808 bool tdb1_write_all(int fd, const void *buf, size_t count)
812 ret = write(fd, buf, count);
815 buf = (const char *)buf + ret;