2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #include "tdb1_private.h"
32 non-blocking increment of the tdb sequence number if the tdb has been opened using
35 void tdb1_increment_seqnum_nonblock(struct tdb_context *tdb)
39 if (!(tdb->flags & TDB_SEQNUM)) {
43 /* we ignore errors from this, as we have no sane way of
46 tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum);
48 tdb1_ofs_write(tdb, TDB1_SEQNUM_OFS, &seqnum);
52 increment the tdb sequence number if the tdb has been opened using
55 static void tdb1_increment_seqnum(struct tdb_context *tdb)
57 if (!(tdb->flags & TDB_SEQNUM)) {
61 if (tdb1_nest_lock(tdb, TDB1_SEQNUM_OFS, F_WRLCK,
62 TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
66 tdb1_increment_seqnum_nonblock(tdb);
68 tdb1_nest_unlock(tdb, TDB1_SEQNUM_OFS, F_WRLCK);
71 static int tdb1_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
73 return memcmp(data.dptr, key.dptr, data.dsize);
76 /* Returns 0 on fail. On success, return offset of record, and fills
78 static tdb1_off_t tdb1_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
79 struct tdb1_record *r)
83 /* read in the hash top */
84 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
87 /* keep looking until we find the right record */
89 if (tdb1_rec_read(tdb, rec_ptr, r) == -1)
92 tdb->stats.compares++;
94 tdb->stats.compare_wrong_bucket++;
95 } else if (key.dsize != r->key_len) {
96 tdb->stats.compare_wrong_keylen++;
97 } else if (hash != r->full_hash) {
98 tdb->stats.compare_wrong_rechash++;
99 } else if (tdb1_parse_data(tdb, key, rec_ptr + sizeof(*r),
100 r->key_len, tdb1_key_compare,
102 tdb->stats.compare_wrong_keycmp++;
106 /* detect tight infinite loop */
107 if (rec_ptr == r->next) {
108 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT,
110 "tdb1_find: loop detected.");
115 tdb->last_error = TDB_ERR_NOEXIST;
119 /* As tdb1_find, but if you succeed, keep the lock */
120 tdb1_off_t tdb1_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
121 struct tdb1_record *rec)
125 if (tdb1_lock(tdb, TDB1_BUCKET(hash), locktype) == -1)
127 if (!(rec_ptr = tdb1_find(tdb, key, hash, rec)))
128 tdb1_unlock(tdb, TDB1_BUCKET(hash), locktype);
132 static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key);
134 /* update an entry in place - this only works if the new data size
135 is <= the old data size and the key exists.
136 on failure return -1.
138 static int tdb1_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
140 struct tdb1_record rec;
144 if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec)))
147 /* it could be an exact duplicate of what is there - this is
148 * surprisingly common (eg. with a ldb re-index). */
149 if (rec.key_len == key.dsize &&
150 rec.data_len == dbuf.dsize &&
151 rec.full_hash == hash) {
152 TDB_DATA data = _tdb1_fetch(tdb, key);
153 if (data.dsize == dbuf.dsize &&
154 memcmp(data.dptr, dbuf.dptr, data.dsize) == 0) {
165 /* must be long enough key, data and tailer */
166 if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb1_off_t)) {
167 tdb->last_error = TDB_SUCCESS; /* Not really an error */
171 if (tdb->tdb1.io->tdb1_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
172 dbuf.dptr, dbuf.dsize) == -1)
175 if (dbuf.dsize != rec.data_len) {
177 rec.data_len = dbuf.dsize;
178 return tdb1_rec_write(tdb, rec_ptr, &rec);
184 /* find an entry in the database given a key */
185 /* If an entry doesn't exist tdb1_err will be set to
186 * TDB_ERR_NOEXIST. If a key has no data attached
187 * then the TDB_DATA will have zero length but
190 static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key)
193 struct tdb1_record rec;
197 /* find which hash bucket it is in */
198 hash = tdb_hash(tdb, key.dptr, key.dsize);
199 if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
205 ret.dptr = tdb1_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
207 ret.dsize = rec.data_len;
208 tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
212 enum TDB_ERROR tdb1_fetch(struct tdb_context *tdb, TDB_DATA key, TDB_DATA *data)
214 *data = _tdb1_fetch(tdb, key);
215 if (data->dptr == NULL)
216 return tdb->last_error;
220 enum TDB_ERROR tdb1_parse_record(struct tdb_context *tdb, TDB_DATA key,
221 enum TDB_ERROR (*parser)(TDB_DATA key,
227 struct tdb1_record rec;
231 /* find which hash bucket it is in */
232 hash = tdb_hash(tdb, key.dptr, key.dsize);
234 if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
235 /* record not found */
236 return TDB_ERR_NOEXIST;
239 ret = tdb1_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
240 rec.data_len, parser, private_data);
242 tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
247 /* check if an entry in the database exists
249 note that 1 is returned if the key is found and 0 is returned if not found
250 this doesn't match the conventions in the rest of this module, but is
253 static int tdb1_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
255 struct tdb1_record rec;
257 if (tdb1_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
259 tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
263 int tdb1_exists(struct tdb_context *tdb, TDB_DATA key)
265 uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize);
268 assert(tdb->flags & TDB_VERSION1);
269 ret = tdb1_exists_hash(tdb, key, hash);
273 /* actually delete an entry in the database given the offset */
274 int tdb1_do_delete(struct tdb_context *tdb, tdb1_off_t rec_ptr, struct tdb1_record *rec)
276 tdb1_off_t last_ptr, i;
277 struct tdb1_record lastrec;
279 if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) return -1;
281 if (((tdb->tdb1.traverse_write != 0) && (!TDB1_DEAD(rec))) ||
282 tdb1_write_lock_record(tdb, rec_ptr) == -1) {
283 /* Someone traversing here: mark it as dead */
284 rec->magic = TDB1_DEAD_MAGIC;
285 return tdb1_rec_write(tdb, rec_ptr, rec);
287 if (tdb1_write_unlock_record(tdb, rec_ptr) != 0)
290 /* find previous record in hash chain */
291 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(rec->full_hash), &i) == -1)
293 for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
294 if (tdb1_rec_read(tdb, i, &lastrec) == -1)
297 /* unlink it: next ptr is at start of record. */
299 last_ptr = TDB1_HASH_TOP(rec->full_hash);
300 if (tdb1_ofs_write(tdb, last_ptr, &rec->next) == -1)
303 /* recover the space */
304 if (tdb1_free(tdb, rec_ptr, rec) == -1)
309 static int tdb1_count_dead(struct tdb_context *tdb, uint32_t hash)
313 struct tdb1_record rec;
315 /* read in the hash top */
316 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
320 if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1)
323 if (rec.magic == TDB1_DEAD_MAGIC) {
332 * Purge all DEAD records from a hash chain
334 static int tdb1_purge_dead(struct tdb_context *tdb, uint32_t hash)
337 struct tdb1_record rec;
340 if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
344 /* read in the hash top */
345 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
351 if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1) {
357 if (rec.magic == TDB1_DEAD_MAGIC
358 && tdb1_do_delete(tdb, rec_ptr, &rec) == -1) {
365 tdb1_unlock(tdb, -1, F_WRLCK);
369 /* delete an entry in the database given a key */
370 static int tdb1_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
373 struct tdb1_record rec;
376 if (tdb->tdb1.max_dead_records != 0) {
379 * Allow for some dead records per hash chain, mainly for
380 * tdb's with a very high create/delete rate like locking.tdb.
383 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
386 if (tdb1_count_dead(tdb, hash) >= tdb->tdb1.max_dead_records) {
388 * Don't let the per-chain freelist grow too large,
389 * delete all existing dead records
391 tdb1_purge_dead(tdb, hash);
394 if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec))) {
395 tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
400 * Just mark the record as dead.
402 rec.magic = TDB1_DEAD_MAGIC;
403 ret = tdb1_rec_write(tdb, rec_ptr, &rec);
406 if (!(rec_ptr = tdb1_find_lock_hash(tdb, key, hash, F_WRLCK,
410 ret = tdb1_do_delete(tdb, rec_ptr, &rec);
414 tdb1_increment_seqnum(tdb);
417 if (tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_WRLCK) != 0)
418 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
419 "tdb1_delete: WARNING tdb1_unlock failed!");
423 int tdb1_delete(struct tdb_context *tdb, TDB_DATA key)
425 uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize);
428 assert(tdb->flags & TDB_VERSION1);
429 ret = tdb1_delete_hash(tdb, key, hash);
434 * See if we have a dead record around with enough space
436 static tdb1_off_t tdb1_find_dead(struct tdb_context *tdb, uint32_t hash,
437 struct tdb1_record *r, tdb1_len_t length)
441 /* read in the hash top */
442 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
445 /* keep looking until we find the right record */
447 if (tdb1_rec_read(tdb, rec_ptr, r) == -1)
450 if (TDB1_DEAD(r) && r->rec_len >= length) {
452 * First fit for simple coding, TODO: change to best
462 static int _tdb1_store(struct tdb_context *tdb, TDB_DATA key,
463 TDB_DATA dbuf, int flag, uint32_t hash)
465 struct tdb1_record rec;
470 /* check for it existing, on insert. */
471 if (flag == TDB_INSERT) {
472 if (tdb1_exists_hash(tdb, key, hash)) {
473 tdb->last_error = TDB_ERR_EXISTS;
477 /* first try in-place update, on modify or replace. */
478 if (tdb1_update_hash(tdb, key, hash, dbuf) == 0) {
481 if (tdb->last_error == TDB_ERR_NOEXIST &&
482 flag == TDB_MODIFY) {
483 /* if the record doesn't exist and we are in TDB1_MODIFY mode then
484 we should fail the store */
488 /* reset the error code potentially set by the tdb1_update() */
489 tdb->last_error = TDB_SUCCESS;
491 /* delete any existing record - if it doesn't exist we don't
492 care. Doing this first reduces fragmentation, and avoids
493 coalescing with `allocated' block before it's updated. */
494 if (flag != TDB_INSERT)
495 tdb1_delete_hash(tdb, key, hash);
497 /* Copy key+value *before* allocating free space in case malloc
498 fails and we are left with a dead spot in the tdb. */
500 if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
501 tdb->last_error = TDB_ERR_OOM;
505 memcpy(p, key.dptr, key.dsize);
507 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
509 if (tdb->tdb1.max_dead_records != 0) {
511 * Allow for some dead records per hash chain, look if we can
512 * find one that can hold the new record. We need enough space
513 * for key, data and tailer. If we find one, we don't have to
514 * consult the central freelist.
516 rec_ptr = tdb1_find_dead(
518 key.dsize + dbuf.dsize + sizeof(tdb1_off_t));
521 rec.key_len = key.dsize;
522 rec.data_len = dbuf.dsize;
523 rec.full_hash = hash;
524 rec.magic = TDB1_MAGIC;
525 if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1
526 || tdb->tdb1.io->tdb1_write(
527 tdb, rec_ptr + sizeof(rec),
528 p, key.dsize + dbuf.dsize) == -1) {
536 * We have to allocate some space from the freelist, so this means we
537 * have to lock it. Use the chance to purge all the DEAD records from
538 * the hash chain under the freelist lock.
541 if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
545 if ((tdb->tdb1.max_dead_records != 0)
546 && (tdb1_purge_dead(tdb, hash) == -1)) {
547 tdb1_unlock(tdb, -1, F_WRLCK);
551 /* we have to allocate some space */
552 rec_ptr = tdb1_allocate(tdb, key.dsize + dbuf.dsize, &rec);
554 tdb1_unlock(tdb, -1, F_WRLCK);
560 /* Read hash top into next ptr */
561 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec.next) == -1)
564 rec.key_len = key.dsize;
565 rec.data_len = dbuf.dsize;
566 rec.full_hash = hash;
567 rec.magic = TDB1_MAGIC;
569 /* write out and point the top of the hash chain at it */
570 if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1
571 || tdb->tdb1.io->tdb1_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
572 || tdb1_ofs_write(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) {
573 /* Need to tdb1_unallocate() here */
581 tdb1_increment_seqnum(tdb);
588 /* store an element in the database, replacing any existing element
591 return 0 on success, -1 on failure
593 int tdb1_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
598 assert(tdb->flags & TDB_VERSION1);
600 if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) {
601 tdb->last_error = tdb_logerr(tdb, TDB_ERR_RDONLY,
603 "tdb_store: read-only tdb");
607 /* find which hash bucket it is in */
608 hash = tdb_hash(tdb, key.dptr, key.dsize);
609 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
612 ret = _tdb1_store(tdb, key, dbuf, flag, hash);
613 tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
617 /* Append to an entry. Create if not exist. */
618 int tdb1_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
624 assert(tdb->flags & TDB_VERSION1);
626 /* find which hash bucket it is in */
627 hash = tdb_hash(tdb, key.dptr, key.dsize);
628 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
631 dbuf = _tdb1_fetch(tdb, key);
633 if (dbuf.dptr == NULL) {
634 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
636 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
637 unsigned char *new_dptr;
639 /* realloc '0' is special: don't do that. */
642 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
643 if (new_dptr == NULL) {
646 dbuf.dptr = new_dptr;
649 if (dbuf.dptr == NULL) {
650 tdb->last_error = TDB_ERR_OOM;
654 memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
655 dbuf.dsize += new_dbuf.dsize;
657 ret = _tdb1_store(tdb, key, dbuf, 0, hash);
660 tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
661 SAFE_FREE(dbuf.dptr);
667 get the tdb sequence number. Only makes sense if the writers opened
668 with TDB1_SEQNUM set. Note that this sequence number will wrap quite
669 quickly, so it should only be used for a 'has something changed'
670 test, not for code that relies on the count of the number of changes
671 made. If you want a counter then use a tdb record.
673 The aim of this sequence number is to allow for a very lightweight
674 test of a possible tdb change.
676 int tdb1_get_seqnum(struct tdb_context *tdb)
680 tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum);
686 add a region of the file to the freelist. Length is the size of the region in bytes,
687 which includes the free list header that needs to be added
689 static int tdb1_free_region(struct tdb_context *tdb, tdb1_off_t offset, ssize_t length)
691 struct tdb1_record rec;
692 if (length <= sizeof(rec)) {
693 /* the region is not worth adding */
696 if (length + offset > tdb->file->map_size) {
697 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
698 "tdb1_free_region: adding region beyond"
702 memset(&rec,'\0',sizeof(rec));
703 rec.rec_len = length - sizeof(rec);
704 if (tdb1_free(tdb, offset, &rec) == -1) {
705 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
706 "tdb1_free_region: failed to add free record");
713 wipe the entire database, deleting all records. This can be done
714 very fast by using a allrecord lock. The entire data portion of the
715 file becomes a single entry in the freelist.
717 This code carefully steps around the recovery area, leaving it alone
719 int tdb1_wipe_all(struct tdb_context *tdb)
722 tdb1_off_t offset = 0;
724 tdb1_off_t recovery_head;
725 tdb1_len_t recovery_size = 0;
727 if (tdb_lockall(tdb) != TDB_SUCCESS) {
732 /* see if the tdb has a recovery area, and remember its size
733 if so. We don't want to lose this as otherwise each
734 tdb1_wipe_all() in a transaction will increase the size of
735 the tdb by the size of the recovery area */
736 if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, &recovery_head) == -1) {
737 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
738 "tdb1_wipe_all: failed to read recovery head");
742 if (recovery_head != 0) {
743 struct tdb1_record rec;
744 if (tdb->tdb1.io->tdb1_read(tdb, recovery_head, &rec, sizeof(rec), TDB1_DOCONV()) == -1) {
745 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
746 "tdb1_wipe_all: failed to read recovery record");
749 recovery_size = rec.rec_len + sizeof(rec);
752 /* wipe the hashes */
753 for (i=0;i<tdb->tdb1.header.hash_size;i++) {
754 if (tdb1_ofs_write(tdb, TDB1_HASH_TOP(i), &offset) == -1) {
755 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
756 "tdb1_wipe_all: failed to write hash %d", i);
761 /* wipe the freelist */
762 if (tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) {
763 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
764 "tdb1_wipe_all: failed to write freelist");
768 /* add all the rest of the file to the freelist, possibly leaving a gap
769 for the recovery area */
770 if (recovery_size == 0) {
771 /* the simple case - the whole file can be used as a freelist */
772 data_len = (tdb->file->map_size - TDB1_DATA_START(tdb->tdb1.header.hash_size));
773 if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) {
777 /* we need to add two freelist entries - one on either
778 side of the recovery area
780 Note that we cannot shift the recovery area during
781 this operation. Only the transaction.c code may
782 move the recovery area or we risk subtle data
785 data_len = (recovery_head - TDB1_DATA_START(tdb->tdb1.header.hash_size));
786 if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) {
789 /* and the 2nd free list entry after the recovery area - if any */
790 data_len = tdb->file->map_size - (recovery_head+recovery_size);
791 if (tdb1_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
796 tdb1_increment_seqnum_nonblock(tdb);
805 /* Even on files, we can get partial writes due to signals. */
806 bool tdb1_write_all(int fd, const void *buf, size_t count)
810 ret = write(fd, buf, count);
813 buf = (const char *)buf + ret;