2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #include "tdb1_private.h"
32 non-blocking increment of the tdb sequence number if the tdb has been opened using
35 void tdb1_increment_seqnum_nonblock(struct tdb_context *tdb)
39 if (!(tdb->flags & TDB_SEQNUM)) {
43 /* we ignore errors from this, as we have no sane way of
46 tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum);
48 tdb1_ofs_write(tdb, TDB1_SEQNUM_OFS, &seqnum);
52 increment the tdb sequence number if the tdb has been opened using
55 static void tdb1_increment_seqnum(struct tdb_context *tdb)
57 if (!(tdb->flags & TDB_SEQNUM)) {
61 if (tdb1_nest_lock(tdb, TDB1_SEQNUM_OFS, F_WRLCK,
62 TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
66 tdb1_increment_seqnum_nonblock(tdb);
68 tdb1_nest_unlock(tdb, TDB1_SEQNUM_OFS, F_WRLCK);
71 static int tdb1_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
73 return memcmp(data.dptr, key.dptr, data.dsize);
76 /* Returns 0 on fail. On success, return offset of record, and fills
78 static tdb1_off_t tdb1_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
79 struct tdb1_record *r)
83 /* read in the hash top */
84 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
87 /* keep looking until we find the right record */
89 if (tdb1_rec_read(tdb, rec_ptr, r) == -1)
92 if (!TDB1_DEAD(r) && hash==r->full_hash
93 && key.dsize==r->key_len
94 && tdb1_parse_data(tdb, key, rec_ptr + sizeof(*r),
95 r->key_len, tdb1_key_compare,
99 /* detect tight infinite loop */
100 if (rec_ptr == r->next) {
101 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT,
103 "tdb1_find: loop detected.");
108 tdb->last_error = TDB_ERR_NOEXIST;
112 /* As tdb1_find, but if you succeed, keep the lock */
113 tdb1_off_t tdb1_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
114 struct tdb1_record *rec)
118 if (tdb1_lock(tdb, TDB1_BUCKET(hash), locktype) == -1)
120 if (!(rec_ptr = tdb1_find(tdb, key, hash, rec)))
121 tdb1_unlock(tdb, TDB1_BUCKET(hash), locktype);
125 static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key);
127 /* update an entry in place - this only works if the new data size
128 is <= the old data size and the key exists.
129 on failure return -1.
131 static int tdb1_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
133 struct tdb1_record rec;
137 if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec)))
140 /* it could be an exact duplicate of what is there - this is
141 * surprisingly common (eg. with a ldb re-index). */
142 if (rec.key_len == key.dsize &&
143 rec.data_len == dbuf.dsize &&
144 rec.full_hash == hash) {
145 TDB_DATA data = _tdb1_fetch(tdb, key);
146 if (data.dsize == dbuf.dsize &&
147 memcmp(data.dptr, dbuf.dptr, data.dsize) == 0) {
158 /* must be long enough key, data and tailer */
159 if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb1_off_t)) {
160 tdb->last_error = TDB_SUCCESS; /* Not really an error */
164 if (tdb->tdb1.io->tdb1_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
165 dbuf.dptr, dbuf.dsize) == -1)
168 if (dbuf.dsize != rec.data_len) {
170 rec.data_len = dbuf.dsize;
171 return tdb1_rec_write(tdb, rec_ptr, &rec);
177 /* find an entry in the database given a key */
178 /* If an entry doesn't exist tdb1_err will be set to
179 * TDB_ERR_NOEXIST. If a key has no data attached
180 * then the TDB_DATA will have zero length but
183 static TDB_DATA _tdb1_fetch(struct tdb_context *tdb, TDB_DATA key)
186 struct tdb1_record rec;
190 /* find which hash bucket it is in */
191 hash = tdb_hash(tdb, key.dptr, key.dsize);
192 if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
198 ret.dptr = tdb1_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
200 ret.dsize = rec.data_len;
201 tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
205 enum TDB_ERROR tdb1_fetch(struct tdb_context *tdb, TDB_DATA key, TDB_DATA *data)
207 *data = _tdb1_fetch(tdb, key);
208 if (data->dptr == NULL)
209 return tdb->last_error;
213 enum TDB_ERROR tdb1_parse_record(struct tdb_context *tdb, TDB_DATA key,
214 enum TDB_ERROR (*parser)(TDB_DATA key,
220 struct tdb1_record rec;
224 /* find which hash bucket it is in */
225 hash = tdb_hash(tdb, key.dptr, key.dsize);
227 if (!(rec_ptr = tdb1_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
228 /* record not found */
229 return TDB_ERR_NOEXIST;
232 ret = tdb1_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
233 rec.data_len, parser, private_data);
235 tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
240 /* check if an entry in the database exists
242 note that 1 is returned if the key is found and 0 is returned if not found
243 this doesn't match the conventions in the rest of this module, but is
246 static int tdb1_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
248 struct tdb1_record rec;
250 if (tdb1_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
252 tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_RDLCK);
256 int tdb1_exists(struct tdb_context *tdb, TDB_DATA key)
258 uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize);
261 assert(tdb->flags & TDB_VERSION1);
262 ret = tdb1_exists_hash(tdb, key, hash);
266 /* actually delete an entry in the database given the offset */
267 int tdb1_do_delete(struct tdb_context *tdb, tdb1_off_t rec_ptr, struct tdb1_record *rec)
269 tdb1_off_t last_ptr, i;
270 struct tdb1_record lastrec;
272 if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) return -1;
274 if (((tdb->tdb1.traverse_write != 0) && (!TDB1_DEAD(rec))) ||
275 tdb1_write_lock_record(tdb, rec_ptr) == -1) {
276 /* Someone traversing here: mark it as dead */
277 rec->magic = TDB1_DEAD_MAGIC;
278 return tdb1_rec_write(tdb, rec_ptr, rec);
280 if (tdb1_write_unlock_record(tdb, rec_ptr) != 0)
283 /* find previous record in hash chain */
284 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(rec->full_hash), &i) == -1)
286 for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
287 if (tdb1_rec_read(tdb, i, &lastrec) == -1)
290 /* unlink it: next ptr is at start of record. */
292 last_ptr = TDB1_HASH_TOP(rec->full_hash);
293 if (tdb1_ofs_write(tdb, last_ptr, &rec->next) == -1)
296 /* recover the space */
297 if (tdb1_free(tdb, rec_ptr, rec) == -1)
302 static int tdb1_count_dead(struct tdb_context *tdb, uint32_t hash)
306 struct tdb1_record rec;
308 /* read in the hash top */
309 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
313 if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1)
316 if (rec.magic == TDB1_DEAD_MAGIC) {
325 * Purge all DEAD records from a hash chain
327 static int tdb1_purge_dead(struct tdb_context *tdb, uint32_t hash)
330 struct tdb1_record rec;
333 if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
337 /* read in the hash top */
338 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
344 if (tdb1_rec_read(tdb, rec_ptr, &rec) == -1) {
350 if (rec.magic == TDB1_DEAD_MAGIC
351 && tdb1_do_delete(tdb, rec_ptr, &rec) == -1) {
358 tdb1_unlock(tdb, -1, F_WRLCK);
362 /* delete an entry in the database given a key */
363 static int tdb1_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
366 struct tdb1_record rec;
369 if (tdb->tdb1.max_dead_records != 0) {
372 * Allow for some dead records per hash chain, mainly for
373 * tdb's with a very high create/delete rate like locking.tdb.
376 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
379 if (tdb1_count_dead(tdb, hash) >= tdb->tdb1.max_dead_records) {
381 * Don't let the per-chain freelist grow too large,
382 * delete all existing dead records
384 tdb1_purge_dead(tdb, hash);
387 if (!(rec_ptr = tdb1_find(tdb, key, hash, &rec))) {
388 tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
393 * Just mark the record as dead.
395 rec.magic = TDB1_DEAD_MAGIC;
396 ret = tdb1_rec_write(tdb, rec_ptr, &rec);
399 if (!(rec_ptr = tdb1_find_lock_hash(tdb, key, hash, F_WRLCK,
403 ret = tdb1_do_delete(tdb, rec_ptr, &rec);
407 tdb1_increment_seqnum(tdb);
410 if (tdb1_unlock(tdb, TDB1_BUCKET(rec.full_hash), F_WRLCK) != 0)
411 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
412 "tdb1_delete: WARNING tdb1_unlock failed!");
416 int tdb1_delete(struct tdb_context *tdb, TDB_DATA key)
418 uint32_t hash = tdb_hash(tdb, key.dptr, key.dsize);
421 assert(tdb->flags & TDB_VERSION1);
422 ret = tdb1_delete_hash(tdb, key, hash);
427 * See if we have a dead record around with enough space
429 static tdb1_off_t tdb1_find_dead(struct tdb_context *tdb, uint32_t hash,
430 struct tdb1_record *r, tdb1_len_t length)
434 /* read in the hash top */
435 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1)
438 /* keep looking until we find the right record */
440 if (tdb1_rec_read(tdb, rec_ptr, r) == -1)
443 if (TDB1_DEAD(r) && r->rec_len >= length) {
445 * First fit for simple coding, TODO: change to best
455 static int _tdb1_store(struct tdb_context *tdb, TDB_DATA key,
456 TDB_DATA dbuf, int flag, uint32_t hash)
458 struct tdb1_record rec;
463 /* check for it existing, on insert. */
464 if (flag == TDB_INSERT) {
465 if (tdb1_exists_hash(tdb, key, hash)) {
466 tdb->last_error = TDB_ERR_EXISTS;
470 /* first try in-place update, on modify or replace. */
471 if (tdb1_update_hash(tdb, key, hash, dbuf) == 0) {
474 if (tdb->last_error == TDB_ERR_NOEXIST &&
475 flag == TDB_MODIFY) {
476 /* if the record doesn't exist and we are in TDB1_MODIFY mode then
477 we should fail the store */
481 /* reset the error code potentially set by the tdb1_update() */
482 tdb->last_error = TDB_SUCCESS;
484 /* delete any existing record - if it doesn't exist we don't
485 care. Doing this first reduces fragmentation, and avoids
486 coalescing with `allocated' block before it's updated. */
487 if (flag != TDB_INSERT)
488 tdb1_delete_hash(tdb, key, hash);
490 /* Copy key+value *before* allocating free space in case malloc
491 fails and we are left with a dead spot in the tdb. */
493 if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
494 tdb->last_error = TDB_ERR_OOM;
498 memcpy(p, key.dptr, key.dsize);
500 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
502 if (tdb->tdb1.max_dead_records != 0) {
504 * Allow for some dead records per hash chain, look if we can
505 * find one that can hold the new record. We need enough space
506 * for key, data and tailer. If we find one, we don't have to
507 * consult the central freelist.
509 rec_ptr = tdb1_find_dead(
511 key.dsize + dbuf.dsize + sizeof(tdb1_off_t));
514 rec.key_len = key.dsize;
515 rec.data_len = dbuf.dsize;
516 rec.full_hash = hash;
517 rec.magic = TDB1_MAGIC;
518 if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1
519 || tdb->tdb1.io->tdb1_write(
520 tdb, rec_ptr + sizeof(rec),
521 p, key.dsize + dbuf.dsize) == -1) {
529 * We have to allocate some space from the freelist, so this means we
530 * have to lock it. Use the chance to purge all the DEAD records from
531 * the hash chain under the freelist lock.
534 if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
538 if ((tdb->tdb1.max_dead_records != 0)
539 && (tdb1_purge_dead(tdb, hash) == -1)) {
540 tdb1_unlock(tdb, -1, F_WRLCK);
544 /* we have to allocate some space */
545 rec_ptr = tdb1_allocate(tdb, key.dsize + dbuf.dsize, &rec);
547 tdb1_unlock(tdb, -1, F_WRLCK);
553 /* Read hash top into next ptr */
554 if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(hash), &rec.next) == -1)
557 rec.key_len = key.dsize;
558 rec.data_len = dbuf.dsize;
559 rec.full_hash = hash;
560 rec.magic = TDB1_MAGIC;
562 /* write out and point the top of the hash chain at it */
563 if (tdb1_rec_write(tdb, rec_ptr, &rec) == -1
564 || tdb->tdb1.io->tdb1_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
565 || tdb1_ofs_write(tdb, TDB1_HASH_TOP(hash), &rec_ptr) == -1) {
566 /* Need to tdb1_unallocate() here */
574 tdb1_increment_seqnum(tdb);
581 /* store an element in the database, replacing any existing element
584 return 0 on success, -1 on failure
586 int tdb1_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
591 assert(tdb->flags & TDB_VERSION1);
593 if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) {
594 tdb->last_error = TDB_ERR_RDONLY;
598 /* find which hash bucket it is in */
599 hash = tdb_hash(tdb, key.dptr, key.dsize);
600 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
603 ret = _tdb1_store(tdb, key, dbuf, flag, hash);
604 tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
608 /* Append to an entry. Create if not exist. */
609 int tdb1_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
615 assert(tdb->flags & TDB_VERSION1);
617 /* find which hash bucket it is in */
618 hash = tdb_hash(tdb, key.dptr, key.dsize);
619 if (tdb1_lock(tdb, TDB1_BUCKET(hash), F_WRLCK) == -1)
622 dbuf = _tdb1_fetch(tdb, key);
624 if (dbuf.dptr == NULL) {
625 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
627 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
628 unsigned char *new_dptr;
630 /* realloc '0' is special: don't do that. */
633 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
634 if (new_dptr == NULL) {
637 dbuf.dptr = new_dptr;
640 if (dbuf.dptr == NULL) {
641 tdb->last_error = TDB_ERR_OOM;
645 memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
646 dbuf.dsize += new_dbuf.dsize;
648 ret = _tdb1_store(tdb, key, dbuf, 0, hash);
651 tdb1_unlock(tdb, TDB1_BUCKET(hash), F_WRLCK);
652 SAFE_FREE(dbuf.dptr);
658 get the tdb sequence number. Only makes sense if the writers opened
659 with TDB1_SEQNUM set. Note that this sequence number will wrap quite
660 quickly, so it should only be used for a 'has something changed'
661 test, not for code that relies on the count of the number of changes
662 made. If you want a counter then use a tdb record.
664 The aim of this sequence number is to allow for a very lightweight
665 test of a possible tdb change.
667 int tdb1_get_seqnum(struct tdb_context *tdb)
671 tdb1_ofs_read(tdb, TDB1_SEQNUM_OFS, &seqnum);
677 add a region of the file to the freelist. Length is the size of the region in bytes,
678 which includes the free list header that needs to be added
680 static int tdb1_free_region(struct tdb_context *tdb, tdb1_off_t offset, ssize_t length)
682 struct tdb1_record rec;
683 if (length <= sizeof(rec)) {
684 /* the region is not worth adding */
687 if (length + offset > tdb->file->map_size) {
688 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
689 "tdb1_free_region: adding region beyond"
693 memset(&rec,'\0',sizeof(rec));
694 rec.rec_len = length - sizeof(rec);
695 if (tdb1_free(tdb, offset, &rec) == -1) {
696 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
697 "tdb1_free_region: failed to add free record");
704 wipe the entire database, deleting all records. This can be done
705 very fast by using a allrecord lock. The entire data portion of the
706 file becomes a single entry in the freelist.
708 This code carefully steps around the recovery area, leaving it alone
710 int tdb1_wipe_all(struct tdb_context *tdb)
713 tdb1_off_t offset = 0;
715 tdb1_off_t recovery_head;
716 tdb1_len_t recovery_size = 0;
718 if (tdb_lockall(tdb) != TDB_SUCCESS) {
723 /* see if the tdb has a recovery area, and remember its size
724 if so. We don't want to lose this as otherwise each
725 tdb1_wipe_all() in a transaction will increase the size of
726 the tdb by the size of the recovery area */
727 if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, &recovery_head) == -1) {
728 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
729 "tdb1_wipe_all: failed to read recovery head");
733 if (recovery_head != 0) {
734 struct tdb1_record rec;
735 if (tdb->tdb1.io->tdb1_read(tdb, recovery_head, &rec, sizeof(rec), TDB1_DOCONV()) == -1) {
736 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
737 "tdb1_wipe_all: failed to read recovery record");
740 recovery_size = rec.rec_len + sizeof(rec);
743 /* wipe the hashes */
744 for (i=0;i<tdb->tdb1.header.hash_size;i++) {
745 if (tdb1_ofs_write(tdb, TDB1_HASH_TOP(i), &offset) == -1) {
746 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
747 "tdb1_wipe_all: failed to write hash %d", i);
752 /* wipe the freelist */
753 if (tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) {
754 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
755 "tdb1_wipe_all: failed to write freelist");
759 /* add all the rest of the file to the freelist, possibly leaving a gap
760 for the recovery area */
761 if (recovery_size == 0) {
762 /* the simple case - the whole file can be used as a freelist */
763 data_len = (tdb->file->map_size - TDB1_DATA_START(tdb->tdb1.header.hash_size));
764 if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) {
768 /* we need to add two freelist entries - one on either
769 side of the recovery area
771 Note that we cannot shift the recovery area during
772 this operation. Only the transaction.c code may
773 move the recovery area or we risk subtle data
776 data_len = (recovery_head - TDB1_DATA_START(tdb->tdb1.header.hash_size));
777 if (tdb1_free_region(tdb, TDB1_DATA_START(tdb->tdb1.header.hash_size), data_len) != 0) {
780 /* and the 2nd free list entry after the recovery area - if any */
781 data_len = tdb->file->map_size - (recovery_head+recovery_size);
782 if (tdb1_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
795 /* Even on files, we can get partial writes due to signals. */
796 bool tdb1_write_all(int fd, const void *buf, size_t count)
800 ret = write(fd, buf, count);
803 buf = (const char *)buf + ret;