2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2004
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 2 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, write to the Free Software
26 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 /* NOTE: If you use tdbs under valgrind, and in particular if you run
31 * tdbtorture, you may get spurious "uninitialized value" warnings. I
32 * think this is because valgrind doesn't understand that the mmap'd
33 * area may be written to by other processes. Memory can, from the
34 * point of view of the grinded process, spontaneously become
37 * I can think of a few solutions. [mbp 20030311]
39 * 1 - Write suppressions for Valgrind so that it doesn't complain
40 * about this. Probably the most reasonable but people need to
41 * remember to use them.
43 * 2 - Use IO not mmap when running under valgrind. Not so nice.
45 * 3 - Use the special valgrind macros to mark memory as valid at the
46 * right time. Probably too hard -- the process just doesn't know.
64 #include "pppd-private.h"
67 #include "pathnames.h"
69 #define TDB_MAGIC_FOOD "TDB file\n"
70 #define TDB_VERSION (0x26011967 + 6)
71 #define TDB_MAGIC (0x26011999U)
72 #define TDB_FREE_MAGIC (~TDB_MAGIC)
73 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
74 #define TDB_ALIGNMENT 4
75 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
76 #define DEFAULT_HASH_SIZE 131
77 #define TDB_PAGE_SIZE 0x2000
78 #define FREELIST_TOP (sizeof(struct tdb_header))
79 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
80 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
81 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
82 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
83 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
84 #define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1) + TDB_SPINLOCK_SIZE(hash_size))
87 /* NB assumes there is a local variable called "tdb" that is the
88 * current context, also takes doubly-parenthesized print-style
90 #define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
101 #define MAP_FAILED ((void *)-1)
104 /* free memory if the pointer is valid and zero the pointer */
106 #define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
109 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
112 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
113 static TDB_CONTEXT *tdbs = NULL;
115 static int tdb_munmap(TDB_CONTEXT *tdb)
117 if (tdb->flags & TDB_INTERNAL)
122 int ret = munmap(tdb->map_ptr, tdb->map_size);
131 static void tdb_mmap(TDB_CONTEXT *tdb)
133 if (tdb->flags & TDB_INTERNAL)
137 if (!(tdb->flags & TDB_NOMMAP)) {
138 tdb->map_ptr = mmap(NULL, tdb->map_size,
139 PROT_READ|(tdb->read_only? 0:PROT_WRITE),
140 MAP_SHARED|MAP_FILE, tdb->fd, 0);
143 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
146 if (tdb->map_ptr == MAP_FAILED) {
148 TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
149 tdb->map_size, strerror(errno)));
159 /* Endian conversion: we only ever deal with 4 byte quantities */
160 static void *convert(void *buf, u32 size)
163 for (i = 0; i < size / 4; i++)
164 p[i] = TDB_BYTEREV(p[i]);
167 #define DOCONV() (tdb->flags & TDB_CONVERT)
168 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
170 /* the body of the database is made of one list_struct for the free space
171 plus a separate data list for each hash value */
173 tdb_off next; /* offset of the next record in the list */
174 tdb_len rec_len; /* total byte length of record */
175 tdb_len key_len; /* byte length of key */
176 tdb_len data_len; /* byte length of data */
177 u32 full_hash; /* the full 32 bit hash of the key */
178 u32 magic; /* try to catch errors */
179 /* the following union is implied:
181 char record[rec_len];
186 u32 totalsize; (tailer)
191 /* a byte range locking function - return 0 on success
192 this functions locks/unlocks 1 byte at the specified offset.
194 On error, errno is also set so that errors are passed back properly
195 through tdb_open(). */
196 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
197 int rw_type, int lck_type, int probe)
202 if (tdb->flags & TDB_NOLOCK)
204 if ((rw_type == F_WRLCK) && (tdb->read_only)) {
210 fl.l_whence = SEEK_SET;
216 ret = fcntl(tdb->fd,lck_type,&fl);
217 } while (ret == -1 && errno == EINTR);
220 if (!probe && lck_type != F_SETLK) {
221 /* Ensure error code is set for log fun to examine. */
222 tdb->ecode = TDB_ERR_LOCK;
223 TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
224 tdb->fd, offset, rw_type, lck_type));
226 /* Otherwise - generic lock error. errno set by fcntl.
227 * EAGAIN is an expected return from non-blocking
229 if (errno != EAGAIN) {
230 TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
231 tdb->fd, offset, rw_type, lck_type,
234 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
239 /* lock a list in the database. list -1 is the alloc list */
240 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
242 if (list < -1 || list >= (int)tdb->header.hash_size) {
243 TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
247 if (tdb->flags & TDB_NOLOCK)
250 /* Since fcntl locks don't nest, we do a lock for the first one,
251 and simply bump the count for future ones */
252 if (tdb->locked[list+1].count == 0) {
253 if (!tdb->read_only && tdb->header.rwlocks) {
254 if (tdb_spinlock(tdb, list, ltype)) {
255 TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list %d ltype=%d\n",
259 } else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
260 TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
261 list, ltype, strerror(errno)));
264 tdb->locked[list+1].ltype = ltype;
266 tdb->locked[list+1].count++;
270 /* unlock the database: returns void because it's too late for errors. */
271 /* changed to return int it may be interesting to know there
272 has been an error --simo */
273 static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
277 if (tdb->flags & TDB_NOLOCK)
281 if (list < -1 || list >= (int)tdb->header.hash_size) {
282 TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
286 if (tdb->locked[list+1].count==0) {
287 TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
291 if (tdb->locked[list+1].count == 1) {
292 /* Down to last nested lock: unlock underneath */
293 if (!tdb->read_only && tdb->header.rwlocks) {
294 ret = tdb_spinunlock(tdb, list, ltype);
296 ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
301 tdb->locked[list+1].count--;
304 TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
308 /* check for an out of bounds access - if it is out of bounds then
309 see if the database has been expanded by someone else and expand
311 note that "len" is the minimum length needed for the db
313 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
316 if (len <= tdb->map_size)
318 if (tdb->flags & TDB_INTERNAL) {
320 /* Ensure ecode is set for log fn. */
321 tdb->ecode = TDB_ERR_IO;
322 TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
323 (int)len, (int)tdb->map_size));
325 return TDB_ERRCODE(TDB_ERR_IO, -1);
328 if (fstat(tdb->fd, &st) == -1)
329 return TDB_ERRCODE(TDB_ERR_IO, -1);
331 if (st.st_size < (size_t)len) {
333 /* Ensure ecode is set for log fn. */
334 tdb->ecode = TDB_ERR_IO;
335 TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
336 (int)len, (int)st.st_size));
338 return TDB_ERRCODE(TDB_ERR_IO, -1);
341 /* Unmap, update size, remap */
342 if (tdb_munmap(tdb) == -1)
343 return TDB_ERRCODE(TDB_ERR_IO, -1);
344 tdb->map_size = st.st_size;
349 /* write a lump of data at a specified offset */
350 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
352 if (tdb_oob(tdb, off + len, 0) != 0)
356 memcpy(off + (char *)tdb->map_ptr, buf, len);
358 else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
360 else if (lseek(tdb->fd, off, SEEK_SET) != off
361 || write(tdb->fd, buf, len) != (ssize_t)len) {
363 /* Ensure ecode is set for log fn. */
364 tdb->ecode = TDB_ERR_IO;
365 TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
366 off, len, strerror(errno)));
367 return TDB_ERRCODE(TDB_ERR_IO, -1);
372 /* read a lump of data at a specified offset, maybe convert */
373 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
375 if (tdb_oob(tdb, off + len, 0) != 0)
379 memcpy(buf, off + (char *)tdb->map_ptr, len);
381 else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
383 else if (lseek(tdb->fd, off, SEEK_SET) != off
384 || read(tdb->fd, buf, len) != (ssize_t)len) {
386 /* Ensure ecode is set for log fn. */
387 tdb->ecode = TDB_ERR_IO;
388 TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
389 off, len, strerror(errno)));
390 return TDB_ERRCODE(TDB_ERR_IO, -1);
397 /* read a lump of data, allocating the space for it */
398 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
402 if (!(buf = malloc(len))) {
403 /* Ensure ecode is set for log fn. */
404 tdb->ecode = TDB_ERR_OOM;
405 TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
406 len, strerror(errno)));
407 return TDB_ERRCODE(TDB_ERR_OOM, buf);
409 if (tdb_read(tdb, offset, buf, len, 0) == -1) {
416 /* read/write a tdb_off */
417 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
419 return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
421 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
424 return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
427 /* read/write a record */
428 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
430 if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
432 if (TDB_BAD_MAGIC(rec)) {
433 /* Ensure ecode is set for log fn. */
434 tdb->ecode = TDB_ERR_CORRUPT;
435 TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
436 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
438 return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
440 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
442 struct list_struct r = *rec;
443 return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
446 /* read a freelist record and check for simple errors */
447 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
449 if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
452 if (rec->magic == TDB_MAGIC) {
453 /* this happens when a app is showdown while deleting a record - we should
454 not completely fail when this happens */
455 TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
457 rec->magic = TDB_FREE_MAGIC;
458 if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
462 if (rec->magic != TDB_FREE_MAGIC) {
463 /* Ensure ecode is set for log fn. */
464 tdb->ecode = TDB_ERR_CORRUPT;
465 TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
467 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
469 if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
474 /* update a record tailer (must hold allocation lock) */
475 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
476 const struct list_struct *rec)
480 /* Offset of tailer from record header */
481 totalsize = sizeof(*rec) + rec->rec_len;
482 return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
486 /* Remove an element from the freelist. Must have alloc lock. */
487 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
491 /* read in the freelist top */
492 last_ptr = FREELIST_TOP;
493 while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
495 /* We've found it! */
496 return ofs_write(tdb, last_ptr, &next);
498 /* Follow chain (next offset is at start of record) */
501 TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
502 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
505 /* Add an element into the freelist. Merge adjacent records if
507 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
511 /* Allocation and tailer lock */
512 if (tdb_lock(tdb, -1, F_WRLCK) != 0)
515 /* set an initial tailer, so if we fail we don't leave a bogus record */
516 if (update_tailer(tdb, offset, rec) != 0) {
517 TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
521 /* Look right first (I'm an Australian, dammit) */
522 right = offset + sizeof(*rec) + rec->rec_len;
523 if (right + sizeof(*rec) <= tdb->map_size) {
524 struct list_struct r;
526 if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
527 TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
531 /* If it's free, expand to include it. */
532 if (r.magic == TDB_FREE_MAGIC) {
533 if (remove_from_freelist(tdb, right, r.next) == -1) {
534 TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
537 rec->rec_len += sizeof(r) + r.rec_len;
543 left = offset - sizeof(tdb_off);
544 if (left > TDB_DATA_START(tdb->header.hash_size)) {
545 struct list_struct l;
548 /* Read in tailer and jump back to header */
549 if (ofs_read(tdb, left, &leftsize) == -1) {
550 TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
553 left = offset - leftsize;
555 /* Now read in record */
556 if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
557 TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
561 /* If it's free, expand to include it. */
562 if (l.magic == TDB_FREE_MAGIC) {
563 if (remove_from_freelist(tdb, left, l.next) == -1) {
564 TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
568 rec->rec_len += leftsize;
574 if (update_tailer(tdb, offset, rec) == -1) {
575 TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
579 /* Now, prepend to free list */
580 rec->magic = TDB_FREE_MAGIC;
582 if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
583 rec_write(tdb, offset, rec) == -1 ||
584 ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
585 TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
589 /* And we're done. */
590 tdb_unlock(tdb, -1, F_WRLCK);
594 tdb_unlock(tdb, -1, F_WRLCK);
599 /* expand a file. we prefer to use ftruncate, as that is what posix
600 says to use for mmap expansion */
601 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
604 #if HAVE_FTRUNCATE_EXTEND
605 if (ftruncate(tdb->fd, size+addition) != 0) {
606 TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
607 size+addition, strerror(errno)));
614 if (pwrite(tdb->fd, &b, 1, (size+addition) - 1) != 1) {
616 if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
617 write(tdb->fd, &b, 1) != 1) {
619 TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
620 size+addition, strerror(errno)));
625 /* now fill the file with something. This ensures that the file isn't sparse, which would be
626 very bad if we ran out of disk. This must be done with write, not via mmap */
627 memset(buf, 0x42, sizeof(buf));
629 int n = addition>sizeof(buf)?sizeof(buf):addition;
631 int ret = pwrite(tdb->fd, buf, n, size);
634 if (lseek(tdb->fd, size, SEEK_SET) != size)
636 ret = write(tdb->fd, buf, n);
639 TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
640 n, strerror(errno)));
650 /* expand the database at least size bytes by expanding the underlying
651 file and doing the mmap again if necessary */
652 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
654 struct list_struct rec;
657 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
658 TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
662 /* must know about any previous expansions by another process */
663 tdb_oob(tdb, tdb->map_size + 1, 1);
665 /* always make room for at least 10 more records, and round
666 the database up to a multiple of TDB_PAGE_SIZE */
667 size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
669 if (!(tdb->flags & TDB_INTERNAL))
673 * We must ensure the file is unmapped before doing this
674 * to ensure consistency with systems like OpenBSD where
675 * writes and mmaps are not consistent.
678 /* expand the file itself */
679 if (!(tdb->flags & TDB_INTERNAL)) {
680 if (expand_file(tdb, tdb->map_size, size) != 0)
684 tdb->map_size += size;
686 if (tdb->flags & TDB_INTERNAL)
687 tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
690 * We must ensure the file is remapped before adding the space
691 * to ensure consistency with systems like OpenBSD where
692 * writes and mmaps are not consistent.
695 /* We're ok if the mmap fails as we'll fallback to read/write */
699 /* form a new freelist record */
700 memset(&rec,'\0',sizeof(rec));
701 rec.rec_len = size - sizeof(rec);
703 /* link it into the free list */
704 offset = tdb->map_size - size;
705 if (tdb_free(tdb, offset, &rec) == -1)
708 tdb_unlock(tdb, -1, F_WRLCK);
711 tdb_unlock(tdb, -1, F_WRLCK);
715 /* allocate some space from the free list. The offset returned points
716 to a unconnected list_struct within the database with room for at
717 least length bytes of total data
719 0 is returned if the space could not be allocated
721 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
722 struct list_struct *rec)
724 tdb_off rec_ptr, last_ptr, newrec_ptr;
725 struct list_struct newrec;
727 memset(&newrec, '\0', sizeof(newrec));
729 if (tdb_lock(tdb, -1, F_WRLCK) == -1)
732 /* Extra bytes required for tailer */
733 length += sizeof(tdb_off);
736 last_ptr = FREELIST_TOP;
738 /* read in the freelist top */
739 if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
742 /* keep looking until we find a freelist record big enough */
744 if (rec_free_read(tdb, rec_ptr, rec) == -1)
747 if (rec->rec_len >= length) {
748 /* found it - now possibly split it up */
749 if (rec->rec_len > length + MIN_REC_SIZE) {
750 /* Length of left piece */
751 length = TDB_ALIGN(length, TDB_ALIGNMENT);
753 /* Right piece to go on free list */
754 newrec.rec_len = rec->rec_len
755 - (sizeof(*rec) + length);
756 newrec_ptr = rec_ptr + sizeof(*rec) + length;
758 /* And left record is shortened */
759 rec->rec_len = length;
763 /* Remove allocated record from the free list */
764 if (ofs_write(tdb, last_ptr, &rec->next) == -1)
767 /* Update header: do this before we drop alloc
768 lock, otherwise tdb_free() might try to
769 merge with us, thinking we're free.
770 (Thanks Jeremy Allison). */
771 rec->magic = TDB_MAGIC;
772 if (rec_write(tdb, rec_ptr, rec) == -1)
775 /* Did we create new block? */
777 /* Update allocated record tailer (we
779 if (update_tailer(tdb, rec_ptr, rec) == -1)
782 /* Free new record */
783 if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
787 /* all done - return the new record offset */
788 tdb_unlock(tdb, -1, F_WRLCK);
791 /* move to the next record */
795 /* we didn't find enough space. See if we can expand the
796 database and if we can then try again */
797 if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
800 tdb_unlock(tdb, -1, F_WRLCK);
804 /* initialise a new database with a specified hash size */
805 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
807 struct tdb_header *newdb;
810 /* We make it up in memory, then write it out if not internal */
811 size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
812 if (!(newdb = calloc(1, size)))
813 return TDB_ERRCODE(TDB_ERR_OOM, -1);
815 /* Fill in the header */
816 newdb->version = TDB_VERSION;
817 newdb->hash_size = hash_size;
818 if (tdb->flags & TDB_INTERNAL) {
819 tdb->map_size = size;
820 tdb->map_ptr = (char *)newdb;
821 memcpy(&tdb->header, newdb, sizeof(tdb->header));
822 /* Convert the `ondisk' version if asked. */
826 if (lseek(tdb->fd, 0, SEEK_SET) == -1)
829 if (ftruncate(tdb->fd, 0) == -1)
832 /* This creates an endian-converted header, as if read from disk */
834 memcpy(&tdb->header, newdb, sizeof(tdb->header));
835 /* Don't endian-convert the magic food! */
836 memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
837 if (write(tdb->fd, newdb, size) != size)
840 ret = tdb_create_rwlocks(tdb->fd, hash_size);
847 /* Returns 0 on fail. On success, return offset of record, and fills
849 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
850 struct list_struct *r)
854 /* read in the hash top */
855 if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
858 /* keep looking until we find the right record */
860 if (rec_read(tdb, rec_ptr, r) == -1)
863 if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
865 /* a very likely hit - read the key */
866 k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
871 if (memcmp(key.dptr, k, key.dsize) == 0) {
879 return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
882 /* As tdb_find, but if you succeed, keep the lock */
883 static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
884 struct list_struct *rec)
888 if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
890 if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
891 tdb_unlock(tdb, BUCKET(hash), locktype);
895 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
900 static struct tdb_errname {
901 enum TDB_ERROR ecode; const char *estring;
902 } emap[] = { {TDB_SUCCESS, "Success"},
903 {TDB_ERR_CORRUPT, "Corrupt database"},
904 {TDB_ERR_IO, "IO Error"},
905 {TDB_ERR_LOCK, "Locking error"},
906 {TDB_ERR_OOM, "Out of memory"},
907 {TDB_ERR_EXISTS, "Record exists"},
908 {TDB_ERR_NOLOCK, "Lock exists on other keys"},
909 {TDB_ERR_NOEXIST, "Record does not exist"} };
911 /* Error string for the last tdb error */
912 const char *tdb_errorstr(TDB_CONTEXT *tdb)
915 for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
916 if (tdb->ecode == emap[i].ecode)
917 return emap[i].estring;
918 return "Invalid error code";
921 /* update an entry in place - this only works if the new data size
922 is <= the old data size and the key exists.
923 on failure return -1.
926 static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
928 struct list_struct rec;
932 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
935 /* must be long enough key, data and tailer */
936 if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
937 tdb->ecode = TDB_SUCCESS; /* Not really an error */
941 if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
942 dbuf.dptr, dbuf.dsize) == -1)
945 if (dbuf.dsize != rec.data_len) {
947 rec.data_len = dbuf.dsize;
948 return rec_write(tdb, rec_ptr, &rec);
954 /* find an entry in the database given a key */
955 /* If an entry doesn't exist tdb_err will be set to
956 * TDB_ERR_NOEXIST. If a key has no data attached
957 * tdb_err will not be set. Both will return a
958 * zero pptr and zero dsize.
961 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
964 struct list_struct rec;
968 /* find which hash bucket it is in */
969 hash = tdb->hash_fn(&key);
970 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
974 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
978 ret.dsize = rec.data_len;
979 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
983 /* check if an entry in the database exists
985 note that 1 is returned if the key is found and 0 is returned if not found
986 this doesn't match the conventions in the rest of this module, but is
989 static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
991 struct list_struct rec;
993 if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
995 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
999 /* record lock stops delete underneath */
1000 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1002 return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1005 Write locks override our own fcntl readlocks, so check it here.
1006 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1007 an error to fail to get the lock here.
1010 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1012 struct tdb_traverse_lock *i;
1013 for (i = &tdb->travlocks; i; i = i->next)
1016 return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1020 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1021 an error to fail to get the lock here.
1024 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1026 return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1028 /* fcntl locks don't stack: avoid unlocking someone else's */
1029 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1031 struct tdb_traverse_lock *i;
1036 for (i = &tdb->travlocks; i; i = i->next)
1039 return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1042 /* actually delete an entry in the database given the offset */
1043 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1045 tdb_off last_ptr, i;
1046 struct list_struct lastrec;
1048 if (tdb->read_only) return -1;
1050 if (write_lock_record(tdb, rec_ptr) == -1) {
1051 /* Someone traversing here: mark it as dead */
1052 rec->magic = TDB_DEAD_MAGIC;
1053 return rec_write(tdb, rec_ptr, rec);
1055 if (write_unlock_record(tdb, rec_ptr) != 0)
1058 /* find previous record in hash chain */
1059 if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1061 for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1062 if (rec_read(tdb, i, &lastrec) == -1)
1065 /* unlink it: next ptr is at start of record. */
1067 last_ptr = TDB_HASH_TOP(rec->full_hash);
1068 if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1071 /* recover the space */
1072 if (tdb_free(tdb, rec_ptr, rec) == -1)
1077 /* delete an entry in the database given a key */
1078 static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1081 struct list_struct rec;
1084 if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
1086 ret = do_delete(tdb, rec_ptr, &rec);
1087 if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1088 TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1092 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1094 u32 hash = tdb->hash_fn(&key);
1095 return tdb_delete_hash(tdb, key, hash);
1098 /* store an element in the database, replacing any existing element
1101 return 0 on success, -1 on failure
1103 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1105 struct list_struct rec;
1111 /* find which hash bucket it is in */
1112 hash = tdb->hash_fn(&key);
1113 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1116 /* check for it existing, on insert. */
1117 if (flag == TDB_INSERT) {
1118 if (tdb_exists_hash(tdb, key, hash)) {
1119 tdb->ecode = TDB_ERR_EXISTS;
1123 /* first try in-place update, on modify or replace. */
1124 if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
1126 if (tdb->ecode == TDB_ERR_NOEXIST &&
1127 flag == TDB_MODIFY) {
1128 /* if the record doesn't exist and we are in TDB_MODIFY mode then
1129 we should fail the store */
1133 /* reset the error code potentially set by the tdb_update() */
1134 tdb->ecode = TDB_SUCCESS;
1136 /* delete any existing record - if it doesn't exist we don't
1137 care. Doing this first reduces fragmentation, and avoids
1138 coalescing with `allocated' block before it's updated. */
1139 if (flag != TDB_INSERT)
1140 tdb_delete_hash(tdb, key, hash);
1142 /* Copy key+value *before* allocating free space in case malloc
1143 fails and we are left with a dead spot in the tdb. */
1145 if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1146 tdb->ecode = TDB_ERR_OOM;
1150 memcpy(p, key.dptr, key.dsize);
1152 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1154 /* we have to allocate some space */
1155 if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1158 /* Read hash top into next ptr */
1159 if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1162 rec.key_len = key.dsize;
1163 rec.data_len = dbuf.dsize;
1164 rec.full_hash = hash;
1165 rec.magic = TDB_MAGIC;
1167 /* write out and point the top of the hash chain at it */
1168 if (rec_write(tdb, rec_ptr, &rec) == -1
1169 || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1170 || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1171 /* Need to tdb_unallocate() here */
1176 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1183 static int tdb_already_open(dev_t device,
1188 for (i = tdbs; i; i = i->next) {
1189 if (i->device == device && i->inode == ino) {
1197 /* This is based on the hash algorithm from gdbm */
1198 static u32 default_tdb_hash(TDB_DATA *key)
1200 u32 value; /* Used to compute the hash value. */
1201 u32 i; /* Used to cycle through random values. */
1203 /* Set the initial value from the key size. */
1204 for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
1205 value = (value + (key->dptr[i] << (i*5 % 24)));
1207 return (1103515243 * value + 12345);
1210 /* open the database, creating it if necessary
1212 The open_flags and mode are passed straight to the open call on the
1213 database file. A flags value of O_WRONLY is invalid. The hash size
1214 is advisory, use zero for a default value.
1216 Return is NULL on error, in which case errno is also set. Don't
1217 try to call tdb_error or tdb_errname, just do strerror(errno).
1219 @param name may be NULL for internal databases. */
1220 TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1221 int open_flags, mode_t mode)
1223 return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
1227 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1228 int open_flags, mode_t mode,
1229 tdb_log_func log_fn,
1230 tdb_hash_func hash_fn)
1234 int rev = 0, locked = 0;
1238 if (!(tdb = calloc(1, sizeof *tdb))) {
1239 /* Can't log this */
1245 tdb->map_ptr = NULL;
1246 tdb->flags = tdb_flags;
1247 tdb->open_flags = open_flags;
1248 tdb->log_fn = log_fn;
1249 tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
1251 if ((open_flags & O_ACCMODE) == O_WRONLY) {
1252 TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1259 hash_size = DEFAULT_HASH_SIZE;
1260 if ((open_flags & O_ACCMODE) == O_RDONLY) {
1262 /* read only databases don't do locking or clear if first */
1263 tdb->flags |= TDB_NOLOCK;
1264 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1267 /* internal databases don't mmap or lock, and start off cleared */
1268 if (tdb->flags & TDB_INTERNAL) {
1269 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1270 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1271 if (tdb_new_database(tdb, hash_size) != 0) {
1272 TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1279 if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1280 if ((open_flags & O_CREAT) && errno == ENOENT &&
1281 mkdir_recursive(PPP_PATH_VARRUN) == 0)
1284 TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1285 name, strerror(errno)));
1286 goto fail; /* errno set by open(2) */
1289 /* ensure there is only one process initialising at once */
1290 if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1291 TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1292 name, strerror(errno)));
1293 goto fail; /* errno set by tdb_brlock */
1296 /* we need to zero database if we are the only one with it open */
1297 if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
1298 (locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
1299 open_flags |= O_CREAT;
1300 if (ftruncate(tdb->fd, 0) == -1) {
1301 TDB_LOG((tdb, 0, "tdb_open_ex: "
1302 "failed to truncate %s: %s\n",
1303 name, strerror(errno)));
1304 goto fail; /* errno set by ftruncate */
1308 if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1309 || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1310 || (tdb->header.version != TDB_VERSION
1311 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1312 /* its not a valid database - possibly initialise it */
1313 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1314 errno = EIO; /* ie bad format or something */
1317 rev = (tdb->flags & TDB_CONVERT);
1319 vp = (unsigned char *)&tdb->header.version;
1320 vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1321 (((u32)vp[2]) << 8) | (u32)vp[3];
1322 tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1324 tdb->flags &= ~TDB_CONVERT;
1326 tdb->flags |= TDB_CONVERT;
1327 convert(&tdb->header, sizeof(tdb->header));
1329 if (fstat(tdb->fd, &st) == -1)
1332 /* Is it already in the open list? If so, fail. */
1333 if (tdb_already_open(st.st_dev, st.st_ino)) {
1334 TDB_LOG((tdb, 2, "tdb_open_ex: "
1335 "%s (%d,%d) is already open in this process\n",
1336 name, (int)st.st_dev, (int)st.st_ino));
1341 if (!(tdb->name = (char *)strdup(name))) {
1346 tdb->map_size = st.st_size;
1347 tdb->device = st.st_dev;
1348 tdb->inode = st.st_ino;
1349 tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1351 TDB_LOG((tdb, 2, "tdb_open_ex: "
1352 "failed to allocate lock structure for %s\n",
1359 if (!tdb->read_only)
1360 if (tdb_clear_spinlocks(tdb) != 0) {
1361 TDB_LOG((tdb, 0, "tdb_open_ex: "
1362 "failed to clear spinlock\n"));
1365 if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1366 TDB_LOG((tdb, 0, "tdb_open_ex: "
1367 "failed to take ACTIVE_LOCK on %s: %s\n",
1368 name, strerror(errno)));
1374 /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
1375 we didn't get the initial exclusive lock as we need to let all other
1376 users know we're using it. */
1378 if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1379 /* leave this lock in place to indicate it's in use */
1380 if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1386 /* Internal (memory-only) databases skip all the code above to
1387 * do with disk files, and resume here by releasing their
1388 * global lock and hooking into the active list. */
1389 if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1396 { int save_errno = errno;
1402 if (tdb->flags & TDB_INTERNAL)
1403 SAFE_FREE(tdb->map_ptr);
1407 SAFE_FREE(tdb->name);
1409 if (close(tdb->fd) != 0)
1410 TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1411 SAFE_FREE(tdb->locked);
1421 * @returns -1 for error; 0 for success.
1423 int tdb_close(TDB_CONTEXT *tdb)
1429 if (tdb->flags & TDB_INTERNAL)
1430 SAFE_FREE(tdb->map_ptr);
1434 SAFE_FREE(tdb->name);
1436 ret = close(tdb->fd);
1437 SAFE_FREE(tdb->locked);
1439 /* Remove from contexts list */
1440 for (i = &tdbs; *i; i = &(*i)->next) {
1447 memset(tdb, 0, sizeof(*tdb));
1453 /* lock/unlock one hash chain. This is meant to be used to reduce
1454 contention - it cannot guarantee how many records will be locked */
1455 int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
1457 return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
1460 int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
1462 return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);