2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
9 Copyright (C) Rusty Russell 2010
11 ** NOTE! The following LGPL license applies to the tdb
12 ** library. This does NOT imply that all of Samba is released
15 This library is free software; you can redistribute it and/or
16 modify it under the terms of the GNU Lesser General Public
17 License as published by the Free Software Foundation; either
18 version 3 of the License, or (at your option) any later version.
20 This library is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 Lesser General Public License for more details.
25 You should have received a copy of the GNU Lesser General Public
26 License along with this library; if not, see <http://www.gnu.org/licenses/>.
29 #include <ccan/likely/likely.h>
31 void tdb_munmap(struct tdb_context *tdb)
33 if (tdb->flags & TDB_INTERNAL)
37 munmap(tdb->map_ptr, tdb->map_size);
42 void tdb_mmap(struct tdb_context *tdb)
44 if (tdb->flags & TDB_INTERNAL)
47 if (tdb->flags & TDB_NOMMAP)
50 tdb->map_ptr = mmap(NULL, tdb->map_size,
51 PROT_READ|(tdb->read_only? 0:PROT_WRITE),
52 MAP_SHARED, tdb->fd, 0);
55 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
57 if (tdb->map_ptr == MAP_FAILED) {
59 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
60 "tdb_mmap failed for size %lld (%s)\n",
61 (long long)tdb->map_size, strerror(errno));
65 /* check for an out of bounds access - if it is out of bounds then
66 see if the database has been expanded by someone else and expand
68 note that "len" is the minimum length needed for the db
70 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
73 if (len <= tdb->map_size)
75 if (tdb->flags & TDB_INTERNAL) {
77 /* Ensure ecode is set for log fn. */
78 tdb->ecode = TDB_ERR_IO;
79 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
80 "tdb_oob len %lld beyond internal"
81 " malloc size %lld\n",
83 (long long)tdb->map_size);
88 if (fstat(tdb->fd, &st) == -1) {
89 tdb->ecode = TDB_ERR_IO;
93 if (st.st_size < (size_t)len) {
95 /* Ensure ecode is set for log fn. */
96 tdb->ecode = TDB_ERR_IO;
97 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
98 "tdb_oob len %lld beyond eof at %lld\n",
99 (long long)len, (long long)st.st_size);
104 /* Unmap, update size, remap */
106 tdb->map_size = st.st_size;
111 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
113 if (unlikely(!tdb->map_ptr))
116 /* FIXME: We can do a subset of this! */
117 if (tdb->transaction)
120 if (unlikely(tdb_oob(tdb, off + len, true) == -1))
122 return (char *)tdb->map_ptr + off;
125 /* Either make a copy into pad and return that, or return ptr into mmap. */
126 /* Note: pad has to be a real object, so we can't get here if len
127 * overflows size_t */
128 void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
130 if (likely(!(tdb->flags & TDB_CONVERT))) {
131 void *ret = tdb_direct(tdb, off, len);
136 if (unlikely(tdb_oob(tdb, off + len, false) == -1))
139 if (tdb->methods->read(tdb, off, pad, len) == -1)
141 return tdb_convert(tdb, pad, len);
144 /* Endian conversion: we only ever deal with 8 byte quantities */
145 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
147 if (unlikely((tdb->flags & TDB_CONVERT))) {
148 uint64_t i, *p = (uint64_t *)buf;
149 for (i = 0; i < size / 8; i++)
150 p[i] = bswap_64(p[i]);
155 /* Return first non-zero offset in num offset array, or num. */
156 /* FIXME: Return the off? */
157 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb, tdb_off_t off,
163 val = tdb_direct(tdb, off, num * sizeof(tdb_off_t));
164 if (!unlikely(val)) {
165 val = tdb_alloc_read(tdb, off, num * sizeof(tdb_off_t));
171 for (i = 0; i < num; i++) {
180 /* Return first zero offset in num offset array, or num. */
181 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
187 val = tdb_direct(tdb, off, num * sizeof(tdb_off_t));
188 if (!unlikely(val)) {
189 val = tdb_alloc_read(tdb, off, num * sizeof(tdb_off_t));
195 for (i = 0; i < num; i++) {
204 static int fill(struct tdb_context *tdb,
205 const void *buf, size_t size,
206 tdb_off_t off, tdb_len_t len)
209 size_t n = len > size ? size : len;
211 if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
212 tdb->ecode = TDB_ERR_IO;
213 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
214 "fill write failed: giving up!\n");
223 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
225 void *p = tdb_direct(tdb, off, len);
230 char buf[8192] = { 0 };
231 return fill(tdb, buf, sizeof(buf), len, off);
235 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
239 ret = tdb_get(tdb, off, &pad, sizeof(pad));
246 /* Even on files, we can get partial writes due to signals. */
247 bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off)
251 ret = pwrite(fd, buf, len, off);
258 buf = (char *)buf + ret;
265 /* Even on files, we can get partial reads due to signals. */
266 bool tdb_pread_all(int fd, void *buf, size_t len, tdb_off_t off)
270 ret = pread(fd, buf, len, off);
278 buf = (char *)buf + ret;
285 bool tdb_read_all(int fd, void *buf, size_t len)
289 ret = read(fd, buf, len);
297 buf = (char *)buf + ret;
303 /* write a lump of data at a specified offset */
304 static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
305 const void *buf, tdb_len_t len)
311 if (tdb->read_only) {
312 tdb->ecode = TDB_ERR_RDONLY;
316 if (tdb->methods->oob(tdb, off + len, 0) != 0)
320 memcpy(off + (char *)tdb->map_ptr, buf, len);
322 if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
323 tdb->ecode = TDB_ERR_IO;
324 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
325 "tdb_write failed at %llu len=%llu (%s)\n",
326 off, len, strerror(errno));
333 /* read a lump of data at a specified offset */
334 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
337 if (tdb->methods->oob(tdb, off + len, 0) != 0) {
342 memcpy(buf, off + (char *)tdb->map_ptr, len);
344 if (!tdb_pread_all(tdb->fd, buf, len, off)) {
345 /* Ensure ecode is set for log fn. */
346 tdb->ecode = TDB_ERR_IO;
347 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
348 "tdb_read failed at %lld "
349 "len=%lld (%s) map_size=%lld\n",
350 (long long)off, (long long)len,
352 (long long)tdb->map_size);
359 int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
360 void *rec, size_t len)
362 return tdb->methods->write(tdb, off, tdb_convert(tdb, rec, len), len);
365 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
366 void *rec, size_t len)
368 int ret = tdb->methods->read(tdb, off, rec, len);
369 tdb_convert(tdb, rec, len);
373 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
375 return tdb_write_convert(tdb, off, &val, sizeof(val));
378 /* read a lump of data, allocating the space for it */
379 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
383 /* some systems don't like zero length malloc */
384 buf = malloc(len ? len : 1);
385 if (unlikely(!buf)) {
386 tdb->ecode = TDB_ERR_OOM;
387 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
388 "tdb_alloc_read malloc failed len=%lld\n",
390 } else if (unlikely(tdb->methods->read(tdb, offset, buf, len))) {
397 uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
399 struct tdb_used_record pad, *r;
403 r = tdb_get(tdb, off, &pad, sizeof(pad));
408 klen = rec_key_length(r);
409 key = tdb_direct(tdb, off + sizeof(pad), klen);
411 return tdb_hash(tdb, key, klen);
413 key = tdb_alloc_read(tdb, off + sizeof(pad), klen);
416 hash = tdb_hash(tdb, key, klen);
421 /* Give a piece of tdb data to a parser */
422 int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
423 tdb_off_t offset, tdb_len_t len,
424 int (*parser)(TDB_DATA key, TDB_DATA data,
430 bool allocated = false;
433 data.dptr = tdb_direct(tdb, offset, len);
434 if (unlikely(!data.dptr)) {
435 if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
440 result = parser(key, data, private_data);
441 if (unlikely(allocated))
446 /* expand a file. we prefer to use ftruncate, as that is what posix
447 says to use for mmap expansion */
448 static int tdb_expand_file(struct tdb_context *tdb,
449 tdb_len_t size, tdb_len_t addition)
453 if (tdb->read_only) {
454 tdb->ecode = TDB_ERR_RDONLY;
458 /* If this fails, we try to fill anyway. */
459 if (ftruncate(tdb->fd, size+addition))
462 /* now fill the file with something. This ensures that the
463 file isn't sparse, which would be very bad if we ran out of
464 disk. This must be done with write, not via mmap */
465 memset(buf, 0x43, sizeof(buf));
466 return fill(tdb, buf, sizeof(buf), addition, size);
469 const void *tdb_access_read(struct tdb_context *tdb,
470 tdb_off_t off, tdb_len_t len)
472 const void *ret = tdb_direct(tdb, off, len);
475 ret = tdb_alloc_read(tdb, off, len);
479 void tdb_access_release(struct tdb_context *tdb, const void *p)
482 || (char *)p < (char *)tdb->map_ptr
483 || (char *)p >= (char *)tdb->map_ptr + tdb->map_size)
488 /* write a lump of data at a specified offset */
489 static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
490 const void *buf, tdb_len_t len)
496 if (tdb->read_only || tdb->traverse_read) {
497 tdb->ecode = TDB_ERR_RDONLY;
501 if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
505 memcpy(off + (char *)tdb->map_ptr, buf, len);
507 ssize_t written = pwrite(tdb->fd, buf, len, off);
508 if ((written != (ssize_t)len) && (written != -1)) {
510 tdb->ecode = TDB_ERR_IO;
511 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
512 "%d of %d bytes at %d, trying once more\n",
513 (int)written, len, off));
514 written = pwrite(tdb->fd, (const char *)buf+written,
519 /* Ensure ecode is set for log fn. */
520 tdb->ecode = TDB_ERR_IO;
521 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d "
522 "len=%d (%s)\n", off, len, strerror(errno)));
524 } else if (written != (ssize_t)len) {
525 tdb->ecode = TDB_ERR_IO;
526 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
527 "write %d bytes at %d in two attempts\n",
538 do an unlocked scan of the hash table heads to find the next non-zero head. The value
539 will then be confirmed with the lock held
541 static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
545 for (;h < tdb->header.hash_size;h++) {
546 if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
552 for (;h < tdb->header.hash_size;h++) {
553 if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
562 /* expand the database by expanding the underlying file and doing the
563 mmap again if necessary */
564 int tdb_expand(struct tdb_context *tdb)
566 struct tdb_record rec;
567 tdb_off_t offset, new_size;
569 /* We have to lock every hash bucket and every free list. */
573 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
574 TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
578 /* must know about any previous expansions by another process */
579 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
581 /* always make room for at least 100 more records, and at
582 least 25% more space. Round the database up to a multiple
584 new_size = MAX(tdb->map_size + size*100, tdb->map_size * 1.25);
585 size = TDB_ALIGN(new_size, tdb->page_size) - tdb->map_size;
587 if (!(tdb->flags & TDB_INTERNAL))
591 * We must ensure the file is unmapped before doing this
592 * to ensure consistency with systems like OpenBSD where
593 * writes and mmaps are not consistent.
596 /* expand the file itself */
597 if (!(tdb->flags & TDB_INTERNAL)) {
598 if (tdb->methods->tdb_expand_file(tdb, tdb->map_size, size) != 0)
602 tdb->map_size += size;
604 if (tdb->flags & TDB_INTERNAL) {
605 char *new_map_ptr = (char *)realloc(tdb->map_ptr,
608 tdb->map_size -= size;
611 tdb->map_ptr = new_map_ptr;
614 * We must ensure the file is remapped before adding the space
615 * to ensure consistency with systems like OpenBSD where
616 * writes and mmaps are not consistent.
619 /* We're ok if the mmap fails as we'll fallback to read/write */
623 /* form a new freelist record */
624 memset(&rec,'\0',sizeof(rec));
625 rec.rec_len = size - sizeof(rec);
627 /* link it into the free list */
628 offset = tdb->map_size - size;
629 if (tdb_free(tdb, offset, &rec) == -1)
632 tdb_unlock(tdb, -1, F_WRLCK);
635 tdb_unlock(tdb, -1, F_WRLCK);
639 /* read/write a tdb_off_t */
640 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
642 return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
645 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
648 return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
652 /* read/write a record */
653 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
655 if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
657 if (TDB_BAD_MAGIC(rec)) {
658 /* Ensure ecode is set for log fn. */
659 tdb->ecode = TDB_ERR_CORRUPT;
660 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
663 return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
666 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
668 struct tdb_record r = *rec;
669 return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
673 static const struct tdb_methods io_methods = {
681 initialise the default methods table
683 void tdb_io_init(struct tdb_context *tdb)
685 tdb->methods = &io_methods;