2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
9 Copyright (C) Rusty Russell 2010
11 ** NOTE! The following LGPL license applies to the tdb
12 ** library. This does NOT imply that all of Samba is released
15 This library is free software; you can redistribute it and/or
16 modify it under the terms of the GNU Lesser General Public
17 License as published by the Free Software Foundation; either
18 version 3 of the License, or (at your option) any later version.
20 This library is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 Lesser General Public License for more details.
25 You should have received a copy of the GNU Lesser General Public
26 License along with this library; if not, see <http://www.gnu.org/licenses/>.
29 #include <ccan/likely/likely.h>
31 void tdb_munmap(struct tdb_context *tdb)
33 if (tdb->flags & TDB_INTERNAL)
37 munmap(tdb->map_ptr, tdb->map_size);
42 void tdb_mmap(struct tdb_context *tdb)
44 if (tdb->flags & TDB_INTERNAL)
47 if (tdb->flags & TDB_NOMMAP)
50 tdb->map_ptr = mmap(NULL, tdb->map_size,
51 PROT_READ|(tdb->read_only? 0:PROT_WRITE),
52 MAP_SHARED, tdb->fd, 0);
55 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
57 if (tdb->map_ptr == MAP_FAILED) {
59 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
60 "tdb_mmap failed for size %lld (%s)\n",
61 (long long)tdb->map_size, strerror(errno));
65 /* check for an out of bounds access - if it is out of bounds then
66 see if the database has been expanded by someone else and expand
68 note that "len" is the minimum length needed for the db
70 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
73 if (len <= tdb->map_size)
75 if (tdb->flags & TDB_INTERNAL) {
77 /* Ensure ecode is set for log fn. */
78 tdb->ecode = TDB_ERR_IO;
79 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
80 "tdb_oob len %lld beyond internal"
81 " malloc size %lld\n",
83 (long long)tdb->map_size);
88 if (fstat(tdb->fd, &st) == -1) {
89 tdb->ecode = TDB_ERR_IO;
93 if (st.st_size < (size_t)len) {
95 /* Ensure ecode is set for log fn. */
96 tdb->ecode = TDB_ERR_IO;
97 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
98 "tdb_oob len %lld beyond eof at %lld\n",
99 (long long)len, (long long)st.st_size);
104 /* Unmap, update size, remap */
106 tdb->map_size = st.st_size;
111 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
113 if (unlikely(!tdb->map_ptr))
116 /* FIXME: We can do a subset of this! */
117 if (tdb->transaction)
120 if (unlikely(tdb_oob(tdb, off + len, true) == -1))
122 return (char *)tdb->map_ptr + off;
125 /* Either make a copy into pad and return that, or return ptr into mmap. */
126 /* Note: pad has to be a real object, so we can't get here if len
127 * overflows size_t */
128 /* FIXME: Transaction */
129 void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
133 if (likely(!(tdb->flags & TDB_CONVERT))) {
134 void *ret = tdb_direct(tdb, off, len);
139 if (unlikely(tdb_oob(tdb, off + len, false) == -1))
142 r = pread(tdb->fd, pad, len, off);
143 if (r != (ssize_t)len) {
144 /* Ensure ecode is set for log fn. */
145 tdb->ecode = TDB_ERR_IO;
146 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
147 "tdb_read failed at %llu "
148 "len=%lld ret=%lld (%s) map_size=%lld\n",
149 (long long)off, (long long)len,
150 (long long)r, strerror(errno),
151 (long long)tdb->map_size);
154 return tdb_convert(tdb, pad, len);
157 /* Endian conversion: we only ever deal with 8 byte quantities */
158 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
160 if (unlikely((tdb->flags & TDB_CONVERT))) {
161 uint64_t i, *p = (uint64_t *)buf;
162 for (i = 0; i < size / 8; i++)
163 p[i] = bswap_64(p[i]);
168 /* Return first non-zero offset in num offset array, or num. */
169 /* FIXME: Return the off? */
170 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb, tdb_off_t off,
176 val = tdb_direct(tdb, off, num * sizeof(tdb_off_t));
177 if (!unlikely(val)) {
178 val = tdb_alloc_read(tdb, off, num * sizeof(tdb_off_t));
184 for (i = 0; i < num; i++) {
193 /* Return first zero offset in num offset array, or num. */
194 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
200 val = tdb_direct(tdb, off, num * sizeof(tdb_off_t));
201 if (!unlikely(val)) {
202 val = tdb_alloc_read(tdb, off, num * sizeof(tdb_off_t));
208 for (i = 0; i < num; i++) {
217 static int fill(struct tdb_context *tdb,
218 const void *buf, size_t size,
219 tdb_off_t off, tdb_len_t len)
222 size_t n = len > size ? size : len;
224 if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
225 tdb->ecode = TDB_ERR_IO;
226 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
227 "fill write failed: giving up!\n");
236 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
238 void *p = tdb_direct(tdb, off, len);
243 char buf[8192] = { 0 };
244 return fill(tdb, buf, sizeof(buf), len, off);
248 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
252 ret = tdb_get(tdb, off, &pad, sizeof(ret));
259 /* Even on files, we can get partial writes due to signals. */
260 bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off)
264 ret = pwrite(fd, buf, len, off);
278 /* write a lump of data at a specified offset */
279 static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
280 const void *buf, tdb_len_t len)
286 if (tdb->read_only) {
287 tdb->ecode = TDB_ERR_RDONLY;
291 if (tdb->methods->oob(tdb, off + len, 0) != 0)
295 memcpy(off + (char *)tdb->map_ptr, buf, len);
297 if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
298 tdb->ecode = TDB_ERR_IO;
299 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
300 "tdb_write failed at %llu len=%llu (%s)\n",
301 off, len, strerror(errno));
308 /* read a lump of data at a specified offset */
309 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
312 if (tdb->methods->oob(tdb, off + len, 0) != 0) {
317 memcpy(buf, off + (char *)tdb->map_ptr, len);
319 ssize_t ret = pread(tdb->fd, buf, len, off);
320 if (ret != (ssize_t)len) {
321 /* Ensure ecode is set for log fn. */
322 tdb->ecode = TDB_ERR_IO;
323 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
324 "tdb_read failed at %lld "
325 "len=%lld ret=%lld (%s) map_size=%lld\n",
326 (long long)off, (long long)len,
327 (long long)ret, strerror(errno),
328 (long long)tdb->map_size);
335 int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
336 void *rec, size_t len)
338 return tdb->methods->write(tdb, off, tdb_convert(tdb, rec, len), len);
341 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
342 void *rec, size_t len)
344 int ret = tdb->methods->read(tdb, off, rec, len);
345 tdb_convert(tdb, rec, len);
349 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
351 return tdb_write_convert(tdb, off, &val, sizeof(val));
354 /* read a lump of data, allocating the space for it */
355 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
359 /* some systems don't like zero length malloc */
360 buf = malloc(len ? len : 1);
361 if (unlikely(!buf)) {
362 tdb->ecode = TDB_ERR_OOM;
363 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
364 "tdb_alloc_read malloc failed len=%lld\n",
366 } else if (unlikely(tdb->methods->read(tdb, offset, buf, len))) {
373 uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
375 struct tdb_used_record pad, *r;
379 r = tdb_get(tdb, off, &pad, sizeof(*r));
384 klen = rec_key_length(r);
385 key = tdb_direct(tdb, off + sizeof(*r), klen);
387 return tdb_hash(tdb, key, klen);
389 key = tdb_alloc_read(tdb, off + sizeof(*r), klen);
392 hash = tdb_hash(tdb, key, klen);
397 /* Give a piece of tdb data to a parser */
398 int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
399 tdb_off_t offset, tdb_len_t len,
400 int (*parser)(TDB_DATA key, TDB_DATA data,
406 bool allocated = false;
409 data.dptr = tdb_direct(tdb, offset, len);
410 if (unlikely(!data.dptr)) {
411 if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
416 result = parser(key, data, private_data);
417 if (unlikely(allocated))
422 /* expand a file. we prefer to use ftruncate, as that is what posix
423 says to use for mmap expansion */
424 static int tdb_expand_file(struct tdb_context *tdb,
425 tdb_len_t size, tdb_len_t addition)
429 if (tdb->read_only) {
430 tdb->ecode = TDB_ERR_RDONLY;
434 /* If this fails, we try to fill anyway. */
435 if (ftruncate(tdb->fd, size+addition))
438 /* now fill the file with something. This ensures that the
439 file isn't sparse, which would be very bad if we ran out of
440 disk. This must be done with write, not via mmap */
441 memset(buf, 0x43, sizeof(buf));
442 return fill(tdb, buf, sizeof(buf), addition, size);
445 const void *tdb_access_read(struct tdb_context *tdb,
446 tdb_off_t off, tdb_len_t len)
448 const void *ret = tdb_direct(tdb, off, len);
451 ret = tdb_alloc_read(tdb, off, len);
455 void tdb_access_release(struct tdb_context *tdb, const void *p)
458 || (char *)p < (char *)tdb->map_ptr
459 || (char *)p >= (char *)tdb->map_ptr + tdb->map_size)
464 /* write a lump of data at a specified offset */
465 static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
466 const void *buf, tdb_len_t len)
472 if (tdb->read_only || tdb->traverse_read) {
473 tdb->ecode = TDB_ERR_RDONLY;
477 if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
481 memcpy(off + (char *)tdb->map_ptr, buf, len);
483 ssize_t written = pwrite(tdb->fd, buf, len, off);
484 if ((written != (ssize_t)len) && (written != -1)) {
486 tdb->ecode = TDB_ERR_IO;
487 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
488 "%d of %d bytes at %d, trying once more\n",
489 (int)written, len, off));
490 written = pwrite(tdb->fd, (const char *)buf+written,
495 /* Ensure ecode is set for log fn. */
496 tdb->ecode = TDB_ERR_IO;
497 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d "
498 "len=%d (%s)\n", off, len, strerror(errno)));
500 } else if (written != (ssize_t)len) {
501 tdb->ecode = TDB_ERR_IO;
502 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
503 "write %d bytes at %d in two attempts\n",
514 do an unlocked scan of the hash table heads to find the next non-zero head. The value
515 will then be confirmed with the lock held
517 static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
521 for (;h < tdb->header.hash_size;h++) {
522 if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
528 for (;h < tdb->header.hash_size;h++) {
529 if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
538 /* expand the database by expanding the underlying file and doing the
539 mmap again if necessary */
540 int tdb_expand(struct tdb_context *tdb)
542 struct tdb_record rec;
543 tdb_off_t offset, new_size;
545 /* We have to lock every hash bucket and every free list. */
549 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
550 TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
554 /* must know about any previous expansions by another process */
555 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
557 /* always make room for at least 100 more records, and at
558 least 25% more space. Round the database up to a multiple
560 new_size = MAX(tdb->map_size + size*100, tdb->map_size * 1.25);
561 size = TDB_ALIGN(new_size, tdb->page_size) - tdb->map_size;
563 if (!(tdb->flags & TDB_INTERNAL))
567 * We must ensure the file is unmapped before doing this
568 * to ensure consistency with systems like OpenBSD where
569 * writes and mmaps are not consistent.
572 /* expand the file itself */
573 if (!(tdb->flags & TDB_INTERNAL)) {
574 if (tdb->methods->tdb_expand_file(tdb, tdb->map_size, size) != 0)
578 tdb->map_size += size;
580 if (tdb->flags & TDB_INTERNAL) {
581 char *new_map_ptr = (char *)realloc(tdb->map_ptr,
584 tdb->map_size -= size;
587 tdb->map_ptr = new_map_ptr;
590 * We must ensure the file is remapped before adding the space
591 * to ensure consistency with systems like OpenBSD where
592 * writes and mmaps are not consistent.
595 /* We're ok if the mmap fails as we'll fallback to read/write */
599 /* form a new freelist record */
600 memset(&rec,'\0',sizeof(rec));
601 rec.rec_len = size - sizeof(rec);
603 /* link it into the free list */
604 offset = tdb->map_size - size;
605 if (tdb_free(tdb, offset, &rec) == -1)
608 tdb_unlock(tdb, -1, F_WRLCK);
611 tdb_unlock(tdb, -1, F_WRLCK);
615 /* read/write a tdb_off_t */
616 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
618 return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
621 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
624 return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
628 /* read/write a record */
629 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
631 if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
633 if (TDB_BAD_MAGIC(rec)) {
634 /* Ensure ecode is set for log fn. */
635 tdb->ecode = TDB_ERR_CORRUPT;
636 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
639 return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
642 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
644 struct tdb_record r = *rec;
645 return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
649 static const struct tdb_methods io_methods = {
657 initialise the default methods table
659 void tdb_io_init(struct tdb_context *tdb)
661 tdb->methods = &io_methods;