2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
9 Copyright (C) Rusty Russell 2010
11 ** NOTE! The following LGPL license applies to the tdb
12 ** library. This does NOT imply that all of Samba is released
15 This library is free software; you can redistribute it and/or
16 modify it under the terms of the GNU Lesser General Public
17 License as published by the Free Software Foundation; either
18 version 3 of the License, or (at your option) any later version.
20 This library is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 Lesser General Public License for more details.
25 You should have received a copy of the GNU Lesser General Public
26 License along with this library; if not, see <http://www.gnu.org/licenses/>.
30 #include <ccan/likely/likely.h>
32 void tdb_munmap(struct tdb_context *tdb)
34 if (tdb->flags & TDB_INTERNAL)
38 munmap(tdb->map_ptr, tdb->map_size);
43 void tdb_mmap(struct tdb_context *tdb)
45 if (tdb->flags & TDB_INTERNAL)
48 if (tdb->flags & TDB_NOMMAP)
51 tdb->map_ptr = mmap(NULL, tdb->map_size, tdb->mmap_flags,
52 MAP_SHARED, tdb->fd, 0);
55 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
57 if (tdb->map_ptr == MAP_FAILED) {
59 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
60 "tdb_mmap failed for size %lld (%s)\n",
61 (long long)tdb->map_size, strerror(errno));
65 /* check for an out of bounds access - if it is out of bounds then
66 see if the database has been expanded by someone else and expand
68 note that "len" is the minimum length needed for the db
70 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
75 /* We can't hold pointers during this: we could unmap! */
76 assert(!tdb->direct_access || tdb_has_expansion_lock(tdb));
78 if (len <= tdb->map_size)
80 if (tdb->flags & TDB_INTERNAL) {
82 /* Ensure ecode is set for log fn. */
83 tdb->ecode = TDB_ERR_IO;
84 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
85 "tdb_oob len %lld beyond internal"
86 " malloc size %lld\n",
88 (long long)tdb->map_size);
93 if (tdb_lock_expand(tdb, F_RDLCK) != 0)
96 ret = fstat(tdb->fd, &st);
98 tdb_unlock_expand(tdb, F_RDLCK);
101 tdb->ecode = TDB_ERR_IO;
105 if (st.st_size < (size_t)len) {
107 /* Ensure ecode is set for log fn. */
108 tdb->ecode = TDB_ERR_IO;
109 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
110 "tdb_oob len %lld beyond eof at %lld\n",
111 (long long)len, (long long)st.st_size);
116 /* Unmap, update size, remap */
119 tdb->map_size = st.st_size;
124 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
126 if (unlikely(!tdb->map_ptr))
129 /* FIXME: We can do a subset of this! */
130 if (tdb->transaction)
133 if (unlikely(tdb_oob(tdb, off + len, true) == -1))
135 return (char *)tdb->map_ptr + off;
138 /* Either make a copy into pad and return that, or return ptr into mmap. */
139 /* Note: pad has to be a real object, so we can't get here if len
140 * overflows size_t */
141 void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
143 if (likely(!(tdb->flags & TDB_CONVERT))) {
144 void *ret = tdb_direct(tdb, off, len);
148 return tdb_read_convert(tdb, off, pad, len) == -1 ? NULL : pad;
151 /* Endian conversion: we only ever deal with 8 byte quantities */
152 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
154 if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
155 uint64_t i, *p = (uint64_t *)buf;
156 for (i = 0; i < size / 8; i++)
157 p[i] = bswap_64(p[i]);
162 /* FIXME: Return the off? */
163 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
164 tdb_off_t base, uint64_t start, uint64_t end)
169 /* Zero vs non-zero is the same unconverted: minor optimization. */
170 val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
171 (end - start) * sizeof(tdb_off_t), false);
175 for (i = 0; i < (end - start); i++) {
179 tdb_access_release(tdb, val);
183 /* Return first zero offset in num offset array, or num. */
184 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
190 /* Zero vs non-zero is the same unconverted: minor optimization. */
191 val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
195 for (i = 0; i < num; i++) {
199 tdb_access_release(tdb, val);
203 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
205 char buf[8192] = { 0 };
206 void *p = tdb_direct(tdb, off, len);
212 unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
213 if (tdb->methods->write(tdb, off, buf, todo) == -1)
221 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
225 ret = tdb_get(tdb, off, &pad, sizeof(pad));
232 /* Even on files, we can get partial writes due to signals. */
233 bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off)
237 ret = pwrite(fd, buf, len, off);
244 buf = (char *)buf + ret;
251 /* Even on files, we can get partial reads due to signals. */
252 bool tdb_pread_all(int fd, void *buf, size_t len, tdb_off_t off)
256 ret = pread(fd, buf, len, off);
264 buf = (char *)buf + ret;
271 bool tdb_read_all(int fd, void *buf, size_t len)
275 ret = read(fd, buf, len);
283 buf = (char *)buf + ret;
289 /* write a lump of data at a specified offset */
290 static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
291 const void *buf, tdb_len_t len)
297 if (tdb->read_only) {
298 tdb->ecode = TDB_ERR_RDONLY;
302 if (tdb->methods->oob(tdb, off + len, 0) != 0)
306 memcpy(off + (char *)tdb->map_ptr, buf, len);
308 if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
309 tdb->ecode = TDB_ERR_IO;
310 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
311 "tdb_write failed at %llu len=%llu (%s)\n",
312 off, len, strerror(errno));
319 /* read a lump of data at a specified offset */
320 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
323 if (tdb->methods->oob(tdb, off + len, 0) != 0) {
328 memcpy(buf, off + (char *)tdb->map_ptr, len);
330 if (!tdb_pread_all(tdb->fd, buf, len, off)) {
331 /* Ensure ecode is set for log fn. */
332 tdb->ecode = TDB_ERR_IO;
333 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
334 "tdb_read failed at %lld "
335 "len=%lld (%s) map_size=%lld\n",
336 (long long)off, (long long)len,
338 (long long)tdb->map_size);
345 int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
346 const void *rec, size_t len)
349 if (unlikely((tdb->flags & TDB_CONVERT))) {
350 void *conv = malloc(len);
352 tdb->ecode = TDB_ERR_OOM;
353 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
354 "tdb_write: no memory converting %zu bytes\n",
358 memcpy(conv, rec, len);
359 ret = tdb->methods->write(tdb, off,
360 tdb_convert(tdb, conv, len), len);
363 ret = tdb->methods->write(tdb, off, rec, len);
368 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
369 void *rec, size_t len)
371 int ret = tdb->methods->read(tdb, off, rec, len);
372 tdb_convert(tdb, rec, len);
376 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
378 return tdb_write_convert(tdb, off, &val, sizeof(val));
381 static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
382 tdb_len_t len, unsigned int prefix)
386 /* some systems don't like zero length malloc */
387 buf = malloc(prefix + len ? prefix + len : 1);
388 if (unlikely(!buf)) {
389 tdb->ecode = TDB_ERR_OOM;
390 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
391 "tdb_alloc_read malloc failed len=%lld\n",
392 (long long)prefix + len);
393 } else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix, len))) {
400 /* read a lump of data, allocating the space for it */
401 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
403 return _tdb_alloc_read(tdb, offset, len, 0);
406 static int fill(struct tdb_context *tdb,
407 const void *buf, size_t size,
408 tdb_off_t off, tdb_len_t len)
411 size_t n = len > size ? size : len;
413 if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
414 tdb->ecode = TDB_ERR_IO;
415 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
416 "fill write failed: giving up!\n");
425 /* expand a file. we prefer to use ftruncate, as that is what posix
426 says to use for mmap expansion */
427 static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
431 if (tdb->read_only) {
432 tdb->ecode = TDB_ERR_RDONLY;
436 if (tdb->flags & TDB_INTERNAL) {
437 char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
439 tdb->ecode = TDB_ERR_OOM;
443 tdb->map_size += addition;
445 /* Unmap before trying to write; old TDB claimed OpenBSD had
446 * problem with this otherwise. */
449 /* If this fails, we try to fill anyway. */
450 if (ftruncate(tdb->fd, tdb->map_size + addition))
453 /* now fill the file with something. This ensures that the
454 file isn't sparse, which would be very bad if we ran out of
455 disk. This must be done with write, not via mmap */
456 memset(buf, 0x43, sizeof(buf));
457 if (fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
459 tdb->map_size += addition;
465 /* This is only neded for tdb_access_commit, but used everywhere to simplify. */
466 struct tdb_access_hdr {
472 const void *tdb_access_read(struct tdb_context *tdb,
473 tdb_off_t off, tdb_len_t len, bool convert)
475 const void *ret = NULL;
477 if (likely(!(tdb->flags & TDB_CONVERT)))
478 ret = tdb_direct(tdb, off, len);
481 struct tdb_access_hdr *hdr;
482 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
486 tdb_convert(tdb, (void *)ret, len);
489 tdb->direct_access++;
494 void *tdb_access_write(struct tdb_context *tdb,
495 tdb_off_t off, tdb_len_t len, bool convert)
499 if (likely(!(tdb->flags & TDB_CONVERT)))
500 ret = tdb_direct(tdb, off, len);
503 struct tdb_access_hdr *hdr;
504 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
508 hdr->convert = convert;
511 tdb_convert(tdb, (void *)ret, len);
514 tdb->direct_access++;
519 void tdb_access_release(struct tdb_context *tdb, const void *p)
522 || (char *)p < (char *)tdb->map_ptr
523 || (char *)p >= (char *)tdb->map_ptr + tdb->map_size)
524 free((struct tdb_access_hdr *)p - 1);
526 tdb->direct_access--;
529 int tdb_access_commit(struct tdb_context *tdb, void *p)
534 || (char *)p < (char *)tdb->map_ptr
535 || (char *)p >= (char *)tdb->map_ptr + tdb->map_size) {
536 struct tdb_access_hdr *hdr;
538 hdr = (struct tdb_access_hdr *)p - 1;
540 ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
542 ret = tdb_write(tdb, hdr->off, p, hdr->len);
545 tdb->direct_access--;
551 /* write a lump of data at a specified offset */
552 static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
553 const void *buf, tdb_len_t len)
559 if (tdb->read_only || tdb->traverse_read) {
560 tdb->ecode = TDB_ERR_RDONLY;
564 if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
568 memcpy(off + (char *)tdb->map_ptr, buf, len);
570 ssize_t written = pwrite(tdb->fd, buf, len, off);
571 if ((written != (ssize_t)len) && (written != -1)) {
573 tdb->ecode = TDB_ERR_IO;
574 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
575 "%d of %d bytes at %d, trying once more\n",
576 (int)written, len, off));
577 written = pwrite(tdb->fd, (const char *)buf+written,
582 /* Ensure ecode is set for log fn. */
583 tdb->ecode = TDB_ERR_IO;
584 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d "
585 "len=%d (%s)\n", off, len, strerror(errno)));
587 } else if (written != (ssize_t)len) {
588 tdb->ecode = TDB_ERR_IO;
589 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
590 "write %d bytes at %d in two attempts\n",
601 do an unlocked scan of the hash table heads to find the next non-zero head. The value
602 will then be confirmed with the lock held
604 static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
608 for (;h < tdb->header.hash_size;h++) {
609 if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
615 for (;h < tdb->header.hash_size;h++) {
616 if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
624 /* read/write a tdb_off_t */
625 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
627 return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
630 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
633 return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
637 /* read/write a record */
638 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
640 if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
642 if (TDB_BAD_MAGIC(rec)) {
643 /* Ensure ecode is set for log fn. */
644 tdb->ecode = TDB_ERR_CORRUPT;
645 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
648 return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
651 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
653 struct tdb_record r = *rec;
654 return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
658 static const struct tdb_methods io_methods = {
666 initialise the default methods table
668 void tdb_io_init(struct tdb_context *tdb)
670 tdb->methods = &io_methods;