tdb2: add missing prototype, move tdb_firstkey/tdb_nextkey to traverse.c
[ccan] / ccan / tdb2 / io.c
1  /* 
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9    Copyright (C) Rusty Russell                     2010
10
11      ** NOTE! The following LGPL license applies to the tdb
12      ** library. This does NOT imply that all of Samba is released
13      ** under the LGPL
14
15    This library is free software; you can redistribute it and/or
16    modify it under the terms of the GNU Lesser General Public
17    License as published by the Free Software Foundation; either
18    version 3 of the License, or (at your option) any later version.
19
20    This library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24
25    You should have received a copy of the GNU Lesser General Public
26    License along with this library; if not, see <http://www.gnu.org/licenses/>.
27 */
28 #include "private.h"
29 #include <assert.h>
30 #include <ccan/likely/likely.h>
31
32 void tdb_munmap(struct tdb_context *tdb)
33 {
34         if (tdb->flags & TDB_INTERNAL)
35                 return;
36
37         if (tdb->map_ptr) {
38                 munmap(tdb->map_ptr, tdb->map_size);
39                 tdb->map_ptr = NULL;
40         }
41 }
42
43 void tdb_mmap(struct tdb_context *tdb)
44 {
45         if (tdb->flags & TDB_INTERNAL)
46                 return;
47
48         if (tdb->flags & TDB_NOMMAP)
49                 return;
50
51         tdb->map_ptr = mmap(NULL, tdb->map_size, tdb->mmap_flags,
52                             MAP_SHARED, tdb->fd, 0);
53
54         /*
55          * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
56          */
57         if (tdb->map_ptr == MAP_FAILED) {
58                 tdb->map_ptr = NULL;
59                 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
60                          "tdb_mmap failed for size %lld (%s)\n", 
61                          (long long)tdb->map_size, strerror(errno));
62         }
63 }
64
65 /* check for an out of bounds access - if it is out of bounds then
66    see if the database has been expanded by someone else and expand
67    if necessary 
68    note that "len" is the minimum length needed for the db
69 */
70 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
71 {
72         struct stat st;
73         int ret;
74
75         /* We can't hold pointers during this: we could unmap! */
76         assert(!tdb->direct_access || tdb_has_expansion_lock(tdb));
77
78         if (len <= tdb->map_size)
79                 return 0;
80         if (tdb->flags & TDB_INTERNAL) {
81                 if (!probe) {
82                         /* Ensure ecode is set for log fn. */
83                         tdb->ecode = TDB_ERR_IO;
84                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
85                                  "tdb_oob len %lld beyond internal"
86                                  " malloc size %lld\n",
87                                  (long long)len,
88                                  (long long)tdb->map_size);
89                 }
90                 return -1;
91         }
92
93         if (tdb_lock_expand(tdb, F_RDLCK) != 0)
94                 return -1;
95
96         ret = fstat(tdb->fd, &st);
97
98         tdb_unlock_expand(tdb, F_RDLCK);
99
100         if (ret == -1) {
101                 tdb->ecode = TDB_ERR_IO;
102                 return -1;
103         }
104
105         if (st.st_size < (size_t)len) {
106                 if (!probe) {
107                         /* Ensure ecode is set for log fn. */
108                         tdb->ecode = TDB_ERR_IO;
109                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
110                                  "tdb_oob len %lld beyond eof at %lld\n",
111                                  (long long)len, (long long)st.st_size);
112                 }
113                 return -1;
114         }
115
116         /* Unmap, update size, remap */
117         tdb_munmap(tdb);
118
119         tdb->map_size = st.st_size;
120         tdb_mmap(tdb);
121         return 0;
122 }
123
124 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
125 {
126         if (unlikely(!tdb->map_ptr))
127                 return NULL;
128
129         /* FIXME: We can do a subset of this! */
130         if (tdb->transaction)
131                 return NULL;
132
133         if (unlikely(tdb_oob(tdb, off + len, true) == -1))
134                 return NULL;
135         return (char *)tdb->map_ptr + off;
136 }
137
138 /* Either make a copy into pad and return that, or return ptr into mmap. */
139 /* Note: pad has to be a real object, so we can't get here if len
140  * overflows size_t */
141 void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
142 {
143         if (likely(!(tdb->flags & TDB_CONVERT))) {
144                 void *ret = tdb_direct(tdb, off, len);
145                 if (ret)
146                         return ret;
147         }
148         return tdb_read_convert(tdb, off, pad, len) == -1 ? NULL : pad;
149 }
150
151 /* Endian conversion: we only ever deal with 8 byte quantities */
152 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
153 {
154         if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
155                 uint64_t i, *p = (uint64_t *)buf;
156                 for (i = 0; i < size / 8; i++)
157                         p[i] = bswap_64(p[i]);
158         }
159         return buf;
160 }
161
162 /* FIXME: Return the off? */
163 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
164                               tdb_off_t base, uint64_t start, uint64_t end)
165 {
166         uint64_t i;
167         const uint64_t *val;
168
169         /* Zero vs non-zero is the same unconverted: minor optimization. */
170         val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
171                               (end - start) * sizeof(tdb_off_t), false);
172         if (!val)
173                 return end;
174
175         for (i = 0; i < (end - start); i++) {
176                 if (val[i])
177                         break;
178         }
179         tdb_access_release(tdb, val);
180         return start + i;
181 }
182
183 /* Return first zero offset in num offset array, or num. */
184 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
185                            uint64_t num)
186 {
187         uint64_t i;
188         const uint64_t *val;
189
190         /* Zero vs non-zero is the same unconverted: minor optimization. */
191         val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
192         if (!val)
193                 return num;
194
195         for (i = 0; i < num; i++) {
196                 if (!val[i])
197                         break;
198         }
199         tdb_access_release(tdb, val);
200         return i;
201 }
202
203 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
204 {
205         char buf[8192] = { 0 };
206         void *p = tdb_direct(tdb, off, len);
207         if (p) {
208                 memset(p, 0, len);
209                 return 0;
210         }
211         while (len) {
212                 unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
213                 if (tdb->methods->write(tdb, off, buf, todo) == -1)
214                         return -1;
215                 len -= todo;
216                 off += todo;
217         }
218         return 0;
219 }
220
221 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
222 {
223         tdb_off_t pad, *ret;
224
225         ret = tdb_get(tdb, off, &pad, sizeof(pad));
226         if (!ret) {
227                 return TDB_OFF_ERR;
228         }
229         return *ret;
230 }
231
232 /* Even on files, we can get partial writes due to signals. */
233 bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off)
234 {
235         while (len) {
236                 ssize_t ret;
237                 ret = pwrite(fd, buf, len, off);
238                 if (ret < 0)
239                         return false;
240                 if (ret == 0) {
241                         errno = ENOSPC;
242                         return false;
243                 }
244                 buf = (char *)buf + ret;
245                 off += ret;
246                 len -= ret;
247         }
248         return true;
249 }
250
251 /* Even on files, we can get partial reads due to signals. */
252 bool tdb_pread_all(int fd, void *buf, size_t len, tdb_off_t off)
253 {
254         while (len) {
255                 ssize_t ret;
256                 ret = pread(fd, buf, len, off);
257                 if (ret < 0)
258                         return false;
259                 if (ret == 0) {
260                         /* ETOOSHORT? */
261                         errno = EWOULDBLOCK;
262                         return false;
263                 }
264                 buf = (char *)buf + ret;
265                 off += ret;
266                 len -= ret;
267         }
268         return true;
269 }
270
271 bool tdb_read_all(int fd, void *buf, size_t len)
272 {
273         while (len) {
274                 ssize_t ret;
275                 ret = read(fd, buf, len);
276                 if (ret < 0)
277                         return false;
278                 if (ret == 0) {
279                         /* ETOOSHORT? */
280                         errno = EWOULDBLOCK;
281                         return false;
282                 }
283                 buf = (char *)buf + ret;
284                 len -= ret;
285         }
286         return true;
287 }
288
289 /* write a lump of data at a specified offset */
290 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
291                      const void *buf, tdb_len_t len)
292 {
293         if (len == 0) {
294                 return 0;
295         }
296
297         if (tdb->read_only) {
298                 tdb->ecode = TDB_ERR_RDONLY;
299                 return -1;
300         }
301
302         if (tdb->methods->oob(tdb, off + len, 0) != 0)
303                 return -1;
304
305         if (tdb->map_ptr) {
306                 memcpy(off + (char *)tdb->map_ptr, buf, len);
307         } else {
308                 if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
309                         tdb->ecode = TDB_ERR_IO;
310                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
311                                  "tdb_write failed at %llu len=%llu (%s)\n",
312                                  off, len, strerror(errno));
313                         return -1;
314                 }
315         }
316         return 0;
317 }
318
319 /* read a lump of data at a specified offset */
320 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
321                     tdb_len_t len)
322 {
323         if (tdb->methods->oob(tdb, off + len, 0) != 0) {
324                 return -1;
325         }
326
327         if (tdb->map_ptr) {
328                 memcpy(buf, off + (char *)tdb->map_ptr, len);
329         } else {
330                 if (!tdb_pread_all(tdb->fd, buf, len, off)) {
331                         /* Ensure ecode is set for log fn. */
332                         tdb->ecode = TDB_ERR_IO;
333                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
334                                  "tdb_read failed at %lld "
335                                  "len=%lld (%s) map_size=%lld\n",
336                                  (long long)off, (long long)len,
337                                  strerror(errno),
338                                  (long long)tdb->map_size);
339                         return -1;
340                 }
341         }
342         return 0;
343 }
344
345 int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
346                       const void *rec, size_t len)
347 {
348         int ret;
349         if (unlikely((tdb->flags & TDB_CONVERT))) {
350                 void *conv = malloc(len);
351                 if (!conv) {
352                         tdb->ecode = TDB_ERR_OOM;
353                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
354                                  "tdb_write: no memory converting %zu bytes\n",
355                                  len);
356                         return -1;
357                 }
358                 memcpy(conv, rec, len);
359                 ret = tdb->methods->write(tdb, off,
360                                           tdb_convert(tdb, conv, len), len);
361                 free(conv);
362         } else
363                 ret = tdb->methods->write(tdb, off, rec, len);
364
365         return ret;
366 }
367
368 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
369                       void *rec, size_t len)
370 {
371         int ret = tdb->methods->read(tdb, off, rec, len);
372         tdb_convert(tdb, rec, len);
373         return ret;
374 }
375
376 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
377 {
378         return tdb_write_convert(tdb, off, &val, sizeof(val));
379 }
380
381 static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
382                              tdb_len_t len, unsigned int prefix)
383 {
384         void *buf;
385
386         /* some systems don't like zero length malloc */
387         buf = malloc(prefix + len ? prefix + len : 1);
388         if (unlikely(!buf)) {
389                 tdb->ecode = TDB_ERR_OOM;
390                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
391                          "tdb_alloc_read malloc failed len=%lld\n",
392                          (long long)prefix + len);
393         } else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix, len))) {
394                 free(buf);
395                 buf = NULL;
396         }
397         return buf;
398 }
399
400 /* read a lump of data, allocating the space for it */
401 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
402 {
403         return _tdb_alloc_read(tdb, offset, len, 0);
404 }
405
406 static int fill(struct tdb_context *tdb,
407                 const void *buf, size_t size,
408                 tdb_off_t off, tdb_len_t len)
409 {
410         while (len) {
411                 size_t n = len > size ? size : len;
412
413                 if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
414                         tdb->ecode = TDB_ERR_IO;
415                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
416                                  "fill write failed: giving up!\n");
417                         return -1;
418                 }
419                 len -= n;
420                 off += n;
421         }
422         return 0;
423 }
424
425 /* expand a file.  we prefer to use ftruncate, as that is what posix
426   says to use for mmap expansion */
427 static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
428 {
429         char buf[8192];
430
431         if (tdb->read_only) {
432                 tdb->ecode = TDB_ERR_RDONLY;
433                 return -1;
434         }
435
436         if (tdb->flags & TDB_INTERNAL) {
437                 char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
438                 if (!new) {
439                         tdb->ecode = TDB_ERR_OOM;
440                         return -1;
441                 }
442                 tdb->map_ptr = new;
443                 tdb->map_size += addition;
444         } else {
445                 /* Unmap before trying to write; old TDB claimed OpenBSD had
446                  * problem with this otherwise. */
447                 tdb_munmap(tdb);
448
449                 /* If this fails, we try to fill anyway. */
450                 if (ftruncate(tdb->fd, tdb->map_size + addition))
451                         ;
452
453                 /* now fill the file with something. This ensures that the
454                    file isn't sparse, which would be very bad if we ran out of
455                    disk. This must be done with write, not via mmap */
456                 memset(buf, 0x43, sizeof(buf));
457                 if (fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
458                         return -1;
459                 tdb->map_size += addition;
460                 tdb_mmap(tdb);
461         }
462         return 0;
463 }
464
465 /* This is only neded for tdb_access_commit, but used everywhere to simplify. */
466 struct tdb_access_hdr {
467         tdb_off_t off;
468         tdb_len_t len;
469         bool convert;
470 };
471
472 const void *tdb_access_read(struct tdb_context *tdb,
473                             tdb_off_t off, tdb_len_t len, bool convert)
474 {
475         const void *ret = NULL; 
476
477         if (likely(!(tdb->flags & TDB_CONVERT)))
478                 ret = tdb_direct(tdb, off, len);
479
480         if (!ret) {
481                 struct tdb_access_hdr *hdr;
482                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
483                 if (hdr) {
484                         ret = hdr + 1;
485                         if (convert)
486                                 tdb_convert(tdb, (void *)ret, len);
487                 }
488         } else
489                 tdb->direct_access++;
490
491         return ret;
492 }
493
494 void *tdb_access_write(struct tdb_context *tdb,
495                        tdb_off_t off, tdb_len_t len, bool convert)
496 {
497         void *ret = NULL;
498
499         if (likely(!(tdb->flags & TDB_CONVERT)))
500                 ret = tdb_direct(tdb, off, len);
501
502         if (!ret) {
503                 struct tdb_access_hdr *hdr;
504                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
505                 if (hdr) {
506                         hdr->off = off;
507                         hdr->len = len;
508                         hdr->convert = convert;
509                         ret = hdr + 1;
510                         if (convert)
511                                 tdb_convert(tdb, (void *)ret, len);
512                 }
513         } else
514                 tdb->direct_access++;
515
516         return ret;
517 }
518
519 void tdb_access_release(struct tdb_context *tdb, const void *p)
520 {
521         if (!tdb->map_ptr
522             || (char *)p < (char *)tdb->map_ptr
523             || (char *)p >= (char *)tdb->map_ptr + tdb->map_size)
524                 free((struct tdb_access_hdr *)p - 1);
525         else
526                 tdb->direct_access--;
527 }
528
529 int tdb_access_commit(struct tdb_context *tdb, void *p)
530 {
531         int ret = 0;
532
533         if (!tdb->map_ptr
534             || (char *)p < (char *)tdb->map_ptr
535             || (char *)p >= (char *)tdb->map_ptr + tdb->map_size) {
536                 struct tdb_access_hdr *hdr;
537
538                 hdr = (struct tdb_access_hdr *)p - 1;
539                 if (hdr->convert)
540                         ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
541                 else
542                         ret = tdb_write(tdb, hdr->off, p, hdr->len);
543                 free(hdr);
544         } else
545                 tdb->direct_access--;
546
547         return ret;
548 }
549
550 #if 0
551 /* write a lump of data at a specified offset */
552 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
553                      const void *buf, tdb_len_t len)
554 {
555         if (len == 0) {
556                 return 0;
557         }
558
559         if (tdb->read_only || tdb->traverse_read) {
560                 tdb->ecode = TDB_ERR_RDONLY;
561                 return -1;
562         }
563
564         if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
565                 return -1;
566
567         if (tdb->map_ptr) {
568                 memcpy(off + (char *)tdb->map_ptr, buf, len);
569         } else {
570                 ssize_t written = pwrite(tdb->fd, buf, len, off);
571                 if ((written != (ssize_t)len) && (written != -1)) {
572                         /* try once more */
573                         tdb->ecode = TDB_ERR_IO;
574                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
575                                  "%d of %d bytes at %d, trying once more\n",
576                                  (int)written, len, off));
577                         written = pwrite(tdb->fd, (const char *)buf+written,
578                                          len-written,
579                                          off+written);
580                 }
581                 if (written == -1) {
582                         /* Ensure ecode is set for log fn. */
583                         tdb->ecode = TDB_ERR_IO;
584                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d "
585                                  "len=%d (%s)\n", off, len, strerror(errno)));
586                         return -1;
587                 } else if (written != (ssize_t)len) {
588                         tdb->ecode = TDB_ERR_IO;
589                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
590                                  "write %d bytes at %d in two attempts\n",
591                                  len, off));
592                         return -1;
593                 }
594         }
595         return 0;
596 }
597
598
599
600 /*
601   do an unlocked scan of the hash table heads to find the next non-zero head. The value
602   will then be confirmed with the lock held
603 */              
604 static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
605 {
606         uint32_t h = *chain;
607         if (tdb->map_ptr) {
608                 for (;h < tdb->header.hash_size;h++) {
609                         if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
610                                 break;
611                         }
612                 }
613         } else {
614                 uint32_t off=0;
615                 for (;h < tdb->header.hash_size;h++) {
616                         if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
617                                 break;
618                         }
619                 }
620         }
621         (*chain) = h;
622 }
623
624 /* read/write a tdb_off_t */
625 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
626 {
627         return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
628 }
629
630 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
631 {
632         tdb_off_t off = *d;
633         return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
634 }
635
636
637 /* read/write a record */
638 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
639 {
640         if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
641                 return -1;
642         if (TDB_BAD_MAGIC(rec)) {
643                 /* Ensure ecode is set for log fn. */
644                 tdb->ecode = TDB_ERR_CORRUPT;
645                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
646                 return -1;
647         }
648         return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
649 }
650
651 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
652 {
653         struct tdb_record r = *rec;
654         return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
655 }
656 #endif
657
658 static const struct tdb_methods io_methods = {
659         tdb_read,
660         tdb_write,
661         tdb_oob,
662         tdb_expand_file,
663 };
664
665 /*
666   initialise the default methods table
667 */
668 void tdb_io_init(struct tdb_context *tdb)
669 {
670         tdb->methods = &io_methods;
671 }