tdb2: clarify locking heirarchy; hashes, freelist, then expand lock.
[ccan] / ccan / tdb2 / io.c
1  /* 
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9    Copyright (C) Rusty Russell                     2010
10
11      ** NOTE! The following LGPL license applies to the tdb
12      ** library. This does NOT imply that all of Samba is released
13      ** under the LGPL
14
15    This library is free software; you can redistribute it and/or
16    modify it under the terms of the GNU Lesser General Public
17    License as published by the Free Software Foundation; either
18    version 3 of the License, or (at your option) any later version.
19
20    This library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24
25    You should have received a copy of the GNU Lesser General Public
26    License along with this library; if not, see <http://www.gnu.org/licenses/>.
27 */
28 #include "private.h"
29 #include <assert.h>
30 #include <ccan/likely/likely.h>
31
32 void tdb_munmap(struct tdb_context *tdb)
33 {
34         if (tdb->flags & TDB_INTERNAL)
35                 return;
36
37         if (tdb->map_ptr) {
38                 munmap(tdb->map_ptr, tdb->map_size);
39                 tdb->map_ptr = NULL;
40         }
41 }
42
43 void tdb_mmap(struct tdb_context *tdb)
44 {
45         if (tdb->flags & TDB_INTERNAL)
46                 return;
47
48         if (tdb->flags & TDB_NOMMAP)
49                 return;
50
51         tdb->map_ptr = mmap(NULL, tdb->map_size, tdb->mmap_flags,
52                             MAP_SHARED, tdb->fd, 0);
53
54         /*
55          * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
56          */
57         if (tdb->map_ptr == MAP_FAILED) {
58                 tdb->map_ptr = NULL;
59                 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
60                          "tdb_mmap failed for size %lld (%s)\n", 
61                          (long long)tdb->map_size, strerror(errno));
62         }
63 }
64
65 /* check for an out of bounds access - if it is out of bounds then
66    see if the database has been expanded by someone else and expand
67    if necessary 
68    note that "len" is the minimum length needed for the db
69 */
70 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
71 {
72         struct stat st;
73         int ret;
74
75         /* We can't hold pointers during this: we could unmap! */
76         assert(!tdb->direct_access
77                || (tdb->flags & TDB_NOLOCK)
78                || tdb_has_expansion_lock(tdb));
79
80         if (len <= tdb->map_size)
81                 return 0;
82         if (tdb->flags & TDB_INTERNAL) {
83                 if (!probe) {
84                         /* Ensure ecode is set for log fn. */
85                         tdb->ecode = TDB_ERR_IO;
86                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
87                                  "tdb_oob len %lld beyond internal"
88                                  " malloc size %lld\n",
89                                  (long long)len,
90                                  (long long)tdb->map_size);
91                 }
92                 return -1;
93         }
94
95         if (tdb_lock_expand(tdb, F_RDLCK) != 0)
96                 return -1;
97
98         ret = fstat(tdb->fd, &st);
99
100         tdb_unlock_expand(tdb, F_RDLCK);
101
102         if (ret == -1) {
103                 tdb->ecode = TDB_ERR_IO;
104                 return -1;
105         }
106
107         if (st.st_size < (size_t)len) {
108                 if (!probe) {
109                         /* Ensure ecode is set for log fn. */
110                         tdb->ecode = TDB_ERR_IO;
111                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
112                                  "tdb_oob len %lld beyond eof at %lld\n",
113                                  (long long)len, (long long)st.st_size);
114                 }
115                 return -1;
116         }
117
118         /* Unmap, update size, remap */
119         tdb_munmap(tdb);
120
121         tdb->map_size = st.st_size;
122         tdb_mmap(tdb);
123         return 0;
124 }
125
126 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
127 {
128         if (unlikely(!tdb->map_ptr))
129                 return NULL;
130
131         /* FIXME: We can do a subset of this! */
132         if (tdb->transaction)
133                 return NULL;
134
135         if (unlikely(tdb_oob(tdb, off + len, true) == -1))
136                 return NULL;
137         return (char *)tdb->map_ptr + off;
138 }
139
140 /* Either make a copy into pad and return that, or return ptr into mmap. */
141 /* Note: pad has to be a real object, so we can't get here if len
142  * overflows size_t */
143 void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
144 {
145         if (likely(!(tdb->flags & TDB_CONVERT))) {
146                 void *ret = tdb_direct(tdb, off, len);
147                 if (ret)
148                         return ret;
149         }
150         return tdb_read_convert(tdb, off, pad, len) == -1 ? NULL : pad;
151 }
152
153 /* Endian conversion: we only ever deal with 8 byte quantities */
154 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
155 {
156         if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
157                 uint64_t i, *p = (uint64_t *)buf;
158                 for (i = 0; i < size / 8; i++)
159                         p[i] = bswap_64(p[i]);
160         }
161         return buf;
162 }
163
164 /* FIXME: Return the off? */
165 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
166                               tdb_off_t base, uint64_t start, uint64_t end)
167 {
168         uint64_t i;
169         const uint64_t *val;
170
171         /* Zero vs non-zero is the same unconverted: minor optimization. */
172         val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
173                               (end - start) * sizeof(tdb_off_t), false);
174         if (!val)
175                 return end;
176
177         for (i = 0; i < (end - start); i++) {
178                 if (val[i])
179                         break;
180         }
181         tdb_access_release(tdb, val);
182         return start + i;
183 }
184
185 /* Return first zero offset in num offset array, or num. */
186 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
187                            uint64_t num)
188 {
189         uint64_t i;
190         const uint64_t *val;
191
192         /* Zero vs non-zero is the same unconverted: minor optimization. */
193         val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
194         if (!val)
195                 return num;
196
197         for (i = 0; i < num; i++) {
198                 if (!val[i])
199                         break;
200         }
201         tdb_access_release(tdb, val);
202         return i;
203 }
204
205 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
206 {
207         char buf[8192] = { 0 };
208         void *p = tdb_direct(tdb, off, len);
209         if (p) {
210                 memset(p, 0, len);
211                 return 0;
212         }
213         while (len) {
214                 unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
215                 if (tdb->methods->write(tdb, off, buf, todo) == -1)
216                         return -1;
217                 len -= todo;
218                 off += todo;
219         }
220         return 0;
221 }
222
223 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
224 {
225         tdb_off_t pad, *ret;
226
227         ret = tdb_get(tdb, off, &pad, sizeof(pad));
228         if (!ret) {
229                 return TDB_OFF_ERR;
230         }
231         return *ret;
232 }
233
234 /* Even on files, we can get partial writes due to signals. */
235 bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off)
236 {
237         while (len) {
238                 ssize_t ret;
239                 ret = pwrite(fd, buf, len, off);
240                 if (ret < 0)
241                         return false;
242                 if (ret == 0) {
243                         errno = ENOSPC;
244                         return false;
245                 }
246                 buf = (char *)buf + ret;
247                 off += ret;
248                 len -= ret;
249         }
250         return true;
251 }
252
253 /* Even on files, we can get partial reads due to signals. */
254 bool tdb_pread_all(int fd, void *buf, size_t len, tdb_off_t off)
255 {
256         while (len) {
257                 ssize_t ret;
258                 ret = pread(fd, buf, len, off);
259                 if (ret < 0)
260                         return false;
261                 if (ret == 0) {
262                         /* ETOOSHORT? */
263                         errno = EWOULDBLOCK;
264                         return false;
265                 }
266                 buf = (char *)buf + ret;
267                 off += ret;
268                 len -= ret;
269         }
270         return true;
271 }
272
273 bool tdb_read_all(int fd, void *buf, size_t len)
274 {
275         while (len) {
276                 ssize_t ret;
277                 ret = read(fd, buf, len);
278                 if (ret < 0)
279                         return false;
280                 if (ret == 0) {
281                         /* ETOOSHORT? */
282                         errno = EWOULDBLOCK;
283                         return false;
284                 }
285                 buf = (char *)buf + ret;
286                 len -= ret;
287         }
288         return true;
289 }
290
291 /* write a lump of data at a specified offset */
292 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
293                      const void *buf, tdb_len_t len)
294 {
295         if (len == 0) {
296                 return 0;
297         }
298
299         if (tdb->read_only) {
300                 tdb->ecode = TDB_ERR_RDONLY;
301                 return -1;
302         }
303
304         if (tdb->methods->oob(tdb, off + len, 0) != 0)
305                 return -1;
306
307         if (tdb->map_ptr) {
308                 memcpy(off + (char *)tdb->map_ptr, buf, len);
309         } else {
310                 if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
311                         tdb->ecode = TDB_ERR_IO;
312                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
313                                  "tdb_write failed at %llu len=%llu (%s)\n",
314                                  off, len, strerror(errno));
315                         return -1;
316                 }
317         }
318         return 0;
319 }
320
321 /* read a lump of data at a specified offset */
322 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
323                     tdb_len_t len)
324 {
325         if (tdb->methods->oob(tdb, off + len, 0) != 0) {
326                 return -1;
327         }
328
329         if (tdb->map_ptr) {
330                 memcpy(buf, off + (char *)tdb->map_ptr, len);
331         } else {
332                 if (!tdb_pread_all(tdb->fd, buf, len, off)) {
333                         /* Ensure ecode is set for log fn. */
334                         tdb->ecode = TDB_ERR_IO;
335                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
336                                  "tdb_read failed at %lld "
337                                  "len=%lld (%s) map_size=%lld\n",
338                                  (long long)off, (long long)len,
339                                  strerror(errno),
340                                  (long long)tdb->map_size);
341                         return -1;
342                 }
343         }
344         return 0;
345 }
346
347 int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
348                       const void *rec, size_t len)
349 {
350         int ret;
351         if (unlikely((tdb->flags & TDB_CONVERT))) {
352                 void *conv = malloc(len);
353                 if (!conv) {
354                         tdb->ecode = TDB_ERR_OOM;
355                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
356                                  "tdb_write: no memory converting %zu bytes\n",
357                                  len);
358                         return -1;
359                 }
360                 memcpy(conv, rec, len);
361                 ret = tdb->methods->write(tdb, off,
362                                           tdb_convert(tdb, conv, len), len);
363                 free(conv);
364         } else
365                 ret = tdb->methods->write(tdb, off, rec, len);
366
367         return ret;
368 }
369
370 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
371                       void *rec, size_t len)
372 {
373         int ret = tdb->methods->read(tdb, off, rec, len);
374         tdb_convert(tdb, rec, len);
375         return ret;
376 }
377
378 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
379 {
380         return tdb_write_convert(tdb, off, &val, sizeof(val));
381 }
382
383 static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
384                              tdb_len_t len, unsigned int prefix)
385 {
386         void *buf;
387
388         /* some systems don't like zero length malloc */
389         buf = malloc(prefix + len ? prefix + len : 1);
390         if (unlikely(!buf)) {
391                 tdb->ecode = TDB_ERR_OOM;
392                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
393                          "tdb_alloc_read malloc failed len=%lld\n",
394                          (long long)prefix + len);
395         } else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix, len))) {
396                 free(buf);
397                 buf = NULL;
398         }
399         return buf;
400 }
401
402 /* read a lump of data, allocating the space for it */
403 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
404 {
405         return _tdb_alloc_read(tdb, offset, len, 0);
406 }
407
408 static int fill(struct tdb_context *tdb,
409                 const void *buf, size_t size,
410                 tdb_off_t off, tdb_len_t len)
411 {
412         while (len) {
413                 size_t n = len > size ? size : len;
414
415                 if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
416                         tdb->ecode = TDB_ERR_IO;
417                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
418                                  "fill write failed: giving up!\n");
419                         return -1;
420                 }
421                 len -= n;
422                 off += n;
423         }
424         return 0;
425 }
426
427 /* expand a file.  we prefer to use ftruncate, as that is what posix
428   says to use for mmap expansion */
429 static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
430 {
431         char buf[8192];
432
433         if (tdb->read_only) {
434                 tdb->ecode = TDB_ERR_RDONLY;
435                 return -1;
436         }
437
438         if (tdb->flags & TDB_INTERNAL) {
439                 char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
440                 if (!new) {
441                         tdb->ecode = TDB_ERR_OOM;
442                         return -1;
443                 }
444                 tdb->map_ptr = new;
445                 tdb->map_size += addition;
446         } else {
447                 /* Unmap before trying to write; old TDB claimed OpenBSD had
448                  * problem with this otherwise. */
449                 tdb_munmap(tdb);
450
451                 /* If this fails, we try to fill anyway. */
452                 if (ftruncate(tdb->fd, tdb->map_size + addition))
453                         ;
454
455                 /* now fill the file with something. This ensures that the
456                    file isn't sparse, which would be very bad if we ran out of
457                    disk. This must be done with write, not via mmap */
458                 memset(buf, 0x43, sizeof(buf));
459                 if (fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
460                         return -1;
461                 tdb->map_size += addition;
462                 tdb_mmap(tdb);
463         }
464         return 0;
465 }
466
467 /* This is only neded for tdb_access_commit, but used everywhere to simplify. */
468 struct tdb_access_hdr {
469         tdb_off_t off;
470         tdb_len_t len;
471         bool convert;
472 };
473
474 const void *tdb_access_read(struct tdb_context *tdb,
475                             tdb_off_t off, tdb_len_t len, bool convert)
476 {
477         const void *ret = NULL; 
478
479         if (likely(!(tdb->flags & TDB_CONVERT)))
480                 ret = tdb_direct(tdb, off, len);
481
482         if (!ret) {
483                 struct tdb_access_hdr *hdr;
484                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
485                 if (hdr) {
486                         ret = hdr + 1;
487                         if (convert)
488                                 tdb_convert(tdb, (void *)ret, len);
489                 }
490         } else
491                 tdb->direct_access++;
492
493         return ret;
494 }
495
496 void *tdb_access_write(struct tdb_context *tdb,
497                        tdb_off_t off, tdb_len_t len, bool convert)
498 {
499         void *ret = NULL;
500
501         if (likely(!(tdb->flags & TDB_CONVERT)))
502                 ret = tdb_direct(tdb, off, len);
503
504         if (!ret) {
505                 struct tdb_access_hdr *hdr;
506                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
507                 if (hdr) {
508                         hdr->off = off;
509                         hdr->len = len;
510                         hdr->convert = convert;
511                         ret = hdr + 1;
512                         if (convert)
513                                 tdb_convert(tdb, (void *)ret, len);
514                 }
515         } else
516                 tdb->direct_access++;
517
518         return ret;
519 }
520
521 void tdb_access_release(struct tdb_context *tdb, const void *p)
522 {
523         if (!tdb->map_ptr
524             || (char *)p < (char *)tdb->map_ptr
525             || (char *)p >= (char *)tdb->map_ptr + tdb->map_size)
526                 free((struct tdb_access_hdr *)p - 1);
527         else
528                 tdb->direct_access--;
529 }
530
531 int tdb_access_commit(struct tdb_context *tdb, void *p)
532 {
533         int ret = 0;
534
535         if (!tdb->map_ptr
536             || (char *)p < (char *)tdb->map_ptr
537             || (char *)p >= (char *)tdb->map_ptr + tdb->map_size) {
538                 struct tdb_access_hdr *hdr;
539
540                 hdr = (struct tdb_access_hdr *)p - 1;
541                 if (hdr->convert)
542                         ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
543                 else
544                         ret = tdb_write(tdb, hdr->off, p, hdr->len);
545                 free(hdr);
546         } else
547                 tdb->direct_access--;
548
549         return ret;
550 }
551
552 #if 0
553 /* write a lump of data at a specified offset */
554 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
555                      const void *buf, tdb_len_t len)
556 {
557         if (len == 0) {
558                 return 0;
559         }
560
561         if (tdb->read_only || tdb->traverse_read) {
562                 tdb->ecode = TDB_ERR_RDONLY;
563                 return -1;
564         }
565
566         if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
567                 return -1;
568
569         if (tdb->map_ptr) {
570                 memcpy(off + (char *)tdb->map_ptr, buf, len);
571         } else {
572                 ssize_t written = pwrite(tdb->fd, buf, len, off);
573                 if ((written != (ssize_t)len) && (written != -1)) {
574                         /* try once more */
575                         tdb->ecode = TDB_ERR_IO;
576                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
577                                  "%d of %d bytes at %d, trying once more\n",
578                                  (int)written, len, off));
579                         written = pwrite(tdb->fd, (const char *)buf+written,
580                                          len-written,
581                                          off+written);
582                 }
583                 if (written == -1) {
584                         /* Ensure ecode is set for log fn. */
585                         tdb->ecode = TDB_ERR_IO;
586                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d "
587                                  "len=%d (%s)\n", off, len, strerror(errno)));
588                         return -1;
589                 } else if (written != (ssize_t)len) {
590                         tdb->ecode = TDB_ERR_IO;
591                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
592                                  "write %d bytes at %d in two attempts\n",
593                                  len, off));
594                         return -1;
595                 }
596         }
597         return 0;
598 }
599
600
601
602 /*
603   do an unlocked scan of the hash table heads to find the next non-zero head. The value
604   will then be confirmed with the lock held
605 */              
606 static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
607 {
608         uint32_t h = *chain;
609         if (tdb->map_ptr) {
610                 for (;h < tdb->header.hash_size;h++) {
611                         if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
612                                 break;
613                         }
614                 }
615         } else {
616                 uint32_t off=0;
617                 for (;h < tdb->header.hash_size;h++) {
618                         if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
619                                 break;
620                         }
621                 }
622         }
623         (*chain) = h;
624 }
625
626 /* read/write a tdb_off_t */
627 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
628 {
629         return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
630 }
631
632 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
633 {
634         tdb_off_t off = *d;
635         return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
636 }
637
638
639 /* read/write a record */
640 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
641 {
642         if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
643                 return -1;
644         if (TDB_BAD_MAGIC(rec)) {
645                 /* Ensure ecode is set for log fn. */
646                 tdb->ecode = TDB_ERR_CORRUPT;
647                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
648                 return -1;
649         }
650         return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
651 }
652
653 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
654 {
655         struct tdb_record r = *rec;
656         return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
657 }
658 #endif
659
660 static const struct tdb_methods io_methods = {
661         tdb_read,
662         tdb_write,
663         tdb_oob,
664         tdb_expand_file,
665 };
666
667 /*
668   initialise the default methods table
669 */
670 void tdb_io_init(struct tdb_context *tdb)
671 {
672         tdb->methods = &io_methods;
673 }