tdb2: don't hold access to tdb mmap during traverse.
[ccan] / ccan / tdb2 / io.c
1  /* 
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9    Copyright (C) Rusty Russell                     2010
10
11      ** NOTE! The following LGPL license applies to the tdb
12      ** library. This does NOT imply that all of Samba is released
13      ** under the LGPL
14
15    This library is free software; you can redistribute it and/or
16    modify it under the terms of the GNU Lesser General Public
17    License as published by the Free Software Foundation; either
18    version 3 of the License, or (at your option) any later version.
19
20    This library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24
25    You should have received a copy of the GNU Lesser General Public
26    License along with this library; if not, see <http://www.gnu.org/licenses/>.
27 */
28 #include "private.h"
29 #include <assert.h>
30 #include <ccan/likely/likely.h>
31
32 void tdb_munmap(struct tdb_context *tdb)
33 {
34         if (tdb->flags & TDB_INTERNAL)
35                 return;
36
37         if (tdb->map_ptr) {
38                 munmap(tdb->map_ptr, tdb->map_size);
39                 tdb->map_ptr = NULL;
40         }
41 }
42
43 void tdb_mmap(struct tdb_context *tdb)
44 {
45         if (tdb->flags & TDB_INTERNAL)
46                 return;
47
48         if (tdb->flags & TDB_NOMMAP)
49                 return;
50
51         tdb->map_ptr = mmap(NULL, tdb->map_size, 
52                             PROT_READ|(tdb->read_only? 0:PROT_WRITE), 
53                             MAP_SHARED, tdb->fd, 0);
54
55         /*
56          * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
57          */
58         if (tdb->map_ptr == MAP_FAILED) {
59                 tdb->map_ptr = NULL;
60                 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
61                          "tdb_mmap failed for size %lld (%s)\n", 
62                          (long long)tdb->map_size, strerror(errno));
63         }
64 }
65
66 /* check for an out of bounds access - if it is out of bounds then
67    see if the database has been expanded by someone else and expand
68    if necessary 
69    note that "len" is the minimum length needed for the db
70 */
71 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
72 {
73         struct stat st;
74         int ret;
75
76         /* We can't hold pointers during this: we could unmap! */
77         assert(!tdb->direct_access || tdb_has_expansion_lock(tdb));
78
79         if (len <= tdb->map_size)
80                 return 0;
81         if (tdb->flags & TDB_INTERNAL) {
82                 if (!probe) {
83                         /* Ensure ecode is set for log fn. */
84                         tdb->ecode = TDB_ERR_IO;
85                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
86                                  "tdb_oob len %lld beyond internal"
87                                  " malloc size %lld\n",
88                                  (long long)len,
89                                  (long long)tdb->map_size);
90                 }
91                 return -1;
92         }
93
94         if (tdb_lock_expand(tdb, F_RDLCK) != 0)
95                 return -1;
96
97         ret = fstat(tdb->fd, &st);
98
99         tdb_unlock_expand(tdb, F_RDLCK);
100
101         if (ret == -1) {
102                 tdb->ecode = TDB_ERR_IO;
103                 return -1;
104         }
105
106         if (st.st_size < (size_t)len) {
107                 if (!probe) {
108                         /* Ensure ecode is set for log fn. */
109                         tdb->ecode = TDB_ERR_IO;
110                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
111                                  "tdb_oob len %lld beyond eof at %lld\n",
112                                  (long long)len, (long long)st.st_size);
113                 }
114                 return -1;
115         }
116
117         /* Unmap, update size, remap */
118         tdb_munmap(tdb);
119
120         tdb->map_size = st.st_size;
121         tdb_mmap(tdb);
122         return 0;
123 }
124
125 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
126 {
127         if (unlikely(!tdb->map_ptr))
128                 return NULL;
129
130         /* FIXME: We can do a subset of this! */
131         if (tdb->transaction)
132                 return NULL;
133
134         if (unlikely(tdb_oob(tdb, off + len, true) == -1))
135                 return NULL;
136         return (char *)tdb->map_ptr + off;
137 }
138
139 /* Either make a copy into pad and return that, or return ptr into mmap. */
140 /* Note: pad has to be a real object, so we can't get here if len
141  * overflows size_t */
142 void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
143 {
144         if (likely(!(tdb->flags & TDB_CONVERT))) {
145                 void *ret = tdb_direct(tdb, off, len);
146                 if (ret)
147                         return ret;
148         }
149         return tdb_read_convert(tdb, off, pad, len) == -1 ? NULL : pad;
150 }
151
152 /* Endian conversion: we only ever deal with 8 byte quantities */
153 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
154 {
155         if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
156                 uint64_t i, *p = (uint64_t *)buf;
157                 for (i = 0; i < size / 8; i++)
158                         p[i] = bswap_64(p[i]);
159         }
160         return buf;
161 }
162
163 /* FIXME: Return the off? */
164 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
165                               tdb_off_t base, uint64_t start, uint64_t end)
166 {
167         uint64_t i;
168         const uint64_t *val;
169
170         /* Zero vs non-zero is the same unconverted: minor optimization. */
171         val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
172                               (end - start) * sizeof(tdb_off_t), false);
173         if (!val)
174                 return end;
175
176         for (i = 0; i < (end - start); i++) {
177                 if (val[i])
178                         break;
179         }
180         tdb_access_release(tdb, val);
181         return start + i;
182 }
183
184 /* Return first zero offset in num offset array, or num. */
185 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
186                            uint64_t num)
187 {
188         uint64_t i;
189         const uint64_t *val;
190
191         /* Zero vs non-zero is the same unconverted: minor optimization. */
192         val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
193         if (!val)
194                 return num;
195
196         for (i = 0; i < num; i++) {
197                 if (!val[i])
198                         break;
199         }
200         tdb_access_release(tdb, val);
201         return i;
202 }
203
204 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
205 {
206         char buf[8192] = { 0 };
207         void *p = tdb_direct(tdb, off, len);
208         if (p) {
209                 memset(p, 0, len);
210                 return 0;
211         }
212         while (len) {
213                 unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
214                 if (tdb->methods->write(tdb, off, buf, todo) == -1)
215                         return -1;
216                 len -= todo;
217                 off += todo;
218         }
219         return 0;
220 }
221
222 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
223 {
224         tdb_off_t pad, *ret;
225
226         ret = tdb_get(tdb, off, &pad, sizeof(pad));
227         if (!ret) {
228                 return TDB_OFF_ERR;
229         }
230         return *ret;
231 }
232
233 /* Even on files, we can get partial writes due to signals. */
234 bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off)
235 {
236         while (len) {
237                 ssize_t ret;
238                 ret = pwrite(fd, buf, len, off);
239                 if (ret < 0)
240                         return false;
241                 if (ret == 0) {
242                         errno = ENOSPC;
243                         return false;
244                 }
245                 buf = (char *)buf + ret;
246                 off += ret;
247                 len -= ret;
248         }
249         return true;
250 }
251
252 /* Even on files, we can get partial reads due to signals. */
253 bool tdb_pread_all(int fd, void *buf, size_t len, tdb_off_t off)
254 {
255         while (len) {
256                 ssize_t ret;
257                 ret = pread(fd, buf, len, off);
258                 if (ret < 0)
259                         return false;
260                 if (ret == 0) {
261                         /* ETOOSHORT? */
262                         errno = EWOULDBLOCK;
263                         return false;
264                 }
265                 buf = (char *)buf + ret;
266                 off += ret;
267                 len -= ret;
268         }
269         return true;
270 }
271
272 bool tdb_read_all(int fd, void *buf, size_t len)
273 {
274         while (len) {
275                 ssize_t ret;
276                 ret = read(fd, buf, len);
277                 if (ret < 0)
278                         return false;
279                 if (ret == 0) {
280                         /* ETOOSHORT? */
281                         errno = EWOULDBLOCK;
282                         return false;
283                 }
284                 buf = (char *)buf + ret;
285                 len -= ret;
286         }
287         return true;
288 }
289
290 /* write a lump of data at a specified offset */
291 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
292                      const void *buf, tdb_len_t len)
293 {
294         if (len == 0) {
295                 return 0;
296         }
297
298         if (tdb->read_only) {
299                 tdb->ecode = TDB_ERR_RDONLY;
300                 return -1;
301         }
302
303         if (tdb->methods->oob(tdb, off + len, 0) != 0)
304                 return -1;
305
306         if (tdb->map_ptr) {
307                 memcpy(off + (char *)tdb->map_ptr, buf, len);
308         } else {
309                 if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
310                         tdb->ecode = TDB_ERR_IO;
311                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
312                                  "tdb_write failed at %llu len=%llu (%s)\n",
313                                  off, len, strerror(errno));
314                         return -1;
315                 }
316         }
317         return 0;
318 }
319
320 /* read a lump of data at a specified offset */
321 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
322                     tdb_len_t len)
323 {
324         if (tdb->methods->oob(tdb, off + len, 0) != 0) {
325                 return -1;
326         }
327
328         if (tdb->map_ptr) {
329                 memcpy(buf, off + (char *)tdb->map_ptr, len);
330         } else {
331                 if (!tdb_pread_all(tdb->fd, buf, len, off)) {
332                         /* Ensure ecode is set for log fn. */
333                         tdb->ecode = TDB_ERR_IO;
334                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
335                                  "tdb_read failed at %lld "
336                                  "len=%lld (%s) map_size=%lld\n",
337                                  (long long)off, (long long)len,
338                                  strerror(errno),
339                                  (long long)tdb->map_size);
340                         return -1;
341                 }
342         }
343         return 0;
344 }
345
346 int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
347                       const void *rec, size_t len)
348 {
349         int ret;
350         if (unlikely((tdb->flags & TDB_CONVERT))) {
351                 void *conv = malloc(len);
352                 if (!conv) {
353                         tdb->ecode = TDB_ERR_OOM;
354                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
355                                  "tdb_write: no memory converting %zu bytes\n",
356                                  len);
357                         return -1;
358                 }
359                 memcpy(conv, rec, len);
360                 ret = tdb->methods->write(tdb, off,
361                                           tdb_convert(tdb, conv, len), len);
362                 free(conv);
363         } else
364                 ret = tdb->methods->write(tdb, off, rec, len);
365
366         return ret;
367 }
368
369 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
370                       void *rec, size_t len)
371 {
372         int ret = tdb->methods->read(tdb, off, rec, len);
373         tdb_convert(tdb, rec, len);
374         return ret;
375 }
376
377 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
378 {
379         return tdb_write_convert(tdb, off, &val, sizeof(val));
380 }
381
382 static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
383                              tdb_len_t len, unsigned int prefix)
384 {
385         void *buf;
386
387         /* some systems don't like zero length malloc */
388         buf = malloc(prefix + len ? prefix + len : 1);
389         if (unlikely(!buf)) {
390                 tdb->ecode = TDB_ERR_OOM;
391                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
392                          "tdb_alloc_read malloc failed len=%lld\n",
393                          (long long)prefix + len);
394         } else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix, len))) {
395                 free(buf);
396                 buf = NULL;
397         }
398         return buf;
399 }
400
401 /* read a lump of data, allocating the space for it */
402 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
403 {
404         return _tdb_alloc_read(tdb, offset, len, 0);
405 }
406
407 static int fill(struct tdb_context *tdb,
408                 const void *buf, size_t size,
409                 tdb_off_t off, tdb_len_t len)
410 {
411         while (len) {
412                 size_t n = len > size ? size : len;
413
414                 if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
415                         tdb->ecode = TDB_ERR_IO;
416                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
417                                  "fill write failed: giving up!\n");
418                         return -1;
419                 }
420                 len -= n;
421                 off += n;
422         }
423         return 0;
424 }
425
426 /* expand a file.  we prefer to use ftruncate, as that is what posix
427   says to use for mmap expansion */
428 static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
429 {
430         char buf[8192];
431
432         if (tdb->read_only) {
433                 tdb->ecode = TDB_ERR_RDONLY;
434                 return -1;
435         }
436
437         if (tdb->flags & TDB_INTERNAL) {
438                 char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
439                 if (!new) {
440                         tdb->ecode = TDB_ERR_OOM;
441                         return -1;
442                 }
443                 tdb->map_ptr = new;
444                 tdb->map_size += addition;
445         } else {
446                 /* Unmap before trying to write; old TDB claimed OpenBSD had
447                  * problem with this otherwise. */
448                 tdb_munmap(tdb);
449
450                 /* If this fails, we try to fill anyway. */
451                 if (ftruncate(tdb->fd, tdb->map_size + addition))
452                         ;
453
454                 /* now fill the file with something. This ensures that the
455                    file isn't sparse, which would be very bad if we ran out of
456                    disk. This must be done with write, not via mmap */
457                 memset(buf, 0x43, sizeof(buf));
458                 if (fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
459                         return -1;
460                 tdb->map_size += addition;
461                 tdb_mmap(tdb);
462         }
463         return 0;
464 }
465
466 /* This is only neded for tdb_access_commit, but used everywhere to simplify. */
467 struct tdb_access_hdr {
468         tdb_off_t off;
469         tdb_len_t len;
470         bool convert;
471 };
472
473 const void *tdb_access_read(struct tdb_context *tdb,
474                             tdb_off_t off, tdb_len_t len, bool convert)
475 {
476         const void *ret = NULL; 
477
478         if (likely(!(tdb->flags & TDB_CONVERT)))
479                 ret = tdb_direct(tdb, off, len);
480
481         if (!ret) {
482                 struct tdb_access_hdr *hdr;
483                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
484                 if (hdr) {
485                         ret = hdr + 1;
486                         if (convert)
487                                 tdb_convert(tdb, (void *)ret, len);
488                 }
489         } else
490                 tdb->direct_access++;
491
492         return ret;
493 }
494
495 void *tdb_access_write(struct tdb_context *tdb,
496                        tdb_off_t off, tdb_len_t len, bool convert)
497 {
498         void *ret = NULL;
499
500         if (likely(!(tdb->flags & TDB_CONVERT)))
501                 ret = tdb_direct(tdb, off, len);
502
503         if (!ret) {
504                 struct tdb_access_hdr *hdr;
505                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
506                 if (hdr) {
507                         hdr->off = off;
508                         hdr->len = len;
509                         hdr->convert = convert;
510                         ret = hdr + 1;
511                         if (convert)
512                                 tdb_convert(tdb, (void *)ret, len);
513                 }
514         } else
515                 tdb->direct_access++;
516
517         return ret;
518 }
519
520 void tdb_access_release(struct tdb_context *tdb, const void *p)
521 {
522         if (!tdb->map_ptr
523             || (char *)p < (char *)tdb->map_ptr
524             || (char *)p >= (char *)tdb->map_ptr + tdb->map_size)
525                 free((struct tdb_access_hdr *)p - 1);
526         else
527                 tdb->direct_access--;
528 }
529
530 int tdb_access_commit(struct tdb_context *tdb, void *p)
531 {
532         int ret = 0;
533
534         if (!tdb->map_ptr
535             || (char *)p < (char *)tdb->map_ptr
536             || (char *)p >= (char *)tdb->map_ptr + tdb->map_size) {
537                 struct tdb_access_hdr *hdr;
538
539                 hdr = (struct tdb_access_hdr *)p - 1;
540                 if (hdr->convert)
541                         ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
542                 else
543                         ret = tdb_write(tdb, hdr->off, p, hdr->len);
544                 free(hdr);
545         } else
546                 tdb->direct_access--;
547
548         return ret;
549 }
550
551 #if 0
552 /* write a lump of data at a specified offset */
553 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
554                      const void *buf, tdb_len_t len)
555 {
556         if (len == 0) {
557                 return 0;
558         }
559
560         if (tdb->read_only || tdb->traverse_read) {
561                 tdb->ecode = TDB_ERR_RDONLY;
562                 return -1;
563         }
564
565         if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
566                 return -1;
567
568         if (tdb->map_ptr) {
569                 memcpy(off + (char *)tdb->map_ptr, buf, len);
570         } else {
571                 ssize_t written = pwrite(tdb->fd, buf, len, off);
572                 if ((written != (ssize_t)len) && (written != -1)) {
573                         /* try once more */
574                         tdb->ecode = TDB_ERR_IO;
575                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
576                                  "%d of %d bytes at %d, trying once more\n",
577                                  (int)written, len, off));
578                         written = pwrite(tdb->fd, (const char *)buf+written,
579                                          len-written,
580                                          off+written);
581                 }
582                 if (written == -1) {
583                         /* Ensure ecode is set for log fn. */
584                         tdb->ecode = TDB_ERR_IO;
585                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d "
586                                  "len=%d (%s)\n", off, len, strerror(errno)));
587                         return -1;
588                 } else if (written != (ssize_t)len) {
589                         tdb->ecode = TDB_ERR_IO;
590                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
591                                  "write %d bytes at %d in two attempts\n",
592                                  len, off));
593                         return -1;
594                 }
595         }
596         return 0;
597 }
598
599
600
601 /*
602   do an unlocked scan of the hash table heads to find the next non-zero head. The value
603   will then be confirmed with the lock held
604 */              
605 static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
606 {
607         uint32_t h = *chain;
608         if (tdb->map_ptr) {
609                 for (;h < tdb->header.hash_size;h++) {
610                         if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
611                                 break;
612                         }
613                 }
614         } else {
615                 uint32_t off=0;
616                 for (;h < tdb->header.hash_size;h++) {
617                         if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
618                                 break;
619                         }
620                 }
621         }
622         (*chain) = h;
623 }
624
625 /* read/write a tdb_off_t */
626 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
627 {
628         return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
629 }
630
631 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
632 {
633         tdb_off_t off = *d;
634         return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
635 }
636
637
638 /* read/write a record */
639 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
640 {
641         if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
642                 return -1;
643         if (TDB_BAD_MAGIC(rec)) {
644                 /* Ensure ecode is set for log fn. */
645                 tdb->ecode = TDB_ERR_CORRUPT;
646                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
647                 return -1;
648         }
649         return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
650 }
651
652 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
653 {
654         struct tdb_record r = *rec;
655         return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
656 }
657 #endif
658
659 static const struct tdb_methods io_methods = {
660         tdb_read,
661         tdb_write,
662         tdb_oob,
663         tdb_expand_file,
664 };
665
666 /*
667   initialise the default methods table
668 */
669 void tdb_io_init(struct tdb_context *tdb)
670 {
671         tdb->methods = &io_methods;
672 }