tdb2: add write flag to tdb_direct
[ccan] / ccan / tdb2 / io.c
1  /* 
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9    Copyright (C) Rusty Russell                     2010
10
11      ** NOTE! The following LGPL license applies to the tdb
12      ** library. This does NOT imply that all of Samba is released
13      ** under the LGPL
14
15    This library is free software; you can redistribute it and/or
16    modify it under the terms of the GNU Lesser General Public
17    License as published by the Free Software Foundation; either
18    version 3 of the License, or (at your option) any later version.
19
20    This library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24
25    You should have received a copy of the GNU Lesser General Public
26    License along with this library; if not, see <http://www.gnu.org/licenses/>.
27 */
28 #include "private.h"
29 #include <assert.h>
30 #include <ccan/likely/likely.h>
31
32 void tdb_munmap(struct tdb_context *tdb)
33 {
34         if (tdb->flags & TDB_INTERNAL)
35                 return;
36
37         if (tdb->map_ptr) {
38                 munmap(tdb->map_ptr, tdb->map_size);
39                 tdb->map_ptr = NULL;
40         }
41 }
42
43 void tdb_mmap(struct tdb_context *tdb)
44 {
45         if (tdb->flags & TDB_INTERNAL)
46                 return;
47
48         if (tdb->flags & TDB_NOMMAP)
49                 return;
50
51         tdb->map_ptr = mmap(NULL, tdb->map_size, tdb->mmap_flags,
52                             MAP_SHARED, tdb->fd, 0);
53
54         /*
55          * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
56          */
57         if (tdb->map_ptr == MAP_FAILED) {
58                 tdb->map_ptr = NULL;
59                 tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_WARNING,
60                            "tdb_mmap failed for size %lld (%s)",
61                            (long long)tdb->map_size, strerror(errno));
62         }
63 }
64
65 /* check for an out of bounds access - if it is out of bounds then
66    see if the database has been expanded by someone else and expand
67    if necessary 
68    note that "len" is the minimum length needed for the db
69 */
70 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
71 {
72         struct stat st;
73
74         /* We can't hold pointers during this: we could unmap! */
75         assert(!tdb->direct_access
76                || (tdb->flags & TDB_NOLOCK)
77                || tdb_has_expansion_lock(tdb));
78
79         if (len <= tdb->map_size)
80                 return 0;
81         if (tdb->flags & TDB_INTERNAL) {
82                 if (!probe) {
83                         tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
84                                  "tdb_oob len %lld beyond internal"
85                                  " malloc size %lld",
86                                  (long long)len,
87                                  (long long)tdb->map_size);
88                 }
89                 return -1;
90         }
91
92         if (tdb_lock_expand(tdb, F_RDLCK) != 0)
93                 return -1;
94
95         if (fstat(tdb->fd, &st) != 0) {
96                 tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
97                            "Failed to fstat file: %s", strerror(errno));
98                 tdb_unlock_expand(tdb, F_RDLCK);
99                 return -1;
100         }
101
102         tdb_unlock_expand(tdb, F_RDLCK);
103
104         if (st.st_size < (size_t)len) {
105                 if (!probe) {
106                         tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
107                                    "tdb_oob len %zu beyond eof at %zu",
108                                    (size_t)len, st.st_size);
109                 }
110                 return -1;
111         }
112
113         /* Unmap, update size, remap */
114         tdb_munmap(tdb);
115
116         tdb->map_size = st.st_size;
117         tdb_mmap(tdb);
118         return 0;
119 }
120
121 /* Endian conversion: we only ever deal with 8 byte quantities */
122 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
123 {
124         if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
125                 uint64_t i, *p = (uint64_t *)buf;
126                 for (i = 0; i < size / 8; i++)
127                         p[i] = bswap_64(p[i]);
128         }
129         return buf;
130 }
131
132 /* FIXME: Return the off? */
133 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
134                               tdb_off_t base, uint64_t start, uint64_t end)
135 {
136         uint64_t i;
137         const uint64_t *val;
138
139         /* Zero vs non-zero is the same unconverted: minor optimization. */
140         val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
141                               (end - start) * sizeof(tdb_off_t), false);
142         if (!val)
143                 return end;
144
145         for (i = 0; i < (end - start); i++) {
146                 if (val[i])
147                         break;
148         }
149         tdb_access_release(tdb, val);
150         return start + i;
151 }
152
153 /* Return first zero offset in num offset array, or num. */
154 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
155                            uint64_t num)
156 {
157         uint64_t i;
158         const uint64_t *val;
159
160         /* Zero vs non-zero is the same unconverted: minor optimization. */
161         val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
162         if (!val)
163                 return num;
164
165         for (i = 0; i < num; i++) {
166                 if (!val[i])
167                         break;
168         }
169         tdb_access_release(tdb, val);
170         return i;
171 }
172
173 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
174 {
175         char buf[8192] = { 0 };
176         void *p = tdb->methods->direct(tdb, off, len, true);
177
178         assert(!tdb->read_only);
179         if (p) {
180                 memset(p, 0, len);
181                 return 0;
182         }
183         while (len) {
184                 unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
185                 if (tdb->methods->write(tdb, off, buf, todo) == -1)
186                         return -1;
187                 len -= todo;
188                 off += todo;
189         }
190         return 0;
191 }
192
193 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
194 {
195         tdb_off_t ret;
196
197         if (likely(!(tdb->flags & TDB_CONVERT))) {
198                 tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
199                                                     false);
200                 if (p)
201                         return *p;
202         }
203
204         if (tdb_read_convert(tdb, off, &ret, sizeof(ret)) == -1)
205                 return TDB_OFF_ERR;
206         return ret;
207 }
208
209 /* Even on files, we can get partial writes due to signals. */
210 bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off)
211 {
212         while (len) {
213                 ssize_t ret;
214                 ret = pwrite(fd, buf, len, off);
215                 if (ret < 0)
216                         return false;
217                 if (ret == 0) {
218                         errno = ENOSPC;
219                         return false;
220                 }
221                 buf = (char *)buf + ret;
222                 off += ret;
223                 len -= ret;
224         }
225         return true;
226 }
227
228 /* Even on files, we can get partial reads due to signals. */
229 bool tdb_pread_all(int fd, void *buf, size_t len, tdb_off_t off)
230 {
231         while (len) {
232                 ssize_t ret;
233                 ret = pread(fd, buf, len, off);
234                 if (ret < 0)
235                         return false;
236                 if (ret == 0) {
237                         /* ETOOSHORT? */
238                         errno = EWOULDBLOCK;
239                         return false;
240                 }
241                 buf = (char *)buf + ret;
242                 off += ret;
243                 len -= ret;
244         }
245         return true;
246 }
247
248 bool tdb_read_all(int fd, void *buf, size_t len)
249 {
250         while (len) {
251                 ssize_t ret;
252                 ret = read(fd, buf, len);
253                 if (ret < 0)
254                         return false;
255                 if (ret == 0) {
256                         /* ETOOSHORT? */
257                         errno = EWOULDBLOCK;
258                         return false;
259                 }
260                 buf = (char *)buf + ret;
261                 len -= ret;
262         }
263         return true;
264 }
265
266 /* write a lump of data at a specified offset */
267 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
268                      const void *buf, tdb_len_t len)
269 {
270         if (tdb->read_only) {
271                 tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
272                            "Write to read-only database");
273                 return -1;
274         }
275
276         /* FIXME: Bogus optimization? */
277         if (len == 0) {
278                 return 0;
279         }
280
281         if (tdb->methods->oob(tdb, off + len, 0) != 0)
282                 return -1;
283
284         if (tdb->map_ptr) {
285                 memcpy(off + (char *)tdb->map_ptr, buf, len);
286         } else {
287                 if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
288                         tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
289                                    "tdb_write failed at %zu len=%zu (%s)",
290                                    (size_t)off, (size_t)len, strerror(errno));
291                         return -1;
292                 }
293         }
294         return 0;
295 }
296
297 /* read a lump of data at a specified offset */
298 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
299                     tdb_len_t len)
300 {
301         if (tdb->methods->oob(tdb, off + len, 0) != 0) {
302                 return -1;
303         }
304
305         if (tdb->map_ptr) {
306                 memcpy(buf, off + (char *)tdb->map_ptr, len);
307         } else {
308                 if (!tdb_pread_all(tdb->fd, buf, len, off)) {
309                         tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
310                                    "tdb_read failed at %zu "
311                                    "len=%zu (%s) map_size=%zu",
312                                  (size_t)off, (size_t)len,
313                                  strerror(errno),
314                                  (size_t)tdb->map_size);
315                         return -1;
316                 }
317         }
318         return 0;
319 }
320
321 int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
322                       const void *rec, size_t len)
323 {
324         int ret;
325         if (unlikely((tdb->flags & TDB_CONVERT))) {
326                 void *conv = malloc(len);
327                 if (!conv) {
328                         tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
329                                    "tdb_write: no memory converting"
330                                    " %zu bytes", len);
331                         return -1;
332                 }
333                 memcpy(conv, rec, len);
334                 ret = tdb->methods->write(tdb, off,
335                                           tdb_convert(tdb, conv, len), len);
336                 free(conv);
337         } else
338                 ret = tdb->methods->write(tdb, off, rec, len);
339
340         return ret;
341 }
342
343 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
344                       void *rec, size_t len)
345 {
346         int ret = tdb->methods->read(tdb, off, rec, len);
347         tdb_convert(tdb, rec, len);
348         return ret;
349 }
350
351 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
352 {
353         if (tdb->read_only) {
354                 tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
355                            "Write to read-only database");
356                 return -1;
357         }
358
359         if (likely(!(tdb->flags & TDB_CONVERT))) {
360                 tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
361                                                     true);
362                 if (p) {
363                         *p = val;
364                         return 0;
365                 }
366         }
367         return tdb_write_convert(tdb, off, &val, sizeof(val));
368 }
369
370 static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
371                              tdb_len_t len, unsigned int prefix)
372 {
373         void *buf;
374
375         /* some systems don't like zero length malloc */
376         buf = malloc(prefix + len ? prefix + len : 1);
377         if (!buf) {
378                 tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
379                            "tdb_alloc_read malloc failed len=%zu",
380                            (size_t)(prefix + len));
381         } else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix,
382                                                len) == -1)) {
383                 free(buf);
384                 buf = NULL;
385         }
386         return buf;
387 }
388
389 /* read a lump of data, allocating the space for it */
390 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
391 {
392         return _tdb_alloc_read(tdb, offset, len, 0);
393 }
394
395 static int fill(struct tdb_context *tdb,
396                 const void *buf, size_t size,
397                 tdb_off_t off, tdb_len_t len)
398 {
399         while (len) {
400                 size_t n = len > size ? size : len;
401
402                 if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
403                         tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
404                                  "fill write failed: giving up!");
405                         return -1;
406                 }
407                 len -= n;
408                 off += n;
409         }
410         return 0;
411 }
412
413 /* expand a file.  we prefer to use ftruncate, as that is what posix
414   says to use for mmap expansion */
415 static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
416 {
417         char buf[8192];
418
419         if (tdb->read_only) {
420                 tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
421                            "Expand on read-only database");
422                 return -1;
423         }
424
425         if (tdb->flags & TDB_INTERNAL) {
426                 char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
427                 if (!new) {
428                         tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
429                                    "No memory to expand database");
430                         return -1;
431                 }
432                 tdb->map_ptr = new;
433                 tdb->map_size += addition;
434         } else {
435                 /* Unmap before trying to write; old TDB claimed OpenBSD had
436                  * problem with this otherwise. */
437                 tdb_munmap(tdb);
438
439                 /* If this fails, we try to fill anyway. */
440                 if (ftruncate(tdb->fd, tdb->map_size + addition))
441                         ;
442
443                 /* now fill the file with something. This ensures that the
444                    file isn't sparse, which would be very bad if we ran out of
445                    disk. This must be done with write, not via mmap */
446                 memset(buf, 0x43, sizeof(buf));
447                 if (0 || fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
448                         return -1;
449                 tdb->map_size += addition;
450                 tdb_mmap(tdb);
451         }
452         return 0;
453 }
454
455 /* This is only neded for tdb_access_commit, but used everywhere to simplify. */
456 struct tdb_access_hdr {
457         tdb_off_t off;
458         tdb_len_t len;
459         bool convert;
460 };
461
462 const void *tdb_access_read(struct tdb_context *tdb,
463                             tdb_off_t off, tdb_len_t len, bool convert)
464 {
465         const void *ret = NULL; 
466
467         if (likely(!(tdb->flags & TDB_CONVERT)))
468                 ret = tdb->methods->direct(tdb, off, len, false);
469
470         if (!ret) {
471                 struct tdb_access_hdr *hdr;
472                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
473                 if (hdr) {
474                         ret = hdr + 1;
475                         if (convert)
476                                 tdb_convert(tdb, (void *)ret, len);
477                 }
478         } else
479                 tdb->direct_access++;
480
481         return ret;
482 }
483
484 void *tdb_access_write(struct tdb_context *tdb,
485                        tdb_off_t off, tdb_len_t len, bool convert)
486 {
487         void *ret = NULL;
488
489         if (tdb->read_only) {
490                 tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
491                            "Write to read-only database");
492                 return NULL;
493         }
494
495         if (likely(!(tdb->flags & TDB_CONVERT)))
496                 ret = tdb->methods->direct(tdb, off, len, true);
497
498         if (!ret) {
499                 struct tdb_access_hdr *hdr;
500                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
501                 if (hdr) {
502                         hdr->off = off;
503                         hdr->len = len;
504                         hdr->convert = convert;
505                         ret = hdr + 1;
506                         if (convert)
507                                 tdb_convert(tdb, (void *)ret, len);
508                 }
509         } else
510                 tdb->direct_access++;
511
512         return ret;
513 }
514
515 bool is_direct(const struct tdb_context *tdb, const void *p)
516 {
517         return (tdb->map_ptr
518                 && (char *)p >= (char *)tdb->map_ptr
519                 && (char *)p < (char *)tdb->map_ptr + tdb->map_size);
520 }
521
522 void tdb_access_release(struct tdb_context *tdb, const void *p)
523 {
524         if (is_direct(tdb, p))
525                 tdb->direct_access--;
526         else
527                 free((struct tdb_access_hdr *)p - 1);
528 }
529
530 int tdb_access_commit(struct tdb_context *tdb, void *p)
531 {
532         int ret = 0;
533
534         if (!tdb->map_ptr
535             || (char *)p < (char *)tdb->map_ptr
536             || (char *)p >= (char *)tdb->map_ptr + tdb->map_size) {
537                 struct tdb_access_hdr *hdr;
538
539                 hdr = (struct tdb_access_hdr *)p - 1;
540                 if (hdr->convert)
541                         ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
542                 else
543                         ret = tdb_write(tdb, hdr->off, p, hdr->len);
544                 free(hdr);
545         } else
546                 tdb->direct_access--;
547
548         return ret;
549 }
550
551 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
552                         bool write)
553 {
554         if (unlikely(!tdb->map_ptr))
555                 return NULL;
556
557         if (unlikely(tdb_oob(tdb, off + len, true) == -1))
558                 return NULL;
559         return (char *)tdb->map_ptr + off;
560 }
561
562 void add_stat_(struct tdb_context *tdb, uint64_t *stat, size_t val)
563 {
564         if ((uintptr_t)stat < (uintptr_t)tdb->stats + tdb->stats->size)
565                 *stat += val;
566 }
567
568 static const struct tdb_methods io_methods = {
569         tdb_read,
570         tdb_write,
571         tdb_oob,
572         tdb_expand_file,
573         tdb_direct,
574 };
575
576 /*
577   initialise the default methods table
578 */
579 void tdb_io_init(struct tdb_context *tdb)
580 {
581         tdb->methods = &io_methods;
582 }