tdb2: remove looping for read on normal files.
[ccan] / ccan / tdb2 / io.c
1  /* 
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9    Copyright (C) Rusty Russell                     2010
10
11      ** NOTE! The following LGPL license applies to the tdb
12      ** library. This does NOT imply that all of Samba is released
13      ** under the LGPL
14
15    This library is free software; you can redistribute it and/or
16    modify it under the terms of the GNU Lesser General Public
17    License as published by the Free Software Foundation; either
18    version 3 of the License, or (at your option) any later version.
19
20    This library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24
25    You should have received a copy of the GNU Lesser General Public
26    License along with this library; if not, see <http://www.gnu.org/licenses/>.
27 */
28 #include "private.h"
29 #include <assert.h>
30 #include <ccan/likely/likely.h>
31
32 void tdb_munmap(struct tdb_context *tdb)
33 {
34         if (tdb->flags & TDB_INTERNAL)
35                 return;
36
37         if (tdb->map_ptr) {
38                 munmap(tdb->map_ptr, tdb->map_size);
39                 tdb->map_ptr = NULL;
40         }
41 }
42
43 void tdb_mmap(struct tdb_context *tdb)
44 {
45         if (tdb->flags & TDB_INTERNAL)
46                 return;
47
48         if (tdb->flags & TDB_NOMMAP)
49                 return;
50
51         tdb->map_ptr = mmap(NULL, tdb->map_size, tdb->mmap_flags,
52                             MAP_SHARED, tdb->fd, 0);
53
54         /*
55          * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
56          */
57         if (tdb->map_ptr == MAP_FAILED) {
58                 tdb->map_ptr = NULL;
59                 tdb_logerr(tdb, TDB_SUCCESS, TDB_DEBUG_WARNING,
60                            "tdb_mmap failed for size %lld (%s)",
61                            (long long)tdb->map_size, strerror(errno));
62         }
63 }
64
65 /* check for an out of bounds access - if it is out of bounds then
66    see if the database has been expanded by someone else and expand
67    if necessary 
68    note that "len" is the minimum length needed for the db
69 */
70 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
71 {
72         struct stat st;
73
74         /* We can't hold pointers during this: we could unmap! */
75         assert(!tdb->direct_access
76                || (tdb->flags & TDB_NOLOCK)
77                || tdb_has_expansion_lock(tdb));
78
79         if (len <= tdb->map_size)
80                 return 0;
81         if (tdb->flags & TDB_INTERNAL) {
82                 if (!probe) {
83                         tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
84                                  "tdb_oob len %lld beyond internal"
85                                  " malloc size %lld",
86                                  (long long)len,
87                                  (long long)tdb->map_size);
88                 }
89                 return -1;
90         }
91
92         if (tdb_lock_expand(tdb, F_RDLCK) != 0)
93                 return -1;
94
95         if (fstat(tdb->fd, &st) != 0) {
96                 tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
97                            "Failed to fstat file: %s", strerror(errno));
98                 tdb_unlock_expand(tdb, F_RDLCK);
99                 return -1;
100         }
101
102         tdb_unlock_expand(tdb, F_RDLCK);
103
104         if (st.st_size < (size_t)len) {
105                 if (!probe) {
106                         tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
107                                    "tdb_oob len %zu beyond eof at %zu",
108                                    (size_t)len, st.st_size);
109                 }
110                 return -1;
111         }
112
113         /* Unmap, update size, remap */
114         tdb_munmap(tdb);
115
116         tdb->map_size = st.st_size;
117         tdb_mmap(tdb);
118         return 0;
119 }
120
121 /* Endian conversion: we only ever deal with 8 byte quantities */
122 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
123 {
124         if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
125                 uint64_t i, *p = (uint64_t *)buf;
126                 for (i = 0; i < size / 8; i++)
127                         p[i] = bswap_64(p[i]);
128         }
129         return buf;
130 }
131
132 /* FIXME: Return the off? */
133 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
134                               tdb_off_t base, uint64_t start, uint64_t end)
135 {
136         uint64_t i;
137         const uint64_t *val;
138
139         /* Zero vs non-zero is the same unconverted: minor optimization. */
140         val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
141                               (end - start) * sizeof(tdb_off_t), false);
142         if (!val)
143                 return end;
144
145         for (i = 0; i < (end - start); i++) {
146                 if (val[i])
147                         break;
148         }
149         tdb_access_release(tdb, val);
150         return start + i;
151 }
152
153 /* Return first zero offset in num offset array, or num. */
154 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
155                            uint64_t num)
156 {
157         uint64_t i;
158         const uint64_t *val;
159
160         /* Zero vs non-zero is the same unconverted: minor optimization. */
161         val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
162         if (!val)
163                 return num;
164
165         for (i = 0; i < num; i++) {
166                 if (!val[i])
167                         break;
168         }
169         tdb_access_release(tdb, val);
170         return i;
171 }
172
173 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
174 {
175         char buf[8192] = { 0 };
176         void *p = tdb->methods->direct(tdb, off, len, true);
177
178         assert(!tdb->read_only);
179         if (p) {
180                 memset(p, 0, len);
181                 return 0;
182         }
183         while (len) {
184                 unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
185                 if (tdb->methods->write(tdb, off, buf, todo) == -1)
186                         return -1;
187                 len -= todo;
188                 off += todo;
189         }
190         return 0;
191 }
192
193 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
194 {
195         tdb_off_t ret;
196
197         if (likely(!(tdb->flags & TDB_CONVERT))) {
198                 tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
199                                                     false);
200                 if (p)
201                         return *p;
202         }
203
204         if (tdb_read_convert(tdb, off, &ret, sizeof(ret)) == -1)
205                 return TDB_OFF_ERR;
206         return ret;
207 }
208
209 /* Even on files, we can get partial writes due to signals. */
210 bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off)
211 {
212         while (len) {
213                 ssize_t ret;
214                 ret = pwrite(fd, buf, len, off);
215                 if (ret < 0)
216                         return false;
217                 if (ret == 0) {
218                         errno = ENOSPC;
219                         return false;
220                 }
221                 buf = (char *)buf + ret;
222                 off += ret;
223                 len -= ret;
224         }
225         return true;
226 }
227
228 /* write a lump of data at a specified offset */
229 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
230                      const void *buf, tdb_len_t len)
231 {
232         if (tdb->read_only) {
233                 tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
234                            "Write to read-only database");
235                 return -1;
236         }
237
238         /* FIXME: Bogus optimization? */
239         if (len == 0) {
240                 return 0;
241         }
242
243         if (tdb->methods->oob(tdb, off + len, 0) != 0)
244                 return -1;
245
246         if (tdb->map_ptr) {
247                 memcpy(off + (char *)tdb->map_ptr, buf, len);
248         } else {
249                 if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
250                         tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
251                                    "tdb_write failed at %zu len=%zu (%s)",
252                                    (size_t)off, (size_t)len, strerror(errno));
253                         return -1;
254                 }
255         }
256         return 0;
257 }
258
259 /* read a lump of data at a specified offset */
260 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
261                     tdb_len_t len)
262 {
263         if (tdb->methods->oob(tdb, off + len, 0) != 0) {
264                 return -1;
265         }
266
267         if (tdb->map_ptr) {
268                 memcpy(buf, off + (char *)tdb->map_ptr, len);
269         } else {
270                 ssize_t r = pread(tdb->fd, buf, len, off);
271                 if (r != len) {
272                         tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
273                                    "tdb_read failed with %zi at %zu "
274                                    "len=%zu (%s) map_size=%zu",
275                                    r, (size_t)off, (size_t)len,
276                                    strerror(errno),
277                                    (size_t)tdb->map_size);
278                         return -1;
279                 }
280         }
281         return 0;
282 }
283
284 int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
285                       const void *rec, size_t len)
286 {
287         int ret;
288         if (unlikely((tdb->flags & TDB_CONVERT))) {
289                 void *conv = malloc(len);
290                 if (!conv) {
291                         tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
292                                    "tdb_write: no memory converting"
293                                    " %zu bytes", len);
294                         return -1;
295                 }
296                 memcpy(conv, rec, len);
297                 ret = tdb->methods->write(tdb, off,
298                                           tdb_convert(tdb, conv, len), len);
299                 free(conv);
300         } else
301                 ret = tdb->methods->write(tdb, off, rec, len);
302
303         return ret;
304 }
305
306 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
307                       void *rec, size_t len)
308 {
309         int ret = tdb->methods->read(tdb, off, rec, len);
310         tdb_convert(tdb, rec, len);
311         return ret;
312 }
313
314 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
315 {
316         if (tdb->read_only) {
317                 tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
318                            "Write to read-only database");
319                 return -1;
320         }
321
322         if (likely(!(tdb->flags & TDB_CONVERT))) {
323                 tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
324                                                     true);
325                 if (p) {
326                         *p = val;
327                         return 0;
328                 }
329         }
330         return tdb_write_convert(tdb, off, &val, sizeof(val));
331 }
332
333 static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
334                              tdb_len_t len, unsigned int prefix)
335 {
336         void *buf;
337
338         /* some systems don't like zero length malloc */
339         buf = malloc(prefix + len ? prefix + len : 1);
340         if (!buf) {
341                 tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_ERROR,
342                            "tdb_alloc_read malloc failed len=%zu",
343                            (size_t)(prefix + len));
344         } else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix,
345                                                len) == -1)) {
346                 free(buf);
347                 buf = NULL;
348         }
349         return buf;
350 }
351
352 /* read a lump of data, allocating the space for it */
353 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
354 {
355         return _tdb_alloc_read(tdb, offset, len, 0);
356 }
357
358 static int fill(struct tdb_context *tdb,
359                 const void *buf, size_t size,
360                 tdb_off_t off, tdb_len_t len)
361 {
362         while (len) {
363                 size_t n = len > size ? size : len;
364
365                 if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
366                         tdb_logerr(tdb, TDB_ERR_IO, TDB_DEBUG_FATAL,
367                                  "fill write failed: giving up!");
368                         return -1;
369                 }
370                 len -= n;
371                 off += n;
372         }
373         return 0;
374 }
375
376 /* expand a file.  we prefer to use ftruncate, as that is what posix
377   says to use for mmap expansion */
378 static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
379 {
380         char buf[8192];
381
382         if (tdb->read_only) {
383                 tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
384                            "Expand on read-only database");
385                 return -1;
386         }
387
388         if (tdb->flags & TDB_INTERNAL) {
389                 char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
390                 if (!new) {
391                         tdb_logerr(tdb, TDB_ERR_OOM, TDB_DEBUG_FATAL,
392                                    "No memory to expand database");
393                         return -1;
394                 }
395                 tdb->map_ptr = new;
396                 tdb->map_size += addition;
397         } else {
398                 /* Unmap before trying to write; old TDB claimed OpenBSD had
399                  * problem with this otherwise. */
400                 tdb_munmap(tdb);
401
402                 /* If this fails, we try to fill anyway. */
403                 if (ftruncate(tdb->fd, tdb->map_size + addition))
404                         ;
405
406                 /* now fill the file with something. This ensures that the
407                    file isn't sparse, which would be very bad if we ran out of
408                    disk. This must be done with write, not via mmap */
409                 memset(buf, 0x43, sizeof(buf));
410                 if (0 || fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
411                         return -1;
412                 tdb->map_size += addition;
413                 tdb_mmap(tdb);
414         }
415         return 0;
416 }
417
418 const void *tdb_access_read(struct tdb_context *tdb,
419                             tdb_off_t off, tdb_len_t len, bool convert)
420 {
421         const void *ret = NULL; 
422
423         if (likely(!(tdb->flags & TDB_CONVERT)))
424                 ret = tdb->methods->direct(tdb, off, len, false);
425
426         if (!ret) {
427                 struct tdb_access_hdr *hdr;
428                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
429                 if (hdr) {
430                         hdr->next = tdb->access;
431                         tdb->access = hdr;
432                         ret = hdr + 1;
433                         if (convert)
434                                 tdb_convert(tdb, (void *)ret, len);
435                 }
436         } else
437                 tdb->direct_access++;
438
439         return ret;
440 }
441
442 void *tdb_access_write(struct tdb_context *tdb,
443                        tdb_off_t off, tdb_len_t len, bool convert)
444 {
445         void *ret = NULL;
446
447         if (tdb->read_only) {
448                 tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_DEBUG_WARNING,
449                            "Write to read-only database");
450                 return NULL;
451         }
452
453         if (likely(!(tdb->flags & TDB_CONVERT)))
454                 ret = tdb->methods->direct(tdb, off, len, true);
455
456         if (!ret) {
457                 struct tdb_access_hdr *hdr;
458                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
459                 if (hdr) {
460                         hdr->next = tdb->access;
461                         tdb->access = hdr;
462                         hdr->off = off;
463                         hdr->len = len;
464                         hdr->convert = convert;
465                         ret = hdr + 1;
466                         if (convert)
467                                 tdb_convert(tdb, (void *)ret, len);
468                 }
469         } else
470                 tdb->direct_access++;
471
472         return ret;
473 }
474
475 static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p)
476 {
477         struct tdb_access_hdr **hp;
478
479         for (hp = &tdb->access; *hp; hp = &(*hp)->next) {
480                 if (*hp + 1 == p)
481                         return hp;
482         }
483         return NULL;
484 }
485
486 void tdb_access_release(struct tdb_context *tdb, const void *p)
487 {
488         struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
489
490         if (hp) {
491                 hdr = *hp;
492                 *hp = hdr->next;
493                 free(hdr);
494         } else
495                 tdb->direct_access--;
496 }
497
498 int tdb_access_commit(struct tdb_context *tdb, void *p)
499 {
500         struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
501         int ret = 0;
502
503         if (hp) {
504                 hdr = *hp;
505                 if (hdr->convert)
506                         ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
507                 else
508                         ret = tdb_write(tdb, hdr->off, p, hdr->len);
509                 *hp = hdr->next;
510                 free(hdr);
511         } else
512                 tdb->direct_access--;
513
514         return ret;
515 }
516
517 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
518                         bool write_mode)
519 {
520         if (unlikely(!tdb->map_ptr))
521                 return NULL;
522
523         if (unlikely(tdb_oob(tdb, off + len, true) == -1))
524                 return NULL;
525         return (char *)tdb->map_ptr + off;
526 }
527
528 void add_stat_(struct tdb_context *tdb, uint64_t *s, size_t val)
529 {
530         if ((uintptr_t)s < (uintptr_t)tdb->stats + tdb->stats->size)
531                 *s += val;
532 }
533
534 static const struct tdb_methods io_methods = {
535         tdb_read,
536         tdb_write,
537         tdb_oob,
538         tdb_expand_file,
539         tdb_direct,
540 };
541
542 /*
543   initialise the default methods table
544 */
545 void tdb_io_init(struct tdb_context *tdb)
546 {
547         tdb->methods = &io_methods;
548 }