56ccfcba67b7ab2dac531f5d3cf9a08d22644ca4
[ccan] / ccan / tdb2 / io.c
1  /* 
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9    Copyright (C) Rusty Russell                     2010
10
11      ** NOTE! The following LGPL license applies to the tdb
12      ** library. This does NOT imply that all of Samba is released
13      ** under the LGPL
14
15    This library is free software; you can redistribute it and/or
16    modify it under the terms of the GNU Lesser General Public
17    License as published by the Free Software Foundation; either
18    version 3 of the License, or (at your option) any later version.
19
20    This library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24
25    You should have received a copy of the GNU Lesser General Public
26    License along with this library; if not, see <http://www.gnu.org/licenses/>.
27 */
28 #include "private.h"
29 #include <assert.h>
30 #include <ccan/likely/likely.h>
31
32 void tdb_munmap(struct tdb_context *tdb)
33 {
34         if (tdb->flags & TDB_INTERNAL)
35                 return;
36
37         if (tdb->map_ptr) {
38                 munmap(tdb->map_ptr, tdb->map_size);
39                 tdb->map_ptr = NULL;
40         }
41 }
42
43 void tdb_mmap(struct tdb_context *tdb)
44 {
45         if (tdb->flags & TDB_INTERNAL)
46                 return;
47
48         if (tdb->flags & TDB_NOMMAP)
49                 return;
50
51         tdb->map_ptr = mmap(NULL, tdb->map_size, tdb->mmap_flags,
52                             MAP_SHARED, tdb->fd, 0);
53
54         /*
55          * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
56          */
57         if (tdb->map_ptr == MAP_FAILED) {
58                 tdb->map_ptr = NULL;
59                 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
60                          "tdb_mmap failed for size %lld (%s)\n", 
61                          (long long)tdb->map_size, strerror(errno));
62         }
63 }
64
65 /* check for an out of bounds access - if it is out of bounds then
66    see if the database has been expanded by someone else and expand
67    if necessary 
68    note that "len" is the minimum length needed for the db
69 */
70 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
71 {
72         struct stat st;
73         int ret;
74
75         /* We can't hold pointers during this: we could unmap! */
76         assert(!tdb->direct_access
77                || (tdb->flags & TDB_NOLOCK)
78                || tdb_has_expansion_lock(tdb));
79
80         if (len <= tdb->map_size)
81                 return 0;
82         if (tdb->flags & TDB_INTERNAL) {
83                 if (!probe) {
84                         /* Ensure ecode is set for log fn. */
85                         tdb->ecode = TDB_ERR_IO;
86                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
87                                  "tdb_oob len %lld beyond internal"
88                                  " malloc size %lld\n",
89                                  (long long)len,
90                                  (long long)tdb->map_size);
91                 }
92                 return -1;
93         }
94
95         if (tdb_lock_expand(tdb, F_RDLCK) != 0)
96                 return -1;
97
98         ret = fstat(tdb->fd, &st);
99
100         tdb_unlock_expand(tdb, F_RDLCK);
101
102         if (ret == -1) {
103                 tdb->ecode = TDB_ERR_IO;
104                 return -1;
105         }
106
107         if (st.st_size < (size_t)len) {
108                 if (!probe) {
109                         /* Ensure ecode is set for log fn. */
110                         tdb->ecode = TDB_ERR_IO;
111                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
112                                  "tdb_oob len %lld beyond eof at %lld\n",
113                                  (long long)len, (long long)st.st_size);
114                 }
115                 return -1;
116         }
117
118         /* Unmap, update size, remap */
119         tdb_munmap(tdb);
120
121         tdb->map_size = st.st_size;
122         tdb_mmap(tdb);
123         return 0;
124 }
125
126 /* Endian conversion: we only ever deal with 8 byte quantities */
127 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
128 {
129         if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
130                 uint64_t i, *p = (uint64_t *)buf;
131                 for (i = 0; i < size / 8; i++)
132                         p[i] = bswap_64(p[i]);
133         }
134         return buf;
135 }
136
137 /* FIXME: Return the off? */
138 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
139                               tdb_off_t base, uint64_t start, uint64_t end)
140 {
141         uint64_t i;
142         const uint64_t *val;
143
144         /* Zero vs non-zero is the same unconverted: minor optimization. */
145         val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
146                               (end - start) * sizeof(tdb_off_t), false);
147         if (!val)
148                 return end;
149
150         for (i = 0; i < (end - start); i++) {
151                 if (val[i])
152                         break;
153         }
154         tdb_access_release(tdb, val);
155         return start + i;
156 }
157
158 /* Return first zero offset in num offset array, or num. */
159 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
160                            uint64_t num)
161 {
162         uint64_t i;
163         const uint64_t *val;
164
165         /* Zero vs non-zero is the same unconverted: minor optimization. */
166         val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
167         if (!val)
168                 return num;
169
170         for (i = 0; i < num; i++) {
171                 if (!val[i])
172                         break;
173         }
174         tdb_access_release(tdb, val);
175         return i;
176 }
177
178 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
179 {
180         char buf[8192] = { 0 };
181         void *p = tdb->methods->direct(tdb, off, len);
182
183         if (tdb->read_only) {
184                 tdb->ecode = TDB_ERR_RDONLY;
185                 return -1;
186         }
187
188         if (p) {
189                 memset(p, 0, len);
190                 return 0;
191         }
192         while (len) {
193                 unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
194                 if (tdb->methods->write(tdb, off, buf, todo) == -1)
195                         return -1;
196                 len -= todo;
197                 off += todo;
198         }
199         return 0;
200 }
201
202 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
203 {
204         tdb_off_t ret;
205
206         if (likely(!(tdb->flags & TDB_CONVERT))) {
207                 tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p));
208                 if (p)
209                         return *p;
210         }
211
212         if (tdb_read_convert(tdb, off, &ret, sizeof(ret)) == -1)
213                 return TDB_OFF_ERR;
214         return ret;
215 }
216
217 /* Even on files, we can get partial writes due to signals. */
218 bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off)
219 {
220         while (len) {
221                 ssize_t ret;
222                 ret = pwrite(fd, buf, len, off);
223                 if (ret < 0)
224                         return false;
225                 if (ret == 0) {
226                         errno = ENOSPC;
227                         return false;
228                 }
229                 buf = (char *)buf + ret;
230                 off += ret;
231                 len -= ret;
232         }
233         return true;
234 }
235
236 /* Even on files, we can get partial reads due to signals. */
237 bool tdb_pread_all(int fd, void *buf, size_t len, tdb_off_t off)
238 {
239         while (len) {
240                 ssize_t ret;
241                 ret = pread(fd, buf, len, off);
242                 if (ret < 0)
243                         return false;
244                 if (ret == 0) {
245                         /* ETOOSHORT? */
246                         errno = EWOULDBLOCK;
247                         return false;
248                 }
249                 buf = (char *)buf + ret;
250                 off += ret;
251                 len -= ret;
252         }
253         return true;
254 }
255
256 bool tdb_read_all(int fd, void *buf, size_t len)
257 {
258         while (len) {
259                 ssize_t ret;
260                 ret = read(fd, buf, len);
261                 if (ret < 0)
262                         return false;
263                 if (ret == 0) {
264                         /* ETOOSHORT? */
265                         errno = EWOULDBLOCK;
266                         return false;
267                 }
268                 buf = (char *)buf + ret;
269                 len -= ret;
270         }
271         return true;
272 }
273
274 /* write a lump of data at a specified offset */
275 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
276                      const void *buf, tdb_len_t len)
277 {
278         if (len == 0) {
279                 return 0;
280         }
281
282         if (tdb->read_only) {
283                 tdb->ecode = TDB_ERR_RDONLY;
284                 return -1;
285         }
286
287         if (tdb->methods->oob(tdb, off + len, 0) != 0)
288                 return -1;
289
290         if (tdb->map_ptr) {
291                 memcpy(off + (char *)tdb->map_ptr, buf, len);
292         } else {
293                 if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
294                         tdb->ecode = TDB_ERR_IO;
295                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
296                                  "tdb_write failed at %llu len=%llu (%s)\n",
297                                  (long long)off, (long long)len,
298                                  strerror(errno));
299                         return -1;
300                 }
301         }
302         return 0;
303 }
304
305 /* read a lump of data at a specified offset */
306 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
307                     tdb_len_t len)
308 {
309         if (tdb->methods->oob(tdb, off + len, 0) != 0) {
310                 return -1;
311         }
312
313         if (tdb->map_ptr) {
314                 memcpy(buf, off + (char *)tdb->map_ptr, len);
315         } else {
316                 if (!tdb_pread_all(tdb->fd, buf, len, off)) {
317                         /* Ensure ecode is set for log fn. */
318                         tdb->ecode = TDB_ERR_IO;
319                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
320                                  "tdb_read failed at %lld "
321                                  "len=%lld (%s) map_size=%lld\n",
322                                  (long long)off, (long long)len,
323                                  strerror(errno),
324                                  (long long)tdb->map_size);
325                         return -1;
326                 }
327         }
328         return 0;
329 }
330
331 int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
332                       const void *rec, size_t len)
333 {
334         int ret;
335         if (unlikely((tdb->flags & TDB_CONVERT))) {
336                 void *conv = malloc(len);
337                 if (!conv) {
338                         tdb->ecode = TDB_ERR_OOM;
339                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
340                                  "tdb_write: no memory converting %zu bytes\n",
341                                  len);
342                         return -1;
343                 }
344                 memcpy(conv, rec, len);
345                 ret = tdb->methods->write(tdb, off,
346                                           tdb_convert(tdb, conv, len), len);
347                 free(conv);
348         } else
349                 ret = tdb->methods->write(tdb, off, rec, len);
350
351         return ret;
352 }
353
354 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
355                       void *rec, size_t len)
356 {
357         int ret = tdb->methods->read(tdb, off, rec, len);
358         tdb_convert(tdb, rec, len);
359         return ret;
360 }
361
362 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
363 {
364         if (tdb->read_only) {
365                 tdb->ecode = TDB_ERR_RDONLY;
366                 return -1;
367         }
368
369         if (likely(!(tdb->flags & TDB_CONVERT))) {
370                 tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p));
371                 if (p) {
372                         *p = val;
373                         return 0;
374                 }
375         }
376         return tdb_write_convert(tdb, off, &val, sizeof(val));
377 }
378
379 static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
380                              tdb_len_t len, unsigned int prefix)
381 {
382         void *buf;
383
384         /* some systems don't like zero length malloc */
385         buf = malloc(prefix + len ? prefix + len : 1);
386         if (unlikely(!buf)) {
387                 tdb->ecode = TDB_ERR_OOM;
388                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
389                          "tdb_alloc_read malloc failed len=%lld\n",
390                          (long long)prefix + len);
391         } else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix, len))) {
392                 free(buf);
393                 buf = NULL;
394         }
395         return buf;
396 }
397
398 /* read a lump of data, allocating the space for it */
399 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
400 {
401         return _tdb_alloc_read(tdb, offset, len, 0);
402 }
403
404 static int fill(struct tdb_context *tdb,
405                 const void *buf, size_t size,
406                 tdb_off_t off, tdb_len_t len)
407 {
408         while (len) {
409                 size_t n = len > size ? size : len;
410
411                 if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
412                         tdb->ecode = TDB_ERR_IO;
413                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
414                                  "fill write failed: giving up!\n");
415                         return -1;
416                 }
417                 len -= n;
418                 off += n;
419         }
420         return 0;
421 }
422
423 /* expand a file.  we prefer to use ftruncate, as that is what posix
424   says to use for mmap expansion */
425 static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
426 {
427         char buf[8192];
428
429         if (tdb->read_only) {
430                 tdb->ecode = TDB_ERR_RDONLY;
431                 return -1;
432         }
433
434         if (tdb->flags & TDB_INTERNAL) {
435                 char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
436                 if (!new) {
437                         tdb->ecode = TDB_ERR_OOM;
438                         return -1;
439                 }
440                 tdb->map_ptr = new;
441                 tdb->map_size += addition;
442         } else {
443                 /* Unmap before trying to write; old TDB claimed OpenBSD had
444                  * problem with this otherwise. */
445                 tdb_munmap(tdb);
446
447                 /* If this fails, we try to fill anyway. */
448                 if (ftruncate(tdb->fd, tdb->map_size + addition))
449                         ;
450
451                 /* now fill the file with something. This ensures that the
452                    file isn't sparse, which would be very bad if we ran out of
453                    disk. This must be done with write, not via mmap */
454                 memset(buf, 0x43, sizeof(buf));
455                 if (0 || fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
456                         return -1;
457                 tdb->map_size += addition;
458                 tdb_mmap(tdb);
459         }
460         return 0;
461 }
462
463 /* This is only neded for tdb_access_commit, but used everywhere to simplify. */
464 struct tdb_access_hdr {
465         tdb_off_t off;
466         tdb_len_t len;
467         bool convert;
468 };
469
470 const void *tdb_access_read(struct tdb_context *tdb,
471                             tdb_off_t off, tdb_len_t len, bool convert)
472 {
473         const void *ret = NULL; 
474
475         if (likely(!(tdb->flags & TDB_CONVERT)))
476                 ret = tdb->methods->direct(tdb, off, len);
477
478         if (!ret) {
479                 struct tdb_access_hdr *hdr;
480                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
481                 if (hdr) {
482                         ret = hdr + 1;
483                         if (convert)
484                                 tdb_convert(tdb, (void *)ret, len);
485                 }
486         } else
487                 tdb->direct_access++;
488
489         return ret;
490 }
491
492 void *tdb_access_write(struct tdb_context *tdb,
493                        tdb_off_t off, tdb_len_t len, bool convert)
494 {
495         void *ret = NULL;
496
497         if (tdb->read_only) {
498                 tdb->ecode = TDB_ERR_RDONLY;
499                 return NULL;
500         }
501
502         if (likely(!(tdb->flags & TDB_CONVERT)))
503                 ret = tdb->methods->direct(tdb, off, len);
504
505         if (!ret) {
506                 struct tdb_access_hdr *hdr;
507                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
508                 if (hdr) {
509                         hdr->off = off;
510                         hdr->len = len;
511                         hdr->convert = convert;
512                         ret = hdr + 1;
513                         if (convert)
514                                 tdb_convert(tdb, (void *)ret, len);
515                 }
516         } else
517                 tdb->direct_access++;
518
519         return ret;
520 }
521
522 bool is_direct(const struct tdb_context *tdb, const void *p)
523 {
524         return (tdb->map_ptr
525                 && (char *)p >= (char *)tdb->map_ptr
526                 && (char *)p < (char *)tdb->map_ptr + tdb->map_size);
527 }
528
529 void tdb_access_release(struct tdb_context *tdb, const void *p)
530 {
531         if (is_direct(tdb, p))
532                 tdb->direct_access--;
533         else
534                 free((struct tdb_access_hdr *)p - 1);
535 }
536
537 int tdb_access_commit(struct tdb_context *tdb, void *p)
538 {
539         int ret = 0;
540
541         if (!tdb->map_ptr
542             || (char *)p < (char *)tdb->map_ptr
543             || (char *)p >= (char *)tdb->map_ptr + tdb->map_size) {
544                 struct tdb_access_hdr *hdr;
545
546                 hdr = (struct tdb_access_hdr *)p - 1;
547                 if (hdr->convert)
548                         ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
549                 else
550                         ret = tdb_write(tdb, hdr->off, p, hdr->len);
551                 free(hdr);
552         } else
553                 tdb->direct_access--;
554
555         return ret;
556 }
557
558 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
559 {
560         if (unlikely(!tdb->map_ptr))
561                 return NULL;
562
563         if (unlikely(tdb_oob(tdb, off + len, true) == -1))
564                 return NULL;
565         return (char *)tdb->map_ptr + off;
566 }
567
568 void add_stat_(struct tdb_context *tdb, uint64_t *stat, size_t val)
569 {
570         if ((uintptr_t)stat < (uintptr_t)tdb->stats + tdb->stats->size)
571                 *stat += val;
572 }
573
574 static const struct tdb_methods io_methods = {
575         tdb_read,
576         tdb_write,
577         tdb_oob,
578         tdb_expand_file,
579         tdb_direct,
580 };
581
582 /*
583   initialise the default methods table
584 */
585 void tdb_io_init(struct tdb_context *tdb)
586 {
587         tdb->methods = &io_methods;
588 }