check_type: fix incorrect documentation.
[ccan] / ccan / tdb2 / io.c
1  /*
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9    Copyright (C) Rusty Russell                     2010
10
11      ** NOTE! The following LGPL license applies to the tdb
12      ** library. This does NOT imply that all of Samba is released
13      ** under the LGPL
14
15    This library is free software; you can redistribute it and/or
16    modify it under the terms of the GNU Lesser General Public
17    License as published by the Free Software Foundation; either
18    version 3 of the License, or (at your option) any later version.
19
20    This library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24
25    You should have received a copy of the GNU Lesser General Public
26    License along with this library; if not, see <http://www.gnu.org/licenses/>.
27 */
28 #include "private.h"
29 #include <assert.h>
30 #include <ccan/likely/likely.h>
31
32 void tdb_munmap(struct tdb_file *file)
33 {
34         if (file->fd == -1)
35                 return;
36
37         if (file->map_ptr) {
38                 munmap(file->map_ptr, file->map_size);
39                 file->map_ptr = NULL;
40         }
41 }
42
43 void tdb_mmap(struct tdb_context *tdb)
44 {
45         int mmap_flags;
46
47         if (tdb->flags & TDB_INTERNAL)
48                 return;
49
50         if (tdb->flags & TDB_NOMMAP)
51                 return;
52
53         if ((tdb->open_flags & O_ACCMODE) == O_RDONLY)
54                 mmap_flags = PROT_READ;
55         else
56                 mmap_flags = PROT_READ | PROT_WRITE;
57
58         /* size_t can be smaller than off_t. */
59         if ((size_t)tdb->file->map_size == tdb->file->map_size) {
60                 tdb->file->map_ptr = mmap(NULL, tdb->file->map_size,
61                                           mmap_flags,
62                                           MAP_SHARED, tdb->file->fd, 0);
63         } else
64                 tdb->file->map_ptr = MAP_FAILED;
65
66         /*
67          * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
68          */
69         if (tdb->file->map_ptr == MAP_FAILED) {
70                 tdb->file->map_ptr = NULL;
71                 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
72                            "tdb_mmap failed for size %lld (%s)",
73                            (long long)tdb->file->map_size, strerror(errno));
74         }
75 }
76
77 /* check for an out of bounds access - if it is out of bounds then
78    see if the database has been expanded by someone else and expand
79    if necessary
80    note that "len" is the minimum length needed for the db.
81
82    If probe is true, len being too large isn't a failure.
83 */
84 static enum TDB_ERROR tdb_oob(struct tdb_context *tdb, tdb_off_t len,
85                               bool probe)
86 {
87         struct stat st;
88         enum TDB_ERROR ecode;
89
90         /* We can't hold pointers during this: we could unmap! */
91         assert(!tdb->tdb2.direct_access
92                || (tdb->flags & TDB_NOLOCK)
93                || tdb_has_expansion_lock(tdb));
94
95         if (len <= tdb->file->map_size)
96                 return TDB_SUCCESS;
97         if (tdb->flags & TDB_INTERNAL) {
98                 if (probe)
99                         return TDB_SUCCESS;
100
101                 tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
102                            "tdb_oob len %lld beyond internal"
103                            " malloc size %lld",
104                            (long long)len,
105                            (long long)tdb->file->map_size);
106                 return TDB_ERR_IO;
107         }
108
109         ecode = tdb_lock_expand(tdb, F_RDLCK);
110         if (ecode != TDB_SUCCESS) {
111                 return ecode;
112         }
113
114         if (fstat(tdb->file->fd, &st) != 0) {
115                 tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
116                            "Failed to fstat file: %s", strerror(errno));
117                 tdb_unlock_expand(tdb, F_RDLCK);
118                 return TDB_ERR_IO;
119         }
120
121         tdb_unlock_expand(tdb, F_RDLCK);
122
123         if (st.st_size < (size_t)len) {
124                 if (probe)
125                         return TDB_SUCCESS;
126
127                 tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
128                            "tdb_oob len %zu beyond eof at %zu",
129                            (size_t)len, st.st_size);
130                 return TDB_ERR_IO;
131         }
132
133         /* Unmap, update size, remap */
134         tdb_munmap(tdb->file);
135
136         tdb->file->map_size = st.st_size;
137         tdb_mmap(tdb);
138         return TDB_SUCCESS;
139 }
140
141 /* Endian conversion: we only ever deal with 8 byte quantities */
142 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
143 {
144         assert(size % 8 == 0);
145         if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
146                 uint64_t i, *p = (uint64_t *)buf;
147                 for (i = 0; i < size / 8; i++)
148                         p[i] = bswap_64(p[i]);
149         }
150         return buf;
151 }
152
153 /* Return first non-zero offset in offset array, or end, or -ve error. */
154 /* FIXME: Return the off? */
155 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
156                               tdb_off_t base, uint64_t start, uint64_t end)
157 {
158         uint64_t i;
159         const uint64_t *val;
160
161         /* Zero vs non-zero is the same unconverted: minor optimization. */
162         val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
163                               (end - start) * sizeof(tdb_off_t), false);
164         if (TDB_PTR_IS_ERR(val)) {
165                 return TDB_ERR_TO_OFF(TDB_PTR_ERR(val));
166         }
167
168         for (i = 0; i < (end - start); i++) {
169                 if (val[i])
170                         break;
171         }
172         tdb_access_release(tdb, val);
173         return start + i;
174 }
175
176 /* Return first zero offset in num offset array, or num, or -ve error. */
177 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
178                            uint64_t num)
179 {
180         uint64_t i;
181         const uint64_t *val;
182
183         /* Zero vs non-zero is the same unconverted: minor optimization. */
184         val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
185         if (TDB_PTR_IS_ERR(val)) {
186                 return TDB_ERR_TO_OFF(TDB_PTR_ERR(val));
187         }
188
189         for (i = 0; i < num; i++) {
190                 if (!val[i])
191                         break;
192         }
193         tdb_access_release(tdb, val);
194         return i;
195 }
196
197 enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
198 {
199         char buf[8192] = { 0 };
200         void *p = tdb->tdb2.io->direct(tdb, off, len, true);
201         enum TDB_ERROR ecode = TDB_SUCCESS;
202
203         assert(!(tdb->flags & TDB_RDONLY));
204         if (TDB_PTR_IS_ERR(p)) {
205                 return TDB_PTR_ERR(p);
206         }
207         if (p) {
208                 memset(p, 0, len);
209                 return ecode;
210         }
211         while (len) {
212                 unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
213                 ecode = tdb->tdb2.io->twrite(tdb, off, buf, todo);
214                 if (ecode != TDB_SUCCESS) {
215                         break;
216                 }
217                 len -= todo;
218                 off += todo;
219         }
220         return ecode;
221 }
222
223 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
224 {
225         tdb_off_t ret;
226         enum TDB_ERROR ecode;
227
228         if (likely(!(tdb->flags & TDB_CONVERT))) {
229                 tdb_off_t *p = tdb->tdb2.io->direct(tdb, off, sizeof(*p),
230                                                     false);
231                 if (TDB_PTR_IS_ERR(p)) {
232                         return TDB_ERR_TO_OFF(TDB_PTR_ERR(p));
233                 }
234                 if (p)
235                         return *p;
236         }
237
238         ecode = tdb_read_convert(tdb, off, &ret, sizeof(ret));
239         if (ecode != TDB_SUCCESS) {
240                 return TDB_ERR_TO_OFF(ecode);
241         }
242         return ret;
243 }
244
245 /* write a lump of data at a specified offset */
246 static enum TDB_ERROR tdb_write(struct tdb_context *tdb, tdb_off_t off,
247                                 const void *buf, tdb_len_t len)
248 {
249         enum TDB_ERROR ecode;
250
251         if (tdb->flags & TDB_RDONLY) {
252                 return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
253                                   "Write to read-only database");
254         }
255
256         ecode = tdb->tdb2.io->oob(tdb, off + len, false);
257         if (ecode != TDB_SUCCESS) {
258                 return ecode;
259         }
260
261         if (tdb->file->map_ptr) {
262                 memcpy(off + (char *)tdb->file->map_ptr, buf, len);
263         } else {
264                 ssize_t ret;
265                 ret = pwrite(tdb->file->fd, buf, len, off);
266                 if (ret != len) {
267                         /* This shouldn't happen: we avoid sparse files. */
268                         if (ret >= 0)
269                                 errno = ENOSPC;
270
271                         return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
272                                           "tdb_write: %zi at %zu len=%zu (%s)",
273                                           ret, (size_t)off, (size_t)len,
274                                           strerror(errno));
275                 }
276         }
277         return TDB_SUCCESS;
278 }
279
280 /* read a lump of data at a specified offset */
281 static enum TDB_ERROR tdb_read(struct tdb_context *tdb, tdb_off_t off,
282                                void *buf, tdb_len_t len)
283 {
284         enum TDB_ERROR ecode;
285
286         ecode = tdb->tdb2.io->oob(tdb, off + len, false);
287         if (ecode != TDB_SUCCESS) {
288                 return ecode;
289         }
290
291         if (tdb->file->map_ptr) {
292                 memcpy(buf, off + (char *)tdb->file->map_ptr, len);
293         } else {
294                 ssize_t r = pread(tdb->file->fd, buf, len, off);
295                 if (r != len) {
296                         return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
297                                           "tdb_read failed with %zi at %zu "
298                                           "len=%zu (%s) map_size=%zu",
299                                           r, (size_t)off, (size_t)len,
300                                           strerror(errno),
301                                           (size_t)tdb->file->map_size);
302                 }
303         }
304         return TDB_SUCCESS;
305 }
306
307 enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
308                                  const void *rec, size_t len)
309 {
310         enum TDB_ERROR ecode;
311
312         if (unlikely((tdb->flags & TDB_CONVERT))) {
313                 void *conv = malloc(len);
314                 if (!conv) {
315                         return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
316                                           "tdb_write: no memory converting"
317                                           " %zu bytes", len);
318                 }
319                 memcpy(conv, rec, len);
320                 ecode = tdb->tdb2.io->twrite(tdb, off,
321                                            tdb_convert(tdb, conv, len), len);
322                 free(conv);
323         } else {
324                 ecode = tdb->tdb2.io->twrite(tdb, off, rec, len);
325         }
326         return ecode;
327 }
328
329 enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
330                                 void *rec, size_t len)
331 {
332         enum TDB_ERROR ecode = tdb->tdb2.io->tread(tdb, off, rec, len);
333         tdb_convert(tdb, rec, len);
334         return ecode;
335 }
336
337 enum TDB_ERROR tdb_write_off(struct tdb_context *tdb,
338                              tdb_off_t off, tdb_off_t val)
339 {
340         if (tdb->flags & TDB_RDONLY) {
341                 return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
342                                   "Write to read-only database");
343         }
344
345         if (likely(!(tdb->flags & TDB_CONVERT))) {
346                 tdb_off_t *p = tdb->tdb2.io->direct(tdb, off, sizeof(*p),
347                                                     true);
348                 if (TDB_PTR_IS_ERR(p)) {
349                         return TDB_PTR_ERR(p);
350                 }
351                 if (p) {
352                         *p = val;
353                         return TDB_SUCCESS;
354                 }
355         }
356         return tdb_write_convert(tdb, off, &val, sizeof(val));
357 }
358
359 static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
360                              tdb_len_t len, unsigned int prefix)
361 {
362         unsigned char *buf;
363         enum TDB_ERROR ecode;
364
365         /* some systems don't like zero length malloc */
366         buf = malloc(prefix + len ? prefix + len : 1);
367         if (!buf) {
368                 tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_USE_ERROR,
369                            "tdb_alloc_read malloc failed len=%zu",
370                            (size_t)(prefix + len));
371                 return TDB_ERR_PTR(TDB_ERR_OOM);
372         } else {
373                 ecode = tdb->tdb2.io->tread(tdb, offset, buf+prefix, len);
374                 if (unlikely(ecode != TDB_SUCCESS)) {
375                         free(buf);
376                         return TDB_ERR_PTR(ecode);
377                 }
378         }
379         return buf;
380 }
381
382 /* read a lump of data, allocating the space for it */
383 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
384 {
385         return _tdb_alloc_read(tdb, offset, len, 0);
386 }
387
388 static enum TDB_ERROR fill(struct tdb_context *tdb,
389                            const void *buf, size_t size,
390                            tdb_off_t off, tdb_len_t len)
391 {
392         while (len) {
393                 size_t n = len > size ? size : len;
394                 ssize_t ret = pwrite(tdb->file->fd, buf, n, off);
395                 if (ret != n) {
396                         if (ret >= 0)
397                                 errno = ENOSPC;
398
399                         return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
400                                           "fill failed:"
401                                           " %zi at %zu len=%zu (%s)",
402                                           ret, (size_t)off, (size_t)len,
403                                           strerror(errno));
404                 }
405                 len -= n;
406                 off += n;
407         }
408         return TDB_SUCCESS;
409 }
410
411 /* expand a file.  we prefer to use ftruncate, as that is what posix
412   says to use for mmap expansion */
413 static enum TDB_ERROR tdb_expand_file(struct tdb_context *tdb,
414                                       tdb_len_t addition)
415 {
416         char buf[8192];
417         enum TDB_ERROR ecode;
418
419         if (tdb->flags & TDB_RDONLY) {
420                 return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
421                                   "Expand on read-only database");
422         }
423
424         if (tdb->flags & TDB_INTERNAL) {
425                 char *new = realloc(tdb->file->map_ptr,
426                                     tdb->file->map_size + addition);
427                 if (!new) {
428                         return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
429                                           "No memory to expand database");
430                 }
431                 tdb->file->map_ptr = new;
432                 tdb->file->map_size += addition;
433         } else {
434                 /* Unmap before trying to write; old TDB claimed OpenBSD had
435                  * problem with this otherwise. */
436                 tdb_munmap(tdb->file);
437
438                 /* If this fails, we try to fill anyway. */
439                 if (ftruncate(tdb->file->fd, tdb->file->map_size + addition))
440                         ;
441
442                 /* now fill the file with something. This ensures that the
443                    file isn't sparse, which would be very bad if we ran out of
444                    disk. This must be done with write, not via mmap */
445                 memset(buf, 0x43, sizeof(buf));
446                 ecode = fill(tdb, buf, sizeof(buf), tdb->file->map_size,
447                              addition);
448                 if (ecode != TDB_SUCCESS)
449                         return ecode;
450                 tdb->file->map_size += addition;
451                 tdb_mmap(tdb);
452         }
453         return TDB_SUCCESS;
454 }
455
456 const void *tdb_access_read(struct tdb_context *tdb,
457                             tdb_off_t off, tdb_len_t len, bool convert)
458 {
459         void *ret = NULL;
460
461         if (likely(!(tdb->flags & TDB_CONVERT))) {
462                 ret = tdb->tdb2.io->direct(tdb, off, len, false);
463
464                 if (TDB_PTR_IS_ERR(ret)) {
465                         return ret;
466                 }
467         }
468         if (!ret) {
469                 struct tdb_access_hdr *hdr;
470                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
471                 if (TDB_PTR_IS_ERR(hdr)) {
472                         return hdr;
473                 }
474                 hdr->next = tdb->tdb2.access;
475                 tdb->tdb2.access = hdr;
476                 ret = hdr + 1;
477                 if (convert) {
478                         tdb_convert(tdb, (void *)ret, len);
479                 }
480         } else
481                 tdb->tdb2.direct_access++;
482
483         return ret;
484 }
485
486 void *tdb_access_write(struct tdb_context *tdb,
487                        tdb_off_t off, tdb_len_t len, bool convert)
488 {
489         void *ret = NULL;
490
491         if (tdb->flags & TDB_RDONLY) {
492                 tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
493                            "Write to read-only database");
494                 return TDB_ERR_PTR(TDB_ERR_RDONLY);
495         }
496
497         if (likely(!(tdb->flags & TDB_CONVERT))) {
498                 ret = tdb->tdb2.io->direct(tdb, off, len, true);
499
500                 if (TDB_PTR_IS_ERR(ret)) {
501                         return ret;
502                 }
503         }
504
505         if (!ret) {
506                 struct tdb_access_hdr *hdr;
507                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
508                 if (TDB_PTR_IS_ERR(hdr)) {
509                         return hdr;
510                 }
511                 hdr->next = tdb->tdb2.access;
512                 tdb->tdb2.access = hdr;
513                 hdr->off = off;
514                 hdr->len = len;
515                 hdr->convert = convert;
516                 ret = hdr + 1;
517                 if (convert)
518                         tdb_convert(tdb, (void *)ret, len);
519         } else
520                 tdb->tdb2.direct_access++;
521
522         return ret;
523 }
524
525 static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p)
526 {
527         struct tdb_access_hdr **hp;
528
529         for (hp = &tdb->tdb2.access; *hp; hp = &(*hp)->next) {
530                 if (*hp + 1 == p)
531                         return hp;
532         }
533         return NULL;
534 }
535
536 void tdb_access_release(struct tdb_context *tdb, const void *p)
537 {
538         struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
539
540         if (hp) {
541                 hdr = *hp;
542                 *hp = hdr->next;
543                 free(hdr);
544         } else
545                 tdb->tdb2.direct_access--;
546 }
547
548 enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p)
549 {
550         struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
551         enum TDB_ERROR ecode;
552
553         if (hp) {
554                 hdr = *hp;
555                 if (hdr->convert)
556                         ecode = tdb_write_convert(tdb, hdr->off, p, hdr->len);
557                 else
558                         ecode = tdb_write(tdb, hdr->off, p, hdr->len);
559                 *hp = hdr->next;
560                 free(hdr);
561         } else {
562                 tdb->tdb2.direct_access--;
563                 ecode = TDB_SUCCESS;
564         }
565
566         return ecode;
567 }
568
569 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
570                         bool write_mode)
571 {
572         enum TDB_ERROR ecode;
573
574         if (unlikely(!tdb->file->map_ptr))
575                 return NULL;
576
577         ecode = tdb_oob(tdb, off + len, false);
578         if (unlikely(ecode != TDB_SUCCESS))
579                 return TDB_ERR_PTR(ecode);
580         return (char *)tdb->file->map_ptr + off;
581 }
582
583 void tdb_inc_seqnum(struct tdb_context *tdb)
584 {
585         tdb_off_t seq;
586
587         if (tdb->flags & TDB_VERSION1) {
588                 tdb1_increment_seqnum_nonblock(tdb);
589                 return;
590         }
591
592         if (likely(!(tdb->flags & TDB_CONVERT))) {
593                 int64_t *direct;
594
595                 direct = tdb->tdb2.io->direct(tdb,
596                                               offsetof(struct tdb_header,
597                                                        seqnum),
598                                               sizeof(*direct), true);
599                 if (likely(direct)) {
600                         /* Don't let it go negative, even briefly */
601                         if (unlikely((*direct) + 1) < 0)
602                                 *direct = 0;
603                         (*direct)++;
604                         return;
605                 }
606         }
607
608         seq = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum));
609         if (!TDB_OFF_IS_ERR(seq)) {
610                 seq++;
611                 if (unlikely((int64_t)seq < 0))
612                         seq = 0;
613                 tdb_write_off(tdb, offsetof(struct tdb_header, seqnum), seq);
614         }
615 }
616
617 static const struct tdb_methods io_methods = {
618         tdb_read,
619         tdb_write,
620         tdb_oob,
621         tdb_expand_file,
622         tdb_direct,
623 };
624
625 /*
626   initialise the default methods table
627 */
628 void tdb_io_init(struct tdb_context *tdb)
629 {
630         tdb->tdb2.io = &io_methods;
631 }