]> git.ozlabs.org Git - ccan/blob - ccan/tdb2/io.c
tdb2: careful on wrap.
[ccan] / ccan / tdb2 / io.c
1  /*
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9    Copyright (C) Rusty Russell                     2010
10
11      ** NOTE! The following LGPL license applies to the tdb
12      ** library. This does NOT imply that all of Samba is released
13      ** under the LGPL
14
15    This library is free software; you can redistribute it and/or
16    modify it under the terms of the GNU Lesser General Public
17    License as published by the Free Software Foundation; either
18    version 3 of the License, or (at your option) any later version.
19
20    This library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24
25    You should have received a copy of the GNU Lesser General Public
26    License along with this library; if not, see <http://www.gnu.org/licenses/>.
27 */
28 #include "private.h"
29 #include <assert.h>
30 #include <ccan/likely/likely.h>
31
32 void tdb_munmap(struct tdb_file *file)
33 {
34         if (file->fd == -1)
35                 return;
36
37         if (file->map_ptr) {
38                 munmap(file->map_ptr, file->map_size);
39                 file->map_ptr = NULL;
40         }
41 }
42
43 void tdb_mmap(struct tdb_context *tdb)
44 {
45         int mmap_flags;
46
47         if (tdb->flags & TDB_INTERNAL)
48                 return;
49
50         if (tdb->flags & TDB_NOMMAP)
51                 return;
52
53         if ((tdb->open_flags & O_ACCMODE) == O_RDONLY)
54                 mmap_flags = PROT_READ;
55         else
56                 mmap_flags = PROT_READ | PROT_WRITE;
57
58         /* size_t can be smaller than off_t. */
59         if ((size_t)tdb->file->map_size == tdb->file->map_size) {
60                 tdb->file->map_ptr = mmap(NULL, tdb->file->map_size,
61                                           mmap_flags,
62                                           MAP_SHARED, tdb->file->fd, 0);
63         } else
64                 tdb->file->map_ptr = MAP_FAILED;
65
66         /*
67          * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
68          */
69         if (tdb->file->map_ptr == MAP_FAILED) {
70                 tdb->file->map_ptr = NULL;
71                 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
72                            "tdb_mmap failed for size %lld (%s)",
73                            (long long)tdb->file->map_size, strerror(errno));
74         }
75 }
76
77 /* check for an out of bounds access - if it is out of bounds then
78    see if the database has been expanded by someone else and expand
79    if necessary
80    note that "len" is the minimum length needed for the db.
81
82    If probe is true, len being too large isn't a failure.
83 */
84 static enum TDB_ERROR tdb_oob(struct tdb_context *tdb,
85                               tdb_off_t off, tdb_len_t len, bool probe)
86 {
87         struct stat st;
88         enum TDB_ERROR ecode;
89
90         /* We can't hold pointers during this: we could unmap! */
91         assert(!tdb->tdb2.direct_access
92                || (tdb->flags & TDB_NOLOCK)
93                || tdb_has_expansion_lock(tdb));
94
95         if (len + off < len) {
96                 if (probe)
97                         return TDB_SUCCESS;
98
99                 return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
100                                   "tdb_oob off %llu len %llu wrap\n",
101                                   (long long)off, (long long)len);
102         }
103
104         if (len + off <= tdb->file->map_size)
105                 return TDB_SUCCESS;
106         if (tdb->flags & TDB_INTERNAL) {
107                 if (probe)
108                         return TDB_SUCCESS;
109
110                 tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
111                            "tdb_oob len %lld beyond internal"
112                            " malloc size %lld",
113                            (long long)(off + len),
114                            (long long)tdb->file->map_size);
115                 return TDB_ERR_IO;
116         }
117
118         ecode = tdb_lock_expand(tdb, F_RDLCK);
119         if (ecode != TDB_SUCCESS) {
120                 return ecode;
121         }
122
123         if (fstat(tdb->file->fd, &st) != 0) {
124                 tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
125                            "Failed to fstat file: %s", strerror(errno));
126                 tdb_unlock_expand(tdb, F_RDLCK);
127                 return TDB_ERR_IO;
128         }
129
130         tdb_unlock_expand(tdb, F_RDLCK);
131
132         if (st.st_size < off + len) {
133                 if (probe)
134                         return TDB_SUCCESS;
135
136                 tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
137                            "tdb_oob len %llu beyond eof at %zu",
138                            (long long)(off + len), st.st_size);
139                 return TDB_ERR_IO;
140         }
141
142         /* Unmap, update size, remap */
143         tdb_munmap(tdb->file);
144
145         tdb->file->map_size = st.st_size;
146         tdb_mmap(tdb);
147         return TDB_SUCCESS;
148 }
149
150 /* Endian conversion: we only ever deal with 8 byte quantities */
151 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
152 {
153         assert(size % 8 == 0);
154         if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
155                 uint64_t i, *p = (uint64_t *)buf;
156                 for (i = 0; i < size / 8; i++)
157                         p[i] = bswap_64(p[i]);
158         }
159         return buf;
160 }
161
162 /* Return first non-zero offset in offset array, or end, or -ve error. */
163 /* FIXME: Return the off? */
164 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
165                               tdb_off_t base, uint64_t start, uint64_t end)
166 {
167         uint64_t i;
168         const uint64_t *val;
169
170         /* Zero vs non-zero is the same unconverted: minor optimization. */
171         val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
172                               (end - start) * sizeof(tdb_off_t), false);
173         if (TDB_PTR_IS_ERR(val)) {
174                 return TDB_ERR_TO_OFF(TDB_PTR_ERR(val));
175         }
176
177         for (i = 0; i < (end - start); i++) {
178                 if (val[i])
179                         break;
180         }
181         tdb_access_release(tdb, val);
182         return start + i;
183 }
184
185 /* Return first zero offset in num offset array, or num, or -ve error. */
186 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
187                            uint64_t num)
188 {
189         uint64_t i;
190         const uint64_t *val;
191
192         /* Zero vs non-zero is the same unconverted: minor optimization. */
193         val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
194         if (TDB_PTR_IS_ERR(val)) {
195                 return TDB_ERR_TO_OFF(TDB_PTR_ERR(val));
196         }
197
198         for (i = 0; i < num; i++) {
199                 if (!val[i])
200                         break;
201         }
202         tdb_access_release(tdb, val);
203         return i;
204 }
205
206 enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
207 {
208         char buf[8192] = { 0 };
209         void *p = tdb->tdb2.io->direct(tdb, off, len, true);
210         enum TDB_ERROR ecode = TDB_SUCCESS;
211
212         assert(!(tdb->flags & TDB_RDONLY));
213         if (TDB_PTR_IS_ERR(p)) {
214                 return TDB_PTR_ERR(p);
215         }
216         if (p) {
217                 memset(p, 0, len);
218                 return ecode;
219         }
220         while (len) {
221                 unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
222                 ecode = tdb->tdb2.io->twrite(tdb, off, buf, todo);
223                 if (ecode != TDB_SUCCESS) {
224                         break;
225                 }
226                 len -= todo;
227                 off += todo;
228         }
229         return ecode;
230 }
231
232 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
233 {
234         tdb_off_t ret;
235         enum TDB_ERROR ecode;
236
237         if (likely(!(tdb->flags & TDB_CONVERT))) {
238                 tdb_off_t *p = tdb->tdb2.io->direct(tdb, off, sizeof(*p),
239                                                     false);
240                 if (TDB_PTR_IS_ERR(p)) {
241                         return TDB_ERR_TO_OFF(TDB_PTR_ERR(p));
242                 }
243                 if (p)
244                         return *p;
245         }
246
247         ecode = tdb_read_convert(tdb, off, &ret, sizeof(ret));
248         if (ecode != TDB_SUCCESS) {
249                 return TDB_ERR_TO_OFF(ecode);
250         }
251         return ret;
252 }
253
254 /* write a lump of data at a specified offset */
255 static enum TDB_ERROR tdb_write(struct tdb_context *tdb, tdb_off_t off,
256                                 const void *buf, tdb_len_t len)
257 {
258         enum TDB_ERROR ecode;
259
260         if (tdb->flags & TDB_RDONLY) {
261                 return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
262                                   "Write to read-only database");
263         }
264
265         ecode = tdb->tdb2.io->oob(tdb, off, len, false);
266         if (ecode != TDB_SUCCESS) {
267                 return ecode;
268         }
269
270         if (tdb->file->map_ptr) {
271                 memcpy(off + (char *)tdb->file->map_ptr, buf, len);
272         } else {
273                 ssize_t ret;
274                 ret = pwrite(tdb->file->fd, buf, len, off);
275                 if (ret != len) {
276                         /* This shouldn't happen: we avoid sparse files. */
277                         if (ret >= 0)
278                                 errno = ENOSPC;
279
280                         return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
281                                           "tdb_write: %zi at %zu len=%zu (%s)",
282                                           ret, (size_t)off, (size_t)len,
283                                           strerror(errno));
284                 }
285         }
286         return TDB_SUCCESS;
287 }
288
289 /* read a lump of data at a specified offset */
290 static enum TDB_ERROR tdb_read(struct tdb_context *tdb, tdb_off_t off,
291                                void *buf, tdb_len_t len)
292 {
293         enum TDB_ERROR ecode;
294
295         ecode = tdb->tdb2.io->oob(tdb, off, len, false);
296         if (ecode != TDB_SUCCESS) {
297                 return ecode;
298         }
299
300         if (tdb->file->map_ptr) {
301                 memcpy(buf, off + (char *)tdb->file->map_ptr, len);
302         } else {
303                 ssize_t r = pread(tdb->file->fd, buf, len, off);
304                 if (r != len) {
305                         return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
306                                           "tdb_read failed with %zi at %zu "
307                                           "len=%zu (%s) map_size=%zu",
308                                           r, (size_t)off, (size_t)len,
309                                           strerror(errno),
310                                           (size_t)tdb->file->map_size);
311                 }
312         }
313         return TDB_SUCCESS;
314 }
315
316 enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
317                                  const void *rec, size_t len)
318 {
319         enum TDB_ERROR ecode;
320
321         if (unlikely((tdb->flags & TDB_CONVERT))) {
322                 void *conv = malloc(len);
323                 if (!conv) {
324                         return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
325                                           "tdb_write: no memory converting"
326                                           " %zu bytes", len);
327                 }
328                 memcpy(conv, rec, len);
329                 ecode = tdb->tdb2.io->twrite(tdb, off,
330                                            tdb_convert(tdb, conv, len), len);
331                 free(conv);
332         } else {
333                 ecode = tdb->tdb2.io->twrite(tdb, off, rec, len);
334         }
335         return ecode;
336 }
337
338 enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
339                                 void *rec, size_t len)
340 {
341         enum TDB_ERROR ecode = tdb->tdb2.io->tread(tdb, off, rec, len);
342         tdb_convert(tdb, rec, len);
343         return ecode;
344 }
345
346 enum TDB_ERROR tdb_write_off(struct tdb_context *tdb,
347                              tdb_off_t off, tdb_off_t val)
348 {
349         if (tdb->flags & TDB_RDONLY) {
350                 return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
351                                   "Write to read-only database");
352         }
353
354         if (likely(!(tdb->flags & TDB_CONVERT))) {
355                 tdb_off_t *p = tdb->tdb2.io->direct(tdb, off, sizeof(*p),
356                                                     true);
357                 if (TDB_PTR_IS_ERR(p)) {
358                         return TDB_PTR_ERR(p);
359                 }
360                 if (p) {
361                         *p = val;
362                         return TDB_SUCCESS;
363                 }
364         }
365         return tdb_write_convert(tdb, off, &val, sizeof(val));
366 }
367
368 static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
369                              tdb_len_t len, unsigned int prefix)
370 {
371         unsigned char *buf;
372         enum TDB_ERROR ecode;
373
374         /* some systems don't like zero length malloc */
375         buf = malloc(prefix + len ? prefix + len : 1);
376         if (!buf) {
377                 tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_USE_ERROR,
378                            "tdb_alloc_read malloc failed len=%zu",
379                            (size_t)(prefix + len));
380                 return TDB_ERR_PTR(TDB_ERR_OOM);
381         } else {
382                 ecode = tdb->tdb2.io->tread(tdb, offset, buf+prefix, len);
383                 if (unlikely(ecode != TDB_SUCCESS)) {
384                         free(buf);
385                         return TDB_ERR_PTR(ecode);
386                 }
387         }
388         return buf;
389 }
390
391 /* read a lump of data, allocating the space for it */
392 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
393 {
394         return _tdb_alloc_read(tdb, offset, len, 0);
395 }
396
397 static enum TDB_ERROR fill(struct tdb_context *tdb,
398                            const void *buf, size_t size,
399                            tdb_off_t off, tdb_len_t len)
400 {
401         while (len) {
402                 size_t n = len > size ? size : len;
403                 ssize_t ret = pwrite(tdb->file->fd, buf, n, off);
404                 if (ret != n) {
405                         if (ret >= 0)
406                                 errno = ENOSPC;
407
408                         return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
409                                           "fill failed:"
410                                           " %zi at %zu len=%zu (%s)",
411                                           ret, (size_t)off, (size_t)len,
412                                           strerror(errno));
413                 }
414                 len -= n;
415                 off += n;
416         }
417         return TDB_SUCCESS;
418 }
419
420 /* expand a file.  we prefer to use ftruncate, as that is what posix
421   says to use for mmap expansion */
422 static enum TDB_ERROR tdb_expand_file(struct tdb_context *tdb,
423                                       tdb_len_t addition)
424 {
425         char buf[8192];
426         enum TDB_ERROR ecode;
427
428         if (tdb->flags & TDB_RDONLY) {
429                 return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
430                                   "Expand on read-only database");
431         }
432
433         if (tdb->flags & TDB_INTERNAL) {
434                 char *new = realloc(tdb->file->map_ptr,
435                                     tdb->file->map_size + addition);
436                 if (!new) {
437                         return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
438                                           "No memory to expand database");
439                 }
440                 tdb->file->map_ptr = new;
441                 tdb->file->map_size += addition;
442         } else {
443                 /* Unmap before trying to write; old TDB claimed OpenBSD had
444                  * problem with this otherwise. */
445                 tdb_munmap(tdb->file);
446
447                 /* If this fails, we try to fill anyway. */
448                 if (ftruncate(tdb->file->fd, tdb->file->map_size + addition))
449                         ;
450
451                 /* now fill the file with something. This ensures that the
452                    file isn't sparse, which would be very bad if we ran out of
453                    disk. This must be done with write, not via mmap */
454                 memset(buf, 0x43, sizeof(buf));
455                 ecode = fill(tdb, buf, sizeof(buf), tdb->file->map_size,
456                              addition);
457                 if (ecode != TDB_SUCCESS)
458                         return ecode;
459                 tdb->file->map_size += addition;
460                 tdb_mmap(tdb);
461         }
462         return TDB_SUCCESS;
463 }
464
465 const void *tdb_access_read(struct tdb_context *tdb,
466                             tdb_off_t off, tdb_len_t len, bool convert)
467 {
468         void *ret = NULL;
469
470         if (likely(!(tdb->flags & TDB_CONVERT))) {
471                 ret = tdb->tdb2.io->direct(tdb, off, len, false);
472
473                 if (TDB_PTR_IS_ERR(ret)) {
474                         return ret;
475                 }
476         }
477         if (!ret) {
478                 struct tdb_access_hdr *hdr;
479                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
480                 if (TDB_PTR_IS_ERR(hdr)) {
481                         return hdr;
482                 }
483                 hdr->next = tdb->tdb2.access;
484                 tdb->tdb2.access = hdr;
485                 ret = hdr + 1;
486                 if (convert) {
487                         tdb_convert(tdb, (void *)ret, len);
488                 }
489         } else
490                 tdb->tdb2.direct_access++;
491
492         return ret;
493 }
494
495 void *tdb_access_write(struct tdb_context *tdb,
496                        tdb_off_t off, tdb_len_t len, bool convert)
497 {
498         void *ret = NULL;
499
500         if (tdb->flags & TDB_RDONLY) {
501                 tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
502                            "Write to read-only database");
503                 return TDB_ERR_PTR(TDB_ERR_RDONLY);
504         }
505
506         if (likely(!(tdb->flags & TDB_CONVERT))) {
507                 ret = tdb->tdb2.io->direct(tdb, off, len, true);
508
509                 if (TDB_PTR_IS_ERR(ret)) {
510                         return ret;
511                 }
512         }
513
514         if (!ret) {
515                 struct tdb_access_hdr *hdr;
516                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
517                 if (TDB_PTR_IS_ERR(hdr)) {
518                         return hdr;
519                 }
520                 hdr->next = tdb->tdb2.access;
521                 tdb->tdb2.access = hdr;
522                 hdr->off = off;
523                 hdr->len = len;
524                 hdr->convert = convert;
525                 ret = hdr + 1;
526                 if (convert)
527                         tdb_convert(tdb, (void *)ret, len);
528         } else
529                 tdb->tdb2.direct_access++;
530
531         return ret;
532 }
533
534 static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p)
535 {
536         struct tdb_access_hdr **hp;
537
538         for (hp = &tdb->tdb2.access; *hp; hp = &(*hp)->next) {
539                 if (*hp + 1 == p)
540                         return hp;
541         }
542         return NULL;
543 }
544
545 void tdb_access_release(struct tdb_context *tdb, const void *p)
546 {
547         struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
548
549         if (hp) {
550                 hdr = *hp;
551                 *hp = hdr->next;
552                 free(hdr);
553         } else
554                 tdb->tdb2.direct_access--;
555 }
556
557 enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p)
558 {
559         struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
560         enum TDB_ERROR ecode;
561
562         if (hp) {
563                 hdr = *hp;
564                 if (hdr->convert)
565                         ecode = tdb_write_convert(tdb, hdr->off, p, hdr->len);
566                 else
567                         ecode = tdb_write(tdb, hdr->off, p, hdr->len);
568                 *hp = hdr->next;
569                 free(hdr);
570         } else {
571                 tdb->tdb2.direct_access--;
572                 ecode = TDB_SUCCESS;
573         }
574
575         return ecode;
576 }
577
578 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
579                         bool write_mode)
580 {
581         enum TDB_ERROR ecode;
582
583         if (unlikely(!tdb->file->map_ptr))
584                 return NULL;
585
586         ecode = tdb_oob(tdb, off, len, false);
587         if (unlikely(ecode != TDB_SUCCESS))
588                 return TDB_ERR_PTR(ecode);
589         return (char *)tdb->file->map_ptr + off;
590 }
591
592 void tdb_inc_seqnum(struct tdb_context *tdb)
593 {
594         tdb_off_t seq;
595
596         if (tdb->flags & TDB_VERSION1) {
597                 tdb1_increment_seqnum_nonblock(tdb);
598                 return;
599         }
600
601         if (likely(!(tdb->flags & TDB_CONVERT))) {
602                 int64_t *direct;
603
604                 direct = tdb->tdb2.io->direct(tdb,
605                                               offsetof(struct tdb_header,
606                                                        seqnum),
607                                               sizeof(*direct), true);
608                 if (likely(direct)) {
609                         /* Don't let it go negative, even briefly */
610                         if (unlikely((*direct) + 1) < 0)
611                                 *direct = 0;
612                         (*direct)++;
613                         return;
614                 }
615         }
616
617         seq = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum));
618         if (!TDB_OFF_IS_ERR(seq)) {
619                 seq++;
620                 if (unlikely((int64_t)seq < 0))
621                         seq = 0;
622                 tdb_write_off(tdb, offsetof(struct tdb_header, seqnum), seq);
623         }
624 }
625
626 static const struct tdb_methods io_methods = {
627         tdb_read,
628         tdb_write,
629         tdb_oob,
630         tdb_expand_file,
631         tdb_direct,
632 };
633
634 /*
635   initialise the default methods table
636 */
637 void tdb_io_init(struct tdb_context *tdb)
638 {
639         tdb->tdb2.io = &io_methods;
640 }