tdb2: reduce transaction before writing to recovery area.
[ccan] / ccan / tdb2 / io.c
1  /*
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9    Copyright (C) Rusty Russell                     2010
10
11      ** NOTE! The following LGPL license applies to the tdb
12      ** library. This does NOT imply that all of Samba is released
13      ** under the LGPL
14
15    This library is free software; you can redistribute it and/or
16    modify it under the terms of the GNU Lesser General Public
17    License as published by the Free Software Foundation; either
18    version 3 of the License, or (at your option) any later version.
19
20    This library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24
25    You should have received a copy of the GNU Lesser General Public
26    License along with this library; if not, see <http://www.gnu.org/licenses/>.
27 */
28 #include "private.h"
29 #include <assert.h>
30 #include <ccan/likely/likely.h>
31
32 void tdb_munmap(struct tdb_file *file)
33 {
34         if (file->fd == -1)
35                 return;
36
37         if (file->map_ptr) {
38                 munmap(file->map_ptr, file->map_size);
39                 file->map_ptr = NULL;
40         }
41 }
42
43 void tdb_mmap(struct tdb_context *tdb)
44 {
45         if (tdb->flags & TDB_INTERNAL)
46                 return;
47
48         if (tdb->flags & TDB_NOMMAP)
49                 return;
50
51         /* size_t can be smaller than off_t. */
52         if ((size_t)tdb->file->map_size == tdb->file->map_size) {
53                 tdb->file->map_ptr = mmap(NULL, tdb->file->map_size,
54                                           tdb->mmap_flags,
55                                           MAP_SHARED, tdb->file->fd, 0);
56         } else
57                 tdb->file->map_ptr = MAP_FAILED;
58
59         /*
60          * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
61          */
62         if (tdb->file->map_ptr == MAP_FAILED) {
63                 tdb->file->map_ptr = NULL;
64                 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
65                            "tdb_mmap failed for size %lld (%s)",
66                            (long long)tdb->file->map_size, strerror(errno));
67         }
68 }
69
70 /* check for an out of bounds access - if it is out of bounds then
71    see if the database has been expanded by someone else and expand
72    if necessary
73    note that "len" is the minimum length needed for the db
74 */
75 static enum TDB_ERROR tdb_oob(struct tdb_context *tdb, tdb_off_t len,
76                               bool probe)
77 {
78         struct stat st;
79         enum TDB_ERROR ecode;
80
81         /* We can't hold pointers during this: we could unmap! */
82         assert(!tdb->direct_access
83                || (tdb->flags & TDB_NOLOCK)
84                || tdb_has_expansion_lock(tdb));
85
86         if (len <= tdb->file->map_size)
87                 return 0;
88         if (tdb->flags & TDB_INTERNAL) {
89                 if (!probe) {
90                         tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
91                                  "tdb_oob len %lld beyond internal"
92                                  " malloc size %lld",
93                                  (long long)len,
94                                  (long long)tdb->file->map_size);
95                 }
96                 return TDB_ERR_IO;
97         }
98
99         ecode = tdb_lock_expand(tdb, F_RDLCK);
100         if (ecode != TDB_SUCCESS) {
101                 return ecode;
102         }
103
104         if (fstat(tdb->file->fd, &st) != 0) {
105                 tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
106                            "Failed to fstat file: %s", strerror(errno));
107                 tdb_unlock_expand(tdb, F_RDLCK);
108                 return TDB_ERR_IO;
109         }
110
111         tdb_unlock_expand(tdb, F_RDLCK);
112
113         if (st.st_size < (size_t)len) {
114                 if (!probe) {
115                         tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
116                                    "tdb_oob len %zu beyond eof at %zu",
117                                    (size_t)len, st.st_size);
118                 }
119                 return TDB_ERR_IO;
120         }
121
122         /* Unmap, update size, remap */
123         tdb_munmap(tdb->file);
124
125         tdb->file->map_size = st.st_size;
126         tdb_mmap(tdb);
127         return TDB_SUCCESS;
128 }
129
130 /* Endian conversion: we only ever deal with 8 byte quantities */
131 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
132 {
133         assert(size % 8 == 0);
134         if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
135                 uint64_t i, *p = (uint64_t *)buf;
136                 for (i = 0; i < size / 8; i++)
137                         p[i] = bswap_64(p[i]);
138         }
139         return buf;
140 }
141
142 /* Return first non-zero offset in offset array, or end, or -ve error. */
143 /* FIXME: Return the off? */
144 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
145                               tdb_off_t base, uint64_t start, uint64_t end)
146 {
147         uint64_t i;
148         const uint64_t *val;
149
150         /* Zero vs non-zero is the same unconverted: minor optimization. */
151         val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
152                               (end - start) * sizeof(tdb_off_t), false);
153         if (TDB_PTR_IS_ERR(val)) {
154                 return TDB_PTR_ERR(val);
155         }
156
157         for (i = 0; i < (end - start); i++) {
158                 if (val[i])
159                         break;
160         }
161         tdb_access_release(tdb, val);
162         return start + i;
163 }
164
165 /* Return first zero offset in num offset array, or num, or -ve error. */
166 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
167                            uint64_t num)
168 {
169         uint64_t i;
170         const uint64_t *val;
171
172         /* Zero vs non-zero is the same unconverted: minor optimization. */
173         val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
174         if (TDB_PTR_IS_ERR(val)) {
175                 return TDB_PTR_ERR(val);
176         }
177
178         for (i = 0; i < num; i++) {
179                 if (!val[i])
180                         break;
181         }
182         tdb_access_release(tdb, val);
183         return i;
184 }
185
186 enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
187 {
188         char buf[8192] = { 0 };
189         void *p = tdb->methods->direct(tdb, off, len, true);
190         enum TDB_ERROR ecode = TDB_SUCCESS;
191
192         assert(!tdb->read_only);
193         if (TDB_PTR_IS_ERR(p)) {
194                 return TDB_PTR_ERR(p);
195         }
196         if (p) {
197                 memset(p, 0, len);
198                 return ecode;
199         }
200         while (len) {
201                 unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
202                 ecode = tdb->methods->twrite(tdb, off, buf, todo);
203                 if (ecode != TDB_SUCCESS) {
204                         break;
205                 }
206                 len -= todo;
207                 off += todo;
208         }
209         return ecode;
210 }
211
212 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
213 {
214         tdb_off_t ret;
215         enum TDB_ERROR ecode;
216
217         if (likely(!(tdb->flags & TDB_CONVERT))) {
218                 tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
219                                                     false);
220                 if (TDB_PTR_IS_ERR(p)) {
221                         return TDB_PTR_ERR(p);
222                 }
223                 if (p)
224                         return *p;
225         }
226
227         ecode = tdb_read_convert(tdb, off, &ret, sizeof(ret));
228         if (ecode != TDB_SUCCESS) {
229                 return ecode;
230         }
231         return ret;
232 }
233
234 /* write a lump of data at a specified offset */
235 static enum TDB_ERROR tdb_write(struct tdb_context *tdb, tdb_off_t off,
236                                 const void *buf, tdb_len_t len)
237 {
238         enum TDB_ERROR ecode;
239
240         if (tdb->read_only) {
241                 return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
242                                   "Write to read-only database");
243         }
244
245         ecode = tdb->methods->oob(tdb, off + len, 0);
246         if (ecode != TDB_SUCCESS) {
247                 return ecode;
248         }
249
250         if (tdb->file->map_ptr) {
251                 memcpy(off + (char *)tdb->file->map_ptr, buf, len);
252         } else {
253                 ssize_t ret;
254                 ret = pwrite(tdb->file->fd, buf, len, off);
255                 if (ret != len) {
256                         /* This shouldn't happen: we avoid sparse files. */
257                         if (ret >= 0)
258                                 errno = ENOSPC;
259
260                         return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
261                                           "tdb_write: %zi at %zu len=%zu (%s)",
262                                           ret, (size_t)off, (size_t)len,
263                                           strerror(errno));
264                 }
265         }
266         return TDB_SUCCESS;
267 }
268
269 /* read a lump of data at a specified offset */
270 static enum TDB_ERROR tdb_read(struct tdb_context *tdb, tdb_off_t off,
271                                void *buf, tdb_len_t len)
272 {
273         enum TDB_ERROR ecode;
274
275         ecode = tdb->methods->oob(tdb, off + len, 0);
276         if (ecode != TDB_SUCCESS) {
277                 return ecode;
278         }
279
280         if (tdb->file->map_ptr) {
281                 memcpy(buf, off + (char *)tdb->file->map_ptr, len);
282         } else {
283                 ssize_t r = pread(tdb->file->fd, buf, len, off);
284                 if (r != len) {
285                         return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
286                                           "tdb_read failed with %zi at %zu "
287                                           "len=%zu (%s) map_size=%zu",
288                                           r, (size_t)off, (size_t)len,
289                                           strerror(errno),
290                                           (size_t)tdb->file->map_size);
291                 }
292         }
293         return TDB_SUCCESS;
294 }
295
296 enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
297                                  const void *rec, size_t len)
298 {
299         enum TDB_ERROR ecode;
300
301         if (unlikely((tdb->flags & TDB_CONVERT))) {
302                 void *conv = malloc(len);
303                 if (!conv) {
304                         return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
305                                           "tdb_write: no memory converting"
306                                           " %zu bytes", len);
307                 }
308                 memcpy(conv, rec, len);
309                 ecode = tdb->methods->twrite(tdb, off,
310                                            tdb_convert(tdb, conv, len), len);
311                 free(conv);
312         } else {
313                 ecode = tdb->methods->twrite(tdb, off, rec, len);
314         }
315         return ecode;
316 }
317
318 enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
319                                 void *rec, size_t len)
320 {
321         enum TDB_ERROR ecode = tdb->methods->tread(tdb, off, rec, len);
322         tdb_convert(tdb, rec, len);
323         return ecode;
324 }
325
326 enum TDB_ERROR tdb_write_off(struct tdb_context *tdb,
327                              tdb_off_t off, tdb_off_t val)
328 {
329         if (tdb->read_only) {
330                 return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
331                                   "Write to read-only database");
332         }
333
334         if (likely(!(tdb->flags & TDB_CONVERT))) {
335                 tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
336                                                     true);
337                 if (TDB_PTR_IS_ERR(p)) {
338                         return TDB_PTR_ERR(p);
339                 }
340                 if (p) {
341                         *p = val;
342                         return TDB_SUCCESS;
343                 }
344         }
345         return tdb_write_convert(tdb, off, &val, sizeof(val));
346 }
347
348 static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
349                              tdb_len_t len, unsigned int prefix)
350 {
351         unsigned char *buf;
352         enum TDB_ERROR ecode;
353
354         /* some systems don't like zero length malloc */
355         buf = malloc(prefix + len ? prefix + len : 1);
356         if (!buf) {
357                 tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_USE_ERROR,
358                            "tdb_alloc_read malloc failed len=%zu",
359                            (size_t)(prefix + len));
360                 return TDB_ERR_PTR(TDB_ERR_OOM);
361         } else {
362                 ecode = tdb->methods->tread(tdb, offset, buf+prefix, len);
363                 if (unlikely(ecode != TDB_SUCCESS)) {
364                         free(buf);
365                         return TDB_ERR_PTR(ecode);
366                 }
367         }
368         return buf;
369 }
370
371 /* read a lump of data, allocating the space for it */
372 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
373 {
374         return _tdb_alloc_read(tdb, offset, len, 0);
375 }
376
377 static enum TDB_ERROR fill(struct tdb_context *tdb,
378                            const void *buf, size_t size,
379                            tdb_off_t off, tdb_len_t len)
380 {
381         while (len) {
382                 size_t n = len > size ? size : len;
383                 ssize_t ret = pwrite(tdb->file->fd, buf, n, off);
384                 if (ret != n) {
385                         if (ret >= 0)
386                                 errno = ENOSPC;
387
388                         return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
389                                           "fill failed:"
390                                           " %zi at %zu len=%zu (%s)",
391                                           ret, (size_t)off, (size_t)len,
392                                           strerror(errno));
393                 }
394                 len -= n;
395                 off += n;
396         }
397         return TDB_SUCCESS;
398 }
399
400 /* expand a file.  we prefer to use ftruncate, as that is what posix
401   says to use for mmap expansion */
402 static enum TDB_ERROR tdb_expand_file(struct tdb_context *tdb,
403                                       tdb_len_t addition)
404 {
405         char buf[8192];
406         enum TDB_ERROR ecode;
407
408         if (tdb->read_only) {
409                 return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
410                                   "Expand on read-only database");
411         }
412
413         if (tdb->flags & TDB_INTERNAL) {
414                 char *new = realloc(tdb->file->map_ptr,
415                                     tdb->file->map_size + addition);
416                 if (!new) {
417                         return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
418                                           "No memory to expand database");
419                 }
420                 tdb->file->map_ptr = new;
421                 tdb->file->map_size += addition;
422         } else {
423                 /* Unmap before trying to write; old TDB claimed OpenBSD had
424                  * problem with this otherwise. */
425                 tdb_munmap(tdb->file);
426
427                 /* If this fails, we try to fill anyway. */
428                 if (ftruncate(tdb->file->fd, tdb->file->map_size + addition))
429                         ;
430
431                 /* now fill the file with something. This ensures that the
432                    file isn't sparse, which would be very bad if we ran out of
433                    disk. This must be done with write, not via mmap */
434                 memset(buf, 0x43, sizeof(buf));
435                 ecode = fill(tdb, buf, sizeof(buf), tdb->file->map_size,
436                              addition);
437                 if (ecode != TDB_SUCCESS)
438                         return ecode;
439                 tdb->file->map_size += addition;
440                 tdb_mmap(tdb);
441         }
442         return TDB_SUCCESS;
443 }
444
445 const void *tdb_access_read(struct tdb_context *tdb,
446                             tdb_off_t off, tdb_len_t len, bool convert)
447 {
448         void *ret = NULL;
449
450         if (likely(!(tdb->flags & TDB_CONVERT))) {
451                 ret = tdb->methods->direct(tdb, off, len, false);
452
453                 if (TDB_PTR_IS_ERR(ret)) {
454                         return ret;
455                 }
456         }
457         if (!ret) {
458                 struct tdb_access_hdr *hdr;
459                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
460                 if (TDB_PTR_IS_ERR(hdr)) {
461                         return hdr;
462                 }
463                 hdr->next = tdb->access;
464                 tdb->access = hdr;
465                 ret = hdr + 1;
466                 if (convert) {
467                         tdb_convert(tdb, (void *)ret, len);
468                 }
469         } else
470                 tdb->direct_access++;
471
472         return ret;
473 }
474
475 void *tdb_access_write(struct tdb_context *tdb,
476                        tdb_off_t off, tdb_len_t len, bool convert)
477 {
478         void *ret = NULL;
479
480         if (tdb->read_only) {
481                 tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
482                            "Write to read-only database");
483                 return TDB_ERR_PTR(TDB_ERR_RDONLY);
484         }
485
486         if (likely(!(tdb->flags & TDB_CONVERT))) {
487                 ret = tdb->methods->direct(tdb, off, len, true);
488
489                 if (TDB_PTR_IS_ERR(ret)) {
490                         return ret;
491                 }
492         }
493
494         if (!ret) {
495                 struct tdb_access_hdr *hdr;
496                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
497                 if (TDB_PTR_IS_ERR(hdr)) {
498                         return hdr;
499                 }
500                 hdr->next = tdb->access;
501                 tdb->access = hdr;
502                 hdr->off = off;
503                 hdr->len = len;
504                 hdr->convert = convert;
505                 ret = hdr + 1;
506                 if (convert)
507                         tdb_convert(tdb, (void *)ret, len);
508         } else
509                 tdb->direct_access++;
510
511         return ret;
512 }
513
514 static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p)
515 {
516         struct tdb_access_hdr **hp;
517
518         for (hp = &tdb->access; *hp; hp = &(*hp)->next) {
519                 if (*hp + 1 == p)
520                         return hp;
521         }
522         return NULL;
523 }
524
525 void tdb_access_release(struct tdb_context *tdb, const void *p)
526 {
527         struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
528
529         if (hp) {
530                 hdr = *hp;
531                 *hp = hdr->next;
532                 free(hdr);
533         } else
534                 tdb->direct_access--;
535 }
536
537 enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p)
538 {
539         struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
540         enum TDB_ERROR ecode;
541
542         if (hp) {
543                 hdr = *hp;
544                 if (hdr->convert)
545                         ecode = tdb_write_convert(tdb, hdr->off, p, hdr->len);
546                 else
547                         ecode = tdb_write(tdb, hdr->off, p, hdr->len);
548                 *hp = hdr->next;
549                 free(hdr);
550         } else {
551                 tdb->direct_access--;
552                 ecode = TDB_SUCCESS;
553         }
554
555         return ecode;
556 }
557
558 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
559                         bool write_mode)
560 {
561         enum TDB_ERROR ecode;
562
563         if (unlikely(!tdb->file->map_ptr))
564                 return NULL;
565
566         ecode = tdb_oob(tdb, off + len, true);
567         if (unlikely(ecode != TDB_SUCCESS))
568                 return TDB_ERR_PTR(ecode);
569         return (char *)tdb->file->map_ptr + off;
570 }
571
572 void tdb_inc_seqnum(struct tdb_context *tdb)
573 {
574         tdb_off_t seq;
575
576         if (likely(!(tdb->flags & TDB_CONVERT))) {
577                 int64_t *direct;
578
579                 direct = tdb->methods->direct(tdb,
580                                               offsetof(struct tdb_header,
581                                                        seqnum),
582                                               sizeof(*direct), true);
583                 if (likely(direct)) {
584                         /* Don't let it go negative, even briefly */
585                         if (unlikely((*direct) + 1) < 0)
586                                 *direct = 0;
587                         (*direct)++;
588                         return;
589                 }
590         }
591
592         seq = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum));
593         if (!TDB_OFF_IS_ERR(seq)) {
594                 seq++;
595                 if (unlikely((int64_t)seq < 0))
596                         seq = 0;
597                 tdb_write_off(tdb, offsetof(struct tdb_header, seqnum), seq);
598         }
599 }
600
601 static const struct tdb_methods io_methods = {
602         tdb_read,
603         tdb_write,
604         tdb_oob,
605         tdb_expand_file,
606         tdb_direct,
607 };
608
609 /*
610   initialise the default methods table
611 */
612 void tdb_io_init(struct tdb_context *tdb)
613 {
614         tdb->methods = &io_methods;
615 }