tdb2: 64 bit fixes.
[ccan] / ccan / tdb2 / io.c
1  /* 
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9    Copyright (C) Rusty Russell                     2010
10
11      ** NOTE! The following LGPL license applies to the tdb
12      ** library. This does NOT imply that all of Samba is released
13      ** under the LGPL
14
15    This library is free software; you can redistribute it and/or
16    modify it under the terms of the GNU Lesser General Public
17    License as published by the Free Software Foundation; either
18    version 3 of the License, or (at your option) any later version.
19
20    This library is distributed in the hope that it will be useful,
21    but WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    Lesser General Public License for more details.
24
25    You should have received a copy of the GNU Lesser General Public
26    License along with this library; if not, see <http://www.gnu.org/licenses/>.
27 */
28 #include "private.h"
29 #include <assert.h>
30 #include <ccan/likely/likely.h>
31
32 void tdb_munmap(struct tdb_context *tdb)
33 {
34         if (tdb->flags & TDB_INTERNAL)
35                 return;
36
37         if (tdb->map_ptr) {
38                 munmap(tdb->map_ptr, tdb->map_size);
39                 tdb->map_ptr = NULL;
40         }
41 }
42
43 void tdb_mmap(struct tdb_context *tdb)
44 {
45         if (tdb->flags & TDB_INTERNAL)
46                 return;
47
48         if (tdb->flags & TDB_NOMMAP)
49                 return;
50
51         tdb->map_ptr = mmap(NULL, tdb->map_size, tdb->mmap_flags,
52                             MAP_SHARED, tdb->fd, 0);
53
54         /*
55          * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
56          */
57         if (tdb->map_ptr == MAP_FAILED) {
58                 tdb->map_ptr = NULL;
59                 tdb->log(tdb, TDB_DEBUG_WARNING, tdb->log_priv,
60                          "tdb_mmap failed for size %lld (%s)\n", 
61                          (long long)tdb->map_size, strerror(errno));
62         }
63 }
64
65 /* check for an out of bounds access - if it is out of bounds then
66    see if the database has been expanded by someone else and expand
67    if necessary 
68    note that "len" is the minimum length needed for the db
69 */
70 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
71 {
72         struct stat st;
73         int ret;
74
75         /* We can't hold pointers during this: we could unmap! */
76         assert(!tdb->direct_access
77                || (tdb->flags & TDB_NOLOCK)
78                || tdb_has_expansion_lock(tdb));
79
80         if (len <= tdb->map_size)
81                 return 0;
82         if (tdb->flags & TDB_INTERNAL) {
83                 if (!probe) {
84                         /* Ensure ecode is set for log fn. */
85                         tdb->ecode = TDB_ERR_IO;
86                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
87                                  "tdb_oob len %lld beyond internal"
88                                  " malloc size %lld\n",
89                                  (long long)len,
90                                  (long long)tdb->map_size);
91                 }
92                 return -1;
93         }
94
95         if (tdb_lock_expand(tdb, F_RDLCK) != 0)
96                 return -1;
97
98         ret = fstat(tdb->fd, &st);
99
100         tdb_unlock_expand(tdb, F_RDLCK);
101
102         if (ret == -1) {
103                 tdb->ecode = TDB_ERR_IO;
104                 return -1;
105         }
106
107         if (st.st_size < (size_t)len) {
108                 if (!probe) {
109                         /* Ensure ecode is set for log fn. */
110                         tdb->ecode = TDB_ERR_IO;
111                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
112                                  "tdb_oob len %lld beyond eof at %lld\n",
113                                  (long long)len, (long long)st.st_size);
114                 }
115                 return -1;
116         }
117
118         /* Unmap, update size, remap */
119         tdb_munmap(tdb);
120
121         tdb->map_size = st.st_size;
122         tdb_mmap(tdb);
123         return 0;
124 }
125
126 static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len)
127 {
128         if (unlikely(!tdb->map_ptr))
129                 return NULL;
130
131         /* FIXME: We can do a subset of this! */
132         if (tdb->transaction)
133                 return NULL;
134
135         if (unlikely(tdb_oob(tdb, off + len, true) == -1))
136                 return NULL;
137         return (char *)tdb->map_ptr + off;
138 }
139
140 /* Either make a copy into pad and return that, or return ptr into mmap. */
141 /* Note: pad has to be a real object, so we can't get here if len
142  * overflows size_t */
143 void *tdb_get(struct tdb_context *tdb, tdb_off_t off, void *pad, size_t len)
144 {
145         if (likely(!(tdb->flags & TDB_CONVERT))) {
146                 void *ret = tdb_direct(tdb, off, len);
147                 if (ret)
148                         return ret;
149         }
150         return tdb_read_convert(tdb, off, pad, len) == -1 ? NULL : pad;
151 }
152
153 /* Endian conversion: we only ever deal with 8 byte quantities */
154 void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
155 {
156         if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
157                 uint64_t i, *p = (uint64_t *)buf;
158                 for (i = 0; i < size / 8; i++)
159                         p[i] = bswap_64(p[i]);
160         }
161         return buf;
162 }
163
164 /* FIXME: Return the off? */
165 uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
166                               tdb_off_t base, uint64_t start, uint64_t end)
167 {
168         uint64_t i;
169         const uint64_t *val;
170
171         /* Zero vs non-zero is the same unconverted: minor optimization. */
172         val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
173                               (end - start) * sizeof(tdb_off_t), false);
174         if (!val)
175                 return end;
176
177         for (i = 0; i < (end - start); i++) {
178                 if (val[i])
179                         break;
180         }
181         tdb_access_release(tdb, val);
182         return start + i;
183 }
184
185 /* Return first zero offset in num offset array, or num. */
186 uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
187                            uint64_t num)
188 {
189         uint64_t i;
190         const uint64_t *val;
191
192         /* Zero vs non-zero is the same unconverted: minor optimization. */
193         val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
194         if (!val)
195                 return num;
196
197         for (i = 0; i < num; i++) {
198                 if (!val[i])
199                         break;
200         }
201         tdb_access_release(tdb, val);
202         return i;
203 }
204
205 int zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
206 {
207         char buf[8192] = { 0 };
208         void *p = tdb_direct(tdb, off, len);
209         if (p) {
210                 memset(p, 0, len);
211                 return 0;
212         }
213         while (len) {
214                 unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
215                 if (tdb->methods->write(tdb, off, buf, todo) == -1)
216                         return -1;
217                 len -= todo;
218                 off += todo;
219         }
220         return 0;
221 }
222
223 tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
224 {
225         tdb_off_t pad, *ret;
226
227         ret = tdb_get(tdb, off, &pad, sizeof(pad));
228         if (!ret) {
229                 return TDB_OFF_ERR;
230         }
231         return *ret;
232 }
233
234 /* Even on files, we can get partial writes due to signals. */
235 bool tdb_pwrite_all(int fd, const void *buf, size_t len, tdb_off_t off)
236 {
237         while (len) {
238                 ssize_t ret;
239                 ret = pwrite(fd, buf, len, off);
240                 if (ret < 0)
241                         return false;
242                 if (ret == 0) {
243                         errno = ENOSPC;
244                         return false;
245                 }
246                 buf = (char *)buf + ret;
247                 off += ret;
248                 len -= ret;
249         }
250         return true;
251 }
252
253 /* Even on files, we can get partial reads due to signals. */
254 bool tdb_pread_all(int fd, void *buf, size_t len, tdb_off_t off)
255 {
256         while (len) {
257                 ssize_t ret;
258                 ret = pread(fd, buf, len, off);
259                 if (ret < 0)
260                         return false;
261                 if (ret == 0) {
262                         /* ETOOSHORT? */
263                         errno = EWOULDBLOCK;
264                         return false;
265                 }
266                 buf = (char *)buf + ret;
267                 off += ret;
268                 len -= ret;
269         }
270         return true;
271 }
272
273 bool tdb_read_all(int fd, void *buf, size_t len)
274 {
275         while (len) {
276                 ssize_t ret;
277                 ret = read(fd, buf, len);
278                 if (ret < 0)
279                         return false;
280                 if (ret == 0) {
281                         /* ETOOSHORT? */
282                         errno = EWOULDBLOCK;
283                         return false;
284                 }
285                 buf = (char *)buf + ret;
286                 len -= ret;
287         }
288         return true;
289 }
290
291 /* write a lump of data at a specified offset */
292 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
293                      const void *buf, tdb_len_t len)
294 {
295         if (len == 0) {
296                 return 0;
297         }
298
299         if (tdb->read_only) {
300                 tdb->ecode = TDB_ERR_RDONLY;
301                 return -1;
302         }
303
304         if (tdb->methods->oob(tdb, off + len, 0) != 0)
305                 return -1;
306
307         if (tdb->map_ptr) {
308                 memcpy(off + (char *)tdb->map_ptr, buf, len);
309         } else {
310                 if (!tdb_pwrite_all(tdb->fd, buf, len, off)) {
311                         tdb->ecode = TDB_ERR_IO;
312                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
313                                  "tdb_write failed at %llu len=%llu (%s)\n",
314                                  (long long)off, (long long)len,
315                                  strerror(errno));
316                         return -1;
317                 }
318         }
319         return 0;
320 }
321
322 /* read a lump of data at a specified offset */
323 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
324                     tdb_len_t len)
325 {
326         if (tdb->methods->oob(tdb, off + len, 0) != 0) {
327                 return -1;
328         }
329
330         if (tdb->map_ptr) {
331                 memcpy(buf, off + (char *)tdb->map_ptr, len);
332         } else {
333                 if (!tdb_pread_all(tdb->fd, buf, len, off)) {
334                         /* Ensure ecode is set for log fn. */
335                         tdb->ecode = TDB_ERR_IO;
336                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
337                                  "tdb_read failed at %lld "
338                                  "len=%lld (%s) map_size=%lld\n",
339                                  (long long)off, (long long)len,
340                                  strerror(errno),
341                                  (long long)tdb->map_size);
342                         return -1;
343                 }
344         }
345         return 0;
346 }
347
348 int tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
349                       const void *rec, size_t len)
350 {
351         int ret;
352         if (unlikely((tdb->flags & TDB_CONVERT))) {
353                 void *conv = malloc(len);
354                 if (!conv) {
355                         tdb->ecode = TDB_ERR_OOM;
356                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
357                                  "tdb_write: no memory converting %zu bytes\n",
358                                  len);
359                         return -1;
360                 }
361                 memcpy(conv, rec, len);
362                 ret = tdb->methods->write(tdb, off,
363                                           tdb_convert(tdb, conv, len), len);
364                 free(conv);
365         } else
366                 ret = tdb->methods->write(tdb, off, rec, len);
367
368         return ret;
369 }
370
371 int tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
372                       void *rec, size_t len)
373 {
374         int ret = tdb->methods->read(tdb, off, rec, len);
375         tdb_convert(tdb, rec, len);
376         return ret;
377 }
378
379 int tdb_write_off(struct tdb_context *tdb, tdb_off_t off, tdb_off_t val)
380 {
381         return tdb_write_convert(tdb, off, &val, sizeof(val));
382 }
383
384 static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
385                              tdb_len_t len, unsigned int prefix)
386 {
387         void *buf;
388
389         /* some systems don't like zero length malloc */
390         buf = malloc(prefix + len ? prefix + len : 1);
391         if (unlikely(!buf)) {
392                 tdb->ecode = TDB_ERR_OOM;
393                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
394                          "tdb_alloc_read malloc failed len=%lld\n",
395                          (long long)prefix + len);
396         } else if (unlikely(tdb->methods->read(tdb, offset, buf+prefix, len))) {
397                 free(buf);
398                 buf = NULL;
399         }
400         return buf;
401 }
402
403 /* read a lump of data, allocating the space for it */
404 void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
405 {
406         return _tdb_alloc_read(tdb, offset, len, 0);
407 }
408
409 static int fill(struct tdb_context *tdb,
410                 const void *buf, size_t size,
411                 tdb_off_t off, tdb_len_t len)
412 {
413         while (len) {
414                 size_t n = len > size ? size : len;
415
416                 if (!tdb_pwrite_all(tdb->fd, buf, n, off)) {
417                         tdb->ecode = TDB_ERR_IO;
418                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
419                                  "fill write failed: giving up!\n");
420                         return -1;
421                 }
422                 len -= n;
423                 off += n;
424         }
425         return 0;
426 }
427
428 /* expand a file.  we prefer to use ftruncate, as that is what posix
429   says to use for mmap expansion */
430 static int tdb_expand_file(struct tdb_context *tdb, tdb_len_t addition)
431 {
432         char buf[8192];
433
434         if (tdb->read_only) {
435                 tdb->ecode = TDB_ERR_RDONLY;
436                 return -1;
437         }
438
439         if (tdb->flags & TDB_INTERNAL) {
440                 char *new = realloc(tdb->map_ptr, tdb->map_size + addition);
441                 if (!new) {
442                         tdb->ecode = TDB_ERR_OOM;
443                         return -1;
444                 }
445                 tdb->map_ptr = new;
446                 tdb->map_size += addition;
447         } else {
448                 /* Unmap before trying to write; old TDB claimed OpenBSD had
449                  * problem with this otherwise. */
450                 tdb_munmap(tdb);
451
452                 /* If this fails, we try to fill anyway. */
453                 if (ftruncate(tdb->fd, tdb->map_size + addition))
454                         ;
455
456                 /* now fill the file with something. This ensures that the
457                    file isn't sparse, which would be very bad if we ran out of
458                    disk. This must be done with write, not via mmap */
459                 memset(buf, 0x43, sizeof(buf));
460                 if (fill(tdb, buf, sizeof(buf), tdb->map_size, addition) == -1)
461                         return -1;
462                 tdb->map_size += addition;
463                 tdb_mmap(tdb);
464         }
465         return 0;
466 }
467
468 /* This is only neded for tdb_access_commit, but used everywhere to simplify. */
469 struct tdb_access_hdr {
470         tdb_off_t off;
471         tdb_len_t len;
472         bool convert;
473 };
474
475 const void *tdb_access_read(struct tdb_context *tdb,
476                             tdb_off_t off, tdb_len_t len, bool convert)
477 {
478         const void *ret = NULL; 
479
480         if (likely(!(tdb->flags & TDB_CONVERT)))
481                 ret = tdb_direct(tdb, off, len);
482
483         if (!ret) {
484                 struct tdb_access_hdr *hdr;
485                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
486                 if (hdr) {
487                         ret = hdr + 1;
488                         if (convert)
489                                 tdb_convert(tdb, (void *)ret, len);
490                 }
491         } else
492                 tdb->direct_access++;
493
494         return ret;
495 }
496
497 void *tdb_access_write(struct tdb_context *tdb,
498                        tdb_off_t off, tdb_len_t len, bool convert)
499 {
500         void *ret = NULL;
501
502         if (likely(!(tdb->flags & TDB_CONVERT)))
503                 ret = tdb_direct(tdb, off, len);
504
505         if (!ret) {
506                 struct tdb_access_hdr *hdr;
507                 hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
508                 if (hdr) {
509                         hdr->off = off;
510                         hdr->len = len;
511                         hdr->convert = convert;
512                         ret = hdr + 1;
513                         if (convert)
514                                 tdb_convert(tdb, (void *)ret, len);
515                 }
516         } else
517                 tdb->direct_access++;
518
519         return ret;
520 }
521
522 void tdb_access_release(struct tdb_context *tdb, const void *p)
523 {
524         if (!tdb->map_ptr
525             || (char *)p < (char *)tdb->map_ptr
526             || (char *)p >= (char *)tdb->map_ptr + tdb->map_size)
527                 free((struct tdb_access_hdr *)p - 1);
528         else
529                 tdb->direct_access--;
530 }
531
532 int tdb_access_commit(struct tdb_context *tdb, void *p)
533 {
534         int ret = 0;
535
536         if (!tdb->map_ptr
537             || (char *)p < (char *)tdb->map_ptr
538             || (char *)p >= (char *)tdb->map_ptr + tdb->map_size) {
539                 struct tdb_access_hdr *hdr;
540
541                 hdr = (struct tdb_access_hdr *)p - 1;
542                 if (hdr->convert)
543                         ret = tdb_write_convert(tdb, hdr->off, p, hdr->len);
544                 else
545                         ret = tdb_write(tdb, hdr->off, p, hdr->len);
546                 free(hdr);
547         } else
548                 tdb->direct_access--;
549
550         return ret;
551 }
552
553 #if 0
554 /* write a lump of data at a specified offset */
555 static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
556                      const void *buf, tdb_len_t len)
557 {
558         if (len == 0) {
559                 return 0;
560         }
561
562         if (tdb->read_only || tdb->traverse_read) {
563                 tdb->ecode = TDB_ERR_RDONLY;
564                 return -1;
565         }
566
567         if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
568                 return -1;
569
570         if (tdb->map_ptr) {
571                 memcpy(off + (char *)tdb->map_ptr, buf, len);
572         } else {
573                 ssize_t written = pwrite(tdb->fd, buf, len, off);
574                 if ((written != (ssize_t)len) && (written != -1)) {
575                         /* try once more */
576                         tdb->ecode = TDB_ERR_IO;
577                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
578                                  "%d of %d bytes at %d, trying once more\n",
579                                  (int)written, len, off));
580                         written = pwrite(tdb->fd, (const char *)buf+written,
581                                          len-written,
582                                          off+written);
583                 }
584                 if (written == -1) {
585                         /* Ensure ecode is set for log fn. */
586                         tdb->ecode = TDB_ERR_IO;
587                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d "
588                                  "len=%d (%s)\n", off, len, strerror(errno)));
589                         return -1;
590                 } else if (written != (ssize_t)len) {
591                         tdb->ecode = TDB_ERR_IO;
592                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
593                                  "write %d bytes at %d in two attempts\n",
594                                  len, off));
595                         return -1;
596                 }
597         }
598         return 0;
599 }
600
601
602
603 /*
604   do an unlocked scan of the hash table heads to find the next non-zero head. The value
605   will then be confirmed with the lock held
606 */              
607 static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
608 {
609         uint32_t h = *chain;
610         if (tdb->map_ptr) {
611                 for (;h < tdb->header.hash_size;h++) {
612                         if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
613                                 break;
614                         }
615                 }
616         } else {
617                 uint32_t off=0;
618                 for (;h < tdb->header.hash_size;h++) {
619                         if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
620                                 break;
621                         }
622                 }
623         }
624         (*chain) = h;
625 }
626
627 /* read/write a tdb_off_t */
628 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
629 {
630         return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
631 }
632
633 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
634 {
635         tdb_off_t off = *d;
636         return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
637 }
638
639
640 /* read/write a record */
641 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
642 {
643         if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
644                 return -1;
645         if (TDB_BAD_MAGIC(rec)) {
646                 /* Ensure ecode is set for log fn. */
647                 tdb->ecode = TDB_ERR_CORRUPT;
648                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
649                 return -1;
650         }
651         return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
652 }
653
654 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
655 {
656         struct tdb_record r = *rec;
657         return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
658 }
659 #endif
660
661 static const struct tdb_methods io_methods = {
662         tdb_read,
663         tdb_write,
664         tdb_oob,
665         tdb_expand_file,
666 };
667
668 /*
669   initialise the default methods table
670 */
671 void tdb_io_init(struct tdb_context *tdb)
672 {
673         tdb->methods = &io_methods;
674 }