tdb2: make tests work in parallel.
[ccan] / ccan / tdb2 / tdb1_io.c
1  /*
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              1999-2005
7    Copyright (C) Paul `Rusty' Russell              2000
8    Copyright (C) Jeremy Allison                    2000-2003
9
10      ** NOTE! The following LGPL license applies to the tdb
11      ** library. This does NOT imply that all of Samba is released
12      ** under the LGPL
13
14    This library is free software; you can redistribute it and/or
15    modify it under the terms of the GNU Lesser General Public
16    License as published by the Free Software Foundation; either
17    version 3 of the License, or (at your option) any later version.
18
19    This library is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22    Lesser General Public License for more details.
23
24    You should have received a copy of the GNU Lesser General Public
25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
26 */
27
28
29 #include "tdb1_private.h"
30 #ifndef MAX
31 #define MAX(a,b) ((a) > (b) ? (a) : (b))
32 #endif
33
34 /* check for an out of bounds access - if it is out of bounds then
35    see if the database has been expanded by someone else and expand
36    if necessary
37    note that "len" is the minimum length needed for the db
38 */
39 static int tdb1_oob(struct tdb_context *tdb, tdb1_off_t len, int probe)
40 {
41         struct stat st;
42         if (len <= tdb->file->map_size)
43                 return 0;
44         if (tdb->flags & TDB_INTERNAL) {
45                 if (!probe) {
46                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
47                                                 "tdb1_oob len %d beyond internal malloc size %d",
48                                                 (int)len, (int)tdb->file->map_size);
49                 }
50                 return -1;
51         }
52
53         if (fstat(tdb->file->fd, &st) == -1) {
54                 tdb->last_error = TDB_ERR_IO;
55                 return -1;
56         }
57
58         if (st.st_size < (size_t)len) {
59                 if (!probe) {
60                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
61                                                 "tdb1_oob len %d beyond eof at %d",
62                                                 (int)len, (int)st.st_size);
63                 }
64                 return -1;
65         }
66
67         /* Unmap, update size, remap */
68         if (tdb1_munmap(tdb) == -1) {
69                 tdb->last_error = TDB_ERR_IO;
70                 return -1;
71         }
72         tdb->file->map_size = st.st_size;
73         tdb1_mmap(tdb);
74         return 0;
75 }
76
77 /* write a lump of data at a specified offset */
78 static int tdb1_write(struct tdb_context *tdb, tdb1_off_t off,
79                      const void *buf, tdb1_len_t len)
80 {
81         if (len == 0) {
82                 return 0;
83         }
84
85         if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) {
86                 tdb->last_error = TDB_ERR_RDONLY;
87                 return -1;
88         }
89
90         if (tdb->tdb1.io->tdb1_oob(tdb, off + len, 0) != 0)
91                 return -1;
92
93         if (tdb->file->map_ptr) {
94                 memcpy(off + (char *)tdb->file->map_ptr, buf, len);
95         } else {
96                 ssize_t written = pwrite(tdb->file->fd, buf, len, off);
97                 if ((written != (ssize_t)len) && (written != -1)) {
98                         tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_WARNING,
99                                    "tdb1_write: wrote only "
100                                    "%d of %d bytes at %d, trying once more",
101                                    (int)written, len, off);
102                         written = pwrite(tdb->file->fd,
103                                          (const char *)buf+written,
104                                          len-written,
105                                          off+written);
106                 }
107                 if (written == -1) {
108                         /* Ensure ecode is set for log fn. */
109                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
110                                                 "tdb1_write failed at %d "
111                                                 "len=%d (%s)",
112                                                 off, len, strerror(errno));
113                         return -1;
114                 } else if (written != (ssize_t)len) {
115                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
116                                                 "tdb1_write: failed to "
117                                                 "write %d bytes at %d in two attempts",
118                                                 len, off);
119                         return -1;
120                 }
121         }
122         return 0;
123 }
124
125 /* Endian conversion: we only ever deal with 4 byte quantities */
126 void *tdb1_convert(void *buf, uint32_t size)
127 {
128         uint32_t i, *p = (uint32_t *)buf;
129         for (i = 0; i < size / 4; i++)
130                 p[i] = TDB1_BYTEREV(p[i]);
131         return buf;
132 }
133
134
135 /* read a lump of data at a specified offset, maybe convert */
136 static int tdb1_read(struct tdb_context *tdb, tdb1_off_t off, void *buf,
137                     tdb1_len_t len, int cv)
138 {
139         if (tdb->tdb1.io->tdb1_oob(tdb, off + len, 0) != 0) {
140                 return -1;
141         }
142
143         if (tdb->file->map_ptr) {
144                 memcpy(buf, off + (char *)tdb->file->map_ptr, len);
145         } else {
146                 ssize_t ret = pread(tdb->file->fd, buf, len, off);
147                 if (ret != (ssize_t)len) {
148                         /* Ensure ecode is set for log fn. */
149                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
150                                                 "tdb1_read failed at %d "
151                                                 "len=%d ret=%d (%s) map_size=%d",
152                                                 (int)off, (int)len, (int)ret,
153                                                 strerror(errno),
154                                                 (int)tdb->file->map_size);
155                         return -1;
156                 }
157         }
158         if (cv) {
159                 tdb1_convert(buf, len);
160         }
161         return 0;
162 }
163
164
165
166 /*
167   do an unlocked scan of the hash table heads to find the next non-zero head. The value
168   will then be confirmed with the lock held
169 */
170 static void tdb1_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
171 {
172         uint32_t h = *chain;
173         if (tdb->file->map_ptr) {
174                 for (;h < tdb->tdb1.header.hash_size;h++) {
175                         if (0 != *(uint32_t *)(TDB1_HASH_TOP(h) + (unsigned char *)tdb->file->map_ptr)) {
176                                 break;
177                         }
178                 }
179         } else {
180                 uint32_t off=0;
181                 for (;h < tdb->tdb1.header.hash_size;h++) {
182                         if (tdb1_ofs_read(tdb, TDB1_HASH_TOP(h), &off) != 0 || off != 0) {
183                                 break;
184                         }
185                 }
186         }
187         (*chain) = h;
188 }
189
190
191 int tdb1_munmap(struct tdb_context *tdb)
192 {
193         if (tdb->flags & TDB_INTERNAL)
194                 return 0;
195
196 #if HAVE_MMAP
197         if (tdb->file->map_ptr) {
198                 int ret;
199
200                 ret = munmap(tdb->file->map_ptr, tdb->file->map_size);
201                 if (ret != 0)
202                         return ret;
203         }
204 #endif
205         tdb->file->map_ptr = NULL;
206         return 0;
207 }
208
209 void tdb1_mmap(struct tdb_context *tdb)
210 {
211         if (tdb->flags & TDB_INTERNAL)
212                 return;
213
214 #if HAVE_MMAP
215         if (!(tdb->flags & TDB_NOMMAP)) {
216                 int mmap_flags;
217                 if ((tdb->open_flags & O_ACCMODE) == O_RDONLY)
218                         mmap_flags = PROT_READ;
219                 else
220                         mmap_flags = PROT_READ | PROT_WRITE;
221
222                 tdb->file->map_ptr = mmap(NULL, tdb->file->map_size,
223                                     mmap_flags,
224                                     MAP_SHARED|MAP_FILE, tdb->file->fd, 0);
225
226                 /*
227                  * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
228                  */
229
230                 if (tdb->file->map_ptr == MAP_FAILED) {
231                         tdb->file->map_ptr = NULL;
232                         tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_WARNING,
233                                    "tdb1_mmap failed for size %d (%s)",
234                                    tdb->file->map_size, strerror(errno));
235                 }
236         } else {
237                 tdb->file->map_ptr = NULL;
238         }
239 #else
240         tdb->file->map_ptr = NULL;
241 #endif
242 }
243
244 /* expand a file.  we prefer to use ftruncate, as that is what posix
245   says to use for mmap expansion */
246 static int tdb1_expand_file(struct tdb_context *tdb, tdb1_off_t size, tdb1_off_t addition)
247 {
248         char buf[8192];
249
250         if ((tdb->flags & TDB_RDONLY) || tdb->tdb1.traverse_read) {
251                 tdb->last_error = TDB_ERR_RDONLY;
252                 return -1;
253         }
254
255         if (ftruncate(tdb->file->fd, size+addition) == -1) {
256                 char b = 0;
257                 ssize_t written = pwrite(tdb->file->fd, &b, 1,
258                                          (size+addition) - 1);
259                 if (written == 0) {
260                         /* try once more, potentially revealing errno */
261                         written = pwrite(tdb->file->fd, &b, 1,
262                                          (size+addition) - 1);
263                 }
264                 if (written == 0) {
265                         /* again - give up, guessing errno */
266                         errno = ENOSPC;
267                 }
268                 if (written != 1) {
269                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
270                                                 "expand_file to %d failed (%s)",
271                                                 size+addition,
272                                                 strerror(errno));
273                         return -1;
274                 }
275         }
276
277         /* now fill the file with something. This ensures that the
278            file isn't sparse, which would be very bad if we ran out of
279            disk. This must be done with write, not via mmap */
280         memset(buf, TDB1_PAD_BYTE, sizeof(buf));
281         while (addition) {
282                 size_t n = addition>sizeof(buf)?sizeof(buf):addition;
283                 ssize_t written = pwrite(tdb->file->fd, buf, n, size);
284                 if (written == 0) {
285                         /* prevent infinite loops: try _once_ more */
286                         written = pwrite(tdb->file->fd, buf, n, size);
287                 }
288                 if (written == 0) {
289                         /* give up, trying to provide a useful errno */
290                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
291                                                 "expand_file write "
292                                                 "returned 0 twice: giving up!");
293                         errno = ENOSPC;
294                         return -1;
295                 } else if (written == -1) {
296                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
297                                                 "expand_file write of "
298                                                 "%d bytes failed (%s)", (int)n,
299                                                 strerror(errno));
300                         return -1;
301                 } else if (written != n) {
302                         tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_WARNING,
303                                    "expand_file: wrote "
304                                    "only %d of %d bytes - retrying",
305                                    (int)written, (int)n);
306                 }
307                 addition -= written;
308                 size += written;
309         }
310         tdb->stats.expands++;
311         return 0;
312 }
313
314
315 /* expand the database at least size bytes by expanding the underlying
316    file and doing the mmap again if necessary */
317 int tdb1_expand(struct tdb_context *tdb, tdb1_off_t size)
318 {
319         struct tdb1_record rec;
320         tdb1_off_t offset, new_size, top_size, map_size;
321
322         if (tdb1_lock(tdb, -1, F_WRLCK) == -1) {
323                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
324                            "lock failed in tdb1_expand");
325                 return -1;
326         }
327
328         /* must know about any previous expansions by another process */
329         tdb->tdb1.io->tdb1_oob(tdb, tdb->file->map_size + 1, 1);
330
331         /* limit size in order to avoid using up huge amounts of memory for
332          * in memory tdbs if an oddball huge record creeps in */
333         if (size > 100 * 1024) {
334                 top_size = tdb->file->map_size + size * 2;
335         } else {
336                 top_size = tdb->file->map_size + size * 100;
337         }
338
339         /* always make room for at least top_size more records, and at
340            least 25% more space. if the DB is smaller than 100MiB,
341            otherwise grow it by 10% only. */
342         if (tdb->file->map_size > 100 * 1024 * 1024) {
343                 map_size = tdb->file->map_size * 1.10;
344         } else {
345                 map_size = tdb->file->map_size * 1.25;
346         }
347
348         /* Round the database up to a multiple of the page size */
349         new_size = MAX(top_size, map_size);
350         size = TDB1_ALIGN(new_size, tdb->tdb1.page_size) - tdb->file->map_size;
351
352         if (!(tdb->flags & TDB_INTERNAL))
353                 tdb1_munmap(tdb);
354
355         /*
356          * We must ensure the file is unmapped before doing this
357          * to ensure consistency with systems like OpenBSD where
358          * writes and mmaps are not consistent.
359          */
360
361         /* expand the file itself */
362         if (!(tdb->flags & TDB_INTERNAL)) {
363                 if (tdb->tdb1.io->tdb1_expand_file(tdb, tdb->file->map_size, size) != 0)
364                         goto fail;
365         }
366
367         tdb->file->map_size += size;
368
369         if (tdb->flags & TDB_INTERNAL) {
370                 char *new_map_ptr = (char *)realloc(tdb->file->map_ptr,
371                                                     tdb->file->map_size);
372                 if (!new_map_ptr) {
373                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM,
374                                                      TDB_LOG_ERROR,
375                                                      "tdb1_expand: no memory");
376                         tdb->file->map_size -= size;
377                         goto fail;
378                 }
379                 tdb->file->map_ptr = new_map_ptr;
380         } else {
381                 /*
382                  * We must ensure the file is remapped before adding the space
383                  * to ensure consistency with systems like OpenBSD where
384                  * writes and mmaps are not consistent.
385                  */
386
387                 /* We're ok if the mmap fails as we'll fallback to read/write */
388                 tdb1_mmap(tdb);
389         }
390
391         /* form a new freelist record */
392         memset(&rec,'\0',sizeof(rec));
393         rec.rec_len = size - sizeof(rec);
394
395         /* link it into the free list */
396         offset = tdb->file->map_size - size;
397         if (tdb1_free(tdb, offset, &rec) == -1)
398                 goto fail;
399
400         tdb1_unlock(tdb, -1, F_WRLCK);
401         return 0;
402  fail:
403         tdb1_unlock(tdb, -1, F_WRLCK);
404         return -1;
405 }
406
407 /* read/write a tdb1_off_t */
408 int tdb1_ofs_read(struct tdb_context *tdb, tdb1_off_t offset, tdb1_off_t *d)
409 {
410         return tdb->tdb1.io->tdb1_read(tdb, offset, (char*)d, sizeof(*d), TDB1_DOCONV());
411 }
412
413 int tdb1_ofs_write(struct tdb_context *tdb, tdb1_off_t offset, tdb1_off_t *d)
414 {
415         tdb1_off_t off = *d;
416         return tdb->tdb1.io->tdb1_write(tdb, offset, TDB1_CONV(off), sizeof(*d));
417 }
418
419
420 /* read a lump of data, allocating the space for it */
421 unsigned char *tdb1_alloc_read(struct tdb_context *tdb, tdb1_off_t offset, tdb1_len_t len)
422 {
423         unsigned char *buf;
424
425         /* some systems don't like zero length malloc */
426
427         if (!(buf = (unsigned char *)malloc(len ? len : 1))) {
428                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
429                                              "tdb1_alloc_read malloc failed"
430                                              " len=%d (%s)",
431                                              len, strerror(errno));
432                 return NULL;
433         }
434         if (tdb->tdb1.io->tdb1_read(tdb, offset, buf, len, 0) == -1) {
435                 SAFE_FREE(buf);
436                 return NULL;
437         }
438         return buf;
439 }
440
441 /* Give a piece of tdb data to a parser */
442 enum TDB_ERROR tdb1_parse_data(struct tdb_context *tdb, TDB_DATA key,
443                                tdb1_off_t offset, tdb1_len_t len,
444                                enum TDB_ERROR (*parser)(TDB_DATA key,
445                                                         TDB_DATA data,
446                                                         void *private_data),
447                                void *private_data)
448 {
449         TDB_DATA data;
450         enum TDB_ERROR result;
451
452         data.dsize = len;
453
454         if ((tdb->tdb1.transaction == NULL) && (tdb->file->map_ptr != NULL)) {
455                 /*
456                  * Optimize by avoiding the malloc/memcpy/free, point the
457                  * parser directly at the mmap area.
458                  */
459                 if (tdb->tdb1.io->tdb1_oob(tdb, offset+len, 0) != 0) {
460                         return tdb->last_error;
461                 }
462                 data.dptr = offset + (unsigned char *)tdb->file->map_ptr;
463                 return parser(key, data, private_data);
464         }
465
466         if (!(data.dptr = tdb1_alloc_read(tdb, offset, len))) {
467                 return tdb->last_error;
468         }
469
470         result = parser(key, data, private_data);
471         free(data.dptr);
472         return result;
473 }
474
475 /* read/write a record */
476 int tdb1_rec_read(struct tdb_context *tdb, tdb1_off_t offset, struct tdb1_record *rec)
477 {
478         if (tdb->tdb1.io->tdb1_read(tdb, offset, rec, sizeof(*rec),TDB1_DOCONV()) == -1)
479                 return -1;
480         if (TDB1_BAD_MAGIC(rec)) {
481                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
482                                         "tdb1_rec_read bad magic 0x%x at offset=%d",
483                                         rec->magic, offset);
484                 return -1;
485         }
486         return tdb->tdb1.io->tdb1_oob(tdb, rec->next+sizeof(*rec), 0);
487 }
488
489 int tdb1_rec_write(struct tdb_context *tdb, tdb1_off_t offset, struct tdb1_record *rec)
490 {
491         struct tdb1_record r = *rec;
492         return tdb->tdb1.io->tdb1_write(tdb, offset, TDB1_CONV(r), sizeof(r));
493 }
494
495 static const struct tdb1_methods io1_methods = {
496         tdb1_read,
497         tdb1_write,
498         tdb1_next_hash_chain,
499         tdb1_oob,
500         tdb1_expand_file,
501 };
502
503 /*
504   initialise the default methods table
505 */
506 void tdb1_io_init(struct tdb_context *tdb)
507 {
508         tdb->tdb1.io = &io1_methods;
509 }
510
511 enum TDB_ERROR tdb1_probe_length(struct tdb_context *tdb)
512 {
513         tdb->last_error = TDB_SUCCESS;
514         tdb->tdb1.io->tdb1_oob(tdb, tdb->file->map_size + 1, true);
515         return tdb->last_error;
516 }