]> git.ozlabs.org Git - ccan/blob - junkcode/rusty@rustcorp.com.au-ntdb/open.c
io: fix another leak path for always array.
[ccan] / junkcode / rusty@rustcorp.com.au-ntdb / open.c
1  /*
2    Trivial Database 2: opening and closing TDBs
3    Copyright (C) Rusty Russell 2010
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/build_assert/build_assert.h>
20
21 /* all tdbs, to detect double-opens (fcntl file don't nest!) */
22 static struct ntdb_context *tdbs = NULL;
23
24 static struct ntdb_file *find_file(dev_t device, ino_t ino)
25 {
26         struct ntdb_context *i;
27
28         for (i = tdbs; i; i = i->next) {
29                 if (i->file->device == device && i->file->inode == ino) {
30                         i->file->refcnt++;
31                         return i->file;
32                 }
33         }
34         return NULL;
35 }
36
37 static bool read_all(int fd, void *buf, size_t len)
38 {
39         while (len) {
40                 ssize_t ret;
41                 ret = read(fd, buf, len);
42                 if (ret < 0)
43                         return false;
44                 if (ret == 0) {
45                         /* ETOOSHORT? */
46                         errno = EWOULDBLOCK;
47                         return false;
48                 }
49                 buf = (char *)buf + ret;
50                 len -= ret;
51         }
52         return true;
53 }
54
55 static uint32_t random_number(struct ntdb_context *ntdb)
56 {
57         int fd;
58         uint32_t ret = 0;
59         struct timeval now;
60
61         fd = open("/dev/urandom", O_RDONLY);
62         if (fd >= 0) {
63                 if (read_all(fd, &ret, sizeof(ret))) {
64                         close(fd);
65                         return ret;
66                 }
67                 close(fd);
68         }
69         /* FIXME: Untested!  Based on Wikipedia protocol description! */
70         fd = open("/dev/egd-pool", O_RDWR);
71         if (fd >= 0) {
72                 /* Command is 1, next byte is size we want to read. */
73                 char cmd[2] = { 1, sizeof(uint32_t) };
74                 if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) {
75                         char reply[1 + sizeof(uint32_t)];
76                         int r = read(fd, reply, sizeof(reply));
77                         if (r > 1) {
78                                 /* Copy at least some bytes. */
79                                 memcpy(&ret, reply+1, r - 1);
80                                 if (reply[0] == sizeof(uint32_t)
81                                     && r == sizeof(reply)) {
82                                         close(fd);
83                                         return ret;
84                                 }
85                         }
86                 }
87                 close(fd);
88         }
89
90         /* Fallback: pid and time. */
91         gettimeofday(&now, NULL);
92         ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
93         ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
94                    "ntdb_open: random from getpid and time");
95         return ret;
96 }
97
98 static void ntdb_context_init(struct ntdb_context *ntdb)
99 {
100         /* Initialize the NTDB fields here */
101         ntdb_io_init(ntdb);
102         ntdb->transaction = NULL;
103         ntdb->access = NULL;
104 }
105
106 /* initialise a new database:
107  *
108  *      struct ntdb_header;
109  *      struct {
110  *              struct ntdb_used_record hash_header;
111  *              ntdb_off_t hash_buckets[1 << ntdb->hash_bits];
112  *      } hash;
113  *      struct ntdb_freetable ftable;
114  *      struct {
115  *              struct ntdb_free_record free_header;
116  *              char forty_three[...];
117  *      } remainder;
118  */
119 #define NEW_DATABASE_HDR_SIZE(hbits)                                    \
120         (sizeof(struct ntdb_header)                                     \
121          + sizeof(struct ntdb_used_record) + (sizeof(ntdb_off_t) << hbits) \
122          + sizeof(struct ntdb_freetable)                                \
123          + sizeof(struct ntdb_free_record))
124
125 static enum NTDB_ERROR ntdb_new_database(struct ntdb_context *ntdb,
126                                          struct ntdb_attribute_seed *seed,
127                                          struct ntdb_header *rhdr)
128 {
129         /* We make it up in memory, then write it out if not internal */
130         struct ntdb_freetable *ftable;
131         struct ntdb_used_record *htable;
132         struct ntdb_header *hdr;
133         struct ntdb_free_record *remainder;
134         char *mem;
135         unsigned int magic_len;
136         ssize_t rlen;
137         size_t dbsize, hashsize, hdrsize, remaindersize;
138         enum NTDB_ERROR ecode;
139
140         hashsize = sizeof(ntdb_off_t) << ntdb->hash_bits;
141
142         /* Always make db a multiple of NTDB_PGSIZE */
143         hdrsize = NEW_DATABASE_HDR_SIZE(ntdb->hash_bits);
144         dbsize = (hdrsize + NTDB_PGSIZE-1) & ~(NTDB_PGSIZE-1);
145
146         mem = ntdb->alloc_fn(ntdb, dbsize, ntdb->alloc_data);
147         if (!mem) {
148                 return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
149                                    "ntdb_new_database: failed to allocate");
150         }
151
152         hdr = (void *)mem;
153         htable = (void *)(mem + sizeof(*hdr));
154         ftable = (void *)(mem + sizeof(*hdr) + sizeof(*htable) + hashsize);
155         remainder = (void *)(mem + sizeof(*hdr) + sizeof(*htable) + hashsize
156                              + sizeof(*ftable));
157
158         /* Fill in the header */
159         hdr->version = NTDB_VERSION;
160         if (seed)
161                 hdr->hash_seed = seed->seed;
162         else
163                 hdr->hash_seed = random_number(ntdb);
164         hdr->hash_test = NTDB_HASH_MAGIC;
165         hdr->hash_test = ntdb->hash_fn(&hdr->hash_test,
166                                        sizeof(hdr->hash_test),
167                                        hdr->hash_seed,
168                                        ntdb->hash_data);
169         hdr->hash_bits = ntdb->hash_bits;
170         hdr->recovery = 0;
171         hdr->features_used = hdr->features_offered = NTDB_FEATURE_MASK;
172         hdr->seqnum = 0;
173         hdr->capabilities = 0;
174         memset(hdr->reserved, 0, sizeof(hdr->reserved));
175
176         /* Hash is all zero after header. */
177         set_header(NULL, htable, NTDB_HTABLE_MAGIC, 0, hashsize, hashsize);
178         memset(htable + 1, 0, hashsize);
179
180         /* Free is empty. */
181         hdr->free_table = (char *)ftable - (char *)hdr;
182         memset(ftable, 0, sizeof(*ftable));
183         ecode = set_header(NULL, &ftable->hdr, NTDB_FTABLE_MAGIC, 0,
184                            sizeof(*ftable) - sizeof(ftable->hdr),
185                            sizeof(*ftable) - sizeof(ftable->hdr));
186         if (ecode != NTDB_SUCCESS) {
187                 goto out;
188         }
189
190         /* Rest of database is a free record, containing junk. */
191         remaindersize = dbsize - hdrsize;
192         remainder->ftable_and_len
193                 = (remaindersize + sizeof(*remainder)
194                    - sizeof(struct ntdb_used_record));
195         remainder->next = 0;
196         remainder->magic_and_prev
197                 = (NTDB_FREE_MAGIC << (64-NTDB_OFF_UPPER_STEAL))
198                 | ((char *)remainder - (char *)hdr);
199         memset(remainder + 1, 0x43, remaindersize);
200
201         /* Put in our single free entry. */
202         ftable->buckets[size_to_bucket(remaindersize)] =
203                 (char *)remainder - (char *)hdr;
204
205         /* Magic food */
206         memset(hdr->magic_food, 0, sizeof(hdr->magic_food));
207         strcpy(hdr->magic_food, NTDB_MAGIC_FOOD);
208
209         /* This creates an endian-converted database, as if read from disk */
210         magic_len = sizeof(hdr->magic_food);
211         ntdb_convert(ntdb, (char *)hdr + magic_len, hdrsize - magic_len);
212
213         /* Return copy of header. */
214         *rhdr = *hdr;
215
216         if (ntdb->flags & NTDB_INTERNAL) {
217                 ntdb->file->map_size = dbsize;
218                 ntdb->file->map_ptr = hdr;
219                 return NTDB_SUCCESS;
220         }
221         if (lseek(ntdb->file->fd, 0, SEEK_SET) == -1) {
222                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
223                                     "ntdb_new_database:"
224                                     " failed to seek: %s", strerror(errno));
225                 goto out;
226         }
227
228         if (ftruncate(ntdb->file->fd, 0) == -1) {
229                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
230                                     "ntdb_new_database:"
231                                     " failed to truncate: %s", strerror(errno));
232                 goto out;
233         }
234
235         rlen = write(ntdb->file->fd, hdr, dbsize);
236         if (rlen != dbsize) {
237                 if (rlen >= 0)
238                         errno = ENOSPC;
239                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
240                                     "ntdb_new_database: %zi writing header: %s",
241                                     rlen, strerror(errno));
242                 goto out;
243         }
244
245 out:
246         ntdb->free_fn(hdr, ntdb->alloc_data);
247         return ecode;
248 }
249
250 static enum NTDB_ERROR ntdb_new_file(struct ntdb_context *ntdb)
251 {
252         ntdb->file = ntdb->alloc_fn(NULL, sizeof(*ntdb->file), ntdb->alloc_data);
253         if (!ntdb->file)
254                 return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
255                                   "ntdb_open: cannot alloc ntdb_file structure");
256         ntdb->file->num_lockrecs = 0;
257         ntdb->file->lockrecs = NULL;
258         ntdb->file->allrecord_lock.count = 0;
259         ntdb->file->refcnt = 1;
260         ntdb->file->map_ptr = NULL;
261         ntdb->file->direct_count = 0;
262         ntdb->file->old_mmaps = NULL;
263         return NTDB_SUCCESS;
264 }
265
266 _PUBLIC_ enum NTDB_ERROR ntdb_set_attribute(struct ntdb_context *ntdb,
267                                  const union ntdb_attribute *attr)
268 {
269         switch (attr->base.attr) {
270         case NTDB_ATTRIBUTE_LOG:
271                 ntdb->log_fn = attr->log.fn;
272                 ntdb->log_data = attr->log.data;
273                 break;
274         case NTDB_ATTRIBUTE_HASH:
275         case NTDB_ATTRIBUTE_SEED:
276         case NTDB_ATTRIBUTE_OPENHOOK:
277         case NTDB_ATTRIBUTE_HASHSIZE:
278                 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
279                                    NTDB_LOG_USE_ERROR,
280                                    "ntdb_set_attribute:"
281                                    " cannot set %s after opening",
282                                    attr->base.attr == NTDB_ATTRIBUTE_HASH
283                                    ? "NTDB_ATTRIBUTE_HASH"
284                                    : attr->base.attr == NTDB_ATTRIBUTE_SEED
285                                    ? "NTDB_ATTRIBUTE_SEED"
286                                    : attr->base.attr == NTDB_ATTRIBUTE_OPENHOOK
287                                    ? "NTDB_ATTRIBUTE_OPENHOOK"
288                                    : "NTDB_ATTRIBUTE_HASHSIZE");
289         case NTDB_ATTRIBUTE_STATS:
290                 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
291                                    NTDB_LOG_USE_ERROR,
292                                    "ntdb_set_attribute:"
293                                    " cannot set NTDB_ATTRIBUTE_STATS");
294         case NTDB_ATTRIBUTE_FLOCK:
295                 ntdb->lock_fn = attr->flock.lock;
296                 ntdb->unlock_fn = attr->flock.unlock;
297                 ntdb->lock_data = attr->flock.data;
298                 break;
299         case NTDB_ATTRIBUTE_ALLOCATOR:
300                 ntdb->alloc_fn = attr->alloc.alloc;
301                 ntdb->expand_fn = attr->alloc.expand;
302                 ntdb->free_fn = attr->alloc.free;
303                 ntdb->alloc_data = attr->alloc.priv_data;
304                 break;
305         default:
306                 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
307                                    NTDB_LOG_USE_ERROR,
308                                    "ntdb_set_attribute:"
309                                    " unknown attribute type %u",
310                                    attr->base.attr);
311         }
312         return NTDB_SUCCESS;
313 }
314
315 _PUBLIC_ enum NTDB_ERROR ntdb_get_attribute(struct ntdb_context *ntdb,
316                                  union ntdb_attribute *attr)
317 {
318         switch (attr->base.attr) {
319         case NTDB_ATTRIBUTE_LOG:
320                 if (!ntdb->log_fn)
321                         return NTDB_ERR_NOEXIST;
322                 attr->log.fn = ntdb->log_fn;
323                 attr->log.data = ntdb->log_data;
324                 break;
325         case NTDB_ATTRIBUTE_HASH:
326                 attr->hash.fn = ntdb->hash_fn;
327                 attr->hash.data = ntdb->hash_data;
328                 break;
329         case NTDB_ATTRIBUTE_SEED:
330                 attr->seed.seed = ntdb->hash_seed;
331                 break;
332         case NTDB_ATTRIBUTE_OPENHOOK:
333                 if (!ntdb->openhook)
334                         return NTDB_ERR_NOEXIST;
335                 attr->openhook.fn = ntdb->openhook;
336                 attr->openhook.data = ntdb->openhook_data;
337                 break;
338         case NTDB_ATTRIBUTE_STATS: {
339                 size_t size = attr->stats.size;
340                 if (size > ntdb->stats.size)
341                         size = ntdb->stats.size;
342                 memcpy(&attr->stats, &ntdb->stats, size);
343                 break;
344         }
345         case NTDB_ATTRIBUTE_FLOCK:
346                 attr->flock.lock = ntdb->lock_fn;
347                 attr->flock.unlock = ntdb->unlock_fn;
348                 attr->flock.data = ntdb->lock_data;
349                 break;
350         case NTDB_ATTRIBUTE_ALLOCATOR:
351                 attr->alloc.alloc = ntdb->alloc_fn;
352                 attr->alloc.expand = ntdb->expand_fn;
353                 attr->alloc.free = ntdb->free_fn;
354                 attr->alloc.priv_data = ntdb->alloc_data;
355                 break;
356         case NTDB_ATTRIBUTE_HASHSIZE:
357                 attr->hashsize.size = 1 << ntdb->hash_bits;
358                 break;
359         default:
360                 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
361                                    NTDB_LOG_USE_ERROR,
362                                    "ntdb_get_attribute:"
363                                    " unknown attribute type %u",
364                                    attr->base.attr);
365         }
366         attr->base.next = NULL;
367         return NTDB_SUCCESS;
368 }
369
370 _PUBLIC_ void ntdb_unset_attribute(struct ntdb_context *ntdb,
371                          enum ntdb_attribute_type type)
372 {
373         switch (type) {
374         case NTDB_ATTRIBUTE_LOG:
375                 ntdb->log_fn = NULL;
376                 break;
377         case NTDB_ATTRIBUTE_OPENHOOK:
378                 ntdb->openhook = NULL;
379                 break;
380         case NTDB_ATTRIBUTE_HASH:
381         case NTDB_ATTRIBUTE_SEED:
382                 ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
383                            "ntdb_unset_attribute: cannot unset %s after opening",
384                            type == NTDB_ATTRIBUTE_HASH
385                            ? "NTDB_ATTRIBUTE_HASH"
386                            : "NTDB_ATTRIBUTE_SEED");
387                 break;
388         case NTDB_ATTRIBUTE_STATS:
389                 ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
390                            NTDB_LOG_USE_ERROR,
391                            "ntdb_unset_attribute:"
392                            "cannot unset NTDB_ATTRIBUTE_STATS");
393                 break;
394         case NTDB_ATTRIBUTE_FLOCK:
395                 ntdb->lock_fn = ntdb_fcntl_lock;
396                 ntdb->unlock_fn = ntdb_fcntl_unlock;
397                 break;
398         default:
399                 ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
400                            NTDB_LOG_USE_ERROR,
401                            "ntdb_unset_attribute: unknown attribute type %u",
402                            type);
403         }
404 }
405
406 /* The top three bits of the capability tell us whether it matters. */
407 enum NTDB_ERROR unknown_capability(struct ntdb_context *ntdb, const char *caller,
408                                   ntdb_off_t type)
409 {
410         if (type & NTDB_CAP_NOOPEN) {
411                 return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
412                                   "%s: file has unknown capability %llu",
413                                   caller, type & NTDB_CAP_NOOPEN);
414         }
415
416         if ((type & NTDB_CAP_NOWRITE) && !(ntdb->flags & NTDB_RDONLY)) {
417                 return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_ERROR,
418                                   "%s: file has unknown capability %llu"
419                                   " (cannot write to it)",
420                                   caller, type & NTDB_CAP_NOOPEN);
421         }
422
423         if (type & NTDB_CAP_NOCHECK) {
424                 ntdb->flags |= NTDB_CANT_CHECK;
425         }
426         return NTDB_SUCCESS;
427 }
428
429 static enum NTDB_ERROR capabilities_ok(struct ntdb_context *ntdb,
430                                       ntdb_off_t capabilities)
431 {
432         ntdb_off_t off, next;
433         enum NTDB_ERROR ecode = NTDB_SUCCESS;
434         const struct ntdb_capability *cap;
435
436         /* Check capability list. */
437         for (off = capabilities; off && ecode == NTDB_SUCCESS; off = next) {
438                 cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
439                 if (NTDB_PTR_IS_ERR(cap)) {
440                         return NTDB_PTR_ERR(cap);
441                 }
442
443                 switch (cap->type & NTDB_CAP_TYPE_MASK) {
444                 /* We don't understand any capabilities (yet). */
445                 default:
446                         ecode = unknown_capability(ntdb, "ntdb_open", cap->type);
447                 }
448                 next = cap->next;
449                 ntdb_access_release(ntdb, cap);
450         }
451         return ecode;
452 }
453
454 static void *default_alloc(const void *owner, size_t len, void *priv_data)
455 {
456         return malloc(len);
457 }
458
459 static void *default_expand(void *ptr, size_t len, void *priv_data)
460 {
461         return realloc(ptr, len);
462 }
463
464 static void default_free(void *ptr, void *priv_data)
465 {
466         free(ptr);
467 }
468
469 /* First allocation needs manual search of attributes. */
470 static struct ntdb_context *alloc_ntdb(const union ntdb_attribute *attr,
471                                        const char *name)
472 {
473         size_t len = sizeof(struct ntdb_context) + strlen(name) + 1;
474
475         while (attr) {
476                 if  (attr->base.attr == NTDB_ATTRIBUTE_ALLOCATOR) {
477                         return attr->alloc.alloc(NULL, len,
478                                                  attr->alloc.priv_data);
479                 }
480                 attr = attr->base.next;
481         }
482         return default_alloc(NULL, len, NULL);
483 }
484
485 static unsigned int next_pow2(uint64_t size)
486 {
487         unsigned int bits = 1;
488
489         while ((1ULL << bits) < size)
490                 bits++;
491         return bits;
492 }
493
494 _PUBLIC_ struct ntdb_context *ntdb_open(const char *name, int ntdb_flags,
495                                         int open_flags, mode_t mode,
496                                         union ntdb_attribute *attr)
497 {
498         struct ntdb_context *ntdb;
499         struct stat st;
500         int saved_errno = 0;
501         uint64_t hash_test;
502         unsigned v;
503         ssize_t rlen;
504         struct ntdb_header hdr;
505         struct ntdb_attribute_seed *seed = NULL;
506         ntdb_bool_err berr;
507         enum NTDB_ERROR ecode;
508         int openlock;
509
510         ntdb = alloc_ntdb(attr, name);
511         if (!ntdb) {
512                 /* Can't log this */
513                 errno = ENOMEM;
514                 return NULL;
515         }
516         /* Set name immediately for logging functions. */
517         ntdb->name = strcpy((char *)(ntdb + 1), name);
518         ntdb->flags = ntdb_flags;
519         ntdb->log_fn = NULL;
520         ntdb->open_flags = open_flags;
521         ntdb->file = NULL;
522         ntdb->openhook = NULL;
523         ntdb->lock_fn = ntdb_fcntl_lock;
524         ntdb->unlock_fn = ntdb_fcntl_unlock;
525         ntdb->hash_fn = ntdb_jenkins_hash;
526         memset(&ntdb->stats, 0, sizeof(ntdb->stats));
527         ntdb->stats.base.attr = NTDB_ATTRIBUTE_STATS;
528         ntdb->stats.size = sizeof(ntdb->stats);
529         ntdb->alloc_fn = default_alloc;
530         ntdb->expand_fn = default_expand;
531         ntdb->free_fn = default_free;
532         ntdb->hash_bits = NTDB_DEFAULT_HBITS; /* 64k of hash by default. */
533
534         while (attr) {
535                 switch (attr->base.attr) {
536                 case NTDB_ATTRIBUTE_HASH:
537                         ntdb->hash_fn = attr->hash.fn;
538                         ntdb->hash_data = attr->hash.data;
539                         break;
540                 case NTDB_ATTRIBUTE_SEED:
541                         seed = &attr->seed;
542                         break;
543                 case NTDB_ATTRIBUTE_OPENHOOK:
544                         ntdb->openhook = attr->openhook.fn;
545                         ntdb->openhook_data = attr->openhook.data;
546                         break;
547                 case NTDB_ATTRIBUTE_HASHSIZE:
548                         ntdb->hash_bits = next_pow2(attr->hashsize.size);
549                         if (ntdb->hash_bits > 31) {
550                                 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
551                                                     NTDB_LOG_USE_ERROR,
552                                                     "ntdb_open: hash_size %u"
553                                                     " too large",
554                                                     attr->hashsize.size);
555                                 goto fail;
556                         }
557                         break;
558                 default:
559                         /* These are set as normal. */
560                         ecode = ntdb_set_attribute(ntdb, attr);
561                         if (ecode != NTDB_SUCCESS)
562                                 goto fail;
563                 }
564                 attr = attr->base.next;
565         }
566
567         if (ntdb_flags & ~(NTDB_INTERNAL | NTDB_NOLOCK | NTDB_NOMMAP | NTDB_CONVERT
568                           | NTDB_NOSYNC | NTDB_SEQNUM | NTDB_ALLOW_NESTING
569                           | NTDB_RDONLY)) {
570                 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
571                                    "ntdb_open: unknown flags %u", ntdb_flags);
572                 goto fail;
573         }
574
575         if (seed) {
576                 if (!(ntdb_flags & NTDB_INTERNAL) && !(open_flags & O_CREAT)) {
577                         ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
578                                            NTDB_LOG_USE_ERROR,
579                                            "ntdb_open:"
580                                            " cannot set NTDB_ATTRIBUTE_SEED"
581                                            " without O_CREAT.");
582                         goto fail;
583                 }
584         }
585
586         if ((open_flags & O_ACCMODE) == O_WRONLY) {
587                 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
588                                    "ntdb_open: can't open ntdb %s write-only",
589                                    name);
590                 goto fail;
591         }
592
593         if ((open_flags & O_ACCMODE) == O_RDONLY) {
594                 openlock = F_RDLCK;
595                 ntdb->flags |= NTDB_RDONLY;
596         } else {
597                 if (ntdb_flags & NTDB_RDONLY) {
598                         ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
599                                            NTDB_LOG_USE_ERROR,
600                                            "ntdb_open: can't use NTDB_RDONLY"
601                                            " without O_RDONLY");
602                         goto fail;
603                 }
604                 openlock = F_WRLCK;
605         }
606
607         /* internal databases don't need any of the rest. */
608         if (ntdb->flags & NTDB_INTERNAL) {
609                 ntdb->flags |= (NTDB_NOLOCK | NTDB_NOMMAP);
610                 ecode = ntdb_new_file(ntdb);
611                 if (ecode != NTDB_SUCCESS) {
612                         goto fail;
613                 }
614                 ntdb->file->fd = -1;
615                 ecode = ntdb_new_database(ntdb, seed, &hdr);
616                 if (ecode == NTDB_SUCCESS) {
617                         ntdb_convert(ntdb, &hdr.hash_seed,
618                                     sizeof(hdr.hash_seed));
619                         ntdb->hash_seed = hdr.hash_seed;
620                         ntdb_context_init(ntdb);
621                         ntdb_ftable_init(ntdb);
622                 }
623                 if (ecode != NTDB_SUCCESS) {
624                         goto fail;
625                 }
626                 return ntdb;
627         }
628
629         if (stat(name, &st) != -1)
630                 ntdb->file = find_file(st.st_dev, st.st_ino);
631
632         if (!ntdb->file) {
633                 ecode = ntdb_new_file(ntdb);
634                 if (ecode != NTDB_SUCCESS) {
635                         goto fail;
636                 }
637
638                 /* Set this now, as ntdb_nest_lock examines it. */
639                 ntdb->file->map_size = 0;
640
641                 if ((ntdb->file->fd = open(name, open_flags, mode)) == -1) {
642                         enum ntdb_log_level lvl;
643                         /* errno set by open(2) */
644                         saved_errno = errno;
645
646                         /* Probing for files like this is a common pattern. */
647                         if (!(open_flags & O_CREAT) && errno == ENOENT) {
648                                 lvl = NTDB_LOG_WARNING;
649                         } else {
650                                 lvl = NTDB_LOG_ERROR;
651                         }
652                         ntdb_logerr(ntdb, NTDB_ERR_IO, lvl,
653                                    "ntdb_open: could not open file %s: %s",
654                                    name, strerror(errno));
655
656                         goto fail_errno;
657                 }
658
659                 /* ensure there is only one process initialising at once:
660                  * do it immediately to reduce the create/openlock race. */
661                 ecode = ntdb_lock_open(ntdb, openlock,
662                                        NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
663                 if (ecode != NTDB_SUCCESS) {
664                         saved_errno = errno;
665                         goto fail_errno;
666                 }
667
668                 /* on exec, don't inherit the fd */
669                 v = fcntl(ntdb->file->fd, F_GETFD, 0);
670                 fcntl(ntdb->file->fd, F_SETFD, v | FD_CLOEXEC);
671
672                 if (fstat(ntdb->file->fd, &st) == -1) {
673                         saved_errno = errno;
674                         ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
675                                    "ntdb_open: could not stat open %s: %s",
676                                    name, strerror(errno));
677                         goto fail_errno;
678                 }
679
680                 ntdb->file->device = st.st_dev;
681                 ntdb->file->inode = st.st_ino;
682
683                 /* call their open hook if they gave us one. */
684                 if (ntdb->openhook) {
685                         ecode = ntdb->openhook(ntdb->file->fd, ntdb->openhook_data);
686                         if (ecode != NTDB_SUCCESS) {
687                                 ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
688                                             "ntdb_open: open hook failed");
689                                 goto fail;
690                         }
691                         open_flags |= O_CREAT;
692                 }
693         } else {
694                 /* ensure there is only one process initialising at once */
695                 ecode = ntdb_lock_open(ntdb, openlock,
696                                        NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
697                 if (ecode != NTDB_SUCCESS) {
698                         saved_errno = errno;
699                         goto fail_errno;
700                 }
701         }
702
703         /* If they used O_TRUNC, read will return 0. */
704         rlen = pread(ntdb->file->fd, &hdr, sizeof(hdr), 0);
705         if (rlen == 0 && (open_flags & O_CREAT)) {
706                 ecode = ntdb_new_database(ntdb, seed, &hdr);
707                 if (ecode != NTDB_SUCCESS) {
708                         goto fail;
709                 }
710         } else if (rlen < 0) {
711                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
712                                    "ntdb_open: error %s reading %s",
713                                    strerror(errno), name);
714                 goto fail;
715         } else if (rlen < sizeof(hdr)
716                    || strcmp(hdr.magic_food, NTDB_MAGIC_FOOD) != 0) {
717                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
718                                    "ntdb_open: %s is not a ntdb file", name);
719                 goto fail;
720         }
721
722         if (hdr.version != NTDB_VERSION) {
723                 if (hdr.version == bswap_64(NTDB_VERSION))
724                         ntdb->flags |= NTDB_CONVERT;
725                 else {
726                         /* wrong version */
727                         ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
728                                            "ntdb_open:"
729                                            " %s is unknown version 0x%llx",
730                                            name, (long long)hdr.version);
731                         goto fail;
732                 }
733         } else if (ntdb->flags & NTDB_CONVERT) {
734                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
735                                    "ntdb_open:"
736                                    " %s does not need NTDB_CONVERT",
737                                    name);
738                 goto fail;
739         }
740
741         ntdb_context_init(ntdb);
742
743         ntdb_convert(ntdb, &hdr, sizeof(hdr));
744         ntdb->hash_bits = hdr.hash_bits;
745         ntdb->hash_seed = hdr.hash_seed;
746         hash_test = NTDB_HASH_MAGIC;
747         hash_test = ntdb_hash(ntdb, &hash_test, sizeof(hash_test));
748         if (hdr.hash_test != hash_test) {
749                 /* wrong hash variant */
750                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
751                                    "ntdb_open:"
752                                    " %s uses a different hash function",
753                                    name);
754                 goto fail;
755         }
756
757         ecode = capabilities_ok(ntdb, hdr.capabilities);
758         if (ecode != NTDB_SUCCESS) {
759                 goto fail;
760         }
761
762         /* Clear any features we don't understand. */
763         if ((open_flags & O_ACCMODE) != O_RDONLY) {
764                 hdr.features_used &= NTDB_FEATURE_MASK;
765                 ecode = ntdb_write_convert(ntdb, offsetof(struct ntdb_header,
766                                                         features_used),
767                                           &hdr.features_used,
768                                           sizeof(hdr.features_used));
769                 if (ecode != NTDB_SUCCESS)
770                         goto fail;
771         }
772
773         ntdb_unlock_open(ntdb, openlock);
774
775         /* This makes sure we have current map_size and mmap. */
776         ecode = ntdb_oob(ntdb, ntdb->file->map_size, 1, true);
777         if (unlikely(ecode != NTDB_SUCCESS))
778                 goto fail;
779
780         if (ntdb->file->map_size % NTDB_PGSIZE != 0) {
781                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
782                                     "ntdb_open:"
783                                     " %s size %llu isn't a multiple of %u",
784                                     name, (long long)ntdb->file->map_size,
785                                     NTDB_PGSIZE);
786                 goto fail;
787         }
788
789         /* Now it's fully formed, recover if necessary. */
790         berr = ntdb_needs_recovery(ntdb);
791         if (unlikely(berr != false)) {
792                 if (berr < 0) {
793                         ecode = NTDB_OFF_TO_ERR(berr);
794                         goto fail;
795                 }
796                 ecode = ntdb_lock_and_recover(ntdb);
797                 if (ecode != NTDB_SUCCESS) {
798                         goto fail;
799                 }
800         }
801
802         ecode = ntdb_ftable_init(ntdb);
803         if (ecode != NTDB_SUCCESS) {
804                 goto fail;
805         }
806
807         ntdb->next = tdbs;
808         tdbs = ntdb;
809         return ntdb;
810
811  fail:
812         /* Map ecode to some logical errno. */
813         switch (NTDB_ERR_TO_OFF(ecode)) {
814         case NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT):
815         case NTDB_ERR_TO_OFF(NTDB_ERR_IO):
816                 saved_errno = EIO;
817                 break;
818         case NTDB_ERR_TO_OFF(NTDB_ERR_LOCK):
819                 saved_errno = EWOULDBLOCK;
820                 break;
821         case NTDB_ERR_TO_OFF(NTDB_ERR_OOM):
822                 saved_errno = ENOMEM;
823                 break;
824         case NTDB_ERR_TO_OFF(NTDB_ERR_EINVAL):
825                 saved_errno = EINVAL;
826                 break;
827         default:
828                 saved_errno = EINVAL;
829                 break;
830         }
831
832 fail_errno:
833 #ifdef NTDB_TRACE
834         close(ntdb->tracefd);
835 #endif
836         if (ntdb->file) {
837                 ntdb_lock_cleanup(ntdb);
838                 if (--ntdb->file->refcnt == 0) {
839                         assert(ntdb->file->num_lockrecs == 0);
840                         if (ntdb->file->map_ptr) {
841                                 if (ntdb->flags & NTDB_INTERNAL) {
842                                         ntdb->free_fn(ntdb->file->map_ptr,
843                                                       ntdb->alloc_data);
844                                 } else
845                                         ntdb_munmap(ntdb);
846                         }
847                         if (ntdb->file->fd != -1 && close(ntdb->file->fd) != 0)
848                                 ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
849                                            "ntdb_open: failed to close ntdb fd"
850                                            " on error: %s", strerror(errno));
851                         ntdb->free_fn(ntdb->file->lockrecs, ntdb->alloc_data);
852                         ntdb->free_fn(ntdb->file, ntdb->alloc_data);
853                 }
854         }
855
856         ntdb->free_fn(ntdb, ntdb->alloc_data);
857         errno = saved_errno;
858         return NULL;
859 }
860
861 _PUBLIC_ int ntdb_close(struct ntdb_context *ntdb)
862 {
863         int ret = 0;
864         struct ntdb_context **i;
865
866         ntdb_trace(ntdb, "ntdb_close");
867
868         if (ntdb->transaction) {
869                 ntdb_transaction_cancel(ntdb);
870         }
871
872         ntdb_lock_cleanup(ntdb);
873         if (--ntdb->file->refcnt == 0) {
874                 if (ntdb->file->map_ptr) {
875                         if (ntdb->flags & NTDB_INTERNAL) {
876                                 ntdb->free_fn(ntdb->file->map_ptr,
877                                               ntdb->alloc_data);
878                         } else {
879                                 ntdb_munmap(ntdb);
880                         }
881                 }
882                 ret = close(ntdb->file->fd);
883                 ntdb->free_fn(ntdb->file->lockrecs, ntdb->alloc_data);
884                 ntdb->free_fn(ntdb->file, ntdb->alloc_data);
885         }
886
887         /* Remove from tdbs list */
888         for (i = &tdbs; *i; i = &(*i)->next) {
889                 if (*i == ntdb) {
890                         *i = ntdb->next;
891                         break;
892                 }
893         }
894
895 #ifdef NTDB_TRACE
896         close(ntdb->tracefd);
897 #endif
898         ntdb->free_fn(ntdb, ntdb->alloc_data);
899
900         return ret;
901 }
902
903 _PUBLIC_ void ntdb_foreach_(int (*fn)(struct ntdb_context *, void *), void *p)
904 {
905         struct ntdb_context *i;
906
907         for (i = tdbs; i; i = i->next) {
908                 if (fn(i, p) != 0)
909                         break;
910         }
911 }