]> git.ozlabs.org Git - ccan/blob - ccan/tdb2/transaction.c
tdb2: rework some io.c functions to return enum TDB_ERROR.
[ccan] / ccan / tdb2 / transaction.c
1  /*
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              2005
7    Copyright (C) Rusty Russell                2010
8
9      ** NOTE! The following LGPL license applies to the tdb
10      ** library. This does NOT imply that all of Samba is released
11      ** under the LGPL
12
13    This library is free software; you can redistribute it and/or
14    modify it under the terms of the GNU Lesser General Public
15    License as published by the Free Software Foundation; either
16    version 3 of the License, or (at your option) any later version.
17
18    This library is distributed in the hope that it will be useful,
19    but WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    Lesser General Public License for more details.
22
23    You should have received a copy of the GNU Lesser General Public
24    License along with this library; if not, see <http://www.gnu.org/licenses/>.
25 */
26
27 #include "private.h"
28 #define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0)
29
30 /*
31   transaction design:
32
33   - only allow a single transaction at a time per database. This makes
34     using the transaction API simpler, as otherwise the caller would
35     have to cope with temporary failures in transactions that conflict
36     with other current transactions
37
38   - keep the transaction recovery information in the same file as the
39     database, using a special 'transaction recovery' record pointed at
40     by the header. This removes the need for extra journal files as
41     used by some other databases
42
43   - dynamically allocated the transaction recover record, re-using it
44     for subsequent transactions. If a larger record is needed then
45     tdb_free() the old record to place it on the normal tdb freelist
46     before allocating the new record
47
48   - during transactions, keep a linked list of writes all that have
49     been performed by intercepting all tdb_write() calls. The hooked
50     transaction versions of tdb_read() and tdb_write() check this
51     linked list and try to use the elements of the list in preference
52     to the real database.
53
54   - don't allow any locks to be held when a transaction starts,
55     otherwise we can end up with deadlock (plus lack of lock nesting
56     in posix locks would mean the lock is lost)
57
58   - if the caller gains a lock during the transaction but doesn't
59     release it then fail the commit
60
61   - allow for nested calls to tdb_transaction_start(), re-using the
62     existing transaction record. If the inner transaction is cancelled
63     then a subsequent commit will fail
64
65   - keep a mirrored copy of the tdb hash chain heads to allow for the
66     fast hash heads scan on traverse, updating the mirrored copy in
67     the transaction version of tdb_write
68
69   - allow callers to mix transaction and non-transaction use of tdb,
70     although once a transaction is started then an exclusive lock is
71     gained until the transaction is committed or cancelled
72
73   - the commit stategy involves first saving away all modified data
74     into a linearised buffer in the transaction recovery area, then
75     marking the transaction recovery area with a magic value to
76     indicate a valid recovery record. In total 4 fsync/msync calls are
77     needed per commit to prevent race conditions. It might be possible
78     to reduce this to 3 or even 2 with some more work.
79
80   - check for a valid recovery record on open of the tdb, while the
81     open lock is held. Automatically recover from the transaction
82     recovery area if needed, then continue with the open as
83     usual. This allows for smooth crash recovery with no administrator
84     intervention.
85
86   - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
87     still available, but no transaction recovery area is used and no
88     fsync/msync calls are made.
89 */
90
91
92 /*
93   hold the context of any current transaction
94 */
95 struct tdb_transaction {
96         /* the original io methods - used to do IOs to the real db */
97         const struct tdb_methods *io_methods;
98
99         /* the list of transaction blocks. When a block is first
100            written to, it gets created in this list */
101         uint8_t **blocks;
102         size_t num_blocks;
103         size_t last_block_size; /* number of valid bytes in the last block */
104
105         /* non-zero when an internal transaction error has
106            occurred. All write operations will then fail until the
107            transaction is ended */
108         int transaction_error;
109
110         /* when inside a transaction we need to keep track of any
111            nested tdb_transaction_start() calls, as these are allowed,
112            but don't create a new transaction */
113         int nesting;
114
115         /* set when a prepare has already occurred */
116         bool prepared;
117         tdb_off_t magic_offset;
118
119         /* old file size before transaction */
120         tdb_len_t old_map_size;
121 };
122
123
124 /*
125   read while in a transaction. We need to check first if the data is in our list
126   of transaction elements, then if not do a real read
127 */
128 static enum TDB_ERROR transaction_read(struct tdb_context *tdb, tdb_off_t off,
129                                        void *buf, tdb_len_t len)
130 {
131         size_t blk;
132         enum TDB_ERROR ecode;
133
134         /* break it down into block sized ops */
135         while (len + (off % getpagesize()) > getpagesize()) {
136                 tdb_len_t len2 = getpagesize() - (off % getpagesize());
137                 ecode = transaction_read(tdb, off, buf, len2);
138                 if (ecode != TDB_SUCCESS) {
139                         return ecode;
140                 }
141                 len -= len2;
142                 off += len2;
143                 buf = (void *)(len2 + (char *)buf);
144         }
145
146         if (len == 0) {
147                 return TDB_SUCCESS;
148         }
149
150         blk = off / getpagesize();
151
152         /* see if we have it in the block list */
153         if (tdb->transaction->num_blocks <= blk ||
154             tdb->transaction->blocks[blk] == NULL) {
155                 /* nope, do a real read */
156                 ecode = tdb->transaction->io_methods->tread(tdb, off, buf, len);
157                 if (ecode != TDB_SUCCESS) {
158                         goto fail;
159                 }
160                 return 0;
161         }
162
163         /* it is in the block list. Now check for the last block */
164         if (blk == tdb->transaction->num_blocks-1) {
165                 if (len > tdb->transaction->last_block_size) {
166                         ecode = TDB_ERR_IO;
167                         goto fail;
168                 }
169         }
170
171         /* now copy it out of this block */
172         memcpy(buf, tdb->transaction->blocks[blk] + (off % getpagesize()), len);
173         return TDB_SUCCESS;
174
175 fail:
176         tdb->transaction->transaction_error = 1;
177         return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
178                           "transaction_read: failed at off=%zu len=%zu",
179                           (size_t)off, (size_t)len);
180 }
181
182
183 /*
184   write while in a transaction
185 */
186 static enum TDB_ERROR transaction_write(struct tdb_context *tdb, tdb_off_t off,
187                                         const void *buf, tdb_len_t len)
188 {
189         size_t blk;
190         enum TDB_ERROR ecode;
191
192         /* Only a commit is allowed on a prepared transaction */
193         if (tdb->transaction->prepared) {
194                 ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR,
195                                    "transaction_write: transaction already"
196                                    " prepared, write not allowed");
197                 goto fail;
198         }
199
200         /* break it up into block sized chunks */
201         while (len + (off % getpagesize()) > getpagesize()) {
202                 tdb_len_t len2 = getpagesize() - (off % getpagesize());
203                 ecode = transaction_write(tdb, off, buf, len2);
204                 if (ecode != TDB_SUCCESS) {
205                         return -1;
206                 }
207                 len -= len2;
208                 off += len2;
209                 if (buf != NULL) {
210                         buf = (const void *)(len2 + (const char *)buf);
211                 }
212         }
213
214         if (len == 0) {
215                 return TDB_SUCCESS;
216         }
217
218         blk = off / getpagesize();
219         off = off % getpagesize();
220
221         if (tdb->transaction->num_blocks <= blk) {
222                 uint8_t **new_blocks;
223                 /* expand the blocks array */
224                 if (tdb->transaction->blocks == NULL) {
225                         new_blocks = (uint8_t **)malloc(
226                                 (blk+1)*sizeof(uint8_t *));
227                 } else {
228                         new_blocks = (uint8_t **)realloc(
229                                 tdb->transaction->blocks,
230                                 (blk+1)*sizeof(uint8_t *));
231                 }
232                 if (new_blocks == NULL) {
233                         ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
234                                            "transaction_write:"
235                                            " failed to allocate");
236                         goto fail;
237                 }
238                 memset(&new_blocks[tdb->transaction->num_blocks], 0,
239                        (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
240                 tdb->transaction->blocks = new_blocks;
241                 tdb->transaction->num_blocks = blk+1;
242                 tdb->transaction->last_block_size = 0;
243         }
244
245         /* allocate and fill a block? */
246         if (tdb->transaction->blocks[blk] == NULL) {
247                 tdb->transaction->blocks[blk] = (uint8_t *)calloc(getpagesize(), 1);
248                 if (tdb->transaction->blocks[blk] == NULL) {
249                         ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
250                                            "transaction_write:"
251                                            " failed to allocate");
252                         goto fail;
253                 }
254                 if (tdb->transaction->old_map_size > blk * getpagesize()) {
255                         tdb_len_t len2 = getpagesize();
256                         if (len2 + (blk * getpagesize()) > tdb->transaction->old_map_size) {
257                                 len2 = tdb->transaction->old_map_size - (blk * getpagesize());
258                         }
259                         ecode = tdb->transaction->io_methods->tread(tdb,
260                                         blk * getpagesize(),
261                                         tdb->transaction->blocks[blk],
262                                         len2);
263                         if (ecode != TDB_SUCCESS) {
264                                 ecode = tdb_logerr(tdb, ecode,
265                                                    TDB_LOG_ERROR,
266                                                    "transaction_write:"
267                                                    " failed to"
268                                                    " read old block: %s",
269                                                    strerror(errno));
270                                 SAFE_FREE(tdb->transaction->blocks[blk]);
271                                 goto fail;
272                         }
273                         if (blk == tdb->transaction->num_blocks-1) {
274                                 tdb->transaction->last_block_size = len2;
275                         }
276                 }
277         }
278
279         /* overwrite part of an existing block */
280         if (buf == NULL) {
281                 memset(tdb->transaction->blocks[blk] + off, 0, len);
282         } else {
283                 memcpy(tdb->transaction->blocks[blk] + off, buf, len);
284         }
285         if (blk == tdb->transaction->num_blocks-1) {
286                 if (len + off > tdb->transaction->last_block_size) {
287                         tdb->transaction->last_block_size = len + off;
288                 }
289         }
290
291         return TDB_SUCCESS;
292
293 fail:
294         tdb->transaction->transaction_error = 1;
295         return ecode;
296 }
297
298
299 /*
300   write while in a transaction - this varient never expands the transaction blocks, it only
301   updates existing blocks. This means it cannot change the recovery size
302 */
303 static void transaction_write_existing(struct tdb_context *tdb, tdb_off_t off,
304                                        const void *buf, tdb_len_t len)
305 {
306         size_t blk;
307
308         /* break it up into block sized chunks */
309         while (len + (off % getpagesize()) > getpagesize()) {
310                 tdb_len_t len2 = getpagesize() - (off % getpagesize());
311                 transaction_write_existing(tdb, off, buf, len2);
312                 len -= len2;
313                 off += len2;
314                 if (buf != NULL) {
315                         buf = (const void *)(len2 + (const char *)buf);
316                 }
317         }
318
319         if (len == 0) {
320                 return;
321         }
322
323         blk = off / getpagesize();
324         off = off % getpagesize();
325
326         if (tdb->transaction->num_blocks <= blk ||
327             tdb->transaction->blocks[blk] == NULL) {
328                 return;
329         }
330
331         if (blk == tdb->transaction->num_blocks-1 &&
332             off + len > tdb->transaction->last_block_size) {
333                 if (off >= tdb->transaction->last_block_size) {
334                         return;
335                 }
336                 len = tdb->transaction->last_block_size - off;
337         }
338
339         /* overwrite part of an existing block */
340         memcpy(tdb->transaction->blocks[blk] + off, buf, len);
341 }
342
343
344 /*
345   out of bounds check during a transaction
346 */
347 static enum TDB_ERROR transaction_oob(struct tdb_context *tdb, tdb_off_t len,
348                                       bool probe)
349 {
350         if (len <= tdb->map_size) {
351                 return TDB_SUCCESS;
352         }
353         if (!probe) {
354                 tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
355                            "tdb_oob len %lld beyond transaction size %lld",
356                            (long long)len,
357                            (long long)tdb->map_size);
358         }
359         return TDB_ERR_IO;
360 }
361
362 /*
363   transaction version of tdb_expand().
364 */
365 static enum TDB_ERROR transaction_expand_file(struct tdb_context *tdb,
366                                               tdb_off_t addition)
367 {
368         enum TDB_ERROR ecode;
369
370         /* add a write to the transaction elements, so subsequent
371            reads see the zero data */
372         ecode = transaction_write(tdb, tdb->map_size, NULL, addition);
373         if (ecode != TDB_SUCCESS) {
374                 tdb->ecode = ecode;
375                 return ecode;
376         }
377         tdb->map_size += addition;
378         return ecode;
379 }
380
381 static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off,
382                                 size_t len, bool write_mode)
383 {
384         size_t blk = off / getpagesize(), end_blk;
385
386         /* This is wrong for zero-length blocks, but will fail gracefully */
387         end_blk = (off + len - 1) / getpagesize();
388
389         /* Can only do direct if in single block and we've already copied. */
390         if (write_mode) {
391                 if (blk != end_blk)
392                         return NULL;
393                 if (blk >= tdb->transaction->num_blocks)
394                         return NULL;
395                 if (tdb->transaction->blocks[blk] == NULL)
396                         return NULL;
397                 return tdb->transaction->blocks[blk] + off % getpagesize();
398         }
399
400         /* Single which we have copied? */
401         if (blk == end_blk
402             && blk < tdb->transaction->num_blocks
403             && tdb->transaction->blocks[blk])
404                 return tdb->transaction->blocks[blk] + off % getpagesize();
405
406         /* Otherwise must be all not copied. */
407         while (blk < end_blk) {
408                 if (blk >= tdb->transaction->num_blocks)
409                         break;
410                 if (tdb->transaction->blocks[blk])
411                         return NULL;
412                 blk++;
413         }
414         return tdb->transaction->io_methods->direct(tdb, off, len, false);
415 }
416
417 static const struct tdb_methods transaction_methods = {
418         transaction_read,
419         transaction_write,
420         transaction_oob,
421         transaction_expand_file,
422         transaction_direct,
423 };
424
425 /*
426   sync to disk
427 */
428 static enum TDB_ERROR transaction_sync(struct tdb_context *tdb,
429                                        tdb_off_t offset, tdb_len_t length)
430 {
431         if (tdb->flags & TDB_NOSYNC) {
432                 return TDB_SUCCESS;
433         }
434
435         if (fsync(tdb->fd) != 0) {
436                 return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
437                                   "tdb_transaction: fsync failed: %s",
438                                   strerror(errno));
439         }
440 #ifdef MS_SYNC
441         if (tdb->map_ptr) {
442                 tdb_off_t moffset = offset & ~(getpagesize()-1);
443                 if (msync(moffset + (char *)tdb->map_ptr,
444                           length + (offset - moffset), MS_SYNC) != 0) {
445                         return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
446                                           "tdb_transaction: msync failed: %s",
447                                           strerror(errno));
448                 }
449         }
450 #endif
451         return TDB_SUCCESS;
452 }
453
454
455 static void _tdb_transaction_cancel(struct tdb_context *tdb)
456 {
457         int i;
458         enum TDB_ERROR ecode;
459
460         if (tdb->transaction == NULL) {
461                 tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
462                            "tdb_transaction_cancel: no transaction");
463                 return;
464         }
465
466         if (tdb->transaction->nesting != 0) {
467                 tdb->transaction->transaction_error = 1;
468                 tdb->transaction->nesting--;
469                 return;
470         }
471
472         tdb->map_size = tdb->transaction->old_map_size;
473
474         /* free all the transaction blocks */
475         for (i=0;i<tdb->transaction->num_blocks;i++) {
476                 if (tdb->transaction->blocks[i] != NULL) {
477                         free(tdb->transaction->blocks[i]);
478                 }
479         }
480         SAFE_FREE(tdb->transaction->blocks);
481
482         if (tdb->transaction->magic_offset) {
483                 const struct tdb_methods *methods = tdb->transaction->io_methods;
484                 uint64_t invalid = TDB_RECOVERY_INVALID_MAGIC;
485
486                 /* remove the recovery marker */
487                 ecode = methods->twrite(tdb, tdb->transaction->magic_offset,
488                                         &invalid, sizeof(invalid));
489                 if (ecode == TDB_SUCCESS)
490                         ecode = transaction_sync(tdb,
491                                                  tdb->transaction->magic_offset,
492                                                  sizeof(invalid));
493                 if (ecode != TDB_SUCCESS) {
494                         tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
495                                    "tdb_transaction_cancel: failed to remove"
496                                    " recovery magic");
497                 }
498         }
499
500         if (tdb->allrecord_lock.count)
501                 tdb_allrecord_unlock(tdb, tdb->allrecord_lock.ltype);
502
503         /* restore the normal io methods */
504         tdb->methods = tdb->transaction->io_methods;
505
506         tdb_transaction_unlock(tdb, F_WRLCK);
507
508         if (tdb_has_open_lock(tdb))
509                 tdb_unlock_open(tdb);
510
511         SAFE_FREE(tdb->transaction);
512 }
513
514 /*
515   start a tdb transaction. No token is returned, as only a single
516   transaction is allowed to be pending per tdb_context
517 */
518 int tdb_transaction_start(struct tdb_context *tdb)
519 {
520         enum TDB_ERROR ecode;
521
522         /* some sanity checks */
523         if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) {
524                 tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
525                            "tdb_transaction_start: cannot start a transaction"
526                            " on a read-only or internal db");
527                 return -1;
528         }
529
530         /* cope with nested tdb_transaction_start() calls */
531         if (tdb->transaction != NULL) {
532                 tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_USE_ERROR,
533                            "tdb_transaction_start:"
534                            " already inside transaction");
535                 return -1;
536         }
537
538         if (tdb_has_hash_locks(tdb)) {
539                 /* the caller must not have any locks when starting a
540                    transaction as otherwise we'll be screwed by lack
541                    of nested locks in posix */
542                 tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
543                            "tdb_transaction_start: cannot start a transaction"
544                            " with locks held");
545                 return -1;
546         }
547
548         tdb->transaction = (struct tdb_transaction *)
549                 calloc(sizeof(struct tdb_transaction), 1);
550         if (tdb->transaction == NULL) {
551                 tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
552                            "tdb_transaction_start: cannot allocate");
553                 return -1;
554         }
555
556         /* get the transaction write lock. This is a blocking lock. As
557            discussed with Volker, there are a number of ways we could
558            make this async, which we will probably do in the future */
559         ecode = tdb_transaction_lock(tdb, F_WRLCK);
560         if (ecode != TDB_SUCCESS) {
561                 tdb->ecode = ecode;
562                 SAFE_FREE(tdb->transaction->blocks);
563                 SAFE_FREE(tdb->transaction);
564                 return -1;
565         }
566
567         /* get a read lock over entire file. This is upgraded to a write
568            lock during the commit */
569         ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true);
570         if (ecode != TDB_SUCCESS) {
571                 tdb->ecode = ecode;
572                 goto fail_allrecord_lock;
573         }
574
575         /* make sure we know about any file expansions already done by
576            anyone else */
577         tdb->methods->oob(tdb, tdb->map_size + 1, true);
578         tdb->transaction->old_map_size = tdb->map_size;
579
580         /* finally hook the io methods, replacing them with
581            transaction specific methods */
582         tdb->transaction->io_methods = tdb->methods;
583         tdb->methods = &transaction_methods;
584         return 0;
585
586 fail_allrecord_lock:
587         tdb_transaction_unlock(tdb, F_WRLCK);
588         SAFE_FREE(tdb->transaction->blocks);
589         SAFE_FREE(tdb->transaction);
590         return -1;
591 }
592
593
594 /*
595   cancel the current transaction
596 */
597 void tdb_transaction_cancel(struct tdb_context *tdb)
598 {
599         _tdb_transaction_cancel(tdb);
600 }
601
602 /*
603   work out how much space the linearised recovery data will consume
604 */
605 static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
606 {
607         tdb_len_t recovery_size = 0;
608         int i;
609
610         recovery_size = sizeof(tdb_len_t);
611         for (i=0;i<tdb->transaction->num_blocks;i++) {
612                 if (i * getpagesize() >= tdb->transaction->old_map_size) {
613                         break;
614                 }
615                 if (tdb->transaction->blocks[i] == NULL) {
616                         continue;
617                 }
618                 recovery_size += 2*sizeof(tdb_off_t);
619                 if (i == tdb->transaction->num_blocks-1) {
620                         recovery_size += tdb->transaction->last_block_size;
621                 } else {
622                         recovery_size += getpagesize();
623                 }
624         }
625
626         return recovery_size;
627 }
628
629 /*
630   allocate the recovery area, or use an existing recovery area if it is
631   large enough
632 */
633 static int tdb_recovery_allocate(struct tdb_context *tdb,
634                                  tdb_len_t *recovery_size,
635                                  tdb_off_t *recovery_offset,
636                                  tdb_len_t *recovery_max_size)
637 {
638         struct tdb_recovery_record rec;
639         const struct tdb_methods *methods = tdb->transaction->io_methods;
640         tdb_off_t recovery_head;
641         size_t addition;
642         enum TDB_ERROR ecode;
643
644         recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
645         if (recovery_head == TDB_OFF_ERR) {
646                 tdb_logerr(tdb, tdb->ecode, TDB_LOG_ERROR,
647                          "tdb_recovery_allocate:"
648                          " failed to read recovery head");
649                 return -1;
650         }
651
652         if (recovery_head != 0) {
653                 ecode = methods->tread(tdb, recovery_head, &rec, sizeof(rec));
654                 if (ecode != TDB_SUCCESS) {
655                         tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
656                                  "tdb_recovery_allocate:"
657                                  " failed to read recovery record");
658                         return -1;
659                 }
660                 tdb_convert(tdb, &rec, sizeof(rec));
661                 /* ignore invalid recovery regions: can happen in crash */
662                 if (rec.magic != TDB_RECOVERY_MAGIC &&
663                     rec.magic != TDB_RECOVERY_INVALID_MAGIC) {
664                         recovery_head = 0;
665                 }
666         }
667
668         *recovery_size = tdb_recovery_size(tdb);
669
670         if (recovery_head != 0 && *recovery_size <= rec.max_len) {
671                 /* it fits in the existing area */
672                 *recovery_max_size = rec.max_len;
673                 *recovery_offset = recovery_head;
674                 return 0;
675         }
676
677         /* we need to free up the old recovery area, then allocate a
678            new one at the end of the file. Note that we cannot use
679            normal allocation to allocate the new one as that might return
680            us an area that is being currently used (as of the start of
681            the transaction) */
682         if (recovery_head != 0) {
683                 add_stat(tdb, frees, 1);
684                 if (add_free_record(tdb, recovery_head,
685                                     sizeof(rec) + rec.max_len) != 0) {
686                         tdb_logerr(tdb, tdb->ecode, TDB_LOG_ERROR,
687                                    "tdb_recovery_allocate:"
688                                    " failed to free previous recovery area");
689                         return -1;
690                 }
691         }
692
693         /* the tdb_free() call might have increased the recovery size */
694         *recovery_size = tdb_recovery_size(tdb);
695
696         /* round up to a multiple of page size */
697         *recovery_max_size
698                 = (((sizeof(rec) + *recovery_size) + getpagesize()-1)
699                    & ~(getpagesize()-1))
700                 - sizeof(rec);
701         *recovery_offset = tdb->map_size;
702         recovery_head = *recovery_offset;
703
704         /* Restore ->map_size before calling underlying expand_file.
705            Also so that we don't try to expand the file again in the
706            transaction commit, which would destroy the recovery
707            area */
708         addition = (tdb->map_size - tdb->transaction->old_map_size) +
709                 sizeof(rec) + *recovery_max_size;
710         tdb->map_size = tdb->transaction->old_map_size;
711         ecode = methods->expand_file(tdb, addition);
712         if (ecode != TDB_SUCCESS) {
713                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
714                          "tdb_recovery_allocate:"
715                          " failed to create recovery area");
716                 return -1;
717         }
718
719         /* we have to reset the old map size so that we don't try to
720            expand the file again in the transaction commit, which
721            would destroy the recovery area */
722         tdb->transaction->old_map_size = tdb->map_size;
723
724         /* write the recovery header offset and sync - we can sync without a race here
725            as the magic ptr in the recovery record has not been set */
726         tdb_convert(tdb, &recovery_head, sizeof(recovery_head));
727         ecode = methods->twrite(tdb, offsetof(struct tdb_header, recovery),
728                                 &recovery_head, sizeof(tdb_off_t));
729         if (ecode != TDB_SUCCESS) {
730                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
731                          "tdb_recovery_allocate:"
732                          " failed to write recovery head");
733                 return -1;
734         }
735         transaction_write_existing(tdb, offsetof(struct tdb_header, recovery),
736                                    &recovery_head,
737                                    sizeof(tdb_off_t));
738         return 0;
739 }
740
741 /* Set up header for the recovery record. */
742 static void set_recovery_header(struct tdb_recovery_record *rec,
743                                 uint64_t magic,
744                                 uint64_t datalen, uint64_t actuallen,
745                                 uint64_t oldsize)
746 {
747         rec->magic = magic;
748         rec->max_len = actuallen;
749         rec->len = datalen;
750         rec->eof = oldsize;
751 }
752
753 /*
754   setup the recovery data that will be used on a crash during commit
755 */
756 static int transaction_setup_recovery(struct tdb_context *tdb,
757                                       tdb_off_t *magic_offset)
758 {
759         tdb_len_t recovery_size;
760         unsigned char *data, *p;
761         const struct tdb_methods *methods = tdb->transaction->io_methods;
762         struct tdb_recovery_record *rec;
763         tdb_off_t recovery_offset, recovery_max_size;
764         tdb_off_t old_map_size = tdb->transaction->old_map_size;
765         uint64_t magic, tailer;
766         int i;
767         enum TDB_ERROR ecode;
768
769         /*
770           check that the recovery area has enough space
771         */
772         if (tdb_recovery_allocate(tdb, &recovery_size,
773                                   &recovery_offset, &recovery_max_size) == -1) {
774                 return -1;
775         }
776
777         data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
778         if (data == NULL) {
779                 tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
780                            "transaction_setup_recovery: cannot allocate");
781                 return -1;
782         }
783
784         rec = (struct tdb_recovery_record *)data;
785         set_recovery_header(rec, TDB_RECOVERY_INVALID_MAGIC,
786                             recovery_size, recovery_max_size, old_map_size);
787         tdb_convert(tdb, rec, sizeof(*rec));
788
789         /* build the recovery data into a single blob to allow us to do a single
790            large write, which should be more efficient */
791         p = data + sizeof(*rec);
792         for (i=0;i<tdb->transaction->num_blocks;i++) {
793                 tdb_off_t offset;
794                 tdb_len_t length;
795
796                 if (tdb->transaction->blocks[i] == NULL) {
797                         continue;
798                 }
799
800                 offset = i * getpagesize();
801                 length = getpagesize();
802                 if (i == tdb->transaction->num_blocks-1) {
803                         length = tdb->transaction->last_block_size;
804                 }
805
806                 if (offset >= old_map_size) {
807                         continue;
808                 }
809                 if (offset + length > tdb->map_size) {
810                         tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
811                                    "tdb_transaction_setup_recovery:"
812                                    " transaction data over new region boundary");
813                         free(data);
814                         return -1;
815                 }
816                 memcpy(p, &offset, sizeof(offset));
817                 memcpy(p + sizeof(offset), &length, sizeof(length));
818                 tdb_convert(tdb, p, sizeof(offset) + sizeof(length));
819
820                 /* the recovery area contains the old data, not the
821                    new data, so we have to call the original tdb_read
822                    method to get it */
823                 ecode = methods->tread(tdb, offset,
824                                        p + sizeof(offset) + sizeof(length),
825                                        length);
826                 if (ecode != TDB_SUCCESS) {
827                         tdb->ecode = ecode;
828                         free(data);
829                         return -1;
830                 }
831                 p += sizeof(offset) + sizeof(length) + length;
832         }
833
834         /* and the tailer */
835         tailer = sizeof(*rec) + recovery_max_size;
836         memcpy(p, &tailer, sizeof(tailer));
837         tdb_convert(tdb, p, sizeof(tailer));
838
839         /* write the recovery data to the recovery area */
840         ecode = methods->twrite(tdb, recovery_offset, data,
841                                 sizeof(*rec) + recovery_size);
842         if (ecode != TDB_SUCCESS) {
843                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
844                          "tdb_transaction_setup_recovery:"
845                          " failed to write recovery data");
846                 free(data);
847                 return -1;
848         }
849         transaction_write_existing(tdb, recovery_offset, data,
850                                    sizeof(*rec) + recovery_size);
851
852         /* as we don't have ordered writes, we have to sync the recovery
853            data before we update the magic to indicate that the recovery
854            data is present */
855         ecode = transaction_sync(tdb, recovery_offset,
856                                  sizeof(*rec) + recovery_size);
857         if (ecode != TDB_SUCCESS) {
858                 free(data);
859                 tdb->ecode = ecode;
860                 return -1;
861         }
862
863         free(data);
864
865         magic = TDB_RECOVERY_MAGIC;
866         tdb_convert(tdb, &magic, sizeof(magic));
867
868         *magic_offset = recovery_offset + offsetof(struct tdb_recovery_record,
869                                                    magic);
870
871         ecode = methods->twrite(tdb, *magic_offset, &magic, sizeof(magic));
872         if (ecode != TDB_SUCCESS) {
873                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
874                          "tdb_transaction_setup_recovery:"
875                          " failed to write recovery magic");
876                 return -1;
877         }
878         transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic));
879
880         /* ensure the recovery magic marker is on disk */
881         ecode = transaction_sync(tdb, *magic_offset, sizeof(magic));
882         if (ecode != TDB_SUCCESS) {
883                 tdb->ecode = ecode;
884                 return -1;
885         }
886
887         return 0;
888 }
889
890 static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
891 {
892         const struct tdb_methods *methods;
893         enum TDB_ERROR ecode;
894
895         if (tdb->transaction == NULL) {
896                 tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
897                            "tdb_transaction_prepare_commit: no transaction");
898                 return -1;
899         }
900
901         if (tdb->transaction->prepared) {
902                 _tdb_transaction_cancel(tdb);
903                 tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
904                            "tdb_transaction_prepare_commit:"
905                            " transaction already prepared");
906                 return -1;
907         }
908
909         if (tdb->transaction->transaction_error) {
910                 _tdb_transaction_cancel(tdb);
911                 tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR,
912                            "tdb_transaction_prepare_commit:"
913                            " transaction error pending");
914                 return -1;
915         }
916
917
918         if (tdb->transaction->nesting != 0) {
919                 tdb->transaction->nesting--;
920                 return 0;
921         }
922
923         /* check for a null transaction */
924         if (tdb->transaction->blocks == NULL) {
925                 return 0;
926         }
927
928         methods = tdb->transaction->io_methods;
929
930         /* upgrade the main transaction lock region to a write lock */
931         ecode = tdb_allrecord_upgrade(tdb);
932         if (ecode != TDB_SUCCESS) {
933                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
934                          "tdb_transaction_prepare_commit:"
935                          " failed to upgrade hash locks");
936                 _tdb_transaction_cancel(tdb);
937                 return -1;
938         }
939
940         /* get the open lock - this prevents new users attaching to the database
941            during the commit */
942         ecode = tdb_lock_open(tdb, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
943         if (ecode != TDB_SUCCESS) {
944                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
945                            "tdb_transaction_prepare_commit:"
946                            " failed to get open lock");
947                 _tdb_transaction_cancel(tdb);
948                 return -1;
949         }
950
951         /* Since we have whole db locked, we don't need the expansion lock. */
952         if (!(tdb->flags & TDB_NOSYNC)) {
953                 /* write the recovery data to the end of the file */
954                 if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
955                         tdb_logerr(tdb, tdb->ecode, TDB_LOG_ERROR,
956                                  "tdb_transaction_prepare_commit:"
957                                  " failed to setup recovery data");
958                         _tdb_transaction_cancel(tdb);
959                         return -1;
960                 }
961         }
962
963         tdb->transaction->prepared = true;
964
965         /* expand the file to the new size if needed */
966         if (tdb->map_size != tdb->transaction->old_map_size) {
967                 tdb_len_t add = tdb->map_size - tdb->transaction->old_map_size;
968                 /* Restore original map size for tdb_expand_file */
969                 tdb->map_size = tdb->transaction->old_map_size;
970                 ecode = methods->expand_file(tdb, add);
971                 if (ecode != TDB_SUCCESS) {
972                         tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
973                                  "tdb_transaction_prepare_commit:"
974                                  " expansion failed");
975                         _tdb_transaction_cancel(tdb);
976                         return -1;
977                 }
978         }
979
980         /* Keep the open lock until the actual commit */
981
982         return 0;
983 }
984
985 /*
986    prepare to commit the current transaction
987 */
988 int tdb_transaction_prepare_commit(struct tdb_context *tdb)
989 {
990         return _tdb_transaction_prepare_commit(tdb);
991 }
992
993 /*
994   commit the current transaction
995 */
996 int tdb_transaction_commit(struct tdb_context *tdb)
997 {
998         const struct tdb_methods *methods;
999         int i;
1000         enum TDB_ERROR ecode;
1001
1002         if (tdb->transaction == NULL) {
1003                 tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
1004                          "tdb_transaction_commit: no transaction");
1005                 return -1;
1006         }
1007
1008         tdb_trace(tdb, "tdb_transaction_commit");
1009
1010         if (tdb->transaction->nesting != 0) {
1011                 tdb->transaction->nesting--;
1012                 return 0;
1013         }
1014
1015         /* check for a null transaction */
1016         if (tdb->transaction->blocks == NULL) {
1017                 _tdb_transaction_cancel(tdb);
1018                 return 0;
1019         }
1020
1021         if (!tdb->transaction->prepared) {
1022                 int ret = _tdb_transaction_prepare_commit(tdb);
1023                 if (ret)
1024                         return ret;
1025         }
1026
1027         methods = tdb->transaction->io_methods;
1028
1029         /* perform all the writes */
1030         for (i=0;i<tdb->transaction->num_blocks;i++) {
1031                 tdb_off_t offset;
1032                 tdb_len_t length;
1033
1034                 if (tdb->transaction->blocks[i] == NULL) {
1035                         continue;
1036                 }
1037
1038                 offset = i * getpagesize();
1039                 length = getpagesize();
1040                 if (i == tdb->transaction->num_blocks-1) {
1041                         length = tdb->transaction->last_block_size;
1042                 }
1043
1044                 ecode = methods->twrite(tdb, offset,
1045                                         tdb->transaction->blocks[i], length);
1046                 if (ecode != TDB_SUCCESS) {
1047                         tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1048                                    "tdb_transaction_commit:"
1049                                    " write failed during commit");
1050
1051                         /* we've overwritten part of the data and
1052                            possibly expanded the file, so we need to
1053                            run the crash recovery code */
1054                         tdb->methods = methods;
1055                         tdb_transaction_recover(tdb);
1056
1057                         _tdb_transaction_cancel(tdb);
1058
1059                         return -1;
1060                 }
1061                 SAFE_FREE(tdb->transaction->blocks[i]);
1062         }
1063
1064         SAFE_FREE(tdb->transaction->blocks);
1065         tdb->transaction->num_blocks = 0;
1066
1067         /* ensure the new data is on disk */
1068         ecode = transaction_sync(tdb, 0, tdb->map_size);
1069         if (ecode != TDB_SUCCESS) {
1070                 tdb->ecode = ecode;
1071                 return -1;
1072         }
1073
1074         /*
1075           TODO: maybe write to some dummy hdr field, or write to magic
1076           offset without mmap, before the last sync, instead of the
1077           utime() call
1078         */
1079
1080         /* on some systems (like Linux 2.6.x) changes via mmap/msync
1081            don't change the mtime of the file, this means the file may
1082            not be backed up (as tdb rounding to block sizes means that
1083            file size changes are quite rare too). The following forces
1084            mtime changes when a transaction completes */
1085 #if HAVE_UTIME
1086         utime(tdb->name, NULL);
1087 #endif
1088
1089         /* use a transaction cancel to free memory and remove the
1090            transaction locks */
1091         _tdb_transaction_cancel(tdb);
1092
1093         return 0;
1094 }
1095
1096
1097 /*
1098   recover from an aborted transaction. Must be called with exclusive
1099   database write access already established (including the open
1100   lock to prevent new processes attaching)
1101 */
1102 int tdb_transaction_recover(struct tdb_context *tdb)
1103 {
1104         tdb_off_t recovery_head, recovery_eof;
1105         unsigned char *data, *p;
1106         struct tdb_recovery_record rec;
1107         enum TDB_ERROR ecode;
1108
1109         /* find the recovery area */
1110         recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
1111         if (recovery_head == TDB_OFF_ERR) {
1112                 tdb_logerr(tdb, tdb->ecode, TDB_LOG_ERROR,
1113                          "tdb_transaction_recover:"
1114                          " failed to read recovery head");
1115                 return -1;
1116         }
1117
1118         if (recovery_head == 0) {
1119                 /* we have never allocated a recovery record */
1120                 return 0;
1121         }
1122
1123         /* read the recovery record */
1124         ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec));
1125         if (ecode != TDB_SUCCESS) {
1126                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1127                            "tdb_transaction_recover:"
1128                            " failed to read recovery record");
1129                 return -1;
1130         }
1131
1132         if (rec.magic != TDB_RECOVERY_MAGIC) {
1133                 /* there is no valid recovery data */
1134                 return 0;
1135         }
1136
1137         if (tdb->read_only) {
1138                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
1139                            "tdb_transaction_recover:"
1140                            " attempt to recover read only database");
1141                 return -1;
1142         }
1143
1144         recovery_eof = rec.eof;
1145
1146         data = (unsigned char *)malloc(rec.len);
1147         if (data == NULL) {
1148                 tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
1149                            "tdb_transaction_recover:"
1150                            " failed to allocate recovery data");
1151                 return -1;
1152         }
1153
1154         /* read the full recovery data */
1155         ecode = tdb->methods->tread(tdb, recovery_head + sizeof(rec), data,
1156                                     rec.len);
1157         if (ecode != TDB_SUCCESS) {
1158                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1159                            "tdb_transaction_recover:"
1160                            " failed to read recovery data");
1161                 return -1;
1162         }
1163
1164         /* recover the file data */
1165         p = data;
1166         while (p+sizeof(tdb_off_t)+sizeof(tdb_len_t) < data + rec.len) {
1167                 tdb_off_t ofs;
1168                 tdb_len_t len;
1169                 tdb_convert(tdb, p, sizeof(ofs) + sizeof(len));
1170                 memcpy(&ofs, p, sizeof(ofs));
1171                 memcpy(&len, p + sizeof(ofs), sizeof(len));
1172                 p += sizeof(ofs) + sizeof(len);
1173
1174                 ecode = tdb->methods->twrite(tdb, ofs, p, len);
1175                 if (ecode != TDB_SUCCESS) {
1176                         free(data);
1177                         tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1178                                  "tdb_transaction_recover:"
1179                                  " failed to recover %zu bytes at offset %zu",
1180                                  (size_t)len, (size_t)ofs);
1181                         return -1;
1182                 }
1183                 p += len;
1184         }
1185
1186         free(data);
1187
1188         ecode = transaction_sync(tdb, 0, tdb->map_size);
1189         if (ecode != TDB_SUCCESS) {
1190                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1191                            "tdb_transaction_recover: failed to sync recovery");
1192                 return -1;
1193         }
1194
1195         /* if the recovery area is after the recovered eof then remove it */
1196         if (recovery_eof <= recovery_head) {
1197                 ecode = tdb_write_off(tdb, offsetof(struct tdb_header,
1198                                                     recovery),
1199                                       0);
1200                 if (ecode != TDB_SUCCESS) {
1201                         tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1202                                  "tdb_transaction_recover:"
1203                                  " failed to remove recovery head");
1204                         return -1;
1205                 }
1206         }
1207
1208         /* remove the recovery magic */
1209         ecode = tdb_write_off(tdb,
1210                               recovery_head
1211                               + offsetof(struct tdb_recovery_record, magic),
1212                               TDB_RECOVERY_INVALID_MAGIC);
1213         if (ecode != TDB_SUCCESS) {
1214                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1215                          "tdb_transaction_recover:"
1216                          " failed to remove recovery magic");
1217                 return -1;
1218         }
1219
1220         ecode = transaction_sync(tdb, 0, recovery_eof);
1221         if (ecode != TDB_SUCCESS) {
1222                 tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
1223                          "tdb_transaction_recover: failed to sync2 recovery");
1224                 return -1;
1225         }
1226
1227         tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
1228                    "tdb_transaction_recover: recovered %zu byte database",
1229                    (size_t)recovery_eof);
1230
1231         /* all done */
1232         return 0;
1233 }
1234
1235 /* Any I/O failures we say "needs recovery". */
1236 bool tdb_needs_recovery(struct tdb_context *tdb)
1237 {
1238         tdb_off_t recovery_head;
1239         struct tdb_recovery_record rec;
1240         enum TDB_ERROR ecode;
1241
1242         /* find the recovery area */
1243         recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
1244         if (recovery_head == TDB_OFF_ERR) {
1245                 return true;
1246         }
1247
1248         if (recovery_head == 0) {
1249                 /* we have never allocated a recovery record */
1250                 return false;
1251         }
1252
1253         /* read the recovery record */
1254         ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec));
1255         if (ecode != TDB_SUCCESS) {
1256                 tdb->ecode = ecode;
1257                 return true;
1258         }
1259
1260         return (rec.magic == TDB_RECOVERY_MAGIC);
1261 }