]> git.ozlabs.org Git - ccan/blob - ccan/tdb2/tdb1_transaction.c
tdb2: don't cancel transactions on lock failures in tdb1 backend.
[ccan] / ccan / tdb2 / tdb1_transaction.c
1  /*
2    Unix SMB/CIFS implementation.
3
4    trivial database library
5
6    Copyright (C) Andrew Tridgell              2005
7
8      ** NOTE! The following LGPL license applies to the tdb
9      ** library. This does NOT imply that all of Samba is released
10      ** under the LGPL
11
12    This library is free software; you can redistribute it and/or
13    modify it under the terms of the GNU Lesser General Public
14    License as published by the Free Software Foundation; either
15    version 3 of the License, or (at your option) any later version.
16
17    This library is distributed in the hope that it will be useful,
18    but WITHOUT ANY WARRANTY; without even the implied warranty of
19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20    Lesser General Public License for more details.
21
22    You should have received a copy of the GNU Lesser General Public
23    License along with this library; if not, see <http://www.gnu.org/licenses/>.
24 */
25
26 #include "tdb1_private.h"
27
28 /*
29   transaction design:
30
31   - only allow a single transaction at a time per database. This makes
32     using the transaction API simpler, as otherwise the caller would
33     have to cope with temporary failures in transactions that conflict
34     with other current transactions
35
36   - keep the transaction recovery information in the same file as the
37     database, using a special 'transaction recovery' record pointed at
38     by the header. This removes the need for extra journal files as
39     used by some other databases
40
41   - dynamically allocated the transaction recover record, re-using it
42     for subsequent transactions. If a larger record is needed then
43     tdb1_free() the old record to place it on the normal tdb freelist
44     before allocating the new record
45
46   - during transactions, keep a linked list of writes all that have
47     been performed by intercepting all tdb1_write() calls. The hooked
48     transaction versions of tdb1_read() and tdb1_write() check this
49     linked list and try to use the elements of the list in preference
50     to the real database.
51
52   - don't allow any locks to be held when a transaction starts,
53     otherwise we can end up with deadlock (plus lack of lock nesting
54     in posix locks would mean the lock is lost)
55
56   - if the caller gains a lock during the transaction but doesn't
57     release it then fail the commit
58
59   - allow for nested calls to tdb1_transaction_start(), re-using the
60     existing transaction record. If the inner transaction is cancelled
61     then a subsequent commit will fail
62
63   - keep a mirrored copy of the tdb hash chain heads to allow for the
64     fast hash heads scan on traverse, updating the mirrored copy in
65     the transaction version of tdb1_write
66
67   - allow callers to mix transaction and non-transaction use of tdb,
68     although once a transaction is started then an exclusive lock is
69     gained until the transaction is committed or cancelled
70
71   - the commit stategy involves first saving away all modified data
72     into a linearised buffer in the transaction recovery area, then
73     marking the transaction recovery area with a magic value to
74     indicate a valid recovery record. In total 4 fsync/msync calls are
75     needed per commit to prevent race conditions. It might be possible
76     to reduce this to 3 or even 2 with some more work.
77
78   - check for a valid recovery record on open of the tdb, while the
79     open lock is held. Automatically recover from the transaction
80     recovery area if needed, then continue with the open as
81     usual. This allows for smooth crash recovery with no administrator
82     intervention.
83
84   - if TDB_NOSYNC is passed to flags in tdb1_open then transactions are
85     still available, but no transaction recovery area is used and no
86     fsync/msync calls are made.
87
88   - if TDB_ALLOW_NESTING is passed to flags in tdb open, or added using
89     tdb1_add_flags() transaction nesting is enabled.
90     The default is that transaction nesting is NOT allowed.
91
92     Beware. when transactions are nested a transaction successfully
93     completed with tdb1_transaction_commit() can be silently unrolled later.
94 */
95
96
97 /*
98   hold the context of any current transaction
99 */
100 struct tdb1_transaction {
101         /* we keep a mirrored copy of the tdb hash heads here so
102            tdb1_next_hash_chain() can operate efficiently */
103         uint32_t *hash_heads;
104
105         /* the original io methods - used to do IOs to the real db */
106         const struct tdb1_methods *io_methods;
107
108         /* the list of transaction blocks. When a block is first
109            written to, it gets created in this list */
110         uint8_t **blocks;
111         uint32_t num_blocks;
112         uint32_t block_size;      /* bytes in each block */
113         uint32_t last_block_size; /* number of valid bytes in the last block */
114
115         /* non-zero when an internal transaction error has
116            occurred. All write operations will then fail until the
117            transaction is ended */
118         int transaction_error;
119
120         /* when inside a transaction we need to keep track of any
121            nested tdb1_transaction_start() calls, as these are allowed,
122            but don't create a new transaction */
123         int nesting;
124
125         /* set when a prepare has already occurred */
126         bool prepared;
127         tdb1_off_t magic_offset;
128
129         /* old file size before transaction */
130         tdb1_len_t old_map_size;
131
132         /* did we expand in this transaction */
133         bool expanded;
134 };
135
136
137 /*
138   read while in a transaction. We need to check first if the data is in our list
139   of transaction elements, then if not do a real read
140 */
141 static int transaction1_read(struct tdb1_context *tdb, tdb1_off_t off, void *buf,
142                              tdb1_len_t len, int cv)
143 {
144         uint32_t blk;
145
146         /* break it down into block sized ops */
147         while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
148                 tdb1_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
149                 if (transaction1_read(tdb, off, buf, len2, cv) != 0) {
150                         return -1;
151                 }
152                 len -= len2;
153                 off += len2;
154                 buf = (void *)(len2 + (char *)buf);
155         }
156
157         if (len == 0) {
158                 return 0;
159         }
160
161         blk = off / tdb->transaction->block_size;
162
163         /* see if we have it in the block list */
164         if (tdb->transaction->num_blocks <= blk ||
165             tdb->transaction->blocks[blk] == NULL) {
166                 /* nope, do a real read */
167                 if (tdb->transaction->io_methods->tdb1_read(tdb, off, buf, len, cv) != 0) {
168                         goto fail;
169                 }
170                 return 0;
171         }
172
173         /* it is in the block list. Now check for the last block */
174         if (blk == tdb->transaction->num_blocks-1) {
175                 if (len > tdb->transaction->last_block_size) {
176                         goto fail;
177                 }
178         }
179
180         /* now copy it out of this block */
181         memcpy(buf, tdb->transaction->blocks[blk] + (off % tdb->transaction->block_size), len);
182         if (cv) {
183                 tdb1_convert(buf, len);
184         }
185         return 0;
186
187 fail:
188         tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
189                                 "transaction_read: failed at off=%d len=%d",
190                                 off, len);
191         tdb->transaction->transaction_error = 1;
192         return -1;
193 }
194
195
196 /*
197   write while in a transaction
198 */
199 static int transaction1_write(struct tdb1_context *tdb, tdb1_off_t off,
200                              const void *buf, tdb1_len_t len)
201 {
202         uint32_t blk;
203
204         /* Only a commit is allowed on a prepared transaction */
205         if (tdb->transaction->prepared) {
206                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
207                                         "transaction_write: transaction already"
208                                         " prepared, write not allowed");
209                 tdb->transaction->transaction_error = 1;
210                 return -1;
211         }
212
213         /* if the write is to a hash head, then update the transaction
214            hash heads */
215         if (len == sizeof(tdb1_off_t) && off >= TDB1_FREELIST_TOP &&
216             off < TDB1_FREELIST_TOP+TDB1_HASHTABLE_SIZE(tdb)) {
217                 uint32_t chain = (off-TDB1_FREELIST_TOP) / sizeof(tdb1_off_t);
218                 memcpy(&tdb->transaction->hash_heads[chain], buf, len);
219         }
220
221         /* break it up into block sized chunks */
222         while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
223                 tdb1_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
224                 if (transaction1_write(tdb, off, buf, len2) != 0) {
225                         return -1;
226                 }
227                 len -= len2;
228                 off += len2;
229                 if (buf != NULL) {
230                         buf = (const void *)(len2 + (const char *)buf);
231                 }
232         }
233
234         if (len == 0) {
235                 return 0;
236         }
237
238         blk = off / tdb->transaction->block_size;
239         off = off % tdb->transaction->block_size;
240
241         if (tdb->transaction->num_blocks <= blk) {
242                 uint8_t **new_blocks;
243                 /* expand the blocks array */
244                 if (tdb->transaction->blocks == NULL) {
245                         new_blocks = (uint8_t **)malloc(
246                                 (blk+1)*sizeof(uint8_t *));
247                 } else {
248                         new_blocks = (uint8_t **)realloc(
249                                 tdb->transaction->blocks,
250                                 (blk+1)*sizeof(uint8_t *));
251                 }
252                 if (new_blocks == NULL) {
253                         tdb->last_error = TDB_ERR_OOM;
254                         goto fail;
255                 }
256                 memset(&new_blocks[tdb->transaction->num_blocks], 0,
257                        (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
258                 tdb->transaction->blocks = new_blocks;
259                 tdb->transaction->num_blocks = blk+1;
260                 tdb->transaction->last_block_size = 0;
261         }
262
263         /* allocate and fill a block? */
264         if (tdb->transaction->blocks[blk] == NULL) {
265                 tdb->transaction->blocks[blk] = (uint8_t *)calloc(tdb->transaction->block_size, 1);
266                 if (tdb->transaction->blocks[blk] == NULL) {
267                         tdb->last_error = TDB_ERR_OOM;
268                         tdb->transaction->transaction_error = 1;
269                         return -1;
270                 }
271                 if (tdb->transaction->old_map_size > blk * tdb->transaction->block_size) {
272                         tdb1_len_t len2 = tdb->transaction->block_size;
273                         if (len2 + (blk * tdb->transaction->block_size) > tdb->transaction->old_map_size) {
274                                 len2 = tdb->transaction->old_map_size - (blk * tdb->transaction->block_size);
275                         }
276                         if (tdb->transaction->io_methods->tdb1_read(tdb, blk * tdb->transaction->block_size,
277                                                                    tdb->transaction->blocks[blk],
278                                                                    len2, 0) != 0) {
279                                 SAFE_FREE(tdb->transaction->blocks[blk]);
280                                 tdb->last_error = TDB_ERR_IO;
281                                 goto fail;
282                         }
283                         if (blk == tdb->transaction->num_blocks-1) {
284                                 tdb->transaction->last_block_size = len2;
285                         }
286                 }
287         }
288
289         /* overwrite part of an existing block */
290         if (buf == NULL) {
291                 memset(tdb->transaction->blocks[blk] + off, 0, len);
292         } else {
293                 memcpy(tdb->transaction->blocks[blk] + off, buf, len);
294         }
295         if (blk == tdb->transaction->num_blocks-1) {
296                 if (len + off > tdb->transaction->last_block_size) {
297                         tdb->transaction->last_block_size = len + off;
298                 }
299         }
300
301         return 0;
302
303 fail:
304         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
305                    "transaction_write: failed at off=%d len=%d",
306                    (blk*tdb->transaction->block_size) + off, len);
307         tdb->transaction->transaction_error = 1;
308         return -1;
309 }
310
311
312 /*
313   write while in a transaction - this varient never expands the transaction blocks, it only
314   updates existing blocks. This means it cannot change the recovery size
315 */
316 static int transaction1_write_existing(struct tdb1_context *tdb, tdb1_off_t off,
317                                       const void *buf, tdb1_len_t len)
318 {
319         uint32_t blk;
320
321         /* break it up into block sized chunks */
322         while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
323                 tdb1_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
324                 if (transaction1_write_existing(tdb, off, buf, len2) != 0) {
325                         return -1;
326                 }
327                 len -= len2;
328                 off += len2;
329                 if (buf != NULL) {
330                         buf = (const void *)(len2 + (const char *)buf);
331                 }
332         }
333
334         if (len == 0) {
335                 return 0;
336         }
337
338         blk = off / tdb->transaction->block_size;
339         off = off % tdb->transaction->block_size;
340
341         if (tdb->transaction->num_blocks <= blk ||
342             tdb->transaction->blocks[blk] == NULL) {
343                 return 0;
344         }
345
346         if (blk == tdb->transaction->num_blocks-1 &&
347             off + len > tdb->transaction->last_block_size) {
348                 if (off >= tdb->transaction->last_block_size) {
349                         return 0;
350                 }
351                 len = tdb->transaction->last_block_size - off;
352         }
353
354         /* overwrite part of an existing block */
355         memcpy(tdb->transaction->blocks[blk] + off, buf, len);
356
357         return 0;
358 }
359
360
361 /*
362   accelerated hash chain head search, using the cached hash heads
363 */
364 static void transaction1_next_hash_chain(struct tdb1_context *tdb, uint32_t *chain)
365 {
366         uint32_t h = *chain;
367         for (;h < tdb->header.hash_size;h++) {
368                 /* the +1 takes account of the freelist */
369                 if (0 != tdb->transaction->hash_heads[h+1]) {
370                         break;
371                 }
372         }
373         (*chain) = h;
374 }
375
376 /*
377   out of bounds check during a transaction
378 */
379 static int transaction1_oob(struct tdb1_context *tdb, tdb1_off_t len, int probe)
380 {
381         if (len <= tdb->file->map_size) {
382                 return 0;
383         }
384         tdb->last_error = TDB_ERR_IO;
385         return -1;
386 }
387
388 /*
389   transaction version of tdb1_expand().
390 */
391 static int transaction1_expand_file(struct tdb1_context *tdb, tdb1_off_t size,
392                                     tdb1_off_t addition)
393 {
394         /* add a write to the transaction elements, so subsequent
395            reads see the zero data */
396         if (transaction1_write(tdb, size, NULL, addition) != 0) {
397                 return -1;
398         }
399
400         tdb->transaction->expanded = true;
401
402         return 0;
403 }
404
405 static const struct tdb1_methods transaction1_methods = {
406         transaction1_read,
407         transaction1_write,
408         transaction1_next_hash_chain,
409         transaction1_oob,
410         transaction1_expand_file,
411 };
412
413
414 /*
415   start a tdb transaction. No token is returned, as only a single
416   transaction is allowed to be pending per tdb1_context
417 */
418 static int _tdb1_transaction_start(struct tdb1_context *tdb)
419 {
420         /* some sanity checks */
421         if (tdb->read_only || (tdb->flags & TDB_INTERNAL) || tdb->traverse_read) {
422                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
423                                         "tdb1_transaction_start: cannot start a"
424                                         " transaction on a read-only or"
425                                         " internal db");
426                 return -1;
427         }
428
429         /* cope with nested tdb1_transaction_start() calls */
430         if (tdb->transaction != NULL) {
431                 if (!(tdb->flags & TDB_ALLOW_NESTING)) {
432                         tdb->last_error = TDB_ERR_EINVAL;
433                         return -1;
434                 }
435                 tdb->transaction->nesting++;
436                 return 0;
437         }
438
439         if (tdb1_have_extra_locks(tdb)) {
440                 /* the caller must not have any locks when starting a
441                    transaction as otherwise we'll be screwed by lack
442                    of nested locks in posix */
443                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
444                                         "tdb1_transaction_start: cannot start a"
445                                         " transaction with locks held");
446                 return -1;
447         }
448
449         if (tdb->travlocks.next != NULL) {
450                 /* you cannot use transactions inside a traverse (although you can use
451                    traverse inside a transaction) as otherwise you can end up with
452                    deadlock */
453                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
454                                         "tdb1_transaction_start: cannot start a"
455                                         " transaction within a traverse");
456                 return -1;
457         }
458
459         tdb->transaction = (struct tdb1_transaction *)
460                 calloc(sizeof(struct tdb1_transaction), 1);
461         if (tdb->transaction == NULL) {
462                 tdb->last_error = TDB_ERR_OOM;
463                 return -1;
464         }
465
466         /* a page at a time seems like a reasonable compromise between compactness and efficiency */
467         tdb->transaction->block_size = tdb->page_size;
468
469         /* get the transaction write lock. This is a blocking lock. As
470            discussed with Volker, there are a number of ways we could
471            make this async, which we will probably do in the future */
472         if (tdb1_transaction_lock(tdb, F_WRLCK, TDB_LOCK_WAIT) == -1) {
473                 SAFE_FREE(tdb->transaction->blocks);
474                 SAFE_FREE(tdb->transaction);
475                 return -1;
476         }
477
478         /* get a read lock from the freelist to the end of file. This
479            is upgraded to a write lock during the commit */
480         if (tdb1_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true) == -1) {
481                 if (errno != EAGAIN && errno != EINTR) {
482                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
483                                    "tdb1_transaction_start:"
484                                    " failed to get hash locks");
485                 }
486                 goto fail_allrecord_lock;
487         }
488
489         /* setup a copy of the hash table heads so the hash scan in
490            traverse can be fast */
491         tdb->transaction->hash_heads = (uint32_t *)
492                 calloc(tdb->header.hash_size+1, sizeof(uint32_t));
493         if (tdb->transaction->hash_heads == NULL) {
494                 tdb->last_error = TDB_ERR_OOM;
495                 goto fail;
496         }
497         if (tdb->methods->tdb1_read(tdb, TDB1_FREELIST_TOP, tdb->transaction->hash_heads,
498                                    TDB1_HASHTABLE_SIZE(tdb), 0) != 0) {
499                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
500                            "tdb1_transaction_start: failed to read hash heads");
501                 goto fail;
502         }
503
504         /* make sure we know about any file expansions already done by
505            anyone else */
506         tdb->methods->tdb1_oob(tdb, tdb->file->map_size + 1, 1);
507         tdb->transaction->old_map_size = tdb->file->map_size;
508
509         /* finally hook the io methods, replacing them with
510            transaction specific methods */
511         tdb->transaction->io_methods = tdb->methods;
512         tdb->methods = &transaction1_methods;
513
514         return 0;
515
516 fail:
517         tdb1_allrecord_unlock(tdb, F_RDLCK);
518 fail_allrecord_lock:
519         tdb1_transaction_unlock(tdb, F_WRLCK);
520         SAFE_FREE(tdb->transaction->blocks);
521         SAFE_FREE(tdb->transaction->hash_heads);
522         SAFE_FREE(tdb->transaction);
523         return -1;
524 }
525
526 int tdb1_transaction_start(struct tdb1_context *tdb)
527 {
528         return _tdb1_transaction_start(tdb);
529 }
530
531 /*
532   sync to disk
533 */
534 static int transaction1_sync(struct tdb1_context *tdb, tdb1_off_t offset, tdb1_len_t length)
535 {
536         if (tdb->flags & TDB_NOSYNC) {
537                 return 0;
538         }
539
540 #if HAVE_FDATASYNC
541         if (fdatasync(tdb->file->fd) != 0) {
542 #else
543         if (fsync(tdb->file->fd) != 0) {
544 #endif
545                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
546                                         "tdb1_transaction: fsync failed");
547                 return -1;
548         }
549 #if HAVE_MMAP
550         if (tdb->file->map_ptr) {
551                 tdb1_off_t moffset = offset & ~(tdb->page_size-1);
552                 if (msync(moffset + (char *)tdb->file->map_ptr,
553                           length + (offset - moffset), MS_SYNC) != 0) {
554                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
555                                                 "tdb1_transaction:"
556                                                 " msync failed - %s",
557                                                 strerror(errno));
558                         return -1;
559                 }
560         }
561 #endif
562         return 0;
563 }
564
565
566 static int _tdb1_transaction_cancel(struct tdb1_context *tdb)
567 {
568         int i, ret = 0;
569
570         if (tdb->transaction == NULL) {
571                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
572                                         "tdb1_transaction_cancel:"
573                                         " no transaction");
574                 return -1;
575         }
576
577         if (tdb->transaction->nesting != 0) {
578                 tdb->transaction->transaction_error = 1;
579                 tdb->transaction->nesting--;
580                 return 0;
581         }
582
583         tdb->file->map_size = tdb->transaction->old_map_size;
584
585         /* free all the transaction blocks */
586         for (i=0;i<tdb->transaction->num_blocks;i++) {
587                 if (tdb->transaction->blocks[i] != NULL) {
588                         free(tdb->transaction->blocks[i]);
589                 }
590         }
591         SAFE_FREE(tdb->transaction->blocks);
592
593         if (tdb->transaction->magic_offset) {
594                 const struct tdb1_methods *methods = tdb->transaction->io_methods;
595                 const uint32_t invalid = TDB1_RECOVERY_INVALID_MAGIC;
596
597                 /* remove the recovery marker */
598                 if (methods->tdb1_write(tdb, tdb->transaction->magic_offset, &invalid, 4) == -1 ||
599                 transaction1_sync(tdb, tdb->transaction->magic_offset, 4) == -1) {
600                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
601                                    "tdb1_transaction_cancel: failed to"
602                                    " remove recovery magic");
603                         ret = -1;
604                 }
605         }
606
607         /* This also removes the OPEN_LOCK, if we have it. */
608         tdb1_release_transaction_locks(tdb);
609
610         /* restore the normal io methods */
611         tdb->methods = tdb->transaction->io_methods;
612
613         SAFE_FREE(tdb->transaction->hash_heads);
614         SAFE_FREE(tdb->transaction);
615
616         return ret;
617 }
618
619 /*
620   cancel the current transaction
621 */
622 int tdb1_transaction_cancel(struct tdb1_context *tdb)
623 {
624         return _tdb1_transaction_cancel(tdb);
625 }
626
627 /*
628   work out how much space the linearised recovery data will consume
629 */
630 static tdb1_len_t tdb1_recovery_size(struct tdb1_context *tdb)
631 {
632         tdb1_len_t recovery_size = 0;
633         int i;
634
635         recovery_size = sizeof(uint32_t);
636         for (i=0;i<tdb->transaction->num_blocks;i++) {
637                 if (i * tdb->transaction->block_size >= tdb->transaction->old_map_size) {
638                         break;
639                 }
640                 if (tdb->transaction->blocks[i] == NULL) {
641                         continue;
642                 }
643                 recovery_size += 2*sizeof(tdb1_off_t);
644                 if (i == tdb->transaction->num_blocks-1) {
645                         recovery_size += tdb->transaction->last_block_size;
646                 } else {
647                         recovery_size += tdb->transaction->block_size;
648                 }
649         }
650
651         return recovery_size;
652 }
653
654 int tdb1_recovery_area(struct tdb1_context *tdb,
655                       const struct tdb1_methods *methods,
656                       tdb1_off_t *recovery_offset,
657                       struct tdb1_record *rec)
658 {
659         if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, recovery_offset) == -1) {
660                 return -1;
661         }
662
663         if (*recovery_offset == 0) {
664                 rec->rec_len = 0;
665                 return 0;
666         }
667
668         if (methods->tdb1_read(tdb, *recovery_offset, rec, sizeof(*rec),
669                               TDB1_DOCONV()) == -1) {
670                 return -1;
671         }
672
673         /* ignore invalid recovery regions: can happen in crash */
674         if (rec->magic != TDB1_RECOVERY_MAGIC &&
675             rec->magic != TDB1_RECOVERY_INVALID_MAGIC) {
676                 *recovery_offset = 0;
677                 rec->rec_len = 0;
678         }
679         return 0;
680 }
681
682 /*
683   allocate the recovery area, or use an existing recovery area if it is
684   large enough
685 */
686 static int tdb1_recovery_allocate(struct tdb1_context *tdb,
687                                  tdb1_len_t *recovery_size,
688                                  tdb1_off_t *recovery_offset,
689                                  tdb1_len_t *recovery_max_size)
690 {
691         struct tdb1_record rec;
692         const struct tdb1_methods *methods = tdb->transaction->io_methods;
693         tdb1_off_t recovery_head;
694
695         if (tdb1_recovery_area(tdb, methods, &recovery_head, &rec) == -1) {
696                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
697                            "tdb1_recovery_allocate:"
698                            " failed to read recovery head");
699                 return -1;
700         }
701
702         *recovery_size = tdb1_recovery_size(tdb);
703
704         if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
705                 /* it fits in the existing area */
706                 *recovery_max_size = rec.rec_len;
707                 *recovery_offset = recovery_head;
708                 return 0;
709         }
710
711         /* we need to free up the old recovery area, then allocate a
712            new one at the end of the file. Note that we cannot use
713            tdb1_allocate() to allocate the new one as that might return
714            us an area that is being currently used (as of the start of
715            the transaction) */
716         if (recovery_head != 0) {
717                 if (tdb1_free(tdb, recovery_head, &rec) == -1) {
718                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
719                                    "tdb1_recovery_allocate: failed to free"
720                                    " previous recovery area");
721                         return -1;
722                 }
723         }
724
725         /* the tdb1_free() call might have increased the recovery size */
726         *recovery_size = tdb1_recovery_size(tdb);
727
728         /* round up to a multiple of page size */
729         *recovery_max_size = TDB1_ALIGN(sizeof(rec) + *recovery_size, tdb->page_size) - sizeof(rec);
730         *recovery_offset = tdb->file->map_size;
731         recovery_head = *recovery_offset;
732
733         if (methods->tdb1_expand_file(tdb, tdb->transaction->old_map_size,
734                                      (tdb->file->map_size - tdb->transaction->old_map_size) +
735                                      sizeof(rec) + *recovery_max_size) == -1) {
736                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
737                            "tdb1_recovery_allocate:"
738                            " failed to create recovery area");
739                 return -1;
740         }
741
742         /* remap the file (if using mmap) */
743         methods->tdb1_oob(tdb, tdb->file->map_size + 1, 1);
744
745         /* we have to reset the old map size so that we don't try to expand the file
746            again in the transaction commit, which would destroy the recovery area */
747         tdb->transaction->old_map_size = tdb->file->map_size;
748
749         /* write the recovery header offset and sync - we can sync without a race here
750            as the magic ptr in the recovery record has not been set */
751         TDB1_CONV(recovery_head);
752         if (methods->tdb1_write(tdb, TDB1_RECOVERY_HEAD,
753                                &recovery_head, sizeof(tdb1_off_t)) == -1) {
754                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
755                            "tdb1_recovery_allocate:"
756                            " failed to write recovery head");
757                 return -1;
758         }
759         if (transaction1_write_existing(tdb, TDB1_RECOVERY_HEAD, &recovery_head, sizeof(tdb1_off_t)) == -1) {
760                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
761                            "tdb1_recovery_allocate:"
762                            " failed to write recovery head");
763                 return -1;
764         }
765
766         return 0;
767 }
768
769
770 /*
771   setup the recovery data that will be used on a crash during commit
772 */
773 static int transaction1_setup_recovery(struct tdb1_context *tdb,
774                                        tdb1_off_t *magic_offset)
775 {
776         tdb1_len_t recovery_size;
777         unsigned char *data, *p;
778         const struct tdb1_methods *methods = tdb->transaction->io_methods;
779         struct tdb1_record *rec;
780         tdb1_off_t recovery_offset, recovery_max_size;
781         tdb1_off_t old_map_size = tdb->transaction->old_map_size;
782         uint32_t magic, tailer;
783         int i;
784
785         /*
786           check that the recovery area has enough space
787         */
788         if (tdb1_recovery_allocate(tdb, &recovery_size,
789                                   &recovery_offset, &recovery_max_size) == -1) {
790                 return -1;
791         }
792
793         data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
794         if (data == NULL) {
795                 tdb->last_error = TDB_ERR_OOM;
796                 return -1;
797         }
798
799         rec = (struct tdb1_record *)data;
800         memset(rec, 0, sizeof(*rec));
801
802         rec->magic    = TDB1_RECOVERY_INVALID_MAGIC;
803         rec->data_len = recovery_size;
804         rec->rec_len  = recovery_max_size;
805         rec->key_len  = old_map_size;
806         TDB1_CONV(*rec);
807
808         /* build the recovery data into a single blob to allow us to do a single
809            large write, which should be more efficient */
810         p = data + sizeof(*rec);
811         for (i=0;i<tdb->transaction->num_blocks;i++) {
812                 tdb1_off_t offset;
813                 tdb1_len_t length;
814
815                 if (tdb->transaction->blocks[i] == NULL) {
816                         continue;
817                 }
818
819                 offset = i * tdb->transaction->block_size;
820                 length = tdb->transaction->block_size;
821                 if (i == tdb->transaction->num_blocks-1) {
822                         length = tdb->transaction->last_block_size;
823                 }
824
825                 if (offset >= old_map_size) {
826                         continue;
827                 }
828                 if (offset + length > tdb->transaction->old_map_size) {
829                         tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT,
830                                                 TDB_LOG_ERROR,
831                                                 "tdb1_transaction_setup_recovery: transaction data over new region boundary");
832                         free(data);
833                         return -1;
834                 }
835                 memcpy(p, &offset, 4);
836                 memcpy(p+4, &length, 4);
837                 if (TDB1_DOCONV()) {
838                         tdb1_convert(p, 8);
839                 }
840                 /* the recovery area contains the old data, not the
841                    new data, so we have to call the original tdb1_read
842                    method to get it */
843                 if (methods->tdb1_read(tdb, offset, p + 8, length, 0) != 0) {
844                         free(data);
845                         tdb->last_error = TDB_ERR_IO;
846                         return -1;
847                 }
848                 p += 8 + length;
849         }
850
851         /* and the tailer */
852         tailer = sizeof(*rec) + recovery_max_size;
853         memcpy(p, &tailer, 4);
854         if (TDB1_DOCONV()) {
855                 tdb1_convert(p, 4);
856         }
857
858         /* write the recovery data to the recovery area */
859         if (methods->tdb1_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
860                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
861                            "tdb1_transaction_setup_recovery:"
862                            " failed to write recovery data");
863                 free(data);
864                 return -1;
865         }
866         if (transaction1_write_existing(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
867                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
868                            "tdb1_transaction_setup_recovery: failed to write"
869                            " secondary recovery data");
870                 free(data);
871                 return -1;
872         }
873
874         /* as we don't have ordered writes, we have to sync the recovery
875            data before we update the magic to indicate that the recovery
876            data is present */
877         if (transaction1_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
878                 free(data);
879                 return -1;
880         }
881
882         free(data);
883
884         magic = TDB1_RECOVERY_MAGIC;
885         TDB1_CONV(magic);
886
887         *magic_offset = recovery_offset + offsetof(struct tdb1_record, magic);
888
889         if (methods->tdb1_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
890                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
891                            "tdb1_transaction_setup_recovery:"
892                            " failed to write recovery magic");
893                 return -1;
894         }
895         if (transaction1_write_existing(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
896                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
897                            "tdb1_transaction_setup_recovery:"
898                            " failed to write secondary recovery magic");
899                 return -1;
900         }
901
902         /* ensure the recovery magic marker is on disk */
903         if (transaction1_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
904                 return -1;
905         }
906
907         return 0;
908 }
909
910 static int _tdb1_transaction_prepare_commit(struct tdb1_context *tdb)
911 {
912         const struct tdb1_methods *methods;
913
914         if (tdb->transaction == NULL) {
915                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
916                                         "tdb1_transaction_prepare_commit:"
917                                         " no transaction");
918                 return -1;
919         }
920
921         if (tdb->transaction->prepared) {
922                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
923                                         "tdb1_transaction_prepare_commit:"
924                                         " transaction already prepared");
925                 _tdb1_transaction_cancel(tdb);
926                 return -1;
927         }
928
929         if (tdb->transaction->transaction_error) {
930                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
931                                         "tdb1_transaction_prepare_commit:"
932                                         " transaction error pending");
933                 _tdb1_transaction_cancel(tdb);
934                 return -1;
935         }
936
937
938         if (tdb->transaction->nesting != 0) {
939                 return 0;
940         }
941
942         /* check for a null transaction */
943         if (tdb->transaction->blocks == NULL) {
944                 return 0;
945         }
946
947         methods = tdb->transaction->io_methods;
948
949         /* if there are any locks pending then the caller has not
950            nested their locks properly, so fail the transaction */
951         if (tdb1_have_extra_locks(tdb)) {
952                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
953                                         "tdb1_transaction_prepare_commit:"
954                                         " locks pending on commit");
955                 _tdb1_transaction_cancel(tdb);
956                 return -1;
957         }
958
959         /* upgrade the main transaction lock region to a write lock */
960         if (tdb1_allrecord_upgrade(tdb) == -1) {
961                 if (errno != EAGAIN && errno != EINTR) {
962                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
963                                    "tdb1_transaction_prepare_commit:"
964                                    " failed to upgrade hash locks");
965                 }
966                 return -1;
967         }
968
969         /* get the open lock - this prevents new users attaching to the database
970            during the commit */
971         if (tdb1_nest_lock(tdb, TDB1_OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT) == -1) {
972                 if (errno != EAGAIN && errno != EINTR) {
973                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
974                                    "tdb1_transaction_prepare_commit:"
975                                    " failed to get open lock");
976                 }
977                 return -1;
978         }
979
980         if (!(tdb->flags & TDB_NOSYNC)) {
981                 /* write the recovery data to the end of the file */
982                 if (transaction1_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
983                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
984                                    "tdb1_transaction_prepare_commit:"
985                                    " failed to setup recovery data");
986                         return -1;
987                 }
988         }
989
990         tdb->transaction->prepared = true;
991
992         /* expand the file to the new size if needed */
993         if (tdb->file->map_size != tdb->transaction->old_map_size) {
994                 if (methods->tdb1_expand_file(tdb, tdb->transaction->old_map_size,
995                                              tdb->file->map_size -
996                                              tdb->transaction->old_map_size) == -1) {
997                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
998                                    "tdb1_transaction_prepare_commit:"
999                                    " expansion failed");
1000                         return -1;
1001                 }
1002                 tdb->file->map_size = tdb->transaction->old_map_size;
1003                 methods->tdb1_oob(tdb, tdb->file->map_size + 1, 1);
1004         }
1005
1006         /* Keep the open lock until the actual commit */
1007
1008         return 0;
1009 }
1010
1011 /*
1012    prepare to commit the current transaction
1013 */
1014 int tdb1_transaction_prepare_commit(struct tdb1_context *tdb)
1015 {
1016         return _tdb1_transaction_prepare_commit(tdb);
1017 }
1018
1019 /* A repack is worthwhile if the largest is less than half total free. */
1020 static bool repack_worthwhile(struct tdb1_context *tdb)
1021 {
1022         tdb1_off_t ptr;
1023         struct tdb1_record rec;
1024         tdb1_len_t total = 0, largest = 0;
1025
1026         if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &ptr) == -1) {
1027                 return false;
1028         }
1029
1030         while (ptr != 0 && tdb1_rec_free_read(tdb, ptr, &rec) == 0) {
1031                 total += rec.rec_len;
1032                 if (rec.rec_len > largest) {
1033                         largest = rec.rec_len;
1034                 }
1035                 ptr = rec.next;
1036         }
1037
1038         return total > largest * 2;
1039 }
1040
1041 /*
1042   commit the current transaction
1043 */
1044 int tdb1_transaction_commit(struct tdb1_context *tdb)
1045 {
1046         const struct tdb1_methods *methods;
1047         int i;
1048         bool need_repack = false;
1049
1050         if (tdb->transaction == NULL) {
1051                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
1052                                         "tdb1_transaction_commit:"
1053                                         " no transaction");
1054                 return -1;
1055         }
1056
1057         if (tdb->transaction->transaction_error) {
1058                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
1059                                         "tdb1_transaction_commit:"
1060                                         " transaction error pending");
1061                 _tdb1_transaction_cancel(tdb);
1062                 return -1;
1063         }
1064
1065
1066         if (tdb->transaction->nesting != 0) {
1067                 tdb->transaction->nesting--;
1068                 return 0;
1069         }
1070
1071         /* check for a null transaction */
1072         if (tdb->transaction->blocks == NULL) {
1073                 _tdb1_transaction_cancel(tdb);
1074                 return 0;
1075         }
1076
1077         if (!tdb->transaction->prepared) {
1078                 int ret = _tdb1_transaction_prepare_commit(tdb);
1079                 if (ret) {
1080                         _tdb1_transaction_cancel(tdb);
1081                         return ret;
1082                 }
1083         }
1084
1085         methods = tdb->transaction->io_methods;
1086
1087         /* perform all the writes */
1088         for (i=0;i<tdb->transaction->num_blocks;i++) {
1089                 tdb1_off_t offset;
1090                 tdb1_len_t length;
1091
1092                 if (tdb->transaction->blocks[i] == NULL) {
1093                         continue;
1094                 }
1095
1096                 offset = i * tdb->transaction->block_size;
1097                 length = tdb->transaction->block_size;
1098                 if (i == tdb->transaction->num_blocks-1) {
1099                         length = tdb->transaction->last_block_size;
1100                 }
1101
1102                 if (methods->tdb1_write(tdb, offset, tdb->transaction->blocks[i], length) == -1) {
1103                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
1104                                    "tdb1_transaction_commit:"
1105                                    " write failed during commit");
1106
1107                         /* we've overwritten part of the data and
1108                            possibly expanded the file, so we need to
1109                            run the crash recovery code */
1110                         tdb->methods = methods;
1111                         tdb1_transaction_recover(tdb);
1112
1113                         _tdb1_transaction_cancel(tdb);
1114
1115                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
1116                                    "tdb1_transaction_commit: write failed");
1117                         return -1;
1118                 }
1119                 SAFE_FREE(tdb->transaction->blocks[i]);
1120         }
1121
1122         /* Do this before we drop lock or blocks. */
1123         if (tdb->transaction->expanded) {
1124                 need_repack = repack_worthwhile(tdb);
1125         }
1126
1127         SAFE_FREE(tdb->transaction->blocks);
1128         tdb->transaction->num_blocks = 0;
1129
1130         /* ensure the new data is on disk */
1131         if (transaction1_sync(tdb, 0, tdb->file->map_size) == -1) {
1132                 return -1;
1133         }
1134
1135         /*
1136           TODO: maybe write to some dummy hdr field, or write to magic
1137           offset without mmap, before the last sync, instead of the
1138           utime() call
1139         */
1140
1141         /* on some systems (like Linux 2.6.x) changes via mmap/msync
1142            don't change the mtime of the file, this means the file may
1143            not be backed up (as tdb rounding to block sizes means that
1144            file size changes are quite rare too). The following forces
1145            mtime changes when a transaction completes */
1146 #if HAVE_UTIME
1147         utime(tdb->name, NULL);
1148 #endif
1149
1150         /* use a transaction cancel to free memory and remove the
1151            transaction locks */
1152         _tdb1_transaction_cancel(tdb);
1153
1154         if (need_repack) {
1155                 return tdb1_repack(tdb);
1156         }
1157
1158         return 0;
1159 }
1160
1161
1162 /*
1163   recover from an aborted transaction. Must be called with exclusive
1164   database write access already established (including the open
1165   lock to prevent new processes attaching)
1166 */
1167 int tdb1_transaction_recover(struct tdb1_context *tdb)
1168 {
1169         tdb1_off_t recovery_head, recovery_eof;
1170         unsigned char *data, *p;
1171         uint32_t zero = 0;
1172         struct tdb1_record rec;
1173
1174         /* find the recovery area */
1175         if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, &recovery_head) == -1) {
1176                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
1177                            "tdb1_transaction_recover:"
1178                            " failed to read recovery head");
1179                 return -1;
1180         }
1181
1182         if (recovery_head == 0) {
1183                 /* we have never allocated a recovery record */
1184                 return 0;
1185         }
1186
1187         /* read the recovery record */
1188         if (tdb->methods->tdb1_read(tdb, recovery_head, &rec,
1189                                    sizeof(rec), TDB1_DOCONV()) == -1) {
1190                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
1191                            "tdb1_transaction_recover:"
1192                            " failed to read recovery record");
1193                 return -1;
1194         }
1195
1196         if (rec.magic != TDB1_RECOVERY_MAGIC) {
1197                 /* there is no valid recovery data */
1198                 return 0;
1199         }
1200
1201         if (tdb->read_only) {
1202                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
1203                                         "tdb1_transaction_recover:"
1204                                         " attempt to recover read only"
1205                                         " database");
1206                 return -1;
1207         }
1208
1209         recovery_eof = rec.key_len;
1210
1211         data = (unsigned char *)malloc(rec.data_len);
1212         if (data == NULL) {
1213                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
1214                                         "tdb1_transaction_recover:"
1215                                         " failed to allocate recovery data");
1216                 return -1;
1217         }
1218
1219         /* read the full recovery data */
1220         if (tdb->methods->tdb1_read(tdb, recovery_head + sizeof(rec), data,
1221                                    rec.data_len, 0) == -1) {
1222                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
1223                            "tdb1_transaction_recover:"
1224                            " failed to read recovery data");
1225                 return -1;
1226         }
1227
1228         /* recover the file data */
1229         p = data;
1230         while (p+8 < data + rec.data_len) {
1231                 uint32_t ofs, len;
1232                 if (TDB1_DOCONV()) {
1233                         tdb1_convert(p, 8);
1234                 }
1235                 memcpy(&ofs, p, 4);
1236                 memcpy(&len, p+4, 4);
1237
1238                 if (tdb->methods->tdb1_write(tdb, ofs, p+8, len) == -1) {
1239                         free(data);
1240                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
1241                                    "tdb1_transaction_recover: failed to recover"
1242                                    " %d bytes at offset %d", len, ofs);
1243                         return -1;
1244                 }
1245                 p += 8 + len;
1246         }
1247
1248         free(data);
1249
1250         if (transaction1_sync(tdb, 0, tdb->file->map_size) == -1) {
1251                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
1252                            "tdb1_transaction_recover: failed to sync recovery");
1253                 return -1;
1254         }
1255
1256         /* if the recovery area is after the recovered eof then remove it */
1257         if (recovery_eof <= recovery_head) {
1258                 if (tdb1_ofs_write(tdb, TDB1_RECOVERY_HEAD, &zero) == -1) {
1259                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
1260                                    "tdb1_transaction_recover: failed to remove"
1261                                    " recovery head");
1262                         return -1;
1263                 }
1264         }
1265
1266         /* remove the recovery magic */
1267         if (tdb1_ofs_write(tdb, recovery_head + offsetof(struct tdb1_record, magic),
1268                           &zero) == -1) {
1269                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
1270                            "tdb1_transaction_recover: failed to remove"
1271                            " recovery magic");
1272                 return -1;
1273         }
1274
1275         if (transaction1_sync(tdb, 0, recovery_eof) == -1) {
1276                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
1277                            "tdb1_transaction_recover:"
1278                            " failed to sync2 recovery");
1279                 return -1;
1280         }
1281
1282         tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
1283                    "tdb1_transaction_recover: recovered %d byte database",
1284                    recovery_eof);
1285
1286         /* all done */
1287         return 0;
1288 }
1289
1290 /* Any I/O failures we say "needs recovery". */
1291 bool tdb1_needs_recovery(struct tdb1_context *tdb)
1292 {
1293         tdb1_off_t recovery_head;
1294         struct tdb1_record rec;
1295
1296         /* find the recovery area */
1297         if (tdb1_ofs_read(tdb, TDB1_RECOVERY_HEAD, &recovery_head) == -1) {
1298                 return true;
1299         }
1300
1301         if (recovery_head == 0) {
1302                 /* we have never allocated a recovery record */
1303                 return false;
1304         }
1305
1306         /* read the recovery record */
1307         if (tdb->methods->tdb1_read(tdb, recovery_head, &rec,
1308                                    sizeof(rec), TDB1_DOCONV()) == -1) {
1309                 return true;
1310         }
1311
1312         return (rec.magic == TDB1_RECOVERY_MAGIC);
1313 }