]> git.ozlabs.org Git - ccan/blob - ccan/tdb/tools/replay_trace.c
651a67e97b1cc5a17f4b732960dfaf7e0c0c7495
[ccan] / ccan / tdb / tools / replay_trace.c
1 #include <ccan/tdb/tdb.h>
2 #include <ccan/grab_file/grab_file.h>
3 #include <ccan/hash/hash.h>
4 #include <ccan/talloc/talloc.h>
5 #include <ccan/str_talloc/str_talloc.h>
6 #include <ccan/str/str.h>
7 #include <ccan/list/list.h>
8 #include <err.h>
9 #include <ctype.h>
10 #include <string.h>
11 #include <unistd.h>
12 #include <sys/types.h>
13 #include <sys/wait.h>
14 #include <sys/time.h>
15 #include <errno.h>
16 #include <signal.h>
17 #include <assert.h>
18 #include <fcntl.h>
19
20 #define STRINGIFY2(x) #x
21 #define STRINGIFY(x) STRINGIFY2(x)
22
23 /* Avoid mod by zero */
24 static unsigned int total_keys = 1;
25
26 /* #define DEBUG_DEPS 1 */
27
28 /* Traversals block transactions in the current implementation. */
29 #define TRAVERSALS_TAKE_TRANSACTION_LOCK 1
30
31 struct pipe {
32         int fd[2];
33 };
34 static struct pipe *pipes;
35 static int backoff_fd = -1;
36
37 static void __attribute__((noreturn)) fail(const char *filename,
38                                            unsigned int line,
39                                            const char *fmt, ...)
40 {
41         va_list ap;
42
43         va_start(ap, fmt);
44         fprintf(stderr, "%s:%u: FAIL: ", filename, line);
45         vfprintf(stderr, fmt, ap);
46         fprintf(stderr, "\n");
47         va_end(ap);
48         exit(1);
49 }
50         
51 /* Try or die. */
52 #define try(expr, expect)                                               \
53         do {                                                            \
54                 int ret = (expr);                                       \
55                 if (ret != (expect))                                    \
56                         fail(filename[file], i+1,                       \
57                              STRINGIFY(expr) "= %i", ret);              \
58         } while (0)
59
60 /* Try or imitate results. */
61 #define unreliable(expr, expect, force, undo)                           \
62         do {                                                            \
63                 int ret = expr;                                         \
64                 if (ret != expect) {                                    \
65                         fprintf(stderr, "%s:%u: %s gave %i not %i",     \
66                                 filename[file], i+1, STRINGIFY(expr),   \
67                                 ret, expect);                           \
68                         if (expect == 0)                                \
69                                 force;                                  \
70                         else                                            \
71                                 undo;                                   \
72                 }                                                       \
73         } while (0)
74
75 static bool key_eq(TDB_DATA a, TDB_DATA b)
76 {
77         if (a.dsize != b.dsize)
78                 return false;
79         return memcmp(a.dptr, b.dptr, a.dsize) == 0;
80 }
81
82 /* This is based on the hash algorithm from gdbm */
83 static unsigned int hash_key(TDB_DATA *key)
84 {
85         uint32_t value; /* Used to compute the hash value.  */
86         uint32_t   i;   /* Used to cycle through random values. */
87
88         /* Set the initial value from the key size. */
89         for (value = 0x238F13AF ^ key->dsize, i=0; i < key->dsize; i++)
90                 value = (value + (key->dptr[i] << (i*5 % 24)));
91
92         return (1103515243 * value + 12345);  
93 }
94
95 enum op_type {
96         OP_TDB_LOCKALL,
97         OP_TDB_LOCKALL_MARK,
98         OP_TDB_LOCKALL_UNMARK,
99         OP_TDB_LOCKALL_NONBLOCK,
100         OP_TDB_UNLOCKALL,
101         OP_TDB_LOCKALL_READ,
102         OP_TDB_LOCKALL_READ_NONBLOCK,
103         OP_TDB_UNLOCKALL_READ,
104         OP_TDB_CHAINLOCK,
105         OP_TDB_CHAINLOCK_NONBLOCK,
106         OP_TDB_CHAINLOCK_MARK,
107         OP_TDB_CHAINLOCK_UNMARK,
108         OP_TDB_CHAINUNLOCK,
109         OP_TDB_CHAINLOCK_READ,
110         OP_TDB_CHAINUNLOCK_READ,
111         OP_TDB_PARSE_RECORD,
112         OP_TDB_EXISTS,
113         OP_TDB_STORE,
114         OP_TDB_APPEND,
115         OP_TDB_GET_SEQNUM,
116         OP_TDB_WIPE_ALL,
117         OP_TDB_TRANSACTION_START,
118         OP_TDB_TRANSACTION_CANCEL,
119         OP_TDB_TRANSACTION_PREPARE_COMMIT,
120         OP_TDB_TRANSACTION_COMMIT,
121         OP_TDB_TRAVERSE_READ_START,
122         OP_TDB_TRAVERSE_START,
123         OP_TDB_TRAVERSE_END,
124         OP_TDB_TRAVERSE,
125         OP_TDB_TRAVERSE_END_EARLY,
126         OP_TDB_FIRSTKEY,
127         OP_TDB_NEXTKEY,
128         OP_TDB_FETCH,
129         OP_TDB_DELETE,
130 };
131
132 struct op {
133         unsigned int seqnum;
134         enum op_type type;
135         TDB_DATA key;
136         TDB_DATA data;
137         int ret;
138
139         /* Who is waiting for us? */
140         struct list_head post;
141         /* What are we waiting for? */
142         struct list_head pre;
143
144         /* If I'm part of a group (traverse/transaction) where is
145          * start?  (Otherwise, 0) */
146         unsigned int group_start;
147
148         union {
149                 int flag; /* open and store */
150                 struct {  /* append */
151                         TDB_DATA pre;
152                         TDB_DATA post;
153                 } append;
154                 /* transaction/traverse start/chainlock */
155                 unsigned int group_len;
156         };
157 };
158
159 struct op_desc {
160         unsigned int file;
161         unsigned int op_num;
162 };
163
164 static unsigned char hex_char(const char *filename, unsigned int line, char c)
165 {
166         c = toupper(c);
167         if (c >= 'A' && c <= 'F')
168                 return c - 'A' + 10;
169         if (c >= '0' && c <= '9')
170                 return c - '0';
171         fail(filename, line, "invalid hex character '%c'", c);
172 }
173
174 /* TDB data is <size>:<%02x>* */
175 static TDB_DATA make_tdb_data(const void *ctx,
176                               const char *filename, unsigned int line,
177                               const char *word)
178 {
179         TDB_DATA data;
180         unsigned int i;
181         const char *p;
182
183         if (streq(word, "NULL"))
184                 return tdb_null;
185
186         data.dsize = atoi(word);
187         data.dptr = talloc_array(ctx, unsigned char, data.dsize);
188         p = strchr(word, ':');
189         if (!p)
190                 fail(filename, line, "invalid tdb data '%s'", word);
191         p++;
192         for (i = 0; i < data.dsize; i++)
193                 data.dptr[i] = hex_char(filename, line, p[i*2])*16
194                         + hex_char(filename, line, p[i*2+1]);
195
196         return data;
197 }
198
199 static void add_op(const char *filename, struct op **op, unsigned int i,
200                    unsigned int seqnum, enum op_type type)
201 {
202         struct op *new;
203         *op = talloc_realloc(NULL, *op, struct op, i+1);
204         new = (*op) + i;
205         new->type = type;
206         new->seqnum = seqnum;
207         new->ret = 0;
208         new->group_start = 0;
209 }
210
211 static void op_add_nothing(const char *filename,
212                            struct op op[], unsigned int op_num, char *words[])
213 {
214         if (words[2])
215                 fail(filename, op_num+1, "Expected no arguments");
216         op[op_num].key = tdb_null;
217 }
218
219 static void op_add_key(const char *filename,
220                        struct op op[], unsigned int op_num, char *words[])
221 {
222         if (words[2] == NULL || words[3])
223                 fail(filename, op_num+1, "Expected just a key");
224
225         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
226         total_keys++;
227 }
228
229 static void op_add_key_ret(const char *filename,
230                            struct op op[], unsigned int op_num, char *words[])
231 {
232         if (!words[2] || !words[3] || !words[4] || words[5]
233             || !streq(words[3], "="))
234                 fail(filename, op_num+1, "Expected <key> = <ret>");
235         op[op_num].ret = atoi(words[4]);
236         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
237         /* May only be a unique key if it fails */
238         if (op[op_num].ret != 0)
239                 total_keys++;
240 }
241
242 static void op_add_key_data(const char *filename,
243                             struct op op[], unsigned int op_num, char *words[])
244 {
245         if (!words[2] || !words[3] || !words[4] || words[5]
246             || !streq(words[3], "="))
247                 fail(filename, op_num+1, "Expected <key> = <data>");
248         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
249         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[4]);
250         /* May only be a unique key if it fails */
251         if (!op[op_num].data.dptr)
252                 total_keys++;
253 }
254
255 /* We don't record the keys or data for a traverse, as we don't use them. */
256 static void op_add_traverse(const char *filename,
257                             struct op op[], unsigned int op_num, char *words[])
258 {
259         if (!words[2] || !words[3] || !words[4] || words[5]
260             || !streq(words[3], "="))
261                 fail(filename, op_num+1, "Expected <key> = <data>");
262         op[op_num].key = tdb_null;
263 }
264
265 /* Full traverse info is useful for debugging, but changing it to
266  * "traversefn" without the data makes the traces *much* smaller! */
267 static void op_add_traversefn(const char *filename,
268                             struct op op[], unsigned int op_num, char *words[])
269 {
270         if (words[2])
271                 fail(filename, op_num+1, "Expected no values");
272         op[op_num].key = tdb_null;
273 }
274
275 /* <seqnum> tdb_store <rec> <rec> <flag> = <ret> */
276 static void op_add_store(const char *filename,
277                          struct op op[], unsigned int op_num, char *words[])
278 {
279         if (!words[2] || !words[3] || !words[4] || !words[5] || !words[6]
280             || words[7] || !streq(words[5], "="))
281                 fail(filename, op_num+1, "Expect <key> <data> <flag> = <ret>");
282
283         op[op_num].flag = strtoul(words[4], NULL, 0);
284         op[op_num].ret = atoi(words[6]);
285         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
286         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
287         total_keys++;
288 }
289
290 /* <seqnum> tdb_append <rec> <rec> = <rec> */
291 static void op_add_append(const char *filename,
292                           struct op op[], unsigned int op_num, char *words[])
293 {
294         if (!words[2] || !words[3] || !words[4] || !words[5] || words[6]
295             || !streq(words[4], "="))
296                 fail(filename, op_num+1, "Expect <key> <data> = <rec>");
297
298         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
299         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
300
301         op[op_num].append.post
302                 = make_tdb_data(op, filename, op_num+1, words[5]);
303
304         /* By subtraction, figure out what previous data was. */
305         op[op_num].append.pre.dptr = op[op_num].append.post.dptr;
306         op[op_num].append.pre.dsize
307                 = op[op_num].append.post.dsize - op[op_num].data.dsize;
308         total_keys++;
309 }
310
311 /* <seqnum> tdb_get_seqnum = <ret> */
312 static void op_add_seqnum(const char *filename,
313                           struct op op[], unsigned int op_num, char *words[])
314 {
315         if (!words[2] || !words[3] || words[4] || !streq(words[2], "="))
316                 fail(filename, op_num+1, "Expect = <ret>");
317
318         op[op_num].key = tdb_null;
319         op[op_num].ret = atoi(words[3]);
320 }
321
322 static void op_add_traverse_start(const char *filename,
323                                   struct op op[],
324                                   unsigned int op_num, char *words[])
325 {
326         if (words[2])
327                 fail(filename, op_num+1, "Expect no arguments");
328
329         op[op_num].key = tdb_null;
330         op[op_num].group_len = 0;
331 }
332
333 static void op_add_transaction(const char *filename, struct op op[],
334                                unsigned int op_num, char *words[])
335 {
336         if (words[2])
337                 fail(filename, op_num+1, "Expect no arguments");
338
339         op[op_num].key = tdb_null;
340         op[op_num].group_len = 0;
341 }
342
343 static void op_add_chainlock(const char *filename,
344                              struct op op[], unsigned int op_num, char *words[])
345 {
346         if (words[2] == NULL || words[3])
347                 fail(filename, op_num+1, "Expected just a key");
348
349         /* A chainlock key isn't a key in the normal sense; it doesn't
350          * have to be in the db at all.  Also, we don't want to hash this op. */
351         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
352         op[op_num].key = tdb_null;
353         op[op_num].group_len = 0;
354 }
355
356 static void op_add_chainlock_ret(const char *filename,
357                                  struct op op[], unsigned int op_num,
358                                  char *words[])
359 {
360         if (!words[2] || !words[3] || !words[4] || words[5]
361             || !streq(words[3], "="))
362                 fail(filename, op_num+1, "Expected <key> = <ret>");
363         op[op_num].ret = atoi(words[4]);
364         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
365         op[op_num].key = tdb_null;
366         op[op_num].group_len = 0;
367         total_keys++;
368 }
369
370 static int op_find_start(struct op op[], unsigned int op_num, enum op_type type)
371 {
372         unsigned int i;
373
374         for (i = op_num-1; i > 0; i--) {
375                 if (op[i].type == type && !op[i].group_len)
376                         return i;
377         }
378         return 0;
379 }
380
381 static void op_analyze_transaction(const char *filename,
382                                    struct op op[], unsigned int op_num,
383                                    char *words[])
384 {
385         unsigned int start, i;
386
387         op[op_num].key = tdb_null;
388
389         if (words[2])
390                 fail(filename, op_num+1, "Expect no arguments");
391
392         start = op_find_start(op, op_num, OP_TDB_TRANSACTION_START);
393         if (!start)
394                 fail(filename, op_num+1, "no transaction start found");
395
396         op[start].group_len = op_num - start;
397
398         /* This rolls in nested transactions.  I think that's right. */
399         for (i = start; i <= op_num; i++)
400                 op[i].group_start = start;
401 }
402
403 /* We treat chainlocks a lot like transactions, even though that's overkill */
404 static void op_analyze_chainlock(const char *filename,
405                                  struct op op[], unsigned int op_num,
406                                  char *words[])
407 {
408         unsigned int i, start;
409
410         if (words[2] == NULL || words[3])
411                 fail(filename, op_num+1, "Expected just a key");
412
413         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
414         op[op_num].key = tdb_null;
415         total_keys++;
416
417         start = op_find_start(op, op_num, OP_TDB_CHAINLOCK);
418         if (!start)
419                 start = op_find_start(op, op_num, OP_TDB_CHAINLOCK_READ);
420         if (!start)
421                 fail(filename, op_num+1, "no initial chainlock found");
422
423         /* FIXME: We'd have to do something clever to make this work
424          * vs. deadlock. */
425         if (!key_eq(op[start].data, op[op_num].data))
426                 fail(filename, op_num+1, "nested chainlock calls?");
427
428         op[start].group_len = op_num - start;
429         for (i = start; i <= op_num; i++)
430                 op[i].group_start = start;
431 }
432
433 static void op_analyze_traverse(const char *filename,
434                                 struct op op[], unsigned int op_num,
435                                 char *words[])
436 {
437         int i, start;
438
439         op[op_num].key = tdb_null;
440
441         /* = %u means traverse function terminated. */
442         if (words[2]) {
443                 if (!streq(words[2], "=") || !words[3] || words[4])
444                         fail(filename, op_num+1, "expect = <num>");
445                 op[op_num].ret = atoi(words[3]);
446         } else
447                 op[op_num].ret = 0;
448
449         start = op_find_start(op, op_num, OP_TDB_TRAVERSE_START);
450         if (!start)
451                 start = op_find_start(op, op_num, OP_TDB_TRAVERSE_READ_START);
452         if (!start)
453                 fail(filename, op_num+1, "no traversal start found");
454
455         op[start].group_len = op_num - start;
456
457         /* Don't roll in nested traverse/chainlock */
458         for (i = start; i <= op_num; i++)
459                 if (!op[i].group_start)
460                         op[i].group_start = start;
461 }
462
463 /* Keep -Wmissing-declarations happy: */
464 const struct op_table *
465 find_keyword (register const char *str, register unsigned int len);
466
467 #include "keywords.c"
468
469 struct depend {
470         /* We can have more than one */
471         struct list_node pre_list;
472         struct list_node post_list;
473         struct op_desc needs;
474         struct op_desc prereq;
475 };
476
477 static void check_deps(const char *filename, struct op op[], unsigned int num)
478 {
479 #ifdef DEBUG_DEPS
480         unsigned int i;
481
482         for (i = 1; i < num; i++)
483                 if (!list_empty(&op[i].pre))
484                         fail(filename, i+1, "Still has dependencies");
485 #endif
486 }
487
488 static void dump_pre(char *filename[], struct op *op[],
489                      unsigned int file, unsigned int i)
490 {
491         struct depend *dep;
492
493         printf("%s:%u (%u) still waiting for:\n", filename[file], i+1,
494                 op[file][i].seqnum);
495         list_for_each(&op[file][i].pre, dep, pre_list)
496                 printf("    %s:%u (%u)\n",
497                        filename[dep->prereq.file], dep->prereq.op_num+1,
498                        op[dep->prereq.file][dep->prereq.op_num].seqnum);
499         check_deps(filename[file], op[file], i);
500 }
501
502 /* We simply read/write pointers, since we all are children. */
503 static bool do_pre(struct tdb_context *tdb,
504                    char *filename[], struct op *op[],
505                    unsigned int file, int pre_fd, unsigned int i,
506                    bool backoff)
507 {
508         while (!list_empty(&op[file][i].pre)) {
509                 struct depend *dep;
510
511 #if DEBUG_DEPS
512                 printf("%s:%u:waiting for pre\n", filename[file], i+1);
513                 fflush(stdout);
514 #endif
515                 if (backoff)
516                         alarm(2);
517                 else
518                         alarm(10);
519                 while (read(pre_fd, &dep, sizeof(dep)) != sizeof(dep)) {
520                         if (errno == EINTR) {
521                                 if (backoff) {
522                                         struct op_desc desc = { file,i };
523                                         warnx("%s:%u:avoiding deadlock",
524                                               filename[file], i+1);
525                                         if (write(backoff_fd, &desc,
526                                                   sizeof(desc)) != sizeof(desc))
527                                                 err(1, "writing backoff_fd");
528                                         return false;
529                                 }
530                                 dump_pre(filename, op, file, i);
531                                 exit(1);
532                         } else
533                                 errx(1, "Reading from pipe");
534                 }
535                 alarm(0);
536
537 #if DEBUG_DEPS
538                 printf("%s:%u:got pre %u from %s:%u\n", filename[file], i+1,
539                        dep->needs.op_num+1, filename[dep->prereq.file],
540                        dep->prereq.op_num+1);
541                 fflush(stdout);
542 #endif
543                 /* This could be any op, not just this one. */
544                 talloc_free(dep);
545         }
546         return true;
547 }
548
549 static void do_post(char *filename[], struct op *op[],
550                     unsigned int file, unsigned int i)
551 {
552         struct depend *dep;
553
554         list_for_each(&op[file][i].post, dep, post_list) {
555 #if DEBUG_DEPS
556                 printf("%s:%u:sending to file %s:%u\n", filename[file], i+1,
557                        filename[dep->needs.file], dep->needs.op_num+1);
558 #endif
559                 if (write(pipes[dep->needs.file].fd[1], &dep, sizeof(dep))
560                     != sizeof(dep))
561                         err(1, "%s:%u failed to tell file %s",
562                             filename[file], i+1, filename[dep->needs.file]);
563         }
564 }
565
566 static int get_len(TDB_DATA key, TDB_DATA data, void *private_data)
567 {
568         return data.dsize;
569 }
570
571 static unsigned run_ops(struct tdb_context *tdb,
572                         int pre_fd,
573                         char *filename[],
574                         struct op *op[],
575                         unsigned int file,
576                         unsigned int start, unsigned int stop,
577                         bool backoff);
578
579 struct traverse_info {
580         struct op **op;
581         char **filename;
582         unsigned file;
583         int pre_fd;
584         unsigned int start;
585         unsigned int i;
586 };
587
588 /* More complex.  Just do whatever's they did at the n'th entry. */
589 static int nontrivial_traverse(struct tdb_context *tdb,
590                                TDB_DATA key, TDB_DATA data,
591                                void *_tinfo)
592 {
593         struct traverse_info *tinfo = _tinfo;
594         unsigned int trav_len = tinfo->op[tinfo->file][tinfo->start].group_len;
595         bool avoid_deadlock = false;
596
597         if (tinfo->i == tinfo->start + trav_len) {
598                 /* This can happen if traverse expects to be empty. */
599                 if (trav_len == 1)
600                         return 1;
601                 fail(tinfo->filename[tinfo->file], tinfo->start + 1,
602                      "traverse did not terminate");
603         }
604
605         if (tinfo->op[tinfo->file][tinfo->i].type != OP_TDB_TRAVERSE)
606                 fail(tinfo->filename[tinfo->file], tinfo->start + 1,
607                      "%s:%u:traverse terminated early");
608
609 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
610         avoid_deadlock = true;
611 #endif
612
613         /* Run any normal ops. */
614         tinfo->i = run_ops(tdb, tinfo->pre_fd, tinfo->filename, tinfo->op,
615                            tinfo->file, tinfo->i+1, tinfo->start + trav_len,
616                            avoid_deadlock);
617
618         /* We backed off, or we hit OP_TDB_TRAVERSE_END/EARLY. */
619         if (tinfo->op[tinfo->file][tinfo->i].type != OP_TDB_TRAVERSE)
620                 return 1;
621
622         return 0;
623 }
624
625 static unsigned op_traverse(struct tdb_context *tdb,
626                             int pre_fd,
627                             char *filename[],
628                             unsigned int file,
629                             int (*traversefn)(struct tdb_context *,
630                                               tdb_traverse_func, void *),
631                             struct op *op[],
632                             unsigned int start)
633 {
634         struct traverse_info tinfo = { op, filename, file, pre_fd,
635                                        start, start+1 };
636
637         traversefn(tdb, nontrivial_traverse, &tinfo);
638
639         /* Traversing in wrong order can have strange effects: eg. if
640          * original traverse went A (delete A), B, we might do B
641          * (delete A).  So if we have ops left over, we do it now. */
642         while (tinfo.i != start + op[file][start].group_len) {
643                 if (op[file][tinfo.i].type == OP_TDB_TRAVERSE
644                     || op[file][tinfo.i].type == OP_TDB_TRAVERSE_END_EARLY)
645                         tinfo.i++;
646                 else
647                         tinfo.i = run_ops(tdb, pre_fd, filename, op, file,
648                                           tinfo.i,
649                                           start + op[file][start].group_len,
650                                           false);
651         }
652
653         return tinfo.i;
654 }
655
656 static void break_out(int sig)
657 {
658 }
659
660 static __attribute__((noinline))
661 unsigned run_ops(struct tdb_context *tdb,
662                  int pre_fd,
663                  char *filename[],
664                  struct op *op[],
665                  unsigned int file,
666                  unsigned int start, unsigned int stop,
667                  bool backoff)
668 {
669         unsigned int i;
670         struct sigaction sa;
671
672         sa.sa_handler = break_out;
673         sa.sa_flags = 0;
674
675         sigaction(SIGALRM, &sa, NULL);
676         for (i = start; i < stop; i++) {
677                 if (!do_pre(tdb, filename, op, file, pre_fd, i, backoff))
678                         return i;
679
680                 switch (op[file][i].type) {
681                 case OP_TDB_LOCKALL:
682                         try(tdb_lockall(tdb), op[file][i].ret);
683                         break;
684                 case OP_TDB_LOCKALL_MARK:
685                         try(tdb_lockall_mark(tdb), op[file][i].ret);
686                         break;
687                 case OP_TDB_LOCKALL_UNMARK:
688                         try(tdb_lockall_unmark(tdb), op[file][i].ret);
689                         break;
690                 case OP_TDB_LOCKALL_NONBLOCK:
691                         unreliable(tdb_lockall_nonblock(tdb), op[file][i].ret,
692                                    tdb_lockall(tdb), tdb_unlockall(tdb));
693                         break;
694                 case OP_TDB_UNLOCKALL:
695                         try(tdb_unlockall(tdb), op[file][i].ret);
696                         break;
697                 case OP_TDB_LOCKALL_READ:
698                         try(tdb_lockall_read(tdb), op[file][i].ret);
699                         break;
700                 case OP_TDB_LOCKALL_READ_NONBLOCK:
701                         unreliable(tdb_lockall_read_nonblock(tdb),
702                                    op[file][i].ret,
703                                    tdb_lockall_read(tdb),
704                                    tdb_unlockall_read(tdb));
705                         break;
706                 case OP_TDB_UNLOCKALL_READ:
707                         try(tdb_unlockall_read(tdb), op[file][i].ret);
708                         break;
709                 case OP_TDB_CHAINLOCK:
710                         try(tdb_chainlock(tdb, op[file][i].key),
711                             op[file][i].ret);
712                         break;
713                 case OP_TDB_CHAINLOCK_NONBLOCK:
714                         unreliable(tdb_chainlock_nonblock(tdb, op[file][i].key),
715                                    op[file][i].ret,
716                                    tdb_chainlock(tdb, op[file][i].key),
717                                    tdb_chainunlock(tdb, op[file][i].key));
718                         break;
719                 case OP_TDB_CHAINLOCK_MARK:
720                         try(tdb_chainlock_mark(tdb, op[file][i].key),
721                             op[file][i].ret);
722                         break;
723                 case OP_TDB_CHAINLOCK_UNMARK:
724                         try(tdb_chainlock_unmark(tdb, op[file][i].key),
725                             op[file][i].ret);
726                         break;
727                 case OP_TDB_CHAINUNLOCK:
728                         try(tdb_chainunlock(tdb, op[file][i].key),
729                             op[file][i].ret);
730                         break;
731                 case OP_TDB_CHAINLOCK_READ:
732                         try(tdb_chainlock_read(tdb, op[file][i].key),
733                             op[file][i].ret);
734                         break;
735                 case OP_TDB_CHAINUNLOCK_READ:
736                         try(tdb_chainunlock_read(tdb, op[file][i].key),
737                             op[file][i].ret);
738                         break;
739                 case OP_TDB_PARSE_RECORD:
740                         try(tdb_parse_record(tdb, op[file][i].key, get_len,
741                                              NULL),
742                             op[file][i].ret);
743                         break;
744                 case OP_TDB_EXISTS:
745                         try(tdb_exists(tdb, op[file][i].key), op[file][i].ret);
746                         break;
747                 case OP_TDB_STORE:
748                         try(tdb_store(tdb, op[file][i].key, op[file][i].data,
749                                       op[file][i].flag),
750                             op[file][i].ret);
751                         break;
752                 case OP_TDB_APPEND:
753                         try(tdb_append(tdb, op[file][i].key, op[file][i].data),
754                             op[file][i].ret);
755                         break;
756                 case OP_TDB_GET_SEQNUM:
757                         try(tdb_get_seqnum(tdb), op[file][i].ret);
758                         break;
759                 case OP_TDB_WIPE_ALL:
760                         try(tdb_wipe_all(tdb), op[file][i].ret);
761                         break;
762                 case OP_TDB_TRANSACTION_START:
763                         try(tdb_transaction_start(tdb), op[file][i].ret);
764                         break;
765                 case OP_TDB_TRANSACTION_CANCEL:
766                         try(tdb_transaction_cancel(tdb), op[file][i].ret);
767                         break;
768                 case OP_TDB_TRANSACTION_PREPARE_COMMIT:
769                         try(tdb_transaction_prepare_commit(tdb),
770                             op[file][i].ret);
771                         break;
772                 case OP_TDB_TRANSACTION_COMMIT:
773                         try(tdb_transaction_commit(tdb), op[file][i].ret);
774                         break;
775                 case OP_TDB_TRAVERSE_READ_START:
776                         i = op_traverse(tdb, pre_fd, filename, file,
777                                         tdb_traverse_read, op, i);
778                         break;
779                 case OP_TDB_TRAVERSE_START:
780                         i = op_traverse(tdb, pre_fd, filename, file,
781                                         tdb_traverse, op, i);
782                         break;
783                 case OP_TDB_TRAVERSE:
784                 case OP_TDB_TRAVERSE_END_EARLY:
785                         /* Terminate: we're in a traverse, and we've
786                          * done our ops. */
787                         return i;
788                 case OP_TDB_TRAVERSE_END:
789                         fail(filename[file], i+1, "unexpected end traverse");
790                 /* FIXME: These must be treated like traverse. */
791                 case OP_TDB_FIRSTKEY:
792                         if (!key_eq(tdb_firstkey(tdb), op[file][i].data))
793                                 fail(filename[file], i+1, "bad firstkey");
794                         break;
795                 case OP_TDB_NEXTKEY:
796                         if (!key_eq(tdb_nextkey(tdb, op[file][i].key),
797                                     op[file][i].data))
798                                 fail(filename[file], i+1, "bad nextkey");
799                         break;
800                 case OP_TDB_FETCH: {
801                         TDB_DATA f = tdb_fetch(tdb, op[file][i].key);
802                         if (!key_eq(f, op[file][i].data))
803                                 fail(filename[file], i+1, "bad fetch %u",
804                                      f.dsize);
805                         break;
806                 }
807                 case OP_TDB_DELETE:
808                         try(tdb_delete(tdb, op[file][i].key), op[file][i].ret);
809                         break;
810                 }
811                 do_post(filename, op, file, i);
812         }
813         return i;
814 }
815
816 /* tdbtorture, in particular, can do a tdb_close with a transaction in
817  * progress. */
818 static struct op *maybe_cancel_transaction(const char *filename,
819                                            struct op *op, unsigned int *num)
820 {
821         unsigned int start = op_find_start(op, *num, OP_TDB_TRANSACTION_START);
822
823         if (start) {
824                 char *words[] = { "<unknown>", "tdb_close", NULL };
825                 add_op(filename, &op, *num, op[start].seqnum,
826                        OP_TDB_TRANSACTION_CANCEL);
827                 op_analyze_transaction(filename, op, *num, words);
828                 (*num)++;
829         }
830         return op;
831 }
832
833 static struct op *load_tracefile(const char *filename, unsigned int *num,
834                                  unsigned int *hashsize,
835                                  unsigned int *tdb_flags,
836                                  unsigned int *open_flags)
837 {
838         unsigned int i;
839         struct op *op = talloc_array(NULL, struct op, 1);
840         char **words;
841         char **lines;
842         char *file;
843
844         file = grab_file(NULL, filename, NULL);
845         if (!file)
846                 err(1, "Reading %s", filename);
847
848         lines = strsplit(file, file, "\n", NULL);
849         if (!lines[0])
850                 errx(1, "%s is empty", filename);
851
852         words = strsplit(lines, lines[0], " ", NULL);
853         if (!streq(words[1], "tdb_open"))
854                 fail(filename, 1, "does not start with tdb_open");
855
856         *hashsize = atoi(words[2]);
857         *tdb_flags = strtoul(words[3], NULL, 0);
858         *open_flags = strtoul(words[4], NULL, 0);
859
860         for (i = 1; lines[i]; i++) {
861                 const struct op_table *opt;
862
863                 words = strsplit(lines, lines[i], " ", NULL);
864                 if (!words[0] || !words[1])
865                         fail(filename, i+1, "Expected seqnum number and op");
866                
867                 opt = find_keyword(words[1], strlen(words[1]));
868                 if (!opt) {
869                         if (streq(words[1], "tdb_close")) {
870                                 if (lines[i+1])
871                                         fail(filename, i+2,
872                                              "lines after tdb_close");
873                                 *num = i;
874                                 talloc_free(lines);
875                                 return maybe_cancel_transaction(filename,
876                                                                 op, num);
877                         }
878                         fail(filename, i+1, "Unknown operation '%s'", words[1]);
879                 }
880
881                 add_op(filename, &op, i, atoi(words[0]), opt->type);
882                 opt->enhance_op(filename, op, i, words);
883         }
884
885         fprintf(stderr, "%s:%u:last operation is not tdb_close: incomplete?",
886               filename, i);
887         talloc_free(lines);
888         *num = i - 1;
889         return maybe_cancel_transaction(filename, op, num);
890 }
891
892 /* We remember all the keys we've ever seen, and who has them. */
893 struct keyinfo {
894         TDB_DATA key;
895         unsigned int num_users;
896         struct op_desc *user;
897 };
898
899 static const TDB_DATA must_not_exist;
900 static const TDB_DATA must_exist;
901 static const TDB_DATA not_exists_or_empty;
902
903 /* NULL means doesn't care if it exists or not, &must_exist means
904  * it must exist but we don't care what, &must_not_exist means it must
905  * not exist, otherwise the data it needs. */
906 static const TDB_DATA *needs(const struct op *op)
907 {
908         switch (op->type) {
909         /* FIXME: Pull forward deps, since we can deadlock */
910         case OP_TDB_CHAINLOCK:
911         case OP_TDB_CHAINLOCK_NONBLOCK:
912         case OP_TDB_CHAINLOCK_MARK:
913         case OP_TDB_CHAINLOCK_UNMARK:
914         case OP_TDB_CHAINUNLOCK:
915         case OP_TDB_CHAINLOCK_READ:
916         case OP_TDB_CHAINUNLOCK_READ:
917                 return NULL;
918
919         case OP_TDB_APPEND:
920                 if (op->append.pre.dsize == 0)
921                         return &not_exists_or_empty;
922                 return &op->append.pre;
923
924         case OP_TDB_STORE:
925                 if (op->flag == TDB_INSERT) {
926                         if (op->ret < 0)
927                                 return &must_exist;
928                         else
929                                 return &must_not_exist;
930                 } else if (op->flag == TDB_MODIFY) {
931                         if (op->ret < 0)
932                                 return &must_not_exist;
933                         else
934                                 return &must_exist;
935                 }
936                 /* No flags?  Don't care */
937                 return NULL;
938
939         case OP_TDB_EXISTS:
940                 if (op->ret == 1)
941                         return &must_exist;
942                 else
943                         return &must_not_exist;
944
945         case OP_TDB_PARSE_RECORD:
946                 if (op->ret < 0)
947                         return &must_not_exist;
948                 return &must_exist;
949
950         /* FIXME: handle these. */
951         case OP_TDB_WIPE_ALL:
952         case OP_TDB_FIRSTKEY:
953         case OP_TDB_NEXTKEY:
954         case OP_TDB_GET_SEQNUM:
955         case OP_TDB_TRAVERSE:
956         case OP_TDB_TRANSACTION_COMMIT:
957         case OP_TDB_TRANSACTION_CANCEL:
958         case OP_TDB_TRANSACTION_START:
959                 return NULL;
960
961         case OP_TDB_FETCH:
962                 if (!op->data.dptr)
963                         return &must_not_exist;
964                 return &op->data;
965
966         case OP_TDB_DELETE:
967                 if (op->ret < 0)
968                         return &must_not_exist;
969                 return &must_exist;
970
971         default:
972                 errx(1, "Unexpected op type %i", op->type);
973         }
974         
975 }
976
977 static bool starts_transaction(const struct op *op)
978 {
979         return op->type == OP_TDB_TRANSACTION_START;
980 }
981
982 static bool in_transaction(const struct op op[], unsigned int i)
983 {
984         return op[i].group_start && starts_transaction(&op[op[i].group_start]);
985 }
986
987 static bool successful_transaction(const struct op *op)
988 {
989         return starts_transaction(op)
990                 && op[op->group_len].type == OP_TDB_TRANSACTION_COMMIT;
991 }
992
993 static bool starts_traverse(const struct op *op)
994 {
995         return op->type == OP_TDB_TRAVERSE_START
996                 || op->type == OP_TDB_TRAVERSE_READ_START;
997 }
998
999 static bool in_traverse(const struct op op[], unsigned int i)
1000 {
1001         return op[i].group_start && starts_traverse(&op[op[i].group_start]);
1002 }
1003
1004 static bool starts_chainlock(const struct op *op)
1005 {
1006         return op->type == OP_TDB_CHAINLOCK_READ
1007                 || op->type == OP_TDB_CHAINLOCK;
1008 }
1009
1010 static bool in_chainlock(const struct op op[], unsigned int i)
1011 {
1012         return op[i].group_start && starts_chainlock(&op[op[i].group_start]);
1013 }
1014
1015 /* What's the data after this op?  pre if nothing changed. */
1016 static const TDB_DATA *gives(const TDB_DATA *key, const TDB_DATA *pre,
1017                              const struct op *op)
1018 {
1019         if (starts_transaction(op) || starts_chainlock(op)) {
1020                 unsigned int i;
1021
1022                 /* Cancelled transactions don't change anything. */
1023                 if (op[op->group_len].type == OP_TDB_TRANSACTION_CANCEL)
1024                         return pre;
1025                 assert(op[op->group_len].type == OP_TDB_TRANSACTION_COMMIT
1026                        || op[op->group_len].type == OP_TDB_CHAINUNLOCK_READ
1027                        || op[op->group_len].type == OP_TDB_CHAINUNLOCK);
1028
1029                 for (i = 1; i < op->group_len; i++) {
1030                         /* This skips nested transactions, too */
1031                         if (key_eq(op[i].key, *key))
1032                                 pre = gives(key, pre, &op[i]);
1033                 }
1034                 return pre;
1035         }
1036
1037         /* Failed ops don't change state of db. */
1038         if (op->ret < 0)
1039                 return pre;
1040
1041         if (op->type == OP_TDB_DELETE || op->type == OP_TDB_WIPE_ALL)
1042                 return &tdb_null;
1043
1044         if (op->type == OP_TDB_APPEND)
1045                 return &op->append.post;
1046
1047         if (op->type == OP_TDB_STORE)
1048                 return &op->data;
1049
1050         return pre;
1051 }
1052
1053 static struct keyinfo *hash_ops(struct op *op[], unsigned int num_ops[],
1054                                 unsigned int num)
1055 {
1056         unsigned int i, j, h;
1057         struct keyinfo *hash;
1058
1059         hash = talloc_zero_array(op[0], struct keyinfo, total_keys*2);
1060         for (i = 0; i < num; i++) {
1061                 for (j = 1; j < num_ops[i]; j++) {
1062                         /* We can't do this on allocation, due to realloc. */
1063                         list_head_init(&op[i][j].post);
1064                         list_head_init(&op[i][j].pre);
1065
1066                         if (!op[i][j].key.dptr)
1067                                 continue;
1068
1069                         h = hash_key(&op[i][j].key) % (total_keys * 2);
1070                         while (!key_eq(hash[h].key, op[i][j].key)) {
1071                                 if (!hash[h].key.dptr) {
1072                                         hash[h].key = op[i][j].key;
1073                                         break;
1074                                 }
1075                                 h = (h + 1) % (total_keys * 2);
1076                         }
1077                         /* Might as well save some memory if we can. */
1078                         if (op[i][j].key.dptr != hash[h].key.dptr) {
1079                                 talloc_free(op[i][j].key.dptr);
1080                                 op[i][j].key.dptr = hash[h].key.dptr;
1081                         }
1082                         hash[h].user = talloc_realloc(hash, hash[h].user,
1083                                                      struct op_desc,
1084                                                      hash[h].num_users+1);
1085
1086                         /* If it's in a transaction, it's the transaction which
1087                          * matters from an analysis POV. */
1088                         if (in_transaction(op[i], j)
1089                             || in_chainlock(op[i], j)) {
1090                                 unsigned start = op[i][j].group_start;
1091
1092                                 /* Don't include twice. */
1093                                 if (hash[h].num_users
1094                                     && hash[h].user[hash[h].num_users-1].file
1095                                         == i
1096                                     && hash[h].user[hash[h].num_users-1].op_num
1097                                         == start)
1098                                         continue;
1099
1100                                 hash[h].user[hash[h].num_users].op_num = start;
1101                         } else
1102                                 hash[h].user[hash[h].num_users].op_num = j;
1103                         hash[h].user[hash[h].num_users].file = i;
1104                         hash[h].num_users++;
1105                 }
1106         }
1107
1108         return hash;
1109 }
1110
1111 static bool satisfies(const TDB_DATA *key, const TDB_DATA *data,
1112                       const struct op *op)
1113 {
1114         const TDB_DATA *need = NULL;
1115
1116         if (starts_transaction(op) || starts_chainlock(op)) {
1117                 unsigned int i;
1118
1119                 /* Look through for an op in this transaction which
1120                  * needs this key. */
1121                 for (i = 1; i < op->group_len; i++) {
1122                         if (key_eq(op[i].key, *key)) {
1123                                 need = needs(&op[i]);
1124                                 /* tdb_exists() is special: there might be
1125                                  * something in the transaction with more
1126                                  * specific requirements.  Other ops don't have
1127                                  * specific requirements (eg. store or delete),
1128                                  * but they change the value so we can't get
1129                                  * more information from future ops. */
1130                                 if (op[i].type != OP_TDB_EXISTS)
1131                                         break;
1132                         }
1133                 }
1134         } else
1135                 need = needs(op);
1136
1137         /* Don't need anything?  Cool. */
1138         if (!need)
1139                 return true;
1140
1141         /* This should be tdb_null or a real value. */
1142         assert(data != &must_exist);
1143         assert(data != &must_not_exist);
1144         assert(data != &not_exists_or_empty);
1145
1146         /* Must not exist?  data must not exist. */
1147         if (need == &must_not_exist)
1148                 return data == &tdb_null;
1149
1150         /* Must exist? */
1151         if (need == &must_exist)
1152                 return data != &tdb_null;
1153
1154         /* Either noexist or empty. */
1155         if (need == &not_exists_or_empty)
1156                 return data->dsize == 0;
1157
1158         /* Needs something specific. */
1159         return key_eq(*data, *need);
1160 }
1161
1162 static void move_to_front(struct op_desc res[], unsigned off, unsigned elem)
1163 {
1164         if (elem != off) {
1165                 struct op_desc tmp = res[elem];
1166                 memmove(res + off + 1, res + off, (elem - off)*sizeof(res[0]));
1167                 res[off] = tmp;
1168         }
1169 }
1170
1171 static void restore_to_pos(struct op_desc res[], unsigned off, unsigned elem)
1172 {
1173         if (elem != off) {
1174                 struct op_desc tmp = res[off];
1175                 memmove(res + off, res + off + 1, (elem - off)*sizeof(res[0]));
1176                 res[elem] = tmp;
1177         }
1178 }
1179
1180 static bool sort_deps(char *filename[], struct op *op[],
1181                       struct op_desc res[],
1182                       unsigned off, unsigned num,
1183                       const TDB_DATA *key, const TDB_DATA *data,
1184                       unsigned num_files, unsigned fuzz)
1185 {
1186         unsigned int i, files_done;
1187         struct op *this_op;
1188         bool done[num_files];
1189
1190         /* None left?  We're sorted. */
1191         if (off == num)
1192                 return true;
1193
1194         /* Does this make sequence number go backwards?  Allow a little fuzz. */
1195         if (off > 0) {
1196                 int seqnum1 = op[res[off-1].file][res[off-1].op_num].seqnum;
1197                 int seqnum2 = op[res[off].file][res[off].op_num].seqnum;
1198
1199                 if (seqnum1 - seqnum2 > (int)fuzz) {
1200 #if DEBUG_DEPS
1201                         printf("Seqnum jump too far (%u -> %u)\n",
1202                                seqnum1, seqnum2);
1203 #endif
1204                         return false;
1205                 }
1206         }
1207
1208         memset(done, 0, sizeof(done));
1209
1210         /* Since ops within a trace file are ordered, we just need to figure
1211          * out which file to try next.  Since we don't take into account
1212          * inter-key relationships (which exist by virtue of trace file order),
1213          * we minimize the chance of harm by trying to keep in seqnum order. */
1214         for (files_done = 0, i = off; i < num && files_done < num_files; i++) {
1215                 if (done[res[i].file])
1216                         continue;
1217
1218                 this_op = &op[res[i].file][res[i].op_num];
1219
1220                 /* Is what we have good enough for this op? */
1221                 if (satisfies(key, data, this_op)) {
1222                         move_to_front(res, off, i);
1223                         if (sort_deps(filename, op, res, off+1, num,
1224                                       key, gives(key, data, this_op),
1225                                       num_files, fuzz))
1226                                 return true;
1227                         restore_to_pos(res, off, i);
1228                 }
1229                 done[res[i].file] = true;
1230                 files_done++;
1231         }
1232
1233         /* No combination worked. */
1234         return false;
1235 }
1236
1237 static void check_dep_sorting(struct op_desc user[], unsigned num_users,
1238                               unsigned num_files)
1239 {
1240 #if DEBUG_DEPS
1241         unsigned int i;
1242         unsigned minima[num_files];
1243
1244         memset(minima, 0, sizeof(minima));
1245         for (i = 0; i < num_users; i++) {
1246                 assert(minima[user[i].file] < user[i].op_num);
1247                 minima[user[i].file] = user[i].op_num;
1248         }
1249 #endif
1250 }
1251
1252 /* All these ops happen on the same key.  Which comes first?
1253  *
1254  * This can happen both because read ops or failed write ops don't
1255  * change sequence number, and also due to race since we access the
1256  * number unlocked (the race can cause less detectable ordering problems,
1257  * in which case we'll deadlock and report: fix manually in that case).
1258  */
1259 static void figure_deps(char *filename[], struct op *op[],
1260                         const TDB_DATA *key, struct op_desc user[],
1261                         unsigned num_users, unsigned num_files)
1262 {
1263         /* We assume database starts empty. */
1264         const struct TDB_DATA *data = &tdb_null;
1265         unsigned int fuzz;
1266
1267         /* We prefer to keep strict seqnum order if possible: it's the
1268          * most likely.  We get more lax if that fails. */
1269         for (fuzz = 0; fuzz < 100; fuzz = (fuzz + 1)*2) {
1270                 if (sort_deps(filename, op, user, 0, num_users, key, data,
1271                               num_files, fuzz))
1272                         break;
1273         }
1274
1275         if (fuzz >= 100)
1276                 fail(filename[user[0].file], user[0].op_num+1,
1277                      "Could not resolve inter-dependencies");
1278
1279         check_dep_sorting(user, num_users, num_files);
1280 }
1281
1282 static void sort_ops(struct keyinfo hash[], char *filename[], struct op *op[],
1283                      unsigned int num)
1284 {
1285         unsigned int h;
1286
1287         /* Gcc nexted function extension.  How cool is this? */
1288         int compare_seqnum(const void *_a, const void *_b)
1289         {
1290                 const struct op_desc *a = _a, *b = _b;
1291
1292                 /* First, maintain order within any trace file. */
1293                 if (a->file == b->file)
1294                         return a->op_num - b->op_num;
1295
1296                 /* Otherwise, arrange by seqnum order. */
1297                 if (op[a->file][a->op_num].seqnum !=
1298                     op[b->file][b->op_num].seqnum)
1299                         return op[a->file][a->op_num].seqnum
1300                                 - op[b->file][b->op_num].seqnum;
1301
1302                 /* Cancelled transactions are assumed to happen first. */
1303                 if (starts_transaction(&op[a->file][a->op_num])
1304                     && !successful_transaction(&op[a->file][a->op_num]))
1305                         return -1;
1306                 if (starts_transaction(&op[b->file][b->op_num])
1307                     && !successful_transaction(&op[b->file][b->op_num]))
1308                         return 1;
1309
1310                 /* No idea. */
1311                 return 0;
1312         }
1313
1314         /* Now sort into seqnum order. */
1315         for (h = 0; h < total_keys * 2; h++) {
1316                 struct op_desc *user = hash[h].user;
1317
1318                 qsort(user, hash[h].num_users, sizeof(user[0]), compare_seqnum);
1319                 figure_deps(filename, op, &hash[h].key, user, hash[h].num_users,
1320                             num);
1321         }
1322 }
1323
1324 static int destroy_depend(struct depend *dep)
1325 {
1326         list_del(&dep->pre_list);
1327         list_del(&dep->post_list);
1328         return 0;
1329 }
1330
1331 static void add_dependency(void *ctx,
1332                            struct op *op[],
1333                            char *filename[],
1334                            const struct op_desc *needs,
1335                            const struct op_desc *prereq)
1336 {
1337         struct depend *dep;
1338
1339         /* We don't depend on ourselves. */
1340         if (needs->file == prereq->file) {
1341                 assert(prereq->op_num < needs->op_num);
1342                 return;
1343         }
1344
1345 #if DEBUG_DEPS
1346         printf("%s:%u: depends on %s:%u\n",
1347                filename[needs->file], needs->op_num+1,
1348                filename[prereq->file], prereq->op_num+1);
1349 #endif
1350
1351         dep = talloc(ctx, struct depend);
1352         dep->needs = *needs;
1353         dep->prereq = *prereq;
1354
1355 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
1356         /* If something in a traverse depends on something in another
1357          * traverse/transaction, it creates a dependency between the
1358          * two groups. */
1359         if ((in_traverse(op[prereq->file], prereq->op_num)
1360              && (starts_transaction(&op[needs->file][needs->op_num])
1361                  || starts_traverse(&op[needs->file][needs->op_num])))
1362             || (in_traverse(op[needs->file], needs->op_num)
1363                 && (starts_transaction(&op[prereq->file][prereq->op_num])
1364                     || starts_traverse(&op[prereq->file][prereq->op_num])))) {
1365                 unsigned int start;
1366
1367                 /* We are satisfied by end of group. */
1368                 start = op[prereq->file][prereq->op_num].group_start;
1369                 dep->prereq.op_num = start + op[prereq->file][start].group_len;
1370                 /* And we need that done by start of our group. */
1371                 dep->needs.op_num = op[needs->file][needs->op_num].group_start;
1372         }
1373
1374         /* There is also this case:
1375          *  <traverse> <read foo> ...
1376          *  <transaction> ... </transaction> <create foo>
1377          * Where if we start the traverse then wait, we could block
1378          * the transaction and deadlock.
1379          *
1380          * We try to address this by ensuring that where seqnum indicates it's
1381          * possible, we wait for <create foo> before *starting* traverse.
1382          */
1383         else if (in_traverse(op[needs->file], needs->op_num)) {
1384                 struct op *need = &op[needs->file][needs->op_num];
1385                 if (op[needs->file][need->group_start].seqnum >
1386                     op[prereq->file][prereq->op_num].seqnum) {
1387                         dep->needs.op_num = need->group_start;
1388                 }
1389         }
1390 #endif
1391
1392         /* If you depend on a transaction or chainlock, you actually
1393          * depend on it ending. */
1394         if (starts_transaction(&op[prereq->file][dep->prereq.op_num])
1395             || starts_chainlock(&op[prereq->file][dep->prereq.op_num])) {
1396                 dep->prereq.op_num
1397                         += op[dep->prereq.file][dep->prereq.op_num].group_len;
1398 #if DEBUG_DEPS
1399                 printf("-> Actually end of transaction %s:%u\n",
1400                        filename[dep->prereq->file], dep->prereq->op_num+1);
1401 #endif
1402         } else
1403                 /* We should never create a dependency from middle of
1404                  * a transaction. */
1405                 assert(!in_transaction(op[prereq->file], dep->prereq.op_num)
1406                        || op[prereq->file][dep->prereq.op_num].type
1407                        == OP_TDB_TRANSACTION_COMMIT
1408                        || op[prereq->file][dep->prereq.op_num].type
1409                        == OP_TDB_TRANSACTION_CANCEL);
1410
1411         list_add(&op[dep->prereq.file][dep->prereq.op_num].post,
1412                  &dep->post_list);
1413         list_add(&op[dep->needs.file][dep->needs.op_num].pre,
1414                  &dep->pre_list);
1415         talloc_set_destructor(dep, destroy_depend);
1416 }
1417
1418 static bool changes_db(const TDB_DATA *key, const struct op *op)
1419 {
1420         return gives(key, NULL, op) != NULL;
1421 }
1422
1423 static void depend_on_previous(struct op *op[],
1424                                char *filename[],
1425                                unsigned int num,
1426                                struct op_desc user[],
1427                                unsigned int i,
1428                                int prev)
1429 {
1430         bool deps[num];
1431         int j;
1432
1433         if (i == 0)
1434                 return;
1435
1436         if (prev == i - 1) {
1437                 /* Just depend on previous. */
1438                 add_dependency(NULL, op, filename, &user[i], &user[prev]);
1439                 return;
1440         }
1441
1442         /* We have to wait for the readers.  Find last one in *each* file. */
1443         memset(deps, 0, sizeof(deps));
1444         deps[user[i].file] = true;
1445         for (j = i - 1; j > prev; j--) {
1446                 if (!deps[user[j].file]) {
1447                         add_dependency(NULL, op, filename, &user[i], &user[j]);
1448                         deps[user[j].file] = true;
1449                 }
1450         }
1451 }
1452
1453 /* This is simple, but not complete.  We don't take into account
1454  * indirect dependencies. */
1455 static void optimize_dependencies(struct op *op[], unsigned int num_ops[],
1456                                   unsigned int num)
1457 {
1458         unsigned int i, j;
1459
1460         /* There can only be one real dependency on each file */
1461         for (i = 0; i < num; i++) {
1462                 for (j = 1; j < num_ops[i]; j++) {
1463                         struct depend *dep, *next;
1464                         struct depend *prev[num];
1465
1466                         memset(prev, 0, sizeof(prev));
1467
1468                         list_for_each_safe(&op[i][j].pre, dep, next, pre_list) {
1469                                 if (!prev[dep->prereq.file]) {
1470                                         prev[dep->prereq.file] = dep;
1471                                         continue;
1472                                 }
1473                                 if (prev[dep->prereq.file]->prereq.op_num
1474                                     < dep->prereq.op_num) {
1475                                         talloc_free(prev[dep->prereq.file]);
1476                                         prev[dep->prereq.file] = dep;
1477                                 } else
1478                                         talloc_free(dep);
1479                         }
1480                 }
1481         }
1482
1483         for (i = 0; i < num; i++) {
1484                 int deps[num];
1485
1486                 for (j = 0; j < num; j++)
1487                         deps[j] = -1;
1488
1489                 for (j = 1; j < num_ops[i]; j++) {
1490                         struct depend *dep, *next;
1491
1492                         list_for_each_safe(&op[i][j].pre, dep, next, pre_list) {
1493                                 if (deps[dep->prereq.file]
1494                                     >= (int)dep->prereq.op_num)
1495                                         talloc_free(dep);
1496                                 else
1497                                         deps[dep->prereq.file]
1498                                                 = dep->prereq.op_num;
1499                         }
1500                 }
1501         }
1502 }
1503
1504 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
1505 /* Force an order among the traversals, so they don't deadlock (as much) */
1506 static void make_traverse_depends(char *filename[],
1507                                   struct op *op[], unsigned int num_ops[],
1508                                   unsigned int num)
1509 {
1510         unsigned int i, num_traversals = 0;
1511         int j;
1512         struct op_desc *desc;
1513
1514         /* Sort by which one runs first. */
1515         int compare_traverse_desc(const void *_a, const void *_b)
1516         {
1517                 const struct op_desc *da = _a, *db = _b;
1518                 const struct op *a = &op[da->file][da->op_num],
1519                         *b = &op[db->file][db->op_num];
1520
1521                 if (a->seqnum != b->seqnum)
1522                         return a->seqnum - b->seqnum;
1523
1524                 /* If they have same seqnum, it means one didn't make any
1525                  * changes.  Thus sort by end in that case. */
1526                 return a[a->group_len].seqnum - b[b->group_len].seqnum;
1527         }
1528
1529         desc = talloc_array(NULL, struct op_desc, 1);
1530
1531         /* Count them. */
1532         for (i = 0; i < num; i++) {
1533                 for (j = 1; j < num_ops[i]; j++) {
1534                         /* Traverse start (ignore those in
1535                          * transactions; they're already covered by
1536                          * transaction dependencies). */
1537                         if (starts_traverse(&op[i][j])
1538                             && !in_transaction(op[i], j)) {
1539                                 desc = talloc_realloc(NULL, desc,
1540                                                       struct op_desc,
1541                                                       num_traversals+1);
1542                                 desc[num_traversals].file = i;
1543                                 desc[num_traversals].op_num = j;
1544                                 num_traversals++;
1545                         }
1546                 }
1547         }
1548         qsort(desc, num_traversals, sizeof(desc[0]), compare_traverse_desc);
1549
1550         for (i = 1; i < num_traversals; i++) {
1551                 const struct op *prev = &op[desc[i-1].file][desc[i-1].op_num];
1552                 const struct op *curr = &op[desc[i].file][desc[i].op_num];
1553
1554                 /* Read traverses don't depend on each other (read lock). */
1555                 if (prev->type == OP_TDB_TRAVERSE_READ_START
1556                     && curr->type == OP_TDB_TRAVERSE_READ_START)
1557                         continue;
1558
1559                 /* Only make dependency if it's clear. */
1560                 if (compare_traverse_desc(&desc[i], &desc[i-1])) {
1561                         /* i depends on end of traverse i-1. */
1562                         struct op_desc end = desc[i-1];
1563                         end.op_num += prev->group_len;
1564                         add_dependency(NULL, op, filename, &desc[i], &end);
1565                 }
1566         }
1567         talloc_free(desc);
1568 }
1569
1570 static void set_nonblock(int fd)
1571 {
1572         if (fcntl(fd, F_SETFL, fcntl(fd, F_GETFL)|O_NONBLOCK) != 0)
1573                 err(1, "Setting pipe nonblocking");
1574 }
1575
1576 static bool handle_backoff(struct op *op[], int fd)
1577 {
1578         struct op_desc desc;
1579         bool handled = false;
1580
1581         /* Sloppy coding: we assume PIPEBUF never fills. */
1582         while (read(fd, &desc, sizeof(desc)) != -1) {
1583                 unsigned int i;
1584                 handled = true;
1585                 for (i = desc.op_num; i > 0; i--) {
1586                         if (op[desc.file][i].type == OP_TDB_TRAVERSE) {
1587                                 /* We insert a fake end here. */
1588                                 op[desc.file][i].type
1589                                         = OP_TDB_TRAVERSE_END_EARLY;
1590                                 break;
1591                         } else if (starts_traverse(&op[desc.file][i])) {
1592                                 unsigned int start = i;
1593                                 struct op tmp = op[desc.file][i];
1594                                 /* Move the ops outside traverse. */
1595                                 memmove(&op[desc.file][i],
1596                                         &op[desc.file][i+1],
1597                                         (desc.op_num-i-1) * sizeof(op[0][0]));
1598                                 op[desc.file][desc.op_num] = tmp;
1599                                 while (op[desc.file][i].group_start == start) {
1600                                         op[desc.file][i++].group_start
1601                                                 = desc.op_num;
1602                                 }
1603                                 break;
1604                         }
1605                 }
1606         }
1607         return handled;
1608 }
1609
1610 #else /* !TRAVERSALS_TAKE_TRANSACTION_LOCK */
1611 static bool handle_backoff(struct op *op[], int fd)
1612 {
1613         return false;
1614 }
1615 #endif
1616
1617 static void derive_dependencies(char *filename[],
1618                                 struct op *op[], unsigned int num_ops[],
1619                                 unsigned int num)
1620 {
1621         struct keyinfo *hash;
1622         unsigned int h, i;
1623
1624         /* Create hash table for faster key lookup. */
1625         hash = hash_ops(op, num_ops, num);
1626
1627         /* Sort them by sequence number. */
1628         sort_ops(hash, filename, op, num);
1629
1630         /* Create dependencies back to the last change, rather than
1631          * creating false dependencies by naively making each one
1632          * depend on the previous.  This has two purposes: it makes
1633          * later optimization simpler, and it also avoids deadlock with
1634          * same sequence number ops inside traversals (if one
1635          * traversal doesn't write anything, two ops can have the same
1636          * sequence number yet we can create a traversal dependency
1637          * the other way). */
1638         for (h = 0; h < total_keys * 2; h++) {
1639                 int prev = -1;
1640
1641                 if (hash[h].num_users < 2)
1642                         continue;
1643
1644                 for (i = 0; i < hash[h].num_users; i++) {
1645                         if (changes_db(&hash[h].key, &op[hash[h].user[i].file]
1646                                        [hash[h].user[i].op_num])) {
1647                                 depend_on_previous(op, filename, num,
1648                                                    hash[h].user, i, prev);
1649                                 prev = i;
1650                         } else if (prev >= 0)
1651                                 add_dependency(hash, op, filename,
1652                                                &hash[h].user[i],
1653                                                &hash[h].user[prev]);
1654                 }
1655         }
1656
1657 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
1658         make_traverse_depends(filename, op, num_ops, num);
1659 #endif
1660
1661         optimize_dependencies(op, num_ops, num);
1662 }
1663
1664 static struct timeval run_test(char *argv[],
1665                                unsigned int num_ops[],
1666                                unsigned int hashsize[],
1667                                unsigned int tdb_flags[],
1668                                unsigned int open_flags[],
1669                                struct op *op[],
1670                                int fds[2])
1671 {
1672         unsigned int i;
1673         struct timeval start, end, diff;
1674         bool ok = true;
1675
1676         for (i = 0; argv[i+2]; i++) {
1677                 struct tdb_context *tdb;
1678                 char c;
1679
1680                 switch (fork()) {
1681                 case -1:
1682                         err(1, "fork failed");
1683                 case 0:
1684                         close(fds[1]);
1685                         tdb = tdb_open_ex(argv[1], hashsize[i],
1686                                           tdb_flags[i]|TDB_NOSYNC,
1687                                           open_flags[i], 0600, NULL, hash_key);
1688                         if (!tdb)
1689                                 err(1, "Opening tdb %s", argv[1]);
1690
1691                         /* This catches parent exiting. */
1692                         if (read(fds[0], &c, 1) != 1)
1693                                 exit(1);
1694                         run_ops(tdb, pipes[i].fd[0], argv+2, op, i, 1,
1695                                 num_ops[i], false);
1696                         check_deps(argv[2+i], op[i], num_ops[i]);
1697                         exit(0);
1698                 default:
1699                         break;
1700                 }
1701         }
1702
1703         /* Let everything settle. */
1704         sleep(1);
1705
1706         printf("Starting run...");
1707         fflush(stdout);
1708         gettimeofday(&start, NULL);
1709         /* Tell them all to go!  Any write of sufficient length will do. */
1710         if (write(fds[1], hashsize, i) != i)
1711                 err(1, "Writing to wakeup pipe");
1712
1713         for (i = 0; argv[i + 2]; i++) {
1714                 int status;
1715                 wait(&status);
1716                 if (!WIFEXITED(status)) {
1717                         warnx("Child died with signal %i", WTERMSIG(status));
1718                         ok = false;
1719                 } else if (WEXITSTATUS(status) != 0)
1720                         /* Assume child spat out error. */
1721                         ok = false;
1722         }
1723         if (!ok)
1724                 exit(1);
1725
1726         gettimeofday(&end, NULL);
1727         printf("done\n");
1728
1729         if (end.tv_usec < start.tv_usec) {
1730                 end.tv_usec += 1000000;
1731                 end.tv_sec--;
1732         }
1733         diff.tv_sec = end.tv_sec - start.tv_sec;
1734         diff.tv_usec = end.tv_usec - start.tv_usec;
1735         return diff;
1736 }
1737
1738 int main(int argc, char *argv[])
1739 {
1740         struct timeval diff;
1741         unsigned int i, num_ops[argc], hashsize[argc], tdb_flags[argc], open_flags[argc];
1742         struct op *op[argc];
1743         int fds[2];
1744
1745         if (argc < 3)
1746                 errx(1, "Usage: %s <tdbfile> <tracefile>...", argv[0]);
1747
1748         pipes = talloc_array(NULL, struct pipe, argc - 1);
1749         for (i = 0; i < argc - 2; i++) {
1750                 printf("Loading tracefile %s...", argv[2+i]);
1751                 fflush(stdout);
1752                 op[i] = load_tracefile(argv[2+i], &num_ops[i], &hashsize[i],
1753                                        &tdb_flags[i], &open_flags[i]);
1754                 if (pipe(pipes[i].fd) != 0)
1755                         err(1, "creating pipe");
1756                 printf("done\n");
1757         }
1758
1759         printf("Calculating inter-dependencies...");
1760         fflush(stdout);
1761         derive_dependencies(argv+2, op, num_ops, i);
1762         printf("done\n");
1763
1764         /* Don't fork for single arg case: simple debugging. */
1765         if (argc == 3) {
1766                 struct tdb_context *tdb;
1767                 tdb = tdb_open_ex(argv[1], hashsize[0], tdb_flags[0]|TDB_NOSYNC,
1768                                   open_flags[0], 0600, NULL, hash_key);
1769                 printf("Single threaded run...");
1770                 fflush(stdout);
1771
1772                 run_ops(tdb, pipes[0].fd[0], argv+2, op, 0, 1, num_ops[0],
1773                         false);
1774                 check_deps(argv[2], op[0], num_ops[0]);
1775
1776                 printf("done\n");
1777                 exit(0);
1778         }
1779
1780         if (pipe(fds) != 0)
1781                 err(1, "creating pipe");
1782
1783 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
1784         if (pipe(pipes[argc-2].fd) != 0)
1785                 err(1, "creating pipe");
1786         backoff_fd = pipes[argc-2].fd[1];
1787         set_nonblock(pipes[argc-2].fd[1]);
1788         set_nonblock(pipes[argc-2].fd[0]);
1789 #endif
1790
1791         do {
1792                 diff = run_test(argv, num_ops, hashsize, tdb_flags, open_flags,
1793                                 op, fds);
1794         } while (handle_backoff(op, pipes[argc-2].fd[0]));
1795
1796         printf("Time replaying: %lu usec\n",
1797                diff.tv_sec * 1000000UL + diff.tv_usec);
1798         
1799         exit(0);
1800 }