]> git.ozlabs.org Git - ccan/blob - ccan/tdb/tools/replay_trace.c
2099dafbf1f64d38ef986b661aaae718abd29a2c
[ccan] / ccan / tdb / tools / replay_trace.c
1 #include <ccan/tdb/tdb.h>
2 #include <ccan/grab_file/grab_file.h>
3 #include <ccan/hash/hash.h>
4 #include <ccan/talloc/talloc.h>
5 #include <ccan/str_talloc/str_talloc.h>
6 #include <ccan/str/str.h>
7 #include <ccan/list/list.h>
8 #include <err.h>
9 #include <ctype.h>
10 #include <string.h>
11 #include <unistd.h>
12 #include <sys/types.h>
13 #include <sys/wait.h>
14 #include <sys/time.h>
15 #include <errno.h>
16 #include <signal.h>
17 #include <assert.h>
18
19 #define STRINGIFY2(x) #x
20 #define STRINGIFY(x) STRINGIFY2(x)
21
22 /* Avoid mod by zero */
23 static unsigned int total_keys = 1;
24
25 /* #define DEBUG_DEPS 1 */
26
27 /* Traversals block transactions in the current implementation. */
28 #define TRAVERSALS_TAKE_TRANSACTION_LOCK 1
29
30 struct pipe {
31         int fd[2];
32 };
33 static struct pipe *pipes;
34
35 static void __attribute__((noreturn)) fail(const char *filename,
36                                            unsigned int line,
37                                            const char *fmt, ...)
38 {
39         va_list ap;
40
41         va_start(ap, fmt);
42         fprintf(stderr, "%s:%u: FAIL: ", filename, line);
43         vfprintf(stderr, fmt, ap);
44         fprintf(stderr, "\n");
45         va_end(ap);
46         exit(1);
47 }
48         
49 /* Try or die. */
50 #define try(expr, expect)                                               \
51         do {                                                            \
52                 int ret = (expr);                                       \
53                 if (ret != (expect))                                    \
54                         fail(filename[file], i+1,                       \
55                              STRINGIFY(expr) "= %i", ret);              \
56         } while (0)
57
58 /* Try or imitate results. */
59 #define unreliable(expr, expect, force, undo)                           \
60         do {                                                            \
61                 int ret = expr;                                         \
62                 if (ret != expect) {                                    \
63                         fprintf(stderr, "%s:%u: %s gave %i not %i",     \
64                                 filename[file], i+1, STRINGIFY(expr),   \
65                                 ret, expect);                           \
66                         if (expect == 0)                                \
67                                 force;                                  \
68                         else                                            \
69                                 undo;                                   \
70                 }                                                       \
71         } while (0)
72
73 static bool key_eq(TDB_DATA a, TDB_DATA b)
74 {
75         if (a.dsize != b.dsize)
76                 return false;
77         return memcmp(a.dptr, b.dptr, a.dsize) == 0;
78 }
79
80 /* This is based on the hash algorithm from gdbm */
81 static unsigned int hash_key(TDB_DATA *key)
82 {
83         uint32_t value; /* Used to compute the hash value.  */
84         uint32_t   i;   /* Used to cycle through random values. */
85
86         /* Set the initial value from the key size. */
87         for (value = 0x238F13AF ^ key->dsize, i=0; i < key->dsize; i++)
88                 value = (value + (key->dptr[i] << (i*5 % 24)));
89
90         return (1103515243 * value + 12345);  
91 }
92
93 enum op_type {
94         OP_TDB_LOCKALL,
95         OP_TDB_LOCKALL_MARK,
96         OP_TDB_LOCKALL_UNMARK,
97         OP_TDB_LOCKALL_NONBLOCK,
98         OP_TDB_UNLOCKALL,
99         OP_TDB_LOCKALL_READ,
100         OP_TDB_LOCKALL_READ_NONBLOCK,
101         OP_TDB_UNLOCKALL_READ,
102         OP_TDB_CHAINLOCK,
103         OP_TDB_CHAINLOCK_NONBLOCK,
104         OP_TDB_CHAINLOCK_MARK,
105         OP_TDB_CHAINLOCK_UNMARK,
106         OP_TDB_CHAINUNLOCK,
107         OP_TDB_CHAINLOCK_READ,
108         OP_TDB_CHAINUNLOCK_READ,
109         OP_TDB_PARSE_RECORD,
110         OP_TDB_EXISTS,
111         OP_TDB_STORE,
112         OP_TDB_APPEND,
113         OP_TDB_GET_SEQNUM,
114         OP_TDB_WIPE_ALL,
115         OP_TDB_TRANSACTION_START,
116         OP_TDB_TRANSACTION_CANCEL,
117         OP_TDB_TRANSACTION_COMMIT,
118         OP_TDB_TRAVERSE_READ_START,
119         OP_TDB_TRAVERSE_START,
120         OP_TDB_TRAVERSE_END,
121         OP_TDB_TRAVERSE,
122         OP_TDB_FIRSTKEY,
123         OP_TDB_NEXTKEY,
124         OP_TDB_FETCH,
125         OP_TDB_DELETE,
126 };
127
128 struct op {
129         unsigned int seqnum;
130         enum op_type type;
131         TDB_DATA key;
132         TDB_DATA data;
133         int ret;
134
135         /* Who is waiting for us? */
136         struct list_head post;
137         /* What are we waiting for? */
138         struct list_head pre;
139
140         /* If I'm part of a group (traverse/transaction) where is
141          * start?  (Otherwise, 0) */
142         unsigned int group_start;
143
144         union {
145                 int flag; /* open and store */
146                 struct {  /* append */
147                         TDB_DATA pre;
148                         TDB_DATA post;
149                 } append;
150                 /* transaction/traverse start/chainlock */
151                 unsigned int group_len;
152         };
153 };
154
155 struct op_desc {
156         unsigned int file;
157         unsigned int op_num;
158 };
159
160 static unsigned char hex_char(const char *filename, unsigned int line, char c)
161 {
162         c = toupper(c);
163         if (c >= 'A' && c <= 'F')
164                 return c - 'A' + 10;
165         if (c >= '0' && c <= '9')
166                 return c - '0';
167         fail(filename, line, "invalid hex character '%c'", c);
168 }
169
170 /* TDB data is <size>:<%02x>* */
171 static TDB_DATA make_tdb_data(const void *ctx,
172                               const char *filename, unsigned int line,
173                               const char *word)
174 {
175         TDB_DATA data;
176         unsigned int i;
177         const char *p;
178
179         if (streq(word, "NULL"))
180                 return tdb_null;
181
182         data.dsize = atoi(word);
183         data.dptr = talloc_array(ctx, unsigned char, data.dsize);
184         p = strchr(word, ':');
185         if (!p)
186                 fail(filename, line, "invalid tdb data '%s'", word);
187         p++;
188         for (i = 0; i < data.dsize; i++)
189                 data.dptr[i] = hex_char(filename, line, p[i*2])*16
190                         + hex_char(filename, line, p[i*2+1]);
191
192         return data;
193 }
194
195 static void add_op(const char *filename, struct op **op, unsigned int i,
196                    unsigned int seqnum, enum op_type type)
197 {
198         struct op *new;
199         *op = talloc_realloc(NULL, *op, struct op, i+1);
200         new = (*op) + i;
201         new->type = type;
202         new->seqnum = seqnum;
203         new->ret = 0;
204         new->group_start = 0;
205 }
206
207 static void op_add_nothing(const char *filename,
208                            struct op op[], unsigned int op_num, char *words[])
209 {
210         if (words[2])
211                 fail(filename, op_num+1, "Expected no arguments");
212         op[op_num].key = tdb_null;
213 }
214
215 static void op_add_key(const char *filename,
216                        struct op op[], unsigned int op_num, char *words[])
217 {
218         if (words[2] == NULL || words[3])
219                 fail(filename, op_num+1, "Expected just a key");
220
221         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
222         total_keys++;
223 }
224
225 static void op_add_key_ret(const char *filename,
226                            struct op op[], unsigned int op_num, char *words[])
227 {
228         if (!words[2] || !words[3] || !words[4] || words[5]
229             || !streq(words[3], "="))
230                 fail(filename, op_num+1, "Expected <key> = <ret>");
231         op[op_num].ret = atoi(words[4]);
232         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
233         /* May only be a unique key if it fails */
234         if (op[op_num].ret != 0)
235                 total_keys++;
236 }
237
238 static void op_add_key_data(const char *filename,
239                             struct op op[], unsigned int op_num, char *words[])
240 {
241         if (!words[2] || !words[3] || !words[4] || words[5]
242             || !streq(words[3], "="))
243                 fail(filename, op_num+1, "Expected <key> = <data>");
244         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
245         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[4]);
246         /* May only be a unique key if it fails */
247         if (!op[op_num].data.dptr)
248                 total_keys++;
249 }
250
251 /* We don't record the keys or data for a traverse, as we don't use them. */
252 static void op_add_traverse(const char *filename,
253                             struct op op[], unsigned int op_num, char *words[])
254 {
255         if (!words[2] || !words[3] || !words[4] || words[5]
256             || !streq(words[3], "="))
257                 fail(filename, op_num+1, "Expected <key> = <data>");
258         op[op_num].key = tdb_null;
259 }
260
261 /* Full traverse info is useful for debugging, but changing it to
262  * "traversefn" without the data makes the traces *much* smaller! */
263 static void op_add_traversefn(const char *filename,
264                             struct op op[], unsigned int op_num, char *words[])
265 {
266         if (words[2])
267                 fail(filename, op_num+1, "Expected no values");
268         op[op_num].key = tdb_null;
269 }
270
271 /* <seqnum> tdb_store <rec> <rec> <flag> = <ret> */
272 static void op_add_store(const char *filename,
273                          struct op op[], unsigned int op_num, char *words[])
274 {
275         if (!words[2] || !words[3] || !words[4] || !words[5] || !words[6]
276             || words[7] || !streq(words[5], "="))
277                 fail(filename, op_num+1, "Expect <key> <data> <flag> = <ret>");
278
279         op[op_num].flag = strtoul(words[4], NULL, 0);
280         op[op_num].ret = atoi(words[6]);
281         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
282         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
283         total_keys++;
284 }
285
286 /* <seqnum> tdb_append <rec> <rec> = <rec> */
287 static void op_add_append(const char *filename,
288                           struct op op[], unsigned int op_num, char *words[])
289 {
290         if (!words[2] || !words[3] || !words[4] || !words[5] || words[6]
291             || !streq(words[4], "="))
292                 fail(filename, op_num+1, "Expect <key> <data> = <rec>");
293
294         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
295         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
296
297         op[op_num].append.post
298                 = make_tdb_data(op, filename, op_num+1, words[5]);
299
300         /* By subtraction, figure out what previous data was. */
301         op[op_num].append.pre.dptr = op[op_num].append.post.dptr;
302         op[op_num].append.pre.dsize
303                 = op[op_num].append.post.dsize - op[op_num].data.dsize;
304         total_keys++;
305 }
306
307 /* <seqnum> tdb_get_seqnum = <ret> */
308 static void op_add_seqnum(const char *filename,
309                           struct op op[], unsigned int op_num, char *words[])
310 {
311         if (!words[2] || !words[3] || words[4] || !streq(words[2], "="))
312                 fail(filename, op_num+1, "Expect = <ret>");
313
314         op[op_num].key = tdb_null;
315         op[op_num].ret = atoi(words[3]);
316 }
317
318 static void op_add_traverse_start(const char *filename,
319                                   struct op op[],
320                                   unsigned int op_num, char *words[])
321 {
322         if (words[2])
323                 fail(filename, op_num+1, "Expect no arguments");
324
325         op[op_num].key = tdb_null;
326         op[op_num].group_len = 0;
327 }
328
329 static void op_add_transaction(const char *filename, struct op op[],
330                                unsigned int op_num, char *words[])
331 {
332         if (words[2])
333                 fail(filename, op_num+1, "Expect no arguments");
334
335         op[op_num].key = tdb_null;
336         op[op_num].group_len = 0;
337 }
338
339 static void op_add_chainlock(const char *filename,
340                              struct op op[], unsigned int op_num, char *words[])
341 {
342         if (words[2] == NULL || words[3])
343                 fail(filename, op_num+1, "Expected just a key");
344
345         /* A chainlock key isn't a key in the normal sense; it doesn't
346          * have to be in the db at all.  Also, we don't want to hash this op. */
347         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
348         op[op_num].key = tdb_null;
349         op[op_num].group_len = 0;
350 }
351
352 static void op_add_chainlock_ret(const char *filename,
353                                  struct op op[], unsigned int op_num,
354                                  char *words[])
355 {
356         if (!words[2] || !words[3] || !words[4] || words[5]
357             || !streq(words[3], "="))
358                 fail(filename, op_num+1, "Expected <key> = <ret>");
359         op[op_num].ret = atoi(words[4]);
360         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
361         op[op_num].key = tdb_null;
362         op[op_num].group_len = 0;
363         total_keys++;
364 }
365
366 static int op_find_start(struct op op[], unsigned int op_num, enum op_type type)
367 {
368         unsigned int i;
369
370         for (i = op_num-1; i > 0; i--) {
371                 if (op[i].type == type && !op[i].group_len)
372                         return i;
373         }
374         return 0;
375 }
376
377 static void op_analyze_transaction(const char *filename,
378                                    struct op op[], unsigned int op_num,
379                                    char *words[])
380 {
381         unsigned int start, i;
382
383         op[op_num].key = tdb_null;
384
385         if (words[2])
386                 fail(filename, op_num+1, "Expect no arguments");
387
388         start = op_find_start(op, op_num, OP_TDB_TRANSACTION_START);
389         if (!start)
390                 fail(filename, op_num+1, "no transaction start found");
391
392         op[start].group_len = op_num - start;
393
394         /* This rolls in nested transactions.  I think that's right. */
395         for (i = start; i <= op_num; i++)
396                 op[i].group_start = start;
397 }
398
399 /* We treat chainlocks a lot like transactions, even though that's overkill */
400 static void op_analyze_chainlock(const char *filename,
401                                  struct op op[], unsigned int op_num,
402                                  char *words[])
403 {
404         unsigned int i, start;
405
406         if (words[2] == NULL || words[3])
407                 fail(filename, op_num+1, "Expected just a key");
408
409         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
410         op[op_num].key = tdb_null;
411         total_keys++;
412
413         start = op_find_start(op, op_num, OP_TDB_CHAINLOCK);
414         if (!start)
415                 start = op_find_start(op, op_num, OP_TDB_CHAINLOCK_READ);
416         if (!start)
417                 fail(filename, op_num+1, "no initial chainlock found");
418
419         /* FIXME: We'd have to do something clever to make this work
420          * vs. deadlock. */
421         if (!key_eq(op[start].data, op[op_num].data))
422                 fail(filename, op_num+1, "nested chainlock calls?");
423
424         op[start].group_len = op_num - start;
425         for (i = start; i <= op_num; i++)
426                 op[i].group_start = start;
427 }
428
429 static void op_analyze_traverse(const char *filename,
430                                 struct op op[], unsigned int op_num,
431                                 char *words[])
432 {
433         int i, start;
434
435         op[op_num].key = tdb_null;
436
437         /* = %u means traverse function terminated. */
438         if (words[2]) {
439                 if (!streq(words[2], "=") || !words[3] || words[4])
440                         fail(filename, op_num+1, "expect = <num>");
441                 op[op_num].ret = atoi(words[3]);
442         } else
443                 op[op_num].ret = 0;
444
445         start = op_find_start(op, op_num, OP_TDB_TRAVERSE_START);
446         if (!start)
447                 start = op_find_start(op, op_num, OP_TDB_TRAVERSE_READ_START);
448         if (!start)
449                 fail(filename, op_num+1, "no traversal start found");
450
451         op[start].group_len = op_num - start;
452
453         /* Don't roll in nested traverse/chainlock */
454         for (i = start; i <= op_num; i++)
455                 if (!op[i].group_start)
456                         op[i].group_start = start;
457 }
458
459 /* Keep -Wmissing-declarations happy: */
460 const struct op_table *
461 find_keyword (register const char *str, register unsigned int len);
462
463 #include "keywords.c"
464
465 struct depend {
466         /* We can have more than one */
467         struct list_node pre_list;
468         struct list_node post_list;
469         struct op_desc needs;
470         struct op_desc prereq;
471 };
472
473 static void check_deps(const char *filename, struct op op[], unsigned int num)
474 {
475 #ifdef DEBUG_DEPS
476         unsigned int i;
477
478         for (i = 1; i < num; i++)
479                 if (!list_empty(&op[i].pre))
480                         fail(filename, i+1, "Still has dependencies");
481 #endif
482 }
483
484 static void dump_pre(char *filename[], struct op *op[],
485                      unsigned int file, unsigned int i)
486 {
487         struct depend *dep;
488
489         printf("%s:%u (%u) still waiting for:\n", filename[file], i+1,
490                 op[file][i].seqnum);
491         list_for_each(&op[file][i].pre, dep, pre_list)
492                 printf("    %s:%u (%u)\n",
493                        filename[dep->prereq.file], dep->prereq.op_num+1,
494                        op[dep->prereq.file][dep->prereq.op_num].seqnum);
495         check_deps(filename[file], op[file], i);
496 }
497
498 /* We simply read/write pointers, since we all are children. */
499 static bool do_pre(struct tdb_context *tdb,
500                    char *filename[], struct op *op[],
501                    unsigned int file, int pre_fd, unsigned int i,
502                    bool backoff)
503 {
504         while (!list_empty(&op[file][i].pre)) {
505                 struct depend *dep;
506
507 #if DEBUG_DEPS
508                 printf("%s:%u:waiting for pre\n", filename[file], i+1);
509                 fflush(stdout);
510 #endif
511                 if (backoff)
512                         alarm(2);
513                 else
514                         alarm(10);
515                 while (read(pre_fd, &dep, sizeof(dep)) != sizeof(dep)) {
516                         if (errno == EINTR) {
517                                 if (backoff) {
518                                         warnx("%s:%u:avoiding deadlock",
519                                               filename[file], i+1);
520                                         return false;
521                                 }
522                                 dump_pre(filename, op, file, i);
523                                 exit(1);
524                         } else
525                                 errx(1, "Reading from pipe");
526                 }
527                 alarm(0);
528
529 #if DEBUG_DEPS
530                 printf("%s:%u:got pre %u from %s:%u\n", filename[file], i+1,
531                        dep->needs.op_num+1, filename[dep->prereq.file],
532                        dep->prereq.op_num+1);
533                 fflush(stdout);
534 #endif
535                 /* This could be any op, not just this one. */
536                 talloc_free(dep);
537         }
538         return true;
539 }
540
541 static void do_post(char *filename[], struct op *op[],
542                     unsigned int file, unsigned int i)
543 {
544         struct depend *dep;
545
546         list_for_each(&op[file][i].post, dep, post_list) {
547 #if DEBUG_DEPS
548                 printf("%s:%u:sending to file %s:%u\n", filename[file], i+1,
549                        filename[dep->needs.file], dep->needs.op_num+1);
550 #endif
551                 if (write(pipes[dep->needs.file].fd[1], &dep, sizeof(dep))
552                     != sizeof(dep))
553                         err(1, "%s:%u failed to tell file %s",
554                             filename[file], i+1, filename[dep->needs.file]);
555         }
556 }
557
558 static int get_len(TDB_DATA key, TDB_DATA data, void *private_data)
559 {
560         return data.dsize;
561 }
562
563 static unsigned run_ops(struct tdb_context *tdb,
564                         int pre_fd,
565                         char *filename[],
566                         struct op *op[],
567                         unsigned int file,
568                         unsigned int start, unsigned int stop,
569                         bool backoff);
570
571 struct traverse_info {
572         struct op **op;
573         char **filename;
574         unsigned file;
575         int pre_fd;
576         unsigned int start;
577         unsigned int i;
578 };
579
580 /* More complex.  Just do whatever's they did at the n'th entry. */
581 static int nontrivial_traverse(struct tdb_context *tdb,
582                                TDB_DATA key, TDB_DATA data,
583                                void *_tinfo)
584 {
585         struct traverse_info *tinfo = _tinfo;
586         unsigned int trav_len = tinfo->op[tinfo->file][tinfo->start].group_len;
587         bool avoid_deadlock = false;
588
589         if (tinfo->i == tinfo->start + trav_len) {
590                 /* This can happen if traverse expects to be empty. */
591                 if (trav_len == 1)
592                         return 1;
593                 fail(tinfo->filename[tinfo->file], tinfo->start + 1,
594                      "traverse did not terminate");
595         }
596
597         if (tinfo->op[tinfo->file][tinfo->i].type != OP_TDB_TRAVERSE)
598                 fail(tinfo->filename[tinfo->file], tinfo->start + 1,
599                      "%s:%u:traverse terminated early");
600
601 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
602         avoid_deadlock = true;
603 #endif
604
605         /* Run any normal ops. */
606         tinfo->i = run_ops(tdb, tinfo->pre_fd, tinfo->filename, tinfo->op,
607                            tinfo->file, tinfo->i+1, tinfo->start + trav_len,
608                            avoid_deadlock);
609
610         /* We backed off, or we hit OP_TDB_TRAVERSE_END. */
611         if (tinfo->op[tinfo->file][tinfo->i].type != OP_TDB_TRAVERSE)
612                 return 1;
613
614         return 0;
615 }
616
617 static unsigned op_traverse(struct tdb_context *tdb,
618                             int pre_fd,
619                             char *filename[],
620                             unsigned int file,
621                             int (*traversefn)(struct tdb_context *,
622                                               tdb_traverse_func, void *),
623                             struct op *op[],
624                             unsigned int start)
625 {
626         struct traverse_info tinfo = { op, filename, file, pre_fd,
627                                        start, start+1 };
628
629         traversefn(tdb, nontrivial_traverse, &tinfo);
630
631         /* Traversing in wrong order can have strange effects: eg. if
632          * original traverse went A (delete A), B, we might do B
633          * (delete A).  So if we have ops left over, we do it now. */
634         while (tinfo.i != start + op[file][start].group_len) {
635                 if (op[file][tinfo.i].type == OP_TDB_TRAVERSE)
636                         tinfo.i++;
637                 else
638                         tinfo.i = run_ops(tdb, pre_fd, filename, op, file,
639                                           tinfo.i,
640                                           start + op[file][start].group_len,
641                                           false);
642         }
643
644         return tinfo.i;
645 }
646
647 static void break_out(int sig)
648 {
649 }
650
651 static __attribute__((noinline))
652 unsigned run_ops(struct tdb_context *tdb,
653                  int pre_fd,
654                  char *filename[],
655                  struct op *op[],
656                  unsigned int file,
657                  unsigned int start, unsigned int stop,
658                  bool backoff)
659 {
660         unsigned int i;
661         struct sigaction sa;
662
663         sa.sa_handler = break_out;
664         sa.sa_flags = 0;
665
666         sigaction(SIGALRM, &sa, NULL);
667         for (i = start; i < stop; i++) {
668                 if (!do_pre(tdb, filename, op, file, pre_fd, i, backoff))
669                         return i;
670
671                 switch (op[file][i].type) {
672                 case OP_TDB_LOCKALL:
673                         try(tdb_lockall(tdb), op[file][i].ret);
674                         break;
675                 case OP_TDB_LOCKALL_MARK:
676                         try(tdb_lockall_mark(tdb), op[file][i].ret);
677                         break;
678                 case OP_TDB_LOCKALL_UNMARK:
679                         try(tdb_lockall_unmark(tdb), op[file][i].ret);
680                         break;
681                 case OP_TDB_LOCKALL_NONBLOCK:
682                         unreliable(tdb_lockall_nonblock(tdb), op[file][i].ret,
683                                    tdb_lockall(tdb), tdb_unlockall(tdb));
684                         break;
685                 case OP_TDB_UNLOCKALL:
686                         try(tdb_unlockall(tdb), op[file][i].ret);
687                         break;
688                 case OP_TDB_LOCKALL_READ:
689                         try(tdb_lockall_read(tdb), op[file][i].ret);
690                         break;
691                 case OP_TDB_LOCKALL_READ_NONBLOCK:
692                         unreliable(tdb_lockall_read_nonblock(tdb),
693                                    op[file][i].ret,
694                                    tdb_lockall_read(tdb),
695                                    tdb_unlockall_read(tdb));
696                         break;
697                 case OP_TDB_UNLOCKALL_READ:
698                         try(tdb_unlockall_read(tdb), op[file][i].ret);
699                         break;
700                 case OP_TDB_CHAINLOCK:
701                         try(tdb_chainlock(tdb, op[file][i].key),
702                             op[file][i].ret);
703                         break;
704                 case OP_TDB_CHAINLOCK_NONBLOCK:
705                         unreliable(tdb_chainlock_nonblock(tdb, op[file][i].key),
706                                    op[file][i].ret,
707                                    tdb_chainlock(tdb, op[file][i].key),
708                                    tdb_chainunlock(tdb, op[file][i].key));
709                         break;
710                 case OP_TDB_CHAINLOCK_MARK:
711                         try(tdb_chainlock_mark(tdb, op[file][i].key),
712                             op[file][i].ret);
713                         break;
714                 case OP_TDB_CHAINLOCK_UNMARK:
715                         try(tdb_chainlock_unmark(tdb, op[file][i].key),
716                             op[file][i].ret);
717                         break;
718                 case OP_TDB_CHAINUNLOCK:
719                         try(tdb_chainunlock(tdb, op[file][i].key),
720                             op[file][i].ret);
721                         break;
722                 case OP_TDB_CHAINLOCK_READ:
723                         try(tdb_chainlock_read(tdb, op[file][i].key),
724                             op[file][i].ret);
725                         break;
726                 case OP_TDB_CHAINUNLOCK_READ:
727                         try(tdb_chainunlock_read(tdb, op[file][i].key),
728                             op[file][i].ret);
729                         break;
730                 case OP_TDB_PARSE_RECORD:
731                         try(tdb_parse_record(tdb, op[file][i].key, get_len,
732                                              NULL),
733                             op[file][i].ret);
734                         break;
735                 case OP_TDB_EXISTS:
736                         try(tdb_exists(tdb, op[file][i].key), op[file][i].ret);
737                         break;
738                 case OP_TDB_STORE:
739                         try(tdb_store(tdb, op[file][i].key, op[file][i].data,
740                                       op[file][i].flag),
741                             op[file][i].ret);
742                         break;
743                 case OP_TDB_APPEND:
744                         try(tdb_append(tdb, op[file][i].key, op[file][i].data),
745                             op[file][i].ret);
746                         break;
747                 case OP_TDB_GET_SEQNUM:
748                         try(tdb_get_seqnum(tdb), op[file][i].ret);
749                         break;
750                 case OP_TDB_WIPE_ALL:
751                         try(tdb_wipe_all(tdb), op[file][i].ret);
752                         break;
753                 case OP_TDB_TRANSACTION_START:
754                         try(tdb_transaction_start(tdb), op[file][i].ret);
755                         break;
756                 case OP_TDB_TRANSACTION_CANCEL:
757                         try(tdb_transaction_cancel(tdb), op[file][i].ret);
758                         break;
759                 case OP_TDB_TRANSACTION_COMMIT:
760                         try(tdb_transaction_commit(tdb), op[file][i].ret);
761                         break;
762                 case OP_TDB_TRAVERSE_READ_START:
763                         i = op_traverse(tdb, pre_fd, filename, file,
764                                         tdb_traverse_read, op, i);
765                         break;
766                 case OP_TDB_TRAVERSE_START:
767                         i = op_traverse(tdb, pre_fd, filename, file,
768                                         tdb_traverse, op, i);
769                         break;
770                 case OP_TDB_TRAVERSE:
771                         /* Terminate: we're in a traverse, and we've
772                          * done our ops. */
773                         return i;
774                 case OP_TDB_TRAVERSE_END:
775                         fail(filename[file], i+1, "unexpected end traverse");
776                 /* FIXME: These must be treated like traverse. */
777                 case OP_TDB_FIRSTKEY:
778                         if (!key_eq(tdb_firstkey(tdb), op[file][i].data))
779                                 fail(filename[file], i+1, "bad firstkey");
780                         break;
781                 case OP_TDB_NEXTKEY:
782                         if (!key_eq(tdb_nextkey(tdb, op[file][i].key),
783                                     op[file][i].data))
784                                 fail(filename[file], i+1, "bad nextkey");
785                         break;
786                 case OP_TDB_FETCH: {
787                         TDB_DATA f = tdb_fetch(tdb, op[file][i].key);
788                         if (!key_eq(f, op[file][i].data))
789                                 fail(filename[file], i+1, "bad fetch %u",
790                                      f.dsize);
791                         break;
792                 }
793                 case OP_TDB_DELETE:
794                         try(tdb_delete(tdb, op[file][i].key), op[file][i].ret);
795                         break;
796                 }
797                 do_post(filename, op, file, i);
798         }
799         return i;
800 }
801
802 /* tdbtorture, in particular, can do a tdb_close with a transaction in
803  * progress. */
804 static struct op *maybe_cancel_transaction(const char *filename,
805                                            struct op *op, unsigned int *num)
806 {
807         unsigned int start = op_find_start(op, *num, OP_TDB_TRANSACTION_START);
808
809         if (start) {
810                 char *words[] = { "<unknown>", "tdb_close", NULL };
811                 add_op(filename, &op, *num, op[start].seqnum,
812                        OP_TDB_TRANSACTION_CANCEL);
813                 op_analyze_transaction(filename, op, *num, words);
814                 (*num)++;
815         }
816         return op;
817 }
818
819 static struct op *load_tracefile(const char *filename, unsigned int *num,
820                                  unsigned int *hashsize,
821                                  unsigned int *tdb_flags,
822                                  unsigned int *open_flags)
823 {
824         unsigned int i;
825         struct op *op = talloc_array(NULL, struct op, 1);
826         char **words;
827         char **lines;
828         char *file;
829
830         file = grab_file(NULL, filename, NULL);
831         if (!file)
832                 err(1, "Reading %s", filename);
833
834         lines = strsplit(file, file, "\n", NULL);
835         if (!lines[0])
836                 errx(1, "%s is empty", filename);
837
838         words = strsplit(lines, lines[0], " ", NULL);
839         if (!streq(words[1], "tdb_open"))
840                 fail(filename, 1, "does not start with tdb_open");
841
842         *hashsize = atoi(words[2]);
843         *tdb_flags = strtoul(words[3], NULL, 0);
844         *open_flags = strtoul(words[4], NULL, 0);
845
846         for (i = 1; lines[i]; i++) {
847                 const struct op_table *opt;
848
849                 words = strsplit(lines, lines[i], " ", NULL);
850                 if (!words[0] || !words[1])
851                         fail(filename, i+1, "Expected seqnum number and op");
852                
853                 opt = find_keyword(words[1], strlen(words[1]));
854                 if (!opt) {
855                         if (streq(words[1], "tdb_close")) {
856                                 if (lines[i+1])
857                                         fail(filename, i+2,
858                                              "lines after tdb_close");
859                                 *num = i;
860                                 talloc_free(lines);
861                                 return maybe_cancel_transaction(filename,
862                                                                 op, num);
863                         }
864                         fail(filename, i+1, "Unknown operation '%s'", words[1]);
865                 }
866
867                 add_op(filename, &op, i, atoi(words[0]), opt->type);
868                 opt->enhance_op(filename, op, i, words);
869         }
870
871         fprintf(stderr, "%s:%u:last operation is not tdb_close: incomplete?",
872               filename, i);
873         talloc_free(lines);
874         *num = i - 1;
875         return maybe_cancel_transaction(filename, op, num);
876 }
877
878 /* We remember all the keys we've ever seen, and who has them. */
879 struct keyinfo {
880         TDB_DATA key;
881         unsigned int num_users;
882         struct op_desc *user;
883 };
884
885 static const TDB_DATA must_not_exist;
886 static const TDB_DATA must_exist;
887 static const TDB_DATA not_exists_or_empty;
888
889 /* NULL means doesn't care if it exists or not, &must_exist means
890  * it must exist but we don't care what, &must_not_exist means it must
891  * not exist, otherwise the data it needs. */
892 static const TDB_DATA *needs(const struct op *op)
893 {
894         switch (op->type) {
895         /* FIXME: Pull forward deps, since we can deadlock */
896         case OP_TDB_CHAINLOCK:
897         case OP_TDB_CHAINLOCK_NONBLOCK:
898         case OP_TDB_CHAINLOCK_MARK:
899         case OP_TDB_CHAINLOCK_UNMARK:
900         case OP_TDB_CHAINUNLOCK:
901         case OP_TDB_CHAINLOCK_READ:
902         case OP_TDB_CHAINUNLOCK_READ:
903                 return NULL;
904
905         case OP_TDB_APPEND:
906                 if (op->append.pre.dsize == 0)
907                         return &not_exists_or_empty;
908                 return &op->append.pre;
909
910         case OP_TDB_STORE:
911                 if (op->flag == TDB_INSERT) {
912                         if (op->ret < 0)
913                                 return &must_exist;
914                         else
915                                 return &must_not_exist;
916                 } else if (op->flag == TDB_MODIFY) {
917                         if (op->ret < 0)
918                                 return &must_not_exist;
919                         else
920                                 return &must_exist;
921                 }
922                 /* No flags?  Don't care */
923                 return NULL;
924
925         case OP_TDB_EXISTS:
926                 if (op->ret == 1)
927                         return &must_exist;
928                 else
929                         return &must_not_exist;
930
931         case OP_TDB_PARSE_RECORD:
932                 if (op->ret < 0)
933                         return &must_not_exist;
934                 return &must_exist;
935
936         /* FIXME: handle these. */
937         case OP_TDB_WIPE_ALL:
938         case OP_TDB_FIRSTKEY:
939         case OP_TDB_NEXTKEY:
940         case OP_TDB_GET_SEQNUM:
941         case OP_TDB_TRAVERSE:
942         case OP_TDB_TRANSACTION_COMMIT:
943         case OP_TDB_TRANSACTION_CANCEL:
944         case OP_TDB_TRANSACTION_START:
945                 return NULL;
946
947         case OP_TDB_FETCH:
948                 if (!op->data.dptr)
949                         return &must_not_exist;
950                 return &op->data;
951
952         case OP_TDB_DELETE:
953                 if (op->ret < 0)
954                         return &must_not_exist;
955                 return &must_exist;
956
957         default:
958                 errx(1, "Unexpected op type %i", op->type);
959         }
960         
961 }
962
963 static bool starts_transaction(const struct op *op)
964 {
965         return op->type == OP_TDB_TRANSACTION_START;
966 }
967
968 static bool in_transaction(const struct op op[], unsigned int i)
969 {
970         return op[i].group_start && starts_transaction(&op[op[i].group_start]);
971 }
972
973 static bool successful_transaction(const struct op *op)
974 {
975         return starts_transaction(op)
976                 && op[op->group_len].type == OP_TDB_TRANSACTION_COMMIT;
977 }
978
979 static bool starts_traverse(const struct op *op)
980 {
981         return op->type == OP_TDB_TRAVERSE_START
982                 || op->type == OP_TDB_TRAVERSE_READ_START;
983 }
984
985 static bool in_traverse(const struct op op[], unsigned int i)
986 {
987         return op[i].group_start && starts_traverse(&op[op[i].group_start]);
988 }
989
990 static bool starts_chainlock(const struct op *op)
991 {
992         return op->type == OP_TDB_CHAINLOCK_READ
993                 || op->type == OP_TDB_CHAINLOCK;
994 }
995
996 static bool in_chainlock(const struct op op[], unsigned int i)
997 {
998         return op[i].group_start && starts_chainlock(&op[op[i].group_start]);
999 }
1000
1001 /* What's the data after this op?  pre if nothing changed. */
1002 static const TDB_DATA *gives(const TDB_DATA *key, const TDB_DATA *pre,
1003                              const struct op *op)
1004 {
1005         if (starts_transaction(op) || starts_chainlock(op)) {
1006                 unsigned int i;
1007
1008                 /* Cancelled transactions don't change anything. */
1009                 if (op[op->group_len].type == OP_TDB_TRANSACTION_CANCEL)
1010                         return pre;
1011                 assert(op[op->group_len].type == OP_TDB_TRANSACTION_COMMIT
1012                        || op[op->group_len].type == OP_TDB_CHAINUNLOCK_READ
1013                        || op[op->group_len].type == OP_TDB_CHAINUNLOCK);
1014
1015                 for (i = 1; i < op->group_len; i++) {
1016                         /* This skips nested transactions, too */
1017                         if (key_eq(op[i].key, *key))
1018                                 pre = gives(key, pre, &op[i]);
1019                 }
1020                 return pre;
1021         }
1022
1023         /* Failed ops don't change state of db. */
1024         if (op->ret < 0)
1025                 return pre;
1026
1027         if (op->type == OP_TDB_DELETE || op->type == OP_TDB_WIPE_ALL)
1028                 return &tdb_null;
1029
1030         if (op->type == OP_TDB_APPEND)
1031                 return &op->append.post;
1032
1033         if (op->type == OP_TDB_STORE)
1034                 return &op->data;
1035
1036         return pre;
1037 }
1038
1039 static struct keyinfo *hash_ops(struct op *op[], unsigned int num_ops[],
1040                                 unsigned int num)
1041 {
1042         unsigned int i, j, h;
1043         struct keyinfo *hash;
1044
1045         hash = talloc_zero_array(op[0], struct keyinfo, total_keys*2);
1046         for (i = 0; i < num; i++) {
1047                 for (j = 1; j < num_ops[i]; j++) {
1048                         /* We can't do this on allocation, due to realloc. */
1049                         list_head_init(&op[i][j].post);
1050                         list_head_init(&op[i][j].pre);
1051
1052                         if (!op[i][j].key.dptr)
1053                                 continue;
1054
1055                         h = hash_key(&op[i][j].key) % (total_keys * 2);
1056                         while (!key_eq(hash[h].key, op[i][j].key)) {
1057                                 if (!hash[h].key.dptr) {
1058                                         hash[h].key = op[i][j].key;
1059                                         break;
1060                                 }
1061                                 h = (h + 1) % (total_keys * 2);
1062                         }
1063                         /* Might as well save some memory if we can. */
1064                         if (op[i][j].key.dptr != hash[h].key.dptr) {
1065                                 talloc_free(op[i][j].key.dptr);
1066                                 op[i][j].key.dptr = hash[h].key.dptr;
1067                         }
1068                         hash[h].user = talloc_realloc(hash, hash[h].user,
1069                                                      struct op_desc,
1070                                                      hash[h].num_users+1);
1071
1072                         /* If it's in a transaction, it's the transaction which
1073                          * matters from an analysis POV. */
1074                         if (in_transaction(op[i], j)
1075                             || in_chainlock(op[i], j)) {
1076                                 unsigned start = op[i][j].group_start;
1077
1078                                 /* Don't include twice. */
1079                                 if (hash[h].num_users
1080                                     && hash[h].user[hash[h].num_users-1].file
1081                                         == i
1082                                     && hash[h].user[hash[h].num_users-1].op_num
1083                                         == start)
1084                                         continue;
1085
1086                                 hash[h].user[hash[h].num_users].op_num = start;
1087                         } else
1088                                 hash[h].user[hash[h].num_users].op_num = j;
1089                         hash[h].user[hash[h].num_users].file = i;
1090                         hash[h].num_users++;
1091                 }
1092         }
1093
1094         return hash;
1095 }
1096
1097 static bool satisfies(const TDB_DATA *key, const TDB_DATA *data,
1098                       const struct op *op)
1099 {
1100         const TDB_DATA *need = NULL;
1101
1102         if (starts_transaction(op) || starts_chainlock(op)) {
1103                 unsigned int i;
1104
1105                 /* Look through for an op in this transaction which
1106                  * needs this key. */
1107                 for (i = 1; i < op->group_len; i++) {
1108                         if (key_eq(op[i].key, *key)) {
1109                                 need = needs(&op[i]);
1110                                 /* tdb_exists() is special: there might be
1111                                  * something in the transaction with more
1112                                  * specific requirements.  Other ops don't have
1113                                  * specific requirements (eg. store or delete),
1114                                  * but they change the value so we can't get
1115                                  * more information from future ops. */
1116                                 if (op[i].type != OP_TDB_EXISTS)
1117                                         break;
1118                         }
1119                 }
1120         } else
1121                 need = needs(op);
1122
1123         /* Don't need anything?  Cool. */
1124         if (!need)
1125                 return true;
1126
1127         /* This should be tdb_null or a real value. */
1128         assert(data != &must_exist);
1129         assert(data != &must_not_exist);
1130         assert(data != &not_exists_or_empty);
1131
1132         /* Must not exist?  data must not exist. */
1133         if (need == &must_not_exist)
1134                 return data == &tdb_null;
1135
1136         /* Must exist? */
1137         if (need == &must_exist)
1138                 return data != &tdb_null;
1139
1140         /* Either noexist or empty. */
1141         if (need == &not_exists_or_empty)
1142                 return data->dsize == 0;
1143
1144         /* Needs something specific. */
1145         return key_eq(*data, *need);
1146 }
1147
1148 static void move_to_front(struct op_desc res[], unsigned off, unsigned elem)
1149 {
1150         if (elem != off) {
1151                 struct op_desc tmp = res[elem];
1152                 memmove(res + off + 1, res + off, (elem - off)*sizeof(res[0]));
1153                 res[off] = tmp;
1154         }
1155 }
1156
1157 static void restore_to_pos(struct op_desc res[], unsigned off, unsigned elem)
1158 {
1159         if (elem != off) {
1160                 struct op_desc tmp = res[off];
1161                 memmove(res + off, res + off + 1, (elem - off)*sizeof(res[0]));
1162                 res[elem] = tmp;
1163         }
1164 }
1165
1166 static bool sort_deps(char *filename[], struct op *op[],
1167                       struct op_desc res[],
1168                       unsigned off, unsigned num,
1169                       const TDB_DATA *key, const TDB_DATA *data,
1170                       unsigned num_files, unsigned fuzz)
1171 {
1172         unsigned int i, files_done;
1173         struct op *this_op;
1174         bool done[num_files];
1175
1176         /* None left?  We're sorted. */
1177         if (off == num)
1178                 return true;
1179
1180         /* Does this make sequence number go backwards?  Allow a little fuzz. */
1181         if (off > 0) {
1182                 int seqnum1 = op[res[off-1].file][res[off-1].op_num].seqnum;
1183                 int seqnum2 = op[res[off].file][res[off].op_num].seqnum;
1184
1185                 if (seqnum1 - seqnum2 > (int)fuzz) {
1186 #if DEBUG_DEPS
1187                         printf("Seqnum jump too far (%u -> %u)\n",
1188                                seqnum1, seqnum2);
1189 #endif
1190                         return false;
1191                 }
1192         }
1193
1194         memset(done, 0, sizeof(done));
1195
1196         /* Since ops within a trace file are ordered, we just need to figure
1197          * out which file to try next.  Since we don't take into account
1198          * inter-key relationships (which exist by virtue of trace file order),
1199          * we minimize the chance of harm by trying to keep in seqnum order. */
1200         for (files_done = 0, i = off; i < num && files_done < num_files; i++) {
1201                 if (done[res[i].file])
1202                         continue;
1203
1204                 this_op = &op[res[i].file][res[i].op_num];
1205
1206                 /* Is what we have good enough for this op? */
1207                 if (satisfies(key, data, this_op)) {
1208                         move_to_front(res, off, i);
1209                         if (sort_deps(filename, op, res, off+1, num,
1210                                       key, gives(key, data, this_op),
1211                                       num_files, fuzz))
1212                                 return true;
1213                         restore_to_pos(res, off, i);
1214                 }
1215                 done[res[i].file] = true;
1216                 files_done++;
1217         }
1218
1219         /* No combination worked. */
1220         return false;
1221 }
1222
1223 static void check_dep_sorting(struct op_desc user[], unsigned num_users,
1224                               unsigned num_files)
1225 {
1226 #if DEBUG_DEPS
1227         unsigned int i;
1228         unsigned minima[num_files];
1229
1230         memset(minima, 0, sizeof(minima));
1231         for (i = 0; i < num_users; i++) {
1232                 assert(minima[user[i].file] < user[i].op_num);
1233                 minima[user[i].file] = user[i].op_num;
1234         }
1235 #endif
1236 }
1237
1238 /* All these ops happen on the same key.  Which comes first?
1239  *
1240  * This can happen both because read ops or failed write ops don't
1241  * change sequence number, and also due to race since we access the
1242  * number unlocked (the race can cause less detectable ordering problems,
1243  * in which case we'll deadlock and report: fix manually in that case).
1244  */
1245 static void figure_deps(char *filename[], struct op *op[],
1246                         const TDB_DATA *key, struct op_desc user[],
1247                         unsigned num_users, unsigned num_files)
1248 {
1249         /* We assume database starts empty. */
1250         const struct TDB_DATA *data = &tdb_null;
1251         unsigned int fuzz;
1252
1253         /* We prefer to keep strict seqnum order if possible: it's the
1254          * most likely.  We get more lax if that fails. */
1255         for (fuzz = 0; fuzz < 100; fuzz = (fuzz + 1)*2) {
1256                 if (sort_deps(filename, op, user, 0, num_users, key, data,
1257                               num_files, fuzz))
1258                         break;
1259         }
1260
1261         if (fuzz >= 100)
1262                 fail(filename[user[0].file], user[0].op_num+1,
1263                      "Could not resolve inter-dependencies");
1264
1265         check_dep_sorting(user, num_users, num_files);
1266 }
1267
1268 static void sort_ops(struct keyinfo hash[], char *filename[], struct op *op[],
1269                      unsigned int num)
1270 {
1271         unsigned int h;
1272
1273         /* Gcc nexted function extension.  How cool is this? */
1274         int compare_seqnum(const void *_a, const void *_b)
1275         {
1276                 const struct op_desc *a = _a, *b = _b;
1277
1278                 /* First, maintain order within any trace file. */
1279                 if (a->file == b->file)
1280                         return a->op_num - b->op_num;
1281
1282                 /* Otherwise, arrange by seqnum order. */
1283                 if (op[a->file][a->op_num].seqnum !=
1284                     op[b->file][b->op_num].seqnum)
1285                         return op[a->file][a->op_num].seqnum
1286                                 - op[b->file][b->op_num].seqnum;
1287
1288                 /* Cancelled transactions are assumed to happen first. */
1289                 if (starts_transaction(&op[a->file][a->op_num])
1290                     && !successful_transaction(&op[a->file][a->op_num]))
1291                         return -1;
1292                 if (starts_transaction(&op[b->file][b->op_num])
1293                     && !successful_transaction(&op[b->file][b->op_num]))
1294                         return 1;
1295
1296                 /* No idea. */
1297                 return 0;
1298         }
1299
1300         /* Now sort into seqnum order. */
1301         for (h = 0; h < total_keys * 2; h++) {
1302                 struct op_desc *user = hash[h].user;
1303
1304                 qsort(user, hash[h].num_users, sizeof(user[0]), compare_seqnum);
1305                 figure_deps(filename, op, &hash[h].key, user, hash[h].num_users,
1306                             num);
1307         }
1308 }
1309
1310 static int destroy_depend(struct depend *dep)
1311 {
1312         list_del(&dep->pre_list);
1313         list_del(&dep->post_list);
1314         return 0;
1315 }
1316
1317 static void add_dependency(void *ctx,
1318                            struct op *op[],
1319                            char *filename[],
1320                            const struct op_desc *needs,
1321                            const struct op_desc *prereq)
1322 {
1323         struct depend *dep;
1324
1325         /* We don't depend on ourselves. */
1326         if (needs->file == prereq->file) {
1327                 assert(prereq->op_num < needs->op_num);
1328                 return;
1329         }
1330
1331 #if DEBUG_DEPS
1332         printf("%s:%u: depends on %s:%u\n",
1333                filename[needs->file], needs->op_num+1,
1334                filename[prereq->file], prereq->op_num+1);
1335 #endif
1336
1337         dep = talloc(ctx, struct depend);
1338         dep->needs = *needs;
1339         dep->prereq = *prereq;
1340
1341 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
1342         /* If something in a traverse depends on something in another
1343          * traverse/transaction, it creates a dependency between the
1344          * two groups. */
1345         if ((in_traverse(op[prereq->file], prereq->op_num)
1346              && (starts_transaction(&op[needs->file][needs->op_num])
1347                  || starts_traverse(&op[needs->file][needs->op_num])))
1348             || (in_traverse(op[needs->file], needs->op_num)
1349                 && (starts_transaction(&op[prereq->file][prereq->op_num])
1350                     || starts_traverse(&op[prereq->file][prereq->op_num])))) {
1351                 unsigned int start;
1352
1353                 /* We are satisfied by end of group. */
1354                 start = op[prereq->file][prereq->op_num].group_start;
1355                 dep->prereq.op_num = start + op[prereq->file][start].group_len;
1356                 /* And we need that done by start of our group. */
1357                 dep->needs.op_num = op[needs->file][needs->op_num].group_start;
1358         }
1359
1360         /* There is also this case:
1361          *  <traverse> <read foo> ...
1362          *  <transaction> ... </transaction> <create foo>
1363          * Where if we start the traverse then wait, we could block
1364          * the transaction and deadlock.
1365          *
1366          * We try to address this by ensuring that where seqnum indicates it's
1367          * possible, we wait for <create foo> before *starting* traverse.
1368          */
1369         else if (in_traverse(op[needs->file], needs->op_num)) {
1370                 struct op *need = &op[needs->file][needs->op_num];
1371                 if (op[needs->file][need->group_start].seqnum >
1372                     op[prereq->file][prereq->op_num].seqnum) {
1373                         dep->needs.op_num = need->group_start;
1374                 }
1375         }
1376 #endif
1377
1378         /* If you depend on a transaction or chainlock, you actually
1379          * depend on it ending. */
1380         if (starts_transaction(&op[prereq->file][dep->prereq.op_num])
1381             || starts_chainlock(&op[prereq->file][dep->prereq.op_num])) {
1382                 dep->prereq.op_num
1383                         += op[dep->prereq.file][dep->prereq.op_num].group_len;
1384 #if DEBUG_DEPS
1385                 printf("-> Actually end of transaction %s:%u\n",
1386                        filename[dep->prereq->file], dep->prereq->op_num+1);
1387 #endif
1388         } else
1389                 /* We should never create a dependency from middle of
1390                  * a transaction. */
1391                 assert(!in_transaction(op[prereq->file], dep->prereq.op_num)
1392                        || op[prereq->file][dep->prereq.op_num].type
1393                        == OP_TDB_TRANSACTION_COMMIT
1394                        || op[prereq->file][dep->prereq.op_num].type
1395                        == OP_TDB_TRANSACTION_CANCEL);
1396
1397         list_add(&op[dep->prereq.file][dep->prereq.op_num].post,
1398                  &dep->post_list);
1399         list_add(&op[dep->needs.file][dep->needs.op_num].pre,
1400                  &dep->pre_list);
1401         talloc_set_destructor(dep, destroy_depend);
1402 }
1403
1404 static bool changes_db(const TDB_DATA *key, const struct op *op)
1405 {
1406         return gives(key, NULL, op) != NULL;
1407 }
1408
1409 static void depend_on_previous(struct op *op[],
1410                                char *filename[],
1411                                unsigned int num,
1412                                struct op_desc user[],
1413                                unsigned int i,
1414                                int prev)
1415 {
1416         bool deps[num];
1417         int j;
1418
1419         if (i == 0)
1420                 return;
1421
1422         if (prev == i - 1) {
1423                 /* Just depend on previous. */
1424                 add_dependency(NULL, op, filename, &user[i], &user[prev]);
1425                 return;
1426         }
1427
1428         /* We have to wait for the readers.  Find last one in *each* file. */
1429         memset(deps, 0, sizeof(deps));
1430         deps[user[i].file] = true;
1431         for (j = i - 1; j > prev; j--) {
1432                 if (!deps[user[j].file]) {
1433                         add_dependency(NULL, op, filename, &user[i], &user[j]);
1434                         deps[user[j].file] = true;
1435                 }
1436         }
1437 }
1438
1439 /* This is simple, but not complete.  We don't take into account
1440  * indirect dependencies. */
1441 static void optimize_dependencies(struct op *op[], unsigned int num_ops[],
1442                                   unsigned int num)
1443 {
1444         unsigned int i, j;
1445
1446         /* There can only be one real dependency on each file */
1447         for (i = 0; i < num; i++) {
1448                 for (j = 1; j < num_ops[i]; j++) {
1449                         struct depend *dep, *next;
1450                         struct depend *prev[num];
1451
1452                         memset(prev, 0, sizeof(prev));
1453
1454                         list_for_each_safe(&op[i][j].pre, dep, next, pre_list) {
1455                                 if (!prev[dep->prereq.file]) {
1456                                         prev[dep->prereq.file] = dep;
1457                                         continue;
1458                                 }
1459                                 if (prev[dep->prereq.file]->prereq.op_num
1460                                     < dep->prereq.op_num) {
1461                                         talloc_free(prev[dep->prereq.file]);
1462                                         prev[dep->prereq.file] = dep;
1463                                 } else
1464                                         talloc_free(dep);
1465                         }
1466                 }
1467         }
1468
1469         for (i = 0; i < num; i++) {
1470                 int deps[num];
1471
1472                 for (j = 0; j < num; j++)
1473                         deps[j] = -1;
1474
1475                 for (j = 1; j < num_ops[i]; j++) {
1476                         struct depend *dep, *next;
1477
1478                         list_for_each_safe(&op[i][j].pre, dep, next, pre_list) {
1479                                 if (deps[dep->prereq.file]
1480                                     >= (int)dep->prereq.op_num)
1481                                         talloc_free(dep);
1482                                 else
1483                                         deps[dep->prereq.file]
1484                                                 = dep->prereq.op_num;
1485                         }
1486                 }
1487         }
1488 }
1489
1490 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
1491 /* Force an order among the traversals, so they don't deadlock (as much) */
1492 static void make_traverse_depends(char *filename[],
1493                                   struct op *op[], unsigned int num_ops[],
1494                                   unsigned int num)
1495 {
1496         unsigned int i, num_traversals = 0;
1497         int j;
1498         struct op_desc *desc;
1499
1500         /* Sort by which one runs first. */
1501         int compare_traverse_desc(const void *_a, const void *_b)
1502         {
1503                 const struct op_desc *da = _a, *db = _b;
1504                 const struct op *a = &op[da->file][da->op_num],
1505                         *b = &op[db->file][db->op_num];
1506
1507                 if (a->seqnum != b->seqnum)
1508                         return a->seqnum - b->seqnum;
1509
1510                 /* If they have same seqnum, it means one didn't make any
1511                  * changes.  Thus sort by end in that case. */
1512                 return a[a->group_len].seqnum - b[b->group_len].seqnum;
1513         }
1514
1515         desc = talloc_array(NULL, struct op_desc, 1);
1516
1517         /* Count them. */
1518         for (i = 0; i < num; i++) {
1519                 for (j = 1; j < num_ops[i]; j++) {
1520                         /* Traverse start (ignore those in
1521                          * transactions; they're already covered by
1522                          * transaction dependencies). */
1523                         if (starts_traverse(&op[i][j])
1524                             && !in_transaction(op[i], j)) {
1525                                 desc = talloc_realloc(NULL, desc,
1526                                                       struct op_desc,
1527                                                       num_traversals+1);
1528                                 desc[num_traversals].file = i;
1529                                 desc[num_traversals].op_num = j;
1530                                 num_traversals++;
1531                         }
1532                 }
1533         }
1534         qsort(desc, num_traversals, sizeof(desc[0]), compare_traverse_desc);
1535
1536         for (i = 1; i < num_traversals; i++) {
1537                 const struct op *prev = &op[desc[i-1].file][desc[i-1].op_num];
1538                 const struct op *curr = &op[desc[i].file][desc[i].op_num];
1539
1540                 /* Read traverses don't depend on each other (read lock). */
1541                 if (prev->type == OP_TDB_TRAVERSE_READ_START
1542                     && curr->type == OP_TDB_TRAVERSE_READ_START)
1543                         continue;
1544
1545                 /* Only make dependency if it's clear. */
1546                 if (compare_traverse_desc(&desc[i], &desc[i-1])) {
1547                         /* i depends on end of traverse i-1. */
1548                         struct op_desc end = desc[i-1];
1549                         end.op_num += prev->group_len;
1550                         add_dependency(NULL, op, filename, &desc[i], &end);
1551                 }
1552         }
1553         talloc_free(desc);
1554 }
1555 #endif
1556
1557 static void derive_dependencies(char *filename[],
1558                                 struct op *op[], unsigned int num_ops[],
1559                                 unsigned int num)
1560 {
1561         struct keyinfo *hash;
1562         unsigned int h, i;
1563
1564         /* Create hash table for faster key lookup. */
1565         hash = hash_ops(op, num_ops, num);
1566
1567         /* Sort them by sequence number. */
1568         sort_ops(hash, filename, op, num);
1569
1570         /* Create dependencies back to the last change, rather than
1571          * creating false dependencies by naively making each one
1572          * depend on the previous.  This has two purposes: it makes
1573          * later optimization simpler, and it also avoids deadlock with
1574          * same sequence number ops inside traversals (if one
1575          * traversal doesn't write anything, two ops can have the same
1576          * sequence number yet we can create a traversal dependency
1577          * the other way). */
1578         for (h = 0; h < total_keys * 2; h++) {
1579                 int prev = -1;
1580
1581                 if (hash[h].num_users < 2)
1582                         continue;
1583
1584                 for (i = 0; i < hash[h].num_users; i++) {
1585                         if (changes_db(&hash[h].key, &op[hash[h].user[i].file]
1586                                        [hash[h].user[i].op_num])) {
1587                                 depend_on_previous(op, filename, num,
1588                                                    hash[h].user, i, prev);
1589                                 prev = i;
1590                         } else if (prev >= 0)
1591                                 add_dependency(hash, op, filename,
1592                                                &hash[h].user[i],
1593                                                &hash[h].user[prev]);
1594                 }
1595         }
1596
1597 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
1598         make_traverse_depends(filename, op, num_ops, num);
1599 #endif
1600
1601         optimize_dependencies(op, num_ops, num);
1602 }
1603
1604 int main(int argc, char *argv[])
1605 {
1606         struct timeval start, end;
1607         unsigned int i, num_ops[argc], hashsize[argc], tdb_flags[argc], open_flags[argc];
1608         struct op *op[argc];
1609         int fds[2];
1610         char c;
1611         bool ok = true;
1612
1613         if (argc < 3)
1614                 errx(1, "Usage: %s <tdbfile> <tracefile>...", argv[0]);
1615
1616         pipes = talloc_array(NULL, struct pipe, argc - 2);
1617         for (i = 0; i < argc - 2; i++) {
1618                 printf("Loading tracefile %s...", argv[2+i]);
1619                 fflush(stdout);
1620                 op[i] = load_tracefile(argv[2+i], &num_ops[i], &hashsize[i],
1621                                        &tdb_flags[i], &open_flags[i]);
1622                 if (pipe(pipes[i].fd) != 0)
1623                         err(1, "creating pipe");
1624                 printf("done\n");
1625         }
1626
1627         printf("Calculating inter-dependencies...");
1628         fflush(stdout);
1629         derive_dependencies(argv+2, op, num_ops, i);
1630         printf("done\n");
1631
1632         /* Don't fork for single arg case: simple debugging. */
1633         if (argc == 3) {
1634                 struct tdb_context *tdb;
1635                 tdb = tdb_open_ex(argv[1], hashsize[0], tdb_flags[0]|TDB_NOSYNC,
1636                                   open_flags[0], 0600, NULL, hash_key);
1637                 printf("Single threaded run...");
1638                 fflush(stdout);
1639
1640                 run_ops(tdb, pipes[0].fd[0], argv+2, op, 0, 1, num_ops[0],
1641                         false);
1642                 check_deps(argv[2], op[0], num_ops[0]);
1643
1644                 printf("done\n");
1645                 exit(0);
1646         }
1647
1648         if (pipe(fds) != 0)
1649                 err(1, "creating pipe");
1650
1651         for (i = 0; i < argc - 2; i++) {
1652                 struct tdb_context *tdb;
1653
1654                 switch (fork()) {
1655                 case -1:
1656                         err(1, "fork failed");
1657                 case 0:
1658                         close(fds[1]);
1659                         tdb = tdb_open_ex(argv[1], hashsize[i],
1660                                           tdb_flags[i]|TDB_NOSYNC,
1661                                           open_flags[i], 0600, NULL, hash_key);
1662                         if (!tdb)
1663                                 err(1, "Opening tdb %s", argv[1]);
1664
1665                         /* This catches parent exiting. */
1666                         if (read(fds[0], &c, 1) != 1)
1667                                 exit(1);
1668                         run_ops(tdb, pipes[i].fd[0], argv+2, op, i, 1,
1669                                 num_ops[i], false);
1670                         check_deps(argv[2+i], op[i], num_ops[i]);
1671                         exit(0);
1672                 default:
1673                         break;
1674                 }
1675         }
1676
1677         /* Let everything settle. */
1678         sleep(1);
1679
1680         printf("Starting run...");
1681         fflush(stdout);
1682         gettimeofday(&start, NULL);
1683         /* Tell them all to go!  Any write of sufficient length will do. */
1684         if (write(fds[1], hashsize, i) != i)
1685                 err(1, "Writing to wakeup pipe");
1686
1687         for (i = 0; i < argc - 2; i++) {
1688                 int status;
1689                 wait(&status);
1690                 if (!WIFEXITED(status)) {
1691                         warnx("Child died with signal %i", WTERMSIG(status));
1692                         ok = false;
1693                 } else if (WEXITSTATUS(status) != 0)
1694                         /* Assume child spat out error. */
1695                         ok = false;
1696         }
1697         if (!ok)
1698                 exit(1);
1699
1700         gettimeofday(&end, NULL);
1701         printf("done\n");
1702
1703         end.tv_sec -= start.tv_sec;
1704         printf("Time replaying: %lu usec\n",
1705                end.tv_sec * 1000000UL + (end.tv_usec - start.tv_usec));
1706         
1707         exit(0);
1708 }