]> git.ozlabs.org Git - ccan/blob - ccan/tdb/tools/replay_trace.c
Traverses seem to work now. Also, much better reporting of deadlocks.
[ccan] / ccan / tdb / tools / replay_trace.c
1 #include <ccan/tdb/tdb.h>
2 #include <ccan/grab_file/grab_file.h>
3 #include <ccan/hash/hash.h>
4 #include <ccan/talloc/talloc.h>
5 #include <ccan/str_talloc/str_talloc.h>
6 #include <ccan/str/str.h>
7 #include <ccan/list/list.h>
8 #include <err.h>
9 #include <ctype.h>
10 #include <string.h>
11 #include <unistd.h>
12 #include <sys/types.h>
13 #include <sys/wait.h>
14 #include <sys/time.h>
15 #include <errno.h>
16 #include <signal.h>
17
18 #define STRINGIFY2(x) #x
19 #define STRINGIFY(x) STRINGIFY2(x)
20
21 /* Avoid mod by zero */
22 static unsigned int total_keys = 1;
23
24 /* #define DEBUG_DEPS 1 */
25
26 /* Traversals block transactions in the current implementation. */
27 #define TRAVERSALS_TAKE_TRANSACTION_LOCK 1
28
29 struct pipe {
30         int fd[2];
31 };
32 static struct pipe *pipes;
33
34 static void __attribute__((noreturn)) fail(const char *filename,
35                                            unsigned int line,
36                                            const char *fmt, ...)
37 {
38         va_list ap;
39
40         va_start(ap, fmt);
41         fprintf(stderr, "%s:%u: FAIL: ", filename, line);
42         vfprintf(stderr, fmt, ap);
43         fprintf(stderr, "\n");
44         va_end(ap);
45         exit(1);
46 }
47         
48 /* Try or die. */
49 #define try(expr, expect)                                               \
50         do {                                                            \
51                 int ret = (expr);                                       \
52                 if (ret != (expect))                                    \
53                         fail(filename[file], i+1,                       \
54                              STRINGIFY(expr) "= %i", ret);              \
55         } while (0)
56
57 /* Try or imitate results. */
58 #define unreliable(expr, expect, force, undo)                           \
59         do {                                                            \
60                 int ret = expr;                                         \
61                 if (ret != expect) {                                    \
62                         fprintf(stderr, "%s:%u: %s gave %i not %i",     \
63                                 filename[file], i+1, STRINGIFY(expr),   \
64                                 ret, expect);                           \
65                         if (expect == 0)                                \
66                                 force;                                  \
67                         else                                            \
68                                 undo;                                   \
69                 }                                                       \
70         } while (0)
71
72 static bool key_eq(TDB_DATA a, TDB_DATA b)
73 {
74         if (a.dsize != b.dsize)
75                 return false;
76         return memcmp(a.dptr, b.dptr, a.dsize) == 0;
77 }
78
79 /* This is based on the hash algorithm from gdbm */
80 static unsigned int hash_key(TDB_DATA *key)
81 {
82         uint32_t value; /* Used to compute the hash value.  */
83         uint32_t   i;   /* Used to cycle through random values. */
84
85         /* Set the initial value from the key size. */
86         for (value = 0x238F13AF ^ key->dsize, i=0; i < key->dsize; i++)
87                 value = (value + (key->dptr[i] << (i*5 % 24)));
88
89         return (1103515243 * value + 12345);  
90 }
91
92 enum op_type {
93         OP_TDB_LOCKALL,
94         OP_TDB_LOCKALL_MARK,
95         OP_TDB_LOCKALL_UNMARK,
96         OP_TDB_LOCKALL_NONBLOCK,
97         OP_TDB_UNLOCKALL,
98         OP_TDB_LOCKALL_READ,
99         OP_TDB_LOCKALL_READ_NONBLOCK,
100         OP_TDB_UNLOCKALL_READ,
101         OP_TDB_CHAINLOCK,
102         OP_TDB_CHAINLOCK_NONBLOCK,
103         OP_TDB_CHAINLOCK_MARK,
104         OP_TDB_CHAINLOCK_UNMARK,
105         OP_TDB_CHAINUNLOCK,
106         OP_TDB_CHAINLOCK_READ,
107         OP_TDB_CHAINUNLOCK_READ,
108         OP_TDB_PARSE_RECORD,
109         OP_TDB_EXISTS,
110         OP_TDB_STORE,
111         OP_TDB_APPEND,
112         OP_TDB_GET_SEQNUM,
113         OP_TDB_WIPE_ALL,
114         OP_TDB_TRANSACTION_START,
115         OP_TDB_TRANSACTION_CANCEL,
116         OP_TDB_TRANSACTION_COMMIT,
117         OP_TDB_TRAVERSE_READ_START,
118         OP_TDB_TRAVERSE_START,
119         OP_TDB_TRAVERSE_END,
120         OP_TDB_TRAVERSE,
121         OP_TDB_FIRSTKEY,
122         OP_TDB_NEXTKEY,
123         OP_TDB_FETCH,
124         OP_TDB_DELETE,
125 };
126
127 struct op {
128         unsigned int serial;
129         enum op_type op;
130         TDB_DATA key;
131         TDB_DATA data;
132         int ret;
133
134         /* Who is waiting for us? */
135         struct list_head post;
136         /* What are we waiting for? */
137         struct list_head pre;
138
139         /* If I'm part of a group (traverse/transaction) where is
140          * start?  (Otherwise, 0) */
141         unsigned int group_start;
142
143         union {
144                 int flag; /* open and store */
145                 struct traverse *trav; /* traverse start */
146                 TDB_DATA pre_append; /* append */
147                 unsigned int transaction_end; /* transaction start */
148         };
149 };
150
151 static unsigned char hex_char(const char *filename, unsigned int line, char c)
152 {
153         c = toupper(c);
154         if (c >= 'A' && c <= 'F')
155                 return c - 'A' + 10;
156         if (c >= '0' && c <= '9')
157                 return c - '0';
158         fail(filename, line, "invalid hex character '%c'", c);
159 }
160
161 /* TDB data is <size>:<%02x>* */
162 static TDB_DATA make_tdb_data(const void *ctx,
163                               const char *filename, unsigned int line,
164                               const char *word)
165 {
166         TDB_DATA data;
167         unsigned int i;
168         const char *p;
169
170         if (streq(word, "NULL"))
171                 return tdb_null;
172
173         data.dsize = atoi(word);
174         data.dptr = talloc_array(ctx, unsigned char, data.dsize);
175         p = strchr(word, ':');
176         if (!p)
177                 fail(filename, line, "invalid tdb data '%s'", word);
178         p++;
179         for (i = 0; i < data.dsize; i++)
180                 data.dptr[i] = hex_char(filename, line, p[i*2])*16
181                         + hex_char(filename, line, p[i*2+1]);
182
183         return data;
184 }
185
186 static void add_op(const char *filename, struct op **op, unsigned int i,
187                    unsigned int serial, enum op_type type)
188 {
189         struct op *new;
190         *op = talloc_realloc(NULL, *op, struct op, i+1);
191         new = (*op) + i;
192         new->op = type;
193         new->serial = serial;
194         new->ret = 0;
195         new->group_start = 0;
196 }
197
198 static void op_add_nothing(const char *filename,
199                            struct op op[], unsigned int op_num, char *words[])
200 {
201         if (words[2])
202                 fail(filename, op_num+1, "Expected no arguments");
203         op[op_num].key = tdb_null;
204 }
205
206 static void op_add_key(const char *filename,
207                        struct op op[], unsigned int op_num, char *words[])
208 {
209         if (words[2] == NULL || words[3])
210                 fail(filename, op_num+1, "Expected just a key");
211
212         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
213         if (op[op_num].op != OP_TDB_TRAVERSE)
214                 total_keys++;
215 }
216
217 static void op_add_key_ret(const char *filename,
218                            struct op op[], unsigned int op_num, char *words[])
219 {
220         if (!words[2] || !words[3] || !words[4] || words[5]
221             || !streq(words[3], "="))
222                 fail(filename, op_num+1, "Expected <key> = <ret>");
223         op[op_num].ret = atoi(words[4]);
224         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
225         /* May only be a unique key if it fails */
226         if (op[op_num].ret != 0)
227                 total_keys++;
228 }
229
230 static void op_add_key_data(const char *filename,
231                             struct op op[], unsigned int op_num, char *words[])
232 {
233         if (!words[2] || !words[3] || !words[4] || words[5]
234             || !streq(words[3], "="))
235                 fail(filename, op_num+1, "Expected <key> = <data>");
236         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
237         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[4]);
238         /* May only be a unique key if it fails */
239         if (!op[op_num].data.dptr)
240                 total_keys++;
241 }
242
243 /* <serial> tdb_store <rec> <rec> <flag> = <ret> */
244 static void op_add_store(const char *filename,
245                          struct op op[], unsigned int op_num, char *words[])
246 {
247         if (!words[2] || !words[3] || !words[4] || !words[5] || !words[6]
248             || words[7] || !streq(words[5], "="))
249                 fail(filename, op_num+1, "Expect <key> <data> <flag> = <ret>");
250
251         op[op_num].flag = strtoul(words[4], NULL, 0);
252         op[op_num].ret = atoi(words[6]);
253         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
254         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
255         total_keys++;
256 }
257
258 /* <serial> tdb_append <rec> <rec> = <rec> */
259 static void op_add_append(const char *filename,
260                           struct op op[], unsigned int op_num, char *words[])
261 {
262         TDB_DATA post_append;
263
264         if (!words[2] || !words[3] || !words[4] || !words[5] || words[6]
265             || !streq(words[4], "="))
266                 fail(filename, op_num+1, "Expect <key> <data> = <rec>");
267
268         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
269         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
270
271         post_append = make_tdb_data(op, filename, op_num+1, words[5]);
272
273         /* By subtraction, figure out what previous data was. */
274         op[op_num].pre_append.dptr = post_append.dptr;
275         op[op_num].pre_append.dsize = post_append.dsize - op[op_num].data.dsize;
276         total_keys++;
277 }
278
279 /* <serial> tdb_get_seqnum = <ret> */
280 static void op_add_seqnum(const char *filename,
281                           struct op op[], unsigned int op_num, char *words[])
282 {
283         if (!words[2] || !words[3] || words[4] || !streq(words[2], "="))
284                 fail(filename, op_num+1, "Expect = <ret>");
285
286         op[op_num].key = tdb_null;
287         op[op_num].ret = atoi(words[3]);
288 }
289
290 static void op_add_traverse(const char *filename,
291                             struct op op[], unsigned int op_num, char *words[])
292 {
293         if (words[2])
294                 fail(filename, op_num+1, "Expect no arguments");
295
296         op[op_num].key = tdb_null;
297         op[op_num].trav = NULL;
298 }
299
300 static void op_add_transaction(const char *filename, struct op op[],
301                                unsigned int op_num, char *words[])
302 {
303         if (words[2])
304                 fail(filename, op_num+1, "Expect no arguments");
305
306         op[op_num].key = tdb_null;
307         op[op_num].transaction_end = 0;
308 }
309
310 static void op_analyze_transaction(const char *filename,
311                                    struct op op[], unsigned int op_num,
312                                    char *words[])
313 {
314         int i, start;
315
316         op[op_num].key = tdb_null;
317
318         if (words[2])
319                 fail(filename, op_num+1, "Expect no arguments");
320
321         for (i = op_num-1; i >= 0; i--) {
322                 if (op[i].op == OP_TDB_TRANSACTION_START &&
323                     !op[i].transaction_end)
324                         break;
325         }
326
327         if (i < 0)
328                 fail(filename, op_num+1, "no transaction start found");
329
330         start = i;
331         op[start].transaction_end = op_num;
332
333         /* This rolls in nested transactions.  I think that's right. */
334         for (i++; i <= op_num; i++)
335                 op[i].group_start = start;
336 }
337
338 struct traverse_hash {
339         TDB_DATA key;
340         unsigned int index;
341 };
342
343 /* A traverse is a hash of keys, each one associated with ops. */
344 struct traverse {
345         /* How many traversal callouts should I do? */
346         unsigned int num;
347
348         /* Where is traversal end op? */
349         unsigned int end;
350
351         /* For trivial traversals. */
352         struct traverse_hash *hash;
353 };
354
355 /* A trivial traversal is one which doesn't terminate early and only
356  * plays with its own record.  We can reliably replay these even if
357  * traverse order changes. */
358 static bool is_trivial_traverse(struct op op[], unsigned int end)
359 {
360 #if 0
361         unsigned int i;
362         TDB_DATA cur = tdb_null;
363
364         if (op[end].ret != 0)
365                 return false;
366
367         for (i = 0; i < end; i++) {
368                 if (!op[i].key.dptr)
369                         continue;
370                 if (op[i].op == OP_TDB_TRAVERSE)
371                         cur = op[i].key;
372                 if (!key_eq(cur, op[i].key))
373                         return false;
374         }
375         return true;
376 #endif
377         /* With multiple things happening at once, no traverse is trivial. */
378         return false;
379 }
380
381 static void op_analyze_traverse(const char *filename,
382                                 struct op op[], unsigned int op_num,
383                                 char *words[])
384 {
385         int i, start;
386         struct traverse *trav = talloc(op, struct traverse);
387
388         op[op_num].key = tdb_null;
389
390         /* = %u means traverse function terminated. */
391         if (words[2]) {
392                 if (!streq(words[2], "=") || !words[3] || words[4])
393                         fail(filename, op_num+1, "expect = <num>");
394                 op[op_num].ret = atoi(words[3]);
395         } else
396                 op[op_num].ret = 0;
397
398         trav->num = 0;
399         trav->end = op_num;
400         for (i = op_num-1; i >= 0; i--) {
401                 if (op[i].op == OP_TDB_TRAVERSE)
402                         trav->num++;
403                 if (op[i].op != OP_TDB_TRAVERSE_READ_START
404                     && op[i].op != OP_TDB_TRAVERSE_START)
405                         continue;
406                 if (op[i].trav)
407                         continue;
408                 break;
409         }
410
411         if (i < 0)
412                 fail(filename, op_num+1, "no traversal start found");
413
414         start = i;
415         op[start].trav = trav;
416
417         for (i = start; i <= op_num; i++)
418                 op[i].group_start = start;
419
420         if (is_trivial_traverse(op+i, op_num-i)) {
421                 /* Fill in a plentiful hash table. */
422                 op[start].trav->hash = talloc_zero_array(op[i].trav,
423                                                          struct traverse_hash,
424                                                          trav->num * 2);
425                 for (i = start; i < op_num; i++) {
426                         unsigned int h;
427                         if (op[i].op != OP_TDB_TRAVERSE)
428                                 continue;
429                         h = hash_key(&op[i].key) % (trav->num * 2);
430                         while (trav->hash[h].index)
431                                 h = (h + 1) % (trav->num * 2);
432                         trav->hash[h].index = i+1;
433                         trav->hash[h].key = op[i].key;
434                 }
435         } else
436                 trav->hash = NULL;
437 }
438
439 /* Keep -Wmissing-declarations happy: */
440 const struct op_table *
441 find_keyword (register const char *str, register unsigned int len);
442
443 #include "keywords.c"
444
445 struct depend {
446         /* We can have more than one */
447         struct list_node list;
448         unsigned int file;
449         unsigned int op;
450 };
451
452 struct depend_xmit {
453         unsigned int dst_op;
454         unsigned int src_file, src_op;
455 };
456
457 static void remove_matching_dep(struct list_head *deps,
458                                 unsigned int file, unsigned int op)
459 {
460         struct depend *dep;
461
462         list_for_each(deps, dep, list) {
463                 if (dep->file == file && dep->op == op) {
464                         list_del(&dep->list);
465                         return;
466                 }
467         }
468         errx(1, "Failed to find depend on file %u line %u\n", file, op+1);
469 }
470
471 static void check_deps(const char *filename, struct op op[], unsigned int num)
472 {
473 #ifdef DEBUG_DEPS
474         unsigned int i;
475
476         for (i = 1; i < num; i++)
477                 if (!list_empty(&op[i].pre))
478                         fail(filename, i+1, "Still has dependencies");
479 #endif
480 }
481
482 static void dump_pre(char *filename[], unsigned int file,
483                      struct op op[], unsigned int i)
484 {
485         struct depend *dep;
486
487         printf("%s:%u still waiting for:\n", filename[file], i+1);
488         list_for_each(&op[i].pre, dep, list)
489                 printf("    %s:%u\n", filename[dep->file], dep->op+1);
490         check_deps(filename[file], op, i);
491 }
492
493 static void do_pre(char *filename[], unsigned int file, int pre_fd,
494                    struct op op[], unsigned int i)
495 {
496         while (!list_empty(&op[i].pre)) {
497                 struct depend_xmit dep;
498
499 #if DEBUG_DEPS
500                 printf("%s:%u:waiting for pre\n", filename[file], i+1);
501                 fflush(stdout);
502 #endif
503                 alarm(10);
504                 while (read(pre_fd, &dep, sizeof(dep)) != sizeof(dep)) {
505                         if (errno == EINTR) {
506                                 dump_pre(filename, file, op, i);
507                                 exit(1);
508                         } else
509                                 errx(1, "Reading from pipe");
510                 }
511                 alarm(0);
512
513 #if DEBUG_DEPS
514                 printf("%s:%u:got pre %u from %s:%u\n", filename[file], i+1,
515                        dep.dst_op+1, filename[dep.src_file], dep.src_op+1);
516                 fflush(stdout);
517 #endif
518                 /* This could be any op, not just this one. */
519                 remove_matching_dep(&op[dep.dst_op].pre,
520                                     dep.src_file, dep.src_op);
521         }
522 }
523
524 static void do_post(char *filename[], unsigned int file,
525                     const struct op op[], unsigned int i)
526 {
527         struct depend *dep;
528
529         list_for_each(&op[i].post, dep, list) {
530                 struct depend_xmit dx;
531
532                 dx.src_file = file;
533                 dx.src_op = i;
534                 dx.dst_op = dep->op;
535 #if DEBUG_DEPS
536                 printf("%s:%u:sending to file %s:%u\n", filename[file], i+1,
537                        filename[dep->file], dep->op+1);
538 #endif
539                 if (write(pipes[dep->file].fd[1], &dx, sizeof(dx))
540                     != sizeof(dx))
541                         err(1, "%s:%u failed to tell file %s",
542                             filename[file], i+1, filename[dep->file]);
543         }
544 }
545
546 static int get_len(TDB_DATA key, TDB_DATA data, void *private_data)
547 {
548         return data.dsize;
549 }
550
551 static unsigned run_ops(struct tdb_context *tdb,
552                         int pre_fd,
553                         char *filename[],
554                         unsigned int file,
555                         struct op op[],
556                         unsigned int start, unsigned int stop);
557
558 struct traverse_info {
559         struct op *op;
560         char **filename;
561         unsigned file;
562         int pre_fd;
563         unsigned int start;
564         unsigned int i;
565 };
566
567 /* Trivial case: do whatever they did for this key. */
568 static int trivial_traverse(struct tdb_context *tdb,
569                             TDB_DATA key, TDB_DATA data,
570                             void *_tinfo)
571 {
572         struct traverse_info *tinfo = _tinfo;
573         struct traverse *trav = tinfo->op[tinfo->start].trav;
574         unsigned int h = hash_key(&key) % (trav->num * 2);
575
576         while (trav->hash[h].index) {
577                 if (key_eq(trav->hash[h].key, key)) {
578                         run_ops(tdb, tinfo->pre_fd, tinfo->filename,
579                                 tinfo->file, tinfo->op, trav->hash[h].index,
580                                 trav->end);
581                         tinfo->i++;
582                         return 0;
583                 }
584                 h = (h + 1) % (trav->num * 2);
585         }
586         fail(tinfo->filename[tinfo->file], tinfo->start + 1,
587              "unexpected traverse key");
588 }
589
590 /* More complex.  Just do whatever's they did at the n'th entry. */
591 static int nontrivial_traverse(struct tdb_context *tdb,
592                                TDB_DATA key, TDB_DATA data,
593                                void *_tinfo)
594 {
595         struct traverse_info *tinfo = _tinfo;
596         struct traverse *trav = tinfo->op[tinfo->start].trav;
597
598         if (tinfo->i == trav->end) {
599                 /* This can happen if traverse expects to be empty. */
600                 if (tinfo->start + 1 == trav->end)
601                         return 1;
602                 fail(tinfo->filename[tinfo->file], tinfo->start + 1,
603                      "traverse did not terminate");
604         }
605
606         if (tinfo->op[tinfo->i].op != OP_TDB_TRAVERSE)
607                 fail(tinfo->filename[tinfo->file], tinfo->start + 1,
608                      "%s:%u:traverse terminated early");
609
610         /* Run any normal ops. */
611         tinfo->i = run_ops(tdb, tinfo->pre_fd, tinfo->filename, tinfo->file,
612                            tinfo->op, tinfo->i+1, trav->end);
613
614         if (tinfo->i == trav->end)
615                 return 1;
616
617         return 0;
618 }
619
620 static unsigned op_traverse(struct tdb_context *tdb,
621                             int pre_fd,
622                             char *filename[],
623                             unsigned int file,
624                             int (*traversefn)(struct tdb_context *,
625                                               tdb_traverse_func, void *),
626                             struct op op[],
627                             unsigned int start)
628 {
629         struct traverse *trav = op[start].trav;
630         struct traverse_info tinfo = { op, filename, file, pre_fd,
631                                        start, start+1 };
632
633         /* Trivial case. */
634         if (trav->hash) {
635                 int ret = traversefn(tdb, trivial_traverse, &tinfo);
636                 if (ret != trav->num)
637                         fail(filename[file], start+1,
638                              "short traversal %i", ret);
639                 return trav->end;
640         }
641
642         traversefn(tdb, nontrivial_traverse, &tinfo);
643
644         /* Traversing in wrong order can have strange effects: eg. if
645          * original traverse went A (delete A), B, we might do B
646          * (delete A).  So if we have ops left over, we do it now. */
647         while (tinfo.i != trav->end) {
648                 if (op[tinfo.i].op == OP_TDB_TRAVERSE)
649                         tinfo.i++;
650                 else
651                         tinfo.i = run_ops(tdb, pre_fd, filename, file, op,
652                                           tinfo.i, trav->end);
653         }
654
655         return trav->end;
656 }
657
658 static void break_out(int sig)
659 {
660 }
661
662 static __attribute__((noinline))
663 unsigned run_ops(struct tdb_context *tdb,
664                  int pre_fd,
665                  char *filename[],
666                  unsigned int file,
667                  struct op op[], unsigned int start, unsigned int stop)
668 {
669         unsigned int i;
670         struct sigaction sa;
671
672         sa.sa_handler = break_out;
673         sa.sa_flags = 0;
674
675         sigaction(SIGALRM, &sa, NULL);
676         for (i = start; i < stop; i++) {
677                 do_pre(filename, file, pre_fd, op, i);
678
679                 switch (op[i].op) {
680                 case OP_TDB_LOCKALL:
681                         try(tdb_lockall(tdb), op[i].ret);
682                         break;
683                 case OP_TDB_LOCKALL_MARK:
684                         try(tdb_lockall_mark(tdb), op[i].ret);
685                         break;
686                 case OP_TDB_LOCKALL_UNMARK:
687                         try(tdb_lockall_unmark(tdb), op[i].ret);
688                         break;
689                 case OP_TDB_LOCKALL_NONBLOCK:
690                         unreliable(tdb_lockall_nonblock(tdb), op[i].ret,
691                                    tdb_lockall(tdb), tdb_unlockall(tdb));
692                         break;
693                 case OP_TDB_UNLOCKALL:
694                         try(tdb_unlockall(tdb), op[i].ret);
695                         break;
696                 case OP_TDB_LOCKALL_READ:
697                         try(tdb_lockall_read(tdb), op[i].ret);
698                         break;
699                 case OP_TDB_LOCKALL_READ_NONBLOCK:
700                         unreliable(tdb_lockall_read_nonblock(tdb), op[i].ret,
701                                    tdb_lockall_read(tdb),
702                                    tdb_unlockall_read(tdb));
703                         break;
704                 case OP_TDB_UNLOCKALL_READ:
705                         try(tdb_unlockall_read(tdb), op[i].ret);
706                         break;
707                 case OP_TDB_CHAINLOCK:
708                         try(tdb_chainlock(tdb, op[i].key), op[i].ret);
709                         break;
710                 case OP_TDB_CHAINLOCK_NONBLOCK:
711                         unreliable(tdb_chainlock_nonblock(tdb, op[i].key),
712                                    op[i].ret,
713                                    tdb_chainlock(tdb, op[i].key),
714                                    tdb_chainunlock(tdb, op[i].key));
715                         break;
716                 case OP_TDB_CHAINLOCK_MARK:
717                         try(tdb_chainlock_mark(tdb, op[i].key), op[i].ret);
718                         break;
719                 case OP_TDB_CHAINLOCK_UNMARK:
720                         try(tdb_chainlock_unmark(tdb, op[i].key), op[i].ret);
721                         break;
722                 case OP_TDB_CHAINUNLOCK:
723                         try(tdb_chainunlock(tdb, op[i].key), op[i].ret);
724                         break;
725                 case OP_TDB_CHAINLOCK_READ:
726                         try(tdb_chainlock_read(tdb, op[i].key), op[i].ret);
727                         break;
728                 case OP_TDB_CHAINUNLOCK_READ:
729                         try(tdb_chainunlock_read(tdb, op[i].key), op[i].ret);
730                         break;
731                 case OP_TDB_PARSE_RECORD:
732                         try(tdb_parse_record(tdb, op[i].key, get_len, NULL),
733                             op[i].ret);
734                         break;
735                 case OP_TDB_EXISTS:
736                         try(tdb_exists(tdb, op[i].key), op[i].ret);
737                         break;
738                 case OP_TDB_STORE:
739                         try(tdb_store(tdb, op[i].key, op[i].data, op[i].flag),
740                             op[i].ret < 0 ? op[i].ret : 0);
741                         break;
742                 case OP_TDB_APPEND:
743                         try(tdb_append(tdb, op[i].key, op[i].data),
744                             op[i].ret < 0 ? op[i].ret : 0);
745                         break;
746                 case OP_TDB_GET_SEQNUM:
747                         try(tdb_get_seqnum(tdb), op[i].ret);
748                         break;
749                 case OP_TDB_WIPE_ALL:
750                         try(tdb_wipe_all(tdb), op[i].ret);
751                         break;
752                 case OP_TDB_TRANSACTION_START:
753                         try(tdb_transaction_start(tdb), op[i].ret);
754                         break;
755                 case OP_TDB_TRANSACTION_CANCEL:
756                         try(tdb_transaction_cancel(tdb), op[i].ret);
757                         break;
758                 case OP_TDB_TRANSACTION_COMMIT:
759                         try(tdb_transaction_commit(tdb), op[i].ret);
760                         break;
761                 case OP_TDB_TRAVERSE_READ_START:
762                         i = op_traverse(tdb, pre_fd, filename, file,
763                                         tdb_traverse_read, op, i);
764                         break;
765                 case OP_TDB_TRAVERSE_START:
766                         i = op_traverse(tdb, pre_fd, filename, file,
767                                         tdb_traverse, op, i);
768                         break;
769                 case OP_TDB_TRAVERSE:
770                         /* Terminate: we're in a traverse, and we've
771                          * done our ops. */
772                         return i;
773                 case OP_TDB_TRAVERSE_END:
774                         fail(filename[file], i+1, "unepxected end traverse");
775                 /* FIXME: These must be treated like traverse. */
776                 case OP_TDB_FIRSTKEY:
777                         if (!key_eq(tdb_firstkey(tdb), op[i].data))
778                                 fail(filename[file], i+1, "bad firstkey");
779                         break;
780                 case OP_TDB_NEXTKEY:
781                         if (!key_eq(tdb_nextkey(tdb, op[i].key), op[i].data))
782                                 fail(filename[file], i+1, "bad nextkey");
783                         break;
784                 case OP_TDB_FETCH: {
785                         TDB_DATA f = tdb_fetch(tdb, op[i].key);
786                         if (!key_eq(f, op[i].data))
787                                 fail(filename[file], i+1, "bad fetch %u",
788                                      f.dsize);
789                         break;
790                 }
791                 case OP_TDB_DELETE:
792                         try(tdb_delete(tdb, op[i].key), op[i].ret);
793                         break;
794                 }
795                 do_post(filename, file, op, i);
796         }
797         return i;
798 }
799
800 static struct op *load_tracefile(const char *filename, unsigned int *num,
801                                  unsigned int *hashsize,
802                                  unsigned int *tdb_flags,
803                                  unsigned int *open_flags)
804 {
805         unsigned int i;
806         struct op *op = talloc_array(NULL, struct op, 1);
807         char **words;
808         char **lines;
809         char *file;
810
811         file = grab_file(NULL, filename, NULL);
812         if (!file)
813                 err(1, "Reading %s", filename);
814
815         lines = strsplit(file, file, "\n", NULL);
816         if (!lines[0])
817                 errx(1, "%s is empty", filename);
818
819         words = strsplit(lines, lines[0], " ", NULL);
820         if (!streq(words[1], "tdb_open"))
821                 fail(filename, 1, "does not start with tdb_open");
822
823         *hashsize = atoi(words[2]);
824         *tdb_flags = strtoul(words[3], NULL, 0);
825         *open_flags = strtoul(words[4], NULL, 0);
826
827         for (i = 1; lines[i]; i++) {
828                 const struct op_table *opt;
829
830                 words = strsplit(lines, lines[i], " ", NULL);
831                 if (!words[0] || !words[1])
832                         fail(filename, i+1, "Expected serial number and op");
833                
834                 opt = find_keyword(words[1], strlen(words[1]));
835                 if (!opt) {
836                         if (streq(words[1], "tdb_close")) {
837                                 if (lines[i+1])
838                                         fail(filename, i+2,
839                                              "lines after tdb_close");
840                                 *num = i;
841                                 talloc_free(lines);
842                                 return op;
843                         }
844                         fail(filename, i+1, "Unknown operation '%s'", words[1]);
845                 }
846
847                 add_op(filename, &op, i, atoi(words[0]), opt->type);
848                 opt->enhance_op(filename, op, i, words);
849         }
850
851         fprintf(stderr, "%s:%u:last operation is not tdb_close: incomplete?",
852               filename, i);
853         talloc_free(lines);
854         *num = i - 1;
855         return op;
856 }
857
858 /* We remember all the keys we've ever seen, and who has them. */
859 struct key_user {
860         unsigned int file;
861         unsigned int op_num;
862 };
863
864 struct keyinfo {
865         TDB_DATA key;
866         unsigned int num_users;
867         struct key_user *user;
868 };
869
870 static const TDB_DATA must_not_exist;
871 static const TDB_DATA must_exist;
872 static const TDB_DATA not_exists_or_empty;
873
874 /* NULL means doesn't care if it exists or not, &must_exist means
875  * it must exist but we don't care what, &must_not_exist means it must
876  * not exist, otherwise the data it needs. */
877 static const TDB_DATA *needs(const struct op *op)
878 {
879         switch (op->op) {
880         /* FIXME: Pull forward deps, since we can deadlock */
881         case OP_TDB_CHAINLOCK:
882         case OP_TDB_CHAINLOCK_NONBLOCK:
883         case OP_TDB_CHAINLOCK_MARK:
884         case OP_TDB_CHAINLOCK_UNMARK:
885         case OP_TDB_CHAINUNLOCK:
886         case OP_TDB_CHAINLOCK_READ:
887         case OP_TDB_CHAINUNLOCK_READ:
888                 return NULL;
889
890         case OP_TDB_APPEND:
891                 if (op->pre_append.dsize == 0)
892                         return &not_exists_or_empty;
893                 return &op->pre_append;
894
895         case OP_TDB_STORE:
896                 if (op->flag == TDB_INSERT) {
897                         if (op->ret < 0)
898                                 return &must_exist;
899                         else
900                                 return &must_not_exist;
901                 } else if (op->flag == TDB_MODIFY) {
902                         if (op->ret < 0)
903                                 return &must_not_exist;
904                         else
905                                 return &must_exist;
906                 }
907                 /* No flags?  Don't care */
908                 return NULL;
909
910         case OP_TDB_EXISTS:
911                 if (op->ret == 1)
912                         return &must_exist;
913                 else
914                         return &must_not_exist;
915
916         case OP_TDB_PARSE_RECORD:
917                 if (op->ret < 0)
918                         return &must_not_exist;
919                 return &must_exist;
920
921         /* FIXME: handle these. */
922         case OP_TDB_WIPE_ALL:
923         case OP_TDB_FIRSTKEY:
924         case OP_TDB_NEXTKEY:
925         case OP_TDB_GET_SEQNUM:
926         case OP_TDB_TRAVERSE:
927         case OP_TDB_TRANSACTION_COMMIT:
928         case OP_TDB_TRANSACTION_CANCEL:
929         case OP_TDB_TRANSACTION_START:
930                 return NULL;
931
932         case OP_TDB_FETCH:
933                 if (!op->data.dptr)
934                         return &must_not_exist;
935                 return &op->data;
936
937         case OP_TDB_DELETE:
938                 if (op->ret < 0)
939                         return &must_not_exist;
940                 return &must_exist;
941
942         default:
943                 errx(1, "Unexpected op %i", op->op);
944         }
945         
946 }
947
948 enum satisfaction {
949         /* This op makes the other one possible. */
950         SATISFIES,
951         /* This op makes the other one impossible. */
952         DISSATISFIES,
953         /* This op makes no difference. */
954         NO_CHANGE
955 };
956
957 static enum satisfaction satisfies(const struct op *op, const TDB_DATA *need)
958 {
959         bool deletes, creates;
960
961         /* Failed ops don't change state of db. */
962         if (op->ret < 0)
963                 return NO_CHANGE;
964
965         deletes = (op->op == OP_TDB_DELETE || op->op == OP_TDB_WIPE_ALL);
966         /* Append/store is creating the record if ret == 0 (1 means replaced) */
967         if (op->op == OP_TDB_APPEND || op->op == OP_TDB_STORE)
968                 creates = (op->ret == 0);
969         else
970                 creates = false;
971
972         if (need == &must_not_exist) {
973                 if (deletes)
974                         return SATISFIES;
975                 if (creates)
976                         return DISSATISFIES;
977                 return NO_CHANGE;
978         }
979
980         if (need == &must_exist) {
981                 if (deletes)
982                         return DISSATISFIES;
983                 if (creates) 
984                         return SATISFIES;
985                 return NO_CHANGE;
986         }
987
988         if (need == &not_exists_or_empty) {
989                 if (deletes)
990                         return SATISFIES;
991                 if (!creates)
992                         return NO_CHANGE;
993         }
994
995         /* OK, we need an exact match. */
996         if (deletes)
997                 return DISSATISFIES;
998
999         /* An append which results in the wrong data dissatisfies. */
1000         if (op->op == OP_TDB_APPEND) {
1001                 if (op->pre_append.dsize + op->data.dsize != need->dsize)
1002                         return DISSATISFIES;
1003                 if (memcmp(op->pre_append.dptr, need->dptr,
1004                            op->pre_append.dsize) != 0)
1005                         return DISSATISFIES;
1006                 if (memcmp(op->data.dptr, need->dptr + op->pre_append.dsize,
1007                            op->data.dsize) != 0)
1008                         return DISSATISFIES;
1009                 return SATISFIES;
1010         } else if (op->op == OP_TDB_STORE) {
1011                 if (key_eq(op->data, *need))
1012                         return SATISFIES;
1013                 return DISSATISFIES;
1014         }
1015         return NO_CHANGE;
1016 }
1017
1018 static struct keyinfo *hash_ops(struct op *op[], unsigned int num_ops[],
1019                                 unsigned int num)
1020 {
1021         unsigned int i, j, h;
1022         struct keyinfo *hash;
1023
1024         /* Gcc nexted function extension.  How cool is this? */
1025         int compare_user_serial(const void *_a, const void *_b)
1026         {
1027                 const struct key_user *a = _a, *b = _b;
1028                 int ret = op[a->file][a->op_num].serial
1029                         - op[b->file][b->op_num].serial;
1030
1031                 /* Serial is not completely reliable.  First, fetches don't
1032                  * inc serial, second we don't lock to get seq number.
1033                  * This smooths things a little for simple cases. */
1034                 if (ret == 0) {
1035                         const TDB_DATA *a_needs, *b_needs;
1036
1037                         b_needs = needs(&op[b->file][b->op_num]);
1038                         switch (satisfies(&op[a->file][a->op_num], b_needs)) {
1039                         case SATISFIES:
1040                                 /* A comes first: it satisfies B. */
1041                                 return -1;
1042                         case DISSATISFIES:
1043                                 /* A doesn't come first: it messes up B. */
1044                                 return 1;
1045                         default:
1046                                 break;
1047                         }
1048
1049                         a_needs = needs(&op[a->file][a->op_num]);
1050                         switch (satisfies(&op[b->file][b->op_num], a_needs)) {
1051                         case SATISFIES:
1052                                 /* B comes first: it satisfies A. */
1053                                 return 1;
1054                         case DISSATISFIES:
1055                                 /* B doesn't come first: it messes up A. */
1056                                 return -1;
1057                         default:
1058                                 break;
1059                         }
1060                 }
1061                 return ret;
1062         }
1063
1064         hash = talloc_zero_array(op[0], struct keyinfo, total_keys*2);
1065         for (i = 0; i < num; i++) {
1066                 for (j = 1; j < num_ops[i]; j++) {
1067                         /* We can't do this on allocation, due to realloc. */
1068                         list_head_init(&op[i][j].post);
1069                         list_head_init(&op[i][j].pre);
1070
1071                         if (!op[i][j].key.dptr)
1072                                 continue;
1073
1074                         /* We don't wait for traverse keys */
1075                         /* FIXME: We should, for trivial traversals. */
1076                         if (op[i][j].op == OP_TDB_TRAVERSE)
1077                                 continue;
1078
1079                         h = hash_key(&op[i][j].key) % (total_keys * 2);
1080                         while (!key_eq(hash[h].key, op[i][j].key)) {
1081                                 if (!hash[h].key.dptr) {
1082                                         hash[h].key = op[i][j].key;
1083                                         break;
1084                                 }
1085                                 h = (h + 1) % (total_keys * 2);
1086                         }
1087                         /* Might as well save some memory if we can. */
1088                         if (op[i][j].key.dptr != hash[h].key.dptr) {
1089                                 talloc_free(op[i][j].key.dptr);
1090                                 op[i][j].key.dptr = hash[h].key.dptr;
1091                         }
1092                         hash[h].user = talloc_realloc(hash, hash[h].user,
1093                                                      struct key_user,
1094                                                      hash[h].num_users+1);
1095                         hash[h].user[hash[h].num_users].op_num = j;
1096                         hash[h].user[hash[h].num_users].file = i;
1097                         hash[h].num_users++;
1098                 }
1099         }
1100
1101         /* Now sort into seqnum order. */
1102         for (h = 0; h < total_keys * 2; h++)
1103                 qsort(hash[h].user, hash[h].num_users, sizeof(hash[h].user[0]),
1104                       compare_user_serial);
1105
1106         return hash;
1107 }
1108
1109 static void add_dependency(void *ctx,
1110                            struct op *op[],
1111                            char *filename[],
1112                            unsigned int needs_file,
1113                            unsigned int needs_opnum,
1114                            unsigned int satisfies_file,
1115                            unsigned int satisfies_opnum)
1116 {
1117         struct depend *post, *pre;
1118         unsigned int needs_start, sat_start;
1119
1120         /* We don't depend on ourselves. */
1121         if (needs_file == satisfies_file)
1122                 return;
1123
1124 #if DEBUG_DEPS
1125         printf("%s:%u: depends on %s:%u\n",
1126                filename[needs_file], needs_opnum+1,
1127                filename[satisfies_file], satisfies_opnum+1);
1128 #endif
1129
1130         needs_start = op[needs_file][needs_opnum].group_start;
1131         sat_start = op[satisfies_file][satisfies_opnum].group_start;
1132
1133         /* If needs is in a transaction, we need it before start. */
1134         if (needs_start) {
1135                 switch (op[needs_file][needs_start].op) {
1136                 case OP_TDB_TRANSACTION_START:
1137                         needs_opnum = needs_start;
1138 #ifdef DEBUG_DEPS
1139                         printf("  -> Back to %u\n", needs_start+1);
1140                         fflush(stdout);
1141 #endif
1142                         break;
1143                 default:
1144                         break;
1145                 }
1146         }
1147
1148         /* If satisfies is in a transaction, we wait until after commit. */
1149         /* FIXME: If transaction is cancelled, don't need dependency. */
1150         if (sat_start) {
1151                 if (op[satisfies_file][sat_start].op
1152                     == OP_TDB_TRANSACTION_START) {
1153                         satisfies_opnum
1154                                 = op[satisfies_file][sat_start].transaction_end;
1155 #ifdef DEBUG_DEPS
1156                         printf("  -> Depends on %u\n", satisfies_opnum+1);
1157                         fflush(stdout);
1158 #endif
1159                 }
1160         }
1161
1162         post = talloc(ctx, struct depend);
1163         post->file = needs_file;
1164         post->op = needs_opnum;
1165         list_add(&op[satisfies_file][satisfies_opnum].post, &post->list);
1166
1167         pre = talloc(ctx, struct depend);
1168         pre->file = satisfies_file;
1169         pre->op = satisfies_opnum;
1170         list_add(&op[needs_file][needs_opnum].pre, &pre->list);
1171 }
1172
1173 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
1174 struct traverse_dep {
1175         unsigned int file;
1176         unsigned int op_num;
1177         const struct op *op;
1178 };
1179
1180 /* Sort by which one runs first. */
1181 static int compare_traverse_dep(const void *_a, const void *_b)
1182 {
1183         const struct traverse_dep *a = _a, *b = _b;
1184         const struct traverse *trava = a->op->trav, *travb = b->op->trav;
1185
1186         if (a->op->serial != b->op->serial)
1187                 return a->op->serial - b->op->serial;
1188
1189         /* If they have same serial, it means one didn't make any changes.
1190          * Thus sort by end in that case. */
1191         return a->op[trava->end - a->op_num].serial
1192                 - b->op[travb->end - b->op_num].serial;
1193 }
1194
1195 /* Traversals can deadlock against each other.  Force order. */
1196 static void make_traverse_depends(char *filename[],
1197                                   struct op *op[], unsigned int num_ops[],
1198                                   unsigned int num)
1199 {
1200         unsigned int i, j, num_traversals = 0;
1201         struct traverse_dep *dep;
1202
1203         dep = talloc_array(NULL, struct traverse_dep, 1);
1204
1205         /* Count them. */
1206         for (i = 0; i < num; i++) {
1207                 for (j = 0; j < num_ops[i]; j++) {
1208                         if (op[i][j].op == OP_TDB_TRAVERSE_START
1209                             || op[i][j].op == OP_TDB_TRAVERSE_READ_START) {
1210                                 dep = talloc_realloc(NULL, dep,
1211                                                      struct traverse_dep,
1212                                                      num_traversals+1);
1213                                 dep[num_traversals].file = i;
1214                                 dep[num_traversals].op_num = j;
1215                                 dep[num_traversals].op = &op[i][j];
1216                                 num_traversals++;
1217                         }
1218                 }
1219         }
1220         qsort(dep, num_traversals, sizeof(dep[0]), compare_traverse_dep);
1221         for (i = 1; i < num_traversals; i++) {
1222                 /* i depends on end of traverse i-1. */
1223                 add_dependency(NULL, op, filename, dep[i].file, dep[i].op_num,
1224                                dep[i-1].file, dep[i-1].op->trav->end);
1225         }
1226         talloc_free(dep);
1227 }
1228 #endif /* TRAVERSALS_TAKE_TRANSACTION_LOCK */
1229
1230 static void derive_dependencies(char *filename[],
1231                                 struct op *op[], unsigned int num_ops[],
1232                                 unsigned int num)
1233 {
1234         struct keyinfo *hash;
1235         unsigned int i, j;
1236
1237         /* Create hash table for faster key lookup. */
1238         hash = hash_ops(op, num_ops, num);
1239
1240         /* We make the naive assumption that two ops on the same key
1241          * have to be ordered; it's overkill. */
1242         for (i = 0; i < total_keys * 2; i++) {
1243                 for (j = 1; j < hash[i].num_users; j++) {
1244                         add_dependency(hash, op, filename,
1245                                        hash[i].user[j].file,
1246                                        hash[i].user[j].op_num,
1247                                        hash[i].user[j-1].file,
1248                                        hash[i].user[j-1].op_num);
1249                 }
1250         }
1251
1252 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
1253         make_traverse_depends(filename, op, num_ops, num);
1254 #endif
1255 }
1256
1257 int main(int argc, char *argv[])
1258 {
1259         struct timeval start, end;
1260         unsigned int i, num_ops[argc], hashsize[argc], tdb_flags[argc], open_flags[argc];
1261         struct op *op[argc];
1262         int fds[2];
1263         char c;
1264         bool ok = true;
1265
1266         if (argc < 3)
1267                 errx(1, "Usage: %s <tdbfile> <tracefile>...", argv[0]);
1268
1269         pipes = talloc_array(NULL, struct pipe, argc - 2);
1270         for (i = 0; i < argc - 2; i++) {
1271                 printf("Loading tracefile %s...", argv[2+i]);
1272                 fflush(stdout);
1273                 op[i] = load_tracefile(argv[2+i], &num_ops[i], &hashsize[i],
1274                                        &tdb_flags[i], &open_flags[i]);
1275                 if (pipe(pipes[i].fd) != 0)
1276                         err(1, "creating pipe");
1277                 printf("done\n");
1278         }
1279
1280         printf("Calculating inter-dependencies...");
1281         fflush(stdout);
1282         derive_dependencies(argv+2, op, num_ops, i);
1283         printf("done\n");
1284
1285         /* Don't fork for single arg case: simple debugging. */
1286         if (argc == 3) {
1287                 struct tdb_context *tdb;
1288                 tdb = tdb_open_ex(argv[1], hashsize[0], tdb_flags[0],
1289                                   open_flags[0], 0600,
1290                                   NULL, hash_key);
1291                 printf("Single threaded run...");
1292                 fflush(stdout);
1293
1294                 run_ops(tdb, pipes[0].fd[0], argv+2, 0, op[0], 1, num_ops[0]);
1295                 check_deps(argv[2], op[0], num_ops[0]);
1296
1297                 printf("done\n");
1298                 exit(0);
1299         }
1300
1301         if (pipe(fds) != 0)
1302                 err(1, "creating pipe");
1303
1304         for (i = 0; i < argc - 2; i++) {
1305                 struct tdb_context *tdb;
1306
1307                 switch (fork()) {
1308                 case -1:
1309                         err(1, "fork failed");
1310                 case 0:
1311                         close(fds[1]);
1312                         tdb = tdb_open_ex(argv[1], hashsize[i], tdb_flags[i],
1313                                           open_flags[i], 0600,
1314                                           NULL, hash_key);
1315                         if (!tdb)
1316                                 err(1, "Opening tdb %s", argv[1]);
1317
1318                         /* This catches parent exiting. */
1319                         if (read(fds[0], &c, 1) != 1)
1320                                 exit(1);
1321                         run_ops(tdb, pipes[i].fd[0], argv+2, i, op[i], 1,
1322                                 num_ops[i]);
1323                         check_deps(argv[2+i], op[i], num_ops[i]);
1324                         exit(0);
1325                 default:
1326                         break;
1327                 }
1328         }
1329
1330         /* Let everything settle. */
1331         sleep(1);
1332
1333         printf("Starting run...");
1334         fflush(stdout);
1335         gettimeofday(&start, NULL);
1336         /* Tell them all to go!  Any write of sufficient length will do. */
1337         if (write(fds[1], hashsize, i) != i)
1338                 err(1, "Writing to wakeup pipe");
1339
1340         for (i = 0; i < argc - 2; i++) {
1341                 int status;
1342                 wait(&status);
1343                 if (!WIFEXITED(status)) {
1344                         warnx("Child died with signal %i", WTERMSIG(status));
1345                         ok = false;
1346                 } else if (WEXITSTATUS(status) != 0)
1347                         /* Assume child spat out error. */
1348                         ok = false;
1349         }
1350         if (!ok)
1351                 exit(1);
1352
1353         gettimeofday(&end, NULL);
1354         printf("done\n");
1355
1356         end.tv_sec -= start.tv_sec;
1357         printf("Time replaying: %lu usec\n",
1358                end.tv_sec * 1000000UL + (end.tv_usec - start.tv_usec));
1359         
1360         exit(0);
1361 }