]> git.ozlabs.org Git - ccan/blob - ccan/tdb/tools/replay_trace.c
First attempt at transactions & traverse (deadlocks still).
[ccan] / ccan / tdb / tools / replay_trace.c
1 #include <ccan/tdb/tdb.h>
2 #include <ccan/grab_file/grab_file.h>
3 #include <ccan/hash/hash.h>
4 #include <ccan/talloc/talloc.h>
5 #include <ccan/str_talloc/str_talloc.h>
6 #include <ccan/str/str.h>
7 #include <ccan/list/list.h>
8 #include <err.h>
9 #include <ctype.h>
10 #include <string.h>
11 #include <unistd.h>
12 #include <sys/types.h>
13 #include <sys/wait.h>
14 #include <sys/time.h>
15
16 #define STRINGIFY2(x) #x
17 #define STRINGIFY(x) STRINGIFY2(x)
18
19 /* Avoid mod by zero */
20 static unsigned int total_keys = 1;
21
22 #define DEBUG_DEPS 1
23
24 /* Traversals block transactions in the current implementation. */
25 #define TRAVERSALS_TAKE_TRANSACTION_LOCK 1
26
27 struct pipe {
28         int fd[2];
29 };
30 static struct pipe *pipes;
31
32 static void __attribute__((noreturn)) fail(const char *filename,
33                                            unsigned int line,
34                                            const char *fmt, ...)
35 {
36         va_list ap;
37
38         va_start(ap, fmt);
39         fprintf(stderr, "%s:%u: FAIL: ", filename, line);
40         vfprintf(stderr, fmt, ap);
41         fprintf(stderr, "\n");
42         va_end(ap);
43         exit(1);
44 }
45         
46 /* Try or die. */
47 #define try(expr, expect)                                               \
48         do {                                                            \
49                 int ret = (expr);                                       \
50                 if (ret != (expect))                                    \
51                         fail(filename, i+1, STRINGIFY(expr) "= %i", ret); \
52         } while (0)
53
54 /* Try or imitate results. */
55 #define unreliable(expr, expect, force, undo)                           \
56         do {                                                            \
57                 int ret = expr;                                         \
58                 if (ret != expect) {                                    \
59                         fprintf(stderr, "%s:%u: %s gave %i not %i",     \
60                               filename, i+1, STRINGIFY(expr), ret, expect); \
61                         if (expect == 0)                                \
62                                 force;                                  \
63                         else                                            \
64                                 undo;                                   \
65                 }                                                       \
66         } while (0)
67
68 static bool key_eq(TDB_DATA a, TDB_DATA b)
69 {
70         if (a.dsize != b.dsize)
71                 return false;
72         return memcmp(a.dptr, b.dptr, a.dsize) == 0;
73 }
74
75 /* This is based on the hash algorithm from gdbm */
76 static unsigned int hash_key(TDB_DATA *key)
77 {
78         uint32_t value; /* Used to compute the hash value.  */
79         uint32_t   i;   /* Used to cycle through random values. */
80
81         /* Set the initial value from the key size. */
82         for (value = 0x238F13AF ^ key->dsize, i=0; i < key->dsize; i++)
83                 value = (value + (key->dptr[i] << (i*5 % 24)));
84
85         return (1103515243 * value + 12345);  
86 }
87
88 enum op_type {
89         OP_TDB_LOCKALL,
90         OP_TDB_LOCKALL_MARK,
91         OP_TDB_LOCKALL_UNMARK,
92         OP_TDB_LOCKALL_NONBLOCK,
93         OP_TDB_UNLOCKALL,
94         OP_TDB_LOCKALL_READ,
95         OP_TDB_LOCKALL_READ_NONBLOCK,
96         OP_TDB_UNLOCKALL_READ,
97         OP_TDB_CHAINLOCK,
98         OP_TDB_CHAINLOCK_NONBLOCK,
99         OP_TDB_CHAINLOCK_MARK,
100         OP_TDB_CHAINLOCK_UNMARK,
101         OP_TDB_CHAINUNLOCK,
102         OP_TDB_CHAINLOCK_READ,
103         OP_TDB_CHAINUNLOCK_READ,
104         OP_TDB_PARSE_RECORD,
105         OP_TDB_EXISTS,
106         OP_TDB_STORE,
107         OP_TDB_APPEND,
108         OP_TDB_GET_SEQNUM,
109         OP_TDB_WIPE_ALL,
110         OP_TDB_TRANSACTION_START,
111         OP_TDB_TRANSACTION_CANCEL,
112         OP_TDB_TRANSACTION_COMMIT,
113         OP_TDB_TRAVERSE_READ_START,
114         OP_TDB_TRAVERSE_START,
115         OP_TDB_TRAVERSE_END,
116         OP_TDB_TRAVERSE,
117         OP_TDB_FIRSTKEY,
118         OP_TDB_NEXTKEY,
119         OP_TDB_FETCH,
120         OP_TDB_DELETE,
121 };
122
123 struct op {
124         unsigned int serial;
125         enum op_type op;
126         TDB_DATA key;
127         TDB_DATA data;
128         int ret;
129
130         /* Who is waiting for us? */
131         struct list_head post;
132         /* How many are we waiting for? */
133         unsigned int pre;
134
135         /* If I'm part of a group (traverse/transaction) where is
136          * start?  (Otherwise, 0) */
137         unsigned int group_start;
138
139         union {
140                 int flag; /* open and store */
141                 struct traverse *trav; /* traverse start */
142                 TDB_DATA post_append; /* append */
143                 unsigned int transaction_end; /* transaction start */
144         };
145 };
146
147 static unsigned char hex_char(const char *filename, unsigned int line, char c)
148 {
149         c = toupper(c);
150         if (c >= 'A' && c <= 'F')
151                 return c - 'A' + 10;
152         if (c >= '0' && c <= '9')
153                 return c - '0';
154         fail(filename, line, "invalid hex character '%c'", c);
155 }
156
157 /* TDB data is <size>:<%02x>* */
158 static TDB_DATA make_tdb_data(const void *ctx,
159                               const char *filename, unsigned int line,
160                               const char *word)
161 {
162         TDB_DATA data;
163         unsigned int i;
164         const char *p;
165
166         if (streq(word, "NULL"))
167                 return tdb_null;
168
169         data.dsize = atoi(word);
170         data.dptr = talloc_array(ctx, unsigned char, data.dsize);
171         p = strchr(word, ':');
172         if (!p)
173                 fail(filename, line, "invalid tdb data '%s'", word);
174         p++;
175         for (i = 0; i < data.dsize; i++)
176                 data.dptr[i] = hex_char(filename, line, p[i*2])*16
177                         + hex_char(filename, line, p[i*2+1]);
178
179         return data;
180 }
181
182 static void add_op(const char *filename, struct op **op, unsigned int i,
183                    unsigned int serial, enum op_type type)
184 {
185         struct op *new;
186         *op = talloc_realloc(NULL, *op, struct op, i+1);
187         new = (*op) + i;
188         new->op = type;
189         new->serial = serial;
190         new->pre = 0;
191         new->ret = 0;
192         new->group_start = 0;
193 }
194
195 static void op_add_nothing(const char *filename,
196                            struct op op[], unsigned int op_num, char *words[])
197 {
198         if (words[2])
199                 fail(filename, op_num+1, "Expected no arguments");
200         op[op_num].key = tdb_null;
201 }
202
203 static void op_add_key(const char *filename,
204                        struct op op[], unsigned int op_num, char *words[])
205 {
206         if (words[2] == NULL || words[3])
207                 fail(filename, op_num+1, "Expected just a key");
208
209         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
210         if (op[op_num].op != OP_TDB_TRAVERSE)
211                 total_keys++;
212 }
213
214 static void op_add_key_ret(const char *filename,
215                            struct op op[], unsigned int op_num, char *words[])
216 {
217         if (!words[2] || !words[3] || !words[4] || words[5]
218             || !streq(words[3], "="))
219                 fail(filename, op_num+1, "Expected <key> = <ret>");
220         op[op_num].ret = atoi(words[4]);
221         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
222         /* May only be a unique key if it fails */
223         if (op[op_num].ret != 0)
224                 total_keys++;
225 }
226
227 static void op_add_key_data(const char *filename,
228                             struct op op[], unsigned int op_num, char *words[])
229 {
230         if (!words[2] || !words[3] || !words[4] || words[5]
231             || !streq(words[3], "="))
232                 fail(filename, op_num+1, "Expected <key> = <data>");
233         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
234         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[4]);
235         /* May only be a unique key if it fails */
236         if (!op[op_num].data.dptr)
237                 total_keys++;
238 }
239
240 /* <serial> tdb_store <rec> <rec> <flag> = <ret> */
241 static void op_add_store(const char *filename,
242                          struct op op[], unsigned int op_num, char *words[])
243 {
244         if (!words[2] || !words[3] || !words[4] || !words[5] || !words[6]
245             || words[7] || !streq(words[5], "="))
246                 fail(filename, op_num+1, "Expect <key> <data> <flag> = <ret>");
247
248         op[op_num].flag = strtoul(words[4], NULL, 0);
249         op[op_num].ret = atoi(words[6]);
250         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
251         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
252         total_keys++;
253 }
254
255 /* <serial> tdb_append <rec> <rec> = <rec> */
256 static void op_add_append(const char *filename,
257                           struct op op[], unsigned int op_num, char *words[])
258 {
259         if (!words[2] || !words[3] || !words[4] || !words[5] || words[6]
260             || !streq(words[4], "="))
261                 fail(filename, op_num+1, "Expect <key> <data> = <rec>");
262
263         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
264         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
265         op[op_num].post_append
266                 = make_tdb_data(op, filename, op_num+1, words[5]);
267         total_keys++;
268 }
269
270 /* <serial> tdb_get_seqnum = <ret> */
271 static void op_add_seqnum(const char *filename,
272                           struct op op[], unsigned int op_num, char *words[])
273 {
274         if (!words[2] || !words[3] || words[4] || !streq(words[2], "="))
275                 fail(filename, op_num+1, "Expect = <ret>");
276
277         op[op_num].key = tdb_null;
278         op[op_num].ret = atoi(words[3]);
279 }
280
281 static void op_add_traverse(const char *filename,
282                             struct op op[], unsigned int op_num, char *words[])
283 {
284         if (words[2])
285                 fail(filename, op_num+1, "Expect no arguments");
286
287         op[op_num].key = tdb_null;
288         op[op_num].trav = NULL;
289 }
290
291 static void op_add_transaction(const char *filename, struct op op[],
292                                unsigned int op_num, char *words[])
293 {
294         if (words[2])
295                 fail(filename, op_num+1, "Expect no arguments");
296
297         op[op_num].key = tdb_null;
298         op[op_num].transaction_end = 0;
299 }
300
301 static void op_analyze_transaction(const char *filename,
302                                    struct op op[], unsigned int op_num,
303                                    char *words[])
304 {
305         int i, start;
306
307         op[op_num].key = tdb_null;
308
309         if (words[2])
310                 fail(filename, op_num+1, "Expect no arguments");
311
312         for (i = op_num-1; i >= 0; i--) {
313                 if (op[i].op == OP_TDB_TRANSACTION_START &&
314                     !op[i].transaction_end)
315                         break;
316         }
317
318         if (i < 0)
319                 fail(filename, op_num+1, "no transaction start found");
320
321         start = i;
322         op[start].transaction_end = op_num;
323
324         /* This rolls in nested transactions.  I think that's right. */
325         for (i++; i <= op_num; i++)
326                 op[i].group_start = start;
327 }
328
329 struct traverse_hash {
330         TDB_DATA key;
331         unsigned int index;
332 };
333
334 /* A traverse is a hash of keys, each one associated with ops. */
335 struct traverse {
336         /* How many traversal callouts should I do? */
337         unsigned int num;
338
339         /* Where is traversal end op? */
340         unsigned int end;
341
342         /* For trivial traversals. */
343         struct traverse_hash *hash;
344 };
345
346 /* A trivial traversal is one which doesn't terminate early and only
347  * plays with its own record.  We can reliably replay these even if
348  * traverse order changes. */
349 static bool is_trivial_traverse(struct op op[], unsigned int end)
350 {
351 #if 0
352         unsigned int i;
353         TDB_DATA cur = tdb_null;
354
355         if (op[end].ret != 0)
356                 return false;
357
358         for (i = 0; i < end; i++) {
359                 if (!op[i].key.dptr)
360                         continue;
361                 if (op[i].op == OP_TDB_TRAVERSE)
362                         cur = op[i].key;
363                 if (!key_eq(cur, op[i].key))
364                         return false;
365         }
366         return true;
367 #endif
368         /* With multiple things happening at once, no traverse is trivial. */
369         return false;
370 }
371
372 static void op_analyze_traverse(const char *filename,
373                                 struct op op[], unsigned int op_num,
374                                 char *words[])
375 {
376         int i, start;
377         struct traverse *trav = talloc(op, struct traverse);
378
379         op[op_num].key = tdb_null;
380
381         /* = %u means traverse function terminated. */
382         if (words[2]) {
383                 if (!streq(words[2], "=") || !words[3] || words[4])
384                         fail(filename, op_num+1, "expect = <num>");
385                 op[op_num].ret = atoi(words[3]);
386         } else
387                 op[op_num].ret = 0;
388
389         trav->num = 0;
390         trav->end = op_num;
391         for (i = op_num-1; i >= 0; i--) {
392                 if (op[i].op == OP_TDB_TRAVERSE)
393                         trav->num++;
394                 if (op[i].op != OP_TDB_TRAVERSE_READ_START
395                     && op[i].op != OP_TDB_TRAVERSE_START)
396                         continue;
397                 if (op[i].trav)
398                         continue;
399                 break;
400         }
401
402         if (i < 0)
403                 fail(filename, op_num+1, "no traversal start found");
404
405         start = i;
406         op[start].trav = trav;
407
408         for (i = start; i <= op_num; i++)
409                 op[i].group_start = start;
410
411         if (is_trivial_traverse(op+i, op_num-i)) {
412                 /* Fill in a plentiful hash table. */
413                 op[start].trav->hash = talloc_zero_array(op[i].trav,
414                                                          struct traverse_hash,
415                                                          trav->num * 2);
416                 for (i = start; i < op_num; i++) {
417                         unsigned int h;
418                         if (op[i].op != OP_TDB_TRAVERSE)
419                                 continue;
420                         h = hash_key(&op[i].key) % (trav->num * 2);
421                         while (trav->hash[h].index)
422                                 h = (h + 1) % (trav->num * 2);
423                         trav->hash[h].index = i+1;
424                         trav->hash[h].key = op[i].key;
425                 }
426         } else
427                 trav->hash = NULL;
428 }
429
430 /* Keep -Wmissing-declarations happy: */
431 const struct op_table *
432 find_keyword (register const char *str, register unsigned int len);
433
434 #include "keywords.c"
435
436 static int get_len(TDB_DATA key, TDB_DATA data, void *private_data)
437 {
438         return data.dsize;
439 }
440
441 static unsigned run_ops(struct tdb_context *tdb,
442                         int pre_fd,
443                         const char *filename,
444                         struct op op[],
445                         unsigned int start, unsigned int stop);
446
447 struct traverse_info {
448         struct op *op;
449         const char *filename;
450         int pre_fd;
451         unsigned int start;
452         unsigned int i;
453 };
454
455 /* Trivial case: do whatever they did for this key. */
456 static int trivial_traverse(struct tdb_context *tdb,
457                             TDB_DATA key, TDB_DATA data,
458                             void *_tinfo)
459 {
460         struct traverse_info *tinfo = _tinfo;
461         struct traverse *trav = tinfo->op[tinfo->start].trav;
462         unsigned int h = hash_key(&key) % (trav->num * 2);
463
464         while (trav->hash[h].index) {
465                 if (key_eq(trav->hash[h].key, key)) {
466                         run_ops(tdb, tinfo->pre_fd, tinfo->filename, tinfo->op,
467                                 trav->hash[h].index, trav->end);
468                         tinfo->i++;
469                         return 0;
470                 }
471                 h = (h + 1) % (trav->num * 2);
472         }
473         fail(tinfo->filename, tinfo->start + 1, "unexpected traverse key");
474 }
475
476 /* More complex.  Just do whatever's they did at the n'th entry. */
477 static int nontrivial_traverse(struct tdb_context *tdb,
478                                TDB_DATA key, TDB_DATA data,
479                                void *_tinfo)
480 {
481         struct traverse_info *tinfo = _tinfo;
482         struct traverse *trav = tinfo->op[tinfo->start].trav;
483
484         if (tinfo->i == trav->end) {
485                 /* This can happen if traverse expects to be empty. */
486                 if (tinfo->start + 1 == trav->end)
487                         return 1;
488                 fail(tinfo->filename, tinfo->start + 1,
489                      "traverse did not terminate");
490         }
491
492         if (tinfo->op[tinfo->i].op != OP_TDB_TRAVERSE)
493                 fail(tinfo->filename, tinfo->start + 1,
494                      "%s:%u:traverse terminated early");
495
496         /* Run any normal ops. */
497         tinfo->i = run_ops(tdb, tinfo->pre_fd, tinfo->filename, tinfo->op,
498                            tinfo->i+1, trav->end);
499
500         if (tinfo->i == trav->end)
501                 return 1;
502
503         return 0;
504 }
505
506 static unsigned op_traverse(struct tdb_context *tdb,
507                             int pre_fd,
508                             const char *filename,
509                             int (*traversefn)(struct tdb_context *,
510                                               tdb_traverse_func, void *),
511                             struct op op[],
512                             unsigned int start)
513 {
514         struct traverse *trav = op[start].trav;
515         struct traverse_info tinfo = { op, filename, pre_fd, start, start+1 };
516
517         /* Trivial case. */
518         if (trav->hash) {
519                 int ret = traversefn(tdb, trivial_traverse, &tinfo);
520                 if (ret != trav->num)
521                         fail(filename, start+1, "short traversal %i", ret);
522                 return trav->end;
523         }
524
525         traversefn(tdb, nontrivial_traverse, &tinfo);
526
527         /* Traversing in wrong order can have strange effects: eg. if
528          * original traverse went A (delete A), B, we might do B
529          * (delete A).  So if we have ops left over, we do it now. */
530         while (tinfo.i != trav->end) {
531                 if (op[tinfo.i].op == OP_TDB_TRAVERSE)
532                         tinfo.i++;
533                 else
534                         tinfo.i = run_ops(tdb, pre_fd, filename, op,
535                                           tinfo.i, trav->end);
536         }
537         return trav->end;
538 }
539
540 struct depend {
541         /* We can have more than one */
542         struct list_node list;
543         unsigned int file;
544         unsigned int op;
545 };
546
547 static void do_pre(const char *filename, int pre_fd,
548                    struct op op[], unsigned int i)
549 {
550         while (op[i].pre != 0) {
551                 unsigned int opnum;
552
553 #if DEBUG_DEPS
554                 printf("%s:%u:waiting for pre\n", filename, i+1);
555                 fflush(stdout);
556 #endif
557                 if (read(pre_fd, &opnum, sizeof(opnum)) != sizeof(opnum))
558                         errx(1, "Reading from pipe");
559
560 #if DEBUG_DEPS
561                 printf("%s:%u:got pre %u (%u)\n", filename, i+1, opnum+1,
562                        op[opnum].pre);
563                 fflush(stdout);
564 #endif
565                 /* This could be any op, not just this one. */
566                 if (op[opnum].pre == 0)
567                         errx(1, "Got extra notification for op line %u",
568                              opnum + 1);
569                 if (opnum < i)
570                         errx(1, "%s:%u: got late notification for line %u",
571                              filename, i + 1, opnum + 1);
572                 op[opnum].pre--;
573         }
574 }
575
576 static void do_post(const char *filename, const struct op op[], unsigned int i)
577 {
578         struct depend *dep;
579
580         list_for_each(&op[i].post, dep, list) {
581 #if DEBUG_DEPS
582                 printf("%s:%u:sending %u to file %u\n", filename, i+1,
583                        dep->op+1, dep->file);
584 #endif
585                 if (write(pipes[dep->file].fd[1], &dep->op, sizeof(dep->op))
586                     != sizeof(dep->op))
587                         err(1, "Failed to tell file %u", dep->file);
588         }
589 }
590
591 static __attribute__((noinline))
592 unsigned run_ops(struct tdb_context *tdb,
593                  int pre_fd,
594                  const char *filename,
595                  struct op op[], unsigned int start, unsigned int stop)
596 {
597         unsigned int i;
598
599         for (i = start; i < stop; i++) {
600                 do_pre(filename, pre_fd, op, i);
601
602                 switch (op[i].op) {
603                 case OP_TDB_LOCKALL:
604                         try(tdb_lockall(tdb), op[i].ret);
605                         break;
606                 case OP_TDB_LOCKALL_MARK:
607                         try(tdb_lockall_mark(tdb), op[i].ret);
608                         break;
609                 case OP_TDB_LOCKALL_UNMARK:
610                         try(tdb_lockall_unmark(tdb), op[i].ret);
611                         break;
612                 case OP_TDB_LOCKALL_NONBLOCK:
613                         unreliable(tdb_lockall_nonblock(tdb), op[i].ret,
614                                    tdb_lockall(tdb), tdb_unlockall(tdb));
615                         break;
616                 case OP_TDB_UNLOCKALL:
617                         try(tdb_unlockall(tdb), op[i].ret);
618                         break;
619                 case OP_TDB_LOCKALL_READ:
620                         try(tdb_lockall_read(tdb), op[i].ret);
621                         break;
622                 case OP_TDB_LOCKALL_READ_NONBLOCK:
623                         unreliable(tdb_lockall_read_nonblock(tdb), op[i].ret,
624                                    tdb_lockall_read(tdb),
625                                    tdb_unlockall_read(tdb));
626                         break;
627                 case OP_TDB_UNLOCKALL_READ:
628                         try(tdb_unlockall_read(tdb), op[i].ret);
629                         break;
630                 case OP_TDB_CHAINLOCK:
631                         try(tdb_chainlock(tdb, op[i].key), op[i].ret);
632                         break;
633                 case OP_TDB_CHAINLOCK_NONBLOCK:
634                         unreliable(tdb_chainlock_nonblock(tdb, op[i].key),
635                                    op[i].ret,
636                                    tdb_chainlock(tdb, op[i].key),
637                                    tdb_chainunlock(tdb, op[i].key));
638                         break;
639                 case OP_TDB_CHAINLOCK_MARK:
640                         try(tdb_chainlock_mark(tdb, op[i].key), op[i].ret);
641                         break;
642                 case OP_TDB_CHAINLOCK_UNMARK:
643                         try(tdb_chainlock_unmark(tdb, op[i].key), op[i].ret);
644                         break;
645                 case OP_TDB_CHAINUNLOCK:
646                         try(tdb_chainunlock(tdb, op[i].key), op[i].ret);
647                         break;
648                 case OP_TDB_CHAINLOCK_READ:
649                         try(tdb_chainlock_read(tdb, op[i].key), op[i].ret);
650                         break;
651                 case OP_TDB_CHAINUNLOCK_READ:
652                         try(tdb_chainunlock_read(tdb, op[i].key), op[i].ret);
653                         break;
654                 case OP_TDB_PARSE_RECORD:
655                         try(tdb_parse_record(tdb, op[i].key, get_len, NULL),
656                             op[i].ret);
657                         break;
658                 case OP_TDB_EXISTS:
659                         try(tdb_exists(tdb, op[i].key), op[i].ret);
660                         break;
661                 case OP_TDB_STORE:
662                         try(tdb_store(tdb, op[i].key, op[i].data, op[i].flag),
663                             op[i].ret < 0 ? op[i].ret : 0);
664                         break;
665                 case OP_TDB_APPEND:
666                         try(tdb_append(tdb, op[i].key, op[i].data),
667                             op[i].ret < 0 ? op[i].ret : 0);
668                         break;
669                 case OP_TDB_GET_SEQNUM:
670                         try(tdb_get_seqnum(tdb), op[i].ret);
671                         break;
672                 case OP_TDB_WIPE_ALL:
673                         try(tdb_wipe_all(tdb), op[i].ret);
674                         break;
675                 case OP_TDB_TRANSACTION_START:
676                         try(tdb_transaction_start(tdb), op[i].ret);
677                         break;
678                 case OP_TDB_TRANSACTION_CANCEL:
679                         try(tdb_transaction_cancel(tdb), op[i].ret);
680                         break;
681                 case OP_TDB_TRANSACTION_COMMIT:
682                         try(tdb_transaction_commit(tdb), op[i].ret);
683                         break;
684                 case OP_TDB_TRAVERSE_READ_START:
685                         i = op_traverse(tdb, pre_fd, filename,
686                                         tdb_traverse_read, op, i);
687                         break;
688                 case OP_TDB_TRAVERSE_START:
689                         i = op_traverse(tdb, pre_fd, filename,
690                                         tdb_traverse, op, i);
691                         break;
692                 case OP_TDB_TRAVERSE:
693                         /* Terminate: we're in a traverse, and we've
694                          * done our ops. */
695                         return i;
696                 case OP_TDB_TRAVERSE_END:
697                         fail(filename, i+1, "unepxected end traverse");
698                 /* FIXME: These must be treated like traverse. */
699                 case OP_TDB_FIRSTKEY:
700                         if (!key_eq(tdb_firstkey(tdb), op[i].data))
701                                 fail(filename, i+1, "bad firstkey");
702                         break;
703                 case OP_TDB_NEXTKEY:
704                         if (!key_eq(tdb_nextkey(tdb, op[i].key), op[i].data))
705                                 fail(filename, i+1, "bad nextkey");
706                         break;
707                 case OP_TDB_FETCH: {
708                         TDB_DATA f = tdb_fetch(tdb, op[i].key);
709                         if (!key_eq(f, op[i].data))
710                                 fail(filename, i+1, "bad fetch %u", f.dsize);
711                         break;
712                 }
713                 case OP_TDB_DELETE:
714                         try(tdb_delete(tdb, op[i].key), op[i].ret);
715                         break;
716                 }
717                 do_post(filename, op, i);
718         }
719         return i;
720 }
721
722 static struct op *load_tracefile(const char *filename, unsigned int *num,
723                                  unsigned int *hashsize,
724                                  unsigned int *tdb_flags,
725                                  unsigned int *open_flags)
726 {
727         unsigned int i;
728         struct op *op = talloc_array(NULL, struct op, 1);
729         char **words;
730         char **lines;
731         char *file;
732
733         file = grab_file(NULL, filename, NULL);
734         if (!file)
735                 err(1, "Reading %s", filename);
736
737         lines = strsplit(file, file, "\n", NULL);
738         if (!lines[0])
739                 errx(1, "%s is empty", filename);
740
741         words = strsplit(lines, lines[0], " ", NULL);
742         if (!streq(words[1], "tdb_open"))
743                 fail(filename, 1, "does not start with tdb_open");
744
745         *hashsize = atoi(words[2]);
746         *tdb_flags = strtoul(words[3], NULL, 0);
747         *open_flags = strtoul(words[4], NULL, 0);
748
749         for (i = 1; lines[i]; i++) {
750                 const struct op_table *opt;
751
752                 words = strsplit(lines, lines[i], " ", NULL);
753                 if (!words[0] || !words[1])
754                         fail(filename, i+1, "Expected serial number and op");
755                
756                 opt = find_keyword(words[1], strlen(words[1]));
757                 if (!opt) {
758                         if (streq(words[1], "tdb_close")) {
759                                 if (lines[i+1])
760                                         fail(filename, i+2,
761                                              "lines after tdb_close");
762                                 *num = i;
763                                 talloc_free(lines);
764                                 return op;
765                         }
766                         fail(filename, i+1, "Unknown operation '%s'", words[1]);
767                 }
768
769                 add_op(filename, &op, i, atoi(words[0]), opt->type);
770                 opt->enhance_op(filename, op, i, words);
771         }
772
773         fprintf(stderr, "%s:%u:last operation is not tdb_close: incomplete?",
774               filename, i);
775         talloc_free(lines);
776         *num = i - 1;
777         return op;
778 }
779
780 /* We remember all the keys we've ever seen, and who has them. */
781 struct key_user {
782         unsigned int file;
783         unsigned int op_num;
784 };
785
786 struct keyinfo {
787         TDB_DATA key;
788         unsigned int num_users;
789         struct key_user *user;
790 };
791
792 static bool changes_db(const struct op *op)
793 {
794         if (op->ret != 0)
795                 return false;
796
797         return op->op == OP_TDB_STORE
798                 || op->op == OP_TDB_APPEND
799                 || op->op == OP_TDB_WIPE_ALL
800                 || op->op == OP_TDB_TRANSACTION_COMMIT
801                 || op->op == OP_TDB_DELETE;
802 }
803
804 static struct keyinfo *hash_ops(struct op *op[], unsigned int num_ops[],
805                                 unsigned int num)
806 {
807         unsigned int i, j, h;
808         struct keyinfo *hash;
809
810         /* Gcc nexted function extension.  How cool is this? */
811         int compare_user_serial(const void *_a, const void *_b)
812         {
813                 const struct key_user *a = _a, *b = _b;
814                 int ret = op[a->file][a->op_num].serial
815                         - op[b->file][b->op_num].serial;
816
817                 /* Fetches don't inc serial, so we put changes first. */
818                 if (ret == 0) {
819                         if (changes_db(&op[a->file][a->op_num])
820                             && !changes_db(&op[b->file][b->op_num]))
821                                 return -1;
822                         if (changes_db(&op[b->file][b->op_num])
823                             && !changes_db(&op[a->file][a->op_num]))
824                                 return 1;
825                 }
826                 return ret;
827         }
828
829         hash = talloc_zero_array(op[0], struct keyinfo, total_keys*2);
830         for (i = 0; i < num; i++) {
831                 for (j = 1; j < num_ops[i]; j++) {
832                         /* We can't do this on allocation, due to realloc. */
833                         list_head_init(&op[i][j].post);
834
835                         if (!op[i][j].key.dptr)
836                                 continue;
837
838                         /* We don't wait for traverse keys */
839                         /* FIXME: We should, for trivial traversals. */
840                         if (op[i][j].op == OP_TDB_TRAVERSE)
841                                 continue;
842
843                         h = hash_key(&op[i][j].key) % (total_keys * 2);
844                         while (!key_eq(hash[h].key, op[i][j].key)) {
845                                 if (!hash[h].key.dptr) {
846                                         hash[h].key = op[i][j].key;
847                                         break;
848                                 }
849                                 h = (h + 1) % (total_keys * 2);
850                         }
851                         /* Might as well save some memory if we can. */
852                         if (op[i][j].key.dptr != hash[h].key.dptr) {
853                                 talloc_free(op[i][j].key.dptr);
854                                 op[i][j].key.dptr = hash[h].key.dptr;
855                         }
856                         hash[h].user = talloc_realloc(hash, hash[h].user,
857                                                      struct key_user,
858                                                      hash[h].num_users+1);
859                         hash[h].user[hash[h].num_users].op_num = j;
860                         hash[h].user[hash[h].num_users].file = i;
861                         hash[h].num_users++;
862                 }
863         }
864
865         /* Now sort into seqnum order. */
866         for (h = 0; h < total_keys * 2; h++)
867                 qsort(hash[h].user, hash[h].num_users, sizeof(hash[h].user[0]),
868                       compare_user_serial);
869
870         return hash;
871 }
872
873 static void add_dependency(void *ctx,
874                            struct op *op[],
875                            unsigned int needs_file,
876                            unsigned int needs_opnum,
877                            unsigned int satisfies_file,
878                            unsigned int satisfies_opnum)
879 {
880         struct depend *post;
881         unsigned int needs_start, sat_start;
882
883         needs_start = op[needs_file][needs_opnum].group_start;
884         sat_start = op[satisfies_file][satisfies_opnum].group_start;
885
886         /* If needs is in a transaction, we need it before start. */
887         if (needs_start) {
888                 switch (op[needs_file][needs_start].op) {
889                 case OP_TDB_TRANSACTION_START:
890 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
891                 case OP_TDB_TRAVERSE_START:
892                 case OP_TDB_TRAVERSE_READ_START:
893 #endif
894                         needs_opnum = needs_start;
895 #ifdef DEBUG_DEPS
896                         printf("  -> Back to %u\n", needs_start+1);
897                         fflush(stdout);
898 #endif
899                         break;
900                 default:
901                         break;
902                 }
903         }
904
905         /* If satisfies is in a transaction, we wait until after commit. */
906         /* FIXME: If transaction is cancelled, don't need dependency. */
907         if (sat_start) {
908                 if (op[satisfies_file][sat_start].op
909                     == OP_TDB_TRANSACTION_START) {
910                         satisfies_opnum
911                                 = op[satisfies_file][sat_start].transaction_end;
912 #ifdef DEBUG_DEPS
913                         printf("  -> Depends on %u\n", satisfies_opnum+1);
914                         fflush(stdout);
915 #endif
916                 }
917         }
918
919         post = talloc(ctx, struct depend);
920         post->file = needs_file;
921         post->op = needs_opnum;
922         list_add(&op[satisfies_file][satisfies_opnum].post, &post->list);
923
924         op[needs_file][needs_opnum].pre++;
925 #ifdef DEBUG_DEPS
926         if (op[needs_file][needs_opnum].pre > 1) {
927                 printf("   (File %u opnum %u hash %u needs)\n",
928                        needs_file, needs_opnum+1,
929                        op[needs_file][needs_opnum].pre);
930                 fflush(stdout);
931         }
932 #endif
933 }
934
935 static void derive_dependencies(char *filename[],
936                                 struct op *op[], unsigned int num_ops[],
937                                 unsigned int num)
938 {
939         struct keyinfo *hash;
940         unsigned int i;
941
942         /* Create hash table for faster key lookup. */
943         hash = hash_ops(op, num_ops, num);
944
945         /* We make the naive assumption that two ops on the same key
946          * have to be ordered; it's overkill. */
947         for (i = 0; i < total_keys * 2; i++) {
948                 unsigned int j;
949
950                 for (j = 1; j < hash[i].num_users; j++) {
951                         /* We don't depend on ourselves. */
952                         if (hash[i].user[j].file == hash[i].user[j-1].file)
953                                 continue;
954 #if DEBUG_DEPS
955                         printf("%s:%u: depends on %s:%u\n",
956                                filename[hash[i].user[j].file],
957                                hash[i].user[j].op_num+1,
958                                filename[hash[i].user[j-1].file],
959                                hash[i].user[j-1].op_num+1);
960                         fflush(stdout);
961 #endif
962                         add_dependency(hash, op,
963                                        hash[i].user[j].file,
964                                        hash[i].user[j].op_num,
965                                        hash[i].user[j-1].file,
966                                        hash[i].user[j-1].op_num);
967                 }
968         }
969 }
970
971 int main(int argc, char *argv[])
972 {
973         struct timeval start, end;
974         unsigned int i, num_ops[argc], hashsize[argc], tdb_flags[argc], open_flags[argc];
975         struct op *op[argc];
976         int fds[2];
977         char c;
978
979         if (argc < 3)
980                 errx(1, "Usage: %s <tdbfile> <tracefile>...", argv[0]);
981
982         pipes = talloc_array(NULL, struct pipe, argc - 2);
983         for (i = 0; i < argc - 2; i++) {
984                 op[i] = load_tracefile(argv[2+i], &num_ops[i], &hashsize[i],
985                                        &tdb_flags[i], &open_flags[i]);
986                 if (pipe(pipes[i].fd) != 0)
987                         err(1, "creating pipe");
988         }
989
990         derive_dependencies(argv+2, op, num_ops, i);
991
992         /* Don't fork for single arg case: simple debugging. */
993         if (argc == 3) {
994                 struct tdb_context *tdb;
995                 tdb = tdb_open_ex(argv[1], hashsize[0], tdb_flags[0],
996                                   open_flags[0], 0600,
997                                   NULL, hash_key);
998                 run_ops(tdb, pipes[0].fd[0], argv[2],
999                         op[0], 1, num_ops[0]);
1000                 exit(0);
1001         }
1002
1003         if (pipe(fds) != 0)
1004                 err(1, "creating pipe");
1005
1006         for (i = 0; i < argc - 2; i++) {
1007                 struct tdb_context *tdb;
1008
1009                 switch (fork()) {
1010                 case -1:
1011                         err(1, "fork failed");
1012                 case 0:
1013                         close(fds[1]);
1014                         tdb = tdb_open_ex(argv[1], hashsize[i], tdb_flags[i],
1015                                           open_flags[i], 0600,
1016                                           NULL, hash_key);
1017                         if (!tdb)
1018                                 err(1, "Opening tdb %s", argv[1]);
1019
1020                         /* This catches parent exiting. */
1021                         if (read(fds[0], &c, 1) != 1)
1022                                 exit(1);
1023                         run_ops(tdb, pipes[i].fd[0], argv[2+i],
1024                                 op[i], 1, num_ops[i]);
1025                         exit(0);
1026                 default:
1027                         break;
1028                 }
1029         }
1030
1031         /* Let everything settle. */
1032         sleep(1);
1033
1034         gettimeofday(&start, NULL);
1035         /* Tell them all to go!  Any write of sufficient length will do. */
1036         if (write(fds[1], hashsize, i) != i)
1037                 err(1, "Writing to wakeup pipe");
1038
1039         for (i = 0; i < argc - 2; i++) {
1040                 int status;
1041                 wait(&status);
1042                 if (!WIFEXITED(status))
1043                         errx(1, "Child died with signal %i", WTERMSIG(status));
1044                 if (WEXITSTATUS(status) != 0)
1045                         errx(1, "Child died with error code");
1046         }
1047         gettimeofday(&end, NULL);
1048
1049         end.tv_sec -= start.tv_sec;
1050         printf("Time replaying: %lu usec\n",
1051                end.tv_sec * 1000000UL + (end.tv_usec - start.tv_usec));
1052         
1053         exit(0);
1054 }