git.ozlabs.org Git - ccan/blob - ccan/tdb/tools/replay_trace.c

   1 #include <ccan/tdb/tdb.h>
   2 #include <ccan/grab_file/grab_file.h>
   3 #include <ccan/hash/hash.h>
   4 #include <ccan/talloc/talloc.h>
   5 #include <ccan/str_talloc/str_talloc.h>
   6 #include <ccan/str/str.h>
   7 #include <ccan/list/list.h>
   8 #include <err.h>
   9 #include <ctype.h>
  10 #include <string.h>
  11 #include <unistd.h>
  12 #include <sys/types.h>
  13 #include <sys/wait.h>
  14 #include <sys/time.h>
  15 #include <errno.h>
  16 #include <signal.h>
  17 #include <assert.h>
  18
  19 #define STRINGIFY2(x) #x
  20 #define STRINGIFY(x) STRINGIFY2(x)
  21
  22 /* Avoid mod by zero */
  23 static unsigned int total_keys = 1;
  24
  25 /* #define DEBUG_DEPS 1 */
  26
  27 /* Traversals block transactions in the current implementation. */
  28 #define TRAVERSALS_TAKE_TRANSACTION_LOCK 1
  29
  30 struct pipe {
  31         int fd[2];
  32 };
  33 static struct pipe *pipes;
  34
  35 static void __attribute__((noreturn)) fail(const char *filename,
  36                                            unsigned int line,
  37                                            const char *fmt, ...)
  38 {
  39         va_list ap;
  40
  41         va_start(ap, fmt);
  42         fprintf(stderr, "%s:%u: FAIL: ", filename, line);
  43         vfprintf(stderr, fmt, ap);
  44         fprintf(stderr, "\n");
  45         va_end(ap);
  46         exit(1);
  47 }
  48
  49 /* Try or die. */
  50 #define try(expr, expect)                                               \
  51         do {                                                            \
  52                 int ret = (expr);                                       \
  53                 if (ret != (expect))                                    \
  54                         fail(filename[file], i+1,                       \
  55                              STRINGIFY(expr) "= %i", ret);              \
  56         } while (0)
  57
  58 /* Try or imitate results. */
  59 #define unreliable(expr, expect, force, undo)                           \
  60         do {                                                            \
  61                 int ret = expr;                                         \
  62                 if (ret != expect) {                                    \
  63                         fprintf(stderr, "%s:%u: %s gave %i not %i",     \
  64                                 filename[file], i+1, STRINGIFY(expr),   \
  65                                 ret, expect);                           \
  66                         if (expect == 0)                                \
  67                                 force;                                  \
  68                         else                                            \
  69                                 undo;                                   \
  70                 }                                                       \
  71         } while (0)
  72
  73 static bool key_eq(TDB_DATA a, TDB_DATA b)
  74 {
  75         if (a.dsize != b.dsize)
  76                 return false;
  77         return memcmp(a.dptr, b.dptr, a.dsize) == 0;
  78 }
  79
  80 /* This is based on the hash algorithm from gdbm */
  81 static unsigned int hash_key(TDB_DATA *key)
  82 {
  83         uint32_t value; /* Used to compute the hash value.  */
  84         uint32_t   i;   /* Used to cycle through random values. */
  85
  86         /* Set the initial value from the key size. */
  87         for (value = 0x238F13AF ^ key->dsize, i=0; i < key->dsize; i++)
  88                 value = (value + (key->dptr[i] << (i*5 % 24)));
  89
  90         return (1103515243 * value + 12345);
  91 }
  92
  93 enum op_type {
  94         OP_TDB_LOCKALL,
  95         OP_TDB_LOCKALL_MARK,
  96         OP_TDB_LOCKALL_UNMARK,
  97         OP_TDB_LOCKALL_NONBLOCK,
  98         OP_TDB_UNLOCKALL,
  99         OP_TDB_LOCKALL_READ,
 100         OP_TDB_LOCKALL_READ_NONBLOCK,
 101         OP_TDB_UNLOCKALL_READ,
 102         OP_TDB_CHAINLOCK,
 103         OP_TDB_CHAINLOCK_NONBLOCK,
 104         OP_TDB_CHAINLOCK_MARK,
 105         OP_TDB_CHAINLOCK_UNMARK,
 106         OP_TDB_CHAINUNLOCK,
 107         OP_TDB_CHAINLOCK_READ,
 108         OP_TDB_CHAINUNLOCK_READ,
 109         OP_TDB_PARSE_RECORD,
 110         OP_TDB_EXISTS,
 111         OP_TDB_STORE,
 112         OP_TDB_APPEND,
 113         OP_TDB_GET_SEQNUM,
 114         OP_TDB_WIPE_ALL,
 115         OP_TDB_TRANSACTION_START,
 116         OP_TDB_TRANSACTION_CANCEL,
 117         OP_TDB_TRANSACTION_COMMIT,
 118         OP_TDB_TRAVERSE_READ_START,
 119         OP_TDB_TRAVERSE_START,
 120         OP_TDB_TRAVERSE_END,
 121         OP_TDB_TRAVERSE,
 122         OP_TDB_FIRSTKEY,
 123         OP_TDB_NEXTKEY,
 124         OP_TDB_FETCH,
 125         OP_TDB_DELETE,
 126 };
 127
 128 struct op {
 129         unsigned int serial;
 130         enum op_type op;
 131         TDB_DATA key;
 132         TDB_DATA data;
 133         int ret;
 134
 135         /* Who is waiting for us? */
 136         struct list_head post;
 137         /* What are we waiting for? */
 138         struct list_head pre;
 139
 140         /* If I'm part of a group (traverse/transaction) where is
 141          * start?  (Otherwise, 0) */
 142         unsigned int group_start;
 143
 144         union {
 145                 int flag; /* open and store */
 146                 struct {  /* append */
 147                         TDB_DATA pre;
 148                         TDB_DATA post;
 149                 } append;
 150                 unsigned int group_len; /* transaction/traverse start */
 151         };
 152 };
 153
 154 static unsigned char hex_char(const char *filename, unsigned int line, char c)
 155 {
 156         c = toupper(c);
 157         if (c >= 'A' && c <= 'F')
 158                 return c - 'A' + 10;
 159         if (c >= '0' && c <= '9')
 160                 return c - '0';
 161         fail(filename, line, "invalid hex character '%c'", c);
 162 }
 163
 164 /* TDB data is <size>:<%02x>* */
 165 static TDB_DATA make_tdb_data(const void *ctx,
 166                               const char *filename, unsigned int line,
 167                               const char *word)
 168 {
 169         TDB_DATA data;
 170         unsigned int i;
 171         const char *p;
 172
 173         if (streq(word, "NULL"))
 174                 return tdb_null;
 175
 176         data.dsize = atoi(word);
 177         data.dptr = talloc_array(ctx, unsigned char, data.dsize);
 178         p = strchr(word, ':');
 179         if (!p)
 180                 fail(filename, line, "invalid tdb data '%s'", word);
 181         p++;
 182         for (i = 0; i < data.dsize; i++)
 183                 data.dptr[i] = hex_char(filename, line, p[i*2])*16
 184                         + hex_char(filename, line, p[i*2+1]);
 185
 186         return data;
 187 }
 188
 189 static void add_op(const char *filename, struct op **op, unsigned int i,
 190                    unsigned int serial, enum op_type type)
 191 {
 192         struct op *new;
 193         *op = talloc_realloc(NULL, *op, struct op, i+1);
 194         new = (*op) + i;
 195         new->op = type;
 196         new->serial = serial;
 197         new->ret = 0;
 198         new->group_start = 0;
 199 }
 200
 201 static void op_add_nothing(const char *filename,
 202                            struct op op[], unsigned int op_num, char *words[])
 203 {
 204         if (words[2])
 205                 fail(filename, op_num+1, "Expected no arguments");
 206         op[op_num].key = tdb_null;
 207 }
 208
 209 static void op_add_key(const char *filename,
 210                        struct op op[], unsigned int op_num, char *words[])
 211 {
 212         if (words[2] == NULL || words[3])
 213                 fail(filename, op_num+1, "Expected just a key");
 214
 215         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
 216         if (op[op_num].op != OP_TDB_TRAVERSE)
 217                 total_keys++;
 218 }
 219
 220 static void op_add_key_ret(const char *filename,
 221                            struct op op[], unsigned int op_num, char *words[])
 222 {
 223         if (!words[2] || !words[3] || !words[4] || words[5]
 224             || !streq(words[3], "="))
 225                 fail(filename, op_num+1, "Expected <key> = <ret>");
 226         op[op_num].ret = atoi(words[4]);
 227         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
 228         /* May only be a unique key if it fails */
 229         if (op[op_num].ret != 0)
 230                 total_keys++;
 231 }
 232
 233 static void op_add_key_data(const char *filename,
 234                             struct op op[], unsigned int op_num, char *words[])
 235 {
 236         if (!words[2] || !words[3] || !words[4] || words[5]
 237             || !streq(words[3], "="))
 238                 fail(filename, op_num+1, "Expected <key> = <data>");
 239         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
 240         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[4]);
 241         /* May only be a unique key if it fails */
 242         if (!op[op_num].data.dptr)
 243                 total_keys++;
 244 }
 245
 246 /* <serial> tdb_store <rec> <rec> <flag> = <ret> */
 247 static void op_add_store(const char *filename,
 248                          struct op op[], unsigned int op_num, char *words[])
 249 {
 250         if (!words[2] || !words[3] || !words[4] || !words[5] || !words[6]
 251             || words[7] || !streq(words[5], "="))
 252                 fail(filename, op_num+1, "Expect <key> <data> <flag> = <ret>");
 253
 254         op[op_num].flag = strtoul(words[4], NULL, 0);
 255         op[op_num].ret = atoi(words[6]);
 256         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
 257         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
 258         total_keys++;
 259 }
 260
 261 /* <serial> tdb_append <rec> <rec> = <rec> */
 262 static void op_add_append(const char *filename,
 263                           struct op op[], unsigned int op_num, char *words[])
 264 {
 265         if (!words[2] || !words[3] || !words[4] || !words[5] || words[6]
 266             || !streq(words[4], "="))
 267                 fail(filename, op_num+1, "Expect <key> <data> = <rec>");
 268
 269         op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
 270         op[op_num].data = make_tdb_data(op, filename, op_num+1, words[3]);
 271
 272         op[op_num].append.post
 273                 = make_tdb_data(op, filename, op_num+1, words[5]);
 274
 275         /* By subtraction, figure out what previous data was. */
 276         op[op_num].append.pre.dptr = op[op_num].append.post.dptr;
 277         op[op_num].append.pre.dsize
 278                 = op[op_num].append.post.dsize - op[op_num].data.dsize;
 279         total_keys++;
 280 }
 281
 282 /* <serial> tdb_get_seqnum = <ret> */
 283 static void op_add_seqnum(const char *filename,
 284                           struct op op[], unsigned int op_num, char *words[])
 285 {
 286         if (!words[2] || !words[3] || words[4] || !streq(words[2], "="))
 287                 fail(filename, op_num+1, "Expect = <ret>");
 288
 289         op[op_num].key = tdb_null;
 290         op[op_num].ret = atoi(words[3]);
 291 }
 292
 293 static void op_add_traverse(const char *filename,
 294                             struct op op[], unsigned int op_num, char *words[])
 295 {
 296         if (words[2])
 297                 fail(filename, op_num+1, "Expect no arguments");
 298
 299         op[op_num].key = tdb_null;
 300         op[op_num].group_len = 0;
 301 }
 302
 303 static void op_add_transaction(const char *filename, struct op op[],
 304                                unsigned int op_num, char *words[])
 305 {
 306         if (words[2])
 307                 fail(filename, op_num+1, "Expect no arguments");
 308
 309         op[op_num].key = tdb_null;
 310         op[op_num].group_len = 0;
 311 }
 312
 313 static int op_transaction_start(struct op op[], unsigned int op_num)
 314 {
 315         unsigned int i;
 316
 317         for (i = op_num-1; i > 0; i--) {
 318                 if (op[i].op == OP_TDB_TRANSACTION_START && !op[i].group_len)
 319                         return i;
 320         }
 321         return 0;
 322 }
 323
 324 static void op_analyze_transaction(const char *filename,
 325                                    struct op op[], unsigned int op_num,
 326                                    char *words[])
 327 {
 328         unsigned int start, i;
 329
 330         op[op_num].key = tdb_null;
 331
 332         if (words[2])
 333                 fail(filename, op_num+1, "Expect no arguments");
 334
 335         start = op_transaction_start(op, op_num);
 336         if (!start)
 337                 fail(filename, op_num+1, "no transaction start found");
 338
 339         op[start].group_len = op_num - start;
 340
 341         /* This rolls in nested transactions.  I think that's right. */
 342         for (i = start; i <= op_num; i++)
 343                 op[i].group_start = start;
 344 }
 345
 346 struct traverse_hash {
 347         TDB_DATA key;
 348         unsigned int index;
 349 };
 350
 351 static void op_analyze_traverse(const char *filename,
 352                                 struct op op[], unsigned int op_num,
 353                                 char *words[])
 354 {
 355         int i, start;
 356
 357         op[op_num].key = tdb_null;
 358
 359         /* = %u means traverse function terminated. */
 360         if (words[2]) {
 361                 if (!streq(words[2], "=") || !words[3] || words[4])
 362                         fail(filename, op_num+1, "expect = <num>");
 363                 op[op_num].ret = atoi(words[3]);
 364         } else
 365                 op[op_num].ret = 0;
 366
 367         for (i = op_num-1; i >= 0; i--) {
 368                 if (op[i].op != OP_TDB_TRAVERSE_READ_START
 369                     && op[i].op != OP_TDB_TRAVERSE_START)
 370                         continue;
 371                 if (op[i].group_len)
 372                         continue;
 373                 break;
 374         }
 375
 376         if (i < 0)
 377                 fail(filename, op_num+1, "no traversal start found");
 378
 379         start = i;
 380         op[start].group_len = op_num - start;
 381
 382         for (i = start; i <= op_num; i++)
 383                 op[i].group_start = start;
 384 }
 385
 386 /* Keep -Wmissing-declarations happy: */
 387 const struct op_table *
 388 find_keyword (register const char *str, register unsigned int len);
 389
 390 #include "keywords.c"
 391
 392 struct depend {
 393         /* We can have more than one */
 394         struct list_node pre_list;
 395         struct list_node post_list;
 396         unsigned int needs_file;
 397         unsigned int needs_opnum;
 398         unsigned int satisfies_file;
 399         unsigned int satisfies_opnum;
 400 };
 401
 402 static void check_deps(const char *filename, struct op op[], unsigned int num)
 403 {
 404 #ifdef DEBUG_DEPS
 405         unsigned int i;
 406
 407         for (i = 1; i < num; i++)
 408                 if (!list_empty(&op[i].pre))
 409                         fail(filename, i+1, "Still has dependencies");
 410 #endif
 411 }
 412
 413 static void dump_pre(char *filename[], struct op *op[],
 414                      unsigned int file, unsigned int i)
 415 {
 416         struct depend *dep;
 417
 418         printf("%s:%u (%u) still waiting for:\n", filename[file], i+1,
 419                 op[file][i].serial);
 420         list_for_each(&op[file][i].pre, dep, pre_list)
 421                 printf("    %s:%u (%u)\n",
 422                        filename[dep->satisfies_file], dep->satisfies_opnum+1,
 423                        op[dep->satisfies_file][dep->satisfies_opnum].serial);
 424         check_deps(filename[file], op[file], i);
 425 }
 426
 427 /* We simply read/write pointers, since we all are children. */
 428 static bool do_pre(struct tdb_context *tdb,
 429                    char *filename[], struct op *op[],
 430                    unsigned int file, int pre_fd, unsigned int i,
 431                    bool backoff)
 432 {
 433         while (!list_empty(&op[file][i].pre)) {
 434                 struct depend *dep;
 435
 436 #if DEBUG_DEPS
 437                 printf("%s:%u:waiting for pre\n", filename[file], i+1);
 438                 fflush(stdout);
 439 #endif
 440                 if (backoff)
 441                         alarm(2);
 442                 else
 443                         alarm(10);
 444                 while (read(pre_fd, &dep, sizeof(dep)) != sizeof(dep)) {
 445                         if (errno == EINTR) {
 446                                 if (backoff) {
 447                                         warnx("%s:%u:avoiding deadlock",
 448                                               filename[file], i+1);
 449                                         return false;
 450                                 }
 451                                 dump_pre(filename, op, file, i);
 452                                 exit(1);
 453                         } else
 454                                 errx(1, "Reading from pipe");
 455                 }
 456                 alarm(0);
 457
 458 #if DEBUG_DEPS
 459                 printf("%s:%u:got pre %u from %s:%u\n", filename[file], i+1,
 460                        dep->needs_opnum+1, filename[dep->satisfies_file],
 461                        dep->satisfies_opnum+1);
 462                 fflush(stdout);
 463 #endif
 464                 /* This could be any op, not just this one. */
 465                 talloc_free(dep);
 466         }
 467         return true;
 468 }
 469
 470 static void do_post(char *filename[], struct op *op[],
 471                     unsigned int file, unsigned int i)
 472 {
 473         struct depend *dep;
 474
 475         list_for_each(&op[file][i].post, dep, post_list) {
 476 #if DEBUG_DEPS
 477                 printf("%s:%u:sending to file %s:%u\n", filename[file], i+1,
 478                        filename[dep->needs_file], dep->needs_opnum+1);
 479 #endif
 480                 if (write(pipes[dep->needs_file].fd[1], &dep, sizeof(dep))
 481                     != sizeof(dep))
 482                         err(1, "%s:%u failed to tell file %s",
 483                             filename[file], i+1, filename[dep->needs_file]);
 484         }
 485 }
 486
 487 static int get_len(TDB_DATA key, TDB_DATA data, void *private_data)
 488 {
 489         return data.dsize;
 490 }
 491
 492 static unsigned run_ops(struct tdb_context *tdb,
 493                         int pre_fd,
 494                         char *filename[],
 495                         struct op *op[],
 496                         unsigned int file,
 497                         unsigned int start, unsigned int stop,
 498                         bool backoff);
 499
 500 struct traverse_info {
 501         struct op **op;
 502         char **filename;
 503         unsigned file;
 504         int pre_fd;
 505         unsigned int start;
 506         unsigned int i;
 507 };
 508
 509 /* More complex.  Just do whatever's they did at the n'th entry. */
 510 static int nontrivial_traverse(struct tdb_context *tdb,
 511                                TDB_DATA key, TDB_DATA data,
 512                                void *_tinfo)
 513 {
 514         struct traverse_info *tinfo = _tinfo;
 515         unsigned int trav_len = tinfo->op[tinfo->file][tinfo->start].group_len;
 516         bool avoid_deadlock = false;
 517
 518         if (tinfo->i == tinfo->start + trav_len) {
 519                 /* This can happen if traverse expects to be empty. */
 520                 if (trav_len == 1)
 521                         return 1;
 522                 fail(tinfo->filename[tinfo->file], tinfo->start + 1,
 523                      "traverse did not terminate");
 524         }
 525
 526         if (tinfo->op[tinfo->file][tinfo->i].op != OP_TDB_TRAVERSE)
 527                 fail(tinfo->filename[tinfo->file], tinfo->start + 1,
 528                      "%s:%u:traverse terminated early");
 529
 530 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
 531         avoid_deadlock = true;
 532 #endif
 533
 534         /* Run any normal ops. */
 535         tinfo->i = run_ops(tdb, tinfo->pre_fd, tinfo->filename, tinfo->op,
 536                            tinfo->file, tinfo->i+1, tinfo->start + trav_len,
 537                            avoid_deadlock);
 538
 539         /* We backed off, or we hit OP_TDB_TRAVERSE_END. */
 540         if (tinfo->op[tinfo->file][tinfo->i].op != OP_TDB_TRAVERSE)
 541                 return 1;
 542
 543         return 0;
 544 }
 545
 546 static unsigned op_traverse(struct tdb_context *tdb,
 547                             int pre_fd,
 548                             char *filename[],
 549                             unsigned int file,
 550                             int (*traversefn)(struct tdb_context *,
 551                                               tdb_traverse_func, void *),
 552                             struct op *op[],
 553                             unsigned int start)
 554 {
 555         struct traverse_info tinfo = { op, filename, file, pre_fd,
 556                                        start, start+1 };
 557
 558         traversefn(tdb, nontrivial_traverse, &tinfo);
 559
 560         /* Traversing in wrong order can have strange effects: eg. if
 561          * original traverse went A (delete A), B, we might do B
 562          * (delete A).  So if we have ops left over, we do it now. */
 563         while (tinfo.i != start + op[file][start].group_len) {
 564                 if (op[file][tinfo.i].op == OP_TDB_TRAVERSE)
 565                         tinfo.i++;
 566                 else
 567                         tinfo.i = run_ops(tdb, pre_fd, filename, op, file,
 568                                           tinfo.i,
 569                                           start + op[file][start].group_len,
 570                                           false);
 571         }
 572
 573         return tinfo.i;
 574 }
 575
 576 static void break_out(int sig)
 577 {
 578 }
 579
 580 static __attribute__((noinline))
 581 unsigned run_ops(struct tdb_context *tdb,
 582                  int pre_fd,
 583                  char *filename[],
 584                  struct op *op[],
 585                  unsigned int file,
 586                  unsigned int start, unsigned int stop,
 587                  bool backoff)
 588 {
 589         unsigned int i;
 590         struct sigaction sa;
 591
 592         sa.sa_handler = break_out;
 593         sa.sa_flags = 0;
 594
 595         sigaction(SIGALRM, &sa, NULL);
 596         for (i = start; i < stop; i++) {
 597                 if (!do_pre(tdb, filename, op, file, pre_fd, i, backoff))
 598                         return i;
 599
 600                 switch (op[file][i].op) {
 601                 case OP_TDB_LOCKALL:
 602                         try(tdb_lockall(tdb), op[file][i].ret);
 603                         break;
 604                 case OP_TDB_LOCKALL_MARK:
 605                         try(tdb_lockall_mark(tdb), op[file][i].ret);
 606                         break;
 607                 case OP_TDB_LOCKALL_UNMARK:
 608                         try(tdb_lockall_unmark(tdb), op[file][i].ret);
 609                         break;
 610                 case OP_TDB_LOCKALL_NONBLOCK:
 611                         unreliable(tdb_lockall_nonblock(tdb), op[file][i].ret,
 612                                    tdb_lockall(tdb), tdb_unlockall(tdb));
 613                         break;
 614                 case OP_TDB_UNLOCKALL:
 615                         try(tdb_unlockall(tdb), op[file][i].ret);
 616                         break;
 617                 case OP_TDB_LOCKALL_READ:
 618                         try(tdb_lockall_read(tdb), op[file][i].ret);
 619                         break;
 620                 case OP_TDB_LOCKALL_READ_NONBLOCK:
 621                         unreliable(tdb_lockall_read_nonblock(tdb),
 622                                    op[file][i].ret,
 623                                    tdb_lockall_read(tdb),
 624                                    tdb_unlockall_read(tdb));
 625                         break;
 626                 case OP_TDB_UNLOCKALL_READ:
 627                         try(tdb_unlockall_read(tdb), op[file][i].ret);
 628                         break;
 629                 case OP_TDB_CHAINLOCK:
 630                         try(tdb_chainlock(tdb, op[file][i].key),
 631                             op[file][i].ret);
 632                         break;
 633                 case OP_TDB_CHAINLOCK_NONBLOCK:
 634                         unreliable(tdb_chainlock_nonblock(tdb, op[file][i].key),
 635                                    op[file][i].ret,
 636                                    tdb_chainlock(tdb, op[file][i].key),
 637                                    tdb_chainunlock(tdb, op[file][i].key));
 638                         break;
 639                 case OP_TDB_CHAINLOCK_MARK:
 640                         try(tdb_chainlock_mark(tdb, op[file][i].key),
 641                             op[file][i].ret);
 642                         break;
 643                 case OP_TDB_CHAINLOCK_UNMARK:
 644                         try(tdb_chainlock_unmark(tdb, op[file][i].key),
 645                             op[file][i].ret);
 646                         break;
 647                 case OP_TDB_CHAINUNLOCK:
 648                         try(tdb_chainunlock(tdb, op[file][i].key),
 649                             op[file][i].ret);
 650                         break;
 651                 case OP_TDB_CHAINLOCK_READ:
 652                         try(tdb_chainlock_read(tdb, op[file][i].key),
 653                             op[file][i].ret);
 654                         break;
 655                 case OP_TDB_CHAINUNLOCK_READ:
 656                         try(tdb_chainunlock_read(tdb, op[file][i].key),
 657                             op[file][i].ret);
 658                         break;
 659                 case OP_TDB_PARSE_RECORD:
 660                         try(tdb_parse_record(tdb, op[file][i].key, get_len,
 661                                              NULL),
 662                             op[file][i].ret);
 663                         break;
 664                 case OP_TDB_EXISTS:
 665                         try(tdb_exists(tdb, op[file][i].key), op[file][i].ret);
 666                         break;
 667                 case OP_TDB_STORE:
 668                         try(tdb_store(tdb, op[file][i].key, op[file][i].data,
 669                                       op[file][i].flag),
 670                             op[file][i].ret);
 671                         break;
 672                 case OP_TDB_APPEND:
 673                         try(tdb_append(tdb, op[file][i].key, op[file][i].data),
 674                             op[file][i].ret);
 675                         break;
 676                 case OP_TDB_GET_SEQNUM:
 677                         try(tdb_get_seqnum(tdb), op[file][i].ret);
 678                         break;
 679                 case OP_TDB_WIPE_ALL:
 680                         try(tdb_wipe_all(tdb), op[file][i].ret);
 681                         break;
 682                 case OP_TDB_TRANSACTION_START:
 683                         try(tdb_transaction_start(tdb), op[file][i].ret);
 684                         break;
 685                 case OP_TDB_TRANSACTION_CANCEL:
 686                         try(tdb_transaction_cancel(tdb), op[file][i].ret);
 687                         break;
 688                 case OP_TDB_TRANSACTION_COMMIT:
 689                         try(tdb_transaction_commit(tdb), op[file][i].ret);
 690                         break;
 691                 case OP_TDB_TRAVERSE_READ_START:
 692                         i = op_traverse(tdb, pre_fd, filename, file,
 693                                         tdb_traverse_read, op, i);
 694                         break;
 695                 case OP_TDB_TRAVERSE_START:
 696                         i = op_traverse(tdb, pre_fd, filename, file,
 697                                         tdb_traverse, op, i);
 698                         break;
 699                 case OP_TDB_TRAVERSE:
 700                         /* Terminate: we're in a traverse, and we've
 701                          * done our ops. */
 702                         return i;
 703                 case OP_TDB_TRAVERSE_END:
 704                         fail(filename[file], i+1, "unexpected end traverse");
 705                 /* FIXME: These must be treated like traverse. */
 706                 case OP_TDB_FIRSTKEY:
 707                         if (!key_eq(tdb_firstkey(tdb), op[file][i].data))
 708                                 fail(filename[file], i+1, "bad firstkey");
 709                         break;
 710                 case OP_TDB_NEXTKEY:
 711                         if (!key_eq(tdb_nextkey(tdb, op[file][i].key),
 712                                     op[file][i].data))
 713                                 fail(filename[file], i+1, "bad nextkey");
 714                         break;
 715                 case OP_TDB_FETCH: {
 716                         TDB_DATA f = tdb_fetch(tdb, op[file][i].key);
 717                         if (!key_eq(f, op[file][i].data))
 718                                 fail(filename[file], i+1, "bad fetch %u",
 719                                      f.dsize);
 720                         break;
 721                 }
 722                 case OP_TDB_DELETE:
 723                         try(tdb_delete(tdb, op[file][i].key), op[file][i].ret);
 724                         break;
 725                 }
 726                 do_post(filename, op, file, i);
 727         }
 728         return i;
 729 }
 730
 731 /* tdbtorture, in particular, can do a tdb_close with a transaction in
 732  * progress. */
 733 static struct op *maybe_cancel_transaction(const char *filename,
 734                                            struct op *op, unsigned int *num)
 735 {
 736         unsigned int start = op_transaction_start(op, *num);
 737
 738         if (start) {
 739                 char *words[] = { "<unknown>", "tdb_close", NULL };
 740                 add_op(filename, &op, *num, op[start].serial,
 741                        OP_TDB_TRANSACTION_CANCEL);
 742                 op_analyze_transaction(filename, op, *num, words);
 743                 (*num)++;
 744         }
 745         return op;
 746 }
 747
 748 static struct op *load_tracefile(const char *filename, unsigned int *num,
 749                                  unsigned int *hashsize,
 750                                  unsigned int *tdb_flags,
 751                                  unsigned int *open_flags)
 752 {
 753         unsigned int i;
 754         struct op *op = talloc_array(NULL, struct op, 1);
 755         char **words;
 756         char **lines;
 757         char *file;
 758
 759         file = grab_file(NULL, filename, NULL);
 760         if (!file)
 761                 err(1, "Reading %s", filename);
 762
 763         lines = strsplit(file, file, "\n", NULL);
 764         if (!lines[0])
 765                 errx(1, "%s is empty", filename);
 766
 767         words = strsplit(lines, lines[0], " ", NULL);
 768         if (!streq(words[1], "tdb_open"))
 769                 fail(filename, 1, "does not start with tdb_open");
 770
 771         *hashsize = atoi(words[2]);
 772         *tdb_flags = strtoul(words[3], NULL, 0);
 773         *open_flags = strtoul(words[4], NULL, 0);
 774
 775         for (i = 1; lines[i]; i++) {
 776                 const struct op_table *opt;
 777
 778                 words = strsplit(lines, lines[i], " ", NULL);
 779                 if (!words[0] || !words[1])
 780                         fail(filename, i+1, "Expected serial number and op");
 781
 782                 opt = find_keyword(words[1], strlen(words[1]));
 783                 if (!opt) {
 784                         if (streq(words[1], "tdb_close")) {
 785                                 if (lines[i+1])
 786                                         fail(filename, i+2,
 787                                              "lines after tdb_close");
 788                                 *num = i;
 789                                 talloc_free(lines);
 790                                 return maybe_cancel_transaction(filename,
 791                                                                 op, num);
 792                         }
 793                         fail(filename, i+1, "Unknown operation '%s'", words[1]);
 794                 }
 795
 796                 add_op(filename, &op, i, atoi(words[0]), opt->type);
 797                 opt->enhance_op(filename, op, i, words);
 798         }
 799
 800         fprintf(stderr, "%s:%u:last operation is not tdb_close: incomplete?",
 801               filename, i);
 802         talloc_free(lines);
 803         *num = i - 1;
 804         return maybe_cancel_transaction(filename, op, num);
 805 }
 806
 807 /* We remember all the keys we've ever seen, and who has them. */
 808 struct key_user {
 809         unsigned int file;
 810         unsigned int op_num;
 811 };
 812
 813 struct keyinfo {
 814         TDB_DATA key;
 815         unsigned int num_users;
 816         struct key_user *user;
 817 };
 818
 819 static const TDB_DATA must_not_exist;
 820 static const TDB_DATA must_exist;
 821 static const TDB_DATA not_exists_or_empty;
 822
 823 /* NULL means doesn't care if it exists or not, &must_exist means
 824  * it must exist but we don't care what, &must_not_exist means it must
 825  * not exist, otherwise the data it needs. */
 826 static const TDB_DATA *needs(const struct op *op)
 827 {
 828         switch (op->op) {
 829         /* FIXME: Pull forward deps, since we can deadlock */
 830         case OP_TDB_CHAINLOCK:
 831         case OP_TDB_CHAINLOCK_NONBLOCK:
 832         case OP_TDB_CHAINLOCK_MARK:
 833         case OP_TDB_CHAINLOCK_UNMARK:
 834         case OP_TDB_CHAINUNLOCK:
 835         case OP_TDB_CHAINLOCK_READ:
 836         case OP_TDB_CHAINUNLOCK_READ:
 837                 return NULL;
 838
 839         case OP_TDB_APPEND:
 840                 if (op->append.pre.dsize == 0)
 841                         return &not_exists_or_empty;
 842                 return &op->append.pre;
 843
 844         case OP_TDB_STORE:
 845                 if (op->flag == TDB_INSERT) {
 846                         if (op->ret < 0)
 847                                 return &must_exist;
 848                         else
 849                                 return &must_not_exist;
 850                 } else if (op->flag == TDB_MODIFY) {
 851                         if (op->ret < 0)
 852                                 return &must_not_exist;
 853                         else
 854                                 return &must_exist;
 855                 }
 856                 /* No flags?  Don't care */
 857                 return NULL;
 858
 859         case OP_TDB_EXISTS:
 860                 if (op->ret == 1)
 861                         return &must_exist;
 862                 else
 863                         return &must_not_exist;
 864
 865         case OP_TDB_PARSE_RECORD:
 866                 if (op->ret < 0)
 867                         return &must_not_exist;
 868                 return &must_exist;
 869
 870         /* FIXME: handle these. */
 871         case OP_TDB_WIPE_ALL:
 872         case OP_TDB_FIRSTKEY:
 873         case OP_TDB_NEXTKEY:
 874         case OP_TDB_GET_SEQNUM:
 875         case OP_TDB_TRAVERSE:
 876         case OP_TDB_TRANSACTION_COMMIT:
 877         case OP_TDB_TRANSACTION_CANCEL:
 878         case OP_TDB_TRANSACTION_START:
 879                 return NULL;
 880
 881         case OP_TDB_FETCH:
 882                 if (!op->data.dptr)
 883                         return &must_not_exist;
 884                 return &op->data;
 885
 886         case OP_TDB_DELETE:
 887                 if (op->ret < 0)
 888                         return &must_not_exist;
 889                 return &must_exist;
 890
 891         default:
 892                 errx(1, "Unexpected op %i", op->op);
 893         }
 894
 895 }
 896
 897 static bool is_transaction(const struct op *op)
 898 {
 899         return op->op == OP_TDB_TRANSACTION_START;
 900 }
 901
 902 /* What's the data after this op?  pre if nothing changed. */
 903 static const TDB_DATA *gives(const TDB_DATA *key, const TDB_DATA *pre,
 904                              const struct op *op)
 905 {
 906         if (is_transaction(op)) {
 907                 unsigned int i;
 908
 909                 /* Cancelled transactions don't change anything. */
 910                 if (op[op->group_len].op == OP_TDB_TRANSACTION_CANCEL)
 911                         return pre;
 912                 assert(op[op->group_len].op == OP_TDB_TRANSACTION_COMMIT);
 913
 914                 for (i = 1; i < op->group_len; i++) {
 915                         /* This skips nested transactions, too */
 916                         if (op[i].op != OP_TDB_TRAVERSE
 917                             && key_eq(op[i].key, *key))
 918                                 pre = gives(key, pre, &op[i]);
 919                 }
 920                 return pre;
 921         }
 922
 923         /* Failed ops don't change state of db. */
 924         if (op->ret < 0)
 925                 return pre;
 926
 927         if (op->op == OP_TDB_DELETE || op->op == OP_TDB_WIPE_ALL)
 928                 return &tdb_null;
 929
 930         if (op->op == OP_TDB_APPEND)
 931                 return &op->append.post;
 932
 933         if (op->op == OP_TDB_STORE)
 934                 return &op->data;
 935
 936         return pre;
 937 }
 938
 939 static bool in_transaction(const struct op op[], unsigned int i)
 940 {
 941         return op[i].group_start && is_transaction(&op[op[i].group_start]);
 942 }
 943
 944 static bool in_traverse(const struct op op[], unsigned int i)
 945 {
 946         return op[i].group_start && !is_transaction(&op[op[i].group_start]);
 947 }
 948
 949 static struct keyinfo *hash_ops(struct op *op[], unsigned int num_ops[],
 950                                 unsigned int num)
 951 {
 952         unsigned int i, j, h;
 953         struct keyinfo *hash;
 954
 955         hash = talloc_zero_array(op[0], struct keyinfo, total_keys*2);
 956         for (i = 0; i < num; i++) {
 957                 for (j = 1; j < num_ops[i]; j++) {
 958                         /* We can't do this on allocation, due to realloc. */
 959                         list_head_init(&op[i][j].post);
 960                         list_head_init(&op[i][j].pre);
 961
 962                         if (!op[i][j].key.dptr)
 963                                 continue;
 964
 965                         /* We don't wait for traverse keys */
 966                         /* FIXME: We should, for trivial traversals. */
 967                         if (op[i][j].op == OP_TDB_TRAVERSE)
 968                                 continue;
 969
 970                         h = hash_key(&op[i][j].key) % (total_keys * 2);
 971                         while (!key_eq(hash[h].key, op[i][j].key)) {
 972                                 if (!hash[h].key.dptr) {
 973                                         hash[h].key = op[i][j].key;
 974                                         break;
 975                                 }
 976                                 h = (h + 1) % (total_keys * 2);
 977                         }
 978                         /* Might as well save some memory if we can. */
 979                         if (op[i][j].key.dptr != hash[h].key.dptr) {
 980                                 talloc_free(op[i][j].key.dptr);
 981                                 op[i][j].key.dptr = hash[h].key.dptr;
 982                         }
 983                         hash[h].user = talloc_realloc(hash, hash[h].user,
 984                                                      struct key_user,
 985                                                      hash[h].num_users+1);
 986
 987                         /* If it's in a transaction, it's the transaction which
 988                          * matters from an analysis POV. */
 989                         if (in_transaction(op[i], j)) {
 990                                 unsigned start = op[i][j].group_start;
 991
 992                                 /* Don't include twice. */
 993                                 if (hash[h].num_users
 994                                     && hash[h].user[hash[h].num_users-1].file
 995                                         == i
 996                                     && hash[h].user[hash[h].num_users-1].op_num
 997                                         == start)
 998                                         continue;
 999
1000                                 hash[h].user[hash[h].num_users].op_num = start;
1001                         } else
1002                                 hash[h].user[hash[h].num_users].op_num = j;
1003                         hash[h].user[hash[h].num_users].file = i;
1004                         hash[h].num_users++;
1005                 }
1006         }
1007
1008         return hash;
1009 }
1010
1011 static bool satisfies(const TDB_DATA *key, const TDB_DATA *data,
1012                       const struct op *op)
1013 {
1014         const TDB_DATA *need = NULL;
1015
1016         if (is_transaction(op)) {
1017                 unsigned int i;
1018
1019                 /* Look through for an op in this transaction which
1020                  * needs this key. */
1021                 for (i = 1; i < op->group_len; i++) {
1022                         if (op[i].op != OP_TDB_TRAVERSE
1023                             && key_eq(op[i].key, *key)) {
1024                                 need = needs(&op[i]);
1025                                 /* tdb_exists() is special: there might be
1026                                  * something in the transaction with more
1027                                  * specific requirements.  Other ops don't have
1028                                  * specific requirements (eg. store or delete),
1029                                  * but they change the value so we can't get
1030                                  * more information from future ops. */
1031                                 if (op[i].op != OP_TDB_EXISTS)
1032                                         break;
1033                         }
1034                 }
1035         } else
1036                 need = needs(op);
1037
1038         /* Don't need anything?  Cool. */
1039         if (!need)
1040                 return true;
1041
1042         /* This should be tdb_null or a real value. */
1043         assert(data != &must_exist);
1044         assert(data != &must_not_exist);
1045         assert(data != &not_exists_or_empty);
1046
1047         /* Must not exist?  data must not exist. */
1048         if (need == &must_not_exist)
1049                 return data == &tdb_null;
1050
1051         /* Must exist? */
1052         if (need == &must_exist)
1053                 return data != &tdb_null;
1054
1055         /* Either noexist or empty. */
1056         if (need == &not_exists_or_empty)
1057                 return data->dsize == 0;
1058
1059         /* Needs something specific. */
1060         return key_eq(*data, *need);
1061 }
1062
1063 static void move_to_front(struct key_user res[], unsigned off, unsigned elem)
1064 {
1065         if (elem != off) {
1066                 struct key_user tmp = res[elem];
1067                 memmove(res + off + 1, res + off, (elem - off)*sizeof(res[0]));
1068                 res[off] = tmp;
1069         }
1070 }
1071
1072 static void restore_to_pos(struct key_user res[], unsigned off, unsigned elem)
1073 {
1074         if (elem != off) {
1075                 struct key_user tmp = res[off];
1076                 memmove(res + off, res + off + 1, (elem - off)*sizeof(res[0]));
1077                 res[elem] = tmp;
1078         }
1079 }
1080
1081 static bool sort_deps(char *filename[], struct op *op[],
1082                       struct key_user res[],
1083                       unsigned off, unsigned num,
1084                       const TDB_DATA *key, const TDB_DATA *data,
1085                       unsigned num_files, unsigned fuzz)
1086 {
1087         unsigned int i, files_done;
1088         struct op *this_op;
1089         bool done[num_files];
1090
1091         /* Does this make serial numbers go backwards?  Allow a little fuzz. */
1092         if (off > 0) {
1093                 int serial1 = op[res[off-1].file][res[off-1].op_num].serial;
1094                 int serial2 = op[res[off].file][res[off].op_num].serial;
1095
1096                 if (serial1 - serial2 > (int)fuzz) {
1097 #if DEBUG_DEPS
1098                         printf("Serial jump too far (%u -> %u)\n",
1099                                serial1, serial2);
1100 #endif
1101                         return false;
1102                 }
1103         }
1104
1105         /* One or none left?  We're sorted. */
1106         if (off + 1 >= num)
1107                 return true;
1108
1109         memset(done, 0, sizeof(done));
1110
1111         /* Since ops within a trace file are ordered, we just need to figure
1112          * out which file to try next.  Since we don't take into account
1113          * inter-key relationships (which exist by virtue of trace file order),
1114          * we minimize the chance of harm by trying to keep in serial order. */
1115         for (files_done = 0, i = off; i < num && files_done < num_files; i++) {
1116                 if (done[res[i].file])
1117                         continue;
1118
1119                 this_op = &op[res[i].file][res[i].op_num];
1120
1121                 /* Is what we have good enough for this op? */
1122                 if (satisfies(key, data, this_op)) {
1123                         move_to_front(res, off, i);
1124                         if (sort_deps(filename, op, res, off+1, num,
1125                                       key, gives(key, data, this_op),
1126                                       num_files, fuzz))
1127                                 return true;
1128                         restore_to_pos(res, off, i);
1129                 }
1130                 done[res[i].file] = true;
1131                 files_done++;
1132         }
1133
1134         /* No combination worked. */
1135         return false;
1136 }
1137
1138 static void check_dep_sorting(struct key_user user[], unsigned num_users,
1139                               unsigned num_files)
1140 {
1141 #if DEBUG_DEPS
1142         unsigned int i;
1143         unsigned minima[num_files];
1144
1145         memset(minima, 0, sizeof(minima));
1146         for (i = 0; i < num_users; i++) {
1147                 assert(minima[user[i].file] < user[i].op_num);
1148                 minima[user[i].file] = user[i].op_num;
1149         }
1150 #endif
1151 }
1152
1153 /* All these ops happen on the same key.  Which comes first?
1154  *
1155  * This can happen both because read ops or failed write ops don't
1156  * change serial number, and also due to race since we access the
1157  * number unlocked (the race can cause less detectable ordering problems,
1158  * in which case we'll deadlock and report: fix manually in that case).
1159  */
1160 static void figure_deps(char *filename[], struct op *op[],
1161                         const TDB_DATA *key, struct key_user user[],
1162                         unsigned num_users, unsigned num_files)
1163 {
1164         /* We assume database starts empty. */
1165         const struct TDB_DATA *data = &tdb_null;
1166         unsigned int fuzz;
1167
1168         /* We prefer to keep strict serial order if possible: it's the
1169          * most likely.  We get more lax if that fails. */
1170         for (fuzz = 0; fuzz < 100; fuzz = (fuzz + 1)*2) {
1171                 if (sort_deps(filename, op, user, 0, num_users, key, data,
1172                               num_files, fuzz))
1173                         break;
1174         }
1175
1176         if (fuzz >= 100)
1177                 fail(filename[user[0].file], user[0].op_num+1,
1178                      "Could not resolve inter-dependencies");
1179
1180         check_dep_sorting(user, num_users, num_files);
1181 }
1182
1183 static void sort_ops(struct keyinfo hash[], char *filename[], struct op *op[],
1184                      unsigned int num)
1185 {
1186         unsigned int h;
1187
1188         /* Gcc nexted function extension.  How cool is this? */
1189         int compare_serial(const void *_a, const void *_b)
1190         {
1191                 const struct key_user *a = _a, *b = _b;
1192
1193                 /* First, maintain order within any trace file. */
1194                 if (a->file == b->file)
1195                         return a->op_num - b->op_num;
1196
1197                 /* Otherwise, arrange by serial order. */
1198                 return op[a->file][a->op_num].serial
1199                         - op[b->file][b->op_num].serial;
1200         }
1201
1202         /* Now sort into serial order. */
1203         for (h = 0; h < total_keys * 2; h++) {
1204                 struct key_user *user = hash[h].user;
1205
1206                 qsort(user, hash[h].num_users, sizeof(user[0]), compare_serial);
1207                 figure_deps(filename, op, &hash[h].key, user, hash[h].num_users,
1208                             num);
1209         }
1210 }
1211
1212 static int destroy_depend(struct depend *dep)
1213 {
1214         list_del(&dep->pre_list);
1215         list_del(&dep->post_list);
1216         return 0;
1217 }
1218
1219 static void add_dependency(void *ctx,
1220                            struct op *op[],
1221                            char *filename[],
1222                            unsigned int needs_file,
1223                            unsigned int needs_opnum,
1224                            unsigned int satisfies_file,
1225                            unsigned int satisfies_opnum)
1226 {
1227         struct depend *dep;
1228
1229         /* We don't depend on ourselves. */
1230         if (needs_file == satisfies_file) {
1231                 assert(satisfies_opnum < needs_opnum);
1232                 return;
1233         }
1234
1235 #if DEBUG_DEPS
1236         printf("%s:%u: depends on %s:%u\n",
1237                filename[needs_file], needs_opnum+1,
1238                filename[satisfies_file], satisfies_opnum+1);
1239 #endif
1240
1241 #if TRAVERSALS_TAKE_TRANSACTION_LOCK
1242         /* If something in a traverse depends on something in another
1243          * traverse/transaction, it creates a dependency between the
1244          * two groups. */
1245         if ((in_traverse(op[satisfies_file], satisfies_opnum)
1246              && op[needs_file][needs_opnum].group_start)
1247             || (in_traverse(op[needs_file], needs_opnum)
1248                 && op[satisfies_file][satisfies_opnum].group_start)) {
1249                 unsigned int sat;
1250
1251                 /* We are satisfied by end of group. */
1252                 sat = op[satisfies_file][satisfies_opnum].group_start;
1253                 satisfies_opnum = sat + op[satisfies_file][sat].group_len;
1254                 /* And we need that done by start of our group. */
1255                 needs_opnum = op[needs_file][needs_opnum].group_start;
1256         }
1257
1258         /* There is also this case:
1259          *  <traverse> <read foo> ...
1260          *  <transaction> ... </transaction> <create foo>
1261          * Where if we start the traverse then wait, we could block
1262          * the transaction and deadlock.
1263          *
1264          * We try to address this by ensuring that where seqnum indicates it's
1265          * possible, we wait for <create foo> before *starting* traverse.
1266          */
1267         else if (in_traverse(op[needs_file], needs_opnum)) {
1268                 struct op *need = &op[needs_file][needs_opnum];
1269                 if (op[needs_file][need->group_start].serial >
1270                     op[satisfies_file][satisfies_opnum].serial) {
1271                         needs_opnum = need->group_start;
1272                 }
1273         }
1274 #endif
1275
1276         /* If you depend on a transaction, you actually depend on it ending. */
1277         if (is_transaction(&op[satisfies_file][satisfies_opnum])) {
1278                 satisfies_opnum
1279                         += op[satisfies_file][satisfies_opnum].group_len;
1280 #if DEBUG_DEPS
1281                 printf("-> Actually end of transaction %s:%u\n",
1282                        filename[satisfies_file], satisfies_opnum+1);
1283 #endif
1284         } else
1285                 /* We should never create a dependency from middle of
1286                  * a transaction. */
1287                 assert(!in_transaction(op[satisfies_file], satisfies_opnum)
1288                        || op[satisfies_file][satisfies_opnum].op
1289                        == OP_TDB_TRANSACTION_COMMIT
1290                        || op[satisfies_file][satisfies_opnum].op
1291                        == OP_TDB_TRANSACTION_CANCEL);
1292
1293         assert(op[needs_file][needs_opnum].op != OP_TDB_TRAVERSE);
1294         assert(op[satisfies_file][satisfies_opnum].op != OP_TDB_TRAVERSE);
1295
1296         dep = talloc(ctx, struct depend);
1297         dep->needs_file = needs_file;
1298         dep->needs_opnum = needs_opnum;
1299         dep->satisfies_file = satisfies_file;
1300         dep->satisfies_opnum = satisfies_opnum;
1301         list_add(&op[satisfies_file][satisfies_opnum].post, &dep->post_list);
1302         list_add(&op[needs_file][needs_opnum].pre, &dep->pre_list);
1303         talloc_set_destructor(dep, destroy_depend);
1304 }
1305
1306 static bool changes_db(const TDB_DATA *key, const struct op *op)
1307 {
1308         return gives(key, NULL, op) != NULL;
1309 }
1310
1311 static void depend_on_previous(struct op *op[],
1312                                char *filename[],
1313                                unsigned int num,
1314                                struct key_user user[],
1315                                unsigned int i,
1316                                int prev)
1317 {
1318         bool deps[num];
1319         int j;
1320
1321         if (i == 0)
1322                 return;
1323
1324         if (prev == i - 1) {
1325                 /* Just depend on previous. */
1326                 add_dependency(NULL, op, filename,
1327                                user[i].file, user[i].op_num,
1328                                user[prev].file, user[prev].op_num);
1329                 return;
1330         }
1331
1332         /* We have to wait for the readers.  Find last one in *each* file. */
1333         memset(deps, 0, sizeof(deps));
1334         deps[user[i].file] = true;
1335         for (j = i - 1; j > prev; j--) {
1336                 if (!deps[user[j].file]) {
1337                         add_dependency(NULL, op, filename,
1338                                        user[i].file, user[i].op_num,
1339                                        user[j].file, user[j].op_num);
1340                         deps[user[j].file] = true;
1341                 }
1342         }
1343 }
1344
1345 /* This is simple, but not complete.  We don't take into account
1346  * indirect dependencies. */
1347 static void optimize_dependencies(struct op *op[], unsigned int num_ops[],
1348                                   unsigned int num)
1349 {
1350         unsigned int i, j;
1351
1352         /* There can only be one real dependency on each file */
1353         for (i = 0; i < num; i++) {
1354                 for (j = 1; j < num_ops[i]; j++) {
1355                         struct depend *dep, *next;
1356                         struct depend *prev[num];
1357
1358                         memset(prev, 0, sizeof(prev));
1359
1360                         list_for_each_safe(&op[i][j].pre, dep, next, pre_list) {
1361                                 if (!prev[dep->satisfies_file]) {
1362                                         prev[dep->satisfies_file] = dep;
1363                                         continue;
1364                                 }
1365                                 if (prev[dep->satisfies_file]->satisfies_opnum
1366                                     < dep->satisfies_opnum) {
1367                                         talloc_free(prev[dep->satisfies_file]);
1368                                         prev[dep->satisfies_file] = dep;
1369                                 } else
1370                                         talloc_free(dep);
1371                         }
1372                 }
1373         }
1374
1375         for (i = 0; i < num; i++) {
1376                 int deps[num];
1377
1378                 for (j = 0; j < num; j++)
1379                         deps[j] = -1;
1380
1381                 for (j = 1; j < num_ops[i]; j++) {
1382                         struct depend *dep, *next;
1383
1384                         list_for_each_safe(&op[i][j].pre, dep, next, pre_list) {
1385                                 if (deps[dep->satisfies_file]
1386                                     >= (int)dep->satisfies_opnum)
1387                                         talloc_free(dep);
1388                                 else
1389                                         deps[dep->satisfies_file]
1390                                                 = dep->satisfies_opnum;
1391                         }
1392                 }
1393         }
1394 }
1395
1396 static void derive_dependencies(char *filename[],
1397                                 struct op *op[], unsigned int num_ops[],
1398                                 unsigned int num)
1399 {
1400         struct keyinfo *hash;
1401         unsigned int h, i;
1402
1403         /* Create hash table for faster key lookup. */
1404         hash = hash_ops(op, num_ops, num);
1405
1406         /* Sort them by serial number. */
1407         sort_ops(hash, filename, op, num);
1408
1409         /* Create dependencies back to the last change, rather than
1410          * creating false dependencies by naively making each one
1411          * depend on the previous.  This has two purposes: it makes
1412          * later optimization simpler, and it also avoids deadlock with
1413          * same sequence number ops inside traversals (if one
1414          * traversal doesn't write anything, two ops can have the same
1415          * sequence number yet we can create a traversal dependency
1416          * the other way). */
1417         for (h = 0; h < total_keys * 2; h++) {
1418                 int prev = -1;
1419
1420                 if (hash[h].num_users < 2)
1421                         continue;
1422
1423                 for (i = 0; i < hash[h].num_users; i++) {
1424                         if (changes_db(&hash[h].key, &op[hash[h].user[i].file]
1425                                        [hash[h].user[i].op_num])) {
1426                                 depend_on_previous(op, filename, num,
1427                                                    hash[h].user, i, prev);
1428                                 prev = i;
1429                         } else if (prev >= 0)
1430                                 add_dependency(hash, op, filename,
1431                                                hash[h].user[i].file,
1432                                                hash[h].user[i].op_num,
1433                                                hash[h].user[prev].file,
1434                                                hash[h].user[prev].op_num);
1435                 }
1436         }
1437
1438         optimize_dependencies(op, num_ops, num);
1439 }
1440
1441 int main(int argc, char *argv[])
1442 {
1443         struct timeval start, end;
1444         unsigned int i, num_ops[argc], hashsize[argc], tdb_flags[argc], open_flags[argc];
1445         struct op *op[argc];
1446         int fds[2];
1447         char c;
1448         bool ok = true;
1449
1450         if (argc < 3)
1451                 errx(1, "Usage: %s <tdbfile> <tracefile>...", argv[0]);
1452
1453         pipes = talloc_array(NULL, struct pipe, argc - 2);
1454         for (i = 0; i < argc - 2; i++) {
1455                 printf("Loading tracefile %s...", argv[2+i]);
1456                 fflush(stdout);
1457                 op[i] = load_tracefile(argv[2+i], &num_ops[i], &hashsize[i],
1458                                        &tdb_flags[i], &open_flags[i]);
1459                 if (pipe(pipes[i].fd) != 0)
1460                         err(1, "creating pipe");
1461                 printf("done\n");
1462         }
1463
1464         printf("Calculating inter-dependencies...");
1465         fflush(stdout);
1466         derive_dependencies(argv+2, op, num_ops, i);
1467         printf("done\n");
1468
1469         /* Don't fork for single arg case: simple debugging. */
1470         if (argc == 3) {
1471                 struct tdb_context *tdb;
1472                 tdb = tdb_open_ex(argv[1], hashsize[0], tdb_flags[0]|TDB_NOSYNC,
1473                                   open_flags[0], 0600, NULL, hash_key);
1474                 printf("Single threaded run...");
1475                 fflush(stdout);
1476
1477                 run_ops(tdb, pipes[0].fd[0], argv+2, op, 0, 1, num_ops[0],
1478                         false);
1479                 check_deps(argv[2], op[0], num_ops[0]);
1480
1481                 printf("done\n");
1482                 exit(0);
1483         }
1484
1485         if (pipe(fds) != 0)
1486                 err(1, "creating pipe");
1487
1488         for (i = 0; i < argc - 2; i++) {
1489                 struct tdb_context *tdb;
1490
1491                 switch (fork()) {
1492                 case -1:
1493                         err(1, "fork failed");
1494                 case 0:
1495                         close(fds[1]);
1496                         tdb = tdb_open_ex(argv[1], hashsize[i],
1497                                           tdb_flags[i]|TDB_NOSYNC,
1498                                           open_flags[i], 0600, NULL, hash_key);
1499                         if (!tdb)
1500                                 err(1, "Opening tdb %s", argv[1]);
1501
1502                         /* This catches parent exiting. */
1503                         if (read(fds[0], &c, 1) != 1)
1504                                 exit(1);
1505                         run_ops(tdb, pipes[i].fd[0], argv+2, op, i, 1,
1506                                 num_ops[i], false);
1507                         check_deps(argv[2+i], op[i], num_ops[i]);
1508                         exit(0);
1509                 default:
1510                         break;
1511                 }
1512         }
1513
1514         /* Let everything settle. */
1515         sleep(1);
1516
1517         printf("Starting run...");
1518         fflush(stdout);
1519         gettimeofday(&start, NULL);
1520         /* Tell them all to go!  Any write of sufficient length will do. */
1521         if (write(fds[1], hashsize, i) != i)
1522                 err(1, "Writing to wakeup pipe");
1523
1524         for (i = 0; i < argc - 2; i++) {
1525                 int status;
1526                 wait(&status);
1527                 if (!WIFEXITED(status)) {
1528                         warnx("Child died with signal %i", WTERMSIG(status));
1529                         ok = false;
1530                 } else if (WEXITSTATUS(status) != 0)
1531                         /* Assume child spat out error. */
1532                         ok = false;
1533         }
1534         if (!ok)
1535                 exit(1);
1536
1537         gettimeofday(&end, NULL);
1538         printf("done\n");
1539
1540         end.tv_sec -= start.tv_sec;
1541         printf("Time replaying: %lu usec\n",
1542                end.tv_sec * 1000000UL + (end.tv_usec - start.tv_usec));
1543
1544         exit(0);
1545 }