+ if (op->op == OP_TDB_DELETE || op->op == OP_TDB_WIPE_ALL)
+ return &tdb_null;
+
+ if (op->op == OP_TDB_APPEND)
+ return &op->append.post;
+
+ if (op->op == OP_TDB_STORE)
+ return &op->data;
+
+ return pre;
+}
+
+static struct keyinfo *hash_ops(struct op *op[], unsigned int num_ops[],
+ unsigned int num)
+{
+ unsigned int i, j, h;
+ struct keyinfo *hash;
+
+ hash = talloc_zero_array(op[0], struct keyinfo, total_keys*2);
+ for (i = 0; i < num; i++) {
+ for (j = 1; j < num_ops[i]; j++) {
+ /* We can't do this on allocation, due to realloc. */
+ list_head_init(&op[i][j].post);
+ list_head_init(&op[i][j].pre);
+
+ if (!op[i][j].key.dptr)
+ continue;
+
+ /* We don't wait for traverse keys */
+ /* FIXME: We should, for trivial traversals. */
+ if (op[i][j].op == OP_TDB_TRAVERSE)
+ continue;
+
+ h = hash_key(&op[i][j].key) % (total_keys * 2);
+ while (!key_eq(hash[h].key, op[i][j].key)) {
+ if (!hash[h].key.dptr) {
+ hash[h].key = op[i][j].key;
+ break;
+ }
+ h = (h + 1) % (total_keys * 2);
+ }
+ /* Might as well save some memory if we can. */
+ if (op[i][j].key.dptr != hash[h].key.dptr) {
+ talloc_free(op[i][j].key.dptr);
+ op[i][j].key.dptr = hash[h].key.dptr;
+ }
+ hash[h].user = talloc_realloc(hash, hash[h].user,
+ struct key_user,
+ hash[h].num_users+1);
+ hash[h].user[hash[h].num_users].op_num = j;
+ hash[h].user[hash[h].num_users].file = i;
+ hash[h].num_users++;
+ }
+ }
+
+ return hash;
+}
+
+static bool satisfies(const TDB_DATA *data, const TDB_DATA *need)
+{
+ /* Don't need anything? Cool. */
+ if (!need)
+ return true;
+
+ /* This should be tdb_null or a real value. */
+ assert(data != &must_exist);
+ assert(data != &must_not_exist);
+ assert(data != ¬_exists_or_empty);
+
+ /* must_not_exist == must_not_exist, must_exist == must_exist, or
+ not_exists_or_empty == not_exists_or_empty. */
+ if (data->dsize == need->dsize && data->dptr == need->dptr)
+ return true;
+
+ /* Must not exist? data must not exist. */
+ if (need == &must_not_exist)
+ return data->dptr == NULL;
+
+ /* Must exist? */
+ if (need == &must_exist)
+ return data->dptr != NULL;
+
+ /* Either noexist or empty. */
+ if (need == ¬_exists_or_empty)
+ return data->dsize == 0;
+
+ /* Needs something specific. */
+ return key_eq(*data, *need);
+}
+
+static void move_to_front(struct key_user res[], unsigned int elem)
+{
+ if (elem != 0) {
+ struct key_user tmp = res[elem];
+ memmove(res + 1, res, elem*sizeof(res[0]));
+ res[0] = tmp;
+ }
+}
+
+static void restore_to_pos(struct key_user res[], unsigned int elem)
+{
+ if (elem != 0) {
+ struct key_user tmp = res[0];
+ memmove(res, res + 1, elem*sizeof(res[0]));
+ res[elem] = tmp;
+ }
+}
+
+static bool sort_deps(char *filename[], struct op *op[],
+ struct key_user res[], unsigned num,
+ const TDB_DATA *data, unsigned num_files)
+{
+ unsigned int i, files_done;
+ struct op *this_op;
+ bool done[num_files];
+
+ /* Nothing left? We're sorted. */
+ if (num == 0)
+ return true;
+
+ memset(done, 0, sizeof(done));
+
+ /* Since ops within a trace file are ordered, we just need to figure
+ * out which file to try next. Since we don't take into account
+ * inter-key relationships (which exist by virtue of trace file order),
+ * we minimize the chance of harm by trying to keep in serial order. */
+ for (files_done = 0, i = 0; i < num && files_done < num_files; i++) {
+ if (done[res[i].file])
+ continue;
+
+ this_op = &op[res[i].file][res[i].op_num];
+ /* Is what we have good enough for this op? */
+ if (satisfies(data, needs(this_op))) {
+ move_to_front(res, i);
+ if (sort_deps(filename, op, res+1, num-1,
+ gives(this_op, data), num_files))
+ return true;
+ restore_to_pos(res, i);
+ }
+ done[res[i].file] = true;
+ files_done++;
+ }
+
+ /* No combination worked. */
+ return false;
+}
+
+static void check_dep_sorting(struct key_user user[], unsigned num_users,
+ unsigned num_files)
+{
+#if DEBUG_DEPS
+ unsigned int i;
+ unsigned minima[num_files];
+
+ memset(minima, 0, sizeof(minima));
+ for (i = 0; i < num_users; i++) {
+ assert(minima[user[i].file] < user[i].op_num);
+ minima[user[i].file] = user[i].op_num;
+ }
+#endif
+}
+
+/* All these ops have the same serial number. Which comes first?
+ *
+ * This can happen both because read ops or failed write ops don't
+ * change serial number, and also due to race since we access the
+ * number unlocked (the race can cause less detectable ordering problems,
+ * in which case we'll deadlock and report: fix manually in that case).
+ */
+static void figure_deps(char *filename[], struct op *op[],
+ struct key_user user[], unsigned num_users,
+ unsigned num_files)
+{
+ /* We assume database starts empty. */
+ const struct TDB_DATA *data = &tdb_null;
+
+ if (!sort_deps(filename, op, user, num_users, data, num_files))
+ fail(filename[user[0].file], user[0].op_num+1,
+ "Could not resolve inter-dependencies");
+
+ check_dep_sorting(user, num_users, num_files);
+}
+
+static void sort_ops(struct keyinfo hash[], char *filename[], struct op *op[],
+ unsigned int num)
+{
+ unsigned int h;
+
+ /* Gcc nexted function extension. How cool is this? */
+ int compare_serial(const void *_a, const void *_b)
+ {
+ const struct key_user *a = _a, *b = _b;
+
+ /* First, maintain order within any trace file. */
+ if (a->file == b->file)
+ return a->op_num - b->op_num;
+
+ /* Otherwise, arrange by serial order. */
+ return op[a->file][a->op_num].serial
+ - op[b->file][b->op_num].serial;
+ }
+
+ /* Now sort into serial order. */
+ for (h = 0; h < total_keys * 2; h++) {
+ struct key_user *user = hash[h].user;
+
+ qsort(user, hash[h].num_users, sizeof(user[0]), compare_serial);
+ figure_deps(filename, op, user, hash[h].num_users, num);
+ }
+}
+
+static int destroy_depend(struct depend *dep)
+{
+ list_del(&dep->pre_list);
+ list_del(&dep->post_list);
+ return 0;
+}
+
+static void add_dependency(void *ctx,
+ struct op *op[],
+ char *filename[],
+ unsigned int needs_file,
+ unsigned int needs_opnum,
+ unsigned int satisfies_file,
+ unsigned int satisfies_opnum)
+{
+ struct depend *dep;
+ unsigned int needs_start, sat_start;
+
+ /* We don't depend on ourselves. */
+ if (needs_file == satisfies_file) {
+ assert(satisfies_opnum < needs_opnum);
+ return;
+ }
+
+#if DEBUG_DEPS
+ printf("%s:%u: depends on %s:%u\n",
+ filename[needs_file], needs_opnum+1,
+ filename[satisfies_file], satisfies_opnum+1);
+#endif
+
+ needs_start = op[needs_file][needs_opnum].group_start;
+ sat_start = op[satisfies_file][satisfies_opnum].group_start;
+
+ /* If needs is in a transaction, we need it before start. */
+ if (needs_start) {
+ switch (op[needs_file][needs_start].op) {
+ case OP_TDB_TRANSACTION_START:
+ needs_opnum = needs_start;
+#ifdef DEBUG_DEPS
+ printf(" -> Back to %u\n", needs_start+1);
+ fflush(stdout);
+#endif
+ break;
+ default:
+ break;
+ }
+ }
+
+ /* If satisfies is in a transaction, we wait until after commit. */
+ /* FIXME: If transaction is cancelled, don't need dependency. */
+ if (sat_start) {
+ if (op[satisfies_file][sat_start].op
+ == OP_TDB_TRANSACTION_START) {
+ satisfies_opnum = sat_start
+ + op[satisfies_file][sat_start].group_len;
+#ifdef DEBUG_DEPS
+ printf(" -> Depends on %u\n", satisfies_opnum+1);
+ fflush(stdout);
+#endif
+ }
+ }
+
+ assert(op[needs_file][needs_opnum].op != OP_TDB_TRAVERSE);
+ assert(op[satisfies_file][satisfies_opnum].op != OP_TDB_TRAVERSE);
+
+ dep = talloc(ctx, struct depend);
+ dep->needs_file = needs_file;
+ dep->needs_opnum = needs_opnum;
+ dep->satisfies_file = satisfies_file;
+ dep->satisfies_opnum = satisfies_opnum;
+ list_add(&op[satisfies_file][satisfies_opnum].post, &dep->post_list);
+ list_add(&op[needs_file][needs_opnum].pre, &dep->pre_list);
+ talloc_set_destructor(dep, destroy_depend);
+}
+
+#if TRAVERSALS_TAKE_TRANSACTION_LOCK
+struct traverse_dep {
+ unsigned int file;
+ unsigned int op_num;
+};
+
+/* Traversals can deadlock against each other, and transactions. Force
+ * order. */
+static void make_traverse_depends(char *filename[],
+ struct op *op[], unsigned int num_ops[],
+ unsigned int num)
+{
+ unsigned int i, j, num_traversals = 0;
+ struct traverse_dep *dep;
+
+ /* Sort by which one runs first. */
+ int compare_traverse_dep(const void *_a, const void *_b)
+ {
+ const struct traverse_dep *ta = _a, *tb = _b;
+ const struct op *a = &op[ta->file][ta->op_num],
+ *b = &op[tb->file][tb->op_num];
+
+ if (a->serial != b->serial)
+ return a->serial - b->serial;
+
+ /* If they have same serial, it means one didn't make any
+ * changes. Thus sort by end in that case. */
+ return a[a->group_len].serial - b[b->group_len].serial;
+ }
+
+ dep = talloc_array(NULL, struct traverse_dep, 1);
+
+ /* Count them. */
+ for (i = 0; i < num; i++) {
+ for (j = 1; j < num_ops[i]; j++) {
+ /* Transaction or traverse start. */
+ if (op[i][j].group_start == j) {
+ dep = talloc_realloc(NULL, dep,
+ struct traverse_dep,
+ num_traversals+1);
+ dep[num_traversals].file = i;
+ dep[num_traversals].op_num = j;
+ num_traversals++;
+ }
+ }
+ }
+ qsort(dep, num_traversals, sizeof(dep[0]), compare_traverse_dep);
+ for (i = 1; i < num_traversals; i++) {
+ /* i depends on end of traverse i-1. */
+ add_dependency(NULL, op, filename, dep[i].file, dep[i].op_num,
+ dep[i-1].file, dep[i-1].op_num
+ + op[dep[i-1].file][dep[i-1].op_num].group_len);
+ }
+ talloc_free(dep);
+}
+#endif /* TRAVERSALS_TAKE_TRANSACTION_LOCK */
+
+static bool changes_db(const struct op *op)
+{
+ return gives(op, NULL) != NULL;
+}
+
+static void depend_on_previous(struct op *op[],
+ char *filename[],
+ unsigned int num,
+ struct key_user user[],
+ unsigned int i,
+ int prev)
+{
+ bool deps[num];
+ int j;
+
+ if (i == 0)
+ return;
+
+ if (prev == i - 1) {
+ /* Just depend on previous. */
+ add_dependency(NULL, op, filename,
+ user[i].file, user[i].op_num,
+ user[prev].file, user[prev].op_num);
+ return;
+ }
+
+ /* We have to wait for the readers. Find last one in *each* file. */
+ memset(deps, 0, sizeof(deps));
+ deps[user[i].file] = true;
+ for (j = i - 1; j > prev; j--) {
+ if (!deps[user[j].file]) {
+ add_dependency(NULL, op, filename,
+ user[i].file, user[i].op_num,
+ user[j].file, user[j].op_num);
+ deps[user[j].file] = true;
+ }
+ }
+}
+
+/* This is simple, but not complete. We don't take into account
+ * indirect dependencies. */
+static void optimize_dependencies(struct op *op[], unsigned int num_ops[],
+ unsigned int num)
+{
+ unsigned int i, j;
+
+ /* There can only be one real dependency on each file */
+ for (i = 0; i < num; i++) {
+ for (j = 1; j < num_ops[i]; j++) {
+ struct depend *dep, *next;
+ struct depend *prev[num];
+
+ memset(prev, 0, sizeof(prev));
+
+ list_for_each_safe(&op[i][j].pre, dep, next, pre_list) {
+ if (!prev[dep->satisfies_file]) {
+ prev[dep->satisfies_file] = dep;
+ continue;
+ }
+ if (prev[dep->satisfies_file]->satisfies_opnum
+ < dep->satisfies_opnum) {
+ talloc_free(prev[dep->satisfies_file]);
+ prev[dep->satisfies_file] = dep;
+ } else
+ talloc_free(dep);
+ }
+ }
+ }
+
+ for (i = 0; i < num; i++) {
+ int deps[num];
+
+ for (j = 0; j < num; j++)
+ deps[j] = -1;
+
+ for (j = 1; j < num_ops[i]; j++) {
+ struct depend *dep, *next;
+
+ list_for_each_safe(&op[i][j].pre, dep, next, pre_list) {
+ if (deps[dep->satisfies_file]
+ >= (int)dep->satisfies_opnum)
+ talloc_free(dep);
+ else
+ deps[dep->satisfies_file]
+ = dep->satisfies_opnum;
+ }
+ }
+ }
+}
+
+static void derive_dependencies(char *filename[],
+ struct op *op[], unsigned int num_ops[],
+ unsigned int num)
+{
+ struct keyinfo *hash;
+ unsigned int h, i;
+
+ /* Create hash table for faster key lookup. */
+ hash = hash_ops(op, num_ops, num);
+
+ /* Sort them by serial number. */
+ sort_ops(hash, filename, op, num);
+
+ /* Create dependencies back to the last change, rather than
+ * creating false dependencies by naively making each one
+ * depend on the previous. This has two purposes: it makes
+ * later optimization simpler, and it also avoids deadlock with
+ * same sequence number ops inside traversals (if one
+ * traversal doesn't write anything, two ops can have the same
+ * sequence number yet we can create a traversal dependency
+ * the other way). */
+ for (h = 0; h < total_keys * 2; h++) {
+ int prev = -1;
+
+ if (hash[h].num_users < 2)
+ continue;
+
+ for (i = 0; i < hash[h].num_users; i++) {
+ if (changes_db(&op[hash[h].user[i].file]
+ [hash[h].user[i].op_num])) {
+ depend_on_previous(op, filename, num,
+ hash[h].user, i, prev);
+ prev = i;
+ } else if (prev >= 0)
+ add_dependency(hash, op, filename,
+ hash[h].user[i].file,
+ hash[h].user[i].op_num,
+ hash[h].user[prev].file,
+ hash[h].user[prev].op_num);
+ }
+ }
+
+#if TRAVERSALS_TAKE_TRANSACTION_LOCK
+ make_traverse_depends(filename, op, num_ops, num);
+#endif
+
+ optimize_dependencies(op, num_ops, num);
+}
+
+int main(int argc, char *argv[])
+{
+ struct timeval start, end;
+ unsigned int i, num_ops[argc], hashsize[argc], tdb_flags[argc], open_flags[argc];
+ struct op *op[argc];
+ int fds[2];
+ char c;
+ bool ok = true;
+
+ if (argc < 3)
+ errx(1, "Usage: %s <tdbfile> <tracefile>...", argv[0]);
+
+ pipes = talloc_array(NULL, struct pipe, argc - 2);
+ for (i = 0; i < argc - 2; i++) {
+ printf("Loading tracefile %s...", argv[2+i]);
+ fflush(stdout);
+ op[i] = load_tracefile(argv[2+i], &num_ops[i], &hashsize[i],
+ &tdb_flags[i], &open_flags[i]);
+ if (pipe(pipes[i].fd) != 0)
+ err(1, "creating pipe");
+ printf("done\n");
+ }
+
+ printf("Calculating inter-dependencies...");
+ fflush(stdout);
+ derive_dependencies(argv+2, op, num_ops, i);
+ printf("done\n");
+
+ /* Don't fork for single arg case: simple debugging. */
+ if (argc == 3) {
+ struct tdb_context *tdb;
+ tdb = tdb_open_ex(argv[1], hashsize[0], tdb_flags[0],
+ open_flags[0], 0600,
+ NULL, hash_key);
+ printf("Single threaded run...");
+ fflush(stdout);
+
+ run_ops(tdb, pipes[0].fd[0], argv+2, op, 0, 1, num_ops[0]);
+ check_deps(argv[2], op[0], num_ops[0]);
+
+ printf("done\n");
+ exit(0);
+ }
+
+ if (pipe(fds) != 0)
+ err(1, "creating pipe");
+
+ for (i = 0; i < argc - 2; i++) {
+ struct tdb_context *tdb;
+
+ switch (fork()) {
+ case -1:
+ err(1, "fork failed");
+ case 0:
+ close(fds[1]);
+ tdb = tdb_open_ex(argv[1], hashsize[i], tdb_flags[i],
+ open_flags[i], 0600,
+ NULL, hash_key);
+ if (!tdb)
+ err(1, "Opening tdb %s", argv[1]);
+
+ /* This catches parent exiting. */
+ if (read(fds[0], &c, 1) != 1)
+ exit(1);
+ run_ops(tdb, pipes[i].fd[0], argv+2, op, i, 1,
+ num_ops[i]);
+ check_deps(argv[2+i], op[i], num_ops[i]);
+ exit(0);
+ default:
+ break;
+ }
+ }
+
+ /* Let everything settle. */
+ sleep(1);
+
+ printf("Starting run...");
+ fflush(stdout);