]> git.ozlabs.org Git - ccan/blobdiff - ccan/tdb/tools/replay_trace.c
Leave groups of chainlock inside traverse.
[ccan] / ccan / tdb / tools / replay_trace.c
index 8fc0ce52a63379cdf325ee4fd2ced797afb8edb0..087bd876acfd636f3af218423d93ccd0dd4e9cee 100644 (file)
@@ -22,7 +22,7 @@
 /* Avoid mod by zero */
 static unsigned int total_keys = 1;
 
-#define DEBUG_DEPS 1
+/* #define DEBUG_DEPS 1 */
 
 /* Traversals block transactions in the current implementation. */
 #define TRAVERSALS_TAKE_TRANSACTION_LOCK 1
@@ -147,7 +147,8 @@ struct op {
                        TDB_DATA pre;
                        TDB_DATA post;
                } append;
-               unsigned int group_len; /* transaction/traverse start */
+               /* transaction/traverse start/chainlock */
+               unsigned int group_len;
        };
 };
 
@@ -213,8 +214,7 @@ static void op_add_key(const char *filename,
                fail(filename, op_num+1, "Expected just a key");
 
        op[op_num].key = make_tdb_data(op, filename, op_num+1, words[2]);
-       if (op[op_num].op != OP_TDB_TRAVERSE)
-               total_keys++;
+       total_keys++;
 }
 
 static void op_add_key_ret(const char *filename,
@@ -243,6 +243,16 @@ static void op_add_key_data(const char *filename,
                total_keys++;
 }
 
+/* We don't record the keys or data for a traverse, as we don't use them. */
+static void op_add_traverse(const char *filename,
+                           struct op op[], unsigned int op_num, char *words[])
+{
+       if (!words[2] || !words[3] || !words[4] || words[5]
+           || !streq(words[3], "="))
+               fail(filename, op_num+1, "Expected <key> = <data>");
+       op[op_num].key = tdb_null;
+}
+
 /* <serial> tdb_store <rec> <rec> <flag> = <ret> */
 static void op_add_store(const char *filename,
                         struct op op[], unsigned int op_num, char *words[])
@@ -290,8 +300,9 @@ static void op_add_seqnum(const char *filename,
        op[op_num].ret = atoi(words[3]);
 }
 
-static void op_add_traverse(const char *filename,
-                           struct op op[], unsigned int op_num, char *words[])
+static void op_add_traverse_start(const char *filename,
+                                 struct op op[],
+                                 unsigned int op_num, char *words[])
 {
        if (words[2])
                fail(filename, op_num+1, "Expect no arguments");
@@ -310,12 +321,39 @@ static void op_add_transaction(const char *filename, struct op op[],
        op[op_num].group_len = 0;
 }
 
-static int op_transaction_start(struct op op[], unsigned int op_num)
+static void op_add_chainlock(const char *filename,
+                            struct op op[], unsigned int op_num, char *words[])
+{
+       if (words[2] == NULL || words[3])
+               fail(filename, op_num+1, "Expected just a key");
+
+       /* A chainlock key isn't a key in the normal sense; it doesn't
+        * have to be in the db at all.  Also, we don't want to hash this op. */
+       op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
+       op[op_num].key = tdb_null;
+       op[op_num].group_len = 0;
+}
+
+static void op_add_chainlock_ret(const char *filename,
+                                struct op op[], unsigned int op_num,
+                                char *words[])
+{
+       if (!words[2] || !words[3] || !words[4] || words[5]
+           || !streq(words[3], "="))
+               fail(filename, op_num+1, "Expected <key> = <ret>");
+       op[op_num].ret = atoi(words[4]);
+       op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
+       op[op_num].key = tdb_null;
+       op[op_num].group_len = 0;
+       total_keys++;
+}
+
+static int op_find_start(struct op op[], unsigned int op_num, enum op_type type)
 {
        unsigned int i;
 
        for (i = op_num-1; i > 0; i--) {
-               if (op[i].op == OP_TDB_TRANSACTION_START && !op[i].group_len)
+               if (op[i].op == type && !op[i].group_len)
                        return i;
        }
        return 0;
@@ -332,7 +370,7 @@ static void op_analyze_transaction(const char *filename,
        if (words[2])
                fail(filename, op_num+1, "Expect no arguments");
 
-       start = op_transaction_start(op, op_num);
+       start = op_find_start(op, op_num, OP_TDB_TRANSACTION_START);
        if (!start)
                fail(filename, op_num+1, "no transaction start found");
 
@@ -343,10 +381,35 @@ static void op_analyze_transaction(const char *filename,
                op[i].group_start = start;
 }
 
-struct traverse_hash {
-       TDB_DATA key;
-       unsigned int index;
-};
+/* We treat chainlocks a lot like transactions, even though that's overkill */
+static void op_analyze_chainlock(const char *filename,
+                                struct op op[], unsigned int op_num,
+                                char *words[])
+{
+       unsigned int i, start;
+
+       if (words[2] == NULL || words[3])
+               fail(filename, op_num+1, "Expected just a key");
+
+       op[op_num].data = make_tdb_data(op, filename, op_num+1, words[2]);
+       op[op_num].key = tdb_null;
+       total_keys++;
+
+       start = op_find_start(op, op_num, OP_TDB_CHAINLOCK);
+       if (!start)
+               start = op_find_start(op, op_num, OP_TDB_CHAINLOCK_READ);
+       if (!start)
+               fail(filename, op_num+1, "no initial chainlock found");
+
+       /* FIXME: We'd have to do something clever to make this work
+        * vs. deadlock. */
+       if (!key_eq(op[start].data, op[op_num].data))
+               fail(filename, op_num+1, "nested chainlock calls?");
+
+       op[start].group_len = op_num - start;
+       for (i = start; i <= op_num; i++)
+               op[i].group_start = start;
+}
 
 static void op_analyze_traverse(const char *filename,
                                struct op op[], unsigned int op_num,
@@ -364,23 +427,18 @@ static void op_analyze_traverse(const char *filename,
        } else
                op[op_num].ret = 0;
 
-       for (i = op_num-1; i >= 0; i--) {
-               if (op[i].op != OP_TDB_TRAVERSE_READ_START
-                   && op[i].op != OP_TDB_TRAVERSE_START)
-                       continue;
-               if (op[i].group_len)
-                       continue;
-               break;
-       }
-
-       if (i < 0)
+       start = op_find_start(op, op_num, OP_TDB_TRAVERSE_START);
+       if (!start)
+               start = op_find_start(op, op_num, OP_TDB_TRAVERSE_READ_START);
+       if (!start)
                fail(filename, op_num+1, "no traversal start found");
 
-       start = i;
        op[start].group_len = op_num - start;
 
+       /* Don't roll in nested traverse/chainlock */
        for (i = start; i <= op_num; i++)
-               op[i].group_start = start;
+               if (!op[i].group_start)
+                       op[i].group_start = start;
 }
 
 /* Keep -Wmissing-declarations happy: */
@@ -425,9 +483,10 @@ static void dump_pre(char *filename[], struct op *op[],
 }
 
 /* We simply read/write pointers, since we all are children. */
-static void do_pre(struct tdb_context *tdb,
+static bool do_pre(struct tdb_context *tdb,
                   char *filename[], struct op *op[],
-                  unsigned int file, int pre_fd, unsigned int i)
+                  unsigned int file, int pre_fd, unsigned int i,
+                  bool backoff)
 {
        while (!list_empty(&op[file][i].pre)) {
                struct depend *dep;
@@ -436,9 +495,17 @@ static void do_pre(struct tdb_context *tdb,
                printf("%s:%u:waiting for pre\n", filename[file], i+1);
                fflush(stdout);
 #endif
-               alarm(10);
+               if (backoff)
+                       alarm(2);
+               else
+                       alarm(10);
                while (read(pre_fd, &dep, sizeof(dep)) != sizeof(dep)) {
                        if (errno == EINTR) {
+                               if (backoff) {
+                                       warnx("%s:%u:avoiding deadlock",
+                                             filename[file], i+1);
+                                       return false;
+                               }
                                dump_pre(filename, op, file, i);
                                exit(1);
                        } else
@@ -455,6 +522,7 @@ static void do_pre(struct tdb_context *tdb,
                /* This could be any op, not just this one. */
                talloc_free(dep);
        }
+       return true;
 }
 
 static void do_post(char *filename[], struct op *op[],
@@ -484,7 +552,8 @@ static unsigned run_ops(struct tdb_context *tdb,
                        char *filename[],
                        struct op *op[],
                        unsigned int file,
-                       unsigned int start, unsigned int stop);
+                       unsigned int start, unsigned int stop,
+                       bool backoff);
 
 struct traverse_info {
        struct op **op;
@@ -502,6 +571,7 @@ static int nontrivial_traverse(struct tdb_context *tdb,
 {
        struct traverse_info *tinfo = _tinfo;
        unsigned int trav_len = tinfo->op[tinfo->file][tinfo->start].group_len;
+       bool avoid_deadlock = false;
 
        if (tinfo->i == tinfo->start + trav_len) {
                /* This can happen if traverse expects to be empty. */
@@ -515,11 +585,17 @@ static int nontrivial_traverse(struct tdb_context *tdb,
                fail(tinfo->filename[tinfo->file], tinfo->start + 1,
                     "%s:%u:traverse terminated early");
 
+#if TRAVERSALS_TAKE_TRANSACTION_LOCK
+       avoid_deadlock = true;
+#endif
+
        /* Run any normal ops. */
        tinfo->i = run_ops(tdb, tinfo->pre_fd, tinfo->filename, tinfo->op,
-                          tinfo->file, tinfo->i+1, tinfo->start + trav_len);
+                          tinfo->file, tinfo->i+1, tinfo->start + trav_len,
+                          avoid_deadlock);
 
-       if (tinfo->i == tinfo->start + trav_len)
+       /* We backed off, or we hit OP_TDB_TRAVERSE_END. */
+       if (tinfo->op[tinfo->file][tinfo->i].op != OP_TDB_TRAVERSE)
                return 1;
 
        return 0;
@@ -548,7 +624,8 @@ static unsigned op_traverse(struct tdb_context *tdb,
                else
                        tinfo.i = run_ops(tdb, pre_fd, filename, op, file,
                                          tinfo.i,
-                                         start + op[file][start].group_len);
+                                         start + op[file][start].group_len,
+                                         false);
        }
 
        return tinfo.i;
@@ -564,7 +641,8 @@ unsigned run_ops(struct tdb_context *tdb,
                 char *filename[],
                 struct op *op[],
                 unsigned int file,
-                unsigned int start, unsigned int stop)
+                unsigned int start, unsigned int stop,
+                bool backoff)
 {
        unsigned int i;
        struct sigaction sa;
@@ -574,7 +652,8 @@ unsigned run_ops(struct tdb_context *tdb,
 
        sigaction(SIGALRM, &sa, NULL);
        for (i = start; i < stop; i++) {
-               do_pre(tdb, filename, op, file, pre_fd, i);
+               if (!do_pre(tdb, filename, op, file, pre_fd, i, backoff))
+                       return i;
 
                switch (op[file][i].op) {
                case OP_TDB_LOCKALL:
@@ -712,7 +791,7 @@ unsigned run_ops(struct tdb_context *tdb,
 static struct op *maybe_cancel_transaction(const char *filename,
                                           struct op *op, unsigned int *num)
 {
-       unsigned int start = op_transaction_start(op, *num);
+       unsigned int start = op_find_start(op, *num, OP_TDB_TRANSACTION_START);
 
        if (start) {
                char *words[] = { "<unknown>", "tdb_close", NULL };
@@ -873,27 +952,54 @@ static const TDB_DATA *needs(const struct op *op)
        
 }
 
-static bool is_transaction(const struct op *op)
+static bool starts_transaction(const struct op *op)
 {
        return op->op == OP_TDB_TRANSACTION_START;
 }
 
+static bool in_transaction(const struct op op[], unsigned int i)
+{
+       return op[i].group_start && starts_transaction(&op[op[i].group_start]);
+}
+
+static bool starts_traverse(const struct op *op)
+{
+       return op->op == OP_TDB_TRAVERSE_START
+               || op->op == OP_TDB_TRAVERSE_READ_START;
+}
+
+static bool in_traverse(const struct op op[], unsigned int i)
+{
+       return op[i].group_start && starts_traverse(&op[op[i].group_start]);
+}
+
+static bool starts_chainlock(const struct op *op)
+{
+       return op->op == OP_TDB_CHAINLOCK_READ || op->op == OP_TDB_CHAINLOCK;
+}
+
+static bool in_chainlock(const struct op op[], unsigned int i)
+{
+       return op[i].group_start && starts_chainlock(&op[op[i].group_start]);
+}
+
 /* What's the data after this op?  pre if nothing changed. */
 static const TDB_DATA *gives(const TDB_DATA *key, const TDB_DATA *pre,
                             const struct op *op)
 {
-       if (is_transaction(op)) {
+       if (starts_transaction(op) || starts_chainlock(op)) {
                unsigned int i;
 
                /* Cancelled transactions don't change anything. */
                if (op[op->group_len].op == OP_TDB_TRANSACTION_CANCEL)
                        return pre;
-               assert(op[op->group_len].op == OP_TDB_TRANSACTION_COMMIT);
+               assert(op[op->group_len].op == OP_TDB_TRANSACTION_COMMIT
+                      || op[op->group_len].op == OP_TDB_CHAINUNLOCK_READ
+                      || op[op->group_len].op == OP_TDB_CHAINUNLOCK);
 
                for (i = 1; i < op->group_len; i++) {
                        /* This skips nested transactions, too */
-                       if (op[i].op != OP_TDB_TRAVERSE
-                           && key_eq(op[i].key, *key))
+                       if (key_eq(op[i].key, *key))
                                pre = gives(key, pre, &op[i]);
                }
                return pre;
@@ -915,16 +1021,6 @@ static const TDB_DATA *gives(const TDB_DATA *key, const TDB_DATA *pre,
        return pre;
 }
 
-static bool in_transaction(const struct op op[], unsigned int i)
-{
-       return op[i].group_start && is_transaction(&op[op[i].group_start]);
-}
-
-static bool in_traverse(const struct op op[], unsigned int i)
-{
-       return op[i].group_start && !is_transaction(&op[op[i].group_start]);
-}
-
 static struct keyinfo *hash_ops(struct op *op[], unsigned int num_ops[],
                                unsigned int num)
 {
@@ -941,11 +1037,6 @@ static struct keyinfo *hash_ops(struct op *op[], unsigned int num_ops[],
                        if (!op[i][j].key.dptr)
                                continue;
 
-                       /* We don't wait for traverse keys */
-                       /* FIXME: We should, for trivial traversals. */
-                       if (op[i][j].op == OP_TDB_TRAVERSE)
-                               continue;
-
                        h = hash_key(&op[i][j].key) % (total_keys * 2);
                        while (!key_eq(hash[h].key, op[i][j].key)) {
                                if (!hash[h].key.dptr) {
@@ -965,7 +1056,8 @@ static struct keyinfo *hash_ops(struct op *op[], unsigned int num_ops[],
 
                        /* If it's in a transaction, it's the transaction which
                         * matters from an analysis POV. */
-                       if (in_transaction(op[i], j)) {
+                       if (in_transaction(op[i], j)
+                           || in_chainlock(op[i], j)) {
                                unsigned start = op[i][j].group_start;
 
                                /* Don't include twice. */
@@ -992,14 +1084,13 @@ static bool satisfies(const TDB_DATA *key, const TDB_DATA *data,
 {
        const TDB_DATA *need = NULL;
 
-       if (is_transaction(op)) {
+       if (starts_transaction(op) || starts_chainlock(op)) {
                unsigned int i;
 
                /* Look through for an op in this transaction which
                 * needs this key. */
                for (i = 1; i < op->group_len; i++) {
-                       if (op[i].op != OP_TDB_TRAVERSE
-                           && key_eq(op[i].key, *key)) {
+                       if (key_eq(op[i].key, *key)) {
                                need = needs(&op[i]);
                                /* tdb_exists() is special: there might be
                                 * something in the transaction with more
@@ -1067,6 +1158,10 @@ static bool sort_deps(char *filename[], struct op *op[],
        struct op *this_op;
        bool done[num_files];
 
+       /* None left?  We're sorted. */
+       if (off == num)
+               return true;
+
        /* Does this make serial numbers go backwards?  Allow a little fuzz. */
        if (off > 0) {
                int serial1 = op[res[off-1].file][res[off-1].op_num].serial;
@@ -1081,10 +1176,6 @@ static bool sort_deps(char *filename[], struct op *op[],
                }
        }
 
-       /* One or none left?  We're sorted. */
-       if (off + 1 >= num)
-               return true;
-
        memset(done, 0, sizeof(done));
 
        /* Since ops within a trace file are ordered, we just need to figure
@@ -1222,9 +1313,11 @@ static void add_dependency(void *ctx,
         * traverse/transaction, it creates a dependency between the
         * two groups. */
        if ((in_traverse(op[satisfies_file], satisfies_opnum)
-            && op[needs_file][needs_opnum].group_start)
+            && (starts_transaction(&op[needs_file][needs_opnum])
+                || starts_traverse(&op[needs_file][needs_opnum])))
            || (in_traverse(op[needs_file], needs_opnum)
-               && op[satisfies_file][satisfies_opnum].group_start)) {
+               && (starts_transaction(&op[satisfies_file][satisfies_opnum])
+                   || starts_traverse(&op[satisfies_file][satisfies_opnum])))){
                unsigned int sat;
 
                /* We are satisfied by end of group. */
@@ -1252,8 +1345,10 @@ static void add_dependency(void *ctx,
        }
 #endif
 
-       /* If you depend on a transaction, you actually depend on it ending. */
-       if (is_transaction(&op[satisfies_file][satisfies_opnum])) {
+       /* If you depend on a transaction or chainlock, you actually
+        * depend on it ending. */
+       if (starts_transaction(&op[satisfies_file][satisfies_opnum])
+           || starts_chainlock(&op[satisfies_file][satisfies_opnum])) {
                satisfies_opnum
                        += op[satisfies_file][satisfies_opnum].group_len;
 #if DEBUG_DEPS
@@ -1372,6 +1467,79 @@ static void optimize_dependencies(struct op *op[], unsigned int num_ops[],
        }
 }
 
+#if TRAVERSALS_TAKE_TRANSACTION_LOCK
+struct traverse_dep {
+       unsigned int file;
+       unsigned int op_num;
+};
+
+/* Force an order among the traversals, so they don't deadlock (as much) */
+static void make_traverse_depends(char *filename[],
+                                 struct op *op[], unsigned int num_ops[],
+                                 unsigned int num)
+{
+       unsigned int i, num_traversals = 0;
+       int j;
+       struct traverse_dep *dep;
+
+       /* Sort by which one runs first. */
+       int compare_traverse_dep(const void *_a, const void *_b)
+       {
+               const struct traverse_dep *ta = _a, *tb = _b;
+               const struct op *a = &op[ta->file][ta->op_num],
+                       *b = &op[tb->file][tb->op_num];
+
+               if (a->serial != b->serial)
+                       return a->serial - b->serial;
+
+               /* If they have same serial, it means one didn't make any
+                * changes.  Thus sort by end in that case. */
+               return a[a->group_len].serial - b[b->group_len].serial;
+       }
+
+       dep = talloc_array(NULL, struct traverse_dep, 1);
+
+       /* Count them. */
+       for (i = 0; i < num; i++) {
+               for (j = 1; j < num_ops[i]; j++) {
+                       /* Traverse start (ignore those in
+                        * transactions; they're already covered by
+                        * transaction dependencies). */
+                       if (starts_traverse(&op[i][j])
+                           && !in_transaction(op[i], j)) {
+                               dep = talloc_realloc(NULL, dep,
+                                                    struct traverse_dep,
+                                                    num_traversals+1);
+                               dep[num_traversals].file = i;
+                               dep[num_traversals].op_num = j;
+                               num_traversals++;
+                       }
+               }
+       }
+       qsort(dep, num_traversals, sizeof(dep[0]), compare_traverse_dep);
+
+       for (i = 1; i < num_traversals; i++) {
+               const struct op *prev = &op[dep[i-1].file][dep[i-1].op_num];
+               const struct op *curr = &op[dep[i].file][dep[i].op_num];
+
+               /* Read traverses don't depend on each other (read lock). */
+               if (prev->op == OP_TDB_TRAVERSE_READ_START
+                   && curr->op == OP_TDB_TRAVERSE_READ_START)
+                       continue;
+
+               /* Only make dependency if it's clear. */
+               if (compare_traverse_dep(&dep[i], &dep[i-1])) {
+                       /* i depends on end of traverse i-1. */
+                       add_dependency(NULL, op, filename,
+                                      dep[i].file, dep[i].op_num,
+                                      dep[i-1].file, dep[i-1].op_num
+                                      + prev->group_len);
+               }
+       }
+       talloc_free(dep);
+}
+#endif
+
 static void derive_dependencies(char *filename[],
                                struct op *op[], unsigned int num_ops[],
                                unsigned int num)
@@ -1414,6 +1582,10 @@ static void derive_dependencies(char *filename[],
                }
        }
 
+#if TRAVERSALS_TAKE_TRANSACTION_LOCK
+       make_traverse_depends(filename, op, num_ops, num);
+#endif
+
        optimize_dependencies(op, num_ops, num);
 }
 
@@ -1453,7 +1625,8 @@ int main(int argc, char *argv[])
                printf("Single threaded run...");
                fflush(stdout);
 
-               run_ops(tdb, pipes[0].fd[0], argv+2, op, 0, 1, num_ops[0]);
+               run_ops(tdb, pipes[0].fd[0], argv+2, op, 0, 1, num_ops[0],
+                       false);
                check_deps(argv[2], op[0], num_ops[0]);
 
                printf("done\n");
@@ -1481,7 +1654,7 @@ int main(int argc, char *argv[])
                        if (read(fds[0], &c, 1) != 1)
                                exit(1);
                        run_ops(tdb, pipes[i].fd[0], argv+2, op, i, 1,
-                               num_ops[i]);
+                               num_ops[i], false);
                        check_deps(argv[2+i], op[i], num_ops[i]);
                        exit(0);
                default: