X-Git-Url: https://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Ffailtest%2Ffailtest.c;h=dae3024ef97f55c24be4169a6589e7ef01258373;hp=a0c825399d03881ab0338075a0a46fb2c4ec1898;hb=e18e80fe175422d26efe689addc0f67bdba0e097;hpb=2006aa032d6f72599165e50242d06df35428d43a

diff --git a/ccan/failtest/failtest.c b/ccan/failtest/failtest.c
index a0c82539..dae3024e 100644
--- a/ccan/failtest/failtest.c
+++ b/ccan/failtest/failtest.c
@@ -14,6 +14,7 @@
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/mman.h>
+#include <sys/resource.h>
 #include <signal.h>
 #include <assert.h>
 #include <ccan/time/time.h>
@@ -23,11 +24,12 @@
 #include <ccan/hash/hash.h>
 #include <ccan/htable/htable_type.h>
 #include <ccan/str/str.h>
+#include <ccan/compiler/compiler.h>
 
 enum failtest_result (*failtest_hook)(struct tlist_calls *);
 
-static int tracefd = -1;
-static int warnfd;
+static FILE *tracef = NULL, *warnf;
+static int traceindent = 0;
 
 unsigned int failtest_timeout_ms = 20000;
 
@@ -82,26 +84,53 @@ HTABLE_DEFINE_TYPE(struct failtest_call, (struct failtest_call *), hash_call,
 
 bool (*failtest_exit_check)(struct tlist_calls *history);
 
+/* The entire history of all calls. */
 static struct tlist_calls history = TLIST_INIT(history);
+/* If we're a child, the fd two write control info to the parent. */
 static int control_fd = -1;
+/* If we're a child, this is the first call we did ourselves. */
+static struct failtest_call *our_history_start = NULL;
+/* For printing runtime with --trace. */
 static struct timeval start;
+/* Set when failtest_hook returns FAIL_PROBE */
 static bool probing = false;
+/* Table to track duplicates. */
 static struct failtable failtable;
 
+/* Array of writes which our child did.  We report them on failure. */
 static struct write_call *child_writes = NULL;
 static unsigned int child_writes_num = 0;
 
+/* fcntl locking info. */
 static pid_t lock_owner;
 static struct lock_info *locks = NULL;
 static unsigned int lock_num = 0;
 
+/* Our original pid, which we return to anyone who asks. */
 static pid_t orig_pid;
 
-static const char info_to_arg[] = "mceoxprwfa";
+/* Mapping from failtest_type to char. */
+static const char info_to_arg[] = "mceoxprwfal";
 
 /* Dummy call used for failtest_undo wrappers. */
 static struct failtest_call unrecorded_call;
 
+struct contents_saved {
+	size_t count;
+	off_t off;
+	off_t old_len;
+	char contents[1];
+};
+
+/* File contents, saved in this child only. */
+struct saved_mmapped_file {
+	struct saved_mmapped_file *next;
+	struct failtest_call *opener;
+	struct contents_saved *s;
+};
+
+static struct saved_mmapped_file *saved_mmapped_files;
+
 #if HAVE_BACKTRACE
 #include <execinfo.h>
 
@@ -131,6 +160,7 @@ static void **get_backtrace(unsigned int *num)
 #endif /* HAVE_BACKTRACE */
 
 static struct failtest_call *add_history_(enum failtest_call_type type,
+					  bool can_leak,
 					  const char *file,
 					  unsigned int line,
 					  const void *elem,
@@ -144,6 +174,7 @@ static struct failtest_call *add_history_(enum failtest_call_type type,
 
 	call = malloc(sizeof *call);
 	call->type = type;
+	call->can_leak = can_leak;
 	call->file = file;
 	call->line = line;
 	call->cleanup = NULL;
@@ -153,23 +184,32 @@ static struct failtest_call *add_history_(enum failtest_call_type type,
 	return call;
 }
 
-#define add_history(type, file, line, elem) \
-	add_history_((type), (file), (line), (elem), sizeof(*(elem)))
+#define add_history(type, can_leak, file, line, elem)		\
+	add_history_((type), (can_leak), (file), (line), (elem), sizeof(*(elem)))
 
 /* We do a fake call inside a sizeof(), to check types. */
 #define set_cleanup(call, clean, type)			\
-	(call)->cleanup = (void *)((void)sizeof(clean((type *)NULL),1), (clean))
-
+	(call)->cleanup = (void *)((void)sizeof(clean((type *)NULL, false),1), (clean))
 
 /* Dup the fd to a high value (out of the way I hope!), and close the old fd. */
 static int move_fd_to_high(int fd)
 {
 	int i;
+	struct rlimit lim;
+	int max;
+
+	if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
+		max = lim.rlim_cur;
+		printf("Max is %i\n", max);
+	} else
+		max = FD_SETSIZE;
 
-	for (i = FD_SETSIZE - 1; i >= 0; i--) {
+	for (i = max - 1; i > fd; i--) {
 		if (fcntl(i, F_GETFL) == -1 && errno == EBADF) {
-			if (dup2(fd, i) == -1)
-				err(1, "Failed to dup fd %i to %i", fd, i);
+			if (dup2(fd, i) == -1) {
+				warn("Failed to dup fd %i to %i", fd, i);
+				continue;
+			}
 			close(fd);
 			return i;
 		}
@@ -215,14 +255,14 @@ static char *failpath_string(void)
 	return ret;
 }
 
-static void warn_via_fd(int e, const char *fmt, va_list ap)
+static void do_warn(int e, const char *fmt, va_list ap)
 {
 	char *p = failpath_string();
 
-	vdprintf(warnfd, fmt, ap);
+	vfprintf(warnf, fmt, ap);
 	if (e != -1)
-		dprintf(warnfd, ": %s", strerror(e));
-	dprintf(warnfd, " [%s]\n", p);
+		fprintf(warnf, ": %s", strerror(e));
+	fprintf(warnf, " [%s]\n", p);
 	free(p);
 }
 
@@ -232,7 +272,7 @@ static void fwarn(const char *fmt, ...)
 	int e = errno;
 
 	va_start(ap, fmt);
-	warn_via_fd(e, fmt, ap);
+	do_warn(e, fmt, ap);
 	va_end(ap);
 }
 
@@ -242,7 +282,7 @@ static void fwarnx(const char *fmt, ...)
 	va_list ap;
 
 	va_start(ap, fmt);
-	warn_via_fd(-1, fmt, ap);
+	do_warn(-1, fmt, ap);
 	va_end(ap);
 }
 
@@ -268,16 +308,25 @@ static void child_fail(const char *out, size_t outlen, const char *fmt, ...)
 	exit(1);
 }
 
-static void trace(const char *fmt, ...)
+static void PRINTF_FMT(1, 2) trace(const char *fmt, ...)
 {
 	va_list ap;
+	unsigned int i;
+	char *p;
+	static int idx;
 
-	if (tracefd == -1)
+	if (!tracef)
 		return;
 
+	for (i = 0; i < traceindent; i++)
+		fprintf(tracef, "  ");
+
+	p = failpath_string();
+	fprintf(tracef, "%i: %u: %s ", idx++, getpid(), p);
 	va_start(ap, fmt);
-	vdprintf(tracefd, fmt, ap);
+	vfprintf(tracef, fmt, ap);
 	va_end(ap);
+	free(p);
 }
 
 static pid_t child;
@@ -302,6 +351,7 @@ static void release_locks(void)
 		fl.l_start = 0;
 		fl.l_len = 0;
 
+		trace("Releasing %u locks\n", lock_num);
 		for (i = 0; i < lock_num; i++)
 			fcntl(locks[i].fd, F_SETLK, &fl);
 	} else {
@@ -334,6 +384,7 @@ static void get_locks(void)
 	if (lock_owner != 0) {
 		enum info_type type = RELEASE_LOCKS;
 		assert(control_fd != -1);
+		trace("Asking parent to release locks\n");
 		write_all(control_fd, &type, sizeof(type));
 	}
 
@@ -350,96 +401,171 @@ static void get_locks(void)
 		if (fcntl(locks[i].fd, F_SETLKW, &fl) != 0)
 			abort();
 	}
+	trace("Acquired %u locks\n", lock_num);
 	lock_owner = getpid();
 }
 
-struct saved_file {
-	struct saved_file *next;
-	int fd;
-	void *contents;
-	off_t off, len;
-};
 
-static struct saved_file *save_file(struct saved_file *next, int fd)
-{
-	struct saved_file *s = malloc(sizeof(*s));
-
-	s->next = next;
-	s->fd = fd;
-	s->off = lseek(fd, 0, SEEK_CUR);
-	/* Special file?  Erk... */
-	assert(s->off != -1);
-	s->len = lseek(fd, 0, SEEK_END);
-	lseek(fd, 0, SEEK_SET);
-	s->contents = malloc(s->len);
-	if (read(fd, s->contents, s->len) != s->len)
-		err(1, "Failed to save %zu bytes", (size_t)s->len);
-	lseek(fd, s->off, SEEK_SET);
+static struct contents_saved *save_contents(const char *filename,
+					    int fd, size_t count, off_t off,
+					    const char *why)
+{
+	struct contents_saved *s = malloc(sizeof(*s) + count);
+	ssize_t ret;
+
+	s->off = off;
+
+	ret = pread(fd, s->contents, count, off);
+	if (ret < 0) {
+		fwarn("failtest_write: failed to save old contents!");
+		s->count = 0;
+	} else
+		s->count = ret;
+
+	/* Use lseek to get the size of file, but we have to restore
+	 * file offset */
+	off = lseek(fd, 0, SEEK_CUR);
+	s->old_len = lseek(fd, 0, SEEK_END);
+	lseek(fd, off, SEEK_SET);
+
+	trace("Saving %p %s %zu@%llu after %s (filelength %llu) via fd %i\n",
+	      s, filename, s->count, (long long)s->off, why,
+	      (long long)s->old_len, fd);
 	return s;
 }
-	
-/* We have little choice but to save and restore open files: mmap means we
- * can really intercept changes in the child.
- *
- * We could do non-mmap'ed files on demand, however. */
-static struct saved_file *save_files(void)
-{
-	struct saved_file *files = NULL;
-	struct failtest_call *i;
 
-	/* Figure out the set of live fds. */
-	tlist_for_each_rev(&history, i, list) {
-		if (i->type == FAILTEST_OPEN) {
-			int fd = i->u.open.ret;
-			/* Only do successful, writable fds. */
-			if (fd < 0)
-				continue;
+static void restore_contents(struct failtest_call *opener,
+			     struct contents_saved *s,
+			     bool restore_offset,
+			     const char *caller)
+{
+	int fd;
 
-			/* If it was closed, cleanup == NULL. */
-			if (!i->cleanup)
-				continue;
+	/* The top parent doesn't need to restore. */
+	if (control_fd == -1)
+		return;
 
-			if ((i->u.open.flags & O_RDWR) == O_RDWR) {
-				files = save_file(files, fd);
-			} else if ((i->u.open.flags & O_WRONLY)
-				   == O_WRONLY) {
-				/* FIXME: Handle O_WRONLY.  Open with O_RDWR? */
-				abort();
-			}
+	/* Has the fd been closed? */
+	if (opener->u.open.closed) {
+		/* Reopen, replace fd, close silently as we clean up. */
+		fd = open(opener->u.open.pathname, O_RDWR);
+		if (fd < 0) {
+			fwarn("failtest: could not reopen %s to clean up %s!",
+			      opener->u.open.pathname, caller);
+			return;
 		}
+		/* Make it clearly distinguisable from a "normal" fd. */
+		fd = move_fd_to_high(fd);
+		trace("Reopening %s to restore it (was fd %i, now %i)\n",
+		      opener->u.open.pathname, opener->u.open.ret, fd);
+		opener->u.open.ret = fd;
+		opener->u.open.closed = false;
+	}
+	fd = opener->u.open.ret;
+
+	trace("Restoring %p %s %zu@%llu after %s (filelength %llu) via fd %i\n",
+	      s, opener->u.open.pathname, s->count, (long long)s->off, caller,
+	      (long long)s->old_len, fd);
+	if (pwrite(fd, s->contents, s->count, s->off) != s->count) {
+		fwarn("failtest: write failed cleaning up %s for %s!",
+		      opener->u.open.pathname, caller);
 	}
 
-	return files;
+	if (ftruncate(fd, s->old_len) != 0) {
+		fwarn("failtest_write: truncate failed cleaning up %s for %s!",
+		      opener->u.open.pathname, caller);
+	}
+
+	if (restore_offset) {
+		trace("Restoring offset of fd %i to %llu\n",
+		      fd, (long long)s->off);
+		lseek(fd, s->off, SEEK_SET);
+	}
 }
 
-static void restore_files(struct saved_file *s)
+/* We save/restore most things on demand, but always do mmaped files. */
+static void save_mmapped_files(void)
 {
-	while (s) {
-		struct saved_file *next = s->next;
+	struct failtest_call *i;
+	trace("Saving mmapped files in child\n");
+
+	tlist_for_each_rev(&history, i, list) {
+		struct mmap_call *m = &i->u.mmap;
+		struct saved_mmapped_file *s;
+
+		if (i->type != FAILTEST_MMAP)
+			continue;
+
+		/* FIXME: We only handle mmapped files where fd is still open. */
+		if (m->opener->u.open.closed)
+			continue;
 
-		lseek(s->fd, 0, SEEK_SET);
-		if (write(s->fd, s->contents, s->len) != s->len)
-			err(1, "Failed to restore %zu bytes", (size_t)s->len);
-		if (ftruncate(s->fd, s->len) != 0)
-			err(1, "Failed to trim file to length %zu",
-			    (size_t)s->len);
-		free(s->contents);
-		lseek(s->fd, s->off, SEEK_SET);
-		free(s);
-		s = next;
+		s = malloc(sizeof *s);
+		s->s = save_contents(m->opener->u.open.pathname,
+				     m->fd, m->length, m->offset,
+				     "mmapped file before fork");
+		s->opener = m->opener;
+		s->next = saved_mmapped_files;
+		saved_mmapped_files = s;
 	}
 }
 
-static void free_files(struct saved_file *s)
+static void free_mmapped_files(bool restore)
 {
-	while (s) {
-		struct saved_file *next = s->next;
-		free(s->contents);
-		free(s);
-		s = next;
+	trace("%s mmapped files in child\n",
+	      restore ? "Restoring" : "Discarding");
+	while (saved_mmapped_files) {
+		struct saved_mmapped_file *next = saved_mmapped_files->next;
+		if (restore)
+			restore_contents(saved_mmapped_files->opener,
+					 saved_mmapped_files->s, false,
+					 "saved mmap");
+		free(saved_mmapped_files->s);
+		free(saved_mmapped_files);
+		saved_mmapped_files = next;
 	}
 }
 
+/* Returns a FAILTEST_OPEN, FAILTEST_PIPE or NULL. */
+static struct failtest_call *opener_of(int fd)
+{
+	struct failtest_call *i;
+
+	/* Don't get confused and match genuinely failed opens. */
+	if (fd < 0)
+		return NULL;
+
+	/* Figure out the set of live fds. */
+	tlist_for_each_rev(&history, i, list) {
+		if (i->fail)
+			continue;
+		switch (i->type) {
+		case FAILTEST_CLOSE:
+			if (i->u.close.fd == fd) {
+				return NULL;
+			}
+			break;
+		case FAILTEST_OPEN:
+			if (i->u.open.ret == fd) {
+				if (i->u.open.closed)
+					return NULL;
+				return i;
+			}
+			break;
+		case FAILTEST_PIPE:
+			if (i->u.pipe.fds[0] == fd || i->u.pipe.fds[1] == fd) {
+				return i;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* FIXME: socket, dup, etc are untracked! */
+	return NULL;
+}
+
 static void free_call(struct failtest_call *call)
 {
 	/* We don't do this in cleanup: needed even for failed opens. */
@@ -455,7 +581,7 @@ static void free_everything(void)
 {
 	struct failtest_call *i;
 
-	while ((i = tlist_top(&history, struct failtest_call, list)) != NULL)
+	while ((i = tlist_top(&history, list)) != NULL)
 		free_call(i);
 
 	failtable_clear(&failtable);
@@ -464,29 +590,38 @@ static void free_everything(void)
 static NORETURN void failtest_cleanup(bool forced_cleanup, int status)
 {
 	struct failtest_call *i;
+	bool restore = true;
 
 	/* For children, we don't care if they "failed" the testing. */
 	if (control_fd != -1)
 		status = 0;
-
-	if (forced_cleanup) {
-		/* We didn't actually do final operation: remove it. */
-		i = tlist_tail(&history, struct failtest_call, list);
-		free_call(i);
-	}
+	else
+		/* We don't restore contents for original parent. */
+		restore = false;
 
 	/* Cleanup everything, in reverse order. */
 	tlist_for_each_rev(&history, i, list) {
-		if (!i->cleanup)
+		/* Don't restore things our parent did. */
+		if (i == our_history_start)
+			restore = false;
+
+		if (i->fail)
 			continue;
-		if (!forced_cleanup) {
+
+		if (i->cleanup)
+			i->cleanup(&i->u, restore);
+
+		/* But their program shouldn't leak, even on failure. */
+		if (!forced_cleanup && i->can_leak) {
 			printf("Leak at %s:%u: --failpath=%s\n",
 			       i->file, i->line, failpath_string());
 			status = 1;
 		}
-		i->cleanup(&i->u);
 	}
 
+	/* Put back mmaped files the way our parent (if any) expects. */
+	free_mmapped_files(true);
+
 	free_everything();
 	if (status == 0)
 		tell_parent(SUCCESS);
@@ -495,6 +630,34 @@ static NORETURN void failtest_cleanup(bool forced_cleanup, int status)
 	exit(status);
 }
 
+static bool following_path(void)
+{
+	if (!failpath)
+		return false;
+	/* + means continue after end, like normal. */
+	if (*failpath == '+') {
+		failpath = NULL;
+		return false;
+	}
+	return true;
+}
+
+static bool follow_path(struct failtest_call *call)
+{
+	if (*failpath == '\0') {
+		/* Continue, but don't inject errors. */
+		return call->fail = false;
+	}
+
+	if (tolower((unsigned char)*failpath) != info_to_arg[call->type])
+		errx(1, "Failpath expected '%s' got '%c'\n",
+		     failpath, info_to_arg[call->type]);
+	call->fail = cisupper(*(failpath++));
+			if (call->fail)
+				call->can_leak = false;
+	return call->fail;
+}
+
 static bool should_fail(struct failtest_call *call)
 {
 	int status;
@@ -502,28 +665,13 @@ static bool should_fail(struct failtest_call *call)
 	enum info_type type = UNEXPECTED;
 	char *out = NULL;
 	size_t outlen = 0;
-	struct saved_file *files;
 	struct failtest_call *dup;
 
 	if (call == &unrecorded_call)
 		return false;
 
-	if (failpath) {
-		/* + means continue after end, like normal. */
-		if (*failpath == '+')
-			failpath = NULL;
-		else if (*failpath == '\0') {
-			/* Continue, but don't inject errors. */
-			return call->fail = false;
-		} else {
-			if (tolower((unsigned char)*failpath)
-			    != info_to_arg[call->type])
-				errx(1, "Failpath expected '%s' got '%c'\n",
-				     failpath, info_to_arg[call->type]);
-			call->fail = cisupper(*(failpath++));
-			return call->fail;
-		}
-	}
+	if (following_path())
+		return follow_path(call);
 
 	/* Attach debugger if they asked for it. */
 	if (debugpath) {
@@ -556,13 +704,19 @@ static bool should_fail(struct failtest_call *call)
 	}
 
 	/* Are we probing?  If so, we never fail twice. */
-	if (probing)
+	if (probing) {
+		trace("Not failing %c due to FAIL_PROBE return\n",
+		      info_to_arg[call->type]);
 		return call->fail = false;
+	}
 
-	/* Don't more than once in the same place. */
+	/* Don't fail more than once in the same place. */
 	dup = failtable_get(&failtable, call);
-	if (dup)
+	if (dup) {
+		trace("Not failing %c due to duplicate\n",
+		      info_to_arg[call->type]);
 		return call->fail = false;
+	}
 
 	if (failtest_hook) {
 		switch (failtest_hook(&history)) {
@@ -572,6 +726,8 @@ static bool should_fail(struct failtest_call *call)
 			probing = true;
 			break;
 		case FAIL_DONT_FAIL:
+			trace("Not failing %c due to failhook return\n",
+			      info_to_arg[call->type]);
 			call->fail = false;
 			return false;
 		default:
@@ -582,49 +738,71 @@ static bool should_fail(struct failtest_call *call)
 	/* Add it to our table of calls. */
 	failtable_add(&failtable, call);
 
-	files = save_files();
-
 	/* We're going to fail in the child. */
 	call->fail = true;
 	if (pipe(control) != 0 || pipe(output) != 0)
 		err(1, "opening pipe");
 
+	/* Move out the way, to high fds. */
+	control[0] = move_fd_to_high(control[0]);
+	control[1] = move_fd_to_high(control[1]);
+	output[0] = move_fd_to_high(output[0]);
+	output[1] = move_fd_to_high(output[1]);
+
 	/* Prevent double-printing (in child and parent) */
 	fflush(stdout);
+	fflush(warnf);
+	if (tracef)
+		fflush(tracef);
 	child = fork();
 	if (child == -1)
 		err(1, "forking failed");
 
 	if (child == 0) {
-		if (tracefd != -1) {
+		traceindent++;
+		if (tracef) {
 			struct timeval diff;
 			const char *p;
 			char *failpath;
 			struct failtest_call *c;
 
-			c = tlist_tail(&history, struct failtest_call, list);
+			c = tlist_tail(&history, list);
 			diff = time_sub(time_now(), start);
 			failpath = failpath_string();
-			trace("%u->%u (%u.%02u): %s (", getppid(), getpid(),
-			      (int)diff.tv_sec, (int)diff.tv_usec / 10000,
-			      failpath);
-			free(failpath);
 			p = strrchr(c->file, '/');
 			if (p)
-				trace("%s", p+1);
+				p++;
 			else
-				trace("%s", c->file);
-			trace(":%u)\n", c->line);
+				p = c->file;
+			trace("%u->%u (%u.%02u): %s (%s:%u)\n",
+			      getppid(), getpid(),
+			      (int)diff.tv_sec, (int)diff.tv_usec / 10000,
+			      failpath, p, c->line);
+			free(failpath);
 		}
+		/* From here on, we have to clean up! */
+		our_history_start = tlist_tail(&history, list);
 		close(control[0]);
 		close(output[0]);
-		dup2(output[1], STDOUT_FILENO);
-		dup2(output[1], STDERR_FILENO);
-		if (output[1] != STDOUT_FILENO && output[1] != STDERR_FILENO)
-			close(output[1]);
+		/* Don't swallow stderr if we're tracing. */
+		if (!tracef) {
+			dup2(output[1], STDOUT_FILENO);
+			dup2(output[1], STDERR_FILENO);
+			if (output[1] != STDOUT_FILENO
+			    && output[1] != STDERR_FILENO)
+				close(output[1]);
+		}
 		control_fd = move_fd_to_high(control[1]);
-		/* Valgrind spots the leak if we don't free these. */
-		free_files(files);
+
+		/* Forget any of our parent's saved files. */
+		free_mmapped_files(false);
+
+		/* Now, save any files we need to. */
+		save_mmapped_files();
+
+		/* Failed calls can't leak. */
+		call->can_leak = false;
+
 		return true;
 	}
 
@@ -701,8 +879,6 @@ static bool should_fail(struct failtest_call *call)
 	free(out);
 	signal(SIGUSR1, SIG_DFL);
 
-	restore_files(files);
-
 	/* Only child does probe. */
 	probing = false;
 
@@ -711,8 +887,9 @@ static bool should_fail(struct failtest_call *call)
 	return false;
 }
 
-static void cleanup_calloc(struct calloc_call *call)
+static void cleanup_calloc(struct calloc_call *call, bool restore)
 {
+	trace("undoing calloc %p\n", call->ret);
 	free(call->ret);
 }
 
@@ -723,7 +900,7 @@ void *failtest_calloc(size_t nmemb, size_t size,
 	struct calloc_call call;
 	call.nmemb = nmemb;
 	call.size = size;
-	p = add_history(FAILTEST_CALLOC, file, line, &call);
+	p = add_history(FAILTEST_CALLOC, true, file, line, &call);
 
 	if (should_fail(p)) {
 		p->u.calloc.ret = NULL;
@@ -732,12 +909,15 @@ void *failtest_calloc(size_t nmemb, size_t size,
 		p->u.calloc.ret = calloc(nmemb, size);
 		set_cleanup(p, cleanup_calloc, struct calloc_call);
 	}
+	trace("calloc %zu x %zu %s:%u -> %p\n",
+	      nmemb, size, file, line, p->u.calloc.ret);
 	errno = p->error;
 	return p->u.calloc.ret;
 }
 
-static void cleanup_malloc(struct malloc_call *call)
+static void cleanup_malloc(struct malloc_call *call, bool restore)
 {
+	trace("undoing malloc %p\n", call->ret);
 	free(call->ret);
 }
 
@@ -747,7 +927,7 @@ void *failtest_malloc(size_t size, const char *file, unsigned line)
 	struct malloc_call call;
 	call.size = size;
 
-	p = add_history(FAILTEST_MALLOC, file, line, &call);
+	p = add_history(FAILTEST_MALLOC, true, file, line, &call);
 	if (should_fail(p)) {
 		p->u.malloc.ret = NULL;
 		p->error = ENOMEM;
@@ -755,17 +935,20 @@ void *failtest_malloc(size_t size, const char *file, unsigned line)
 		p->u.malloc.ret = malloc(size);
 		set_cleanup(p, cleanup_malloc, struct malloc_call);
 	}
+	trace("malloc %zu %s:%u -> %p\n",
+	      size, file, line, p->u.malloc.ret);
 	errno = p->error;
 	return p->u.malloc.ret;
 }
 
-static void cleanup_realloc(struct realloc_call *call)
+static void cleanup_realloc(struct realloc_call *call, bool restore)
 {
+	trace("undoing realloc %p\n", call->ret);
 	free(call->ret);
 }
 
 /* Walk back and find out if we got this ptr from a previous routine. */
-static void fixup_ptr_history(void *ptr)
+static void fixup_ptr_history(void *ptr, const char *why)
 {
 	struct failtest_call *i;
 
@@ -774,19 +957,28 @@ static void fixup_ptr_history(void *ptr)
 		switch (i->type) {
 		case FAILTEST_REALLOC:
 			if (i->u.realloc.ret == ptr) {
+				trace("found realloc %p %s:%u matching %s\n",
+				      ptr, i->file, i->line, why);
 				i->cleanup = NULL;
+				i->can_leak = false;
 				return;
 			}
 			break;
 		case FAILTEST_MALLOC:
 			if (i->u.malloc.ret == ptr) {
+				trace("found malloc %p %s:%u matching %s\n",
+				      ptr, i->file, i->line, why);
 				i->cleanup = NULL;
+				i->can_leak = false;
 				return;
 			}
 			break;
 		case FAILTEST_CALLOC:
 			if (i->u.calloc.ret == ptr) {
+				trace("found calloc %p %s:%u matching %s\n",
+				      ptr, i->file, i->line, why);
 				i->cleanup = NULL;
+				i->can_leak = false;
 				return;
 			}
 			break;
@@ -794,6 +986,7 @@ static void fixup_ptr_history(void *ptr)
 			break;
 		}
 	}
+	trace("Did not find %p matching %s\n", ptr, why);
 }
 
 void *failtest_realloc(void *ptr, size_t size, const char *file, unsigned line)
@@ -801,7 +994,7 @@ void *failtest_realloc(void *ptr, size_t size, const char *file, unsigned line)
 	struct failtest_call *p;
 	struct realloc_call call;
 	call.size = size;
-	p = add_history(FAILTEST_REALLOC, file, line, &call);
+	p = add_history(FAILTEST_REALLOC, true, file, line, &call);
 
 	/* FIXME: Try one child moving allocation, one not. */
 	if (should_fail(p)) {
@@ -810,23 +1003,72 @@ void *failtest_realloc(void *ptr, size_t size, const char *file, unsigned line)
 	} else {
 		/* Don't catch this one in the history fixup... */
 		p->u.realloc.ret = NULL;
-		fixup_ptr_history(ptr);
+		fixup_ptr_history(ptr, "realloc");
 		p->u.realloc.ret = realloc(ptr, size);
 		set_cleanup(p, cleanup_realloc, struct realloc_call);
 	}
+	trace("realloc %p %s:%u -> %p\n",
+	      ptr, file, line, p->u.realloc.ret);
 	errno = p->error;
 	return p->u.realloc.ret;
 }
 
+/* FIXME: Record free, so we can terminate fixup_ptr_history correctly.
+ * If there's an alloc we don't see, it could get confusing if it matches
+ * a previous allocation we did see. */
 void failtest_free(void *ptr)
 {
-	fixup_ptr_history(ptr);
+	fixup_ptr_history(ptr, "free");
+	trace("free %p\n", ptr);
 	free(ptr);
 }
 
-static void cleanup_open(struct open_call *call)
+
+static struct contents_saved *save_file(const char *pathname)
+{
+	int fd;
+	struct contents_saved *s;
+
+	fd = open(pathname, O_RDONLY);
+	if (fd < 0)
+		return NULL;
+
+	s = save_contents(pathname, fd, lseek(fd, 0, SEEK_END), 0,
+			  "open with O_TRUNC");
+	close(fd);
+	return s;
+}
+
+/* Optimization: don't create a child for an open which *we know*
+ * would fail anyway. */
+static bool open_would_fail(const char *pathname, int flags)
+{
+	if ((flags & O_ACCMODE) == O_RDONLY)
+		return access(pathname, R_OK) != 0;
+	if (!(flags & O_CREAT)) {
+		if ((flags & O_ACCMODE) == O_WRONLY)
+			return access(pathname, W_OK) != 0;
+		if ((flags & O_ACCMODE) == O_RDWR)
+			return access(pathname, W_OK) != 0
+				|| access(pathname, R_OK) != 0;
+	}
+	/* FIXME: We could check if it exists, for O_CREAT|O_EXCL */
+	return false;
+}
+
+static void cleanup_open(struct open_call *call, bool restore)
 {
-	close(call->ret);
+	if (restore && call->saved)
+		restore_contents(container_of(call, struct failtest_call,
+					      u.open),
+				 call->saved, false, "open with O_TRUNC");
+	if (!call->closed) {
+		trace("Cleaning up open %s by closing fd %i\n",
+		      call->pathname, call->ret);
+		close(call->ret);
+		call->closed = true;
+	}
+	free(call->saved);
 }
 
 int failtest_open(const char *pathname,
@@ -839,31 +1081,55 @@ int failtest_open(const char *pathname,
 	call.pathname = strdup(pathname);
 	va_start(ap, line);
 	call.flags = va_arg(ap, int);
+	call.always_save = false;
+	call.closed = false;
 	if (call.flags & O_CREAT) {
 		call.mode = va_arg(ap, int);
 		va_end(ap);
 	}
-	p = add_history(FAILTEST_OPEN, file, line, &call);
+	p = add_history(FAILTEST_OPEN, true, file, line, &call);
 	/* Avoid memory leak! */
 	if (p == &unrecorded_call)
 		free((char *)call.pathname);
-	p->u.open.ret = open(pathname, call.flags, call.mode);
 
-	if (p->u.open.ret == -1) {
-		p->fail = false;
-		p->error = errno;
-	} else if (should_fail(p)) {
-		close(p->u.open.ret);
+	if (should_fail(p)) {
+		/* Don't bother inserting failures that would happen anyway. */
+		if (open_would_fail(pathname, call.flags)) {
+			trace("Open would have failed anyway: stopping\n");
+			failtest_cleanup(true, 0);
+		}
 		p->u.open.ret = -1;
 		/* FIXME: Play with error codes? */
 		p->error = EACCES;
 	} else {
-		set_cleanup(p, cleanup_open, struct open_call);
+		/* Save the old version if they're truncating it. */
+		if (call.flags & O_TRUNC)
+			p->u.open.saved = save_file(pathname);
+		else
+			p->u.open.saved = NULL;
+		p->u.open.ret = open(pathname, call.flags, call.mode);
+		if (p->u.open.ret == -1) {
+			p->u.open.closed = true;
+			p->can_leak = false;
+		} else {
+			set_cleanup(p, cleanup_open, struct open_call);
+		}
 	}
+	trace("open %s %s:%u -> %i (opener %p)\n",
+	      pathname, file, line, p->u.open.ret, &p->u.open);
 	errno = p->error;
 	return p->u.open.ret;
 }
 
+static void cleanup_mmap(struct mmap_call *mmap, bool restore)
+{
+	trace("cleaning up mmap @%p (opener %p)\n",
+	      mmap->ret, mmap->opener);
+	if (restore)
+		restore_contents(mmap->opener, mmap->saved, false, "mmap");
+	free(mmap->saved);
+}
+
 void *failtest_mmap(void *addr, size_t length, int prot, int flags,
 		    int fd, off_t offset, const char *file, unsigned line)
 {
@@ -876,20 +1142,55 @@ void *failtest_mmap(void *addr, size_t length, int prot, int flags,
 	call.flags = flags;
 	call.offset = offset;
 	call.fd = fd;
+	call.opener = opener_of(fd);
+
+	/* If we don't know what file it was, don't fail. */
+	if (!call.opener) {
+		if (fd != -1) {
+			fwarnx("failtest_mmap: couldn't figure out source for"
+			       " fd %i at %s:%u", fd, file, line);
+		}
+		addr = mmap(addr, length, prot, flags, fd, offset);
+		trace("mmap of fd %i -> %p (opener = NULL)\n", fd, addr);
+		return addr;
+	}
 
-	p = add_history(FAILTEST_MMAP, file, line, &call);
+	p = add_history(FAILTEST_MMAP, false, file, line, &call);
 	if (should_fail(p)) {
 		p->u.mmap.ret = MAP_FAILED;
 		p->error = ENOMEM;
 	} else {
 		p->u.mmap.ret = mmap(addr, length, prot, flags, fd, offset);
+		/* Save contents if we're writing to a normal file */
+		if (p->u.mmap.ret != MAP_FAILED
+		    && (prot & PROT_WRITE)
+		    && call.opener->type == FAILTEST_OPEN) {
+			const char *fname = call.opener->u.open.pathname;
+			p->u.mmap.saved = save_contents(fname, fd, length,
+							offset, "being mmapped");
+			set_cleanup(p, cleanup_mmap, struct mmap_call);
+		}
 	}
+	trace("mmap of fd %i %s:%u -> %p (opener = %p)\n",
+	      fd, file, line, addr, call.opener);
 	errno = p->error;
 	return p->u.mmap.ret;
 }
 
-static void cleanup_pipe(struct pipe_call *call)
+/* Since OpenBSD can't handle adding args, we use this file and line.
+ * This will make all mmaps look the same, reducing coverage. */
+void *failtest_mmap_noloc(void *addr, size_t length, int prot, int flags,
+			  int fd, off_t offset)
+{
+	return failtest_mmap(addr, length, prot, flags, fd, offset,
+			     __FILE__, __LINE__);
+}
+
+static void cleanup_pipe(struct pipe_call *call, bool restore)
 {
+	trace("cleaning up pipe fd=%i%s,%i%s\n",
+	      call->fds[0], call->closed[0] ? "(already closed)" : "",
+	      call->fds[1], call->closed[1] ? "(already closed)" : "");
 	if (!call->closed[0])
 		close(call->fds[0]);
 	if (!call->closed[1])
@@ -901,7 +1202,7 @@ int failtest_pipe(int pipefd[2], const char *file, unsigned line)
 	struct failtest_call *p;
 	struct pipe_call call;
 
-	p = add_history(FAILTEST_PIPE, file, line, &call);
+	p = add_history(FAILTEST_PIPE, true, file, line, &call);
 	if (should_fail(p)) {
 		p->u.open.ret = -1;
 		/* FIXME: Play with error codes? */
@@ -911,14 +1212,32 @@ int failtest_pipe(int pipefd[2], const char *file, unsigned line)
 		p->u.pipe.closed[0] = p->u.pipe.closed[1] = false;
 		set_cleanup(p, cleanup_pipe, struct pipe_call);
 	}
+
+	trace("pipe %s:%u -> %i,%i\n", file, line,
+	      p->u.pipe.ret ? -1 : p->u.pipe.fds[0],
+	      p->u.pipe.ret ? -1 : p->u.pipe.fds[1]);
+
 	/* This causes valgrind to notice if they use pipefd[] after failure */
 	memcpy(pipefd, p->u.pipe.fds, sizeof(p->u.pipe.fds));
 	errno = p->error;
 	return p->u.pipe.ret;
 }
 
-ssize_t failtest_pread(int fd, void *buf, size_t count, off_t off,
-		       const char *file, unsigned line)
+static void cleanup_read(struct read_call *call, bool restore)
+{
+	if (restore) {
+		trace("cleaning up read on fd %i: seeking to %llu\n",
+		      call->fd, (long long)call->off);
+
+		/* Read (not readv!) moves file offset! */
+		if (lseek(call->fd, call->off, SEEK_SET) != call->off) {
+			fwarn("Restoring lseek pointer failed (read)");
+		}
+	}
+}
+
+static ssize_t failtest_add_read(int fd, void *buf, size_t count, off_t off,
+				 bool is_pread, const char *file, unsigned line)
 {
 	struct failtest_call *p;
 	struct read_call call;
@@ -926,21 +1245,41 @@ ssize_t failtest_pread(int fd, void *buf, size_t count, off_t off,
 	call.buf = buf;
 	call.count = count;
 	call.off = off;
-	p = add_history(FAILTEST_READ, file, line, &call);
+	p = add_history(FAILTEST_READ, false, file, line, &call);
 
 	/* FIXME: Try partial read returns. */
 	if (should_fail(p)) {
 		p->u.read.ret = -1;
 		p->error = EIO;
 	} else {
-		p->u.read.ret = pread(fd, buf, count, off);
+		if (is_pread)
+			p->u.read.ret = pread(fd, buf, count, off);
+		else {
+			p->u.read.ret = read(fd, buf, count);
+			if (p->u.read.ret != -1)
+				set_cleanup(p, cleanup_read, struct read_call);
+		}
 	}
+	trace("%sread %s:%u fd %i %zu@%llu -> %i\n",
+	      is_pread ? "p" : "", file, line, fd, count, (long long)off,
+	      p->u.read.ret);
 	errno = p->error;
 	return p->u.read.ret;
 }
 
-ssize_t failtest_pwrite(int fd, const void *buf, size_t count, off_t off,
-			const char *file, unsigned line)
+static void cleanup_write(struct write_call *write, bool restore)
+{
+	trace("cleaning up write on %s\n", write->opener->u.open.pathname);
+	if (restore)
+		restore_contents(write->opener, write->saved, !write->is_pwrite,
+				 "write");
+	free(write->saved);
+}
+
+static ssize_t failtest_add_write(int fd, const void *buf,
+				  size_t count, off_t off,
+				  bool is_pwrite,
+				  const char *file, unsigned line)
 {
 	struct failtest_call *p;
 	struct write_call call;
@@ -949,7 +1288,9 @@ ssize_t failtest_pwrite(int fd, const void *buf, size_t count, off_t off,
 	call.buf = buf;
 	call.count = count;
 	call.off = off;
-	p = add_history(FAILTEST_WRITE, file, line, &call);
+	call.is_pwrite = is_pwrite;
+	call.opener = opener_of(fd);
+	p = add_history(FAILTEST_WRITE, false, file, line, &call);
 
 	/* If we're a child, we need to make sure we write the same thing
 	 * to non-files as the parent does, so tell it. */
@@ -966,8 +1307,19 @@ ssize_t failtest_pwrite(int fd, const void *buf, size_t count, off_t off,
 		p->u.write.ret = -1;
 		p->error = EIO;
 	} else {
+		bool is_file;
+		assert(call.opener == p->u.write.opener);
+
+		if (p->u.write.opener) {
+			is_file = (p->u.write.opener->type == FAILTEST_OPEN);
+		} else {
+			/* We can't unwind it, so at least check same
+			 * in parent and child. */
+			is_file = false;
+		}
+
 		/* FIXME: We assume same write order in parent and child */
-		if (off == (off_t)-1 && child_writes_num != 0) {
+		if (!is_file && child_writes_num != 0) {
 			if (child_writes[0].fd != fd)
 				errx(1, "Child wrote to fd %u, not %u?",
 				     child_writes[0].fd, fd);
@@ -988,32 +1340,59 @@ ssize_t failtest_pwrite(int fd, const void *buf, size_t count, off_t off,
 			memmove(&child_writes[0], &child_writes[1],
 				sizeof(child_writes[0]) * child_writes_num);
 
-			/* Is this is a socket or pipe, child wrote it
-			   already. */
-			if (p->u.write.off == (off_t)-1) {
-				p->u.write.ret = count;
-				errno = p->error;
-				return p->u.write.ret;
-			}
+			/* Child wrote it already. */
+			trace("write %s:%i on fd %i already done by child\n",
+			      file, line, fd);
+			p->u.write.ret = count;
+			errno = p->error;
+			return p->u.write.ret;
 		}
-		p->u.write.ret = pwrite(fd, buf, count, off);
+
+		if (is_file) {
+			p->u.write.saved = save_contents(call.opener->u.open.pathname,
+							 fd, count, off,
+							 "being overwritten");
+			set_cleanup(p, cleanup_write, struct write_call);
+		}
+
+		/* Though off is current seek ptr for write case, we need to
+		 * move it.  write() does that for us. */
+		if (p->u.write.is_pwrite)
+			p->u.write.ret = pwrite(fd, buf, count, off);
+		else
+			p->u.write.ret = write(fd, buf, count);
 	}
+	trace("%swrite %s:%i %zu@%llu on fd %i -> %i\n",
+	      p->u.write.is_pwrite ? "p" : "",
+	      file, line, count, (long long)off, fd, p->u.write.ret);
 	errno = p->error;
 	return p->u.write.ret;
 }
 
-ssize_t failtest_read(int fd, void *buf, size_t count,
-		      const char *file, unsigned line)
+ssize_t failtest_pwrite(int fd, const void *buf, size_t count, off_t offset,
+			const char *file, unsigned line)
 {
-	return failtest_pread(fd, buf, count, lseek(fd, 0, SEEK_CUR),
-			      file, line);
+	return failtest_add_write(fd, buf, count, offset, true, file, line);
 }
 
 ssize_t failtest_write(int fd, const void *buf, size_t count,
 		       const char *file, unsigned line)
 {
-	return failtest_pwrite(fd, buf, count, lseek(fd, 0, SEEK_CUR),
-			       file, line);
+	return failtest_add_write(fd, buf, count, lseek(fd, 0, SEEK_CUR), false,
+				  file, line);
+}
+
+ssize_t failtest_pread(int fd, void *buf, size_t count, off_t off,
+		       const char *file, unsigned line)
+{
+	return failtest_add_read(fd, buf, count, off, true, file, line);
+}
+
+ssize_t failtest_read(int fd, void *buf, size_t count,
+		      const char *file, unsigned line)
+{
+	return failtest_add_read(fd, buf, count, lseek(fd, 0, SEEK_CUR), false,
+				 file, line);
 }
 
 static struct lock_info *WARN_UNUSED_RESULT
@@ -1046,23 +1425,38 @@ add_lock(struct lock_info *locks, int fd, off_t start, off_t end, int type)
 			off_t new_start, new_end;
 			new_start = end + 1;
 			new_end = l->end;
+			trace("splitting lock on fd %i from %llu-%llu"
+			      " to %llu-%llu\n",
+			      fd, (long long)l->start, (long long)l->end,
+			      (long long)l->start, (long long)start - 1);
 			l->end = start - 1;
 			locks = add_lock(locks,
 					 fd, new_start, new_end, l->type);
 			l = &locks[i];
 		} else if (start <= l->start && end >= l->end) {
 			/* Total overlap: eliminate entry. */
+			trace("erasing lock on fd %i %llu-%llu\n",
+			      fd, (long long)l->start, (long long)l->end);
 			l->end = 0;
 			l->start = 1;
 		} else if (end >= l->start && end < l->end) {
+			trace("trimming lock on fd %i from %llu-%llu"
+			      " to %llu-%llu\n",
+			      fd, (long long)l->start, (long long)l->end,
+			      (long long)end + 1, (long long)l->end);
 			/* Start overlap: trim entry. */
 			l->start = end + 1;
 		} else if (start > l->start && start <= l->end) {
+			trace("trimming lock on fd %i from %llu-%llu"
+			      " to %llu-%llu\n",
+			      fd, (long long)l->start, (long long)l->end,
+			      (long long)l->start, (long long)start - 1);
 			/* End overlap: trim entry. */
 			l->end = start-1;
 		}
 		/* Nothing left?  Remove it. */
 		if (l->end < l->start) {
+			trace("forgetting lock on fd %i\n", fd);
 			memmove(l, l + 1, (--lock_num - i) * sizeof(l[0]));
 			i--;
 		}
@@ -1075,6 +1469,8 @@ add_lock(struct lock_info *locks, int fd, off_t start, off_t end, int type)
 		l->start = start;
 		l->end = end;
 		l->type = type;
+		trace("new lock on fd %i %llu-%llu\n",
+		      fd, (long long)l->start, (long long)l->end);
 	}
 	return locks;
 }
@@ -1082,56 +1478,87 @@ add_lock(struct lock_info *locks, int fd, off_t start, off_t end, int type)
 /* We trap this so we can record it: we don't fail it. */
 int failtest_close(int fd, const char *file, unsigned line)
 {
-	struct failtest_call *i;
 	struct close_call call;
-	struct failtest_call *p;
+	struct failtest_call *p, *opener;
+
+	/* Do this before we add ourselves to history! */
+	opener = opener_of(fd);
 
 	call.fd = fd;
-	p = add_history(FAILTEST_CLOSE, file, line, &call);
+	p = add_history(FAILTEST_CLOSE, false, file, line, &call);
 	p->fail = false;
 
-	/* Consume close from failpath. */
-	if (failpath)
-		if (should_fail(p))
+	/* Consume close from failpath (shouldn't tell us to fail). */
+	if (following_path()) {
+		if (follow_path(p))
 			abort();
+	}
 
+	trace("close on fd %i\n", fd);
 	if (fd < 0)
 		return close(fd);
 
-	/* Trace history to find source of fd. */
-	tlist_for_each_rev(&history, i, list) {
-		switch (i->type) {
-		case FAILTEST_PIPE:
+	/* Mark opener as not leaking, remove its cleanup function. */
+	if (opener) {
+		trace("close on fd %i found opener %p\n", fd, opener);
+		if (opener->type == FAILTEST_PIPE) {
 			/* From a pipe? */
-			if (i->u.pipe.fds[0] == fd) {
-				assert(!i->u.pipe.closed[0]);
-				i->u.pipe.closed[0] = true;
-				if (i->u.pipe.closed[1])
-					i->cleanup = NULL;
-				goto out;
-			}
-			if (i->u.pipe.fds[1] == fd) {
-				assert(!i->u.pipe.closed[1]);
-				i->u.pipe.closed[1] = true;
-				if (i->u.pipe.closed[0])
-					i->cleanup = NULL;
-				goto out;
-			}
-			break;
-		case FAILTEST_OPEN:
-			if (i->u.open.ret == fd) {
-				assert((void *)i->cleanup
-				       == (void *)cleanup_open);
+			if (opener->u.pipe.fds[0] == fd) {
+				assert(!opener->u.pipe.closed[0]);
+				opener->u.pipe.closed[0] = true;
+			} else if (opener->u.pipe.fds[1] == fd) {
+				assert(!opener->u.pipe.closed[1]);
+				opener->u.pipe.closed[1] = true;
+			} else
+				abort();
+			opener->can_leak = (!opener->u.pipe.closed[0]
+					    || !opener->u.pipe.closed[1]);
+		} else if (opener->type == FAILTEST_OPEN) {
+			opener->u.open.closed = true;
+			opener->can_leak = false;
+		} else
+			abort();
+	}
+
+	/* Restore offset now, in case parent shared (can't do after close!). */
+	if (control_fd != -1) {
+		struct failtest_call *i;
+
+		tlist_for_each_rev(&history, i, list) {
+			if (i == our_history_start)
+				break;
+			if (i == opener)
+				break;
+			if (i->type == FAILTEST_LSEEK && i->u.lseek.fd == fd) {
+				trace("close on fd %i undoes lseek\n", fd);
+				/* This seeks back. */
+				i->cleanup(&i->u, true);
 				i->cleanup = NULL;
-				goto out;
+			} else if (i->type == FAILTEST_WRITE
+				   && i->u.write.fd == fd
+				   && !i->u.write.is_pwrite) {
+				trace("close on fd %i undoes write"
+				      " offset change\n", fd);
+				/* Write (not pwrite!) moves file offset! */
+				if (lseek(fd, i->u.write.off, SEEK_SET)
+				    != i->u.write.off) {
+					fwarn("Restoring lseek pointer failed (write)");
+				}
+			} else if (i->type == FAILTEST_READ
+				   && i->u.read.fd == fd) {
+				/* preads don't *have* cleanups */
+				if (i->cleanup) {
+					trace("close on fd %i undoes read"
+					      " offset change\n", fd);
+					/* This seeks back. */
+					i->cleanup(&i->u, true);
+					i->cleanup = NULL;
+				}
 			}
-			break;
-		default:
-			break;
 		}
 	}
 
-out:
+	/* Close unlocks everything. */
 	locks = add_lock(locks, fd, 0, off_max(), F_UNLCK);
 	return close(fd);
 }
@@ -1161,11 +1588,14 @@ int failtest_fcntl(int fd, const char *file, unsigned line, int cmd, ...)
 		va_start(ap, cmd);
 		call.arg.l = va_arg(ap, long);
 		va_end(ap);
+		trace("fcntl on fd %i F_SETFL/F_SETFD\n", fd);
 		return fcntl(fd, cmd, call.arg.l);
 	case F_GETFD:
 	case F_GETFL:
+		trace("fcntl on fd %i F_GETFL/F_GETFD\n", fd);
 		return fcntl(fd, cmd);
 	case F_GETLK:
+		trace("fcntl on fd %i F_GETLK\n", fd);
 		get_locks();
 		va_start(ap, cmd);
 		call.arg.fl = *va_arg(ap, struct flock *);
@@ -1173,6 +1603,8 @@ int failtest_fcntl(int fd, const char *file, unsigned line, int cmd, ...)
 		return fcntl(fd, cmd, &call.arg.fl);
 	case F_SETLK:
 	case F_SETLKW:
+		trace("fcntl on fd %i F_SETLK%s\n",
+		      fd, cmd == F_SETLKW ? "W" : "");
 		va_start(ap, cmd);
 		call.arg.fl = *va_arg(ap, struct flock *);
 		va_end(ap);
@@ -1182,7 +1614,7 @@ int failtest_fcntl(int fd, const char *file, unsigned line, int cmd, ...)
 		err(1, "failtest: unknown fcntl %u", cmd);
 	}
 
-	p = add_history(FAILTEST_FCNTL, file, line, &call);
+	p = add_history(FAILTEST_FCNTL, false, file, line, &call);
 
 	if (should_fail(p)) {
 		p->u.fcntl.ret = -1;
@@ -1207,10 +1639,54 @@ int failtest_fcntl(int fd, const char *file, unsigned line, int cmd, ...)
 					 p->u.fcntl.arg.fl.l_type);
 		}
 	}
+	trace("fcntl on fd %i -> %i\n", fd, p->u.fcntl.ret);
 	errno = p->error;
 	return p->u.fcntl.ret;
 }
 
+static void cleanup_lseek(struct lseek_call *call, bool restore)
+{
+	if (restore) {
+		trace("cleaning up lseek on fd %i -> %llu\n",
+		      call->fd, (long long)call->old_off);
+		if (lseek(call->fd, call->old_off, SEEK_SET) != call->old_off)
+			fwarn("Restoring lseek pointer failed");
+	}
+}
+
+/* We trap this so we can undo it: we don't fail it. */
+off_t failtest_lseek(int fd, off_t offset, int whence, const char *file,
+		     unsigned int line)
+{
+	struct failtest_call *p;
+	struct lseek_call call;
+	call.fd = fd;
+	call.offset = offset;
+	call.whence = whence;
+	call.old_off = lseek(fd, 0, SEEK_CUR);
+
+	p = add_history(FAILTEST_LSEEK, false, file, line, &call);
+	p->fail = false;
+
+	/* Consume lseek from failpath. */
+	if (failpath)
+		if (should_fail(p))
+			abort();
+
+	p->u.lseek.ret = lseek(fd, offset, whence);
+
+	if (p->u.lseek.ret != (off_t)-1)
+		set_cleanup(p, cleanup_lseek, struct lseek_call);
+
+	trace("lseek %s:%u on fd %i from %llu to %llu%s\n",
+	      file, line, fd, (long long)call.old_off, (long long)offset,
+	      whence == SEEK_CUR ? " (from current off)" :
+	      whence == SEEK_END ? " (from end)" :
+	      whence == SEEK_SET ? "" : " (invalid whence)");
+	return p->u.lseek.ret;
+}
+
+
 pid_t failtest_getpid(const char *file, unsigned line)
 {
 	/* You must call failtest_init first! */
@@ -1224,12 +1700,12 @@ void failtest_init(int argc, char *argv[])
 
 	orig_pid = getpid();
 
-	warnfd = move_fd_to_high(dup(STDERR_FILENO));
+	warnf = fdopen(move_fd_to_high(dup(STDERR_FILENO)), "w");
 	for (i = 1; i < argc; i++) {
 		if (!strncmp(argv[i], "--failpath=", strlen("--failpath="))) {
 			failpath = argv[i] + strlen("--failpath=");
-		} else if (strcmp(argv[i], "--tracepath") == 0) {
-			tracefd = warnfd;
+		} else if (strcmp(argv[i], "--trace") == 0) {
+			tracef = warnf;
 			failtest_timeout_ms = -1;
 		} else if (!strncmp(argv[i], "--debugpath=",
 				    strlen("--debugpath="))) {
@@ -1247,6 +1723,7 @@ bool failtest_has_failed(void)
 
 void failtest_exit(int status)
 {
+	trace("failtest_exit with status %i\n", status);
 	if (failtest_exit_check) {
 		if (!failtest_exit_check(&history))
 			child_fail(NULL, 0, "failtest_exit_check failed\n");