X-Git-Url: https://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Ftdb2%2Ffree.c;h=8ff5d74a3fd7a8a400b243b9dd8a22c1dfe44af6;hp=df2792ecf19da0841e732bb764cdfd40bafa1880;hb=20defbbcfa088a7574d9897b533d1bc600b2df53;hpb=d2a4d6b49bc260bd0979965f4e4ef62b40b19efe

diff --git a/ccan/tdb2/free.c b/ccan/tdb2/free.c
index df2792ec..8ff5d74a 100644
--- a/ccan/tdb2/free.c
+++ b/ccan/tdb2/free.c
@@ -28,7 +28,7 @@ static unsigned fls64(uint64_t val)
 }
 
 /* In which bucket would we find a particular record size? (ignoring header) */
-unsigned int size_to_bucket(unsigned int zone_bits, tdb_len_t data_len)
+unsigned int size_to_bucket(tdb_len_t data_len)
 {
 	unsigned int bucket;
 
@@ -44,90 +44,62 @@ unsigned int size_to_bucket(unsigned int zone_bits, tdb_len_t data_len)
 		bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2;
 	}
 
-	if (unlikely(bucket > BUCKETS_FOR_ZONE(zone_bits)))
-		bucket = BUCKETS_FOR_ZONE(zone_bits);
+	if (unlikely(bucket >= TDB_FREE_BUCKETS))
+		bucket = TDB_FREE_BUCKETS - 1;
 	return bucket;
 }
 
-/* Subtract 1-byte tailer and header.  Then round up to next power of 2. */
-static unsigned max_zone_bits(struct tdb_context *tdb)
+tdb_off_t first_flist(struct tdb_context *tdb)
 {
-	return fls64(tdb->map_size-1-sizeof(struct tdb_header)-1) + 1;
+	return tdb_read_off(tdb, offsetof(struct tdb_header, free_list));
 }
 
-/* Start by using a random zone to spread the load: returns the offset. */
-static uint64_t random_zone(struct tdb_context *tdb)
+tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist)
 {
-	struct free_zone_header zhdr;
-	tdb_off_t off = sizeof(struct tdb_header);
-	tdb_len_t half_bits;
-	uint64_t randbits = 0;
-	unsigned int i;
-
-	for (i = 0; i < 64; i += fls64(RAND_MAX)) 
-		randbits ^= ((uint64_t)random()) << i;
-
-	/* FIXME: Does this work?  Test! */
-	half_bits = max_zone_bits(tdb) - 1;
-	do {
-		/* Pick left or right side (not outside file) */
-		if ((randbits & 1)
-		    && !tdb->methods->oob(tdb, off + (1ULL << half_bits)
-					  + sizeof(zhdr), true)) {
-			off += 1ULL << half_bits;
-		}
-		randbits >>= 1;
+	return tdb_read_off(tdb, flist + offsetof(struct tdb_freelist, next));
+}
 
-		if (tdb_read_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1) 
-			return TDB_OFF_ERR;
+int tdb_flist_init(struct tdb_context *tdb)
+{
+	/* Use reservoir sampling algorithm to select a free list at random. */
+	unsigned int rnd, max = 0, count = 0;
+	tdb_off_t off;
 
-		if (zhdr.zone_bits == half_bits)
-			return off;
+	tdb->flist_off = off = first_flist(tdb);
+	tdb->flist = 0;
 
-		half_bits--;
-	} while (half_bits >= INITIAL_ZONE_BITS);
+	while (off) {
+		if (off == TDB_OFF_ERR)
+			return -1;
 
-	tdb->ecode = TDB_ERR_CORRUPT;
-	tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
-		 "random_zone: zone at %llu smaller than %u bits?",
-		 (long long)off, INITIAL_ZONE_BITS);
-	return TDB_OFF_ERR;
-}
+		rnd = random();
+		if (rnd >= max) {
+			tdb->flist_off = off;
+			tdb->flist = count;
+			max = rnd;
+		}
 
-int tdb_zone_init(struct tdb_context *tdb)
-{
-	tdb->zone_off = random_zone(tdb);
-	if (tdb->zone_off == TDB_OFF_ERR)
-		return -1;
-	if (tdb_read_convert(tdb, tdb->zone_off,
-			     &tdb->zhdr, sizeof(tdb->zhdr)) == -1) 
-		return -1;
+		off = next_flist(tdb, off);
+		count++;
+	}
 	return 0;
 }
 
-/* Where's the header, given a zone size of 1 << zone_bits? */
-static tdb_off_t zone_off(tdb_off_t off, unsigned int zone_bits)
-{
-	off -= sizeof(struct tdb_header);
-	return (off & ~((1ULL << zone_bits) - 1)) + sizeof(struct tdb_header);
-}
-
 /* Offset of a given bucket. */
-/* FIXME: bucket can be "unsigned" everywhere, or even uint8/16. */
-tdb_off_t bucket_off(tdb_off_t zone_off, tdb_off_t bucket)
+tdb_off_t bucket_off(tdb_off_t flist_off, unsigned bucket)
 {
-	return zone_off
-		+ sizeof(struct free_zone_header)
+	return flist_off + offsetof(struct tdb_freelist, buckets)
 		+ bucket * sizeof(tdb_off_t);
 }
 
 /* Returns free_buckets + 1, or list number to search. */
-static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
+static tdb_off_t find_free_head(struct tdb_context *tdb,
+				tdb_off_t flist_off,
+				tdb_off_t bucket)
 {
 	/* Speculatively search for a non-zero bucket. */
-	return tdb_find_nonzero_off(tdb, bucket_off(tdb->zone_off, 0),
-				    bucket,
-				    BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits) + 1);
+	return tdb_find_nonzero_off(tdb, bucket_off(flist_off, 0),
+				    bucket, TDB_FREE_BUCKETS);
 }
 
 /* Remove from free bucket. */
@@ -138,10 +110,10 @@ static int remove_from_list(struct tdb_context *tdb,
 	tdb_off_t off;
 
 	/* Front of list? */
-	if (r->prev == 0) {
+	if (frec_prev(r) == 0) {
 		off = b_off;
 	} else {
-		off = r->prev + offsetof(struct tdb_free_record, next);
+		off = frec_prev(r) + offsetof(struct tdb_free_record, next);
 	}
 
 #ifdef DEBUG
@@ -159,11 +131,11 @@ static int remove_from_list(struct tdb_context *tdb,
 	}
 
 	if (r->next != 0) {
-		off = r->next + offsetof(struct tdb_free_record, prev);
+		off = r->next + offsetof(struct tdb_free_record,magic_and_prev);
 		/* r->next->prev = r->prev */
 
 #ifdef DEBUG
-		if (tdb_read_off(tdb, off) != r_off) {
+		if (tdb_read_off(tdb, off) & TDB_OFF_MASK != r_off) {
 			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 				 "remove_from_list: %llu bad list %llu\n",
 				 (long long)r_off, (long long)b_off);
@@ -171,7 +143,7 @@ static int remove_from_list(struct tdb_context *tdb,
 		}
 #endif
 
-		if (tdb_write_off(tdb, off, r->prev)) {
+		if (tdb_write_off(tdb, off, r->magic_and_prev)) {
 			return -1;
 		}
 	}
@@ -182,72 +154,73 @@ static int remove_from_list(struct tdb_context *tdb,
 static int enqueue_in_free(struct tdb_context *tdb,
 			   tdb_off_t b_off,
 			   tdb_off_t off,
-			   struct tdb_free_record *new)
+			   tdb_len_t len)
 {
-	new->prev = 0;
+	struct tdb_free_record new;
+	uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL));
+
+	/* We only need to set flist_and_len; rest is set in enqueue_in_free */
+	new.flist_and_len = ((uint64_t)tdb->flist << (64 - TDB_OFF_UPPER_STEAL))
+		| len;
+	/* prev = 0. */
+	new.magic_and_prev = magic;
+
 	/* new->next = head. */
-	new->next = tdb_read_off(tdb, b_off);
-	if (new->next == TDB_OFF_ERR)
+	new.next = tdb_read_off(tdb, b_off);
+	if (new.next == TDB_OFF_ERR)
 		return -1;
 
-	if (new->next) {
+	if (new.next) {
 #ifdef DEBUG
 		if (tdb_read_off(tdb,
-				 new->next
-				 + offsetof(struct tdb_free_record, prev))
-		    != 0) {
+				 new.next + offsetof(struct tdb_free_record,
+						     magic_and_prev))
+		    != magic) {
 			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 				 "enqueue_in_free: %llu bad head prev %llu\n",
-				 (long long)new->next, (long long)b_off);
+				 (long long)new.next, (long long)b_off);
 			return -1;
 		}
 #endif
 		/* next->prev = new. */
-		if (tdb_write_off(tdb, new->next
-				  + offsetof(struct tdb_free_record, prev),
-				  off) != 0)
+		if (tdb_write_off(tdb, new.next
+				  + offsetof(struct tdb_free_record,
+					     magic_and_prev),
+				  off | magic) != 0)
 			return -1;
 	}
 	/* head = new */
 	if (tdb_write_off(tdb, b_off, off) != 0)
 		return -1;
 
-	return tdb_write_convert(tdb, off, new, sizeof(*new));
+	return tdb_write_convert(tdb, off, &new, sizeof(new));
 }
 
 /* List need not be locked. */
 int add_free_record(struct tdb_context *tdb,
-		    unsigned int zone_bits,
 		    tdb_off_t off, tdb_len_t len_with_header)
 {
-	struct tdb_free_record new;
 	tdb_off_t b_off;
+	tdb_len_t len;
 	int ret;
 
-	assert(len_with_header >= sizeof(new));
-	assert(zone_bits < (1 << 6));
+	assert(len_with_header >= sizeof(struct tdb_free_record));
 
-	new.magic_and_meta = TDB_FREE_MAGIC | zone_bits;
-	new.data_len = len_with_header - sizeof(struct tdb_used_record);
+	len = len_with_header - sizeof(struct tdb_used_record);
 
-	b_off = bucket_off(zone_off(off, zone_bits),
-			   size_to_bucket(zone_bits, new.data_len));
+	b_off = bucket_off(tdb->flist_off, size_to_bucket(len));
 	if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
 		return -1;
 
-	ret = enqueue_in_free(tdb, b_off, off, &new);
+	ret = enqueue_in_free(tdb, b_off, off, len);
 	tdb_unlock_free_bucket(tdb, b_off);
 	return ret;
 }
 
-static size_t adjust_size(size_t keylen, size_t datalen, bool want_extra)
+static size_t adjust_size(size_t keylen, size_t datalen)
 {
 	size_t size = keylen + datalen;
 
-	/* We want at least 50% growth for data. */
-	if (want_extra)
-		size += datalen/2;
-
 	if (size < TDB_MIN_DATA_LEN)
 		size = TDB_MIN_DATA_LEN;
 
@@ -261,28 +234,39 @@ static size_t record_leftover(size_t keylen, size_t datalen,
 {
 	ssize_t leftover;
 
-	/* We might *want* extra, but not have it, so leftover is negative. */
-	leftover = total_len - adjust_size(keylen, datalen, want_extra);
-	if (leftover < (ssize_t)sizeof(struct tdb_free_record))
-		return 0;
+	if (want_extra)
+		datalen += datalen / 2;
+	leftover = total_len - adjust_size(keylen, datalen);
 
-	/* If we want extra anwyay, don't split unless we have 2x size. */
-	if (want_extra && leftover <= datalen / 2)
+	if (leftover < (ssize_t)sizeof(struct tdb_free_record))
 		return 0;
 
 	return leftover;
 }
 
+/* FIXME: Shortcut common case where tdb->flist == flist */
+static tdb_off_t flist_offset(struct tdb_context *tdb, unsigned int flist)
+{
+	tdb_off_t off = first_flist(tdb);
+	unsigned int i;
+
+	for (i = 0; i < flist; i++)
+		off = next_flist(tdb, off);
+	return off;
+}
+
 /* Note: we unlock the current bucket if we coalesce or fail. */
 static int coalesce(struct tdb_context *tdb,
-		    tdb_off_t zone_off, unsigned zone_bits,
 		    tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
 {
 	struct tdb_free_record pad, *r;
-	tdb_off_t end = off + sizeof(struct tdb_used_record) + data_len;
+	tdb_off_t end;
+
+	end = off + sizeof(struct tdb_used_record) + data_len;
 
-	while (end < (zone_off + (1ULL << zone_bits))) {
+	while (end < tdb->map_size) {
 		tdb_off_t nb_off;
+		unsigned flist, bucket;
 
 		/* FIXME: do tdb_get here and below really win? */
 		r = tdb_get(tdb, end, &pad, sizeof(pad));
@@ -292,8 +276,9 @@ static int coalesce(struct tdb_context *tdb,
 		if (frec_magic(r) != TDB_FREE_MAGIC)
 			break;
 
-		nb_off = bucket_off(zone_off,
-				    size_to_bucket(zone_bits, r->data_len));
+		flist = frec_flist(r);
+		bucket = size_to_bucket(frec_len(r));
+		nb_off = bucket_off(flist_offset(tdb, flist), bucket);
 
 		/* We may be violating lock order here, so best effort. */
 		if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
@@ -311,9 +296,8 @@ static int coalesce(struct tdb_context *tdb,
 			break;
 		}
 
-		if (unlikely(bucket_off(zone_off,
-					size_to_bucket(zone_bits, r->data_len))
-			     != nb_off)) {
+		if (unlikely(frec_flist(r) != flist)
+		    || unlikely(size_to_bucket(frec_len(r)) != bucket)) {
 			tdb_unlock_free_bucket(tdb, nb_off);
 			break;
 		}
@@ -323,7 +307,7 @@ static int coalesce(struct tdb_context *tdb,
 			goto err;
 		}
 
-		end += sizeof(struct tdb_used_record) + r->data_len;
+		end += sizeof(struct tdb_used_record) + frec_len(r);
 		tdb_unlock_free_bucket(tdb, nb_off);
 	}
 
@@ -336,21 +320,32 @@ static int coalesce(struct tdb_context *tdb,
 	if (!r)
 		goto err;
 
-	if (r->data_len != data_len) {
+	if (frec_len(r) != data_len) {
 		tdb->ecode = TDB_ERR_CORRUPT;
 		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 			 "coalesce: expected data len %llu not %llu\n",
-			 (long long)data_len, (long long)r->data_len);
+			 (long long)data_len, (long long)frec_len(r));
 		goto err;
 	}
 
 	if (remove_from_list(tdb, b_off, off, r) == -1)
 		goto err;
 
-	/* We have to drop this to avoid deadlocks. */
+	r = tdb_access_write(tdb, off, sizeof(*r), true);
+	if (!r)
+		goto err;
+
+	/* We have to drop this to avoid deadlocks, so make sure record
+	 * doesn't get coalesced by someone else! */
+	r->magic_and_prev = TDB_COALESCING_MAGIC << (64 - TDB_OFF_UPPER_STEAL);
+	/* FIXME: Use 255 as invalid free list? */
+	r->flist_and_len = end - off - sizeof(struct tdb_used_record);
+	if (tdb_access_commit(tdb, r) != 0)
+		goto err;
+
 	tdb_unlock_free_bucket(tdb, b_off);
 
-	if (add_free_record(tdb, zone_bits, off, end - off) == -1)
+	if (add_free_record(tdb, off, end - off) == -1)
 		return -1;
 	return 1;
 
@@ -362,8 +357,7 @@ err:
 
 /* We need size bytes to put our key and data in. */
 static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
-				tdb_off_t zone_off,
-				unsigned zone_bits,
+				tdb_off_t flist_off,
 				tdb_off_t bucket,
 				size_t keylen, size_t datalen,
 				bool want_extra,
@@ -372,20 +366,18 @@ static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
 	tdb_off_t off, b_off,best_off;
 	struct tdb_free_record pad, best = { 0 }, *r;
 	double multiplier;
-	size_t size = keylen + datalen;
+	size_t size = adjust_size(keylen, datalen);
 
 again:
-	b_off = bucket_off(zone_off, bucket);
+	b_off = bucket_off(flist_off, bucket);
 
-	/* FIXME: Try non-blocking wait first, to measure contention.
-	 * If we're contented, try switching zones, and don't enlarge zone
-	 * next time (we want more zones). */
+	/* FIXME: Try non-blocking wait first, to measure contention. */
 	/* Lock this bucket. */
 	if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == -1) {
 		return TDB_OFF_ERR;
 	}
 
-	best.data_len = -1ULL;
+	best.flist_and_len = -1ULL;
 	best_off = 0;
 
 	/* Get slack if we're after extra. */
@@ -409,23 +401,22 @@ again:
 		if (frec_magic(r) != TDB_FREE_MAGIC) {
 			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
 				 "lock_and_alloc: %llu non-free 0x%llx\n",
-				 (long long)off, (long long)r->magic_and_meta);
+				 (long long)off, (long long)r->magic_and_prev);
 			goto unlock_err;
 		}
 
-		if (r->data_len >= size && r->data_len < best.data_len) {
+		if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
 			best_off = off;
 			best = *r;
 		}
 
-		if (best.data_len < size * multiplier && best_off)
+		if (frec_len(&best) < size * multiplier && best_off)
 			break;
 
 		multiplier *= 1.01;
 
 		/* Since we're going slow anyway, try coalescing here. */
-		switch (coalesce(tdb, zone_off, zone_bits, off, b_off,
-				 r->data_len)) {
+		switch (coalesce(tdb, off, b_off, frec_len(r))) {
 		case -1:
 			/* This has already unlocked on error. */
 			return -1;
@@ -446,13 +437,14 @@ again:
 			goto unlock_err;
 
 		leftover = record_leftover(keylen, datalen, want_extra,
-					   best.data_len);
+					   frec_len(&best));
 
+		assert(keylen + datalen + leftover <= frec_len(&best));
 		/* We need to mark non-free before we drop lock, otherwise
 		 * coalesce() could try to merge it! */
-		if (set_header(tdb, &rec, keylen, datalen,
-			       best.data_len - leftover,
-			       hashlow, zone_bits) != 0)
+		if (set_used_header(tdb, &rec, keylen, datalen,
+				    frec_len(&best) - leftover,
+				    hashlow) != 0)
 			goto unlock_err;
 
 		if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
@@ -461,9 +453,9 @@ again:
 		tdb_unlock_free_bucket(tdb, b_off);
 
 		if (leftover) {
-			if (add_free_record(tdb, zone_bits,
+			if (add_free_record(tdb,
 					    best_off + sizeof(rec)
-					    + best.data_len - leftover,
+					    + frec_len(&best) - leftover,
 					    leftover))
 				return TDB_OFF_ERR;
 		}
@@ -478,85 +470,66 @@ unlock_err:
 	return TDB_OFF_ERR;
 }
 
-static bool next_zone(struct tdb_context *tdb)
-{
-	tdb_off_t next = tdb->zone_off + (1ULL << tdb->zhdr.zone_bits);
-
-	/* We must have a header. */
-	if (tdb->methods->oob(tdb, next + sizeof(tdb->zhdr), true))
-		return false;
-
-	tdb->zone_off = next;
-	return tdb_read_convert(tdb, next, &tdb->zhdr, sizeof(tdb->zhdr)) == 0;
-}
-
-/* Offset returned is within current zone (which it may alter). */
+/* Get a free block from current free list, or 0 if none. */
 static tdb_off_t get_free(struct tdb_context *tdb,
 			  size_t keylen, size_t datalen, bool want_extra,
 			  unsigned hashlow)
 {
-	tdb_off_t start_zone = tdb->zone_off, off;
+	tdb_off_t off, flist_off;
+	unsigned start_b, b, flist;
 	bool wrapped = false;
-	size_t size = adjust_size(keylen, datalen, want_extra);
 
 	/* If they are growing, add 50% to get to higher bucket. */
 	if (want_extra)
-		size += datalen / 2;
-
-	/* FIXME: If we don't get a hit in the first bucket we want,
-	 * try changing zones for next time.  That should help wear
-	 * zones evenly, so we don't need to search all of them before
-	 * expanding. */
-	while (!wrapped || tdb->zone_off != start_zone) {
-		tdb_off_t b;
-
-		/* Shortcut for really huge allocations... */
-		if ((size >> tdb->zhdr.zone_bits) != 0)
-			goto next;
+		start_b = size_to_bucket(adjust_size(keylen,
+						     datalen + datalen / 2));
+	else
+		start_b = size_to_bucket(adjust_size(keylen, datalen));
 
+	flist_off = tdb->flist_off;
+	flist = tdb->flist;
+	while (!wrapped || flist_off != tdb->flist_off) {
 		/* Start at exact size bucket, and search up... */
-		b = size_to_bucket(tdb->zhdr.zone_bits, size);
-		for (b = find_free_head(tdb, b);
-		     b <= BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits);
-		     b += find_free_head(tdb, b + 1)) {
+		for (b = find_free_head(tdb, flist_off, start_b);
+		     b < TDB_FREE_BUCKETS;
+		     b = find_free_head(tdb, flist_off, b + 1)) {
 			/* Try getting one from list. */
-			off = lock_and_alloc(tdb, tdb->zone_off,
-					     tdb->zhdr.zone_bits,
+			off = lock_and_alloc(tdb, flist_off,
 					     b, keylen, datalen, want_extra,
 					     hashlow);
 			if (off == TDB_OFF_ERR)
 				return TDB_OFF_ERR;
-			if (off != 0)
+			if (off != 0) {
+				/* Worked?  Stay using this list. */
+				tdb->flist_off = flist_off;
+				tdb->flist = flist;
 				return off;
+			}
 			/* Didn't work.  Try next bucket. */
 		}
 
-	next:
-		/* Didn't work, try next zone, if it exists. */
-		if (!next_zone(tdb)) {
+		/* Hmm, try next list. */
+		flist_off = next_flist(tdb, flist_off);
+		flist++;
+		if (flist_off == 0) {
 			wrapped = true;
-			tdb->zone_off = sizeof(struct tdb_header);
-			if (tdb_read_convert(tdb, tdb->zone_off,
-					     &tdb->zhdr, sizeof(tdb->zhdr))) {
-				return TDB_OFF_ERR;
-			}
+			flist_off = first_flist(tdb);
+			flist = 0;
 		}
 	}
+
 	return 0;
 }
 
-int set_header(struct tdb_context *tdb,
-	       struct tdb_used_record *rec,
-	       uint64_t keylen, uint64_t datalen,
-	       uint64_t actuallen, unsigned hashlow,
-	       unsigned int zone_bits)
+int set_used_header(struct tdb_context *tdb,
+		    struct tdb_used_record *rec,
+		    uint64_t keylen, uint64_t datalen,
+		    uint64_t actuallen, unsigned hashlow)
 {
 	uint64_t keybits = (fls64(keylen) + 1) / 2;
 
 	/* Use bottom bits of hash, so it's independent of hash table size. */
-	rec->magic_and_meta
-		= zone_bits
-		| ((hashlow & ((1 << 5)-1)) << 6)
+	rec->magic_and_meta = (hashlow & ((1 << 11)-1))
 		| ((actuallen - (keylen + datalen)) << 11)
 		| (keybits << 43)
 		| (TDB_MAGIC << 48);
@@ -576,33 +549,31 @@ int set_header(struct tdb_context *tdb,
 	return 0;
 }
 
-static bool zones_happy(struct tdb_context *tdb)
-{
-	/* FIXME: look at distribution of zones. */
-	return true;
-}
-
-/* Assume we want buckets up to the comfort factor. */
-static tdb_len_t overhead(unsigned int zone_bits)
-{
-	return sizeof(struct free_zone_header)
-		+ (BUCKETS_FOR_ZONE(zone_bits) + 1) * sizeof(tdb_off_t);
-}
-
-/* Expand the database (by adding a zone). */
+/* Expand the database. */
 static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 {
 	uint64_t old_size;
-	tdb_off_t off;
-	uint8_t zone_bits;
-	unsigned int num_buckets;
 	tdb_len_t wanted;
-	struct free_zone_header zhdr;
-	bool enlarge_zone;
 
 	/* We need room for the record header too. */
 	wanted = sizeof(struct tdb_used_record) + size;
 
+	/* Need to hold a hash lock to expand DB: transactions rely on it. */
+	if (!(tdb->flags & TDB_NOLOCK)
+	    && !tdb->allrecord_lock.count && !tdb_has_hash_locks(tdb)) {
+		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			 "tdb_expand: must hold lock during expand\n");
+		return -1;
+	}
+
+	/* always make room for at least 100 more records, and at
+           least 25% more space. */
+	if (size * TDB_EXTENSION_FACTOR > tdb->map_size / 4)
+		wanted = size * TDB_EXTENSION_FACTOR;
+	else
+		wanted = tdb->map_size / 4;
+	wanted = adjust_size(0, wanted);
+
 	/* Only one person can expand file at a time. */
 	if (tdb_lock_expand(tdb, F_WRLCK) != 0)
 		return -1;
@@ -610,70 +581,20 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 	/* Someone else may have expanded the file, so retry. */
 	old_size = tdb->map_size;
 	tdb->methods->oob(tdb, tdb->map_size + 1, true);
-	if (tdb->map_size != old_size)
-		goto success;
-
-	/* FIXME: Tailer is a bogus optimization, remove it. */
-	/* zone bits tailer char is protected by EXPAND lock. */
-	if (tdb->methods->read(tdb, old_size - 1, &zone_bits, 1) == -1)
-		goto fail;
-
-	/* If zones aren't working well, add larger zone if possible. */
-	enlarge_zone = !zones_happy(tdb);
-
-	/* New zone can be between zone_bits or larger if we're on the right
-	 * boundary. */
-	for (;;) {
-		/* Does this fit the allocation comfortably? */
-		if ((1ULL << zone_bits) >= overhead(zone_bits) + wanted) {
-			/* Only let enlarge_zone enlarge us once. */
-			if (!enlarge_zone)
-				break;
-			enlarge_zone = false;
-		}
-		if ((old_size - 1 - sizeof(struct tdb_header))
-		    & (1 << zone_bits))
-			break;
-		zone_bits++;
+	if (tdb->map_size != old_size) {
+		tdb_unlock_expand(tdb, F_WRLCK);
+		return 0;
 	}
 
-	zhdr.zone_bits = zone_bits;
-	num_buckets = BUCKETS_FOR_ZONE(zone_bits);
-
-	/* FIXME: I don't think we need to expand to full zone, do we? */
-	if (tdb->methods->expand_file(tdb, 1ULL << zone_bits) == -1)
-		goto fail;
-
-	/* Write new tailer. */
-	if (tdb->methods->write(tdb, tdb->map_size - 1, &zone_bits, 1) == -1)
-		goto fail;
-
-	/* Write new zone header (just before old tailer). */
-	off = old_size - 1;
-	if (tdb_write_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1)
-		goto fail;
-
-	/* Now write empty buckets. */
-	off += sizeof(zhdr);
-	if (zero_out(tdb, off, (num_buckets+1) * sizeof(tdb_off_t)) == -1)
-		goto fail;
-	off += (num_buckets+1) * sizeof(tdb_off_t);
-
-	/* Now add the rest as our free record. */
-	if (add_free_record(tdb, zone_bits, off, tdb->map_size-1-off) == -1)
-		goto fail;
-
-	/* Try allocating from this zone now. */
-	tdb->zone_off = old_size - 1;
-	tdb->zhdr = zhdr;
+	if (tdb->methods->expand_file(tdb, wanted) == -1) {
+		tdb_unlock_expand(tdb, F_WRLCK);
+		return -1;
+	}
 
-success:
+	/* We need to drop this lock before adding free record. */
 	tdb_unlock_expand(tdb, F_WRLCK);
-	return 0;
 
-fail:
-	tdb_unlock_expand(tdb, F_WRLCK);
-	return -1;
+	return add_free_record(tdb, old_size, wanted);
 }
 
 /* This won't fail: it will expand the database if it has to. */
@@ -690,7 +611,7 @@ tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
 		if (likely(off != 0))
 			break;
 
-		if (tdb_expand(tdb, adjust_size(keylen, datalen, growing)))
+		if (tdb_expand(tdb, adjust_size(keylen, datalen)))
 			return TDB_OFF_ERR;
 	}