X-Git-Url: http://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Ftdb2%2Ffree.c;h=be2b91f1dec0a988c539e3dc2b86874fe53e9b7d;hp=83d916fc6986ceec757131b51b44a8c13eaa022a;hb=453d3fe2bdec8e430b85f4cd595a9ad2f8ebcbb3;hpb=39f01834db9b6a21d076e67d1e3143ab99aaf43e

diff --git a/ccan/tdb2/free.c b/ccan/tdb2/free.c
index 83d916fc..be2b91f1 100644
--- a/ccan/tdb2/free.c
+++ b/ccan/tdb2/free.c
@@ -45,10 +45,17 @@ static unsigned int quick_random(struct tdb_context *tdb)
 }
 
 /* Start by using a random zone to spread the load. */
-uint64_t random_free_zone(struct tdb_context *tdb)
+void tdb_zone_init(struct tdb_context *tdb)
 {
-	/* num_zones might be out of date, but can only increase */
-	return quick_random(tdb) % tdb->header.v.num_zones;
+	/*
+	 * We read num_zones without a proper lock, so we could have
+	 * gotten a partial read.  Since zone_bits is 1 byte long, we
+	 * can trust that; even if it's increased, the number of zones
+	 * cannot have decreased.  And using the map size means we
+	 * will not start with a zone which hasn't been filled yet.
+	 */
+	tdb->last_zone = quick_random(tdb)
+		% ((tdb->map_size >> tdb->header.v.zone_bits) + 1);
 }
 
 static unsigned fls64(uint64_t val)
@@ -123,7 +130,7 @@ tdb_off_t zone_of(struct tdb_context *tdb, tdb_off_t off)
 	return off >> tdb->header.v.zone_bits;
 }
 
-/* Returns fl->max_bucket + 1, or list number to search. */
+/* Returns free_buckets + 1, or list number to search. */
 static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
 {
 	tdb_off_t first, off;
@@ -131,7 +138,7 @@ static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
 	/* Speculatively search for a non-zero bucket. */
 	first = tdb->last_zone * (tdb->header.v.free_buckets+1) + bucket;
 	off = tdb_find_nonzero_off(tdb, free_list_off(tdb, first),
-				   tdb->header.v.free_buckets - bucket);
+				   tdb->header.v.free_buckets + 1 - bucket);
 	return bucket + off;
 }
 
@@ -203,7 +210,7 @@ int add_free_record(struct tdb_context *tdb,
 	tdb->last_zone = zone_of(tdb, off);
 	list = tdb->last_zone * (tdb->header.v.free_buckets+1)
 		+ size_to_bucket(tdb, new.data_len);
-		
+
 	if (tdb_lock_free_list(tdb, list, TDB_LOCK_WAIT) != 0)
 		return -1;
 
@@ -312,7 +319,7 @@ static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
 				tdb_len_t *actual)
 {
 	tdb_off_t list;
-	tdb_off_t off, prev, best_off;
+	tdb_off_t off, best_off;
 	struct tdb_free_record pad, best = { 0 }, *r;
 	double multiplier;
 
@@ -324,27 +331,21 @@ again:
 		return TDB_OFF_ERR;
 	}
 
-	prev = free_list_off(tdb, list);
-	off = tdb_read_off(tdb, prev);
-
-	if (unlikely(off == TDB_OFF_ERR))
-		goto unlock_err;
-
 	best.data_len = -1ULL;
 	best_off = 0;
 	multiplier = 1.0;
 
 	/* Walk the list to see if any are large enough, getting less fussy
 	 * as we go. */
-	while (off) {
-		prev = off;
-		off = tdb_read_off(tdb, prev);
-		if (unlikely(off == TDB_OFF_ERR))
-			goto unlock_err;
+	off = tdb_read_off(tdb, free_list_off(tdb, list));
+	if (unlikely(off == TDB_OFF_ERR))
+		goto unlock_err;
 
+	while (off) {
 		r = tdb_get(tdb, off, &pad, sizeof(*r));
 		if (!r)
 			goto unlock_err;
+
 		if (r->magic != TDB_FREE_MAGIC) {
 			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
 				 "lock_and_alloc: %llu non-free 0x%llx\n",
@@ -357,18 +358,9 @@ again:
 			best = *r;
 		}
 
-		if (best.data_len < size * multiplier && best_off) {
-			/* We're happy with this size: take it. */
-			if (remove_from_list(tdb, list, &best) != 0)
-				goto unlock_err;
-			tdb_unlock_free_list(tdb, list);
+		if (best.data_len < size * multiplier && best_off)
+			goto use_best;
 
-			if (to_used_record(tdb, best_off, size, best.data_len,
-					   actual)) {
-				return -1;
-			}
-			return best_off;
-		}
 		multiplier *= 1.01;
 
 		/* Since we're going slow anyway, try coalescing here. */
@@ -380,6 +372,22 @@ again:
 			/* This has unlocked list, restart. */
 			goto again;
 		}
+		off = r->next;
+	}
+
+	/* If we found anything at all, use it. */
+	if (best_off) {
+	use_best:
+		/* We're happy with this size: take it. */
+		if (remove_from_list(tdb, list, &best) != 0)
+			goto unlock_err;
+		tdb_unlock_free_list(tdb, list);
+
+		if (to_used_record(tdb, best_off, size, best.data_len,
+				   actual)) {
+			return -1;
+		}
+		return best_off;
 	}
 
 	tdb_unlock_free_list(tdb, list);
@@ -396,20 +404,17 @@ static tdb_off_t huge_alloc(struct tdb_context *tdb, size_t size,
 {
 	tdb_off_t i, off;
 
-	do {
-		for (i = 0; i < tdb->header.v.num_zones; i++) {
-			/* Try getting one from list. */
-			off = lock_and_alloc(tdb, tdb->header.v.free_buckets,
-					     size, actual);
-			if (off == TDB_OFF_ERR)
-				return TDB_OFF_ERR;
-			if (off != 0)
-				return off;
-			/* FIXME: Coalesce! */
-		}
-	} while (tdb_expand(tdb, 0, size, false) == 0);
-
-	return TDB_OFF_ERR;
+	for (i = 0; i < tdb->header.v.num_zones; i++) {
+		/* Try getting one from list. */
+		off = lock_and_alloc(tdb, tdb->header.v.free_buckets,
+				     size, actual);
+		if (off == TDB_OFF_ERR)
+			return TDB_OFF_ERR;
+		if (off != 0)
+			return off;
+		/* FIXME: Coalesce! */
+	}
+	return 0;
 }
 
 static tdb_off_t get_free(struct tdb_context *tdb, size_t size,
@@ -433,11 +438,11 @@ static tdb_off_t get_free(struct tdb_context *tdb, size_t size,
 		tdb_off_t b;
 
 		/* Start at exact size bucket, and search up... */
-		for (b = bucket; b <= tdb->header.v.num_zones; b++) {
+		for (b = bucket; b <= tdb->header.v.free_buckets; b++) {
 			b = find_free_head(tdb, b);
 
 			/* Non-empty list?  Try getting block. */
-			if (b <= tdb->header.v.num_zones) {
+			if (b <= tdb->header.v.free_buckets) {
 				/* Try getting one from list. */
 				off = lock_and_alloc(tdb, b, size, actual);
 				if (off == TDB_OFF_ERR)
@@ -554,82 +559,39 @@ static bool zones_happy(struct tdb_context *tdb)
 	return true;
 }
 
-/* Expand the database. */
-int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
-	       bool growing)
+/* Returns how much extra room we get, or TDB_OFF_ERR. */
+static tdb_len_t expand_to_fill_zones(struct tdb_context *tdb)
 {
-	uint64_t new_num_buckets, new_num_zones, new_zone_bits;
-	uint64_t old_num_total, i;
-	tdb_len_t add, freebucket_size, needed;
-	tdb_off_t off, old_free_off;
-	const tdb_off_t *oldf;
-	struct tdb_used_record fhdr;
-	
-	/* We need room for the record header too. */
-	needed = sizeof(struct tdb_used_record)
-		+ adjust_size(klen, dlen, growing);
-
-	/* FIXME: this is overkill.  An expand lock? */
-	if (tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == -1)
-		return -1;
-
-	/* Someone may have expanded for us. */
-	if (update_header(tdb))
-		goto success;
-
-	/* Make sure we have the latest size. */
-	tdb->methods->oob(tdb, tdb->map_size + 1, true);
-
-	/* Did we enlarge zones without enlarging file? */
-	if (tdb->map_size < tdb->header.v.num_zones<<tdb->header.v.zone_bits) {
-		add = (tdb->header.v.num_zones<<tdb->header.v.zone_bits)
-			- tdb->map_size;
-		/* Updates tdb->map_size. */
-		if (tdb->methods->expand_file(tdb, tdb->map_size, add) == -1)
-			goto fail;
-		if (add_free_record(tdb, tdb->map_size - add, add) == -1)
-			goto fail;
-		if (add >= needed) {
-			/* Allocate from this zone. */
-			tdb->last_zone = zone_of(tdb, tdb->map_size - add);
-			goto success;
-		}
-	}
+	tdb_len_t add;
 
-	/* Slow path.  Should we increase the number of buckets? */
-	new_num_buckets = tdb->header.v.free_buckets;
-	if (larger_buckets_might_help(tdb))
-		new_num_buckets++;
-
-	/* Now we'll need room for the new free buckets, too.  Assume
-	 * worst case (zones expand). */
-	needed += sizeof(fhdr)
-		+ ((tdb->header.v.num_zones+1)
-		   * (new_num_buckets+1) * sizeof(tdb_off_t));
+	/* We can enlarge zones without enlarging file to match. */
+	add = (tdb->header.v.num_zones<<tdb->header.v.zone_bits)
+		- tdb->map_size;
+	if (add <= sizeof(struct tdb_free_record))
+		return 0;
 
-	/* If we need less that one zone, and they're working well, just add
-	 * another one. */
-	if (needed < (1UL<<tdb->header.v.zone_bits) && zones_happy(tdb)) {
-		new_num_zones = tdb->header.v.num_zones+1;
-		new_zone_bits = tdb->header.v.zone_bits;
-		add = 1ULL << tdb->header.v.zone_bits;
-	} else {
-		/* Increase the zone size. */
-		new_num_zones = tdb->header.v.num_zones;
-		new_zone_bits = tdb->header.v.zone_bits+1;
-		while ((new_num_zones << new_zone_bits) - tdb->map_size
-		       < needed) {
-			new_zone_bits++;
-		}
+	/* Updates tdb->map_size. */
+	if (tdb->methods->expand_file(tdb, add) == -1)
+		return TDB_OFF_ERR;
+	if (add_free_record(tdb, tdb->map_size - add, add) == -1)
+		return TDB_OFF_ERR;
+	return add;
+}
 
-		/* We expand by enough zones to meet the need. */
-		add = (needed + (1ULL << new_zone_bits)-1)
-			& ~((1ULL << new_zone_bits)-1);
-	}
+static int update_zones(struct tdb_context *tdb,
+			uint64_t new_num_zones,
+			uint64_t new_zone_bits,
+			uint64_t new_num_buckets,
+			tdb_len_t add)
+{
+	tdb_len_t freebucket_size;
+	const tdb_off_t *oldf;
+	tdb_off_t i, off, old_num_total, old_free_off;
+	struct tdb_used_record fhdr;
 
 	/* Updates tdb->map_size. */
-	if (tdb->methods->expand_file(tdb, tdb->map_size, add) == -1)
-		goto fail;
+	if (tdb->methods->expand_file(tdb, add) == -1)
+		return -1;
 
 	/* Use first part as new free bucket array. */
 	off = tdb->map_size - add;
@@ -638,9 +600,9 @@ int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
 
 	/* Write header. */
 	if (set_header(tdb, &fhdr, 0, freebucket_size, freebucket_size, 0))
-		goto fail;
+		return -1;
 	if (tdb_write_convert(tdb, off, &fhdr, sizeof(fhdr)) == -1)
-		goto fail;
+		return -1;
 
 	/* Adjust off to point to start of buckets, add to be remainder. */
 	add -= freebucket_size + sizeof(fhdr);
@@ -650,15 +612,17 @@ int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
 	old_num_total = tdb->header.v.num_zones*(tdb->header.v.free_buckets+1);
 	old_free_off = tdb->header.v.free_off;
 	oldf = tdb_access_read(tdb, old_free_off,
-			       old_num_total * sizeof(tdb_off_t));
+			       old_num_total * sizeof(tdb_off_t), true);
 	if (!oldf)
-		goto fail;
+		return -1;
 
 	/* Switch to using our new zone. */
-	if (zero_out(tdb, off, new_num_zones * (new_num_buckets + 1)) == -1)
+	if (zero_out(tdb, off, freebucket_size) == -1)
 		goto fail_release;
+
 	tdb->header.v.free_off = off;
 	tdb->header.v.num_zones = new_num_zones;
+	tdb->header.v.zone_bits = new_zone_bits;
 	tdb->header.v.free_buckets = new_num_buckets;
 
 	/* FIXME: If zone size hasn't changed, can simply copy pointers. */
@@ -682,13 +646,14 @@ int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
 		}
 	}
 
-
 	/* Free up the old free buckets. */
 	old_free_off -= sizeof(fhdr);
 	if (tdb_read_convert(tdb, old_free_off, &fhdr, sizeof(fhdr)) == -1)
 		goto fail_release;
 	if (add_free_record(tdb, old_free_off,
-			    rec_data_length(&fhdr)+rec_extra_padding(&fhdr)))
+			    sizeof(fhdr)
+			    + rec_data_length(&fhdr)
+			    + rec_extra_padding(&fhdr)))
 		goto fail_release;
 
 	/* Add the rest as a new free record. */
@@ -698,12 +663,95 @@ int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
 	/* Start allocating from where the new space is. */
 	tdb->last_zone = zone_of(tdb, tdb->map_size - add);
 	tdb_access_release(tdb, oldf);
+	return write_header(tdb);
+
+fail_release:
+	tdb_access_release(tdb, oldf);
+	return -1;
+}
+
+/* Expand the database. */
+int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
+	       bool growing)
+{
+	uint64_t new_num_buckets, new_num_zones, new_zone_bits;
+	uint64_t old_num_zones, old_size, old_zone_bits;
+	tdb_len_t add, needed;
+
+	/* We need room for the record header too. */
+	needed = sizeof(struct tdb_used_record)
+		+ adjust_size(klen, dlen, growing);
+
+	/* tdb_allrecord_lock will update header; did zones change? */
+	old_zone_bits = tdb->header.v.zone_bits;
+	old_num_zones = tdb->header.v.num_zones;
+
+	/* FIXME: this is overkill.  An expand lock? */
+	if (tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == -1)
+		return -1;
+
+	/* Someone may have expanded for us. */
+	if (old_zone_bits != tdb->header.v.zone_bits
+	    || old_num_zones != tdb->header.v.num_zones)
+		goto success;
+
+	/* They may have also expanded the underlying size (otherwise we'd
+	 * have expanded our mmap to look at those offsets already). */
+	old_size = tdb->map_size;
+	tdb->methods->oob(tdb, tdb->map_size + 1, true);
+	if (tdb->map_size != old_size)
+		goto success;
+
+	add = expand_to_fill_zones(tdb);
+	if (add == TDB_OFF_ERR)
+		goto fail;
+
+	if (add >= needed) {
+		/* Allocate from this zone. */
+		tdb->last_zone = zone_of(tdb, tdb->map_size - add);
+		goto success;
+	}
+
+	/* Slow path.  Should we increase the number of buckets? */
+	new_num_buckets = tdb->header.v.free_buckets;
+	if (larger_buckets_might_help(tdb))
+		new_num_buckets++;
+
+	/* Now we'll need room for the new free buckets, too.  Assume
+	 * worst case (zones expand). */
+	needed += sizeof(struct tdb_used_record)
+		+ ((tdb->header.v.num_zones+1)
+		   * (new_num_buckets+1) * sizeof(tdb_off_t));
+
+	/* If we need less that one zone, and they're working well, just add
+	 * another one. */
+	if (needed < (1UL<<tdb->header.v.zone_bits) && zones_happy(tdb)) {
+		new_num_zones = tdb->header.v.num_zones+1;
+		new_zone_bits = tdb->header.v.zone_bits;
+		add = 1ULL << tdb->header.v.zone_bits;
+	} else {
+		/* Increase the zone size. */
+		new_num_zones = tdb->header.v.num_zones;
+		new_zone_bits = tdb->header.v.zone_bits+1;
+		while ((new_num_zones << new_zone_bits)
+		       < tdb->map_size + needed) {
+			new_zone_bits++;
+		}
+
+		/* We expand by enough full zones to meet the need. */
+		add = ((tdb->map_size + needed + (1ULL << new_zone_bits)-1)
+		       & ~((1ULL << new_zone_bits)-1))
+			- tdb->map_size;
+	}
+
+	if (update_zones(tdb, new_num_zones, new_zone_bits, new_num_buckets,
+			 add) == -1)
+		goto fail;
+
 success:
 	tdb_allrecord_unlock(tdb, F_WRLCK);
 	return 0;
 
-fail_release:
-	tdb_access_release(tdb, oldf);
 fail:
 	tdb_allrecord_unlock(tdb, F_WRLCK);
 	return -1;