X-Git-Url: https://git.ozlabs.org/?a=blobdiff_plain;f=ccan%2Ftdb2%2Ffree.c;h=dca8ff10709c81e3848b0c1fb9e3e4431ed0996f;hb=afc3c1e723b4eca0b32f7c5b656f5b070eb1c9fb;hp=3fc8bef68dd97cce9c0b0f3b26052fe3f55c3e39;hpb=d70577b6aff24ccf6815896509dabb8c9ac07904;p=ccan

diff --git a/ccan/tdb2/free.c b/ccan/tdb2/free.c
index 3fc8bef6..dca8ff10 100644
--- a/ccan/tdb2/free.c
+++ b/ccan/tdb2/free.c
@@ -49,12 +49,39 @@ unsigned int size_to_bucket(tdb_len_t data_len)
 	return bucket;
 }
 
+tdb_off_t first_flist(struct tdb_context *tdb)
+{
+	return tdb_read_off(tdb, offsetof(struct tdb_header, free_list));
+}
+
+tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist)
+{
+	return tdb_read_off(tdb, flist + offsetof(struct tdb_freelist, next));
+}
+
 int tdb_flist_init(struct tdb_context *tdb)
 {
-	tdb->flist_off = tdb_read_off(tdb,
-				      offsetof(struct tdb_header, free_list));
-	if (tdb->flist_off == TDB_OFF_ERR)
-		return -1;
+	/* Use reservoir sampling algorithm to select a free list at random. */
+	unsigned int rnd, max = 0, count = 0;
+	tdb_off_t off;
+
+	tdb->flist_off = off = first_flist(tdb);
+	tdb->flist = 0;
+
+	while (off) {
+		if (off == TDB_OFF_ERR)
+			return -1;
+
+		rnd = random();
+		if (rnd >= max) {
+			tdb->flist_off = off;
+			tdb->flist = count;
+			max = rnd;
+		}
+
+		off = next_flist(tdb, off);
+		count++;
+	}
 	return 0;
 }
 
@@ -66,10 +93,12 @@ tdb_off_t bucket_off(tdb_off_t flist_off, unsigned bucket)
 }
 
 /* Returns free_buckets + 1, or list number to search. */
-static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
+static tdb_off_t find_free_head(struct tdb_context *tdb,
+				tdb_off_t flist_off,
+				tdb_off_t bucket)
 {
 	/* Speculatively search for a non-zero bucket. */
-	return tdb_find_nonzero_off(tdb, bucket_off(tdb->flist_off, 0),
+	return tdb_find_nonzero_off(tdb, bucket_off(flist_off, 0),
 				    bucket, TDB_FREE_BUCKETS);
 }
 
@@ -81,10 +110,10 @@ static int remove_from_list(struct tdb_context *tdb,
 	tdb_off_t off;
 
 	/* Front of list? */
-	if (r->prev == 0) {
+	if (frec_prev(r) == 0) {
 		off = b_off;
 	} else {
-		off = r->prev + offsetof(struct tdb_free_record, next);
+		off = frec_prev(r) + offsetof(struct tdb_free_record, next);
 	}
 
 #ifdef DEBUG
@@ -102,11 +131,11 @@ static int remove_from_list(struct tdb_context *tdb,
 	}
 
 	if (r->next != 0) {
-		off = r->next + offsetof(struct tdb_free_record, prev);
+		off = r->next + offsetof(struct tdb_free_record,magic_and_prev);
 		/* r->next->prev = r->prev */
 
 #ifdef DEBUG
-		if (tdb_read_off(tdb, off) != r_off) {
+		if (tdb_read_off(tdb, off) & TDB_OFF_MASK != r_off) {
 			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 				 "remove_from_list: %llu bad list %llu\n",
 				 (long long)r_off, (long long)b_off);
@@ -114,7 +143,7 @@ static int remove_from_list(struct tdb_context *tdb,
 		}
 #endif
 
-		if (tdb_write_off(tdb, off, r->prev)) {
+		if (tdb_write_off(tdb, off, r->magic_and_prev)) {
 			return -1;
 		}
 	}
@@ -125,57 +154,65 @@ static int remove_from_list(struct tdb_context *tdb,
 static int enqueue_in_free(struct tdb_context *tdb,
 			   tdb_off_t b_off,
 			   tdb_off_t off,
-			   struct tdb_free_record *new)
+			   tdb_len_t len)
 {
-	new->prev = 0;
+	struct tdb_free_record new;
+	uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL));
+
+	/* We only need to set flist_and_len; rest is set in enqueue_in_free */
+	new.flist_and_len = ((uint64_t)tdb->flist << (64 - TDB_OFF_UPPER_STEAL))
+		| len;
+	/* prev = 0. */
+	new.magic_and_prev = magic;
+
 	/* new->next = head. */
-	new->next = tdb_read_off(tdb, b_off);
-	if (new->next == TDB_OFF_ERR)
+	new.next = tdb_read_off(tdb, b_off);
+	if (new.next == TDB_OFF_ERR)
 		return -1;
 
-	if (new->next) {
+	if (new.next) {
 #ifdef DEBUG
 		if (tdb_read_off(tdb,
-				 new->next
-				 + offsetof(struct tdb_free_record, prev))
-		    != 0) {
+				 new.next + offsetof(struct tdb_free_record,
+						     magic_and_prev))
+		    != magic) {
 			tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 				 "enqueue_in_free: %llu bad head prev %llu\n",
-				 (long long)new->next, (long long)b_off);
+				 (long long)new.next, (long long)b_off);
 			return -1;
 		}
 #endif
 		/* next->prev = new. */
-		if (tdb_write_off(tdb, new->next
-				  + offsetof(struct tdb_free_record, prev),
-				  off) != 0)
+		if (tdb_write_off(tdb, new.next
+				  + offsetof(struct tdb_free_record,
+					     magic_and_prev),
+				  off | magic) != 0)
 			return -1;
 	}
 	/* head = new */
 	if (tdb_write_off(tdb, b_off, off) != 0)
 		return -1;
 
-	return tdb_write_convert(tdb, off, new, sizeof(*new));
+	return tdb_write_convert(tdb, off, &new, sizeof(new));
 }
 
 /* List need not be locked. */
 int add_free_record(struct tdb_context *tdb,
 		    tdb_off_t off, tdb_len_t len_with_header)
 {
-	struct tdb_free_record new;
 	tdb_off_t b_off;
+	tdb_len_t len;
 	int ret;
 
-	assert(len_with_header >= sizeof(new));
+	assert(len_with_header >= sizeof(struct tdb_free_record));
 
-	new.magic_and_meta = TDB_FREE_MAGIC;
-	new.data_len = len_with_header - sizeof(struct tdb_used_record);
+	len = len_with_header - sizeof(struct tdb_used_record);
 
-	b_off = bucket_off(tdb->flist_off, size_to_bucket(new.data_len));
+	b_off = bucket_off(tdb->flist_off, size_to_bucket(len));
 	if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
 		return -1;
 
-	ret = enqueue_in_free(tdb, b_off, off, &new);
+	ret = enqueue_in_free(tdb, b_off, off, len);
 	tdb_unlock_free_bucket(tdb, b_off);
 	return ret;
 }
@@ -207,6 +244,20 @@ static size_t record_leftover(size_t keylen, size_t datalen,
 	return leftover;
 }
 
+static tdb_off_t flist_offset(struct tdb_context *tdb, unsigned int flist)
+{
+	tdb_off_t off;
+	unsigned int i;
+
+	if (likely(tdb->flist == flist))
+		return tdb->flist_off;
+
+	off = first_flist(tdb);
+	for (i = 0; i < flist; i++)
+		off = next_flist(tdb, off);
+	return off;
+}
+
 /* Note: we unlock the current bucket if we coalesce or fail. */
 static int coalesce(struct tdb_context *tdb,
 		    tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
@@ -214,10 +265,12 @@ static int coalesce(struct tdb_context *tdb,
 	struct tdb_free_record pad, *r;
 	tdb_off_t end;
 
+	add_stat(tdb, alloc_coalesce_tried, 1);
 	end = off + sizeof(struct tdb_used_record) + data_len;
 
 	while (end < tdb->map_size) {
 		tdb_off_t nb_off;
+		unsigned flist, bucket;
 
 		/* FIXME: do tdb_get here and below really win? */
 		r = tdb_get(tdb, end, &pad, sizeof(pad));
@@ -227,12 +280,15 @@ static int coalesce(struct tdb_context *tdb,
 		if (frec_magic(r) != TDB_FREE_MAGIC)
 			break;
 
-		/* FIXME: Use flist from record */
-		nb_off = bucket_off(tdb->flist_off,size_to_bucket(r->data_len));
+		flist = frec_flist(r);
+		bucket = size_to_bucket(frec_len(r));
+		nb_off = bucket_off(flist_offset(tdb, flist), bucket);
 
 		/* We may be violating lock order here, so best effort. */
-		if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
+		if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1) {
+			add_stat(tdb, alloc_coalesce_lockfail, 1);
 			break;
+		}
 
 		/* Now we have lock, re-check. */
 		r = tdb_get(tdb, end, &pad, sizeof(pad));
@@ -242,13 +298,14 @@ static int coalesce(struct tdb_context *tdb,
 		}
 
 		if (unlikely(frec_magic(r) != TDB_FREE_MAGIC)) {
+			add_stat(tdb, alloc_coalesce_race, 1);
 			tdb_unlock_free_bucket(tdb, nb_off);
 			break;
 		}
 
-		if (unlikely(bucket_off(tdb->flist_off,
-					size_to_bucket(r->data_len))
-			     != nb_off)) {
+		if (unlikely(frec_flist(r) != flist)
+		    || unlikely(size_to_bucket(frec_len(r)) != bucket)) {
+			add_stat(tdb, alloc_coalesce_race, 1);
 			tdb_unlock_free_bucket(tdb, nb_off);
 			break;
 		}
@@ -258,8 +315,9 @@ static int coalesce(struct tdb_context *tdb,
 			goto err;
 		}
 
-		end += sizeof(struct tdb_used_record) + r->data_len;
+		end += sizeof(struct tdb_used_record) + frec_len(r);
 		tdb_unlock_free_bucket(tdb, nb_off);
+		add_stat(tdb, alloc_coalesce_num_merged, 1);
 	}
 
 	/* Didn't find any adjacent free? */
@@ -271,11 +329,11 @@ static int coalesce(struct tdb_context *tdb,
 	if (!r)
 		goto err;
 
-	if (r->data_len != data_len) {
+	if (frec_len(r) != data_len) {
 		tdb->ecode = TDB_ERR_CORRUPT;
 		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 			 "coalesce: expected data len %llu not %llu\n",
-			 (long long)data_len, (long long)r->data_len);
+			 (long long)data_len, (long long)frec_len(r));
 		goto err;
 	}
 
@@ -288,11 +346,13 @@ static int coalesce(struct tdb_context *tdb,
 
 	/* We have to drop this to avoid deadlocks, so make sure record
 	 * doesn't get coalesced by someone else! */
-	r->magic_and_meta = TDB_COALESCING_MAGIC;
-	r->data_len = end - off - sizeof(struct tdb_used_record);
+	r->magic_and_prev = TDB_COALESCING_MAGIC << (64 - TDB_OFF_UPPER_STEAL);
+	/* FIXME: Use 255 as invalid free list? */
+	r->flist_and_len = end - off - sizeof(struct tdb_used_record);
 	if (tdb_access_commit(tdb, r) != 0)
 		goto err;
 
+	add_stat(tdb, alloc_coalesce_succeeded, 1);
 	tdb_unlock_free_bucket(tdb, b_off);
 
 	if (add_free_record(tdb, off, end - off) == -1)
@@ -318,6 +378,7 @@ static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
 	double multiplier;
 	size_t size = adjust_size(keylen, datalen);
 
+	add_stat(tdb, allocs, 1);
 again:
 	b_off = bucket_off(flist_off, bucket);
 
@@ -327,7 +388,7 @@ again:
 		return TDB_OFF_ERR;
 	}
 
-	best.data_len = -1ULL;
+	best.flist_and_len = -1ULL;
 	best_off = 0;
 
 	/* Get slack if we're after extra. */
@@ -351,22 +412,22 @@ again:
 		if (frec_magic(r) != TDB_FREE_MAGIC) {
 			tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
 				 "lock_and_alloc: %llu non-free 0x%llx\n",
-				 (long long)off, (long long)r->magic_and_meta);
+				 (long long)off, (long long)r->magic_and_prev);
 			goto unlock_err;
 		}
 
-		if (r->data_len >= size && r->data_len < best.data_len) {
+		if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
 			best_off = off;
 			best = *r;
 		}
 
-		if (best.data_len < size * multiplier && best_off)
+		if (frec_len(&best) < size * multiplier && best_off)
 			break;
 
 		multiplier *= 1.01;
 
 		/* Since we're going slow anyway, try coalescing here. */
-		switch (coalesce(tdb, off, b_off, r->data_len)) {
+		switch (coalesce(tdb, off, b_off, frec_len(r))) {
 		case -1:
 			/* This has already unlocked on error. */
 			return -1;
@@ -387,28 +448,31 @@ again:
 			goto unlock_err;
 
 		leftover = record_leftover(keylen, datalen, want_extra,
-					   best.data_len);
+					   frec_len(&best));
 
-		assert(keylen + datalen + leftover <= best.data_len);
+		assert(keylen + datalen + leftover <= frec_len(&best));
 		/* We need to mark non-free before we drop lock, otherwise
 		 * coalesce() could try to merge it! */
-		if (set_header(tdb, &rec, keylen, datalen,
-			       best.data_len - leftover,
-			       hashlow) != 0)
+		if (set_used_header(tdb, &rec, keylen, datalen,
+				    frec_len(&best) - leftover,
+				    hashlow) != 0)
 			goto unlock_err;
 
 		if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
 			goto unlock_err;
 
-		tdb_unlock_free_bucket(tdb, b_off);
-
+		/* Bucket of leftover will be <= current bucket, so nested
+		 * locking is allowed. */
 		if (leftover) {
+			add_stat(tdb, alloc_leftover, 1);
 			if (add_free_record(tdb,
 					    best_off + sizeof(rec)
-					    + best.data_len - leftover,
+					    + frec_len(&best) - leftover,
 					    leftover))
-				return TDB_OFF_ERR;
+				best_off = TDB_OFF_ERR;
 		}
+		tdb_unlock_free_bucket(tdb, b_off);
+
 		return best_off;
 	}
 
@@ -425,8 +489,9 @@ static tdb_off_t get_free(struct tdb_context *tdb,
 			  size_t keylen, size_t datalen, bool want_extra,
 			  unsigned hashlow)
 {
-	tdb_off_t off;
-	unsigned start_b, b;
+	tdb_off_t off, flist_off;
+	unsigned start_b, b, flist;
+	bool wrapped = false;
 
 	/* If they are growing, add 50% to get to higher bucket. */
 	if (want_extra)
@@ -435,27 +500,50 @@ static tdb_off_t get_free(struct tdb_context *tdb,
 	else
 		start_b = size_to_bucket(adjust_size(keylen, datalen));
 
-	/* Start at exact size bucket, and search up... */
-	for (b = find_free_head(tdb, start_b);
-	     b < TDB_FREE_BUCKETS;
-	     b = find_free_head(tdb, b + 1)) {
-		/* Try getting one from list. */
-		off = lock_and_alloc(tdb, tdb->flist_off,
-				     b, keylen, datalen, want_extra,
-				     hashlow);
-		if (off == TDB_OFF_ERR)
-			return TDB_OFF_ERR;
-		if (off != 0)
-			return off;
-		/* Didn't work.  Try next bucket. */
+	flist_off = tdb->flist_off;
+	flist = tdb->flist;
+	while (!wrapped || flist_off != tdb->flist_off) {
+		/* Start at exact size bucket, and search up... */
+		for (b = find_free_head(tdb, flist_off, start_b);
+		     b < TDB_FREE_BUCKETS;
+		     b = find_free_head(tdb, flist_off, b + 1)) {
+			/* Try getting one from list. */
+			off = lock_and_alloc(tdb, flist_off,
+					     b, keylen, datalen, want_extra,
+					     hashlow);
+			if (off == TDB_OFF_ERR)
+				return TDB_OFF_ERR;
+			if (off != 0) {
+				if (b == start_b)
+					add_stat(tdb, alloc_bucket_exact, 1);
+				if (b == TDB_FREE_BUCKETS - 1)
+					add_stat(tdb, alloc_bucket_max, 1);
+				/* Worked?  Stay using this list. */
+				tdb->flist_off = flist_off;
+				tdb->flist = flist;
+				return off;
+			}
+			/* Didn't work.  Try next bucket. */
+		}
+
+		/* Hmm, try next list. */
+		flist_off = next_flist(tdb, flist_off);
+		flist++;
+
+		if (flist_off == 0) {
+			wrapped = true;
+			flist_off = first_flist(tdb);
+			flist = 0;
+		}
 	}
+
 	return 0;
 }
 
-int set_header(struct tdb_context *tdb,
-	       struct tdb_used_record *rec,
-	       uint64_t keylen, uint64_t datalen,
-	       uint64_t actuallen, unsigned hashlow)
+int set_used_header(struct tdb_context *tdb,
+		    struct tdb_used_record *rec,
+		    uint64_t keylen, uint64_t datalen,
+		    uint64_t actuallen, unsigned hashlow)
 {
 	uint64_t keybits = (fls64(keylen) + 1) / 2;
 
@@ -489,6 +577,22 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 	/* We need room for the record header too. */
 	wanted = sizeof(struct tdb_used_record) + size;
 
+	/* Need to hold a hash lock to expand DB: transactions rely on it. */
+	if (!(tdb->flags & TDB_NOLOCK)
+	    && !tdb->allrecord_lock.count && !tdb_has_hash_locks(tdb)) {
+		tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
+			 "tdb_expand: must hold lock during expand\n");
+		return -1;
+	}
+
+	/* always make room for at least 100 more records, and at
+           least 25% more space. */
+	if (size * TDB_EXTENSION_FACTOR > tdb->map_size / 4)
+		wanted = size * TDB_EXTENSION_FACTOR;
+	else
+		wanted = tdb->map_size / 4;
+	wanted = adjust_size(0, wanted);
+
 	/* Only one person can expand file at a time. */
 	if (tdb_lock_expand(tdb, F_WRLCK) != 0)
 		return -1;
@@ -501,7 +605,7 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 		return 0;
 	}
 
-	if (tdb->methods->expand_file(tdb, wanted*TDB_EXTENSION_FACTOR) == -1) {
+	if (tdb->methods->expand_file(tdb, wanted) == -1) {
 		tdb_unlock_expand(tdb, F_WRLCK);
 		return -1;
 	}
@@ -509,7 +613,8 @@ static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 	/* We need to drop this lock before adding free record. */
 	tdb_unlock_expand(tdb, F_WRLCK);
 
-	return add_free_record(tdb, old_size, wanted * TDB_EXTENSION_FACTOR);
+	add_stat(tdb, expands, 1);
+	return add_free_record(tdb, old_size, wanted);
 }
 
 /* This won't fail: it will expand the database if it has to. */