X-Git-Url: http://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Ftdb2%2Ffree.c;h=be2b91f1dec0a988c539e3dc2b86874fe53e9b7d;hp=83d916fc6986ceec757131b51b44a8c13eaa022a;hb=453d3fe2bdec8e430b85f4cd595a9ad2f8ebcbb3;hpb=39f01834db9b6a21d076e67d1e3143ab99aaf43e diff --git a/ccan/tdb2/free.c b/ccan/tdb2/free.c index 83d916fc..be2b91f1 100644 --- a/ccan/tdb2/free.c +++ b/ccan/tdb2/free.c @@ -45,10 +45,17 @@ static unsigned int quick_random(struct tdb_context *tdb) } /* Start by using a random zone to spread the load. */ -uint64_t random_free_zone(struct tdb_context *tdb) +void tdb_zone_init(struct tdb_context *tdb) { - /* num_zones might be out of date, but can only increase */ - return quick_random(tdb) % tdb->header.v.num_zones; + /* + * We read num_zones without a proper lock, so we could have + * gotten a partial read. Since zone_bits is 1 byte long, we + * can trust that; even if it's increased, the number of zones + * cannot have decreased. And using the map size means we + * will not start with a zone which hasn't been filled yet. + */ + tdb->last_zone = quick_random(tdb) + % ((tdb->map_size >> tdb->header.v.zone_bits) + 1); } static unsigned fls64(uint64_t val) @@ -123,7 +130,7 @@ tdb_off_t zone_of(struct tdb_context *tdb, tdb_off_t off) return off >> tdb->header.v.zone_bits; } -/* Returns fl->max_bucket + 1, or list number to search. */ +/* Returns free_buckets + 1, or list number to search. */ static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket) { tdb_off_t first, off; @@ -131,7 +138,7 @@ static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket) /* Speculatively search for a non-zero bucket. */ first = tdb->last_zone * (tdb->header.v.free_buckets+1) + bucket; off = tdb_find_nonzero_off(tdb, free_list_off(tdb, first), - tdb->header.v.free_buckets - bucket); + tdb->header.v.free_buckets + 1 - bucket); return bucket + off; } @@ -203,7 +210,7 @@ int add_free_record(struct tdb_context *tdb, tdb->last_zone = zone_of(tdb, off); list = tdb->last_zone * (tdb->header.v.free_buckets+1) + size_to_bucket(tdb, new.data_len); - + if (tdb_lock_free_list(tdb, list, TDB_LOCK_WAIT) != 0) return -1; @@ -312,7 +319,7 @@ static tdb_off_t lock_and_alloc(struct tdb_context *tdb, tdb_len_t *actual) { tdb_off_t list; - tdb_off_t off, prev, best_off; + tdb_off_t off, best_off; struct tdb_free_record pad, best = { 0 }, *r; double multiplier; @@ -324,27 +331,21 @@ again: return TDB_OFF_ERR; } - prev = free_list_off(tdb, list); - off = tdb_read_off(tdb, prev); - - if (unlikely(off == TDB_OFF_ERR)) - goto unlock_err; - best.data_len = -1ULL; best_off = 0; multiplier = 1.0; /* Walk the list to see if any are large enough, getting less fussy * as we go. */ - while (off) { - prev = off; - off = tdb_read_off(tdb, prev); - if (unlikely(off == TDB_OFF_ERR)) - goto unlock_err; + off = tdb_read_off(tdb, free_list_off(tdb, list)); + if (unlikely(off == TDB_OFF_ERR)) + goto unlock_err; + while (off) { r = tdb_get(tdb, off, &pad, sizeof(*r)); if (!r) goto unlock_err; + if (r->magic != TDB_FREE_MAGIC) { tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, "lock_and_alloc: %llu non-free 0x%llx\n", @@ -357,18 +358,9 @@ again: best = *r; } - if (best.data_len < size * multiplier && best_off) { - /* We're happy with this size: take it. */ - if (remove_from_list(tdb, list, &best) != 0) - goto unlock_err; - tdb_unlock_free_list(tdb, list); + if (best.data_len < size * multiplier && best_off) + goto use_best; - if (to_used_record(tdb, best_off, size, best.data_len, - actual)) { - return -1; - } - return best_off; - } multiplier *= 1.01; /* Since we're going slow anyway, try coalescing here. */ @@ -380,6 +372,22 @@ again: /* This has unlocked list, restart. */ goto again; } + off = r->next; + } + + /* If we found anything at all, use it. */ + if (best_off) { + use_best: + /* We're happy with this size: take it. */ + if (remove_from_list(tdb, list, &best) != 0) + goto unlock_err; + tdb_unlock_free_list(tdb, list); + + if (to_used_record(tdb, best_off, size, best.data_len, + actual)) { + return -1; + } + return best_off; } tdb_unlock_free_list(tdb, list); @@ -396,20 +404,17 @@ static tdb_off_t huge_alloc(struct tdb_context *tdb, size_t size, { tdb_off_t i, off; - do { - for (i = 0; i < tdb->header.v.num_zones; i++) { - /* Try getting one from list. */ - off = lock_and_alloc(tdb, tdb->header.v.free_buckets, - size, actual); - if (off == TDB_OFF_ERR) - return TDB_OFF_ERR; - if (off != 0) - return off; - /* FIXME: Coalesce! */ - } - } while (tdb_expand(tdb, 0, size, false) == 0); - - return TDB_OFF_ERR; + for (i = 0; i < tdb->header.v.num_zones; i++) { + /* Try getting one from list. */ + off = lock_and_alloc(tdb, tdb->header.v.free_buckets, + size, actual); + if (off == TDB_OFF_ERR) + return TDB_OFF_ERR; + if (off != 0) + return off; + /* FIXME: Coalesce! */ + } + return 0; } static tdb_off_t get_free(struct tdb_context *tdb, size_t size, @@ -433,11 +438,11 @@ static tdb_off_t get_free(struct tdb_context *tdb, size_t size, tdb_off_t b; /* Start at exact size bucket, and search up... */ - for (b = bucket; b <= tdb->header.v.num_zones; b++) { + for (b = bucket; b <= tdb->header.v.free_buckets; b++) { b = find_free_head(tdb, b); /* Non-empty list? Try getting block. */ - if (b <= tdb->header.v.num_zones) { + if (b <= tdb->header.v.free_buckets) { /* Try getting one from list. */ off = lock_and_alloc(tdb, b, size, actual); if (off == TDB_OFF_ERR) @@ -554,82 +559,39 @@ static bool zones_happy(struct tdb_context *tdb) return true; } -/* Expand the database. */ -int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen, - bool growing) +/* Returns how much extra room we get, or TDB_OFF_ERR. */ +static tdb_len_t expand_to_fill_zones(struct tdb_context *tdb) { - uint64_t new_num_buckets, new_num_zones, new_zone_bits; - uint64_t old_num_total, i; - tdb_len_t add, freebucket_size, needed; - tdb_off_t off, old_free_off; - const tdb_off_t *oldf; - struct tdb_used_record fhdr; - - /* We need room for the record header too. */ - needed = sizeof(struct tdb_used_record) - + adjust_size(klen, dlen, growing); - - /* FIXME: this is overkill. An expand lock? */ - if (tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == -1) - return -1; - - /* Someone may have expanded for us. */ - if (update_header(tdb)) - goto success; - - /* Make sure we have the latest size. */ - tdb->methods->oob(tdb, tdb->map_size + 1, true); - - /* Did we enlarge zones without enlarging file? */ - if (tdb->map_size < tdb->header.v.num_zones<header.v.zone_bits) { - add = (tdb->header.v.num_zones<header.v.zone_bits) - - tdb->map_size; - /* Updates tdb->map_size. */ - if (tdb->methods->expand_file(tdb, tdb->map_size, add) == -1) - goto fail; - if (add_free_record(tdb, tdb->map_size - add, add) == -1) - goto fail; - if (add >= needed) { - /* Allocate from this zone. */ - tdb->last_zone = zone_of(tdb, tdb->map_size - add); - goto success; - } - } + tdb_len_t add; - /* Slow path. Should we increase the number of buckets? */ - new_num_buckets = tdb->header.v.free_buckets; - if (larger_buckets_might_help(tdb)) - new_num_buckets++; - - /* Now we'll need room for the new free buckets, too. Assume - * worst case (zones expand). */ - needed += sizeof(fhdr) - + ((tdb->header.v.num_zones+1) - * (new_num_buckets+1) * sizeof(tdb_off_t)); + /* We can enlarge zones without enlarging file to match. */ + add = (tdb->header.v.num_zones<header.v.zone_bits) + - tdb->map_size; + if (add <= sizeof(struct tdb_free_record)) + return 0; - /* If we need less that one zone, and they're working well, just add - * another one. */ - if (needed < (1UL<header.v.zone_bits) && zones_happy(tdb)) { - new_num_zones = tdb->header.v.num_zones+1; - new_zone_bits = tdb->header.v.zone_bits; - add = 1ULL << tdb->header.v.zone_bits; - } else { - /* Increase the zone size. */ - new_num_zones = tdb->header.v.num_zones; - new_zone_bits = tdb->header.v.zone_bits+1; - while ((new_num_zones << new_zone_bits) - tdb->map_size - < needed) { - new_zone_bits++; - } + /* Updates tdb->map_size. */ + if (tdb->methods->expand_file(tdb, add) == -1) + return TDB_OFF_ERR; + if (add_free_record(tdb, tdb->map_size - add, add) == -1) + return TDB_OFF_ERR; + return add; +} - /* We expand by enough zones to meet the need. */ - add = (needed + (1ULL << new_zone_bits)-1) - & ~((1ULL << new_zone_bits)-1); - } +static int update_zones(struct tdb_context *tdb, + uint64_t new_num_zones, + uint64_t new_zone_bits, + uint64_t new_num_buckets, + tdb_len_t add) +{ + tdb_len_t freebucket_size; + const tdb_off_t *oldf; + tdb_off_t i, off, old_num_total, old_free_off; + struct tdb_used_record fhdr; /* Updates tdb->map_size. */ - if (tdb->methods->expand_file(tdb, tdb->map_size, add) == -1) - goto fail; + if (tdb->methods->expand_file(tdb, add) == -1) + return -1; /* Use first part as new free bucket array. */ off = tdb->map_size - add; @@ -638,9 +600,9 @@ int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen, /* Write header. */ if (set_header(tdb, &fhdr, 0, freebucket_size, freebucket_size, 0)) - goto fail; + return -1; if (tdb_write_convert(tdb, off, &fhdr, sizeof(fhdr)) == -1) - goto fail; + return -1; /* Adjust off to point to start of buckets, add to be remainder. */ add -= freebucket_size + sizeof(fhdr); @@ -650,15 +612,17 @@ int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen, old_num_total = tdb->header.v.num_zones*(tdb->header.v.free_buckets+1); old_free_off = tdb->header.v.free_off; oldf = tdb_access_read(tdb, old_free_off, - old_num_total * sizeof(tdb_off_t)); + old_num_total * sizeof(tdb_off_t), true); if (!oldf) - goto fail; + return -1; /* Switch to using our new zone. */ - if (zero_out(tdb, off, new_num_zones * (new_num_buckets + 1)) == -1) + if (zero_out(tdb, off, freebucket_size) == -1) goto fail_release; + tdb->header.v.free_off = off; tdb->header.v.num_zones = new_num_zones; + tdb->header.v.zone_bits = new_zone_bits; tdb->header.v.free_buckets = new_num_buckets; /* FIXME: If zone size hasn't changed, can simply copy pointers. */ @@ -682,13 +646,14 @@ int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen, } } - /* Free up the old free buckets. */ old_free_off -= sizeof(fhdr); if (tdb_read_convert(tdb, old_free_off, &fhdr, sizeof(fhdr)) == -1) goto fail_release; if (add_free_record(tdb, old_free_off, - rec_data_length(&fhdr)+rec_extra_padding(&fhdr))) + sizeof(fhdr) + + rec_data_length(&fhdr) + + rec_extra_padding(&fhdr))) goto fail_release; /* Add the rest as a new free record. */ @@ -698,12 +663,95 @@ int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen, /* Start allocating from where the new space is. */ tdb->last_zone = zone_of(tdb, tdb->map_size - add); tdb_access_release(tdb, oldf); + return write_header(tdb); + +fail_release: + tdb_access_release(tdb, oldf); + return -1; +} + +/* Expand the database. */ +int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen, + bool growing) +{ + uint64_t new_num_buckets, new_num_zones, new_zone_bits; + uint64_t old_num_zones, old_size, old_zone_bits; + tdb_len_t add, needed; + + /* We need room for the record header too. */ + needed = sizeof(struct tdb_used_record) + + adjust_size(klen, dlen, growing); + + /* tdb_allrecord_lock will update header; did zones change? */ + old_zone_bits = tdb->header.v.zone_bits; + old_num_zones = tdb->header.v.num_zones; + + /* FIXME: this is overkill. An expand lock? */ + if (tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == -1) + return -1; + + /* Someone may have expanded for us. */ + if (old_zone_bits != tdb->header.v.zone_bits + || old_num_zones != tdb->header.v.num_zones) + goto success; + + /* They may have also expanded the underlying size (otherwise we'd + * have expanded our mmap to look at those offsets already). */ + old_size = tdb->map_size; + tdb->methods->oob(tdb, tdb->map_size + 1, true); + if (tdb->map_size != old_size) + goto success; + + add = expand_to_fill_zones(tdb); + if (add == TDB_OFF_ERR) + goto fail; + + if (add >= needed) { + /* Allocate from this zone. */ + tdb->last_zone = zone_of(tdb, tdb->map_size - add); + goto success; + } + + /* Slow path. Should we increase the number of buckets? */ + new_num_buckets = tdb->header.v.free_buckets; + if (larger_buckets_might_help(tdb)) + new_num_buckets++; + + /* Now we'll need room for the new free buckets, too. Assume + * worst case (zones expand). */ + needed += sizeof(struct tdb_used_record) + + ((tdb->header.v.num_zones+1) + * (new_num_buckets+1) * sizeof(tdb_off_t)); + + /* If we need less that one zone, and they're working well, just add + * another one. */ + if (needed < (1UL<header.v.zone_bits) && zones_happy(tdb)) { + new_num_zones = tdb->header.v.num_zones+1; + new_zone_bits = tdb->header.v.zone_bits; + add = 1ULL << tdb->header.v.zone_bits; + } else { + /* Increase the zone size. */ + new_num_zones = tdb->header.v.num_zones; + new_zone_bits = tdb->header.v.zone_bits+1; + while ((new_num_zones << new_zone_bits) + < tdb->map_size + needed) { + new_zone_bits++; + } + + /* We expand by enough full zones to meet the need. */ + add = ((tdb->map_size + needed + (1ULL << new_zone_bits)-1) + & ~((1ULL << new_zone_bits)-1)) + - tdb->map_size; + } + + if (update_zones(tdb, new_num_zones, new_zone_bits, new_num_buckets, + add) == -1) + goto fail; + success: tdb_allrecord_unlock(tdb, F_WRLCK); return 0; -fail_release: - tdb_access_release(tdb, oldf); fail: tdb_allrecord_unlock(tdb, F_WRLCK); return -1;