f5107fadfbd888efae8094b1ff8d3cc4010c3ca8
[ccan] / ccan / tdb2 / free.c
1  /* 
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4    
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/ilog/ilog.h>
21 #include <time.h>
22 #include <assert.h>
23 #include <limits.h>
24
25 static unsigned fls64(uint64_t val)
26 {
27         return ilog64(val);
28 }
29
30 static unsigned ffs64(uint64_t val)
31 {
32 #if HAVE_BUILTIN_FFSLL
33         return __builtin_ffsll(val);
34 #else
35         unsigned r = 0;
36
37         if (!val)
38                 return 0;
39
40         if (!(val & 0xffffffff)) {
41                 val >>= 32;
42                 r += 32;
43         }
44         if (!(val & 0xffff)) {
45                 val >>= 16;
46                 r += 16;
47         }
48         if (!(val & 0xff)) {
49                 val >>= 8;
50                 r += 8;
51         }
52         if (!(val & 0xf)) {
53                 val >>= 4;
54                 r += 4;
55         }
56         if (!(val & 0x3)) {
57                 val >>= 2;
58                 r += 2;
59         }
60         if (!(val & 0x1)) {
61                 val >>= 1;
62                 r += 1;
63         }
64         return r;
65 #endif
66 }
67
68 /* In which bucket would we find a particular record size? (ignoring header) */
69 unsigned int size_to_bucket(unsigned int zone_bits, tdb_len_t data_len)
70 {
71         unsigned int bucket;
72
73         /* We can't have records smaller than this. */
74         assert(data_len >= TDB_MIN_DATA_LEN);
75
76         /* Ignoring the header... */
77         if (data_len - TDB_MIN_DATA_LEN <= 64) {
78                 /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
79                 bucket = (data_len - TDB_MIN_DATA_LEN) / 8;
80         } else {
81                 /* After that we go power of 2. */
82                 bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2;
83         }
84
85         if (unlikely(bucket > BUCKETS_FOR_ZONE(zone_bits)))
86                 bucket = BUCKETS_FOR_ZONE(zone_bits);
87         return bucket;
88 }
89
90 /* Binary search for the zone for this offset. */
91 static tdb_off_t off_to_zone(struct tdb_context *tdb, tdb_off_t off,
92                              struct free_zone_header *zhdr)
93 {
94         tdb_off_t start, end;
95
96         start = sizeof(struct tdb_header);
97         end = start + (1ULL << fls64(tdb->map_size - start));
98
99         for (;;) {
100                 if (tdb_read_convert(tdb, start, zhdr, sizeof(*zhdr)) == -1)
101                         return TDB_OFF_ERR;
102
103                 /* Is it inside this zone? */
104                 if (off < start + (1ULL << zhdr->zone_bits))
105                         return start;
106
107                 /* In practice, start + end won't overflow. */
108                 if (off >= (start + end) / 2)
109                         start = (start + end) / 2;
110                 else
111                         end = (start + end) / 2;
112         }
113 }
114
115 static tdb_off_t last_zone(struct tdb_context *tdb,
116                            struct free_zone_header *zhdr)
117 {
118         return off_to_zone(tdb, tdb->map_size - 1, zhdr);
119 }
120
121 int tdb_zone_init(struct tdb_context *tdb)
122 {
123         unsigned int i;
124         uint64_t randoff = 0;
125
126         /* We start in a random zone, to spread the load. */
127         for (i = 0; i < 64; i += fls64(RAND_MAX))
128                 randoff ^= ((uint64_t)random()) << i;
129         randoff = sizeof(struct tdb_header)
130                 + (randoff % (tdb->map_size - sizeof(struct tdb_header)));
131
132         tdb->zone_off = off_to_zone(tdb, randoff, &tdb->zhdr);
133         if (tdb->zone_off == TDB_OFF_ERR)
134                 return -1;
135         return 0;
136 }
137
138 /* Where's the header, given a zone size of 1 << zone_bits? */
139 static tdb_off_t zone_off(tdb_off_t off, unsigned int zone_bits)
140 {
141         off -= sizeof(struct tdb_header);
142         return (off & ~((1ULL << zone_bits) - 1)) + sizeof(struct tdb_header);
143 }
144
145 /* Offset of a given bucket. */
146 /* FIXME: bucket can be "unsigned" everywhere, or even uint8/16. */
147 tdb_off_t bucket_off(tdb_off_t zone_off, tdb_off_t bucket)
148 {
149         return zone_off
150                 + sizeof(struct free_zone_header)
151                 + bucket * sizeof(tdb_off_t);
152 }
153
154 /* Returns free_buckets + 1, or list number to search. */
155 static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
156 {
157         /* Speculatively search for a non-zero bucket. */
158         return tdb_find_nonzero_off(tdb, bucket_off(tdb->zone_off, 0),
159                                     bucket,
160                                     BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits) + 1);
161 }
162
163 /* Remove from free bucket. */
164 static int remove_from_list(struct tdb_context *tdb,
165                             tdb_off_t b_off, tdb_off_t r_off,
166                             struct tdb_free_record *r)
167 {
168         tdb_off_t off;
169
170         /* Front of list? */
171         if (r->prev == 0) {
172                 off = b_off;
173         } else {
174                 off = r->prev + offsetof(struct tdb_free_record, next);
175         }
176
177 #ifdef DEBUG
178         if (tdb_read_off(tdb, off) != r_off) {
179                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
180                          "remove_from_list: %llu bad prev in list %llu\n",
181                          (long long)r_off, (long long)b_off);
182                 return -1;
183         }
184 #endif
185
186         /* r->prev->next = r->next */
187         if (tdb_write_off(tdb, off, r->next)) {
188                 return -1;
189         }
190
191         if (r->next != 0) {
192                 off = r->next + offsetof(struct tdb_free_record, prev);
193                 /* r->next->prev = r->prev */
194
195 #ifdef DEBUG
196                 if (tdb_read_off(tdb, off) != r_off) {
197                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
198                                  "remove_from_list: %llu bad list %llu\n",
199                                  (long long)r_off, (long long)b_off);
200                         return -1;
201                 }
202 #endif
203
204                 if (tdb_write_off(tdb, off, r->prev)) {
205                         return -1;
206                 }
207         }
208         return 0;
209 }
210
211 /* Enqueue in this free bucket. */
212 static int enqueue_in_free(struct tdb_context *tdb,
213                            tdb_off_t b_off,
214                            tdb_off_t off,
215                            struct tdb_free_record *new)
216 {
217         new->prev = 0;
218         /* new->next = head. */
219         new->next = tdb_read_off(tdb, b_off);
220         if (new->next == TDB_OFF_ERR)
221                 return -1;
222
223         if (new->next) {
224 #ifdef DEBUG
225                 if (tdb_read_off(tdb,
226                                  new->next
227                                  + offsetof(struct tdb_free_record, prev))
228                     != 0) {
229                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
230                                  "enqueue_in_free: %llu bad head prev %llu\n",
231                                  (long long)new->next, (long long)b_off);
232                         return -1;
233                 }
234 #endif
235                 /* next->prev = new. */
236                 if (tdb_write_off(tdb, new->next
237                                   + offsetof(struct tdb_free_record, prev),
238                                   off) != 0)
239                         return -1;
240         }
241         /* head = new */
242         if (tdb_write_off(tdb, b_off, off) != 0)
243                 return -1;
244
245         return tdb_write_convert(tdb, off, new, sizeof(*new));
246 }
247
248 /* List need not be locked. */
249 int add_free_record(struct tdb_context *tdb,
250                     unsigned int zone_bits,
251                     tdb_off_t off, tdb_len_t len_with_header)
252 {
253         struct tdb_free_record new;
254         tdb_off_t b_off;
255         int ret;
256
257         assert(len_with_header >= sizeof(new));
258         assert(zone_bits < 64);
259
260         new.magic_and_meta = TDB_FREE_MAGIC | zone_bits;
261         new.data_len = len_with_header - sizeof(struct tdb_used_record);
262
263         b_off = bucket_off(zone_off(off, zone_bits),
264                            size_to_bucket(zone_bits, new.data_len));
265         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
266                 return -1;
267
268         ret = enqueue_in_free(tdb, b_off, off, &new);
269         tdb_unlock_free_bucket(tdb, b_off);
270         return ret;
271 }
272
273 static size_t adjust_size(size_t keylen, size_t datalen)
274 {
275         size_t size = keylen + datalen;
276
277         if (size < TDB_MIN_DATA_LEN)
278                 size = TDB_MIN_DATA_LEN;
279
280         /* Round to next uint64_t boundary. */
281         return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
282 }
283
284 /* If we have enough left over to be useful, split that off. */
285 static size_t record_leftover(size_t keylen, size_t datalen,
286                               bool want_extra, size_t total_len)
287 {
288         ssize_t leftover;
289
290         if (want_extra)
291                 datalen += datalen / 2;
292         leftover = total_len - adjust_size(keylen, datalen);
293
294         if (leftover < (ssize_t)sizeof(struct tdb_free_record))
295                 return 0;
296
297         return leftover;
298 }
299
300 /* Note: we unlock the current bucket if we coalesce or fail. */
301 static int coalesce(struct tdb_context *tdb,
302                     tdb_off_t zone_off, unsigned zone_bits,
303                     tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
304 {
305         struct tdb_free_record pad, *r;
306         tdb_off_t zone_end, end;
307
308         end = off + sizeof(struct tdb_used_record) + data_len;
309         zone_end = zone_off + (1ULL << zone_bits);
310
311         if (tdb->methods->oob(tdb, zone_end, true))
312                 zone_end = tdb->map_size;
313
314         while (end < zone_end) {
315                 tdb_off_t nb_off;
316
317                 /* FIXME: do tdb_get here and below really win? */
318                 r = tdb_get(tdb, end, &pad, sizeof(pad));
319                 if (!r)
320                         goto err;
321
322                 if (frec_magic(r) != TDB_FREE_MAGIC)
323                         break;
324
325                 nb_off = bucket_off(zone_off,
326                                     size_to_bucket(zone_bits, r->data_len));
327
328                 /* We may be violating lock order here, so best effort. */
329                 if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
330                         break;
331
332                 /* Now we have lock, re-check. */
333                 r = tdb_get(tdb, end, &pad, sizeof(pad));
334                 if (!r) {
335                         tdb_unlock_free_bucket(tdb, nb_off);
336                         goto err;
337                 }
338
339                 if (unlikely(frec_magic(r) != TDB_FREE_MAGIC)) {
340                         tdb_unlock_free_bucket(tdb, nb_off);
341                         break;
342                 }
343
344                 if (unlikely(bucket_off(zone_off,
345                                         size_to_bucket(zone_bits, r->data_len))
346                              != nb_off)) {
347                         tdb_unlock_free_bucket(tdb, nb_off);
348                         break;
349                 }
350
351                 if (remove_from_list(tdb, nb_off, end, r) == -1) {
352                         tdb_unlock_free_bucket(tdb, nb_off);
353                         goto err;
354                 }
355
356                 end += sizeof(struct tdb_used_record) + r->data_len;
357                 tdb_unlock_free_bucket(tdb, nb_off);
358         }
359
360         /* Didn't find any adjacent free? */
361         if (end == off + sizeof(struct tdb_used_record) + data_len)
362                 return 0;
363
364         /* OK, expand record */
365         r = tdb_get(tdb, off, &pad, sizeof(pad));
366         if (!r)
367                 goto err;
368
369         if (r->data_len != data_len) {
370                 tdb->ecode = TDB_ERR_CORRUPT;
371                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
372                          "coalesce: expected data len %llu not %llu\n",
373                          (long long)data_len, (long long)r->data_len);
374                 goto err;
375         }
376
377         if (remove_from_list(tdb, b_off, off, r) == -1)
378                 goto err;
379
380         r = tdb_access_write(tdb, off, sizeof(*r), true);
381         if (!r)
382                 goto err;
383
384         /* We have to drop this to avoid deadlocks, so make sure record
385          * doesn't get coalesced by someone else! */
386         r->magic_and_meta = TDB_COALESCING_MAGIC | zone_bits;
387         r->data_len = end - off - sizeof(struct tdb_used_record);
388         if (tdb_access_commit(tdb, r) != 0)
389                 goto err;
390
391         tdb_unlock_free_bucket(tdb, b_off);
392
393         if (add_free_record(tdb, zone_bits, off, end - off) == -1)
394                 return -1;
395         return 1;
396
397 err:
398         /* To unify error paths, we *always* unlock bucket on error. */
399         tdb_unlock_free_bucket(tdb, b_off);
400         return -1;
401 }
402
403 /* We need size bytes to put our key and data in. */
404 static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
405                                 tdb_off_t zone_off,
406                                 unsigned zone_bits,
407                                 tdb_off_t bucket,
408                                 size_t keylen, size_t datalen,
409                                 bool want_extra,
410                                 unsigned hashlow)
411 {
412         tdb_off_t off, b_off,best_off;
413         struct tdb_free_record pad, best = { 0 }, *r;
414         double multiplier;
415         size_t size = adjust_size(keylen, datalen);
416
417 again:
418         b_off = bucket_off(zone_off, bucket);
419
420         /* FIXME: Try non-blocking wait first, to measure contention.
421          * If we're contented, try switching zones, and don't enlarge zone
422          * next time (we want more zones). */
423         /* Lock this bucket. */
424         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == -1) {
425                 return TDB_OFF_ERR;
426         }
427
428         best.data_len = -1ULL;
429         best_off = 0;
430
431         /* Get slack if we're after extra. */
432         if (want_extra)
433                 multiplier = 1.5;
434         else
435                 multiplier = 1.0;
436
437         /* Walk the list to see if any are large enough, getting less fussy
438          * as we go. */
439         off = tdb_read_off(tdb, b_off);
440         if (unlikely(off == TDB_OFF_ERR))
441                 goto unlock_err;
442
443         while (off) {
444                 /* FIXME: Does tdb_get win anything here? */
445                 r = tdb_get(tdb, off, &pad, sizeof(*r));
446                 if (!r)
447                         goto unlock_err;
448
449                 if (frec_magic(r) != TDB_FREE_MAGIC) {
450                         tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
451                                  "lock_and_alloc: %llu non-free 0x%llx\n",
452                                  (long long)off, (long long)r->magic_and_meta);
453                         goto unlock_err;
454                 }
455
456                 if (r->data_len >= size && r->data_len < best.data_len) {
457                         best_off = off;
458                         best = *r;
459                 }
460
461                 if (best.data_len < size * multiplier && best_off)
462                         break;
463
464                 multiplier *= 1.01;
465
466                 /* Since we're going slow anyway, try coalescing here. */
467                 switch (coalesce(tdb, zone_off, zone_bits, off, b_off,
468                                  r->data_len)) {
469                 case -1:
470                         /* This has already unlocked on error. */
471                         return -1;
472                 case 1:
473                         /* This has unlocked list, restart. */
474                         goto again;
475                 }
476                 off = r->next;
477         }
478
479         /* If we found anything at all, use it. */
480         if (best_off) {
481                 struct tdb_used_record rec;
482                 size_t leftover;
483
484                 /* We're happy with this size: take it. */
485                 if (remove_from_list(tdb, b_off, best_off, &best) != 0)
486                         goto unlock_err;
487
488                 leftover = record_leftover(keylen, datalen, want_extra,
489                                            best.data_len);
490
491                 assert(keylen + datalen + leftover <= best.data_len);
492                 /* We need to mark non-free before we drop lock, otherwise
493                  * coalesce() could try to merge it! */
494                 if (set_header(tdb, &rec, keylen, datalen,
495                                best.data_len - leftover,
496                                hashlow, zone_bits) != 0)
497                         goto unlock_err;
498
499                 if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
500                         goto unlock_err;
501
502                 tdb_unlock_free_bucket(tdb, b_off);
503
504                 if (leftover) {
505                         if (add_free_record(tdb, zone_bits,
506                                             best_off + sizeof(rec)
507                                             + best.data_len - leftover,
508                                             leftover))
509                                 return TDB_OFF_ERR;
510                 }
511                 return best_off;
512         }
513
514         tdb_unlock_free_bucket(tdb, b_off);
515         return 0;
516
517 unlock_err:
518         tdb_unlock_free_bucket(tdb, b_off);
519         return TDB_OFF_ERR;
520 }
521
522 static bool next_zone(struct tdb_context *tdb)
523 {
524         tdb_off_t next = tdb->zone_off + (1ULL << tdb->zhdr.zone_bits);
525
526         /* We must have a header. */
527         if (tdb->methods->oob(tdb, next + sizeof(tdb->zhdr), true))
528                 return false;
529
530         tdb->zone_off = next;
531         return tdb_read_convert(tdb, next, &tdb->zhdr, sizeof(tdb->zhdr)) == 0;
532 }
533
534 /* Offset returned is within current zone (which it may alter). */
535 static tdb_off_t get_free(struct tdb_context *tdb,
536                           size_t keylen, size_t datalen, bool want_extra,
537                           unsigned hashlow)
538 {
539         tdb_off_t start_zone = tdb->zone_off, off;
540         bool wrapped = false;
541         size_t size = adjust_size(keylen, datalen);
542
543         /* If they are growing, add 50% to get to higher bucket. */
544         if (want_extra)
545                 size += datalen / 2;
546
547         /* FIXME: If we don't get a hit in the first bucket we want,
548          * try changing zones for next time.  That should help wear
549          * zones evenly, so we don't need to search all of them before
550          * expanding. */
551         while (!wrapped || tdb->zone_off != start_zone) {
552                 tdb_off_t b;
553
554                 /* Shortcut for really huge allocations... */
555                 if ((size >> tdb->zhdr.zone_bits) != 0)
556                         goto next;
557
558                 /* Start at exact size bucket, and search up... */
559                 b = size_to_bucket(tdb->zhdr.zone_bits, size);
560                 for (b = find_free_head(tdb, b);
561                      b <= BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits);
562                      b = find_free_head(tdb, b + 1)) {
563                         /* Try getting one from list. */
564                         off = lock_and_alloc(tdb, tdb->zone_off,
565                                              tdb->zhdr.zone_bits,
566                                              b, keylen, datalen, want_extra,
567                                              hashlow);
568                         if (off == TDB_OFF_ERR)
569                                 return TDB_OFF_ERR;
570                         if (off != 0)
571                                 return off;
572                         /* Didn't work.  Try next bucket. */
573                 }
574
575         next:
576                 /* Didn't work, try next zone, if it exists. */
577                 if (!next_zone(tdb)) {
578                         wrapped = true;
579                         tdb->zone_off = sizeof(struct tdb_header);
580                         if (tdb_read_convert(tdb, tdb->zone_off,
581                                              &tdb->zhdr, sizeof(tdb->zhdr))) {
582                                 return TDB_OFF_ERR;
583                         }
584                 }
585         }
586         return 0;
587 }
588
589 int set_header(struct tdb_context *tdb,
590                struct tdb_used_record *rec,
591                uint64_t keylen, uint64_t datalen,
592                uint64_t actuallen, unsigned hashlow,
593                unsigned int zone_bits)
594 {
595         uint64_t keybits = (fls64(keylen) + 1) / 2;
596
597         /* Use bottom bits of hash, so it's independent of hash table size. */
598         rec->magic_and_meta
599                 = zone_bits
600                 | ((hashlow & ((1 << 5)-1)) << 6)
601                 | ((actuallen - (keylen + datalen)) << 11)
602                 | (keybits << 43)
603                 | (TDB_MAGIC << 48);
604         rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
605
606         /* Encoding can fail on big values. */
607         if (rec_key_length(rec) != keylen
608             || rec_data_length(rec) != datalen
609             || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
610                 tdb->ecode = TDB_ERR_IO;
611                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
612                          "Could not encode k=%llu,d=%llu,a=%llu\n",
613                          (long long)keylen, (long long)datalen,
614                          (long long)actuallen);
615                 return -1;
616         }
617         return 0;
618 }
619
620 static bool zones_contended(struct tdb_context *tdb)
621 {
622         return false;
623 }
624
625 /* Assume we want buckets up to the comfort factor. */
626 static tdb_len_t overhead(unsigned int zone_bits)
627 {
628         return sizeof(struct free_zone_header)
629                 + (BUCKETS_FOR_ZONE(zone_bits) + 1) * sizeof(tdb_off_t);
630 }
631
632 /* Expand the database (by adding a zone). */
633 static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
634 {
635         uint64_t old_size;
636         tdb_off_t off;
637         unsigned int num_buckets, zone_bits;
638         tdb_len_t wanted, expand;
639         struct free_zone_header zhdr;
640
641         /* We need room for the record header too. */
642         wanted = sizeof(struct tdb_used_record) + size;
643
644         /* Only one person can expand file at a time. */
645         if (tdb_lock_expand(tdb, F_WRLCK) != 0)
646                 return -1;
647
648         /* Someone else may have expanded the file, so retry. */
649         old_size = tdb->map_size;
650         tdb->methods->oob(tdb, tdb->map_size + 1, true);
651         if (tdb->map_size != old_size)
652                 goto success;
653
654         /* Treat last zone as minimum reasonable zone size. */
655         off = last_zone(tdb, &zhdr);
656         if (off == TDB_OFF_ERR)
657                 goto fail;
658
659         /* Zone isn't fully expanded? */
660         if (tdb->map_size < off + (1ULL << zhdr.zone_bits)) {
661                 expand = off + (1ULL << zhdr.zone_bits) - tdb->map_size;
662                 /* Expand more than we want. */
663                 if (expand > (wanted << TDB_COMFORT_FACTOR_BITS))
664                         expand = (wanted << TDB_COMFORT_FACTOR_BITS);
665                 if (tdb->methods->expand_file(tdb, expand) == -1)
666                         goto fail;
667                 /* We need to drop this lock before adding free record. */
668                 tdb_unlock_expand(tdb, F_WRLCK);
669
670                 /* Allocate from here. */
671                 tdb->zone_off = off;
672                 tdb->zhdr = zhdr;
673
674                 /* FIXME: If this isn't sufficient, we search again... */
675                 return add_free_record(tdb, zhdr.zone_bits,
676                                        tdb->map_size - expand, expand);
677         }
678
679         /* We are never allowed to cross a power-of-two boundary, and our
680          * minimum zone size is 1 << INITIAL_ZONE_BITS.
681          *
682          * If our filesize is 128k, we can add a 64k or a 128k zone.  If it's
683          * 192k, we can only add a 64k zone.
684          *
685          * In other words, our max zone size is (1 << (ffs(filesize) - 1)) */
686         zone_bits = ffs64(old_size - sizeof(struct tdb_header)) - 1;
687         assert(zone_bits >= INITIAL_ZONE_BITS);
688
689         /* Big zones generally good, but more zones wanted if contended. */
690         if (zones_contended(tdb)) {
691                 /* If it suffices, make zone same size as last one. */
692                 if (zhdr.zone_bits < zone_bits
693                     && (1ULL << zhdr.zone_bits) >= overhead(zone_bits)+wanted)
694                         zone_bits = zhdr.zone_bits;
695         }
696
697         zhdr.zone_bits = zone_bits;
698         num_buckets = BUCKETS_FOR_ZONE(zone_bits);
699
700         /* Expand the file by more than we need right now. */
701         expand = 1ULL << zone_bits;
702         if (expand > overhead(zone_bits) + (wanted << TDB_COMFORT_FACTOR_BITS))
703                 expand = overhead(zone_bits)
704                         + (wanted << TDB_COMFORT_FACTOR_BITS);
705
706         if (tdb->methods->expand_file(tdb, expand) == -1)
707                 goto fail;
708
709         /* Write new zone header (at old end). */
710         off = old_size;
711         if (tdb_write_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1)
712                 goto fail;
713
714         /* Now write empty buckets. */
715         off += sizeof(zhdr);
716         if (zero_out(tdb, off, (num_buckets+1) * sizeof(tdb_off_t)) == -1)
717                 goto fail;
718         off += (num_buckets+1) * sizeof(tdb_off_t);
719
720         /* Now add the rest as our free record. */
721         if (add_free_record(tdb, zone_bits, off, expand - overhead(zone_bits))
722             == -1)
723                 goto fail;
724
725         /* Try allocating from this zone now. */
726         tdb->zone_off = old_size;
727         tdb->zhdr = zhdr;
728
729 success:
730         tdb_unlock_expand(tdb, F_WRLCK);
731         return 0;
732
733 fail:
734         tdb_unlock_expand(tdb, F_WRLCK);
735         return -1;
736 }
737
738 /* This won't fail: it will expand the database if it has to. */
739 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
740                 uint64_t hash, bool growing)
741 {
742         tdb_off_t off;
743
744         /* We can't hold pointers during this: we could unmap! */
745         assert(!tdb->direct_access);
746
747         for (;;) {
748                 off = get_free(tdb, keylen, datalen, growing, hash);
749                 if (likely(off != 0))
750                         break;
751
752                 if (tdb_expand(tdb, adjust_size(keylen, datalen)))
753                         return TDB_OFF_ERR;
754         }
755
756         return off;
757 }