3917fdaa9333c4eea47fbbffa578b4068fbdaacb
[ccan] / ccan / tdb2 / free.c
1  /* 
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4    
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/ilog/ilog.h>
21 #include <time.h>
22 #include <assert.h>
23 #include <limits.h>
24
25 static unsigned fls64(uint64_t val)
26 {
27         return ilog64(val);
28 }
29
30 static unsigned ffs64(uint64_t val)
31 {
32 #if HAVE_BUILTIN_FFSLL
33         return __builtin_ffsll(val);
34 #else
35         unsigned r = 0;
36
37         if (!val)
38                 return 0;
39
40         if (!(val & 0xffffffff)) {
41                 val >>= 32;
42                 r += 32;
43         }
44         if (!(val & 0xffff)) {
45                 val >>= 16;
46                 r += 16;
47         }
48         if (!(val & 0xff)) {
49                 val >>= 8;
50                 r += 8;
51         }
52         if (!(val & 0xf)) {
53                 val >>= 4;
54                 r += 4;
55         }
56         if (!(val & 0x3)) {
57                 val >>= 2;
58                 r += 2;
59         }
60         if (!(val & 0x1)) {
61                 val >>= 1;
62                 r += 1;
63         }
64         return r;
65 #endif
66 }
67
68 /* In which bucket would we find a particular record size? (ignoring header) */
69 unsigned int size_to_bucket(unsigned int zone_bits, tdb_len_t data_len)
70 {
71         unsigned int bucket;
72
73         /* We can't have records smaller than this. */
74         assert(data_len >= TDB_MIN_DATA_LEN);
75
76         /* Ignoring the header... */
77         if (data_len - TDB_MIN_DATA_LEN <= 64) {
78                 /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
79                 bucket = (data_len - TDB_MIN_DATA_LEN) / 8;
80         } else {
81                 /* After that we go power of 2. */
82                 bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2;
83         }
84
85         if (unlikely(bucket > BUCKETS_FOR_ZONE(zone_bits)))
86                 bucket = BUCKETS_FOR_ZONE(zone_bits);
87         return bucket;
88 }
89
90 /* Binary search for the zone for this offset. */
91 static tdb_off_t off_to_zone(struct tdb_context *tdb, tdb_off_t off,
92                              struct free_zone_header *zhdr)
93 {
94         tdb_off_t start, end;
95
96         start = sizeof(struct tdb_header);
97         end = start + (1ULL << fls64(tdb->map_size - start));
98
99         for (;;) {
100                 if (tdb_read_convert(tdb, start, zhdr, sizeof(*zhdr)) == -1)
101                         return TDB_OFF_ERR;
102
103                 /* Is it inside this zone? */
104                 if (off < start + (1ULL << zhdr->zone_bits))
105                         return start;
106
107                 /* In practice, start + end won't overflow. */
108                 if (off >= (start + end) / 2)
109                         start = (start + end) / 2;
110                 else
111                         end = (start + end) / 2;
112         }
113 }
114
115 static tdb_off_t last_zone(struct tdb_context *tdb,
116                            struct free_zone_header *zhdr)
117 {
118         return off_to_zone(tdb, tdb->map_size - 1, zhdr);
119 }
120
121 int tdb_zone_init(struct tdb_context *tdb)
122 {
123         unsigned int i;
124         uint64_t randoff = 0;
125
126         /* We start in a random zone, to spread the load. */
127         for (i = 0; i < 64; i += fls64(RAND_MAX))
128                 randoff ^= ((uint64_t)random()) << i;
129         randoff = sizeof(struct tdb_header)
130                 + (randoff % (tdb->map_size - sizeof(struct tdb_header)));
131
132         tdb->zone_off = off_to_zone(tdb, randoff, &tdb->zhdr);
133         if (tdb->zone_off == TDB_OFF_ERR)
134                 return -1;
135         return 0;
136 }
137
138 /* Where's the header, given a zone size of 1 << zone_bits? */
139 static tdb_off_t zone_off(tdb_off_t off, unsigned int zone_bits)
140 {
141         off -= sizeof(struct tdb_header);
142         return (off & ~((1ULL << zone_bits) - 1)) + sizeof(struct tdb_header);
143 }
144
145 /* Offset of a given bucket. */
146 /* FIXME: bucket can be "unsigned" everywhere, or even uint8/16. */
147 tdb_off_t bucket_off(tdb_off_t zone_off, tdb_off_t bucket)
148 {
149         return zone_off
150                 + sizeof(struct free_zone_header)
151                 + bucket * sizeof(tdb_off_t);
152 }
153
154 /* Returns free_buckets + 1, or list number to search. */
155 static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
156 {
157         /* Speculatively search for a non-zero bucket. */
158         return tdb_find_nonzero_off(tdb, bucket_off(tdb->zone_off, 0),
159                                     bucket,
160                                     BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits) + 1);
161 }
162
163 /* Remove from free bucket. */
164 static int remove_from_list(struct tdb_context *tdb,
165                             tdb_off_t b_off, tdb_off_t r_off,
166                             struct tdb_free_record *r)
167 {
168         tdb_off_t off;
169
170         /* Front of list? */
171         if (r->prev == 0) {
172                 off = b_off;
173         } else {
174                 off = r->prev + offsetof(struct tdb_free_record, next);
175         }
176
177 #ifdef DEBUG
178         if (tdb_read_off(tdb, off) != r_off) {
179                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
180                          "remove_from_list: %llu bad prev in list %llu\n",
181                          (long long)r_off, (long long)b_off);
182                 return -1;
183         }
184 #endif
185
186         /* r->prev->next = r->next */
187         if (tdb_write_off(tdb, off, r->next)) {
188                 return -1;
189         }
190
191         if (r->next != 0) {
192                 off = r->next + offsetof(struct tdb_free_record, prev);
193                 /* r->next->prev = r->prev */
194
195 #ifdef DEBUG
196                 if (tdb_read_off(tdb, off) != r_off) {
197                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
198                                  "remove_from_list: %llu bad list %llu\n",
199                                  (long long)r_off, (long long)b_off);
200                         return -1;
201                 }
202 #endif
203
204                 if (tdb_write_off(tdb, off, r->prev)) {
205                         return -1;
206                 }
207         }
208         return 0;
209 }
210
211 /* Enqueue in this free bucket. */
212 static int enqueue_in_free(struct tdb_context *tdb,
213                            tdb_off_t b_off,
214                            tdb_off_t off,
215                            struct tdb_free_record *new)
216 {
217         new->prev = 0;
218         /* new->next = head. */
219         new->next = tdb_read_off(tdb, b_off);
220         if (new->next == TDB_OFF_ERR)
221                 return -1;
222
223         if (new->next) {
224 #ifdef DEBUG
225                 if (tdb_read_off(tdb,
226                                  new->next
227                                  + offsetof(struct tdb_free_record, prev))
228                     != 0) {
229                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
230                                  "enqueue_in_free: %llu bad head prev %llu\n",
231                                  (long long)new->next, (long long)b_off);
232                         return -1;
233                 }
234 #endif
235                 /* next->prev = new. */
236                 if (tdb_write_off(tdb, new->next
237                                   + offsetof(struct tdb_free_record, prev),
238                                   off) != 0)
239                         return -1;
240         }
241         /* head = new */
242         if (tdb_write_off(tdb, b_off, off) != 0)
243                 return -1;
244
245         return tdb_write_convert(tdb, off, new, sizeof(*new));
246 }
247
248 /* List need not be locked. */
249 int add_free_record(struct tdb_context *tdb,
250                     unsigned int zone_bits,
251                     tdb_off_t off, tdb_len_t len_with_header)
252 {
253         struct tdb_free_record new;
254         tdb_off_t b_off;
255         int ret;
256
257         assert(len_with_header >= sizeof(new));
258         assert(zone_bits < 64);
259
260         new.magic_and_meta = TDB_FREE_MAGIC | zone_bits;
261         new.data_len = len_with_header - sizeof(struct tdb_used_record);
262
263         b_off = bucket_off(zone_off(off, zone_bits),
264                            size_to_bucket(zone_bits, new.data_len));
265         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
266                 return -1;
267
268         ret = enqueue_in_free(tdb, b_off, off, &new);
269         tdb_unlock_free_bucket(tdb, b_off);
270         return ret;
271 }
272
273 static size_t adjust_size(size_t keylen, size_t datalen, bool want_extra)
274 {
275         size_t size = keylen + datalen;
276
277         /* We want at least 50% growth for data. */
278         if (want_extra)
279                 size += datalen/2;
280
281         if (size < TDB_MIN_DATA_LEN)
282                 size = TDB_MIN_DATA_LEN;
283
284         /* Round to next uint64_t boundary. */
285         return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
286 }
287
288 /* If we have enough left over to be useful, split that off. */
289 static size_t record_leftover(size_t keylen, size_t datalen,
290                               bool want_extra, size_t total_len)
291 {
292         ssize_t leftover;
293
294         /* We might *want* extra, but not have it, so leftover is negative. */
295         leftover = total_len - adjust_size(keylen, datalen, want_extra);
296         if (leftover < (ssize_t)sizeof(struct tdb_free_record))
297                 return 0;
298
299         /* If we want extra anwyay, don't split unless we have 2x size. */
300         if (want_extra && leftover <= datalen / 2)
301                 return 0;
302
303         return leftover;
304 }
305
306 /* Note: we unlock the current bucket if we coalesce or fail. */
307 static int coalesce(struct tdb_context *tdb,
308                     tdb_off_t zone_off, unsigned zone_bits,
309                     tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
310 {
311         struct tdb_free_record pad, *r;
312         tdb_off_t zone_end, end;
313
314         end = off + sizeof(struct tdb_used_record) + data_len;
315         zone_end = zone_off + (1ULL << zone_bits);
316
317         if (tdb->methods->oob(tdb, zone_end, true))
318                 zone_end = tdb->map_size;
319
320         while (end < zone_end) {
321                 tdb_off_t nb_off;
322
323                 /* FIXME: do tdb_get here and below really win? */
324                 r = tdb_get(tdb, end, &pad, sizeof(pad));
325                 if (!r)
326                         goto err;
327
328                 if (frec_magic(r) != TDB_FREE_MAGIC)
329                         break;
330
331                 nb_off = bucket_off(zone_off,
332                                     size_to_bucket(zone_bits, r->data_len));
333
334                 /* We may be violating lock order here, so best effort. */
335                 if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
336                         break;
337
338                 /* Now we have lock, re-check. */
339                 r = tdb_get(tdb, end, &pad, sizeof(pad));
340                 if (!r) {
341                         tdb_unlock_free_bucket(tdb, nb_off);
342                         goto err;
343                 }
344
345                 if (unlikely(frec_magic(r) != TDB_FREE_MAGIC)) {
346                         tdb_unlock_free_bucket(tdb, nb_off);
347                         break;
348                 }
349
350                 if (unlikely(bucket_off(zone_off,
351                                         size_to_bucket(zone_bits, r->data_len))
352                              != nb_off)) {
353                         tdb_unlock_free_bucket(tdb, nb_off);
354                         break;
355                 }
356
357                 if (remove_from_list(tdb, nb_off, end, r) == -1) {
358                         tdb_unlock_free_bucket(tdb, nb_off);
359                         goto err;
360                 }
361
362                 end += sizeof(struct tdb_used_record) + r->data_len;
363                 tdb_unlock_free_bucket(tdb, nb_off);
364         }
365
366         /* Didn't find any adjacent free? */
367         if (end == off + sizeof(struct tdb_used_record) + data_len)
368                 return 0;
369
370         /* OK, expand record */
371         r = tdb_get(tdb, off, &pad, sizeof(pad));
372         if (!r)
373                 goto err;
374
375         if (r->data_len != data_len) {
376                 tdb->ecode = TDB_ERR_CORRUPT;
377                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
378                          "coalesce: expected data len %llu not %llu\n",
379                          (long long)data_len, (long long)r->data_len);
380                 goto err;
381         }
382
383         if (remove_from_list(tdb, b_off, off, r) == -1)
384                 goto err;
385
386         r = tdb_access_write(tdb, off, sizeof(*r), true);
387         if (!r)
388                 goto err;
389
390         /* We have to drop this to avoid deadlocks, so make sure record
391          * doesn't get coalesced by someone else! */
392         r->magic_and_meta = TDB_COALESCING_MAGIC | zone_bits;
393         r->data_len = end - off - sizeof(struct tdb_used_record);
394         if (tdb_access_commit(tdb, r) != 0)
395                 goto err;
396
397         tdb_unlock_free_bucket(tdb, b_off);
398
399         if (add_free_record(tdb, zone_bits, off, end - off) == -1)
400                 return -1;
401         return 1;
402
403 err:
404         /* To unify error paths, we *always* unlock bucket on error. */
405         tdb_unlock_free_bucket(tdb, b_off);
406         return -1;
407 }
408
409 /* We need size bytes to put our key and data in. */
410 static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
411                                 tdb_off_t zone_off,
412                                 unsigned zone_bits,
413                                 tdb_off_t bucket,
414                                 size_t keylen, size_t datalen,
415                                 bool want_extra,
416                                 unsigned hashlow)
417 {
418         tdb_off_t off, b_off,best_off;
419         struct tdb_free_record pad, best = { 0 }, *r;
420         double multiplier;
421         size_t size = keylen + datalen;
422
423 again:
424         b_off = bucket_off(zone_off, bucket);
425
426         /* FIXME: Try non-blocking wait first, to measure contention.
427          * If we're contented, try switching zones, and don't enlarge zone
428          * next time (we want more zones). */
429         /* Lock this bucket. */
430         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == -1) {
431                 return TDB_OFF_ERR;
432         }
433
434         best.data_len = -1ULL;
435         best_off = 0;
436
437         /* Get slack if we're after extra. */
438         if (want_extra)
439                 multiplier = 1.5;
440         else
441                 multiplier = 1.0;
442
443         /* Walk the list to see if any are large enough, getting less fussy
444          * as we go. */
445         off = tdb_read_off(tdb, b_off);
446         if (unlikely(off == TDB_OFF_ERR))
447                 goto unlock_err;
448
449         while (off) {
450                 /* FIXME: Does tdb_get win anything here? */
451                 r = tdb_get(tdb, off, &pad, sizeof(*r));
452                 if (!r)
453                         goto unlock_err;
454
455                 if (frec_magic(r) != TDB_FREE_MAGIC) {
456                         tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
457                                  "lock_and_alloc: %llu non-free 0x%llx\n",
458                                  (long long)off, (long long)r->magic_and_meta);
459                         goto unlock_err;
460                 }
461
462                 if (r->data_len >= size && r->data_len < best.data_len) {
463                         best_off = off;
464                         best = *r;
465                 }
466
467                 if (best.data_len < size * multiplier && best_off)
468                         break;
469
470                 multiplier *= 1.01;
471
472                 /* Since we're going slow anyway, try coalescing here. */
473                 switch (coalesce(tdb, zone_off, zone_bits, off, b_off,
474                                  r->data_len)) {
475                 case -1:
476                         /* This has already unlocked on error. */
477                         return -1;
478                 case 1:
479                         /* This has unlocked list, restart. */
480                         goto again;
481                 }
482                 off = r->next;
483         }
484
485         /* If we found anything at all, use it. */
486         if (best_off) {
487                 struct tdb_used_record rec;
488                 size_t leftover;
489
490                 /* We're happy with this size: take it. */
491                 if (remove_from_list(tdb, b_off, best_off, &best) != 0)
492                         goto unlock_err;
493
494                 leftover = record_leftover(keylen, datalen, want_extra,
495                                            best.data_len);
496
497                 /* We need to mark non-free before we drop lock, otherwise
498                  * coalesce() could try to merge it! */
499                 if (set_header(tdb, &rec, keylen, datalen,
500                                best.data_len - leftover,
501                                hashlow, zone_bits) != 0)
502                         goto unlock_err;
503
504                 if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
505                         goto unlock_err;
506
507                 tdb_unlock_free_bucket(tdb, b_off);
508
509                 if (leftover) {
510                         if (add_free_record(tdb, zone_bits,
511                                             best_off + sizeof(rec)
512                                             + best.data_len - leftover,
513                                             leftover))
514                                 return TDB_OFF_ERR;
515                 }
516                 return best_off;
517         }
518
519         tdb_unlock_free_bucket(tdb, b_off);
520         return 0;
521
522 unlock_err:
523         tdb_unlock_free_bucket(tdb, b_off);
524         return TDB_OFF_ERR;
525 }
526
527 static bool next_zone(struct tdb_context *tdb)
528 {
529         tdb_off_t next = tdb->zone_off + (1ULL << tdb->zhdr.zone_bits);
530
531         /* We must have a header. */
532         if (tdb->methods->oob(tdb, next + sizeof(tdb->zhdr), true))
533                 return false;
534
535         tdb->zone_off = next;
536         return tdb_read_convert(tdb, next, &tdb->zhdr, sizeof(tdb->zhdr)) == 0;
537 }
538
539 /* Offset returned is within current zone (which it may alter). */
540 static tdb_off_t get_free(struct tdb_context *tdb,
541                           size_t keylen, size_t datalen, bool want_extra,
542                           unsigned hashlow)
543 {
544         tdb_off_t start_zone = tdb->zone_off, off;
545         bool wrapped = false;
546         size_t size = adjust_size(keylen, datalen, want_extra);
547
548         /* If they are growing, add 50% to get to higher bucket. */
549         if (want_extra)
550                 size += datalen / 2;
551
552         /* FIXME: If we don't get a hit in the first bucket we want,
553          * try changing zones for next time.  That should help wear
554          * zones evenly, so we don't need to search all of them before
555          * expanding. */
556         while (!wrapped || tdb->zone_off != start_zone) {
557                 tdb_off_t b;
558
559                 /* Shortcut for really huge allocations... */
560                 if ((size >> tdb->zhdr.zone_bits) != 0)
561                         goto next;
562
563                 /* Start at exact size bucket, and search up... */
564                 b = size_to_bucket(tdb->zhdr.zone_bits, size);
565                 for (b = find_free_head(tdb, b);
566                      b <= BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits);
567                      b += find_free_head(tdb, b + 1)) {
568                         /* Try getting one from list. */
569                         off = lock_and_alloc(tdb, tdb->zone_off,
570                                              tdb->zhdr.zone_bits,
571                                              b, keylen, datalen, want_extra,
572                                              hashlow);
573                         if (off == TDB_OFF_ERR)
574                                 return TDB_OFF_ERR;
575                         if (off != 0)
576                                 return off;
577                         /* Didn't work.  Try next bucket. */
578                 }
579
580         next:
581                 /* Didn't work, try next zone, if it exists. */
582                 if (!next_zone(tdb)) {
583                         wrapped = true;
584                         tdb->zone_off = sizeof(struct tdb_header);
585                         if (tdb_read_convert(tdb, tdb->zone_off,
586                                              &tdb->zhdr, sizeof(tdb->zhdr))) {
587                                 return TDB_OFF_ERR;
588                         }
589                 }
590         }
591         return 0;
592 }
593
594 int set_header(struct tdb_context *tdb,
595                struct tdb_used_record *rec,
596                uint64_t keylen, uint64_t datalen,
597                uint64_t actuallen, unsigned hashlow,
598                unsigned int zone_bits)
599 {
600         uint64_t keybits = (fls64(keylen) + 1) / 2;
601
602         /* Use bottom bits of hash, so it's independent of hash table size. */
603         rec->magic_and_meta
604                 = zone_bits
605                 | ((hashlow & ((1 << 5)-1)) << 6)
606                 | ((actuallen - (keylen + datalen)) << 11)
607                 | (keybits << 43)
608                 | (TDB_MAGIC << 48);
609         rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
610
611         /* Encoding can fail on big values. */
612         if (rec_key_length(rec) != keylen
613             || rec_data_length(rec) != datalen
614             || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
615                 tdb->ecode = TDB_ERR_IO;
616                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
617                          "Could not encode k=%llu,d=%llu,a=%llu\n",
618                          (long long)keylen, (long long)datalen,
619                          (long long)actuallen);
620                 return -1;
621         }
622         return 0;
623 }
624
625 static bool zones_contended(struct tdb_context *tdb)
626 {
627         return false;
628 }
629
630 /* Assume we want buckets up to the comfort factor. */
631 static tdb_len_t overhead(unsigned int zone_bits)
632 {
633         return sizeof(struct free_zone_header)
634                 + (BUCKETS_FOR_ZONE(zone_bits) + 1) * sizeof(tdb_off_t);
635 }
636
637 /* Expand the database (by adding a zone). */
638 static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
639 {
640         uint64_t old_size;
641         tdb_off_t off;
642         unsigned int num_buckets, zone_bits;
643         tdb_len_t wanted, expand;
644         struct free_zone_header zhdr;
645
646         /* We need room for the record header too. */
647         wanted = sizeof(struct tdb_used_record) + size;
648
649         /* Only one person can expand file at a time. */
650         if (tdb_lock_expand(tdb, F_WRLCK) != 0)
651                 return -1;
652
653         /* Someone else may have expanded the file, so retry. */
654         old_size = tdb->map_size;
655         tdb->methods->oob(tdb, tdb->map_size + 1, true);
656         if (tdb->map_size != old_size)
657                 goto success;
658
659         /* Treat last zone as minimum reasonable zone size. */
660         off = last_zone(tdb, &zhdr);
661         if (off == TDB_OFF_ERR)
662                 goto fail;
663
664         /* Zone isn't fully expanded? */
665         if (tdb->map_size < off + (1ULL << zhdr.zone_bits)) {
666                 expand = off + (1ULL << zhdr.zone_bits) - tdb->map_size;
667                 /* Expand more than we want. */
668                 if (expand > (wanted << TDB_COMFORT_FACTOR_BITS))
669                         expand = (wanted << TDB_COMFORT_FACTOR_BITS);
670                 if (tdb->methods->expand_file(tdb, expand) == -1)
671                         goto fail;
672                 /* We need to drop this lock before adding free record. */
673                 tdb_unlock_expand(tdb, F_WRLCK);
674
675                 /* Allocate from here. */
676                 tdb->zone_off = off;
677                 tdb->zhdr = zhdr;
678
679                 /* FIXME: If this isn't sufficient, we search again... */
680                 return add_free_record(tdb, zhdr.zone_bits,
681                                        tdb->map_size - expand, expand);
682         }
683
684         /* We are never allowed to cross a power-of-two boundary, and our
685          * minimum zone size is 1 << INITIAL_ZONE_BITS.
686          *
687          * If our filesize is 128k, we can add a 64k or a 128k zone.  If it's
688          * 192k, we can only add a 64k zone.
689          *
690          * In other words, our max zone size is (1 << (ffs(filesize) - 1)) */
691         zone_bits = ffs64(old_size - sizeof(struct tdb_header)) - 1;
692         assert(zone_bits >= INITIAL_ZONE_BITS);
693
694         /* Big zones generally good, but more zones wanted if contended. */
695         if (zones_contended(tdb)) {
696                 /* If it suffices, make zone same size as last one. */
697                 if (zhdr.zone_bits < zone_bits
698                     && (1ULL << zhdr.zone_bits) >= overhead(zone_bits)+wanted)
699                         zone_bits = zhdr.zone_bits;
700         }
701
702         zhdr.zone_bits = zone_bits;
703         num_buckets = BUCKETS_FOR_ZONE(zone_bits);
704
705         /* Expand the file by more than we need right now. */
706         expand = 1ULL << zone_bits;
707         if (expand > overhead(zone_bits) + (wanted << TDB_COMFORT_FACTOR_BITS))
708                 expand = overhead(zone_bits)
709                         + (wanted << TDB_COMFORT_FACTOR_BITS);
710
711         if (tdb->methods->expand_file(tdb, expand) == -1)
712                 goto fail;
713
714         /* Write new zone header (at old end). */
715         off = old_size;
716         if (tdb_write_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1)
717                 goto fail;
718
719         /* Now write empty buckets. */
720         off += sizeof(zhdr);
721         if (zero_out(tdb, off, (num_buckets+1) * sizeof(tdb_off_t)) == -1)
722                 goto fail;
723         off += (num_buckets+1) * sizeof(tdb_off_t);
724
725         /* Now add the rest as our free record. */
726         if (add_free_record(tdb, zone_bits, off, expand - overhead(zone_bits))
727             == -1)
728                 goto fail;
729
730         /* Try allocating from this zone now. */
731         tdb->zone_off = old_size;
732         tdb->zhdr = zhdr;
733
734 success:
735         tdb_unlock_expand(tdb, F_WRLCK);
736         return 0;
737
738 fail:
739         tdb_unlock_expand(tdb, F_WRLCK);
740         return -1;
741 }
742
743 /* This won't fail: it will expand the database if it has to. */
744 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
745                 uint64_t hash, bool growing)
746 {
747         tdb_off_t off;
748
749         /* We can't hold pointers during this: we could unmap! */
750         assert(!tdb->direct_access);
751
752         for (;;) {
753                 off = get_free(tdb, keylen, datalen, growing, hash);
754                 if (likely(off != 0))
755                         break;
756
757                 if (tdb_expand(tdb, adjust_size(keylen, datalen, growing)))
758                         return TDB_OFF_ERR;
759         }
760
761         return off;
762 }