git.ozlabs.org Git - ccan/blob - ccan/tdb2/free.c

   1  /*
   2    Trivial Database 2: free list/block handling
   3    Copyright (C) Rusty Russell 2010
   4
   5    This library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 3 of the License, or (at your option) any later version.
   9
  10    This library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  17 */
  18 #include "private.h"
  19 #include <ccan/likely/likely.h>
  20 #include <ccan/ilog/ilog.h>
  21 #include <time.h>
  22 #include <assert.h>
  23 #include <limits.h>
  24
  25 static unsigned fls64(uint64_t val)
  26 {
  27         return ilog64(val);
  28 }
  29
  30 static unsigned ffs64(uint64_t val)
  31 {
  32 #if HAVE_BUILTIN_FFSLL
  33         return __builtin_ffsll(val);
  34 #else
  35         unsigned r = 0;
  36
  37         if (!val)
  38                 return 0;
  39
  40         if (!(val & 0xffffffff)) {
  41                 val >>= 32;
  42                 r += 32;
  43         }
  44         if (!(val & 0xffff)) {
  45                 val >>= 16;
  46                 r += 16;
  47         }
  48         if (!(val & 0xff)) {
  49                 val >>= 8;
  50                 r += 8;
  51         }
  52         if (!(val & 0xf)) {
  53                 val >>= 4;
  54                 r += 4;
  55         }
  56         if (!(val & 0x3)) {
  57                 val >>= 2;
  58                 r += 2;
  59         }
  60         if (!(val & 0x1)) {
  61                 val >>= 1;
  62                 r += 1;
  63         }
  64         return r;
  65 #endif
  66 }
  67
  68 /* In which bucket would we find a particular record size? (ignoring header) */
  69 unsigned int size_to_bucket(unsigned int zone_bits, tdb_len_t data_len)
  70 {
  71         unsigned int bucket;
  72
  73         /* We can't have records smaller than this. */
  74         assert(data_len >= TDB_MIN_DATA_LEN);
  75
  76         /* Ignoring the header... */
  77         if (data_len - TDB_MIN_DATA_LEN <= 64) {
  78                 /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
  79                 bucket = (data_len - TDB_MIN_DATA_LEN) / 8;
  80         } else {
  81                 /* After that we go power of 2. */
  82                 bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2;
  83         }
  84
  85         if (unlikely(bucket > BUCKETS_FOR_ZONE(zone_bits)))
  86                 bucket = BUCKETS_FOR_ZONE(zone_bits);
  87         return bucket;
  88 }
  89
  90 /* Binary search for the zone for this offset. */
  91 static tdb_off_t off_to_zone(struct tdb_context *tdb, tdb_off_t off,
  92                              struct free_zone_header *zhdr)
  93 {
  94         tdb_off_t start, end;
  95
  96         start = sizeof(struct tdb_header);
  97         end = start + (1ULL << fls64(tdb->map_size - start));
  98
  99         for (;;) {
 100                 if (tdb_read_convert(tdb, start, zhdr, sizeof(*zhdr)) == -1)
 101                         return TDB_OFF_ERR;
 102
 103                 /* Is it inside this zone? */
 104                 if (off < start + (1ULL << zhdr->zone_bits))
 105                         return start;
 106
 107                 /* In practice, start + end won't overflow. */
 108                 if (off >= (start + end) / 2)
 109                         start = (start + end) / 2;
 110                 else
 111                         end = (start + end) / 2;
 112         }
 113 }
 114
 115 static tdb_off_t last_zone(struct tdb_context *tdb,
 116                            struct free_zone_header *zhdr)
 117 {
 118         return off_to_zone(tdb, tdb->map_size - 1, zhdr);
 119 }
 120
 121 int tdb_zone_init(struct tdb_context *tdb)
 122 {
 123         unsigned int i;
 124         uint64_t randoff = 0;
 125
 126         /* We start in a random zone, to spread the load. */
 127         for (i = 0; i < 64; i += fls64(RAND_MAX))
 128                 randoff ^= ((uint64_t)random()) << i;
 129         randoff = sizeof(struct tdb_header)
 130                 + (randoff % (tdb->map_size - sizeof(struct tdb_header)));
 131
 132         tdb->zone_off = off_to_zone(tdb, randoff, &tdb->zhdr);
 133         if (tdb->zone_off == TDB_OFF_ERR)
 134                 return -1;
 135         return 0;
 136 }
 137
 138 /* Where's the header, given a zone size of 1 << zone_bits? */
 139 static tdb_off_t zone_off(tdb_off_t off, unsigned int zone_bits)
 140 {
 141         off -= sizeof(struct tdb_header);
 142         return (off & ~((1ULL << zone_bits) - 1)) + sizeof(struct tdb_header);
 143 }
 144
 145 /* Offset of a given bucket. */
 146 /* FIXME: bucket can be "unsigned" everywhere, or even uint8/16. */
 147 tdb_off_t bucket_off(tdb_off_t zone_off, tdb_off_t bucket)
 148 {
 149         return zone_off
 150                 + sizeof(struct free_zone_header)
 151                 + bucket * sizeof(tdb_off_t);
 152 }
 153
 154 /* Returns free_buckets + 1, or list number to search. */
 155 static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
 156 {
 157         /* Speculatively search for a non-zero bucket. */
 158         return tdb_find_nonzero_off(tdb, bucket_off(tdb->zone_off, 0),
 159                                     bucket,
 160                                     BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits) + 1);
 161 }
 162
 163 /* Remove from free bucket. */
 164 static int remove_from_list(struct tdb_context *tdb,
 165                             tdb_off_t b_off, tdb_off_t r_off,
 166                             struct tdb_free_record *r)
 167 {
 168         tdb_off_t off;
 169
 170         /* Front of list? */
 171         if (r->prev == 0) {
 172                 off = b_off;
 173         } else {
 174                 off = r->prev + offsetof(struct tdb_free_record, next);
 175         }
 176
 177 #ifdef DEBUG
 178         if (tdb_read_off(tdb, off) != r_off) {
 179                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 180                          "remove_from_list: %llu bad prev in list %llu\n",
 181                          (long long)r_off, (long long)b_off);
 182                 return -1;
 183         }
 184 #endif
 185
 186         /* r->prev->next = r->next */
 187         if (tdb_write_off(tdb, off, r->next)) {
 188                 return -1;
 189         }
 190
 191         if (r->next != 0) {
 192                 off = r->next + offsetof(struct tdb_free_record, prev);
 193                 /* r->next->prev = r->prev */
 194
 195 #ifdef DEBUG
 196                 if (tdb_read_off(tdb, off) != r_off) {
 197                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 198                                  "remove_from_list: %llu bad list %llu\n",
 199                                  (long long)r_off, (long long)b_off);
 200                         return -1;
 201                 }
 202 #endif
 203
 204                 if (tdb_write_off(tdb, off, r->prev)) {
 205                         return -1;
 206                 }
 207         }
 208         return 0;
 209 }
 210
 211 /* Enqueue in this free bucket. */
 212 static int enqueue_in_free(struct tdb_context *tdb,
 213                            tdb_off_t b_off,
 214                            tdb_off_t off,
 215                            struct tdb_free_record *new)
 216 {
 217         new->prev = 0;
 218         /* new->next = head. */
 219         new->next = tdb_read_off(tdb, b_off);
 220         if (new->next == TDB_OFF_ERR)
 221                 return -1;
 222
 223         if (new->next) {
 224 #ifdef DEBUG
 225                 if (tdb_read_off(tdb,
 226                                  new->next
 227                                  + offsetof(struct tdb_free_record, prev))
 228                     != 0) {
 229                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 230                                  "enqueue_in_free: %llu bad head prev %llu\n",
 231                                  (long long)new->next, (long long)b_off);
 232                         return -1;
 233                 }
 234 #endif
 235                 /* next->prev = new. */
 236                 if (tdb_write_off(tdb, new->next
 237                                   + offsetof(struct tdb_free_record, prev),
 238                                   off) != 0)
 239                         return -1;
 240         }
 241         /* head = new */
 242         if (tdb_write_off(tdb, b_off, off) != 0)
 243                 return -1;
 244
 245         return tdb_write_convert(tdb, off, new, sizeof(*new));
 246 }
 247
 248 /* List need not be locked. */
 249 int add_free_record(struct tdb_context *tdb,
 250                     unsigned int zone_bits,
 251                     tdb_off_t off, tdb_len_t len_with_header)
 252 {
 253         struct tdb_free_record new;
 254         tdb_off_t b_off;
 255         int ret;
 256
 257         assert(len_with_header >= sizeof(new));
 258         assert(zone_bits < 64);
 259
 260         new.magic_and_meta = TDB_FREE_MAGIC | zone_bits;
 261         new.data_len = len_with_header - sizeof(struct tdb_used_record);
 262
 263         b_off = bucket_off(zone_off(off, zone_bits),
 264                            size_to_bucket(zone_bits, new.data_len));
 265         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
 266                 return -1;
 267
 268         ret = enqueue_in_free(tdb, b_off, off, &new);
 269         tdb_unlock_free_bucket(tdb, b_off);
 270         return ret;
 271 }
 272
 273 static size_t adjust_size(size_t keylen, size_t datalen, bool want_extra)
 274 {
 275         size_t size = keylen + datalen;
 276
 277         /* We want at least 50% growth for data. */
 278         if (want_extra)
 279                 size += datalen/2;
 280
 281         if (size < TDB_MIN_DATA_LEN)
 282                 size = TDB_MIN_DATA_LEN;
 283
 284         /* Round to next uint64_t boundary. */
 285         return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
 286 }
 287
 288 /* If we have enough left over to be useful, split that off. */
 289 static size_t record_leftover(size_t keylen, size_t datalen,
 290                               bool want_extra, size_t total_len)
 291 {
 292         ssize_t leftover;
 293
 294         /* We might *want* extra, but not have it, so leftover is negative. */
 295         leftover = total_len - adjust_size(keylen, datalen, want_extra);
 296         if (leftover < (ssize_t)sizeof(struct tdb_free_record))
 297                 return 0;
 298
 299         /* If we want extra anwyay, don't split unless we have 2x size. */
 300         if (want_extra && leftover <= datalen / 2)
 301                 return 0;
 302
 303         return leftover;
 304 }
 305
 306 /* Note: we unlock the current bucket if we coalesce or fail. */
 307 static int coalesce(struct tdb_context *tdb,
 308                     tdb_off_t zone_off, unsigned zone_bits,
 309                     tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
 310 {
 311         struct tdb_free_record pad, *r;
 312         tdb_off_t zone_end, end;
 313
 314         end = off + sizeof(struct tdb_used_record) + data_len;
 315         zone_end = zone_off + (1ULL << zone_bits);
 316
 317         if (tdb->methods->oob(tdb, zone_end, true))
 318                 zone_end = tdb->map_size;
 319
 320         while (end < zone_end) {
 321                 tdb_off_t nb_off;
 322
 323                 /* FIXME: do tdb_get here and below really win? */
 324                 r = tdb_get(tdb, end, &pad, sizeof(pad));
 325                 if (!r)
 326                         goto err;
 327
 328                 if (frec_magic(r) != TDB_FREE_MAGIC)
 329                         break;
 330
 331                 nb_off = bucket_off(zone_off,
 332                                     size_to_bucket(zone_bits, r->data_len));
 333
 334                 /* We may be violating lock order here, so best effort. */
 335                 if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
 336                         break;
 337
 338                 /* Now we have lock, re-check. */
 339                 r = tdb_get(tdb, end, &pad, sizeof(pad));
 340                 if (!r) {
 341                         tdb_unlock_free_bucket(tdb, nb_off);
 342                         goto err;
 343                 }
 344
 345                 if (unlikely(frec_magic(r) != TDB_FREE_MAGIC)) {
 346                         tdb_unlock_free_bucket(tdb, nb_off);
 347                         break;
 348                 }
 349
 350                 if (unlikely(bucket_off(zone_off,
 351                                         size_to_bucket(zone_bits, r->data_len))
 352                              != nb_off)) {
 353                         tdb_unlock_free_bucket(tdb, nb_off);
 354                         break;
 355                 }
 356
 357                 if (remove_from_list(tdb, nb_off, end, r) == -1) {
 358                         tdb_unlock_free_bucket(tdb, nb_off);
 359                         goto err;
 360                 }
 361
 362                 end += sizeof(struct tdb_used_record) + r->data_len;
 363                 tdb_unlock_free_bucket(tdb, nb_off);
 364         }
 365
 366         /* Didn't find any adjacent free? */
 367         if (end == off + sizeof(struct tdb_used_record) + data_len)
 368                 return 0;
 369
 370         /* OK, expand record */
 371         r = tdb_get(tdb, off, &pad, sizeof(pad));
 372         if (!r)
 373                 goto err;
 374
 375         if (r->data_len != data_len) {
 376                 tdb->ecode = TDB_ERR_CORRUPT;
 377                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 378                          "coalesce: expected data len %llu not %llu\n",
 379                          (long long)data_len, (long long)r->data_len);
 380                 goto err;
 381         }
 382
 383         if (remove_from_list(tdb, b_off, off, r) == -1)
 384                 goto err;
 385
 386         r = tdb_access_write(tdb, off, sizeof(*r), true);
 387         if (!r)
 388                 goto err;
 389
 390         /* We have to drop this to avoid deadlocks, so make sure record
 391          * doesn't get coalesced by someone else! */
 392         r->magic_and_meta = TDB_COALESCING_MAGIC | zone_bits;
 393         r->data_len = end - off - sizeof(struct tdb_used_record);
 394         if (tdb_access_commit(tdb, r) != 0)
 395                 goto err;
 396
 397         tdb_unlock_free_bucket(tdb, b_off);
 398
 399         if (add_free_record(tdb, zone_bits, off, end - off) == -1)
 400                 return -1;
 401         return 1;
 402
 403 err:
 404         /* To unify error paths, we *always* unlock bucket on error. */
 405         tdb_unlock_free_bucket(tdb, b_off);
 406         return -1;
 407 }
 408
 409 /* We need size bytes to put our key and data in. */
 410 static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
 411                                 tdb_off_t zone_off,
 412                                 unsigned zone_bits,
 413                                 tdb_off_t bucket,
 414                                 size_t keylen, size_t datalen,
 415                                 bool want_extra,
 416                                 unsigned hashlow)
 417 {
 418         tdb_off_t off, b_off,best_off;
 419         struct tdb_free_record pad, best = { 0 }, *r;
 420         double multiplier;
 421         size_t size = keylen + datalen;
 422
 423 again:
 424         b_off = bucket_off(zone_off, bucket);
 425
 426         /* FIXME: Try non-blocking wait first, to measure contention.
 427          * If we're contented, try switching zones, and don't enlarge zone
 428          * next time (we want more zones). */
 429         /* Lock this bucket. */
 430         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == -1) {
 431                 return TDB_OFF_ERR;
 432         }
 433
 434         best.data_len = -1ULL;
 435         best_off = 0;
 436
 437         /* Get slack if we're after extra. */
 438         if (want_extra)
 439                 multiplier = 1.5;
 440         else
 441                 multiplier = 1.0;
 442
 443         /* Walk the list to see if any are large enough, getting less fussy
 444          * as we go. */
 445         off = tdb_read_off(tdb, b_off);
 446         if (unlikely(off == TDB_OFF_ERR))
 447                 goto unlock_err;
 448
 449         while (off) {
 450                 /* FIXME: Does tdb_get win anything here? */
 451                 r = tdb_get(tdb, off, &pad, sizeof(*r));
 452                 if (!r)
 453                         goto unlock_err;
 454
 455                 if (frec_magic(r) != TDB_FREE_MAGIC) {
 456                         tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
 457                                  "lock_and_alloc: %llu non-free 0x%llx\n",
 458                                  (long long)off, (long long)r->magic_and_meta);
 459                         goto unlock_err;
 460                 }
 461
 462                 if (r->data_len >= size && r->data_len < best.data_len) {
 463                         best_off = off;
 464                         best = *r;
 465                 }
 466
 467                 if (best.data_len < size * multiplier && best_off)
 468                         break;
 469
 470                 multiplier *= 1.01;
 471
 472                 /* Since we're going slow anyway, try coalescing here. */
 473                 switch (coalesce(tdb, zone_off, zone_bits, off, b_off,
 474                                  r->data_len)) {
 475                 case -1:
 476                         /* This has already unlocked on error. */
 477                         return -1;
 478                 case 1:
 479                         /* This has unlocked list, restart. */
 480                         goto again;
 481                 }
 482                 off = r->next;
 483         }
 484
 485         /* If we found anything at all, use it. */
 486         if (best_off) {
 487                 struct tdb_used_record rec;
 488                 size_t leftover;
 489
 490                 /* We're happy with this size: take it. */
 491                 if (remove_from_list(tdb, b_off, best_off, &best) != 0)
 492                         goto unlock_err;
 493
 494                 leftover = record_leftover(keylen, datalen, want_extra,
 495                                            best.data_len);
 496
 497                 /* We need to mark non-free before we drop lock, otherwise
 498                  * coalesce() could try to merge it! */
 499                 if (set_header(tdb, &rec, keylen, datalen,
 500                                best.data_len - leftover,
 501                                hashlow, zone_bits) != 0)
 502                         goto unlock_err;
 503
 504                 if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
 505                         goto unlock_err;
 506
 507                 tdb_unlock_free_bucket(tdb, b_off);
 508
 509                 if (leftover) {
 510                         if (add_free_record(tdb, zone_bits,
 511                                             best_off + sizeof(rec)
 512                                             + best.data_len - leftover,
 513                                             leftover))
 514                                 return TDB_OFF_ERR;
 515                 }
 516                 return best_off;
 517         }
 518
 519         tdb_unlock_free_bucket(tdb, b_off);
 520         return 0;
 521
 522 unlock_err:
 523         tdb_unlock_free_bucket(tdb, b_off);
 524         return TDB_OFF_ERR;
 525 }
 526
 527 static bool next_zone(struct tdb_context *tdb)
 528 {
 529         tdb_off_t next = tdb->zone_off + (1ULL << tdb->zhdr.zone_bits);
 530
 531         /* We must have a header. */
 532         if (tdb->methods->oob(tdb, next + sizeof(tdb->zhdr), true))
 533                 return false;
 534
 535         tdb->zone_off = next;
 536         return tdb_read_convert(tdb, next, &tdb->zhdr, sizeof(tdb->zhdr)) == 0;
 537 }
 538
 539 /* Offset returned is within current zone (which it may alter). */
 540 static tdb_off_t get_free(struct tdb_context *tdb,
 541                           size_t keylen, size_t datalen, bool want_extra,
 542                           unsigned hashlow)
 543 {
 544         tdb_off_t start_zone = tdb->zone_off, off;
 545         bool wrapped = false;
 546         size_t size = adjust_size(keylen, datalen, want_extra);
 547
 548         /* If they are growing, add 50% to get to higher bucket. */
 549         if (want_extra)
 550                 size += datalen / 2;
 551
 552         /* FIXME: If we don't get a hit in the first bucket we want,
 553          * try changing zones for next time.  That should help wear
 554          * zones evenly, so we don't need to search all of them before
 555          * expanding. */
 556         while (!wrapped || tdb->zone_off != start_zone) {
 557                 tdb_off_t b;
 558
 559                 /* Shortcut for really huge allocations... */
 560                 if ((size >> tdb->zhdr.zone_bits) != 0)
 561                         goto next;
 562
 563                 /* Start at exact size bucket, and search up... */
 564                 b = size_to_bucket(tdb->zhdr.zone_bits, size);
 565                 for (b = find_free_head(tdb, b);
 566                      b <= BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits);
 567                      b += find_free_head(tdb, b + 1)) {
 568                         /* Try getting one from list. */
 569                         off = lock_and_alloc(tdb, tdb->zone_off,
 570                                              tdb->zhdr.zone_bits,
 571                                              b, keylen, datalen, want_extra,
 572                                              hashlow);
 573                         if (off == TDB_OFF_ERR)
 574                                 return TDB_OFF_ERR;
 575                         if (off != 0)
 576                                 return off;
 577                         /* Didn't work.  Try next bucket. */
 578                 }
 579
 580         next:
 581                 /* Didn't work, try next zone, if it exists. */
 582                 if (!next_zone(tdb)) {
 583                         wrapped = true;
 584                         tdb->zone_off = sizeof(struct tdb_header);
 585                         if (tdb_read_convert(tdb, tdb->zone_off,
 586                                              &tdb->zhdr, sizeof(tdb->zhdr))) {
 587                                 return TDB_OFF_ERR;
 588                         }
 589                 }
 590         }
 591         return 0;
 592 }
 593
 594 int set_header(struct tdb_context *tdb,
 595                struct tdb_used_record *rec,
 596                uint64_t keylen, uint64_t datalen,
 597                uint64_t actuallen, unsigned hashlow,
 598                unsigned int zone_bits)
 599 {
 600         uint64_t keybits = (fls64(keylen) + 1) / 2;
 601
 602         /* Use bottom bits of hash, so it's independent of hash table size. */
 603         rec->magic_and_meta
 604                 = zone_bits
 605                 | ((hashlow & ((1 << 5)-1)) << 6)
 606                 | ((actuallen - (keylen + datalen)) << 11)
 607                 | (keybits << 43)
 608                 | (TDB_MAGIC << 48);
 609         rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
 610
 611         /* Encoding can fail on big values. */
 612         if (rec_key_length(rec) != keylen
 613             || rec_data_length(rec) != datalen
 614             || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
 615                 tdb->ecode = TDB_ERR_IO;
 616                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
 617                          "Could not encode k=%llu,d=%llu,a=%llu\n",
 618                          (long long)keylen, (long long)datalen,
 619                          (long long)actuallen);
 620                 return -1;
 621         }
 622         return 0;
 623 }
 624
 625 static bool zones_contended(struct tdb_context *tdb)
 626 {
 627         return false;
 628 }
 629
 630 /* Assume we want buckets up to the comfort factor. */
 631 static tdb_len_t overhead(unsigned int zone_bits)
 632 {
 633         return sizeof(struct free_zone_header)
 634                 + (BUCKETS_FOR_ZONE(zone_bits) + 1) * sizeof(tdb_off_t);
 635 }
 636
 637 /* Expand the database (by adding a zone). */
 638 static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 639 {
 640         uint64_t old_size;
 641         tdb_off_t off;
 642         unsigned int num_buckets, zone_bits;
 643         tdb_len_t wanted, expand;
 644         struct free_zone_header zhdr;
 645
 646         /* We need room for the record header too. */
 647         wanted = sizeof(struct tdb_used_record) + size;
 648
 649         /* Only one person can expand file at a time. */
 650         if (tdb_lock_expand(tdb, F_WRLCK) != 0)
 651                 return -1;
 652
 653         /* Someone else may have expanded the file, so retry. */
 654         old_size = tdb->map_size;
 655         tdb->methods->oob(tdb, tdb->map_size + 1, true);
 656         if (tdb->map_size != old_size)
 657                 goto success;
 658
 659         /* Treat last zone as minimum reasonable zone size. */
 660         off = last_zone(tdb, &zhdr);
 661         if (off == TDB_OFF_ERR)
 662                 goto fail;
 663
 664         /* Zone isn't fully expanded? */
 665         if (tdb->map_size < off + (1ULL << zhdr.zone_bits)) {
 666                 expand = off + (1ULL << zhdr.zone_bits) - tdb->map_size;
 667                 /* Expand more than we want. */
 668                 if (expand > (wanted << TDB_COMFORT_FACTOR_BITS))
 669                         expand = (wanted << TDB_COMFORT_FACTOR_BITS);
 670                 if (tdb->methods->expand_file(tdb, expand) == -1)
 671                         goto fail;
 672                 /* We need to drop this lock before adding free record. */
 673                 tdb_unlock_expand(tdb, F_WRLCK);
 674
 675                 /* Allocate from here. */
 676                 tdb->zone_off = off;
 677                 tdb->zhdr = zhdr;
 678
 679                 /* FIXME: If this isn't sufficient, we search again... */
 680                 return add_free_record(tdb, zhdr.zone_bits,
 681                                        tdb->map_size - expand, expand);
 682         }
 683
 684         /* We are never allowed to cross a power-of-two boundary, and our
 685          * minimum zone size is 1 << INITIAL_ZONE_BITS.
 686          *
 687          * If our filesize is 128k, we can add a 64k or a 128k zone.  If it's
 688          * 192k, we can only add a 64k zone.
 689          *
 690          * In other words, our max zone size is (1 << (ffs(filesize) - 1)) */
 691         zone_bits = ffs64(old_size - sizeof(struct tdb_header)) - 1;
 692         assert(zone_bits >= INITIAL_ZONE_BITS);
 693
 694         /* Big zones generally good, but more zones wanted if contended. */
 695         if (zones_contended(tdb)) {
 696                 /* If it suffices, make zone same size as last one. */
 697                 if (zhdr.zone_bits < zone_bits
 698                     && (1ULL << zhdr.zone_bits) >= overhead(zone_bits)+wanted)
 699                         zone_bits = zhdr.zone_bits;
 700         }
 701
 702         zhdr.zone_bits = zone_bits;
 703         num_buckets = BUCKETS_FOR_ZONE(zone_bits);
 704
 705         /* Expand the file by more than we need right now. */
 706         expand = 1ULL << zone_bits;
 707         if (expand > overhead(zone_bits) + (wanted << TDB_COMFORT_FACTOR_BITS))
 708                 expand = overhead(zone_bits)
 709                         + (wanted << TDB_COMFORT_FACTOR_BITS);
 710
 711         if (tdb->methods->expand_file(tdb, expand) == -1)
 712                 goto fail;
 713
 714         /* Write new zone header (at old end). */
 715         off = old_size;
 716         if (tdb_write_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1)
 717                 goto fail;
 718
 719         /* Now write empty buckets. */
 720         off += sizeof(zhdr);
 721         if (zero_out(tdb, off, (num_buckets+1) * sizeof(tdb_off_t)) == -1)
 722                 goto fail;
 723         off += (num_buckets+1) * sizeof(tdb_off_t);
 724
 725         /* Now add the rest as our free record. */
 726         if (add_free_record(tdb, zone_bits, off, expand - overhead(zone_bits))
 727             == -1)
 728                 goto fail;
 729
 730         /* Try allocating from this zone now. */
 731         tdb->zone_off = old_size;
 732         tdb->zhdr = zhdr;
 733
 734 success:
 735         tdb_unlock_expand(tdb, F_WRLCK);
 736         return 0;
 737
 738 fail:
 739         tdb_unlock_expand(tdb, F_WRLCK);
 740         return -1;
 741 }
 742
 743 /* This won't fail: it will expand the database if it has to. */
 744 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
 745                 uint64_t hash, bool growing)
 746 {
 747         tdb_off_t off;
 748
 749         /* We can't hold pointers during this: we could unmap! */
 750         assert(!tdb->direct_access);
 751
 752         for (;;) {
 753                 off = get_free(tdb, keylen, datalen, growing, hash);
 754                 if (likely(off != 0))
 755                         break;
 756
 757                 if (tdb_expand(tdb, adjust_size(keylen, datalen, growing)))
 758                         return TDB_OFF_ERR;
 759         }
 760
 761         return off;
 762 }