git.ozlabs.org Git - ccan/blob - ccan/tdb2/free.c

   1  /*
   2    Trivial Database 2: free list/block handling
   3    Copyright (C) Rusty Russell 2010
   4
   5    This library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 3 of the License, or (at your option) any later version.
   9
  10    This library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  17 */
  18 #include "private.h"
  19 #include <ccan/likely/likely.h>
  20 #include <ccan/ilog/ilog.h>
  21 #include <time.h>
  22 #include <assert.h>
  23 #include <limits.h>
  24
  25 static unsigned fls64(uint64_t val)
  26 {
  27         return ilog64(val);
  28 }
  29
  30 /* In which bucket would we find a particular record size? (ignoring header) */
  31 unsigned int size_to_bucket(unsigned int zone_bits, tdb_len_t data_len)
  32 {
  33         unsigned int bucket;
  34
  35         /* We can't have records smaller than this. */
  36         assert(data_len >= TDB_MIN_DATA_LEN);
  37
  38         /* Ignoring the header... */
  39         if (data_len - TDB_MIN_DATA_LEN <= 64) {
  40                 /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
  41                 bucket = (data_len - TDB_MIN_DATA_LEN) / 8;
  42         } else {
  43                 /* After that we go power of 2. */
  44                 bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2;
  45         }
  46
  47         if (unlikely(bucket > BUCKETS_FOR_ZONE(zone_bits)))
  48                 bucket = BUCKETS_FOR_ZONE(zone_bits);
  49         return bucket;
  50 }
  51
  52 /* Subtract 1-byte tailer and header.  Then round up to next power of 2. */
  53 static unsigned max_zone_bits(struct tdb_context *tdb)
  54 {
  55         return fls64(tdb->map_size-1-sizeof(struct tdb_header)-1) + 1;
  56 }
  57
  58 /* Start by using a random zone to spread the load: returns the offset. */
  59 static uint64_t random_zone(struct tdb_context *tdb)
  60 {
  61         struct free_zone_header zhdr;
  62         tdb_off_t off = sizeof(struct tdb_header);
  63         tdb_len_t half_bits;
  64         uint64_t randbits = 0;
  65         unsigned int i;
  66
  67         for (i = 0; i < 64; i += fls64(RAND_MAX))
  68                 randbits ^= ((uint64_t)random()) << i;
  69
  70         /* FIXME: Does this work?  Test! */
  71         half_bits = max_zone_bits(tdb) - 1;
  72         do {
  73                 /* Pick left or right side (not outside file) */
  74                 if ((randbits & 1)
  75                     && !tdb->methods->oob(tdb, off + (1ULL << half_bits)
  76                                           + sizeof(zhdr), true)) {
  77                         off += 1ULL << half_bits;
  78                 }
  79                 randbits >>= 1;
  80
  81                 if (tdb_read_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1)
  82                         return TDB_OFF_ERR;
  83
  84                 if (zhdr.zone_bits == half_bits)
  85                         return off;
  86
  87                 half_bits--;
  88         } while (half_bits >= INITIAL_ZONE_BITS);
  89
  90         tdb->ecode = TDB_ERR_CORRUPT;
  91         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
  92                  "random_zone: zone at %llu smaller than %u bits?",
  93                  (long long)off, INITIAL_ZONE_BITS);
  94         return TDB_OFF_ERR;
  95 }
  96
  97 int tdb_zone_init(struct tdb_context *tdb)
  98 {
  99         tdb->zone_off = random_zone(tdb);
 100         if (tdb->zone_off == TDB_OFF_ERR)
 101                 return -1;
 102         if (tdb_read_convert(tdb, tdb->zone_off,
 103                              &tdb->zhdr, sizeof(tdb->zhdr)) == -1)
 104                 return -1;
 105         return 0;
 106 }
 107
 108 /* Where's the header, given a zone size of 1 << zone_bits? */
 109 static tdb_off_t zone_off(tdb_off_t off, unsigned int zone_bits)
 110 {
 111         off -= sizeof(struct tdb_header);
 112         return (off & ~((1ULL << zone_bits) - 1)) + sizeof(struct tdb_header);
 113 }
 114
 115 /* Offset of a given bucket. */
 116 /* FIXME: bucket can be "unsigned" everywhere, or even uint8/16. */
 117 tdb_off_t bucket_off(tdb_off_t zone_off, tdb_off_t bucket)
 118 {
 119         return zone_off
 120                 + sizeof(struct free_zone_header)
 121                 + bucket * sizeof(tdb_off_t);
 122 }
 123
 124 /* Returns free_buckets + 1, or list number to search. */
 125 static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
 126 {
 127         /* Speculatively search for a non-zero bucket. */
 128         return tdb_find_nonzero_off(tdb, bucket_off(tdb->zone_off, 0),
 129                                     bucket,
 130                                     BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits) + 1);
 131 }
 132
 133 /* Remove from free bucket. */
 134 static int remove_from_list(struct tdb_context *tdb,
 135                             tdb_off_t b_off, tdb_off_t r_off,
 136                             struct tdb_free_record *r)
 137 {
 138         tdb_off_t off;
 139
 140         /* Front of list? */
 141         if (r->prev == 0) {
 142                 off = b_off;
 143         } else {
 144                 off = r->prev + offsetof(struct tdb_free_record, next);
 145         }
 146
 147 #ifdef DEBUG
 148         if (tdb_read_off(tdb, off) != r_off) {
 149                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 150                          "remove_from_list: %llu bad prev in list %llu\n",
 151                          (long long)r_off, (long long)b_off);
 152                 return -1;
 153         }
 154 #endif
 155
 156         /* r->prev->next = r->next */
 157         if (tdb_write_off(tdb, off, r->next)) {
 158                 return -1;
 159         }
 160
 161         if (r->next != 0) {
 162                 off = r->next + offsetof(struct tdb_free_record, prev);
 163                 /* r->next->prev = r->prev */
 164
 165 #ifdef DEBUG
 166                 if (tdb_read_off(tdb, off) != r_off) {
 167                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 168                                  "remove_from_list: %llu bad list %llu\n",
 169                                  (long long)r_off, (long long)b_off);
 170                         return -1;
 171                 }
 172 #endif
 173
 174                 if (tdb_write_off(tdb, off, r->prev)) {
 175                         return -1;
 176                 }
 177         }
 178         return 0;
 179 }
 180
 181 /* Enqueue in this free bucket. */
 182 static int enqueue_in_free(struct tdb_context *tdb,
 183                            tdb_off_t b_off,
 184                            tdb_off_t off,
 185                            struct tdb_free_record *new)
 186 {
 187         new->prev = 0;
 188         /* new->next = head. */
 189         new->next = tdb_read_off(tdb, b_off);
 190         if (new->next == TDB_OFF_ERR)
 191                 return -1;
 192
 193         if (new->next) {
 194 #ifdef DEBUG
 195                 if (tdb_read_off(tdb,
 196                                  new->next
 197                                  + offsetof(struct tdb_free_record, prev))
 198                     != 0) {
 199                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 200                                  "enqueue_in_free: %llu bad head prev %llu\n",
 201                                  (long long)new->next, (long long)b_off);
 202                         return -1;
 203                 }
 204 #endif
 205                 /* next->prev = new. */
 206                 if (tdb_write_off(tdb, new->next
 207                                   + offsetof(struct tdb_free_record, prev),
 208                                   off) != 0)
 209                         return -1;
 210         }
 211         /* head = new */
 212         if (tdb_write_off(tdb, b_off, off) != 0)
 213                 return -1;
 214
 215         return tdb_write_convert(tdb, off, new, sizeof(*new));
 216 }
 217
 218 /* List need not be locked. */
 219 int add_free_record(struct tdb_context *tdb,
 220                     unsigned int zone_bits,
 221                     tdb_off_t off, tdb_len_t len_with_header)
 222 {
 223         struct tdb_free_record new;
 224         tdb_off_t b_off;
 225         int ret;
 226
 227         assert(len_with_header >= sizeof(new));
 228         assert(zone_bits < (1 << 6));
 229
 230         new.magic_and_meta = TDB_FREE_MAGIC | zone_bits;
 231         new.data_len = len_with_header - sizeof(struct tdb_used_record);
 232
 233         b_off = bucket_off(zone_off(off, zone_bits),
 234                            size_to_bucket(zone_bits, new.data_len));
 235         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
 236                 return -1;
 237
 238         ret = enqueue_in_free(tdb, b_off, off, &new);
 239         tdb_unlock_free_bucket(tdb, b_off);
 240         return ret;
 241 }
 242
 243 static size_t adjust_size(size_t keylen, size_t datalen, bool want_extra)
 244 {
 245         size_t size = keylen + datalen;
 246
 247         /* We want at least 50% growth for data. */
 248         if (want_extra)
 249                 size += datalen/2;
 250
 251         if (size < TDB_MIN_DATA_LEN)
 252                 size = TDB_MIN_DATA_LEN;
 253
 254         /* Round to next uint64_t boundary. */
 255         return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
 256 }
 257
 258 /* If we have enough left over to be useful, split that off. */
 259 static size_t record_leftover(size_t keylen, size_t datalen,
 260                               bool want_extra, size_t total_len)
 261 {
 262         ssize_t leftover;
 263
 264         /* We might *want* extra, but not have it, so leftover is negative. */
 265         leftover = total_len - adjust_size(keylen, datalen, want_extra);
 266         if (leftover < (ssize_t)sizeof(struct tdb_free_record))
 267                 return 0;
 268
 269         /* If we want extra anwyay, don't split unless we have 2x size. */
 270         if (want_extra && leftover <= datalen / 2)
 271                 return 0;
 272
 273         return leftover;
 274 }
 275
 276 /* Note: we unlock the current bucket if we coalesce or fail. */
 277 static int coalesce(struct tdb_context *tdb,
 278                     tdb_off_t zone_off, unsigned zone_bits,
 279                     tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
 280 {
 281         struct tdb_free_record pad, *r;
 282         tdb_off_t end = off + sizeof(struct tdb_used_record) + data_len;
 283
 284         while (end < (zone_off + (1ULL << zone_bits))) {
 285                 tdb_off_t nb_off;
 286
 287                 /* FIXME: do tdb_get here and below really win? */
 288                 r = tdb_get(tdb, end, &pad, sizeof(pad));
 289                 if (!r)
 290                         goto err;
 291
 292                 if (frec_magic(r) != TDB_FREE_MAGIC)
 293                         break;
 294
 295                 nb_off = bucket_off(zone_off,
 296                                     size_to_bucket(zone_bits, r->data_len));
 297
 298                 /* We may be violating lock order here, so best effort. */
 299                 if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
 300                         break;
 301
 302                 /* Now we have lock, re-check. */
 303                 r = tdb_get(tdb, end, &pad, sizeof(pad));
 304                 if (!r) {
 305                         tdb_unlock_free_bucket(tdb, nb_off);
 306                         goto err;
 307                 }
 308
 309                 if (unlikely(frec_magic(r) != TDB_FREE_MAGIC)) {
 310                         tdb_unlock_free_bucket(tdb, nb_off);
 311                         break;
 312                 }
 313
 314                 if (unlikely(bucket_off(zone_off,
 315                                         size_to_bucket(zone_bits, r->data_len))
 316                              != nb_off)) {
 317                         tdb_unlock_free_bucket(tdb, nb_off);
 318                         break;
 319                 }
 320
 321                 if (remove_from_list(tdb, nb_off, end, r) == -1) {
 322                         tdb_unlock_free_bucket(tdb, nb_off);
 323                         goto err;
 324                 }
 325
 326                 end += sizeof(struct tdb_used_record) + r->data_len;
 327                 tdb_unlock_free_bucket(tdb, nb_off);
 328         }
 329
 330         /* Didn't find any adjacent free? */
 331         if (end == off + sizeof(struct tdb_used_record) + data_len)
 332                 return 0;
 333
 334         /* OK, expand record */
 335         r = tdb_get(tdb, off, &pad, sizeof(pad));
 336         if (!r)
 337                 goto err;
 338
 339         if (r->data_len != data_len) {
 340                 tdb->ecode = TDB_ERR_CORRUPT;
 341                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 342                          "coalesce: expected data len %llu not %llu\n",
 343                          (long long)data_len, (long long)r->data_len);
 344                 goto err;
 345         }
 346
 347         if (remove_from_list(tdb, b_off, off, r) == -1)
 348                 goto err;
 349
 350         /* We have to drop this to avoid deadlocks. */
 351         tdb_unlock_free_bucket(tdb, b_off);
 352
 353         if (add_free_record(tdb, zone_bits, off, end - off) == -1)
 354                 return -1;
 355         return 1;
 356
 357 err:
 358         /* To unify error paths, we *always* unlock bucket on error. */
 359         tdb_unlock_free_bucket(tdb, b_off);
 360         return -1;
 361 }
 362
 363 /* We need size bytes to put our key and data in. */
 364 static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
 365                                 tdb_off_t zone_off,
 366                                 unsigned zone_bits,
 367                                 tdb_off_t bucket,
 368                                 size_t keylen, size_t datalen,
 369                                 bool want_extra,
 370                                 unsigned hashlow)
 371 {
 372         tdb_off_t off, b_off,best_off;
 373         struct tdb_free_record pad, best = { 0 }, *r;
 374         double multiplier;
 375         size_t size = keylen + datalen;
 376
 377 again:
 378         b_off = bucket_off(zone_off, bucket);
 379
 380         /* FIXME: Try non-blocking wait first, to measure contention.
 381          * If we're contented, try switching zones, and don't enlarge zone
 382          * next time (we want more zones). */
 383         /* Lock this bucket. */
 384         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == -1) {
 385                 return TDB_OFF_ERR;
 386         }
 387
 388         best.data_len = -1ULL;
 389         best_off = 0;
 390
 391         /* Get slack if we're after extra. */
 392         if (want_extra)
 393                 multiplier = 1.5;
 394         else
 395                 multiplier = 1.0;
 396
 397         /* Walk the list to see if any are large enough, getting less fussy
 398          * as we go. */
 399         off = tdb_read_off(tdb, b_off);
 400         if (unlikely(off == TDB_OFF_ERR))
 401                 goto unlock_err;
 402
 403         while (off) {
 404                 /* FIXME: Does tdb_get win anything here? */
 405                 r = tdb_get(tdb, off, &pad, sizeof(*r));
 406                 if (!r)
 407                         goto unlock_err;
 408
 409                 if (frec_magic(r) != TDB_FREE_MAGIC) {
 410                         tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
 411                                  "lock_and_alloc: %llu non-free 0x%llx\n",
 412                                  (long long)off, (long long)r->magic_and_meta);
 413                         goto unlock_err;
 414                 }
 415
 416                 if (r->data_len >= size && r->data_len < best.data_len) {
 417                         best_off = off;
 418                         best = *r;
 419                 }
 420
 421                 if (best.data_len < size * multiplier && best_off)
 422                         break;
 423
 424                 multiplier *= 1.01;
 425
 426                 /* Since we're going slow anyway, try coalescing here. */
 427                 switch (coalesce(tdb, zone_off, zone_bits, off, b_off,
 428                                  r->data_len)) {
 429                 case -1:
 430                         /* This has already unlocked on error. */
 431                         return -1;
 432                 case 1:
 433                         /* This has unlocked list, restart. */
 434                         goto again;
 435                 }
 436                 off = r->next;
 437         }
 438
 439         /* If we found anything at all, use it. */
 440         if (best_off) {
 441                 struct tdb_used_record rec;
 442                 size_t leftover;
 443
 444                 /* We're happy with this size: take it. */
 445                 if (remove_from_list(tdb, b_off, best_off, &best) != 0)
 446                         goto unlock_err;
 447
 448                 leftover = record_leftover(keylen, datalen, want_extra,
 449                                            best.data_len);
 450
 451                 /* We need to mark non-free before we drop lock, otherwise
 452                  * coalesce() could try to merge it! */
 453                 if (set_header(tdb, &rec, keylen, datalen,
 454                                best.data_len - leftover,
 455                                hashlow, zone_bits) != 0)
 456                         goto unlock_err;
 457
 458                 if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
 459                         goto unlock_err;
 460
 461                 tdb_unlock_free_bucket(tdb, b_off);
 462
 463                 if (leftover) {
 464                         if (add_free_record(tdb, zone_bits,
 465                                             best_off + sizeof(rec)
 466                                             + best.data_len - leftover,
 467                                             leftover))
 468                                 return TDB_OFF_ERR;
 469                 }
 470                 return best_off;
 471         }
 472
 473         tdb_unlock_free_bucket(tdb, b_off);
 474         return 0;
 475
 476 unlock_err:
 477         tdb_unlock_free_bucket(tdb, b_off);
 478         return TDB_OFF_ERR;
 479 }
 480
 481 static bool next_zone(struct tdb_context *tdb)
 482 {
 483         tdb_off_t next = tdb->zone_off + (1ULL << tdb->zhdr.zone_bits);
 484
 485         /* We must have a header. */
 486         if (tdb->methods->oob(tdb, next + sizeof(tdb->zhdr), true))
 487                 return false;
 488
 489         tdb->zone_off = next;
 490         return tdb_read_convert(tdb, next, &tdb->zhdr, sizeof(tdb->zhdr)) == 0;
 491 }
 492
 493 /* Offset returned is within current zone (which it may alter). */
 494 static tdb_off_t get_free(struct tdb_context *tdb,
 495                           size_t keylen, size_t datalen, bool want_extra,
 496                           unsigned hashlow)
 497 {
 498         tdb_off_t start_zone = tdb->zone_off, off;
 499         bool wrapped = false;
 500         size_t size = adjust_size(keylen, datalen, want_extra);
 501
 502         /* If they are growing, add 50% to get to higher bucket. */
 503         if (want_extra)
 504                 size += datalen / 2;
 505
 506         /* FIXME: If we don't get a hit in the first bucket we want,
 507          * try changing zones for next time.  That should help wear
 508          * zones evenly, so we don't need to search all of them before
 509          * expanding. */
 510         while (!wrapped || tdb->zone_off != start_zone) {
 511                 tdb_off_t b;
 512
 513                 /* Shortcut for really huge allocations... */
 514                 if ((size >> tdb->zhdr.zone_bits) != 0)
 515                         goto next;
 516
 517                 /* Start at exact size bucket, and search up... */
 518                 b = size_to_bucket(tdb->zhdr.zone_bits, size);
 519                 for (b = find_free_head(tdb, b);
 520                      b <= BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits);
 521                      b += find_free_head(tdb, b + 1)) {
 522                         /* Try getting one from list. */
 523                         off = lock_and_alloc(tdb, tdb->zone_off,
 524                                              tdb->zhdr.zone_bits,
 525                                              b, keylen, datalen, want_extra,
 526                                              hashlow);
 527                         if (off == TDB_OFF_ERR)
 528                                 return TDB_OFF_ERR;
 529                         if (off != 0)
 530                                 return off;
 531                         /* Didn't work.  Try next bucket. */
 532                 }
 533
 534         next:
 535                 /* Didn't work, try next zone, if it exists. */
 536                 if (!next_zone(tdb)) {
 537                         wrapped = true;
 538                         tdb->zone_off = sizeof(struct tdb_header);
 539                         if (tdb_read_convert(tdb, tdb->zone_off,
 540                                              &tdb->zhdr, sizeof(tdb->zhdr))) {
 541                                 return TDB_OFF_ERR;
 542                         }
 543                 }
 544         }
 545         return 0;
 546 }
 547
 548 int set_header(struct tdb_context *tdb,
 549                struct tdb_used_record *rec,
 550                uint64_t keylen, uint64_t datalen,
 551                uint64_t actuallen, unsigned hashlow,
 552                unsigned int zone_bits)
 553 {
 554         uint64_t keybits = (fls64(keylen) + 1) / 2;
 555
 556         /* Use bottom bits of hash, so it's independent of hash table size. */
 557         rec->magic_and_meta
 558                 = zone_bits
 559                 | ((hashlow & ((1 << 5)-1)) << 6)
 560                 | ((actuallen - (keylen + datalen)) << 11)
 561                 | (keybits << 43)
 562                 | (TDB_MAGIC << 48);
 563         rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
 564
 565         /* Encoding can fail on big values. */
 566         if (rec_key_length(rec) != keylen
 567             || rec_data_length(rec) != datalen
 568             || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
 569                 tdb->ecode = TDB_ERR_IO;
 570                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
 571                          "Could not encode k=%llu,d=%llu,a=%llu\n",
 572                          (long long)keylen, (long long)datalen,
 573                          (long long)actuallen);
 574                 return -1;
 575         }
 576         return 0;
 577 }
 578
 579 static bool zones_happy(struct tdb_context *tdb)
 580 {
 581         /* FIXME: look at distribution of zones. */
 582         return true;
 583 }
 584
 585 /* Assume we want buckets up to the comfort factor. */
 586 static tdb_len_t overhead(unsigned int zone_bits)
 587 {
 588         return sizeof(struct free_zone_header)
 589                 + (BUCKETS_FOR_ZONE(zone_bits) + 1) * sizeof(tdb_off_t);
 590 }
 591
 592 /* Expand the database (by adding a zone). */
 593 static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 594 {
 595         uint64_t old_size;
 596         tdb_off_t off;
 597         uint8_t zone_bits;
 598         unsigned int num_buckets;
 599         tdb_len_t wanted;
 600         struct free_zone_header zhdr;
 601         bool enlarge_zone;
 602
 603         /* We need room for the record header too. */
 604         wanted = sizeof(struct tdb_used_record) + size;
 605
 606         /* Only one person can expand file at a time. */
 607         if (tdb_lock_expand(tdb, F_WRLCK) != 0)
 608                 return -1;
 609
 610         /* Someone else may have expanded the file, so retry. */
 611         old_size = tdb->map_size;
 612         tdb->methods->oob(tdb, tdb->map_size + 1, true);
 613         if (tdb->map_size != old_size)
 614                 goto success;
 615
 616         /* FIXME: Tailer is a bogus optimization, remove it. */
 617         /* zone bits tailer char is protected by EXPAND lock. */
 618         if (tdb->methods->read(tdb, old_size - 1, &zone_bits, 1) == -1)
 619                 goto fail;
 620
 621         /* If zones aren't working well, add larger zone if possible. */
 622         enlarge_zone = !zones_happy(tdb);
 623
 624         /* New zone can be between zone_bits or larger if we're on the right
 625          * boundary. */
 626         for (;;) {
 627                 /* Does this fit the allocation comfortably? */
 628                 if ((1ULL << zone_bits) >= overhead(zone_bits) + wanted) {
 629                         /* Only let enlarge_zone enlarge us once. */
 630                         if (!enlarge_zone)
 631                                 break;
 632                         enlarge_zone = false;
 633                 }
 634                 if ((old_size - 1 - sizeof(struct tdb_header))
 635                     & (1 << zone_bits))
 636                         break;
 637                 zone_bits++;
 638         }
 639
 640         zhdr.zone_bits = zone_bits;
 641         num_buckets = BUCKETS_FOR_ZONE(zone_bits);
 642
 643         /* FIXME: I don't think we need to expand to full zone, do we? */
 644         if (tdb->methods->expand_file(tdb, 1ULL << zone_bits) == -1)
 645                 goto fail;
 646
 647         /* Write new tailer. */
 648         if (tdb->methods->write(tdb, tdb->map_size - 1, &zone_bits, 1) == -1)
 649                 goto fail;
 650
 651         /* Write new zone header (just before old tailer). */
 652         off = old_size - 1;
 653         if (tdb_write_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1)
 654                 goto fail;
 655
 656         /* Now write empty buckets. */
 657         off += sizeof(zhdr);
 658         if (zero_out(tdb, off, (num_buckets+1) * sizeof(tdb_off_t)) == -1)
 659                 goto fail;
 660         off += (num_buckets+1) * sizeof(tdb_off_t);
 661
 662         /* Now add the rest as our free record. */
 663         if (add_free_record(tdb, zone_bits, off, tdb->map_size-1-off) == -1)
 664                 goto fail;
 665
 666         /* Try allocating from this zone now. */
 667         tdb->zone_off = old_size - 1;
 668         tdb->zhdr = zhdr;
 669
 670 success:
 671         tdb_unlock_expand(tdb, F_WRLCK);
 672         return 0;
 673
 674 fail:
 675         tdb_unlock_expand(tdb, F_WRLCK);
 676         return -1;
 677 }
 678
 679 /* This won't fail: it will expand the database if it has to. */
 680 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
 681                 uint64_t hash, bool growing)
 682 {
 683         tdb_off_t off;
 684
 685         /* We can't hold pointers during this: we could unmap! */
 686         assert(!tdb->direct_access);
 687
 688         for (;;) {
 689                 off = get_free(tdb, keylen, datalen, growing, hash);
 690                 if (likely(off != 0))
 691                         break;
 692
 693                 if (tdb_expand(tdb, adjust_size(keylen, datalen, growing)))
 694                         return TDB_OFF_ERR;
 695         }
 696
 697         return off;
 698 }