git.ozlabs.org Git - ccan/blob - ccan/tdb2/free.c

   1  /*
   2    Trivial Database 2: free list/block handling
   3    Copyright (C) Rusty Russell 2010
   4
   5    This library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 3 of the License, or (at your option) any later version.
   9
  10    This library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  17 */
  18 #include "private.h"
  19 #include <ccan/likely/likely.h>
  20 #include <ccan/ilog/ilog.h>
  21 #include <time.h>
  22 #include <assert.h>
  23 #include <limits.h>
  24
  25 static unsigned fls64(uint64_t val)
  26 {
  27         return ilog64(val);
  28 }
  29
  30 /* In which bucket would we find a particular record size? (ignoring header) */
  31 unsigned int size_to_bucket(unsigned int zone_bits, tdb_len_t data_len)
  32 {
  33         unsigned int bucket;
  34
  35         /* We can't have records smaller than this. */
  36         assert(data_len >= TDB_MIN_DATA_LEN);
  37
  38         /* Ignoring the header... */
  39         if (data_len - TDB_MIN_DATA_LEN <= 64) {
  40                 /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
  41                 bucket = (data_len - TDB_MIN_DATA_LEN) / 8;
  42         } else {
  43                 /* After that we go power of 2. */
  44                 bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2;
  45         }
  46
  47         if (unlikely(bucket > BUCKETS_FOR_ZONE(zone_bits)))
  48                 bucket = BUCKETS_FOR_ZONE(zone_bits);
  49         return bucket;
  50 }
  51
  52 /* Subtract 1-byte tailer and header.  Then round up to next power of 2. */
  53 static unsigned max_zone_bits(struct tdb_context *tdb)
  54 {
  55         return fls64(tdb->map_size-1-sizeof(struct tdb_header)-1) + 1;
  56 }
  57
  58 /* Start by using a random zone to spread the load: returns the offset. */
  59 static uint64_t random_zone(struct tdb_context *tdb)
  60 {
  61         struct free_zone_header zhdr;
  62         tdb_off_t off = sizeof(struct tdb_header);
  63         tdb_len_t half_bits;
  64         uint64_t randbits = 0;
  65         unsigned int i;
  66
  67         for (i = 0; i < 64; i += fls64(RAND_MAX))
  68                 randbits ^= ((uint64_t)random()) << i;
  69
  70         /* FIXME: Does this work?  Test! */
  71         half_bits = max_zone_bits(tdb) - 1;
  72         do {
  73                 /* Pick left or right side (not outside file) */
  74                 if ((randbits & 1)
  75                     && !tdb->methods->oob(tdb, off + (1ULL << half_bits)
  76                                           + sizeof(zhdr), true)) {
  77                         off += 1ULL << half_bits;
  78                 }
  79                 randbits >>= 1;
  80
  81                 if (tdb_read_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1)
  82                         return TDB_OFF_ERR;
  83
  84                 if (zhdr.zone_bits == half_bits)
  85                         return off;
  86
  87                 half_bits--;
  88         } while (half_bits >= INITIAL_ZONE_BITS);
  89
  90         tdb->ecode = TDB_ERR_CORRUPT;
  91         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
  92                  "random_zone: zone at %llu smaller than %u bits?",
  93                  (long long)off, INITIAL_ZONE_BITS);
  94         return TDB_OFF_ERR;
  95 }
  96
  97 int tdb_zone_init(struct tdb_context *tdb)
  98 {
  99         tdb->zone_off = random_zone(tdb);
 100         if (tdb->zone_off == TDB_OFF_ERR)
 101                 return -1;
 102         if (tdb_read_convert(tdb, tdb->zone_off,
 103                              &tdb->zhdr, sizeof(tdb->zhdr)) == -1)
 104                 return -1;
 105         return 0;
 106 }
 107
 108 /* Where's the header, given a zone size of 1 << zone_bits? */
 109 static tdb_off_t zone_off(tdb_off_t off, unsigned int zone_bits)
 110 {
 111         off -= sizeof(struct tdb_header);
 112         return (off & ~((1ULL << zone_bits) - 1)) + sizeof(struct tdb_header);
 113 }
 114
 115 /* Offset of a given bucket. */
 116 /* FIXME: bucket can be "unsigned" everywhere, or even uint8/16. */
 117 tdb_off_t bucket_off(tdb_off_t zone_off, tdb_off_t bucket)
 118 {
 119         return zone_off
 120                 + sizeof(struct free_zone_header)
 121                 + bucket * sizeof(tdb_off_t);
 122 }
 123
 124 /* Returns free_buckets + 1, or list number to search. */
 125 static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
 126 {
 127         /* Speculatively search for a non-zero bucket. */
 128         return tdb_find_nonzero_off(tdb, bucket_off(tdb->zone_off, 0),
 129                                     bucket,
 130                                     BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits) + 1);
 131 }
 132
 133 /* Remove from free bucket. */
 134 static int remove_from_list(struct tdb_context *tdb,
 135                             tdb_off_t b_off, tdb_off_t r_off,
 136                             struct tdb_free_record *r)
 137 {
 138         tdb_off_t off;
 139
 140         /* Front of list? */
 141         if (r->prev == 0) {
 142                 off = b_off;
 143         } else {
 144                 off = r->prev + offsetof(struct tdb_free_record, next);
 145         }
 146
 147 #ifdef DEBUG
 148         if (tdb_read_off(tdb, off) != r_off) {
 149                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 150                          "remove_from_list: %llu bad prev in list %llu\n",
 151                          (long long)r_off, (long long)b_off);
 152                 return -1;
 153         }
 154 #endif
 155
 156         /* r->prev->next = r->next */
 157         if (tdb_write_off(tdb, off, r->next)) {
 158                 return -1;
 159         }
 160
 161         if (r->next != 0) {
 162                 off = r->next + offsetof(struct tdb_free_record, prev);
 163                 /* r->next->prev = r->prev */
 164
 165 #ifdef DEBUG
 166                 if (tdb_read_off(tdb, off) != r_off) {
 167                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 168                                  "remove_from_list: %llu bad list %llu\n",
 169                                  (long long)r_off, (long long)b_off);
 170                         return -1;
 171                 }
 172 #endif
 173
 174                 if (tdb_write_off(tdb, off, r->prev)) {
 175                         return -1;
 176                 }
 177         }
 178         return 0;
 179 }
 180
 181 /* Enqueue in this free bucket. */
 182 static int enqueue_in_free(struct tdb_context *tdb,
 183                            tdb_off_t b_off,
 184                            tdb_off_t off,
 185                            struct tdb_free_record *new)
 186 {
 187         new->prev = 0;
 188         /* new->next = head. */
 189         new->next = tdb_read_off(tdb, b_off);
 190         if (new->next == TDB_OFF_ERR)
 191                 return -1;
 192
 193         if (new->next) {
 194 #ifdef DEBUG
 195                 if (tdb_read_off(tdb,
 196                                  new->next
 197                                  + offsetof(struct tdb_free_record, prev))
 198                     != 0) {
 199                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 200                                  "enqueue_in_free: %llu bad head prev %llu\n",
 201                                  (long long)new->next, (long long)b_off);
 202                         return -1;
 203                 }
 204 #endif
 205                 /* next->prev = new. */
 206                 if (tdb_write_off(tdb, new->next
 207                                   + offsetof(struct tdb_free_record, prev),
 208                                   off) != 0)
 209                         return -1;
 210         }
 211         /* head = new */
 212         if (tdb_write_off(tdb, b_off, off) != 0)
 213                 return -1;
 214
 215         return tdb_write_convert(tdb, off, new, sizeof(*new));
 216 }
 217
 218 /* List need not be locked. */
 219 int add_free_record(struct tdb_context *tdb,
 220                     unsigned int zone_bits,
 221                     tdb_off_t off, tdb_len_t len_with_header)
 222 {
 223         struct tdb_free_record new;
 224         tdb_off_t b_off;
 225         int ret;
 226
 227         assert(len_with_header >= sizeof(new));
 228         assert(zone_bits < (1 << 6));
 229
 230         new.magic_and_meta = TDB_FREE_MAGIC | zone_bits;
 231         new.data_len = len_with_header - sizeof(struct tdb_used_record);
 232
 233         b_off = bucket_off(zone_off(off, zone_bits),
 234                            size_to_bucket(zone_bits, new.data_len));
 235         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
 236                 return -1;
 237
 238         ret = enqueue_in_free(tdb, b_off, off, &new);
 239         tdb_unlock_free_bucket(tdb, b_off);
 240         return ret;
 241 }
 242
 243 static size_t adjust_size(size_t keylen, size_t datalen, bool want_extra)
 244 {
 245         size_t size = keylen + datalen;
 246
 247         /* We want at least 50% growth for data. */
 248         if (want_extra)
 249                 size += datalen/2;
 250
 251         if (size < TDB_MIN_DATA_LEN)
 252                 size = TDB_MIN_DATA_LEN;
 253
 254         /* Round to next uint64_t boundary. */
 255         return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
 256 }
 257
 258 /* If we have enough left over to be useful, split that off. */
 259 static size_t record_leftover(size_t keylen, size_t datalen,
 260                               bool want_extra, size_t total_len)
 261 {
 262         ssize_t leftover;
 263
 264         /* We might *want* extra, but not have it, so leftover is negative. */
 265         leftover = total_len - adjust_size(keylen, datalen, want_extra);
 266         if (leftover < (ssize_t)sizeof(struct tdb_free_record))
 267                 return 0;
 268
 269         /* If we want extra anwyay, don't split unless we have 2x size. */
 270         if (want_extra && leftover <= datalen / 2)
 271                 return 0;
 272
 273         return leftover;
 274 }
 275
 276 /* Note: we unlock the current bucket if we coalesce or fail. */
 277 static int coalesce(struct tdb_context *tdb,
 278                     tdb_off_t zone_off, unsigned zone_bits,
 279                     tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
 280 {
 281         struct tdb_free_record pad, *r;
 282         tdb_off_t end = off + sizeof(struct tdb_used_record) + data_len;
 283
 284         while (end < (zone_off + (1ULL << zone_bits))) {
 285                 tdb_off_t nb_off;
 286
 287                 /* FIXME: do tdb_get here and below really win? */
 288                 r = tdb_get(tdb, end, &pad, sizeof(pad));
 289                 if (!r)
 290                         goto err;
 291
 292                 if (frec_magic(r) != TDB_FREE_MAGIC)
 293                         break;
 294
 295                 nb_off = bucket_off(zone_off,
 296                                     size_to_bucket(zone_bits, r->data_len));
 297
 298                 /* We may be violating lock order here, so best effort. */
 299                 if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
 300                         break;
 301
 302                 /* Now we have lock, re-check. */
 303                 r = tdb_get(tdb, end, &pad, sizeof(pad));
 304                 if (!r) {
 305                         tdb_unlock_free_bucket(tdb, nb_off);
 306                         goto err;
 307                 }
 308
 309                 if (unlikely(frec_magic(r) != TDB_FREE_MAGIC)) {
 310                         tdb_unlock_free_bucket(tdb, nb_off);
 311                         break;
 312                 }
 313
 314                 if (unlikely(bucket_off(zone_off,
 315                                         size_to_bucket(zone_bits, r->data_len))
 316                              != nb_off)) {
 317                         tdb_unlock_free_bucket(tdb, nb_off);
 318                         break;
 319                 }
 320
 321                 if (remove_from_list(tdb, nb_off, end, r) == -1) {
 322                         tdb_unlock_free_bucket(tdb, nb_off);
 323                         goto err;
 324                 }
 325
 326                 end += sizeof(struct tdb_used_record) + r->data_len;
 327                 tdb_unlock_free_bucket(tdb, nb_off);
 328         }
 329
 330         /* Didn't find any adjacent free? */
 331         if (end == off + sizeof(struct tdb_used_record) + data_len)
 332                 return 0;
 333
 334         /* OK, expand record */
 335         r = tdb_get(tdb, off, &pad, sizeof(pad));
 336         if (!r)
 337                 goto err;
 338
 339         if (r->data_len != data_len) {
 340                 tdb->ecode = TDB_ERR_CORRUPT;
 341                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 342                          "coalesce: expected data len %llu not %llu\n",
 343                          (long long)data_len, (long long)r->data_len);
 344                 goto err;
 345         }
 346
 347         if (remove_from_list(tdb, b_off, off, r) == -1)
 348                 goto err;
 349
 350         r = tdb_access_write(tdb, off, sizeof(*r), true);
 351         if (!r)
 352                 goto err;
 353
 354         /* We have to drop this to avoid deadlocks, so make sure record
 355          * doesn't get coalesced by someone else! */
 356         r->magic_and_meta = TDB_COALESCING_MAGIC | zone_bits;
 357         r->data_len = end - off - sizeof(struct tdb_used_record);
 358         if (tdb_access_commit(tdb, r) != 0)
 359                 goto err;
 360
 361         tdb_unlock_free_bucket(tdb, b_off);
 362
 363         if (add_free_record(tdb, zone_bits, off, end - off) == -1)
 364                 return -1;
 365         return 1;
 366
 367 err:
 368         /* To unify error paths, we *always* unlock bucket on error. */
 369         tdb_unlock_free_bucket(tdb, b_off);
 370         return -1;
 371 }
 372
 373 /* We need size bytes to put our key and data in. */
 374 static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
 375                                 tdb_off_t zone_off,
 376                                 unsigned zone_bits,
 377                                 tdb_off_t bucket,
 378                                 size_t keylen, size_t datalen,
 379                                 bool want_extra,
 380                                 unsigned hashlow)
 381 {
 382         tdb_off_t off, b_off,best_off;
 383         struct tdb_free_record pad, best = { 0 }, *r;
 384         double multiplier;
 385         size_t size = keylen + datalen;
 386
 387 again:
 388         b_off = bucket_off(zone_off, bucket);
 389
 390         /* FIXME: Try non-blocking wait first, to measure contention.
 391          * If we're contented, try switching zones, and don't enlarge zone
 392          * next time (we want more zones). */
 393         /* Lock this bucket. */
 394         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == -1) {
 395                 return TDB_OFF_ERR;
 396         }
 397
 398         best.data_len = -1ULL;
 399         best_off = 0;
 400
 401         /* Get slack if we're after extra. */
 402         if (want_extra)
 403                 multiplier = 1.5;
 404         else
 405                 multiplier = 1.0;
 406
 407         /* Walk the list to see if any are large enough, getting less fussy
 408          * as we go. */
 409         off = tdb_read_off(tdb, b_off);
 410         if (unlikely(off == TDB_OFF_ERR))
 411                 goto unlock_err;
 412
 413         while (off) {
 414                 /* FIXME: Does tdb_get win anything here? */
 415                 r = tdb_get(tdb, off, &pad, sizeof(*r));
 416                 if (!r)
 417                         goto unlock_err;
 418
 419                 if (frec_magic(r) != TDB_FREE_MAGIC) {
 420                         tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
 421                                  "lock_and_alloc: %llu non-free 0x%llx\n",
 422                                  (long long)off, (long long)r->magic_and_meta);
 423                         goto unlock_err;
 424                 }
 425
 426                 if (r->data_len >= size && r->data_len < best.data_len) {
 427                         best_off = off;
 428                         best = *r;
 429                 }
 430
 431                 if (best.data_len < size * multiplier && best_off)
 432                         break;
 433
 434                 multiplier *= 1.01;
 435
 436                 /* Since we're going slow anyway, try coalescing here. */
 437                 switch (coalesce(tdb, zone_off, zone_bits, off, b_off,
 438                                  r->data_len)) {
 439                 case -1:
 440                         /* This has already unlocked on error. */
 441                         return -1;
 442                 case 1:
 443                         /* This has unlocked list, restart. */
 444                         goto again;
 445                 }
 446                 off = r->next;
 447         }
 448
 449         /* If we found anything at all, use it. */
 450         if (best_off) {
 451                 struct tdb_used_record rec;
 452                 size_t leftover;
 453
 454                 /* We're happy with this size: take it. */
 455                 if (remove_from_list(tdb, b_off, best_off, &best) != 0)
 456                         goto unlock_err;
 457
 458                 leftover = record_leftover(keylen, datalen, want_extra,
 459                                            best.data_len);
 460
 461                 /* We need to mark non-free before we drop lock, otherwise
 462                  * coalesce() could try to merge it! */
 463                 if (set_header(tdb, &rec, keylen, datalen,
 464                                best.data_len - leftover,
 465                                hashlow, zone_bits) != 0)
 466                         goto unlock_err;
 467
 468                 if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
 469                         goto unlock_err;
 470
 471                 tdb_unlock_free_bucket(tdb, b_off);
 472
 473                 if (leftover) {
 474                         if (add_free_record(tdb, zone_bits,
 475                                             best_off + sizeof(rec)
 476                                             + best.data_len - leftover,
 477                                             leftover))
 478                                 return TDB_OFF_ERR;
 479                 }
 480                 return best_off;
 481         }
 482
 483         tdb_unlock_free_bucket(tdb, b_off);
 484         return 0;
 485
 486 unlock_err:
 487         tdb_unlock_free_bucket(tdb, b_off);
 488         return TDB_OFF_ERR;
 489 }
 490
 491 static bool next_zone(struct tdb_context *tdb)
 492 {
 493         tdb_off_t next = tdb->zone_off + (1ULL << tdb->zhdr.zone_bits);
 494
 495         /* We must have a header. */
 496         if (tdb->methods->oob(tdb, next + sizeof(tdb->zhdr), true))
 497                 return false;
 498
 499         tdb->zone_off = next;
 500         return tdb_read_convert(tdb, next, &tdb->zhdr, sizeof(tdb->zhdr)) == 0;
 501 }
 502
 503 /* Offset returned is within current zone (which it may alter). */
 504 static tdb_off_t get_free(struct tdb_context *tdb,
 505                           size_t keylen, size_t datalen, bool want_extra,
 506                           unsigned hashlow)
 507 {
 508         tdb_off_t start_zone = tdb->zone_off, off;
 509         bool wrapped = false;
 510         size_t size = adjust_size(keylen, datalen, want_extra);
 511
 512         /* If they are growing, add 50% to get to higher bucket. */
 513         if (want_extra)
 514                 size += datalen / 2;
 515
 516         /* FIXME: If we don't get a hit in the first bucket we want,
 517          * try changing zones for next time.  That should help wear
 518          * zones evenly, so we don't need to search all of them before
 519          * expanding. */
 520         while (!wrapped || tdb->zone_off != start_zone) {
 521                 tdb_off_t b;
 522
 523                 /* Shortcut for really huge allocations... */
 524                 if ((size >> tdb->zhdr.zone_bits) != 0)
 525                         goto next;
 526
 527                 /* Start at exact size bucket, and search up... */
 528                 b = size_to_bucket(tdb->zhdr.zone_bits, size);
 529                 for (b = find_free_head(tdb, b);
 530                      b <= BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits);
 531                      b += find_free_head(tdb, b + 1)) {
 532                         /* Try getting one from list. */
 533                         off = lock_and_alloc(tdb, tdb->zone_off,
 534                                              tdb->zhdr.zone_bits,
 535                                              b, keylen, datalen, want_extra,
 536                                              hashlow);
 537                         if (off == TDB_OFF_ERR)
 538                                 return TDB_OFF_ERR;
 539                         if (off != 0)
 540                                 return off;
 541                         /* Didn't work.  Try next bucket. */
 542                 }
 543
 544         next:
 545                 /* Didn't work, try next zone, if it exists. */
 546                 if (!next_zone(tdb)) {
 547                         wrapped = true;
 548                         tdb->zone_off = sizeof(struct tdb_header);
 549                         if (tdb_read_convert(tdb, tdb->zone_off,
 550                                              &tdb->zhdr, sizeof(tdb->zhdr))) {
 551                                 return TDB_OFF_ERR;
 552                         }
 553                 }
 554         }
 555         return 0;
 556 }
 557
 558 int set_header(struct tdb_context *tdb,
 559                struct tdb_used_record *rec,
 560                uint64_t keylen, uint64_t datalen,
 561                uint64_t actuallen, unsigned hashlow,
 562                unsigned int zone_bits)
 563 {
 564         uint64_t keybits = (fls64(keylen) + 1) / 2;
 565
 566         /* Use bottom bits of hash, so it's independent of hash table size. */
 567         rec->magic_and_meta
 568                 = zone_bits
 569                 | ((hashlow & ((1 << 5)-1)) << 6)
 570                 | ((actuallen - (keylen + datalen)) << 11)
 571                 | (keybits << 43)
 572                 | (TDB_MAGIC << 48);
 573         rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
 574
 575         /* Encoding can fail on big values. */
 576         if (rec_key_length(rec) != keylen
 577             || rec_data_length(rec) != datalen
 578             || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
 579                 tdb->ecode = TDB_ERR_IO;
 580                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
 581                          "Could not encode k=%llu,d=%llu,a=%llu\n",
 582                          (long long)keylen, (long long)datalen,
 583                          (long long)actuallen);
 584                 return -1;
 585         }
 586         return 0;
 587 }
 588
 589 static bool zones_happy(struct tdb_context *tdb)
 590 {
 591         /* FIXME: look at distribution of zones. */
 592         return true;
 593 }
 594
 595 /* Assume we want buckets up to the comfort factor. */
 596 static tdb_len_t overhead(unsigned int zone_bits)
 597 {
 598         return sizeof(struct free_zone_header)
 599                 + (BUCKETS_FOR_ZONE(zone_bits) + 1) * sizeof(tdb_off_t);
 600 }
 601
 602 /* Expand the database (by adding a zone). */
 603 static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 604 {
 605         uint64_t old_size;
 606         tdb_off_t off;
 607         uint8_t zone_bits;
 608         unsigned int num_buckets;
 609         tdb_len_t wanted;
 610         struct free_zone_header zhdr;
 611         bool enlarge_zone;
 612
 613         /* We need room for the record header too. */
 614         wanted = sizeof(struct tdb_used_record) + size;
 615
 616         /* Only one person can expand file at a time. */
 617         if (tdb_lock_expand(tdb, F_WRLCK) != 0)
 618                 return -1;
 619
 620         /* Someone else may have expanded the file, so retry. */
 621         old_size = tdb->map_size;
 622         tdb->methods->oob(tdb, tdb->map_size + 1, true);
 623         if (tdb->map_size != old_size)
 624                 goto success;
 625
 626         /* FIXME: Tailer is a bogus optimization, remove it. */
 627         /* zone bits tailer char is protected by EXPAND lock. */
 628         if (tdb->methods->read(tdb, old_size - 1, &zone_bits, 1) == -1)
 629                 goto fail;
 630
 631         /* If zones aren't working well, add larger zone if possible. */
 632         enlarge_zone = !zones_happy(tdb);
 633
 634         /* New zone can be between zone_bits or larger if we're on the right
 635          * boundary. */
 636         for (;;) {
 637                 /* Does this fit the allocation comfortably? */
 638                 if ((1ULL << zone_bits) >= overhead(zone_bits) + wanted) {
 639                         /* Only let enlarge_zone enlarge us once. */
 640                         if (!enlarge_zone)
 641                                 break;
 642                         enlarge_zone = false;
 643                 }
 644                 if ((old_size - 1 - sizeof(struct tdb_header))
 645                     & (1 << zone_bits))
 646                         break;
 647                 zone_bits++;
 648         }
 649
 650         zhdr.zone_bits = zone_bits;
 651         num_buckets = BUCKETS_FOR_ZONE(zone_bits);
 652
 653         /* FIXME: I don't think we need to expand to full zone, do we? */
 654         if (tdb->methods->expand_file(tdb, 1ULL << zone_bits) == -1)
 655                 goto fail;
 656
 657         /* Write new tailer. */
 658         if (tdb->methods->write(tdb, tdb->map_size - 1, &zone_bits, 1) == -1)
 659                 goto fail;
 660
 661         /* Write new zone header (just before old tailer). */
 662         off = old_size - 1;
 663         if (tdb_write_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1)
 664                 goto fail;
 665
 666         /* Now write empty buckets. */
 667         off += sizeof(zhdr);
 668         if (zero_out(tdb, off, (num_buckets+1) * sizeof(tdb_off_t)) == -1)
 669                 goto fail;
 670         off += (num_buckets+1) * sizeof(tdb_off_t);
 671
 672         /* Now add the rest as our free record. */
 673         if (add_free_record(tdb, zone_bits, off, tdb->map_size-1-off) == -1)
 674                 goto fail;
 675
 676         /* Try allocating from this zone now. */
 677         tdb->zone_off = old_size - 1;
 678         tdb->zhdr = zhdr;
 679
 680 success:
 681         tdb_unlock_expand(tdb, F_WRLCK);
 682         return 0;
 683
 684 fail:
 685         tdb_unlock_expand(tdb, F_WRLCK);
 686         return -1;
 687 }
 688
 689 /* This won't fail: it will expand the database if it has to. */
 690 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
 691                 uint64_t hash, bool growing)
 692 {
 693         tdb_off_t off;
 694
 695         /* We can't hold pointers during this: we could unmap! */
 696         assert(!tdb->direct_access);
 697
 698         for (;;) {
 699                 off = get_free(tdb, keylen, datalen, growing, hash);
 700                 if (likely(off != 0))
 701                         break;
 702
 703                 if (tdb_expand(tdb, adjust_size(keylen, datalen, growing)))
 704                         return TDB_OFF_ERR;
 705         }
 706
 707         return off;
 708 }