git.ozlabs.org Git - ccan/blob - ccan/tdb2/free.c

   1  /*
   2    Trivial Database 2: free list/block handling
   3    Copyright (C) Rusty Russell 2010
   4
   5    This library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 3 of the License, or (at your option) any later version.
   9
  10    This library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  17 */
  18 #include "private.h"
  19 #include <ccan/likely/likely.h>
  20 #include <ccan/ilog/ilog.h>
  21 #include <time.h>
  22 #include <assert.h>
  23 #include <limits.h>
  24
  25 static unsigned fls64(uint64_t val)
  26 {
  27         return ilog64(val);
  28 }
  29
  30 static unsigned ffs64(uint64_t val)
  31 {
  32 #if HAVE_BUILTIN_FFSLL
  33         return __builtin_ffsll(val);
  34 #else
  35         unsigned r = 0;
  36
  37         if (!val)
  38                 return 0;
  39
  40         if (!(val & 0xffffffff)) {
  41                 val >>= 32;
  42                 r += 32;
  43         }
  44         if (!(val & 0xffff)) {
  45                 val >>= 16;
  46                 r += 16;
  47         }
  48         if (!(val & 0xff)) {
  49                 val >>= 8;
  50                 r += 8;
  51         }
  52         if (!(val & 0xf)) {
  53                 val >>= 4;
  54                 r += 4;
  55         }
  56         if (!(val & 0x3)) {
  57                 val >>= 2;
  58                 r += 2;
  59         }
  60         if (!(val & 0x1)) {
  61                 val >>= 1;
  62                 r += 1;
  63         }
  64         return r;
  65 #endif
  66 }
  67
  68 /* In which bucket would we find a particular record size? (ignoring header) */
  69 unsigned int size_to_bucket(unsigned int zone_bits, tdb_len_t data_len)
  70 {
  71         unsigned int bucket;
  72
  73         /* We can't have records smaller than this. */
  74         assert(data_len >= TDB_MIN_DATA_LEN);
  75
  76         /* Ignoring the header... */
  77         if (data_len - TDB_MIN_DATA_LEN <= 64) {
  78                 /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
  79                 bucket = (data_len - TDB_MIN_DATA_LEN) / 8;
  80         } else {
  81                 /* After that we go power of 2. */
  82                 bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2;
  83         }
  84
  85         if (unlikely(bucket > BUCKETS_FOR_ZONE(zone_bits)))
  86                 bucket = BUCKETS_FOR_ZONE(zone_bits);
  87         return bucket;
  88 }
  89
  90 /* Binary search for the zone for this offset. */
  91 static tdb_off_t off_to_zone(struct tdb_context *tdb, tdb_off_t off,
  92                              struct free_zone_header *zhdr)
  93 {
  94         tdb_off_t start, end;
  95
  96         start = sizeof(struct tdb_header);
  97         end = start + (1ULL << fls64(tdb->map_size - start));
  98
  99         for (;;) {
 100                 if (tdb_read_convert(tdb, start, zhdr, sizeof(*zhdr)) == -1)
 101                         return TDB_OFF_ERR;
 102
 103                 /* Is it inside this zone? */
 104                 if (off < start + (1ULL << zhdr->zone_bits))
 105                         return start;
 106
 107                 /* In practice, start + end won't overflow. */
 108                 if (off >= (start + end) / 2)
 109                         start = (start + end) / 2;
 110                 else
 111                         end = (start + end) / 2;
 112         }
 113 }
 114
 115 static tdb_off_t last_zone(struct tdb_context *tdb,
 116                            struct free_zone_header *zhdr)
 117 {
 118         return off_to_zone(tdb, tdb->map_size - 1, zhdr);
 119 }
 120
 121 int tdb_zone_init(struct tdb_context *tdb)
 122 {
 123         unsigned int i;
 124         uint64_t randoff = 0;
 125
 126         /* We start in a random zone, to spread the load. */
 127         for (i = 0; i < 64; i += fls64(RAND_MAX))
 128                 randoff ^= ((uint64_t)random()) << i;
 129         randoff = sizeof(struct tdb_header)
 130                 + (randoff % (tdb->map_size - sizeof(struct tdb_header)));
 131
 132         tdb->zone_off = off_to_zone(tdb, randoff, &tdb->zhdr);
 133         if (tdb->zone_off == TDB_OFF_ERR)
 134                 return -1;
 135         return 0;
 136 }
 137
 138 /* Where's the header, given a zone size of 1 << zone_bits? */
 139 static tdb_off_t zone_off(tdb_off_t off, unsigned int zone_bits)
 140 {
 141         off -= sizeof(struct tdb_header);
 142         return (off & ~((1ULL << zone_bits) - 1)) + sizeof(struct tdb_header);
 143 }
 144
 145 /* Offset of a given bucket. */
 146 /* FIXME: bucket can be "unsigned" everywhere, or even uint8/16. */
 147 tdb_off_t bucket_off(tdb_off_t zone_off, tdb_off_t bucket)
 148 {
 149         return zone_off
 150                 + sizeof(struct free_zone_header)
 151                 + bucket * sizeof(tdb_off_t);
 152 }
 153
 154 /* Returns free_buckets + 1, or list number to search. */
 155 static tdb_off_t find_free_head(struct tdb_context *tdb, tdb_off_t bucket)
 156 {
 157         /* Speculatively search for a non-zero bucket. */
 158         return tdb_find_nonzero_off(tdb, bucket_off(tdb->zone_off, 0),
 159                                     bucket,
 160                                     BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits) + 1);
 161 }
 162
 163 /* Remove from free bucket. */
 164 static int remove_from_list(struct tdb_context *tdb,
 165                             tdb_off_t b_off, tdb_off_t r_off,
 166                             struct tdb_free_record *r)
 167 {
 168         tdb_off_t off;
 169
 170         /* Front of list? */
 171         if (r->prev == 0) {
 172                 off = b_off;
 173         } else {
 174                 off = r->prev + offsetof(struct tdb_free_record, next);
 175         }
 176
 177 #ifdef DEBUG
 178         if (tdb_read_off(tdb, off) != r_off) {
 179                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 180                          "remove_from_list: %llu bad prev in list %llu\n",
 181                          (long long)r_off, (long long)b_off);
 182                 return -1;
 183         }
 184 #endif
 185
 186         /* r->prev->next = r->next */
 187         if (tdb_write_off(tdb, off, r->next)) {
 188                 return -1;
 189         }
 190
 191         if (r->next != 0) {
 192                 off = r->next + offsetof(struct tdb_free_record, prev);
 193                 /* r->next->prev = r->prev */
 194
 195 #ifdef DEBUG
 196                 if (tdb_read_off(tdb, off) != r_off) {
 197                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 198                                  "remove_from_list: %llu bad list %llu\n",
 199                                  (long long)r_off, (long long)b_off);
 200                         return -1;
 201                 }
 202 #endif
 203
 204                 if (tdb_write_off(tdb, off, r->prev)) {
 205                         return -1;
 206                 }
 207         }
 208         return 0;
 209 }
 210
 211 /* Enqueue in this free bucket. */
 212 static int enqueue_in_free(struct tdb_context *tdb,
 213                            tdb_off_t b_off,
 214                            tdb_off_t off,
 215                            struct tdb_free_record *new)
 216 {
 217         new->prev = 0;
 218         /* new->next = head. */
 219         new->next = tdb_read_off(tdb, b_off);
 220         if (new->next == TDB_OFF_ERR)
 221                 return -1;
 222
 223         if (new->next) {
 224 #ifdef DEBUG
 225                 if (tdb_read_off(tdb,
 226                                  new->next
 227                                  + offsetof(struct tdb_free_record, prev))
 228                     != 0) {
 229                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 230                                  "enqueue_in_free: %llu bad head prev %llu\n",
 231                                  (long long)new->next, (long long)b_off);
 232                         return -1;
 233                 }
 234 #endif
 235                 /* next->prev = new. */
 236                 if (tdb_write_off(tdb, new->next
 237                                   + offsetof(struct tdb_free_record, prev),
 238                                   off) != 0)
 239                         return -1;
 240         }
 241         /* head = new */
 242         if (tdb_write_off(tdb, b_off, off) != 0)
 243                 return -1;
 244
 245         return tdb_write_convert(tdb, off, new, sizeof(*new));
 246 }
 247
 248 /* List need not be locked. */
 249 int add_free_record(struct tdb_context *tdb,
 250                     unsigned int zone_bits,
 251                     tdb_off_t off, tdb_len_t len_with_header)
 252 {
 253         struct tdb_free_record new;
 254         tdb_off_t b_off;
 255         int ret;
 256
 257         assert(len_with_header >= sizeof(new));
 258         assert(zone_bits < 64);
 259
 260         new.magic_and_meta = TDB_FREE_MAGIC | zone_bits;
 261         new.data_len = len_with_header - sizeof(struct tdb_used_record);
 262
 263         b_off = bucket_off(zone_off(off, zone_bits),
 264                            size_to_bucket(zone_bits, new.data_len));
 265         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
 266                 return -1;
 267
 268         ret = enqueue_in_free(tdb, b_off, off, &new);
 269         tdb_unlock_free_bucket(tdb, b_off);
 270         return ret;
 271 }
 272
 273 static size_t adjust_size(size_t keylen, size_t datalen)
 274 {
 275         size_t size = keylen + datalen;
 276
 277         if (size < TDB_MIN_DATA_LEN)
 278                 size = TDB_MIN_DATA_LEN;
 279
 280         /* Round to next uint64_t boundary. */
 281         return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
 282 }
 283
 284 /* If we have enough left over to be useful, split that off. */
 285 static size_t record_leftover(size_t keylen, size_t datalen,
 286                               bool want_extra, size_t total_len)
 287 {
 288         ssize_t leftover;
 289
 290         if (want_extra)
 291                 datalen += datalen / 2;
 292         leftover = total_len - adjust_size(keylen, datalen);
 293
 294         if (leftover < (ssize_t)sizeof(struct tdb_free_record))
 295                 return 0;
 296
 297         return leftover;
 298 }
 299
 300 /* Note: we unlock the current bucket if we coalesce or fail. */
 301 static int coalesce(struct tdb_context *tdb,
 302                     tdb_off_t zone_off, unsigned zone_bits,
 303                     tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
 304 {
 305         struct tdb_free_record pad, *r;
 306         tdb_off_t zone_end, end;
 307
 308         end = off + sizeof(struct tdb_used_record) + data_len;
 309         zone_end = zone_off + (1ULL << zone_bits);
 310
 311         if (tdb->methods->oob(tdb, zone_end, true))
 312                 zone_end = tdb->map_size;
 313
 314         while (end < zone_end) {
 315                 tdb_off_t nb_off;
 316
 317                 /* FIXME: do tdb_get here and below really win? */
 318                 r = tdb_get(tdb, end, &pad, sizeof(pad));
 319                 if (!r)
 320                         goto err;
 321
 322                 if (frec_magic(r) != TDB_FREE_MAGIC)
 323                         break;
 324
 325                 nb_off = bucket_off(zone_off,
 326                                     size_to_bucket(zone_bits, r->data_len));
 327
 328                 /* We may be violating lock order here, so best effort. */
 329                 if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
 330                         break;
 331
 332                 /* Now we have lock, re-check. */
 333                 r = tdb_get(tdb, end, &pad, sizeof(pad));
 334                 if (!r) {
 335                         tdb_unlock_free_bucket(tdb, nb_off);
 336                         goto err;
 337                 }
 338
 339                 if (unlikely(frec_magic(r) != TDB_FREE_MAGIC)) {
 340                         tdb_unlock_free_bucket(tdb, nb_off);
 341                         break;
 342                 }
 343
 344                 if (unlikely(bucket_off(zone_off,
 345                                         size_to_bucket(zone_bits, r->data_len))
 346                              != nb_off)) {
 347                         tdb_unlock_free_bucket(tdb, nb_off);
 348                         break;
 349                 }
 350
 351                 if (remove_from_list(tdb, nb_off, end, r) == -1) {
 352                         tdb_unlock_free_bucket(tdb, nb_off);
 353                         goto err;
 354                 }
 355
 356                 end += sizeof(struct tdb_used_record) + r->data_len;
 357                 tdb_unlock_free_bucket(tdb, nb_off);
 358         }
 359
 360         /* Didn't find any adjacent free? */
 361         if (end == off + sizeof(struct tdb_used_record) + data_len)
 362                 return 0;
 363
 364         /* OK, expand record */
 365         r = tdb_get(tdb, off, &pad, sizeof(pad));
 366         if (!r)
 367                 goto err;
 368
 369         if (r->data_len != data_len) {
 370                 tdb->ecode = TDB_ERR_CORRUPT;
 371                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
 372                          "coalesce: expected data len %llu not %llu\n",
 373                          (long long)data_len, (long long)r->data_len);
 374                 goto err;
 375         }
 376
 377         if (remove_from_list(tdb, b_off, off, r) == -1)
 378                 goto err;
 379
 380         r = tdb_access_write(tdb, off, sizeof(*r), true);
 381         if (!r)
 382                 goto err;
 383
 384         /* We have to drop this to avoid deadlocks, so make sure record
 385          * doesn't get coalesced by someone else! */
 386         r->magic_and_meta = TDB_COALESCING_MAGIC | zone_bits;
 387         r->data_len = end - off - sizeof(struct tdb_used_record);
 388         if (tdb_access_commit(tdb, r) != 0)
 389                 goto err;
 390
 391         tdb_unlock_free_bucket(tdb, b_off);
 392
 393         if (add_free_record(tdb, zone_bits, off, end - off) == -1)
 394                 return -1;
 395         return 1;
 396
 397 err:
 398         /* To unify error paths, we *always* unlock bucket on error. */
 399         tdb_unlock_free_bucket(tdb, b_off);
 400         return -1;
 401 }
 402
 403 /* We need size bytes to put our key and data in. */
 404 static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
 405                                 tdb_off_t zone_off,
 406                                 unsigned zone_bits,
 407                                 tdb_off_t bucket,
 408                                 size_t keylen, size_t datalen,
 409                                 bool want_extra,
 410                                 unsigned hashlow)
 411 {
 412         tdb_off_t off, b_off,best_off;
 413         struct tdb_free_record pad, best = { 0 }, *r;
 414         double multiplier;
 415         size_t size = adjust_size(keylen, datalen);
 416
 417 again:
 418         b_off = bucket_off(zone_off, bucket);
 419
 420         /* FIXME: Try non-blocking wait first, to measure contention.
 421          * If we're contented, try switching zones, and don't enlarge zone
 422          * next time (we want more zones). */
 423         /* Lock this bucket. */
 424         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == -1) {
 425                 return TDB_OFF_ERR;
 426         }
 427
 428         best.data_len = -1ULL;
 429         best_off = 0;
 430
 431         /* Get slack if we're after extra. */
 432         if (want_extra)
 433                 multiplier = 1.5;
 434         else
 435                 multiplier = 1.0;
 436
 437         /* Walk the list to see if any are large enough, getting less fussy
 438          * as we go. */
 439         off = tdb_read_off(tdb, b_off);
 440         if (unlikely(off == TDB_OFF_ERR))
 441                 goto unlock_err;
 442
 443         while (off) {
 444                 /* FIXME: Does tdb_get win anything here? */
 445                 r = tdb_get(tdb, off, &pad, sizeof(*r));
 446                 if (!r)
 447                         goto unlock_err;
 448
 449                 if (frec_magic(r) != TDB_FREE_MAGIC) {
 450                         tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
 451                                  "lock_and_alloc: %llu non-free 0x%llx\n",
 452                                  (long long)off, (long long)r->magic_and_meta);
 453                         goto unlock_err;
 454                 }
 455
 456                 if (r->data_len >= size && r->data_len < best.data_len) {
 457                         best_off = off;
 458                         best = *r;
 459                 }
 460
 461                 if (best.data_len < size * multiplier && best_off)
 462                         break;
 463
 464                 multiplier *= 1.01;
 465
 466                 /* Since we're going slow anyway, try coalescing here. */
 467                 switch (coalesce(tdb, zone_off, zone_bits, off, b_off,
 468                                  r->data_len)) {
 469                 case -1:
 470                         /* This has already unlocked on error. */
 471                         return -1;
 472                 case 1:
 473                         /* This has unlocked list, restart. */
 474                         goto again;
 475                 }
 476                 off = r->next;
 477         }
 478
 479         /* If we found anything at all, use it. */
 480         if (best_off) {
 481                 struct tdb_used_record rec;
 482                 size_t leftover;
 483
 484                 /* We're happy with this size: take it. */
 485                 if (remove_from_list(tdb, b_off, best_off, &best) != 0)
 486                         goto unlock_err;
 487
 488                 leftover = record_leftover(keylen, datalen, want_extra,
 489                                            best.data_len);
 490
 491                 assert(keylen + datalen + leftover <= best.data_len);
 492                 /* We need to mark non-free before we drop lock, otherwise
 493                  * coalesce() could try to merge it! */
 494                 if (set_header(tdb, &rec, keylen, datalen,
 495                                best.data_len - leftover,
 496                                hashlow, zone_bits) != 0)
 497                         goto unlock_err;
 498
 499                 if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
 500                         goto unlock_err;
 501
 502                 tdb_unlock_free_bucket(tdb, b_off);
 503
 504                 if (leftover) {
 505                         if (add_free_record(tdb, zone_bits,
 506                                             best_off + sizeof(rec)
 507                                             + best.data_len - leftover,
 508                                             leftover))
 509                                 return TDB_OFF_ERR;
 510                 }
 511                 return best_off;
 512         }
 513
 514         tdb_unlock_free_bucket(tdb, b_off);
 515         return 0;
 516
 517 unlock_err:
 518         tdb_unlock_free_bucket(tdb, b_off);
 519         return TDB_OFF_ERR;
 520 }
 521
 522 static bool next_zone(struct tdb_context *tdb)
 523 {
 524         tdb_off_t next = tdb->zone_off + (1ULL << tdb->zhdr.zone_bits);
 525
 526         /* We must have a header. */
 527         if (tdb->methods->oob(tdb, next + sizeof(tdb->zhdr), true))
 528                 return false;
 529
 530         tdb->zone_off = next;
 531         return tdb_read_convert(tdb, next, &tdb->zhdr, sizeof(tdb->zhdr)) == 0;
 532 }
 533
 534 /* Offset returned is within current zone (which it may alter). */
 535 static tdb_off_t get_free(struct tdb_context *tdb,
 536                           size_t keylen, size_t datalen, bool want_extra,
 537                           unsigned hashlow)
 538 {
 539         tdb_off_t start_zone = tdb->zone_off, off;
 540         bool wrapped = false;
 541         size_t size = adjust_size(keylen, datalen);
 542
 543         /* If they are growing, add 50% to get to higher bucket. */
 544         if (want_extra)
 545                 size += datalen / 2;
 546
 547         /* FIXME: If we don't get a hit in the first bucket we want,
 548          * try changing zones for next time.  That should help wear
 549          * zones evenly, so we don't need to search all of them before
 550          * expanding. */
 551         while (!wrapped || tdb->zone_off != start_zone) {
 552                 tdb_off_t b;
 553
 554                 /* Shortcut for really huge allocations... */
 555                 if ((size >> tdb->zhdr.zone_bits) != 0)
 556                         goto next;
 557
 558                 /* Start at exact size bucket, and search up... */
 559                 b = size_to_bucket(tdb->zhdr.zone_bits, size);
 560                 for (b = find_free_head(tdb, b);
 561                      b <= BUCKETS_FOR_ZONE(tdb->zhdr.zone_bits);
 562                      b += find_free_head(tdb, b + 1)) {
 563                         /* Try getting one from list. */
 564                         off = lock_and_alloc(tdb, tdb->zone_off,
 565                                              tdb->zhdr.zone_bits,
 566                                              b, keylen, datalen, want_extra,
 567                                              hashlow);
 568                         if (off == TDB_OFF_ERR)
 569                                 return TDB_OFF_ERR;
 570                         if (off != 0)
 571                                 return off;
 572                         /* Didn't work.  Try next bucket. */
 573                 }
 574
 575         next:
 576                 /* Didn't work, try next zone, if it exists. */
 577                 if (!next_zone(tdb)) {
 578                         wrapped = true;
 579                         tdb->zone_off = sizeof(struct tdb_header);
 580                         if (tdb_read_convert(tdb, tdb->zone_off,
 581                                              &tdb->zhdr, sizeof(tdb->zhdr))) {
 582                                 return TDB_OFF_ERR;
 583                         }
 584                 }
 585         }
 586         return 0;
 587 }
 588
 589 int set_header(struct tdb_context *tdb,
 590                struct tdb_used_record *rec,
 591                uint64_t keylen, uint64_t datalen,
 592                uint64_t actuallen, unsigned hashlow,
 593                unsigned int zone_bits)
 594 {
 595         uint64_t keybits = (fls64(keylen) + 1) / 2;
 596
 597         /* Use bottom bits of hash, so it's independent of hash table size. */
 598         rec->magic_and_meta
 599                 = zone_bits
 600                 | ((hashlow & ((1 << 5)-1)) << 6)
 601                 | ((actuallen - (keylen + datalen)) << 11)
 602                 | (keybits << 43)
 603                 | (TDB_MAGIC << 48);
 604         rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
 605
 606         /* Encoding can fail on big values. */
 607         if (rec_key_length(rec) != keylen
 608             || rec_data_length(rec) != datalen
 609             || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
 610                 tdb->ecode = TDB_ERR_IO;
 611                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
 612                          "Could not encode k=%llu,d=%llu,a=%llu\n",
 613                          (long long)keylen, (long long)datalen,
 614                          (long long)actuallen);
 615                 return -1;
 616         }
 617         return 0;
 618 }
 619
 620 static bool zones_contended(struct tdb_context *tdb)
 621 {
 622         return false;
 623 }
 624
 625 /* Assume we want buckets up to the comfort factor. */
 626 static tdb_len_t overhead(unsigned int zone_bits)
 627 {
 628         return sizeof(struct free_zone_header)
 629                 + (BUCKETS_FOR_ZONE(zone_bits) + 1) * sizeof(tdb_off_t);
 630 }
 631
 632 /* Expand the database (by adding a zone). */
 633 static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 634 {
 635         uint64_t old_size;
 636         tdb_off_t off;
 637         unsigned int num_buckets, zone_bits;
 638         tdb_len_t wanted, expand;
 639         struct free_zone_header zhdr;
 640
 641         /* We need room for the record header too. */
 642         wanted = sizeof(struct tdb_used_record) + size;
 643
 644         /* Only one person can expand file at a time. */
 645         if (tdb_lock_expand(tdb, F_WRLCK) != 0)
 646                 return -1;
 647
 648         /* Someone else may have expanded the file, so retry. */
 649         old_size = tdb->map_size;
 650         tdb->methods->oob(tdb, tdb->map_size + 1, true);
 651         if (tdb->map_size != old_size)
 652                 goto success;
 653
 654         /* Treat last zone as minimum reasonable zone size. */
 655         off = last_zone(tdb, &zhdr);
 656         if (off == TDB_OFF_ERR)
 657                 goto fail;
 658
 659         /* Zone isn't fully expanded? */
 660         if (tdb->map_size < off + (1ULL << zhdr.zone_bits)) {
 661                 expand = off + (1ULL << zhdr.zone_bits) - tdb->map_size;
 662                 /* Expand more than we want. */
 663                 if (expand > (wanted << TDB_COMFORT_FACTOR_BITS))
 664                         expand = (wanted << TDB_COMFORT_FACTOR_BITS);
 665                 if (tdb->methods->expand_file(tdb, expand) == -1)
 666                         goto fail;
 667                 /* We need to drop this lock before adding free record. */
 668                 tdb_unlock_expand(tdb, F_WRLCK);
 669
 670                 /* Allocate from here. */
 671                 tdb->zone_off = off;
 672                 tdb->zhdr = zhdr;
 673
 674                 /* FIXME: If this isn't sufficient, we search again... */
 675                 return add_free_record(tdb, zhdr.zone_bits,
 676                                        tdb->map_size - expand, expand);
 677         }
 678
 679         /* We are never allowed to cross a power-of-two boundary, and our
 680          * minimum zone size is 1 << INITIAL_ZONE_BITS.
 681          *
 682          * If our filesize is 128k, we can add a 64k or a 128k zone.  If it's
 683          * 192k, we can only add a 64k zone.
 684          *
 685          * In other words, our max zone size is (1 << (ffs(filesize) - 1)) */
 686         zone_bits = ffs64(old_size - sizeof(struct tdb_header)) - 1;
 687         assert(zone_bits >= INITIAL_ZONE_BITS);
 688
 689         /* Big zones generally good, but more zones wanted if contended. */
 690         if (zones_contended(tdb)) {
 691                 /* If it suffices, make zone same size as last one. */
 692                 if (zhdr.zone_bits < zone_bits
 693                     && (1ULL << zhdr.zone_bits) >= overhead(zone_bits)+wanted)
 694                         zone_bits = zhdr.zone_bits;
 695         }
 696
 697         zhdr.zone_bits = zone_bits;
 698         num_buckets = BUCKETS_FOR_ZONE(zone_bits);
 699
 700         /* Expand the file by more than we need right now. */
 701         expand = 1ULL << zone_bits;
 702         if (expand > overhead(zone_bits) + (wanted << TDB_COMFORT_FACTOR_BITS))
 703                 expand = overhead(zone_bits)
 704                         + (wanted << TDB_COMFORT_FACTOR_BITS);
 705
 706         if (tdb->methods->expand_file(tdb, expand) == -1)
 707                 goto fail;
 708
 709         /* Write new zone header (at old end). */
 710         off = old_size;
 711         if (tdb_write_convert(tdb, off, &zhdr, sizeof(zhdr)) == -1)
 712                 goto fail;
 713
 714         /* Now write empty buckets. */
 715         off += sizeof(zhdr);
 716         if (zero_out(tdb, off, (num_buckets+1) * sizeof(tdb_off_t)) == -1)
 717                 goto fail;
 718         off += (num_buckets+1) * sizeof(tdb_off_t);
 719
 720         /* Now add the rest as our free record. */
 721         if (add_free_record(tdb, zone_bits, off, expand - overhead(zone_bits))
 722             == -1)
 723                 goto fail;
 724
 725         /* Try allocating from this zone now. */
 726         tdb->zone_off = old_size;
 727         tdb->zhdr = zhdr;
 728
 729 success:
 730         tdb_unlock_expand(tdb, F_WRLCK);
 731         return 0;
 732
 733 fail:
 734         tdb_unlock_expand(tdb, F_WRLCK);
 735         return -1;
 736 }
 737
 738 /* This won't fail: it will expand the database if it has to. */
 739 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
 740                 uint64_t hash, bool growing)
 741 {
 742         tdb_off_t off;
 743
 744         /* We can't hold pointers during this: we could unmap! */
 745         assert(!tdb->direct_access);
 746
 747         for (;;) {
 748                 off = get_free(tdb, keylen, datalen, growing, hash);
 749                 if (likely(off != 0))
 750                         break;
 751
 752                 if (tdb_expand(tdb, adjust_size(keylen, datalen)))
 753                         return TDB_OFF_ERR;
 754         }
 755
 756         return off;
 757 }