]> git.ozlabs.org Git - ccan/blobdiff - ccan/tdb2/free.c
configurator: HAVE_SECTION_START_STOP
[ccan] / ccan / tdb2 / free.c
index 7482daa2153de5f61773353a4607eb8646a895d8..1b2c552a66fecea4a5e3190a4b4d3e1f5fe51cc3 100644 (file)
@@ -65,18 +65,18 @@ enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb)
        unsigned int rnd, max = 0, count = 0;
        tdb_off_t off;
 
-       tdb->ftable_off = off = first_ftable(tdb);
-       tdb->ftable = 0;
+       tdb->tdb2.ftable_off = off = first_ftable(tdb);
+       tdb->tdb2.ftable = 0;
 
        while (off) {
                if (TDB_OFF_IS_ERR(off)) {
-                       return off;
+                       return TDB_OFF_TO_ERR(off);
                }
 
                rnd = random();
                if (rnd >= max) {
-                       tdb->ftable_off = off;
-                       tdb->ftable = count;
+                       tdb->tdb2.ftable_off = off;
+                       tdb->tdb2.ftable = count;
                        max = rnd;
                }
 
@@ -146,14 +146,14 @@ static enum TDB_ERROR remove_from_list(struct tdb_context *tdb,
        /* Get prev->next */
        prev_next = tdb_read_off(tdb, off);
        if (TDB_OFF_IS_ERR(prev_next))
-               return prev_next;
+               return TDB_OFF_TO_ERR(prev_next);
 
        /* If prev->next == 0, we were head: update bucket to point to next. */
        if (prev_next == 0) {
                /* We must preserve upper bits. */
                head = tdb_read_off(tdb, b_off);
                if (TDB_OFF_IS_ERR(head))
-                       return head;
+                       return TDB_OFF_TO_ERR(head);
 
                if ((head & TDB_OFF_MASK) != r_off) {
                        return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
@@ -178,7 +178,7 @@ static enum TDB_ERROR remove_from_list(struct tdb_context *tdb,
        if (r->next == 0) {
                head = tdb_read_off(tdb, b_off);
                if (TDB_OFF_IS_ERR(head))
-                       return head;
+                       return TDB_OFF_TO_ERR(head);
                head &= TDB_OFF_MASK;
                off = head + offsetof(struct tdb_free_record, magic_and_prev);
        } else {
@@ -215,10 +215,10 @@ static enum TDB_ERROR enqueue_in_free(struct tdb_context *tdb,
 
        head = tdb_read_off(tdb, b_off);
        if (TDB_OFF_IS_ERR(head))
-               return head;
+               return TDB_OFF_TO_ERR(head);
 
        /* We only need to set ftable_and_len; rest is set in enqueue_in_free */
-       new.ftable_and_len = ((uint64_t)tdb->ftable << (64 - TDB_OFF_UPPER_STEAL))
+       new.ftable_and_len = ((uint64_t)tdb->tdb2.ftable << (64 - TDB_OFF_UPPER_STEAL))
                | len;
 
        /* new->next = head. */
@@ -287,8 +287,8 @@ static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable)
        tdb_off_t off;
        unsigned int i;
 
-       if (likely(tdb->ftable == ftable))
-               return tdb->ftable_off;
+       if (likely(tdb->tdb2.ftable == ftable))
+               return tdb->tdb2.ftable_off;
 
        off = first_ftable(tdb);
        for (i = 0; i < ftable; i++) {
@@ -336,7 +336,7 @@ static tdb_len_t coalesce(struct tdb_context *tdb,
                nb_off = ftable_offset(tdb, ftable);
                if (TDB_OFF_IS_ERR(nb_off)) {
                        tdb_access_release(tdb, r);
-                       ecode = nb_off;
+                       ecode = TDB_OFF_TO_ERR(nb_off);
                        goto err;
                }
                nb_off = bucket_off(nb_off, bucket);
@@ -370,8 +370,10 @@ static tdb_len_t coalesce(struct tdb_context *tdb,
                }
 
                /* Did we just mess up a record you were hoping to use? */
-               if (end == *protect)
-                       *protect = TDB_ERR_NOEXIST;
+               if (end == *protect) {
+                       tdb->stats.alloc_coalesce_iterate_clash++;
+                       *protect = TDB_ERR_TO_OFF(TDB_ERR_NOEXIST);
+               }
 
                ecode = remove_from_list(tdb, nb_off, end, &rec);
                check_list(tdb, nb_off);
@@ -390,8 +392,10 @@ static tdb_len_t coalesce(struct tdb_context *tdb,
                return 0;
 
        /* Before we expand, check this isn't one you wanted protected? */
-       if (off == *protect)
-               *protect = TDB_ERR_EXISTS;
+       if (off == *protect) {
+               *protect = TDB_ERR_TO_OFF(TDB_ERR_EXISTS);
+               tdb->stats.alloc_coalesce_iterate_clash++;
+       }
 
        /* OK, expand initial record */
        ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
@@ -416,7 +420,8 @@ static tdb_len_t coalesce(struct tdb_context *tdb,
        ecode = add_free_record(tdb, off, end - off, TDB_LOCK_NOWAIT, false);
        if (ecode != TDB_SUCCESS) {
                /* Need to drop lock.  Can't rely on anything stable. */
-               *protect = TDB_ERR_CORRUPT;
+               tdb->stats.alloc_coalesce_lockfail++;
+               *protect = TDB_ERR_TO_OFF(TDB_ERR_CORRUPT);
 
                /* We have to drop this to avoid deadlocks, so make sure record
                 * doesn't get coalesced by someone else! */
@@ -431,18 +436,18 @@ static tdb_len_t coalesce(struct tdb_context *tdb,
                        goto err;
                }
 
-               tdb->stats.alloc_coalesce_succeeded++;
                tdb_unlock_free_bucket(tdb, b_off);
 
                ecode = add_free_record(tdb, off, end - off, TDB_LOCK_WAIT,
                                        false);
                if (ecode != TDB_SUCCESS) {
-                       return ecode;
+                       return TDB_ERR_TO_OFF(ecode);
                }
        } else if (TDB_OFF_IS_ERR(*protect)) {
                /* For simplicity, we always drop lock if they can't continue */
                tdb_unlock_free_bucket(tdb, b_off);
        }
+       tdb->stats.alloc_coalesce_succeeded++;
 
        /* Return usable length. */
        return end - off - sizeof(struct tdb_used_record);
@@ -450,25 +455,27 @@ static tdb_len_t coalesce(struct tdb_context *tdb,
 err:
        /* To unify error paths, we *always* unlock bucket on error. */
        tdb_unlock_free_bucket(tdb, b_off);
-       return ecode;
+       return TDB_ERR_TO_OFF(ecode);
 }
 
 /* List is locked: we unlock it. */
 static enum TDB_ERROR coalesce_list(struct tdb_context *tdb,
-                                   tdb_off_t ftable_off, tdb_off_t b_off)
+                                   tdb_off_t ftable_off,
+                                   tdb_off_t b_off,
+                                   unsigned int limit)
 {
        enum TDB_ERROR ecode;
        tdb_off_t off;
 
        off = tdb_read_off(tdb, b_off);
        if (TDB_OFF_IS_ERR(off)) {
-               ecode = off;
+               ecode = TDB_OFF_TO_ERR(off);
                goto unlock_err;
        }
-       /* A little bit of paranoia */
+       /* A little bit of paranoia: counter should be 0. */
        off &= TDB_OFF_MASK;
 
-       while (off) {
+       while (off && limit--) {
                struct tdb_free_record rec;
                tdb_len_t coal;
                tdb_off_t next;
@@ -481,15 +488,91 @@ static enum TDB_ERROR coalesce_list(struct tdb_context *tdb,
                coal = coalesce(tdb, off, b_off, frec_len(&rec), &next);
                if (TDB_OFF_IS_ERR(coal)) {
                        /* This has already unlocked on error. */
-                       return coal;
+                       return TDB_OFF_TO_ERR(coal);
                }
                if (TDB_OFF_IS_ERR(next)) {
                        /* Coalescing had to unlock, so stop. */
                        return TDB_SUCCESS;
                }
+               /* Keep going if we're doing well... */
+               limit += size_to_bucket(coal / 16 + TDB_MIN_DATA_LEN);
                off = next;
        }
 
+       /* Now, move those elements to the tail of the list so we get something
+        * else next time. */
+       if (off) {
+               struct tdb_free_record oldhrec, newhrec, oldtrec, newtrec;
+               tdb_off_t oldhoff, oldtoff, newtoff;
+
+               /* The record we were up to is the new head. */
+               ecode = tdb_read_convert(tdb, off, &newhrec, sizeof(newhrec));
+               if (ecode != TDB_SUCCESS)
+                       goto unlock_err;
+
+               /* Get the new tail. */
+               newtoff = frec_prev(&newhrec);
+               ecode = tdb_read_convert(tdb, newtoff, &newtrec,
+                                        sizeof(newtrec));
+               if (ecode != TDB_SUCCESS)
+                       goto unlock_err;
+
+               /* Get the old head. */
+               oldhoff = tdb_read_off(tdb, b_off);
+               if (TDB_OFF_IS_ERR(oldhoff)) {
+                       ecode = TDB_OFF_TO_ERR(oldhoff);
+                       goto unlock_err;
+               }
+
+               /* This could happen if they all coalesced away. */
+               if (oldhoff == off)
+                       goto out;
+
+               ecode = tdb_read_convert(tdb, oldhoff, &oldhrec,
+                                        sizeof(oldhrec));
+               if (ecode != TDB_SUCCESS)
+                       goto unlock_err;
+
+               /* Get the old tail. */
+               oldtoff = frec_prev(&oldhrec);
+               ecode = tdb_read_convert(tdb, oldtoff, &oldtrec,
+                                        sizeof(oldtrec));
+               if (ecode != TDB_SUCCESS)
+                       goto unlock_err;
+
+               /* Old tail's next points to old head. */
+               oldtrec.next = oldhoff;
+
+               /* Old head's prev points to old tail. */
+               oldhrec.magic_and_prev
+                       = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL))
+                       | oldtoff;
+
+               /* New tail's next is 0. */
+               newtrec.next = 0;
+
+               /* Write out the modified versions. */
+               ecode = tdb_write_convert(tdb, oldtoff, &oldtrec,
+                                         sizeof(oldtrec));
+               if (ecode != TDB_SUCCESS)
+                       goto unlock_err;
+
+               ecode = tdb_write_convert(tdb, oldhoff, &oldhrec,
+                                         sizeof(oldhrec));
+               if (ecode != TDB_SUCCESS)
+                       goto unlock_err;
+
+               ecode = tdb_write_convert(tdb, newtoff, &newtrec,
+                                         sizeof(newtrec));
+               if (ecode != TDB_SUCCESS)
+                       goto unlock_err;
+               
+               /* And finally link in new head. */
+               ecode = tdb_write_off(tdb, b_off, off);
+               if (ecode != TDB_SUCCESS)
+                       goto unlock_err;
+       }
+out:
        tdb_unlock_free_bucket(tdb, b_off);
        return TDB_SUCCESS;
 
@@ -512,7 +595,7 @@ enum TDB_ERROR add_free_record(struct tdb_context *tdb,
 
        len = len_with_header - sizeof(struct tdb_used_record);
 
-       b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
+       b_off = bucket_off(tdb->tdb2.ftable_off, size_to_bucket(len));
        ecode = tdb_lock_free_bucket(tdb, b_off, waitflag);
        if (ecode != TDB_SUCCESS) {
                return ecode;
@@ -523,7 +606,7 @@ enum TDB_ERROR add_free_record(struct tdb_context *tdb,
 
        /* Coalescing unlocks free list. */
        if (!ecode && coalesce)
-               ecode = coalesce_list(tdb, tdb->ftable_off, b_off);
+               ecode = coalesce_list(tdb, tdb->tdb2.ftable_off, b_off, 2);
        else
                tdb_unlock_free_bucket(tdb, b_off);
        return ecode;
@@ -578,7 +661,7 @@ static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
        /* Lock this bucket. */
        ecode = tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT);
        if (ecode != TDB_SUCCESS) {
-               return ecode;
+               return TDB_ERR_TO_OFF(ecode);
        }
 
        best.ftable_and_len = -1ULL;
@@ -594,7 +677,7 @@ static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
         * as we go. */
        off = tdb_read_off(tdb, b_off);
        if (TDB_OFF_IS_ERR(off)) {
-               ecode = off;
+               ecode = TDB_OFF_TO_ERR(off);
                goto unlock_err;
        }
        off &= TDB_OFF_MASK;
@@ -669,7 +752,7 @@ static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
 
                /* For futureproofing, we put a 0 in any unused space. */
                if (rec_extra_padding(&rec)) {
-                       ecode = tdb->methods->twrite(tdb, best_off + sizeof(rec)
+                       ecode = tdb->tdb2.io->twrite(tdb, best_off + sizeof(rec)
                                                     + keylen + datalen, "", 1);
                        if (ecode != TDB_SUCCESS) {
                                goto unlock_err;
@@ -685,7 +768,7 @@ static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
                                                + frec_len(&best) - leftover,
                                                leftover, TDB_LOCK_WAIT, false);
                        if (ecode != TDB_SUCCESS) {
-                               best_off = ecode;
+                               best_off = TDB_ERR_TO_OFF(ecode);
                        }
                }
                tdb_unlock_free_bucket(tdb, b_off);
@@ -698,7 +781,7 @@ static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
 
 unlock_err:
        tdb_unlock_free_bucket(tdb, b_off);
-       return ecode;
+       return TDB_ERR_TO_OFF(ecode);
 }
 
 /* Get a free block from current free list, or 0 if none, -ve on error. */
@@ -717,9 +800,9 @@ static tdb_off_t get_free(struct tdb_context *tdb,
        else
                start_b = size_to_bucket(adjust_size(keylen, datalen));
 
-       ftable_off = tdb->ftable_off;
-       ftable = tdb->ftable;
-       while (!wrapped || ftable_off != tdb->ftable_off) {
+       ftable_off = tdb->tdb2.ftable_off;
+       ftable = tdb->tdb2.ftable;
+       while (!wrapped || ftable_off != tdb->tdb2.ftable_off) {
                /* Start at exact size bucket, and search up... */
                for (b = find_free_head(tdb, ftable_off, start_b);
                     b < TDB_FREE_BUCKETS;
@@ -736,8 +819,8 @@ static tdb_off_t get_free(struct tdb_context *tdb,
                                if (b == TDB_FREE_BUCKETS - 1)
                                        tdb->stats.alloc_bucket_max++;
                                /* Worked?  Stay using this list. */
-                               tdb->ftable_off = ftable_off;
-                               tdb->ftable = ftable;
+                               tdb->tdb2.ftable_off = ftable_off;
+                               tdb->tdb2.ftable = ftable;
                                return off;
                        }
                        /* Didn't work.  Try next bucket. */
@@ -815,7 +898,7 @@ static enum TDB_ERROR tdb_expand(struct tdb_context *tdb, tdb_len_t size)
 
        /* Someone else may have expanded the file, so retry. */
        old_size = tdb->file->map_size;
-       tdb->methods->oob(tdb, tdb->file->map_size + 1, true);
+       tdb->tdb2.io->oob(tdb, tdb->file->map_size + 1, true);
        if (tdb->file->map_size != old_size) {
                tdb_unlock_expand(tdb, F_WRLCK);
                return TDB_SUCCESS;
@@ -847,7 +930,7 @@ static enum TDB_ERROR tdb_expand(struct tdb_context *tdb, tdb_len_t size)
        /* We need room for the record header too. */
        wanted = adjust_size(0, sizeof(struct tdb_used_record) + wanted);
 
-       ecode = tdb->methods->expand_file(tdb, wanted);
+       ecode = tdb->tdb2.io->expand_file(tdb, wanted);
        if (ecode != TDB_SUCCESS) {
                tdb_unlock_expand(tdb, F_WRLCK);
                return ecode;
@@ -867,7 +950,7 @@ tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
        tdb_off_t off;
 
        /* We can't hold pointers during this: we could unmap! */
-       assert(!tdb->direct_access);
+       assert(!tdb->tdb2.direct_access);
 
        for (;;) {
                enum TDB_ERROR ecode;
@@ -877,7 +960,7 @@ tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
 
                ecode = tdb_expand(tdb, adjust_size(keylen, datalen));
                if (ecode != TDB_SUCCESS) {
-                       return ecode;
+                       return TDB_ERR_TO_OFF(ecode);
                }
        }