git.ozlabs.org Git - ccan/blob - ccan/tdb2/tdb1_freelist.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb1_private.h"
  29
  30 /* read a freelist record and check for simple errors */
  31 int tdb1_rec_free_read(struct tdb1_context *tdb, tdb1_off_t off, struct tdb1_record *rec)
  32 {
  33         if (tdb->methods->tdb1_read(tdb, off, rec, sizeof(*rec),TDB1_DOCONV()) == -1)
  34                 return -1;
  35
  36         if (rec->magic == TDB1_MAGIC) {
  37                 /* this happens when a app is showdown while deleting a record - we should
  38                    not completely fail when this happens */
  39                 TDB1_LOG((tdb, TDB1_DEBUG_WARNING, "tdb1_rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
  40                          rec->magic, off));
  41                 rec->magic = TDB1_FREE_MAGIC;
  42                 if (tdb->methods->tdb1_write(tdb, off, rec, sizeof(*rec)) == -1)
  43                         return -1;
  44         }
  45
  46         if (rec->magic != TDB1_FREE_MAGIC) {
  47                 /* Ensure ecode is set for log fn. */
  48                 tdb->ecode = TDB1_ERR_CORRUPT;
  49                 TDB1_LOG((tdb, TDB1_DEBUG_WARNING, "tdb1_rec_free_read bad magic 0x%x at offset=%d\n",
  50                            rec->magic, off));
  51                 return -1;
  52         }
  53         if (tdb->methods->tdb1_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
  54                 return -1;
  55         return 0;
  56 }
  57
  58
  59 /* update a record tailer (must hold allocation lock) */
  60 static int update_tailer(struct tdb1_context *tdb, tdb1_off_t offset,
  61                          const struct tdb1_record *rec)
  62 {
  63         tdb1_off_t totalsize;
  64
  65         /* Offset of tailer from record header */
  66         totalsize = sizeof(*rec) + rec->rec_len;
  67         return tdb1_ofs_write(tdb, offset + totalsize - sizeof(tdb1_off_t),
  68                          &totalsize);
  69 }
  70
  71 /* Add an element into the freelist. Merge adjacent records if
  72    necessary. */
  73 int tdb1_free(struct tdb1_context *tdb, tdb1_off_t offset, struct tdb1_record *rec)
  74 {
  75         /* Allocation and tailer lock */
  76         if (tdb1_lock(tdb, -1, F_WRLCK) != 0)
  77                 return -1;
  78
  79         /* set an initial tailer, so if we fail we don't leave a bogus record */
  80         if (update_tailer(tdb, offset, rec) != 0) {
  81                 TDB1_LOG((tdb, TDB1_DEBUG_FATAL, "tdb1_free: update_tailer failed!\n"));
  82                 goto fail;
  83         }
  84
  85         /* Look left */
  86         if (offset - sizeof(tdb1_off_t) > TDB1_DATA_START(tdb->header.hash_size)) {
  87                 tdb1_off_t left = offset - sizeof(tdb1_off_t);
  88                 struct tdb1_record l;
  89                 tdb1_off_t leftsize;
  90
  91                 /* Read in tailer and jump back to header */
  92                 if (tdb1_ofs_read(tdb, left, &leftsize) == -1) {
  93                         TDB1_LOG((tdb, TDB1_DEBUG_FATAL, "tdb1_free: left offset read failed at %u\n", left));
  94                         goto update;
  95                 }
  96
  97                 /* it could be uninitialised data */
  98                 if (leftsize == 0 || leftsize == TDB1_PAD_U32) {
  99                         goto update;
 100                 }
 101
 102                 left = offset - leftsize;
 103
 104                 if (leftsize > offset ||
 105                     left < TDB1_DATA_START(tdb->header.hash_size)) {
 106                         goto update;
 107                 }
 108
 109                 /* Now read in the left record */
 110                 if (tdb->methods->tdb1_read(tdb, left, &l, sizeof(l), TDB1_DOCONV()) == -1) {
 111                         TDB1_LOG((tdb, TDB1_DEBUG_FATAL, "tdb1_free: left read failed at %u (%u)\n", left, leftsize));
 112                         goto update;
 113                 }
 114
 115                 /* If it's free, expand to include it. */
 116                 if (l.magic == TDB1_FREE_MAGIC) {
 117                         /* we now merge the new record into the left record, rather than the other
 118                            way around. This makes the operation O(1) instead of O(n). This change
 119                            prevents traverse from being O(n^2) after a lot of deletes */
 120                         l.rec_len += sizeof(*rec) + rec->rec_len;
 121                         if (tdb1_rec_write(tdb, left, &l) == -1) {
 122                                 TDB1_LOG((tdb, TDB1_DEBUG_FATAL, "tdb1_free: update_left failed at %u\n", left));
 123                                 goto fail;
 124                         }
 125                         if (update_tailer(tdb, left, &l) == -1) {
 126                                 TDB1_LOG((tdb, TDB1_DEBUG_FATAL, "tdb1_free: update_tailer failed at %u\n", offset));
 127                                 goto fail;
 128                         }
 129                         tdb1_unlock(tdb, -1, F_WRLCK);
 130                         return 0;
 131                 }
 132         }
 133
 134 update:
 135
 136         /* Now, prepend to free list */
 137         rec->magic = TDB1_FREE_MAGIC;
 138
 139         if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &rec->next) == -1 ||
 140             tdb1_rec_write(tdb, offset, rec) == -1 ||
 141             tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) {
 142                 TDB1_LOG((tdb, TDB1_DEBUG_FATAL, "tdb1_free record write failed at offset=%d\n", offset));
 143                 goto fail;
 144         }
 145
 146         /* And we're done. */
 147         tdb1_unlock(tdb, -1, F_WRLCK);
 148         return 0;
 149
 150  fail:
 151         tdb1_unlock(tdb, -1, F_WRLCK);
 152         return -1;
 153 }
 154
 155
 156
 157 /*
 158    the core of tdb1_allocate - called when we have decided which
 159    free list entry to use
 160
 161    Note that we try to allocate by grabbing data from the end of an existing record,
 162    not the beginning. This is so the left merge in a free is more likely to be
 163    able to free up the record without fragmentation
 164  */
 165 static tdb1_off_t tdb1_allocate_ofs(struct tdb1_context *tdb,
 166                                   tdb1_len_t length, tdb1_off_t rec_ptr,
 167                                   struct tdb1_record *rec, tdb1_off_t last_ptr)
 168 {
 169 #define MIN_REC_SIZE (sizeof(struct tdb1_record) + sizeof(tdb1_off_t) + 8)
 170
 171         if (rec->rec_len < length + MIN_REC_SIZE) {
 172                 /* we have to grab the whole record */
 173
 174                 /* unlink it from the previous record */
 175                 if (tdb1_ofs_write(tdb, last_ptr, &rec->next) == -1) {
 176                         return 0;
 177                 }
 178
 179                 /* mark it not free */
 180                 rec->magic = TDB1_MAGIC;
 181                 if (tdb1_rec_write(tdb, rec_ptr, rec) == -1) {
 182                         return 0;
 183                 }
 184                 return rec_ptr;
 185         }
 186
 187         /* we're going to just shorten the existing record */
 188         rec->rec_len -= (length + sizeof(*rec));
 189         if (tdb1_rec_write(tdb, rec_ptr, rec) == -1) {
 190                 return 0;
 191         }
 192         if (update_tailer(tdb, rec_ptr, rec) == -1) {
 193                 return 0;
 194         }
 195
 196         /* and setup the new record */
 197         rec_ptr += sizeof(*rec) + rec->rec_len;
 198
 199         memset(rec, '\0', sizeof(*rec));
 200         rec->rec_len = length;
 201         rec->magic = TDB1_MAGIC;
 202
 203         if (tdb1_rec_write(tdb, rec_ptr, rec) == -1) {
 204                 return 0;
 205         }
 206
 207         if (update_tailer(tdb, rec_ptr, rec) == -1) {
 208                 return 0;
 209         }
 210
 211         return rec_ptr;
 212 }
 213
 214 /* allocate some space from the free list. The offset returned points
 215    to a unconnected tdb1_record within the database with room for at
 216    least length bytes of total data
 217
 218    0 is returned if the space could not be allocated
 219  */
 220 tdb1_off_t tdb1_allocate(struct tdb1_context *tdb, tdb1_len_t length, struct tdb1_record *rec)
 221 {
 222         tdb1_off_t rec_ptr, last_ptr, newrec_ptr;
 223         struct {
 224                 tdb1_off_t rec_ptr, last_ptr;
 225                 tdb1_len_t rec_len;
 226         } bestfit;
 227         float multiplier = 1.0;
 228
 229         if (tdb1_lock(tdb, -1, F_WRLCK) == -1)
 230                 return 0;
 231
 232         /* over-allocate to reduce fragmentation */
 233         length *= 1.25;
 234
 235         /* Extra bytes required for tailer */
 236         length += sizeof(tdb1_off_t);
 237         length = TDB1_ALIGN(length, TDB1_ALIGNMENT);
 238
 239  again:
 240         last_ptr = TDB1_FREELIST_TOP;
 241
 242         /* read in the freelist top */
 243         if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &rec_ptr) == -1)
 244                 goto fail;
 245
 246         bestfit.rec_ptr = 0;
 247         bestfit.last_ptr = 0;
 248         bestfit.rec_len = 0;
 249
 250         /*
 251            this is a best fit allocation strategy. Originally we used
 252            a first fit strategy, but it suffered from massive fragmentation
 253            issues when faced with a slowly increasing record size.
 254          */
 255         while (rec_ptr) {
 256                 if (tdb1_rec_free_read(tdb, rec_ptr, rec) == -1) {
 257                         goto fail;
 258                 }
 259
 260                 if (rec->rec_len >= length) {
 261                         if (bestfit.rec_ptr == 0 ||
 262                             rec->rec_len < bestfit.rec_len) {
 263                                 bestfit.rec_len = rec->rec_len;
 264                                 bestfit.rec_ptr = rec_ptr;
 265                                 bestfit.last_ptr = last_ptr;
 266                         }
 267                 }
 268
 269                 /* move to the next record */
 270                 last_ptr = rec_ptr;
 271                 rec_ptr = rec->next;
 272
 273                 /* if we've found a record that is big enough, then
 274                    stop searching if its also not too big. The
 275                    definition of 'too big' changes as we scan
 276                    through */
 277                 if (bestfit.rec_len > 0 &&
 278                     bestfit.rec_len < length * multiplier) {
 279                         break;
 280                 }
 281
 282                 /* this multiplier means we only extremely rarely
 283                    search more than 50 or so records. At 50 records we
 284                    accept records up to 11 times larger than what we
 285                    want */
 286                 multiplier *= 1.05;
 287         }
 288
 289         if (bestfit.rec_ptr != 0) {
 290                 if (tdb1_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
 291                         goto fail;
 292                 }
 293
 294                 newrec_ptr = tdb1_allocate_ofs(tdb, length, bestfit.rec_ptr,
 295                                               rec, bestfit.last_ptr);
 296                 tdb1_unlock(tdb, -1, F_WRLCK);
 297                 return newrec_ptr;
 298         }
 299
 300         /* we didn't find enough space. See if we can expand the
 301            database and if we can then try again */
 302         if (tdb1_expand(tdb, length + sizeof(*rec)) == 0)
 303                 goto again;
 304  fail:
 305         tdb1_unlock(tdb, -1, F_WRLCK);
 306         return 0;
 307 }