git.ozlabs.org Git - ccan/blob - ccan/tdb2/tdb1_freelist.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb1_private.h"
  29
  30 /* read a freelist record and check for simple errors */
  31 int tdb1_rec_free_read(struct tdb_context *tdb, tdb1_off_t off, struct tdb1_record *rec)
  32 {
  33         if (tdb->tdb1.io->tdb1_read(tdb, off, rec, sizeof(*rec),TDB1_DOCONV()) == -1)
  34                 return -1;
  35
  36         if (rec->magic == TDB1_MAGIC) {
  37                 /* this happens when a app is showdown while deleting a record - we should
  38                    not completely fail when this happens */
  39                 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_WARNING,
  40                            "tdb1_rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
  41                            rec->magic, off);
  42                 rec->magic = TDB1_FREE_MAGIC;
  43                 if (tdb->tdb1.io->tdb1_write(tdb, off, rec, sizeof(*rec)) == -1)
  44                         return -1;
  45         }
  46
  47         if (rec->magic != TDB1_FREE_MAGIC) {
  48                 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
  49                                         "tdb1_rec_free_read bad magic 0x%x at offset=%d\n",
  50                                         rec->magic, off);
  51                 return -1;
  52         }
  53         if (tdb->tdb1.io->tdb1_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
  54                 return -1;
  55         return 0;
  56 }
  57
  58
  59 /* update a record tailer (must hold allocation lock) */
  60 static int update_tailer(struct tdb_context *tdb, tdb1_off_t offset,
  61                          const struct tdb1_record *rec)
  62 {
  63         tdb1_off_t totalsize;
  64
  65         /* Offset of tailer from record header */
  66         totalsize = sizeof(*rec) + rec->rec_len;
  67         return tdb1_ofs_write(tdb, offset + totalsize - sizeof(tdb1_off_t),
  68                          &totalsize);
  69 }
  70
  71 /* Add an element into the freelist. Merge adjacent records if
  72    necessary. */
  73 int tdb1_free(struct tdb_context *tdb, tdb1_off_t offset, struct tdb1_record *rec)
  74 {
  75         /* Allocation and tailer lock */
  76         if (tdb1_lock(tdb, -1, F_WRLCK) != 0)
  77                 return -1;
  78
  79         /* set an initial tailer, so if we fail we don't leave a bogus record */
  80         if (update_tailer(tdb, offset, rec) != 0) {
  81                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
  82                            "tdb_free: update_tailer failed!\n");
  83                 goto fail;
  84         }
  85
  86         /* Look left */
  87         if (offset - sizeof(tdb1_off_t) > TDB1_DATA_START(tdb->tdb1.header.hash_size)) {
  88                 tdb1_off_t left = offset - sizeof(tdb1_off_t);
  89                 struct tdb1_record l;
  90                 tdb1_off_t leftsize;
  91
  92                 /* Read in tailer and jump back to header */
  93                 if (tdb1_ofs_read(tdb, left, &leftsize) == -1) {
  94                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
  95                                    "tdb1_free: left offset read failed at %u", left);
  96                         goto update;
  97                 }
  98
  99                 /* it could be uninitialised data */
 100                 if (leftsize == 0 || leftsize == TDB1_PAD_U32) {
 101                         goto update;
 102                 }
 103
 104                 left = offset - leftsize;
 105
 106                 if (leftsize > offset ||
 107                     left < TDB1_DATA_START(tdb->tdb1.header.hash_size)) {
 108                         goto update;
 109                 }
 110
 111                 /* Now read in the left record */
 112                 if (tdb->tdb1.io->tdb1_read(tdb, left, &l, sizeof(l), TDB1_DOCONV()) == -1) {
 113                         tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 114                                    "tdb1_free: left read failed at %u (%u)", left, leftsize);
 115                         goto update;
 116                 }
 117
 118                 /* If it's free, expand to include it. */
 119                 if (l.magic == TDB1_FREE_MAGIC) {
 120                         /* we now merge the new record into the left record, rather than the other
 121                            way around. This makes the operation O(1) instead of O(n). This change
 122                            prevents traverse from being O(n^2) after a lot of deletes */
 123                         l.rec_len += sizeof(*rec) + rec->rec_len;
 124                         if (tdb1_rec_write(tdb, left, &l) == -1) {
 125                                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 126                                            "tdb1_free: update_left failed at %u", left);
 127                                 goto fail;
 128                         }
 129                         if (update_tailer(tdb, left, &l) == -1) {
 130                                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 131                                            "tdb1_free: update_tailer failed at %u", offset);
 132                                 goto fail;
 133                         }
 134                         tdb1_unlock(tdb, -1, F_WRLCK);
 135                         return 0;
 136                 }
 137         }
 138
 139 update:
 140
 141         /* Now, prepend to free list */
 142         rec->magic = TDB1_FREE_MAGIC;
 143
 144         if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &rec->next) == -1 ||
 145             tdb1_rec_write(tdb, offset, rec) == -1 ||
 146             tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) {
 147                 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
 148                            "tdb1_free record write failed at offset=%d",
 149                            offset);
 150                 goto fail;
 151         }
 152
 153         /* And we're done. */
 154         tdb1_unlock(tdb, -1, F_WRLCK);
 155         return 0;
 156
 157  fail:
 158         tdb1_unlock(tdb, -1, F_WRLCK);
 159         return -1;
 160 }
 161
 162
 163
 164 /*
 165    the core of tdb1_allocate - called when we have decided which
 166    free list entry to use
 167
 168    Note that we try to allocate by grabbing data from the end of an existing record,
 169    not the beginning. This is so the left merge in a free is more likely to be
 170    able to free up the record without fragmentation
 171  */
 172 static tdb1_off_t tdb1_allocate_ofs(struct tdb_context *tdb,
 173                                   tdb1_len_t length, tdb1_off_t rec_ptr,
 174                                   struct tdb1_record *rec, tdb1_off_t last_ptr)
 175 {
 176 #define MIN_REC_SIZE (sizeof(struct tdb1_record) + sizeof(tdb1_off_t) + 8)
 177
 178         if (rec->rec_len < length + MIN_REC_SIZE) {
 179                 /* we have to grab the whole record */
 180
 181                 /* unlink it from the previous record */
 182                 if (tdb1_ofs_write(tdb, last_ptr, &rec->next) == -1) {
 183                         return 0;
 184                 }
 185
 186                 /* mark it not free */
 187                 rec->magic = TDB1_MAGIC;
 188                 if (tdb1_rec_write(tdb, rec_ptr, rec) == -1) {
 189                         return 0;
 190                 }
 191                 return rec_ptr;
 192         }
 193
 194         /* we're going to just shorten the existing record */
 195         rec->rec_len -= (length + sizeof(*rec));
 196         if (tdb1_rec_write(tdb, rec_ptr, rec) == -1) {
 197                 return 0;
 198         }
 199         if (update_tailer(tdb, rec_ptr, rec) == -1) {
 200                 return 0;
 201         }
 202
 203         /* and setup the new record */
 204         rec_ptr += sizeof(*rec) + rec->rec_len;
 205
 206         memset(rec, '\0', sizeof(*rec));
 207         rec->rec_len = length;
 208         rec->magic = TDB1_MAGIC;
 209
 210         if (tdb1_rec_write(tdb, rec_ptr, rec) == -1) {
 211                 return 0;
 212         }
 213
 214         if (update_tailer(tdb, rec_ptr, rec) == -1) {
 215                 return 0;
 216         }
 217
 218         return rec_ptr;
 219 }
 220
 221 /* allocate some space from the free list. The offset returned points
 222    to a unconnected tdb1_record within the database with room for at
 223    least length bytes of total data
 224
 225    0 is returned if the space could not be allocated
 226  */
 227 tdb1_off_t tdb1_allocate(struct tdb_context *tdb, tdb1_len_t length, struct tdb1_record *rec)
 228 {
 229         tdb1_off_t rec_ptr, last_ptr, newrec_ptr;
 230         struct {
 231                 tdb1_off_t rec_ptr, last_ptr;
 232                 tdb1_len_t rec_len;
 233         } bestfit;
 234         float multiplier = 1.0;
 235
 236         if (tdb1_lock(tdb, -1, F_WRLCK) == -1)
 237                 return 0;
 238
 239         /* over-allocate to reduce fragmentation */
 240         length *= 1.25;
 241
 242         /* Extra bytes required for tailer */
 243         length += sizeof(tdb1_off_t);
 244         length = TDB1_ALIGN(length, TDB1_ALIGNMENT);
 245
 246  again:
 247         last_ptr = TDB1_FREELIST_TOP;
 248
 249         /* read in the freelist top */
 250         if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &rec_ptr) == -1)
 251                 goto fail;
 252
 253         bestfit.rec_ptr = 0;
 254         bestfit.last_ptr = 0;
 255         bestfit.rec_len = 0;
 256
 257         /*
 258            this is a best fit allocation strategy. Originally we used
 259            a first fit strategy, but it suffered from massive fragmentation
 260            issues when faced with a slowly increasing record size.
 261          */
 262         while (rec_ptr) {
 263                 if (tdb1_rec_free_read(tdb, rec_ptr, rec) == -1) {
 264                         goto fail;
 265                 }
 266
 267                 if (rec->rec_len >= length) {
 268                         if (bestfit.rec_ptr == 0 ||
 269                             rec->rec_len < bestfit.rec_len) {
 270                                 bestfit.rec_len = rec->rec_len;
 271                                 bestfit.rec_ptr = rec_ptr;
 272                                 bestfit.last_ptr = last_ptr;
 273                         }
 274                 }
 275
 276                 /* move to the next record */
 277                 last_ptr = rec_ptr;
 278                 rec_ptr = rec->next;
 279
 280                 /* if we've found a record that is big enough, then
 281                    stop searching if its also not too big. The
 282                    definition of 'too big' changes as we scan
 283                    through */
 284                 if (bestfit.rec_len > 0 &&
 285                     bestfit.rec_len < length * multiplier) {
 286                         break;
 287                 }
 288
 289                 /* this multiplier means we only extremely rarely
 290                    search more than 50 or so records. At 50 records we
 291                    accept records up to 11 times larger than what we
 292                    want */
 293                 multiplier *= 1.05;
 294         }
 295
 296         if (bestfit.rec_ptr != 0) {
 297                 if (tdb1_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
 298                         goto fail;
 299                 }
 300
 301                 newrec_ptr = tdb1_allocate_ofs(tdb, length, bestfit.rec_ptr,
 302                                               rec, bestfit.last_ptr);
 303                 tdb1_unlock(tdb, -1, F_WRLCK);
 304                 return newrec_ptr;
 305         }
 306
 307         /* we didn't find enough space. See if we can expand the
 308            database and if we can then try again */
 309         if (tdb1_expand(tdb, length + sizeof(*rec)) == 0)
 310                 goto again;
 311  fail:
 312         tdb1_unlock(tdb, -1, F_WRLCK);
 313         return 0;
 314 }