pppd/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2004
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 2 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, write to the Free Software
  26    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  27 */
  28
  29
  30 /* NOTE: If you use tdbs under valgrind, and in particular if you run
  31  * tdbtorture, you may get spurious "uninitialized value" warnings.  I
  32  * think this is because valgrind doesn't understand that the mmap'd
  33  * area may be written to by other processes.  Memory can, from the
  34  * point of view of the grinded process, spontaneously become
  35  * initialized.
  36  *
  37  * I can think of a few solutions.  [mbp 20030311]
  38  *
  39  * 1 - Write suppressions for Valgrind so that it doesn't complain
  40  * about this.  Probably the most reasonable but people need to
  41  * remember to use them.
  42  *
  43  * 2 - Use IO not mmap when running under valgrind.  Not so nice.
  44  *
  45  * 3 - Use the special valgrind macros to mark memory as valid at the
  46  * right time.  Probably too hard -- the process just doesn't know.
  47  */
  48
  49 #ifdef HAVE_CONFIG_H
  50 #include "config.h"
  51 #endif
  52
  53 #include <stdlib.h>
  54 #include <stdio.h>
  55 #include <fcntl.h>
  56 #include <unistd.h>
  57 #include <string.h>
  58 #include <fcntl.h>
  59 #include <errno.h>
  60 #include <sys/mman.h>
  61 #include <sys/stat.h>
  62 #include <signal.h>
  63 #include "tdb.h"
  64 #include "spinlock.h"
  65
  66 #define TDB_MAGIC_FOOD "TDB file\n"
  67 #define TDB_VERSION (0x26011967 + 6)
  68 #define TDB_MAGIC (0x26011999U)
  69 #define TDB_FREE_MAGIC (~TDB_MAGIC)
  70 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
  71 #define TDB_ALIGNMENT 4
  72 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
  73 #define DEFAULT_HASH_SIZE 131
  74 #define TDB_PAGE_SIZE 0x2000
  75 #define FREELIST_TOP (sizeof(struct tdb_header))
  76 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
  77 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
  78 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
  79 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
  80 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
  81 #define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1) + TDB_SPINLOCK_SIZE(hash_size))
  82
  83
  84 /* NB assumes there is a local variable called "tdb" that is the
  85  * current context, also takes doubly-parenthesized print-style
  86  * argument. */
  87 #define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
  88
  89 /* lock offsets */
  90 #define GLOBAL_LOCK 0
  91 #define ACTIVE_LOCK 4
  92
  93 #ifndef MAP_FILE
  94 #define MAP_FILE 0
  95 #endif
  96
  97 #ifndef MAP_FAILED
  98 #define MAP_FAILED ((void *)-1)
  99 #endif
 100
 101 /* free memory if the pointer is valid and zero the pointer */
 102 #ifndef SAFE_FREE
 103 #define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
 104 #endif
 105
 106 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
 107 TDB_DATA tdb_null;
 108
 109 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
 110 static TDB_CONTEXT *tdbs = NULL;
 111
 112 static int tdb_munmap(TDB_CONTEXT *tdb)
 113 {
 114         if (tdb->flags & TDB_INTERNAL)
 115                 return 0;
 116
 117 #ifdef HAVE_MMAP
 118         if (tdb->map_ptr) {
 119                 int ret = munmap(tdb->map_ptr, tdb->map_size);
 120                 if (ret != 0)
 121                         return ret;
 122         }
 123 #endif
 124         tdb->map_ptr = NULL;
 125         return 0;
 126 }
 127
 128 static void tdb_mmap(TDB_CONTEXT *tdb)
 129 {
 130         if (tdb->flags & TDB_INTERNAL)
 131                 return;
 132
 133 #ifdef HAVE_MMAP
 134         if (!(tdb->flags & TDB_NOMMAP)) {
 135                 tdb->map_ptr = mmap(NULL, tdb->map_size,
 136                                     PROT_READ|(tdb->read_only? 0:PROT_WRITE),
 137                                     MAP_SHARED|MAP_FILE, tdb->fd, 0);
 138
 139                 /*
 140                  * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
 141                  */
 142
 143                 if (tdb->map_ptr == MAP_FAILED) {
 144                         tdb->map_ptr = NULL;
 145                         TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
 146                                  tdb->map_size, strerror(errno)));
 147                 }
 148         } else {
 149                 tdb->map_ptr = NULL;
 150         }
 151 #else
 152         tdb->map_ptr = NULL;
 153 #endif
 154 }
 155
 156 /* Endian conversion: we only ever deal with 4 byte quantities */
 157 static void *convert(void *buf, u32 size)
 158 {
 159         u32 i, *p = buf;
 160         for (i = 0; i < size / 4; i++)
 161                 p[i] = TDB_BYTEREV(p[i]);
 162         return buf;
 163 }
 164 #define DOCONV() (tdb->flags & TDB_CONVERT)
 165 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
 166
 167 /* the body of the database is made of one list_struct for the free space
 168    plus a separate data list for each hash value */
 169 struct list_struct {
 170         tdb_off next; /* offset of the next record in the list */
 171         tdb_len rec_len; /* total byte length of record */
 172         tdb_len key_len; /* byte length of key */
 173         tdb_len data_len; /* byte length of data */
 174         u32 full_hash; /* the full 32 bit hash of the key */
 175         u32 magic;   /* try to catch errors */
 176         /* the following union is implied:
 177                 union {
 178                         char record[rec_len];
 179                         struct {
 180                                 char key[key_len];
 181                                 char data[data_len];
 182                         }
 183                         u32 totalsize; (tailer)
 184                 }
 185         */
 186 };
 187
 188 /***************************************************************
 189  Allow a caller to set a "alarm" flag that tdb can check to abort
 190  a blocking lock on SIGALRM.
 191 ***************************************************************/
 192
 193 static sig_atomic_t *palarm_fired;
 194
 195 void tdb_set_lock_alarm(sig_atomic_t *palarm)
 196 {
 197         palarm_fired = palarm;
 198 }
 199
 200 /* a byte range locking function - return 0 on success
 201    this functions locks/unlocks 1 byte at the specified offset.
 202
 203    On error, errno is also set so that errors are passed back properly
 204    through tdb_open(). */
 205 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
 206                       int rw_type, int lck_type, int probe)
 207 {
 208         struct flock fl;
 209         int ret;
 210
 211         if (tdb->flags & TDB_NOLOCK)
 212                 return 0;
 213         if ((rw_type == F_WRLCK) && (tdb->read_only)) {
 214                 errno = EACCES;
 215                 return -1;
 216         }
 217
 218         fl.l_type = rw_type;
 219         fl.l_whence = SEEK_SET;
 220         fl.l_start = offset;
 221         fl.l_len = 1;
 222         fl.l_pid = 0;
 223
 224         do {
 225                 ret = fcntl(tdb->fd,lck_type,&fl);
 226                 if (ret == -1 && errno == EINTR && palarm_fired && *palarm_fired)
 227                         break;
 228         } while (ret == -1 && errno == EINTR);
 229
 230         if (ret == -1) {
 231                 if (!probe && lck_type != F_SETLK) {
 232                         /* Ensure error code is set for log fun to examine. */
 233                         if (errno == EINTR && palarm_fired && *palarm_fired)
 234                                 tdb->ecode = TDB_ERR_LOCK_TIMEOUT;
 235                         else
 236                                 tdb->ecode = TDB_ERR_LOCK;
 237                         TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 238                                  tdb->fd, offset, rw_type, lck_type));
 239                 }
 240                 /* Was it an alarm timeout ? */
 241                 if (errno == EINTR && palarm_fired && *palarm_fired) {
 242                         TDB_LOG((tdb, 5, "tdb_brlock timed out (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 243                                  tdb->fd, offset, rw_type, lck_type));
 244                         return TDB_ERRCODE(TDB_ERR_LOCK_TIMEOUT, -1);
 245                 }
 246                 /* Otherwise - generic lock error. errno set by fcntl.
 247                  * EAGAIN is an expected return from non-blocking
 248                  * locks. */
 249                 if (errno != EAGAIN) {
 250                         TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
 251                                  tdb->fd, offset, rw_type, lck_type,
 252                                  strerror(errno)));
 253                 }
 254                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
 255         }
 256         return 0;
 257 }
 258
 259 /* lock a list in the database. list -1 is the alloc list */
 260 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
 261 {
 262         if (list < -1 || list >= (int)tdb->header.hash_size) {
 263                 TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
 264                            list, ltype));
 265                 return -1;
 266         }
 267         if (tdb->flags & TDB_NOLOCK)
 268                 return 0;
 269
 270         /* Since fcntl locks don't nest, we do a lock for the first one,
 271            and simply bump the count for future ones */
 272         if (tdb->locked[list+1].count == 0) {
 273                 if (!tdb->read_only && tdb->header.rwlocks) {
 274                         if (tdb_spinlock(tdb, list, ltype)) {
 275                                 TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list %d ltype=%d\n",
 276                                            list, ltype));
 277                                 return -1;
 278                         }
 279                 } else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
 280                         TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
 281                                            list, ltype, strerror(errno)));
 282                         return -1;
 283                 }
 284                 tdb->locked[list+1].ltype = ltype;
 285         }
 286         tdb->locked[list+1].count++;
 287         return 0;
 288 }
 289
 290 /* unlock the database: returns void because it's too late for errors. */
 291         /* changed to return int it may be interesting to know there
 292            has been an error  --simo */
 293 static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
 294 {
 295         int ret = -1;
 296
 297         if (tdb->flags & TDB_NOLOCK)
 298                 return 0;
 299
 300         /* Sanity checks */
 301         if (list < -1 || list >= (int)tdb->header.hash_size) {
 302                 TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
 303                 return ret;
 304         }
 305
 306         if (tdb->locked[list+1].count==0) {
 307                 TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
 308                 return ret;
 309         }
 310
 311         if (tdb->locked[list+1].count == 1) {
 312                 /* Down to last nested lock: unlock underneath */
 313                 if (!tdb->read_only && tdb->header.rwlocks) {
 314                         ret = tdb_spinunlock(tdb, list, ltype);
 315                 } else {
 316                         ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
 317                 }
 318         } else {
 319                 ret = 0;
 320         }
 321         tdb->locked[list+1].count--;
 322
 323         if (ret)
 324                 TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
 325         return ret;
 326 }
 327
 328 /* check for an out of bounds access - if it is out of bounds then
 329    see if the database has been expanded by someone else and expand
 330    if necessary
 331    note that "len" is the minimum length needed for the db
 332 */
 333 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
 334 {
 335         struct stat st;
 336         if (len <= tdb->map_size)
 337                 return 0;
 338         if (tdb->flags & TDB_INTERNAL) {
 339                 if (!probe) {
 340                         /* Ensure ecode is set for log fn. */
 341                         tdb->ecode = TDB_ERR_IO;
 342                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
 343                                  (int)len, (int)tdb->map_size));
 344                 }
 345                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 346         }
 347
 348         if (fstat(tdb->fd, &st) == -1)
 349                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 350
 351         if (st.st_size < (size_t)len) {
 352                 if (!probe) {
 353                         /* Ensure ecode is set for log fn. */
 354                         tdb->ecode = TDB_ERR_IO;
 355                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
 356                                  (int)len, (int)st.st_size));
 357                 }
 358                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 359         }
 360
 361         /* Unmap, update size, remap */
 362         if (tdb_munmap(tdb) == -1)
 363                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 364         tdb->map_size = st.st_size;
 365         tdb_mmap(tdb);
 366         return 0;
 367 }
 368
 369 /* write a lump of data at a specified offset */
 370 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
 371 {
 372         if (tdb_oob(tdb, off + len, 0) != 0)
 373                 return -1;
 374
 375         if (tdb->map_ptr)
 376                 memcpy(off + (char *)tdb->map_ptr, buf, len);
 377 #ifdef HAVE_PWRITE
 378         else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
 379 #else
 380         else if (lseek(tdb->fd, off, SEEK_SET) != off
 381                  || write(tdb->fd, buf, len) != (ssize_t)len) {
 382 #endif
 383                 /* Ensure ecode is set for log fn. */
 384                 tdb->ecode = TDB_ERR_IO;
 385                 TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
 386                            off, len, strerror(errno)));
 387                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 388         }
 389         return 0;
 390 }
 391
 392 /* read a lump of data at a specified offset, maybe convert */
 393 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
 394 {
 395         if (tdb_oob(tdb, off + len, 0) != 0)
 396                 return -1;
 397
 398         if (tdb->map_ptr)
 399                 memcpy(buf, off + (char *)tdb->map_ptr, len);
 400 #ifdef HAVE_PREAD
 401         else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
 402 #else
 403         else if (lseek(tdb->fd, off, SEEK_SET) != off
 404                  || read(tdb->fd, buf, len) != (ssize_t)len) {
 405 #endif
 406                 /* Ensure ecode is set for log fn. */
 407                 tdb->ecode = TDB_ERR_IO;
 408                 TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
 409                            off, len, strerror(errno)));
 410                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 411         }
 412         if (cv)
 413                 convert(buf, len);
 414         return 0;
 415 }
 416
 417 /* read a lump of data, allocating the space for it */
 418 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
 419 {
 420         char *buf;
 421
 422         if (!(buf = malloc(len))) {
 423                 /* Ensure ecode is set for log fn. */
 424                 tdb->ecode = TDB_ERR_OOM;
 425                 TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
 426                            len, strerror(errno)));
 427                 return TDB_ERRCODE(TDB_ERR_OOM, buf);
 428         }
 429         if (tdb_read(tdb, offset, buf, len, 0) == -1) {
 430                 SAFE_FREE(buf);
 431                 return NULL;
 432         }
 433         return buf;
 434 }
 435
 436 /* read/write a tdb_off */
 437 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 438 {
 439         return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
 440 }
 441 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 442 {
 443         tdb_off off = *d;
 444         return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
 445 }
 446
 447 /* read/write a record */
 448 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 449 {
 450         if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
 451                 return -1;
 452         if (TDB_BAD_MAGIC(rec)) {
 453                 /* Ensure ecode is set for log fn. */
 454                 tdb->ecode = TDB_ERR_CORRUPT;
 455                 TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
 456                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 457         }
 458         return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
 459 }
 460 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 461 {
 462         struct list_struct r = *rec;
 463         return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
 464 }
 465
 466 /* read a freelist record and check for simple errors */
 467 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
 468 {
 469         if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
 470                 return -1;
 471
 472         if (rec->magic == TDB_MAGIC) {
 473                 /* this happens when a app is showdown while deleting a record - we should
 474                    not completely fail when this happens */
 475                 TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
 476                          rec->magic, off));
 477                 rec->magic = TDB_FREE_MAGIC;
 478                 if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
 479                         return -1;
 480         }
 481
 482         if (rec->magic != TDB_FREE_MAGIC) {
 483                 /* Ensure ecode is set for log fn. */
 484                 tdb->ecode = TDB_ERR_CORRUPT;
 485                 TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
 486                            rec->magic, off));
 487                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 488         }
 489         if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
 490                 return -1;
 491         return 0;
 492 }
 493
 494 /* update a record tailer (must hold allocation lock) */
 495 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
 496                          const struct list_struct *rec)
 497 {
 498         tdb_off totalsize;
 499
 500         /* Offset of tailer from record header */
 501         totalsize = sizeof(*rec) + rec->rec_len;
 502         return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
 503                          &totalsize);
 504 }
 505
 506 static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
 507 {
 508         struct list_struct rec;
 509         tdb_off tailer_ofs, tailer;
 510
 511         if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 512                 printf("ERROR: failed to read record at %u\n", offset);
 513                 return 0;
 514         }
 515
 516         printf(" rec: offset=%u next=%d rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
 517                offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
 518
 519         tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
 520         if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
 521                 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
 522                 return rec.next;
 523         }
 524
 525         if (tailer != rec.rec_len + sizeof(rec)) {
 526                 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
 527                                 (unsigned)tailer, (unsigned)(rec.rec_len + sizeof(rec)));
 528         }
 529         return rec.next;
 530 }
 531
 532 static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
 533 {
 534         tdb_off rec_ptr, top;
 535
 536         top = TDB_HASH_TOP(i);
 537
 538         if (tdb_lock(tdb, i, F_WRLCK) != 0)
 539                 return -1;
 540
 541         if (ofs_read(tdb, top, &rec_ptr) == -1)
 542                 return tdb_unlock(tdb, i, F_WRLCK);
 543
 544         if (rec_ptr)
 545                 printf("hash=%d\n", i);
 546
 547         while (rec_ptr) {
 548                 rec_ptr = tdb_dump_record(tdb, rec_ptr);
 549         }
 550
 551         return tdb_unlock(tdb, i, F_WRLCK);
 552 }
 553
 554 void tdb_dump_all(TDB_CONTEXT *tdb)
 555 {
 556         int i;
 557         for (i=0;i<tdb->header.hash_size;i++) {
 558                 tdb_dump_chain(tdb, i);
 559         }
 560         printf("freelist:\n");
 561         tdb_dump_chain(tdb, -1);
 562 }
 563
 564 int tdb_printfreelist(TDB_CONTEXT *tdb)
 565 {
 566         int ret;
 567         long total_free = 0;
 568         tdb_off offset, rec_ptr;
 569         struct list_struct rec;
 570
 571         if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
 572                 return ret;
 573
 574         offset = FREELIST_TOP;
 575
 576         /* read in the freelist top */
 577         if (ofs_read(tdb, offset, &rec_ptr) == -1) {
 578                 tdb_unlock(tdb, -1, F_WRLCK);
 579                 return 0;
 580         }
 581
 582         printf("freelist top=[0x%08x]\n", rec_ptr );
 583         while (rec_ptr) {
 584                 if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 585                         tdb_unlock(tdb, -1, F_WRLCK);
 586                         return -1;
 587                 }
 588
 589                 if (rec.magic != TDB_FREE_MAGIC) {
 590                         printf("bad magic 0x%08x in free list\n", rec.magic);
 591                         tdb_unlock(tdb, -1, F_WRLCK);
 592                         return -1;
 593                 }
 594
 595                 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)]\n", rec.next, rec.rec_len, rec.rec_len );
 596                 total_free += rec.rec_len;
 597
 598                 /* move to the next record */
 599                 rec_ptr = rec.next;
 600         }
 601         printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
 602                (int)total_free);
 603
 604         return tdb_unlock(tdb, -1, F_WRLCK);
 605 }
 606
 607 /* Remove an element from the freelist.  Must have alloc lock. */
 608 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
 609 {
 610         tdb_off last_ptr, i;
 611
 612         /* read in the freelist top */
 613         last_ptr = FREELIST_TOP;
 614         while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
 615                 if (i == off) {
 616                         /* We've found it! */
 617                         return ofs_write(tdb, last_ptr, &next);
 618                 }
 619                 /* Follow chain (next offset is at start of record) */
 620                 last_ptr = i;
 621         }
 622         TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
 623         return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 624 }
 625
 626 /* Add an element into the freelist. Merge adjacent records if
 627    neccessary. */
 628 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 629 {
 630         tdb_off right, left;
 631
 632         /* Allocation and tailer lock */
 633         if (tdb_lock(tdb, -1, F_WRLCK) != 0)
 634                 return -1;
 635
 636         /* set an initial tailer, so if we fail we don't leave a bogus record */
 637         if (update_tailer(tdb, offset, rec) != 0) {
 638                 TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
 639                 goto fail;
 640         }
 641
 642         /* Look right first (I'm an Australian, dammit) */
 643         right = offset + sizeof(*rec) + rec->rec_len;
 644         if (right + sizeof(*rec) <= tdb->map_size) {
 645                 struct list_struct r;
 646
 647                 if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
 648                         TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
 649                         goto left;
 650                 }
 651
 652                 /* If it's free, expand to include it. */
 653                 if (r.magic == TDB_FREE_MAGIC) {
 654                         if (remove_from_freelist(tdb, right, r.next) == -1) {
 655                                 TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
 656                                 goto left;
 657                         }
 658                         rec->rec_len += sizeof(r) + r.rec_len;
 659                 }
 660         }
 661
 662 left:
 663         /* Look left */
 664         left = offset - sizeof(tdb_off);
 665         if (left > TDB_DATA_START(tdb->header.hash_size)) {
 666                 struct list_struct l;
 667                 tdb_off leftsize;
 668
 669                 /* Read in tailer and jump back to header */
 670                 if (ofs_read(tdb, left, &leftsize) == -1) {
 671                         TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
 672                         goto update;
 673                 }
 674                 left = offset - leftsize;
 675
 676                 /* Now read in record */
 677                 if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
 678                         TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
 679                         goto update;
 680                 }
 681
 682                 /* If it's free, expand to include it. */
 683                 if (l.magic == TDB_FREE_MAGIC) {
 684                         if (remove_from_freelist(tdb, left, l.next) == -1) {
 685                                 TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
 686                                 goto update;
 687                         } else {
 688                                 offset = left;
 689                                 rec->rec_len += leftsize;
 690                         }
 691                 }
 692         }
 693
 694 update:
 695         if (update_tailer(tdb, offset, rec) == -1) {
 696                 TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
 697                 goto fail;
 698         }
 699
 700         /* Now, prepend to free list */
 701         rec->magic = TDB_FREE_MAGIC;
 702
 703         if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
 704             rec_write(tdb, offset, rec) == -1 ||
 705             ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 706                 TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
 707                 goto fail;
 708         }
 709
 710         /* And we're done. */
 711         tdb_unlock(tdb, -1, F_WRLCK);
 712         return 0;
 713
 714  fail:
 715         tdb_unlock(tdb, -1, F_WRLCK);
 716         return -1;
 717 }
 718
 719
 720 /* expand a file.  we prefer to use ftruncate, as that is what posix
 721   says to use for mmap expansion */
 722 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
 723 {
 724         char buf[1024];
 725 #if HAVE_FTRUNCATE_EXTEND
 726         if (ftruncate(tdb->fd, size+addition) != 0) {
 727                 TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
 728                            size+addition, strerror(errno)));
 729                 return -1;
 730         }
 731 #else
 732         char b = 0;
 733
 734 #ifdef HAVE_PWRITE
 735         if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
 736 #else
 737         if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
 738             write(tdb->fd, &b, 1) != 1) {
 739 #endif
 740                 TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
 741                            size+addition, strerror(errno)));
 742                 return -1;
 743         }
 744 #endif
 745
 746         /* now fill the file with something. This ensures that the file isn't sparse, which would be
 747            very bad if we ran out of disk. This must be done with write, not via mmap */
 748         memset(buf, 0x42, sizeof(buf));
 749         while (addition) {
 750                 int n = addition>sizeof(buf)?sizeof(buf):addition;
 751 #ifdef HAVE_PWRITE
 752                 int ret = pwrite(tdb->fd, buf, n, size);
 753 #else
 754                 int ret;
 755                 if (lseek(tdb->fd, size, SEEK_SET) != size)
 756                         return -1;
 757                 ret = write(tdb->fd, buf, n);
 758 #endif
 759                 if (ret != n) {
 760                         TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
 761                                    n, strerror(errno)));
 762                         return -1;
 763                 }
 764                 addition -= n;
 765                 size += n;
 766         }
 767         return 0;
 768 }
 769
 770
 771 /* expand the database at least size bytes by expanding the underlying
 772    file and doing the mmap again if necessary */
 773 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
 774 {
 775         struct list_struct rec;
 776         tdb_off offset;
 777
 778         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 779                 TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
 780                 return -1;
 781         }
 782
 783         /* must know about any previous expansions by another process */
 784         tdb_oob(tdb, tdb->map_size + 1, 1);
 785
 786         /* always make room for at least 10 more records, and round
 787            the database up to a multiple of TDB_PAGE_SIZE */
 788         size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
 789
 790         if (!(tdb->flags & TDB_INTERNAL))
 791                 tdb_munmap(tdb);
 792
 793         /*
 794          * We must ensure the file is unmapped before doing this
 795          * to ensure consistency with systems like OpenBSD where
 796          * writes and mmaps are not consistent.
 797          */
 798
 799         /* expand the file itself */
 800         if (!(tdb->flags & TDB_INTERNAL)) {
 801                 if (expand_file(tdb, tdb->map_size, size) != 0)
 802                         goto fail;
 803         }
 804
 805         tdb->map_size += size;
 806
 807         if (tdb->flags & TDB_INTERNAL)
 808                 tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
 809         else {
 810                 /*
 811                  * We must ensure the file is remapped before adding the space
 812                  * to ensure consistency with systems like OpenBSD where
 813                  * writes and mmaps are not consistent.
 814                  */
 815
 816                 /* We're ok if the mmap fails as we'll fallback to read/write */
 817                 tdb_mmap(tdb);
 818         }
 819
 820         /* form a new freelist record */
 821         memset(&rec,'\0',sizeof(rec));
 822         rec.rec_len = size - sizeof(rec);
 823
 824         /* link it into the free list */
 825         offset = tdb->map_size - size;
 826         if (tdb_free(tdb, offset, &rec) == -1)
 827                 goto fail;
 828
 829         tdb_unlock(tdb, -1, F_WRLCK);
 830         return 0;
 831  fail:
 832         tdb_unlock(tdb, -1, F_WRLCK);
 833         return -1;
 834 }
 835
 836 /* allocate some space from the free list. The offset returned points
 837    to a unconnected list_struct within the database with room for at
 838    least length bytes of total data
 839
 840    0 is returned if the space could not be allocated
 841  */
 842 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
 843                             struct list_struct *rec)
 844 {
 845         tdb_off rec_ptr, last_ptr, newrec_ptr;
 846         struct list_struct newrec;
 847
 848         memset(&newrec, '\0', sizeof(newrec));
 849
 850         if (tdb_lock(tdb, -1, F_WRLCK) == -1)
 851                 return 0;
 852
 853         /* Extra bytes required for tailer */
 854         length += sizeof(tdb_off);
 855
 856  again:
 857         last_ptr = FREELIST_TOP;
 858
 859         /* read in the freelist top */
 860         if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
 861                 goto fail;
 862
 863         /* keep looking until we find a freelist record big enough */
 864         while (rec_ptr) {
 865                 if (rec_free_read(tdb, rec_ptr, rec) == -1)
 866                         goto fail;
 867
 868                 if (rec->rec_len >= length) {
 869                         /* found it - now possibly split it up  */
 870                         if (rec->rec_len > length + MIN_REC_SIZE) {
 871                                 /* Length of left piece */
 872                                 length = TDB_ALIGN(length, TDB_ALIGNMENT);
 873
 874                                 /* Right piece to go on free list */
 875                                 newrec.rec_len = rec->rec_len
 876                                         - (sizeof(*rec) + length);
 877                                 newrec_ptr = rec_ptr + sizeof(*rec) + length;
 878
 879                                 /* And left record is shortened */
 880                                 rec->rec_len = length;
 881                         } else
 882                                 newrec_ptr = 0;
 883
 884                         /* Remove allocated record from the free list */
 885                         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
 886                                 goto fail;
 887
 888                         /* Update header: do this before we drop alloc
 889                            lock, otherwise tdb_free() might try to
 890                            merge with us, thinking we're free.
 891                            (Thanks Jeremy Allison). */
 892                         rec->magic = TDB_MAGIC;
 893                         if (rec_write(tdb, rec_ptr, rec) == -1)
 894                                 goto fail;
 895
 896                         /* Did we create new block? */
 897                         if (newrec_ptr) {
 898                                 /* Update allocated record tailer (we
 899                                    shortened it). */
 900                                 if (update_tailer(tdb, rec_ptr, rec) == -1)
 901                                         goto fail;
 902
 903                                 /* Free new record */
 904                                 if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
 905                                         goto fail;
 906                         }
 907
 908                         /* all done - return the new record offset */
 909                         tdb_unlock(tdb, -1, F_WRLCK);
 910                         return rec_ptr;
 911                 }
 912                 /* move to the next record */
 913                 last_ptr = rec_ptr;
 914                 rec_ptr = rec->next;
 915         }
 916         /* we didn't find enough space. See if we can expand the
 917            database and if we can then try again */
 918         if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
 919                 goto again;
 920  fail:
 921         tdb_unlock(tdb, -1, F_WRLCK);
 922         return 0;
 923 }
 924
 925 /* initialise a new database with a specified hash size */
 926 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
 927 {
 928         struct tdb_header *newdb;
 929         int size, ret = -1;
 930
 931         /* We make it up in memory, then write it out if not internal */
 932         size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
 933         if (!(newdb = calloc(size, 1)))
 934                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
 935
 936         /* Fill in the header */
 937         newdb->version = TDB_VERSION;
 938         newdb->hash_size = hash_size;
 939         if (tdb->flags & TDB_INTERNAL) {
 940                 tdb->map_size = size;
 941                 tdb->map_ptr = (char *)newdb;
 942                 memcpy(&tdb->header, newdb, sizeof(tdb->header));
 943                 /* Convert the `ondisk' version if asked. */
 944                 CONVERT(*newdb);
 945                 return 0;
 946         }
 947         if (lseek(tdb->fd, 0, SEEK_SET) == -1)
 948                 goto fail;
 949
 950         if (ftruncate(tdb->fd, 0) == -1)
 951                 goto fail;
 952
 953         /* This creates an endian-converted header, as if read from disk */
 954         CONVERT(*newdb);
 955         memcpy(&tdb->header, newdb, sizeof(tdb->header));
 956         /* Don't endian-convert the magic food! */
 957         memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
 958         if (write(tdb->fd, newdb, size) != size)
 959                 ret = -1;
 960         else
 961                 ret = tdb_create_rwlocks(tdb->fd, hash_size);
 962
 963   fail:
 964         SAFE_FREE(newdb);
 965         return ret;
 966 }
 967
 968 /* Returns 0 on fail.  On success, return offset of record, and fills
 969    in rec */
 970 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
 971                         struct list_struct *r)
 972 {
 973         tdb_off rec_ptr;
 974
 975         /* read in the hash top */
 976         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 977                 return 0;
 978
 979         /* keep looking until we find the right record */
 980         while (rec_ptr) {
 981                 if (rec_read(tdb, rec_ptr, r) == -1)
 982                         return 0;
 983
 984                 if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
 985                         char *k;
 986                         /* a very likely hit - read the key */
 987                         k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
 988                                            r->key_len);
 989                         if (!k)
 990                                 return 0;
 991
 992                         if (memcmp(key.dptr, k, key.dsize) == 0) {
 993                                 SAFE_FREE(k);
 994                                 return rec_ptr;
 995                         }
 996                         SAFE_FREE(k);
 997                 }
 998                 rec_ptr = r->next;
 999         }
1000         return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
1001 }
1002
1003 /* As tdb_find, but if you succeed, keep the lock */
1004 static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
1005                              struct list_struct *rec)
1006 {
1007         u32 rec_ptr;
1008
1009         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
1010                 return 0;
1011         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
1012                 tdb_unlock(tdb, BUCKET(hash), locktype);
1013         return rec_ptr;
1014 }
1015
1016 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
1017 {
1018         return tdb->ecode;
1019 }
1020
1021 static struct tdb_errname {
1022         enum TDB_ERROR ecode; const char *estring;
1023 } emap[] = { {TDB_SUCCESS, "Success"},
1024              {TDB_ERR_CORRUPT, "Corrupt database"},
1025              {TDB_ERR_IO, "IO Error"},
1026              {TDB_ERR_LOCK, "Locking error"},
1027              {TDB_ERR_OOM, "Out of memory"},
1028              {TDB_ERR_EXISTS, "Record exists"},
1029              {TDB_ERR_NOLOCK, "Lock exists on other keys"},
1030              {TDB_ERR_NOEXIST, "Record does not exist"} };
1031
1032 /* Error string for the last tdb error */
1033 const char *tdb_errorstr(TDB_CONTEXT *tdb)
1034 {
1035         u32 i;
1036         for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
1037                 if (tdb->ecode == emap[i].ecode)
1038                         return emap[i].estring;
1039         return "Invalid error code";
1040 }
1041
1042 /* update an entry in place - this only works if the new data size
1043    is <= the old data size and the key exists.
1044    on failure return -1.
1045 */
1046
1047 static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
1048 {
1049         struct list_struct rec;
1050         tdb_off rec_ptr;
1051
1052         /* find entry */
1053         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1054                 return -1;
1055
1056         /* must be long enough key, data and tailer */
1057         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1058                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1059                 return -1;
1060         }
1061
1062         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1063                       dbuf.dptr, dbuf.dsize) == -1)
1064                 return -1;
1065
1066         if (dbuf.dsize != rec.data_len) {
1067                 /* update size */
1068                 rec.data_len = dbuf.dsize;
1069                 return rec_write(tdb, rec_ptr, &rec);
1070         }
1071
1072         return 0;
1073 }
1074
1075 /* find an entry in the database given a key */
1076 /* If an entry doesn't exist tdb_err will be set to
1077  * TDB_ERR_NOEXIST. If a key has no data attached
1078  * tdb_err will not be set. Both will return a
1079  * zero pptr and zero dsize.
1080  */
1081
1082 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1083 {
1084         tdb_off rec_ptr;
1085         struct list_struct rec;
1086         TDB_DATA ret;
1087         u32 hash;
1088
1089         /* find which hash bucket it is in */
1090         hash = tdb->hash_fn(&key);
1091         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
1092                 return tdb_null;
1093
1094         if (rec.data_len)
1095                 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1096                                           rec.data_len);
1097         else
1098                 ret.dptr = NULL;
1099         ret.dsize = rec.data_len;
1100         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1101         return ret;
1102 }
1103
1104 /* check if an entry in the database exists
1105
1106    note that 1 is returned if the key is found and 0 is returned if not found
1107    this doesn't match the conventions in the rest of this module, but is
1108    compatible with gdbm
1109 */
1110 static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1111 {
1112         struct list_struct rec;
1113
1114         if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
1115                 return 0;
1116         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1117         return 1;
1118 }
1119
1120 int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1121 {
1122         u32 hash = tdb->hash_fn(&key);
1123         return tdb_exists_hash(tdb, key, hash);
1124 }
1125
1126 /* record lock stops delete underneath */
1127 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1128 {
1129         return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1130 }
1131 /*
1132   Write locks override our own fcntl readlocks, so check it here.
1133   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1134   an error to fail to get the lock here.
1135 */
1136
1137 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1138 {
1139         struct tdb_traverse_lock *i;
1140         for (i = &tdb->travlocks; i; i = i->next)
1141                 if (i->off == off)
1142                         return -1;
1143         return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1144 }
1145
1146 /*
1147   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1148   an error to fail to get the lock here.
1149 */
1150
1151 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1152 {
1153         return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1154 }
1155 /* fcntl locks don't stack: avoid unlocking someone else's */
1156 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1157 {
1158         struct tdb_traverse_lock *i;
1159         u32 count = 0;
1160
1161         if (off == 0)
1162                 return 0;
1163         for (i = &tdb->travlocks; i; i = i->next)
1164                 if (i->off == off)
1165                         count++;
1166         return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1167 }
1168
1169 /* actually delete an entry in the database given the offset */
1170 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1171 {
1172         tdb_off last_ptr, i;
1173         struct list_struct lastrec;
1174
1175         if (tdb->read_only) return -1;
1176
1177         if (write_lock_record(tdb, rec_ptr) == -1) {
1178                 /* Someone traversing here: mark it as dead */
1179                 rec->magic = TDB_DEAD_MAGIC;
1180                 return rec_write(tdb, rec_ptr, rec);
1181         }
1182         if (write_unlock_record(tdb, rec_ptr) != 0)
1183                 return -1;
1184
1185         /* find previous record in hash chain */
1186         if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1187                 return -1;
1188         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1189                 if (rec_read(tdb, i, &lastrec) == -1)
1190                         return -1;
1191
1192         /* unlink it: next ptr is at start of record. */
1193         if (last_ptr == 0)
1194                 last_ptr = TDB_HASH_TOP(rec->full_hash);
1195         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1196                 return -1;
1197
1198         /* recover the space */
1199         if (tdb_free(tdb, rec_ptr, rec) == -1)
1200                 return -1;
1201         return 0;
1202 }
1203
1204 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1205 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1206                          struct list_struct *rec)
1207 {
1208         int want_next = (tlock->off != 0);
1209
1210         /* Lock each chain from the start one. */
1211         for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1212                 if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1213                         return -1;
1214
1215                 /* No previous record?  Start at top of chain. */
1216                 if (!tlock->off) {
1217                         if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1218                                      &tlock->off) == -1)
1219                                 goto fail;
1220                 } else {
1221                         /* Otherwise unlock the previous record. */
1222                         if (unlock_record(tdb, tlock->off) != 0)
1223                                 goto fail;
1224                 }
1225
1226                 if (want_next) {
1227                         /* We have offset of old record: grab next */
1228                         if (rec_read(tdb, tlock->off, rec) == -1)
1229                                 goto fail;
1230                         tlock->off = rec->next;
1231                 }
1232
1233                 /* Iterate through chain */
1234                 while( tlock->off) {
1235                         tdb_off current;
1236                         if (rec_read(tdb, tlock->off, rec) == -1)
1237                                 goto fail;
1238                         if (!TDB_DEAD(rec)) {
1239                                 /* Woohoo: we found one! */
1240                                 if (lock_record(tdb, tlock->off) != 0)
1241                                         goto fail;
1242                                 return tlock->off;
1243                         }
1244                         /* Try to clean dead ones from old traverses */
1245                         current = tlock->off;
1246                         tlock->off = rec->next;
1247                         if (!tdb->read_only &&
1248                             do_delete(tdb, current, rec) != 0)
1249                                 goto fail;
1250                 }
1251                 tdb_unlock(tdb, tlock->hash, F_WRLCK);
1252                 want_next = 0;
1253         }
1254         /* We finished iteration without finding anything */
1255         return TDB_ERRCODE(TDB_SUCCESS, 0);
1256
1257  fail:
1258         tlock->off = 0;
1259         if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1260                 TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1261         return -1;
1262 }
1263
1264 /* traverse the entire database - calling fn(tdb, key, data) on each element.
1265    return -1 on error or the record count traversed
1266    if fn is NULL then it is not called
1267    a non-zero return value from fn() indicates that the traversal should stop
1268   */
1269 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
1270 {
1271         TDB_DATA key, dbuf;
1272         struct list_struct rec;
1273         struct tdb_traverse_lock tl = { NULL, 0, 0 };
1274         int ret, count = 0;
1275
1276         /* This was in the initializaton, above, but the IRIX compiler
1277          * did not like it.  crh
1278          */
1279         tl.next = tdb->travlocks.next;
1280
1281         /* fcntl locks don't stack: beware traverse inside traverse */
1282         tdb->travlocks.next = &tl;
1283
1284         /* tdb_next_lock places locks on the record returned, and its chain */
1285         while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1286                 count++;
1287                 /* now read the full record */
1288                 key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1289                                           rec.key_len + rec.data_len);
1290                 if (!key.dptr) {
1291                         ret = -1;
1292                         if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1293                                 goto out;
1294                         if (unlock_record(tdb, tl.off) != 0)
1295                                 TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1296                         goto out;
1297                 }
1298                 key.dsize = rec.key_len;
1299                 dbuf.dptr = key.dptr + rec.key_len;
1300                 dbuf.dsize = rec.data_len;
1301
1302                 /* Drop chain lock, call out */
1303                 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1304                         ret = -1;
1305                         goto out;
1306                 }
1307                 if (fn && fn(tdb, key, dbuf, private)) {
1308                         /* They want us to terminate traversal */
1309                         ret = count;
1310                         if (unlock_record(tdb, tl.off) != 0) {
1311                                 TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
1312                                 ret = -1;
1313                         }
1314                         tdb->travlocks.next = tl.next;
1315                         SAFE_FREE(key.dptr);
1316                         return count;
1317                 }
1318                 SAFE_FREE(key.dptr);
1319         }
1320 out:
1321         tdb->travlocks.next = tl.next;
1322         if (ret < 0)
1323                 return -1;
1324         else
1325                 return count;
1326 }
1327
1328 /* find the first entry in the database and return its key */
1329 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1330 {
1331         TDB_DATA key;
1332         struct list_struct rec;
1333
1334         /* release any old lock */
1335         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1336                 return tdb_null;
1337         tdb->travlocks.off = tdb->travlocks.hash = 0;
1338
1339         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1340                 return tdb_null;
1341         /* now read the key */
1342         key.dsize = rec.key_len;
1343         key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1344         if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1345                 TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1346         return key;
1347 }
1348
1349 /* find the next entry in the database, returning its key */
1350 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1351 {
1352         u32 oldhash;
1353         TDB_DATA key = tdb_null;
1354         struct list_struct rec;
1355         char *k = NULL;
1356
1357         /* Is locked key the old key?  If so, traverse will be reliable. */
1358         if (tdb->travlocks.off) {
1359                 if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1360                         return tdb_null;
1361                 if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1362                     || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1363                                             rec.key_len))
1364                     || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1365                         /* No, it wasn't: unlock it and start from scratch */
1366                         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1367                                 return tdb_null;
1368                         if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1369                                 return tdb_null;
1370                         tdb->travlocks.off = 0;
1371                 }
1372
1373                 SAFE_FREE(k);
1374         }
1375
1376         if (!tdb->travlocks.off) {
1377                 /* No previous element: do normal find, and lock record */
1378                 tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
1379                 if (!tdb->travlocks.off)
1380                         return tdb_null;
1381                 tdb->travlocks.hash = BUCKET(rec.full_hash);
1382                 if (lock_record(tdb, tdb->travlocks.off) != 0) {
1383                         TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1384                         return tdb_null;
1385                 }
1386         }
1387         oldhash = tdb->travlocks.hash;
1388
1389         /* Grab next record: locks chain and returned record,
1390            unlocks old record */
1391         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1392                 key.dsize = rec.key_len;
1393                 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1394                                           key.dsize);
1395                 /* Unlock the chain of this new record */
1396                 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1397                         TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1398         }
1399         /* Unlock the chain of old record */
1400         if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1401                 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1402         return key;
1403 }
1404
1405 /* delete an entry in the database given a key */
1406 static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1407 {
1408         tdb_off rec_ptr;
1409         struct list_struct rec;
1410         int ret;
1411
1412         if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
1413                 return -1;
1414         ret = do_delete(tdb, rec_ptr, &rec);
1415         if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1416                 TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1417         return ret;
1418 }
1419
1420 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1421 {
1422         u32 hash = tdb->hash_fn(&key);
1423         return tdb_delete_hash(tdb, key, hash);
1424 }
1425
1426 /* store an element in the database, replacing any existing element
1427    with the same key
1428
1429    return 0 on success, -1 on failure
1430 */
1431 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1432 {
1433         struct list_struct rec;
1434         u32 hash;
1435         tdb_off rec_ptr;
1436         char *p = NULL;
1437         int ret = 0;
1438
1439         /* find which hash bucket it is in */
1440         hash = tdb->hash_fn(&key);
1441         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1442                 return -1;
1443
1444         /* check for it existing, on insert. */
1445         if (flag == TDB_INSERT) {
1446                 if (tdb_exists_hash(tdb, key, hash)) {
1447                         tdb->ecode = TDB_ERR_EXISTS;
1448                         goto fail;
1449                 }
1450         } else {
1451                 /* first try in-place update, on modify or replace. */
1452                 if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
1453                         goto out;
1454                 if (tdb->ecode == TDB_ERR_NOEXIST &&
1455                     flag == TDB_MODIFY) {
1456                         /* if the record doesn't exist and we are in TDB_MODIFY mode then
1457                          we should fail the store */
1458                         goto fail;
1459         }
1460         }
1461         /* reset the error code potentially set by the tdb_update() */
1462         tdb->ecode = TDB_SUCCESS;
1463
1464         /* delete any existing record - if it doesn't exist we don't
1465            care.  Doing this first reduces fragmentation, and avoids
1466            coalescing with `allocated' block before it's updated. */
1467         if (flag != TDB_INSERT)
1468                 tdb_delete_hash(tdb, key, hash);
1469
1470         /* Copy key+value *before* allocating free space in case malloc
1471            fails and we are left with a dead spot in the tdb. */
1472
1473         if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1474                 tdb->ecode = TDB_ERR_OOM;
1475                 goto fail;
1476         }
1477
1478         memcpy(p, key.dptr, key.dsize);
1479         if (dbuf.dsize)
1480                 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1481
1482         /* we have to allocate some space */
1483         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1484                 goto fail;
1485
1486         /* Read hash top into next ptr */
1487         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1488                 goto fail;
1489
1490         rec.key_len = key.dsize;
1491         rec.data_len = dbuf.dsize;
1492         rec.full_hash = hash;
1493         rec.magic = TDB_MAGIC;
1494
1495         /* write out and point the top of the hash chain at it */
1496         if (rec_write(tdb, rec_ptr, &rec) == -1
1497             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1498             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1499                 /* Need to tdb_unallocate() here */
1500                 goto fail;
1501         }
1502  out:
1503         SAFE_FREE(p);
1504         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1505         return ret;
1506 fail:
1507         ret = -1;
1508         goto out;
1509 }
1510
1511 /* Attempt to append data to an entry in place - this only works if the new data size
1512    is <= the old data size and the key exists.
1513    on failure return -1. Record must be locked before calling.
1514 */
1515 static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
1516 {
1517         struct list_struct rec;
1518         tdb_off rec_ptr;
1519
1520         /* find entry */
1521         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1522                 return -1;
1523
1524         /* Append of 0 is always ok. */
1525         if (new_dbuf.dsize == 0)
1526                 return 0;
1527
1528         /* must be long enough for key, old data + new data and tailer */
1529         if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
1530                 /* No room. */
1531                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1532                 return -1;
1533         }
1534
1535         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
1536                       new_dbuf.dptr, new_dbuf.dsize) == -1)
1537                 return -1;
1538
1539         /* update size */
1540         rec.data_len += new_dbuf.dsize;
1541         return rec_write(tdb, rec_ptr, &rec);
1542 }
1543
1544 /* Append to an entry. Create if not exist. */
1545
1546 int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1547 {
1548         struct list_struct rec;
1549         u32 hash;
1550         tdb_off rec_ptr;
1551         char *p = NULL;
1552         int ret = 0;
1553         size_t new_data_size = 0;
1554
1555         /* find which hash bucket it is in */
1556         hash = tdb->hash_fn(&key);
1557         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1558                 return -1;
1559
1560         /* first try in-place. */
1561         if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
1562                 goto out;
1563
1564         /* reset the error code potentially set by the tdb_append_inplace() */
1565         tdb->ecode = TDB_SUCCESS;
1566
1567         /* find entry */
1568         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
1569                 if (tdb->ecode != TDB_ERR_NOEXIST)
1570                         goto fail;
1571
1572                 /* Not found - create. */
1573
1574                 ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
1575                 goto out;
1576         }
1577
1578         new_data_size = rec.data_len + new_dbuf.dsize;
1579
1580         /* Copy key+old_value+value *before* allocating free space in case malloc
1581            fails and we are left with a dead spot in the tdb. */
1582
1583         if (!(p = (char *)malloc(key.dsize + new_data_size))) {
1584                 tdb->ecode = TDB_ERR_OOM;
1585                 goto fail;
1586         }
1587
1588         /* Copy the key in place. */
1589         memcpy(p, key.dptr, key.dsize);
1590
1591         /* Now read the old data into place. */
1592         if (rec.data_len &&
1593                 tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
1594                         goto fail;
1595
1596         /* Finally append the new data. */
1597         if (new_dbuf.dsize)
1598                 memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
1599
1600         /* delete any existing record - if it doesn't exist we don't
1601            care.  Doing this first reduces fragmentation, and avoids
1602            coalescing with `allocated' block before it's updated. */
1603
1604         tdb_delete_hash(tdb, key, hash);
1605
1606         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
1607                 goto fail;
1608
1609         /* Read hash top into next ptr */
1610         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1611                 goto fail;
1612
1613         rec.key_len = key.dsize;
1614         rec.data_len = new_data_size;
1615         rec.full_hash = hash;
1616         rec.magic = TDB_MAGIC;
1617
1618         /* write out and point the top of the hash chain at it */
1619         if (rec_write(tdb, rec_ptr, &rec) == -1
1620             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
1621             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1622                 /* Need to tdb_unallocate() here */
1623                 goto fail;
1624         }
1625
1626  out:
1627         SAFE_FREE(p);
1628         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1629         return ret;
1630
1631 fail:
1632         ret = -1;
1633         goto out;
1634 }
1635
1636 static int tdb_already_open(dev_t device,
1637                             ino_t ino)
1638 {
1639         TDB_CONTEXT *i;
1640
1641         for (i = tdbs; i; i = i->next) {
1642                 if (i->device == device && i->inode == ino) {
1643                         return 1;
1644                 }
1645         }
1646
1647         return 0;
1648 }
1649
1650 /* This is based on the hash algorithm from gdbm */
1651 static u32 default_tdb_hash(TDB_DATA *key)
1652 {
1653         u32 value;      /* Used to compute the hash value.  */
1654         u32   i;        /* Used to cycle through random values. */
1655
1656         /* Set the initial value from the key size. */
1657         for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
1658                 value = (value + (key->dptr[i] << (i*5 % 24)));
1659
1660         return (1103515243 * value + 12345);
1661 }
1662
1663 /* open the database, creating it if necessary
1664
1665    The open_flags and mode are passed straight to the open call on the
1666    database file. A flags value of O_WRONLY is invalid. The hash size
1667    is advisory, use zero for a default value.
1668
1669    Return is NULL on error, in which case errno is also set.  Don't
1670    try to call tdb_error or tdb_errname, just do strerror(errno).
1671
1672    @param name may be NULL for internal databases. */
1673 TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1674                       int open_flags, mode_t mode)
1675 {
1676         return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
1677 }
1678
1679
1680 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1681                          int open_flags, mode_t mode,
1682                          tdb_log_func log_fn,
1683                          tdb_hash_func hash_fn)
1684 {
1685         TDB_CONTEXT *tdb;
1686         struct stat st;
1687         int rev = 0, locked = 0;
1688         unsigned char *vp;
1689         u32 vertest;
1690
1691         if (!(tdb = calloc(1, sizeof *tdb))) {
1692                 /* Can't log this */
1693                 errno = ENOMEM;
1694                 goto fail;
1695         }
1696         tdb->fd = -1;
1697         tdb->name = NULL;
1698         tdb->map_ptr = NULL;
1699         tdb->flags = tdb_flags;
1700         tdb->open_flags = open_flags;
1701         tdb->log_fn = log_fn;
1702         tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
1703
1704         if ((open_flags & O_ACCMODE) == O_WRONLY) {
1705                 TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1706                          name));
1707                 errno = EINVAL;
1708                 goto fail;
1709         }
1710
1711         if (hash_size == 0)
1712                 hash_size = DEFAULT_HASH_SIZE;
1713         if ((open_flags & O_ACCMODE) == O_RDONLY) {
1714                 tdb->read_only = 1;
1715                 /* read only databases don't do locking or clear if first */
1716                 tdb->flags |= TDB_NOLOCK;
1717                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1718         }
1719
1720         /* internal databases don't mmap or lock, and start off cleared */
1721         if (tdb->flags & TDB_INTERNAL) {
1722                 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1723                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1724                 if (tdb_new_database(tdb, hash_size) != 0) {
1725                         TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1726                         goto fail;
1727                 }
1728                 goto internal;
1729         }
1730
1731         if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1732                 TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1733                          name, strerror(errno)));
1734                 goto fail;      /* errno set by open(2) */
1735         }
1736
1737         /* ensure there is only one process initialising at once */
1738         if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1739                 TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1740                          name, strerror(errno)));
1741                 goto fail;      /* errno set by tdb_brlock */
1742         }
1743
1744         /* we need to zero database if we are the only one with it open */
1745         if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
1746                 (locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
1747                 open_flags |= O_CREAT;
1748                 if (ftruncate(tdb->fd, 0) == -1) {
1749                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1750                                  "failed to truncate %s: %s\n",
1751                                  name, strerror(errno)));
1752                         goto fail; /* errno set by ftruncate */
1753                 }
1754         }
1755
1756         if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1757             || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1758             || (tdb->header.version != TDB_VERSION
1759                 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1760                 /* its not a valid database - possibly initialise it */
1761                 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1762                         errno = EIO; /* ie bad format or something */
1763                         goto fail;
1764                 }
1765                 rev = (tdb->flags & TDB_CONVERT);
1766         }
1767         vp = (unsigned char *)&tdb->header.version;
1768         vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1769                   (((u32)vp[2]) << 8) | (u32)vp[3];
1770         tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1771         if (!rev)
1772                 tdb->flags &= ~TDB_CONVERT;
1773         else {
1774                 tdb->flags |= TDB_CONVERT;
1775                 convert(&tdb->header, sizeof(tdb->header));
1776         }
1777         if (fstat(tdb->fd, &st) == -1)
1778                 goto fail;
1779
1780         /* Is it already in the open list?  If so, fail. */
1781         if (tdb_already_open(st.st_dev, st.st_ino)) {
1782                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1783                          "%s (%d,%d) is already open in this process\n",
1784                          name, (int)st.st_dev, (int)st.st_ino));
1785                 errno = EBUSY;
1786                 goto fail;
1787         }
1788
1789         if (!(tdb->name = (char *)strdup(name))) {
1790                 errno = ENOMEM;
1791                 goto fail;
1792         }
1793
1794         tdb->map_size = st.st_size;
1795         tdb->device = st.st_dev;
1796         tdb->inode = st.st_ino;
1797         tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1798         if (!tdb->locked) {
1799                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1800                          "failed to allocate lock structure for %s\n",
1801                          name));
1802                 errno = ENOMEM;
1803                 goto fail;
1804         }
1805         tdb_mmap(tdb);
1806         if (locked) {
1807                 if (!tdb->read_only)
1808                         if (tdb_clear_spinlocks(tdb) != 0) {
1809                                 TDB_LOG((tdb, 0, "tdb_open_ex: "
1810                                 "failed to clear spinlock\n"));
1811                                 goto fail;
1812                         }
1813                 if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1814                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1815                                  "failed to take ACTIVE_LOCK on %s: %s\n",
1816                                  name, strerror(errno)));
1817                         goto fail;
1818                 }
1819
1820         }
1821
1822         /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
1823            we didn't get the initial exclusive lock as we need to let all other
1824            users know we're using it. */
1825
1826         if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1827                 /* leave this lock in place to indicate it's in use */
1828                 if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1829                         goto fail;
1830         }
1831
1832
1833  internal:
1834         /* Internal (memory-only) databases skip all the code above to
1835          * do with disk files, and resume here by releasing their
1836          * global lock and hooking into the active list. */
1837         if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1838                 goto fail;
1839         tdb->next = tdbs;
1840         tdbs = tdb;
1841         return tdb;
1842
1843  fail:
1844         { int save_errno = errno;
1845
1846         if (!tdb)
1847                 return NULL;
1848
1849         if (tdb->map_ptr) {
1850                 if (tdb->flags & TDB_INTERNAL)
1851                         SAFE_FREE(tdb->map_ptr);
1852                 else
1853                         tdb_munmap(tdb);
1854         }
1855         SAFE_FREE(tdb->name);
1856         if (tdb->fd != -1)
1857                 if (close(tdb->fd) != 0)
1858                         TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1859         SAFE_FREE(tdb->locked);
1860         SAFE_FREE(tdb);
1861         errno = save_errno;
1862         return NULL;
1863         }
1864 }
1865
1866 /**
1867  * Close a database.
1868  *
1869  * @returns -1 for error; 0 for success.
1870  **/
1871 int tdb_close(TDB_CONTEXT *tdb)
1872 {
1873         TDB_CONTEXT **i;
1874         int ret = 0;
1875
1876         if (tdb->map_ptr) {
1877                 if (tdb->flags & TDB_INTERNAL)
1878                         SAFE_FREE(tdb->map_ptr);
1879                 else
1880                         tdb_munmap(tdb);
1881         }
1882         SAFE_FREE(tdb->name);
1883         if (tdb->fd != -1)
1884                 ret = close(tdb->fd);
1885         SAFE_FREE(tdb->locked);
1886
1887         /* Remove from contexts list */
1888         for (i = &tdbs; *i; i = &(*i)->next) {
1889                 if (*i == tdb) {
1890                         *i = tdb->next;
1891                         break;
1892                 }
1893         }
1894
1895         memset(tdb, 0, sizeof(*tdb));
1896         SAFE_FREE(tdb);
1897
1898         return ret;
1899 }
1900
1901 /* lock/unlock entire database */
1902 int tdb_lockall(TDB_CONTEXT *tdb)
1903 {
1904         u32 i;
1905
1906         /* There are no locks on read-only dbs */
1907         if (tdb->read_only)
1908                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
1909         for (i = 0; i < tdb->header.hash_size; i++)
1910                 if (tdb_lock(tdb, i, F_WRLCK))
1911                         break;
1912
1913         /* If error, release locks we have... */
1914         if (i < tdb->header.hash_size) {
1915                 u32 j;
1916
1917                 for ( j = 0; j < i; j++)
1918                         tdb_unlock(tdb, j, F_WRLCK);
1919                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1920         }
1921
1922         return 0;
1923 }
1924 void tdb_unlockall(TDB_CONTEXT *tdb)
1925 {
1926         u32 i;
1927         for (i=0; i < tdb->header.hash_size; i++)
1928                 tdb_unlock(tdb, i, F_WRLCK);
1929 }
1930
1931 /* lock/unlock one hash chain. This is meant to be used to reduce
1932    contention - it cannot guarantee how many records will be locked */
1933 int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
1934 {
1935         return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
1936 }
1937
1938 int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
1939 {
1940         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
1941 }
1942
1943 int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1944 {
1945         return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
1946 }
1947
1948 int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1949 {
1950         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
1951 }
1952
1953
1954 /* register a loging function */
1955 void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
1956 {
1957         tdb->log_fn = fn;
1958 }
1959
1960 /* reopen a tdb - this can be used after a fork to ensure that we have an independent
1961    seek pointer from our parent and to re-establish locks */
1962 int tdb_reopen(TDB_CONTEXT *tdb)
1963 {
1964         struct stat st;
1965
1966         if (tdb->flags & TDB_INTERNAL)
1967                 return 0; /* Nothing to do. */
1968         if (tdb_munmap(tdb) != 0) {
1969                 TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
1970                 goto fail;
1971         }
1972         if (close(tdb->fd) != 0)
1973                 TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
1974         tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
1975         if (tdb->fd == -1) {
1976                 TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
1977                 goto fail;
1978         }
1979         if (fstat(tdb->fd, &st) != 0) {
1980                 TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
1981                 goto fail;
1982         }
1983         if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
1984                 TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
1985                 goto fail;
1986         }
1987         tdb_mmap(tdb);
1988         if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
1989                 TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
1990                 goto fail;
1991         }
1992
1993         return 0;
1994
1995 fail:
1996         tdb_close(tdb);
1997         return -1;
1998 }
1999
2000 /* reopen all tdb's */
2001 int tdb_reopen_all(void)
2002 {
2003         TDB_CONTEXT *tdb;
2004
2005         for (tdb=tdbs; tdb; tdb = tdb->next) {
2006                 /* Ensure no clear-if-first. */
2007                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
2008                 if (tdb_reopen(tdb) != 0)
2009                         return -1;
2010         }
2011
2012         return 0;
2013 }