pppd/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2004
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 2 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, write to the Free Software
  26    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  27 */
  28
  29
  30 /* NOTE: If you use tdbs under valgrind, and in particular if you run
  31  * tdbtorture, you may get spurious "uninitialized value" warnings.  I
  32  * think this is because valgrind doesn't understand that the mmap'd
  33  * area may be written to by other processes.  Memory can, from the
  34  * point of view of the grinded process, spontaneously become
  35  * initialized.
  36  *
  37  * I can think of a few solutions.  [mbp 20030311]
  38  *
  39  * 1 - Write suppressions for Valgrind so that it doesn't complain
  40  * about this.  Probably the most reasonable but people need to
  41  * remember to use them.
  42  *
  43  * 2 - Use IO not mmap when running under valgrind.  Not so nice.
  44  *
  45  * 3 - Use the special valgrind macros to mark memory as valid at the
  46  * right time.  Probably too hard -- the process just doesn't know.
  47  */
  48
  49 #ifdef HAVE_CONFIG_H
  50 #include "config.h"
  51 #endif
  52
  53 #include <stdlib.h>
  54 #include <stdio.h>
  55 #include <fcntl.h>
  56 #include <unistd.h>
  57 #include <string.h>
  58 #include <fcntl.h>
  59 #include <errno.h>
  60 #include <sys/mman.h>
  61 #include <sys/stat.h>
  62 #include <signal.h>
  63
  64 #include "pppd-private.h"
  65 #include "tdb.h"
  66 #include "spinlock.h"
  67 #include "pathnames.h"
  68
  69 #define TDB_MAGIC_FOOD "TDB file\n"
  70 #define TDB_VERSION (0x26011967 + 6)
  71 #define TDB_MAGIC (0x26011999U)
  72 #define TDB_FREE_MAGIC (~TDB_MAGIC)
  73 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
  74 #define TDB_ALIGNMENT 4
  75 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
  76 #define DEFAULT_HASH_SIZE 131
  77 #define TDB_PAGE_SIZE 0x2000
  78 #define FREELIST_TOP (sizeof(struct tdb_header))
  79 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
  80 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
  81 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
  82 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
  83 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
  84 #define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1) + TDB_SPINLOCK_SIZE(hash_size))
  85
  86
  87 /* NB assumes there is a local variable called "tdb" that is the
  88  * current context, also takes doubly-parenthesized print-style
  89  * argument. */
  90 #define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
  91
  92 /* lock offsets */
  93 #define GLOBAL_LOCK 0
  94 #define ACTIVE_LOCK 4
  95
  96 #ifndef MAP_FILE
  97 #define MAP_FILE 0
  98 #endif
  99
 100 #ifndef MAP_FAILED
 101 #define MAP_FAILED ((void *)-1)
 102 #endif
 103
 104 /* free memory if the pointer is valid and zero the pointer */
 105 #ifndef SAFE_FREE
 106 #define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
 107 #endif
 108
 109 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
 110 TDB_DATA tdb_null;
 111
 112 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
 113 static TDB_CONTEXT *tdbs = NULL;
 114
 115 static int tdb_munmap(TDB_CONTEXT *tdb)
 116 {
 117         if (tdb->flags & TDB_INTERNAL)
 118                 return 0;
 119
 120 #ifdef HAVE_MMAP
 121         if (tdb->map_ptr) {
 122                 int ret = munmap(tdb->map_ptr, tdb->map_size);
 123                 if (ret != 0)
 124                         return ret;
 125         }
 126 #endif
 127         tdb->map_ptr = NULL;
 128         return 0;
 129 }
 130
 131 static void tdb_mmap(TDB_CONTEXT *tdb)
 132 {
 133         if (tdb->flags & TDB_INTERNAL)
 134                 return;
 135
 136 #ifdef HAVE_MMAP
 137         if (!(tdb->flags & TDB_NOMMAP)) {
 138                 tdb->map_ptr = mmap(NULL, tdb->map_size,
 139                                     PROT_READ|(tdb->read_only? 0:PROT_WRITE),
 140                                     MAP_SHARED|MAP_FILE, tdb->fd, 0);
 141
 142                 /*
 143                  * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
 144                  */
 145
 146                 if (tdb->map_ptr == MAP_FAILED) {
 147                         tdb->map_ptr = NULL;
 148                         TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
 149                                  tdb->map_size, strerror(errno)));
 150                 }
 151         } else {
 152                 tdb->map_ptr = NULL;
 153         }
 154 #else
 155         tdb->map_ptr = NULL;
 156 #endif
 157 }
 158
 159 /* Endian conversion: we only ever deal with 4 byte quantities */
 160 static void *convert(void *buf, u32 size)
 161 {
 162         u32 i, *p = buf;
 163         for (i = 0; i < size / 4; i++)
 164                 p[i] = TDB_BYTEREV(p[i]);
 165         return buf;
 166 }
 167 #define DOCONV() (tdb->flags & TDB_CONVERT)
 168 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
 169
 170 /* the body of the database is made of one list_struct for the free space
 171    plus a separate data list for each hash value */
 172 struct list_struct {
 173         tdb_off next; /* offset of the next record in the list */
 174         tdb_len rec_len; /* total byte length of record */
 175         tdb_len key_len; /* byte length of key */
 176         tdb_len data_len; /* byte length of data */
 177         u32 full_hash; /* the full 32 bit hash of the key */
 178         u32 magic;   /* try to catch errors */
 179         /* the following union is implied:
 180                 union {
 181                         char record[rec_len];
 182                         struct {
 183                                 char key[key_len];
 184                                 char data[data_len];
 185                         }
 186                         u32 totalsize; (tailer)
 187                 }
 188         */
 189 };
 190
 191 /***************************************************************
 192  Allow a caller to set a "alarm" flag that tdb can check to abort
 193  a blocking lock on SIGALRM.
 194 ***************************************************************/
 195
 196 static sig_atomic_t *palarm_fired;
 197
 198 void tdb_set_lock_alarm(sig_atomic_t *palarm)
 199 {
 200         palarm_fired = palarm;
 201 }
 202
 203 /* a byte range locking function - return 0 on success
 204    this functions locks/unlocks 1 byte at the specified offset.
 205
 206    On error, errno is also set so that errors are passed back properly
 207    through tdb_open(). */
 208 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
 209                       int rw_type, int lck_type, int probe)
 210 {
 211         struct flock fl;
 212         int ret;
 213
 214         if (tdb->flags & TDB_NOLOCK)
 215                 return 0;
 216         if ((rw_type == F_WRLCK) && (tdb->read_only)) {
 217                 errno = EACCES;
 218                 return -1;
 219         }
 220
 221         fl.l_type = rw_type;
 222         fl.l_whence = SEEK_SET;
 223         fl.l_start = offset;
 224         fl.l_len = 1;
 225         fl.l_pid = 0;
 226
 227         do {
 228                 ret = fcntl(tdb->fd,lck_type,&fl);
 229                 if (ret == -1 && errno == EINTR && palarm_fired && *palarm_fired)
 230                         break;
 231         } while (ret == -1 && errno == EINTR);
 232
 233         if (ret == -1) {
 234                 if (!probe && lck_type != F_SETLK) {
 235                         /* Ensure error code is set for log fun to examine. */
 236                         if (errno == EINTR && palarm_fired && *palarm_fired)
 237                                 tdb->ecode = TDB_ERR_LOCK_TIMEOUT;
 238                         else
 239                                 tdb->ecode = TDB_ERR_LOCK;
 240                         TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 241                                  tdb->fd, offset, rw_type, lck_type));
 242                 }
 243                 /* Was it an alarm timeout ? */
 244                 if (errno == EINTR && palarm_fired && *palarm_fired) {
 245                         TDB_LOG((tdb, 5, "tdb_brlock timed out (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 246                                  tdb->fd, offset, rw_type, lck_type));
 247                         return TDB_ERRCODE(TDB_ERR_LOCK_TIMEOUT, -1);
 248                 }
 249                 /* Otherwise - generic lock error. errno set by fcntl.
 250                  * EAGAIN is an expected return from non-blocking
 251                  * locks. */
 252                 if (errno != EAGAIN) {
 253                         TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
 254                                  tdb->fd, offset, rw_type, lck_type,
 255                                  strerror(errno)));
 256                 }
 257                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
 258         }
 259         return 0;
 260 }
 261
 262 /* lock a list in the database. list -1 is the alloc list */
 263 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
 264 {
 265         if (list < -1 || list >= (int)tdb->header.hash_size) {
 266                 TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
 267                            list, ltype));
 268                 return -1;
 269         }
 270         if (tdb->flags & TDB_NOLOCK)
 271                 return 0;
 272
 273         /* Since fcntl locks don't nest, we do a lock for the first one,
 274            and simply bump the count for future ones */
 275         if (tdb->locked[list+1].count == 0) {
 276                 if (!tdb->read_only && tdb->header.rwlocks) {
 277                         if (tdb_spinlock(tdb, list, ltype)) {
 278                                 TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list %d ltype=%d\n",
 279                                            list, ltype));
 280                                 return -1;
 281                         }
 282                 } else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
 283                         TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
 284                                            list, ltype, strerror(errno)));
 285                         return -1;
 286                 }
 287                 tdb->locked[list+1].ltype = ltype;
 288         }
 289         tdb->locked[list+1].count++;
 290         return 0;
 291 }
 292
 293 /* unlock the database: returns void because it's too late for errors. */
 294         /* changed to return int it may be interesting to know there
 295            has been an error  --simo */
 296 static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
 297 {
 298         int ret = -1;
 299
 300         if (tdb->flags & TDB_NOLOCK)
 301                 return 0;
 302
 303         /* Sanity checks */
 304         if (list < -1 || list >= (int)tdb->header.hash_size) {
 305                 TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
 306                 return ret;
 307         }
 308
 309         if (tdb->locked[list+1].count==0) {
 310                 TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
 311                 return ret;
 312         }
 313
 314         if (tdb->locked[list+1].count == 1) {
 315                 /* Down to last nested lock: unlock underneath */
 316                 if (!tdb->read_only && tdb->header.rwlocks) {
 317                         ret = tdb_spinunlock(tdb, list, ltype);
 318                 } else {
 319                         ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
 320                 }
 321         } else {
 322                 ret = 0;
 323         }
 324         tdb->locked[list+1].count--;
 325
 326         if (ret)
 327                 TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
 328         return ret;
 329 }
 330
 331 /* check for an out of bounds access - if it is out of bounds then
 332    see if the database has been expanded by someone else and expand
 333    if necessary
 334    note that "len" is the minimum length needed for the db
 335 */
 336 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
 337 {
 338         struct stat st;
 339         if (len <= tdb->map_size)
 340                 return 0;
 341         if (tdb->flags & TDB_INTERNAL) {
 342                 if (!probe) {
 343                         /* Ensure ecode is set for log fn. */
 344                         tdb->ecode = TDB_ERR_IO;
 345                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
 346                                  (int)len, (int)tdb->map_size));
 347                 }
 348                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 349         }
 350
 351         if (fstat(tdb->fd, &st) == -1)
 352                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 353
 354         if (st.st_size < (size_t)len) {
 355                 if (!probe) {
 356                         /* Ensure ecode is set for log fn. */
 357                         tdb->ecode = TDB_ERR_IO;
 358                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
 359                                  (int)len, (int)st.st_size));
 360                 }
 361                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 362         }
 363
 364         /* Unmap, update size, remap */
 365         if (tdb_munmap(tdb) == -1)
 366                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 367         tdb->map_size = st.st_size;
 368         tdb_mmap(tdb);
 369         return 0;
 370 }
 371
 372 /* write a lump of data at a specified offset */
 373 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
 374 {
 375         if (tdb_oob(tdb, off + len, 0) != 0)
 376                 return -1;
 377
 378         if (tdb->map_ptr)
 379                 memcpy(off + (char *)tdb->map_ptr, buf, len);
 380 #ifdef HAVE_PWRITE
 381         else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
 382 #else
 383         else if (lseek(tdb->fd, off, SEEK_SET) != off
 384                  || write(tdb->fd, buf, len) != (ssize_t)len) {
 385 #endif
 386                 /* Ensure ecode is set for log fn. */
 387                 tdb->ecode = TDB_ERR_IO;
 388                 TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
 389                            off, len, strerror(errno)));
 390                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 391         }
 392         return 0;
 393 }
 394
 395 /* read a lump of data at a specified offset, maybe convert */
 396 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
 397 {
 398         if (tdb_oob(tdb, off + len, 0) != 0)
 399                 return -1;
 400
 401         if (tdb->map_ptr)
 402                 memcpy(buf, off + (char *)tdb->map_ptr, len);
 403 #ifdef HAVE_PREAD
 404         else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
 405 #else
 406         else if (lseek(tdb->fd, off, SEEK_SET) != off
 407                  || read(tdb->fd, buf, len) != (ssize_t)len) {
 408 #endif
 409                 /* Ensure ecode is set for log fn. */
 410                 tdb->ecode = TDB_ERR_IO;
 411                 TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
 412                            off, len, strerror(errno)));
 413                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 414         }
 415         if (cv)
 416                 convert(buf, len);
 417         return 0;
 418 }
 419
 420 /* read a lump of data, allocating the space for it */
 421 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
 422 {
 423         char *buf;
 424
 425         if (!(buf = malloc(len))) {
 426                 /* Ensure ecode is set for log fn. */
 427                 tdb->ecode = TDB_ERR_OOM;
 428                 TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
 429                            len, strerror(errno)));
 430                 return TDB_ERRCODE(TDB_ERR_OOM, buf);
 431         }
 432         if (tdb_read(tdb, offset, buf, len, 0) == -1) {
 433                 SAFE_FREE(buf);
 434                 return NULL;
 435         }
 436         return buf;
 437 }
 438
 439 /* read/write a tdb_off */
 440 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 441 {
 442         return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
 443 }
 444 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 445 {
 446         tdb_off off = *d;
 447         return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
 448 }
 449
 450 /* read/write a record */
 451 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 452 {
 453         if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
 454                 return -1;
 455         if (TDB_BAD_MAGIC(rec)) {
 456                 /* Ensure ecode is set for log fn. */
 457                 tdb->ecode = TDB_ERR_CORRUPT;
 458                 TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
 459                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 460         }
 461         return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
 462 }
 463 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 464 {
 465         struct list_struct r = *rec;
 466         return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
 467 }
 468
 469 /* read a freelist record and check for simple errors */
 470 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
 471 {
 472         if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
 473                 return -1;
 474
 475         if (rec->magic == TDB_MAGIC) {
 476                 /* this happens when a app is showdown while deleting a record - we should
 477                    not completely fail when this happens */
 478                 TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
 479                          rec->magic, off));
 480                 rec->magic = TDB_FREE_MAGIC;
 481                 if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
 482                         return -1;
 483         }
 484
 485         if (rec->magic != TDB_FREE_MAGIC) {
 486                 /* Ensure ecode is set for log fn. */
 487                 tdb->ecode = TDB_ERR_CORRUPT;
 488                 TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
 489                            rec->magic, off));
 490                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 491         }
 492         if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
 493                 return -1;
 494         return 0;
 495 }
 496
 497 /* update a record tailer (must hold allocation lock) */
 498 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
 499                          const struct list_struct *rec)
 500 {
 501         tdb_off totalsize;
 502
 503         /* Offset of tailer from record header */
 504         totalsize = sizeof(*rec) + rec->rec_len;
 505         return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
 506                          &totalsize);
 507 }
 508
 509 static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
 510 {
 511         struct list_struct rec;
 512         tdb_off tailer_ofs, tailer;
 513
 514         if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 515                 printf("ERROR: failed to read record at %u\n", offset);
 516                 return 0;
 517         }
 518
 519         printf(" rec: offset=%u next=%d rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
 520                offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
 521
 522         tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
 523         if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
 524                 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
 525                 return rec.next;
 526         }
 527
 528         if (tailer != rec.rec_len + sizeof(rec)) {
 529                 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
 530                                 (unsigned)tailer, (unsigned)(rec.rec_len + sizeof(rec)));
 531         }
 532         return rec.next;
 533 }
 534
 535 static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
 536 {
 537         tdb_off rec_ptr, top;
 538
 539         top = TDB_HASH_TOP(i);
 540
 541         if (tdb_lock(tdb, i, F_WRLCK) != 0)
 542                 return -1;
 543
 544         if (ofs_read(tdb, top, &rec_ptr) == -1)
 545                 return tdb_unlock(tdb, i, F_WRLCK);
 546
 547         if (rec_ptr)
 548                 printf("hash=%d\n", i);
 549
 550         while (rec_ptr) {
 551                 rec_ptr = tdb_dump_record(tdb, rec_ptr);
 552         }
 553
 554         return tdb_unlock(tdb, i, F_WRLCK);
 555 }
 556
 557 void tdb_dump_all(TDB_CONTEXT *tdb)
 558 {
 559         int i;
 560         for (i=0;i<tdb->header.hash_size;i++) {
 561                 tdb_dump_chain(tdb, i);
 562         }
 563         printf("freelist:\n");
 564         tdb_dump_chain(tdb, -1);
 565 }
 566
 567 int tdb_printfreelist(TDB_CONTEXT *tdb)
 568 {
 569         int ret;
 570         long total_free = 0;
 571         tdb_off offset, rec_ptr;
 572         struct list_struct rec;
 573
 574         if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
 575                 return ret;
 576
 577         offset = FREELIST_TOP;
 578
 579         /* read in the freelist top */
 580         if (ofs_read(tdb, offset, &rec_ptr) == -1) {
 581                 tdb_unlock(tdb, -1, F_WRLCK);
 582                 return 0;
 583         }
 584
 585         printf("freelist top=[0x%08x]\n", rec_ptr );
 586         while (rec_ptr) {
 587                 if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 588                         tdb_unlock(tdb, -1, F_WRLCK);
 589                         return -1;
 590                 }
 591
 592                 if (rec.magic != TDB_FREE_MAGIC) {
 593                         printf("bad magic 0x%08x in free list\n", rec.magic);
 594                         tdb_unlock(tdb, -1, F_WRLCK);
 595                         return -1;
 596                 }
 597
 598                 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)]\n", rec.next, rec.rec_len, rec.rec_len );
 599                 total_free += rec.rec_len;
 600
 601                 /* move to the next record */
 602                 rec_ptr = rec.next;
 603         }
 604         printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
 605                (int)total_free);
 606
 607         return tdb_unlock(tdb, -1, F_WRLCK);
 608 }
 609
 610 /* Remove an element from the freelist.  Must have alloc lock. */
 611 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
 612 {
 613         tdb_off last_ptr, i;
 614
 615         /* read in the freelist top */
 616         last_ptr = FREELIST_TOP;
 617         while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
 618                 if (i == off) {
 619                         /* We've found it! */
 620                         return ofs_write(tdb, last_ptr, &next);
 621                 }
 622                 /* Follow chain (next offset is at start of record) */
 623                 last_ptr = i;
 624         }
 625         TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
 626         return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 627 }
 628
 629 /* Add an element into the freelist. Merge adjacent records if
 630    neccessary. */
 631 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 632 {
 633         tdb_off right, left;
 634
 635         /* Allocation and tailer lock */
 636         if (tdb_lock(tdb, -1, F_WRLCK) != 0)
 637                 return -1;
 638
 639         /* set an initial tailer, so if we fail we don't leave a bogus record */
 640         if (update_tailer(tdb, offset, rec) != 0) {
 641                 TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
 642                 goto fail;
 643         }
 644
 645         /* Look right first (I'm an Australian, dammit) */
 646         right = offset + sizeof(*rec) + rec->rec_len;
 647         if (right + sizeof(*rec) <= tdb->map_size) {
 648                 struct list_struct r;
 649
 650                 if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
 651                         TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
 652                         goto left;
 653                 }
 654
 655                 /* If it's free, expand to include it. */
 656                 if (r.magic == TDB_FREE_MAGIC) {
 657                         if (remove_from_freelist(tdb, right, r.next) == -1) {
 658                                 TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
 659                                 goto left;
 660                         }
 661                         rec->rec_len += sizeof(r) + r.rec_len;
 662                 }
 663         }
 664
 665 left:
 666         /* Look left */
 667         left = offset - sizeof(tdb_off);
 668         if (left > TDB_DATA_START(tdb->header.hash_size)) {
 669                 struct list_struct l;
 670                 tdb_off leftsize;
 671
 672                 /* Read in tailer and jump back to header */
 673                 if (ofs_read(tdb, left, &leftsize) == -1) {
 674                         TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
 675                         goto update;
 676                 }
 677                 left = offset - leftsize;
 678
 679                 /* Now read in record */
 680                 if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
 681                         TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
 682                         goto update;
 683                 }
 684
 685                 /* If it's free, expand to include it. */
 686                 if (l.magic == TDB_FREE_MAGIC) {
 687                         if (remove_from_freelist(tdb, left, l.next) == -1) {
 688                                 TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
 689                                 goto update;
 690                         } else {
 691                                 offset = left;
 692                                 rec->rec_len += leftsize;
 693                         }
 694                 }
 695         }
 696
 697 update:
 698         if (update_tailer(tdb, offset, rec) == -1) {
 699                 TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
 700                 goto fail;
 701         }
 702
 703         /* Now, prepend to free list */
 704         rec->magic = TDB_FREE_MAGIC;
 705
 706         if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
 707             rec_write(tdb, offset, rec) == -1 ||
 708             ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 709                 TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
 710                 goto fail;
 711         }
 712
 713         /* And we're done. */
 714         tdb_unlock(tdb, -1, F_WRLCK);
 715         return 0;
 716
 717  fail:
 718         tdb_unlock(tdb, -1, F_WRLCK);
 719         return -1;
 720 }
 721
 722
 723 /* expand a file.  we prefer to use ftruncate, as that is what posix
 724   says to use for mmap expansion */
 725 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
 726 {
 727         char buf[1024];
 728 #if HAVE_FTRUNCATE_EXTEND
 729         if (ftruncate(tdb->fd, size+addition) != 0) {
 730                 TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
 731                            size+addition, strerror(errno)));
 732                 return -1;
 733         }
 734 #else
 735         char b = 0;
 736
 737 #ifdef HAVE_PWRITE
 738         if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
 739 #else
 740         if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
 741             write(tdb->fd, &b, 1) != 1) {
 742 #endif
 743                 TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
 744                            size+addition, strerror(errno)));
 745                 return -1;
 746         }
 747 #endif
 748
 749         /* now fill the file with something. This ensures that the file isn't sparse, which would be
 750            very bad if we ran out of disk. This must be done with write, not via mmap */
 751         memset(buf, 0x42, sizeof(buf));
 752         while (addition) {
 753                 int n = addition>sizeof(buf)?sizeof(buf):addition;
 754 #ifdef HAVE_PWRITE
 755                 int ret = pwrite(tdb->fd, buf, n, size);
 756 #else
 757                 int ret;
 758                 if (lseek(tdb->fd, size, SEEK_SET) != size)
 759                         return -1;
 760                 ret = write(tdb->fd, buf, n);
 761 #endif
 762                 if (ret != n) {
 763                         TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
 764                                    n, strerror(errno)));
 765                         return -1;
 766                 }
 767                 addition -= n;
 768                 size += n;
 769         }
 770         return 0;
 771 }
 772
 773
 774 /* expand the database at least size bytes by expanding the underlying
 775    file and doing the mmap again if necessary */
 776 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
 777 {
 778         struct list_struct rec;
 779         tdb_off offset;
 780
 781         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 782                 TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
 783                 return -1;
 784         }
 785
 786         /* must know about any previous expansions by another process */
 787         tdb_oob(tdb, tdb->map_size + 1, 1);
 788
 789         /* always make room for at least 10 more records, and round
 790            the database up to a multiple of TDB_PAGE_SIZE */
 791         size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
 792
 793         if (!(tdb->flags & TDB_INTERNAL))
 794                 tdb_munmap(tdb);
 795
 796         /*
 797          * We must ensure the file is unmapped before doing this
 798          * to ensure consistency with systems like OpenBSD where
 799          * writes and mmaps are not consistent.
 800          */
 801
 802         /* expand the file itself */
 803         if (!(tdb->flags & TDB_INTERNAL)) {
 804                 if (expand_file(tdb, tdb->map_size, size) != 0)
 805                         goto fail;
 806         }
 807
 808         tdb->map_size += size;
 809
 810         if (tdb->flags & TDB_INTERNAL)
 811                 tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
 812         else {
 813                 /*
 814                  * We must ensure the file is remapped before adding the space
 815                  * to ensure consistency with systems like OpenBSD where
 816                  * writes and mmaps are not consistent.
 817                  */
 818
 819                 /* We're ok if the mmap fails as we'll fallback to read/write */
 820                 tdb_mmap(tdb);
 821         }
 822
 823         /* form a new freelist record */
 824         memset(&rec,'\0',sizeof(rec));
 825         rec.rec_len = size - sizeof(rec);
 826
 827         /* link it into the free list */
 828         offset = tdb->map_size - size;
 829         if (tdb_free(tdb, offset, &rec) == -1)
 830                 goto fail;
 831
 832         tdb_unlock(tdb, -1, F_WRLCK);
 833         return 0;
 834  fail:
 835         tdb_unlock(tdb, -1, F_WRLCK);
 836         return -1;
 837 }
 838
 839 /* allocate some space from the free list. The offset returned points
 840    to a unconnected list_struct within the database with room for at
 841    least length bytes of total data
 842
 843    0 is returned if the space could not be allocated
 844  */
 845 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
 846                             struct list_struct *rec)
 847 {
 848         tdb_off rec_ptr, last_ptr, newrec_ptr;
 849         struct list_struct newrec;
 850
 851         memset(&newrec, '\0', sizeof(newrec));
 852
 853         if (tdb_lock(tdb, -1, F_WRLCK) == -1)
 854                 return 0;
 855
 856         /* Extra bytes required for tailer */
 857         length += sizeof(tdb_off);
 858
 859  again:
 860         last_ptr = FREELIST_TOP;
 861
 862         /* read in the freelist top */
 863         if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
 864                 goto fail;
 865
 866         /* keep looking until we find a freelist record big enough */
 867         while (rec_ptr) {
 868                 if (rec_free_read(tdb, rec_ptr, rec) == -1)
 869                         goto fail;
 870
 871                 if (rec->rec_len >= length) {
 872                         /* found it - now possibly split it up  */
 873                         if (rec->rec_len > length + MIN_REC_SIZE) {
 874                                 /* Length of left piece */
 875                                 length = TDB_ALIGN(length, TDB_ALIGNMENT);
 876
 877                                 /* Right piece to go on free list */
 878                                 newrec.rec_len = rec->rec_len
 879                                         - (sizeof(*rec) + length);
 880                                 newrec_ptr = rec_ptr + sizeof(*rec) + length;
 881
 882                                 /* And left record is shortened */
 883                                 rec->rec_len = length;
 884                         } else
 885                                 newrec_ptr = 0;
 886
 887                         /* Remove allocated record from the free list */
 888                         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
 889                                 goto fail;
 890
 891                         /* Update header: do this before we drop alloc
 892                            lock, otherwise tdb_free() might try to
 893                            merge with us, thinking we're free.
 894                            (Thanks Jeremy Allison). */
 895                         rec->magic = TDB_MAGIC;
 896                         if (rec_write(tdb, rec_ptr, rec) == -1)
 897                                 goto fail;
 898
 899                         /* Did we create new block? */
 900                         if (newrec_ptr) {
 901                                 /* Update allocated record tailer (we
 902                                    shortened it). */
 903                                 if (update_tailer(tdb, rec_ptr, rec) == -1)
 904                                         goto fail;
 905
 906                                 /* Free new record */
 907                                 if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
 908                                         goto fail;
 909                         }
 910
 911                         /* all done - return the new record offset */
 912                         tdb_unlock(tdb, -1, F_WRLCK);
 913                         return rec_ptr;
 914                 }
 915                 /* move to the next record */
 916                 last_ptr = rec_ptr;
 917                 rec_ptr = rec->next;
 918         }
 919         /* we didn't find enough space. See if we can expand the
 920            database and if we can then try again */
 921         if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
 922                 goto again;
 923  fail:
 924         tdb_unlock(tdb, -1, F_WRLCK);
 925         return 0;
 926 }
 927
 928 /* initialise a new database with a specified hash size */
 929 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
 930 {
 931         struct tdb_header *newdb;
 932         int size, ret = -1;
 933
 934         /* We make it up in memory, then write it out if not internal */
 935         size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
 936         if (!(newdb = calloc(size, 1)))
 937                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
 938
 939         /* Fill in the header */
 940         newdb->version = TDB_VERSION;
 941         newdb->hash_size = hash_size;
 942         if (tdb->flags & TDB_INTERNAL) {
 943                 tdb->map_size = size;
 944                 tdb->map_ptr = (char *)newdb;
 945                 memcpy(&tdb->header, newdb, sizeof(tdb->header));
 946                 /* Convert the `ondisk' version if asked. */
 947                 CONVERT(*newdb);
 948                 return 0;
 949         }
 950         if (lseek(tdb->fd, 0, SEEK_SET) == -1)
 951                 goto fail;
 952
 953         if (ftruncate(tdb->fd, 0) == -1)
 954                 goto fail;
 955
 956         /* This creates an endian-converted header, as if read from disk */
 957         CONVERT(*newdb);
 958         memcpy(&tdb->header, newdb, sizeof(tdb->header));
 959         /* Don't endian-convert the magic food! */
 960         memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
 961         if (write(tdb->fd, newdb, size) != size)
 962                 ret = -1;
 963         else
 964                 ret = tdb_create_rwlocks(tdb->fd, hash_size);
 965
 966   fail:
 967         SAFE_FREE(newdb);
 968         return ret;
 969 }
 970
 971 /* Returns 0 on fail.  On success, return offset of record, and fills
 972    in rec */
 973 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
 974                         struct list_struct *r)
 975 {
 976         tdb_off rec_ptr;
 977
 978         /* read in the hash top */
 979         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 980                 return 0;
 981
 982         /* keep looking until we find the right record */
 983         while (rec_ptr) {
 984                 if (rec_read(tdb, rec_ptr, r) == -1)
 985                         return 0;
 986
 987                 if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
 988                         char *k;
 989                         /* a very likely hit - read the key */
 990                         k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
 991                                            r->key_len);
 992                         if (!k)
 993                                 return 0;
 994
 995                         if (memcmp(key.dptr, k, key.dsize) == 0) {
 996                                 SAFE_FREE(k);
 997                                 return rec_ptr;
 998                         }
 999                         SAFE_FREE(k);
1000                 }
1001                 rec_ptr = r->next;
1002         }
1003         return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
1004 }
1005
1006 /* As tdb_find, but if you succeed, keep the lock */
1007 static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
1008                              struct list_struct *rec)
1009 {
1010         u32 rec_ptr;
1011
1012         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
1013                 return 0;
1014         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
1015                 tdb_unlock(tdb, BUCKET(hash), locktype);
1016         return rec_ptr;
1017 }
1018
1019 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
1020 {
1021         return tdb->ecode;
1022 }
1023
1024 static struct tdb_errname {
1025         enum TDB_ERROR ecode; const char *estring;
1026 } emap[] = { {TDB_SUCCESS, "Success"},
1027              {TDB_ERR_CORRUPT, "Corrupt database"},
1028              {TDB_ERR_IO, "IO Error"},
1029              {TDB_ERR_LOCK, "Locking error"},
1030              {TDB_ERR_OOM, "Out of memory"},
1031              {TDB_ERR_EXISTS, "Record exists"},
1032              {TDB_ERR_NOLOCK, "Lock exists on other keys"},
1033              {TDB_ERR_NOEXIST, "Record does not exist"} };
1034
1035 /* Error string for the last tdb error */
1036 const char *tdb_errorstr(TDB_CONTEXT *tdb)
1037 {
1038         u32 i;
1039         for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
1040                 if (tdb->ecode == emap[i].ecode)
1041                         return emap[i].estring;
1042         return "Invalid error code";
1043 }
1044
1045 /* update an entry in place - this only works if the new data size
1046    is <= the old data size and the key exists.
1047    on failure return -1.
1048 */
1049
1050 static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
1051 {
1052         struct list_struct rec;
1053         tdb_off rec_ptr;
1054
1055         /* find entry */
1056         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1057                 return -1;
1058
1059         /* must be long enough key, data and tailer */
1060         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1061                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1062                 return -1;
1063         }
1064
1065         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1066                       dbuf.dptr, dbuf.dsize) == -1)
1067                 return -1;
1068
1069         if (dbuf.dsize != rec.data_len) {
1070                 /* update size */
1071                 rec.data_len = dbuf.dsize;
1072                 return rec_write(tdb, rec_ptr, &rec);
1073         }
1074
1075         return 0;
1076 }
1077
1078 /* find an entry in the database given a key */
1079 /* If an entry doesn't exist tdb_err will be set to
1080  * TDB_ERR_NOEXIST. If a key has no data attached
1081  * tdb_err will not be set. Both will return a
1082  * zero pptr and zero dsize.
1083  */
1084
1085 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1086 {
1087         tdb_off rec_ptr;
1088         struct list_struct rec;
1089         TDB_DATA ret;
1090         u32 hash;
1091
1092         /* find which hash bucket it is in */
1093         hash = tdb->hash_fn(&key);
1094         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
1095                 return tdb_null;
1096
1097         if (rec.data_len)
1098                 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1099                                           rec.data_len);
1100         else
1101                 ret.dptr = NULL;
1102         ret.dsize = rec.data_len;
1103         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1104         return ret;
1105 }
1106
1107 /* check if an entry in the database exists
1108
1109    note that 1 is returned if the key is found and 0 is returned if not found
1110    this doesn't match the conventions in the rest of this module, but is
1111    compatible with gdbm
1112 */
1113 static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1114 {
1115         struct list_struct rec;
1116
1117         if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
1118                 return 0;
1119         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1120         return 1;
1121 }
1122
1123 int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1124 {
1125         u32 hash = tdb->hash_fn(&key);
1126         return tdb_exists_hash(tdb, key, hash);
1127 }
1128
1129 /* record lock stops delete underneath */
1130 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1131 {
1132         return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1133 }
1134 /*
1135   Write locks override our own fcntl readlocks, so check it here.
1136   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1137   an error to fail to get the lock here.
1138 */
1139
1140 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1141 {
1142         struct tdb_traverse_lock *i;
1143         for (i = &tdb->travlocks; i; i = i->next)
1144                 if (i->off == off)
1145                         return -1;
1146         return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1147 }
1148
1149 /*
1150   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1151   an error to fail to get the lock here.
1152 */
1153
1154 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1155 {
1156         return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1157 }
1158 /* fcntl locks don't stack: avoid unlocking someone else's */
1159 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1160 {
1161         struct tdb_traverse_lock *i;
1162         u32 count = 0;
1163
1164         if (off == 0)
1165                 return 0;
1166         for (i = &tdb->travlocks; i; i = i->next)
1167                 if (i->off == off)
1168                         count++;
1169         return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1170 }
1171
1172 /* actually delete an entry in the database given the offset */
1173 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1174 {
1175         tdb_off last_ptr, i;
1176         struct list_struct lastrec;
1177
1178         if (tdb->read_only) return -1;
1179
1180         if (write_lock_record(tdb, rec_ptr) == -1) {
1181                 /* Someone traversing here: mark it as dead */
1182                 rec->magic = TDB_DEAD_MAGIC;
1183                 return rec_write(tdb, rec_ptr, rec);
1184         }
1185         if (write_unlock_record(tdb, rec_ptr) != 0)
1186                 return -1;
1187
1188         /* find previous record in hash chain */
1189         if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1190                 return -1;
1191         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1192                 if (rec_read(tdb, i, &lastrec) == -1)
1193                         return -1;
1194
1195         /* unlink it: next ptr is at start of record. */
1196         if (last_ptr == 0)
1197                 last_ptr = TDB_HASH_TOP(rec->full_hash);
1198         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1199                 return -1;
1200
1201         /* recover the space */
1202         if (tdb_free(tdb, rec_ptr, rec) == -1)
1203                 return -1;
1204         return 0;
1205 }
1206
1207 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1208 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1209                          struct list_struct *rec)
1210 {
1211         int want_next = (tlock->off != 0);
1212
1213         /* Lock each chain from the start one. */
1214         for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1215                 if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1216                         return -1;
1217
1218                 /* No previous record?  Start at top of chain. */
1219                 if (!tlock->off) {
1220                         if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1221                                      &tlock->off) == -1)
1222                                 goto fail;
1223                 } else {
1224                         /* Otherwise unlock the previous record. */
1225                         if (unlock_record(tdb, tlock->off) != 0)
1226                                 goto fail;
1227                 }
1228
1229                 if (want_next) {
1230                         /* We have offset of old record: grab next */
1231                         if (rec_read(tdb, tlock->off, rec) == -1)
1232                                 goto fail;
1233                         tlock->off = rec->next;
1234                 }
1235
1236                 /* Iterate through chain */
1237                 while( tlock->off) {
1238                         tdb_off current;
1239                         if (rec_read(tdb, tlock->off, rec) == -1)
1240                                 goto fail;
1241                         if (!TDB_DEAD(rec)) {
1242                                 /* Woohoo: we found one! */
1243                                 if (lock_record(tdb, tlock->off) != 0)
1244                                         goto fail;
1245                                 return tlock->off;
1246                         }
1247                         /* Try to clean dead ones from old traverses */
1248                         current = tlock->off;
1249                         tlock->off = rec->next;
1250                         if (!tdb->read_only &&
1251                             do_delete(tdb, current, rec) != 0)
1252                                 goto fail;
1253                 }
1254                 tdb_unlock(tdb, tlock->hash, F_WRLCK);
1255                 want_next = 0;
1256         }
1257         /* We finished iteration without finding anything */
1258         return TDB_ERRCODE(TDB_SUCCESS, 0);
1259
1260  fail:
1261         tlock->off = 0;
1262         if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1263                 TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1264         return -1;
1265 }
1266
1267 /* traverse the entire database - calling fn(tdb, key, data) on each element.
1268    return -1 on error or the record count traversed
1269    if fn is NULL then it is not called
1270    a non-zero return value from fn() indicates that the traversal should stop
1271   */
1272 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
1273 {
1274         TDB_DATA key, dbuf;
1275         struct list_struct rec;
1276         struct tdb_traverse_lock tl = { NULL, 0, 0 };
1277         int ret, count = 0;
1278
1279         /* This was in the initializaton, above, but the IRIX compiler
1280          * did not like it.  crh
1281          */
1282         tl.next = tdb->travlocks.next;
1283
1284         /* fcntl locks don't stack: beware traverse inside traverse */
1285         tdb->travlocks.next = &tl;
1286
1287         /* tdb_next_lock places locks on the record returned, and its chain */
1288         while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1289                 count++;
1290                 /* now read the full record */
1291                 key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1292                                           rec.key_len + rec.data_len);
1293                 if (!key.dptr) {
1294                         ret = -1;
1295                         if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1296                                 goto out;
1297                         if (unlock_record(tdb, tl.off) != 0)
1298                                 TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1299                         goto out;
1300                 }
1301                 key.dsize = rec.key_len;
1302                 dbuf.dptr = key.dptr + rec.key_len;
1303                 dbuf.dsize = rec.data_len;
1304
1305                 /* Drop chain lock, call out */
1306                 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1307                         ret = -1;
1308                         goto out;
1309                 }
1310                 if (fn && fn(tdb, key, dbuf, private)) {
1311                         /* They want us to terminate traversal */
1312                         ret = count;
1313                         if (unlock_record(tdb, tl.off) != 0) {
1314                                 TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
1315                                 ret = -1;
1316                         }
1317                         tdb->travlocks.next = tl.next;
1318                         SAFE_FREE(key.dptr);
1319                         return count;
1320                 }
1321                 SAFE_FREE(key.dptr);
1322         }
1323 out:
1324         tdb->travlocks.next = tl.next;
1325         if (ret < 0)
1326                 return -1;
1327         else
1328                 return count;
1329 }
1330
1331 /* find the first entry in the database and return its key */
1332 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1333 {
1334         TDB_DATA key;
1335         struct list_struct rec;
1336
1337         /* release any old lock */
1338         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1339                 return tdb_null;
1340         tdb->travlocks.off = tdb->travlocks.hash = 0;
1341
1342         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1343                 return tdb_null;
1344         /* now read the key */
1345         key.dsize = rec.key_len;
1346         key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1347         if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1348                 TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1349         return key;
1350 }
1351
1352 /* find the next entry in the database, returning its key */
1353 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1354 {
1355         u32 oldhash;
1356         TDB_DATA key = tdb_null;
1357         struct list_struct rec;
1358         char *k = NULL;
1359
1360         /* Is locked key the old key?  If so, traverse will be reliable. */
1361         if (tdb->travlocks.off) {
1362                 if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1363                         return tdb_null;
1364                 if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1365                     || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1366                                             rec.key_len))
1367                     || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1368                         /* No, it wasn't: unlock it and start from scratch */
1369                         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1370                                 return tdb_null;
1371                         if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1372                                 return tdb_null;
1373                         tdb->travlocks.off = 0;
1374                 }
1375
1376                 SAFE_FREE(k);
1377         }
1378
1379         if (!tdb->travlocks.off) {
1380                 /* No previous element: do normal find, and lock record */
1381                 tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
1382                 if (!tdb->travlocks.off)
1383                         return tdb_null;
1384                 tdb->travlocks.hash = BUCKET(rec.full_hash);
1385                 if (lock_record(tdb, tdb->travlocks.off) != 0) {
1386                         TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1387                         return tdb_null;
1388                 }
1389         }
1390         oldhash = tdb->travlocks.hash;
1391
1392         /* Grab next record: locks chain and returned record,
1393            unlocks old record */
1394         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1395                 key.dsize = rec.key_len;
1396                 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1397                                           key.dsize);
1398                 /* Unlock the chain of this new record */
1399                 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1400                         TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1401         }
1402         /* Unlock the chain of old record */
1403         if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1404                 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1405         return key;
1406 }
1407
1408 /* delete an entry in the database given a key */
1409 static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1410 {
1411         tdb_off rec_ptr;
1412         struct list_struct rec;
1413         int ret;
1414
1415         if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
1416                 return -1;
1417         ret = do_delete(tdb, rec_ptr, &rec);
1418         if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1419                 TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1420         return ret;
1421 }
1422
1423 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1424 {
1425         u32 hash = tdb->hash_fn(&key);
1426         return tdb_delete_hash(tdb, key, hash);
1427 }
1428
1429 /* store an element in the database, replacing any existing element
1430    with the same key
1431
1432    return 0 on success, -1 on failure
1433 */
1434 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1435 {
1436         struct list_struct rec;
1437         u32 hash;
1438         tdb_off rec_ptr;
1439         char *p = NULL;
1440         int ret = 0;
1441
1442         /* find which hash bucket it is in */
1443         hash = tdb->hash_fn(&key);
1444         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1445                 return -1;
1446
1447         /* check for it existing, on insert. */
1448         if (flag == TDB_INSERT) {
1449                 if (tdb_exists_hash(tdb, key, hash)) {
1450                         tdb->ecode = TDB_ERR_EXISTS;
1451                         goto fail;
1452                 }
1453         } else {
1454                 /* first try in-place update, on modify or replace. */
1455                 if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
1456                         goto out;
1457                 if (tdb->ecode == TDB_ERR_NOEXIST &&
1458                     flag == TDB_MODIFY) {
1459                         /* if the record doesn't exist and we are in TDB_MODIFY mode then
1460                          we should fail the store */
1461                         goto fail;
1462         }
1463         }
1464         /* reset the error code potentially set by the tdb_update() */
1465         tdb->ecode = TDB_SUCCESS;
1466
1467         /* delete any existing record - if it doesn't exist we don't
1468            care.  Doing this first reduces fragmentation, and avoids
1469            coalescing with `allocated' block before it's updated. */
1470         if (flag != TDB_INSERT)
1471                 tdb_delete_hash(tdb, key, hash);
1472
1473         /* Copy key+value *before* allocating free space in case malloc
1474            fails and we are left with a dead spot in the tdb. */
1475
1476         if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1477                 tdb->ecode = TDB_ERR_OOM;
1478                 goto fail;
1479         }
1480
1481         memcpy(p, key.dptr, key.dsize);
1482         if (dbuf.dsize)
1483                 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1484
1485         /* we have to allocate some space */
1486         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1487                 goto fail;
1488
1489         /* Read hash top into next ptr */
1490         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1491                 goto fail;
1492
1493         rec.key_len = key.dsize;
1494         rec.data_len = dbuf.dsize;
1495         rec.full_hash = hash;
1496         rec.magic = TDB_MAGIC;
1497
1498         /* write out and point the top of the hash chain at it */
1499         if (rec_write(tdb, rec_ptr, &rec) == -1
1500             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1501             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1502                 /* Need to tdb_unallocate() here */
1503                 goto fail;
1504         }
1505  out:
1506         SAFE_FREE(p);
1507         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1508         return ret;
1509 fail:
1510         ret = -1;
1511         goto out;
1512 }
1513
1514 /* Attempt to append data to an entry in place - this only works if the new data size
1515    is <= the old data size and the key exists.
1516    on failure return -1. Record must be locked before calling.
1517 */
1518 static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
1519 {
1520         struct list_struct rec;
1521         tdb_off rec_ptr;
1522
1523         /* find entry */
1524         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1525                 return -1;
1526
1527         /* Append of 0 is always ok. */
1528         if (new_dbuf.dsize == 0)
1529                 return 0;
1530
1531         /* must be long enough for key, old data + new data and tailer */
1532         if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
1533                 /* No room. */
1534                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1535                 return -1;
1536         }
1537
1538         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
1539                       new_dbuf.dptr, new_dbuf.dsize) == -1)
1540                 return -1;
1541
1542         /* update size */
1543         rec.data_len += new_dbuf.dsize;
1544         return rec_write(tdb, rec_ptr, &rec);
1545 }
1546
1547 /* Append to an entry. Create if not exist. */
1548
1549 int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1550 {
1551         struct list_struct rec;
1552         u32 hash;
1553         tdb_off rec_ptr;
1554         char *p = NULL;
1555         int ret = 0;
1556         size_t new_data_size = 0;
1557
1558         /* find which hash bucket it is in */
1559         hash = tdb->hash_fn(&key);
1560         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1561                 return -1;
1562
1563         /* first try in-place. */
1564         if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
1565                 goto out;
1566
1567         /* reset the error code potentially set by the tdb_append_inplace() */
1568         tdb->ecode = TDB_SUCCESS;
1569
1570         /* find entry */
1571         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
1572                 if (tdb->ecode != TDB_ERR_NOEXIST)
1573                         goto fail;
1574
1575                 /* Not found - create. */
1576
1577                 ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
1578                 goto out;
1579         }
1580
1581         new_data_size = rec.data_len + new_dbuf.dsize;
1582
1583         /* Copy key+old_value+value *before* allocating free space in case malloc
1584            fails and we are left with a dead spot in the tdb. */
1585
1586         if (!(p = (char *)malloc(key.dsize + new_data_size))) {
1587                 tdb->ecode = TDB_ERR_OOM;
1588                 goto fail;
1589         }
1590
1591         /* Copy the key in place. */
1592         memcpy(p, key.dptr, key.dsize);
1593
1594         /* Now read the old data into place. */
1595         if (rec.data_len &&
1596                 tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
1597                         goto fail;
1598
1599         /* Finally append the new data. */
1600         if (new_dbuf.dsize)
1601                 memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
1602
1603         /* delete any existing record - if it doesn't exist we don't
1604            care.  Doing this first reduces fragmentation, and avoids
1605            coalescing with `allocated' block before it's updated. */
1606
1607         tdb_delete_hash(tdb, key, hash);
1608
1609         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
1610                 goto fail;
1611
1612         /* Read hash top into next ptr */
1613         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1614                 goto fail;
1615
1616         rec.key_len = key.dsize;
1617         rec.data_len = new_data_size;
1618         rec.full_hash = hash;
1619         rec.magic = TDB_MAGIC;
1620
1621         /* write out and point the top of the hash chain at it */
1622         if (rec_write(tdb, rec_ptr, &rec) == -1
1623             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
1624             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1625                 /* Need to tdb_unallocate() here */
1626                 goto fail;
1627         }
1628
1629  out:
1630         SAFE_FREE(p);
1631         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1632         return ret;
1633
1634 fail:
1635         ret = -1;
1636         goto out;
1637 }
1638
1639 static int tdb_already_open(dev_t device,
1640                             ino_t ino)
1641 {
1642         TDB_CONTEXT *i;
1643
1644         for (i = tdbs; i; i = i->next) {
1645                 if (i->device == device && i->inode == ino) {
1646                         return 1;
1647                 }
1648         }
1649
1650         return 0;
1651 }
1652
1653 /* This is based on the hash algorithm from gdbm */
1654 static u32 default_tdb_hash(TDB_DATA *key)
1655 {
1656         u32 value;      /* Used to compute the hash value.  */
1657         u32   i;        /* Used to cycle through random values. */
1658
1659         /* Set the initial value from the key size. */
1660         for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
1661                 value = (value + (key->dptr[i] << (i*5 % 24)));
1662
1663         return (1103515243 * value + 12345);
1664 }
1665
1666 /* open the database, creating it if necessary
1667
1668    The open_flags and mode are passed straight to the open call on the
1669    database file. A flags value of O_WRONLY is invalid. The hash size
1670    is advisory, use zero for a default value.
1671
1672    Return is NULL on error, in which case errno is also set.  Don't
1673    try to call tdb_error or tdb_errname, just do strerror(errno).
1674
1675    @param name may be NULL for internal databases. */
1676 TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1677                       int open_flags, mode_t mode)
1678 {
1679         return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
1680 }
1681
1682
1683 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1684                          int open_flags, mode_t mode,
1685                          tdb_log_func log_fn,
1686                          tdb_hash_func hash_fn)
1687 {
1688         TDB_CONTEXT *tdb;
1689         struct stat st;
1690         int rev = 0, locked = 0;
1691         unsigned char *vp;
1692         u32 vertest;
1693
1694         if (!(tdb = calloc(1, sizeof *tdb))) {
1695                 /* Can't log this */
1696                 errno = ENOMEM;
1697                 goto fail;
1698         }
1699         tdb->fd = -1;
1700         tdb->name = NULL;
1701         tdb->map_ptr = NULL;
1702         tdb->flags = tdb_flags;
1703         tdb->open_flags = open_flags;
1704         tdb->log_fn = log_fn;
1705         tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
1706
1707         if ((open_flags & O_ACCMODE) == O_WRONLY) {
1708                 TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1709                          name));
1710                 errno = EINVAL;
1711                 goto fail;
1712         }
1713
1714         if (hash_size == 0)
1715                 hash_size = DEFAULT_HASH_SIZE;
1716         if ((open_flags & O_ACCMODE) == O_RDONLY) {
1717                 tdb->read_only = 1;
1718                 /* read only databases don't do locking or clear if first */
1719                 tdb->flags |= TDB_NOLOCK;
1720                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1721         }
1722
1723         /* internal databases don't mmap or lock, and start off cleared */
1724         if (tdb->flags & TDB_INTERNAL) {
1725                 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1726                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1727                 if (tdb_new_database(tdb, hash_size) != 0) {
1728                         TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1729                         goto fail;
1730                 }
1731                 goto internal;
1732         }
1733
1734 again:
1735         if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1736                 if ((open_flags & O_CREAT) && errno == ENOENT &&
1737                         mkdir_recursive(PPP_PATH_VARRUN) == 0)
1738                         goto again;
1739
1740                 TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1741                          name, strerror(errno)));
1742                 goto fail;      /* errno set by open(2) */
1743         }
1744
1745         /* ensure there is only one process initialising at once */
1746         if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1747                 TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1748                          name, strerror(errno)));
1749                 goto fail;      /* errno set by tdb_brlock */
1750         }
1751
1752         /* we need to zero database if we are the only one with it open */
1753         if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
1754                 (locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
1755                 open_flags |= O_CREAT;
1756                 if (ftruncate(tdb->fd, 0) == -1) {
1757                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1758                                  "failed to truncate %s: %s\n",
1759                                  name, strerror(errno)));
1760                         goto fail; /* errno set by ftruncate */
1761                 }
1762         }
1763
1764         if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1765             || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1766             || (tdb->header.version != TDB_VERSION
1767                 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1768                 /* its not a valid database - possibly initialise it */
1769                 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1770                         errno = EIO; /* ie bad format or something */
1771                         goto fail;
1772                 }
1773                 rev = (tdb->flags & TDB_CONVERT);
1774         }
1775         vp = (unsigned char *)&tdb->header.version;
1776         vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1777                   (((u32)vp[2]) << 8) | (u32)vp[3];
1778         tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1779         if (!rev)
1780                 tdb->flags &= ~TDB_CONVERT;
1781         else {
1782                 tdb->flags |= TDB_CONVERT;
1783                 convert(&tdb->header, sizeof(tdb->header));
1784         }
1785         if (fstat(tdb->fd, &st) == -1)
1786                 goto fail;
1787
1788         /* Is it already in the open list?  If so, fail. */
1789         if (tdb_already_open(st.st_dev, st.st_ino)) {
1790                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1791                          "%s (%d,%d) is already open in this process\n",
1792                          name, (int)st.st_dev, (int)st.st_ino));
1793                 errno = EBUSY;
1794                 goto fail;
1795         }
1796
1797         if (!(tdb->name = (char *)strdup(name))) {
1798                 errno = ENOMEM;
1799                 goto fail;
1800         }
1801
1802         tdb->map_size = st.st_size;
1803         tdb->device = st.st_dev;
1804         tdb->inode = st.st_ino;
1805         tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1806         if (!tdb->locked) {
1807                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1808                          "failed to allocate lock structure for %s\n",
1809                          name));
1810                 errno = ENOMEM;
1811                 goto fail;
1812         }
1813         tdb_mmap(tdb);
1814         if (locked) {
1815                 if (!tdb->read_only)
1816                         if (tdb_clear_spinlocks(tdb) != 0) {
1817                                 TDB_LOG((tdb, 0, "tdb_open_ex: "
1818                                 "failed to clear spinlock\n"));
1819                                 goto fail;
1820                         }
1821                 if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1822                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1823                                  "failed to take ACTIVE_LOCK on %s: %s\n",
1824                                  name, strerror(errno)));
1825                         goto fail;
1826                 }
1827
1828         }
1829
1830         /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
1831            we didn't get the initial exclusive lock as we need to let all other
1832            users know we're using it. */
1833
1834         if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1835                 /* leave this lock in place to indicate it's in use */
1836                 if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1837                         goto fail;
1838         }
1839
1840
1841  internal:
1842         /* Internal (memory-only) databases skip all the code above to
1843          * do with disk files, and resume here by releasing their
1844          * global lock and hooking into the active list. */
1845         if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1846                 goto fail;
1847         tdb->next = tdbs;
1848         tdbs = tdb;
1849         return tdb;
1850
1851  fail:
1852         { int save_errno = errno;
1853
1854         if (!tdb)
1855                 return NULL;
1856
1857         if (tdb->map_ptr) {
1858                 if (tdb->flags & TDB_INTERNAL)
1859                         SAFE_FREE(tdb->map_ptr);
1860                 else
1861                         tdb_munmap(tdb);
1862         }
1863         SAFE_FREE(tdb->name);
1864         if (tdb->fd != -1)
1865                 if (close(tdb->fd) != 0)
1866                         TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1867         SAFE_FREE(tdb->locked);
1868         SAFE_FREE(tdb);
1869         errno = save_errno;
1870         return NULL;
1871         }
1872 }
1873
1874 /**
1875  * Close a database.
1876  *
1877  * @returns -1 for error; 0 for success.
1878  **/
1879 int tdb_close(TDB_CONTEXT *tdb)
1880 {
1881         TDB_CONTEXT **i;
1882         int ret = 0;
1883
1884         if (tdb->map_ptr) {
1885                 if (tdb->flags & TDB_INTERNAL)
1886                         SAFE_FREE(tdb->map_ptr);
1887                 else
1888                         tdb_munmap(tdb);
1889         }
1890         SAFE_FREE(tdb->name);
1891         if (tdb->fd != -1)
1892                 ret = close(tdb->fd);
1893         SAFE_FREE(tdb->locked);
1894
1895         /* Remove from contexts list */
1896         for (i = &tdbs; *i; i = &(*i)->next) {
1897                 if (*i == tdb) {
1898                         *i = tdb->next;
1899                         break;
1900                 }
1901         }
1902
1903         memset(tdb, 0, sizeof(*tdb));
1904         SAFE_FREE(tdb);
1905
1906         return ret;
1907 }
1908
1909 /* lock/unlock entire database */
1910 int tdb_lockall(TDB_CONTEXT *tdb)
1911 {
1912         u32 i;
1913
1914         /* There are no locks on read-only dbs */
1915         if (tdb->read_only)
1916                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
1917         for (i = 0; i < tdb->header.hash_size; i++)
1918                 if (tdb_lock(tdb, i, F_WRLCK))
1919                         break;
1920
1921         /* If error, release locks we have... */
1922         if (i < tdb->header.hash_size) {
1923                 u32 j;
1924
1925                 for ( j = 0; j < i; j++)
1926                         tdb_unlock(tdb, j, F_WRLCK);
1927                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1928         }
1929
1930         return 0;
1931 }
1932 void tdb_unlockall(TDB_CONTEXT *tdb)
1933 {
1934         u32 i;
1935         for (i=0; i < tdb->header.hash_size; i++)
1936                 tdb_unlock(tdb, i, F_WRLCK);
1937 }
1938
1939 /* lock/unlock one hash chain. This is meant to be used to reduce
1940    contention - it cannot guarantee how many records will be locked */
1941 int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
1942 {
1943         return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
1944 }
1945
1946 int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
1947 {
1948         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
1949 }
1950
1951 int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1952 {
1953         return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
1954 }
1955
1956 int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1957 {
1958         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
1959 }
1960
1961
1962 /* register a loging function */
1963 void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
1964 {
1965         tdb->log_fn = fn;
1966 }
1967
1968 /* reopen a tdb - this can be used after a fork to ensure that we have an independent
1969    seek pointer from our parent and to re-establish locks */
1970 int tdb_reopen(TDB_CONTEXT *tdb)
1971 {
1972         struct stat st;
1973
1974         if (tdb->flags & TDB_INTERNAL)
1975                 return 0; /* Nothing to do. */
1976         if (tdb_munmap(tdb) != 0) {
1977                 TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
1978                 goto fail;
1979         }
1980         if (close(tdb->fd) != 0)
1981                 TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
1982         tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
1983         if (tdb->fd == -1) {
1984                 TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
1985                 goto fail;
1986         }
1987         if (fstat(tdb->fd, &st) != 0) {
1988                 TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
1989                 goto fail;
1990         }
1991         if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
1992                 TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
1993                 goto fail;
1994         }
1995         tdb_mmap(tdb);
1996         if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
1997                 TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
1998                 goto fail;
1999         }
2000
2001         return 0;
2002
2003 fail:
2004         tdb_close(tdb);
2005         return -1;
2006 }
2007
2008 /* reopen all tdb's */
2009 int tdb_reopen_all(void)
2010 {
2011         TDB_CONTEXT *tdb;
2012
2013         for (tdb=tdbs; tdb; tdb = tdb->next) {
2014                 /* Ensure no clear-if-first. */
2015                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
2016                 if (tdb_reopen(tdb) != 0)
2017                         return -1;
2018         }
2019
2020         return 0;
2021 }