2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Rusty Russell 2009
8 ** NOTE! The following LGPL license applies to the tdb
9 ** library. This does NOT imply that all of Samba is released
12 This library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Lesser General Public
14 License as published by the Free Software Foundation; either
15 version 3 of the License, or (at your option) any later version.
17 This library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Lesser General Public License for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License along with this library; if not, see <http://www.gnu.org/licenses/>.
25 #include "tdb_private.h"
28 /* Since we opened it, these shouldn't fail unless it's recent corruption. */
29 static bool tdb_check_header(struct tdb_context *tdb, tdb_off_t *recovery)
31 struct tdb_header hdr;
34 if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1)
36 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0)
40 if (hdr.version != TDB_VERSION)
43 if (hdr.rwlocks != 0 && hdr.rwlocks != TDB_HASH_RWLOCK_MAGIC)
46 tdb_header_hash(tdb, &h1, &h2);
47 if (hdr.magic1_hash && hdr.magic2_hash &&
48 (hdr.magic1_hash != h1 || hdr.magic2_hash != h2))
51 if (hdr.hash_size == 0)
54 if (hdr.hash_size != tdb->header.hash_size)
57 if (hdr.recovery_start != 0 &&
58 hdr.recovery_start < TDB_DATA_START(tdb->header.hash_size))
61 *recovery = hdr.recovery_start;
65 tdb->ecode = TDB_ERR_CORRUPT;
66 TDB_LOG((tdb, TDB_DEBUG_ERROR, "Header is corrupt\n"));
70 /* Generic record header check. */
71 static bool tdb_check_record(struct tdb_context *tdb,
73 const struct tdb_record *rec)
77 /* Check rec->next: 0 or points to record offset, aligned. */
78 if (rec->next > 0 && rec->next < TDB_DATA_START(tdb->header.hash_size)){
79 TDB_LOG((tdb, TDB_DEBUG_ERROR,
80 "Record offset %d too small next %d\n",
84 if (rec->next + sizeof(*rec) < rec->next) {
85 TDB_LOG((tdb, TDB_DEBUG_ERROR,
86 "Record offset %d too large next %d\n",
90 if ((rec->next % TDB_ALIGNMENT) != 0) {
91 TDB_LOG((tdb, TDB_DEBUG_ERROR,
92 "Record offset %d misaligned next %d\n",
96 if (tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0))
99 /* Check rec_len: similar to rec->next, implies next record. */
100 if ((rec->rec_len % TDB_ALIGNMENT) != 0) {
101 TDB_LOG((tdb, TDB_DEBUG_ERROR,
102 "Record offset %d misaligned length %d\n",
106 /* Must fit tailer. */
107 if (rec->rec_len < sizeof(tailer)) {
108 TDB_LOG((tdb, TDB_DEBUG_ERROR,
109 "Record offset %d too short length %d\n",
113 /* OOB allows "right at the end" access, so this works for last rec. */
114 if (tdb->methods->tdb_oob(tdb, off+sizeof(*rec)+rec->rec_len, 0))
118 if (tdb_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer),
121 if (tailer != sizeof(*rec) + rec->rec_len) {
122 TDB_LOG((tdb, TDB_DEBUG_ERROR,
123 "Record offset %d invalid tailer\n", off));
130 tdb->ecode = TDB_ERR_CORRUPT;
134 /* Grab some bytes: may copy if can't use mmap.
135 Caller has already done bounds check. */
136 static TDB_DATA get_bytes(struct tdb_context *tdb,
137 tdb_off_t off, tdb_len_t len)
143 if (tdb->transaction == NULL && tdb->map_ptr != NULL)
144 d.dptr = (unsigned char *)tdb->map_ptr + off;
146 d.dptr = tdb_alloc_read(tdb, off, d.dsize);
150 /* Frees data if we're not able to simply use mmap. */
151 static void put_bytes(struct tdb_context *tdb, TDB_DATA d)
153 if (tdb->transaction == NULL && tdb->map_ptr != NULL)
158 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
159 * See: http://burtleburtle.net/bob/c/lookup3.c
161 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
162 static void hash(uint32_t key, uint32_t *pc, uint32_t *pb)
166 /* Set up the internal state */
167 a = b = c = 0xdeadbeef + *pc;
170 c ^= b; c -= rot(b,14);
171 a ^= c; a -= rot(c,11);
172 b ^= a; b -= rot(a,25);
173 c ^= b; c -= rot(b,16);
174 a ^= c; a -= rot(c,4);
175 b ^= a; b -= rot(a,14);
176 c ^= b; c -= rot(b,24);
181 We want to check that all free records are in the free list
182 (only once), and all free list entries are free records. Similarly
183 for each hash chain of used records.
185 Doing that naively (without walking hash chains, since we want to be
186 linear) means keeping a list of records which have been seen in each
187 hash chain, and another of records pointed to (ie. next pointers
188 from records and the initial hash chain heads). These two lists
189 should be equal. This will take 8 bytes per record, and require
192 So instead, we record each offset in a bitmap such a way that
193 recording it twice will cancel out. Since each offset should appear
194 exactly twice, the bitmap should be zero at the end.
196 The approach was inspired by Bloom Filters (see Wikipedia). For
197 each value, we flip K bits in a bitmap of size N. The number of
198 distinct arrangements is:
202 Of course, not all arrangements are actually distinct, but testing
203 shows this formula to be close enough.
205 So, if K == 8 and N == 256, the probability of two things flipping the same
206 bits is 1 in 409,663,695,276,000.
208 Given that ldb uses a hash size of 10000, using 32 bytes per hash chain
209 (320k) seems reasonable.
212 #define BITMAP_BITS 256
214 static void bit_flip(unsigned char bits[], unsigned int idx)
216 bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT));
219 /* We record offsets in a bitmap for the particular chain it should be in. */
220 static void record_offset(unsigned char bits[], tdb_off_t off)
222 uint32_t h1 = off, h2 = 0;
225 /* We get two good hash values out of jhash2, so we use both. Then
226 * we keep going to produce further hash values. */
227 for (i = 0; i < NUM_HASHES / 2; i++) {
229 bit_flip(bits, h1 % BITMAP_BITS);
230 bit_flip(bits, h2 % BITMAP_BITS);
235 /* Check that an in-use record is valid. */
236 static bool tdb_check_used_record(struct tdb_context *tdb,
238 const struct tdb_record *rec,
239 unsigned char **hashes,
240 int (*check)(TDB_DATA, TDB_DATA, void *),
245 if (!tdb_check_record(tdb, off, rec))
248 /* key + data + tailer must fit in record */
249 if (rec->key_len + rec->data_len + sizeof(tdb_off_t) > rec->rec_len) {
250 TDB_LOG((tdb, TDB_DEBUG_ERROR,
251 "Record offset %d too short for contents\n", off));
255 key = get_bytes(tdb, off + sizeof(*rec), rec->key_len);
259 if (tdb->hash_fn(&key) != rec->full_hash) {
260 TDB_LOG((tdb, TDB_DEBUG_ERROR,
261 "Record offset %d has incorrect hash\n", off));
265 /* Mark this offset as a known value for this hash bucket. */
266 record_offset(hashes[BUCKET(rec->full_hash)+1], off);
267 /* And similarly if the next pointer is valid. */
269 record_offset(hashes[BUCKET(rec->full_hash)+1], rec->next);
271 /* If they supply a check function and this record isn't dead,
272 get data and feed it. */
273 if (check && rec->magic != TDB_DEAD_MAGIC) {
274 data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len,
279 if (check(key, data, private_data) == -1)
281 put_bytes(tdb, data);
288 put_bytes(tdb, data);
294 /* Check that an unused record is valid. */
295 static bool tdb_check_free_record(struct tdb_context *tdb,
297 const struct tdb_record *rec,
298 unsigned char **hashes)
300 if (!tdb_check_record(tdb, off, rec))
303 /* Mark this offset as a known value for the free list. */
304 record_offset(hashes[0], off);
305 /* And similarly if the next pointer is valid. */
307 record_offset(hashes[0], rec->next);
311 /* Slow, but should be very rare. */
312 size_t tdb_dead_space(struct tdb_context *tdb, tdb_off_t off)
316 for (len = 0; off + len < tdb->map_size; len++) {
318 if (tdb->methods->tdb_read(tdb, off, &c, 1, 0))
320 if (c != 0 && c != 0x42)
326 int tdb_check(struct tdb_context *tdb,
327 int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
331 unsigned char **hashes;
332 tdb_off_t off, recovery_start;
333 struct tdb_record rec;
334 bool found_recovery = false;
338 /* Read-only databases use no locking at all: it's best-effort.
339 * We may have a write lock already, so skip that case too. */
340 if (tdb->read_only || tdb->allrecord_lock.count != 0) {
343 if (tdb_lockall_read(tdb) == -1)
348 /* Make sure we know true size of the underlying file. */
349 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
351 /* Header must be OK: also gets us the recovery ptr, if any. */
352 if (!tdb_check_header(tdb, &recovery_start))
355 /* We should have the whole header, too. */
356 if (tdb->map_size < TDB_DATA_START(tdb->header.hash_size)) {
357 tdb->ecode = TDB_ERR_CORRUPT;
358 TDB_LOG((tdb, TDB_DEBUG_ERROR, "File too short for hashes\n"));
362 /* One big malloc: pointers then bit arrays. */
363 hashes = (unsigned char **)calloc(
364 1, sizeof(hashes[0]) * (1+tdb->header.hash_size)
365 + BITMAP_BITS / CHAR_BIT * (1+tdb->header.hash_size));
367 tdb->ecode = TDB_ERR_OOM;
371 /* Initialize pointers */
372 hashes[0] = (unsigned char *)(&hashes[1+tdb->header.hash_size]);
373 for (h = 1; h < 1+tdb->header.hash_size; h++)
374 hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT;
376 /* Freelist and hash headers are all in a row: read them. */
377 for (h = 0; h < 1+tdb->header.hash_size; h++) {
378 if (tdb_ofs_read(tdb, FREELIST_TOP + h*sizeof(tdb_off_t),
382 record_offset(hashes[h], off);
385 /* For each record, read it in and check it's ok. */
386 for (off = TDB_DATA_START(tdb->header.hash_size);
388 off += sizeof(rec) + rec.rec_len) {
389 if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
395 if (!tdb_check_used_record(tdb, off, &rec, hashes,
396 check, private_data))
400 if (!tdb_check_free_record(tdb, off, &rec, hashes))
403 /* If we crash after ftruncate, we can get zeroes or fill. */
404 case TDB_RECOVERY_INVALID_MAGIC:
406 if (recovery_start == off) {
407 found_recovery = true;
410 dead = tdb_dead_space(tdb, off);
411 if (dead < sizeof(rec))
414 TDB_LOG((tdb, TDB_DEBUG_ERROR,
415 "Dead space at %d-%d (of %u)\n",
416 off, off + dead, tdb->map_size));
417 rec.rec_len = dead - sizeof(rec);
419 case TDB_RECOVERY_MAGIC:
420 if (recovery_start != off) {
421 TDB_LOG((tdb, TDB_DEBUG_ERROR,
422 "Unexpected recovery record at offset %d\n",
426 found_recovery = true;
430 tdb->ecode = TDB_ERR_CORRUPT;
431 TDB_LOG((tdb, TDB_DEBUG_ERROR,
432 "Bad magic 0x%x at offset %d\n",
438 /* Now, hashes should all be empty: each record exists and is referred
439 * to by one other. */
440 for (h = 0; h < 1+tdb->header.hash_size; h++) {
442 for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) {
443 if (hashes[h][i] != 0) {
444 tdb->ecode = TDB_ERR_CORRUPT;
445 TDB_LOG((tdb, TDB_DEBUG_ERROR,
446 "Hashes do not match records\n"));
452 /* We must have found recovery area if there was one. */
453 if (recovery_start != 0 && !found_recovery) {
454 TDB_LOG((tdb, TDB_DEBUG_ERROR,
455 "Expected a recovery area at %u\n",
462 tdb_unlockall_read(tdb);
470 tdb_unlockall_read(tdb);