2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Rusty Russell 2009
8 ** NOTE! The following LGPL license applies to the tdb
9 ** library. This does NOT imply that all of Samba is released
12 This library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Lesser General Public
14 License as published by the Free Software Foundation; either
15 version 3 of the License, or (at your option) any later version.
17 This library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Lesser General Public License for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License along with this library; if not, see <http://www.gnu.org/licenses/>.
25 #include "tdb_private.h"
28 /* Since we opened it, these shouldn't fail unless it's recent corruption. */
29 static bool tdb_check_header(struct tdb_context *tdb, tdb_off_t *recovery)
31 struct tdb_header hdr;
33 if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), DOCONV()) == -1)
35 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0)
39 if (hdr.version != TDB_VERSION)
45 if (hdr.hash_size == 0)
48 if (hdr.hash_size != tdb->header.hash_size)
51 if (hdr.recovery_start != 0 &&
52 hdr.recovery_start < TDB_DATA_START(tdb->header.hash_size))
55 *recovery = hdr.recovery_start;
59 tdb->ecode = TDB_ERR_CORRUPT;
60 TDB_LOG((tdb, TDB_DEBUG_ERROR, "Header is corrupt\n"));
64 /* Generic record header check. */
65 static bool tdb_check_record(struct tdb_context *tdb,
67 const struct tdb_record *rec)
71 /* Check rec->next: 0 or points to record offset, aligned. */
72 if (rec->next > 0 && rec->next < TDB_DATA_START(tdb->header.hash_size)){
73 TDB_LOG((tdb, TDB_DEBUG_ERROR,
74 "Record offset %d too small next %d\n",
78 if (rec->next + sizeof(*rec) < rec->next) {
79 TDB_LOG((tdb, TDB_DEBUG_ERROR,
80 "Record offset %d too large next %d\n",
84 if ((rec->next % TDB_ALIGNMENT) != 0) {
85 TDB_LOG((tdb, TDB_DEBUG_ERROR,
86 "Record offset %d misaligned next %d\n",
90 if (tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0))
93 /* Check rec_len: similar to rec->next, implies next record. */
94 if ((rec->rec_len % TDB_ALIGNMENT) != 0) {
95 TDB_LOG((tdb, TDB_DEBUG_ERROR,
96 "Record offset %d misaligned length %d\n",
100 /* Must fit tailer. */
101 if (rec->rec_len < sizeof(tailer)) {
102 TDB_LOG((tdb, TDB_DEBUG_ERROR,
103 "Record offset %d too short length %d\n",
107 /* OOB allows "right at the end" access, so this works for last rec. */
108 if (tdb->methods->tdb_oob(tdb, off+sizeof(*rec)+rec->rec_len, 0))
112 if (tdb_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer),
115 if (tailer != sizeof(*rec) + rec->rec_len) {
116 TDB_LOG((tdb, TDB_DEBUG_ERROR,
117 "Record offset %d invalid tailer\n", off));
124 tdb->ecode = TDB_ERR_CORRUPT;
128 /* Grab some bytes: may copy if can't use mmap.
129 Caller has already done bounds check. */
130 static TDB_DATA get_bytes(struct tdb_context *tdb,
131 tdb_off_t off, tdb_len_t len)
137 if (tdb->transaction == NULL && tdb->map_ptr != NULL)
138 d.dptr = (unsigned char *)tdb->map_ptr + off;
140 d.dptr = tdb_alloc_read(tdb, off, d.dsize);
144 /* Frees data if we're not able to simply use mmap. */
145 static void put_bytes(struct tdb_context *tdb, TDB_DATA d)
147 if (tdb->transaction == NULL && tdb->map_ptr != NULL)
152 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
153 * See: http://burtleburtle.net/bob/c/lookup3.c
155 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
156 static void hash(uint32_t key, uint32_t *pc, uint32_t *pb)
160 /* Set up the internal state */
161 a = b = c = 0xdeadbeef + *pc;
164 c ^= b; c -= rot(b,14);
165 a ^= c; a -= rot(c,11);
166 b ^= a; b -= rot(a,25);
167 c ^= b; c -= rot(b,16);
168 a ^= c; a -= rot(c,4);
169 b ^= a; b -= rot(a,14);
170 c ^= b; c -= rot(b,24);
175 We want to check that all free records are in the free list
176 (only once), and all free list entries are free records. Similarly
177 for each hash chain of used records.
179 Doing that naively (without walking hash chains, since we want to be
180 linear) means keeping a list of records which have been seen in each
181 hash chain, and another of records pointed to (ie. next pointers
182 from records and the initial hash chain heads). These two lists
183 should be equal. This will take 8 bytes per record, and require
186 So instead, we record each offset in a bitmap such a way that
187 recording it twice will cancel out. Since each offset should appear
188 exactly twice, the bitmap should be zero at the end.
190 The approach was inspired by Bloom Filters (see Wikipedia). For
191 each value, we flip K bits in a bitmap of size N. The number of
192 distinct arrangements is:
196 Of course, not all arrangements are actually distinct, but testing
197 shows this formula to be close enough.
199 So, if K == 8 and N == 256, the probability of two things flipping the same
200 bits is 1 in 409,663,695,276,000.
202 Given that ldb uses a hash size of 10000, using 32 bytes per hash chain
203 (320k) seems reasonable.
206 #define BITMAP_BITS 256
208 static void bit_flip(unsigned char bits[], unsigned int idx)
210 bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT));
213 /* We record offsets in a bitmap for the particular chain it should be in. */
214 static void record_offset(unsigned char bits[], tdb_off_t off)
216 uint32_t h1 = off, h2 = 0;
219 /* We get two good hash values out of jhash2, so we use both. Then
220 * we keep going to produce further hash values. */
221 for (i = 0; i < NUM_HASHES / 2; i++) {
223 bit_flip(bits, h1 % BITMAP_BITS);
224 bit_flip(bits, h2 % BITMAP_BITS);
229 /* Check that an in-use record is valid. */
230 static bool tdb_check_used_record(struct tdb_context *tdb,
232 const struct tdb_record *rec,
233 unsigned char **hashes,
234 int (*check)(TDB_DATA, TDB_DATA, void *),
239 if (!tdb_check_record(tdb, off, rec))
242 /* key + data + tailer must fit in record */
243 if (rec->key_len + rec->data_len + sizeof(tdb_off_t) > rec->rec_len) {
244 TDB_LOG((tdb, TDB_DEBUG_ERROR,
245 "Record offset %d too short for contents\n", off));
249 key = get_bytes(tdb, off + sizeof(*rec), rec->key_len);
253 if (tdb->hash_fn(&key) != rec->full_hash) {
254 TDB_LOG((tdb, TDB_DEBUG_ERROR,
255 "Record offset %d has incorrect hash\n", off));
259 /* Mark this offset as a known value for this hash bucket. */
260 record_offset(hashes[BUCKET(rec->full_hash)+1], off);
261 /* And similarly if the next pointer is valid. */
263 record_offset(hashes[BUCKET(rec->full_hash)+1], rec->next);
265 /* If they supply a check function and this record isn't dead,
266 get data and feed it. */
267 if (check && rec->magic != TDB_DEAD_MAGIC) {
268 data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len,
273 if (check(key, data, private_data) == -1)
275 put_bytes(tdb, data);
282 put_bytes(tdb, data);
288 /* Check that an unused record is valid. */
289 static bool tdb_check_free_record(struct tdb_context *tdb,
291 const struct tdb_record *rec,
292 unsigned char **hashes)
294 if (!tdb_check_record(tdb, off, rec))
297 /* Mark this offset as a known value for the free list. */
298 record_offset(hashes[0], off);
299 /* And similarly if the next pointer is valid. */
301 record_offset(hashes[0], rec->next);
305 int tdb_check(struct tdb_context *tdb,
306 int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
310 unsigned char **hashes;
311 tdb_off_t off, recovery_start;
312 struct tdb_record rec;
313 bool found_recovery = false;
315 if (tdb_lockall(tdb) == -1)
318 /* Make sure we know true size of the underlying file. */
319 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
321 /* Header must be OK: also gets us the recovery ptr, if any. */
322 if (!tdb_check_header(tdb, &recovery_start))
325 /* We should have the whole header, too. */
326 if (tdb->map_size < TDB_DATA_START(tdb->header.hash_size)) {
327 tdb->ecode = TDB_ERR_CORRUPT;
328 TDB_LOG((tdb, TDB_DEBUG_ERROR, "File too short for hashes\n"));
332 /* One big malloc: pointers then bit arrays. */
333 hashes = (unsigned char **)calloc(
334 1, sizeof(hashes[0]) * (1+tdb->header.hash_size)
335 + BITMAP_BITS / CHAR_BIT * (1+tdb->header.hash_size));
337 tdb->ecode = TDB_ERR_OOM;
341 /* Initialize pointers */
342 hashes[0] = (unsigned char *)(&hashes[1+tdb->header.hash_size]);
343 for (h = 1; h < 1+tdb->header.hash_size; h++)
344 hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT;
346 /* Freelist and hash headers are all in a row: read them. */
347 for (h = 0; h < 1+tdb->header.hash_size; h++) {
348 if (tdb_ofs_read(tdb, FREELIST_TOP + h*sizeof(tdb_off_t),
352 record_offset(hashes[h], off);
355 /* For each record, read it in and check it's ok. */
356 for (off = TDB_DATA_START(tdb->header.hash_size);
358 off += sizeof(rec) + rec.rec_len) {
359 if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
365 if (!tdb_check_used_record(tdb, off, &rec, hashes,
366 check, private_data))
370 if (!tdb_check_free_record(tdb, off, &rec, hashes))
373 case TDB_RECOVERY_MAGIC:
374 case TDB_RECOVERY_INVALID_MAGIC:
375 if (recovery_start != off) {
376 TDB_LOG((tdb, TDB_DEBUG_ERROR,
377 "Unexpected recovery record at offset %d\n",
381 found_recovery = true;
384 tdb->ecode = TDB_ERR_CORRUPT;
385 TDB_LOG((tdb, TDB_DEBUG_ERROR,
386 "Bad magic 0x%x at offset %d\n",
392 /* Now, hashes should all be empty: each record exists and is referred
393 * to by one other. */
394 for (h = 0; h < 1+tdb->header.hash_size; h++) {
396 for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) {
397 if (hashes[h][i] != 0) {
398 tdb->ecode = TDB_ERR_CORRUPT;
399 TDB_LOG((tdb, TDB_DEBUG_ERROR,
400 "Hashes do not match records\n"));
406 /* We must have found recovery area if there was one. */
407 if (recovery_start != 0 && !found_recovery) {
408 TDB_LOG((tdb, TDB_DEBUG_ERROR,
409 "Expected %s recovery area, got %s\n",
410 recovery_start ? "a" : "no",
411 found_recovery ? "one" : "none"));