2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Rusty Russell 2009
8 ** NOTE! The following LGPL license applies to the tdb
9 ** library. This does NOT imply that all of Samba is released
12 This library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Lesser General Public
14 License as published by the Free Software Foundation; either
15 version 3 of the License, or (at your option) any later version.
17 This library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Lesser General Public License for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License along with this library; if not, see <http://www.gnu.org/licenses/>.
25 #include "tdb_private.h"
28 /* Since we opened it, these shouldn't fail unless it's recent corruption. */
29 static bool tdb_check_header(struct tdb_context *tdb, tdb_off_t *recovery)
31 struct tdb_header hdr;
33 if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), DOCONV()) == -1)
35 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0)
39 if (hdr.version != TDB_VERSION)
45 if (hdr.hash_size == 0)
48 if (hdr.hash_size != tdb->header.hash_size)
51 if (hdr.recovery_start != 0 &&
52 hdr.recovery_start < TDB_DATA_START(tdb->header.hash_size))
55 *recovery = hdr.recovery_start;
59 tdb->ecode = TDB_ERR_CORRUPT;
63 /* Generic record header check. */
64 static bool tdb_check_record(struct tdb_context *tdb,
66 const struct list_struct *rec)
70 /* Check rec->next: 0 or points to record offset, aligned. */
71 if (rec->next > 0 && rec->next < TDB_DATA_START(tdb->header.hash_size))
73 if (rec->next + sizeof(*rec) < rec->next)
75 if ((rec->next % TDB_ALIGNMENT) != 0)
77 if (tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 1))
80 /* Check rec_len: similar to rec->next, implies next record. */
81 if ((rec->rec_len % TDB_ALIGNMENT) != 0)
83 /* Must fit tailer. */
84 if (rec->rec_len < sizeof(tailer))
86 /* OOB allows "right at the end" access, so this works for last rec. */
87 if (tdb->methods->tdb_oob(tdb, off+sizeof(*rec)+rec->rec_len, 1))
91 if (tdb_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer),
94 if (tailer != sizeof(*rec) + rec->rec_len)
100 tdb->ecode = TDB_ERR_CORRUPT;
104 /* Grab some bytes: may copy if can't use mmap.
105 Caller has already done bounds check. */
106 static TDB_DATA get_bytes(struct tdb_context *tdb,
107 tdb_off_t off, tdb_len_t len)
113 if (tdb->transaction == NULL && tdb->map_ptr != NULL)
114 d.dptr = (unsigned char *)tdb->map_ptr + off;
116 d.dptr = tdb_alloc_read(tdb, off, d.dsize);
120 /* Frees data if we're not able to simply use mmap. */
121 static void put_bytes(struct tdb_context *tdb, TDB_DATA d)
123 if (tdb->transaction == NULL && tdb->map_ptr != NULL)
128 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
129 * See: http://burtleburtle.net/bob/c/lookup3.c
131 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
132 static void hash(uint32_t key, uint32_t *pc, uint32_t *pb)
136 /* Set up the internal state */
137 a = b = c = 0xdeadbeef + *pc;
140 c ^= b; c -= rot(b,14);
141 a ^= c; a -= rot(c,11);
142 b ^= a; b -= rot(a,25);
143 c ^= b; c -= rot(b,16);
144 a ^= c; a -= rot(c,4);
145 b ^= a; b -= rot(a,14);
146 c ^= b; c -= rot(b,24);
151 We want to check that all free records are in the free list
152 (only once), and all free list entries are free records. Similarly
153 for each hash chain of used records.
155 Doing that naively (without walking hash chains, since we want to be
156 linear) means keeping a list of records which have been seen in each
157 hash chain, and another of records pointed to (ie. next pointers
158 from records and the initial hash chain heads). These two lists
159 should be equal. This will take 8 bytes per record, and require
162 So instead, we record each offset in a bitmap such a way that
163 recording it twice will cancel out. Since each offset should appear
164 exactly twice, the bitmap should be zero at the end.
166 The approach was inspired by Bloom Filters (see Wikipedia). For
167 each value, we flip K bits in a bitmap of size N. The number of
168 distinct arrangements is:
172 Of course, not all arrangements are actually distinct, but testing
173 shows this formula to be close enough.
175 So, if K == 8 and N == 256, the probability of two things flipping the same
176 bits is 1 in 409,663,695,276,000.
178 Given that ldb uses a hash size of 10000, using 32 bytes per hash chain
179 (320k) seems reasonable.
182 #define BITMAP_BITS 256
184 static void bit_flip(unsigned char bits[], unsigned int idx)
186 bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT));
189 /* We record offsets in a bitmap for the particular chain it should be in. */
190 static void record_offset(unsigned char bits[], tdb_off_t off)
192 uint32_t h1 = off, h2 = 0;
195 /* We get two good hash values out of jhash2, so we use both. Then
196 * we keep going to produce further hash values. */
197 for (i = 0; i < NUM_HASHES / 2; i++) {
199 bit_flip(bits, h1 % BITMAP_BITS);
200 bit_flip(bits, h2 % BITMAP_BITS);
205 /* Check that an in-use record is valid. */
206 static bool tdb_check_used_record(struct tdb_context *tdb,
208 const struct list_struct *rec,
209 unsigned char **hashes,
210 int (*check)(TDB_DATA, TDB_DATA, void *),
215 if (!tdb_check_record(tdb, off, rec))
218 /* key + data + tailer must fit in record */
219 if (rec->key_len + rec->data_len + sizeof(tdb_off_t) > rec->rec_len)
222 key = get_bytes(tdb, off + sizeof(*rec), rec->key_len);
226 if (tdb->hash_fn(&key) != rec->full_hash)
229 /* Mark this offset as a known value for this hash bucket. */
230 record_offset(hashes[BUCKET(rec->full_hash)+1], off);
231 /* And similarly if the next pointer is valid. */
233 record_offset(hashes[BUCKET(rec->full_hash)+1], rec->next);
235 /* If they supply a check function and this record isn't dead,
236 get data and feed it. */
237 if (check && rec->magic != TDB_DEAD_MAGIC) {
238 data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len,
243 if (check(key, data, private) == -1)
245 put_bytes(tdb, data);
252 put_bytes(tdb, data);
258 /* Check that an unused record is valid. */
259 static bool tdb_check_free_record(struct tdb_context *tdb,
261 const struct list_struct *rec,
262 unsigned char **hashes)
264 if (!tdb_check_record(tdb, off, rec))
267 /* Mark this offset as a known value for the free list. */
268 record_offset(hashes[0], off);
269 /* And similarly if the next pointer is valid. */
271 record_offset(hashes[0], rec->next);
275 int tdb_check(struct tdb_context *tdb,
276 int (*check)(TDB_DATA key, TDB_DATA data, void *private),
280 unsigned char **hashes;
281 tdb_off_t off, recovery_start;
282 struct list_struct rec;
283 bool found_recovery = false;
285 if (tdb_lockall(tdb) == -1)
288 /* Make sure we know true size of the underlying file. */
289 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
291 /* Header must be OK: also gets us the recovery ptr, if any. */
292 if (!tdb_check_header(tdb, &recovery_start))
295 /* We should have the whole header, too. */
296 if (tdb->map_size < TDB_DATA_START(tdb->header.hash_size)) {
297 tdb->ecode = TDB_ERR_CORRUPT;
301 /* One big malloc: pointers then bit arrays. */
302 hashes = calloc(1, sizeof(hashes[0]) * (1+tdb->header.hash_size)
303 + BITMAP_BITS / CHAR_BIT * (1+tdb->header.hash_size));
305 tdb->ecode = TDB_ERR_OOM;
309 /* Initialize pointers */
310 hashes[0] = (unsigned char *)(&hashes[1+tdb->header.hash_size]);
311 for (h = 1; h < 1+tdb->header.hash_size; h++)
312 hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT;
314 /* Freelist and hash headers are all in a row: read them. */
315 for (h = 0; h < 1+tdb->header.hash_size; h++) {
316 if (tdb_ofs_read(tdb, FREELIST_TOP + h*sizeof(tdb_off_t),
320 record_offset(hashes[h], off);
323 /* For each record, read it in and check it's ok. */
324 for (off = TDB_DATA_START(tdb->header.hash_size);
326 off += sizeof(rec) + rec.rec_len) {
327 if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
333 if (!tdb_check_used_record(tdb, off, &rec, hashes,
338 if (!tdb_check_free_record(tdb, off, &rec, hashes))
341 case TDB_RECOVERY_MAGIC:
342 if (recovery_start != off)
344 found_recovery = true;
347 tdb->ecode = TDB_ERR_CORRUPT;
352 /* Now, hashes should all be empty: each record exists and is referred
353 * to by one other. */
354 for (h = 0; h < 1+tdb->header.hash_size; h++) {
356 for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) {
357 if (hashes[h][i] != 0) {
358 tdb->ecode = TDB_ERR_CORRUPT;
364 /* We must have found recovery area if there was one. */
365 if (recovery_start != 0 && !found_recovery)