]> git.ozlabs.org Git - ccan/blob - ccan/ntdb/hash.c
tools/ccanlint: add args to main.
[ccan] / ccan / ntdb / hash.c
1  /*
2    Trivial Database 2: hash handling
3    Copyright (C) Rusty Russell 2010
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/hash/hash.h>
20
21 /* Default hash function. */
22 uint32_t ntdb_jenkins_hash(const void *key, size_t length, uint32_t seed,
23                           void *unused)
24 {
25         return hash_stable((const unsigned char *)key, length, seed);
26 }
27
28 uint32_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len)
29 {
30         return ntdb->hash_fn(ptr, len, ntdb->hash_seed, ntdb->hash_data);
31 }
32
33 static ntdb_bool_err key_matches(struct ntdb_context *ntdb,
34                                  const struct ntdb_used_record *rec,
35                                  ntdb_off_t off,
36                                  const NTDB_DATA *key,
37                                  const char **rptr)
38 {
39         ntdb_bool_err ret = false;
40         const char *rkey;
41
42         if (rec_key_length(rec) != key->dsize) {
43                 ntdb->stats.compare_wrong_keylen++;
44                 return ret;
45         }
46
47         rkey = ntdb_access_read(ntdb, off + sizeof(*rec),
48                                 key->dsize + rec_data_length(rec), false);
49         if (NTDB_PTR_IS_ERR(rkey)) {
50                 return (ntdb_bool_err)NTDB_PTR_ERR(rkey);
51         }
52         if (memcmp(rkey, key->dptr, key->dsize) == 0) {
53                 if (rptr) {
54                         *rptr = rkey;
55                 } else {
56                         ntdb_access_release(ntdb, rkey);
57                 }
58                 return true;
59         }
60         ntdb->stats.compare_wrong_keycmp++;
61         ntdb_access_release(ntdb, rkey);
62         return ret;
63 }
64
65 /* Does entry match? */
66 static ntdb_bool_err match(struct ntdb_context *ntdb,
67                            uint32_t hash,
68                            const NTDB_DATA *key,
69                            ntdb_off_t val,
70                            struct ntdb_used_record *rec,
71                            const char **rptr)
72 {
73         ntdb_off_t off;
74         enum NTDB_ERROR ecode;
75
76         ntdb->stats.compares++;
77
78         /* Top bits of offset == next bits of hash. */
79         if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
80             != bits_from(val, 64-NTDB_OFF_UPPER_STEAL, NTDB_OFF_UPPER_STEAL)) {
81                 ntdb->stats.compare_wrong_offsetbits++;
82                 return false;
83         }
84
85         off = val & NTDB_OFF_MASK;
86         ecode = ntdb_read_convert(ntdb, off, rec, sizeof(*rec));
87         if (ecode != NTDB_SUCCESS) {
88                 return (ntdb_bool_err)ecode;
89         }
90
91         return key_matches(ntdb, rec, off, key, rptr);
92 }
93
94 static bool is_chain(ntdb_off_t val)
95 {
96         return val & (1ULL << NTDB_OFF_CHAIN_BIT);
97 }
98
99 static ntdb_off_t hbucket_off(ntdb_off_t base, ntdb_len_t idx)
100 {
101         return base + sizeof(struct ntdb_used_record)
102                 + idx * sizeof(ntdb_off_t);
103 }
104
105 /* This is the core routine which searches the hashtable for an entry.
106  * On error, no locks are held and -ve is returned.
107  * Otherwise, hinfo is filled in.
108  * If not found, the return value is 0.
109  * If found, the return value is the offset, and *rec is the record. */
110 ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
111                          NTDB_DATA key,
112                          int ltype,
113                          struct hash_info *h,
114                          struct ntdb_used_record *rec,
115                          const char **rptr)
116 {
117         ntdb_off_t off, val;
118         const ntdb_off_t *arr = NULL;
119         ntdb_len_t i;
120         bool found_empty;
121         enum NTDB_ERROR ecode;
122         struct ntdb_used_record chdr;
123         ntdb_bool_err berr;
124
125         h->h = ntdb_hash(ntdb, key.dptr, key.dsize);
126
127         h->table = NTDB_HASH_OFFSET;
128         h->table_size = 1 << ntdb->hash_bits;
129         h->bucket = bits_from(h->h, 0, ntdb->hash_bits);
130         h->old_val = 0;
131
132         ecode = ntdb_lock_hash(ntdb, h->bucket, ltype);
133         if (ecode != NTDB_SUCCESS) {
134                 return NTDB_ERR_TO_OFF(ecode);
135         }
136
137         off = hbucket_off(h->table, h->bucket);
138         val = ntdb_read_off(ntdb, off);
139         if (NTDB_OFF_IS_ERR(val)) {
140                 ecode = NTDB_OFF_TO_ERR(val);
141                 goto fail;
142         }
143
144         /* Directly in hash table? */
145         if (!likely(is_chain(val))) {
146                 if (val) {
147                         berr = match(ntdb, h->h, &key, val, rec, rptr);
148                         if (berr < 0) {
149                                 ecode = NTDB_OFF_TO_ERR(berr);
150                                 goto fail;
151                         }
152                         if (berr) {
153                                 return val & NTDB_OFF_MASK;
154                         }
155                         /* If you want to insert here, make a chain. */
156                         h->old_val = val;
157                 }
158                 return 0;
159         }
160
161         /* Nope?  Iterate through chain. */
162         h->table = val & NTDB_OFF_MASK;
163
164         ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
165         if (ecode != NTDB_SUCCESS) {
166                 goto fail;
167         }
168
169         if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
170                 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
171                                     NTDB_LOG_ERROR,
172                                     "find_and_lock:"
173                                     " corrupt record %#x at %llu",
174                                     rec_magic(&chdr), (long long)off);
175                 goto fail;
176         }
177
178         h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
179
180         arr = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
181                                rec_data_length(&chdr), true);
182         if (NTDB_PTR_IS_ERR(arr)) {
183                 ecode = NTDB_PTR_ERR(arr);
184                 goto fail;
185         }
186
187         found_empty = false;
188         for (i = 0; i < h->table_size; i++) {
189                 if (arr[i] == 0) {
190                         if (!found_empty) {
191                                 h->bucket = i;
192                                 found_empty = true;
193                         }
194                 } else {
195                         berr = match(ntdb, h->h, &key, arr[i], rec, rptr);
196                         if (berr < 0) {
197                                 ecode = NTDB_OFF_TO_ERR(berr);
198                                 ntdb_access_release(ntdb, arr);
199                                 goto fail;
200                         }
201                         if (berr) {
202                                 /* We found it! */
203                                 h->bucket = i;
204                                 off = arr[i] & NTDB_OFF_MASK;
205                                 ntdb_access_release(ntdb, arr);
206                                 return off;
207                         }
208                 }
209         }
210         if (!found_empty) {
211                 /* Set to any non-zero value */
212                 h->old_val = 1;
213                 h->bucket = i;
214         }
215
216         ntdb_access_release(ntdb, arr);
217         return 0;
218
219 fail:
220         ntdb_unlock_hash(ntdb, h->bucket, ltype);
221         return NTDB_ERR_TO_OFF(ecode);
222 }
223
224 static ntdb_off_t encode_offset(const struct ntdb_context *ntdb,
225                                 ntdb_off_t new_off, uint32_t hash)
226 {
227         ntdb_off_t extra;
228
229         assert((new_off & (1ULL << NTDB_OFF_CHAIN_BIT)) == 0);
230         assert((new_off >> (64 - NTDB_OFF_UPPER_STEAL)) == 0);
231         /* We pack extra hash bits into the upper bits of the offset. */
232         extra = bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL);
233         extra <<= (64 - NTDB_OFF_UPPER_STEAL);
234
235         return new_off | extra;
236 }
237
238 /* Simply overwrite the hash entry we found before. */
239 enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
240                                 const struct hash_info *h,
241                                 ntdb_off_t new_off)
242 {
243         return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket),
244                               encode_offset(ntdb, new_off, h->h));
245 }
246
247 enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb,
248                                  const struct hash_info *h)
249 {
250         return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket), 0);
251 }
252
253
254 enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb,
255                             const struct hash_info *h,
256                             ntdb_off_t new_off)
257 {
258         enum NTDB_ERROR ecode;
259         ntdb_off_t chain;
260         struct ntdb_used_record chdr;
261         const ntdb_off_t *old;
262         ntdb_off_t *new;
263
264         /* We hit an empty bucket during search?  That's where it goes. */
265         if (!h->old_val) {
266                 return replace_in_hash(ntdb, h, new_off);
267         }
268
269         /* Full at top-level?  Create a 2-element chain. */
270         if (h->table == NTDB_HASH_OFFSET) {
271                 ntdb_off_t pair[2];
272
273                 /* One element is old value, the other is the new value. */
274                 pair[0] = h->old_val;
275                 pair[1] = encode_offset(ntdb, new_off, h->h);
276
277                 chain = alloc(ntdb, 0, sizeof(pair), NTDB_CHAIN_MAGIC, true);
278                 if (NTDB_OFF_IS_ERR(chain)) {
279                         return NTDB_OFF_TO_ERR(chain);
280                 }
281                 ecode = ntdb_write_convert(ntdb,
282                                            chain
283                                            + sizeof(struct ntdb_used_record),
284                                            pair, sizeof(pair));
285                 if (ecode == NTDB_SUCCESS) {
286                         ecode = ntdb_write_off(ntdb,
287                                                hbucket_off(h->table, h->bucket),
288                                                chain
289                                                | (1ULL << NTDB_OFF_CHAIN_BIT));
290                 }
291                 return ecode;
292         }
293
294         /* Full bucket.  Expand. */
295         ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
296         if (ecode != NTDB_SUCCESS) {
297                 return ecode;
298         }
299
300         if (rec_extra_padding(&chdr) >= sizeof(new_off)) {
301                 /* Expand in place. */
302                 uint64_t dlen = rec_data_length(&chdr);
303
304                 ecode = set_header(ntdb, &chdr, NTDB_CHAIN_MAGIC, 0,
305                                    dlen + sizeof(new_off),
306                                    dlen + rec_extra_padding(&chdr));
307
308                 if (ecode != NTDB_SUCCESS) {
309                         return ecode;
310                 }
311                 /* find_and_lock set up h to point to last bucket. */
312                 ecode = replace_in_hash(ntdb, h, new_off);
313                 if (ecode != NTDB_SUCCESS) {
314                         return ecode;
315                 }
316                 ecode = ntdb_write_convert(ntdb, h->table, &chdr, sizeof(chdr));
317                 if (ecode != NTDB_SUCCESS) {
318                         return ecode;
319                 }
320                 /* For futureproofing, we always make the first byte of padding
321                  * a zero. */
322                 if (rec_extra_padding(&chdr)) {
323                         ecode = ntdb->io->twrite(ntdb, h->table + sizeof(chdr)
324                                                  + dlen + sizeof(new_off),
325                                                  "", 1);
326                 }
327                 return ecode;
328         }
329
330         /* We need to reallocate the chain. */
331         chain = alloc(ntdb, 0, (h->table_size + 1) * sizeof(ntdb_off_t),
332                       NTDB_CHAIN_MAGIC, true);
333         if (NTDB_OFF_IS_ERR(chain)) {
334                 return NTDB_OFF_TO_ERR(chain);
335         }
336
337         /* Map both and copy across old buckets. */
338         old = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
339                                h->table_size*sizeof(ntdb_off_t), true);
340         if (NTDB_PTR_IS_ERR(old)) {
341                 return NTDB_PTR_ERR(old);
342         }
343         new = ntdb_access_write(ntdb, hbucket_off(chain, 0),
344                                 (h->table_size + 1)*sizeof(ntdb_off_t), true);
345         if (NTDB_PTR_IS_ERR(new)) {
346                 ntdb_access_release(ntdb, old);
347                 return NTDB_PTR_ERR(new);
348         }
349
350         memcpy(new, old, h->bucket * sizeof(ntdb_off_t));
351         new[h->bucket] = encode_offset(ntdb, new_off, h->h);
352         ntdb_access_release(ntdb, old);
353
354         ecode = ntdb_access_commit(ntdb, new);
355         if (ecode != NTDB_SUCCESS) {
356                 return ecode;
357         }
358
359         /* Free the old chain. */
360         ecode = add_free_record(ntdb, h->table,
361                                 sizeof(struct ntdb_used_record)
362                                 + rec_data_length(&chdr)
363                                 + rec_extra_padding(&chdr),
364                                 NTDB_LOCK_WAIT, true);
365
366         /* Replace top-level to point to new chain */
367         return ntdb_write_off(ntdb,
368                               hbucket_off(NTDB_HASH_OFFSET,
369                                           bits_from(h->h, 0, ntdb->hash_bits)),
370                               chain | (1ULL << NTDB_OFF_CHAIN_BIT));
371 }
372
373 /* Traverse support: returns offset of record, or 0 or -ve error. */
374 static ntdb_off_t iterate_chain(struct ntdb_context *ntdb,
375                                 ntdb_off_t val,
376                                 struct hash_info *h)
377 {
378         ntdb_off_t i;
379         enum NTDB_ERROR ecode;
380         struct ntdb_used_record chdr;
381
382         /* First load up chain header. */
383         h->table = val & NTDB_OFF_MASK;
384         ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
385         if (ecode != NTDB_SUCCESS) {
386                 return ecode;
387         }
388
389         if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
390                 return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
391                                    NTDB_LOG_ERROR,
392                                    "get_table:"
393                                    " corrupt record %#x at %llu",
394                                    rec_magic(&chdr),
395                                    (long long)h->table);
396         }
397
398         /* Chain length is implied by data length. */
399         h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
400
401         i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), h->bucket,
402                                   h->table_size);
403         if (NTDB_OFF_IS_ERR(i)) {
404                 return i;
405         }
406
407         if (i != h->table_size) {
408                 /* Return to next bucket. */
409                 h->bucket = i + 1;
410                 val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
411                 if (NTDB_OFF_IS_ERR(val)) {
412                         return val;
413                 }
414                 return val & NTDB_OFF_MASK;
415         }
416
417         /* Go back up to hash table. */
418         h->table = NTDB_HASH_OFFSET;
419         h->table_size = 1 << ntdb->hash_bits;
420         h->bucket = bits_from(h->h, 0, ntdb->hash_bits) + 1;
421         return 0;
422 }
423
424 /* Keeps hash locked unless returns 0 or error. */
425 static ntdb_off_t lock_and_iterate_hash(struct ntdb_context *ntdb,
426                                         struct hash_info *h)
427 {
428         ntdb_off_t val, i;
429         enum NTDB_ERROR ecode;
430
431         if (h->table != NTDB_HASH_OFFSET) {
432                 /* We're in a chain. */
433                 i = bits_from(h->h, 0, ntdb->hash_bits);
434                 ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
435                 if (ecode != NTDB_SUCCESS) {
436                         return NTDB_ERR_TO_OFF(ecode);
437                 }
438
439                 /* We dropped lock, bucket might have moved! */
440                 val = ntdb_read_off(ntdb, hbucket_off(NTDB_HASH_OFFSET, i));
441                 if (NTDB_OFF_IS_ERR(val)) {
442                         goto unlock;
443                 }
444
445                 /* We don't remove chains: there should still be one there! */
446                 if (!val || !is_chain(val)) {
447                         ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
448                                             NTDB_LOG_ERROR,
449                                             "iterate_hash:"
450                                             " vanished hchain %llu at %llu",
451                                             (long long)val,
452                                             (long long)i);
453                         val = NTDB_ERR_TO_OFF(ecode);
454                         goto unlock;
455                 }
456
457                 /* Find next bucket in the chain. */
458                 val = iterate_chain(ntdb, val, h);
459                 if (NTDB_OFF_IS_ERR(val)) {
460                         goto unlock;
461                 }
462                 if (val != 0) {
463                         return val;
464                 }
465                 ntdb_unlock_hash(ntdb, i, F_RDLCK);
466
467                 /* OK, we've reset h back to top level. */
468         }
469
470         /* We do this unlocked, then re-check. */
471         for (i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
472                                        h->bucket, h->table_size);
473              i != h->table_size;
474              i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
475                                        i+1, h->table_size)) {
476                 ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
477                 if (ecode != NTDB_SUCCESS) {
478                         return NTDB_ERR_TO_OFF(ecode);
479                 }
480
481                 val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
482                 if (NTDB_OFF_IS_ERR(val)) {
483                         goto unlock;
484                 }
485
486                 /* Lost race, and it's empty? */
487                 if (!val) {
488                         ntdb->stats.traverse_val_vanished++;
489                         ntdb_unlock_hash(ntdb, i, F_RDLCK);
490                         continue;
491                 }
492
493                 if (!is_chain(val)) {
494                         /* So caller knows what lock to free. */
495                         h->h = i;
496                         /* Return to next bucket. */
497                         h->bucket = i + 1;
498                         val &= NTDB_OFF_MASK;
499                         return val;
500                 }
501
502                 /* Start at beginning of chain */
503                 h->bucket = 0;
504                 h->h = i;
505
506                 val = iterate_chain(ntdb, val, h);
507                 if (NTDB_OFF_IS_ERR(val)) {
508                         goto unlock;
509                 }
510                 if (val != 0) {
511                         return val;
512                 }
513
514                 /* Otherwise, bucket has been set to i+1 */
515                 ntdb_unlock_hash(ntdb, i, F_RDLCK);
516         }
517         return 0;
518
519 unlock:
520         ntdb_unlock_hash(ntdb, i, F_RDLCK);
521         return val;
522 }
523
524 /* Return success if we find something, NTDB_ERR_NOEXIST if none. */
525 enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
526                              struct hash_info *h,
527                              NTDB_DATA *kbuf, size_t *dlen)
528 {
529         ntdb_off_t off;
530         struct ntdb_used_record rec;
531         enum NTDB_ERROR ecode;
532
533         off = lock_and_iterate_hash(ntdb, h);
534
535         if (NTDB_OFF_IS_ERR(off)) {
536                 return NTDB_OFF_TO_ERR(off);
537         } else if (off == 0) {
538                 return NTDB_ERR_NOEXIST;
539         }
540
541         /* The hash for this key is still locked. */
542         ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
543         if (ecode != NTDB_SUCCESS) {
544                 goto unlock;
545         }
546         if (rec_magic(&rec) != NTDB_USED_MAGIC) {
547                 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
548                                     NTDB_LOG_ERROR,
549                                     "next_in_hash:"
550                                     " corrupt record at %llu",
551                                     (long long)off);
552                 goto unlock;
553         }
554
555         kbuf->dsize = rec_key_length(&rec);
556
557         /* They want data as well? */
558         if (dlen) {
559                 *dlen = rec_data_length(&rec);
560                 kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
561                                              kbuf->dsize + *dlen);
562         } else {
563                 kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
564                                              kbuf->dsize);
565         }
566         if (NTDB_PTR_IS_ERR(kbuf->dptr)) {
567                 ecode = NTDB_PTR_ERR(kbuf->dptr);
568                 goto unlock;
569         }
570         ecode = NTDB_SUCCESS;
571
572 unlock:
573         ntdb_unlock_hash(ntdb, bits_from(h->h, 0, ntdb->hash_bits), F_RDLCK);
574         return ecode;
575
576 }
577
578 enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
579                              struct hash_info *h,
580                              NTDB_DATA *kbuf, size_t *dlen)
581 {
582         h->table = NTDB_HASH_OFFSET;
583         h->table_size = 1 << ntdb->hash_bits;
584         h->bucket = 0;
585
586         return next_in_hash(ntdb, h, kbuf, dlen);
587 }
588
589 /* Even if the entry isn't in this hash bucket, you'd have to lock this
590  * bucket to find it. */
591 static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb,
592                                  const NTDB_DATA *key, int ltype)
593 {
594         uint32_t h = ntdb_hash(ntdb, key->dptr, key->dsize);
595
596         return ntdb_lock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), ltype);
597 }
598
599 /* lock/unlock one hash chain. This is meant to be used to reduce
600    contention - it cannot guarantee how many records will be locked */
601 _PUBLIC_ enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key)
602 {
603         return chainlock(ntdb, &key, F_WRLCK);
604 }
605
606 _PUBLIC_ void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key)
607 {
608         uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
609
610         ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_WRLCK);
611 }
612
613 _PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb,
614                                              NTDB_DATA key)
615 {
616         return chainlock(ntdb, &key, F_RDLCK);
617 }
618
619 _PUBLIC_ void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key)
620 {
621         uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
622
623         ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_RDLCK);
624 }