]> git.ozlabs.org Git - ccan/blob - ccan/tdb2/free.c
tdb2: stricter ordering on expansion lock
[ccan] / ccan / tdb2 / free.c
1  /* 
2    Trivial Database 2: free list/block handling
3    Copyright (C) Rusty Russell 2010
4    
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/likely/likely.h>
20 #include <ccan/ilog/ilog.h>
21 #include <time.h>
22 #include <assert.h>
23 #include <limits.h>
24
25 static unsigned fls64(uint64_t val)
26 {
27         return ilog64(val);
28 }
29
30 /* In which bucket would we find a particular record size? (ignoring header) */
31 unsigned int size_to_bucket(tdb_len_t data_len)
32 {
33         unsigned int bucket;
34
35         /* We can't have records smaller than this. */
36         assert(data_len >= TDB_MIN_DATA_LEN);
37
38         /* Ignoring the header... */
39         if (data_len - TDB_MIN_DATA_LEN <= 64) {
40                 /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
41                 bucket = (data_len - TDB_MIN_DATA_LEN) / 8;
42         } else {
43                 /* After that we go power of 2. */
44                 bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2;
45         }
46
47         if (unlikely(bucket >= TDB_FREE_BUCKETS))
48                 bucket = TDB_FREE_BUCKETS - 1;
49         return bucket;
50 }
51
52 tdb_off_t first_flist(struct tdb_context *tdb)
53 {
54         return tdb_read_off(tdb, offsetof(struct tdb_header, free_list));
55 }
56
57 tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist)
58 {
59         return tdb_read_off(tdb, flist + offsetof(struct tdb_freelist, next));
60 }
61
62 int tdb_flist_init(struct tdb_context *tdb)
63 {
64         /* Use reservoir sampling algorithm to select a free list at random. */
65         unsigned int rnd, max = 0;
66         tdb_off_t off;
67
68         tdb->flist_off = off = first_flist(tdb);
69
70         while (off) {
71                 if (off == TDB_OFF_ERR)
72                         return -1;
73
74                 rnd = random();
75                 if (rnd >= max) {
76                         tdb->flist_off = off;
77                         max = rnd;
78                 }
79
80                 off = next_flist(tdb, off);
81         }
82         return 0;
83 }
84
85 /* Offset of a given bucket. */
86 tdb_off_t bucket_off(tdb_off_t flist_off, unsigned bucket)
87 {
88         return flist_off + offsetof(struct tdb_freelist, buckets)
89                 + bucket * sizeof(tdb_off_t);
90 }
91
92 /* Returns free_buckets + 1, or list number to search. */
93 static tdb_off_t find_free_head(struct tdb_context *tdb,
94                                 tdb_off_t flist_off,
95                                 tdb_off_t bucket)
96 {
97         /* Speculatively search for a non-zero bucket. */
98         return tdb_find_nonzero_off(tdb, bucket_off(flist_off, 0),
99                                     bucket, TDB_FREE_BUCKETS);
100 }
101
102 /* Remove from free bucket. */
103 static int remove_from_list(struct tdb_context *tdb,
104                             tdb_off_t b_off, tdb_off_t r_off,
105                             struct tdb_free_record *r)
106 {
107         tdb_off_t off;
108
109         /* Front of list? */
110         if (r->prev == 0) {
111                 off = b_off;
112         } else {
113                 off = r->prev + offsetof(struct tdb_free_record, next);
114         }
115
116 #ifdef DEBUG
117         if (tdb_read_off(tdb, off) != r_off) {
118                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
119                          "remove_from_list: %llu bad prev in list %llu\n",
120                          (long long)r_off, (long long)b_off);
121                 return -1;
122         }
123 #endif
124
125         /* r->prev->next = r->next */
126         if (tdb_write_off(tdb, off, r->next)) {
127                 return -1;
128         }
129
130         if (r->next != 0) {
131                 off = r->next + offsetof(struct tdb_free_record, prev);
132                 /* r->next->prev = r->prev */
133
134 #ifdef DEBUG
135                 if (tdb_read_off(tdb, off) != r_off) {
136                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
137                                  "remove_from_list: %llu bad list %llu\n",
138                                  (long long)r_off, (long long)b_off);
139                         return -1;
140                 }
141 #endif
142
143                 if (tdb_write_off(tdb, off, r->prev)) {
144                         return -1;
145                 }
146         }
147         return 0;
148 }
149
150 /* Enqueue in this free bucket. */
151 static int enqueue_in_free(struct tdb_context *tdb,
152                            tdb_off_t b_off,
153                            tdb_off_t off,
154                            struct tdb_free_record *new)
155 {
156         new->prev = 0;
157         /* new->next = head. */
158         new->next = tdb_read_off(tdb, b_off);
159         if (new->next == TDB_OFF_ERR)
160                 return -1;
161
162         if (new->next) {
163 #ifdef DEBUG
164                 if (tdb_read_off(tdb,
165                                  new->next
166                                  + offsetof(struct tdb_free_record, prev))
167                     != 0) {
168                         tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
169                                  "enqueue_in_free: %llu bad head prev %llu\n",
170                                  (long long)new->next, (long long)b_off);
171                         return -1;
172                 }
173 #endif
174                 /* next->prev = new. */
175                 if (tdb_write_off(tdb, new->next
176                                   + offsetof(struct tdb_free_record, prev),
177                                   off) != 0)
178                         return -1;
179         }
180         /* head = new */
181         if (tdb_write_off(tdb, b_off, off) != 0)
182                 return -1;
183
184         return tdb_write_convert(tdb, off, new, sizeof(*new));
185 }
186
187 /* List need not be locked. */
188 int add_free_record(struct tdb_context *tdb,
189                     tdb_off_t off, tdb_len_t len_with_header)
190 {
191         struct tdb_free_record new;
192         tdb_off_t b_off;
193         int ret;
194
195         assert(len_with_header >= sizeof(new));
196
197         new.magic_and_meta = TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL)
198                 | tdb->flist_off;
199         new.data_len = len_with_header - sizeof(struct tdb_used_record);
200
201         b_off = bucket_off(tdb->flist_off, size_to_bucket(new.data_len));
202         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
203                 return -1;
204
205         ret = enqueue_in_free(tdb, b_off, off, &new);
206         tdb_unlock_free_bucket(tdb, b_off);
207         return ret;
208 }
209
210 static size_t adjust_size(size_t keylen, size_t datalen)
211 {
212         size_t size = keylen + datalen;
213
214         if (size < TDB_MIN_DATA_LEN)
215                 size = TDB_MIN_DATA_LEN;
216
217         /* Round to next uint64_t boundary. */
218         return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
219 }
220
221 /* If we have enough left over to be useful, split that off. */
222 static size_t record_leftover(size_t keylen, size_t datalen,
223                               bool want_extra, size_t total_len)
224 {
225         ssize_t leftover;
226
227         if (want_extra)
228                 datalen += datalen / 2;
229         leftover = total_len - adjust_size(keylen, datalen);
230
231         if (leftover < (ssize_t)sizeof(struct tdb_free_record))
232                 return 0;
233
234         return leftover;
235 }
236
237 /* Note: we unlock the current bucket if we coalesce or fail. */
238 static int coalesce(struct tdb_context *tdb,
239                     tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
240 {
241         struct tdb_free_record pad, *r;
242         tdb_off_t end;
243
244         end = off + sizeof(struct tdb_used_record) + data_len;
245
246         while (end < tdb->map_size) {
247                 tdb_off_t nb_off;
248
249                 /* FIXME: do tdb_get here and below really win? */
250                 r = tdb_get(tdb, end, &pad, sizeof(pad));
251                 if (!r)
252                         goto err;
253
254                 if (frec_magic(r) != TDB_FREE_MAGIC)
255                         break;
256
257                 nb_off = bucket_off(frec_flist(r), size_to_bucket(r->data_len));
258
259                 /* We may be violating lock order here, so best effort. */
260                 if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
261                         break;
262
263                 /* Now we have lock, re-check. */
264                 r = tdb_get(tdb, end, &pad, sizeof(pad));
265                 if (!r) {
266                         tdb_unlock_free_bucket(tdb, nb_off);
267                         goto err;
268                 }
269
270                 if (unlikely(frec_magic(r) != TDB_FREE_MAGIC)) {
271                         tdb_unlock_free_bucket(tdb, nb_off);
272                         break;
273                 }
274
275                 if (unlikely(bucket_off(frec_flist(r),
276                                         size_to_bucket(r->data_len))
277                              != nb_off)) {
278                         tdb_unlock_free_bucket(tdb, nb_off);
279                         break;
280                 }
281
282                 if (remove_from_list(tdb, nb_off, end, r) == -1) {
283                         tdb_unlock_free_bucket(tdb, nb_off);
284                         goto err;
285                 }
286
287                 end += sizeof(struct tdb_used_record) + r->data_len;
288                 tdb_unlock_free_bucket(tdb, nb_off);
289         }
290
291         /* Didn't find any adjacent free? */
292         if (end == off + sizeof(struct tdb_used_record) + data_len)
293                 return 0;
294
295         /* OK, expand record */
296         r = tdb_get(tdb, off, &pad, sizeof(pad));
297         if (!r)
298                 goto err;
299
300         if (r->data_len != data_len) {
301                 tdb->ecode = TDB_ERR_CORRUPT;
302                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
303                          "coalesce: expected data len %llu not %llu\n",
304                          (long long)data_len, (long long)r->data_len);
305                 goto err;
306         }
307
308         if (remove_from_list(tdb, b_off, off, r) == -1)
309                 goto err;
310
311         r = tdb_access_write(tdb, off, sizeof(*r), true);
312         if (!r)
313                 goto err;
314
315         /* We have to drop this to avoid deadlocks, so make sure record
316          * doesn't get coalesced by someone else! */
317         r->magic_and_meta = TDB_COALESCING_MAGIC << (64 - TDB_OFF_UPPER_STEAL);
318         r->data_len = end - off - sizeof(struct tdb_used_record);
319         if (tdb_access_commit(tdb, r) != 0)
320                 goto err;
321
322         tdb_unlock_free_bucket(tdb, b_off);
323
324         if (add_free_record(tdb, off, end - off) == -1)
325                 return -1;
326         return 1;
327
328 err:
329         /* To unify error paths, we *always* unlock bucket on error. */
330         tdb_unlock_free_bucket(tdb, b_off);
331         return -1;
332 }
333
334 /* We need size bytes to put our key and data in. */
335 static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
336                                 tdb_off_t flist_off,
337                                 tdb_off_t bucket,
338                                 size_t keylen, size_t datalen,
339                                 bool want_extra,
340                                 unsigned hashlow)
341 {
342         tdb_off_t off, b_off,best_off;
343         struct tdb_free_record pad, best = { 0 }, *r;
344         double multiplier;
345         size_t size = adjust_size(keylen, datalen);
346
347 again:
348         b_off = bucket_off(flist_off, bucket);
349
350         /* FIXME: Try non-blocking wait first, to measure contention. */
351         /* Lock this bucket. */
352         if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == -1) {
353                 return TDB_OFF_ERR;
354         }
355
356         best.data_len = -1ULL;
357         best_off = 0;
358
359         /* Get slack if we're after extra. */
360         if (want_extra)
361                 multiplier = 1.5;
362         else
363                 multiplier = 1.0;
364
365         /* Walk the list to see if any are large enough, getting less fussy
366          * as we go. */
367         off = tdb_read_off(tdb, b_off);
368         if (unlikely(off == TDB_OFF_ERR))
369                 goto unlock_err;
370
371         while (off) {
372                 /* FIXME: Does tdb_get win anything here? */
373                 r = tdb_get(tdb, off, &pad, sizeof(*r));
374                 if (!r)
375                         goto unlock_err;
376
377                 if (frec_magic(r) != TDB_FREE_MAGIC) {
378                         tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
379                                  "lock_and_alloc: %llu non-free 0x%llx\n",
380                                  (long long)off, (long long)r->magic_and_meta);
381                         goto unlock_err;
382                 }
383
384                 if (r->data_len >= size && r->data_len < best.data_len) {
385                         best_off = off;
386                         best = *r;
387                 }
388
389                 if (best.data_len < size * multiplier && best_off)
390                         break;
391
392                 multiplier *= 1.01;
393
394                 /* Since we're going slow anyway, try coalescing here. */
395                 switch (coalesce(tdb, off, b_off, r->data_len)) {
396                 case -1:
397                         /* This has already unlocked on error. */
398                         return -1;
399                 case 1:
400                         /* This has unlocked list, restart. */
401                         goto again;
402                 }
403                 off = r->next;
404         }
405
406         /* If we found anything at all, use it. */
407         if (best_off) {
408                 struct tdb_used_record rec;
409                 size_t leftover;
410
411                 /* We're happy with this size: take it. */
412                 if (remove_from_list(tdb, b_off, best_off, &best) != 0)
413                         goto unlock_err;
414
415                 leftover = record_leftover(keylen, datalen, want_extra,
416                                            best.data_len);
417
418                 assert(keylen + datalen + leftover <= best.data_len);
419                 /* We need to mark non-free before we drop lock, otherwise
420                  * coalesce() could try to merge it! */
421                 if (set_header(tdb, &rec, keylen, datalen,
422                                best.data_len - leftover,
423                                hashlow) != 0)
424                         goto unlock_err;
425
426                 if (tdb_write_convert(tdb, best_off, &rec, sizeof(rec)) != 0)
427                         goto unlock_err;
428
429                 tdb_unlock_free_bucket(tdb, b_off);
430
431                 if (leftover) {
432                         if (add_free_record(tdb,
433                                             best_off + sizeof(rec)
434                                             + best.data_len - leftover,
435                                             leftover))
436                                 return TDB_OFF_ERR;
437                 }
438                 return best_off;
439         }
440
441         tdb_unlock_free_bucket(tdb, b_off);
442         return 0;
443
444 unlock_err:
445         tdb_unlock_free_bucket(tdb, b_off);
446         return TDB_OFF_ERR;
447 }
448
449 /* Get a free block from current free list, or 0 if none. */
450 static tdb_off_t get_free(struct tdb_context *tdb,
451                           size_t keylen, size_t datalen, bool want_extra,
452                           unsigned hashlow)
453 {
454         tdb_off_t off, flist;
455         unsigned start_b, b;
456         bool wrapped = false;
457
458         /* If they are growing, add 50% to get to higher bucket. */
459         if (want_extra)
460                 start_b = size_to_bucket(adjust_size(keylen,
461                                                      datalen + datalen / 2));
462         else
463                 start_b = size_to_bucket(adjust_size(keylen, datalen));
464
465         flist = tdb->flist_off;
466         while (!wrapped || flist != tdb->flist_off) {
467                 /* Start at exact size bucket, and search up... */
468                 for (b = find_free_head(tdb, flist, start_b);
469                      b < TDB_FREE_BUCKETS;
470                      b = find_free_head(tdb, flist, b + 1)) {
471                         /* Try getting one from list. */
472                         off = lock_and_alloc(tdb, flist,
473                                              b, keylen, datalen, want_extra,
474                                              hashlow);
475                         if (off == TDB_OFF_ERR)
476                                 return TDB_OFF_ERR;
477                         if (off != 0) {
478                                 /* Worked?  Stay using this list. */
479                                 tdb->flist_off = flist;
480                                 return off;
481                         }
482                         /* Didn't work.  Try next bucket. */
483                 }
484
485                 /* Hmm, try next list. */
486                 flist = next_flist(tdb, flist);
487                 if (flist == 0) {
488                         wrapped = true;
489                         flist = first_flist(tdb);
490                 }
491         }
492
493         return 0;
494 }
495
496 int set_header(struct tdb_context *tdb,
497                struct tdb_used_record *rec,
498                uint64_t keylen, uint64_t datalen,
499                uint64_t actuallen, unsigned hashlow)
500 {
501         uint64_t keybits = (fls64(keylen) + 1) / 2;
502
503         /* Use bottom bits of hash, so it's independent of hash table size. */
504         rec->magic_and_meta = (hashlow & ((1 << 11)-1))
505                 | ((actuallen - (keylen + datalen)) << 11)
506                 | (keybits << 43)
507                 | (TDB_MAGIC << 48);
508         rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
509
510         /* Encoding can fail on big values. */
511         if (rec_key_length(rec) != keylen
512             || rec_data_length(rec) != datalen
513             || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
514                 tdb->ecode = TDB_ERR_IO;
515                 tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
516                          "Could not encode k=%llu,d=%llu,a=%llu\n",
517                          (long long)keylen, (long long)datalen,
518                          (long long)actuallen);
519                 return -1;
520         }
521         return 0;
522 }
523
524 /* Expand the database. */
525 static int tdb_expand(struct tdb_context *tdb, tdb_len_t size)
526 {
527         uint64_t old_size;
528         tdb_len_t wanted;
529
530         /* We need room for the record header too. */
531         wanted = sizeof(struct tdb_used_record) + size;
532
533         /* Need to hold a hash lock to expand DB: transactions rely on it. */
534         if (!(tdb->flags & TDB_NOLOCK)
535             && !tdb->allrecord_lock.count && !tdb_has_hash_locks(tdb)) {
536                 tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
537                          "tdb_expand: must hold lock during expand\n");
538                 return -1;
539         }
540
541         /* Only one person can expand file at a time. */
542         if (tdb_lock_expand(tdb, F_WRLCK) != 0)
543                 return -1;
544
545         /* Someone else may have expanded the file, so retry. */
546         old_size = tdb->map_size;
547         tdb->methods->oob(tdb, tdb->map_size + 1, true);
548         if (tdb->map_size != old_size) {
549                 tdb_unlock_expand(tdb, F_WRLCK);
550                 return 0;
551         }
552
553         if (tdb->methods->expand_file(tdb, wanted*TDB_EXTENSION_FACTOR) == -1) {
554                 tdb_unlock_expand(tdb, F_WRLCK);
555                 return -1;
556         }
557
558         /* We need to drop this lock before adding free record. */
559         tdb_unlock_expand(tdb, F_WRLCK);
560
561         return add_free_record(tdb, old_size, wanted * TDB_EXTENSION_FACTOR);
562 }
563
564 /* This won't fail: it will expand the database if it has to. */
565 tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
566                 uint64_t hash, bool growing)
567 {
568         tdb_off_t off;
569
570         /* We can't hold pointers during this: we could unmap! */
571         assert(!tdb->direct_access);
572
573         for (;;) {
574                 off = get_free(tdb, keylen, datalen, growing, hash);
575                 if (likely(off != 0))
576                         break;
577
578                 if (tdb_expand(tdb, adjust_size(keylen, datalen)))
579                         return TDB_OFF_ERR;
580         }
581
582         return off;
583 }