47a597091d5a1df8f056a234298a48d143522c4c
[ccan] / ccan / alloc / alloc.c
1 /* Licensed under LGPLv2.1+ - see LICENSE file for details */
2 #include <unistd.h>
3 #include <stdint.h>
4 #include <string.h>
5 #include <limits.h>
6 #include <assert.h>
7 #include <stdlib.h>
8 #include "alloc.h"
9 #include "bitops.h"
10 #include "tiny.h"
11 #include <ccan/build_assert/build_assert.h>
12 #include <ccan/likely/likely.h>
13 #include <ccan/alignof/alignof.h>
14 #include <ccan/short_types/short_types.h>
15 #include <ccan/compiler/compiler.h>
16 #include "config.h"
17
18 /*
19    Inspired by (and parts taken from) Andrew Tridgell's alloc_mmap:
20    http://samba.org/~tridge/junkcode/alloc_mmap/
21
22    Copyright (C) Andrew Tridgell 2007
23    
24    This library is free software; you can redistribute it and/or
25    modify it under the terms of the GNU Lesser General Public
26    License as published by the Free Software Foundation; either
27    version 2 of the License, or (at your option) any later version.
28
29    This library is distributed in the hope that it will be useful,
30    but WITHOUT ANY WARRANTY; without even the implied warranty of
31    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
32    Lesser General Public License for more details.
33
34    You should have received a copy of the GNU Lesser General Public
35    License along with this library; if not, write to the Free Software
36    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
37  */
38
39 /* We divide the pool into this many large pages (nearest power of 2) */
40 #define MAX_LARGE_PAGES (256UL)
41
42 /* 32 small pages == 1 large page. */
43 #define BITS_FROM_SMALL_TO_LARGE_PAGE 5
44
45 #define MAX_SMALL_PAGES (MAX_LARGE_PAGES << BITS_FROM_SMALL_TO_LARGE_PAGE)
46
47 /* Smallest pool size for this scheme: 128-byte small pages.  That's
48  * 9/13% overhead for 32/64 bit. */
49 #define MIN_USEFUL_SIZE (MAX_SMALL_PAGES * 128)
50
51 /* Every 4 buckets, we jump up a power of 2. ...8 10 12 14 16 20 24 28 32... */
52 #define INTER_BUCKET_SPACE 4
53
54 #define SMALL_PAGES_PER_LARGE_PAGE (1 << BITS_FROM_SMALL_TO_LARGE_PAGE)
55
56 /* FIXME: Figure this out properly. */
57 #define MAX_SIZE (1 << 30)
58
59 /* How few object to fit in a page before using a larger one? (8) */
60 #define MAX_PAGE_OBJECT_ORDER   3
61
62 #define BITS_PER_LONG (sizeof(long) * CHAR_BIT)
63
64 struct bucket_state {
65         u32 elements_per_page;
66         u16 page_list;
67         u16 full_list;
68 };
69
70 struct header {
71         /* Bitmap of which pages are large. */
72         unsigned long pagesize[MAX_LARGE_PAGES / BITS_PER_LONG];
73
74         /* List of unused small/large pages. */
75         u16 small_free_list;
76         u16 large_free_list;
77
78         /* List of huge allocs. */
79         unsigned long huge;
80
81         /* This is less defined: we have two buckets for each power of 2 */
82         struct bucket_state bs[1];
83 };
84
85 struct huge_alloc {
86         unsigned long next, prev;
87         unsigned long off, len;
88 };
89
90 struct page_header {
91         u16 next, prev;
92         /* FIXME: We can just count all-0 and all-1 used[] elements. */
93         unsigned elements_used : 25;
94         unsigned bucket : 7;
95         unsigned long used[1]; /* One bit per element. */
96 };
97
98 /*
99  * Every 4 buckets, the size doubles.
100  * Between buckets, sizes increase linearly.
101  *
102  * eg. bucket 40 = 2^10                 = 1024
103  *     bucket 41 = 2^10 + 2^10*4        = 1024 + 256
104  *     bucket 42 = 2^10 + 2^10*4        = 1024 + 512
105  *     bucket 43 = 2^10 + 2^10*4        = 1024 + 768
106  *     bucket 45 = 2^11                 = 2048
107  *
108  * Care is taken to handle low numbered buckets, at cost of overflow.
109  */
110 static unsigned long bucket_to_size(unsigned int bucket)
111 {
112         unsigned long base = 1UL << (bucket / INTER_BUCKET_SPACE);
113         return base + ((bucket % INTER_BUCKET_SPACE)
114                        << (bucket / INTER_BUCKET_SPACE))
115                 / INTER_BUCKET_SPACE;
116 }
117
118 /*
119  * Say size is 10.
120  *   fls(size/2) == 3.  1 << 3 == 8, so we're 2 too large, out of a possible
121  * 8 too large.  That's 1/4 of the way to the next power of 2 == 1 bucket.
122  *
123  * We make sure we round up.  Note that this fails on 32 bit at size
124  * 1879048193 (around bucket 120).
125  */
126 static unsigned int size_to_bucket(unsigned long size)
127 {
128         unsigned int base = afls(size/2);
129         unsigned long overshoot;
130
131         overshoot = size - (1UL << base);
132         return base * INTER_BUCKET_SPACE
133                 + ((overshoot * INTER_BUCKET_SPACE + (1UL << base)-1) >> base);
134 }
135
136 static unsigned int small_page_bits(unsigned long poolsize)
137 {
138         return afls(poolsize / MAX_SMALL_PAGES - 1);
139 }
140
141 static struct page_header *from_pgnum(struct header *head,
142                                       unsigned long pgnum,
143                                       unsigned sp_bits)
144 {
145         return (struct page_header *)((char *)head + (pgnum << sp_bits));
146 }
147
148 static u16 to_pgnum(struct header *head, void *p, unsigned sp_bits)
149 {
150         return ((char *)p - (char *)head) >> sp_bits;
151 }
152
153 static size_t used_size(unsigned int num_elements)
154 {
155         return align_up(num_elements, BITS_PER_LONG) / CHAR_BIT;
156 }
157
158 /*
159  * We always align the first entry to the lower power of 2.
160  * eg. the 12-byte bucket gets 8-byte aligned.  The 4096-byte bucket
161  * gets 4096-byte aligned.
162  */
163 static unsigned long page_header_size(unsigned int align_bits,
164                                       unsigned long num_elements)
165 {
166         unsigned long size;
167
168         size = sizeof(struct page_header)
169                 - sizeof(((struct page_header *)0)->used)
170                 + used_size(num_elements);
171         return align_up(size, 1UL << align_bits);
172 }
173
174 static void add_to_list(struct header *head,
175                         u16 *list, struct page_header *ph, unsigned sp_bits)
176 {
177         unsigned long h = *list, offset = to_pgnum(head, ph, sp_bits);
178
179         ph->next = h;
180         if (h) {
181                 struct page_header *prev = from_pgnum(head, h, sp_bits);
182                 assert(prev->prev == 0);
183                 prev->prev = offset;
184         }
185         *list = offset;
186         ph->prev = 0;
187 }
188
189 static void del_from_list(struct header *head,
190                           u16 *list, struct page_header *ph, unsigned sp_bits)
191 {
192         /* Front of list? */
193         if (ph->prev == 0) {
194                 *list = ph->next;
195         } else {
196                 struct page_header *prev = from_pgnum(head, ph->prev, sp_bits);
197                 prev->next = ph->next;
198         }
199         if (ph->next != 0) {
200                 struct page_header *next = from_pgnum(head, ph->next, sp_bits);
201                 next->prev = ph->prev;
202         }
203 }
204
205 static u16 pop_from_list(struct header *head,
206                                    u16 *list,
207                                    unsigned int sp_bits)
208 {
209         u16 h = *list;
210         struct page_header *ph = from_pgnum(head, h, sp_bits);
211
212         if (likely(h)) {
213                 *list = ph->next;
214                 if (*list)
215                         from_pgnum(head, *list, sp_bits)->prev = 0;
216         }
217         return h;
218 }
219
220 static void add_to_huge_list(struct header *head, struct huge_alloc *ha)
221 {
222         unsigned long h = head->huge;
223         unsigned long offset = (char *)ha - (char *)head;
224
225         ha->next = h;
226         if (h) {
227                 struct huge_alloc *prev = (void *)((char *)head + h);
228                 assert(prev->prev == 0);
229                 prev->prev = offset;
230         }
231         head->huge = offset;
232         ha->prev = 0;
233 }
234
235 static void del_from_huge(struct header *head, struct huge_alloc *ha)
236 {
237         /* Front of list? */
238         if (ha->prev == 0) {
239                 head->huge = ha->next;
240         } else {
241                 struct huge_alloc *prev = (void *)((char *)head + ha->prev);
242                 prev->next = ha->next;
243         }
244         if (ha->next != 0) {
245                 struct huge_alloc *next = (void *)((char *)head + ha->next);
246                 next->prev = ha->prev;
247         }
248 }
249
250 static void add_small_page_to_freelist(struct header *head,
251                                        struct page_header *ph,
252                                        unsigned int sp_bits)
253 {
254         add_to_list(head, &head->small_free_list, ph, sp_bits);
255 }
256
257 static void add_large_page_to_freelist(struct header *head,
258                                        struct page_header *ph,
259                                        unsigned int sp_bits)
260 {
261         add_to_list(head, &head->large_free_list, ph, sp_bits);
262 }
263
264 static void add_to_bucket_list(struct header *head,
265                                struct bucket_state *bs,
266                                struct page_header *ph,
267                                unsigned int sp_bits)
268 {
269         add_to_list(head, &bs->page_list, ph, sp_bits);
270 }
271
272 static void del_from_bucket_list(struct header *head,
273                                  struct bucket_state *bs,
274                                  struct page_header *ph,
275                                  unsigned int sp_bits)
276 {
277         del_from_list(head, &bs->page_list, ph, sp_bits);
278 }
279
280 static void del_from_bucket_full_list(struct header *head,
281                                       struct bucket_state *bs,
282                                       struct page_header *ph,
283                                       unsigned int sp_bits)
284 {
285         del_from_list(head, &bs->full_list, ph, sp_bits);
286 }
287
288 static void add_to_bucket_full_list(struct header *head,
289                                     struct bucket_state *bs,
290                                     struct page_header *ph,
291                                     unsigned int sp_bits)
292 {
293         add_to_list(head, &bs->full_list, ph, sp_bits);
294 }
295
296 static void clear_bit(unsigned long bitmap[], unsigned int off)
297 {
298         bitmap[off / BITS_PER_LONG] &= ~(1UL << (off % BITS_PER_LONG));
299 }
300
301 static bool test_bit(const unsigned long bitmap[], unsigned int off)
302 {
303         return bitmap[off / BITS_PER_LONG] & (1UL << (off % BITS_PER_LONG));
304 }
305
306 static void set_bit(unsigned long bitmap[], unsigned int off)
307 {
308         bitmap[off / BITS_PER_LONG] |= (1UL << (off % BITS_PER_LONG));
309 }
310
311 /* There must be a bit to be found. */
312 static unsigned int find_free_bit(const unsigned long bitmap[])
313 {
314         unsigned int i;
315
316         for (i = 0; bitmap[i] == -1UL; i++);
317         return (i*BITS_PER_LONG) + affsl(~bitmap[i]) - 1;
318 }
319
320 /* How many elements can we fit in a page? */
321 static unsigned long elements_per_page(unsigned long align_bits,
322                                        unsigned long esize,
323                                        unsigned long psize)
324 {
325         unsigned long num, overhead;
326
327         /* First approximation: no extra room for bitmap. */
328         overhead = align_up(sizeof(struct page_header), 1UL << align_bits);
329         num = (psize - overhead) / esize;
330
331         while (page_header_size(align_bits, num) + esize * num > psize)
332                 num--;
333         return num;
334 }
335
336 static bool large_page_bucket(unsigned int bucket, unsigned int sp_bits)
337 {
338         unsigned long max_smallsize;
339
340         /* Note: this doesn't take into account page header. */
341         max_smallsize = (1UL << sp_bits) >> MAX_PAGE_OBJECT_ORDER;
342
343         return bucket_to_size(bucket) > max_smallsize;
344 }
345
346 static unsigned int max_bucket(unsigned int lp_bits)
347 {
348         return (lp_bits - MAX_PAGE_OBJECT_ORDER) * INTER_BUCKET_SPACE;
349 }
350
351 void alloc_init(void *pool, unsigned long poolsize)
352 {
353         struct header *head = pool;
354         struct page_header *ph;
355         unsigned int lp_bits, sp_bits, num_buckets;
356         unsigned long header_size, i;
357
358         if (poolsize < MIN_USEFUL_SIZE) {
359                 tiny_alloc_init(pool, poolsize);
360                 return;
361         }
362
363         /* We rely on page numbers fitting in 16 bit. */
364         BUILD_ASSERT(MAX_SMALL_PAGES < 65536);
365         
366         sp_bits = small_page_bits(poolsize);
367         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
368
369         num_buckets = max_bucket(lp_bits);
370
371         head = pool;
372         header_size = sizeof(*head) + sizeof(head->bs) * (num_buckets-1);
373
374         memset(head, 0, header_size);
375         for (i = 0; i < num_buckets; i++) {
376                 unsigned long pagesize;
377
378                 if (large_page_bucket(i, sp_bits))
379                         pagesize = 1UL << lp_bits;
380                 else
381                         pagesize = 1UL << sp_bits;
382
383                 head->bs[i].elements_per_page
384                         = elements_per_page(i / INTER_BUCKET_SPACE,
385                                             bucket_to_size(i),
386                                             pagesize);
387         }
388
389         /* They start as all large pages. */
390         memset(head->pagesize, 0xFF, sizeof(head->pagesize));
391         /* FIXME: small pages for last bit? */
392
393         /* Split first page into small pages. */
394         assert(header_size < (1UL << lp_bits));
395         clear_bit(head->pagesize, 0);
396
397         /* Skip over page(s) used by header, add rest to free list */
398         for (i = align_up(header_size, (1UL << sp_bits)) >> sp_bits;
399              i < SMALL_PAGES_PER_LARGE_PAGE;
400              i++) {
401                 ph = from_pgnum(head, i, sp_bits);
402                 ph->elements_used = 0;
403                 add_small_page_to_freelist(head, ph, sp_bits);
404         }
405
406         /* Add the rest of the pages as large pages. */
407         i = SMALL_PAGES_PER_LARGE_PAGE;
408         while ((i << sp_bits) + (1UL << lp_bits) <= poolsize) {
409                 assert(i < MAX_SMALL_PAGES);
410                 ph = from_pgnum(head, i, sp_bits);
411                 ph->elements_used = 0;
412                 add_large_page_to_freelist(head, ph, sp_bits);
413                 i += SMALL_PAGES_PER_LARGE_PAGE;
414         }
415 }
416
417 /* A large page worth of small pages are free: delete them from free list. */
418 static void del_large_from_small_free_list(struct header *head,
419                                            struct page_header *ph,
420                                            unsigned int sp_bits)
421 {
422         unsigned long i;
423
424         for (i = 0; i < SMALL_PAGES_PER_LARGE_PAGE; i++) {
425                 del_from_list(head, &head->small_free_list,
426                               (struct page_header *)((char *)ph
427                                                      + (i << sp_bits)),
428                               sp_bits);
429         }
430 }
431
432 static bool all_empty(struct header *head,
433                       unsigned long pgnum,
434                       unsigned sp_bits)
435 {
436         unsigned long i;
437
438         for (i = 0; i < SMALL_PAGES_PER_LARGE_PAGE; i++) {
439                 struct page_header *ph = from_pgnum(head, pgnum + i, sp_bits);
440                 if (ph->elements_used)
441                         return false;
442         }
443         return true;
444 }
445
446 static void recombine_small_pages(struct header *head, unsigned long poolsize,
447                                   unsigned int sp_bits)
448 {
449         unsigned long i;
450         unsigned int lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
451
452         /* Look for small pages to coalesce, after first large page. */
453         for (i = SMALL_PAGES_PER_LARGE_PAGE;
454              i < (poolsize >> lp_bits) << BITS_FROM_SMALL_TO_LARGE_PAGE;
455              i += SMALL_PAGES_PER_LARGE_PAGE) {
456                 /* Already a large page? */
457                 if (test_bit(head->pagesize, i / SMALL_PAGES_PER_LARGE_PAGE))
458                         continue;
459                 if (all_empty(head, i, sp_bits)) {
460                         struct page_header *ph = from_pgnum(head, i, sp_bits);
461                         set_bit(head->pagesize,
462                                 i / SMALL_PAGES_PER_LARGE_PAGE);
463                         del_large_from_small_free_list(head, ph, sp_bits);
464                         add_large_page_to_freelist(head, ph, sp_bits);
465                 }
466         }
467 }
468
469 static u16 get_large_page(struct header *head, unsigned long poolsize,
470                           unsigned int sp_bits)
471 {
472         unsigned int page;
473
474         page = pop_from_list(head, &head->large_free_list, sp_bits);
475         if (likely(page))
476                 return page;
477
478         recombine_small_pages(head, poolsize, sp_bits);
479
480         return pop_from_list(head, &head->large_free_list, sp_bits);
481 }
482
483 /* Returns small page. */
484 static unsigned long break_up_large_page(struct header *head,
485                                          unsigned int sp_bits,
486                                          u16 lpage)
487 {
488         unsigned int i;
489
490         clear_bit(head->pagesize, lpage >> BITS_FROM_SMALL_TO_LARGE_PAGE);
491
492         for (i = 1; i < SMALL_PAGES_PER_LARGE_PAGE; i++) {
493                 struct page_header *ph = from_pgnum(head, lpage + i, sp_bits);
494                 /* Initialize this: huge_alloc reads it. */
495                 ph->elements_used = 0;
496                 add_small_page_to_freelist(head, ph, sp_bits);
497         }
498
499         return lpage;
500 }
501
502 static u16 get_small_page(struct header *head, unsigned long poolsize,
503                           unsigned int sp_bits)
504 {
505         u16 ret;
506
507         ret = pop_from_list(head, &head->small_free_list, sp_bits);
508         if (likely(ret))
509                 return ret;
510         ret = get_large_page(head, poolsize, sp_bits);
511         if (likely(ret))
512                 ret = break_up_large_page(head, sp_bits, ret);
513         return ret;
514 }
515
516 static bool huge_allocated(struct header *head, unsigned long offset)
517 {
518         unsigned long i;
519         struct huge_alloc *ha;
520
521         for (i = head->huge; i; i = ha->next) {
522                 ha = (void *)((char *)head + i);
523                 if (ha->off <= offset && ha->off + ha->len > offset)
524                         return true;
525         }
526         return false;
527 }
528
529 /* They want something really big.  Aim for contiguous pages (slow). */
530 static COLD void *huge_alloc(void *pool, unsigned long poolsize,
531                              unsigned long size, unsigned long align)
532 {
533         struct header *head = pool;
534         struct huge_alloc *ha;
535         unsigned long i, sp_bits, lp_bits, num, header_size;
536
537         sp_bits = small_page_bits(poolsize);
538         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
539
540         /* Allocate tracking structure optimistically. */
541         ha = alloc_get(pool, poolsize, sizeof(*ha), ALIGNOF(*ha));
542         if (!ha)
543                 return NULL;
544
545         /* First search for contiguous small pages... */
546         header_size = sizeof(*head) + sizeof(head->bs) * (max_bucket(lp_bits)-1);
547
548         num = 0;
549         for (i = (header_size + (1UL << sp_bits) - 1) >> sp_bits;
550              i << sp_bits < poolsize;
551              i++) {
552                 struct page_header *pg;
553                 unsigned long off = (i << sp_bits);
554
555                 /* Skip over large pages. */
556                 if (test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
557                         i += (1UL << BITS_FROM_SMALL_TO_LARGE_PAGE)-1;
558                         continue;
559                 }
560
561                 /* Does this page meet alignment requirements? */
562                 if (!num && off % align != 0)
563                         continue;
564
565                 /* FIXME: This makes us O(n^2). */
566                 if (huge_allocated(head, off)) {
567                         num = 0;
568                         continue;
569                 }
570
571                 pg = (struct page_header *)((char *)head + off);
572                 if (pg->elements_used) {
573                         num = 0;
574                         continue;
575                 }
576
577                 num++;
578                 if (num << sp_bits >= size) {
579                         unsigned long pgnum;
580
581                         /* Remove from free list. */
582                         for (pgnum = i; pgnum > i - num; pgnum--) {
583                                 pg = from_pgnum(head, pgnum, sp_bits);
584                                 del_from_list(head,
585                                               &head->small_free_list,
586                                               pg, sp_bits);
587                         }
588                         ha->off = (i - num + 1) << sp_bits;
589                         ha->len = num << sp_bits;
590                         goto done;
591                 }
592         }
593
594         /* Now search for large pages... */
595         recombine_small_pages(head, poolsize, sp_bits);
596
597         num = 0;
598         for (i = (header_size + (1UL << lp_bits) - 1) >> lp_bits;
599              (i << lp_bits) < poolsize; i++) {
600                 struct page_header *pg;
601                 unsigned long off = (i << lp_bits);
602
603                 /* Ignore small pages. */
604                 if (!test_bit(head->pagesize, i))
605                         continue;
606
607                 /* Does this page meet alignment requirements? */
608                 if (!num && off % align != 0)
609                         continue;
610
611                 /* FIXME: This makes us O(n^2). */
612                 if (huge_allocated(head, off)) {
613                         num = 0;
614                         continue;
615                 }
616
617                 pg = (struct page_header *)((char *)head + off);
618                 if (pg->elements_used) {
619                         num = 0;
620                         continue;
621                 }
622
623                 num++;
624                 if (num << lp_bits >= size) {
625                         unsigned long pgnum;
626
627                         /* Remove from free list. */
628                         for (pgnum = i; pgnum > i - num; pgnum--) {
629                                 pg = from_pgnum(head, pgnum, lp_bits);
630                                 del_from_list(head,
631                                               &head->large_free_list,
632                                               pg, sp_bits);
633                         }
634                         ha->off = (i - num + 1) << lp_bits;
635                         ha->len = num << lp_bits;
636                         goto done;
637                 }
638         }
639
640         /* Unable to satisfy: free huge alloc structure. */
641         alloc_free(pool, poolsize, ha);
642         return NULL;
643
644 done:
645         add_to_huge_list(pool, ha);
646         return (char *)pool + ha->off;
647 }
648
649 static COLD void
650 huge_free(struct header *head, unsigned long poolsize, void *free)
651 {
652         unsigned long i, off, pgnum, free_off = (char *)free - (char *)head;
653         unsigned int sp_bits, lp_bits;
654         struct huge_alloc *ha;
655
656         for (i = head->huge; i; i = ha->next) {
657                 ha = (void *)((char *)head + i);
658                 if (free_off == ha->off)
659                         break;
660         }
661         assert(i);
662
663         /* Free up all the pages, delete and free ha */
664         sp_bits = small_page_bits(poolsize);
665         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
666         pgnum = free_off >> sp_bits;
667
668         if (test_bit(head->pagesize, pgnum >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
669                 for (off = ha->off;
670                      off < ha->off + ha->len;
671                      off += 1UL << lp_bits) {
672                         add_large_page_to_freelist(head,
673                                                    (void *)((char *)head + off),
674                                                    sp_bits);
675                 }
676         } else {
677                 for (off = ha->off;
678                      off < ha->off + ha->len;
679                      off += 1UL << sp_bits) {
680                         add_small_page_to_freelist(head,
681                                                    (void *)((char *)head + off),
682                                                    sp_bits);
683                 }
684         }
685         del_from_huge(head, ha);
686         alloc_free(head, poolsize, ha);
687 }
688
689 static COLD unsigned long huge_size(struct header *head, void *p)
690 {
691         unsigned long i, off = (char *)p - (char *)head;
692         struct huge_alloc *ha;
693
694         for (i = head->huge; i; i = ha->next) {
695                 ha = (void *)((char *)head + i);
696                 if (off == ha->off) {
697                         return ha->len;
698                 }
699         }
700         abort();
701 }
702
703 void *alloc_get(void *pool, unsigned long poolsize,
704                 unsigned long size, unsigned long align)
705 {
706         struct header *head = pool;
707         unsigned int bucket;
708         unsigned long i;
709         struct bucket_state *bs;
710         struct page_header *ph;
711         unsigned int sp_bits;
712
713         if (poolsize < MIN_USEFUL_SIZE) {
714                 return tiny_alloc_get(pool, poolsize, size, align);
715         }
716
717         size = align_up(size, align);
718         if (unlikely(!size))
719                 size = 1;
720         bucket = size_to_bucket(size);
721
722         sp_bits = small_page_bits(poolsize);
723
724         if (bucket >= max_bucket(sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE)) {
725                 return huge_alloc(pool, poolsize, size, align);
726         }
727
728         bs = &head->bs[bucket];
729
730         if (!bs->page_list) {
731                 struct page_header *ph;
732
733                 if (large_page_bucket(bucket, sp_bits))
734                         bs->page_list = get_large_page(head, poolsize,
735                                                        sp_bits);
736                 else
737                         bs->page_list = get_small_page(head, poolsize,
738                                                        sp_bits);
739                 /* FIXME: Try large-aligned alloc?  Header stuffing? */
740                 if (unlikely(!bs->page_list))
741                         return NULL;
742                 ph = from_pgnum(head, bs->page_list, sp_bits);
743                 ph->bucket = bucket;
744                 ph->elements_used = 0;
745                 ph->next = 0;
746                 memset(ph->used, 0, used_size(bs->elements_per_page));
747         }
748
749         ph = from_pgnum(head, bs->page_list, sp_bits);
750
751         i = find_free_bit(ph->used);
752         set_bit(ph->used, i);
753         ph->elements_used++;
754
755         /* check if this page is now full */
756         if (unlikely(ph->elements_used == bs->elements_per_page)) {
757                 del_from_bucket_list(head, bs, ph, sp_bits);
758                 add_to_bucket_full_list(head, bs, ph, sp_bits);
759         }
760
761         return (char *)ph + page_header_size(ph->bucket / INTER_BUCKET_SPACE,
762                                              bs->elements_per_page)
763                + i * bucket_to_size(bucket);
764 }
765
766 void alloc_free(void *pool, unsigned long poolsize, void *free)
767 {
768         struct header *head = pool;
769         struct bucket_state *bs;
770         unsigned int sp_bits;
771         unsigned long i, pgnum, pgoffset, offset = (char *)free - (char *)pool;
772         bool smallpage;
773         struct page_header *ph;
774
775         if (poolsize < MIN_USEFUL_SIZE) {
776                 tiny_alloc_free(pool, poolsize, free);
777                 return;
778         }
779         
780         /* Get page header. */
781         sp_bits = small_page_bits(poolsize);
782         pgnum = offset >> sp_bits;
783
784         /* Big page? Round down further. */
785         if (test_bit(head->pagesize, pgnum >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
786                 smallpage = false;
787                 pgnum &= ~(SMALL_PAGES_PER_LARGE_PAGE - 1);
788         } else
789                 smallpage = true;
790
791         /* Step back to page header. */
792         ph = from_pgnum(head, pgnum, sp_bits);
793         if ((void *)ph == free) {
794                 huge_free(head, poolsize, free);
795                 return;
796         }
797
798         bs = &head->bs[ph->bucket];
799         pgoffset = offset - (pgnum << sp_bits)
800                 - page_header_size(ph->bucket / INTER_BUCKET_SPACE,
801                                    bs->elements_per_page);
802
803         if (unlikely(ph->elements_used == bs->elements_per_page)) {
804                 del_from_bucket_full_list(head, bs, ph, sp_bits);
805                 add_to_bucket_list(head, bs, ph, sp_bits);
806         }
807
808         /* Which element are we? */
809         i = pgoffset / bucket_to_size(ph->bucket);
810         clear_bit(ph->used, i);
811         ph->elements_used--;
812
813         if (unlikely(ph->elements_used == 0)) {
814                 bs = &head->bs[ph->bucket];
815                 del_from_bucket_list(head, bs, ph, sp_bits);
816                 if (smallpage)
817                         add_small_page_to_freelist(head, ph, sp_bits);
818                 else
819                         add_large_page_to_freelist(head, ph, sp_bits);
820         }
821 }
822
823 unsigned long alloc_size(void *pool, unsigned long poolsize, void *p)
824 {
825         struct header *head = pool;
826         unsigned int pgnum, sp_bits;
827         unsigned long offset = (char *)p - (char *)pool;
828         struct page_header *ph;
829
830         if (poolsize < MIN_USEFUL_SIZE)
831                 return tiny_alloc_size(pool, poolsize, p);
832
833         /* Get page header. */
834         sp_bits = small_page_bits(poolsize);
835         pgnum = offset >> sp_bits;
836
837         /* Big page? Round down further. */
838         if (test_bit(head->pagesize, pgnum >> BITS_FROM_SMALL_TO_LARGE_PAGE))
839                 pgnum &= ~(SMALL_PAGES_PER_LARGE_PAGE - 1);
840
841         /* Step back to page header. */
842         ph = from_pgnum(head, pgnum, sp_bits);
843         if ((void *)ph == p)
844                 return huge_size(head, p);
845
846         return bucket_to_size(ph->bucket);
847 }
848
849 /* Useful for gdb breakpoints. */
850 static bool check_fail(void)
851 {
852         return false;
853 }
854
855 static unsigned long count_bits(const unsigned long bitmap[],
856                                 unsigned long limit)
857 {
858         unsigned long i, count = 0;
859
860         while (limit >= BITS_PER_LONG) {
861                 count += popcount(bitmap[0]);
862                 bitmap++;
863                 limit -= BITS_PER_LONG;
864         }
865
866         for (i = 0; i < limit; i++)
867                 if (test_bit(bitmap, i))
868                         count++;
869         return count;
870 }
871
872 static bool out_of_bounds(unsigned long pgnum,
873                           unsigned int sp_bits,
874                           unsigned long pagesize,
875                           unsigned long poolsize)
876 {
877         if (((pgnum << sp_bits) >> sp_bits) != pgnum)
878                 return true;
879
880         if ((pgnum << sp_bits) > poolsize)
881                 return true;
882
883         return ((pgnum << sp_bits) + pagesize > poolsize);
884 }
885
886 static bool check_bucket(struct header *head,
887                          unsigned long poolsize,
888                          unsigned long pages[],
889                          struct bucket_state *bs,
890                          unsigned int bindex)
891 {
892         bool lp_bucket;
893         struct page_header *ph;
894         unsigned long taken, i, prev, pagesize, sp_bits, lp_bits;
895
896         sp_bits = small_page_bits(poolsize);
897         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
898
899         lp_bucket = large_page_bucket(bindex, sp_bits);
900
901         pagesize = 1UL << (lp_bucket ? lp_bits : sp_bits);
902
903         /* This many elements fit? */
904         taken = page_header_size(bindex / INTER_BUCKET_SPACE,
905                                  bs->elements_per_page);
906         taken += bucket_to_size(bindex) * bs->elements_per_page;
907         if (taken > pagesize)
908                 return check_fail();
909
910         /* One more wouldn't fit? */
911         taken = page_header_size(bindex / INTER_BUCKET_SPACE,
912                                  bs->elements_per_page + 1);
913         taken += bucket_to_size(bindex) * (bs->elements_per_page + 1);
914         if (taken <= pagesize)
915                 return check_fail();
916
917         /* Walk used list. */
918         prev = 0;
919         for (i = bs->page_list; i; i = ph->next) {
920                 /* Bad pointer? */
921                 if (out_of_bounds(i, sp_bits, pagesize, poolsize))
922                         return check_fail();
923                 /* Wrong size page? */
924                 if (!!test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE)
925                     != lp_bucket)
926                         return check_fail();
927                 /* Large page not on boundary? */
928                 if (lp_bucket && (i % SMALL_PAGES_PER_LARGE_PAGE) != 0)
929                         return check_fail();
930                 ph = from_pgnum(head, i, sp_bits);
931                 /* Linked list corrupt? */
932                 if (ph->prev != prev)
933                         return check_fail();
934                 /* Already seen this page? */
935                 if (test_bit(pages, i))
936                         return check_fail();
937                 set_bit(pages, i);
938                 /* Empty or full? */
939                 if (ph->elements_used == 0)
940                         return check_fail();
941                 if (ph->elements_used >= bs->elements_per_page)
942                         return check_fail();
943                 /* Used bits don't agree? */
944                 if (ph->elements_used != count_bits(ph->used,
945                                                     bs->elements_per_page))
946                         return check_fail();
947                 /* Wrong bucket? */
948                 if (ph->bucket != bindex)
949                         return check_fail();
950                 prev = i;
951         }
952
953         /* Walk full list. */
954         prev = 0;
955         for (i = bs->full_list; i; i = ph->next) {
956                 /* Bad pointer? */
957                 if (out_of_bounds(i, sp_bits, pagesize, poolsize))
958                         return check_fail();
959                 /* Wrong size page? */
960                 if (!!test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE)
961                     != lp_bucket)
962                 /* Large page not on boundary? */
963                 if (lp_bucket && (i % SMALL_PAGES_PER_LARGE_PAGE) != 0)
964                         return check_fail();
965                 ph = from_pgnum(head, i, sp_bits);
966                 /* Linked list corrupt? */
967                 if (ph->prev != prev)
968                         return check_fail();
969                 /* Already seen this page? */
970                 if (test_bit(pages, i))
971                         return check_fail();
972                 set_bit(pages, i);
973                 /* Not full? */
974                 if (ph->elements_used != bs->elements_per_page)
975                         return check_fail();
976                 /* Used bits don't agree? */
977                 if (ph->elements_used != count_bits(ph->used,
978                                                     bs->elements_per_page))
979                         return check_fail();
980                 /* Wrong bucket? */
981                 if (ph->bucket != bindex)
982                         return check_fail();
983                 prev = i;
984         }
985         return true;
986 }
987
988 bool alloc_check(void *pool, unsigned long poolsize)
989 {
990         struct header *head = pool;
991         unsigned long prev, i, lp_bits, sp_bits, header_size, num_buckets;
992         struct page_header *ph;
993         struct huge_alloc *ha;
994         unsigned long pages[MAX_SMALL_PAGES / BITS_PER_LONG] = { 0 };
995
996         if (poolsize < MIN_USEFUL_SIZE)
997                 return tiny_alloc_check(pool, poolsize);
998
999         sp_bits = small_page_bits(poolsize);
1000         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
1001
1002         num_buckets = max_bucket(lp_bits);
1003
1004         header_size = sizeof(*head) + sizeof(head->bs) * (num_buckets-1);
1005
1006         /* First, set all bits taken by header. */
1007         for (i = 0; i < header_size; i += (1UL << sp_bits))
1008                 set_bit(pages, i >> sp_bits);
1009
1010         /* Check small page free list. */
1011         prev = 0;
1012         for (i = head->small_free_list; i; i = ph->next) {
1013                 /* Bad pointer? */
1014                 if (out_of_bounds(i, sp_bits, 1UL << sp_bits, poolsize))
1015                         return check_fail();
1016                 /* Large page? */
1017                 if (test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE))
1018                         return check_fail();
1019                 ph = from_pgnum(head, i, sp_bits);
1020                 /* Linked list corrupt? */
1021                 if (ph->prev != prev)
1022                         return check_fail();
1023                 /* Already seen this page? */
1024                 if (test_bit(pages, i))
1025                         return check_fail();
1026                 set_bit(pages, i);
1027                 prev = i;
1028         }
1029
1030         /* Check large page free list. */
1031         prev = 0;
1032         for (i = head->large_free_list; i; i = ph->next) {
1033                 /* Bad pointer? */
1034                 if (out_of_bounds(i, sp_bits, 1UL << lp_bits, poolsize))
1035                         return check_fail();
1036                 /* Not large page? */
1037                 if (!test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE))
1038                         return check_fail();
1039                 /* Not page boundary? */
1040                 if ((i % SMALL_PAGES_PER_LARGE_PAGE) != 0)
1041                         return check_fail();
1042                 ph = from_pgnum(head, i, sp_bits);
1043                 /* Linked list corrupt? */
1044                 if (ph->prev != prev)
1045                         return check_fail();
1046                 /* Already seen this page? */
1047                 if (test_bit(pages, i))
1048                         return check_fail();
1049                 set_bit(pages, i);
1050                 prev = i;
1051         }
1052
1053         /* Check the buckets. */
1054         for (i = 0; i < max_bucket(lp_bits); i++) {
1055                 struct bucket_state *bs = &head->bs[i];
1056
1057                 if (!check_bucket(head, poolsize, pages, bs, i))
1058                         return false;
1059         }
1060
1061         /* Check the huge alloc list. */
1062         prev = 0;
1063         for (i = head->huge; i; i = ha->next) {
1064                 unsigned long pgbits, j;
1065
1066                 /* Bad pointer? */
1067                 if (i >= poolsize || i + sizeof(*ha) > poolsize)
1068                         return check_fail();
1069                 ha = (void *)((char *)head + i);
1070
1071                 /* Check contents of ha. */
1072                 if (ha->off > poolsize || ha->off + ha->len > poolsize)
1073                         return check_fail();
1074
1075                 /* Large or small page? */
1076                 pgbits = test_bit(head->pagesize, ha->off >> lp_bits)
1077                         ? lp_bits : sp_bits;
1078
1079                 /* Not page boundary? */
1080                 if ((ha->off % (1UL << pgbits)) != 0)
1081                         return check_fail();
1082
1083                 /* Not page length? */
1084                 if ((ha->len % (1UL << pgbits)) != 0)
1085                         return check_fail();
1086
1087                 /* Linked list corrupt? */
1088                 if (ha->prev != prev)
1089                         return check_fail();
1090
1091                 for (j = ha->off; j < ha->off + ha->len; j += (1UL<<sp_bits)) {
1092                         /* Already seen this page? */
1093                         if (test_bit(pages, j >> sp_bits))
1094                                 return check_fail();
1095                         set_bit(pages, j >> sp_bits);
1096                 }
1097
1098                 prev = i;
1099         }
1100                 
1101         /* Make sure every page accounted for. */
1102         for (i = 0; i < poolsize >> sp_bits; i++) {
1103                 if (!test_bit(pages, i))
1104                         return check_fail();
1105                 if (test_bit(head->pagesize,
1106                              i >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
1107                         /* Large page, skip rest. */
1108                         i += SMALL_PAGES_PER_LARGE_PAGE - 1;
1109                 }
1110         }
1111
1112         return true;
1113 }
1114
1115 static unsigned long print_overhead(FILE *out, const char *desc,
1116                                     unsigned long bytes,
1117                                     unsigned long poolsize)
1118 {
1119         fprintf(out, "Overhead (%s): %lu bytes (%.3g%%)\n",
1120                 desc, bytes, 100.0 * bytes / poolsize);
1121         return bytes;
1122 }
1123
1124 static unsigned long count_list(struct header *head,
1125                                 u16 pgnum,
1126                                 unsigned int sp_bits,
1127                                 unsigned long *total_elems)
1128 {
1129         struct page_header *p;
1130         unsigned long ret = 0;
1131
1132         while (pgnum) {
1133                 p = from_pgnum(head, pgnum, sp_bits);
1134                 if (total_elems)
1135                         (*total_elems) += p->elements_used;
1136                 ret++;
1137                 pgnum = p->next;
1138         }
1139         return ret;
1140 }
1141
1142 static unsigned long visualize_bucket(FILE *out, struct header *head,
1143                                       unsigned int bucket,
1144                                       unsigned long poolsize,
1145                                       unsigned int sp_bits)
1146 {
1147         unsigned long num_full, num_partial, num_pages, page_size,
1148                 elems, hdr_min, hdr_size, elems_per_page, overhead = 0;
1149
1150         elems_per_page = head->bs[bucket].elements_per_page;
1151
1152         /* If we used byte-based bitmaps, we could get pg hdr to: */
1153         hdr_min = sizeof(struct page_header)
1154                 - sizeof(((struct page_header *)0)->used)
1155                 + align_up(elems_per_page, CHAR_BIT) / CHAR_BIT;
1156         hdr_size = page_header_size(bucket / INTER_BUCKET_SPACE,
1157                                     elems_per_page);
1158
1159         elems = 0;
1160         num_full = count_list(head, head->bs[bucket].full_list, sp_bits,
1161                               &elems);
1162         num_partial = count_list(head, head->bs[bucket].page_list, sp_bits,
1163                                  &elems);
1164         num_pages = num_full + num_partial;
1165         if (!num_pages)
1166                 return 0;
1167
1168         fprintf(out, "Bucket %u (%lu bytes):"
1169                 " %lu full, %lu partial = %lu elements\n",
1170                 bucket, bucket_to_size(bucket), num_full, num_partial, elems);
1171         /* Strict requirement of page header size. */
1172         overhead += print_overhead(out, "page headers",
1173                                    hdr_min * num_pages, poolsize);
1174         /* Gap between minimal page header and actual start. */
1175         overhead += print_overhead(out, "page post-header alignments",
1176                                    (hdr_size - hdr_min) * num_pages, poolsize);
1177         /* Between last element and end of page. */
1178         page_size = (1UL << sp_bits);
1179         if (large_page_bucket(bucket, sp_bits))
1180                 page_size <<= BITS_FROM_SMALL_TO_LARGE_PAGE;
1181
1182         overhead += print_overhead(out, "page tails",
1183                                    (page_size - (hdr_size
1184                                                  + (elems_per_page
1185                                                     * bucket_to_size(bucket))))
1186                                    * num_pages, poolsize);
1187         return overhead;
1188 }
1189
1190 void alloc_visualize(FILE *out, void *pool, unsigned long poolsize)
1191 {
1192         struct header *head = pool;
1193         unsigned long i, lp_bits, sp_bits, header_size, num_buckets, count,
1194                 overhead = 0;
1195
1196         fprintf(out, "Pool %p size %lu: (%s allocator)\n", pool, poolsize,
1197                 poolsize < MIN_USEFUL_SIZE ? "tiny" : "standard");
1198
1199         if (poolsize < MIN_USEFUL_SIZE) {
1200                 tiny_alloc_visualize(out, pool, poolsize);
1201                 return;
1202         }
1203         
1204         sp_bits = small_page_bits(poolsize);
1205         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
1206
1207         num_buckets = max_bucket(lp_bits);
1208         header_size = sizeof(*head) + sizeof(head->bs) * (num_buckets-1);
1209
1210         fprintf(out, "Large page size %lu, small page size %lu.\n",
1211                 1UL << lp_bits, 1UL << sp_bits);
1212         overhead += print_overhead(out, "unused pool tail",
1213                                    poolsize % (1UL << lp_bits), poolsize);
1214         fprintf(out, "Main header %lu bytes (%lu small pages).\n",
1215                 header_size, align_up(header_size, 1UL << sp_bits) >> sp_bits);
1216         overhead += print_overhead(out, "partial header page",
1217                                    align_up(header_size, 1UL << sp_bits)
1218                                    - header_size, poolsize);
1219         /* Total large pages. */
1220         i = count_bits(head->pagesize, poolsize >> lp_bits);
1221         /* Used pages. */
1222         count = i - count_list(head, head->large_free_list, sp_bits, NULL);
1223         fprintf(out, "%lu/%lu large pages used (%.3g%%)\n",
1224                 count, i, count ? 100.0 * count / i : 0.0);
1225
1226         /* Total small pages. */
1227         i = ((poolsize >> lp_bits) - i) << BITS_FROM_SMALL_TO_LARGE_PAGE;
1228         /* Used pages */
1229         count = i - count_list(head, head->small_free_list, sp_bits, NULL);
1230         fprintf(out, "%lu/%lu small pages used (%.3g%%)\n",
1231                 count, i, count ? 100.0 * count / i : 0.0);
1232
1233         /* Summary of each bucket. */
1234         fprintf(out, "%lu buckets:\n", num_buckets);
1235         for (i = 0; i < num_buckets; i++)
1236                 overhead += visualize_bucket(out, head, i, poolsize, sp_bits);
1237
1238         print_overhead(out, "total", overhead, poolsize);
1239 }