]> git.ozlabs.org Git - ccan/blob - ccan/alloc/alloc.c
alloc: fix missing header in compilation of example.
[ccan] / ccan / alloc / alloc.c
1 #include <unistd.h>
2 #include <stdint.h>
3 #include <string.h>
4 #include <limits.h>
5 #include <assert.h>
6 #include <stdlib.h>
7 #include "alloc.h"
8 #include "bitops.h"
9 #include "tiny.h"
10 #include <ccan/build_assert/build_assert.h>
11 #include <ccan/likely/likely.h>
12 #include <ccan/alignof/alignof.h>
13 #include <ccan/short_types/short_types.h>
14 #include <ccan/compiler/compiler.h>
15 #include "config.h"
16
17 /*
18    Inspired by (and parts taken from) Andrew Tridgell's alloc_mmap:
19    http://samba.org/~tridge/junkcode/alloc_mmap/
20
21    Copyright (C) Andrew Tridgell 2007
22    
23    This library is free software; you can redistribute it and/or
24    modify it under the terms of the GNU Lesser General Public
25    License as published by the Free Software Foundation; either
26    version 2 of the License, or (at your option) any later version.
27
28    This library is distributed in the hope that it will be useful,
29    but WITHOUT ANY WARRANTY; without even the implied warranty of
30    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
31    Lesser General Public License for more details.
32
33    You should have received a copy of the GNU Lesser General Public
34    License along with this library; if not, write to the Free Software
35    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
36  */
37
38 /* We divide the pool into this many large pages (nearest power of 2) */
39 #define MAX_LARGE_PAGES (256UL)
40
41 /* 32 small pages == 1 large page. */
42 #define BITS_FROM_SMALL_TO_LARGE_PAGE 5
43
44 #define MAX_SMALL_PAGES (MAX_LARGE_PAGES << BITS_FROM_SMALL_TO_LARGE_PAGE)
45
46 /* Smallest pool size for this scheme: 128-byte small pages.  That's
47  * 9/13% overhead for 32/64 bit. */
48 #define MIN_USEFUL_SIZE (MAX_SMALL_PAGES * 128)
49
50 /* Every 4 buckets, we jump up a power of 2. ...8 10 12 14 16 20 24 28 32... */
51 #define INTER_BUCKET_SPACE 4
52
53 #define SMALL_PAGES_PER_LARGE_PAGE (1 << BITS_FROM_SMALL_TO_LARGE_PAGE)
54
55 /* FIXME: Figure this out properly. */
56 #define MAX_SIZE (1 << 30)
57
58 /* How few object to fit in a page before using a larger one? (8) */
59 #define MAX_PAGE_OBJECT_ORDER   3
60
61 #define BITS_PER_LONG (sizeof(long) * CHAR_BIT)
62
63 struct bucket_state {
64         u32 elements_per_page;
65         u16 page_list;
66         u16 full_list;
67 };
68
69 struct header {
70         /* Bitmap of which pages are large. */
71         unsigned long pagesize[MAX_LARGE_PAGES / BITS_PER_LONG];
72
73         /* List of unused small/large pages. */
74         u16 small_free_list;
75         u16 large_free_list;
76
77         /* List of huge allocs. */
78         unsigned long huge;
79
80         /* This is less defined: we have two buckets for each power of 2 */
81         struct bucket_state bs[1];
82 };
83
84 struct huge_alloc {
85         unsigned long next, prev;
86         unsigned long off, len;
87 };
88
89 struct page_header {
90         u16 next, prev;
91         /* FIXME: We can just count all-0 and all-1 used[] elements. */
92         unsigned elements_used : 25;
93         unsigned bucket : 7;
94         unsigned long used[1]; /* One bit per element. */
95 };
96
97 /*
98  * Every 4 buckets, the size doubles.
99  * Between buckets, sizes increase linearly.
100  *
101  * eg. bucket 40 = 2^10                 = 1024
102  *     bucket 41 = 2^10 + 2^10*4        = 1024 + 256
103  *     bucket 42 = 2^10 + 2^10*4        = 1024 + 512
104  *     bucket 43 = 2^10 + 2^10*4        = 1024 + 768
105  *     bucket 45 = 2^11                 = 2048
106  *
107  * Care is taken to handle low numbered buckets, at cost of overflow.
108  */
109 static unsigned long bucket_to_size(unsigned int bucket)
110 {
111         unsigned long base = 1UL << (bucket / INTER_BUCKET_SPACE);
112         return base + ((bucket % INTER_BUCKET_SPACE)
113                        << (bucket / INTER_BUCKET_SPACE))
114                 / INTER_BUCKET_SPACE;
115 }
116
117 /*
118  * Say size is 10.
119  *   fls(size/2) == 3.  1 << 3 == 8, so we're 2 too large, out of a possible
120  * 8 too large.  That's 1/4 of the way to the next power of 2 == 1 bucket.
121  *
122  * We make sure we round up.  Note that this fails on 32 bit at size
123  * 1879048193 (around bucket 120).
124  */
125 static unsigned int size_to_bucket(unsigned long size)
126 {
127         unsigned int base = fls(size/2);
128         unsigned long overshoot;
129
130         overshoot = size - (1UL << base);
131         return base * INTER_BUCKET_SPACE
132                 + ((overshoot * INTER_BUCKET_SPACE + (1UL << base)-1) >> base);
133 }
134
135 static unsigned int small_page_bits(unsigned long poolsize)
136 {
137         return fls(poolsize / MAX_SMALL_PAGES - 1);
138 }
139
140 static struct page_header *from_pgnum(struct header *head,
141                                       unsigned long pgnum,
142                                       unsigned sp_bits)
143 {
144         return (struct page_header *)((char *)head + (pgnum << sp_bits));
145 }
146
147 static u16 to_pgnum(struct header *head, void *p, unsigned sp_bits)
148 {
149         return ((char *)p - (char *)head) >> sp_bits;
150 }
151
152 static size_t used_size(unsigned int num_elements)
153 {
154         return align_up(num_elements, BITS_PER_LONG) / CHAR_BIT;
155 }
156
157 /*
158  * We always align the first entry to the lower power of 2.
159  * eg. the 12-byte bucket gets 8-byte aligned.  The 4096-byte bucket
160  * gets 4096-byte aligned.
161  */
162 static unsigned long page_header_size(unsigned int align_bits,
163                                       unsigned long num_elements)
164 {
165         unsigned long size;
166
167         size = sizeof(struct page_header)
168                 - sizeof(((struct page_header *)0)->used)
169                 + used_size(num_elements);
170         return align_up(size, 1UL << align_bits);
171 }
172
173 static void add_to_list(struct header *head,
174                         u16 *list, struct page_header *ph, unsigned sp_bits)
175 {
176         unsigned long h = *list, offset = to_pgnum(head, ph, sp_bits);
177
178         ph->next = h;
179         if (h) {
180                 struct page_header *prev = from_pgnum(head, h, sp_bits);
181                 assert(prev->prev == 0);
182                 prev->prev = offset;
183         }
184         *list = offset;
185         ph->prev = 0;
186 }
187
188 static void del_from_list(struct header *head,
189                           u16 *list, struct page_header *ph, unsigned sp_bits)
190 {
191         /* Front of list? */
192         if (ph->prev == 0) {
193                 *list = ph->next;
194         } else {
195                 struct page_header *prev = from_pgnum(head, ph->prev, sp_bits);
196                 prev->next = ph->next;
197         }
198         if (ph->next != 0) {
199                 struct page_header *next = from_pgnum(head, ph->next, sp_bits);
200                 next->prev = ph->prev;
201         }
202 }
203
204 static u16 pop_from_list(struct header *head,
205                                    u16 *list,
206                                    unsigned int sp_bits)
207 {
208         u16 h = *list;
209         struct page_header *ph = from_pgnum(head, h, sp_bits);
210
211         if (likely(h)) {
212                 *list = ph->next;
213                 if (*list)
214                         from_pgnum(head, *list, sp_bits)->prev = 0;
215         }
216         return h;
217 }
218
219 static void add_to_huge_list(struct header *head, struct huge_alloc *ha)
220 {
221         unsigned long h = head->huge;
222         unsigned long offset = (char *)ha - (char *)head;
223
224         ha->next = h;
225         if (h) {
226                 struct huge_alloc *prev = (void *)((char *)head + h);
227                 assert(prev->prev == 0);
228                 prev->prev = offset;
229         }
230         head->huge = offset;
231         ha->prev = 0;
232 }
233
234 static void del_from_huge(struct header *head, struct huge_alloc *ha)
235 {
236         /* Front of list? */
237         if (ha->prev == 0) {
238                 head->huge = ha->next;
239         } else {
240                 struct huge_alloc *prev = (void *)((char *)head + ha->prev);
241                 prev->next = ha->next;
242         }
243         if (ha->next != 0) {
244                 struct huge_alloc *next = (void *)((char *)head + ha->next);
245                 next->prev = ha->prev;
246         }
247 }
248
249 static void add_small_page_to_freelist(struct header *head,
250                                        struct page_header *ph,
251                                        unsigned int sp_bits)
252 {
253         add_to_list(head, &head->small_free_list, ph, sp_bits);
254 }
255
256 static void add_large_page_to_freelist(struct header *head,
257                                        struct page_header *ph,
258                                        unsigned int sp_bits)
259 {
260         add_to_list(head, &head->large_free_list, ph, sp_bits);
261 }
262
263 static void add_to_bucket_list(struct header *head,
264                                struct bucket_state *bs,
265                                struct page_header *ph,
266                                unsigned int sp_bits)
267 {
268         add_to_list(head, &bs->page_list, ph, sp_bits);
269 }
270
271 static void del_from_bucket_list(struct header *head,
272                                  struct bucket_state *bs,
273                                  struct page_header *ph,
274                                  unsigned int sp_bits)
275 {
276         del_from_list(head, &bs->page_list, ph, sp_bits);
277 }
278
279 static void del_from_bucket_full_list(struct header *head,
280                                       struct bucket_state *bs,
281                                       struct page_header *ph,
282                                       unsigned int sp_bits)
283 {
284         del_from_list(head, &bs->full_list, ph, sp_bits);
285 }
286
287 static void add_to_bucket_full_list(struct header *head,
288                                     struct bucket_state *bs,
289                                     struct page_header *ph,
290                                     unsigned int sp_bits)
291 {
292         add_to_list(head, &bs->full_list, ph, sp_bits);
293 }
294
295 static void clear_bit(unsigned long bitmap[], unsigned int off)
296 {
297         bitmap[off / BITS_PER_LONG] &= ~(1UL << (off % BITS_PER_LONG));
298 }
299
300 static bool test_bit(const unsigned long bitmap[], unsigned int off)
301 {
302         return bitmap[off / BITS_PER_LONG] & (1UL << (off % BITS_PER_LONG));
303 }
304
305 static void set_bit(unsigned long bitmap[], unsigned int off)
306 {
307         bitmap[off / BITS_PER_LONG] |= (1UL << (off % BITS_PER_LONG));
308 }
309
310 /* There must be a bit to be found. */
311 static unsigned int find_free_bit(const unsigned long bitmap[])
312 {
313         unsigned int i;
314
315         for (i = 0; bitmap[i] == -1UL; i++);
316         return (i*BITS_PER_LONG) + ffsl(~bitmap[i]) - 1;
317 }
318
319 /* How many elements can we fit in a page? */
320 static unsigned long elements_per_page(unsigned long align_bits,
321                                        unsigned long esize,
322                                        unsigned long psize)
323 {
324         unsigned long num, overhead;
325
326         /* First approximation: no extra room for bitmap. */
327         overhead = align_up(sizeof(struct page_header), 1UL << align_bits);
328         num = (psize - overhead) / esize;
329
330         while (page_header_size(align_bits, num) + esize * num > psize)
331                 num--;
332         return num;
333 }
334
335 static bool large_page_bucket(unsigned int bucket, unsigned int sp_bits)
336 {
337         unsigned long max_smallsize;
338
339         /* Note: this doesn't take into account page header. */
340         max_smallsize = (1UL << sp_bits) >> MAX_PAGE_OBJECT_ORDER;
341
342         return bucket_to_size(bucket) > max_smallsize;
343 }
344
345 static unsigned int max_bucket(unsigned int lp_bits)
346 {
347         return (lp_bits - MAX_PAGE_OBJECT_ORDER) * INTER_BUCKET_SPACE;
348 }
349
350 void alloc_init(void *pool, unsigned long poolsize)
351 {
352         struct header *head = pool;
353         struct page_header *ph;
354         unsigned int lp_bits, sp_bits, num_buckets;
355         unsigned long header_size, i;
356
357         if (poolsize < MIN_USEFUL_SIZE) {
358                 tiny_alloc_init(pool, poolsize);
359                 return;
360         }
361
362         /* We rely on page numbers fitting in 16 bit. */
363         BUILD_ASSERT(MAX_SMALL_PAGES < 65536);
364         
365         sp_bits = small_page_bits(poolsize);
366         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
367
368         num_buckets = max_bucket(lp_bits);
369
370         head = pool;
371         header_size = sizeof(*head) + sizeof(head->bs) * (num_buckets-1);
372
373         memset(head, 0, header_size);
374         for (i = 0; i < num_buckets; i++) {
375                 unsigned long pagesize;
376
377                 if (large_page_bucket(i, sp_bits))
378                         pagesize = 1UL << lp_bits;
379                 else
380                         pagesize = 1UL << sp_bits;
381
382                 head->bs[i].elements_per_page
383                         = elements_per_page(i / INTER_BUCKET_SPACE,
384                                             bucket_to_size(i),
385                                             pagesize);
386         }
387
388         /* They start as all large pages. */
389         memset(head->pagesize, 0xFF, sizeof(head->pagesize));
390         /* FIXME: small pages for last bit? */
391
392         /* Split first page into small pages. */
393         assert(header_size < (1UL << lp_bits));
394         clear_bit(head->pagesize, 0);
395
396         /* Skip over page(s) used by header, add rest to free list */
397         for (i = align_up(header_size, (1UL << sp_bits)) >> sp_bits;
398              i < SMALL_PAGES_PER_LARGE_PAGE;
399              i++) {
400                 ph = from_pgnum(head, i, sp_bits);
401                 ph->elements_used = 0;
402                 add_small_page_to_freelist(head, ph, sp_bits);
403         }
404
405         /* Add the rest of the pages as large pages. */
406         i = SMALL_PAGES_PER_LARGE_PAGE;
407         while ((i << sp_bits) + (1UL << lp_bits) <= poolsize) {
408                 assert(i < MAX_SMALL_PAGES);
409                 ph = from_pgnum(head, i, sp_bits);
410                 ph->elements_used = 0;
411                 add_large_page_to_freelist(head, ph, sp_bits);
412                 i += SMALL_PAGES_PER_LARGE_PAGE;
413         }
414 }
415
416 /* A large page worth of small pages are free: delete them from free list. */
417 static void del_large_from_small_free_list(struct header *head,
418                                            struct page_header *ph,
419                                            unsigned int sp_bits)
420 {
421         unsigned long i;
422
423         for (i = 0; i < SMALL_PAGES_PER_LARGE_PAGE; i++) {
424                 del_from_list(head, &head->small_free_list,
425                               (void *)ph + (i << sp_bits),
426                               sp_bits);
427         }
428 }
429
430 static bool all_empty(struct header *head,
431                       unsigned long pgnum,
432                       unsigned sp_bits)
433 {
434         unsigned long i;
435
436         for (i = 0; i < SMALL_PAGES_PER_LARGE_PAGE; i++) {
437                 struct page_header *ph = from_pgnum(head, pgnum + i, sp_bits);
438                 if (ph->elements_used)
439                         return false;
440         }
441         return true;
442 }
443
444 static void recombine_small_pages(struct header *head, unsigned long poolsize,
445                                   unsigned int sp_bits)
446 {
447         unsigned long i;
448         unsigned int lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
449
450         /* Look for small pages to coalesce, after first large page. */
451         for (i = SMALL_PAGES_PER_LARGE_PAGE;
452              i < (poolsize >> lp_bits) << BITS_FROM_SMALL_TO_LARGE_PAGE;
453              i += SMALL_PAGES_PER_LARGE_PAGE) {
454                 /* Already a large page? */
455                 if (test_bit(head->pagesize, i / SMALL_PAGES_PER_LARGE_PAGE))
456                         continue;
457                 if (all_empty(head, i, sp_bits)) {
458                         struct page_header *ph = from_pgnum(head, i, sp_bits);
459                         set_bit(head->pagesize,
460                                 i / SMALL_PAGES_PER_LARGE_PAGE);
461                         del_large_from_small_free_list(head, ph, sp_bits);
462                         add_large_page_to_freelist(head, ph, sp_bits);
463                 }
464         }
465 }
466
467 static u16 get_large_page(struct header *head, unsigned long poolsize,
468                           unsigned int sp_bits)
469 {
470         unsigned int lp_bits, page;
471
472         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
473
474         page = pop_from_list(head, &head->large_free_list, sp_bits);
475         if (likely(page))
476                 return page;
477
478         recombine_small_pages(head, poolsize, sp_bits);
479
480         return pop_from_list(head, &head->large_free_list, sp_bits);
481 }
482
483 /* Returns small page. */
484 static unsigned long break_up_large_page(struct header *head,
485                                          unsigned int sp_bits,
486                                          u16 lpage)
487 {
488         unsigned int i;
489
490         clear_bit(head->pagesize, lpage >> BITS_FROM_SMALL_TO_LARGE_PAGE);
491
492         for (i = 1; i < SMALL_PAGES_PER_LARGE_PAGE; i++) {
493                 struct page_header *ph = from_pgnum(head, lpage + i, sp_bits);
494                 /* Initialize this: huge_alloc reads it. */
495                 ph->elements_used = 0;
496                 add_small_page_to_freelist(head, ph, sp_bits);
497         }
498
499         return lpage;
500 }
501
502 static u16 get_small_page(struct header *head, unsigned long poolsize,
503                           unsigned int sp_bits)
504 {
505         u16 ret;
506
507         ret = pop_from_list(head, &head->small_free_list, sp_bits);
508         if (likely(ret))
509                 return ret;
510         ret = get_large_page(head, poolsize, sp_bits);
511         if (likely(ret))
512                 ret = break_up_large_page(head, sp_bits, ret);
513         return ret;
514 }
515
516 static bool huge_allocated(struct header *head, unsigned long offset)
517 {
518         unsigned long i;
519         struct huge_alloc *ha;
520
521         for (i = head->huge; i; i = ha->next) {
522                 ha = (void *)((char *)head + i);
523                 if (ha->off <= offset && ha->off + ha->len > offset)
524                         return true;
525         }
526         return false;
527 }
528
529 /* They want something really big.  Aim for contiguous pages (slow). */
530 static COLD_ATTRIBUTE
531 void *huge_alloc(void *pool, unsigned long poolsize,
532                  unsigned long size, unsigned long align)
533 {
534         struct header *head = pool;
535         struct huge_alloc *ha;
536         unsigned long i, sp_bits, lp_bits, num, header_size;
537
538         sp_bits = small_page_bits(poolsize);
539         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
540
541         /* Allocate tracking structure optimistically. */
542         ha = alloc_get(pool, poolsize, sizeof(*ha), ALIGNOF(*ha));
543         if (!ha)
544                 return NULL;
545
546         /* First search for contiguous small pages... */
547         header_size = sizeof(*head) + sizeof(head->bs) * (max_bucket(lp_bits)-1);
548
549         num = 0;
550         for (i = (header_size + (1UL << sp_bits) - 1) >> sp_bits;
551              i << sp_bits < poolsize;
552              i++) {
553                 struct page_header *pg;
554                 unsigned long off = (i << sp_bits);
555
556                 /* Skip over large pages. */
557                 if (test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
558                         i += (1UL << BITS_FROM_SMALL_TO_LARGE_PAGE)-1;
559                         continue;
560                 }
561
562                 /* Does this page meet alignment requirements? */
563                 if (!num && off % align != 0)
564                         continue;
565
566                 /* FIXME: This makes us O(n^2). */
567                 if (huge_allocated(head, off)) {
568                         num = 0;
569                         continue;
570                 }
571
572                 pg = (struct page_header *)((char *)head + off);
573                 if (pg->elements_used) {
574                         num = 0;
575                         continue;
576                 }
577
578                 num++;
579                 if (num << sp_bits >= size) {
580                         unsigned long pgnum;
581
582                         /* Remove from free list. */
583                         for (pgnum = i; pgnum > i - num; pgnum--) {
584                                 pg = from_pgnum(head, pgnum, sp_bits);
585                                 del_from_list(head,
586                                               &head->small_free_list,
587                                               pg, sp_bits);
588                         }
589                         ha->off = (i - num + 1) << sp_bits;
590                         ha->len = num << sp_bits;
591                         goto done;
592                 }
593         }
594
595         /* Now search for large pages... */
596         recombine_small_pages(head, poolsize, sp_bits);
597
598         num = 0;
599         for (i = (header_size + (1UL << lp_bits) - 1) >> lp_bits;
600              (i << lp_bits) < poolsize; i++) {
601                 struct page_header *pg;
602                 unsigned long off = (i << lp_bits);
603
604                 /* Ignore small pages. */
605                 if (!test_bit(head->pagesize, i))
606                         continue;
607
608                 /* Does this page meet alignment requirements? */
609                 if (!num && off % align != 0)
610                         continue;
611
612                 /* FIXME: This makes us O(n^2). */
613                 if (huge_allocated(head, off)) {
614                         num = 0;
615                         continue;
616                 }
617
618                 pg = (struct page_header *)((char *)head + off);
619                 if (pg->elements_used) {
620                         num = 0;
621                         continue;
622                 }
623
624                 num++;
625                 if (num << lp_bits >= size) {
626                         unsigned long pgnum;
627
628                         /* Remove from free list. */
629                         for (pgnum = i; pgnum > i - num; pgnum--) {
630                                 pg = from_pgnum(head, pgnum, lp_bits);
631                                 del_from_list(head,
632                                               &head->large_free_list,
633                                               pg, sp_bits);
634                         }
635                         ha->off = (i - num + 1) << lp_bits;
636                         ha->len = num << lp_bits;
637                         goto done;
638                 }
639         }
640
641         /* Unable to satisfy: free huge alloc structure. */
642         alloc_free(pool, poolsize, ha);
643         return NULL;
644
645 done:
646         add_to_huge_list(pool, ha);
647         return (char *)pool + ha->off;
648 }
649
650 static COLD_ATTRIBUTE void
651 huge_free(struct header *head, unsigned long poolsize, void *free)
652 {
653         unsigned long i, off, pgnum, free_off = (char *)free - (char *)head;
654         unsigned int sp_bits, lp_bits;
655         struct huge_alloc *ha;
656
657         for (i = head->huge; i; i = ha->next) {
658                 ha = (void *)((char *)head + i);
659                 if (free_off == ha->off)
660                         break;
661         }
662         assert(i);
663
664         /* Free up all the pages, delete and free ha */
665         sp_bits = small_page_bits(poolsize);
666         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
667         pgnum = free_off >> sp_bits;
668
669         if (test_bit(head->pagesize, pgnum >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
670                 for (off = ha->off;
671                      off < ha->off + ha->len;
672                      off += 1UL << lp_bits) {
673                         add_large_page_to_freelist(head,
674                                                    (void *)((char *)head + off),
675                                                    sp_bits);
676                 }
677         } else {
678                 for (off = ha->off;
679                      off < ha->off + ha->len;
680                      off += 1UL << sp_bits) {
681                         add_small_page_to_freelist(head,
682                                                    (void *)((char *)head + off),
683                                                    sp_bits);
684                 }
685         }
686         del_from_huge(head, ha);
687         alloc_free(head, poolsize, ha);
688 }
689
690 static COLD_ATTRIBUTE unsigned long
691 huge_size(struct header *head, void *p)
692 {
693         unsigned long i, off = (char *)p - (char *)head;
694         struct huge_alloc *ha;
695
696         for (i = head->huge; i; i = ha->next) {
697                 ha = (void *)((char *)head + i);
698                 if (off == ha->off) {
699                         return ha->len;
700                 }
701         }
702         abort();
703 }
704
705 void *alloc_get(void *pool, unsigned long poolsize,
706                 unsigned long size, unsigned long align)
707 {
708         struct header *head = pool;
709         unsigned int bucket;
710         unsigned long i;
711         struct bucket_state *bs;
712         struct page_header *ph;
713         unsigned int sp_bits;
714
715         if (poolsize < MIN_USEFUL_SIZE) {
716                 return tiny_alloc_get(pool, poolsize, size, align);
717         }
718
719         size = align_up(size, align);
720         if (unlikely(!size))
721                 size = 1;
722         bucket = size_to_bucket(size);
723
724         sp_bits = small_page_bits(poolsize);
725
726         if (bucket >= max_bucket(sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE)) {
727                 return huge_alloc(pool, poolsize, size, align);
728         }
729
730         bs = &head->bs[bucket];
731
732         if (!bs->page_list) {
733                 struct page_header *ph;
734
735                 if (large_page_bucket(bucket, sp_bits))
736                         bs->page_list = get_large_page(head, poolsize,
737                                                        sp_bits);
738                 else
739                         bs->page_list = get_small_page(head, poolsize,
740                                                        sp_bits);
741                 /* FIXME: Try large-aligned alloc?  Header stuffing? */
742                 if (unlikely(!bs->page_list))
743                         return NULL;
744                 ph = from_pgnum(head, bs->page_list, sp_bits);
745                 ph->bucket = bucket;
746                 ph->elements_used = 0;
747                 ph->next = 0;
748                 memset(ph->used, 0, used_size(bs->elements_per_page));
749         }
750
751         ph = from_pgnum(head, bs->page_list, sp_bits);
752
753         i = find_free_bit(ph->used);
754         set_bit(ph->used, i);
755         ph->elements_used++;
756
757         /* check if this page is now full */
758         if (unlikely(ph->elements_used == bs->elements_per_page)) {
759                 del_from_bucket_list(head, bs, ph, sp_bits);
760                 add_to_bucket_full_list(head, bs, ph, sp_bits);
761         }
762
763         return (char *)ph + page_header_size(ph->bucket / INTER_BUCKET_SPACE,
764                                              bs->elements_per_page)
765                + i * bucket_to_size(bucket);
766 }
767
768 void alloc_free(void *pool, unsigned long poolsize, void *free)
769 {
770         struct header *head = pool;
771         struct bucket_state *bs;
772         unsigned int sp_bits;
773         unsigned long i, pgnum, pgoffset, offset = (char *)free - (char *)pool;
774         bool smallpage;
775         struct page_header *ph;
776
777         if (poolsize < MIN_USEFUL_SIZE) {
778                 return tiny_alloc_free(pool, poolsize, free);
779         }
780         
781         /* Get page header. */
782         sp_bits = small_page_bits(poolsize);
783         pgnum = offset >> sp_bits;
784
785         /* Big page? Round down further. */
786         if (test_bit(head->pagesize, pgnum >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
787                 smallpage = false;
788                 pgnum &= ~(SMALL_PAGES_PER_LARGE_PAGE - 1);
789         } else
790                 smallpage = true;
791
792         /* Step back to page header. */
793         ph = from_pgnum(head, pgnum, sp_bits);
794         if ((void *)ph == free) {
795                 huge_free(head, poolsize, free);
796                 return;
797         }
798
799         bs = &head->bs[ph->bucket];
800         pgoffset = offset - (pgnum << sp_bits)
801                 - page_header_size(ph->bucket / INTER_BUCKET_SPACE,
802                                    bs->elements_per_page);
803
804         if (unlikely(ph->elements_used == bs->elements_per_page)) {
805                 del_from_bucket_full_list(head, bs, ph, sp_bits);
806                 add_to_bucket_list(head, bs, ph, sp_bits);
807         }
808
809         /* Which element are we? */
810         i = pgoffset / bucket_to_size(ph->bucket);
811         clear_bit(ph->used, i);
812         ph->elements_used--;
813
814         if (unlikely(ph->elements_used == 0)) {
815                 bs = &head->bs[ph->bucket];
816                 del_from_bucket_list(head, bs, ph, sp_bits);
817                 if (smallpage)
818                         add_small_page_to_freelist(head, ph, sp_bits);
819                 else
820                         add_large_page_to_freelist(head, ph, sp_bits);
821         }
822 }
823
824 unsigned long alloc_size(void *pool, unsigned long poolsize, void *p)
825 {
826         struct header *head = pool;
827         unsigned int pgnum, sp_bits;
828         unsigned long offset = (char *)p - (char *)pool;
829         struct page_header *ph;
830
831         if (poolsize < MIN_USEFUL_SIZE)
832                 return tiny_alloc_size(pool, poolsize, p);
833
834         /* Get page header. */
835         sp_bits = small_page_bits(poolsize);
836         pgnum = offset >> sp_bits;
837
838         /* Big page? Round down further. */
839         if (test_bit(head->pagesize, pgnum >> BITS_FROM_SMALL_TO_LARGE_PAGE))
840                 pgnum &= ~(SMALL_PAGES_PER_LARGE_PAGE - 1);
841
842         /* Step back to page header. */
843         ph = from_pgnum(head, pgnum, sp_bits);
844         if ((void *)ph == p)
845                 return huge_size(head, p);
846
847         return bucket_to_size(ph->bucket);
848 }
849
850 /* Useful for gdb breakpoints. */
851 static bool check_fail(void)
852 {
853         return false;
854 }
855
856 static unsigned long count_bits(const unsigned long bitmap[],
857                                 unsigned long limit)
858 {
859         unsigned long i, count = 0;
860
861         while (limit >= BITS_PER_LONG) {
862                 count += popcount(bitmap[0]);
863                 bitmap++;
864                 limit -= BITS_PER_LONG;
865         }
866
867         for (i = 0; i < limit; i++)
868                 if (test_bit(bitmap, i))
869                         count++;
870         return count;
871 }
872
873 static bool out_of_bounds(unsigned long pgnum,
874                           unsigned int sp_bits,
875                           unsigned long pagesize,
876                           unsigned long poolsize)
877 {
878         if (((pgnum << sp_bits) >> sp_bits) != pgnum)
879                 return true;
880
881         if ((pgnum << sp_bits) > poolsize)
882                 return true;
883
884         return ((pgnum << sp_bits) + pagesize > poolsize);
885 }
886
887 static bool check_bucket(struct header *head,
888                          unsigned long poolsize,
889                          unsigned long pages[],
890                          struct bucket_state *bs,
891                          unsigned int bindex)
892 {
893         bool lp_bucket;
894         struct page_header *ph;
895         unsigned long taken, i, prev, pagesize, sp_bits, lp_bits;
896
897         sp_bits = small_page_bits(poolsize);
898         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
899
900         lp_bucket = large_page_bucket(bindex, sp_bits);
901
902         pagesize = 1UL << (lp_bucket ? lp_bits : sp_bits);
903
904         /* This many elements fit? */
905         taken = page_header_size(bindex / INTER_BUCKET_SPACE,
906                                  bs->elements_per_page);
907         taken += bucket_to_size(bindex) * bs->elements_per_page;
908         if (taken > pagesize)
909                 return check_fail();
910
911         /* One more wouldn't fit? */
912         taken = page_header_size(bindex / INTER_BUCKET_SPACE,
913                                  bs->elements_per_page + 1);
914         taken += bucket_to_size(bindex) * (bs->elements_per_page + 1);
915         if (taken <= pagesize)
916                 return check_fail();
917
918         /* Walk used list. */
919         prev = 0;
920         for (i = bs->page_list; i; i = ph->next) {
921                 /* Bad pointer? */
922                 if (out_of_bounds(i, sp_bits, pagesize, poolsize))
923                         return check_fail();
924                 /* Wrong size page? */
925                 if (!!test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE)
926                     != lp_bucket)
927                         return check_fail();
928                 /* Large page not on boundary? */
929                 if (lp_bucket && (i % SMALL_PAGES_PER_LARGE_PAGE) != 0)
930                         return check_fail();
931                 ph = from_pgnum(head, i, sp_bits);
932                 /* Linked list corrupt? */
933                 if (ph->prev != prev)
934                         return check_fail();
935                 /* Already seen this page? */
936                 if (test_bit(pages, i))
937                         return check_fail();
938                 set_bit(pages, i);
939                 /* Empty or full? */
940                 if (ph->elements_used == 0)
941                         return check_fail();
942                 if (ph->elements_used >= bs->elements_per_page)
943                         return check_fail();
944                 /* Used bits don't agree? */
945                 if (ph->elements_used != count_bits(ph->used,
946                                                     bs->elements_per_page))
947                         return check_fail();
948                 /* Wrong bucket? */
949                 if (ph->bucket != bindex)
950                         return check_fail();
951                 prev = i;
952         }
953
954         /* Walk full list. */
955         prev = 0;
956         for (i = bs->full_list; i; i = ph->next) {
957                 /* Bad pointer? */
958                 if (out_of_bounds(i, sp_bits, pagesize, poolsize))
959                         return check_fail();
960                 /* Wrong size page? */
961                 if (!!test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE)
962                     != lp_bucket)
963                 /* Large page not on boundary? */
964                 if (lp_bucket && (i % SMALL_PAGES_PER_LARGE_PAGE) != 0)
965                         return check_fail();
966                 ph = from_pgnum(head, i, sp_bits);
967                 /* Linked list corrupt? */
968                 if (ph->prev != prev)
969                         return check_fail();
970                 /* Already seen this page? */
971                 if (test_bit(pages, i))
972                         return check_fail();
973                 set_bit(pages, i);
974                 /* Not full? */
975                 if (ph->elements_used != bs->elements_per_page)
976                         return check_fail();
977                 /* Used bits don't agree? */
978                 if (ph->elements_used != count_bits(ph->used,
979                                                     bs->elements_per_page))
980                         return check_fail();
981                 /* Wrong bucket? */
982                 if (ph->bucket != bindex)
983                         return check_fail();
984                 prev = i;
985         }
986         return true;
987 }
988
989 bool alloc_check(void *pool, unsigned long poolsize)
990 {
991         struct header *head = pool;
992         unsigned long prev, i, lp_bits, sp_bits, header_size, num_buckets;
993         struct page_header *ph;
994         struct huge_alloc *ha;
995         unsigned long pages[MAX_SMALL_PAGES / BITS_PER_LONG] = { 0 };
996
997         if (poolsize < MIN_USEFUL_SIZE)
998                 return tiny_alloc_check(pool, poolsize);
999
1000         sp_bits = small_page_bits(poolsize);
1001         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
1002
1003         num_buckets = max_bucket(lp_bits);
1004
1005         header_size = sizeof(*head) + sizeof(head->bs) * (num_buckets-1);
1006
1007         /* First, set all bits taken by header. */
1008         for (i = 0; i < header_size; i += (1UL << sp_bits))
1009                 set_bit(pages, i >> sp_bits);
1010
1011         /* Check small page free list. */
1012         prev = 0;
1013         for (i = head->small_free_list; i; i = ph->next) {
1014                 /* Bad pointer? */
1015                 if (out_of_bounds(i, sp_bits, 1UL << sp_bits, poolsize))
1016                         return check_fail();
1017                 /* Large page? */
1018                 if (test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE))
1019                         return check_fail();
1020                 ph = from_pgnum(head, i, sp_bits);
1021                 /* Linked list corrupt? */
1022                 if (ph->prev != prev)
1023                         return check_fail();
1024                 /* Already seen this page? */
1025                 if (test_bit(pages, i))
1026                         return check_fail();
1027                 set_bit(pages, i);
1028                 prev = i;
1029         }
1030
1031         /* Check large page free list. */
1032         prev = 0;
1033         for (i = head->large_free_list; i; i = ph->next) {
1034                 /* Bad pointer? */
1035                 if (out_of_bounds(i, sp_bits, 1UL << lp_bits, poolsize))
1036                         return check_fail();
1037                 /* Not large page? */
1038                 if (!test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE))
1039                         return check_fail();
1040                 /* Not page boundary? */
1041                 if ((i % SMALL_PAGES_PER_LARGE_PAGE) != 0)
1042                         return check_fail();
1043                 ph = from_pgnum(head, i, sp_bits);
1044                 /* Linked list corrupt? */
1045                 if (ph->prev != prev)
1046                         return check_fail();
1047                 /* Already seen this page? */
1048                 if (test_bit(pages, i))
1049                         return check_fail();
1050                 set_bit(pages, i);
1051                 prev = i;
1052         }
1053
1054         /* Check the buckets. */
1055         for (i = 0; i < max_bucket(lp_bits); i++) {
1056                 struct bucket_state *bs = &head->bs[i];
1057
1058                 if (!check_bucket(head, poolsize, pages, bs, i))
1059                         return false;
1060         }
1061
1062         /* Check the huge alloc list. */
1063         prev = 0;
1064         for (i = head->huge; i; i = ha->next) {
1065                 unsigned long pgbits, j;
1066
1067                 /* Bad pointer? */
1068                 if (i >= poolsize || i + sizeof(*ha) > poolsize)
1069                         return check_fail();
1070                 ha = (void *)((char *)head + i);
1071
1072                 /* Check contents of ha. */
1073                 if (ha->off > poolsize || ha->off + ha->len > poolsize)
1074                         return check_fail();
1075
1076                 /* Large or small page? */
1077                 pgbits = test_bit(head->pagesize, ha->off >> lp_bits)
1078                         ? lp_bits : sp_bits;
1079
1080                 /* Not page boundary? */
1081                 if ((ha->off % (1UL << pgbits)) != 0)
1082                         return check_fail();
1083
1084                 /* Not page length? */
1085                 if ((ha->len % (1UL << pgbits)) != 0)
1086                         return check_fail();
1087
1088                 /* Linked list corrupt? */
1089                 if (ha->prev != prev)
1090                         return check_fail();
1091
1092                 for (j = ha->off; j < ha->off + ha->len; j += (1UL<<sp_bits)) {
1093                         /* Already seen this page? */
1094                         if (test_bit(pages, j >> sp_bits))
1095                                 return check_fail();
1096                         set_bit(pages, j >> sp_bits);
1097                 }
1098
1099                 prev = i;
1100         }
1101                 
1102         /* Make sure every page accounted for. */
1103         for (i = 0; i < poolsize >> sp_bits; i++) {
1104                 if (!test_bit(pages, i))
1105                         return check_fail();
1106                 if (test_bit(head->pagesize,
1107                              i >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
1108                         /* Large page, skip rest. */
1109                         i += SMALL_PAGES_PER_LARGE_PAGE - 1;
1110                 }
1111         }
1112
1113         return true;
1114 }
1115
1116 static unsigned long print_overhead(FILE *out, const char *desc,
1117                                     unsigned long bytes,
1118                                     unsigned long poolsize)
1119 {
1120         fprintf(out, "Overhead (%s): %lu bytes (%.3g%%)\n",
1121                 desc, bytes, 100.0 * bytes / poolsize);
1122         return bytes;
1123 }
1124
1125 static unsigned long count_list(struct header *head,
1126                                 u16 pgnum,
1127                                 unsigned int sp_bits,
1128                                 unsigned long *total_elems)
1129 {
1130         struct page_header *p;
1131         unsigned long ret = 0;
1132
1133         while (pgnum) {
1134                 p = from_pgnum(head, pgnum, sp_bits);
1135                 if (total_elems)
1136                         (*total_elems) += p->elements_used;
1137                 ret++;
1138                 pgnum = p->next;
1139         }
1140         return ret;
1141 }
1142
1143 static unsigned long visualize_bucket(FILE *out, struct header *head,
1144                                       unsigned int bucket,
1145                                       unsigned long poolsize,
1146                                       unsigned int sp_bits)
1147 {
1148         unsigned long num_full, num_partial, num_pages, page_size,
1149                 elems, hdr_min, hdr_size, elems_per_page, overhead = 0;
1150
1151         elems_per_page = head->bs[bucket].elements_per_page;
1152
1153         /* If we used byte-based bitmaps, we could get pg hdr to: */
1154         hdr_min = sizeof(struct page_header)
1155                 - sizeof(((struct page_header *)0)->used)
1156                 + align_up(elems_per_page, CHAR_BIT) / CHAR_BIT;
1157         hdr_size = page_header_size(bucket / INTER_BUCKET_SPACE,
1158                                     elems_per_page);
1159
1160         elems = 0;
1161         num_full = count_list(head, head->bs[bucket].full_list, sp_bits,
1162                               &elems);
1163         num_partial = count_list(head, head->bs[bucket].page_list, sp_bits,
1164                                  &elems);
1165         num_pages = num_full + num_partial;
1166         if (!num_pages)
1167                 return 0;
1168
1169         fprintf(out, "Bucket %u (%lu bytes):"
1170                 " %lu full, %lu partial = %lu elements\n",
1171                 bucket, bucket_to_size(bucket), num_full, num_partial, elems);
1172         /* Strict requirement of page header size. */
1173         overhead += print_overhead(out, "page headers",
1174                                    hdr_min * num_pages, poolsize);
1175         /* Gap between minimal page header and actual start. */
1176         overhead += print_overhead(out, "page post-header alignments",
1177                                    (hdr_size - hdr_min) * num_pages, poolsize);
1178         /* Between last element and end of page. */
1179         page_size = (1UL << sp_bits);
1180         if (large_page_bucket(bucket, sp_bits))
1181                 page_size <<= BITS_FROM_SMALL_TO_LARGE_PAGE;
1182
1183         overhead += print_overhead(out, "page tails",
1184                                    (page_size - (hdr_size
1185                                                  + (elems_per_page
1186                                                     * bucket_to_size(bucket))))
1187                                    * num_pages, poolsize);
1188         return overhead;
1189 }
1190
1191 void alloc_visualize(FILE *out, void *pool, unsigned long poolsize)
1192 {
1193         struct header *head = pool;
1194         unsigned long i, lp_bits, sp_bits, header_size, num_buckets, count,
1195                 overhead = 0;
1196
1197         fprintf(out, "Pool %p size %lu: (%s allocator)\n", pool, poolsize,
1198                 poolsize < MIN_USEFUL_SIZE ? "tiny" : "standard");
1199
1200         if (poolsize < MIN_USEFUL_SIZE) {
1201                 tiny_alloc_visualize(out, pool, poolsize);
1202                 return;
1203         }
1204         
1205         sp_bits = small_page_bits(poolsize);
1206         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
1207
1208         num_buckets = max_bucket(lp_bits);
1209         header_size = sizeof(*head) + sizeof(head->bs) * (num_buckets-1);
1210
1211         fprintf(out, "Large page size %lu, small page size %lu.\n",
1212                 1UL << lp_bits, 1UL << sp_bits);
1213         overhead += print_overhead(out, "unused pool tail",
1214                                    poolsize % (1UL << lp_bits), poolsize);
1215         fprintf(out, "Main header %lu bytes (%lu small pages).\n",
1216                 header_size, align_up(header_size, 1UL << sp_bits) >> sp_bits);
1217         overhead += print_overhead(out, "partial header page",
1218                                    align_up(header_size, 1UL << sp_bits)
1219                                    - header_size, poolsize);
1220         /* Total large pages. */
1221         i = count_bits(head->pagesize, poolsize >> lp_bits);
1222         /* Used pages. */
1223         count = i - count_list(head, head->large_free_list, sp_bits, NULL);
1224         fprintf(out, "%lu/%lu large pages used (%.3g%%)\n",
1225                 count, i, count ? 100.0 * count / i : 0.0);
1226
1227         /* Total small pages. */
1228         i = ((poolsize >> lp_bits) - i) << BITS_FROM_SMALL_TO_LARGE_PAGE;
1229         /* Used pages */
1230         count = i - count_list(head, head->small_free_list, sp_bits, NULL);
1231         fprintf(out, "%lu/%lu small pages used (%.3g%%)\n",
1232                 count, i, count ? 100.0 * count / i : 0.0);
1233
1234         /* Summary of each bucket. */
1235         fprintf(out, "%lu buckets:\n", num_buckets);
1236         for (i = 0; i < num_buckets; i++)
1237                 overhead += visualize_bucket(out, head, i, poolsize, sp_bits);
1238
1239         print_overhead(out, "total", overhead, poolsize);
1240 }