]> git.ozlabs.org Git - ccan/blob - ccan/alloc/alloc.c
alloc: implement huge allocations
[ccan] / ccan / alloc / alloc.c
1 #include <unistd.h>
2 #include <stdint.h>
3 #include <string.h>
4 #include <limits.h>
5 #include <assert.h>
6 #include <stdlib.h>
7 #include "alloc.h"
8 #include "bitops.h"
9 #include "tiny.h"
10 #include <ccan/build_assert/build_assert.h>
11 #include <ccan/likely/likely.h>
12 #include <ccan/alignof/alignof.h>
13 #include <ccan/short_types/short_types.h>
14 #include "config.h"
15
16 /*
17    Inspired by (and parts taken from) Andrew Tridgell's alloc_mmap:
18    http://samba.org/~tridge/junkcode/alloc_mmap/
19
20    Copyright (C) Andrew Tridgell 2007
21    
22    This library is free software; you can redistribute it and/or
23    modify it under the terms of the GNU Lesser General Public
24    License as published by the Free Software Foundation; either
25    version 2 of the License, or (at your option) any later version.
26
27    This library is distributed in the hope that it will be useful,
28    but WITHOUT ANY WARRANTY; without even the implied warranty of
29    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
30    Lesser General Public License for more details.
31
32    You should have received a copy of the GNU Lesser General Public
33    License along with this library; if not, write to the Free Software
34    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
35  */
36
37 /* We divide the pool into this many large pages (nearest power of 2) */
38 #define MAX_LARGE_PAGES (1024UL)
39
40 /* 32 small pages == 1 large page. */
41 #define BITS_FROM_SMALL_TO_LARGE_PAGE 5
42
43 #define MAX_SMALL_PAGES (MAX_LARGE_PAGES << BITS_FROM_SMALL_TO_LARGE_PAGE)
44
45 /* Smallest pool size for this scheme: 128-byte small pages.  That's
46  * 9/13% overhead for 32/64 bit. */
47 #define MIN_USEFUL_SIZE (MAX_SMALL_PAGES * 128)
48
49 /* Every 4 buckets, we jump up a power of 2. ...8 10 12 14 16 20 24 28 32... */
50 #define INTER_BUCKET_SPACE 4
51
52 #define SMALL_PAGES_PER_LARGE_PAGE (1 << BITS_FROM_SMALL_TO_LARGE_PAGE)
53
54 /* FIXME: Figure this out properly. */
55 #define MAX_SIZE (1 << 30)
56
57 /* How few object to fit in a page before using a larger one? (8) */
58 #define MAX_PAGE_OBJECT_ORDER   3
59
60 #define BITS_PER_LONG (sizeof(long) * CHAR_BIT)
61
62 struct bucket_state {
63         u32 elements_per_page;
64         u16 page_list;
65         u16 full_list;
66 };
67
68 struct header {
69         /* Bitmap of which pages are large. */
70         unsigned long pagesize[MAX_LARGE_PAGES / BITS_PER_LONG];
71
72         /* List of unused small/large pages. */
73         u16 small_free_list;
74         u16 large_free_list;
75
76         /* List of huge allocs. */
77         unsigned long huge;
78
79         /* This is less defined: we have two buckets for each power of 2 */
80         struct bucket_state bs[1];
81 };
82
83 struct huge_alloc {
84         unsigned long next, prev;
85         unsigned long off, len;
86 };
87
88 struct page_header {
89         u16 next, prev;
90         /* FIXME: We can just count all-0 and all-1 used[] elements. */
91         unsigned elements_used : 25;
92         unsigned bucket : 7;
93         unsigned long used[1]; /* One bit per element. */
94 };
95
96 /*
97  * Every 4 buckets, the size doubles.
98  * Between buckets, sizes increase linearly.
99  *
100  * eg. bucket 40 = 2^10                 = 1024
101  *     bucket 41 = 2^10 + 2^10*4        = 1024 + 256
102  *     bucket 42 = 2^10 + 2^10*4        = 1024 + 512
103  *     bucket 43 = 2^10 + 2^10*4        = 1024 + 768
104  *     bucket 45 = 2^11                 = 2048
105  *
106  * Care is taken to handle low numbered buckets, at cost of overflow.
107  */
108 static unsigned long bucket_to_size(unsigned int bucket)
109 {
110         unsigned long base = 1 << (bucket / INTER_BUCKET_SPACE);
111         return base + ((bucket % INTER_BUCKET_SPACE)
112                        << (bucket / INTER_BUCKET_SPACE))
113                 / INTER_BUCKET_SPACE;
114 }
115
116 /*
117  * Say size is 10.
118  *   fls(size/2) == 3.  1 << 3 == 8, so we're 2 too large, out of a possible
119  * 8 too large.  That's 1/4 of the way to the next power of 2 == 1 bucket.
120  *
121  * We make sure we round up.  Note that this fails on 32 bit at size
122  * 1879048193 (around bucket 120).
123  */
124 static unsigned int size_to_bucket(unsigned long size)
125 {
126         unsigned int base = fls(size/2);
127         unsigned long overshoot;
128
129         overshoot = size - (1 << base);
130         return base * INTER_BUCKET_SPACE
131                 + ((overshoot * INTER_BUCKET_SPACE + (1 << base)-1) >> base);
132 }
133
134 static unsigned int small_page_bits(unsigned long poolsize)
135 {
136         return fls(poolsize / MAX_SMALL_PAGES / 2);
137 }
138
139 static struct page_header *from_pgnum(struct header *head,
140                                       unsigned long pgnum,
141                                       unsigned sp_bits)
142 {
143         return (struct page_header *)((char *)head + (pgnum << sp_bits));
144 }
145
146 static u16 to_pgnum(struct header *head, void *p, unsigned sp_bits)
147 {
148         return ((char *)p - (char *)head) >> sp_bits;
149 }
150
151 static size_t used_size(unsigned int num_elements)
152 {
153         return align_up(num_elements, BITS_PER_LONG) / CHAR_BIT;
154 }
155
156 /*
157  * We always align the first entry to the lower power of 2.
158  * eg. the 12-byte bucket gets 8-byte aligned.  The 4096-byte bucket
159  * gets 4096-byte aligned.
160  */
161 static unsigned long page_header_size(unsigned int align_bits,
162                                       unsigned long num_elements)
163 {
164         unsigned long size;
165
166         size = sizeof(struct page_header)
167                 - sizeof(((struct page_header *)0)->used)
168                 + used_size(num_elements);
169         return align_up(size, 1 << align_bits);
170 }
171
172 static void add_to_list(struct header *head,
173                         u16 *list, struct page_header *ph, unsigned sp_bits)
174 {
175         unsigned long h = *list, offset = to_pgnum(head, ph, sp_bits);
176
177         ph->next = h;
178         if (h) {
179                 struct page_header *prev = from_pgnum(head, h, sp_bits);
180                 assert(prev->prev == 0);
181                 prev->prev = offset;
182         }
183         *list = offset;
184         ph->prev = 0;
185 }
186
187 static void del_from_list(struct header *head,
188                           u16 *list, struct page_header *ph, unsigned sp_bits)
189 {
190         /* Front of list? */
191         if (ph->prev == 0) {
192                 *list = ph->next;
193         } else {
194                 struct page_header *prev = from_pgnum(head, ph->prev, sp_bits);
195                 prev->next = ph->next;
196         }
197         if (ph->next != 0) {
198                 struct page_header *next = from_pgnum(head, ph->next, sp_bits);
199                 next->prev = ph->prev;
200         }
201 }
202
203 static u16 pop_from_list(struct header *head,
204                                    u16 *list,
205                                    unsigned int sp_bits)
206 {
207         u16 h = *list;
208         struct page_header *ph = from_pgnum(head, h, sp_bits);
209
210         if (likely(h)) {
211                 *list = ph->next;
212                 if (*list)
213                         from_pgnum(head, *list, sp_bits)->prev = 0;
214         }
215         return h;
216 }
217
218 static void add_to_huge_list(struct header *head, struct huge_alloc *ha)
219 {
220         unsigned long h = head->huge;
221         unsigned long offset = (char *)ha - (char *)head;
222
223         ha->next = h;
224         if (h) {
225                 struct huge_alloc *prev = (void *)((char *)head + h);
226                 assert(prev->prev == 0);
227                 prev->prev = offset;
228         }
229         head->huge = offset;
230         ha->prev = 0;
231 }
232
233 static void del_from_huge(struct header *head, struct huge_alloc *ha)
234 {
235         /* Front of list? */
236         if (ha->prev == 0) {
237                 head->huge = ha->next;
238         } else {
239                 struct huge_alloc *prev = (void *)((char *)head + ha->prev);
240                 prev->next = ha->next;
241         }
242         if (ha->next != 0) {
243                 struct huge_alloc *next = (void *)((char *)head + ha->next);
244                 next->prev = ha->prev;
245         }
246 }
247
248 static void add_small_page_to_freelist(struct header *head,
249                                        struct page_header *ph,
250                                        unsigned int sp_bits)
251 {
252         add_to_list(head, &head->small_free_list, ph, sp_bits);
253 }
254
255 static void add_large_page_to_freelist(struct header *head,
256                                        struct page_header *ph,
257                                        unsigned int sp_bits)
258 {
259         add_to_list(head, &head->large_free_list, ph, sp_bits);
260 }
261
262 static void add_to_bucket_list(struct header *head,
263                                struct bucket_state *bs,
264                                struct page_header *ph,
265                                unsigned int sp_bits)
266 {
267         add_to_list(head, &bs->page_list, ph, sp_bits);
268 }
269
270 static void del_from_bucket_list(struct header *head,
271                                  struct bucket_state *bs,
272                                  struct page_header *ph,
273                                  unsigned int sp_bits)
274 {
275         del_from_list(head, &bs->page_list, ph, sp_bits);
276 }
277
278 static void del_from_bucket_full_list(struct header *head,
279                                       struct bucket_state *bs,
280                                       struct page_header *ph,
281                                       unsigned int sp_bits)
282 {
283         del_from_list(head, &bs->full_list, ph, sp_bits);
284 }
285
286 static void add_to_bucket_full_list(struct header *head,
287                                     struct bucket_state *bs,
288                                     struct page_header *ph,
289                                     unsigned int sp_bits)
290 {
291         add_to_list(head, &bs->full_list, ph, sp_bits);
292 }
293
294 static void clear_bit(unsigned long bitmap[], unsigned int off)
295 {
296         bitmap[off / BITS_PER_LONG] &= ~(1 << (off % BITS_PER_LONG));
297 }
298
299 static bool test_bit(const unsigned long bitmap[], unsigned int off)
300 {
301         return bitmap[off / BITS_PER_LONG] & (1 << (off % BITS_PER_LONG));
302 }
303
304 static void set_bit(unsigned long bitmap[], unsigned int off)
305 {
306         bitmap[off / BITS_PER_LONG] |= (1 << (off % BITS_PER_LONG));
307 }
308
309 /* There must be a bit to be found. */
310 static unsigned int find_free_bit(const unsigned long bitmap[])
311 {
312         unsigned int i;
313
314         for (i = 0; bitmap[i] == -1UL; i++);
315         return (i*BITS_PER_LONG) + ffsl(~bitmap[i]) - 1;
316 }
317
318 /* How many elements can we fit in a page? */
319 static unsigned long elements_per_page(unsigned long align_bits,
320                                        unsigned long esize,
321                                        unsigned long psize)
322 {
323         unsigned long num, overhead;
324
325         /* First approximation: no extra room for bitmap. */
326         overhead = align_up(sizeof(struct page_header), 1 << align_bits);
327         num = (psize - overhead) / esize;
328
329         while (page_header_size(align_bits, num) + esize * num > psize)
330                 num--;
331         return num;
332 }
333
334 static bool large_page_bucket(unsigned int bucket, unsigned int sp_bits)
335 {
336         unsigned long max_smallsize;
337
338         /* Note: this doesn't take into account page header. */
339         max_smallsize = (1UL << sp_bits) >> MAX_PAGE_OBJECT_ORDER;
340
341         return bucket_to_size(bucket) > max_smallsize;
342 }
343
344 static unsigned int max_bucket(unsigned int lp_bits)
345 {
346         return (lp_bits - MAX_PAGE_OBJECT_ORDER) * INTER_BUCKET_SPACE;
347 }
348
349 void alloc_init(void *pool, unsigned long poolsize)
350 {
351         struct header *head = pool;
352         struct page_header *ph;
353         unsigned int lp_bits, sp_bits, num_buckets;
354         unsigned long header_size, i;
355
356         if (poolsize < MIN_USEFUL_SIZE) {
357                 tiny_alloc_init(pool, poolsize);
358                 return;
359         }
360
361         /* We rely on page numbers fitting in 16 bit. */
362         BUILD_ASSERT(MAX_SMALL_PAGES < 65536);
363         
364         sp_bits = small_page_bits(poolsize);
365         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
366
367         num_buckets = max_bucket(lp_bits);
368
369         head = pool;
370         header_size = sizeof(*head) + sizeof(head->bs) * (num_buckets-1);
371
372         memset(head, 0, header_size);
373         for (i = 0; i < num_buckets; i++) {
374                 unsigned long pagesize;
375
376                 if (large_page_bucket(i, sp_bits))
377                         pagesize = 1UL << lp_bits;
378                 else
379                         pagesize = 1UL << sp_bits;
380
381                 head->bs[i].elements_per_page
382                         = elements_per_page(i / INTER_BUCKET_SPACE,
383                                             bucket_to_size(i),
384                                             pagesize);
385         }
386
387         /* They start as all large pages. */
388         memset(head->pagesize, 0xFF, sizeof(head->pagesize));
389         /* FIXME: small pages for last bit? */
390
391         /* Split first page into small pages. */
392         assert(header_size < (1UL << lp_bits));
393         clear_bit(head->pagesize, 0);
394
395         /* Skip over page(s) used by header, add rest to free list */
396         for (i = align_up(header_size, (1 << sp_bits)) >> sp_bits;
397              i < SMALL_PAGES_PER_LARGE_PAGE;
398              i++) {
399                 ph = from_pgnum(head, i, sp_bits);
400                 ph->elements_used = 0;
401                 add_small_page_to_freelist(head, ph, sp_bits);
402         }
403
404         /* Add the rest of the pages as large pages. */
405         i = SMALL_PAGES_PER_LARGE_PAGE;
406         while ((i << sp_bits) + (1 << lp_bits) <= poolsize) {
407                 ph = from_pgnum(head, i, sp_bits);
408                 ph->elements_used = 0;
409                 add_large_page_to_freelist(head, ph, sp_bits);
410                 i += SMALL_PAGES_PER_LARGE_PAGE;
411         }
412 }
413
414 /* A large page worth of small pages are free: delete them from free list. */
415 static void del_large_from_small_free_list(struct header *head,
416                                            struct page_header *ph,
417                                            unsigned int sp_bits)
418 {
419         unsigned long i;
420
421         for (i = 0; i < SMALL_PAGES_PER_LARGE_PAGE; i++) {
422                 del_from_list(head, &head->small_free_list,
423                               (void *)ph + (i << sp_bits),
424                               sp_bits);
425         }
426 }
427
428 static bool all_empty(struct header *head,
429                       unsigned long pgnum,
430                       unsigned sp_bits)
431 {
432         unsigned long i;
433
434         for (i = 0; i < SMALL_PAGES_PER_LARGE_PAGE; i++) {
435                 struct page_header *ph = from_pgnum(head, pgnum + i, sp_bits);
436                 if (ph->elements_used)
437                         return false;
438         }
439         return true;
440 }
441
442 static void recombine_small_pages(struct header *head, unsigned long poolsize,
443                                   unsigned int sp_bits)
444 {
445         unsigned long i;
446         unsigned int lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
447
448         /* Look for small pages to coalesce, after first large page. */
449         for (i = SMALL_PAGES_PER_LARGE_PAGE;
450              i < (poolsize >> lp_bits) << BITS_FROM_SMALL_TO_LARGE_PAGE;
451              i += SMALL_PAGES_PER_LARGE_PAGE) {
452                 /* Already a large page? */
453                 if (test_bit(head->pagesize, i / SMALL_PAGES_PER_LARGE_PAGE))
454                         continue;
455                 if (all_empty(head, i, sp_bits)) {
456                         struct page_header *ph = from_pgnum(head, i, sp_bits);
457                         set_bit(head->pagesize,
458                                 i / SMALL_PAGES_PER_LARGE_PAGE);
459                         del_large_from_small_free_list(head, ph, sp_bits);
460                         add_large_page_to_freelist(head, ph, sp_bits);
461                 }
462         }
463 }
464
465 static u16 get_large_page(struct header *head, unsigned long poolsize,
466                           unsigned int sp_bits)
467 {
468         unsigned int lp_bits, page;
469
470         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
471
472         page = pop_from_list(head, &head->large_free_list, sp_bits);
473         if (likely(page))
474                 return page;
475
476         recombine_small_pages(head, poolsize, sp_bits);
477
478         return pop_from_list(head, &head->large_free_list, sp_bits);
479 }
480
481 /* Returns small page. */
482 static unsigned long break_up_large_page(struct header *head,
483                                          unsigned int sp_bits,
484                                          u16 lpage)
485 {
486         unsigned int i;
487
488         clear_bit(head->pagesize, lpage >> BITS_FROM_SMALL_TO_LARGE_PAGE);
489
490         for (i = 1; i < SMALL_PAGES_PER_LARGE_PAGE; i++) {
491                 struct page_header *ph = from_pgnum(head, lpage + i, sp_bits);
492                 add_small_page_to_freelist(head, ph, sp_bits);
493         }
494
495         return lpage;
496 }
497
498 static u16 get_small_page(struct header *head, unsigned long poolsize,
499                           unsigned int sp_bits)
500 {
501         u16 ret;
502
503         ret = pop_from_list(head, &head->small_free_list, sp_bits);
504         if (likely(ret))
505                 return ret;
506         ret = get_large_page(head, poolsize, sp_bits);
507         if (likely(ret))
508                 ret = break_up_large_page(head, sp_bits, ret);
509         return ret;
510 }
511
512 void where_is_page(struct header *head, struct page_header *where,
513                    unsigned int sp_bits)
514 {
515         struct page_header *pg;
516         unsigned long off, bucket,
517                 num_buckets = max_bucket(sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE);
518
519         for (off = head->small_free_list; off; off = pg->next) {
520                 pg = from_pgnum(head, off, sp_bits);
521                 if (pg == where) {
522                         printf("It's in the small free list\n");
523                         return;
524                 }
525         }
526
527         for (off = head->large_free_list; off; off = pg->next) {
528                 pg = from_pgnum(head, off, sp_bits);
529                 if (pg == where) {
530                         printf("It's in the large free list\n");
531                         return;
532                 }
533         }
534
535         for (bucket = 0; bucket < num_buckets; bucket++) {
536                 for (off = head->bs[bucket].page_list; off; off = pg->next) {
537                         pg = from_pgnum(head, off, sp_bits);
538                         if (pg == where) {
539                                 printf("It's in %lu bucket page list\n", bucket);
540                                 return;
541                         }
542                 }
543
544                 for (off = head->bs[bucket].full_list; off; off = pg->next) {
545                         pg = from_pgnum(head, off, sp_bits);
546                         if (pg == where) {
547                                 printf("It's in %lu bucket full list\n", bucket);
548                                 return;
549                         }
550                 }
551         }
552         printf("It's nowhere!\n");
553 }
554
555 static bool huge_allocated(struct header *head, unsigned long offset)
556 {
557         unsigned long i;
558         struct huge_alloc *ha;
559
560         for (i = head->huge; i; i = ha->next) {
561                 ha = (void *)((char *)head + i);
562                 if (ha->off <= offset && ha->off + ha->len > offset)
563                         return true;
564         }
565         return false;
566 }
567
568 /* They want something really big.  Aim for contiguous pages (slow). */
569 static void *unlikely_func huge_alloc(void *pool, unsigned long poolsize,
570                                       unsigned long size, unsigned long align)
571 {
572         struct header *head = pool;
573         struct huge_alloc *ha;
574         unsigned long i, sp_bits, lp_bits, num, header_size;
575
576         sp_bits = small_page_bits(poolsize);
577         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
578
579         /* Allocate tracking structure optimistically. */
580         ha = alloc_get(pool, poolsize, sizeof(*ha), ALIGNOF(*ha));
581         if (!ha)
582                 return NULL;
583
584         /* First search for contiguous small pages... */
585         header_size = sizeof(*head) + sizeof(head->bs) * (max_bucket(lp_bits)-1);
586
587         num = 0;
588         for (i = (header_size + (1 << sp_bits) - 1) >> sp_bits;
589              i << sp_bits < poolsize;
590              i++) {
591                 struct page_header *pg;
592                 unsigned long off = (i << sp_bits);
593
594                 /* Skip over large pages. */
595                 if (test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
596                         i += (1 << BITS_FROM_SMALL_TO_LARGE_PAGE)-1;
597                         continue;
598                 }
599
600                 /* Does this page meet alignment requirements? */
601                 if (!num && off % align != 0)
602                         continue;
603
604                 /* FIXME: This makes us O(n^2). */
605                 if (huge_allocated(head, off)) {
606                         num = 0;
607                         continue;
608                 }
609
610                 pg = (struct page_header *)((char *)head + off);
611                 if (pg->elements_used) {
612                         num = 0;
613                         continue;
614                 }
615
616                 num++;
617                 if (num << sp_bits >= size) {
618                         unsigned long pgnum;
619
620                         /* Remove from free list. */
621                         for (pgnum = i; pgnum > i - num; pgnum--) {
622                                 pg = from_pgnum(head, pgnum, sp_bits);
623                                 del_from_list(head,
624                                               &head->small_free_list,
625                                               pg, sp_bits);
626                         }
627                         ha->off = (i - num + 1) << sp_bits;
628                         ha->len = num << sp_bits;
629                         goto done;
630                 }
631         }
632
633         /* Now search for large pages... */
634         recombine_small_pages(head, poolsize, sp_bits);
635
636         num = 0;
637         for (i = (header_size + (1 << lp_bits) - 1) >> lp_bits;
638              (i << lp_bits) < poolsize; i++) {
639                 struct page_header *pg;
640                 unsigned long off = (i << lp_bits);
641
642                 /* Ignore small pages. */
643                 if (!test_bit(head->pagesize, i))
644                         continue;
645
646                 /* Does this page meet alignment requirements? */
647                 if (!num && off % align != 0)
648                         continue;
649
650                 /* FIXME: This makes us O(n^2). */
651                 if (huge_allocated(head, off)) {
652                         num = 0;
653                         continue;
654                 }
655
656                 pg = (struct page_header *)((char *)head + off);
657                 if (pg->elements_used) {
658                         num = 0;
659                         continue;
660                 }
661
662                 num++;
663                 if (num << lp_bits >= size) {
664                         unsigned long pgnum;
665
666                         /* Remove from free list. */
667                         for (pgnum = i; pgnum > i - num; pgnum--) {
668                                 pg = from_pgnum(head, pgnum, lp_bits);
669                                 del_from_list(head,
670                                               &head->large_free_list,
671                                               pg, sp_bits);
672                         }
673                         ha->off = (i - num + 1) << lp_bits;
674                         ha->len = num << lp_bits;
675                         goto done;
676                 }
677         }
678
679         /* Unable to satisfy: free huge alloc structure. */
680         alloc_free(pool, poolsize, ha);
681         return NULL;
682
683 done:
684         add_to_huge_list(pool, ha);
685         return (char *)pool + ha->off;
686 }
687
688 static void unlikely_func huge_free(struct header *head,
689                                     unsigned long poolsize, void *free)
690 {
691         unsigned long i, off, pgnum, free_off = (char *)free - (char *)head;
692         unsigned int sp_bits, lp_bits;
693         struct huge_alloc *ha;
694
695         for (i = head->huge; i; i = ha->next) {
696                 ha = (void *)((char *)head + i);
697                 if (free_off == ha->off)
698                         break;
699         }
700         assert(i);
701
702         /* Free up all the pages, delete and free ha */
703         sp_bits = small_page_bits(poolsize);
704         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
705         pgnum = free_off >> sp_bits;
706
707         if (test_bit(head->pagesize, pgnum >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
708                 for (off = ha->off; off < ha->off + ha->len; off += 1 << lp_bits) {
709                         add_large_page_to_freelist(head,
710                                                    (void *)((char *)head + off),
711                                                    sp_bits);
712                 }
713         } else {
714                 for (off = ha->off; off < ha->off + ha->len; off += 1 << sp_bits) {
715                         add_small_page_to_freelist(head,
716                                                    (void *)((char *)head + off),
717                                                    sp_bits);
718                 }
719         }
720         del_from_huge(head, ha);
721         alloc_free(head, poolsize, ha);
722 }
723
724 static unsigned long unlikely_func huge_size(struct header *head, void *p)
725 {
726         unsigned long i, off = (char *)p - (char *)head;
727         struct huge_alloc *ha;
728
729         for (i = head->huge; i; i = ha->next) {
730                 ha = (void *)((char *)head + i);
731                 if (off == ha->off) {
732                         return ha->len;
733                 }
734         }
735         abort();
736 }
737
738 void *alloc_get(void *pool, unsigned long poolsize,
739                 unsigned long size, unsigned long align)
740 {
741         struct header *head = pool;
742         unsigned int bucket;
743         unsigned long i;
744         struct bucket_state *bs;
745         struct page_header *ph;
746         unsigned int sp_bits;
747
748         if (poolsize < MIN_USEFUL_SIZE) {
749                 return tiny_alloc_get(pool, poolsize, size, align);
750         }
751
752         size = align_up(size, align);
753         if (unlikely(!size))
754                 size = 1;
755         bucket = size_to_bucket(size);
756
757         sp_bits = small_page_bits(poolsize);
758
759         if (bucket >= max_bucket(sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE)) {
760                 return huge_alloc(pool, poolsize, size, align);
761         }
762
763         bs = &head->bs[bucket];
764
765         if (!bs->page_list) {
766                 struct page_header *ph;
767
768                 if (large_page_bucket(bucket, sp_bits))
769                         bs->page_list = get_large_page(head, poolsize,
770                                                        sp_bits);
771                 else
772                         bs->page_list = get_small_page(head, poolsize,
773                                                        sp_bits);
774                 /* FIXME: Try large-aligned alloc?  Header stuffing? */
775                 if (unlikely(!bs->page_list))
776                         return NULL;
777                 ph = from_pgnum(head, bs->page_list, sp_bits);
778                 ph->bucket = bucket;
779                 ph->elements_used = 0;
780                 ph->next = 0;
781                 memset(ph->used, 0, used_size(bs->elements_per_page));
782         }
783
784         ph = from_pgnum(head, bs->page_list, sp_bits);
785
786         i = find_free_bit(ph->used);
787         set_bit(ph->used, i);
788         ph->elements_used++;
789
790         /* check if this page is now full */
791         if (unlikely(ph->elements_used == bs->elements_per_page)) {
792                 del_from_bucket_list(head, bs, ph, sp_bits);
793                 add_to_bucket_full_list(head, bs, ph, sp_bits);
794         }
795
796         return (char *)ph + page_header_size(ph->bucket / INTER_BUCKET_SPACE,
797                                              bs->elements_per_page)
798                + i * bucket_to_size(bucket);
799 }
800
801 void alloc_free(void *pool, unsigned long poolsize, void *free)
802 {
803         struct header *head = pool;
804         struct bucket_state *bs;
805         unsigned int sp_bits;
806         unsigned long i, pgnum, pgoffset, offset = (char *)free - (char *)pool;
807         bool smallpage;
808         struct page_header *ph;
809
810         if (poolsize < MIN_USEFUL_SIZE) {
811                 return tiny_alloc_free(pool, poolsize, free);
812         }
813         
814         /* Get page header. */
815         sp_bits = small_page_bits(poolsize);
816         pgnum = offset >> sp_bits;
817
818         /* Big page? Round down further. */
819         if (test_bit(head->pagesize, pgnum >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
820                 smallpage = false;
821                 pgnum &= ~(SMALL_PAGES_PER_LARGE_PAGE - 1);
822         } else
823                 smallpage = true;
824
825         /* Step back to page header. */
826         ph = from_pgnum(head, pgnum, sp_bits);
827         if ((void *)ph == free) {
828                 huge_free(head, poolsize, free);
829                 return;
830         }
831
832         bs = &head->bs[ph->bucket];
833         pgoffset = offset - (pgnum << sp_bits)
834                 - page_header_size(ph->bucket / INTER_BUCKET_SPACE,
835                                    bs->elements_per_page);
836
837         if (unlikely(ph->elements_used == bs->elements_per_page)) {
838                 del_from_bucket_full_list(head, bs, ph, sp_bits);
839                 add_to_bucket_list(head, bs, ph, sp_bits);
840         }
841
842         /* Which element are we? */
843         i = pgoffset / bucket_to_size(ph->bucket);
844         clear_bit(ph->used, i);
845         ph->elements_used--;
846
847         if (unlikely(ph->elements_used == 0)) {
848                 bs = &head->bs[ph->bucket];
849                 del_from_bucket_list(head, bs, ph, sp_bits);
850                 if (smallpage)
851                         add_small_page_to_freelist(head, ph, sp_bits);
852                 else
853                         add_large_page_to_freelist(head, ph, sp_bits);
854         }
855 }
856
857 unsigned long alloc_size(void *pool, unsigned long poolsize, void *p)
858 {
859         struct header *head = pool;
860         unsigned int pgnum, sp_bits;
861         unsigned long offset = (char *)p - (char *)pool;
862         struct page_header *ph;
863
864         if (poolsize < MIN_USEFUL_SIZE)
865                 return tiny_alloc_size(pool, poolsize, p);
866
867         /* Get page header. */
868         sp_bits = small_page_bits(poolsize);
869         pgnum = offset >> sp_bits;
870
871         /* Big page? Round down further. */
872         if (test_bit(head->pagesize, pgnum >> BITS_FROM_SMALL_TO_LARGE_PAGE))
873                 pgnum &= ~(SMALL_PAGES_PER_LARGE_PAGE - 1);
874
875         /* Step back to page header. */
876         ph = from_pgnum(head, pgnum, sp_bits);
877         if ((void *)ph == p)
878                 return huge_size(head, p);
879
880         return bucket_to_size(ph->bucket);
881 }
882
883 /* Useful for gdb breakpoints. */
884 static bool check_fail(void)
885 {
886         return false;
887 }
888
889 static unsigned long count_bits(const unsigned long bitmap[],
890                                 unsigned long limit)
891 {
892         unsigned long i, count = 0;
893
894         while (limit >= BITS_PER_LONG) {
895                 count += popcount(bitmap[0]);
896                 bitmap++;
897                 limit -= BITS_PER_LONG;
898         }
899
900         for (i = 0; i < limit; i++)
901                 if (test_bit(bitmap, i))
902                         count++;
903         return count;
904 }
905
906 static bool out_of_bounds(unsigned long pgnum,
907                           unsigned int sp_bits,
908                           unsigned long pagesize,
909                           unsigned long poolsize)
910 {
911         if (((pgnum << sp_bits) >> sp_bits) != pgnum)
912                 return true;
913
914         if ((pgnum << sp_bits) > poolsize)
915                 return true;
916
917         return ((pgnum << sp_bits) + pagesize > poolsize);
918 }
919
920 static bool check_bucket(struct header *head,
921                          unsigned long poolsize,
922                          unsigned long pages[],
923                          struct bucket_state *bs,
924                          unsigned int bindex)
925 {
926         bool lp_bucket;
927         struct page_header *ph;
928         unsigned long taken, i, prev, pagesize, sp_bits, lp_bits;
929
930         sp_bits = small_page_bits(poolsize);
931         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
932
933         lp_bucket = large_page_bucket(bindex, sp_bits);
934
935         pagesize = 1UL << (lp_bucket ? lp_bits : sp_bits);
936
937         /* This many elements fit? */
938         taken = page_header_size(bindex / INTER_BUCKET_SPACE,
939                                  bs->elements_per_page);
940         taken += bucket_to_size(bindex) * bs->elements_per_page;
941         if (taken > pagesize)
942                 return check_fail();
943
944         /* One more wouldn't fit? */
945         taken = page_header_size(bindex / INTER_BUCKET_SPACE,
946                                  bs->elements_per_page + 1);
947         taken += bucket_to_size(bindex) * (bs->elements_per_page + 1);
948         if (taken <= pagesize)
949                 return check_fail();
950
951         /* Walk used list. */
952         prev = 0;
953         for (i = bs->page_list; i; i = ph->next) {
954                 /* Bad pointer? */
955                 if (out_of_bounds(i, sp_bits, pagesize, poolsize))
956                         return check_fail();
957                 /* Wrong size page? */
958                 if (!!test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE)
959                     != lp_bucket)
960                         return check_fail();
961                 /* Large page not on boundary? */
962                 if (lp_bucket && (i % SMALL_PAGES_PER_LARGE_PAGE) != 0)
963                         return check_fail();
964                 ph = from_pgnum(head, i, sp_bits);
965                 /* Linked list corrupt? */
966                 if (ph->prev != prev)
967                         return check_fail();
968                 /* Already seen this page? */
969                 if (test_bit(pages, i))
970                         return check_fail();
971                 set_bit(pages, i);
972                 /* Empty or full? */
973                 if (ph->elements_used == 0)
974                         return check_fail();
975                 if (ph->elements_used >= bs->elements_per_page)
976                         return check_fail();
977                 /* Used bits don't agree? */
978                 if (ph->elements_used != count_bits(ph->used,
979                                                     bs->elements_per_page))
980                         return check_fail();
981                 /* Wrong bucket? */
982                 if (ph->bucket != bindex)
983                         return check_fail();
984                 prev = i;
985         }
986
987         /* Walk full list. */
988         prev = 0;
989         for (i = bs->full_list; i; i = ph->next) {
990                 /* Bad pointer? */
991                 if (out_of_bounds(i, sp_bits, pagesize, poolsize))
992                         return check_fail();
993                 /* Wrong size page? */
994                 if (!!test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE)
995                     != lp_bucket)
996                 /* Large page not on boundary? */
997                 if (lp_bucket && (i % SMALL_PAGES_PER_LARGE_PAGE) != 0)
998                         return check_fail();
999                 ph = from_pgnum(head, i, sp_bits);
1000                 /* Linked list corrupt? */
1001                 if (ph->prev != prev)
1002                         return check_fail();
1003                 /* Already seen this page? */
1004                 if (test_bit(pages, i))
1005                         return check_fail();
1006                 set_bit(pages, i);
1007                 /* Not full? */
1008                 if (ph->elements_used != bs->elements_per_page)
1009                         return check_fail();
1010                 /* Used bits don't agree? */
1011                 if (ph->elements_used != count_bits(ph->used,
1012                                                     bs->elements_per_page))
1013                         return check_fail();
1014                 /* Wrong bucket? */
1015                 if (ph->bucket != bindex)
1016                         return check_fail();
1017                 prev = i;
1018         }
1019         return true;
1020 }
1021
1022 bool alloc_check(void *pool, unsigned long poolsize)
1023 {
1024         struct header *head = pool;
1025         unsigned long prev, i, lp_bits, sp_bits, header_size, num_buckets;
1026         struct page_header *ph;
1027         struct huge_alloc *ha;
1028         unsigned long pages[MAX_SMALL_PAGES / BITS_PER_LONG] = { 0 };
1029
1030         if (poolsize < MIN_USEFUL_SIZE)
1031                 return tiny_alloc_check(pool, poolsize);
1032
1033         sp_bits = small_page_bits(poolsize);
1034         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
1035
1036         num_buckets = max_bucket(lp_bits);
1037
1038         header_size = sizeof(*head) + sizeof(head->bs) * (num_buckets-1);
1039
1040         /* First, set all bits taken by header. */
1041         for (i = 0; i < header_size; i += (1UL << sp_bits))
1042                 set_bit(pages, i >> sp_bits);
1043
1044         /* Check small page free list. */
1045         prev = 0;
1046         for (i = head->small_free_list; i; i = ph->next) {
1047                 /* Bad pointer? */
1048                 if (out_of_bounds(i, sp_bits, 1 << sp_bits, poolsize))
1049                         return check_fail();
1050                 /* Large page? */
1051                 if (test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE))
1052                         return check_fail();
1053                 ph = from_pgnum(head, i, sp_bits);
1054                 /* Linked list corrupt? */
1055                 if (ph->prev != prev)
1056                         return check_fail();
1057                 /* Already seen this page? */
1058                 if (test_bit(pages, i))
1059                         return check_fail();
1060                 set_bit(pages, i);
1061                 prev = i;
1062         }
1063
1064         /* Check large page free list. */
1065         prev = 0;
1066         for (i = head->large_free_list; i; i = ph->next) {
1067                 /* Bad pointer? */
1068                 if (out_of_bounds(i, sp_bits, 1 << lp_bits, poolsize))
1069                         return check_fail();
1070                 /* Not large page? */
1071                 if (!test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE))
1072                         return check_fail();
1073                 /* Not page boundary? */
1074                 if ((i % SMALL_PAGES_PER_LARGE_PAGE) != 0)
1075                         return check_fail();
1076                 ph = from_pgnum(head, i, sp_bits);
1077                 /* Linked list corrupt? */
1078                 if (ph->prev != prev)
1079                         return check_fail();
1080                 /* Already seen this page? */
1081                 if (test_bit(pages, i))
1082                         return check_fail();
1083                 set_bit(pages, i);
1084                 prev = i;
1085         }
1086
1087         /* Check the buckets. */
1088         for (i = 0; i < max_bucket(lp_bits); i++) {
1089                 struct bucket_state *bs = &head->bs[i];
1090
1091                 if (!check_bucket(head, poolsize, pages, bs, i))
1092                         return false;
1093         }
1094
1095         /* Check the huge alloc list. */
1096         prev = 0;
1097         for (i = head->huge; i; i = ha->next) {
1098                 unsigned long pgbits, j;
1099
1100                 /* Bad pointer? */
1101                 if (i >= poolsize || i + sizeof(*ha) > poolsize)
1102                         return check_fail();
1103                 ha = (void *)((char *)head + i);
1104
1105                 /* Check contents of ha. */
1106                 if (ha->off > poolsize || ha->off + ha->len > poolsize)
1107                         return check_fail();
1108
1109                 /* Large or small page? */
1110                 pgbits = test_bit(head->pagesize, ha->off >> lp_bits)
1111                         ? lp_bits : sp_bits;
1112
1113                 /* Not page boundary? */
1114                 if ((ha->off % (1UL << pgbits)) != 0)
1115                         return check_fail();
1116
1117                 /* Not page length? */
1118                 if ((ha->len % (1UL << pgbits)) != 0)
1119                         return check_fail();
1120
1121                 /* Linked list corrupt? */
1122                 if (ha->prev != prev)
1123                         return check_fail();
1124
1125                 for (j = ha->off; j < ha->off + ha->len; j += (1 << sp_bits)) {
1126                         /* Already seen this page? */
1127                         if (test_bit(pages, j >> sp_bits))
1128                                 return check_fail();
1129                         set_bit(pages, j >> sp_bits);
1130                 }
1131
1132                 prev = i;
1133         }
1134                 
1135         /* Make sure every page accounted for. */
1136         for (i = 0; i < poolsize >> sp_bits; i++) {
1137                 if (!test_bit(pages, i))
1138                         return check_fail();
1139                 if (test_bit(head->pagesize,
1140                              i >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
1141                         /* Large page, skip rest. */
1142                         i += SMALL_PAGES_PER_LARGE_PAGE - 1;
1143                 }
1144         }
1145
1146         return true;
1147 }
1148
1149 static unsigned long print_overhead(FILE *out, const char *desc,
1150                                     unsigned long bytes,
1151                                     unsigned long poolsize)
1152 {
1153         fprintf(out, "Overhead (%s): %lu bytes (%.3g%%)\n",
1154                 desc, bytes, 100.0 * bytes / poolsize);
1155         return bytes;
1156 }
1157
1158 static unsigned long count_list(struct header *head,
1159                                 u16 pgnum,
1160                                 unsigned int sp_bits,
1161                                 unsigned long *total_elems)
1162 {
1163         struct page_header *p;
1164         unsigned long ret = 0;
1165
1166         while (pgnum) {
1167                 p = from_pgnum(head, pgnum, sp_bits);
1168                 if (total_elems)
1169                         (*total_elems) += p->elements_used;
1170                 ret++;
1171                 pgnum = p->next;
1172         }
1173         return ret;
1174 }
1175
1176 static unsigned long visualize_bucket(FILE *out, struct header *head,
1177                                       unsigned int bucket,
1178                                       unsigned long poolsize,
1179                                       unsigned int sp_bits)
1180 {
1181         unsigned long num_full, num_partial, num_pages, page_size,
1182                 elems, hdr_min, hdr_size, elems_per_page, overhead = 0;
1183
1184         elems_per_page = head->bs[bucket].elements_per_page;
1185
1186         /* If we used byte-based bitmaps, we could get pg hdr to: */
1187         hdr_min = sizeof(struct page_header)
1188                 - sizeof(((struct page_header *)0)->used)
1189                 + align_up(elems_per_page, CHAR_BIT) / CHAR_BIT;
1190         hdr_size = page_header_size(bucket / INTER_BUCKET_SPACE,
1191                                     elems_per_page);
1192
1193         elems = 0;
1194         num_full = count_list(head, head->bs[bucket].full_list, sp_bits,
1195                               &elems);
1196         num_partial = count_list(head, head->bs[bucket].page_list, sp_bits,
1197                                  &elems);
1198         num_pages = num_full + num_partial;
1199         if (!num_pages)
1200                 return 0;
1201
1202         fprintf(out, "Bucket %u (%lu bytes):"
1203                 " %lu full, %lu partial = %lu elements\n",
1204                 bucket, bucket_to_size(bucket), num_full, num_partial, elems);
1205         /* Strict requirement of page header size. */
1206         overhead += print_overhead(out, "page headers",
1207                                    hdr_min * num_pages, poolsize);
1208         /* Gap between minimal page header and actual start. */
1209         overhead += print_overhead(out, "page post-header alignments",
1210                                    (hdr_size - hdr_min) * num_pages, poolsize);
1211         /* Between last element and end of page. */
1212         page_size = (1 << sp_bits);
1213         if (large_page_bucket(bucket, sp_bits))
1214                 page_size <<= BITS_FROM_SMALL_TO_LARGE_PAGE;
1215
1216         overhead += print_overhead(out, "page tails",
1217                                    (page_size - (hdr_size
1218                                                  + (elems_per_page
1219                                                     * bucket_to_size(bucket))))
1220                                    * num_pages, poolsize);
1221         return overhead;
1222 }
1223
1224 void alloc_visualize(FILE *out, void *pool, unsigned long poolsize)
1225 {
1226         struct header *head = pool;
1227         unsigned long i, lp_bits, sp_bits, header_size, num_buckets, count,
1228                 overhead = 0;
1229
1230         fprintf(out, "Pool %p size %lu: (%s allocator)\n", pool, poolsize,
1231                 poolsize < MIN_USEFUL_SIZE ? "tiny" : "standard");
1232
1233         if (poolsize < MIN_USEFUL_SIZE) {
1234                 tiny_alloc_visualize(out, pool, poolsize);
1235                 return;
1236         }
1237         
1238         sp_bits = small_page_bits(poolsize);
1239         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
1240
1241         num_buckets = max_bucket(lp_bits);
1242         header_size = sizeof(*head) + sizeof(head->bs) * (num_buckets-1);
1243
1244         fprintf(out, "Large page size %lu, small page size %lu.\n",
1245                 1UL << lp_bits, 1UL << sp_bits);
1246         overhead += print_overhead(out, "unused pool tail",
1247                                    poolsize % (1 << lp_bits), poolsize);
1248         fprintf(out, "Main header %lu bytes (%lu small pages).\n",
1249                 header_size, align_up(header_size, 1 << sp_bits) >> sp_bits);
1250         overhead += print_overhead(out, "partial header page",
1251                                    align_up(header_size, 1 << sp_bits)
1252                                    - header_size, poolsize);
1253         /* Total large pages. */
1254         i = count_bits(head->pagesize, poolsize >> lp_bits);
1255         /* Used pages. */
1256         count = i - count_list(head, head->large_free_list, sp_bits, NULL);
1257         fprintf(out, "%lu/%lu large pages used (%.3g%%)\n",
1258                 count, i, count ? 100.0 * count / i : 0.0);
1259
1260         /* Total small pages. */
1261         i = ((poolsize >> lp_bits) - i) << BITS_FROM_SMALL_TO_LARGE_PAGE;
1262         /* Used pages */
1263         count = i - count_list(head, head->small_free_list, sp_bits, NULL);
1264         fprintf(out, "%lu/%lu small pages used (%.3g%%)\n",
1265                 count, i, count ? 100.0 * count / i : 0.0);
1266
1267         /* Summary of each bucket. */
1268         fprintf(out, "%lu buckets:\n", num_buckets);
1269         for (i = 0; i < num_buckets; i++)
1270                 overhead += visualize_bucket(out, head, i, poolsize, sp_bits);
1271
1272         print_overhead(out, "total", overhead, poolsize);
1273 }