alloc: use page numbers in lists to reduce overhead.
[ccan] / ccan / alloc / alloc.c
1 #include <unistd.h>
2 #include <stdint.h>
3 #include <string.h>
4 #include <limits.h>
5 #include <assert.h>
6 #include <stdlib.h>
7 #include "alloc.h"
8 #include <ccan/build_assert/build_assert.h>
9 #include <ccan/likely/likely.h>
10 #include <ccan/short_types/short_types.h>
11 #include "config.h"
12
13 /*
14    Inspired by (and parts taken from) Andrew Tridgell's alloc_mmap:
15    http://samba.org/~tridge/junkcode/alloc_mmap/
16
17    Copyright (C) Andrew Tridgell 2007
18    
19    This library is free software; you can redistribute it and/or
20    modify it under the terms of the GNU Lesser General Public
21    License as published by the Free Software Foundation; either
22    version 2 of the License, or (at your option) any later version.
23
24    This library is distributed in the hope that it will be useful,
25    but WITHOUT ANY WARRANTY; without even the implied warranty of
26    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
27    Lesser General Public License for more details.
28
29    You should have received a copy of the GNU Lesser General Public
30    License along with this library; if not, write to the Free Software
31    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
32  */
33
34 #if 0 /* Until we have the tiny allocator working, go down to 1 MB */
35
36 /* We divide the pool into this many large pages (nearest power of 2) */
37 #define MAX_PAGES (1024UL)
38
39 /* 32 small pages == 1 large page. */
40 #define BITS_FROM_SMALL_TO_LARGE_PAGE 5
41
42 #else
43
44 #define MAX_PAGES (128UL)
45 #define BITS_FROM_SMALL_TO_LARGE_PAGE 4
46
47 #endif
48
49 /* Smallest pool size for this scheme: 512-byte small pages.  That's
50  * 3/5% overhead for 32/64 bit. */
51 #define MIN_USEFUL_SIZE (MAX_PAGES << (9 + BITS_FROM_SMALL_TO_LARGE_PAGE))
52
53 /* Every 4 buckets, we jump up a power of 2. ...8 10 12 14 16 20 24 28 32... */
54 #define INTER_BUCKET_SPACE 4
55
56 #define SMALL_PAGES_PER_LARGE_PAGE (1 << BITS_FROM_SMALL_TO_LARGE_PAGE)
57
58 /* FIXME: Figure this out properly. */
59 #define MAX_SIZE (1 << 30)
60
61 /* How few object to fit in a page before using a larger one? (8) */
62 #define MAX_PAGE_OBJECT_ORDER   3
63
64 #define BITS_PER_LONG (sizeof(long) * CHAR_BIT)
65
66 struct bucket_state {
67         unsigned long elements_per_page;
68         u16 page_list;
69         u16 full_list;
70 };
71
72 struct header {
73         /* Bitmap of which pages are large. */
74         unsigned long pagesize[MAX_PAGES / BITS_PER_LONG];
75
76         /* List of unused small/large pages. */
77         u16 small_free_list;
78         u16 large_free_list;
79
80         /* This is less defined: we have two buckets for each power of 2 */
81         struct bucket_state bs[1];
82 };
83
84 struct page_header {
85         u16 next, prev;
86         u32 elements_used;
87         /* FIXME: Pack this in somewhere... */
88         u8 bucket;
89         unsigned long used[1]; /* One bit per element. */
90 };
91
92 /* 2 bit for every byte to allocate. */
93 static void tiny_alloc_init(void *pool, unsigned long poolsize)
94 {
95 /* FIXME */
96 }
97
98 static void *tiny_alloc_get(void *pool, unsigned long poolsize,
99                             unsigned long size, unsigned long align)
100 {
101 /* FIXME */
102         return NULL;
103 }
104
105 static void tiny_alloc_free(void *pool, unsigned long poolsize, void *free)
106 {
107 /* FIXME */
108 }
109
110 static unsigned long tiny_alloc_size(void *pool, unsigned long poolsize,
111                                       void *p)
112 {
113 /* FIXME */
114         return 0;
115 }
116
117 static bool tiny_alloc_check(void *pool, unsigned long poolsize)
118 {
119 /* FIXME */
120         return true;
121 }
122
123 static unsigned int fls(unsigned long val)
124 {
125 #if HAVE_BUILTIN_CLZL
126         /* This is significantly faster! */
127         return val ? sizeof(long) * CHAR_BIT - __builtin_clzl(val) : 0;
128 #else
129         unsigned int r = 32;
130
131         if (!val)
132                 return 0;
133         if (!(val & 0xffff0000u)) {
134                 val <<= 16;
135                 r -= 16;
136         }
137         if (!(val & 0xff000000u)) {
138                 val <<= 8;
139                 r -= 8;
140         }
141         if (!(val & 0xf0000000u)) {
142                 val <<= 4;
143                 r -= 4;
144         }
145         if (!(val & 0xc0000000u)) {
146                 val <<= 2;
147                 r -= 2;
148         }
149         if (!(val & 0x80000000u)) {
150                 val <<= 1;
151                 r -= 1;
152         }
153         return r;
154 #endif
155 }
156
157 /* FIXME: Move to bitops. */
158 static unsigned int ffsl(unsigned long val)
159 {
160 #if HAVE_BUILTIN_FFSL
161         /* This is significantly faster! */
162         return __builtin_ffsl(val);
163 #else
164         unsigned int r = 1;
165
166         if (!val)
167                 return 0;
168         if (sizeof(long) == sizeof(u64)) {
169                 if (!(val & 0xffffffff)) {
170                         /* Workaround gcc warning on 32-bit:
171                            error: right shift count >= width of type */
172                         u64 tmp = val;
173                         tmp >>= 32;
174                         val = tmp;
175                         r += 32;
176                 }
177         }
178         if (!(val & 0xffff)) {
179                 val >>= 16;
180                 r += 16;
181         }
182         if (!(val & 0xff)) {
183                 val >>= 8;
184                 r += 8;
185         }
186         if (!(val & 0xf)) {
187                 val >>= 4;
188                 r += 4;
189         }
190         if (!(val & 3)) {
191                 val >>= 2;
192                 r += 2;
193         }
194         if (!(val & 1)) {
195                 val >>= 1;
196                 r += 1;
197         }
198         return r;
199 #endif
200 }
201
202 static unsigned int popcount(unsigned long val)
203 {
204 #if HAVE_BUILTIN_POPCOUNTL
205         return __builtin_popcountl(val);
206 #else
207         if (sizeof(long) == sizeof(u64)) {
208                 u64 v = val;
209                 v = (v & 0x5555555555555555ULL)
210                         + ((v >> 1) & 0x5555555555555555ULL);
211                 v = (v & 0x3333333333333333ULL)
212                         + ((v >> 1) & 0x3333333333333333ULL);
213                 v = (v & 0x0F0F0F0F0F0F0F0FULL)
214                         + ((v >> 1) & 0x0F0F0F0F0F0F0F0FULL);
215                 v = (v & 0x00FF00FF00FF00FFULL)
216                         + ((v >> 1) & 0x00FF00FF00FF00FFULL);
217                 v = (v & 0x0000FFFF0000FFFFULL)
218                         + ((v >> 1) & 0x0000FFFF0000FFFFULL);
219                 v = (v & 0x00000000FFFFFFFFULL)
220                         + ((v >> 1) & 0x00000000FFFFFFFFULL);
221                 return v;
222         }
223         val = (val & 0x55555555ULL) + ((val >> 1) & 0x55555555ULL);
224         val = (val & 0x33333333ULL) + ((val >> 1) & 0x33333333ULL);
225         val = (val & 0x0F0F0F0FULL) + ((val >> 1) & 0x0F0F0F0FULL);
226         val = (val & 0x00FF00FFULL) + ((val >> 1) & 0x00FF00FFULL);
227         val = (val & 0x0000FFFFULL) + ((val >> 1) & 0x0000FFFFULL);
228         return val;
229 #endif
230 }
231
232 /*
233  * Every 4 buckets, the size doubles.
234  * Between buckets, sizes increase linearly.
235  *
236  * eg. bucket 40 = 2^10                 = 1024
237  *     bucket 41 = 2^10 + 2^10*4        = 1024 + 256
238  *     bucket 42 = 2^10 + 2^10*4        = 1024 + 512
239  *     bucket 43 = 2^10 + 2^10*4        = 1024 + 768
240  *     bucket 45 = 2^11                 = 2048
241  *
242  * Care is taken to handle low numbered buckets, at cost of overflow.
243  */
244 static unsigned long bucket_to_size(unsigned int bucket)
245 {
246         unsigned long base = 1 << (bucket / INTER_BUCKET_SPACE);
247         return base + ((bucket % INTER_BUCKET_SPACE)
248                        << (bucket / INTER_BUCKET_SPACE))
249                 / INTER_BUCKET_SPACE;
250 }
251
252 /*
253  * Say size is 10.
254  *   fls(size/2) == 3.  1 << 3 == 8, so we're 2 too large, out of a possible
255  * 8 too large.  That's 1/4 of the way to the next power of 2 == 1 bucket.
256  *
257  * We make sure we round up.  Note that this fails on 32 bit at size
258  * 1879048193 (around bucket 120).
259  */
260 static unsigned int size_to_bucket(unsigned long size)
261 {
262         unsigned int base = fls(size/2);
263         unsigned long overshoot;
264
265         overshoot = size - (1 << base);
266         return base * INTER_BUCKET_SPACE
267                 + ((overshoot * INTER_BUCKET_SPACE + (1 << base)-1) >> base);
268 }
269
270 static unsigned int large_page_bits(unsigned long poolsize)
271 {
272         return fls(poolsize / MAX_PAGES / 2);
273 }
274
275 static unsigned long align_up(unsigned long x, unsigned long align)
276 {
277         return (x + align - 1) & ~(align - 1);
278 }
279
280 static struct page_header *from_pgnum(struct header *head,
281                                       unsigned long pgnum,
282                                       unsigned sp_bits)
283 {
284         return (struct page_header *)((char *)head + (pgnum << sp_bits));
285 }
286
287 static u16 to_pgnum(struct header *head, void *p, unsigned sp_bits)
288 {
289         return ((char *)p - (char *)head) >> sp_bits;
290 }
291
292 static size_t used_size(unsigned int num_elements)
293 {
294         return align_up(num_elements, BITS_PER_LONG) / CHAR_BIT;
295 }
296
297 /*
298  * We always align the first entry to the lower power of 2.
299  * eg. the 12-byte bucket gets 8-byte aligned.  The 4096-byte bucket
300  * gets 4096-byte aligned.
301  */
302 static unsigned long page_header_size(unsigned int align_bits,
303                                       unsigned long num_elements)
304 {
305         unsigned long size;
306
307         size = sizeof(struct page_header)
308                 - sizeof(((struct page_header *)0)->used)
309                 + used_size(num_elements);
310         return align_up(size, 1 << align_bits);
311 }
312
313 static void add_to_list(struct header *head,
314                         u16 *list, struct page_header *ph, unsigned sp_bits)
315 {
316         unsigned long h = *list, offset = to_pgnum(head, ph, sp_bits);
317
318         ph->next = h;
319         if (h) {
320                 struct page_header *prev = from_pgnum(head, h, sp_bits);
321                 assert(prev->prev == 0);
322                 prev->prev = offset;
323         }
324         *list = offset;
325         ph->prev = 0;
326 }
327
328 static void del_from_list(struct header *head,
329                           u16 *list, struct page_header *ph, unsigned sp_bits)
330 {
331         /* Front of list? */
332         if (ph->prev == 0) {
333                 *list = ph->next;
334         } else {
335                 struct page_header *prev = from_pgnum(head, ph->prev, sp_bits);
336                 prev->next = ph->next;
337         }
338         if (ph->next != 0) {
339                 struct page_header *next = from_pgnum(head, ph->next, sp_bits);
340                 next->prev = ph->prev;
341         }
342 }
343
344 static u16 pop_from_list(struct header *head,
345                                    u16 *list,
346                                    unsigned int sp_bits)
347 {
348         u16 h = *list;
349         struct page_header *ph = from_pgnum(head, h, sp_bits);
350
351         if (likely(h)) {
352                 *list = ph->next;
353                 if (*list)
354                         from_pgnum(head, *list, sp_bits)->prev = 0;
355         }
356         return h;
357 }
358
359 static void add_small_page_to_freelist(struct header *head,
360                                        struct page_header *ph,
361                                        unsigned int sp_bits)
362 {
363         add_to_list(head, &head->small_free_list, ph, sp_bits);
364 }
365
366 static void add_large_page_to_freelist(struct header *head,
367                                        struct page_header *ph,
368                                        unsigned int sp_bits)
369 {
370         add_to_list(head, &head->large_free_list, ph, sp_bits);
371 }
372
373 static void add_to_bucket_list(struct header *head,
374                                struct bucket_state *bs,
375                                struct page_header *ph,
376                                unsigned int sp_bits)
377 {
378         add_to_list(head, &bs->page_list, ph, sp_bits);
379 }
380
381 static void del_from_bucket_list(struct header *head,
382                                  struct bucket_state *bs,
383                                  struct page_header *ph,
384                                  unsigned int sp_bits)
385 {
386         del_from_list(head, &bs->page_list, ph, sp_bits);
387 }
388
389 static void del_from_bucket_full_list(struct header *head,
390                                       struct bucket_state *bs,
391                                       struct page_header *ph,
392                                       unsigned int sp_bits)
393 {
394         del_from_list(head, &bs->full_list, ph, sp_bits);
395 }
396
397 static void add_to_bucket_full_list(struct header *head,
398                                     struct bucket_state *bs,
399                                     struct page_header *ph,
400                                     unsigned int sp_bits)
401 {
402         add_to_list(head, &bs->full_list, ph, sp_bits);
403 }
404
405 static void clear_bit(unsigned long bitmap[], unsigned int off)
406 {
407         bitmap[off / BITS_PER_LONG] &= ~(1 << (off % BITS_PER_LONG));
408 }
409
410 static bool test_bit(const unsigned long bitmap[], unsigned int off)
411 {
412         return bitmap[off / BITS_PER_LONG] & (1 << (off % BITS_PER_LONG));
413 }
414
415 static void set_bit(unsigned long bitmap[], unsigned int off)
416 {
417         bitmap[off / BITS_PER_LONG] |= (1 << (off % BITS_PER_LONG));
418 }
419
420 /* There must be a bit to be found. */
421 static unsigned int find_free_bit(const unsigned long bitmap[])
422 {
423         unsigned int i;
424
425         for (i = 0; bitmap[i] == -1UL; i++);
426         return (i*BITS_PER_LONG) + ffsl(~bitmap[i]) - 1;
427 }
428
429 /* How many elements can we fit in a page? */
430 static unsigned long elements_per_page(unsigned long align_bits,
431                                        unsigned long esize,
432                                        unsigned long psize)
433 {
434         unsigned long num, overhead;
435
436         /* First approximation: no extra room for bitmap. */
437         overhead = align_up(sizeof(struct page_header), 1 << align_bits);
438         num = (psize - overhead) / esize;
439
440         while (page_header_size(align_bits, num) + esize * num > psize)
441                 num--;
442         return num;
443 }
444
445 static bool large_page_bucket(unsigned int bucket, unsigned int sp_bits)
446 {
447         unsigned long max_smallsize;
448
449         /* Note: this doesn't take into account page header. */
450         max_smallsize = (1UL << sp_bits) >> MAX_PAGE_OBJECT_ORDER;
451
452         return bucket_to_size(bucket) > max_smallsize;
453 }
454
455 static unsigned int max_bucket(unsigned int lp_bits)
456 {
457         return (lp_bits - MAX_PAGE_OBJECT_ORDER) * INTER_BUCKET_SPACE;
458 }
459
460 void alloc_init(void *pool, unsigned long poolsize)
461 {
462         struct header *head = pool;
463         struct page_header *ph;
464         unsigned int lp_bits, sp_bits, num_buckets;
465         unsigned long header_size, i;
466
467         if (poolsize < MIN_USEFUL_SIZE) {
468                 tiny_alloc_init(pool, poolsize);
469                 return;
470         }
471
472         /* We rely on page numbers fitting in 16 bit. */
473         BUILD_ASSERT((MAX_PAGES << BITS_FROM_SMALL_TO_LARGE_PAGE) < 65536);
474         
475         lp_bits = large_page_bits(poolsize);
476         sp_bits = lp_bits - BITS_FROM_SMALL_TO_LARGE_PAGE;
477
478         num_buckets = max_bucket(lp_bits);
479
480         head = pool;
481         header_size = sizeof(*head) + sizeof(head->bs) * (num_buckets-1);
482
483         memset(head, 0, header_size);
484         for (i = 0; i < num_buckets; i++) {
485                 unsigned long pagesize;
486
487                 if (large_page_bucket(i, sp_bits))
488                         pagesize = 1UL << lp_bits;
489                 else
490                         pagesize = 1UL << sp_bits;
491
492                 head->bs[i].elements_per_page
493                         = elements_per_page(i / INTER_BUCKET_SPACE,
494                                             bucket_to_size(i),
495                                             pagesize);
496         }
497
498         /* They start as all large pages. */
499         memset(head->pagesize, 0xFF, sizeof(head->pagesize));
500         /* FIXME: small pages for last bit? */
501
502         /* Split first page into small pages. */
503         assert(header_size << (1UL << lp_bits));
504         clear_bit(head->pagesize, 0);
505
506         /* Skip over page(s) used by header, add rest to free list */
507         for (i = align_up(header_size, (1 << sp_bits)) >> sp_bits;
508              i < SMALL_PAGES_PER_LARGE_PAGE;
509              i++) {
510                 ph = from_pgnum(head, i, sp_bits);
511                 ph->elements_used = 0;
512                 add_small_page_to_freelist(head, ph, sp_bits);
513         }
514
515         /* Add the rest of the pages as large pages. */
516         i = SMALL_PAGES_PER_LARGE_PAGE;
517         while ((i << sp_bits) + (1 << lp_bits) <= poolsize) {
518                 ph = from_pgnum(head, i, sp_bits);
519                 ph->elements_used = 0;
520                 add_large_page_to_freelist(head, ph, sp_bits);
521                 i += SMALL_PAGES_PER_LARGE_PAGE;
522         }
523 }
524
525 /* A large page worth of small pages are free: delete them from free list. */
526 static void del_large_from_small_free_list(struct header *head,
527                                            struct page_header *ph,
528                                            unsigned int sp_bits)
529 {
530         unsigned long i;
531
532         for (i = 0; i < SMALL_PAGES_PER_LARGE_PAGE; i++) {
533                 del_from_list(head, &head->small_free_list,
534                               (void *)ph + (i << sp_bits),
535                               sp_bits);
536         }
537 }
538
539 static bool all_empty(struct header *head,
540                       unsigned long pgnum,
541                       unsigned sp_bits)
542 {
543         unsigned long i;
544
545         for (i = 0; i < SMALL_PAGES_PER_LARGE_PAGE; i++) {
546                 struct page_header *ph = from_pgnum(head, pgnum + i, sp_bits);
547                 if (ph->elements_used)
548                         return false;
549         }
550         return true;
551 }
552
553 static u16 get_large_page(struct header *head, unsigned long poolsize,
554                           unsigned int sp_bits)
555 {
556         unsigned int lp_bits, i, page;
557
558         lp_bits = sp_bits + BITS_FROM_SMALL_TO_LARGE_PAGE;
559
560         page = pop_from_list(head, &head->large_free_list, sp_bits);
561         if (likely(page))
562                 return page;
563
564         /* Look for small pages to coalesce, after first large page. */
565         for (i = SMALL_PAGES_PER_LARGE_PAGE;
566              i < (poolsize >> lp_bits) << BITS_FROM_SMALL_TO_LARGE_PAGE;
567              i += SMALL_PAGES_PER_LARGE_PAGE) {
568                 /* Already a large page? */
569                 if (test_bit(head->pagesize, i / SMALL_PAGES_PER_LARGE_PAGE))
570                         continue;
571                 if (all_empty(head, i, sp_bits)) {
572                         struct page_header *ph = from_pgnum(head, i, sp_bits);
573                         set_bit(head->pagesize,
574                                 i / SMALL_PAGES_PER_LARGE_PAGE);
575                         del_large_from_small_free_list(head, ph, sp_bits);
576                         add_large_page_to_freelist(head, ph, sp_bits);
577                 }
578         }
579                         
580         return pop_from_list(head, &head->large_free_list, sp_bits);
581 }
582
583 /* Returns small page. */
584 static unsigned long break_up_large_page(struct header *head,
585                                          unsigned int sp_bits,
586                                          u16 lpage)
587 {
588         unsigned int i;
589
590         clear_bit(head->pagesize, lpage >> BITS_FROM_SMALL_TO_LARGE_PAGE);
591
592         for (i = 1; i < SMALL_PAGES_PER_LARGE_PAGE; i++) {
593                 struct page_header *ph = from_pgnum(head, lpage + i, sp_bits);
594                 add_small_page_to_freelist(head, ph, sp_bits);
595         }
596
597         return lpage;
598 }
599
600 static u16 get_small_page(struct header *head, unsigned long poolsize,
601                           unsigned int sp_bits)
602 {
603         u16 ret;
604
605         ret = pop_from_list(head, &head->small_free_list, sp_bits);
606         if (likely(ret))
607                 return ret;
608         ret = get_large_page(head, poolsize, sp_bits);
609         if (likely(ret))
610                 ret = break_up_large_page(head, sp_bits, ret);
611         return ret;
612 }
613
614 void *alloc_get(void *pool, unsigned long poolsize,
615                 unsigned long size, unsigned long align)
616 {
617         struct header *head = pool;
618         unsigned int bucket;
619         unsigned long i;
620         struct bucket_state *bs;
621         struct page_header *ph;
622         unsigned int sp_bits;
623
624         if (poolsize < MIN_USEFUL_SIZE) {
625                 return tiny_alloc_get(pool, poolsize, size, align);
626         }
627
628         size = align_up(size, align);
629         if (unlikely(!size))
630                 size = 1;
631         bucket = size_to_bucket(size);
632
633         if (bucket >= max_bucket(large_page_bits(poolsize))) {
634                 /* FIXME: huge alloc. */
635                 return NULL;
636         }
637
638         bs = &head->bs[bucket];
639         sp_bits = large_page_bits(poolsize) - BITS_FROM_SMALL_TO_LARGE_PAGE;
640
641         if (!bs->page_list) {
642                 struct page_header *ph;
643
644                 if (large_page_bucket(bucket, sp_bits))
645                         bs->page_list = get_large_page(head, poolsize,
646                                                        sp_bits);
647                 else
648                         bs->page_list = get_small_page(head, poolsize,
649                                                        sp_bits);
650                 /* FIXME: Try large-aligned alloc?  Header stuffing? */
651                 if (unlikely(!bs->page_list))
652                         return NULL;
653                 ph = from_pgnum(head, bs->page_list, sp_bits);
654                 ph->bucket = bucket;
655                 ph->elements_used = 0;
656                 ph->next = 0;
657                 memset(ph->used, 0, used_size(bs->elements_per_page));
658         }
659
660         ph = from_pgnum(head, bs->page_list, sp_bits);
661
662         i = find_free_bit(ph->used);
663         set_bit(ph->used, i);
664         ph->elements_used++;
665
666         /* check if this page is now full */
667         if (unlikely(ph->elements_used == bs->elements_per_page)) {
668                 del_from_bucket_list(head, bs, ph, sp_bits);
669                 add_to_bucket_full_list(head, bs, ph, sp_bits);
670         }
671
672         return (char *)ph + page_header_size(ph->bucket / INTER_BUCKET_SPACE,
673                                              bs->elements_per_page)
674                + i * bucket_to_size(bucket);
675 }
676
677 void alloc_free(void *pool, unsigned long poolsize, void *free)
678 {
679         struct header *head = pool;
680         struct bucket_state *bs;
681         unsigned int sp_bits;
682         unsigned long i, pgnum, pgoffset, offset = (char *)free - (char *)pool;
683         bool smallpage;
684         struct page_header *ph;
685
686         if (poolsize < MIN_USEFUL_SIZE) {
687                 return tiny_alloc_free(pool, poolsize, free);
688         }
689         
690         /* Get page header. */
691         sp_bits = large_page_bits(poolsize) - BITS_FROM_SMALL_TO_LARGE_PAGE;
692         pgnum = offset >> sp_bits;
693
694         /* Big page? Round down further. */
695         if (test_bit(head->pagesize, pgnum >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
696                 smallpage = false;
697                 pgnum &= ~(SMALL_PAGES_PER_LARGE_PAGE - 1);
698         } else
699                 smallpage = true;
700
701         /* Step back to page header. */
702         ph = from_pgnum(head, pgnum, sp_bits);
703         bs = &head->bs[ph->bucket];
704         pgoffset = offset - (pgnum << sp_bits)
705                 - page_header_size(ph->bucket / INTER_BUCKET_SPACE,
706                                    bs->elements_per_page);
707
708         if (unlikely(ph->elements_used == bs->elements_per_page)) {
709                 del_from_bucket_full_list(head, bs, ph, sp_bits);
710                 add_to_bucket_list(head, bs, ph, sp_bits);
711         }
712
713         /* Which element are we? */
714         i = pgoffset / bucket_to_size(ph->bucket);
715         clear_bit(ph->used, i);
716         ph->elements_used--;
717
718         if (unlikely(ph->elements_used == 0)) {
719                 bs = &head->bs[ph->bucket];
720                 del_from_bucket_list(head, bs, ph, sp_bits);
721                 if (smallpage)
722                         add_small_page_to_freelist(head, ph, sp_bits);
723                 else
724                         add_large_page_to_freelist(head, ph, sp_bits);
725         }
726 }
727
728 unsigned long alloc_size(void *pool, unsigned long poolsize, void *p)
729 {
730         struct header *head = pool;
731         unsigned int pgnum, sp_bits;
732         unsigned long offset = (char *)p - (char *)pool;
733         struct page_header *ph;
734
735         if (poolsize < MIN_USEFUL_SIZE)
736                 return tiny_alloc_size(pool, poolsize, p);
737
738         /* Get page header. */
739         sp_bits = large_page_bits(poolsize) - BITS_FROM_SMALL_TO_LARGE_PAGE;
740         pgnum = offset >> sp_bits;
741
742         /* Big page? Round down further. */
743         if (test_bit(head->pagesize, pgnum >> BITS_FROM_SMALL_TO_LARGE_PAGE))
744                 pgnum &= ~(SMALL_PAGES_PER_LARGE_PAGE - 1);
745
746         /* Step back to page header. */
747         ph = from_pgnum(head, pgnum, sp_bits);
748         return bucket_to_size(ph->bucket);
749 }
750
751 /* Useful for gdb breakpoints. */
752 static bool check_fail(void)
753 {
754         return false;
755 }
756
757 static unsigned long count_bits(const unsigned long bitmap[],
758                                 unsigned long limit)
759 {
760         unsigned long i, count = 0;
761
762         while (limit >= BITS_PER_LONG) {
763                 count += popcount(bitmap[0]);
764                 bitmap++;
765                 limit -= BITS_PER_LONG;
766         }
767
768         for (i = 0; i < limit; i++)
769                 if (test_bit(bitmap, i))
770                         count++;
771         return count;
772 }
773
774 static bool out_of_bounds(unsigned long pgnum,
775                           unsigned int sp_bits,
776                           unsigned long pagesize,
777                           unsigned long poolsize)
778 {
779         if (((pgnum << sp_bits) >> sp_bits) != pgnum)
780                 return true;
781
782         if ((pgnum << sp_bits) > poolsize)
783                 return true;
784
785         return ((pgnum << sp_bits) + pagesize > poolsize);
786 }
787
788 static bool check_bucket(struct header *head,
789                          unsigned long poolsize,
790                          unsigned long pages[],
791                          struct bucket_state *bs,
792                          unsigned int bindex)
793 {
794         bool lp_bucket;
795         struct page_header *ph;
796         unsigned long taken, i, prev, pagesize, sp_bits, lp_bits;
797
798         lp_bits = large_page_bits(poolsize);
799         sp_bits = lp_bits - BITS_FROM_SMALL_TO_LARGE_PAGE;
800
801         lp_bucket = large_page_bucket(bindex, sp_bits);
802
803         pagesize = 1UL << (lp_bucket ? lp_bits : sp_bits);
804
805         /* This many elements fit? */
806         taken = page_header_size(bindex / INTER_BUCKET_SPACE,
807                                  bs->elements_per_page);
808         taken += bucket_to_size(bindex) * bs->elements_per_page;
809         if (taken > pagesize)
810                 return check_fail();
811
812         /* One more wouldn't fit? */
813         taken = page_header_size(bindex / INTER_BUCKET_SPACE,
814                                  bs->elements_per_page + 1);
815         taken += bucket_to_size(bindex) * (bs->elements_per_page + 1);
816         if (taken <= pagesize)
817                 return check_fail();
818
819         /* Walk used list. */
820         prev = 0;
821         for (i = bs->page_list; i; i = ph->next) {
822                 /* Bad pointer? */
823                 if (out_of_bounds(i, sp_bits, pagesize, poolsize))
824                         return check_fail();
825                 /* Wrong size page? */
826                 if (!!test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE)
827                     != lp_bucket)
828                         return check_fail();
829                 /* Large page not on boundary? */
830                 if (lp_bucket && (i % SMALL_PAGES_PER_LARGE_PAGE) != 0)
831                         return check_fail();
832                 ph = from_pgnum(head, i, sp_bits);
833                 /* Linked list corrupt? */
834                 if (ph->prev != prev)
835                         return check_fail();
836                 /* Already seen this page? */
837                 if (test_bit(pages, i))
838                         return check_fail();
839                 set_bit(pages, i);
840                 /* Empty or full? */
841                 if (ph->elements_used == 0)
842                         return check_fail();
843                 if (ph->elements_used >= bs->elements_per_page)
844                         return check_fail();
845                 /* Used bits don't agree? */
846                 if (ph->elements_used != count_bits(ph->used,
847                                                     bs->elements_per_page))
848                         return check_fail();
849                 /* Wrong bucket? */
850                 if (ph->bucket != bindex)
851                         return check_fail();
852                 prev = i;
853         }
854
855         /* Walk full list. */
856         prev = 0;
857         for (i = bs->full_list; i; i = ph->next) {
858                 /* Bad pointer? */
859                 if (out_of_bounds(i, sp_bits, pagesize, poolsize))
860                         return check_fail();
861                 /* Wrong size page? */
862                 if (!!test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE)
863                     != lp_bucket)
864                 /* Large page not on boundary? */
865                 if (lp_bucket && (i % SMALL_PAGES_PER_LARGE_PAGE) != 0)
866                         return check_fail();
867                 ph = from_pgnum(head, i, sp_bits);
868                 /* Linked list corrupt? */
869                 if (ph->prev != prev)
870                         return check_fail();
871                 /* Already seen this page? */
872                 if (test_bit(pages, i))
873                         return check_fail();
874                 set_bit(pages, i);
875                 /* Not full? */
876                 if (ph->elements_used != bs->elements_per_page)
877                         return check_fail();
878                 /* Used bits don't agree? */
879                 if (ph->elements_used != count_bits(ph->used,
880                                                     bs->elements_per_page))
881                         return check_fail();
882                 /* Wrong bucket? */
883                 if (ph->bucket != bindex)
884                         return check_fail();
885                 prev = i;
886         }
887         return true;
888 }
889
890 bool alloc_check(void *pool, unsigned long poolsize)
891 {
892         struct header *head = pool;
893         unsigned long prev, i, lp_bits, sp_bits, header_size, num_buckets;
894         struct page_header *ph;
895         unsigned long pages[(MAX_PAGES << BITS_FROM_SMALL_TO_LARGE_PAGE)
896                             / BITS_PER_LONG] = { 0 };
897
898         if (poolsize < MIN_USEFUL_SIZE)
899                 return tiny_alloc_check(pool, poolsize);
900
901         lp_bits = large_page_bits(poolsize);
902         sp_bits = lp_bits - BITS_FROM_SMALL_TO_LARGE_PAGE;
903
904         num_buckets = max_bucket(lp_bits);
905
906         header_size = sizeof(*head) + sizeof(head->bs) * (num_buckets-1);
907
908         /* First, set all bits taken by header. */
909         for (i = 0; i < header_size; i += (1UL << sp_bits))
910                 set_bit(pages, i >> sp_bits);
911
912         /* Check small page free list. */
913         prev = 0;
914         for (i = head->small_free_list; i; i = ph->next) {
915                 /* Bad pointer? */
916                 if (out_of_bounds(i, sp_bits, 1 << sp_bits, poolsize))
917                         return check_fail();
918                 /* Large page? */
919                 if (test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE))
920                         return check_fail();
921                 ph = from_pgnum(head, i, sp_bits);
922                 /* Linked list corrupt? */
923                 if (ph->prev != prev)
924                         return check_fail();
925                 /* Already seen this page? */
926                 if (test_bit(pages, i))
927                         return check_fail();
928                 set_bit(pages, i);
929                 prev = i;
930         }
931
932         /* Check large page free list. */
933         prev = 0;
934         for (i = head->large_free_list; i; i = ph->next) {
935                 /* Bad pointer? */
936                 if (out_of_bounds(i, sp_bits, 1 << lp_bits, poolsize))
937                         return check_fail();
938                 /* Not large page? */
939                 if (!test_bit(head->pagesize, i >> BITS_FROM_SMALL_TO_LARGE_PAGE))
940                         return check_fail();
941                 /* Not page boundary? */
942                 if ((i % SMALL_PAGES_PER_LARGE_PAGE) != 0)
943                         return check_fail();
944                 ph = from_pgnum(head, i, sp_bits);
945                 /* Linked list corrupt? */
946                 if (ph->prev != prev)
947                         return check_fail();
948                 /* Already seen this page? */
949                 if (test_bit(pages, i))
950                         return check_fail();
951                 set_bit(pages, i);
952                 prev = i;
953         }
954
955         /* Check the buckets. */
956         for (i = 0; i < max_bucket(lp_bits); i++) {
957                 struct bucket_state *bs = &head->bs[i];
958
959                 if (!check_bucket(head, poolsize, pages, bs, i))
960                         return false;
961         }
962
963         /* Make sure every page accounted for. */
964         for (i = 0; i < poolsize >> sp_bits; i++) {
965                 if (!test_bit(pages, i))
966                         return check_fail();
967                 if (test_bit(head->pagesize,
968                              i >> BITS_FROM_SMALL_TO_LARGE_PAGE)) {
969                         /* Large page, skip rest. */
970                         i += SMALL_PAGES_PER_LARGE_PAGE - 1;
971                 }
972         }
973
974         return true;
975 }
976
977 static unsigned long print_overhead(FILE *out, const char *desc,
978                                     unsigned long bytes,
979                                     unsigned long poolsize)
980 {
981         fprintf(out, "Overhead (%s): %lu bytes (%.3g%%)\n",
982                 desc, bytes, 100.0 * bytes / poolsize);
983         return bytes;
984 }
985
986 static unsigned long count_list(struct header *head,
987                                 u16 pgnum,
988                                 unsigned int sp_bits,
989                                 unsigned long *total_elems)
990 {
991         struct page_header *p;
992         unsigned long ret = 0;
993
994         while (pgnum) {
995                 p = from_pgnum(head, pgnum, sp_bits);
996                 if (total_elems)
997                         (*total_elems) += p->elements_used;
998                 ret++;
999                 pgnum = p->next;
1000         }
1001         return ret;
1002 }
1003
1004 static unsigned long visualize_bucket(FILE *out, struct header *head,
1005                                       unsigned int bucket,
1006                                       unsigned long poolsize,
1007                                       unsigned int sp_bits)
1008 {
1009         unsigned long num_full, num_partial, num_pages, page_size,
1010                 elems, hdr_min, hdr_size, elems_per_page, overhead = 0;
1011
1012         elems_per_page = head->bs[bucket].elements_per_page;
1013
1014         /* If we used byte-based bitmaps, we could get pg hdr to: */
1015         hdr_min = sizeof(struct page_header)
1016                 - sizeof(((struct page_header *)0)->used)
1017                 + align_up(elems_per_page, CHAR_BIT) / CHAR_BIT;
1018         hdr_size = page_header_size(bucket / INTER_BUCKET_SPACE,
1019                                     elems_per_page);
1020
1021         elems = 0;
1022         num_full = count_list(head, head->bs[bucket].full_list, sp_bits,
1023                               &elems);
1024         num_partial = count_list(head, head->bs[bucket].page_list, sp_bits,
1025                                  &elems);
1026         num_pages = num_full + num_partial;
1027         if (!num_pages)
1028                 return 0;
1029
1030         fprintf(out, "Bucket %u (%lu bytes):"
1031                 " %lu full, %lu partial = %lu elements\n",
1032                 bucket, bucket_to_size(bucket), num_full, num_partial, elems);
1033         /* Strict requirement of page header size. */
1034         overhead += print_overhead(out, "page headers",
1035                                    hdr_min * num_pages, poolsize);
1036         /* Gap between minimal page header and actual start. */
1037         overhead += print_overhead(out, "page post-header alignments",
1038                                    (hdr_size - hdr_min) * num_pages, poolsize);
1039         /* Between last element and end of page. */
1040         page_size = (1 << sp_bits);
1041         if (large_page_bucket(bucket, sp_bits))
1042                 page_size <<= BITS_FROM_SMALL_TO_LARGE_PAGE;
1043
1044         overhead += print_overhead(out, "page tails",
1045                                    (page_size - (hdr_size
1046                                                  + (elems_per_page
1047                                                     * bucket_to_size(bucket))))
1048                                    * num_pages, poolsize);
1049         return overhead;
1050 }
1051
1052 static void tiny_alloc_visualize(FILE *out, void *pool, unsigned long poolsize)
1053 {
1054 }
1055
1056 void alloc_visualize(FILE *out, void *pool, unsigned long poolsize)
1057 {
1058         struct header *head = pool;
1059         unsigned long i, lp_bits, sp_bits, header_size, num_buckets, count,
1060                 overhead = 0;
1061
1062         fprintf(out, "Pool %p size %lu: (%s allocator)\n", pool, poolsize,
1063                 poolsize < MIN_USEFUL_SIZE ? "tiny" : "standard");
1064
1065         if (poolsize < MIN_USEFUL_SIZE) {
1066                 tiny_alloc_visualize(out, pool, poolsize);
1067                 return;
1068         }
1069         
1070         lp_bits = large_page_bits(poolsize);
1071         sp_bits = lp_bits - BITS_FROM_SMALL_TO_LARGE_PAGE;
1072
1073         num_buckets = max_bucket(lp_bits);
1074         header_size = sizeof(*head) + sizeof(head->bs) * (num_buckets-1);
1075
1076         fprintf(out, "Large page size %lu, small page size %lu.\n",
1077                 1UL << lp_bits, 1UL << sp_bits);
1078         overhead += print_overhead(out, "unused pool tail",
1079                                    poolsize % (1 << lp_bits), poolsize);
1080         fprintf(out, "Main header %lu bytes (%lu small pages).\n",
1081                 header_size, align_up(header_size, 1 << sp_bits) >> sp_bits);
1082         overhead += print_overhead(out, "partial header page",
1083                                    align_up(header_size, 1 << sp_bits)
1084                                    - header_size, poolsize);
1085         /* Total large pages. */
1086         i = count_bits(head->pagesize, poolsize >> lp_bits);
1087         /* Used pages. */
1088         count = i - count_list(head, head->large_free_list, sp_bits, NULL);
1089         fprintf(out, "%lu/%lu large pages used (%.3g%%)\n",
1090                 count, i, count ? 100.0 * count / i : 0.0);
1091
1092         /* Total small pages. */
1093         i = (poolsize >> lp_bits) - i;
1094         /* Used pages */
1095         count = i - count_list(head, head->small_free_list, sp_bits, NULL);
1096         fprintf(out, "%lu/%lu small pages used (%.3g%%)\n",
1097                 count, i, count ? 100.0 * count / i : 0.0);
1098
1099         /* Summary of each bucket. */
1100         fprintf(out, "%lu buckets:\n", num_buckets);
1101         for (i = 0; i < num_buckets; i++)
1102                 overhead += visualize_bucket(out, head, i, poolsize, sp_bits);
1103
1104         print_overhead(out, "total", overhead, poolsize);
1105 }