aboutsummaryrefslogtreecommitdiffstats
path: root/mm/zsmalloc.c
diff options
context:
space:
mode:
authorMinchan Kim <minchan@kernel.org>2014-01-30 18:45:50 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-01-30 19:56:55 -0500
commitbcf1647d0899666f0fb90d176abf63bae22abb7c (patch)
tree6682b0feec718f594829770b4287afa2da266a0f /mm/zsmalloc.c
parent73f945505b9bf798d8c3ee830cb330dd6d7fb4c7 (diff)
zsmalloc: move it under mm
This patch moves zsmalloc under mm directory. Before that, description will explain why we have needed custom allocator. Zsmalloc is a new slab-based memory allocator for storing compressed pages. It is designed for low fragmentation and high allocation success rate on large object, but <= PAGE_SIZE allocations. zsmalloc differs from the kernel slab allocator in two primary ways to achieve these design goals. zsmalloc never requires high order page allocations to back slabs, or "size classes" in zsmalloc terms. Instead it allows multiple single-order pages to be stitched together into a "zspage" which backs the slab. This allows for higher allocation success rate under memory pressure. Also, zsmalloc allows objects to span page boundaries within the zspage. This allows for lower fragmentation than could be had with the kernel slab allocator for objects between PAGE_SIZE/2 and PAGE_SIZE. With the kernel slab allocator, if a page compresses to 60% of it original size, the memory savings gained through compression is lost in fragmentation because another object of the same size can't be stored in the leftover space. This ability to span pages results in zsmalloc allocations not being directly addressable by the user. The user is given an non-dereferencable handle in response to an allocation request. That handle must be mapped, using zs_map_object(), which returns a pointer to the mapped region that can be used. The mapping is necessary since the object data may reside in two different noncontigious pages. The zsmalloc fulfills the allocation needs for zram perfectly [sjenning@linux.vnet.ibm.com: borrow Seth's quote] Signed-off-by: Minchan Kim <minchan@kernel.org> Acked-by: Nitin Gupta <ngupta@vflare.org> Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Bob Liu <bob.liu@oracle.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Hugh Dickins <hughd@google.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Luigi Semenzato <semenzato@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Pekka Enberg <penberg@kernel.org> Cc: Rik van Riel <riel@redhat.com> Cc: Seth Jennings <sjenning@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/zsmalloc.c')
-rw-r--r--mm/zsmalloc.c1105
1 files changed, 1105 insertions, 0 deletions
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
new file mode 100644
index 000000000000..5d42adfcb67b
--- /dev/null
+++ b/mm/zsmalloc.c
@@ -0,0 +1,1105 @@
1/*
2 * zsmalloc memory allocator
3 *
4 * Copyright (C) 2011 Nitin Gupta
5 *
6 * This code is released using a dual license strategy: BSD/GPL
7 * You can choose the license that better fits your requirements.
8 *
9 * Released under the terms of 3-clause BSD License
10 * Released under the terms of GNU General Public License Version 2.0
11 */
12
13/*
14 * This allocator is designed for use with zram. Thus, the allocator is
15 * supposed to work well under low memory conditions. In particular, it
16 * never attempts higher order page allocation which is very likely to
17 * fail under memory pressure. On the other hand, if we just use single
18 * (0-order) pages, it would suffer from very high fragmentation --
19 * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
20 * This was one of the major issues with its predecessor (xvmalloc).
21 *
22 * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
23 * and links them together using various 'struct page' fields. These linked
24 * pages act as a single higher-order page i.e. an object can span 0-order
25 * page boundaries. The code refers to these linked pages as a single entity
26 * called zspage.
27 *
28 * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
29 * since this satisfies the requirements of all its current users (in the
30 * worst case, page is incompressible and is thus stored "as-is" i.e. in
31 * uncompressed form). For allocation requests larger than this size, failure
32 * is returned (see zs_malloc).
33 *
34 * Additionally, zs_malloc() does not return a dereferenceable pointer.
35 * Instead, it returns an opaque handle (unsigned long) which encodes actual
36 * location of the allocated object. The reason for this indirection is that
37 * zsmalloc does not keep zspages permanently mapped since that would cause
38 * issues on 32-bit systems where the VA region for kernel space mappings
39 * is very small. So, before using the allocating memory, the object has to
40 * be mapped using zs_map_object() to get a usable pointer and subsequently
41 * unmapped using zs_unmap_object().
42 *
43 * Following is how we use various fields and flags of underlying
44 * struct page(s) to form a zspage.
45 *
46 * Usage of struct page fields:
47 * page->first_page: points to the first component (0-order) page
48 * page->index (union with page->freelist): offset of the first object
49 * starting in this page. For the first page, this is
50 * always 0, so we use this field (aka freelist) to point
51 * to the first free object in zspage.
52 * page->lru: links together all component pages (except the first page)
53 * of a zspage
54 *
55 * For _first_ page only:
56 *
57 * page->private (union with page->first_page): refers to the
58 * component page after the first page
59 * page->freelist: points to the first free object in zspage.
60 * Free objects are linked together using in-place
61 * metadata.
62 * page->objects: maximum number of objects we can store in this
63 * zspage (class->zspage_order * PAGE_SIZE / class->size)
64 * page->lru: links together first pages of various zspages.
65 * Basically forming list of zspages in a fullness group.
66 * page->mapping: class index and fullness group of the zspage
67 *
68 * Usage of struct page flags:
69 * PG_private: identifies the first component page
70 * PG_private2: identifies the last component page
71 *
72 */
73
74#ifdef CONFIG_ZSMALLOC_DEBUG
75#define DEBUG
76#endif
77
78#include <linux/module.h>
79#include <linux/kernel.h>
80#include <linux/bitops.h>
81#include <linux/errno.h>
82#include <linux/highmem.h>
83#include <linux/string.h>
84#include <linux/slab.h>
85#include <asm/tlbflush.h>
86#include <asm/pgtable.h>
87#include <linux/cpumask.h>
88#include <linux/cpu.h>
89#include <linux/vmalloc.h>
90#include <linux/hardirq.h>
91#include <linux/spinlock.h>
92#include <linux/types.h>
93#include <linux/zsmalloc.h>
94
95/*
96 * This must be power of 2 and greater than of equal to sizeof(link_free).
97 * These two conditions ensure that any 'struct link_free' itself doesn't
98 * span more than 1 page which avoids complex case of mapping 2 pages simply
99 * to restore link_free pointer values.
100 */
101#define ZS_ALIGN 8
102
103/*
104 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
105 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
106 */
107#define ZS_MAX_ZSPAGE_ORDER 2
108#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
109
110/*
111 * Object location (<PFN>, <obj_idx>) is encoded as
112 * as single (unsigned long) handle value.
113 *
114 * Note that object index <obj_idx> is relative to system
115 * page <PFN> it is stored in, so for each sub-page belonging
116 * to a zspage, obj_idx starts with 0.
117 *
118 * This is made more complicated by various memory models and PAE.
119 */
120
121#ifndef MAX_PHYSMEM_BITS
122#ifdef CONFIG_HIGHMEM64G
123#define MAX_PHYSMEM_BITS 36
124#else /* !CONFIG_HIGHMEM64G */
125/*
126 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
127 * be PAGE_SHIFT
128 */
129#define MAX_PHYSMEM_BITS BITS_PER_LONG
130#endif
131#endif
132#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
133#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
134#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
135
136#define MAX(a, b) ((a) >= (b) ? (a) : (b))
137/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
138#define ZS_MIN_ALLOC_SIZE \
139 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
140#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
141
142/*
143 * On systems with 4K page size, this gives 254 size classes! There is a
144 * trader-off here:
145 * - Large number of size classes is potentially wasteful as free page are
146 * spread across these classes
147 * - Small number of size classes causes large internal fragmentation
148 * - Probably its better to use specific size classes (empirically
149 * determined). NOTE: all those class sizes must be set as multiple of
150 * ZS_ALIGN to make sure link_free itself never has to span 2 pages.
151 *
152 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
153 * (reason above)
154 */
155#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
156#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
157 ZS_SIZE_CLASS_DELTA + 1)
158
159/*
160 * We do not maintain any list for completely empty or full pages
161 */
162enum fullness_group {
163 ZS_ALMOST_FULL,
164 ZS_ALMOST_EMPTY,
165 _ZS_NR_FULLNESS_GROUPS,
166
167 ZS_EMPTY,
168 ZS_FULL
169};
170
171/*
172 * We assign a page to ZS_ALMOST_EMPTY fullness group when:
173 * n <= N / f, where
174 * n = number of allocated objects
175 * N = total number of objects zspage can store
176 * f = 1/fullness_threshold_frac
177 *
178 * Similarly, we assign zspage to:
179 * ZS_ALMOST_FULL when n > N / f
180 * ZS_EMPTY when n == 0
181 * ZS_FULL when n == N
182 *
183 * (see: fix_fullness_group())
184 */
185static const int fullness_threshold_frac = 4;
186
187struct size_class {
188 /*
189 * Size of objects stored in this class. Must be multiple
190 * of ZS_ALIGN.
191 */
192 int size;
193 unsigned int index;
194
195 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
196 int pages_per_zspage;
197
198 spinlock_t lock;
199
200 /* stats */
201 u64 pages_allocated;
202
203 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
204};
205
206/*
207 * Placed within free objects to form a singly linked list.
208 * For every zspage, first_page->freelist gives head of this list.
209 *
210 * This must be power of 2 and less than or equal to ZS_ALIGN
211 */
212struct link_free {
213 /* Handle of next free chunk (encodes <PFN, obj_idx>) */
214 void *next;
215};
216
217struct zs_pool {
218 struct size_class size_class[ZS_SIZE_CLASSES];
219
220 gfp_t flags; /* allocation flags used when growing pool */
221};
222
223/*
224 * A zspage's class index and fullness group
225 * are encoded in its (first)page->mapping
226 */
227#define CLASS_IDX_BITS 28
228#define FULLNESS_BITS 4
229#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1)
230#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1)
231
232struct mapping_area {
233#ifdef CONFIG_PGTABLE_MAPPING
234 struct vm_struct *vm; /* vm area for mapping object that span pages */
235#else
236 char *vm_buf; /* copy buffer for objects that span pages */
237#endif
238 char *vm_addr; /* address of kmap_atomic()'ed pages */
239 enum zs_mapmode vm_mm; /* mapping mode */
240};
241
242
243/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
244static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
245
246static int is_first_page(struct page *page)
247{
248 return PagePrivate(page);
249}
250
251static int is_last_page(struct page *page)
252{
253 return PagePrivate2(page);
254}
255
256static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
257 enum fullness_group *fullness)
258{
259 unsigned long m;
260 BUG_ON(!is_first_page(page));
261
262 m = (unsigned long)page->mapping;
263 *fullness = m & FULLNESS_MASK;
264 *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
265}
266
267static void set_zspage_mapping(struct page *page, unsigned int class_idx,
268 enum fullness_group fullness)
269{
270 unsigned long m;
271 BUG_ON(!is_first_page(page));
272
273 m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
274 (fullness & FULLNESS_MASK);
275 page->mapping = (struct address_space *)m;
276}
277
278/*
279 * zsmalloc divides the pool into various size classes where each
280 * class maintains a list of zspages where each zspage is divided
281 * into equal sized chunks. Each allocation falls into one of these
282 * classes depending on its size. This function returns index of the
283 * size class which has chunk size big enough to hold the give size.
284 */
285static int get_size_class_index(int size)
286{
287 int idx = 0;
288
289 if (likely(size > ZS_MIN_ALLOC_SIZE))
290 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
291 ZS_SIZE_CLASS_DELTA);
292
293 return idx;
294}
295
296/*
297 * For each size class, zspages are divided into different groups
298 * depending on how "full" they are. This was done so that we could
299 * easily find empty or nearly empty zspages when we try to shrink
300 * the pool (not yet implemented). This function returns fullness
301 * status of the given page.
302 */
303static enum fullness_group get_fullness_group(struct page *page)
304{
305 int inuse, max_objects;
306 enum fullness_group fg;
307 BUG_ON(!is_first_page(page));
308
309 inuse = page->inuse;
310 max_objects = page->objects;
311
312 if (inuse == 0)
313 fg = ZS_EMPTY;
314 else if (inuse == max_objects)
315 fg = ZS_FULL;
316 else if (inuse <= max_objects / fullness_threshold_frac)
317 fg = ZS_ALMOST_EMPTY;
318 else
319 fg = ZS_ALMOST_FULL;
320
321 return fg;
322}
323
324/*
325 * Each size class maintains various freelists and zspages are assigned
326 * to one of these freelists based on the number of live objects they
327 * have. This functions inserts the given zspage into the freelist
328 * identified by <class, fullness_group>.
329 */
330static void insert_zspage(struct page *page, struct size_class *class,
331 enum fullness_group fullness)
332{
333 struct page **head;
334
335 BUG_ON(!is_first_page(page));
336
337 if (fullness >= _ZS_NR_FULLNESS_GROUPS)
338 return;
339
340 head = &class->fullness_list[fullness];
341 if (*head)
342 list_add_tail(&page->lru, &(*head)->lru);
343
344 *head = page;
345}
346
347/*
348 * This function removes the given zspage from the freelist identified
349 * by <class, fullness_group>.
350 */
351static void remove_zspage(struct page *page, struct size_class *class,
352 enum fullness_group fullness)
353{
354 struct page **head;
355
356 BUG_ON(!is_first_page(page));
357
358 if (fullness >= _ZS_NR_FULLNESS_GROUPS)
359 return;
360
361 head = &class->fullness_list[fullness];
362 BUG_ON(!*head);
363 if (list_empty(&(*head)->lru))
364 *head = NULL;
365 else if (*head == page)
366 *head = (struct page *)list_entry((*head)->lru.next,
367 struct page, lru);
368
369 list_del_init(&page->lru);
370}
371
372/*
373 * Each size class maintains zspages in different fullness groups depending
374 * on the number of live objects they contain. When allocating or freeing
375 * objects, the fullness status of the page can change, say, from ALMOST_FULL
376 * to ALMOST_EMPTY when freeing an object. This function checks if such
377 * a status change has occurred for the given page and accordingly moves the
378 * page from the freelist of the old fullness group to that of the new
379 * fullness group.
380 */
381static enum fullness_group fix_fullness_group(struct zs_pool *pool,
382 struct page *page)
383{
384 int class_idx;
385 struct size_class *class;
386 enum fullness_group currfg, newfg;
387
388 BUG_ON(!is_first_page(page));
389
390 get_zspage_mapping(page, &class_idx, &currfg);
391 newfg = get_fullness_group(page);
392 if (newfg == currfg)
393 goto out;
394
395 class = &pool->size_class[class_idx];
396 remove_zspage(page, class, currfg);
397 insert_zspage(page, class, newfg);
398 set_zspage_mapping(page, class_idx, newfg);
399
400out:
401 return newfg;
402}
403
404/*
405 * We have to decide on how many pages to link together
406 * to form a zspage for each size class. This is important
407 * to reduce wastage due to unusable space left at end of
408 * each zspage which is given as:
409 * wastage = Zp - Zp % size_class
410 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
411 *
412 * For example, for size class of 3/8 * PAGE_SIZE, we should
413 * link together 3 PAGE_SIZE sized pages to form a zspage
414 * since then we can perfectly fit in 8 such objects.
415 */
416static int get_pages_per_zspage(int class_size)
417{
418 int i, max_usedpc = 0;
419 /* zspage order which gives maximum used size per KB */
420 int max_usedpc_order = 1;
421
422 for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
423 int zspage_size;
424 int waste, usedpc;
425
426 zspage_size = i * PAGE_SIZE;
427 waste = zspage_size % class_size;
428 usedpc = (zspage_size - waste) * 100 / zspage_size;
429
430 if (usedpc > max_usedpc) {
431 max_usedpc = usedpc;
432 max_usedpc_order = i;
433 }
434 }
435
436 return max_usedpc_order;
437}
438
439/*
440 * A single 'zspage' is composed of many system pages which are
441 * linked together using fields in struct page. This function finds
442 * the first/head page, given any component page of a zspage.
443 */
444static struct page *get_first_page(struct page *page)
445{
446 if (is_first_page(page))
447 return page;
448 else
449 return page->first_page;
450}
451
452static struct page *get_next_page(struct page *page)
453{
454 struct page *next;
455
456 if (is_last_page(page))
457 next = NULL;
458 else if (is_first_page(page))
459 next = (struct page *)page_private(page);
460 else
461 next = list_entry(page->lru.next, struct page, lru);
462
463 return next;
464}
465
466/*
467 * Encode <page, obj_idx> as a single handle value.
468 * On hardware platforms with physical memory starting at 0x0 the pfn
469 * could be 0 so we ensure that the handle will never be 0 by adjusting the
470 * encoded obj_idx value before encoding.
471 */
472static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
473{
474 unsigned long handle;
475
476 if (!page) {
477 BUG_ON(obj_idx);
478 return NULL;
479 }
480
481 handle = page_to_pfn(page) << OBJ_INDEX_BITS;
482 handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
483
484 return (void *)handle;
485}
486
487/*
488 * Decode <page, obj_idx> pair from the given object handle. We adjust the
489 * decoded obj_idx back to its original value since it was adjusted in
490 * obj_location_to_handle().
491 */
492static void obj_handle_to_location(unsigned long handle, struct page **page,
493 unsigned long *obj_idx)
494{
495 *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
496 *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
497}
498
499static unsigned long obj_idx_to_offset(struct page *page,
500 unsigned long obj_idx, int class_size)
501{
502 unsigned long off = 0;
503
504 if (!is_first_page(page))
505 off = page->index;
506
507 return off + obj_idx * class_size;
508}
509
510static void reset_page(struct page *page)
511{
512 clear_bit(PG_private, &page->flags);
513 clear_bit(PG_private_2, &page->flags);
514 set_page_private(page, 0);
515 page->mapping = NULL;
516 page->freelist = NULL;
517 page_mapcount_reset(page);
518}
519
520static void free_zspage(struct page *first_page)
521{
522 struct page *nextp, *tmp, *head_extra;
523
524 BUG_ON(!is_first_page(first_page));
525 BUG_ON(first_page->inuse);
526
527 head_extra = (struct page *)page_private(first_page);
528
529 reset_page(first_page);
530 __free_page(first_page);
531
532 /* zspage with only 1 system page */
533 if (!head_extra)
534 return;
535
536 list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
537 list_del(&nextp->lru);
538 reset_page(nextp);
539 __free_page(nextp);
540 }
541 reset_page(head_extra);
542 __free_page(head_extra);
543}
544
545/* Initialize a newly allocated zspage */
546static void init_zspage(struct page *first_page, struct size_class *class)
547{
548 unsigned long off = 0;
549 struct page *page = first_page;
550
551 BUG_ON(!is_first_page(first_page));
552 while (page) {
553 struct page *next_page;
554 struct link_free *link;
555 unsigned int i, objs_on_page;
556
557 /*
558 * page->index stores offset of first object starting
559 * in the page. For the first page, this is always 0,
560 * so we use first_page->index (aka ->freelist) to store
561 * head of corresponding zspage's freelist.
562 */
563 if (page != first_page)
564 page->index = off;
565
566 link = (struct link_free *)kmap_atomic(page) +
567 off / sizeof(*link);
568 objs_on_page = (PAGE_SIZE - off) / class->size;
569
570 for (i = 1; i <= objs_on_page; i++) {
571 off += class->size;
572 if (off < PAGE_SIZE) {
573 link->next = obj_location_to_handle(page, i);
574 link += class->size / sizeof(*link);
575 }
576 }
577
578 /*
579 * We now come to the last (full or partial) object on this
580 * page, which must point to the first object on the next
581 * page (if present)
582 */
583 next_page = get_next_page(page);
584 link->next = obj_location_to_handle(next_page, 0);
585 kunmap_atomic(link);
586 page = next_page;
587 off = (off + class->size) % PAGE_SIZE;
588 }
589}
590
591/*
592 * Allocate a zspage for the given size class
593 */
594static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
595{
596 int i, error;
597 struct page *first_page = NULL, *uninitialized_var(prev_page);
598
599 /*
600 * Allocate individual pages and link them together as:
601 * 1. first page->private = first sub-page
602 * 2. all sub-pages are linked together using page->lru
603 * 3. each sub-page is linked to the first page using page->first_page
604 *
605 * For each size class, First/Head pages are linked together using
606 * page->lru. Also, we set PG_private to identify the first page
607 * (i.e. no other sub-page has this flag set) and PG_private_2 to
608 * identify the last page.
609 */
610 error = -ENOMEM;
611 for (i = 0; i < class->pages_per_zspage; i++) {
612 struct page *page;
613
614 page = alloc_page(flags);
615 if (!page)
616 goto cleanup;
617
618 INIT_LIST_HEAD(&page->lru);
619 if (i == 0) { /* first page */
620 SetPagePrivate(page);
621 set_page_private(page, 0);
622 first_page = page;
623 first_page->inuse = 0;
624 }
625 if (i == 1)
626 set_page_private(first_page, (unsigned long)page);
627 if (i >= 1)
628 page->first_page = first_page;
629 if (i >= 2)
630 list_add(&page->lru, &prev_page->lru);
631 if (i == class->pages_per_zspage - 1) /* last page */
632 SetPagePrivate2(page);
633 prev_page = page;
634 }
635
636 init_zspage(first_page, class);
637
638 first_page->freelist = obj_location_to_handle(first_page, 0);
639 /* Maximum number of objects we can store in this zspage */
640 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
641
642 error = 0; /* Success */
643
644cleanup:
645 if (unlikely(error) && first_page) {
646 free_zspage(first_page);
647 first_page = NULL;
648 }
649
650 return first_page;
651}
652
653static struct page *find_get_zspage(struct size_class *class)
654{
655 int i;
656 struct page *page;
657
658 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
659 page = class->fullness_list[i];
660 if (page)
661 break;
662 }
663
664 return page;
665}
666
667#ifdef CONFIG_PGTABLE_MAPPING
668static inline int __zs_cpu_up(struct mapping_area *area)
669{
670 /*
671 * Make sure we don't leak memory if a cpu UP notification
672 * and zs_init() race and both call zs_cpu_up() on the same cpu
673 */
674 if (area->vm)
675 return 0;
676 area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
677 if (!area->vm)
678 return -ENOMEM;
679 return 0;
680}
681
682static inline void __zs_cpu_down(struct mapping_area *area)
683{
684 if (area->vm)
685 free_vm_area(area->vm);
686 area->vm = NULL;
687}
688
689static inline void *__zs_map_object(struct mapping_area *area,
690 struct page *pages[2], int off, int size)
691{
692 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages));
693 area->vm_addr = area->vm->addr;
694 return area->vm_addr + off;
695}
696
697static inline void __zs_unmap_object(struct mapping_area *area,
698 struct page *pages[2], int off, int size)
699{
700 unsigned long addr = (unsigned long)area->vm_addr;
701
702 unmap_kernel_range(addr, PAGE_SIZE * 2);
703}
704
705#else /* CONFIG_PGTABLE_MAPPING */
706
707static inline int __zs_cpu_up(struct mapping_area *area)
708{
709 /*
710 * Make sure we don't leak memory if a cpu UP notification
711 * and zs_init() race and both call zs_cpu_up() on the same cpu
712 */
713 if (area->vm_buf)
714 return 0;
715 area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
716 if (!area->vm_buf)
717 return -ENOMEM;
718 return 0;
719}
720
721static inline void __zs_cpu_down(struct mapping_area *area)
722{
723 if (area->vm_buf)
724 free_page((unsigned long)area->vm_buf);
725 area->vm_buf = NULL;
726}
727
728static void *__zs_map_object(struct mapping_area *area,
729 struct page *pages[2], int off, int size)
730{
731 int sizes[2];
732 void *addr;
733 char *buf = area->vm_buf;
734
735 /* disable page faults to match kmap_atomic() return conditions */
736 pagefault_disable();
737
738 /* no read fastpath */
739 if (area->vm_mm == ZS_MM_WO)
740 goto out;
741
742 sizes[0] = PAGE_SIZE - off;
743 sizes[1] = size - sizes[0];
744
745 /* copy object to per-cpu buffer */
746 addr = kmap_atomic(pages[0]);
747 memcpy(buf, addr + off, sizes[0]);
748 kunmap_atomic(addr);
749 addr = kmap_atomic(pages[1]);
750 memcpy(buf + sizes[0], addr, sizes[1]);
751 kunmap_atomic(addr);
752out:
753 return area->vm_buf;
754}
755
756static void __zs_unmap_object(struct mapping_area *area,
757 struct page *pages[2], int off, int size)
758{
759 int sizes[2];
760 void *addr;
761 char *buf = area->vm_buf;
762
763 /* no write fastpath */
764 if (area->vm_mm == ZS_MM_RO)
765 goto out;
766
767 sizes[0] = PAGE_SIZE - off;
768 sizes[1] = size - sizes[0];
769
770 /* copy per-cpu buffer to object */
771 addr = kmap_atomic(pages[0]);
772 memcpy(addr + off, buf, sizes[0]);
773 kunmap_atomic(addr);
774 addr = kmap_atomic(pages[1]);
775 memcpy(addr, buf + sizes[0], sizes[1]);
776 kunmap_atomic(addr);
777
778out:
779 /* enable page faults to match kunmap_atomic() return conditions */
780 pagefault_enable();
781}
782
783#endif /* CONFIG_PGTABLE_MAPPING */
784
785static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action,
786 void *pcpu)
787{
788 int ret, cpu = (long)pcpu;
789 struct mapping_area *area;
790
791 switch (action) {
792 case CPU_UP_PREPARE:
793 area = &per_cpu(zs_map_area, cpu);
794 ret = __zs_cpu_up(area);
795 if (ret)
796 return notifier_from_errno(ret);
797 break;
798 case CPU_DEAD:
799 case CPU_UP_CANCELED:
800 area = &per_cpu(zs_map_area, cpu);
801 __zs_cpu_down(area);
802 break;
803 }
804
805 return NOTIFY_OK;
806}
807
808static struct notifier_block zs_cpu_nb = {
809 .notifier_call = zs_cpu_notifier
810};
811
812static void zs_exit(void)
813{
814 int cpu;
815
816 for_each_online_cpu(cpu)
817 zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
818 unregister_cpu_notifier(&zs_cpu_nb);
819}
820
821static int zs_init(void)
822{
823 int cpu, ret;
824
825 register_cpu_notifier(&zs_cpu_nb);
826 for_each_online_cpu(cpu) {
827 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
828 if (notifier_to_errno(ret))
829 goto fail;
830 }
831 return 0;
832fail:
833 zs_exit();
834 return notifier_to_errno(ret);
835}
836
837/**
838 * zs_create_pool - Creates an allocation pool to work from.
839 * @flags: allocation flags used to allocate pool metadata
840 *
841 * This function must be called before anything when using
842 * the zsmalloc allocator.
843 *
844 * On success, a pointer to the newly created pool is returned,
845 * otherwise NULL.
846 */
847struct zs_pool *zs_create_pool(gfp_t flags)
848{
849 int i, ovhd_size;
850 struct zs_pool *pool;
851
852 ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
853 pool = kzalloc(ovhd_size, GFP_KERNEL);
854 if (!pool)
855 return NULL;
856
857 for (i = 0; i < ZS_SIZE_CLASSES; i++) {
858 int size;
859 struct size_class *class;
860
861 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
862 if (size > ZS_MAX_ALLOC_SIZE)
863 size = ZS_MAX_ALLOC_SIZE;
864
865 class = &pool->size_class[i];
866 class->size = size;
867 class->index = i;
868 spin_lock_init(&class->lock);
869 class->pages_per_zspage = get_pages_per_zspage(size);
870
871 }
872
873 pool->flags = flags;
874
875 return pool;
876}
877EXPORT_SYMBOL_GPL(zs_create_pool);
878
879void zs_destroy_pool(struct zs_pool *pool)
880{
881 int i;
882
883 for (i = 0; i < ZS_SIZE_CLASSES; i++) {
884 int fg;
885 struct size_class *class = &pool->size_class[i];
886
887 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
888 if (class->fullness_list[fg]) {
889 pr_info("Freeing non-empty class with size %db, fullness group %d\n",
890 class->size, fg);
891 }
892 }
893 }
894 kfree(pool);
895}
896EXPORT_SYMBOL_GPL(zs_destroy_pool);
897
898/**
899 * zs_malloc - Allocate block of given size from pool.
900 * @pool: pool to allocate from
901 * @size: size of block to allocate
902 *
903 * On success, handle to the allocated object is returned,
904 * otherwise 0.
905 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
906 */
907unsigned long zs_malloc(struct zs_pool *pool, size_t size)
908{
909 unsigned long obj;
910 struct link_free *link;
911 int class_idx;
912 struct size_class *class;
913
914 struct page *first_page, *m_page;
915 unsigned long m_objidx, m_offset;
916
917 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
918 return 0;
919
920 class_idx = get_size_class_index(size);
921 class = &pool->size_class[class_idx];
922 BUG_ON(class_idx != class->index);
923
924 spin_lock(&class->lock);
925 first_page = find_get_zspage(class);
926
927 if (!first_page) {
928 spin_unlock(&class->lock);
929 first_page = alloc_zspage(class, pool->flags);
930 if (unlikely(!first_page))
931 return 0;
932
933 set_zspage_mapping(first_page, class->index, ZS_EMPTY);
934 spin_lock(&class->lock);
935 class->pages_allocated += class->pages_per_zspage;
936 }
937
938 obj = (unsigned long)first_page->freelist;
939 obj_handle_to_location(obj, &m_page, &m_objidx);
940 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
941
942 link = (struct link_free *)kmap_atomic(m_page) +
943 m_offset / sizeof(*link);
944 first_page->freelist = link->next;
945 memset(link, POISON_INUSE, sizeof(*link));
946 kunmap_atomic(link);
947
948 first_page->inuse++;
949 /* Now move the zspage to another fullness group, if required */
950 fix_fullness_group(pool, first_page);
951 spin_unlock(&class->lock);
952
953 return obj;
954}
955EXPORT_SYMBOL_GPL(zs_malloc);
956
957void zs_free(struct zs_pool *pool, unsigned long obj)
958{
959 struct link_free *link;
960 struct page *first_page, *f_page;
961 unsigned long f_objidx, f_offset;
962
963 int class_idx;
964 struct size_class *class;
965 enum fullness_group fullness;
966
967 if (unlikely(!obj))
968 return;
969
970 obj_handle_to_location(obj, &f_page, &f_objidx);
971 first_page = get_first_page(f_page);
972
973 get_zspage_mapping(first_page, &class_idx, &fullness);
974 class = &pool->size_class[class_idx];
975 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
976
977 spin_lock(&class->lock);
978
979 /* Insert this object in containing zspage's freelist */
980 link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
981 + f_offset);
982 link->next = first_page->freelist;
983 kunmap_atomic(link);
984 first_page->freelist = (void *)obj;
985
986 first_page->inuse--;
987 fullness = fix_fullness_group(pool, first_page);
988
989 if (fullness == ZS_EMPTY)
990 class->pages_allocated -= class->pages_per_zspage;
991
992 spin_unlock(&class->lock);
993
994 if (fullness == ZS_EMPTY)
995 free_zspage(first_page);
996}
997EXPORT_SYMBOL_GPL(zs_free);
998
999/**
1000 * zs_map_object - get address of allocated object from handle.
1001 * @pool: pool from which the object was allocated
1002 * @handle: handle returned from zs_malloc
1003 *
1004 * Before using an object allocated from zs_malloc, it must be mapped using
1005 * this function. When done with the object, it must be unmapped using
1006 * zs_unmap_object.
1007 *
1008 * Only one object can be mapped per cpu at a time. There is no protection
1009 * against nested mappings.
1010 *
1011 * This function returns with preemption and page faults disabled.
1012 */
1013void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1014 enum zs_mapmode mm)
1015{
1016 struct page *page;
1017 unsigned long obj_idx, off;
1018
1019 unsigned int class_idx;
1020 enum fullness_group fg;
1021 struct size_class *class;
1022 struct mapping_area *area;
1023 struct page *pages[2];
1024
1025 BUG_ON(!handle);
1026
1027 /*
1028 * Because we use per-cpu mapping areas shared among the
1029 * pools/users, we can't allow mapping in interrupt context
1030 * because it can corrupt another users mappings.
1031 */
1032 BUG_ON(in_interrupt());
1033
1034 obj_handle_to_location(handle, &page, &obj_idx);
1035 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1036 class = &pool->size_class[class_idx];
1037 off = obj_idx_to_offset(page, obj_idx, class->size);
1038
1039 area = &get_cpu_var(zs_map_area);
1040 area->vm_mm = mm;
1041 if (off + class->size <= PAGE_SIZE) {
1042 /* this object is contained entirely within a page */
1043 area->vm_addr = kmap_atomic(page);
1044 return area->vm_addr + off;
1045 }
1046
1047 /* this object spans two pages */
1048 pages[0] = page;
1049 pages[1] = get_next_page(page);
1050 BUG_ON(!pages[1]);
1051
1052 return __zs_map_object(area, pages, off, class->size);
1053}
1054EXPORT_SYMBOL_GPL(zs_map_object);
1055
1056void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1057{
1058 struct page *page;
1059 unsigned long obj_idx, off;
1060
1061 unsigned int class_idx;
1062 enum fullness_group fg;
1063 struct size_class *class;
1064 struct mapping_area *area;
1065
1066 BUG_ON(!handle);
1067
1068 obj_handle_to_location(handle, &page, &obj_idx);
1069 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1070 class = &pool->size_class[class_idx];
1071 off = obj_idx_to_offset(page, obj_idx, class->size);
1072
1073 area = &__get_cpu_var(zs_map_area);
1074 if (off + class->size <= PAGE_SIZE)
1075 kunmap_atomic(area->vm_addr);
1076 else {
1077 struct page *pages[2];
1078
1079 pages[0] = page;
1080 pages[1] = get_next_page(page);
1081 BUG_ON(!pages[1]);
1082
1083 __zs_unmap_object(area, pages, off, class->size);
1084 }
1085 put_cpu_var(zs_map_area);
1086}
1087EXPORT_SYMBOL_GPL(zs_unmap_object);
1088
1089u64 zs_get_total_size_bytes(struct zs_pool *pool)
1090{
1091 int i;
1092 u64 npages = 0;
1093
1094 for (i = 0; i < ZS_SIZE_CLASSES; i++)
1095 npages += pool->size_class[i].pages_allocated;
1096
1097 return npages << PAGE_SHIFT;
1098}
1099EXPORT_SYMBOL_GPL(zs_get_total_size_bytes);
1100
1101module_init(zs_init);
1102module_exit(zs_exit);
1103
1104MODULE_LICENSE("Dual BSD/GPL");
1105MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");