aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-10 21:11:43 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-10 21:11:43 -0400
commitdb6e330490e448733e7836833e25e96034770058 (patch)
treeee23fbe6831bbd757328b3d80b4faa6651b2c314
parentae9249493049fd41fa52fc0470251ee1efaabe74 (diff)
parent98d1e64f95b177d0f14efbdf695a1b28e1428035 (diff)
Merge branch 'akpm' (patches from Andrew Morton)
Merge more patches from Andrew Morton: "The rest of MM" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm: remove free_area_cache zswap: add documentation zswap: add to mm/ zbud: add to mm/
-rw-r--r--Documentation/vm/zswap.txt68
-rw-r--r--arch/arm/mm/mmap.c2
-rw-r--r--arch/arm64/mm/mmap.c2
-rw-r--r--arch/mips/mm/mmap.c2
-rw-r--r--arch/powerpc/mm/mmap.c2
-rw-r--r--arch/s390/mm/mmap.c4
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c2
-rw-r--r--arch/tile/mm/mmap.c2
-rw-r--r--arch/x86/ia32/ia32_aout.c2
-rw-r--r--arch/x86/mm/mmap.c2
-rw-r--r--fs/binfmt_aout.c2
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--include/linux/mm_types.h3
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/zbud.h22
-rw-r--r--kernel/fork.c4
-rw-r--r--mm/Kconfig30
-rw-r--r--mm/Makefile2
-rw-r--r--mm/mmap.c28
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/util.c1
-rw-r--r--mm/zbud.c527
-rw-r--r--mm/zswap.c943
23 files changed, 1592 insertions, 66 deletions
diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.txt
new file mode 100644
index 000000000000..7e492d8aaeaf
--- /dev/null
+++ b/Documentation/vm/zswap.txt
@@ -0,0 +1,68 @@
1Overview:
2
3Zswap is a lightweight compressed cache for swap pages. It takes pages that are
4in the process of being swapped out and attempts to compress them into a
5dynamically allocated RAM-based memory pool. zswap basically trades CPU cycles
6for potentially reduced swap I/O.  This trade-off can also result in a
7significant performance improvement if reads from the compressed cache are
8faster than reads from a swap device.
9
10NOTE: Zswap is a new feature as of v3.11 and interacts heavily with memory
11reclaim. This interaction has not be fully explored on the large set of
12potential configurations and workloads that exist. For this reason, zswap
13is a work in progress and should be considered experimental.
14
15Some potential benefits:
16* Desktop/laptop users with limited RAM capacities can mitigate the
17    performance impact of swapping.
18* Overcommitted guests that share a common I/O resource can
19    dramatically reduce their swap I/O pressure, avoiding heavy handed I/O
20 throttling by the hypervisor. This allows more work to get done with less
21 impact to the guest workload and guests sharing the I/O subsystem
22* Users with SSDs as swap devices can extend the life of the device by
23    drastically reducing life-shortening writes.
24
25Zswap evicts pages from compressed cache on an LRU basis to the backing swap
26device when the compressed pool reaches it size limit. This requirement had
27been identified in prior community discussions.
28
29To enabled zswap, the "enabled" attribute must be set to 1 at boot time. e.g.
30zswap.enabled=1
31
32Design:
33
34Zswap receives pages for compression through the Frontswap API and is able to
35evict pages from its own compressed pool on an LRU basis and write them back to
36the backing swap device in the case that the compressed pool is full.
37
38Zswap makes use of zbud for the managing the compressed memory pool. Each
39allocation in zbud is not directly accessible by address. Rather, a handle is
40return by the allocation routine and that handle must be mapped before being
41accessed. The compressed memory pool grows on demand and shrinks as compressed
42pages are freed. The pool is not preallocated.
43
44When a swap page is passed from frontswap to zswap, zswap maintains a mapping
45of the swap entry, a combination of the swap type and swap offset, to the zbud
46handle that references that compressed swap page. This mapping is achieved
47with a red-black tree per swap type. The swap offset is the search key for the
48tree nodes.
49
50During a page fault on a PTE that is a swap entry, frontswap calls the zswap
51load function to decompress the page into the page allocated by the page fault
52handler.
53
54Once there are no PTEs referencing a swap page stored in zswap (i.e. the count
55in the swap_map goes to 0) the swap code calls the zswap invalidate function,
56via frontswap, to free the compressed entry.
57
58Zswap seeks to be simple in its policies. Sysfs attributes allow for one user
59controlled policies:
60* max_pool_percent - The maximum percentage of memory that the compressed
61 pool can occupy.
62
63Zswap allows the compressor to be selected at kernel boot time by setting the
64“compressor” attribute. The default compressor is lzo. e.g.
65zswap.compressor=deflate
66
67A debugfs interface is provided for various statistic about pool size, number
68of pages stored, and various counters for the reasons pages are rejected.
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 10062ceadd1c..0c6356255fe3 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -181,11 +181,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
181 if (mmap_is_legacy()) { 181 if (mmap_is_legacy()) {
182 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 182 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
183 mm->get_unmapped_area = arch_get_unmapped_area; 183 mm->get_unmapped_area = arch_get_unmapped_area;
184 mm->unmap_area = arch_unmap_area;
185 } else { 184 } else {
186 mm->mmap_base = mmap_base(random_factor); 185 mm->mmap_base = mmap_base(random_factor);
187 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 186 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
188 mm->unmap_area = arch_unmap_area_topdown;
189 } 187 }
190} 188}
191 189
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 7c7be7855638..8ed6cb1a900f 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -90,11 +90,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
90 if (mmap_is_legacy()) { 90 if (mmap_is_legacy()) {
91 mm->mmap_base = TASK_UNMAPPED_BASE; 91 mm->mmap_base = TASK_UNMAPPED_BASE;
92 mm->get_unmapped_area = arch_get_unmapped_area; 92 mm->get_unmapped_area = arch_get_unmapped_area;
93 mm->unmap_area = arch_unmap_area;
94 } else { 93 } else {
95 mm->mmap_base = mmap_base(); 94 mm->mmap_base = mmap_base();
96 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 95 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
97 mm->unmap_area = arch_unmap_area_topdown;
98 } 96 }
99} 97}
100EXPORT_SYMBOL_GPL(arch_pick_mmap_layout); 98EXPORT_SYMBOL_GPL(arch_pick_mmap_layout);
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index 7e5fe2790d8a..f1baadd56e82 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -158,11 +158,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
158 if (mmap_is_legacy()) { 158 if (mmap_is_legacy()) {
159 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 159 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
160 mm->get_unmapped_area = arch_get_unmapped_area; 160 mm->get_unmapped_area = arch_get_unmapped_area;
161 mm->unmap_area = arch_unmap_area;
162 } else { 161 } else {
163 mm->mmap_base = mmap_base(random_factor); 162 mm->mmap_base = mmap_base(random_factor);
164 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 163 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
165 mm->unmap_area = arch_unmap_area_topdown;
166 } 164 }
167} 165}
168 166
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 67a42ed0d2fc..cb8bdbe4972f 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -92,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
92 if (mmap_is_legacy()) { 92 if (mmap_is_legacy()) {
93 mm->mmap_base = TASK_UNMAPPED_BASE; 93 mm->mmap_base = TASK_UNMAPPED_BASE;
94 mm->get_unmapped_area = arch_get_unmapped_area; 94 mm->get_unmapped_area = arch_get_unmapped_area;
95 mm->unmap_area = arch_unmap_area;
96 } else { 95 } else {
97 mm->mmap_base = mmap_base(); 96 mm->mmap_base = mmap_base();
98 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 97 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
99 mm->unmap_area = arch_unmap_area_topdown;
100 } 98 }
101} 99}
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 06bafec00278..40023290ee5b 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -91,11 +91,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
91 if (mmap_is_legacy()) { 91 if (mmap_is_legacy()) {
92 mm->mmap_base = TASK_UNMAPPED_BASE; 92 mm->mmap_base = TASK_UNMAPPED_BASE;
93 mm->get_unmapped_area = arch_get_unmapped_area; 93 mm->get_unmapped_area = arch_get_unmapped_area;
94 mm->unmap_area = arch_unmap_area;
95 } else { 94 } else {
96 mm->mmap_base = mmap_base(); 95 mm->mmap_base = mmap_base();
97 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 96 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
98 mm->unmap_area = arch_unmap_area_topdown;
99 } 97 }
100} 98}
101 99
@@ -176,11 +174,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
176 if (mmap_is_legacy()) { 174 if (mmap_is_legacy()) {
177 mm->mmap_base = TASK_UNMAPPED_BASE; 175 mm->mmap_base = TASK_UNMAPPED_BASE;
178 mm->get_unmapped_area = s390_get_unmapped_area; 176 mm->get_unmapped_area = s390_get_unmapped_area;
179 mm->unmap_area = arch_unmap_area;
180 } else { 177 } else {
181 mm->mmap_base = mmap_base(); 178 mm->mmap_base = mmap_base();
182 mm->get_unmapped_area = s390_get_unmapped_area_topdown; 179 mm->get_unmapped_area = s390_get_unmapped_area_topdown;
183 mm->unmap_area = arch_unmap_area_topdown;
184 } 180 }
185} 181}
186 182
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 2daaaa6eda23..51561b8b15ba 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -290,7 +290,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
290 sysctl_legacy_va_layout) { 290 sysctl_legacy_va_layout) {
291 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 291 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
292 mm->get_unmapped_area = arch_get_unmapped_area; 292 mm->get_unmapped_area = arch_get_unmapped_area;
293 mm->unmap_area = arch_unmap_area;
294 } else { 293 } else {
295 /* We know it's 32-bit */ 294 /* We know it's 32-bit */
296 unsigned long task_size = STACK_TOP32; 295 unsigned long task_size = STACK_TOP32;
@@ -302,7 +301,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
302 301
303 mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor); 302 mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor);
304 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 303 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
305 mm->unmap_area = arch_unmap_area_topdown;
306 } 304 }
307} 305}
308 306
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
index f96f4cec602a..d67d91ebf63e 100644
--- a/arch/tile/mm/mmap.c
+++ b/arch/tile/mm/mmap.c
@@ -66,10 +66,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
66 if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) { 66 if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
67 mm->mmap_base = TASK_UNMAPPED_BASE; 67 mm->mmap_base = TASK_UNMAPPED_BASE;
68 mm->get_unmapped_area = arch_get_unmapped_area; 68 mm->get_unmapped_area = arch_get_unmapped_area;
69 mm->unmap_area = arch_unmap_area;
70 } else { 69 } else {
71 mm->mmap_base = mmap_base(mm); 70 mm->mmap_base = mmap_base(mm);
72 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 71 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
73 mm->unmap_area = arch_unmap_area_topdown;
74 } 72 }
75} 73}
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 52ff81cce008..bae3aba95b15 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -308,8 +308,6 @@ static int load_aout_binary(struct linux_binprm *bprm)
308 (current->mm->start_data = N_DATADDR(ex)); 308 (current->mm->start_data = N_DATADDR(ex));
309 current->mm->brk = ex.a_bss + 309 current->mm->brk = ex.a_bss +
310 (current->mm->start_brk = N_BSSADDR(ex)); 310 (current->mm->start_brk = N_BSSADDR(ex));
311 current->mm->free_area_cache = TASK_UNMAPPED_BASE;
312 current->mm->cached_hole_size = 0;
313 311
314 retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); 312 retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
315 if (retval < 0) { 313 if (retval < 0) {
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 845df6835f9f..62c29a5bfe26 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -115,10 +115,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
115 if (mmap_is_legacy()) { 115 if (mmap_is_legacy()) {
116 mm->mmap_base = mmap_legacy_base(); 116 mm->mmap_base = mmap_legacy_base();
117 mm->get_unmapped_area = arch_get_unmapped_area; 117 mm->get_unmapped_area = arch_get_unmapped_area;
118 mm->unmap_area = arch_unmap_area;
119 } else { 118 } else {
120 mm->mmap_base = mmap_base(); 119 mm->mmap_base = mmap_base();
121 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 120 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
122 mm->unmap_area = arch_unmap_area_topdown;
123 } 121 }
124} 122}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index bce87694f7b0..89dec7f789a4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -255,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
255 (current->mm->start_data = N_DATADDR(ex)); 255 (current->mm->start_data = N_DATADDR(ex));
256 current->mm->brk = ex.a_bss + 256 current->mm->brk = ex.a_bss +
257 (current->mm->start_brk = N_BSSADDR(ex)); 257 (current->mm->start_brk = N_BSSADDR(ex));
258 current->mm->free_area_cache = current->mm->mmap_base;
259 current->mm->cached_hole_size = 0;
260 258
261 retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); 259 retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
262 if (retval < 0) { 260 if (retval < 0) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8a0b0efda44..100edcc5e312 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -738,8 +738,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
738 738
739 /* Do this so that we can load the interpreter, if need be. We will 739 /* Do this so that we can load the interpreter, if need be. We will
740 change some of these later */ 740 change some of these later */
741 current->mm->free_area_cache = current->mm->mmap_base;
742 current->mm->cached_hole_size = 0;
743 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), 741 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
744 executable_stack); 742 executable_stack);
745 if (retval < 0) { 743 if (retval < 0) {
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ace9a5f01c64..fb425aa16c01 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -330,12 +330,9 @@ struct mm_struct {
330 unsigned long (*get_unmapped_area) (struct file *filp, 330 unsigned long (*get_unmapped_area) (struct file *filp,
331 unsigned long addr, unsigned long len, 331 unsigned long addr, unsigned long len,
332 unsigned long pgoff, unsigned long flags); 332 unsigned long pgoff, unsigned long flags);
333 void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
334#endif 333#endif
335 unsigned long mmap_base; /* base of mmap area */ 334 unsigned long mmap_base; /* base of mmap area */
336 unsigned long task_size; /* size of task vm space */ 335 unsigned long task_size; /* size of task vm space */
337 unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
338 unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
339 unsigned long highest_vm_end; /* highest vma end address */ 336 unsigned long highest_vm_end; /* highest vma end address */
340 pgd_t * pgd; 337 pgd_t * pgd;
341 atomic_t mm_users; /* How many users with user space? */ 338 atomic_t mm_users; /* How many users with user space? */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f99d57e0ae47..50d04b92ceda 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -322,8 +322,6 @@ extern unsigned long
322arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, 322arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
323 unsigned long len, unsigned long pgoff, 323 unsigned long len, unsigned long pgoff,
324 unsigned long flags); 324 unsigned long flags);
325extern void arch_unmap_area(struct mm_struct *, unsigned long);
326extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
327#else 325#else
328static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} 326static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
329#endif 327#endif
diff --git a/include/linux/zbud.h b/include/linux/zbud.h
new file mode 100644
index 000000000000..2571a5cfa5fc
--- /dev/null
+++ b/include/linux/zbud.h
@@ -0,0 +1,22 @@
1#ifndef _ZBUD_H_
2#define _ZBUD_H_
3
4#include <linux/types.h>
5
6struct zbud_pool;
7
8struct zbud_ops {
9 int (*evict)(struct zbud_pool *pool, unsigned long handle);
10};
11
12struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
13void zbud_destroy_pool(struct zbud_pool *pool);
14int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
15 unsigned long *handle);
16void zbud_free(struct zbud_pool *pool, unsigned long handle);
17int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
18void *zbud_map(struct zbud_pool *pool, unsigned long handle);
19void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
20u64 zbud_get_pool_size(struct zbud_pool *pool);
21
22#endif /* _ZBUD_H_ */
diff --git a/kernel/fork.c b/kernel/fork.c
index 6e6a1c11b3e5..66635c80a813 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -365,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
365 mm->locked_vm = 0; 365 mm->locked_vm = 0;
366 mm->mmap = NULL; 366 mm->mmap = NULL;
367 mm->mmap_cache = NULL; 367 mm->mmap_cache = NULL;
368 mm->free_area_cache = oldmm->mmap_base;
369 mm->cached_hole_size = ~0UL;
370 mm->map_count = 0; 368 mm->map_count = 0;
371 cpumask_clear(mm_cpumask(mm)); 369 cpumask_clear(mm_cpumask(mm));
372 mm->mm_rb = RB_ROOT; 370 mm->mm_rb = RB_ROOT;
@@ -540,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
540 mm->nr_ptes = 0; 538 mm->nr_ptes = 0;
541 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); 539 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
542 spin_lock_init(&mm->page_table_lock); 540 spin_lock_init(&mm->page_table_lock);
543 mm->free_area_cache = TASK_UNMAPPED_BASE;
544 mm->cached_hole_size = ~0UL;
545 mm_init_aio(mm); 541 mm_init_aio(mm);
546 mm_init_owner(mm, p); 542 mm_init_owner(mm, p);
547 543
diff --git a/mm/Kconfig b/mm/Kconfig
index 7e28ecfa8aa4..8028dcc6615c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -478,6 +478,36 @@ config FRONTSWAP
478 478
479 If unsure, say Y to enable frontswap. 479 If unsure, say Y to enable frontswap.
480 480
481config ZBUD
482 tristate
483 default n
484 help
485 A special purpose allocator for storing compressed pages.
486 It is designed to store up to two compressed pages per physical
487 page. While this design limits storage density, it has simple and
488 deterministic reclaim properties that make it preferable to a higher
489 density approach when reclaim will be used.
490
491config ZSWAP
492 bool "Compressed cache for swap pages (EXPERIMENTAL)"
493 depends on FRONTSWAP && CRYPTO=y
494 select CRYPTO_LZO
495 select ZBUD
496 default n
497 help
498 A lightweight compressed cache for swap pages. It takes
499 pages that are in the process of being swapped out and attempts to
500 compress them into a dynamically allocated RAM-based memory pool.
501 This can result in a significant I/O reduction on swap device and,
502 in the case where decompressing from RAM is faster that swap device
503 reads, can also improve workload performance.
504
505 This is marked experimental because it is a new feature (as of
506 v3.11) that interacts heavily with memory reclaim. While these
507 interactions don't cause any known issues on simple memory setups,
508 they have not be fully explored on the large set of potential
509 configurations and workloads that exist.
510
481config MEM_SOFT_DIRTY 511config MEM_SOFT_DIRTY
482 bool "Track memory changes" 512 bool "Track memory changes"
483 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY 513 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
diff --git a/mm/Makefile b/mm/Makefile
index 72c5acb9345f..f00803386a67 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
32obj-$(CONFIG_BOUNCE) += bounce.o 32obj-$(CONFIG_BOUNCE) += bounce.o
33obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 33obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
34obj-$(CONFIG_FRONTSWAP) += frontswap.o 34obj-$(CONFIG_FRONTSWAP) += frontswap.o
35obj-$(CONFIG_ZSWAP) += zswap.o
35obj-$(CONFIG_HAS_DMA) += dmapool.o 36obj-$(CONFIG_HAS_DMA) += dmapool.o
36obj-$(CONFIG_HUGETLBFS) += hugetlb.o 37obj-$(CONFIG_HUGETLBFS) += hugetlb.o
37obj-$(CONFIG_NUMA) += mempolicy.o 38obj-$(CONFIG_NUMA) += mempolicy.o
@@ -58,3 +59,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
58obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 59obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
59obj-$(CONFIG_CLEANCACHE) += cleancache.o 60obj-$(CONFIG_CLEANCACHE) += cleancache.o
60obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o 61obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
62obj-$(CONFIG_ZBUD) += zbud.o
diff --git a/mm/mmap.c b/mm/mmap.c
index f81311173b4d..fbad7b091090 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1878,15 +1878,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1878} 1878}
1879#endif 1879#endif
1880 1880
1881void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1882{
1883 /*
1884 * Is this a new hole at the lowest possible address?
1885 */
1886 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
1887 mm->free_area_cache = addr;
1888}
1889
1890/* 1881/*
1891 * This mmap-allocator allocates new areas top-down from below the 1882 * This mmap-allocator allocates new areas top-down from below the
1892 * stack's low limit (the base): 1883 * stack's low limit (the base):
@@ -1943,19 +1934,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1943} 1934}
1944#endif 1935#endif
1945 1936
1946void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
1947{
1948 /*
1949 * Is this a new hole at the highest possible address?
1950 */
1951 if (addr > mm->free_area_cache)
1952 mm->free_area_cache = addr;
1953
1954 /* dont allow allocations above current base */
1955 if (mm->free_area_cache > mm->mmap_base)
1956 mm->free_area_cache = mm->mmap_base;
1957}
1958
1959unsigned long 1937unsigned long
1960get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 1938get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1961 unsigned long pgoff, unsigned long flags) 1939 unsigned long pgoff, unsigned long flags)
@@ -2376,7 +2354,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2376{ 2354{
2377 struct vm_area_struct **insertion_point; 2355 struct vm_area_struct **insertion_point;
2378 struct vm_area_struct *tail_vma = NULL; 2356 struct vm_area_struct *tail_vma = NULL;
2379 unsigned long addr;
2380 2357
2381 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2358 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
2382 vma->vm_prev = NULL; 2359 vma->vm_prev = NULL;
@@ -2393,11 +2370,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2393 } else 2370 } else
2394 mm->highest_vm_end = prev ? prev->vm_end : 0; 2371 mm->highest_vm_end = prev ? prev->vm_end : 0;
2395 tail_vma->vm_next = NULL; 2372 tail_vma->vm_next = NULL;
2396 if (mm->unmap_area == arch_unmap_area)
2397 addr = prev ? prev->vm_end : mm->mmap_base;
2398 else
2399 addr = vma ? vma->vm_start : mm->mmap_base;
2400 mm->unmap_area(mm, addr);
2401 mm->mmap_cache = NULL; /* Kill the cache. */ 2373 mm->mmap_cache = NULL; /* Kill the cache. */
2402} 2374}
2403 2375
diff --git a/mm/nommu.c b/mm/nommu.c
index e44e6e0a125c..ecd1f158548e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1871,10 +1871,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1871 return -ENOMEM; 1871 return -ENOMEM;
1872} 1872}
1873 1873
1874void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1875{
1876}
1877
1878void unmap_mapping_range(struct address_space *mapping, 1874void unmap_mapping_range(struct address_space *mapping,
1879 loff_t const holebegin, loff_t const holelen, 1875 loff_t const holebegin, loff_t const holelen,
1880 int even_cows) 1876 int even_cows)
diff --git a/mm/util.c b/mm/util.c
index ab1424dbe2e6..7441c41d00f6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -295,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
295{ 295{
296 mm->mmap_base = TASK_UNMAPPED_BASE; 296 mm->mmap_base = TASK_UNMAPPED_BASE;
297 mm->get_unmapped_area = arch_get_unmapped_area; 297 mm->get_unmapped_area = arch_get_unmapped_area;
298 mm->unmap_area = arch_unmap_area;
299} 298}
300#endif 299#endif
301 300
diff --git a/mm/zbud.c b/mm/zbud.c
new file mode 100644
index 000000000000..9bb4710e3589
--- /dev/null
+++ b/mm/zbud.c
@@ -0,0 +1,527 @@
1/*
2 * zbud.c
3 *
4 * Copyright (C) 2013, Seth Jennings, IBM
5 *
6 * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
7 *
8 * zbud is an special purpose allocator for storing compressed pages. Contrary
9 * to what its name may suggest, zbud is not a buddy allocator, but rather an
10 * allocator that "buddies" two compressed pages together in a single memory
11 * page.
12 *
13 * While this design limits storage density, it has simple and deterministic
14 * reclaim properties that make it preferable to a higher density approach when
15 * reclaim will be used.
16 *
17 * zbud works by storing compressed pages, or "zpages", together in pairs in a
18 * single memory page called a "zbud page". The first buddy is "left
19 * justifed" at the beginning of the zbud page, and the last buddy is "right
20 * justified" at the end of the zbud page. The benefit is that if either
21 * buddy is freed, the freed buddy space, coalesced with whatever slack space
22 * that existed between the buddies, results in the largest possible free region
23 * within the zbud page.
24 *
25 * zbud also provides an attractive lower bound on density. The ratio of zpages
26 * to zbud pages can not be less than 1. This ensures that zbud can never "do
27 * harm" by using more pages to store zpages than the uncompressed zpages would
28 * have used on their own.
29 *
30 * zbud pages are divided into "chunks". The size of the chunks is fixed at
31 * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages
32 * into chunks allows organizing unbuddied zbud pages into a manageable number
33 * of unbuddied lists according to the number of free chunks available in the
34 * zbud page.
35 *
36 * The zbud API differs from that of conventional allocators in that the
37 * allocation function, zbud_alloc(), returns an opaque handle to the user,
38 * not a dereferenceable pointer. The user must map the handle using
39 * zbud_map() in order to get a usable pointer by which to access the
40 * allocation data and unmap the handle with zbud_unmap() when operations
41 * on the allocation data are complete.
42 */
43
44#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
45
46#include <linux/atomic.h>
47#include <linux/list.h>
48#include <linux/mm.h>
49#include <linux/module.h>
50#include <linux/preempt.h>
51#include <linux/slab.h>
52#include <linux/spinlock.h>
53#include <linux/zbud.h>
54
55/*****************
56 * Structures
57*****************/
58/*
59 * NCHUNKS_ORDER determines the internal allocation granularity, effectively
60 * adjusting internal fragmentation. It also determines the number of
61 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
62 * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
63 * will be 64 freelists per pool.
64 */
65#define NCHUNKS_ORDER 6
66
67#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
68#define CHUNK_SIZE (1 << CHUNK_SHIFT)
69#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
70#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
71
72/**
73 * struct zbud_pool - stores metadata for each zbud pool
74 * @lock: protects all pool fields and first|last_chunk fields of any
75 * zbud page in the pool
76 * @unbuddied: array of lists tracking zbud pages that only contain one buddy;
77 * the lists each zbud page is added to depends on the size of
78 * its free region.
79 * @buddied: list tracking the zbud pages that contain two buddies;
80 * these zbud pages are full
81 * @lru: list tracking the zbud pages in LRU order by most recently
82 * added buddy.
83 * @pages_nr: number of zbud pages in the pool.
84 * @ops: pointer to a structure of user defined operations specified at
85 * pool creation time.
86 *
87 * This structure is allocated at pool creation time and maintains metadata
88 * pertaining to a particular zbud pool.
89 */
90struct zbud_pool {
91 spinlock_t lock;
92 struct list_head unbuddied[NCHUNKS];
93 struct list_head buddied;
94 struct list_head lru;
95 u64 pages_nr;
96 struct zbud_ops *ops;
97};
98
99/*
100 * struct zbud_header - zbud page metadata occupying the first chunk of each
101 * zbud page.
102 * @buddy: links the zbud page into the unbuddied/buddied lists in the pool
103 * @lru: links the zbud page into the lru list in the pool
104 * @first_chunks: the size of the first buddy in chunks, 0 if free
105 * @last_chunks: the size of the last buddy in chunks, 0 if free
106 */
107struct zbud_header {
108 struct list_head buddy;
109 struct list_head lru;
110 unsigned int first_chunks;
111 unsigned int last_chunks;
112 bool under_reclaim;
113};
114
115/*****************
116 * Helpers
117*****************/
118/* Just to make the code easier to read */
119enum buddy {
120 FIRST,
121 LAST
122};
123
124/* Converts an allocation size in bytes to size in zbud chunks */
125static int size_to_chunks(int size)
126{
127 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
128}
129
130#define for_each_unbuddied_list(_iter, _begin) \
131 for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
132
133/* Initializes the zbud header of a newly allocated zbud page */
134static struct zbud_header *init_zbud_page(struct page *page)
135{
136 struct zbud_header *zhdr = page_address(page);
137 zhdr->first_chunks = 0;
138 zhdr->last_chunks = 0;
139 INIT_LIST_HEAD(&zhdr->buddy);
140 INIT_LIST_HEAD(&zhdr->lru);
141 zhdr->under_reclaim = 0;
142 return zhdr;
143}
144
145/* Resets the struct page fields and frees the page */
146static void free_zbud_page(struct zbud_header *zhdr)
147{
148 __free_page(virt_to_page(zhdr));
149}
150
151/*
152 * Encodes the handle of a particular buddy within a zbud page
153 * Pool lock should be held as this function accesses first|last_chunks
154 */
155static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
156{
157 unsigned long handle;
158
159 /*
160 * For now, the encoded handle is actually just the pointer to the data
161 * but this might not always be the case. A little information hiding.
162 * Add CHUNK_SIZE to the handle if it is the first allocation to jump
163 * over the zbud header in the first chunk.
164 */
165 handle = (unsigned long)zhdr;
166 if (bud == FIRST)
167 /* skip over zbud header */
168 handle += ZHDR_SIZE_ALIGNED;
169 else /* bud == LAST */
170 handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
171 return handle;
172}
173
174/* Returns the zbud page where a given handle is stored */
175static struct zbud_header *handle_to_zbud_header(unsigned long handle)
176{
177 return (struct zbud_header *)(handle & PAGE_MASK);
178}
179
180/* Returns the number of free chunks in a zbud page */
181static int num_free_chunks(struct zbud_header *zhdr)
182{
183 /*
184 * Rather than branch for different situations, just use the fact that
185 * free buddies have a length of zero to simplify everything. -1 at the
186 * end for the zbud header.
187 */
188 return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
189}
190
191/*****************
192 * API Functions
193*****************/
194/**
195 * zbud_create_pool() - create a new zbud pool
196 * @gfp: gfp flags when allocating the zbud pool structure
197 * @ops: user-defined operations for the zbud pool
198 *
199 * Return: pointer to the new zbud pool or NULL if the metadata allocation
200 * failed.
201 */
202struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
203{
204 struct zbud_pool *pool;
205 int i;
206
207 pool = kmalloc(sizeof(struct zbud_pool), gfp);
208 if (!pool)
209 return NULL;
210 spin_lock_init(&pool->lock);
211 for_each_unbuddied_list(i, 0)
212 INIT_LIST_HEAD(&pool->unbuddied[i]);
213 INIT_LIST_HEAD(&pool->buddied);
214 INIT_LIST_HEAD(&pool->lru);
215 pool->pages_nr = 0;
216 pool->ops = ops;
217 return pool;
218}
219
220/**
221 * zbud_destroy_pool() - destroys an existing zbud pool
222 * @pool: the zbud pool to be destroyed
223 *
224 * The pool should be emptied before this function is called.
225 */
226void zbud_destroy_pool(struct zbud_pool *pool)
227{
228 kfree(pool);
229}
230
231/**
232 * zbud_alloc() - allocates a region of a given size
233 * @pool: zbud pool from which to allocate
234 * @size: size in bytes of the desired allocation
235 * @gfp: gfp flags used if the pool needs to grow
236 * @handle: handle of the new allocation
237 *
238 * This function will attempt to find a free region in the pool large enough to
239 * satisfy the allocation request. A search of the unbuddied lists is
240 * performed first. If no suitable free region is found, then a new page is
241 * allocated and added to the pool to satisfy the request.
242 *
243 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
244 * as zbud pool pages.
245 *
246 * Return: 0 if success and handle is set, otherwise -EINVAL is the size or
247 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
248 * a new page.
249 */
250int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
251 unsigned long *handle)
252{
253 int chunks, i, freechunks;
254 struct zbud_header *zhdr = NULL;
255 enum buddy bud;
256 struct page *page;
257
258 if (size <= 0 || gfp & __GFP_HIGHMEM)
259 return -EINVAL;
260 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED)
261 return -ENOSPC;
262 chunks = size_to_chunks(size);
263 spin_lock(&pool->lock);
264
265 /* First, try to find an unbuddied zbud page. */
266 zhdr = NULL;
267 for_each_unbuddied_list(i, chunks) {
268 if (!list_empty(&pool->unbuddied[i])) {
269 zhdr = list_first_entry(&pool->unbuddied[i],
270 struct zbud_header, buddy);
271 list_del(&zhdr->buddy);
272 if (zhdr->first_chunks == 0)
273 bud = FIRST;
274 else
275 bud = LAST;
276 goto found;
277 }
278 }
279
280 /* Couldn't find unbuddied zbud page, create new one */
281 spin_unlock(&pool->lock);
282 page = alloc_page(gfp);
283 if (!page)
284 return -ENOMEM;
285 spin_lock(&pool->lock);
286 pool->pages_nr++;
287 zhdr = init_zbud_page(page);
288 bud = FIRST;
289
290found:
291 if (bud == FIRST)
292 zhdr->first_chunks = chunks;
293 else
294 zhdr->last_chunks = chunks;
295
296 if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
297 /* Add to unbuddied list */
298 freechunks = num_free_chunks(zhdr);
299 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
300 } else {
301 /* Add to buddied list */
302 list_add(&zhdr->buddy, &pool->buddied);
303 }
304
305 /* Add/move zbud page to beginning of LRU */
306 if (!list_empty(&zhdr->lru))
307 list_del(&zhdr->lru);
308 list_add(&zhdr->lru, &pool->lru);
309
310 *handle = encode_handle(zhdr, bud);
311 spin_unlock(&pool->lock);
312
313 return 0;
314}
315
316/**
317 * zbud_free() - frees the allocation associated with the given handle
318 * @pool: pool in which the allocation resided
319 * @handle: handle associated with the allocation returned by zbud_alloc()
320 *
321 * In the case that the zbud page in which the allocation resides is under
322 * reclaim, as indicated by the PG_reclaim flag being set, this function
323 * only sets the first|last_chunks to 0. The page is actually freed
324 * once both buddies are evicted (see zbud_reclaim_page() below).
325 */
326void zbud_free(struct zbud_pool *pool, unsigned long handle)
327{
328 struct zbud_header *zhdr;
329 int freechunks;
330
331 spin_lock(&pool->lock);
332 zhdr = handle_to_zbud_header(handle);
333
334 /* If first buddy, handle will be page aligned */
335 if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
336 zhdr->last_chunks = 0;
337 else
338 zhdr->first_chunks = 0;
339
340 if (zhdr->under_reclaim) {
341 /* zbud page is under reclaim, reclaim will free */
342 spin_unlock(&pool->lock);
343 return;
344 }
345
346 /* Remove from existing buddy list */
347 list_del(&zhdr->buddy);
348
349 if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
350 /* zbud page is empty, free */
351 list_del(&zhdr->lru);
352 free_zbud_page(zhdr);
353 pool->pages_nr--;
354 } else {
355 /* Add to unbuddied list */
356 freechunks = num_free_chunks(zhdr);
357 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
358 }
359
360 spin_unlock(&pool->lock);
361}
362
363#define list_tail_entry(ptr, type, member) \
364 list_entry((ptr)->prev, type, member)
365
366/**
367 * zbud_reclaim_page() - evicts allocations from a pool page and frees it
368 * @pool: pool from which a page will attempt to be evicted
369 * @retires: number of pages on the LRU list for which eviction will
370 * be attempted before failing
371 *
372 * zbud reclaim is different from normal system reclaim in that the reclaim is
373 * done from the bottom, up. This is because only the bottom layer, zbud, has
374 * information on how the allocations are organized within each zbud page. This
375 * has the potential to create interesting locking situations between zbud and
376 * the user, however.
377 *
378 * To avoid these, this is how zbud_reclaim_page() should be called:
379
380 * The user detects a page should be reclaimed and calls zbud_reclaim_page().
381 * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
382 * the user-defined eviction handler with the pool and handle as arguments.
383 *
384 * If the handle can not be evicted, the eviction handler should return
385 * non-zero. zbud_reclaim_page() will add the zbud page back to the
386 * appropriate list and try the next zbud page on the LRU up to
387 * a user defined number of retries.
388 *
389 * If the handle is successfully evicted, the eviction handler should
390 * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
391 * contains logic to delay freeing the page if the page is under reclaim,
392 * as indicated by the setting of the PG_reclaim flag on the underlying page.
393 *
394 * If all buddies in the zbud page are successfully evicted, then the
395 * zbud page can be freed.
396 *
397 * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
398 * no pages to evict or an eviction handler is not registered, -EAGAIN if
399 * the retry limit was hit.
400 */
401int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
402{
403 int i, ret, freechunks;
404 struct zbud_header *zhdr;
405 unsigned long first_handle = 0, last_handle = 0;
406
407 spin_lock(&pool->lock);
408 if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
409 retries == 0) {
410 spin_unlock(&pool->lock);
411 return -EINVAL;
412 }
413 for (i = 0; i < retries; i++) {
414 zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
415 list_del(&zhdr->lru);
416 list_del(&zhdr->buddy);
417 /* Protect zbud page against free */
418 zhdr->under_reclaim = true;
419 /*
420 * We need encode the handles before unlocking, since we can
421 * race with free that will set (first|last)_chunks to 0
422 */
423 first_handle = 0;
424 last_handle = 0;
425 if (zhdr->first_chunks)
426 first_handle = encode_handle(zhdr, FIRST);
427 if (zhdr->last_chunks)
428 last_handle = encode_handle(zhdr, LAST);
429 spin_unlock(&pool->lock);
430
431 /* Issue the eviction callback(s) */
432 if (first_handle) {
433 ret = pool->ops->evict(pool, first_handle);
434 if (ret)
435 goto next;
436 }
437 if (last_handle) {
438 ret = pool->ops->evict(pool, last_handle);
439 if (ret)
440 goto next;
441 }
442next:
443 spin_lock(&pool->lock);
444 zhdr->under_reclaim = false;
445 if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
446 /*
447 * Both buddies are now free, free the zbud page and
448 * return success.
449 */
450 free_zbud_page(zhdr);
451 pool->pages_nr--;
452 spin_unlock(&pool->lock);
453 return 0;
454 } else if (zhdr->first_chunks == 0 ||
455 zhdr->last_chunks == 0) {
456 /* add to unbuddied list */
457 freechunks = num_free_chunks(zhdr);
458 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
459 } else {
460 /* add to buddied list */
461 list_add(&zhdr->buddy, &pool->buddied);
462 }
463
464 /* add to beginning of LRU */
465 list_add(&zhdr->lru, &pool->lru);
466 }
467 spin_unlock(&pool->lock);
468 return -EAGAIN;
469}
470
471/**
472 * zbud_map() - maps the allocation associated with the given handle
473 * @pool: pool in which the allocation resides
474 * @handle: handle associated with the allocation to be mapped
475 *
476 * While trivial for zbud, the mapping functions for others allocators
477 * implementing this allocation API could have more complex information encoded
478 * in the handle and could create temporary mappings to make the data
479 * accessible to the user.
480 *
481 * Returns: a pointer to the mapped allocation
482 */
483void *zbud_map(struct zbud_pool *pool, unsigned long handle)
484{
485 return (void *)(handle);
486}
487
488/**
489 * zbud_unmap() - maps the allocation associated with the given handle
490 * @pool: pool in which the allocation resides
491 * @handle: handle associated with the allocation to be unmapped
492 */
493void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
494{
495}
496
497/**
498 * zbud_get_pool_size() - gets the zbud pool size in pages
499 * @pool: pool whose size is being queried
500 *
501 * Returns: size in pages of the given pool. The pool lock need not be
502 * taken to access pages_nr.
503 */
504u64 zbud_get_pool_size(struct zbud_pool *pool)
505{
506 return pool->pages_nr;
507}
508
509static int __init init_zbud(void)
510{
511 /* Make sure the zbud header will fit in one chunk */
512 BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
513 pr_info("loaded\n");
514 return 0;
515}
516
517static void __exit exit_zbud(void)
518{
519 pr_info("unloaded\n");
520}
521
522module_init(init_zbud);
523module_exit(exit_zbud);
524
525MODULE_LICENSE("GPL");
526MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
527MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");
diff --git a/mm/zswap.c b/mm/zswap.c
new file mode 100644
index 000000000000..deda2b671e12
--- /dev/null
+++ b/mm/zswap.c
@@ -0,0 +1,943 @@
1/*
2 * zswap.c - zswap driver file
3 *
4 * zswap is a backend for frontswap that takes pages that are in the process
5 * of being swapped out and attempts to compress and store them in a
6 * RAM-based memory pool. This can result in a significant I/O reduction on
7 * the swap device and, in the case where decompressing from RAM is faster
8 * than reading from the swap device, can also improve workload performance.
9 *
10 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21*/
22
23#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
24
25#include <linux/module.h>
26#include <linux/cpu.h>
27#include <linux/highmem.h>
28#include <linux/slab.h>
29#include <linux/spinlock.h>
30#include <linux/types.h>
31#include <linux/atomic.h>
32#include <linux/frontswap.h>
33#include <linux/rbtree.h>
34#include <linux/swap.h>
35#include <linux/crypto.h>
36#include <linux/mempool.h>
37#include <linux/zbud.h>
38
39#include <linux/mm_types.h>
40#include <linux/page-flags.h>
41#include <linux/swapops.h>
42#include <linux/writeback.h>
43#include <linux/pagemap.h>
44
45/*********************************
46* statistics
47**********************************/
48/* Number of memory pages used by the compressed pool */
49static u64 zswap_pool_pages;
50/* The number of compressed pages currently stored in zswap */
51static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
52
53/*
54 * The statistics below are not protected from concurrent access for
55 * performance reasons so they may not be a 100% accurate. However,
56 * they do provide useful information on roughly how many times a
57 * certain event is occurring.
58*/
59
60/* Pool limit was hit (see zswap_max_pool_percent) */
61static u64 zswap_pool_limit_hit;
62/* Pages written back when pool limit was reached */
63static u64 zswap_written_back_pages;
64/* Store failed due to a reclaim failure after pool limit was reached */
65static u64 zswap_reject_reclaim_fail;
66/* Compressed page was too big for the allocator to (optimally) store */
67static u64 zswap_reject_compress_poor;
68/* Store failed because underlying allocator could not get memory */
69static u64 zswap_reject_alloc_fail;
70/* Store failed because the entry metadata could not be allocated (rare) */
71static u64 zswap_reject_kmemcache_fail;
72/* Duplicate store was encountered (rare) */
73static u64 zswap_duplicate_entry;
74
75/*********************************
76* tunables
77**********************************/
78/* Enable/disable zswap (disabled by default, fixed at boot for now) */
79static bool zswap_enabled __read_mostly;
80module_param_named(enabled, zswap_enabled, bool, 0);
81
82/* Compressor to be used by zswap (fixed at boot for now) */
83#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
84static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
85module_param_named(compressor, zswap_compressor, charp, 0);
86
87/* The maximum percentage of memory that the compressed pool can occupy */
88static unsigned int zswap_max_pool_percent = 20;
89module_param_named(max_pool_percent,
90 zswap_max_pool_percent, uint, 0644);
91
92/*********************************
93* compression functions
94**********************************/
95/* per-cpu compression transforms */
96static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
97
98enum comp_op {
99 ZSWAP_COMPOP_COMPRESS,
100 ZSWAP_COMPOP_DECOMPRESS
101};
102
103static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
104 u8 *dst, unsigned int *dlen)
105{
106 struct crypto_comp *tfm;
107 int ret;
108
109 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
110 switch (op) {
111 case ZSWAP_COMPOP_COMPRESS:
112 ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
113 break;
114 case ZSWAP_COMPOP_DECOMPRESS:
115 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
116 break;
117 default:
118 ret = -EINVAL;
119 }
120
121 put_cpu();
122 return ret;
123}
124
125static int __init zswap_comp_init(void)
126{
127 if (!crypto_has_comp(zswap_compressor, 0, 0)) {
128 pr_info("%s compressor not available\n", zswap_compressor);
129 /* fall back to default compressor */
130 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
131 if (!crypto_has_comp(zswap_compressor, 0, 0))
132 /* can't even load the default compressor */
133 return -ENODEV;
134 }
135 pr_info("using %s compressor\n", zswap_compressor);
136
137 /* alloc percpu transforms */
138 zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
139 if (!zswap_comp_pcpu_tfms)
140 return -ENOMEM;
141 return 0;
142}
143
144static void zswap_comp_exit(void)
145{
146 /* free percpu transforms */
147 if (zswap_comp_pcpu_tfms)
148 free_percpu(zswap_comp_pcpu_tfms);
149}
150
151/*********************************
152* data structures
153**********************************/
154/*
155 * struct zswap_entry
156 *
157 * This structure contains the metadata for tracking a single compressed
158 * page within zswap.
159 *
160 * rbnode - links the entry into red-black tree for the appropriate swap type
161 * refcount - the number of outstanding reference to the entry. This is needed
162 * to protect against premature freeing of the entry by code
163 * concurent calls to load, invalidate, and writeback. The lock
164 * for the zswap_tree structure that contains the entry must
165 * be held while changing the refcount. Since the lock must
166 * be held, there is no reason to also make refcount atomic.
167 * offset - the swap offset for the entry. Index into the red-black tree.
168 * handle - zsmalloc allocation handle that stores the compressed page data
169 * length - the length in bytes of the compressed page data. Needed during
170 * decompression
171 */
172struct zswap_entry {
173 struct rb_node rbnode;
174 pgoff_t offset;
175 int refcount;
176 unsigned int length;
177 unsigned long handle;
178};
179
180struct zswap_header {
181 swp_entry_t swpentry;
182};
183
184/*
185 * The tree lock in the zswap_tree struct protects a few things:
186 * - the rbtree
187 * - the refcount field of each entry in the tree
188 */
189struct zswap_tree {
190 struct rb_root rbroot;
191 spinlock_t lock;
192 struct zbud_pool *pool;
193};
194
195static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
196
197/*********************************
198* zswap entry functions
199**********************************/
200static struct kmem_cache *zswap_entry_cache;
201
202static int zswap_entry_cache_create(void)
203{
204 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
205 return (zswap_entry_cache == NULL);
206}
207
208static void zswap_entry_cache_destory(void)
209{
210 kmem_cache_destroy(zswap_entry_cache);
211}
212
213static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
214{
215 struct zswap_entry *entry;
216 entry = kmem_cache_alloc(zswap_entry_cache, gfp);
217 if (!entry)
218 return NULL;
219 entry->refcount = 1;
220 return entry;
221}
222
223static void zswap_entry_cache_free(struct zswap_entry *entry)
224{
225 kmem_cache_free(zswap_entry_cache, entry);
226}
227
228/* caller must hold the tree lock */
229static void zswap_entry_get(struct zswap_entry *entry)
230{
231 entry->refcount++;
232}
233
234/* caller must hold the tree lock */
235static int zswap_entry_put(struct zswap_entry *entry)
236{
237 entry->refcount--;
238 return entry->refcount;
239}
240
241/*********************************
242* rbtree functions
243**********************************/
244static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
245{
246 struct rb_node *node = root->rb_node;
247 struct zswap_entry *entry;
248
249 while (node) {
250 entry = rb_entry(node, struct zswap_entry, rbnode);
251 if (entry->offset > offset)
252 node = node->rb_left;
253 else if (entry->offset < offset)
254 node = node->rb_right;
255 else
256 return entry;
257 }
258 return NULL;
259}
260
261/*
262 * In the case that a entry with the same offset is found, a pointer to
263 * the existing entry is stored in dupentry and the function returns -EEXIST
264 */
265static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
266 struct zswap_entry **dupentry)
267{
268 struct rb_node **link = &root->rb_node, *parent = NULL;
269 struct zswap_entry *myentry;
270
271 while (*link) {
272 parent = *link;
273 myentry = rb_entry(parent, struct zswap_entry, rbnode);
274 if (myentry->offset > entry->offset)
275 link = &(*link)->rb_left;
276 else if (myentry->offset < entry->offset)
277 link = &(*link)->rb_right;
278 else {
279 *dupentry = myentry;
280 return -EEXIST;
281 }
282 }
283 rb_link_node(&entry->rbnode, parent, link);
284 rb_insert_color(&entry->rbnode, root);
285 return 0;
286}
287
288/*********************************
289* per-cpu code
290**********************************/
291static DEFINE_PER_CPU(u8 *, zswap_dstmem);
292
293static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
294{
295 struct crypto_comp *tfm;
296 u8 *dst;
297
298 switch (action) {
299 case CPU_UP_PREPARE:
300 tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
301 if (IS_ERR(tfm)) {
302 pr_err("can't allocate compressor transform\n");
303 return NOTIFY_BAD;
304 }
305 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
306 dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
307 if (!dst) {
308 pr_err("can't allocate compressor buffer\n");
309 crypto_free_comp(tfm);
310 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
311 return NOTIFY_BAD;
312 }
313 per_cpu(zswap_dstmem, cpu) = dst;
314 break;
315 case CPU_DEAD:
316 case CPU_UP_CANCELED:
317 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
318 if (tfm) {
319 crypto_free_comp(tfm);
320 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
321 }
322 dst = per_cpu(zswap_dstmem, cpu);
323 kfree(dst);
324 per_cpu(zswap_dstmem, cpu) = NULL;
325 break;
326 default:
327 break;
328 }
329 return NOTIFY_OK;
330}
331
332static int zswap_cpu_notifier(struct notifier_block *nb,
333 unsigned long action, void *pcpu)
334{
335 unsigned long cpu = (unsigned long)pcpu;
336 return __zswap_cpu_notifier(action, cpu);
337}
338
339static struct notifier_block zswap_cpu_notifier_block = {
340 .notifier_call = zswap_cpu_notifier
341};
342
343static int zswap_cpu_init(void)
344{
345 unsigned long cpu;
346
347 get_online_cpus();
348 for_each_online_cpu(cpu)
349 if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
350 goto cleanup;
351 register_cpu_notifier(&zswap_cpu_notifier_block);
352 put_online_cpus();
353 return 0;
354
355cleanup:
356 for_each_online_cpu(cpu)
357 __zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
358 put_online_cpus();
359 return -ENOMEM;
360}
361
362/*********************************
363* helpers
364**********************************/
365static bool zswap_is_full(void)
366{
367 return (totalram_pages * zswap_max_pool_percent / 100 <
368 zswap_pool_pages);
369}
370
371/*
372 * Carries out the common pattern of freeing and entry's zsmalloc allocation,
373 * freeing the entry itself, and decrementing the number of stored pages.
374 */
375static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
376{
377 zbud_free(tree->pool, entry->handle);
378 zswap_entry_cache_free(entry);
379 atomic_dec(&zswap_stored_pages);
380 zswap_pool_pages = zbud_get_pool_size(tree->pool);
381}
382
383/*********************************
384* writeback code
385**********************************/
386/* return enum for zswap_get_swap_cache_page */
387enum zswap_get_swap_ret {
388 ZSWAP_SWAPCACHE_NEW,
389 ZSWAP_SWAPCACHE_EXIST,
390 ZSWAP_SWAPCACHE_NOMEM
391};
392
393/*
394 * zswap_get_swap_cache_page
395 *
396 * This is an adaption of read_swap_cache_async()
397 *
398 * This function tries to find a page with the given swap entry
399 * in the swapper_space address space (the swap cache). If the page
400 * is found, it is returned in retpage. Otherwise, a page is allocated,
401 * added to the swap cache, and returned in retpage.
402 *
403 * If success, the swap cache page is returned in retpage
404 * Returns 0 if page was already in the swap cache, page is not locked
405 * Returns 1 if the new page needs to be populated, page is locked
406 * Returns <0 on error
407 */
408static int zswap_get_swap_cache_page(swp_entry_t entry,
409 struct page **retpage)
410{
411 struct page *found_page, *new_page = NULL;
412 struct address_space *swapper_space = &swapper_spaces[swp_type(entry)];
413 int err;
414
415 *retpage = NULL;
416 do {
417 /*
418 * First check the swap cache. Since this is normally
419 * called after lookup_swap_cache() failed, re-calling
420 * that would confuse statistics.
421 */
422 found_page = find_get_page(swapper_space, entry.val);
423 if (found_page)
424 break;
425
426 /*
427 * Get a new page to read into from swap.
428 */
429 if (!new_page) {
430 new_page = alloc_page(GFP_KERNEL);
431 if (!new_page)
432 break; /* Out of memory */
433 }
434
435 /*
436 * call radix_tree_preload() while we can wait.
437 */
438 err = radix_tree_preload(GFP_KERNEL);
439 if (err)
440 break;
441
442 /*
443 * Swap entry may have been freed since our caller observed it.
444 */
445 err = swapcache_prepare(entry);
446 if (err == -EEXIST) { /* seems racy */
447 radix_tree_preload_end();
448 continue;
449 }
450 if (err) { /* swp entry is obsolete ? */
451 radix_tree_preload_end();
452 break;
453 }
454
455 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
456 __set_page_locked(new_page);
457 SetPageSwapBacked(new_page);
458 err = __add_to_swap_cache(new_page, entry);
459 if (likely(!err)) {
460 radix_tree_preload_end();
461 lru_cache_add_anon(new_page);
462 *retpage = new_page;
463 return ZSWAP_SWAPCACHE_NEW;
464 }
465 radix_tree_preload_end();
466 ClearPageSwapBacked(new_page);
467 __clear_page_locked(new_page);
468 /*
469 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
470 * clear SWAP_HAS_CACHE flag.
471 */
472 swapcache_free(entry, NULL);
473 } while (err != -ENOMEM);
474
475 if (new_page)
476 page_cache_release(new_page);
477 if (!found_page)
478 return ZSWAP_SWAPCACHE_NOMEM;
479 *retpage = found_page;
480 return ZSWAP_SWAPCACHE_EXIST;
481}
482
483/*
484 * Attempts to free an entry by adding a page to the swap cache,
485 * decompressing the entry data into the page, and issuing a
486 * bio write to write the page back to the swap device.
487 *
488 * This can be thought of as a "resumed writeback" of the page
489 * to the swap device. We are basically resuming the same swap
490 * writeback path that was intercepted with the frontswap_store()
491 * in the first place. After the page has been decompressed into
492 * the swap cache, the compressed version stored by zswap can be
493 * freed.
494 */
495static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
496{
497 struct zswap_header *zhdr;
498 swp_entry_t swpentry;
499 struct zswap_tree *tree;
500 pgoff_t offset;
501 struct zswap_entry *entry;
502 struct page *page;
503 u8 *src, *dst;
504 unsigned int dlen;
505 int ret, refcount;
506 struct writeback_control wbc = {
507 .sync_mode = WB_SYNC_NONE,
508 };
509
510 /* extract swpentry from data */
511 zhdr = zbud_map(pool, handle);
512 swpentry = zhdr->swpentry; /* here */
513 zbud_unmap(pool, handle);
514 tree = zswap_trees[swp_type(swpentry)];
515 offset = swp_offset(swpentry);
516 BUG_ON(pool != tree->pool);
517
518 /* find and ref zswap entry */
519 spin_lock(&tree->lock);
520 entry = zswap_rb_search(&tree->rbroot, offset);
521 if (!entry) {
522 /* entry was invalidated */
523 spin_unlock(&tree->lock);
524 return 0;
525 }
526 zswap_entry_get(entry);
527 spin_unlock(&tree->lock);
528 BUG_ON(offset != entry->offset);
529
530 /* try to allocate swap cache page */
531 switch (zswap_get_swap_cache_page(swpentry, &page)) {
532 case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
533 ret = -ENOMEM;
534 goto fail;
535
536 case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
537 /* page is already in the swap cache, ignore for now */
538 page_cache_release(page);
539 ret = -EEXIST;
540 goto fail;
541
542 case ZSWAP_SWAPCACHE_NEW: /* page is locked */
543 /* decompress */
544 dlen = PAGE_SIZE;
545 src = (u8 *)zbud_map(tree->pool, entry->handle) +
546 sizeof(struct zswap_header);
547 dst = kmap_atomic(page);
548 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
549 entry->length, dst, &dlen);
550 kunmap_atomic(dst);
551 zbud_unmap(tree->pool, entry->handle);
552 BUG_ON(ret);
553 BUG_ON(dlen != PAGE_SIZE);
554
555 /* page is up to date */
556 SetPageUptodate(page);
557 }
558
559 /* start writeback */
560 __swap_writepage(page, &wbc, end_swap_bio_write);
561 page_cache_release(page);
562 zswap_written_back_pages++;
563
564 spin_lock(&tree->lock);
565
566 /* drop local reference */
567 zswap_entry_put(entry);
568 /* drop the initial reference from entry creation */
569 refcount = zswap_entry_put(entry);
570
571 /*
572 * There are three possible values for refcount here:
573 * (1) refcount is 1, load is in progress, unlink from rbtree,
574 * load will free
575 * (2) refcount is 0, (normal case) entry is valid,
576 * remove from rbtree and free entry
577 * (3) refcount is -1, invalidate happened during writeback,
578 * free entry
579 */
580 if (refcount >= 0) {
581 /* no invalidate yet, remove from rbtree */
582 rb_erase(&entry->rbnode, &tree->rbroot);
583 }
584 spin_unlock(&tree->lock);
585 if (refcount <= 0) {
586 /* free the entry */
587 zswap_free_entry(tree, entry);
588 return 0;
589 }
590 return -EAGAIN;
591
592fail:
593 spin_lock(&tree->lock);
594 zswap_entry_put(entry);
595 spin_unlock(&tree->lock);
596 return ret;
597}
598
599/*********************************
600* frontswap hooks
601**********************************/
602/* attempts to compress and store an single page */
603static int zswap_frontswap_store(unsigned type, pgoff_t offset,
604 struct page *page)
605{
606 struct zswap_tree *tree = zswap_trees[type];
607 struct zswap_entry *entry, *dupentry;
608 int ret;
609 unsigned int dlen = PAGE_SIZE, len;
610 unsigned long handle;
611 char *buf;
612 u8 *src, *dst;
613 struct zswap_header *zhdr;
614
615 if (!tree) {
616 ret = -ENODEV;
617 goto reject;
618 }
619
620 /* reclaim space if needed */
621 if (zswap_is_full()) {
622 zswap_pool_limit_hit++;
623 if (zbud_reclaim_page(tree->pool, 8)) {
624 zswap_reject_reclaim_fail++;
625 ret = -ENOMEM;
626 goto reject;
627 }
628 }
629
630 /* allocate entry */
631 entry = zswap_entry_cache_alloc(GFP_KERNEL);
632 if (!entry) {
633 zswap_reject_kmemcache_fail++;
634 ret = -ENOMEM;
635 goto reject;
636 }
637
638 /* compress */
639 dst = get_cpu_var(zswap_dstmem);
640 src = kmap_atomic(page);
641 ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
642 kunmap_atomic(src);
643 if (ret) {
644 ret = -EINVAL;
645 goto freepage;
646 }
647
648 /* store */
649 len = dlen + sizeof(struct zswap_header);
650 ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN,
651 &handle);
652 if (ret == -ENOSPC) {
653 zswap_reject_compress_poor++;
654 goto freepage;
655 }
656 if (ret) {
657 zswap_reject_alloc_fail++;
658 goto freepage;
659 }
660 zhdr = zbud_map(tree->pool, handle);
661 zhdr->swpentry = swp_entry(type, offset);
662 buf = (u8 *)(zhdr + 1);
663 memcpy(buf, dst, dlen);
664 zbud_unmap(tree->pool, handle);
665 put_cpu_var(zswap_dstmem);
666
667 /* populate entry */
668 entry->offset = offset;
669 entry->handle = handle;
670 entry->length = dlen;
671
672 /* map */
673 spin_lock(&tree->lock);
674 do {
675 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
676 if (ret == -EEXIST) {
677 zswap_duplicate_entry++;
678 /* remove from rbtree */
679 rb_erase(&dupentry->rbnode, &tree->rbroot);
680 if (!zswap_entry_put(dupentry)) {
681 /* free */
682 zswap_free_entry(tree, dupentry);
683 }
684 }
685 } while (ret == -EEXIST);
686 spin_unlock(&tree->lock);
687
688 /* update stats */
689 atomic_inc(&zswap_stored_pages);
690 zswap_pool_pages = zbud_get_pool_size(tree->pool);
691
692 return 0;
693
694freepage:
695 put_cpu_var(zswap_dstmem);
696 zswap_entry_cache_free(entry);
697reject:
698 return ret;
699}
700
701/*
702 * returns 0 if the page was successfully decompressed
703 * return -1 on entry not found or error
704*/
705static int zswap_frontswap_load(unsigned type, pgoff_t offset,
706 struct page *page)
707{
708 struct zswap_tree *tree = zswap_trees[type];
709 struct zswap_entry *entry;
710 u8 *src, *dst;
711 unsigned int dlen;
712 int refcount, ret;
713
714 /* find */
715 spin_lock(&tree->lock);
716 entry = zswap_rb_search(&tree->rbroot, offset);
717 if (!entry) {
718 /* entry was written back */
719 spin_unlock(&tree->lock);
720 return -1;
721 }
722 zswap_entry_get(entry);
723 spin_unlock(&tree->lock);
724
725 /* decompress */
726 dlen = PAGE_SIZE;
727 src = (u8 *)zbud_map(tree->pool, entry->handle) +
728 sizeof(struct zswap_header);
729 dst = kmap_atomic(page);
730 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
731 dst, &dlen);
732 kunmap_atomic(dst);
733 zbud_unmap(tree->pool, entry->handle);
734 BUG_ON(ret);
735
736 spin_lock(&tree->lock);
737 refcount = zswap_entry_put(entry);
738 if (likely(refcount)) {
739 spin_unlock(&tree->lock);
740 return 0;
741 }
742 spin_unlock(&tree->lock);
743
744 /*
745 * We don't have to unlink from the rbtree because
746 * zswap_writeback_entry() or zswap_frontswap_invalidate page()
747 * has already done this for us if we are the last reference.
748 */
749 /* free */
750
751 zswap_free_entry(tree, entry);
752
753 return 0;
754}
755
756/* frees an entry in zswap */
757static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
758{
759 struct zswap_tree *tree = zswap_trees[type];
760 struct zswap_entry *entry;
761 int refcount;
762
763 /* find */
764 spin_lock(&tree->lock);
765 entry = zswap_rb_search(&tree->rbroot, offset);
766 if (!entry) {
767 /* entry was written back */
768 spin_unlock(&tree->lock);
769 return;
770 }
771
772 /* remove from rbtree */
773 rb_erase(&entry->rbnode, &tree->rbroot);
774
775 /* drop the initial reference from entry creation */
776 refcount = zswap_entry_put(entry);
777
778 spin_unlock(&tree->lock);
779
780 if (refcount) {
781 /* writeback in progress, writeback will free */
782 return;
783 }
784
785 /* free */
786 zswap_free_entry(tree, entry);
787}
788
789/* frees all zswap entries for the given swap type */
790static void zswap_frontswap_invalidate_area(unsigned type)
791{
792 struct zswap_tree *tree = zswap_trees[type];
793 struct rb_node *node;
794 struct zswap_entry *entry;
795
796 if (!tree)
797 return;
798
799 /* walk the tree and free everything */
800 spin_lock(&tree->lock);
801 /*
802 * TODO: Even though this code should not be executed because
803 * the try_to_unuse() in swapoff should have emptied the tree,
804 * it is very wasteful to rebalance the tree after every
805 * removal when we are freeing the whole tree.
806 *
807 * If post-order traversal code is ever added to the rbtree
808 * implementation, it should be used here.
809 */
810 while ((node = rb_first(&tree->rbroot))) {
811 entry = rb_entry(node, struct zswap_entry, rbnode);
812 rb_erase(&entry->rbnode, &tree->rbroot);
813 zbud_free(tree->pool, entry->handle);
814 zswap_entry_cache_free(entry);
815 atomic_dec(&zswap_stored_pages);
816 }
817 tree->rbroot = RB_ROOT;
818 spin_unlock(&tree->lock);
819}
820
821static struct zbud_ops zswap_zbud_ops = {
822 .evict = zswap_writeback_entry
823};
824
825static void zswap_frontswap_init(unsigned type)
826{
827 struct zswap_tree *tree;
828
829 tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
830 if (!tree)
831 goto err;
832 tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
833 if (!tree->pool)
834 goto freetree;
835 tree->rbroot = RB_ROOT;
836 spin_lock_init(&tree->lock);
837 zswap_trees[type] = tree;
838 return;
839
840freetree:
841 kfree(tree);
842err:
843 pr_err("alloc failed, zswap disabled for swap type %d\n", type);
844}
845
846static struct frontswap_ops zswap_frontswap_ops = {
847 .store = zswap_frontswap_store,
848 .load = zswap_frontswap_load,
849 .invalidate_page = zswap_frontswap_invalidate_page,
850 .invalidate_area = zswap_frontswap_invalidate_area,
851 .init = zswap_frontswap_init
852};
853
854/*********************************
855* debugfs functions
856**********************************/
857#ifdef CONFIG_DEBUG_FS
858#include <linux/debugfs.h>
859
860static struct dentry *zswap_debugfs_root;
861
862static int __init zswap_debugfs_init(void)
863{
864 if (!debugfs_initialized())
865 return -ENODEV;
866
867 zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
868 if (!zswap_debugfs_root)
869 return -ENOMEM;
870
871 debugfs_create_u64("pool_limit_hit", S_IRUGO,
872 zswap_debugfs_root, &zswap_pool_limit_hit);
873 debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
874 zswap_debugfs_root, &zswap_reject_reclaim_fail);
875 debugfs_create_u64("reject_alloc_fail", S_IRUGO,
876 zswap_debugfs_root, &zswap_reject_alloc_fail);
877 debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
878 zswap_debugfs_root, &zswap_reject_kmemcache_fail);
879 debugfs_create_u64("reject_compress_poor", S_IRUGO,
880 zswap_debugfs_root, &zswap_reject_compress_poor);
881 debugfs_create_u64("written_back_pages", S_IRUGO,
882 zswap_debugfs_root, &zswap_written_back_pages);
883 debugfs_create_u64("duplicate_entry", S_IRUGO,
884 zswap_debugfs_root, &zswap_duplicate_entry);
885 debugfs_create_u64("pool_pages", S_IRUGO,
886 zswap_debugfs_root, &zswap_pool_pages);
887 debugfs_create_atomic_t("stored_pages", S_IRUGO,
888 zswap_debugfs_root, &zswap_stored_pages);
889
890 return 0;
891}
892
893static void __exit zswap_debugfs_exit(void)
894{
895 debugfs_remove_recursive(zswap_debugfs_root);
896}
897#else
898static int __init zswap_debugfs_init(void)
899{
900 return 0;
901}
902
903static void __exit zswap_debugfs_exit(void) { }
904#endif
905
906/*********************************
907* module init and exit
908**********************************/
909static int __init init_zswap(void)
910{
911 if (!zswap_enabled)
912 return 0;
913
914 pr_info("loading zswap\n");
915 if (zswap_entry_cache_create()) {
916 pr_err("entry cache creation failed\n");
917 goto error;
918 }
919 if (zswap_comp_init()) {
920 pr_err("compressor initialization failed\n");
921 goto compfail;
922 }
923 if (zswap_cpu_init()) {
924 pr_err("per-cpu initialization failed\n");
925 goto pcpufail;
926 }
927 frontswap_register_ops(&zswap_frontswap_ops);
928 if (zswap_debugfs_init())
929 pr_warn("debugfs initialization failed\n");
930 return 0;
931pcpufail:
932 zswap_comp_exit();
933compfail:
934 zswap_entry_cache_destory();
935error:
936 return -ENOMEM;
937}
938/* must be late so crypto has time to come up */
939late_initcall(init_zswap);
940
941MODULE_LICENSE("GPL");
942MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
943MODULE_DESCRIPTION("Compressed cache for swap pages");