aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorSeth Jennings <sjenning@linux.vnet.ibm.com>2013-07-10 19:05:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-10 21:11:34 -0400
commit2b2811178e85553405b86e3fe78357b9b95889ce (patch)
tree0d5b12e7013be79a8e89d145a46bc32e82f86a81 /mm
parent4e2e2770b1529edc5849c86b29a6febe27e2f083 (diff)
zswap: add to mm/
zswap is a thin backend for frontswap that takes pages that are in the process of being swapped out and attempts to compress them and store them in a RAM-based memory pool. This can result in a significant I/O reduction on the swap device and, in the case where decompressing from RAM is faster than reading from the swap device, can also improve workload performance. It also has support for evicting swap pages that are currently compressed in zswap to the swap device on an LRU(ish) basis. This functionality makes zswap a true cache in that, once the cache is full, the oldest pages can be moved out of zswap to the swap device so newer pages can be compressed and stored in zswap. This patch adds the zswap driver to mm/ Signed-off-by: Seth Jennings <sjenning@linux.vnet.ibm.com> Acked-by: Rik van Riel <riel@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Robert Jennings <rcj@linux.vnet.ibm.com> Cc: Jenifer Hopper <jhopper@us.ibm.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Larry Woodman <lwoodman@redhat.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Dave Hansen <dave@sr71.net> Cc: Joe Perches <joe@perches.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: Hugh Dickens <hughd@google.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Fengguang Wu <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig20
-rw-r--r--mm/Makefile1
-rw-r--r--mm/zswap.c943
3 files changed, 964 insertions, 0 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 45503ed5f3aa..8028dcc6615c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -488,6 +488,26 @@ config ZBUD
488 deterministic reclaim properties that make it preferable to a higher 488 deterministic reclaim properties that make it preferable to a higher
489 density approach when reclaim will be used. 489 density approach when reclaim will be used.
490 490
491config ZSWAP
492 bool "Compressed cache for swap pages (EXPERIMENTAL)"
493 depends on FRONTSWAP && CRYPTO=y
494 select CRYPTO_LZO
495 select ZBUD
496 default n
497 help
498 A lightweight compressed cache for swap pages. It takes
499 pages that are in the process of being swapped out and attempts to
500 compress them into a dynamically allocated RAM-based memory pool.
501 This can result in a significant I/O reduction on swap device and,
502 in the case where decompressing from RAM is faster that swap device
503 reads, can also improve workload performance.
504
505 This is marked experimental because it is a new feature (as of
506 v3.11) that interacts heavily with memory reclaim. While these
507 interactions don't cause any known issues on simple memory setups,
508 they have not be fully explored on the large set of potential
509 configurations and workloads that exist.
510
491config MEM_SOFT_DIRTY 511config MEM_SOFT_DIRTY
492 bool "Track memory changes" 512 bool "Track memory changes"
493 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY 513 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
diff --git a/mm/Makefile b/mm/Makefile
index 95f0197ce3d3..f00803386a67 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
32obj-$(CONFIG_BOUNCE) += bounce.o 32obj-$(CONFIG_BOUNCE) += bounce.o
33obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 33obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
34obj-$(CONFIG_FRONTSWAP) += frontswap.o 34obj-$(CONFIG_FRONTSWAP) += frontswap.o
35obj-$(CONFIG_ZSWAP) += zswap.o
35obj-$(CONFIG_HAS_DMA) += dmapool.o 36obj-$(CONFIG_HAS_DMA) += dmapool.o
36obj-$(CONFIG_HUGETLBFS) += hugetlb.o 37obj-$(CONFIG_HUGETLBFS) += hugetlb.o
37obj-$(CONFIG_NUMA) += mempolicy.o 38obj-$(CONFIG_NUMA) += mempolicy.o
diff --git a/mm/zswap.c b/mm/zswap.c
new file mode 100644
index 000000000000..deda2b671e12
--- /dev/null
+++ b/mm/zswap.c
@@ -0,0 +1,943 @@
1/*
2 * zswap.c - zswap driver file
3 *
4 * zswap is a backend for frontswap that takes pages that are in the process
5 * of being swapped out and attempts to compress and store them in a
6 * RAM-based memory pool. This can result in a significant I/O reduction on
7 * the swap device and, in the case where decompressing from RAM is faster
8 * than reading from the swap device, can also improve workload performance.
9 *
10 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21*/
22
23#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
24
25#include <linux/module.h>
26#include <linux/cpu.h>
27#include <linux/highmem.h>
28#include <linux/slab.h>
29#include <linux/spinlock.h>
30#include <linux/types.h>
31#include <linux/atomic.h>
32#include <linux/frontswap.h>
33#include <linux/rbtree.h>
34#include <linux/swap.h>
35#include <linux/crypto.h>
36#include <linux/mempool.h>
37#include <linux/zbud.h>
38
39#include <linux/mm_types.h>
40#include <linux/page-flags.h>
41#include <linux/swapops.h>
42#include <linux/writeback.h>
43#include <linux/pagemap.h>
44
45/*********************************
46* statistics
47**********************************/
48/* Number of memory pages used by the compressed pool */
49static u64 zswap_pool_pages;
50/* The number of compressed pages currently stored in zswap */
51static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
52
53/*
54 * The statistics below are not protected from concurrent access for
55 * performance reasons so they may not be a 100% accurate. However,
56 * they do provide useful information on roughly how many times a
57 * certain event is occurring.
58*/
59
60/* Pool limit was hit (see zswap_max_pool_percent) */
61static u64 zswap_pool_limit_hit;
62/* Pages written back when pool limit was reached */
63static u64 zswap_written_back_pages;
64/* Store failed due to a reclaim failure after pool limit was reached */
65static u64 zswap_reject_reclaim_fail;
66/* Compressed page was too big for the allocator to (optimally) store */
67static u64 zswap_reject_compress_poor;
68/* Store failed because underlying allocator could not get memory */
69static u64 zswap_reject_alloc_fail;
70/* Store failed because the entry metadata could not be allocated (rare) */
71static u64 zswap_reject_kmemcache_fail;
72/* Duplicate store was encountered (rare) */
73static u64 zswap_duplicate_entry;
74
75/*********************************
76* tunables
77**********************************/
78/* Enable/disable zswap (disabled by default, fixed at boot for now) */
79static bool zswap_enabled __read_mostly;
80module_param_named(enabled, zswap_enabled, bool, 0);
81
82/* Compressor to be used by zswap (fixed at boot for now) */
83#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
84static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
85module_param_named(compressor, zswap_compressor, charp, 0);
86
87/* The maximum percentage of memory that the compressed pool can occupy */
88static unsigned int zswap_max_pool_percent = 20;
89module_param_named(max_pool_percent,
90 zswap_max_pool_percent, uint, 0644);
91
92/*********************************
93* compression functions
94**********************************/
95/* per-cpu compression transforms */
96static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
97
98enum comp_op {
99 ZSWAP_COMPOP_COMPRESS,
100 ZSWAP_COMPOP_DECOMPRESS
101};
102
103static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
104 u8 *dst, unsigned int *dlen)
105{
106 struct crypto_comp *tfm;
107 int ret;
108
109 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
110 switch (op) {
111 case ZSWAP_COMPOP_COMPRESS:
112 ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
113 break;
114 case ZSWAP_COMPOP_DECOMPRESS:
115 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
116 break;
117 default:
118 ret = -EINVAL;
119 }
120
121 put_cpu();
122 return ret;
123}
124
125static int __init zswap_comp_init(void)
126{
127 if (!crypto_has_comp(zswap_compressor, 0, 0)) {
128 pr_info("%s compressor not available\n", zswap_compressor);
129 /* fall back to default compressor */
130 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
131 if (!crypto_has_comp(zswap_compressor, 0, 0))
132 /* can't even load the default compressor */
133 return -ENODEV;
134 }
135 pr_info("using %s compressor\n", zswap_compressor);
136
137 /* alloc percpu transforms */
138 zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
139 if (!zswap_comp_pcpu_tfms)
140 return -ENOMEM;
141 return 0;
142}
143
144static void zswap_comp_exit(void)
145{
146 /* free percpu transforms */
147 if (zswap_comp_pcpu_tfms)
148 free_percpu(zswap_comp_pcpu_tfms);
149}
150
151/*********************************
152* data structures
153**********************************/
154/*
155 * struct zswap_entry
156 *
157 * This structure contains the metadata for tracking a single compressed
158 * page within zswap.
159 *
160 * rbnode - links the entry into red-black tree for the appropriate swap type
161 * refcount - the number of outstanding reference to the entry. This is needed
162 * to protect against premature freeing of the entry by code
163 * concurent calls to load, invalidate, and writeback. The lock
164 * for the zswap_tree structure that contains the entry must
165 * be held while changing the refcount. Since the lock must
166 * be held, there is no reason to also make refcount atomic.
167 * offset - the swap offset for the entry. Index into the red-black tree.
168 * handle - zsmalloc allocation handle that stores the compressed page data
169 * length - the length in bytes of the compressed page data. Needed during
170 * decompression
171 */
172struct zswap_entry {
173 struct rb_node rbnode;
174 pgoff_t offset;
175 int refcount;
176 unsigned int length;
177 unsigned long handle;
178};
179
180struct zswap_header {
181 swp_entry_t swpentry;
182};
183
184/*
185 * The tree lock in the zswap_tree struct protects a few things:
186 * - the rbtree
187 * - the refcount field of each entry in the tree
188 */
189struct zswap_tree {
190 struct rb_root rbroot;
191 spinlock_t lock;
192 struct zbud_pool *pool;
193};
194
195static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
196
197/*********************************
198* zswap entry functions
199**********************************/
200static struct kmem_cache *zswap_entry_cache;
201
202static int zswap_entry_cache_create(void)
203{
204 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
205 return (zswap_entry_cache == NULL);
206}
207
208static void zswap_entry_cache_destory(void)
209{
210 kmem_cache_destroy(zswap_entry_cache);
211}
212
213static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
214{
215 struct zswap_entry *entry;
216 entry = kmem_cache_alloc(zswap_entry_cache, gfp);
217 if (!entry)
218 return NULL;
219 entry->refcount = 1;
220 return entry;
221}
222
223static void zswap_entry_cache_free(struct zswap_entry *entry)
224{
225 kmem_cache_free(zswap_entry_cache, entry);
226}
227
228/* caller must hold the tree lock */
229static void zswap_entry_get(struct zswap_entry *entry)
230{
231 entry->refcount++;
232}
233
234/* caller must hold the tree lock */
235static int zswap_entry_put(struct zswap_entry *entry)
236{
237 entry->refcount--;
238 return entry->refcount;
239}
240
241/*********************************
242* rbtree functions
243**********************************/
244static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
245{
246 struct rb_node *node = root->rb_node;
247 struct zswap_entry *entry;
248
249 while (node) {
250 entry = rb_entry(node, struct zswap_entry, rbnode);
251 if (entry->offset > offset)
252 node = node->rb_left;
253 else if (entry->offset < offset)
254 node = node->rb_right;
255 else
256 return entry;
257 }
258 return NULL;
259}
260
261/*
262 * In the case that a entry with the same offset is found, a pointer to
263 * the existing entry is stored in dupentry and the function returns -EEXIST
264 */
265static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
266 struct zswap_entry **dupentry)
267{
268 struct rb_node **link = &root->rb_node, *parent = NULL;
269 struct zswap_entry *myentry;
270
271 while (*link) {
272 parent = *link;
273 myentry = rb_entry(parent, struct zswap_entry, rbnode);
274 if (myentry->offset > entry->offset)
275 link = &(*link)->rb_left;
276 else if (myentry->offset < entry->offset)
277 link = &(*link)->rb_right;
278 else {
279 *dupentry = myentry;
280 return -EEXIST;
281 }
282 }
283 rb_link_node(&entry->rbnode, parent, link);
284 rb_insert_color(&entry->rbnode, root);
285 return 0;
286}
287
288/*********************************
289* per-cpu code
290**********************************/
291static DEFINE_PER_CPU(u8 *, zswap_dstmem);
292
293static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
294{
295 struct crypto_comp *tfm;
296 u8 *dst;
297
298 switch (action) {
299 case CPU_UP_PREPARE:
300 tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
301 if (IS_ERR(tfm)) {
302 pr_err("can't allocate compressor transform\n");
303 return NOTIFY_BAD;
304 }
305 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
306 dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
307 if (!dst) {
308 pr_err("can't allocate compressor buffer\n");
309 crypto_free_comp(tfm);
310 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
311 return NOTIFY_BAD;
312 }
313 per_cpu(zswap_dstmem, cpu) = dst;
314 break;
315 case CPU_DEAD:
316 case CPU_UP_CANCELED:
317 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
318 if (tfm) {
319 crypto_free_comp(tfm);
320 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
321 }
322 dst = per_cpu(zswap_dstmem, cpu);
323 kfree(dst);
324 per_cpu(zswap_dstmem, cpu) = NULL;
325 break;
326 default:
327 break;
328 }
329 return NOTIFY_OK;
330}
331
332static int zswap_cpu_notifier(struct notifier_block *nb,
333 unsigned long action, void *pcpu)
334{
335 unsigned long cpu = (unsigned long)pcpu;
336 return __zswap_cpu_notifier(action, cpu);
337}
338
339static struct notifier_block zswap_cpu_notifier_block = {
340 .notifier_call = zswap_cpu_notifier
341};
342
343static int zswap_cpu_init(void)
344{
345 unsigned long cpu;
346
347 get_online_cpus();
348 for_each_online_cpu(cpu)
349 if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
350 goto cleanup;
351 register_cpu_notifier(&zswap_cpu_notifier_block);
352 put_online_cpus();
353 return 0;
354
355cleanup:
356 for_each_online_cpu(cpu)
357 __zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
358 put_online_cpus();
359 return -ENOMEM;
360}
361
362/*********************************
363* helpers
364**********************************/
365static bool zswap_is_full(void)
366{
367 return (totalram_pages * zswap_max_pool_percent / 100 <
368 zswap_pool_pages);
369}
370
371/*
372 * Carries out the common pattern of freeing and entry's zsmalloc allocation,
373 * freeing the entry itself, and decrementing the number of stored pages.
374 */
375static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
376{
377 zbud_free(tree->pool, entry->handle);
378 zswap_entry_cache_free(entry);
379 atomic_dec(&zswap_stored_pages);
380 zswap_pool_pages = zbud_get_pool_size(tree->pool);
381}
382
383/*********************************
384* writeback code
385**********************************/
386/* return enum for zswap_get_swap_cache_page */
387enum zswap_get_swap_ret {
388 ZSWAP_SWAPCACHE_NEW,
389 ZSWAP_SWAPCACHE_EXIST,
390 ZSWAP_SWAPCACHE_NOMEM
391};
392
393/*
394 * zswap_get_swap_cache_page
395 *
396 * This is an adaption of read_swap_cache_async()
397 *
398 * This function tries to find a page with the given swap entry
399 * in the swapper_space address space (the swap cache). If the page
400 * is found, it is returned in retpage. Otherwise, a page is allocated,
401 * added to the swap cache, and returned in retpage.
402 *
403 * If success, the swap cache page is returned in retpage
404 * Returns 0 if page was already in the swap cache, page is not locked
405 * Returns 1 if the new page needs to be populated, page is locked
406 * Returns <0 on error
407 */
408static int zswap_get_swap_cache_page(swp_entry_t entry,
409 struct page **retpage)
410{
411 struct page *found_page, *new_page = NULL;
412 struct address_space *swapper_space = &swapper_spaces[swp_type(entry)];
413 int err;
414
415 *retpage = NULL;
416 do {
417 /*
418 * First check the swap cache. Since this is normally
419 * called after lookup_swap_cache() failed, re-calling
420 * that would confuse statistics.
421 */
422 found_page = find_get_page(swapper_space, entry.val);
423 if (found_page)
424 break;
425
426 /*
427 * Get a new page to read into from swap.
428 */
429 if (!new_page) {
430 new_page = alloc_page(GFP_KERNEL);
431 if (!new_page)
432 break; /* Out of memory */
433 }
434
435 /*
436 * call radix_tree_preload() while we can wait.
437 */
438 err = radix_tree_preload(GFP_KERNEL);
439 if (err)
440 break;
441
442 /*
443 * Swap entry may have been freed since our caller observed it.
444 */
445 err = swapcache_prepare(entry);
446 if (err == -EEXIST) { /* seems racy */
447 radix_tree_preload_end();
448 continue;
449 }
450 if (err) { /* swp entry is obsolete ? */
451 radix_tree_preload_end();
452 break;
453 }
454
455 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
456 __set_page_locked(new_page);
457 SetPageSwapBacked(new_page);
458 err = __add_to_swap_cache(new_page, entry);
459 if (likely(!err)) {
460 radix_tree_preload_end();
461 lru_cache_add_anon(new_page);
462 *retpage = new_page;
463 return ZSWAP_SWAPCACHE_NEW;
464 }
465 radix_tree_preload_end();
466 ClearPageSwapBacked(new_page);
467 __clear_page_locked(new_page);
468 /*
469 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
470 * clear SWAP_HAS_CACHE flag.
471 */
472 swapcache_free(entry, NULL);
473 } while (err != -ENOMEM);
474
475 if (new_page)
476 page_cache_release(new_page);
477 if (!found_page)
478 return ZSWAP_SWAPCACHE_NOMEM;
479 *retpage = found_page;
480 return ZSWAP_SWAPCACHE_EXIST;
481}
482
483/*
484 * Attempts to free an entry by adding a page to the swap cache,
485 * decompressing the entry data into the page, and issuing a
486 * bio write to write the page back to the swap device.
487 *
488 * This can be thought of as a "resumed writeback" of the page
489 * to the swap device. We are basically resuming the same swap
490 * writeback path that was intercepted with the frontswap_store()
491 * in the first place. After the page has been decompressed into
492 * the swap cache, the compressed version stored by zswap can be
493 * freed.
494 */
495static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
496{
497 struct zswap_header *zhdr;
498 swp_entry_t swpentry;
499 struct zswap_tree *tree;
500 pgoff_t offset;
501 struct zswap_entry *entry;
502 struct page *page;
503 u8 *src, *dst;
504 unsigned int dlen;
505 int ret, refcount;
506 struct writeback_control wbc = {
507 .sync_mode = WB_SYNC_NONE,
508 };
509
510 /* extract swpentry from data */
511 zhdr = zbud_map(pool, handle);
512 swpentry = zhdr->swpentry; /* here */
513 zbud_unmap(pool, handle);
514 tree = zswap_trees[swp_type(swpentry)];
515 offset = swp_offset(swpentry);
516 BUG_ON(pool != tree->pool);
517
518 /* find and ref zswap entry */
519 spin_lock(&tree->lock);
520 entry = zswap_rb_search(&tree->rbroot, offset);
521 if (!entry) {
522 /* entry was invalidated */
523 spin_unlock(&tree->lock);
524 return 0;
525 }
526 zswap_entry_get(entry);
527 spin_unlock(&tree->lock);
528 BUG_ON(offset != entry->offset);
529
530 /* try to allocate swap cache page */
531 switch (zswap_get_swap_cache_page(swpentry, &page)) {
532 case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
533 ret = -ENOMEM;
534 goto fail;
535
536 case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
537 /* page is already in the swap cache, ignore for now */
538 page_cache_release(page);
539 ret = -EEXIST;
540 goto fail;
541
542 case ZSWAP_SWAPCACHE_NEW: /* page is locked */
543 /* decompress */
544 dlen = PAGE_SIZE;
545 src = (u8 *)zbud_map(tree->pool, entry->handle) +
546 sizeof(struct zswap_header);
547 dst = kmap_atomic(page);
548 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
549 entry->length, dst, &dlen);
550 kunmap_atomic(dst);
551 zbud_unmap(tree->pool, entry->handle);
552 BUG_ON(ret);
553 BUG_ON(dlen != PAGE_SIZE);
554
555 /* page is up to date */
556 SetPageUptodate(page);
557 }
558
559 /* start writeback */
560 __swap_writepage(page, &wbc, end_swap_bio_write);
561 page_cache_release(page);
562 zswap_written_back_pages++;
563
564 spin_lock(&tree->lock);
565
566 /* drop local reference */
567 zswap_entry_put(entry);
568 /* drop the initial reference from entry creation */
569 refcount = zswap_entry_put(entry);
570
571 /*
572 * There are three possible values for refcount here:
573 * (1) refcount is 1, load is in progress, unlink from rbtree,
574 * load will free
575 * (2) refcount is 0, (normal case) entry is valid,
576 * remove from rbtree and free entry
577 * (3) refcount is -1, invalidate happened during writeback,
578 * free entry
579 */
580 if (refcount >= 0) {
581 /* no invalidate yet, remove from rbtree */
582 rb_erase(&entry->rbnode, &tree->rbroot);
583 }
584 spin_unlock(&tree->lock);
585 if (refcount <= 0) {
586 /* free the entry */
587 zswap_free_entry(tree, entry);
588 return 0;
589 }
590 return -EAGAIN;
591
592fail:
593 spin_lock(&tree->lock);
594 zswap_entry_put(entry);
595 spin_unlock(&tree->lock);
596 return ret;
597}
598
599/*********************************
600* frontswap hooks
601**********************************/
602/* attempts to compress and store an single page */
603static int zswap_frontswap_store(unsigned type, pgoff_t offset,
604 struct page *page)
605{
606 struct zswap_tree *tree = zswap_trees[type];
607 struct zswap_entry *entry, *dupentry;
608 int ret;
609 unsigned int dlen = PAGE_SIZE, len;
610 unsigned long handle;
611 char *buf;
612 u8 *src, *dst;
613 struct zswap_header *zhdr;
614
615 if (!tree) {
616 ret = -ENODEV;
617 goto reject;
618 }
619
620 /* reclaim space if needed */
621 if (zswap_is_full()) {
622 zswap_pool_limit_hit++;
623 if (zbud_reclaim_page(tree->pool, 8)) {
624 zswap_reject_reclaim_fail++;
625 ret = -ENOMEM;
626 goto reject;
627 }
628 }
629
630 /* allocate entry */
631 entry = zswap_entry_cache_alloc(GFP_KERNEL);
632 if (!entry) {
633 zswap_reject_kmemcache_fail++;
634 ret = -ENOMEM;
635 goto reject;
636 }
637
638 /* compress */
639 dst = get_cpu_var(zswap_dstmem);
640 src = kmap_atomic(page);
641 ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
642 kunmap_atomic(src);
643 if (ret) {
644 ret = -EINVAL;
645 goto freepage;
646 }
647
648 /* store */
649 len = dlen + sizeof(struct zswap_header);
650 ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN,
651 &handle);
652 if (ret == -ENOSPC) {
653 zswap_reject_compress_poor++;
654 goto freepage;
655 }
656 if (ret) {
657 zswap_reject_alloc_fail++;
658 goto freepage;
659 }
660 zhdr = zbud_map(tree->pool, handle);
661 zhdr->swpentry = swp_entry(type, offset);
662 buf = (u8 *)(zhdr + 1);
663 memcpy(buf, dst, dlen);
664 zbud_unmap(tree->pool, handle);
665 put_cpu_var(zswap_dstmem);
666
667 /* populate entry */
668 entry->offset = offset;
669 entry->handle = handle;
670 entry->length = dlen;
671
672 /* map */
673 spin_lock(&tree->lock);
674 do {
675 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
676 if (ret == -EEXIST) {
677 zswap_duplicate_entry++;
678 /* remove from rbtree */
679 rb_erase(&dupentry->rbnode, &tree->rbroot);
680 if (!zswap_entry_put(dupentry)) {
681 /* free */
682 zswap_free_entry(tree, dupentry);
683 }
684 }
685 } while (ret == -EEXIST);
686 spin_unlock(&tree->lock);
687
688 /* update stats */
689 atomic_inc(&zswap_stored_pages);
690 zswap_pool_pages = zbud_get_pool_size(tree->pool);
691
692 return 0;
693
694freepage:
695 put_cpu_var(zswap_dstmem);
696 zswap_entry_cache_free(entry);
697reject:
698 return ret;
699}
700
701/*
702 * returns 0 if the page was successfully decompressed
703 * return -1 on entry not found or error
704*/
705static int zswap_frontswap_load(unsigned type, pgoff_t offset,
706 struct page *page)
707{
708 struct zswap_tree *tree = zswap_trees[type];
709 struct zswap_entry *entry;
710 u8 *src, *dst;
711 unsigned int dlen;
712 int refcount, ret;
713
714 /* find */
715 spin_lock(&tree->lock);
716 entry = zswap_rb_search(&tree->rbroot, offset);
717 if (!entry) {
718 /* entry was written back */
719 spin_unlock(&tree->lock);
720 return -1;
721 }
722 zswap_entry_get(entry);
723 spin_unlock(&tree->lock);
724
725 /* decompress */
726 dlen = PAGE_SIZE;
727 src = (u8 *)zbud_map(tree->pool, entry->handle) +
728 sizeof(struct zswap_header);
729 dst = kmap_atomic(page);
730 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
731 dst, &dlen);
732 kunmap_atomic(dst);
733 zbud_unmap(tree->pool, entry->handle);
734 BUG_ON(ret);
735
736 spin_lock(&tree->lock);
737 refcount = zswap_entry_put(entry);
738 if (likely(refcount)) {
739 spin_unlock(&tree->lock);
740 return 0;
741 }
742 spin_unlock(&tree->lock);
743
744 /*
745 * We don't have to unlink from the rbtree because
746 * zswap_writeback_entry() or zswap_frontswap_invalidate page()
747 * has already done this for us if we are the last reference.
748 */
749 /* free */
750
751 zswap_free_entry(tree, entry);
752
753 return 0;
754}
755
756/* frees an entry in zswap */
757static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
758{
759 struct zswap_tree *tree = zswap_trees[type];
760 struct zswap_entry *entry;
761 int refcount;
762
763 /* find */
764 spin_lock(&tree->lock);
765 entry = zswap_rb_search(&tree->rbroot, offset);
766 if (!entry) {
767 /* entry was written back */
768 spin_unlock(&tree->lock);
769 return;
770 }
771
772 /* remove from rbtree */
773 rb_erase(&entry->rbnode, &tree->rbroot);
774
775 /* drop the initial reference from entry creation */
776 refcount = zswap_entry_put(entry);
777
778 spin_unlock(&tree->lock);
779
780 if (refcount) {
781 /* writeback in progress, writeback will free */
782 return;
783 }
784
785 /* free */
786 zswap_free_entry(tree, entry);
787}
788
789/* frees all zswap entries for the given swap type */
790static void zswap_frontswap_invalidate_area(unsigned type)
791{
792 struct zswap_tree *tree = zswap_trees[type];
793 struct rb_node *node;
794 struct zswap_entry *entry;
795
796 if (!tree)
797 return;
798
799 /* walk the tree and free everything */
800 spin_lock(&tree->lock);
801 /*
802 * TODO: Even though this code should not be executed because
803 * the try_to_unuse() in swapoff should have emptied the tree,
804 * it is very wasteful to rebalance the tree after every
805 * removal when we are freeing the whole tree.
806 *
807 * If post-order traversal code is ever added to the rbtree
808 * implementation, it should be used here.
809 */
810 while ((node = rb_first(&tree->rbroot))) {
811 entry = rb_entry(node, struct zswap_entry, rbnode);
812 rb_erase(&entry->rbnode, &tree->rbroot);
813 zbud_free(tree->pool, entry->handle);
814 zswap_entry_cache_free(entry);
815 atomic_dec(&zswap_stored_pages);
816 }
817 tree->rbroot = RB_ROOT;
818 spin_unlock(&tree->lock);
819}
820
821static struct zbud_ops zswap_zbud_ops = {
822 .evict = zswap_writeback_entry
823};
824
825static void zswap_frontswap_init(unsigned type)
826{
827 struct zswap_tree *tree;
828
829 tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
830 if (!tree)
831 goto err;
832 tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
833 if (!tree->pool)
834 goto freetree;
835 tree->rbroot = RB_ROOT;
836 spin_lock_init(&tree->lock);
837 zswap_trees[type] = tree;
838 return;
839
840freetree:
841 kfree(tree);
842err:
843 pr_err("alloc failed, zswap disabled for swap type %d\n", type);
844}
845
846static struct frontswap_ops zswap_frontswap_ops = {
847 .store = zswap_frontswap_store,
848 .load = zswap_frontswap_load,
849 .invalidate_page = zswap_frontswap_invalidate_page,
850 .invalidate_area = zswap_frontswap_invalidate_area,
851 .init = zswap_frontswap_init
852};
853
854/*********************************
855* debugfs functions
856**********************************/
857#ifdef CONFIG_DEBUG_FS
858#include <linux/debugfs.h>
859
860static struct dentry *zswap_debugfs_root;
861
862static int __init zswap_debugfs_init(void)
863{
864 if (!debugfs_initialized())
865 return -ENODEV;
866
867 zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
868 if (!zswap_debugfs_root)
869 return -ENOMEM;
870
871 debugfs_create_u64("pool_limit_hit", S_IRUGO,
872 zswap_debugfs_root, &zswap_pool_limit_hit);
873 debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
874 zswap_debugfs_root, &zswap_reject_reclaim_fail);
875 debugfs_create_u64("reject_alloc_fail", S_IRUGO,
876 zswap_debugfs_root, &zswap_reject_alloc_fail);
877 debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
878 zswap_debugfs_root, &zswap_reject_kmemcache_fail);
879 debugfs_create_u64("reject_compress_poor", S_IRUGO,
880 zswap_debugfs_root, &zswap_reject_compress_poor);
881 debugfs_create_u64("written_back_pages", S_IRUGO,
882 zswap_debugfs_root, &zswap_written_back_pages);
883 debugfs_create_u64("duplicate_entry", S_IRUGO,
884 zswap_debugfs_root, &zswap_duplicate_entry);
885 debugfs_create_u64("pool_pages", S_IRUGO,
886 zswap_debugfs_root, &zswap_pool_pages);
887 debugfs_create_atomic_t("stored_pages", S_IRUGO,
888 zswap_debugfs_root, &zswap_stored_pages);
889
890 return 0;
891}
892
893static void __exit zswap_debugfs_exit(void)
894{
895 debugfs_remove_recursive(zswap_debugfs_root);
896}
897#else
898static int __init zswap_debugfs_init(void)
899{
900 return 0;
901}
902
903static void __exit zswap_debugfs_exit(void) { }
904#endif
905
906/*********************************
907* module init and exit
908**********************************/
909static int __init init_zswap(void)
910{
911 if (!zswap_enabled)
912 return 0;
913
914 pr_info("loading zswap\n");
915 if (zswap_entry_cache_create()) {
916 pr_err("entry cache creation failed\n");
917 goto error;
918 }
919 if (zswap_comp_init()) {
920 pr_err("compressor initialization failed\n");
921 goto compfail;
922 }
923 if (zswap_cpu_init()) {
924 pr_err("per-cpu initialization failed\n");
925 goto pcpufail;
926 }
927 frontswap_register_ops(&zswap_frontswap_ops);
928 if (zswap_debugfs_init())
929 pr_warn("debugfs initialization failed\n");
930 return 0;
931pcpufail:
932 zswap_comp_exit();
933compfail:
934 zswap_entry_cache_destory();
935error:
936 return -ENOMEM;
937}
938/* must be late so crypto has time to come up */
939late_initcall(init_zswap);
940
941MODULE_LICENSE("GPL");
942MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
943MODULE_DESCRIPTION("Compressed cache for swap pages");