aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-10 21:19:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-10 21:19:42 -0400
commit33e247c7e58d335d70ecb84fd869091e2e4b8dcb (patch)
treee8561e1993dff03f8e56d10a5795fe9d379a3390 /mm
parentd71fc239b6915a8b750e9a447311029ff45b6580 (diff)
parent452e06af1f0149b01201f94264d452cd7a95db7a (diff)
Merge branch 'akpm' (patches from Andrew)
Merge third patch-bomb from Andrew Morton: - even more of the rest of MM - lib/ updates - checkpatch updates - small changes to a few scruffy filesystems - kmod fixes/cleanups - kexec updates - a dma-mapping cleanup series from hch * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (81 commits) dma-mapping: consolidate dma_set_mask dma-mapping: consolidate dma_supported dma-mapping: cosolidate dma_mapping_error dma-mapping: consolidate dma_{alloc,free}_noncoherent dma-mapping: consolidate dma_{alloc,free}_{attrs,coherent} mm: use vma_is_anonymous() in create_huge_pmd() and wp_huge_pmd() mm: make sure all file VMAs have ->vm_ops set mm, mpx: add "vm_flags_t vm_flags" arg to do_mmap_pgoff() mm: mark most vm_operations_struct const namei: fix warning while make xmldocs caused by namei.c ipc: convert invalid scenarios to use WARN_ON zlib_deflate/deftree: remove bi_reverse() lib/decompress_unlzma: Do a NULL check for pointer lib/decompressors: use real out buf size for gunzip with kernel fs/affs: make root lookup from blkdev logical size sysctl: fix int -> unsigned long assignments in INT_MIN case kexec: export KERNEL_IMAGE_SIZE to vmcoreinfo kexec: align crash_notes allocation to make it be inside one physical page kexec: remove unnecessary test in kimage_alloc_crash_control_pages() kexec: split kexec_load syscall from kexec core code ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/Makefile1
-rw-r--r--mm/debug.c4
-rw-r--r--mm/huge_memory.c12
-rw-r--r--mm/hwpoison-inject.c5
-rw-r--r--mm/kmemleak.c21
-rw-r--r--mm/memcontrol.c76
-rw-r--r--mm/memory-failure.c16
-rw-r--r--mm/memory.c4
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mmap.c18
-rw-r--r--mm/mmu_notifier.c17
-rw-r--r--mm/nommu.c19
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/page_idle.c232
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/swap.c3
-rw-r--r--mm/zpool.c33
-rw-r--r--mm/zswap.c688
19 files changed, 934 insertions, 243 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3a4070f5ab79..6413d027c0b2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,6 +649,18 @@ config DEFERRED_STRUCT_PAGE_INIT
649 processes running early in the lifetime of the systemm until kswapd 649 processes running early in the lifetime of the systemm until kswapd
650 finishes the initialisation. 650 finishes the initialisation.
651 651
652config IDLE_PAGE_TRACKING
653 bool "Enable idle page tracking"
654 depends on SYSFS && MMU
655 select PAGE_EXTENSION if !64BIT
656 help
657 This feature allows to estimate the amount of user pages that have
658 not been touched during a given period of time. This information can
659 be useful to tune memory cgroup limits and/or for job placement
660 within a compute cluster.
661
662 See Documentation/vm/idle_page_tracking.txt for more details.
663
652config ZONE_DEVICE 664config ZONE_DEVICE
653 bool "Device memory (pmem, etc...) hotplug support" if EXPERT 665 bool "Device memory (pmem, etc...) hotplug support" if EXPERT
654 default !ZONE_DMA 666 default !ZONE_DMA
diff --git a/mm/Makefile b/mm/Makefile
index b424d5e5b6ff..56f8eed73f1a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -79,3 +79,4 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
79obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o 79obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
80obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o 80obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
81obj-$(CONFIG_USERFAULTFD) += userfaultfd.o 81obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
82obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
diff --git a/mm/debug.c b/mm/debug.c
index 76089ddf99ea..6c1b3ea61bfd 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -48,6 +48,10 @@ static const struct trace_print_flags pageflag_names[] = {
48#ifdef CONFIG_TRANSPARENT_HUGEPAGE 48#ifdef CONFIG_TRANSPARENT_HUGEPAGE
49 {1UL << PG_compound_lock, "compound_lock" }, 49 {1UL << PG_compound_lock, "compound_lock" },
50#endif 50#endif
51#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
52 {1UL << PG_young, "young" },
53 {1UL << PG_idle, "idle" },
54#endif
51}; 55};
52 56
53static void dump_flags(unsigned long flags, 57static void dump_flags(unsigned long flags,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b16279cbd91d..4b06b8db9df2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -25,6 +25,7 @@
25#include <linux/migrate.h> 25#include <linux/migrate.h>
26#include <linux/hashtable.h> 26#include <linux/hashtable.h>
27#include <linux/userfaultfd_k.h> 27#include <linux/userfaultfd_k.h>
28#include <linux/page_idle.h>
28 29
29#include <asm/tlb.h> 30#include <asm/tlb.h>
30#include <asm/pgalloc.h> 31#include <asm/pgalloc.h>
@@ -1757,6 +1758,11 @@ static void __split_huge_page_refcount(struct page *page,
1757 /* clear PageTail before overwriting first_page */ 1758 /* clear PageTail before overwriting first_page */
1758 smp_wmb(); 1759 smp_wmb();
1759 1760
1761 if (page_is_young(page))
1762 set_page_young(page_tail);
1763 if (page_is_idle(page))
1764 set_page_idle(page_tail);
1765
1760 /* 1766 /*
1761 * __split_huge_page_splitting() already set the 1767 * __split_huge_page_splitting() already set the
1762 * splitting bit in all pmd that could map this 1768 * splitting bit in all pmd that could map this
@@ -2262,7 +2268,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2262 VM_BUG_ON_PAGE(PageLRU(page), page); 2268 VM_BUG_ON_PAGE(PageLRU(page), page);
2263 2269
2264 /* If there is no mapped pte young don't collapse the page */ 2270 /* If there is no mapped pte young don't collapse the page */
2265 if (pte_young(pteval) || PageReferenced(page) || 2271 if (pte_young(pteval) ||
2272 page_is_young(page) || PageReferenced(page) ||
2266 mmu_notifier_test_young(vma->vm_mm, address)) 2273 mmu_notifier_test_young(vma->vm_mm, address))
2267 referenced = true; 2274 referenced = true;
2268 } 2275 }
@@ -2693,7 +2700,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2693 */ 2700 */
2694 if (page_count(page) != 1 + !!PageSwapCache(page)) 2701 if (page_count(page) != 1 + !!PageSwapCache(page))
2695 goto out_unmap; 2702 goto out_unmap;
2696 if (pte_young(pteval) || PageReferenced(page) || 2703 if (pte_young(pteval) ||
2704 page_is_young(page) || PageReferenced(page) ||
2697 mmu_notifier_test_young(vma->vm_mm, address)) 2705 mmu_notifier_test_young(vma->vm_mm, address))
2698 referenced = true; 2706 referenced = true;
2699 } 2707 }
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index aeba0edd6e44..9d26fd9fefe4 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -45,12 +45,9 @@ static int hwpoison_inject(void *data, u64 val)
45 /* 45 /*
46 * do a racy check with elevated page count, to make sure PG_hwpoison 46 * do a racy check with elevated page count, to make sure PG_hwpoison
47 * will only be set for the targeted owner (or on a free page). 47 * will only be set for the targeted owner (or on a free page).
48 * We temporarily take page lock for try_get_mem_cgroup_from_page().
49 * memory_failure() will redo the check reliably inside page lock. 48 * memory_failure() will redo the check reliably inside page lock.
50 */ 49 */
51 lock_page(hpage);
52 err = hwpoison_filter(hpage); 50 err = hwpoison_filter(hpage);
53 unlock_page(hpage);
54 if (err) 51 if (err)
55 goto put_out; 52 goto put_out;
56 53
@@ -126,7 +123,7 @@ static int pfn_inject_init(void)
126 if (!dentry) 123 if (!dentry)
127 goto fail; 124 goto fail;
128 125
129#ifdef CONFIG_MEMCG_SWAP 126#ifdef CONFIG_MEMCG
130 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, 127 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
131 hwpoison_dir, &hwpoison_filter_memcg); 128 hwpoison_dir, &hwpoison_filter_memcg);
132 if (!dentry) 129 if (!dentry)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f532f6a37b55..77191eccdc6f 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -302,23 +302,14 @@ static void hex_dump_object(struct seq_file *seq,
302 struct kmemleak_object *object) 302 struct kmemleak_object *object)
303{ 303{
304 const u8 *ptr = (const u8 *)object->pointer; 304 const u8 *ptr = (const u8 *)object->pointer;
305 int i, len, remaining; 305 size_t len;
306 unsigned char linebuf[HEX_ROW_SIZE * 5];
307 306
308 /* limit the number of lines to HEX_MAX_LINES */ 307 /* limit the number of lines to HEX_MAX_LINES */
309 remaining = len = 308 len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
310 min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE)); 309
311 310 seq_printf(seq, " hex dump (first %zu bytes):\n", len);
312 seq_printf(seq, " hex dump (first %d bytes):\n", len); 311 seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE,
313 for (i = 0; i < len; i += HEX_ROW_SIZE) { 312 HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
314 int linelen = min(remaining, HEX_ROW_SIZE);
315
316 remaining -= HEX_ROW_SIZE;
317 hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
318 HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
319 HEX_ASCII);
320 seq_printf(seq, " %s\n", linebuf);
321 }
322} 313}
323 314
324/* 315/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1742a2db89c7..6ddaeba34e09 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -441,6 +441,34 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
441 return &memcg->css; 441 return &memcg->css;
442} 442}
443 443
444/**
445 * page_cgroup_ino - return inode number of the memcg a page is charged to
446 * @page: the page
447 *
448 * Look up the closest online ancestor of the memory cgroup @page is charged to
449 * and return its inode number or 0 if @page is not charged to any cgroup. It
450 * is safe to call this function without holding a reference to @page.
451 *
452 * Note, this function is inherently racy, because there is nothing to prevent
453 * the cgroup inode from getting torn down and potentially reallocated a moment
454 * after page_cgroup_ino() returns, so it only should be used by callers that
455 * do not care (such as procfs interfaces).
456 */
457ino_t page_cgroup_ino(struct page *page)
458{
459 struct mem_cgroup *memcg;
460 unsigned long ino = 0;
461
462 rcu_read_lock();
463 memcg = READ_ONCE(page->mem_cgroup);
464 while (memcg && !(memcg->css.flags & CSS_ONLINE))
465 memcg = parent_mem_cgroup(memcg);
466 if (memcg)
467 ino = cgroup_ino(memcg->css.cgroup);
468 rcu_read_unlock();
469 return ino;
470}
471
444static struct mem_cgroup_per_zone * 472static struct mem_cgroup_per_zone *
445mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) 473mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
446{ 474{
@@ -2071,40 +2099,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2071 css_put_many(&memcg->css, nr_pages); 2099 css_put_many(&memcg->css, nr_pages);
2072} 2100}
2073 2101
2074/*
2075 * try_get_mem_cgroup_from_page - look up page's memcg association
2076 * @page: the page
2077 *
2078 * Look up, get a css reference, and return the memcg that owns @page.
2079 *
2080 * The page must be locked to prevent racing with swap-in and page
2081 * cache charges. If coming from an unlocked page table, the caller
2082 * must ensure the page is on the LRU or this can race with charging.
2083 */
2084struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2085{
2086 struct mem_cgroup *memcg;
2087 unsigned short id;
2088 swp_entry_t ent;
2089
2090 VM_BUG_ON_PAGE(!PageLocked(page), page);
2091
2092 memcg = page->mem_cgroup;
2093 if (memcg) {
2094 if (!css_tryget_online(&memcg->css))
2095 memcg = NULL;
2096 } else if (PageSwapCache(page)) {
2097 ent.val = page_private(page);
2098 id = lookup_swap_cgroup_id(ent);
2099 rcu_read_lock();
2100 memcg = mem_cgroup_from_id(id);
2101 if (memcg && !css_tryget_online(&memcg->css))
2102 memcg = NULL;
2103 rcu_read_unlock();
2104 }
2105 return memcg;
2106}
2107
2108static void lock_page_lru(struct page *page, int *isolated) 2102static void lock_page_lru(struct page *page, int *isolated)
2109{ 2103{
2110 struct zone *zone = page_zone(page); 2104 struct zone *zone = page_zone(page);
@@ -5301,8 +5295,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5301 * the page lock, which serializes swap cache removal, which 5295 * the page lock, which serializes swap cache removal, which
5302 * in turn serializes uncharging. 5296 * in turn serializes uncharging.
5303 */ 5297 */
5298 VM_BUG_ON_PAGE(!PageLocked(page), page);
5304 if (page->mem_cgroup) 5299 if (page->mem_cgroup)
5305 goto out; 5300 goto out;
5301
5302 if (do_swap_account) {
5303 swp_entry_t ent = { .val = page_private(page), };
5304 unsigned short id = lookup_swap_cgroup_id(ent);
5305
5306 rcu_read_lock();
5307 memcg = mem_cgroup_from_id(id);
5308 if (memcg && !css_tryget_online(&memcg->css))
5309 memcg = NULL;
5310 rcu_read_unlock();
5311 }
5306 } 5312 }
5307 5313
5308 if (PageTransHuge(page)) { 5314 if (PageTransHuge(page)) {
@@ -5310,8 +5316,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5310 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5316 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
5311 } 5317 }
5312 5318
5313 if (do_swap_account && PageSwapCache(page))
5314 memcg = try_get_mem_cgroup_from_page(page);
5315 if (!memcg) 5319 if (!memcg)
5316 memcg = get_mem_cgroup_from_mm(mm); 5320 memcg = get_mem_cgroup_from_mm(mm);
5317 5321
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index eeda6485e76c..95882692e747 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -130,27 +130,15 @@ static int hwpoison_filter_flags(struct page *p)
130 * can only guarantee that the page either belongs to the memcg tasks, or is 130 * can only guarantee that the page either belongs to the memcg tasks, or is
131 * a freed page. 131 * a freed page.
132 */ 132 */
133#ifdef CONFIG_MEMCG_SWAP 133#ifdef CONFIG_MEMCG
134u64 hwpoison_filter_memcg; 134u64 hwpoison_filter_memcg;
135EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 135EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
136static int hwpoison_filter_task(struct page *p) 136static int hwpoison_filter_task(struct page *p)
137{ 137{
138 struct mem_cgroup *mem;
139 struct cgroup_subsys_state *css;
140 unsigned long ino;
141
142 if (!hwpoison_filter_memcg) 138 if (!hwpoison_filter_memcg)
143 return 0; 139 return 0;
144 140
145 mem = try_get_mem_cgroup_from_page(p); 141 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
146 if (!mem)
147 return -EINVAL;
148
149 css = &mem->css;
150 ino = cgroup_ino(css->cgroup);
151 css_put(css);
152
153 if (ino != hwpoison_filter_memcg)
154 return -EINVAL; 142 return -EINVAL;
155 143
156 return 0; 144 return 0;
diff --git a/mm/memory.c b/mm/memory.c
index 6cd0b2160401..9cb27470fee9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3233,7 +3233,7 @@ out:
3233static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, 3233static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
3234 unsigned long address, pmd_t *pmd, unsigned int flags) 3234 unsigned long address, pmd_t *pmd, unsigned int flags)
3235{ 3235{
3236 if (!vma->vm_ops) 3236 if (vma_is_anonymous(vma))
3237 return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); 3237 return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
3238 if (vma->vm_ops->pmd_fault) 3238 if (vma->vm_ops->pmd_fault)
3239 return vma->vm_ops->pmd_fault(vma, address, pmd, flags); 3239 return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
@@ -3244,7 +3244,7 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
3244 unsigned long address, pmd_t *pmd, pmd_t orig_pmd, 3244 unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
3245 unsigned int flags) 3245 unsigned int flags)
3246{ 3246{
3247 if (!vma->vm_ops) 3247 if (vma_is_anonymous(vma))
3248 return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); 3248 return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
3249 if (vma->vm_ops->pmd_fault) 3249 if (vma->vm_ops->pmd_fault)
3250 return vma->vm_ops->pmd_fault(vma, address, pmd, flags); 3250 return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
diff --git a/mm/migrate.c b/mm/migrate.c
index 02ce25df16c2..c3cb566af3e2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -37,6 +37,7 @@
37#include <linux/gfp.h> 37#include <linux/gfp.h>
38#include <linux/balloon_compaction.h> 38#include <linux/balloon_compaction.h>
39#include <linux/mmu_notifier.h> 39#include <linux/mmu_notifier.h>
40#include <linux/page_idle.h>
40 41
41#include <asm/tlbflush.h> 42#include <asm/tlbflush.h>
42 43
@@ -524,6 +525,11 @@ void migrate_page_copy(struct page *newpage, struct page *page)
524 __set_page_dirty_nobuffers(newpage); 525 __set_page_dirty_nobuffers(newpage);
525 } 526 }
526 527
528 if (page_is_young(page))
529 set_page_young(newpage);
530 if (page_is_idle(page))
531 set_page_idle(newpage);
532
527 /* 533 /*
528 * Copy NUMA information to the new page, to prevent over-eager 534 * Copy NUMA information to the new page, to prevent over-eager
529 * future migrations of this same page. 535 * future migrations of this same page.
diff --git a/mm/mmap.c b/mm/mmap.c
index b6be3249f0a9..971dd2cb77d2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,6 +612,8 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
612void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 612void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
613 struct rb_node **rb_link, struct rb_node *rb_parent) 613 struct rb_node **rb_link, struct rb_node *rb_parent)
614{ 614{
615 WARN_ONCE(vma->vm_file && !vma->vm_ops, "missing vma->vm_ops");
616
615 /* Update tracking information for the gap following the new vma. */ 617 /* Update tracking information for the gap following the new vma. */
616 if (vma->vm_next) 618 if (vma->vm_next)
617 vma_gap_update(vma->vm_next); 619 vma_gap_update(vma->vm_next);
@@ -1260,14 +1262,12 @@ static inline int mlock_future_check(struct mm_struct *mm,
1260/* 1262/*
1261 * The caller must hold down_write(&current->mm->mmap_sem). 1263 * The caller must hold down_write(&current->mm->mmap_sem).
1262 */ 1264 */
1263 1265unsigned long do_mmap(struct file *file, unsigned long addr,
1264unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1265 unsigned long len, unsigned long prot, 1266 unsigned long len, unsigned long prot,
1266 unsigned long flags, unsigned long pgoff, 1267 unsigned long flags, vm_flags_t vm_flags,
1267 unsigned long *populate) 1268 unsigned long pgoff, unsigned long *populate)
1268{ 1269{
1269 struct mm_struct *mm = current->mm; 1270 struct mm_struct *mm = current->mm;
1270 vm_flags_t vm_flags;
1271 1271
1272 *populate = 0; 1272 *populate = 0;
1273 1273
@@ -1311,7 +1311,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1311 * to. we assume access permissions have been handled by the open 1311 * to. we assume access permissions have been handled by the open
1312 * of the memory object, so we don't do any here. 1312 * of the memory object, so we don't do any here.
1313 */ 1313 */
1314 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 1314 vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
1315 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 1315 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1316 1316
1317 if (flags & MAP_LOCKED) 1317 if (flags & MAP_LOCKED)
@@ -1638,6 +1638,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1638 */ 1638 */
1639 WARN_ON_ONCE(addr != vma->vm_start); 1639 WARN_ON_ONCE(addr != vma->vm_start);
1640 1640
1641 /* All file mapping must have ->vm_ops set */
1642 if (!vma->vm_ops) {
1643 static const struct vm_operations_struct dummy_ops = {};
1644 vma->vm_ops = &dummy_ops;
1645 }
1646
1641 addr = vma->vm_start; 1647 addr = vma->vm_start;
1642 vm_flags = vma->vm_flags; 1648 vm_flags = vma->vm_flags;
1643 } else if (vm_flags & VM_SHARED) { 1649 } else if (vm_flags & VM_SHARED) {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 3b9b3d0741b2..5fbdd367bbed 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
123 return young; 123 return young;
124} 124}
125 125
126int __mmu_notifier_clear_young(struct mm_struct *mm,
127 unsigned long start,
128 unsigned long end)
129{
130 struct mmu_notifier *mn;
131 int young = 0, id;
132
133 id = srcu_read_lock(&srcu);
134 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
135 if (mn->ops->clear_young)
136 young |= mn->ops->clear_young(mn, mm, start, end);
137 }
138 srcu_read_unlock(&srcu, id);
139
140 return young;
141}
142
126int __mmu_notifier_test_young(struct mm_struct *mm, 143int __mmu_notifier_test_young(struct mm_struct *mm,
127 unsigned long address) 144 unsigned long address)
128{ 145{
diff --git a/mm/nommu.c b/mm/nommu.c
index 1cc0709fcaa5..ab14a2014dea 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1233,18 +1233,19 @@ enomem:
1233/* 1233/*
1234 * handle mapping creation for uClinux 1234 * handle mapping creation for uClinux
1235 */ 1235 */
1236unsigned long do_mmap_pgoff(struct file *file, 1236unsigned long do_mmap(struct file *file,
1237 unsigned long addr, 1237 unsigned long addr,
1238 unsigned long len, 1238 unsigned long len,
1239 unsigned long prot, 1239 unsigned long prot,
1240 unsigned long flags, 1240 unsigned long flags,
1241 unsigned long pgoff, 1241 vm_flags_t vm_flags,
1242 unsigned long *populate) 1242 unsigned long pgoff,
1243 unsigned long *populate)
1243{ 1244{
1244 struct vm_area_struct *vma; 1245 struct vm_area_struct *vma;
1245 struct vm_region *region; 1246 struct vm_region *region;
1246 struct rb_node *rb; 1247 struct rb_node *rb;
1247 unsigned long capabilities, vm_flags, result; 1248 unsigned long capabilities, result;
1248 int ret; 1249 int ret;
1249 1250
1250 *populate = 0; 1251 *populate = 0;
@@ -1262,7 +1263,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1262 1263
1263 /* we've determined that we can make the mapping, now translate what we 1264 /* we've determined that we can make the mapping, now translate what we
1264 * now know into VMA flags */ 1265 * now know into VMA flags */
1265 vm_flags = determine_vm_flags(file, prot, flags, capabilities); 1266 vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
1266 1267
1267 /* we're going to need to record the mapping */ 1268 /* we're going to need to record the mapping */
1268 region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); 1269 region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index d86fd2f5353f..292ca7b8debd 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -6,6 +6,7 @@
6#include <linux/vmalloc.h> 6#include <linux/vmalloc.h>
7#include <linux/kmemleak.h> 7#include <linux/kmemleak.h>
8#include <linux/page_owner.h> 8#include <linux/page_owner.h>
9#include <linux/page_idle.h>
9 10
10/* 11/*
11 * struct page extension 12 * struct page extension
@@ -59,6 +60,9 @@ static struct page_ext_operations *page_ext_ops[] = {
59#ifdef CONFIG_PAGE_OWNER 60#ifdef CONFIG_PAGE_OWNER
60 &page_owner_ops, 61 &page_owner_ops,
61#endif 62#endif
63#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
64 &page_idle_ops,
65#endif
62}; 66};
63 67
64static unsigned long total_usage; 68static unsigned long total_usage;
diff --git a/mm/page_idle.c b/mm/page_idle.c
new file mode 100644
index 000000000000..d5dd79041484
--- /dev/null
+++ b/mm/page_idle.c
@@ -0,0 +1,232 @@
1#include <linux/init.h>
2#include <linux/bootmem.h>
3#include <linux/fs.h>
4#include <linux/sysfs.h>
5#include <linux/kobject.h>
6#include <linux/mm.h>
7#include <linux/mmzone.h>
8#include <linux/pagemap.h>
9#include <linux/rmap.h>
10#include <linux/mmu_notifier.h>
11#include <linux/page_ext.h>
12#include <linux/page_idle.h>
13
14#define BITMAP_CHUNK_SIZE sizeof(u64)
15#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
16
17/*
18 * Idle page tracking only considers user memory pages, for other types of
19 * pages the idle flag is always unset and an attempt to set it is silently
20 * ignored.
21 *
22 * We treat a page as a user memory page if it is on an LRU list, because it is
23 * always safe to pass such a page to rmap_walk(), which is essential for idle
24 * page tracking. With such an indicator of user pages we can skip isolated
25 * pages, but since there are not usually many of them, it will hardly affect
26 * the overall result.
27 *
28 * This function tries to get a user memory page by pfn as described above.
29 */
30static struct page *page_idle_get_page(unsigned long pfn)
31{
32 struct page *page;
33 struct zone *zone;
34
35 if (!pfn_valid(pfn))
36 return NULL;
37
38 page = pfn_to_page(pfn);
39 if (!page || !PageLRU(page) ||
40 !get_page_unless_zero(page))
41 return NULL;
42
43 zone = page_zone(page);
44 spin_lock_irq(&zone->lru_lock);
45 if (unlikely(!PageLRU(page))) {
46 put_page(page);
47 page = NULL;
48 }
49 spin_unlock_irq(&zone->lru_lock);
50 return page;
51}
52
53static int page_idle_clear_pte_refs_one(struct page *page,
54 struct vm_area_struct *vma,
55 unsigned long addr, void *arg)
56{
57 struct mm_struct *mm = vma->vm_mm;
58 spinlock_t *ptl;
59 pmd_t *pmd;
60 pte_t *pte;
61 bool referenced = false;
62
63 if (unlikely(PageTransHuge(page))) {
64 pmd = page_check_address_pmd(page, mm, addr,
65 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
66 if (pmd) {
67 referenced = pmdp_clear_young_notify(vma, addr, pmd);
68 spin_unlock(ptl);
69 }
70 } else {
71 pte = page_check_address(page, mm, addr, &ptl, 0);
72 if (pte) {
73 referenced = ptep_clear_young_notify(vma, addr, pte);
74 pte_unmap_unlock(pte, ptl);
75 }
76 }
77 if (referenced) {
78 clear_page_idle(page);
79 /*
80 * We cleared the referenced bit in a mapping to this page. To
81 * avoid interference with page reclaim, mark it young so that
82 * page_referenced() will return > 0.
83 */
84 set_page_young(page);
85 }
86 return SWAP_AGAIN;
87}
88
89static void page_idle_clear_pte_refs(struct page *page)
90{
91 /*
92 * Since rwc.arg is unused, rwc is effectively immutable, so we
93 * can make it static const to save some cycles and stack.
94 */
95 static const struct rmap_walk_control rwc = {
96 .rmap_one = page_idle_clear_pte_refs_one,
97 .anon_lock = page_lock_anon_vma_read,
98 };
99 bool need_lock;
100
101 if (!page_mapped(page) ||
102 !page_rmapping(page))
103 return;
104
105 need_lock = !PageAnon(page) || PageKsm(page);
106 if (need_lock && !trylock_page(page))
107 return;
108
109 rmap_walk(page, (struct rmap_walk_control *)&rwc);
110
111 if (need_lock)
112 unlock_page(page);
113}
114
115static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
116 struct bin_attribute *attr, char *buf,
117 loff_t pos, size_t count)
118{
119 u64 *out = (u64 *)buf;
120 struct page *page;
121 unsigned long pfn, end_pfn;
122 int bit;
123
124 if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
125 return -EINVAL;
126
127 pfn = pos * BITS_PER_BYTE;
128 if (pfn >= max_pfn)
129 return 0;
130
131 end_pfn = pfn + count * BITS_PER_BYTE;
132 if (end_pfn > max_pfn)
133 end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
134
135 for (; pfn < end_pfn; pfn++) {
136 bit = pfn % BITMAP_CHUNK_BITS;
137 if (!bit)
138 *out = 0ULL;
139 page = page_idle_get_page(pfn);
140 if (page) {
141 if (page_is_idle(page)) {
142 /*
143 * The page might have been referenced via a
144 * pte, in which case it is not idle. Clear
145 * refs and recheck.
146 */
147 page_idle_clear_pte_refs(page);
148 if (page_is_idle(page))
149 *out |= 1ULL << bit;
150 }
151 put_page(page);
152 }
153 if (bit == BITMAP_CHUNK_BITS - 1)
154 out++;
155 cond_resched();
156 }
157 return (char *)out - buf;
158}
159
160static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
161 struct bin_attribute *attr, char *buf,
162 loff_t pos, size_t count)
163{
164 const u64 *in = (u64 *)buf;
165 struct page *page;
166 unsigned long pfn, end_pfn;
167 int bit;
168
169 if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
170 return -EINVAL;
171
172 pfn = pos * BITS_PER_BYTE;
173 if (pfn >= max_pfn)
174 return -ENXIO;
175
176 end_pfn = pfn + count * BITS_PER_BYTE;
177 if (end_pfn > max_pfn)
178 end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
179
180 for (; pfn < end_pfn; pfn++) {
181 bit = pfn % BITMAP_CHUNK_BITS;
182 if ((*in >> bit) & 1) {
183 page = page_idle_get_page(pfn);
184 if (page) {
185 page_idle_clear_pte_refs(page);
186 set_page_idle(page);
187 put_page(page);
188 }
189 }
190 if (bit == BITMAP_CHUNK_BITS - 1)
191 in++;
192 cond_resched();
193 }
194 return (char *)in - buf;
195}
196
197static struct bin_attribute page_idle_bitmap_attr =
198 __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
199 page_idle_bitmap_read, page_idle_bitmap_write, 0);
200
201static struct bin_attribute *page_idle_bin_attrs[] = {
202 &page_idle_bitmap_attr,
203 NULL,
204};
205
206static struct attribute_group page_idle_attr_group = {
207 .bin_attrs = page_idle_bin_attrs,
208 .name = "page_idle",
209};
210
211#ifndef CONFIG_64BIT
212static bool need_page_idle(void)
213{
214 return true;
215}
216struct page_ext_operations page_idle_ops = {
217 .need = need_page_idle,
218};
219#endif
220
221static int __init page_idle_init(void)
222{
223 int err;
224
225 err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
226 if (err) {
227 pr_err("page_idle: register sysfs failed\n");
228 return err;
229 }
230 return 0;
231}
232subsys_initcall(page_idle_init);
diff --git a/mm/rmap.c b/mm/rmap.c
index 0db38e7d0a72..f5b5c1f3dcd7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -59,6 +59,7 @@
59#include <linux/migrate.h> 59#include <linux/migrate.h>
60#include <linux/hugetlb.h> 60#include <linux/hugetlb.h>
61#include <linux/backing-dev.h> 61#include <linux/backing-dev.h>
62#include <linux/page_idle.h>
62 63
63#include <asm/tlbflush.h> 64#include <asm/tlbflush.h>
64 65
@@ -886,6 +887,11 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
886 pte_unmap_unlock(pte, ptl); 887 pte_unmap_unlock(pte, ptl);
887 } 888 }
888 889
890 if (referenced)
891 clear_page_idle(page);
892 if (test_and_clear_page_young(page))
893 referenced++;
894
889 if (referenced) { 895 if (referenced) {
890 pra->referenced++; 896 pra->referenced++;
891 pra->vm_flags |= vma->vm_flags; 897 pra->vm_flags |= vma->vm_flags;
diff --git a/mm/swap.c b/mm/swap.c
index a3a0a2f1f7c3..983f692a47fd 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -32,6 +32,7 @@
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/uio.h> 33#include <linux/uio.h>
34#include <linux/hugetlb.h> 34#include <linux/hugetlb.h>
35#include <linux/page_idle.h>
35 36
36#include "internal.h" 37#include "internal.h"
37 38
@@ -622,6 +623,8 @@ void mark_page_accessed(struct page *page)
622 } else if (!PageReferenced(page)) { 623 } else if (!PageReferenced(page)) {
623 SetPageReferenced(page); 624 SetPageReferenced(page);
624 } 625 }
626 if (page_is_idle(page))
627 clear_page_idle(page);
625} 628}
626EXPORT_SYMBOL(mark_page_accessed); 629EXPORT_SYMBOL(mark_page_accessed);
627 630
diff --git a/mm/zpool.c b/mm/zpool.c
index 68d2dd8ed2d8..8f670d3e8706 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -100,6 +100,39 @@ static void zpool_put_driver(struct zpool_driver *driver)
100} 100}
101 101
102/** 102/**
103 * zpool_has_pool() - Check if the pool driver is available
104 * @type The type of the zpool to check (e.g. zbud, zsmalloc)
105 *
106 * This checks if the @type pool driver is available. This will try to load
107 * the requested module, if needed, but there is no guarantee the module will
108 * still be loaded and available immediately after calling. If this returns
109 * true, the caller should assume the pool is available, but must be prepared
110 * to handle the @zpool_create_pool() returning failure. However if this
111 * returns false, the caller should assume the requested pool type is not
112 * available; either the requested pool type module does not exist, or could
113 * not be loaded, and calling @zpool_create_pool() with the pool type will
114 * fail.
115 *
116 * Returns: true if @type pool is available, false if not
117 */
118bool zpool_has_pool(char *type)
119{
120 struct zpool_driver *driver = zpool_get_driver(type);
121
122 if (!driver) {
123 request_module("zpool-%s", type);
124 driver = zpool_get_driver(type);
125 }
126
127 if (!driver)
128 return false;
129
130 zpool_put_driver(driver);
131 return true;
132}
133EXPORT_SYMBOL(zpool_has_pool);
134
135/**
103 * zpool_create_pool() - Create a new zpool 136 * zpool_create_pool() - Create a new zpool
104 * @type The type of the zpool to create (e.g. zbud, zsmalloc) 137 * @type The type of the zpool to create (e.g. zbud, zsmalloc)
105 * @name The name of the zpool (e.g. zram0, zswap) 138 * @name The name of the zpool (e.g. zram0, zswap)
diff --git a/mm/zswap.c b/mm/zswap.c
index 48a1d081e2a5..4043df7c672f 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -80,85 +80,54 @@ static u64 zswap_duplicate_entry;
80static bool zswap_enabled; 80static bool zswap_enabled;
81module_param_named(enabled, zswap_enabled, bool, 0644); 81module_param_named(enabled, zswap_enabled, bool, 0644);
82 82
83/* Compressor to be used by zswap (fixed at boot for now) */ 83/* Crypto compressor to use */
84#define ZSWAP_COMPRESSOR_DEFAULT "lzo" 84#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
85static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 85static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT;
86module_param_named(compressor, zswap_compressor, charp, 0444); 86static struct kparam_string zswap_compressor_kparam = {
87 87 .string = zswap_compressor,
88/* The maximum percentage of memory that the compressed pool can occupy */ 88 .maxlen = sizeof(zswap_compressor),
89static unsigned int zswap_max_pool_percent = 20; 89};
90module_param_named(max_pool_percent, 90static int zswap_compressor_param_set(const char *,
91 zswap_max_pool_percent, uint, 0644); 91 const struct kernel_param *);
92static struct kernel_param_ops zswap_compressor_param_ops = {
93 .set = zswap_compressor_param_set,
94 .get = param_get_string,
95};
96module_param_cb(compressor, &zswap_compressor_param_ops,
97 &zswap_compressor_kparam, 0644);
92 98
93/* Compressed storage to use */ 99/* Compressed storage zpool to use */
94#define ZSWAP_ZPOOL_DEFAULT "zbud" 100#define ZSWAP_ZPOOL_DEFAULT "zbud"
95static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 101static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT;
96module_param_named(zpool, zswap_zpool_type, charp, 0444); 102static struct kparam_string zswap_zpool_kparam = {
103 .string = zswap_zpool_type,
104 .maxlen = sizeof(zswap_zpool_type),
105};
106static int zswap_zpool_param_set(const char *, const struct kernel_param *);
107static struct kernel_param_ops zswap_zpool_param_ops = {
108 .set = zswap_zpool_param_set,
109 .get = param_get_string,
110};
111module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644);
97 112
98/* zpool is shared by all of zswap backend */ 113/* The maximum percentage of memory that the compressed pool can occupy */
99static struct zpool *zswap_pool; 114static unsigned int zswap_max_pool_percent = 20;
115module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
100 116
101/********************************* 117/*********************************
102* compression functions 118* data structures
103**********************************/ 119**********************************/
104/* per-cpu compression transforms */
105static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
106 120
107enum comp_op { 121struct zswap_pool {
108 ZSWAP_COMPOP_COMPRESS, 122 struct zpool *zpool;
109 ZSWAP_COMPOP_DECOMPRESS 123 struct crypto_comp * __percpu *tfm;
124 struct kref kref;
125 struct list_head list;
126 struct rcu_head rcu_head;
127 struct notifier_block notifier;
128 char tfm_name[CRYPTO_MAX_ALG_NAME];
110}; 129};
111 130
112static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
113 u8 *dst, unsigned int *dlen)
114{
115 struct crypto_comp *tfm;
116 int ret;
117
118 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
119 switch (op) {
120 case ZSWAP_COMPOP_COMPRESS:
121 ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
122 break;
123 case ZSWAP_COMPOP_DECOMPRESS:
124 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
125 break;
126 default:
127 ret = -EINVAL;
128 }
129
130 put_cpu();
131 return ret;
132}
133
134static int __init zswap_comp_init(void)
135{
136 if (!crypto_has_comp(zswap_compressor, 0, 0)) {
137 pr_info("%s compressor not available\n", zswap_compressor);
138 /* fall back to default compressor */
139 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
140 if (!crypto_has_comp(zswap_compressor, 0, 0))
141 /* can't even load the default compressor */
142 return -ENODEV;
143 }
144 pr_info("using %s compressor\n", zswap_compressor);
145
146 /* alloc percpu transforms */
147 zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
148 if (!zswap_comp_pcpu_tfms)
149 return -ENOMEM;
150 return 0;
151}
152
153static void __init zswap_comp_exit(void)
154{
155 /* free percpu transforms */
156 free_percpu(zswap_comp_pcpu_tfms);
157}
158
159/*********************************
160* data structures
161**********************************/
162/* 131/*
163 * struct zswap_entry 132 * struct zswap_entry
164 * 133 *
@@ -166,22 +135,24 @@ static void __init zswap_comp_exit(void)
166 * page within zswap. 135 * page within zswap.
167 * 136 *
168 * rbnode - links the entry into red-black tree for the appropriate swap type 137 * rbnode - links the entry into red-black tree for the appropriate swap type
138 * offset - the swap offset for the entry. Index into the red-black tree.
169 * refcount - the number of outstanding reference to the entry. This is needed 139 * refcount - the number of outstanding reference to the entry. This is needed
170 * to protect against premature freeing of the entry by code 140 * to protect against premature freeing of the entry by code
171 * concurrent calls to load, invalidate, and writeback. The lock 141 * concurrent calls to load, invalidate, and writeback. The lock
172 * for the zswap_tree structure that contains the entry must 142 * for the zswap_tree structure that contains the entry must
173 * be held while changing the refcount. Since the lock must 143 * be held while changing the refcount. Since the lock must
174 * be held, there is no reason to also make refcount atomic. 144 * be held, there is no reason to also make refcount atomic.
175 * offset - the swap offset for the entry. Index into the red-black tree.
176 * handle - zpool allocation handle that stores the compressed page data
177 * length - the length in bytes of the compressed page data. Needed during 145 * length - the length in bytes of the compressed page data. Needed during
178 * decompression 146 * decompression
147 * pool - the zswap_pool the entry's data is in
148 * handle - zpool allocation handle that stores the compressed page data
179 */ 149 */
180struct zswap_entry { 150struct zswap_entry {
181 struct rb_node rbnode; 151 struct rb_node rbnode;
182 pgoff_t offset; 152 pgoff_t offset;
183 int refcount; 153 int refcount;
184 unsigned int length; 154 unsigned int length;
155 struct zswap_pool *pool;
185 unsigned long handle; 156 unsigned long handle;
186}; 157};
187 158
@@ -201,6 +172,51 @@ struct zswap_tree {
201 172
202static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 173static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
203 174
175/* RCU-protected iteration */
176static LIST_HEAD(zswap_pools);
177/* protects zswap_pools list modification */
178static DEFINE_SPINLOCK(zswap_pools_lock);
179
180/* used by param callback function */
181static bool zswap_init_started;
182
183/*********************************
184* helpers and fwd declarations
185**********************************/
186
187#define zswap_pool_debug(msg, p) \
188 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
189 zpool_get_type((p)->zpool))
190
191static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
192static int zswap_pool_get(struct zswap_pool *pool);
193static void zswap_pool_put(struct zswap_pool *pool);
194
195static const struct zpool_ops zswap_zpool_ops = {
196 .evict = zswap_writeback_entry
197};
198
199static bool zswap_is_full(void)
200{
201 return totalram_pages * zswap_max_pool_percent / 100 <
202 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
203}
204
205static void zswap_update_total_size(void)
206{
207 struct zswap_pool *pool;
208 u64 total = 0;
209
210 rcu_read_lock();
211
212 list_for_each_entry_rcu(pool, &zswap_pools, list)
213 total += zpool_get_total_size(pool->zpool);
214
215 rcu_read_unlock();
216
217 zswap_pool_total_size = total;
218}
219
204/********************************* 220/*********************************
205* zswap entry functions 221* zswap entry functions
206**********************************/ 222**********************************/
@@ -294,10 +310,11 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
294 */ 310 */
295static void zswap_free_entry(struct zswap_entry *entry) 311static void zswap_free_entry(struct zswap_entry *entry)
296{ 312{
297 zpool_free(zswap_pool, entry->handle); 313 zpool_free(entry->pool->zpool, entry->handle);
314 zswap_pool_put(entry->pool);
298 zswap_entry_cache_free(entry); 315 zswap_entry_cache_free(entry);
299 atomic_dec(&zswap_stored_pages); 316 atomic_dec(&zswap_stored_pages);
300 zswap_pool_total_size = zpool_get_total_size(zswap_pool); 317 zswap_update_total_size();
301} 318}
302 319
303/* caller must hold the tree lock */ 320/* caller must hold the tree lock */
@@ -339,35 +356,21 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
339**********************************/ 356**********************************/
340static DEFINE_PER_CPU(u8 *, zswap_dstmem); 357static DEFINE_PER_CPU(u8 *, zswap_dstmem);
341 358
342static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) 359static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu)
343{ 360{
344 struct crypto_comp *tfm;
345 u8 *dst; 361 u8 *dst;
346 362
347 switch (action) { 363 switch (action) {
348 case CPU_UP_PREPARE: 364 case CPU_UP_PREPARE:
349 tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
350 if (IS_ERR(tfm)) {
351 pr_err("can't allocate compressor transform\n");
352 return NOTIFY_BAD;
353 }
354 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
355 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 365 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
356 if (!dst) { 366 if (!dst) {
357 pr_err("can't allocate compressor buffer\n"); 367 pr_err("can't allocate compressor buffer\n");
358 crypto_free_comp(tfm);
359 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
360 return NOTIFY_BAD; 368 return NOTIFY_BAD;
361 } 369 }
362 per_cpu(zswap_dstmem, cpu) = dst; 370 per_cpu(zswap_dstmem, cpu) = dst;
363 break; 371 break;
364 case CPU_DEAD: 372 case CPU_DEAD:
365 case CPU_UP_CANCELED: 373 case CPU_UP_CANCELED:
366 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
367 if (tfm) {
368 crypto_free_comp(tfm);
369 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
370 }
371 dst = per_cpu(zswap_dstmem, cpu); 374 dst = per_cpu(zswap_dstmem, cpu);
372 kfree(dst); 375 kfree(dst);
373 per_cpu(zswap_dstmem, cpu) = NULL; 376 per_cpu(zswap_dstmem, cpu) = NULL;
@@ -378,43 +381,398 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
378 return NOTIFY_OK; 381 return NOTIFY_OK;
379} 382}
380 383
381static int zswap_cpu_notifier(struct notifier_block *nb, 384static int zswap_cpu_dstmem_notifier(struct notifier_block *nb,
382 unsigned long action, void *pcpu) 385 unsigned long action, void *pcpu)
383{ 386{
384 unsigned long cpu = (unsigned long)pcpu; 387 return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu);
385 return __zswap_cpu_notifier(action, cpu);
386} 388}
387 389
388static struct notifier_block zswap_cpu_notifier_block = { 390static struct notifier_block zswap_dstmem_notifier = {
389 .notifier_call = zswap_cpu_notifier 391 .notifier_call = zswap_cpu_dstmem_notifier,
390}; 392};
391 393
392static int __init zswap_cpu_init(void) 394static int __init zswap_cpu_dstmem_init(void)
395{
396 unsigned long cpu;
397
398 cpu_notifier_register_begin();
399 for_each_online_cpu(cpu)
400 if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) ==
401 NOTIFY_BAD)
402 goto cleanup;
403 __register_cpu_notifier(&zswap_dstmem_notifier);
404 cpu_notifier_register_done();
405 return 0;
406
407cleanup:
408 for_each_online_cpu(cpu)
409 __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
410 cpu_notifier_register_done();
411 return -ENOMEM;
412}
413
414static void zswap_cpu_dstmem_destroy(void)
415{
416 unsigned long cpu;
417
418 cpu_notifier_register_begin();
419 for_each_online_cpu(cpu)
420 __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
421 __unregister_cpu_notifier(&zswap_dstmem_notifier);
422 cpu_notifier_register_done();
423}
424
425static int __zswap_cpu_comp_notifier(struct zswap_pool *pool,
426 unsigned long action, unsigned long cpu)
427{
428 struct crypto_comp *tfm;
429
430 switch (action) {
431 case CPU_UP_PREPARE:
432 if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
433 break;
434 tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
435 if (IS_ERR_OR_NULL(tfm)) {
436 pr_err("could not alloc crypto comp %s : %ld\n",
437 pool->tfm_name, PTR_ERR(tfm));
438 return NOTIFY_BAD;
439 }
440 *per_cpu_ptr(pool->tfm, cpu) = tfm;
441 break;
442 case CPU_DEAD:
443 case CPU_UP_CANCELED:
444 tfm = *per_cpu_ptr(pool->tfm, cpu);
445 if (!IS_ERR_OR_NULL(tfm))
446 crypto_free_comp(tfm);
447 *per_cpu_ptr(pool->tfm, cpu) = NULL;
448 break;
449 default:
450 break;
451 }
452 return NOTIFY_OK;
453}
454
455static int zswap_cpu_comp_notifier(struct notifier_block *nb,
456 unsigned long action, void *pcpu)
457{
458 unsigned long cpu = (unsigned long)pcpu;
459 struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier);
460
461 return __zswap_cpu_comp_notifier(pool, action, cpu);
462}
463
464static int zswap_cpu_comp_init(struct zswap_pool *pool)
393{ 465{
394 unsigned long cpu; 466 unsigned long cpu;
395 467
468 memset(&pool->notifier, 0, sizeof(pool->notifier));
469 pool->notifier.notifier_call = zswap_cpu_comp_notifier;
470
396 cpu_notifier_register_begin(); 471 cpu_notifier_register_begin();
397 for_each_online_cpu(cpu) 472 for_each_online_cpu(cpu)
398 if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) 473 if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) ==
474 NOTIFY_BAD)
399 goto cleanup; 475 goto cleanup;
400 __register_cpu_notifier(&zswap_cpu_notifier_block); 476 __register_cpu_notifier(&pool->notifier);
401 cpu_notifier_register_done(); 477 cpu_notifier_register_done();
402 return 0; 478 return 0;
403 479
404cleanup: 480cleanup:
405 for_each_online_cpu(cpu) 481 for_each_online_cpu(cpu)
406 __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); 482 __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
407 cpu_notifier_register_done(); 483 cpu_notifier_register_done();
408 return -ENOMEM; 484 return -ENOMEM;
409} 485}
410 486
487static void zswap_cpu_comp_destroy(struct zswap_pool *pool)
488{
489 unsigned long cpu;
490
491 cpu_notifier_register_begin();
492 for_each_online_cpu(cpu)
493 __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
494 __unregister_cpu_notifier(&pool->notifier);
495 cpu_notifier_register_done();
496}
497
411/********************************* 498/*********************************
412* helpers 499* pool functions
413**********************************/ 500**********************************/
414static bool zswap_is_full(void) 501
502static struct zswap_pool *__zswap_pool_current(void)
415{ 503{
416 return totalram_pages * zswap_max_pool_percent / 100 < 504 struct zswap_pool *pool;
417 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 505
506 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
507 WARN_ON(!pool);
508
509 return pool;
510}
511
512static struct zswap_pool *zswap_pool_current(void)
513{
514 assert_spin_locked(&zswap_pools_lock);
515
516 return __zswap_pool_current();
517}
518
519static struct zswap_pool *zswap_pool_current_get(void)
520{
521 struct zswap_pool *pool;
522
523 rcu_read_lock();
524
525 pool = __zswap_pool_current();
526 if (!pool || !zswap_pool_get(pool))
527 pool = NULL;
528
529 rcu_read_unlock();
530
531 return pool;
532}
533
534static struct zswap_pool *zswap_pool_last_get(void)
535{
536 struct zswap_pool *pool, *last = NULL;
537
538 rcu_read_lock();
539
540 list_for_each_entry_rcu(pool, &zswap_pools, list)
541 last = pool;
542 if (!WARN_ON(!last) && !zswap_pool_get(last))
543 last = NULL;
544
545 rcu_read_unlock();
546
547 return last;
548}
549
550static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
551{
552 struct zswap_pool *pool;
553
554 assert_spin_locked(&zswap_pools_lock);
555
556 list_for_each_entry_rcu(pool, &zswap_pools, list) {
557 if (strncmp(pool->tfm_name, compressor, sizeof(pool->tfm_name)))
558 continue;
559 if (strncmp(zpool_get_type(pool->zpool), type,
560 sizeof(zswap_zpool_type)))
561 continue;
562 /* if we can't get it, it's about to be destroyed */
563 if (!zswap_pool_get(pool))
564 continue;
565 return pool;
566 }
567
568 return NULL;
569}
570
571static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
572{
573 struct zswap_pool *pool;
574 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
575
576 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
577 if (!pool) {
578 pr_err("pool alloc failed\n");
579 return NULL;
580 }
581
582 pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops);
583 if (!pool->zpool) {
584 pr_err("%s zpool not available\n", type);
585 goto error;
586 }
587 pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
588
589 strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
590 pool->tfm = alloc_percpu(struct crypto_comp *);
591 if (!pool->tfm) {
592 pr_err("percpu alloc failed\n");
593 goto error;
594 }
595
596 if (zswap_cpu_comp_init(pool))
597 goto error;
598 pr_debug("using %s compressor\n", pool->tfm_name);
599
600 /* being the current pool takes 1 ref; this func expects the
601 * caller to always add the new pool as the current pool
602 */
603 kref_init(&pool->kref);
604 INIT_LIST_HEAD(&pool->list);
605
606 zswap_pool_debug("created", pool);
607
608 return pool;
609
610error:
611 free_percpu(pool->tfm);
612 if (pool->zpool)
613 zpool_destroy_pool(pool->zpool);
614 kfree(pool);
615 return NULL;
616}
617
618static struct zswap_pool *__zswap_pool_create_fallback(void)
619{
620 if (!crypto_has_comp(zswap_compressor, 0, 0)) {
621 pr_err("compressor %s not available, using default %s\n",
622 zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
623 strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT,
624 sizeof(zswap_compressor));
625 }
626 if (!zpool_has_pool(zswap_zpool_type)) {
627 pr_err("zpool %s not available, using default %s\n",
628 zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
629 strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT,
630 sizeof(zswap_zpool_type));
631 }
632
633 return zswap_pool_create(zswap_zpool_type, zswap_compressor);
634}
635
636static void zswap_pool_destroy(struct zswap_pool *pool)
637{
638 zswap_pool_debug("destroying", pool);
639
640 zswap_cpu_comp_destroy(pool);
641 free_percpu(pool->tfm);
642 zpool_destroy_pool(pool->zpool);
643 kfree(pool);
644}
645
646static int __must_check zswap_pool_get(struct zswap_pool *pool)
647{
648 return kref_get_unless_zero(&pool->kref);
649}
650
651static void __zswap_pool_release(struct rcu_head *head)
652{
653 struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head);
654
655 /* nobody should have been able to get a kref... */
656 WARN_ON(kref_get_unless_zero(&pool->kref));
657
658 /* pool is now off zswap_pools list and has no references. */
659 zswap_pool_destroy(pool);
660}
661
662static void __zswap_pool_empty(struct kref *kref)
663{
664 struct zswap_pool *pool;
665
666 pool = container_of(kref, typeof(*pool), kref);
667
668 spin_lock(&zswap_pools_lock);
669
670 WARN_ON(pool == zswap_pool_current());
671
672 list_del_rcu(&pool->list);
673 call_rcu(&pool->rcu_head, __zswap_pool_release);
674
675 spin_unlock(&zswap_pools_lock);
676}
677
678static void zswap_pool_put(struct zswap_pool *pool)
679{
680 kref_put(&pool->kref, __zswap_pool_empty);
681}
682
683/*********************************
684* param callbacks
685**********************************/
686
687static int __zswap_param_set(const char *val, const struct kernel_param *kp,
688 char *type, char *compressor)
689{
690 struct zswap_pool *pool, *put_pool = NULL;
691 char str[kp->str->maxlen], *s;
692 int ret;
693
694 /*
695 * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined
696 * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or
697 * 32 (arbitrary).
698 */
699 strlcpy(str, val, kp->str->maxlen);
700 s = strim(str);
701
702 /* if this is load-time (pre-init) param setting,
703 * don't create a pool; that's done during init.
704 */
705 if (!zswap_init_started)
706 return param_set_copystring(s, kp);
707
708 /* no change required */
709 if (!strncmp(kp->str->string, s, kp->str->maxlen))
710 return 0;
711
712 if (!type) {
713 type = s;
714 if (!zpool_has_pool(type)) {
715 pr_err("zpool %s not available\n", type);
716 return -ENOENT;
717 }
718 } else if (!compressor) {
719 compressor = s;
720 if (!crypto_has_comp(compressor, 0, 0)) {
721 pr_err("compressor %s not available\n", compressor);
722 return -ENOENT;
723 }
724 }
725
726 spin_lock(&zswap_pools_lock);
727
728 pool = zswap_pool_find_get(type, compressor);
729 if (pool) {
730 zswap_pool_debug("using existing", pool);
731 list_del_rcu(&pool->list);
732 } else {
733 spin_unlock(&zswap_pools_lock);
734 pool = zswap_pool_create(type, compressor);
735 spin_lock(&zswap_pools_lock);
736 }
737
738 if (pool)
739 ret = param_set_copystring(s, kp);
740 else
741 ret = -EINVAL;
742
743 if (!ret) {
744 put_pool = zswap_pool_current();
745 list_add_rcu(&pool->list, &zswap_pools);
746 } else if (pool) {
747 /* add the possibly pre-existing pool to the end of the pools
748 * list; if it's new (and empty) then it'll be removed and
749 * destroyed by the put after we drop the lock
750 */
751 list_add_tail_rcu(&pool->list, &zswap_pools);
752 put_pool = pool;
753 }
754
755 spin_unlock(&zswap_pools_lock);
756
757 /* drop the ref from either the old current pool,
758 * or the new pool we failed to add
759 */
760 if (put_pool)
761 zswap_pool_put(put_pool);
762
763 return ret;
764}
765
766static int zswap_compressor_param_set(const char *val,
767 const struct kernel_param *kp)
768{
769 return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
770}
771
772static int zswap_zpool_param_set(const char *val,
773 const struct kernel_param *kp)
774{
775 return __zswap_param_set(val, kp, NULL, zswap_compressor);
418} 776}
419 777
420/********************************* 778/*********************************
@@ -477,6 +835,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
477 pgoff_t offset; 835 pgoff_t offset;
478 struct zswap_entry *entry; 836 struct zswap_entry *entry;
479 struct page *page; 837 struct page *page;
838 struct crypto_comp *tfm;
480 u8 *src, *dst; 839 u8 *src, *dst;
481 unsigned int dlen; 840 unsigned int dlen;
482 int ret; 841 int ret;
@@ -517,13 +876,15 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
517 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 876 case ZSWAP_SWAPCACHE_NEW: /* page is locked */
518 /* decompress */ 877 /* decompress */
519 dlen = PAGE_SIZE; 878 dlen = PAGE_SIZE;
520 src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, 879 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
521 ZPOOL_MM_RO) + sizeof(struct zswap_header); 880 ZPOOL_MM_RO) + sizeof(struct zswap_header);
522 dst = kmap_atomic(page); 881 dst = kmap_atomic(page);
523 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, 882 tfm = *get_cpu_ptr(entry->pool->tfm);
524 entry->length, dst, &dlen); 883 ret = crypto_comp_decompress(tfm, src, entry->length,
884 dst, &dlen);
885 put_cpu_ptr(entry->pool->tfm);
525 kunmap_atomic(dst); 886 kunmap_atomic(dst);
526 zpool_unmap_handle(zswap_pool, entry->handle); 887 zpool_unmap_handle(entry->pool->zpool, entry->handle);
527 BUG_ON(ret); 888 BUG_ON(ret);
528 BUG_ON(dlen != PAGE_SIZE); 889 BUG_ON(dlen != PAGE_SIZE);
529 890
@@ -572,6 +933,22 @@ end:
572 return ret; 933 return ret;
573} 934}
574 935
936static int zswap_shrink(void)
937{
938 struct zswap_pool *pool;
939 int ret;
940
941 pool = zswap_pool_last_get();
942 if (!pool)
943 return -ENOENT;
944
945 ret = zpool_shrink(pool->zpool, 1, NULL);
946
947 zswap_pool_put(pool);
948
949 return ret;
950}
951
575/********************************* 952/*********************************
576* frontswap hooks 953* frontswap hooks
577**********************************/ 954**********************************/
@@ -581,6 +958,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
581{ 958{
582 struct zswap_tree *tree = zswap_trees[type]; 959 struct zswap_tree *tree = zswap_trees[type];
583 struct zswap_entry *entry, *dupentry; 960 struct zswap_entry *entry, *dupentry;
961 struct crypto_comp *tfm;
584 int ret; 962 int ret;
585 unsigned int dlen = PAGE_SIZE, len; 963 unsigned int dlen = PAGE_SIZE, len;
586 unsigned long handle; 964 unsigned long handle;
@@ -596,7 +974,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
596 /* reclaim space if needed */ 974 /* reclaim space if needed */
597 if (zswap_is_full()) { 975 if (zswap_is_full()) {
598 zswap_pool_limit_hit++; 976 zswap_pool_limit_hit++;
599 if (zpool_shrink(zswap_pool, 1, NULL)) { 977 if (zswap_shrink()) {
600 zswap_reject_reclaim_fail++; 978 zswap_reject_reclaim_fail++;
601 ret = -ENOMEM; 979 ret = -ENOMEM;
602 goto reject; 980 goto reject;
@@ -611,33 +989,42 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
611 goto reject; 989 goto reject;
612 } 990 }
613 991
992 /* if entry is successfully added, it keeps the reference */
993 entry->pool = zswap_pool_current_get();
994 if (!entry->pool) {
995 ret = -EINVAL;
996 goto freepage;
997 }
998
614 /* compress */ 999 /* compress */
615 dst = get_cpu_var(zswap_dstmem); 1000 dst = get_cpu_var(zswap_dstmem);
1001 tfm = *get_cpu_ptr(entry->pool->tfm);
616 src = kmap_atomic(page); 1002 src = kmap_atomic(page);
617 ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); 1003 ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
618 kunmap_atomic(src); 1004 kunmap_atomic(src);
1005 put_cpu_ptr(entry->pool->tfm);
619 if (ret) { 1006 if (ret) {
620 ret = -EINVAL; 1007 ret = -EINVAL;
621 goto freepage; 1008 goto put_dstmem;
622 } 1009 }
623 1010
624 /* store */ 1011 /* store */
625 len = dlen + sizeof(struct zswap_header); 1012 len = dlen + sizeof(struct zswap_header);
626 ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, 1013 ret = zpool_malloc(entry->pool->zpool, len,
627 &handle); 1014 __GFP_NORETRY | __GFP_NOWARN, &handle);
628 if (ret == -ENOSPC) { 1015 if (ret == -ENOSPC) {
629 zswap_reject_compress_poor++; 1016 zswap_reject_compress_poor++;
630 goto freepage; 1017 goto put_dstmem;
631 } 1018 }
632 if (ret) { 1019 if (ret) {
633 zswap_reject_alloc_fail++; 1020 zswap_reject_alloc_fail++;
634 goto freepage; 1021 goto put_dstmem;
635 } 1022 }
636 zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); 1023 zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
637 zhdr->swpentry = swp_entry(type, offset); 1024 zhdr->swpentry = swp_entry(type, offset);
638 buf = (u8 *)(zhdr + 1); 1025 buf = (u8 *)(zhdr + 1);
639 memcpy(buf, dst, dlen); 1026 memcpy(buf, dst, dlen);
640 zpool_unmap_handle(zswap_pool, handle); 1027 zpool_unmap_handle(entry->pool->zpool, handle);
641 put_cpu_var(zswap_dstmem); 1028 put_cpu_var(zswap_dstmem);
642 1029
643 /* populate entry */ 1030 /* populate entry */
@@ -660,12 +1047,14 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
660 1047
661 /* update stats */ 1048 /* update stats */
662 atomic_inc(&zswap_stored_pages); 1049 atomic_inc(&zswap_stored_pages);
663 zswap_pool_total_size = zpool_get_total_size(zswap_pool); 1050 zswap_update_total_size();
664 1051
665 return 0; 1052 return 0;
666 1053
667freepage: 1054put_dstmem:
668 put_cpu_var(zswap_dstmem); 1055 put_cpu_var(zswap_dstmem);
1056 zswap_pool_put(entry->pool);
1057freepage:
669 zswap_entry_cache_free(entry); 1058 zswap_entry_cache_free(entry);
670reject: 1059reject:
671 return ret; 1060 return ret;
@@ -680,6 +1069,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
680{ 1069{
681 struct zswap_tree *tree = zswap_trees[type]; 1070 struct zswap_tree *tree = zswap_trees[type];
682 struct zswap_entry *entry; 1071 struct zswap_entry *entry;
1072 struct crypto_comp *tfm;
683 u8 *src, *dst; 1073 u8 *src, *dst;
684 unsigned int dlen; 1074 unsigned int dlen;
685 int ret; 1075 int ret;
@@ -696,13 +1086,14 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
696 1086
697 /* decompress */ 1087 /* decompress */
698 dlen = PAGE_SIZE; 1088 dlen = PAGE_SIZE;
699 src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, 1089 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
700 ZPOOL_MM_RO) + sizeof(struct zswap_header); 1090 ZPOOL_MM_RO) + sizeof(struct zswap_header);
701 dst = kmap_atomic(page); 1091 dst = kmap_atomic(page);
702 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, 1092 tfm = *get_cpu_ptr(entry->pool->tfm);
703 dst, &dlen); 1093 ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
1094 put_cpu_ptr(entry->pool->tfm);
704 kunmap_atomic(dst); 1095 kunmap_atomic(dst);
705 zpool_unmap_handle(zswap_pool, entry->handle); 1096 zpool_unmap_handle(entry->pool->zpool, entry->handle);
706 BUG_ON(ret); 1097 BUG_ON(ret);
707 1098
708 spin_lock(&tree->lock); 1099 spin_lock(&tree->lock);
@@ -755,10 +1146,6 @@ static void zswap_frontswap_invalidate_area(unsigned type)
755 zswap_trees[type] = NULL; 1146 zswap_trees[type] = NULL;
756} 1147}
757 1148
758static const struct zpool_ops zswap_zpool_ops = {
759 .evict = zswap_writeback_entry
760};
761
762static void zswap_frontswap_init(unsigned type) 1149static void zswap_frontswap_init(unsigned type)
763{ 1150{
764 struct zswap_tree *tree; 1151 struct zswap_tree *tree;
@@ -839,49 +1226,40 @@ static void __exit zswap_debugfs_exit(void) { }
839**********************************/ 1226**********************************/
840static int __init init_zswap(void) 1227static int __init init_zswap(void)
841{ 1228{
842 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; 1229 struct zswap_pool *pool;
843 1230
844 pr_info("loading zswap\n"); 1231 zswap_init_started = true;
845
846 zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
847 &zswap_zpool_ops);
848 if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
849 pr_info("%s zpool not available\n", zswap_zpool_type);
850 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
851 zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
852 &zswap_zpool_ops);
853 }
854 if (!zswap_pool) {
855 pr_err("%s zpool not available\n", zswap_zpool_type);
856 pr_err("zpool creation failed\n");
857 goto error;
858 }
859 pr_info("using %s pool\n", zswap_zpool_type);
860 1232
861 if (zswap_entry_cache_create()) { 1233 if (zswap_entry_cache_create()) {
862 pr_err("entry cache creation failed\n"); 1234 pr_err("entry cache creation failed\n");
863 goto cachefail; 1235 goto cache_fail;
864 } 1236 }
865 if (zswap_comp_init()) { 1237
866 pr_err("compressor initialization failed\n"); 1238 if (zswap_cpu_dstmem_init()) {
867 goto compfail; 1239 pr_err("dstmem alloc failed\n");
1240 goto dstmem_fail;
868 } 1241 }
869 if (zswap_cpu_init()) { 1242
870 pr_err("per-cpu initialization failed\n"); 1243 pool = __zswap_pool_create_fallback();
871 goto pcpufail; 1244 if (!pool) {
1245 pr_err("pool creation failed\n");
1246 goto pool_fail;
872 } 1247 }
1248 pr_info("loaded using pool %s/%s\n", pool->tfm_name,
1249 zpool_get_type(pool->zpool));
1250
1251 list_add(&pool->list, &zswap_pools);
873 1252
874 frontswap_register_ops(&zswap_frontswap_ops); 1253 frontswap_register_ops(&zswap_frontswap_ops);
875 if (zswap_debugfs_init()) 1254 if (zswap_debugfs_init())
876 pr_warn("debugfs initialization failed\n"); 1255 pr_warn("debugfs initialization failed\n");
877 return 0; 1256 return 0;
878pcpufail: 1257
879 zswap_comp_exit(); 1258pool_fail:
880compfail: 1259 zswap_cpu_dstmem_destroy();
1260dstmem_fail:
881 zswap_entry_cache_destroy(); 1261 zswap_entry_cache_destroy();
882cachefail: 1262cache_fail:
883 zpool_destroy_pool(zswap_pool);
884error:
885 return -ENOMEM; 1263 return -ENOMEM;
886} 1264}
887/* must be late so crypto has time to come up */ 1265/* must be late so crypto has time to come up */