diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-10 21:19:42 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-10 21:19:42 -0400 |
commit | 33e247c7e58d335d70ecb84fd869091e2e4b8dcb (patch) | |
tree | e8561e1993dff03f8e56d10a5795fe9d379a3390 /mm | |
parent | d71fc239b6915a8b750e9a447311029ff45b6580 (diff) | |
parent | 452e06af1f0149b01201f94264d452cd7a95db7a (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge third patch-bomb from Andrew Morton:
- even more of the rest of MM
- lib/ updates
- checkpatch updates
- small changes to a few scruffy filesystems
- kmod fixes/cleanups
- kexec updates
- a dma-mapping cleanup series from hch
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (81 commits)
dma-mapping: consolidate dma_set_mask
dma-mapping: consolidate dma_supported
dma-mapping: cosolidate dma_mapping_error
dma-mapping: consolidate dma_{alloc,free}_noncoherent
dma-mapping: consolidate dma_{alloc,free}_{attrs,coherent}
mm: use vma_is_anonymous() in create_huge_pmd() and wp_huge_pmd()
mm: make sure all file VMAs have ->vm_ops set
mm, mpx: add "vm_flags_t vm_flags" arg to do_mmap_pgoff()
mm: mark most vm_operations_struct const
namei: fix warning while make xmldocs caused by namei.c
ipc: convert invalid scenarios to use WARN_ON
zlib_deflate/deftree: remove bi_reverse()
lib/decompress_unlzma: Do a NULL check for pointer
lib/decompressors: use real out buf size for gunzip with kernel
fs/affs: make root lookup from blkdev logical size
sysctl: fix int -> unsigned long assignments in INT_MIN case
kexec: export KERNEL_IMAGE_SIZE to vmcoreinfo
kexec: align crash_notes allocation to make it be inside one physical page
kexec: remove unnecessary test in kimage_alloc_crash_control_pages()
kexec: split kexec_load syscall from kexec core code
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 12 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/debug.c | 4 | ||||
-rw-r--r-- | mm/huge_memory.c | 12 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 5 | ||||
-rw-r--r-- | mm/kmemleak.c | 21 | ||||
-rw-r--r-- | mm/memcontrol.c | 76 | ||||
-rw-r--r-- | mm/memory-failure.c | 16 | ||||
-rw-r--r-- | mm/memory.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/mmap.c | 18 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 17 | ||||
-rw-r--r-- | mm/nommu.c | 19 | ||||
-rw-r--r-- | mm/page_ext.c | 4 | ||||
-rw-r--r-- | mm/page_idle.c | 232 | ||||
-rw-r--r-- | mm/rmap.c | 6 | ||||
-rw-r--r-- | mm/swap.c | 3 | ||||
-rw-r--r-- | mm/zpool.c | 33 | ||||
-rw-r--r-- | mm/zswap.c | 688 |
19 files changed, 934 insertions, 243 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 3a4070f5ab79..6413d027c0b2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -649,6 +649,18 @@ config DEFERRED_STRUCT_PAGE_INIT | |||
649 | processes running early in the lifetime of the systemm until kswapd | 649 | processes running early in the lifetime of the systemm until kswapd |
650 | finishes the initialisation. | 650 | finishes the initialisation. |
651 | 651 | ||
652 | config IDLE_PAGE_TRACKING | ||
653 | bool "Enable idle page tracking" | ||
654 | depends on SYSFS && MMU | ||
655 | select PAGE_EXTENSION if !64BIT | ||
656 | help | ||
657 | This feature allows to estimate the amount of user pages that have | ||
658 | not been touched during a given period of time. This information can | ||
659 | be useful to tune memory cgroup limits and/or for job placement | ||
660 | within a compute cluster. | ||
661 | |||
662 | See Documentation/vm/idle_page_tracking.txt for more details. | ||
663 | |||
652 | config ZONE_DEVICE | 664 | config ZONE_DEVICE |
653 | bool "Device memory (pmem, etc...) hotplug support" if EXPERT | 665 | bool "Device memory (pmem, etc...) hotplug support" if EXPERT |
654 | default !ZONE_DMA | 666 | default !ZONE_DMA |
diff --git a/mm/Makefile b/mm/Makefile index b424d5e5b6ff..56f8eed73f1a 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -79,3 +79,4 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o | |||
79 | obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o | 79 | obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o |
80 | obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o | 80 | obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o |
81 | obj-$(CONFIG_USERFAULTFD) += userfaultfd.o | 81 | obj-$(CONFIG_USERFAULTFD) += userfaultfd.o |
82 | obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o | ||
diff --git a/mm/debug.c b/mm/debug.c index 76089ddf99ea..6c1b3ea61bfd 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -48,6 +48,10 @@ static const struct trace_print_flags pageflag_names[] = { | |||
48 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 48 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
49 | {1UL << PG_compound_lock, "compound_lock" }, | 49 | {1UL << PG_compound_lock, "compound_lock" }, |
50 | #endif | 50 | #endif |
51 | #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) | ||
52 | {1UL << PG_young, "young" }, | ||
53 | {1UL << PG_idle, "idle" }, | ||
54 | #endif | ||
51 | }; | 55 | }; |
52 | 56 | ||
53 | static void dump_flags(unsigned long flags, | 57 | static void dump_flags(unsigned long flags, |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b16279cbd91d..4b06b8db9df2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/migrate.h> | 25 | #include <linux/migrate.h> |
26 | #include <linux/hashtable.h> | 26 | #include <linux/hashtable.h> |
27 | #include <linux/userfaultfd_k.h> | 27 | #include <linux/userfaultfd_k.h> |
28 | #include <linux/page_idle.h> | ||
28 | 29 | ||
29 | #include <asm/tlb.h> | 30 | #include <asm/tlb.h> |
30 | #include <asm/pgalloc.h> | 31 | #include <asm/pgalloc.h> |
@@ -1757,6 +1758,11 @@ static void __split_huge_page_refcount(struct page *page, | |||
1757 | /* clear PageTail before overwriting first_page */ | 1758 | /* clear PageTail before overwriting first_page */ |
1758 | smp_wmb(); | 1759 | smp_wmb(); |
1759 | 1760 | ||
1761 | if (page_is_young(page)) | ||
1762 | set_page_young(page_tail); | ||
1763 | if (page_is_idle(page)) | ||
1764 | set_page_idle(page_tail); | ||
1765 | |||
1760 | /* | 1766 | /* |
1761 | * __split_huge_page_splitting() already set the | 1767 | * __split_huge_page_splitting() already set the |
1762 | * splitting bit in all pmd that could map this | 1768 | * splitting bit in all pmd that could map this |
@@ -2262,7 +2268,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2262 | VM_BUG_ON_PAGE(PageLRU(page), page); | 2268 | VM_BUG_ON_PAGE(PageLRU(page), page); |
2263 | 2269 | ||
2264 | /* If there is no mapped pte young don't collapse the page */ | 2270 | /* If there is no mapped pte young don't collapse the page */ |
2265 | if (pte_young(pteval) || PageReferenced(page) || | 2271 | if (pte_young(pteval) || |
2272 | page_is_young(page) || PageReferenced(page) || | ||
2266 | mmu_notifier_test_young(vma->vm_mm, address)) | 2273 | mmu_notifier_test_young(vma->vm_mm, address)) |
2267 | referenced = true; | 2274 | referenced = true; |
2268 | } | 2275 | } |
@@ -2693,7 +2700,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2693 | */ | 2700 | */ |
2694 | if (page_count(page) != 1 + !!PageSwapCache(page)) | 2701 | if (page_count(page) != 1 + !!PageSwapCache(page)) |
2695 | goto out_unmap; | 2702 | goto out_unmap; |
2696 | if (pte_young(pteval) || PageReferenced(page) || | 2703 | if (pte_young(pteval) || |
2704 | page_is_young(page) || PageReferenced(page) || | ||
2697 | mmu_notifier_test_young(vma->vm_mm, address)) | 2705 | mmu_notifier_test_young(vma->vm_mm, address)) |
2698 | referenced = true; | 2706 | referenced = true; |
2699 | } | 2707 | } |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index aeba0edd6e44..9d26fd9fefe4 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -45,12 +45,9 @@ static int hwpoison_inject(void *data, u64 val) | |||
45 | /* | 45 | /* |
46 | * do a racy check with elevated page count, to make sure PG_hwpoison | 46 | * do a racy check with elevated page count, to make sure PG_hwpoison |
47 | * will only be set for the targeted owner (or on a free page). | 47 | * will only be set for the targeted owner (or on a free page). |
48 | * We temporarily take page lock for try_get_mem_cgroup_from_page(). | ||
49 | * memory_failure() will redo the check reliably inside page lock. | 48 | * memory_failure() will redo the check reliably inside page lock. |
50 | */ | 49 | */ |
51 | lock_page(hpage); | ||
52 | err = hwpoison_filter(hpage); | 50 | err = hwpoison_filter(hpage); |
53 | unlock_page(hpage); | ||
54 | if (err) | 51 | if (err) |
55 | goto put_out; | 52 | goto put_out; |
56 | 53 | ||
@@ -126,7 +123,7 @@ static int pfn_inject_init(void) | |||
126 | if (!dentry) | 123 | if (!dentry) |
127 | goto fail; | 124 | goto fail; |
128 | 125 | ||
129 | #ifdef CONFIG_MEMCG_SWAP | 126 | #ifdef CONFIG_MEMCG |
130 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, | 127 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, |
131 | hwpoison_dir, &hwpoison_filter_memcg); | 128 | hwpoison_dir, &hwpoison_filter_memcg); |
132 | if (!dentry) | 129 | if (!dentry) |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f532f6a37b55..77191eccdc6f 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -302,23 +302,14 @@ static void hex_dump_object(struct seq_file *seq, | |||
302 | struct kmemleak_object *object) | 302 | struct kmemleak_object *object) |
303 | { | 303 | { |
304 | const u8 *ptr = (const u8 *)object->pointer; | 304 | const u8 *ptr = (const u8 *)object->pointer; |
305 | int i, len, remaining; | 305 | size_t len; |
306 | unsigned char linebuf[HEX_ROW_SIZE * 5]; | ||
307 | 306 | ||
308 | /* limit the number of lines to HEX_MAX_LINES */ | 307 | /* limit the number of lines to HEX_MAX_LINES */ |
309 | remaining = len = | 308 | len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); |
310 | min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE)); | 309 | |
311 | 310 | seq_printf(seq, " hex dump (first %zu bytes):\n", len); | |
312 | seq_printf(seq, " hex dump (first %d bytes):\n", len); | 311 | seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE, |
313 | for (i = 0; i < len; i += HEX_ROW_SIZE) { | 312 | HEX_GROUP_SIZE, ptr, len, HEX_ASCII); |
314 | int linelen = min(remaining, HEX_ROW_SIZE); | ||
315 | |||
316 | remaining -= HEX_ROW_SIZE; | ||
317 | hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE, | ||
318 | HEX_GROUP_SIZE, linebuf, sizeof(linebuf), | ||
319 | HEX_ASCII); | ||
320 | seq_printf(seq, " %s\n", linebuf); | ||
321 | } | ||
322 | } | 313 | } |
323 | 314 | ||
324 | /* | 315 | /* |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1742a2db89c7..6ddaeba34e09 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -441,6 +441,34 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) | |||
441 | return &memcg->css; | 441 | return &memcg->css; |
442 | } | 442 | } |
443 | 443 | ||
444 | /** | ||
445 | * page_cgroup_ino - return inode number of the memcg a page is charged to | ||
446 | * @page: the page | ||
447 | * | ||
448 | * Look up the closest online ancestor of the memory cgroup @page is charged to | ||
449 | * and return its inode number or 0 if @page is not charged to any cgroup. It | ||
450 | * is safe to call this function without holding a reference to @page. | ||
451 | * | ||
452 | * Note, this function is inherently racy, because there is nothing to prevent | ||
453 | * the cgroup inode from getting torn down and potentially reallocated a moment | ||
454 | * after page_cgroup_ino() returns, so it only should be used by callers that | ||
455 | * do not care (such as procfs interfaces). | ||
456 | */ | ||
457 | ino_t page_cgroup_ino(struct page *page) | ||
458 | { | ||
459 | struct mem_cgroup *memcg; | ||
460 | unsigned long ino = 0; | ||
461 | |||
462 | rcu_read_lock(); | ||
463 | memcg = READ_ONCE(page->mem_cgroup); | ||
464 | while (memcg && !(memcg->css.flags & CSS_ONLINE)) | ||
465 | memcg = parent_mem_cgroup(memcg); | ||
466 | if (memcg) | ||
467 | ino = cgroup_ino(memcg->css.cgroup); | ||
468 | rcu_read_unlock(); | ||
469 | return ino; | ||
470 | } | ||
471 | |||
444 | static struct mem_cgroup_per_zone * | 472 | static struct mem_cgroup_per_zone * |
445 | mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) | 473 | mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) |
446 | { | 474 | { |
@@ -2071,40 +2099,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) | |||
2071 | css_put_many(&memcg->css, nr_pages); | 2099 | css_put_many(&memcg->css, nr_pages); |
2072 | } | 2100 | } |
2073 | 2101 | ||
2074 | /* | ||
2075 | * try_get_mem_cgroup_from_page - look up page's memcg association | ||
2076 | * @page: the page | ||
2077 | * | ||
2078 | * Look up, get a css reference, and return the memcg that owns @page. | ||
2079 | * | ||
2080 | * The page must be locked to prevent racing with swap-in and page | ||
2081 | * cache charges. If coming from an unlocked page table, the caller | ||
2082 | * must ensure the page is on the LRU or this can race with charging. | ||
2083 | */ | ||
2084 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | ||
2085 | { | ||
2086 | struct mem_cgroup *memcg; | ||
2087 | unsigned short id; | ||
2088 | swp_entry_t ent; | ||
2089 | |||
2090 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
2091 | |||
2092 | memcg = page->mem_cgroup; | ||
2093 | if (memcg) { | ||
2094 | if (!css_tryget_online(&memcg->css)) | ||
2095 | memcg = NULL; | ||
2096 | } else if (PageSwapCache(page)) { | ||
2097 | ent.val = page_private(page); | ||
2098 | id = lookup_swap_cgroup_id(ent); | ||
2099 | rcu_read_lock(); | ||
2100 | memcg = mem_cgroup_from_id(id); | ||
2101 | if (memcg && !css_tryget_online(&memcg->css)) | ||
2102 | memcg = NULL; | ||
2103 | rcu_read_unlock(); | ||
2104 | } | ||
2105 | return memcg; | ||
2106 | } | ||
2107 | |||
2108 | static void lock_page_lru(struct page *page, int *isolated) | 2102 | static void lock_page_lru(struct page *page, int *isolated) |
2109 | { | 2103 | { |
2110 | struct zone *zone = page_zone(page); | 2104 | struct zone *zone = page_zone(page); |
@@ -5301,8 +5295,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | |||
5301 | * the page lock, which serializes swap cache removal, which | 5295 | * the page lock, which serializes swap cache removal, which |
5302 | * in turn serializes uncharging. | 5296 | * in turn serializes uncharging. |
5303 | */ | 5297 | */ |
5298 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
5304 | if (page->mem_cgroup) | 5299 | if (page->mem_cgroup) |
5305 | goto out; | 5300 | goto out; |
5301 | |||
5302 | if (do_swap_account) { | ||
5303 | swp_entry_t ent = { .val = page_private(page), }; | ||
5304 | unsigned short id = lookup_swap_cgroup_id(ent); | ||
5305 | |||
5306 | rcu_read_lock(); | ||
5307 | memcg = mem_cgroup_from_id(id); | ||
5308 | if (memcg && !css_tryget_online(&memcg->css)) | ||
5309 | memcg = NULL; | ||
5310 | rcu_read_unlock(); | ||
5311 | } | ||
5306 | } | 5312 | } |
5307 | 5313 | ||
5308 | if (PageTransHuge(page)) { | 5314 | if (PageTransHuge(page)) { |
@@ -5310,8 +5316,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | |||
5310 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | 5316 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
5311 | } | 5317 | } |
5312 | 5318 | ||
5313 | if (do_swap_account && PageSwapCache(page)) | ||
5314 | memcg = try_get_mem_cgroup_from_page(page); | ||
5315 | if (!memcg) | 5319 | if (!memcg) |
5316 | memcg = get_mem_cgroup_from_mm(mm); | 5320 | memcg = get_mem_cgroup_from_mm(mm); |
5317 | 5321 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index eeda6485e76c..95882692e747 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -130,27 +130,15 @@ static int hwpoison_filter_flags(struct page *p) | |||
130 | * can only guarantee that the page either belongs to the memcg tasks, or is | 130 | * can only guarantee that the page either belongs to the memcg tasks, or is |
131 | * a freed page. | 131 | * a freed page. |
132 | */ | 132 | */ |
133 | #ifdef CONFIG_MEMCG_SWAP | 133 | #ifdef CONFIG_MEMCG |
134 | u64 hwpoison_filter_memcg; | 134 | u64 hwpoison_filter_memcg; |
135 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); | 135 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); |
136 | static int hwpoison_filter_task(struct page *p) | 136 | static int hwpoison_filter_task(struct page *p) |
137 | { | 137 | { |
138 | struct mem_cgroup *mem; | ||
139 | struct cgroup_subsys_state *css; | ||
140 | unsigned long ino; | ||
141 | |||
142 | if (!hwpoison_filter_memcg) | 138 | if (!hwpoison_filter_memcg) |
143 | return 0; | 139 | return 0; |
144 | 140 | ||
145 | mem = try_get_mem_cgroup_from_page(p); | 141 | if (page_cgroup_ino(p) != hwpoison_filter_memcg) |
146 | if (!mem) | ||
147 | return -EINVAL; | ||
148 | |||
149 | css = &mem->css; | ||
150 | ino = cgroup_ino(css->cgroup); | ||
151 | css_put(css); | ||
152 | |||
153 | if (ino != hwpoison_filter_memcg) | ||
154 | return -EINVAL; | 142 | return -EINVAL; |
155 | 143 | ||
156 | return 0; | 144 | return 0; |
diff --git a/mm/memory.c b/mm/memory.c index 6cd0b2160401..9cb27470fee9 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -3233,7 +3233,7 @@ out: | |||
3233 | static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | 3233 | static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, |
3234 | unsigned long address, pmd_t *pmd, unsigned int flags) | 3234 | unsigned long address, pmd_t *pmd, unsigned int flags) |
3235 | { | 3235 | { |
3236 | if (!vma->vm_ops) | 3236 | if (vma_is_anonymous(vma)) |
3237 | return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); | 3237 | return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); |
3238 | if (vma->vm_ops->pmd_fault) | 3238 | if (vma->vm_ops->pmd_fault) |
3239 | return vma->vm_ops->pmd_fault(vma, address, pmd, flags); | 3239 | return vma->vm_ops->pmd_fault(vma, address, pmd, flags); |
@@ -3244,7 +3244,7 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3244 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd, | 3244 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd, |
3245 | unsigned int flags) | 3245 | unsigned int flags) |
3246 | { | 3246 | { |
3247 | if (!vma->vm_ops) | 3247 | if (vma_is_anonymous(vma)) |
3248 | return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); | 3248 | return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); |
3249 | if (vma->vm_ops->pmd_fault) | 3249 | if (vma->vm_ops->pmd_fault) |
3250 | return vma->vm_ops->pmd_fault(vma, address, pmd, flags); | 3250 | return vma->vm_ops->pmd_fault(vma, address, pmd, flags); |
diff --git a/mm/migrate.c b/mm/migrate.c index 02ce25df16c2..c3cb566af3e2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/gfp.h> | 37 | #include <linux/gfp.h> |
38 | #include <linux/balloon_compaction.h> | 38 | #include <linux/balloon_compaction.h> |
39 | #include <linux/mmu_notifier.h> | 39 | #include <linux/mmu_notifier.h> |
40 | #include <linux/page_idle.h> | ||
40 | 41 | ||
41 | #include <asm/tlbflush.h> | 42 | #include <asm/tlbflush.h> |
42 | 43 | ||
@@ -524,6 +525,11 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
524 | __set_page_dirty_nobuffers(newpage); | 525 | __set_page_dirty_nobuffers(newpage); |
525 | } | 526 | } |
526 | 527 | ||
528 | if (page_is_young(page)) | ||
529 | set_page_young(newpage); | ||
530 | if (page_is_idle(page)) | ||
531 | set_page_idle(newpage); | ||
532 | |||
527 | /* | 533 | /* |
528 | * Copy NUMA information to the new page, to prevent over-eager | 534 | * Copy NUMA information to the new page, to prevent over-eager |
529 | * future migrations of this same page. | 535 | * future migrations of this same page. |
@@ -612,6 +612,8 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, | |||
612 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | 612 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, |
613 | struct rb_node **rb_link, struct rb_node *rb_parent) | 613 | struct rb_node **rb_link, struct rb_node *rb_parent) |
614 | { | 614 | { |
615 | WARN_ONCE(vma->vm_file && !vma->vm_ops, "missing vma->vm_ops"); | ||
616 | |||
615 | /* Update tracking information for the gap following the new vma. */ | 617 | /* Update tracking information for the gap following the new vma. */ |
616 | if (vma->vm_next) | 618 | if (vma->vm_next) |
617 | vma_gap_update(vma->vm_next); | 619 | vma_gap_update(vma->vm_next); |
@@ -1260,14 +1262,12 @@ static inline int mlock_future_check(struct mm_struct *mm, | |||
1260 | /* | 1262 | /* |
1261 | * The caller must hold down_write(¤t->mm->mmap_sem). | 1263 | * The caller must hold down_write(¤t->mm->mmap_sem). |
1262 | */ | 1264 | */ |
1263 | 1265 | unsigned long do_mmap(struct file *file, unsigned long addr, | |
1264 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | ||
1265 | unsigned long len, unsigned long prot, | 1266 | unsigned long len, unsigned long prot, |
1266 | unsigned long flags, unsigned long pgoff, | 1267 | unsigned long flags, vm_flags_t vm_flags, |
1267 | unsigned long *populate) | 1268 | unsigned long pgoff, unsigned long *populate) |
1268 | { | 1269 | { |
1269 | struct mm_struct *mm = current->mm; | 1270 | struct mm_struct *mm = current->mm; |
1270 | vm_flags_t vm_flags; | ||
1271 | 1271 | ||
1272 | *populate = 0; | 1272 | *populate = 0; |
1273 | 1273 | ||
@@ -1311,7 +1311,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1311 | * to. we assume access permissions have been handled by the open | 1311 | * to. we assume access permissions have been handled by the open |
1312 | * of the memory object, so we don't do any here. | 1312 | * of the memory object, so we don't do any here. |
1313 | */ | 1313 | */ |
1314 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | | 1314 | vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | |
1315 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | 1315 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; |
1316 | 1316 | ||
1317 | if (flags & MAP_LOCKED) | 1317 | if (flags & MAP_LOCKED) |
@@ -1638,6 +1638,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |||
1638 | */ | 1638 | */ |
1639 | WARN_ON_ONCE(addr != vma->vm_start); | 1639 | WARN_ON_ONCE(addr != vma->vm_start); |
1640 | 1640 | ||
1641 | /* All file mapping must have ->vm_ops set */ | ||
1642 | if (!vma->vm_ops) { | ||
1643 | static const struct vm_operations_struct dummy_ops = {}; | ||
1644 | vma->vm_ops = &dummy_ops; | ||
1645 | } | ||
1646 | |||
1641 | addr = vma->vm_start; | 1647 | addr = vma->vm_start; |
1642 | vm_flags = vma->vm_flags; | 1648 | vm_flags = vma->vm_flags; |
1643 | } else if (vm_flags & VM_SHARED) { | 1649 | } else if (vm_flags & VM_SHARED) { |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 3b9b3d0741b2..5fbdd367bbed 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | |||
123 | return young; | 123 | return young; |
124 | } | 124 | } |
125 | 125 | ||
126 | int __mmu_notifier_clear_young(struct mm_struct *mm, | ||
127 | unsigned long start, | ||
128 | unsigned long end) | ||
129 | { | ||
130 | struct mmu_notifier *mn; | ||
131 | int young = 0, id; | ||
132 | |||
133 | id = srcu_read_lock(&srcu); | ||
134 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { | ||
135 | if (mn->ops->clear_young) | ||
136 | young |= mn->ops->clear_young(mn, mm, start, end); | ||
137 | } | ||
138 | srcu_read_unlock(&srcu, id); | ||
139 | |||
140 | return young; | ||
141 | } | ||
142 | |||
126 | int __mmu_notifier_test_young(struct mm_struct *mm, | 143 | int __mmu_notifier_test_young(struct mm_struct *mm, |
127 | unsigned long address) | 144 | unsigned long address) |
128 | { | 145 | { |
diff --git a/mm/nommu.c b/mm/nommu.c index 1cc0709fcaa5..ab14a2014dea 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1233,18 +1233,19 @@ enomem: | |||
1233 | /* | 1233 | /* |
1234 | * handle mapping creation for uClinux | 1234 | * handle mapping creation for uClinux |
1235 | */ | 1235 | */ |
1236 | unsigned long do_mmap_pgoff(struct file *file, | 1236 | unsigned long do_mmap(struct file *file, |
1237 | unsigned long addr, | 1237 | unsigned long addr, |
1238 | unsigned long len, | 1238 | unsigned long len, |
1239 | unsigned long prot, | 1239 | unsigned long prot, |
1240 | unsigned long flags, | 1240 | unsigned long flags, |
1241 | unsigned long pgoff, | 1241 | vm_flags_t vm_flags, |
1242 | unsigned long *populate) | 1242 | unsigned long pgoff, |
1243 | unsigned long *populate) | ||
1243 | { | 1244 | { |
1244 | struct vm_area_struct *vma; | 1245 | struct vm_area_struct *vma; |
1245 | struct vm_region *region; | 1246 | struct vm_region *region; |
1246 | struct rb_node *rb; | 1247 | struct rb_node *rb; |
1247 | unsigned long capabilities, vm_flags, result; | 1248 | unsigned long capabilities, result; |
1248 | int ret; | 1249 | int ret; |
1249 | 1250 | ||
1250 | *populate = 0; | 1251 | *populate = 0; |
@@ -1262,7 +1263,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1262 | 1263 | ||
1263 | /* we've determined that we can make the mapping, now translate what we | 1264 | /* we've determined that we can make the mapping, now translate what we |
1264 | * now know into VMA flags */ | 1265 | * now know into VMA flags */ |
1265 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); | 1266 | vm_flags |= determine_vm_flags(file, prot, flags, capabilities); |
1266 | 1267 | ||
1267 | /* we're going to need to record the mapping */ | 1268 | /* we're going to need to record the mapping */ |
1268 | region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); | 1269 | region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); |
diff --git a/mm/page_ext.c b/mm/page_ext.c index d86fd2f5353f..292ca7b8debd 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/vmalloc.h> | 6 | #include <linux/vmalloc.h> |
7 | #include <linux/kmemleak.h> | 7 | #include <linux/kmemleak.h> |
8 | #include <linux/page_owner.h> | 8 | #include <linux/page_owner.h> |
9 | #include <linux/page_idle.h> | ||
9 | 10 | ||
10 | /* | 11 | /* |
11 | * struct page extension | 12 | * struct page extension |
@@ -59,6 +60,9 @@ static struct page_ext_operations *page_ext_ops[] = { | |||
59 | #ifdef CONFIG_PAGE_OWNER | 60 | #ifdef CONFIG_PAGE_OWNER |
60 | &page_owner_ops, | 61 | &page_owner_ops, |
61 | #endif | 62 | #endif |
63 | #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) | ||
64 | &page_idle_ops, | ||
65 | #endif | ||
62 | }; | 66 | }; |
63 | 67 | ||
64 | static unsigned long total_usage; | 68 | static unsigned long total_usage; |
diff --git a/mm/page_idle.c b/mm/page_idle.c new file mode 100644 index 000000000000..d5dd79041484 --- /dev/null +++ b/mm/page_idle.c | |||
@@ -0,0 +1,232 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/bootmem.h> | ||
3 | #include <linux/fs.h> | ||
4 | #include <linux/sysfs.h> | ||
5 | #include <linux/kobject.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/mmzone.h> | ||
8 | #include <linux/pagemap.h> | ||
9 | #include <linux/rmap.h> | ||
10 | #include <linux/mmu_notifier.h> | ||
11 | #include <linux/page_ext.h> | ||
12 | #include <linux/page_idle.h> | ||
13 | |||
14 | #define BITMAP_CHUNK_SIZE sizeof(u64) | ||
15 | #define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE) | ||
16 | |||
17 | /* | ||
18 | * Idle page tracking only considers user memory pages, for other types of | ||
19 | * pages the idle flag is always unset and an attempt to set it is silently | ||
20 | * ignored. | ||
21 | * | ||
22 | * We treat a page as a user memory page if it is on an LRU list, because it is | ||
23 | * always safe to pass such a page to rmap_walk(), which is essential for idle | ||
24 | * page tracking. With such an indicator of user pages we can skip isolated | ||
25 | * pages, but since there are not usually many of them, it will hardly affect | ||
26 | * the overall result. | ||
27 | * | ||
28 | * This function tries to get a user memory page by pfn as described above. | ||
29 | */ | ||
30 | static struct page *page_idle_get_page(unsigned long pfn) | ||
31 | { | ||
32 | struct page *page; | ||
33 | struct zone *zone; | ||
34 | |||
35 | if (!pfn_valid(pfn)) | ||
36 | return NULL; | ||
37 | |||
38 | page = pfn_to_page(pfn); | ||
39 | if (!page || !PageLRU(page) || | ||
40 | !get_page_unless_zero(page)) | ||
41 | return NULL; | ||
42 | |||
43 | zone = page_zone(page); | ||
44 | spin_lock_irq(&zone->lru_lock); | ||
45 | if (unlikely(!PageLRU(page))) { | ||
46 | put_page(page); | ||
47 | page = NULL; | ||
48 | } | ||
49 | spin_unlock_irq(&zone->lru_lock); | ||
50 | return page; | ||
51 | } | ||
52 | |||
53 | static int page_idle_clear_pte_refs_one(struct page *page, | ||
54 | struct vm_area_struct *vma, | ||
55 | unsigned long addr, void *arg) | ||
56 | { | ||
57 | struct mm_struct *mm = vma->vm_mm; | ||
58 | spinlock_t *ptl; | ||
59 | pmd_t *pmd; | ||
60 | pte_t *pte; | ||
61 | bool referenced = false; | ||
62 | |||
63 | if (unlikely(PageTransHuge(page))) { | ||
64 | pmd = page_check_address_pmd(page, mm, addr, | ||
65 | PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); | ||
66 | if (pmd) { | ||
67 | referenced = pmdp_clear_young_notify(vma, addr, pmd); | ||
68 | spin_unlock(ptl); | ||
69 | } | ||
70 | } else { | ||
71 | pte = page_check_address(page, mm, addr, &ptl, 0); | ||
72 | if (pte) { | ||
73 | referenced = ptep_clear_young_notify(vma, addr, pte); | ||
74 | pte_unmap_unlock(pte, ptl); | ||
75 | } | ||
76 | } | ||
77 | if (referenced) { | ||
78 | clear_page_idle(page); | ||
79 | /* | ||
80 | * We cleared the referenced bit in a mapping to this page. To | ||
81 | * avoid interference with page reclaim, mark it young so that | ||
82 | * page_referenced() will return > 0. | ||
83 | */ | ||
84 | set_page_young(page); | ||
85 | } | ||
86 | return SWAP_AGAIN; | ||
87 | } | ||
88 | |||
89 | static void page_idle_clear_pte_refs(struct page *page) | ||
90 | { | ||
91 | /* | ||
92 | * Since rwc.arg is unused, rwc is effectively immutable, so we | ||
93 | * can make it static const to save some cycles and stack. | ||
94 | */ | ||
95 | static const struct rmap_walk_control rwc = { | ||
96 | .rmap_one = page_idle_clear_pte_refs_one, | ||
97 | .anon_lock = page_lock_anon_vma_read, | ||
98 | }; | ||
99 | bool need_lock; | ||
100 | |||
101 | if (!page_mapped(page) || | ||
102 | !page_rmapping(page)) | ||
103 | return; | ||
104 | |||
105 | need_lock = !PageAnon(page) || PageKsm(page); | ||
106 | if (need_lock && !trylock_page(page)) | ||
107 | return; | ||
108 | |||
109 | rmap_walk(page, (struct rmap_walk_control *)&rwc); | ||
110 | |||
111 | if (need_lock) | ||
112 | unlock_page(page); | ||
113 | } | ||
114 | |||
115 | static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, | ||
116 | struct bin_attribute *attr, char *buf, | ||
117 | loff_t pos, size_t count) | ||
118 | { | ||
119 | u64 *out = (u64 *)buf; | ||
120 | struct page *page; | ||
121 | unsigned long pfn, end_pfn; | ||
122 | int bit; | ||
123 | |||
124 | if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) | ||
125 | return -EINVAL; | ||
126 | |||
127 | pfn = pos * BITS_PER_BYTE; | ||
128 | if (pfn >= max_pfn) | ||
129 | return 0; | ||
130 | |||
131 | end_pfn = pfn + count * BITS_PER_BYTE; | ||
132 | if (end_pfn > max_pfn) | ||
133 | end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); | ||
134 | |||
135 | for (; pfn < end_pfn; pfn++) { | ||
136 | bit = pfn % BITMAP_CHUNK_BITS; | ||
137 | if (!bit) | ||
138 | *out = 0ULL; | ||
139 | page = page_idle_get_page(pfn); | ||
140 | if (page) { | ||
141 | if (page_is_idle(page)) { | ||
142 | /* | ||
143 | * The page might have been referenced via a | ||
144 | * pte, in which case it is not idle. Clear | ||
145 | * refs and recheck. | ||
146 | */ | ||
147 | page_idle_clear_pte_refs(page); | ||
148 | if (page_is_idle(page)) | ||
149 | *out |= 1ULL << bit; | ||
150 | } | ||
151 | put_page(page); | ||
152 | } | ||
153 | if (bit == BITMAP_CHUNK_BITS - 1) | ||
154 | out++; | ||
155 | cond_resched(); | ||
156 | } | ||
157 | return (char *)out - buf; | ||
158 | } | ||
159 | |||
160 | static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, | ||
161 | struct bin_attribute *attr, char *buf, | ||
162 | loff_t pos, size_t count) | ||
163 | { | ||
164 | const u64 *in = (u64 *)buf; | ||
165 | struct page *page; | ||
166 | unsigned long pfn, end_pfn; | ||
167 | int bit; | ||
168 | |||
169 | if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) | ||
170 | return -EINVAL; | ||
171 | |||
172 | pfn = pos * BITS_PER_BYTE; | ||
173 | if (pfn >= max_pfn) | ||
174 | return -ENXIO; | ||
175 | |||
176 | end_pfn = pfn + count * BITS_PER_BYTE; | ||
177 | if (end_pfn > max_pfn) | ||
178 | end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); | ||
179 | |||
180 | for (; pfn < end_pfn; pfn++) { | ||
181 | bit = pfn % BITMAP_CHUNK_BITS; | ||
182 | if ((*in >> bit) & 1) { | ||
183 | page = page_idle_get_page(pfn); | ||
184 | if (page) { | ||
185 | page_idle_clear_pte_refs(page); | ||
186 | set_page_idle(page); | ||
187 | put_page(page); | ||
188 | } | ||
189 | } | ||
190 | if (bit == BITMAP_CHUNK_BITS - 1) | ||
191 | in++; | ||
192 | cond_resched(); | ||
193 | } | ||
194 | return (char *)in - buf; | ||
195 | } | ||
196 | |||
197 | static struct bin_attribute page_idle_bitmap_attr = | ||
198 | __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR, | ||
199 | page_idle_bitmap_read, page_idle_bitmap_write, 0); | ||
200 | |||
201 | static struct bin_attribute *page_idle_bin_attrs[] = { | ||
202 | &page_idle_bitmap_attr, | ||
203 | NULL, | ||
204 | }; | ||
205 | |||
206 | static struct attribute_group page_idle_attr_group = { | ||
207 | .bin_attrs = page_idle_bin_attrs, | ||
208 | .name = "page_idle", | ||
209 | }; | ||
210 | |||
211 | #ifndef CONFIG_64BIT | ||
212 | static bool need_page_idle(void) | ||
213 | { | ||
214 | return true; | ||
215 | } | ||
216 | struct page_ext_operations page_idle_ops = { | ||
217 | .need = need_page_idle, | ||
218 | }; | ||
219 | #endif | ||
220 | |||
221 | static int __init page_idle_init(void) | ||
222 | { | ||
223 | int err; | ||
224 | |||
225 | err = sysfs_create_group(mm_kobj, &page_idle_attr_group); | ||
226 | if (err) { | ||
227 | pr_err("page_idle: register sysfs failed\n"); | ||
228 | return err; | ||
229 | } | ||
230 | return 0; | ||
231 | } | ||
232 | subsys_initcall(page_idle_init); | ||
@@ -59,6 +59,7 @@ | |||
59 | #include <linux/migrate.h> | 59 | #include <linux/migrate.h> |
60 | #include <linux/hugetlb.h> | 60 | #include <linux/hugetlb.h> |
61 | #include <linux/backing-dev.h> | 61 | #include <linux/backing-dev.h> |
62 | #include <linux/page_idle.h> | ||
62 | 63 | ||
63 | #include <asm/tlbflush.h> | 64 | #include <asm/tlbflush.h> |
64 | 65 | ||
@@ -886,6 +887,11 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
886 | pte_unmap_unlock(pte, ptl); | 887 | pte_unmap_unlock(pte, ptl); |
887 | } | 888 | } |
888 | 889 | ||
890 | if (referenced) | ||
891 | clear_page_idle(page); | ||
892 | if (test_and_clear_page_young(page)) | ||
893 | referenced++; | ||
894 | |||
889 | if (referenced) { | 895 | if (referenced) { |
890 | pra->referenced++; | 896 | pra->referenced++; |
891 | pra->vm_flags |= vma->vm_flags; | 897 | pra->vm_flags |= vma->vm_flags; |
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
33 | #include <linux/uio.h> | 33 | #include <linux/uio.h> |
34 | #include <linux/hugetlb.h> | 34 | #include <linux/hugetlb.h> |
35 | #include <linux/page_idle.h> | ||
35 | 36 | ||
36 | #include "internal.h" | 37 | #include "internal.h" |
37 | 38 | ||
@@ -622,6 +623,8 @@ void mark_page_accessed(struct page *page) | |||
622 | } else if (!PageReferenced(page)) { | 623 | } else if (!PageReferenced(page)) { |
623 | SetPageReferenced(page); | 624 | SetPageReferenced(page); |
624 | } | 625 | } |
626 | if (page_is_idle(page)) | ||
627 | clear_page_idle(page); | ||
625 | } | 628 | } |
626 | EXPORT_SYMBOL(mark_page_accessed); | 629 | EXPORT_SYMBOL(mark_page_accessed); |
627 | 630 | ||
diff --git a/mm/zpool.c b/mm/zpool.c index 68d2dd8ed2d8..8f670d3e8706 100644 --- a/mm/zpool.c +++ b/mm/zpool.c | |||
@@ -100,6 +100,39 @@ static void zpool_put_driver(struct zpool_driver *driver) | |||
100 | } | 100 | } |
101 | 101 | ||
102 | /** | 102 | /** |
103 | * zpool_has_pool() - Check if the pool driver is available | ||
104 | * @type The type of the zpool to check (e.g. zbud, zsmalloc) | ||
105 | * | ||
106 | * This checks if the @type pool driver is available. This will try to load | ||
107 | * the requested module, if needed, but there is no guarantee the module will | ||
108 | * still be loaded and available immediately after calling. If this returns | ||
109 | * true, the caller should assume the pool is available, but must be prepared | ||
110 | * to handle the @zpool_create_pool() returning failure. However if this | ||
111 | * returns false, the caller should assume the requested pool type is not | ||
112 | * available; either the requested pool type module does not exist, or could | ||
113 | * not be loaded, and calling @zpool_create_pool() with the pool type will | ||
114 | * fail. | ||
115 | * | ||
116 | * Returns: true if @type pool is available, false if not | ||
117 | */ | ||
118 | bool zpool_has_pool(char *type) | ||
119 | { | ||
120 | struct zpool_driver *driver = zpool_get_driver(type); | ||
121 | |||
122 | if (!driver) { | ||
123 | request_module("zpool-%s", type); | ||
124 | driver = zpool_get_driver(type); | ||
125 | } | ||
126 | |||
127 | if (!driver) | ||
128 | return false; | ||
129 | |||
130 | zpool_put_driver(driver); | ||
131 | return true; | ||
132 | } | ||
133 | EXPORT_SYMBOL(zpool_has_pool); | ||
134 | |||
135 | /** | ||
103 | * zpool_create_pool() - Create a new zpool | 136 | * zpool_create_pool() - Create a new zpool |
104 | * @type The type of the zpool to create (e.g. zbud, zsmalloc) | 137 | * @type The type of the zpool to create (e.g. zbud, zsmalloc) |
105 | * @name The name of the zpool (e.g. zram0, zswap) | 138 | * @name The name of the zpool (e.g. zram0, zswap) |
diff --git a/mm/zswap.c b/mm/zswap.c index 48a1d081e2a5..4043df7c672f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -80,85 +80,54 @@ static u64 zswap_duplicate_entry; | |||
80 | static bool zswap_enabled; | 80 | static bool zswap_enabled; |
81 | module_param_named(enabled, zswap_enabled, bool, 0644); | 81 | module_param_named(enabled, zswap_enabled, bool, 0644); |
82 | 82 | ||
83 | /* Compressor to be used by zswap (fixed at boot for now) */ | 83 | /* Crypto compressor to use */ |
84 | #define ZSWAP_COMPRESSOR_DEFAULT "lzo" | 84 | #define ZSWAP_COMPRESSOR_DEFAULT "lzo" |
85 | static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; | 85 | static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT; |
86 | module_param_named(compressor, zswap_compressor, charp, 0444); | 86 | static struct kparam_string zswap_compressor_kparam = { |
87 | 87 | .string = zswap_compressor, | |
88 | /* The maximum percentage of memory that the compressed pool can occupy */ | 88 | .maxlen = sizeof(zswap_compressor), |
89 | static unsigned int zswap_max_pool_percent = 20; | 89 | }; |
90 | module_param_named(max_pool_percent, | 90 | static int zswap_compressor_param_set(const char *, |
91 | zswap_max_pool_percent, uint, 0644); | 91 | const struct kernel_param *); |
92 | static struct kernel_param_ops zswap_compressor_param_ops = { | ||
93 | .set = zswap_compressor_param_set, | ||
94 | .get = param_get_string, | ||
95 | }; | ||
96 | module_param_cb(compressor, &zswap_compressor_param_ops, | ||
97 | &zswap_compressor_kparam, 0644); | ||
92 | 98 | ||
93 | /* Compressed storage to use */ | 99 | /* Compressed storage zpool to use */ |
94 | #define ZSWAP_ZPOOL_DEFAULT "zbud" | 100 | #define ZSWAP_ZPOOL_DEFAULT "zbud" |
95 | static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; | 101 | static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT; |
96 | module_param_named(zpool, zswap_zpool_type, charp, 0444); | 102 | static struct kparam_string zswap_zpool_kparam = { |
103 | .string = zswap_zpool_type, | ||
104 | .maxlen = sizeof(zswap_zpool_type), | ||
105 | }; | ||
106 | static int zswap_zpool_param_set(const char *, const struct kernel_param *); | ||
107 | static struct kernel_param_ops zswap_zpool_param_ops = { | ||
108 | .set = zswap_zpool_param_set, | ||
109 | .get = param_get_string, | ||
110 | }; | ||
111 | module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644); | ||
97 | 112 | ||
98 | /* zpool is shared by all of zswap backend */ | 113 | /* The maximum percentage of memory that the compressed pool can occupy */ |
99 | static struct zpool *zswap_pool; | 114 | static unsigned int zswap_max_pool_percent = 20; |
115 | module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); | ||
100 | 116 | ||
101 | /********************************* | 117 | /********************************* |
102 | * compression functions | 118 | * data structures |
103 | **********************************/ | 119 | **********************************/ |
104 | /* per-cpu compression transforms */ | ||
105 | static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms; | ||
106 | 120 | ||
107 | enum comp_op { | 121 | struct zswap_pool { |
108 | ZSWAP_COMPOP_COMPRESS, | 122 | struct zpool *zpool; |
109 | ZSWAP_COMPOP_DECOMPRESS | 123 | struct crypto_comp * __percpu *tfm; |
124 | struct kref kref; | ||
125 | struct list_head list; | ||
126 | struct rcu_head rcu_head; | ||
127 | struct notifier_block notifier; | ||
128 | char tfm_name[CRYPTO_MAX_ALG_NAME]; | ||
110 | }; | 129 | }; |
111 | 130 | ||
112 | static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen, | ||
113 | u8 *dst, unsigned int *dlen) | ||
114 | { | ||
115 | struct crypto_comp *tfm; | ||
116 | int ret; | ||
117 | |||
118 | tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu()); | ||
119 | switch (op) { | ||
120 | case ZSWAP_COMPOP_COMPRESS: | ||
121 | ret = crypto_comp_compress(tfm, src, slen, dst, dlen); | ||
122 | break; | ||
123 | case ZSWAP_COMPOP_DECOMPRESS: | ||
124 | ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); | ||
125 | break; | ||
126 | default: | ||
127 | ret = -EINVAL; | ||
128 | } | ||
129 | |||
130 | put_cpu(); | ||
131 | return ret; | ||
132 | } | ||
133 | |||
134 | static int __init zswap_comp_init(void) | ||
135 | { | ||
136 | if (!crypto_has_comp(zswap_compressor, 0, 0)) { | ||
137 | pr_info("%s compressor not available\n", zswap_compressor); | ||
138 | /* fall back to default compressor */ | ||
139 | zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; | ||
140 | if (!crypto_has_comp(zswap_compressor, 0, 0)) | ||
141 | /* can't even load the default compressor */ | ||
142 | return -ENODEV; | ||
143 | } | ||
144 | pr_info("using %s compressor\n", zswap_compressor); | ||
145 | |||
146 | /* alloc percpu transforms */ | ||
147 | zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); | ||
148 | if (!zswap_comp_pcpu_tfms) | ||
149 | return -ENOMEM; | ||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | static void __init zswap_comp_exit(void) | ||
154 | { | ||
155 | /* free percpu transforms */ | ||
156 | free_percpu(zswap_comp_pcpu_tfms); | ||
157 | } | ||
158 | |||
159 | /********************************* | ||
160 | * data structures | ||
161 | **********************************/ | ||
162 | /* | 131 | /* |
163 | * struct zswap_entry | 132 | * struct zswap_entry |
164 | * | 133 | * |
@@ -166,22 +135,24 @@ static void __init zswap_comp_exit(void) | |||
166 | * page within zswap. | 135 | * page within zswap. |
167 | * | 136 | * |
168 | * rbnode - links the entry into red-black tree for the appropriate swap type | 137 | * rbnode - links the entry into red-black tree for the appropriate swap type |
138 | * offset - the swap offset for the entry. Index into the red-black tree. | ||
169 | * refcount - the number of outstanding reference to the entry. This is needed | 139 | * refcount - the number of outstanding reference to the entry. This is needed |
170 | * to protect against premature freeing of the entry by code | 140 | * to protect against premature freeing of the entry by code |
171 | * concurrent calls to load, invalidate, and writeback. The lock | 141 | * concurrent calls to load, invalidate, and writeback. The lock |
172 | * for the zswap_tree structure that contains the entry must | 142 | * for the zswap_tree structure that contains the entry must |
173 | * be held while changing the refcount. Since the lock must | 143 | * be held while changing the refcount. Since the lock must |
174 | * be held, there is no reason to also make refcount atomic. | 144 | * be held, there is no reason to also make refcount atomic. |
175 | * offset - the swap offset for the entry. Index into the red-black tree. | ||
176 | * handle - zpool allocation handle that stores the compressed page data | ||
177 | * length - the length in bytes of the compressed page data. Needed during | 145 | * length - the length in bytes of the compressed page data. Needed during |
178 | * decompression | 146 | * decompression |
147 | * pool - the zswap_pool the entry's data is in | ||
148 | * handle - zpool allocation handle that stores the compressed page data | ||
179 | */ | 149 | */ |
180 | struct zswap_entry { | 150 | struct zswap_entry { |
181 | struct rb_node rbnode; | 151 | struct rb_node rbnode; |
182 | pgoff_t offset; | 152 | pgoff_t offset; |
183 | int refcount; | 153 | int refcount; |
184 | unsigned int length; | 154 | unsigned int length; |
155 | struct zswap_pool *pool; | ||
185 | unsigned long handle; | 156 | unsigned long handle; |
186 | }; | 157 | }; |
187 | 158 | ||
@@ -201,6 +172,51 @@ struct zswap_tree { | |||
201 | 172 | ||
202 | static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; | 173 | static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; |
203 | 174 | ||
175 | /* RCU-protected iteration */ | ||
176 | static LIST_HEAD(zswap_pools); | ||
177 | /* protects zswap_pools list modification */ | ||
178 | static DEFINE_SPINLOCK(zswap_pools_lock); | ||
179 | |||
180 | /* used by param callback function */ | ||
181 | static bool zswap_init_started; | ||
182 | |||
183 | /********************************* | ||
184 | * helpers and fwd declarations | ||
185 | **********************************/ | ||
186 | |||
187 | #define zswap_pool_debug(msg, p) \ | ||
188 | pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ | ||
189 | zpool_get_type((p)->zpool)) | ||
190 | |||
191 | static int zswap_writeback_entry(struct zpool *pool, unsigned long handle); | ||
192 | static int zswap_pool_get(struct zswap_pool *pool); | ||
193 | static void zswap_pool_put(struct zswap_pool *pool); | ||
194 | |||
195 | static const struct zpool_ops zswap_zpool_ops = { | ||
196 | .evict = zswap_writeback_entry | ||
197 | }; | ||
198 | |||
199 | static bool zswap_is_full(void) | ||
200 | { | ||
201 | return totalram_pages * zswap_max_pool_percent / 100 < | ||
202 | DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); | ||
203 | } | ||
204 | |||
205 | static void zswap_update_total_size(void) | ||
206 | { | ||
207 | struct zswap_pool *pool; | ||
208 | u64 total = 0; | ||
209 | |||
210 | rcu_read_lock(); | ||
211 | |||
212 | list_for_each_entry_rcu(pool, &zswap_pools, list) | ||
213 | total += zpool_get_total_size(pool->zpool); | ||
214 | |||
215 | rcu_read_unlock(); | ||
216 | |||
217 | zswap_pool_total_size = total; | ||
218 | } | ||
219 | |||
204 | /********************************* | 220 | /********************************* |
205 | * zswap entry functions | 221 | * zswap entry functions |
206 | **********************************/ | 222 | **********************************/ |
@@ -294,10 +310,11 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) | |||
294 | */ | 310 | */ |
295 | static void zswap_free_entry(struct zswap_entry *entry) | 311 | static void zswap_free_entry(struct zswap_entry *entry) |
296 | { | 312 | { |
297 | zpool_free(zswap_pool, entry->handle); | 313 | zpool_free(entry->pool->zpool, entry->handle); |
314 | zswap_pool_put(entry->pool); | ||
298 | zswap_entry_cache_free(entry); | 315 | zswap_entry_cache_free(entry); |
299 | atomic_dec(&zswap_stored_pages); | 316 | atomic_dec(&zswap_stored_pages); |
300 | zswap_pool_total_size = zpool_get_total_size(zswap_pool); | 317 | zswap_update_total_size(); |
301 | } | 318 | } |
302 | 319 | ||
303 | /* caller must hold the tree lock */ | 320 | /* caller must hold the tree lock */ |
@@ -339,35 +356,21 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, | |||
339 | **********************************/ | 356 | **********************************/ |
340 | static DEFINE_PER_CPU(u8 *, zswap_dstmem); | 357 | static DEFINE_PER_CPU(u8 *, zswap_dstmem); |
341 | 358 | ||
342 | static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) | 359 | static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu) |
343 | { | 360 | { |
344 | struct crypto_comp *tfm; | ||
345 | u8 *dst; | 361 | u8 *dst; |
346 | 362 | ||
347 | switch (action) { | 363 | switch (action) { |
348 | case CPU_UP_PREPARE: | 364 | case CPU_UP_PREPARE: |
349 | tfm = crypto_alloc_comp(zswap_compressor, 0, 0); | ||
350 | if (IS_ERR(tfm)) { | ||
351 | pr_err("can't allocate compressor transform\n"); | ||
352 | return NOTIFY_BAD; | ||
353 | } | ||
354 | *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; | ||
355 | dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); | 365 | dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); |
356 | if (!dst) { | 366 | if (!dst) { |
357 | pr_err("can't allocate compressor buffer\n"); | 367 | pr_err("can't allocate compressor buffer\n"); |
358 | crypto_free_comp(tfm); | ||
359 | *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; | ||
360 | return NOTIFY_BAD; | 368 | return NOTIFY_BAD; |
361 | } | 369 | } |
362 | per_cpu(zswap_dstmem, cpu) = dst; | 370 | per_cpu(zswap_dstmem, cpu) = dst; |
363 | break; | 371 | break; |
364 | case CPU_DEAD: | 372 | case CPU_DEAD: |
365 | case CPU_UP_CANCELED: | 373 | case CPU_UP_CANCELED: |
366 | tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu); | ||
367 | if (tfm) { | ||
368 | crypto_free_comp(tfm); | ||
369 | *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; | ||
370 | } | ||
371 | dst = per_cpu(zswap_dstmem, cpu); | 374 | dst = per_cpu(zswap_dstmem, cpu); |
372 | kfree(dst); | 375 | kfree(dst); |
373 | per_cpu(zswap_dstmem, cpu) = NULL; | 376 | per_cpu(zswap_dstmem, cpu) = NULL; |
@@ -378,43 +381,398 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) | |||
378 | return NOTIFY_OK; | 381 | return NOTIFY_OK; |
379 | } | 382 | } |
380 | 383 | ||
381 | static int zswap_cpu_notifier(struct notifier_block *nb, | 384 | static int zswap_cpu_dstmem_notifier(struct notifier_block *nb, |
382 | unsigned long action, void *pcpu) | 385 | unsigned long action, void *pcpu) |
383 | { | 386 | { |
384 | unsigned long cpu = (unsigned long)pcpu; | 387 | return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu); |
385 | return __zswap_cpu_notifier(action, cpu); | ||
386 | } | 388 | } |
387 | 389 | ||
388 | static struct notifier_block zswap_cpu_notifier_block = { | 390 | static struct notifier_block zswap_dstmem_notifier = { |
389 | .notifier_call = zswap_cpu_notifier | 391 | .notifier_call = zswap_cpu_dstmem_notifier, |
390 | }; | 392 | }; |
391 | 393 | ||
392 | static int __init zswap_cpu_init(void) | 394 | static int __init zswap_cpu_dstmem_init(void) |
395 | { | ||
396 | unsigned long cpu; | ||
397 | |||
398 | cpu_notifier_register_begin(); | ||
399 | for_each_online_cpu(cpu) | ||
400 | if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) == | ||
401 | NOTIFY_BAD) | ||
402 | goto cleanup; | ||
403 | __register_cpu_notifier(&zswap_dstmem_notifier); | ||
404 | cpu_notifier_register_done(); | ||
405 | return 0; | ||
406 | |||
407 | cleanup: | ||
408 | for_each_online_cpu(cpu) | ||
409 | __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu); | ||
410 | cpu_notifier_register_done(); | ||
411 | return -ENOMEM; | ||
412 | } | ||
413 | |||
414 | static void zswap_cpu_dstmem_destroy(void) | ||
415 | { | ||
416 | unsigned long cpu; | ||
417 | |||
418 | cpu_notifier_register_begin(); | ||
419 | for_each_online_cpu(cpu) | ||
420 | __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu); | ||
421 | __unregister_cpu_notifier(&zswap_dstmem_notifier); | ||
422 | cpu_notifier_register_done(); | ||
423 | } | ||
424 | |||
425 | static int __zswap_cpu_comp_notifier(struct zswap_pool *pool, | ||
426 | unsigned long action, unsigned long cpu) | ||
427 | { | ||
428 | struct crypto_comp *tfm; | ||
429 | |||
430 | switch (action) { | ||
431 | case CPU_UP_PREPARE: | ||
432 | if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) | ||
433 | break; | ||
434 | tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); | ||
435 | if (IS_ERR_OR_NULL(tfm)) { | ||
436 | pr_err("could not alloc crypto comp %s : %ld\n", | ||
437 | pool->tfm_name, PTR_ERR(tfm)); | ||
438 | return NOTIFY_BAD; | ||
439 | } | ||
440 | *per_cpu_ptr(pool->tfm, cpu) = tfm; | ||
441 | break; | ||
442 | case CPU_DEAD: | ||
443 | case CPU_UP_CANCELED: | ||
444 | tfm = *per_cpu_ptr(pool->tfm, cpu); | ||
445 | if (!IS_ERR_OR_NULL(tfm)) | ||
446 | crypto_free_comp(tfm); | ||
447 | *per_cpu_ptr(pool->tfm, cpu) = NULL; | ||
448 | break; | ||
449 | default: | ||
450 | break; | ||
451 | } | ||
452 | return NOTIFY_OK; | ||
453 | } | ||
454 | |||
455 | static int zswap_cpu_comp_notifier(struct notifier_block *nb, | ||
456 | unsigned long action, void *pcpu) | ||
457 | { | ||
458 | unsigned long cpu = (unsigned long)pcpu; | ||
459 | struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier); | ||
460 | |||
461 | return __zswap_cpu_comp_notifier(pool, action, cpu); | ||
462 | } | ||
463 | |||
464 | static int zswap_cpu_comp_init(struct zswap_pool *pool) | ||
393 | { | 465 | { |
394 | unsigned long cpu; | 466 | unsigned long cpu; |
395 | 467 | ||
468 | memset(&pool->notifier, 0, sizeof(pool->notifier)); | ||
469 | pool->notifier.notifier_call = zswap_cpu_comp_notifier; | ||
470 | |||
396 | cpu_notifier_register_begin(); | 471 | cpu_notifier_register_begin(); |
397 | for_each_online_cpu(cpu) | 472 | for_each_online_cpu(cpu) |
398 | if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) | 473 | if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) == |
474 | NOTIFY_BAD) | ||
399 | goto cleanup; | 475 | goto cleanup; |
400 | __register_cpu_notifier(&zswap_cpu_notifier_block); | 476 | __register_cpu_notifier(&pool->notifier); |
401 | cpu_notifier_register_done(); | 477 | cpu_notifier_register_done(); |
402 | return 0; | 478 | return 0; |
403 | 479 | ||
404 | cleanup: | 480 | cleanup: |
405 | for_each_online_cpu(cpu) | 481 | for_each_online_cpu(cpu) |
406 | __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); | 482 | __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu); |
407 | cpu_notifier_register_done(); | 483 | cpu_notifier_register_done(); |
408 | return -ENOMEM; | 484 | return -ENOMEM; |
409 | } | 485 | } |
410 | 486 | ||
487 | static void zswap_cpu_comp_destroy(struct zswap_pool *pool) | ||
488 | { | ||
489 | unsigned long cpu; | ||
490 | |||
491 | cpu_notifier_register_begin(); | ||
492 | for_each_online_cpu(cpu) | ||
493 | __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu); | ||
494 | __unregister_cpu_notifier(&pool->notifier); | ||
495 | cpu_notifier_register_done(); | ||
496 | } | ||
497 | |||
411 | /********************************* | 498 | /********************************* |
412 | * helpers | 499 | * pool functions |
413 | **********************************/ | 500 | **********************************/ |
414 | static bool zswap_is_full(void) | 501 | |
502 | static struct zswap_pool *__zswap_pool_current(void) | ||
415 | { | 503 | { |
416 | return totalram_pages * zswap_max_pool_percent / 100 < | 504 | struct zswap_pool *pool; |
417 | DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); | 505 | |
506 | pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); | ||
507 | WARN_ON(!pool); | ||
508 | |||
509 | return pool; | ||
510 | } | ||
511 | |||
512 | static struct zswap_pool *zswap_pool_current(void) | ||
513 | { | ||
514 | assert_spin_locked(&zswap_pools_lock); | ||
515 | |||
516 | return __zswap_pool_current(); | ||
517 | } | ||
518 | |||
519 | static struct zswap_pool *zswap_pool_current_get(void) | ||
520 | { | ||
521 | struct zswap_pool *pool; | ||
522 | |||
523 | rcu_read_lock(); | ||
524 | |||
525 | pool = __zswap_pool_current(); | ||
526 | if (!pool || !zswap_pool_get(pool)) | ||
527 | pool = NULL; | ||
528 | |||
529 | rcu_read_unlock(); | ||
530 | |||
531 | return pool; | ||
532 | } | ||
533 | |||
534 | static struct zswap_pool *zswap_pool_last_get(void) | ||
535 | { | ||
536 | struct zswap_pool *pool, *last = NULL; | ||
537 | |||
538 | rcu_read_lock(); | ||
539 | |||
540 | list_for_each_entry_rcu(pool, &zswap_pools, list) | ||
541 | last = pool; | ||
542 | if (!WARN_ON(!last) && !zswap_pool_get(last)) | ||
543 | last = NULL; | ||
544 | |||
545 | rcu_read_unlock(); | ||
546 | |||
547 | return last; | ||
548 | } | ||
549 | |||
550 | static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) | ||
551 | { | ||
552 | struct zswap_pool *pool; | ||
553 | |||
554 | assert_spin_locked(&zswap_pools_lock); | ||
555 | |||
556 | list_for_each_entry_rcu(pool, &zswap_pools, list) { | ||
557 | if (strncmp(pool->tfm_name, compressor, sizeof(pool->tfm_name))) | ||
558 | continue; | ||
559 | if (strncmp(zpool_get_type(pool->zpool), type, | ||
560 | sizeof(zswap_zpool_type))) | ||
561 | continue; | ||
562 | /* if we can't get it, it's about to be destroyed */ | ||
563 | if (!zswap_pool_get(pool)) | ||
564 | continue; | ||
565 | return pool; | ||
566 | } | ||
567 | |||
568 | return NULL; | ||
569 | } | ||
570 | |||
571 | static struct zswap_pool *zswap_pool_create(char *type, char *compressor) | ||
572 | { | ||
573 | struct zswap_pool *pool; | ||
574 | gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; | ||
575 | |||
576 | pool = kzalloc(sizeof(*pool), GFP_KERNEL); | ||
577 | if (!pool) { | ||
578 | pr_err("pool alloc failed\n"); | ||
579 | return NULL; | ||
580 | } | ||
581 | |||
582 | pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops); | ||
583 | if (!pool->zpool) { | ||
584 | pr_err("%s zpool not available\n", type); | ||
585 | goto error; | ||
586 | } | ||
587 | pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); | ||
588 | |||
589 | strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); | ||
590 | pool->tfm = alloc_percpu(struct crypto_comp *); | ||
591 | if (!pool->tfm) { | ||
592 | pr_err("percpu alloc failed\n"); | ||
593 | goto error; | ||
594 | } | ||
595 | |||
596 | if (zswap_cpu_comp_init(pool)) | ||
597 | goto error; | ||
598 | pr_debug("using %s compressor\n", pool->tfm_name); | ||
599 | |||
600 | /* being the current pool takes 1 ref; this func expects the | ||
601 | * caller to always add the new pool as the current pool | ||
602 | */ | ||
603 | kref_init(&pool->kref); | ||
604 | INIT_LIST_HEAD(&pool->list); | ||
605 | |||
606 | zswap_pool_debug("created", pool); | ||
607 | |||
608 | return pool; | ||
609 | |||
610 | error: | ||
611 | free_percpu(pool->tfm); | ||
612 | if (pool->zpool) | ||
613 | zpool_destroy_pool(pool->zpool); | ||
614 | kfree(pool); | ||
615 | return NULL; | ||
616 | } | ||
617 | |||
618 | static struct zswap_pool *__zswap_pool_create_fallback(void) | ||
619 | { | ||
620 | if (!crypto_has_comp(zswap_compressor, 0, 0)) { | ||
621 | pr_err("compressor %s not available, using default %s\n", | ||
622 | zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); | ||
623 | strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT, | ||
624 | sizeof(zswap_compressor)); | ||
625 | } | ||
626 | if (!zpool_has_pool(zswap_zpool_type)) { | ||
627 | pr_err("zpool %s not available, using default %s\n", | ||
628 | zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); | ||
629 | strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT, | ||
630 | sizeof(zswap_zpool_type)); | ||
631 | } | ||
632 | |||
633 | return zswap_pool_create(zswap_zpool_type, zswap_compressor); | ||
634 | } | ||
635 | |||
636 | static void zswap_pool_destroy(struct zswap_pool *pool) | ||
637 | { | ||
638 | zswap_pool_debug("destroying", pool); | ||
639 | |||
640 | zswap_cpu_comp_destroy(pool); | ||
641 | free_percpu(pool->tfm); | ||
642 | zpool_destroy_pool(pool->zpool); | ||
643 | kfree(pool); | ||
644 | } | ||
645 | |||
646 | static int __must_check zswap_pool_get(struct zswap_pool *pool) | ||
647 | { | ||
648 | return kref_get_unless_zero(&pool->kref); | ||
649 | } | ||
650 | |||
651 | static void __zswap_pool_release(struct rcu_head *head) | ||
652 | { | ||
653 | struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head); | ||
654 | |||
655 | /* nobody should have been able to get a kref... */ | ||
656 | WARN_ON(kref_get_unless_zero(&pool->kref)); | ||
657 | |||
658 | /* pool is now off zswap_pools list and has no references. */ | ||
659 | zswap_pool_destroy(pool); | ||
660 | } | ||
661 | |||
662 | static void __zswap_pool_empty(struct kref *kref) | ||
663 | { | ||
664 | struct zswap_pool *pool; | ||
665 | |||
666 | pool = container_of(kref, typeof(*pool), kref); | ||
667 | |||
668 | spin_lock(&zswap_pools_lock); | ||
669 | |||
670 | WARN_ON(pool == zswap_pool_current()); | ||
671 | |||
672 | list_del_rcu(&pool->list); | ||
673 | call_rcu(&pool->rcu_head, __zswap_pool_release); | ||
674 | |||
675 | spin_unlock(&zswap_pools_lock); | ||
676 | } | ||
677 | |||
678 | static void zswap_pool_put(struct zswap_pool *pool) | ||
679 | { | ||
680 | kref_put(&pool->kref, __zswap_pool_empty); | ||
681 | } | ||
682 | |||
683 | /********************************* | ||
684 | * param callbacks | ||
685 | **********************************/ | ||
686 | |||
687 | static int __zswap_param_set(const char *val, const struct kernel_param *kp, | ||
688 | char *type, char *compressor) | ||
689 | { | ||
690 | struct zswap_pool *pool, *put_pool = NULL; | ||
691 | char str[kp->str->maxlen], *s; | ||
692 | int ret; | ||
693 | |||
694 | /* | ||
695 | * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined | ||
696 | * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or | ||
697 | * 32 (arbitrary). | ||
698 | */ | ||
699 | strlcpy(str, val, kp->str->maxlen); | ||
700 | s = strim(str); | ||
701 | |||
702 | /* if this is load-time (pre-init) param setting, | ||
703 | * don't create a pool; that's done during init. | ||
704 | */ | ||
705 | if (!zswap_init_started) | ||
706 | return param_set_copystring(s, kp); | ||
707 | |||
708 | /* no change required */ | ||
709 | if (!strncmp(kp->str->string, s, kp->str->maxlen)) | ||
710 | return 0; | ||
711 | |||
712 | if (!type) { | ||
713 | type = s; | ||
714 | if (!zpool_has_pool(type)) { | ||
715 | pr_err("zpool %s not available\n", type); | ||
716 | return -ENOENT; | ||
717 | } | ||
718 | } else if (!compressor) { | ||
719 | compressor = s; | ||
720 | if (!crypto_has_comp(compressor, 0, 0)) { | ||
721 | pr_err("compressor %s not available\n", compressor); | ||
722 | return -ENOENT; | ||
723 | } | ||
724 | } | ||
725 | |||
726 | spin_lock(&zswap_pools_lock); | ||
727 | |||
728 | pool = zswap_pool_find_get(type, compressor); | ||
729 | if (pool) { | ||
730 | zswap_pool_debug("using existing", pool); | ||
731 | list_del_rcu(&pool->list); | ||
732 | } else { | ||
733 | spin_unlock(&zswap_pools_lock); | ||
734 | pool = zswap_pool_create(type, compressor); | ||
735 | spin_lock(&zswap_pools_lock); | ||
736 | } | ||
737 | |||
738 | if (pool) | ||
739 | ret = param_set_copystring(s, kp); | ||
740 | else | ||
741 | ret = -EINVAL; | ||
742 | |||
743 | if (!ret) { | ||
744 | put_pool = zswap_pool_current(); | ||
745 | list_add_rcu(&pool->list, &zswap_pools); | ||
746 | } else if (pool) { | ||
747 | /* add the possibly pre-existing pool to the end of the pools | ||
748 | * list; if it's new (and empty) then it'll be removed and | ||
749 | * destroyed by the put after we drop the lock | ||
750 | */ | ||
751 | list_add_tail_rcu(&pool->list, &zswap_pools); | ||
752 | put_pool = pool; | ||
753 | } | ||
754 | |||
755 | spin_unlock(&zswap_pools_lock); | ||
756 | |||
757 | /* drop the ref from either the old current pool, | ||
758 | * or the new pool we failed to add | ||
759 | */ | ||
760 | if (put_pool) | ||
761 | zswap_pool_put(put_pool); | ||
762 | |||
763 | return ret; | ||
764 | } | ||
765 | |||
766 | static int zswap_compressor_param_set(const char *val, | ||
767 | const struct kernel_param *kp) | ||
768 | { | ||
769 | return __zswap_param_set(val, kp, zswap_zpool_type, NULL); | ||
770 | } | ||
771 | |||
772 | static int zswap_zpool_param_set(const char *val, | ||
773 | const struct kernel_param *kp) | ||
774 | { | ||
775 | return __zswap_param_set(val, kp, NULL, zswap_compressor); | ||
418 | } | 776 | } |
419 | 777 | ||
420 | /********************************* | 778 | /********************************* |
@@ -477,6 +835,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) | |||
477 | pgoff_t offset; | 835 | pgoff_t offset; |
478 | struct zswap_entry *entry; | 836 | struct zswap_entry *entry; |
479 | struct page *page; | 837 | struct page *page; |
838 | struct crypto_comp *tfm; | ||
480 | u8 *src, *dst; | 839 | u8 *src, *dst; |
481 | unsigned int dlen; | 840 | unsigned int dlen; |
482 | int ret; | 841 | int ret; |
@@ -517,13 +876,15 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) | |||
517 | case ZSWAP_SWAPCACHE_NEW: /* page is locked */ | 876 | case ZSWAP_SWAPCACHE_NEW: /* page is locked */ |
518 | /* decompress */ | 877 | /* decompress */ |
519 | dlen = PAGE_SIZE; | 878 | dlen = PAGE_SIZE; |
520 | src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, | 879 | src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, |
521 | ZPOOL_MM_RO) + sizeof(struct zswap_header); | 880 | ZPOOL_MM_RO) + sizeof(struct zswap_header); |
522 | dst = kmap_atomic(page); | 881 | dst = kmap_atomic(page); |
523 | ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, | 882 | tfm = *get_cpu_ptr(entry->pool->tfm); |
524 | entry->length, dst, &dlen); | 883 | ret = crypto_comp_decompress(tfm, src, entry->length, |
884 | dst, &dlen); | ||
885 | put_cpu_ptr(entry->pool->tfm); | ||
525 | kunmap_atomic(dst); | 886 | kunmap_atomic(dst); |
526 | zpool_unmap_handle(zswap_pool, entry->handle); | 887 | zpool_unmap_handle(entry->pool->zpool, entry->handle); |
527 | BUG_ON(ret); | 888 | BUG_ON(ret); |
528 | BUG_ON(dlen != PAGE_SIZE); | 889 | BUG_ON(dlen != PAGE_SIZE); |
529 | 890 | ||
@@ -572,6 +933,22 @@ end: | |||
572 | return ret; | 933 | return ret; |
573 | } | 934 | } |
574 | 935 | ||
936 | static int zswap_shrink(void) | ||
937 | { | ||
938 | struct zswap_pool *pool; | ||
939 | int ret; | ||
940 | |||
941 | pool = zswap_pool_last_get(); | ||
942 | if (!pool) | ||
943 | return -ENOENT; | ||
944 | |||
945 | ret = zpool_shrink(pool->zpool, 1, NULL); | ||
946 | |||
947 | zswap_pool_put(pool); | ||
948 | |||
949 | return ret; | ||
950 | } | ||
951 | |||
575 | /********************************* | 952 | /********************************* |
576 | * frontswap hooks | 953 | * frontswap hooks |
577 | **********************************/ | 954 | **********************************/ |
@@ -581,6 +958,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
581 | { | 958 | { |
582 | struct zswap_tree *tree = zswap_trees[type]; | 959 | struct zswap_tree *tree = zswap_trees[type]; |
583 | struct zswap_entry *entry, *dupentry; | 960 | struct zswap_entry *entry, *dupentry; |
961 | struct crypto_comp *tfm; | ||
584 | int ret; | 962 | int ret; |
585 | unsigned int dlen = PAGE_SIZE, len; | 963 | unsigned int dlen = PAGE_SIZE, len; |
586 | unsigned long handle; | 964 | unsigned long handle; |
@@ -596,7 +974,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
596 | /* reclaim space if needed */ | 974 | /* reclaim space if needed */ |
597 | if (zswap_is_full()) { | 975 | if (zswap_is_full()) { |
598 | zswap_pool_limit_hit++; | 976 | zswap_pool_limit_hit++; |
599 | if (zpool_shrink(zswap_pool, 1, NULL)) { | 977 | if (zswap_shrink()) { |
600 | zswap_reject_reclaim_fail++; | 978 | zswap_reject_reclaim_fail++; |
601 | ret = -ENOMEM; | 979 | ret = -ENOMEM; |
602 | goto reject; | 980 | goto reject; |
@@ -611,33 +989,42 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
611 | goto reject; | 989 | goto reject; |
612 | } | 990 | } |
613 | 991 | ||
992 | /* if entry is successfully added, it keeps the reference */ | ||
993 | entry->pool = zswap_pool_current_get(); | ||
994 | if (!entry->pool) { | ||
995 | ret = -EINVAL; | ||
996 | goto freepage; | ||
997 | } | ||
998 | |||
614 | /* compress */ | 999 | /* compress */ |
615 | dst = get_cpu_var(zswap_dstmem); | 1000 | dst = get_cpu_var(zswap_dstmem); |
1001 | tfm = *get_cpu_ptr(entry->pool->tfm); | ||
616 | src = kmap_atomic(page); | 1002 | src = kmap_atomic(page); |
617 | ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); | 1003 | ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); |
618 | kunmap_atomic(src); | 1004 | kunmap_atomic(src); |
1005 | put_cpu_ptr(entry->pool->tfm); | ||
619 | if (ret) { | 1006 | if (ret) { |
620 | ret = -EINVAL; | 1007 | ret = -EINVAL; |
621 | goto freepage; | 1008 | goto put_dstmem; |
622 | } | 1009 | } |
623 | 1010 | ||
624 | /* store */ | 1011 | /* store */ |
625 | len = dlen + sizeof(struct zswap_header); | 1012 | len = dlen + sizeof(struct zswap_header); |
626 | ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, | 1013 | ret = zpool_malloc(entry->pool->zpool, len, |
627 | &handle); | 1014 | __GFP_NORETRY | __GFP_NOWARN, &handle); |
628 | if (ret == -ENOSPC) { | 1015 | if (ret == -ENOSPC) { |
629 | zswap_reject_compress_poor++; | 1016 | zswap_reject_compress_poor++; |
630 | goto freepage; | 1017 | goto put_dstmem; |
631 | } | 1018 | } |
632 | if (ret) { | 1019 | if (ret) { |
633 | zswap_reject_alloc_fail++; | 1020 | zswap_reject_alloc_fail++; |
634 | goto freepage; | 1021 | goto put_dstmem; |
635 | } | 1022 | } |
636 | zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); | 1023 | zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); |
637 | zhdr->swpentry = swp_entry(type, offset); | 1024 | zhdr->swpentry = swp_entry(type, offset); |
638 | buf = (u8 *)(zhdr + 1); | 1025 | buf = (u8 *)(zhdr + 1); |
639 | memcpy(buf, dst, dlen); | 1026 | memcpy(buf, dst, dlen); |
640 | zpool_unmap_handle(zswap_pool, handle); | 1027 | zpool_unmap_handle(entry->pool->zpool, handle); |
641 | put_cpu_var(zswap_dstmem); | 1028 | put_cpu_var(zswap_dstmem); |
642 | 1029 | ||
643 | /* populate entry */ | 1030 | /* populate entry */ |
@@ -660,12 +1047,14 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
660 | 1047 | ||
661 | /* update stats */ | 1048 | /* update stats */ |
662 | atomic_inc(&zswap_stored_pages); | 1049 | atomic_inc(&zswap_stored_pages); |
663 | zswap_pool_total_size = zpool_get_total_size(zswap_pool); | 1050 | zswap_update_total_size(); |
664 | 1051 | ||
665 | return 0; | 1052 | return 0; |
666 | 1053 | ||
667 | freepage: | 1054 | put_dstmem: |
668 | put_cpu_var(zswap_dstmem); | 1055 | put_cpu_var(zswap_dstmem); |
1056 | zswap_pool_put(entry->pool); | ||
1057 | freepage: | ||
669 | zswap_entry_cache_free(entry); | 1058 | zswap_entry_cache_free(entry); |
670 | reject: | 1059 | reject: |
671 | return ret; | 1060 | return ret; |
@@ -680,6 +1069,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, | |||
680 | { | 1069 | { |
681 | struct zswap_tree *tree = zswap_trees[type]; | 1070 | struct zswap_tree *tree = zswap_trees[type]; |
682 | struct zswap_entry *entry; | 1071 | struct zswap_entry *entry; |
1072 | struct crypto_comp *tfm; | ||
683 | u8 *src, *dst; | 1073 | u8 *src, *dst; |
684 | unsigned int dlen; | 1074 | unsigned int dlen; |
685 | int ret; | 1075 | int ret; |
@@ -696,13 +1086,14 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, | |||
696 | 1086 | ||
697 | /* decompress */ | 1087 | /* decompress */ |
698 | dlen = PAGE_SIZE; | 1088 | dlen = PAGE_SIZE; |
699 | src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, | 1089 | src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, |
700 | ZPOOL_MM_RO) + sizeof(struct zswap_header); | 1090 | ZPOOL_MM_RO) + sizeof(struct zswap_header); |
701 | dst = kmap_atomic(page); | 1091 | dst = kmap_atomic(page); |
702 | ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, | 1092 | tfm = *get_cpu_ptr(entry->pool->tfm); |
703 | dst, &dlen); | 1093 | ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); |
1094 | put_cpu_ptr(entry->pool->tfm); | ||
704 | kunmap_atomic(dst); | 1095 | kunmap_atomic(dst); |
705 | zpool_unmap_handle(zswap_pool, entry->handle); | 1096 | zpool_unmap_handle(entry->pool->zpool, entry->handle); |
706 | BUG_ON(ret); | 1097 | BUG_ON(ret); |
707 | 1098 | ||
708 | spin_lock(&tree->lock); | 1099 | spin_lock(&tree->lock); |
@@ -755,10 +1146,6 @@ static void zswap_frontswap_invalidate_area(unsigned type) | |||
755 | zswap_trees[type] = NULL; | 1146 | zswap_trees[type] = NULL; |
756 | } | 1147 | } |
757 | 1148 | ||
758 | static const struct zpool_ops zswap_zpool_ops = { | ||
759 | .evict = zswap_writeback_entry | ||
760 | }; | ||
761 | |||
762 | static void zswap_frontswap_init(unsigned type) | 1149 | static void zswap_frontswap_init(unsigned type) |
763 | { | 1150 | { |
764 | struct zswap_tree *tree; | 1151 | struct zswap_tree *tree; |
@@ -839,49 +1226,40 @@ static void __exit zswap_debugfs_exit(void) { } | |||
839 | **********************************/ | 1226 | **********************************/ |
840 | static int __init init_zswap(void) | 1227 | static int __init init_zswap(void) |
841 | { | 1228 | { |
842 | gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; | 1229 | struct zswap_pool *pool; |
843 | 1230 | ||
844 | pr_info("loading zswap\n"); | 1231 | zswap_init_started = true; |
845 | |||
846 | zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, | ||
847 | &zswap_zpool_ops); | ||
848 | if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { | ||
849 | pr_info("%s zpool not available\n", zswap_zpool_type); | ||
850 | zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; | ||
851 | zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, | ||
852 | &zswap_zpool_ops); | ||
853 | } | ||
854 | if (!zswap_pool) { | ||
855 | pr_err("%s zpool not available\n", zswap_zpool_type); | ||
856 | pr_err("zpool creation failed\n"); | ||
857 | goto error; | ||
858 | } | ||
859 | pr_info("using %s pool\n", zswap_zpool_type); | ||
860 | 1232 | ||
861 | if (zswap_entry_cache_create()) { | 1233 | if (zswap_entry_cache_create()) { |
862 | pr_err("entry cache creation failed\n"); | 1234 | pr_err("entry cache creation failed\n"); |
863 | goto cachefail; | 1235 | goto cache_fail; |
864 | } | 1236 | } |
865 | if (zswap_comp_init()) { | 1237 | |
866 | pr_err("compressor initialization failed\n"); | 1238 | if (zswap_cpu_dstmem_init()) { |
867 | goto compfail; | 1239 | pr_err("dstmem alloc failed\n"); |
1240 | goto dstmem_fail; | ||
868 | } | 1241 | } |
869 | if (zswap_cpu_init()) { | 1242 | |
870 | pr_err("per-cpu initialization failed\n"); | 1243 | pool = __zswap_pool_create_fallback(); |
871 | goto pcpufail; | 1244 | if (!pool) { |
1245 | pr_err("pool creation failed\n"); | ||
1246 | goto pool_fail; | ||
872 | } | 1247 | } |
1248 | pr_info("loaded using pool %s/%s\n", pool->tfm_name, | ||
1249 | zpool_get_type(pool->zpool)); | ||
1250 | |||
1251 | list_add(&pool->list, &zswap_pools); | ||
873 | 1252 | ||
874 | frontswap_register_ops(&zswap_frontswap_ops); | 1253 | frontswap_register_ops(&zswap_frontswap_ops); |
875 | if (zswap_debugfs_init()) | 1254 | if (zswap_debugfs_init()) |
876 | pr_warn("debugfs initialization failed\n"); | 1255 | pr_warn("debugfs initialization failed\n"); |
877 | return 0; | 1256 | return 0; |
878 | pcpufail: | 1257 | |
879 | zswap_comp_exit(); | 1258 | pool_fail: |
880 | compfail: | 1259 | zswap_cpu_dstmem_destroy(); |
1260 | dstmem_fail: | ||
881 | zswap_entry_cache_destroy(); | 1261 | zswap_entry_cache_destroy(); |
882 | cachefail: | 1262 | cache_fail: |
883 | zpool_destroy_pool(zswap_pool); | ||
884 | error: | ||
885 | return -ENOMEM; | 1263 | return -ENOMEM; |
886 | } | 1264 | } |
887 | /* must be late so crypto has time to come up */ | 1265 | /* must be late so crypto has time to come up */ |