aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-13 16:00:36 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-13 16:00:36 -0500
commit78a45c6f067824cf5d0a9fedea7339ac2e28603c (patch)
treeb4f78c8b6b9059ddace0a18c11629b8d2045f793 /mm
parentf96fe225677b3efb74346ebd56fafe3997b02afa (diff)
parent29d293b6007b91a4463f05bc8d0b26e0e65c5816 (diff)
Merge branch 'akpm' (second patch-bomb from Andrew)
Merge second patchbomb from Andrew Morton: - the rest of MM - misc fs fixes - add execveat() syscall - new ratelimit feature for fault-injection - decompressor updates - ipc/ updates - fallocate feature creep - fsnotify cleanups - a few other misc things * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (99 commits) cgroups: Documentation: fix trivial typos and wrong paragraph numberings parisc: percpu: update comments referring to __get_cpu_var percpu: update local_ops.txt to reflect this_cpu operations percpu: remove __get_cpu_var and __raw_get_cpu_var macros fsnotify: remove destroy_list from fsnotify_mark fsnotify: unify inode and mount marks handling fallocate: create FAN_MODIFY and IN_MODIFY events mm/cma: make kmemleak ignore CMA regions slub: fix cpuset check in get_any_partial slab: fix cpuset check in fallback_alloc shmdt: use i_size_read() instead of ->i_size ipc/shm.c: fix overly aggressive shmdt() when calls span multiple segments ipc/msg: increase MSGMNI, remove scaling ipc/sem.c: increase SEMMSL, SEMMNI, SEMOPM ipc/sem.c: change memory barrier in sem_lock() to smp_rmb() lib/decompress.c: consistency of compress formats for kernel image decompress_bunzip2: off by one in get_next_block() usr/Kconfig: make initrd compression algorithm selection not expert fault-inject: add ratelimit option ratelimit: add initialization macro ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug10
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cma.c25
-rw-r--r--mm/debug-pagealloc.c45
-rw-r--r--mm/fadvise.c6
-rw-r--r--mm/filemap.c10
-rw-r--r--mm/filemap_xip.c23
-rw-r--r--mm/fremap.c4
-rw-r--r--mm/hugetlb.c26
-rw-r--r--mm/memblock.c43
-rw-r--r--mm/memcontrol.c180
-rw-r--r--mm/memory-failure.c15
-rw-r--r--mm/memory.c9
-rw-r--r--mm/migrate.c28
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mmap.c24
-rw-r--r--mm/mremap.c6
-rw-r--r--mm/nommu.c50
-rw-r--r--mm/oom_kill.c15
-rw-r--r--mm/page_alloc.c137
-rw-r--r--mm/page_ext.c403
-rw-r--r--mm/page_owner.c311
-rw-r--r--mm/rmap.c18
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slub.c17
-rw-r--r--mm/vmacache.c2
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c216
-rw-r--r--mm/vmstat.c102
-rw-r--r--mm/zbud.c2
-rw-r--r--mm/zsmalloc.c180
-rw-r--r--mm/zswap.c9
32 files changed, 1409 insertions, 524 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 4b2443254de2..56badfc4810a 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,8 +1,18 @@
1config PAGE_EXTENSION
2 bool "Extend memmap on extra space for more information on page"
3 ---help---
4 Extend memmap on extra space for more information on page. This
5 could be used for debugging features that need to insert extra
6 field for every page. This extension enables us to save memory
7 by not allocating this extra memory according to boottime
8 configuration.
9
1config DEBUG_PAGEALLOC 10config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations" 11 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL 12 depends on DEBUG_KERNEL
4 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC 13 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
5 depends on !KMEMCHECK 14 depends on !KMEMCHECK
15 select PAGE_EXTENSION
6 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC 16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
7 select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC 17 select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
8 ---help--- 18 ---help---
diff --git a/mm/Makefile b/mm/Makefile
index b3c6ce932c64..4bf586e66378 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
63obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 63obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
64obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 64obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
65obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 65obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
66obj-$(CONFIG_PAGE_OWNER) += page_owner.o
66obj-$(CONFIG_CLEANCACHE) += cleancache.o 67obj-$(CONFIG_CLEANCACHE) += cleancache.o
67obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o 68obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
68obj-$(CONFIG_ZPOOL) += zpool.o 69obj-$(CONFIG_ZPOOL) += zpool.o
@@ -71,3 +72,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
71obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o 72obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
72obj-$(CONFIG_CMA) += cma.o 73obj-$(CONFIG_CMA) += cma.o
73obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o 74obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
75obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
diff --git a/mm/cma.c b/mm/cma.c
index 8e9ec13d31db..f8917629cbdd 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -33,6 +33,7 @@
33#include <linux/log2.h> 33#include <linux/log2.h>
34#include <linux/cma.h> 34#include <linux/cma.h>
35#include <linux/highmem.h> 35#include <linux/highmem.h>
36#include <linux/io.h>
36 37
37struct cma { 38struct cma {
38 unsigned long base_pfn; 39 unsigned long base_pfn;
@@ -63,6 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
63 return (1UL << (align_order - cma->order_per_bit)) - 1; 64 return (1UL << (align_order - cma->order_per_bit)) - 1;
64} 65}
65 66
67static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
68{
69 unsigned int alignment;
70
71 if (align_order <= cma->order_per_bit)
72 return 0;
73 alignment = 1UL << (align_order - cma->order_per_bit);
74 return ALIGN(cma->base_pfn, alignment) -
75 (cma->base_pfn >> cma->order_per_bit);
76}
77
66static unsigned long cma_bitmap_maxno(struct cma *cma) 78static unsigned long cma_bitmap_maxno(struct cma *cma)
67{ 79{
68 return cma->count >> cma->order_per_bit; 80 return cma->count >> cma->order_per_bit;
@@ -313,6 +325,11 @@ int __init cma_declare_contiguous(phys_addr_t base,
313 } 325 }
314 } 326 }
315 327
328 /*
329 * kmemleak scans/reads tracked objects for pointers to other
330 * objects but this address isn't mapped and accessible
331 */
332 kmemleak_ignore(phys_to_virt(addr));
316 base = addr; 333 base = addr;
317 } 334 }
318 335
@@ -340,7 +357,7 @@ err:
340 */ 357 */
341struct page *cma_alloc(struct cma *cma, int count, unsigned int align) 358struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
342{ 359{
343 unsigned long mask, pfn, start = 0; 360 unsigned long mask, offset, pfn, start = 0;
344 unsigned long bitmap_maxno, bitmap_no, bitmap_count; 361 unsigned long bitmap_maxno, bitmap_no, bitmap_count;
345 struct page *page = NULL; 362 struct page *page = NULL;
346 int ret; 363 int ret;
@@ -355,13 +372,15 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
355 return NULL; 372 return NULL;
356 373
357 mask = cma_bitmap_aligned_mask(cma, align); 374 mask = cma_bitmap_aligned_mask(cma, align);
375 offset = cma_bitmap_aligned_offset(cma, align);
358 bitmap_maxno = cma_bitmap_maxno(cma); 376 bitmap_maxno = cma_bitmap_maxno(cma);
359 bitmap_count = cma_bitmap_pages_to_bits(cma, count); 377 bitmap_count = cma_bitmap_pages_to_bits(cma, count);
360 378
361 for (;;) { 379 for (;;) {
362 mutex_lock(&cma->lock); 380 mutex_lock(&cma->lock);
363 bitmap_no = bitmap_find_next_zero_area(cma->bitmap, 381 bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
364 bitmap_maxno, start, bitmap_count, mask); 382 bitmap_maxno, start, bitmap_count, mask,
383 offset);
365 if (bitmap_no >= bitmap_maxno) { 384 if (bitmap_no >= bitmap_maxno) {
366 mutex_unlock(&cma->lock); 385 mutex_unlock(&cma->lock);
367 break; 386 break;
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index 789ff70c8a4a..5bf5906ce13b 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -2,23 +2,55 @@
2#include <linux/string.h> 2#include <linux/string.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <linux/highmem.h> 4#include <linux/highmem.h>
5#include <linux/page-debug-flags.h> 5#include <linux/page_ext.h>
6#include <linux/poison.h> 6#include <linux/poison.h>
7#include <linux/ratelimit.h> 7#include <linux/ratelimit.h>
8 8
9static bool page_poisoning_enabled __read_mostly;
10
11static bool need_page_poisoning(void)
12{
13 if (!debug_pagealloc_enabled())
14 return false;
15
16 return true;
17}
18
19static void init_page_poisoning(void)
20{
21 if (!debug_pagealloc_enabled())
22 return;
23
24 page_poisoning_enabled = true;
25}
26
27struct page_ext_operations page_poisoning_ops = {
28 .need = need_page_poisoning,
29 .init = init_page_poisoning,
30};
31
9static inline void set_page_poison(struct page *page) 32static inline void set_page_poison(struct page *page)
10{ 33{
11 __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); 34 struct page_ext *page_ext;
35
36 page_ext = lookup_page_ext(page);
37 __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
12} 38}
13 39
14static inline void clear_page_poison(struct page *page) 40static inline void clear_page_poison(struct page *page)
15{ 41{
16 __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); 42 struct page_ext *page_ext;
43
44 page_ext = lookup_page_ext(page);
45 __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
17} 46}
18 47
19static inline bool page_poison(struct page *page) 48static inline bool page_poison(struct page *page)
20{ 49{
21 return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); 50 struct page_ext *page_ext;
51
52 page_ext = lookup_page_ext(page);
53 return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
22} 54}
23 55
24static void poison_page(struct page *page) 56static void poison_page(struct page *page)
@@ -93,8 +125,11 @@ static void unpoison_pages(struct page *page, int n)
93 unpoison_page(page + i); 125 unpoison_page(page + i);
94} 126}
95 127
96void kernel_map_pages(struct page *page, int numpages, int enable) 128void __kernel_map_pages(struct page *page, int numpages, int enable)
97{ 129{
130 if (!page_poisoning_enabled)
131 return;
132
98 if (enable) 133 if (enable)
99 unpoison_pages(page, numpages); 134 unpoison_pages(page, numpages);
100 else 135 else
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81db45e..2ad7adf4f0a4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -117,7 +117,11 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
117 __filemap_fdatawrite_range(mapping, offset, endbyte, 117 __filemap_fdatawrite_range(mapping, offset, endbyte,
118 WB_SYNC_NONE); 118 WB_SYNC_NONE);
119 119
120 /* First and last FULL page! */ 120 /*
121 * First and last FULL page! Partial pages are deliberately
122 * preserved on the expectation that it is better to preserve
123 * needed memory than to discard unneeded memory.
124 */
121 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; 125 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
122 end_index = (endbyte >> PAGE_CACHE_SHIFT); 126 end_index = (endbyte >> PAGE_CACHE_SHIFT);
123 127
diff --git a/mm/filemap.c b/mm/filemap.c
index 14b4642279f1..e8905bc3cbd7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -62,16 +62,16 @@
62/* 62/*
63 * Lock ordering: 63 * Lock ordering:
64 * 64 *
65 * ->i_mmap_mutex (truncate_pagecache) 65 * ->i_mmap_rwsem (truncate_pagecache)
66 * ->private_lock (__free_pte->__set_page_dirty_buffers) 66 * ->private_lock (__free_pte->__set_page_dirty_buffers)
67 * ->swap_lock (exclusive_swap_page, others) 67 * ->swap_lock (exclusive_swap_page, others)
68 * ->mapping->tree_lock 68 * ->mapping->tree_lock
69 * 69 *
70 * ->i_mutex 70 * ->i_mutex
71 * ->i_mmap_mutex (truncate->unmap_mapping_range) 71 * ->i_mmap_rwsem (truncate->unmap_mapping_range)
72 * 72 *
73 * ->mmap_sem 73 * ->mmap_sem
74 * ->i_mmap_mutex 74 * ->i_mmap_rwsem
75 * ->page_table_lock or pte_lock (various, mainly in memory.c) 75 * ->page_table_lock or pte_lock (various, mainly in memory.c)
76 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 76 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
77 * 77 *
@@ -85,7 +85,7 @@
85 * sb_lock (fs/fs-writeback.c) 85 * sb_lock (fs/fs-writeback.c)
86 * ->mapping->tree_lock (__sync_single_inode) 86 * ->mapping->tree_lock (__sync_single_inode)
87 * 87 *
88 * ->i_mmap_mutex 88 * ->i_mmap_rwsem
89 * ->anon_vma.lock (vma_adjust) 89 * ->anon_vma.lock (vma_adjust)
90 * 90 *
91 * ->anon_vma.lock 91 * ->anon_vma.lock
@@ -105,7 +105,7 @@
105 * ->inode->i_lock (zap_pte_range->set_page_dirty) 105 * ->inode->i_lock (zap_pte_range->set_page_dirty)
106 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 106 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
107 * 107 *
108 * ->i_mmap_mutex 108 * ->i_mmap_rwsem
109 * ->tasklist_lock (memory_failure, collect_procs_ao) 109 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 */ 110 */
111 111
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index d8d9fe3f685c..0d105aeff82f 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -155,22 +155,14 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
155EXPORT_SYMBOL_GPL(xip_file_read); 155EXPORT_SYMBOL_GPL(xip_file_read);
156 156
157/* 157/*
158 * __xip_unmap is invoked from xip_unmap and 158 * __xip_unmap is invoked from xip_unmap and xip_write
159 * xip_write
160 * 159 *
161 * This function walks all vmas of the address_space and unmaps the 160 * This function walks all vmas of the address_space and unmaps the
162 * __xip_sparse_page when found at pgoff. 161 * __xip_sparse_page when found at pgoff.
163 */ 162 */
164static void 163static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
165__xip_unmap (struct address_space * mapping,
166 unsigned long pgoff)
167{ 164{
168 struct vm_area_struct *vma; 165 struct vm_area_struct *vma;
169 struct mm_struct *mm;
170 unsigned long address;
171 pte_t *pte;
172 pte_t pteval;
173 spinlock_t *ptl;
174 struct page *page; 166 struct page *page;
175 unsigned count; 167 unsigned count;
176 int locked = 0; 168 int locked = 0;
@@ -182,11 +174,14 @@ __xip_unmap (struct address_space * mapping,
182 return; 174 return;
183 175
184retry: 176retry:
185 mutex_lock(&mapping->i_mmap_mutex); 177 i_mmap_lock_read(mapping);
186 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 178 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
187 mm = vma->vm_mm; 179 pte_t *pte, pteval;
188 address = vma->vm_start + 180 spinlock_t *ptl;
181 struct mm_struct *mm = vma->vm_mm;
182 unsigned long address = vma->vm_start +
189 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 183 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
184
190 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 185 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
191 pte = page_check_address(page, mm, address, &ptl, 1); 186 pte = page_check_address(page, mm, address, &ptl, 1);
192 if (pte) { 187 if (pte) {
@@ -202,7 +197,7 @@ retry:
202 page_cache_release(page); 197 page_cache_release(page);
203 } 198 }
204 } 199 }
205 mutex_unlock(&mapping->i_mmap_mutex); 200 i_mmap_unlock_read(mapping);
206 201
207 if (locked) { 202 if (locked) {
208 mutex_unlock(&xip_sparse_mutex); 203 mutex_unlock(&xip_sparse_mutex);
diff --git a/mm/fremap.c b/mm/fremap.c
index 72b8fa361433..11ef7ec40d13 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -238,13 +238,13 @@ get_write_lock:
238 } 238 }
239 goto out_freed; 239 goto out_freed;
240 } 240 }
241 mutex_lock(&mapping->i_mmap_mutex); 241 i_mmap_lock_write(mapping);
242 flush_dcache_mmap_lock(mapping); 242 flush_dcache_mmap_lock(mapping);
243 vma->vm_flags |= VM_NONLINEAR; 243 vma->vm_flags |= VM_NONLINEAR;
244 vma_interval_tree_remove(vma, &mapping->i_mmap); 244 vma_interval_tree_remove(vma, &mapping->i_mmap);
245 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 245 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
246 flush_dcache_mmap_unlock(mapping); 246 flush_dcache_mmap_unlock(mapping);
247 mutex_unlock(&mapping->i_mmap_mutex); 247 i_mmap_unlock_write(mapping);
248 } 248 }
249 249
250 if (vma->vm_flags & VM_LOCKED) { 250 if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 919b86a2164d..47f6070d7c46 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1457,7 +1457,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1457 return 0; 1457 return 0;
1458 1458
1459found: 1459found:
1460 BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); 1460 BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
1461 /* Put them into a private list first because mem_map is not up yet */ 1461 /* Put them into a private list first because mem_map is not up yet */
1462 list_add(&m->list, &huge_boot_pages); 1462 list_add(&m->list, &huge_boot_pages);
1463 m->hstate = h; 1463 m->hstate = h;
@@ -2083,7 +2083,7 @@ static void hugetlb_register_node(struct node *node)
2083 * devices of nodes that have memory. All on-line nodes should have 2083 * devices of nodes that have memory. All on-line nodes should have
2084 * registered their associated device by this time. 2084 * registered their associated device by this time.
2085 */ 2085 */
2086static void hugetlb_register_all_nodes(void) 2086static void __init hugetlb_register_all_nodes(void)
2087{ 2087{
2088 int nid; 2088 int nid;
2089 2089
@@ -2726,9 +2726,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2726 * on its way out. We're lucky that the flag has such an appropriate 2726 * on its way out. We're lucky that the flag has such an appropriate
2727 * name, and can in fact be safely cleared here. We could clear it 2727 * name, and can in fact be safely cleared here. We could clear it
2728 * before the __unmap_hugepage_range above, but all that's necessary 2728 * before the __unmap_hugepage_range above, but all that's necessary
2729 * is to clear it before releasing the i_mmap_mutex. This works 2729 * is to clear it before releasing the i_mmap_rwsem. This works
2730 * because in the context this is called, the VMA is about to be 2730 * because in the context this is called, the VMA is about to be
2731 * destroyed and the i_mmap_mutex is held. 2731 * destroyed and the i_mmap_rwsem is held.
2732 */ 2732 */
2733 vma->vm_flags &= ~VM_MAYSHARE; 2733 vma->vm_flags &= ~VM_MAYSHARE;
2734} 2734}
@@ -2774,7 +2774,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2774 * this mapping should be shared between all the VMAs, 2774 * this mapping should be shared between all the VMAs,
2775 * __unmap_hugepage_range() is called as the lock is already held 2775 * __unmap_hugepage_range() is called as the lock is already held
2776 */ 2776 */
2777 mutex_lock(&mapping->i_mmap_mutex); 2777 i_mmap_lock_write(mapping);
2778 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 2778 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
2779 /* Do not unmap the current VMA */ 2779 /* Do not unmap the current VMA */
2780 if (iter_vma == vma) 2780 if (iter_vma == vma)
@@ -2791,7 +2791,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2791 unmap_hugepage_range(iter_vma, address, 2791 unmap_hugepage_range(iter_vma, address,
2792 address + huge_page_size(h), page); 2792 address + huge_page_size(h), page);
2793 } 2793 }
2794 mutex_unlock(&mapping->i_mmap_mutex); 2794 i_mmap_unlock_write(mapping);
2795} 2795}
2796 2796
2797/* 2797/*
@@ -3348,7 +3348,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3348 flush_cache_range(vma, address, end); 3348 flush_cache_range(vma, address, end);
3349 3349
3350 mmu_notifier_invalidate_range_start(mm, start, end); 3350 mmu_notifier_invalidate_range_start(mm, start, end);
3351 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 3351 i_mmap_lock_write(vma->vm_file->f_mapping);
3352 for (; address < end; address += huge_page_size(h)) { 3352 for (; address < end; address += huge_page_size(h)) {
3353 spinlock_t *ptl; 3353 spinlock_t *ptl;
3354 ptep = huge_pte_offset(mm, address); 3354 ptep = huge_pte_offset(mm, address);
@@ -3370,13 +3370,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3370 spin_unlock(ptl); 3370 spin_unlock(ptl);
3371 } 3371 }
3372 /* 3372 /*
3373 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare 3373 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
3374 * may have cleared our pud entry and done put_page on the page table: 3374 * may have cleared our pud entry and done put_page on the page table:
3375 * once we release i_mmap_mutex, another task can do the final put_page 3375 * once we release i_mmap_rwsem, another task can do the final put_page
3376 * and that page table be reused and filled with junk. 3376 * and that page table be reused and filled with junk.
3377 */ 3377 */
3378 flush_tlb_range(vma, start, end); 3378 flush_tlb_range(vma, start, end);
3379 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3379 i_mmap_unlock_write(vma->vm_file->f_mapping);
3380 mmu_notifier_invalidate_range_end(mm, start, end); 3380 mmu_notifier_invalidate_range_end(mm, start, end);
3381 3381
3382 return pages << h->order; 3382 return pages << h->order;
@@ -3525,7 +3525,7 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
3525 * and returns the corresponding pte. While this is not necessary for the 3525 * and returns the corresponding pte. While this is not necessary for the
3526 * !shared pmd case because we can allocate the pmd later as well, it makes the 3526 * !shared pmd case because we can allocate the pmd later as well, it makes the
3527 * code much cleaner. pmd allocation is essential for the shared case because 3527 * code much cleaner. pmd allocation is essential for the shared case because
3528 * pud has to be populated inside the same i_mmap_mutex section - otherwise 3528 * pud has to be populated inside the same i_mmap_rwsem section - otherwise
3529 * racing tasks could either miss the sharing (see huge_pte_offset) or select a 3529 * racing tasks could either miss the sharing (see huge_pte_offset) or select a
3530 * bad pmd for sharing. 3530 * bad pmd for sharing.
3531 */ 3531 */
@@ -3544,7 +3544,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3544 if (!vma_shareable(vma, addr)) 3544 if (!vma_shareable(vma, addr))
3545 return (pte_t *)pmd_alloc(mm, pud, addr); 3545 return (pte_t *)pmd_alloc(mm, pud, addr);
3546 3546
3547 mutex_lock(&mapping->i_mmap_mutex); 3547 i_mmap_lock_write(mapping);
3548 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 3548 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
3549 if (svma == vma) 3549 if (svma == vma)
3550 continue; 3550 continue;
@@ -3572,7 +3572,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3572 spin_unlock(ptl); 3572 spin_unlock(ptl);
3573out: 3573out:
3574 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3574 pte = (pte_t *)pmd_alloc(mm, pud, addr);
3575 mutex_unlock(&mapping->i_mmap_mutex); 3575 i_mmap_unlock_write(mapping);
3576 return pte; 3576 return pte;
3577} 3577}
3578 3578
diff --git a/mm/memblock.c b/mm/memblock.c
index 6ecb0d937fb5..252b77bdf65e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -715,16 +715,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
715} 715}
716 716
717/** 717/**
718 * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
719 * @base: the base phys addr of the region
720 * @size: the size of the region
721 * 718 *
722 * This function isolates region [@base, @base + @size), and mark it with flag 719 * This function isolates region [@base, @base + @size), and sets/clears flag
723 * MEMBLOCK_HOTPLUG.
724 * 720 *
725 * Return 0 on succees, -errno on failure. 721 * Return 0 on succees, -errno on failure.
726 */ 722 */
727int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) 723static int __init_memblock memblock_setclr_flag(phys_addr_t base,
724 phys_addr_t size, int set, int flag)
728{ 725{
729 struct memblock_type *type = &memblock.memory; 726 struct memblock_type *type = &memblock.memory;
730 int i, ret, start_rgn, end_rgn; 727 int i, ret, start_rgn, end_rgn;
@@ -734,37 +731,37 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
734 return ret; 731 return ret;
735 732
736 for (i = start_rgn; i < end_rgn; i++) 733 for (i = start_rgn; i < end_rgn; i++)
737 memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); 734 if (set)
735 memblock_set_region_flags(&type->regions[i], flag);
736 else
737 memblock_clear_region_flags(&type->regions[i], flag);
738 738
739 memblock_merge_regions(type); 739 memblock_merge_regions(type);
740 return 0; 740 return 0;
741} 741}
742 742
743/** 743/**
744 * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. 744 * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
745 * @base: the base phys addr of the region 745 * @base: the base phys addr of the region
746 * @size: the size of the region 746 * @size: the size of the region
747 * 747 *
748 * This function isolates region [@base, @base + @size), and clear flag 748 * Return 0 on succees, -errno on failure.
749 * MEMBLOCK_HOTPLUG for the isolated regions. 749 */
750int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
751{
752 return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG);
753}
754
755/**
756 * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
757 * @base: the base phys addr of the region
758 * @size: the size of the region
750 * 759 *
751 * Return 0 on succees, -errno on failure. 760 * Return 0 on succees, -errno on failure.
752 */ 761 */
753int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) 762int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
754{ 763{
755 struct memblock_type *type = &memblock.memory; 764 return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG);
756 int i, ret, start_rgn, end_rgn;
757
758 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
759 if (ret)
760 return ret;
761
762 for (i = start_rgn; i < end_rgn; i++)
763 memblock_clear_region_flags(&type->regions[i],
764 MEMBLOCK_HOTPLUG);
765
766 memblock_merge_regions(type);
767 return 0;
768} 765}
769 766
770/** 767/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 85df503ec023..ef91e856c7e4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -296,7 +296,6 @@ struct mem_cgroup {
296 * Should the accounting and control be hierarchical, per subtree? 296 * Should the accounting and control be hierarchical, per subtree?
297 */ 297 */
298 bool use_hierarchy; 298 bool use_hierarchy;
299 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
300 299
301 bool oom_lock; 300 bool oom_lock;
302 atomic_t under_oom; 301 atomic_t under_oom;
@@ -366,22 +365,11 @@ struct mem_cgroup {
366 /* WARNING: nodeinfo must be the last member here */ 365 /* WARNING: nodeinfo must be the last member here */
367}; 366};
368 367
369/* internal only representation about the status of kmem accounting. */
370enum {
371 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
372};
373
374#ifdef CONFIG_MEMCG_KMEM 368#ifdef CONFIG_MEMCG_KMEM
375static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
376{
377 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
378}
379
380static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 369static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
381{ 370{
382 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 371 return memcg->kmemcg_id >= 0;
383} 372}
384
385#endif 373#endif
386 374
387/* Stuffs for move charges at task migration. */ 375/* Stuffs for move charges at task migration. */
@@ -1571,7 +1559,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1571 * select it. The goal is to allow it to allocate so that it may 1559 * select it. The goal is to allow it to allocate so that it may
1572 * quickly exit and free its memory. 1560 * quickly exit and free its memory.
1573 */ 1561 */
1574 if (fatal_signal_pending(current) || current->flags & PF_EXITING) { 1562 if (fatal_signal_pending(current) || task_will_free_mem(current)) {
1575 set_thread_flag(TIF_MEMDIE); 1563 set_thread_flag(TIF_MEMDIE);
1576 return; 1564 return;
1577 } 1565 }
@@ -1628,6 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1628 NULL, "Memory cgroup out of memory"); 1616 NULL, "Memory cgroup out of memory");
1629} 1617}
1630 1618
1619#if MAX_NUMNODES > 1
1620
1631/** 1621/**
1632 * test_mem_cgroup_node_reclaimable 1622 * test_mem_cgroup_node_reclaimable
1633 * @memcg: the target memcg 1623 * @memcg: the target memcg
@@ -1650,7 +1640,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1650 return false; 1640 return false;
1651 1641
1652} 1642}
1653#if MAX_NUMNODES > 1
1654 1643
1655/* 1644/*
1656 * Always updating the nodemask is not very good - even if we have an empty 1645 * Always updating the nodemask is not very good - even if we have an empty
@@ -2646,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
2646 if (!cachep) 2635 if (!cachep)
2647 return; 2636 return;
2648 2637
2649 css_get(&memcg->css);
2650 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 2638 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2651 2639
2652 /* 2640 /*
@@ -2680,40 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
2680 list_del(&cachep->memcg_params->list); 2668 list_del(&cachep->memcg_params->list);
2681 2669
2682 kmem_cache_destroy(cachep); 2670 kmem_cache_destroy(cachep);
2683
2684 /* drop the reference taken in memcg_register_cache */
2685 css_put(&memcg->css);
2686}
2687
2688/*
2689 * During the creation a new cache, we need to disable our accounting mechanism
2690 * altogether. This is true even if we are not creating, but rather just
2691 * enqueing new caches to be created.
2692 *
2693 * This is because that process will trigger allocations; some visible, like
2694 * explicit kmallocs to auxiliary data structures, name strings and internal
2695 * cache structures; some well concealed, like INIT_WORK() that can allocate
2696 * objects during debug.
2697 *
2698 * If any allocation happens during memcg_kmem_get_cache, we will recurse back
2699 * to it. This may not be a bounded recursion: since the first cache creation
2700 * failed to complete (waiting on the allocation), we'll just try to create the
2701 * cache again, failing at the same point.
2702 *
2703 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
2704 * memcg_kmem_skip_account. So we enclose anything that might allocate memory
2705 * inside the following two functions.
2706 */
2707static inline void memcg_stop_kmem_account(void)
2708{
2709 VM_BUG_ON(!current->mm);
2710 current->memcg_kmem_skip_account++;
2711}
2712
2713static inline void memcg_resume_kmem_account(void)
2714{
2715 VM_BUG_ON(!current->mm);
2716 current->memcg_kmem_skip_account--;
2717} 2671}
2718 2672
2719int __memcg_cleanup_cache_params(struct kmem_cache *s) 2673int __memcg_cleanup_cache_params(struct kmem_cache *s)
@@ -2747,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
2747 mutex_lock(&memcg_slab_mutex); 2701 mutex_lock(&memcg_slab_mutex);
2748 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { 2702 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
2749 cachep = memcg_params_to_cache(params); 2703 cachep = memcg_params_to_cache(params);
2750 kmem_cache_shrink(cachep); 2704 memcg_unregister_cache(cachep);
2751 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
2752 memcg_unregister_cache(cachep);
2753 } 2705 }
2754 mutex_unlock(&memcg_slab_mutex); 2706 mutex_unlock(&memcg_slab_mutex);
2755} 2707}
@@ -2784,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
2784 struct memcg_register_cache_work *cw; 2736 struct memcg_register_cache_work *cw;
2785 2737
2786 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 2738 cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
2787 if (cw == NULL) { 2739 if (!cw)
2788 css_put(&memcg->css);
2789 return; 2740 return;
2790 } 2741
2742 css_get(&memcg->css);
2791 2743
2792 cw->memcg = memcg; 2744 cw->memcg = memcg;
2793 cw->cachep = cachep; 2745 cw->cachep = cachep;
@@ -2810,20 +2762,16 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
2810 * this point we can't allow ourselves back into memcg_kmem_get_cache, 2762 * this point we can't allow ourselves back into memcg_kmem_get_cache,
2811 * the safest choice is to do it like this, wrapping the whole function. 2763 * the safest choice is to do it like this, wrapping the whole function.
2812 */ 2764 */
2813 memcg_stop_kmem_account(); 2765 current->memcg_kmem_skip_account = 1;
2814 __memcg_schedule_register_cache(memcg, cachep); 2766 __memcg_schedule_register_cache(memcg, cachep);
2815 memcg_resume_kmem_account(); 2767 current->memcg_kmem_skip_account = 0;
2816} 2768}
2817 2769
2818int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) 2770int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
2819{ 2771{
2820 unsigned int nr_pages = 1 << order; 2772 unsigned int nr_pages = 1 << order;
2821 int res;
2822 2773
2823 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); 2774 return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
2824 if (!res)
2825 atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
2826 return res;
2827} 2775}
2828 2776
2829void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) 2777void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
@@ -2831,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
2831 unsigned int nr_pages = 1 << order; 2779 unsigned int nr_pages = 1 << order;
2832 2780
2833 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); 2781 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
2834 atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
2835} 2782}
2836 2783
2837/* 2784/*
@@ -2847,8 +2794,7 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
2847 * Can't be called in interrupt context or from kernel threads. 2794 * Can't be called in interrupt context or from kernel threads.
2848 * This function needs to be called with rcu_read_lock() held. 2795 * This function needs to be called with rcu_read_lock() held.
2849 */ 2796 */
2850struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, 2797struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2851 gfp_t gfp)
2852{ 2798{
2853 struct mem_cgroup *memcg; 2799 struct mem_cgroup *memcg;
2854 struct kmem_cache *memcg_cachep; 2800 struct kmem_cache *memcg_cachep;
@@ -2856,25 +2802,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
2856 VM_BUG_ON(!cachep->memcg_params); 2802 VM_BUG_ON(!cachep->memcg_params);
2857 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 2803 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
2858 2804
2859 if (!current->mm || current->memcg_kmem_skip_account) 2805 if (current->memcg_kmem_skip_account)
2860 return cachep; 2806 return cachep;
2861 2807
2862 rcu_read_lock(); 2808 memcg = get_mem_cgroup_from_mm(current->mm);
2863 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
2864
2865 if (!memcg_kmem_is_active(memcg)) 2809 if (!memcg_kmem_is_active(memcg))
2866 goto out; 2810 goto out;
2867 2811
2868 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 2812 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
2869 if (likely(memcg_cachep)) { 2813 if (likely(memcg_cachep))
2870 cachep = memcg_cachep; 2814 return memcg_cachep;
2871 goto out;
2872 }
2873
2874 /* The corresponding put will be done in the workqueue. */
2875 if (!css_tryget_online(&memcg->css))
2876 goto out;
2877 rcu_read_unlock();
2878 2815
2879 /* 2816 /*
2880 * If we are in a safe context (can wait, and not in interrupt 2817 * If we are in a safe context (can wait, and not in interrupt
@@ -2889,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
2889 * defer everything. 2826 * defer everything.
2890 */ 2827 */
2891 memcg_schedule_register_cache(memcg, cachep); 2828 memcg_schedule_register_cache(memcg, cachep);
2892 return cachep;
2893out: 2829out:
2894 rcu_read_unlock(); 2830 css_put(&memcg->css);
2895 return cachep; 2831 return cachep;
2896} 2832}
2897 2833
2834void __memcg_kmem_put_cache(struct kmem_cache *cachep)
2835{
2836 if (!is_root_cache(cachep))
2837 css_put(&cachep->memcg_params->memcg->css);
2838}
2839
2898/* 2840/*
2899 * We need to verify if the allocation against current->mm->owner's memcg is 2841 * We need to verify if the allocation against current->mm->owner's memcg is
2900 * possible for the given order. But the page is not allocated yet, so we'll 2842 * possible for the given order. But the page is not allocated yet, so we'll
@@ -2917,34 +2859,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
2917 2859
2918 *_memcg = NULL; 2860 *_memcg = NULL;
2919 2861
2920 /*
2921 * Disabling accounting is only relevant for some specific memcg
2922 * internal allocations. Therefore we would initially not have such
2923 * check here, since direct calls to the page allocator that are
2924 * accounted to kmemcg (alloc_kmem_pages and friends) only happen
2925 * outside memcg core. We are mostly concerned with cache allocations,
2926 * and by having this test at memcg_kmem_get_cache, we are already able
2927 * to relay the allocation to the root cache and bypass the memcg cache
2928 * altogether.
2929 *
2930 * There is one exception, though: the SLUB allocator does not create
2931 * large order caches, but rather service large kmallocs directly from
2932 * the page allocator. Therefore, the following sequence when backed by
2933 * the SLUB allocator:
2934 *
2935 * memcg_stop_kmem_account();
2936 * kmalloc(<large_number>)
2937 * memcg_resume_kmem_account();
2938 *
2939 * would effectively ignore the fact that we should skip accounting,
2940 * since it will drive us directly to this function without passing
2941 * through the cache selector memcg_kmem_get_cache. Such large
2942 * allocations are extremely rare but can happen, for instance, for the
2943 * cache arrays. We bring this test here.
2944 */
2945 if (!current->mm || current->memcg_kmem_skip_account)
2946 return true;
2947
2948 memcg = get_mem_cgroup_from_mm(current->mm); 2862 memcg = get_mem_cgroup_from_mm(current->mm);
2949 2863
2950 if (!memcg_kmem_is_active(memcg)) { 2864 if (!memcg_kmem_is_active(memcg)) {
@@ -2985,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
2985 memcg_uncharge_kmem(memcg, 1 << order); 2899 memcg_uncharge_kmem(memcg, 1 << order);
2986 page->mem_cgroup = NULL; 2900 page->mem_cgroup = NULL;
2987} 2901}
2988#else
2989static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
2990{
2991}
2992#endif /* CONFIG_MEMCG_KMEM */ 2902#endif /* CONFIG_MEMCG_KMEM */
2993 2903
2994#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2904#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3539,12 +3449,6 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
3539 return 0; 3449 return 0;
3540 3450
3541 /* 3451 /*
3542 * We are going to allocate memory for data shared by all memory
3543 * cgroups so let's stop accounting here.
3544 */
3545 memcg_stop_kmem_account();
3546
3547 /*
3548 * For simplicity, we won't allow this to be disabled. It also can't 3452 * For simplicity, we won't allow this to be disabled. It also can't
3549 * be changed if the cgroup has children already, or if tasks had 3453 * be changed if the cgroup has children already, or if tasks had
3550 * already joined. 3454 * already joined.
@@ -3570,25 +3474,22 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
3570 goto out; 3474 goto out;
3571 } 3475 }
3572 3476
3573 memcg->kmemcg_id = memcg_id;
3574 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
3575
3576 /* 3477 /*
3577 * We couldn't have accounted to this cgroup, because it hasn't got the 3478 * We couldn't have accounted to this cgroup, because it hasn't got
3578 * active bit set yet, so this should succeed. 3479 * activated yet, so this should succeed.
3579 */ 3480 */
3580 err = page_counter_limit(&memcg->kmem, nr_pages); 3481 err = page_counter_limit(&memcg->kmem, nr_pages);
3581 VM_BUG_ON(err); 3482 VM_BUG_ON(err);
3582 3483
3583 static_key_slow_inc(&memcg_kmem_enabled_key); 3484 static_key_slow_inc(&memcg_kmem_enabled_key);
3584 /* 3485 /*
3585 * Setting the active bit after enabling static branching will 3486 * A memory cgroup is considered kmem-active as soon as it gets
3487 * kmemcg_id. Setting the id after enabling static branching will
3586 * guarantee no one starts accounting before all call sites are 3488 * guarantee no one starts accounting before all call sites are
3587 * patched. 3489 * patched.
3588 */ 3490 */
3589 memcg_kmem_set_active(memcg); 3491 memcg->kmemcg_id = memcg_id;
3590out: 3492out:
3591 memcg_resume_kmem_account();
3592 return err; 3493 return err;
3593} 3494}
3594 3495
@@ -3791,11 +3692,6 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
3791} 3692}
3792#endif /* CONFIG_NUMA */ 3693#endif /* CONFIG_NUMA */
3793 3694
3794static inline void mem_cgroup_lru_names_not_uptodate(void)
3795{
3796 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3797}
3798
3799static int memcg_stat_show(struct seq_file *m, void *v) 3695static int memcg_stat_show(struct seq_file *m, void *v)
3800{ 3696{
3801 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3697 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -3803,6 +3699,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
3803 struct mem_cgroup *mi; 3699 struct mem_cgroup *mi;
3804 unsigned int i; 3700 unsigned int i;
3805 3701
3702 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3703
3806 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3704 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3807 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3705 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
3808 continue; 3706 continue;
@@ -4259,7 +4157,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4259{ 4157{
4260 int ret; 4158 int ret;
4261 4159
4262 memcg->kmemcg_id = -1;
4263 ret = memcg_propagate_kmem(memcg); 4160 ret = memcg_propagate_kmem(memcg);
4264 if (ret) 4161 if (ret)
4265 return ret; 4162 return ret;
@@ -4269,6 +4166,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4269 4166
4270static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4167static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4271{ 4168{
4169 memcg_unregister_all_caches(memcg);
4272 mem_cgroup_sockets_destroy(memcg); 4170 mem_cgroup_sockets_destroy(memcg);
4273} 4171}
4274#else 4172#else
@@ -4724,17 +4622,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4724 4622
4725 free_percpu(memcg->stat); 4623 free_percpu(memcg->stat);
4726 4624
4727 /*
4728 * We need to make sure that (at least for now), the jump label
4729 * destruction code runs outside of the cgroup lock. This is because
4730 * get_online_cpus(), which is called from the static_branch update,
4731 * can't be called inside the cgroup_lock. cpusets are the ones
4732 * enforcing this dependency, so if they ever change, we might as well.
4733 *
4734 * schedule_work() will guarantee this happens. Be careful if you need
4735 * to move this code around, and make sure it is outside
4736 * the cgroup_lock.
4737 */
4738 disarm_static_keys(memcg); 4625 disarm_static_keys(memcg);
4739 kfree(memcg); 4626 kfree(memcg);
4740} 4627}
@@ -4804,6 +4691,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4804 vmpressure_init(&memcg->vmpressure); 4691 vmpressure_init(&memcg->vmpressure);
4805 INIT_LIST_HEAD(&memcg->event_list); 4692 INIT_LIST_HEAD(&memcg->event_list);
4806 spin_lock_init(&memcg->event_list_lock); 4693 spin_lock_init(&memcg->event_list_lock);
4694#ifdef CONFIG_MEMCG_KMEM
4695 memcg->kmemcg_id = -1;
4696 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
4697#endif
4807 4698
4808 return &memcg->css; 4699 return &memcg->css;
4809 4700
@@ -4885,7 +4776,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4885 } 4776 }
4886 spin_unlock(&memcg->event_list_lock); 4777 spin_unlock(&memcg->event_list_lock);
4887 4778
4888 memcg_unregister_all_caches(memcg);
4889 vmpressure_cleanup(&memcg->vmpressure); 4779 vmpressure_cleanup(&memcg->vmpressure);
4890} 4780}
4891 4781
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e5ee0ca7ae85..feb803bf3443 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -239,19 +239,14 @@ void shake_page(struct page *p, int access)
239 } 239 }
240 240
241 /* 241 /*
242 * Only call shrink_slab here (which would also shrink other caches) if 242 * Only call shrink_node_slabs here (which would also shrink
243 * access is not potentially fatal. 243 * other caches) if access is not potentially fatal.
244 */ 244 */
245 if (access) { 245 if (access) {
246 int nr; 246 int nr;
247 int nid = page_to_nid(p); 247 int nid = page_to_nid(p);
248 do { 248 do {
249 struct shrink_control shrink = { 249 nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
250 .gfp_mask = GFP_KERNEL,
251 };
252 node_set(nid, shrink.nodes_to_scan);
253
254 nr = shrink_slab(&shrink, 1000, 1000);
255 if (page_count(p) == 1) 250 if (page_count(p) == 1)
256 break; 251 break;
257 } while (nr > 10); 252 } while (nr > 10);
@@ -466,7 +461,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
466 struct task_struct *tsk; 461 struct task_struct *tsk;
467 struct address_space *mapping = page->mapping; 462 struct address_space *mapping = page->mapping;
468 463
469 mutex_lock(&mapping->i_mmap_mutex); 464 i_mmap_lock_read(mapping);
470 read_lock(&tasklist_lock); 465 read_lock(&tasklist_lock);
471 for_each_process(tsk) { 466 for_each_process(tsk) {
472 pgoff_t pgoff = page_to_pgoff(page); 467 pgoff_t pgoff = page_to_pgoff(page);
@@ -488,7 +483,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
488 } 483 }
489 } 484 }
490 read_unlock(&tasklist_lock); 485 read_unlock(&tasklist_lock);
491 mutex_unlock(&mapping->i_mmap_mutex); 486 i_mmap_unlock_read(mapping);
492} 487}
493 488
494/* 489/*
diff --git a/mm/memory.c b/mm/memory.c
index 4b5a282e1107..fbf74112de5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1326,9 +1326,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1326 * safe to do nothing in this case. 1326 * safe to do nothing in this case.
1327 */ 1327 */
1328 if (vma->vm_file) { 1328 if (vma->vm_file) {
1329 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 1329 i_mmap_lock_write(vma->vm_file->f_mapping);
1330 __unmap_hugepage_range_final(tlb, vma, start, end, NULL); 1330 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1331 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 1331 i_mmap_unlock_write(vma->vm_file->f_mapping);
1332 } 1332 }
1333 } else 1333 } else
1334 unmap_page_range(tlb, vma, start, end, details); 1334 unmap_page_range(tlb, vma, start, end, details);
@@ -2377,12 +2377,12 @@ void unmap_mapping_range(struct address_space *mapping,
2377 details.last_index = ULONG_MAX; 2377 details.last_index = ULONG_MAX;
2378 2378
2379 2379
2380 mutex_lock(&mapping->i_mmap_mutex); 2380 i_mmap_lock_read(mapping);
2381 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) 2381 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2382 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2382 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2383 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2383 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2384 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2384 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2385 mutex_unlock(&mapping->i_mmap_mutex); 2385 i_mmap_unlock_read(mapping);
2386} 2386}
2387EXPORT_SYMBOL(unmap_mapping_range); 2387EXPORT_SYMBOL(unmap_mapping_range);
2388 2388
@@ -3365,6 +3365,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3365 3365
3366 return ret; 3366 return ret;
3367} 3367}
3368EXPORT_SYMBOL_GPL(handle_mm_fault);
3368 3369
3369#ifndef __PAGETABLE_PUD_FOLDED 3370#ifndef __PAGETABLE_PUD_FOLDED
3370/* 3371/*
diff --git a/mm/migrate.c b/mm/migrate.c
index 01439953abf5..253474c22239 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -746,7 +746,7 @@ static int fallback_migrate_page(struct address_space *mapping,
746 * MIGRATEPAGE_SUCCESS - success 746 * MIGRATEPAGE_SUCCESS - success
747 */ 747 */
748static int move_to_new_page(struct page *newpage, struct page *page, 748static int move_to_new_page(struct page *newpage, struct page *page,
749 int remap_swapcache, enum migrate_mode mode) 749 int page_was_mapped, enum migrate_mode mode)
750{ 750{
751 struct address_space *mapping; 751 struct address_space *mapping;
752 int rc; 752 int rc;
@@ -784,7 +784,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
784 newpage->mapping = NULL; 784 newpage->mapping = NULL;
785 } else { 785 } else {
786 mem_cgroup_migrate(page, newpage, false); 786 mem_cgroup_migrate(page, newpage, false);
787 if (remap_swapcache) 787 if (page_was_mapped)
788 remove_migration_ptes(page, newpage); 788 remove_migration_ptes(page, newpage);
789 page->mapping = NULL; 789 page->mapping = NULL;
790 } 790 }
@@ -798,7 +798,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
798 int force, enum migrate_mode mode) 798 int force, enum migrate_mode mode)
799{ 799{
800 int rc = -EAGAIN; 800 int rc = -EAGAIN;
801 int remap_swapcache = 1; 801 int page_was_mapped = 0;
802 struct anon_vma *anon_vma = NULL; 802 struct anon_vma *anon_vma = NULL;
803 803
804 if (!trylock_page(page)) { 804 if (!trylock_page(page)) {
@@ -870,7 +870,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
870 * migrated but are not remapped when migration 870 * migrated but are not remapped when migration
871 * completes 871 * completes
872 */ 872 */
873 remap_swapcache = 0;
874 } else { 873 } else {
875 goto out_unlock; 874 goto out_unlock;
876 } 875 }
@@ -910,13 +909,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
910 } 909 }
911 910
912 /* Establish migration ptes or remove ptes */ 911 /* Establish migration ptes or remove ptes */
913 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 912 if (page_mapped(page)) {
913 try_to_unmap(page,
914 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
915 page_was_mapped = 1;
916 }
914 917
915skip_unmap: 918skip_unmap:
916 if (!page_mapped(page)) 919 if (!page_mapped(page))
917 rc = move_to_new_page(newpage, page, remap_swapcache, mode); 920 rc = move_to_new_page(newpage, page, page_was_mapped, mode);
918 921
919 if (rc && remap_swapcache) 922 if (rc && page_was_mapped)
920 remove_migration_ptes(page, page); 923 remove_migration_ptes(page, page);
921 924
922 /* Drop an anon_vma reference if we took one */ 925 /* Drop an anon_vma reference if we took one */
@@ -1017,6 +1020,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1017{ 1020{
1018 int rc = 0; 1021 int rc = 0;
1019 int *result = NULL; 1022 int *result = NULL;
1023 int page_was_mapped = 0;
1020 struct page *new_hpage; 1024 struct page *new_hpage;
1021 struct anon_vma *anon_vma = NULL; 1025 struct anon_vma *anon_vma = NULL;
1022 1026
@@ -1047,12 +1051,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1047 if (PageAnon(hpage)) 1051 if (PageAnon(hpage))
1048 anon_vma = page_get_anon_vma(hpage); 1052 anon_vma = page_get_anon_vma(hpage);
1049 1053
1050 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 1054 if (page_mapped(hpage)) {
1055 try_to_unmap(hpage,
1056 TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1057 page_was_mapped = 1;
1058 }
1051 1059
1052 if (!page_mapped(hpage)) 1060 if (!page_mapped(hpage))
1053 rc = move_to_new_page(new_hpage, hpage, 1, mode); 1061 rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
1054 1062
1055 if (rc != MIGRATEPAGE_SUCCESS) 1063 if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
1056 remove_migration_ptes(hpage, hpage); 1064 remove_migration_ptes(hpage, hpage);
1057 1065
1058 if (anon_vma) 1066 if (anon_vma)
diff --git a/mm/mincore.c b/mm/mincore.c
index 725c80961048..c8c528b36641 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -137,8 +137,11 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
137 } else { /* pte is a swap entry */ 137 } else { /* pte is a swap entry */
138 swp_entry_t entry = pte_to_swp_entry(pte); 138 swp_entry_t entry = pte_to_swp_entry(pte);
139 139
140 if (is_migration_entry(entry)) { 140 if (non_swap_entry(entry)) {
141 /* migration entries are always uptodate */ 141 /*
142 * migration or hwpoison entries are always
143 * uptodate
144 */
142 *vec = 1; 145 *vec = 1;
143 } else { 146 } else {
144#ifdef CONFIG_SWAP 147#ifdef CONFIG_SWAP
diff --git a/mm/mmap.c b/mm/mmap.c
index b6c0a77fc1c8..7b36aa7cc89a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -232,7 +232,7 @@ error:
232} 232}
233 233
234/* 234/*
235 * Requires inode->i_mapping->i_mmap_mutex 235 * Requires inode->i_mapping->i_mmap_rwsem
236 */ 236 */
237static void __remove_shared_vm_struct(struct vm_area_struct *vma, 237static void __remove_shared_vm_struct(struct vm_area_struct *vma,
238 struct file *file, struct address_space *mapping) 238 struct file *file, struct address_space *mapping)
@@ -260,9 +260,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
260 260
261 if (file) { 261 if (file) {
262 struct address_space *mapping = file->f_mapping; 262 struct address_space *mapping = file->f_mapping;
263 mutex_lock(&mapping->i_mmap_mutex); 263 i_mmap_lock_write(mapping);
264 __remove_shared_vm_struct(vma, file, mapping); 264 __remove_shared_vm_struct(vma, file, mapping);
265 mutex_unlock(&mapping->i_mmap_mutex); 265 i_mmap_unlock_write(mapping);
266 } 266 }
267} 267}
268 268
@@ -674,14 +674,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
674 674
675 if (vma->vm_file) { 675 if (vma->vm_file) {
676 mapping = vma->vm_file->f_mapping; 676 mapping = vma->vm_file->f_mapping;
677 mutex_lock(&mapping->i_mmap_mutex); 677 i_mmap_lock_write(mapping);
678 } 678 }
679 679
680 __vma_link(mm, vma, prev, rb_link, rb_parent); 680 __vma_link(mm, vma, prev, rb_link, rb_parent);
681 __vma_link_file(vma); 681 __vma_link_file(vma);
682 682
683 if (mapping) 683 if (mapping)
684 mutex_unlock(&mapping->i_mmap_mutex); 684 i_mmap_unlock_write(mapping);
685 685
686 mm->map_count++; 686 mm->map_count++;
687 validate_mm(mm); 687 validate_mm(mm);
@@ -796,7 +796,7 @@ again: remove_next = 1 + (end > next->vm_end);
796 next->vm_end); 796 next->vm_end);
797 } 797 }
798 798
799 mutex_lock(&mapping->i_mmap_mutex); 799 i_mmap_lock_write(mapping);
800 if (insert) { 800 if (insert) {
801 /* 801 /*
802 * Put into interval tree now, so instantiated pages 802 * Put into interval tree now, so instantiated pages
@@ -883,7 +883,7 @@ again: remove_next = 1 + (end > next->vm_end);
883 anon_vma_unlock_write(anon_vma); 883 anon_vma_unlock_write(anon_vma);
884 } 884 }
885 if (mapping) 885 if (mapping)
886 mutex_unlock(&mapping->i_mmap_mutex); 886 i_mmap_unlock_write(mapping);
887 887
888 if (root) { 888 if (root) {
889 uprobe_mmap(vma); 889 uprobe_mmap(vma);
@@ -2362,6 +2362,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
2362} 2362}
2363#endif 2363#endif
2364 2364
2365EXPORT_SYMBOL_GPL(find_extend_vma);
2366
2365/* 2367/*
2366 * Ok - we have the memory areas we should free on the vma list, 2368 * Ok - we have the memory areas we should free on the vma list,
2367 * so release them, and do the vma updates. 2369 * so release them, and do the vma updates.
@@ -2791,7 +2793,7 @@ void exit_mmap(struct mm_struct *mm)
2791 2793
2792/* Insert vm structure into process list sorted by address 2794/* Insert vm structure into process list sorted by address
2793 * and into the inode's i_mmap tree. If vm_file is non-NULL 2795 * and into the inode's i_mmap tree. If vm_file is non-NULL
2794 * then i_mmap_mutex is taken here. 2796 * then i_mmap_rwsem is taken here.
2795 */ 2797 */
2796int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 2798int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2797{ 2799{
@@ -3086,7 +3088,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
3086 */ 3088 */
3087 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 3089 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
3088 BUG(); 3090 BUG();
3089 mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); 3091 down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
3090 } 3092 }
3091} 3093}
3092 3094
@@ -3113,7 +3115,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
3113 * vma in this mm is backed by the same anon_vma or address_space. 3115 * vma in this mm is backed by the same anon_vma or address_space.
3114 * 3116 *
3115 * We can take all the locks in random order because the VM code 3117 * We can take all the locks in random order because the VM code
3116 * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never 3118 * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
3117 * takes more than one of them in a row. Secondly we're protected 3119 * takes more than one of them in a row. Secondly we're protected
3118 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. 3120 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
3119 * 3121 *
@@ -3182,7 +3184,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
3182 * AS_MM_ALL_LOCKS can't change to 0 from under us 3184 * AS_MM_ALL_LOCKS can't change to 0 from under us
3183 * because we hold the mm_all_locks_mutex. 3185 * because we hold the mm_all_locks_mutex.
3184 */ 3186 */
3185 mutex_unlock(&mapping->i_mmap_mutex); 3187 i_mmap_unlock_write(mapping);
3186 if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 3188 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
3187 &mapping->flags)) 3189 &mapping->flags))
3188 BUG(); 3190 BUG();
diff --git a/mm/mremap.c b/mm/mremap.c
index b147f66f4c40..84aa36f9f308 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -99,7 +99,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
99 spinlock_t *old_ptl, *new_ptl; 99 spinlock_t *old_ptl, *new_ptl;
100 100
101 /* 101 /*
102 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma 102 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
103 * locks to ensure that rmap will always observe either the old or the 103 * locks to ensure that rmap will always observe either the old or the
104 * new ptes. This is the easiest way to avoid races with 104 * new ptes. This is the easiest way to avoid races with
105 * truncate_pagecache(), page migration, etc... 105 * truncate_pagecache(), page migration, etc...
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
119 if (need_rmap_locks) { 119 if (need_rmap_locks) {
120 if (vma->vm_file) { 120 if (vma->vm_file) {
121 mapping = vma->vm_file->f_mapping; 121 mapping = vma->vm_file->f_mapping;
122 mutex_lock(&mapping->i_mmap_mutex); 122 i_mmap_lock_write(mapping);
123 } 123 }
124 if (vma->anon_vma) { 124 if (vma->anon_vma) {
125 anon_vma = vma->anon_vma; 125 anon_vma = vma->anon_vma;
@@ -156,7 +156,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
156 if (anon_vma) 156 if (anon_vma)
157 anon_vma_unlock_write(anon_vma); 157 anon_vma_unlock_write(anon_vma);
158 if (mapping) 158 if (mapping)
159 mutex_unlock(&mapping->i_mmap_mutex); 159 i_mmap_unlock_write(mapping);
160} 160}
161 161
162#define LATENCY_LIMIT (64 * PAGE_SIZE) 162#define LATENCY_LIMIT (64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index bd1808e194a7..b51eadf6d952 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -722,11 +722,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
722 if (vma->vm_file) { 722 if (vma->vm_file) {
723 mapping = vma->vm_file->f_mapping; 723 mapping = vma->vm_file->f_mapping;
724 724
725 mutex_lock(&mapping->i_mmap_mutex); 725 i_mmap_lock_write(mapping);
726 flush_dcache_mmap_lock(mapping); 726 flush_dcache_mmap_lock(mapping);
727 vma_interval_tree_insert(vma, &mapping->i_mmap); 727 vma_interval_tree_insert(vma, &mapping->i_mmap);
728 flush_dcache_mmap_unlock(mapping); 728 flush_dcache_mmap_unlock(mapping);
729 mutex_unlock(&mapping->i_mmap_mutex); 729 i_mmap_unlock_write(mapping);
730 } 730 }
731 731
732 /* add the VMA to the tree */ 732 /* add the VMA to the tree */
@@ -795,11 +795,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
795 if (vma->vm_file) { 795 if (vma->vm_file) {
796 mapping = vma->vm_file->f_mapping; 796 mapping = vma->vm_file->f_mapping;
797 797
798 mutex_lock(&mapping->i_mmap_mutex); 798 i_mmap_lock_write(mapping);
799 flush_dcache_mmap_lock(mapping); 799 flush_dcache_mmap_lock(mapping);
800 vma_interval_tree_remove(vma, &mapping->i_mmap); 800 vma_interval_tree_remove(vma, &mapping->i_mmap);
801 flush_dcache_mmap_unlock(mapping); 801 flush_dcache_mmap_unlock(mapping);
802 mutex_unlock(&mapping->i_mmap_mutex); 802 i_mmap_unlock_write(mapping);
803 } 803 }
804 804
805 /* remove from the MM's tree and list */ 805 /* remove from the MM's tree and list */
@@ -1149,8 +1149,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1149 unsigned long len, 1149 unsigned long len,
1150 unsigned long capabilities) 1150 unsigned long capabilities)
1151{ 1151{
1152 struct page *pages; 1152 unsigned long total, point;
1153 unsigned long total, point, n;
1154 void *base; 1153 void *base;
1155 int ret, order; 1154 int ret, order;
1156 1155
@@ -1182,33 +1181,23 @@ static int do_mmap_private(struct vm_area_struct *vma,
1182 order = get_order(len); 1181 order = get_order(len);
1183 kdebug("alloc order %d for %lx", order, len); 1182 kdebug("alloc order %d for %lx", order, len);
1184 1183
1185 pages = alloc_pages(GFP_KERNEL, order);
1186 if (!pages)
1187 goto enomem;
1188
1189 total = 1 << order; 1184 total = 1 << order;
1190 atomic_long_add(total, &mmap_pages_allocated);
1191
1192 point = len >> PAGE_SHIFT; 1185 point = len >> PAGE_SHIFT;
1193 1186
1194 /* we allocated a power-of-2 sized page set, so we may want to trim off 1187 /* we don't want to allocate a power-of-2 sized page set */
1195 * the excess */
1196 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { 1188 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
1197 while (total > point) { 1189 total = point;
1198 order = ilog2(total - point); 1190 kdebug("try to alloc exact %lu pages", total);
1199 n = 1 << order; 1191 base = alloc_pages_exact(len, GFP_KERNEL);
1200 kdebug("shave %lu/%lu @%lu", n, total - point, total); 1192 } else {
1201 atomic_long_sub(n, &mmap_pages_allocated); 1193 base = (void *)__get_free_pages(GFP_KERNEL, order);
1202 total -= n;
1203 set_page_refcounted(pages + total);
1204 __free_pages(pages + total, order);
1205 }
1206 } 1194 }
1207 1195
1208 for (point = 1; point < total; point++) 1196 if (!base)
1209 set_page_refcounted(&pages[point]); 1197 goto enomem;
1198
1199 atomic_long_add(total, &mmap_pages_allocated);
1210 1200
1211 base = page_address(pages);
1212 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; 1201 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
1213 region->vm_start = (unsigned long) base; 1202 region->vm_start = (unsigned long) base;
1214 region->vm_end = region->vm_start + len; 1203 region->vm_end = region->vm_start + len;
@@ -2094,14 +2083,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2094 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 2083 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
2095 2084
2096 down_write(&nommu_region_sem); 2085 down_write(&nommu_region_sem);
2097 mutex_lock(&inode->i_mapping->i_mmap_mutex); 2086 i_mmap_lock_read(inode->i_mapping);
2098 2087
2099 /* search for VMAs that fall within the dead zone */ 2088 /* search for VMAs that fall within the dead zone */
2100 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { 2089 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
2101 /* found one - only interested if it's shared out of the page 2090 /* found one - only interested if it's shared out of the page
2102 * cache */ 2091 * cache */
2103 if (vma->vm_flags & VM_SHARED) { 2092 if (vma->vm_flags & VM_SHARED) {
2104 mutex_unlock(&inode->i_mapping->i_mmap_mutex); 2093 i_mmap_unlock_read(inode->i_mapping);
2105 up_write(&nommu_region_sem); 2094 up_write(&nommu_region_sem);
2106 return -ETXTBSY; /* not quite true, but near enough */ 2095 return -ETXTBSY; /* not quite true, but near enough */
2107 } 2096 }
@@ -2113,8 +2102,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2113 * we don't check for any regions that start beyond the EOF as there 2102 * we don't check for any regions that start beyond the EOF as there
2114 * shouldn't be any 2103 * shouldn't be any
2115 */ 2104 */
2116 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 2105 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
2117 0, ULONG_MAX) {
2118 if (!(vma->vm_flags & VM_SHARED)) 2106 if (!(vma->vm_flags & VM_SHARED))
2119 continue; 2107 continue;
2120 2108
@@ -2129,7 +2117,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2129 } 2117 }
2130 } 2118 }
2131 2119
2132 mutex_unlock(&inode->i_mapping->i_mmap_mutex); 2120 i_mmap_unlock_read(inode->i_mapping);
2133 up_write(&nommu_region_sem); 2121 up_write(&nommu_region_sem);
2134 return 0; 2122 return 0;
2135} 2123}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 864bba992735..d503e9ce1c7b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -281,14 +281,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
281 if (oom_task_origin(task)) 281 if (oom_task_origin(task))
282 return OOM_SCAN_SELECT; 282 return OOM_SCAN_SELECT;
283 283
284 if (task->flags & PF_EXITING && !force_kill) { 284 if (task_will_free_mem(task) && !force_kill)
285 /* 285 return OOM_SCAN_ABORT;
286 * If this task is not being ptraced on exit, then wait for it 286
287 * to finish before killing some other task unnecessarily.
288 */
289 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
290 return OOM_SCAN_ABORT;
291 }
292 return OOM_SCAN_OK; 287 return OOM_SCAN_OK;
293} 288}
294 289
@@ -443,7 +438,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
443 * If the task is already exiting, don't alarm the sysadmin or kill 438 * If the task is already exiting, don't alarm the sysadmin or kill
444 * its children or threads, just set TIF_MEMDIE so it can die quickly 439 * its children or threads, just set TIF_MEMDIE so it can die quickly
445 */ 440 */
446 if (p->flags & PF_EXITING) { 441 if (task_will_free_mem(p)) {
447 set_tsk_thread_flag(p, TIF_MEMDIE); 442 set_tsk_thread_flag(p, TIF_MEMDIE);
448 put_task_struct(p); 443 put_task_struct(p);
449 return; 444 return;
@@ -649,7 +644,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
649 * select it. The goal is to allow it to allocate so that it may 644 * select it. The goal is to allow it to allocate so that it may
650 * quickly exit and free its memory. 645 * quickly exit and free its memory.
651 */ 646 */
652 if (fatal_signal_pending(current) || current->flags & PF_EXITING) { 647 if (fatal_signal_pending(current) || task_will_free_mem(current)) {
653 set_thread_flag(TIF_MEMDIE); 648 set_thread_flag(TIF_MEMDIE);
654 return; 649 return;
655 } 650 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df542feaac3b..fa974d87f60d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
48#include <linux/backing-dev.h> 48#include <linux/backing-dev.h>
49#include <linux/fault-inject.h> 49#include <linux/fault-inject.h>
50#include <linux/page-isolation.h> 50#include <linux/page-isolation.h>
51#include <linux/page_ext.h>
51#include <linux/debugobjects.h> 52#include <linux/debugobjects.h>
52#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
53#include <linux/compaction.h> 54#include <linux/compaction.h>
@@ -55,9 +56,10 @@
55#include <linux/prefetch.h> 56#include <linux/prefetch.h>
56#include <linux/mm_inline.h> 57#include <linux/mm_inline.h>
57#include <linux/migrate.h> 58#include <linux/migrate.h>
58#include <linux/page-debug-flags.h> 59#include <linux/page_ext.h>
59#include <linux/hugetlb.h> 60#include <linux/hugetlb.h>
60#include <linux/sched/rt.h> 61#include <linux/sched/rt.h>
62#include <linux/page_owner.h>
61 63
62#include <asm/sections.h> 64#include <asm/sections.h>
63#include <asm/tlbflush.h> 65#include <asm/tlbflush.h>
@@ -424,6 +426,42 @@ static inline void prep_zero_page(struct page *page, unsigned int order,
424 426
425#ifdef CONFIG_DEBUG_PAGEALLOC 427#ifdef CONFIG_DEBUG_PAGEALLOC
426unsigned int _debug_guardpage_minorder; 428unsigned int _debug_guardpage_minorder;
429bool _debug_pagealloc_enabled __read_mostly;
430bool _debug_guardpage_enabled __read_mostly;
431
432static int __init early_debug_pagealloc(char *buf)
433{
434 if (!buf)
435 return -EINVAL;
436
437 if (strcmp(buf, "on") == 0)
438 _debug_pagealloc_enabled = true;
439
440 return 0;
441}
442early_param("debug_pagealloc", early_debug_pagealloc);
443
444static bool need_debug_guardpage(void)
445{
446 /* If we don't use debug_pagealloc, we don't need guard page */
447 if (!debug_pagealloc_enabled())
448 return false;
449
450 return true;
451}
452
453static void init_debug_guardpage(void)
454{
455 if (!debug_pagealloc_enabled())
456 return;
457
458 _debug_guardpage_enabled = true;
459}
460
461struct page_ext_operations debug_guardpage_ops = {
462 .need = need_debug_guardpage,
463 .init = init_debug_guardpage,
464};
427 465
428static int __init debug_guardpage_minorder_setup(char *buf) 466static int __init debug_guardpage_minorder_setup(char *buf)
429{ 467{
@@ -439,18 +477,44 @@ static int __init debug_guardpage_minorder_setup(char *buf)
439} 477}
440__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 478__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
441 479
442static inline void set_page_guard_flag(struct page *page) 480static inline void set_page_guard(struct zone *zone, struct page *page,
481 unsigned int order, int migratetype)
443{ 482{
444 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 483 struct page_ext *page_ext;
484
485 if (!debug_guardpage_enabled())
486 return;
487
488 page_ext = lookup_page_ext(page);
489 __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
490
491 INIT_LIST_HEAD(&page->lru);
492 set_page_private(page, order);
493 /* Guard pages are not available for any usage */
494 __mod_zone_freepage_state(zone, -(1 << order), migratetype);
445} 495}
446 496
447static inline void clear_page_guard_flag(struct page *page) 497static inline void clear_page_guard(struct zone *zone, struct page *page,
498 unsigned int order, int migratetype)
448{ 499{
449 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 500 struct page_ext *page_ext;
501
502 if (!debug_guardpage_enabled())
503 return;
504
505 page_ext = lookup_page_ext(page);
506 __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
507
508 set_page_private(page, 0);
509 if (!is_migrate_isolate(migratetype))
510 __mod_zone_freepage_state(zone, (1 << order), migratetype);
450} 511}
451#else 512#else
452static inline void set_page_guard_flag(struct page *page) { } 513struct page_ext_operations debug_guardpage_ops = { NULL, };
453static inline void clear_page_guard_flag(struct page *page) { } 514static inline void set_page_guard(struct zone *zone, struct page *page,
515 unsigned int order, int migratetype) {}
516static inline void clear_page_guard(struct zone *zone, struct page *page,
517 unsigned int order, int migratetype) {}
454#endif 518#endif
455 519
456static inline void set_page_order(struct page *page, unsigned int order) 520static inline void set_page_order(struct page *page, unsigned int order)
@@ -581,12 +645,7 @@ static inline void __free_one_page(struct page *page,
581 * merge with it and move up one order. 645 * merge with it and move up one order.
582 */ 646 */
583 if (page_is_guard(buddy)) { 647 if (page_is_guard(buddy)) {
584 clear_page_guard_flag(buddy); 648 clear_page_guard(zone, buddy, order, migratetype);
585 set_page_private(buddy, 0);
586 if (!is_migrate_isolate(migratetype)) {
587 __mod_zone_freepage_state(zone, 1 << order,
588 migratetype);
589 }
590 } else { 649 } else {
591 list_del(&buddy->lru); 650 list_del(&buddy->lru);
592 zone->free_area[order].nr_free--; 651 zone->free_area[order].nr_free--;
@@ -755,6 +814,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
755 if (bad) 814 if (bad)
756 return false; 815 return false;
757 816
817 reset_page_owner(page, order);
818
758 if (!PageHighMem(page)) { 819 if (!PageHighMem(page)) {
759 debug_check_no_locks_freed(page_address(page), 820 debug_check_no_locks_freed(page_address(page),
760 PAGE_SIZE << order); 821 PAGE_SIZE << order);
@@ -861,23 +922,18 @@ static inline void expand(struct zone *zone, struct page *page,
861 size >>= 1; 922 size >>= 1;
862 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 923 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
863 924
864#ifdef CONFIG_DEBUG_PAGEALLOC 925 if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
865 if (high < debug_guardpage_minorder()) { 926 debug_guardpage_enabled() &&
927 high < debug_guardpage_minorder()) {
866 /* 928 /*
867 * Mark as guard pages (or page), that will allow to 929 * Mark as guard pages (or page), that will allow to
868 * merge back to allocator when buddy will be freed. 930 * merge back to allocator when buddy will be freed.
869 * Corresponding page table entries will not be touched, 931 * Corresponding page table entries will not be touched,
870 * pages will stay not present in virtual address space 932 * pages will stay not present in virtual address space
871 */ 933 */
872 INIT_LIST_HEAD(&page[size].lru); 934 set_page_guard(zone, &page[size], high, migratetype);
873 set_page_guard_flag(&page[size]);
874 set_page_private(&page[size], high);
875 /* Guard pages are not available for any usage */
876 __mod_zone_freepage_state(zone, -(1 << high),
877 migratetype);
878 continue; 935 continue;
879 } 936 }
880#endif
881 list_add(&page[size].lru, &area->free_list[migratetype]); 937 list_add(&page[size].lru, &area->free_list[migratetype]);
882 area->nr_free++; 938 area->nr_free++;
883 set_page_order(&page[size], high); 939 set_page_order(&page[size], high);
@@ -935,6 +991,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
935 if (order && (gfp_flags & __GFP_COMP)) 991 if (order && (gfp_flags & __GFP_COMP))
936 prep_compound_page(page, order); 992 prep_compound_page(page, order);
937 993
994 set_page_owner(page, order, gfp_flags);
995
938 return 0; 996 return 0;
939} 997}
940 998
@@ -1507,8 +1565,11 @@ void split_page(struct page *page, unsigned int order)
1507 split_page(virt_to_page(page[0].shadow), order); 1565 split_page(virt_to_page(page[0].shadow), order);
1508#endif 1566#endif
1509 1567
1510 for (i = 1; i < (1 << order); i++) 1568 set_page_owner(page, 0, 0);
1569 for (i = 1; i < (1 << order); i++) {
1511 set_page_refcounted(page + i); 1570 set_page_refcounted(page + i);
1571 set_page_owner(page + i, 0, 0);
1572 }
1512} 1573}
1513EXPORT_SYMBOL_GPL(split_page); 1574EXPORT_SYMBOL_GPL(split_page);
1514 1575
@@ -1548,6 +1609,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
1548 } 1609 }
1549 } 1610 }
1550 1611
1612 set_page_owner(page, order, 0);
1551 return 1UL << order; 1613 return 1UL << order;
1552} 1614}
1553 1615
@@ -4856,6 +4918,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4856#endif 4918#endif
4857 init_waitqueue_head(&pgdat->kswapd_wait); 4919 init_waitqueue_head(&pgdat->kswapd_wait);
4858 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4920 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4921 pgdat_page_ext_init(pgdat);
4859 4922
4860 for (j = 0; j < MAX_NR_ZONES; j++) { 4923 for (j = 0; j < MAX_NR_ZONES; j++) {
4861 struct zone *zone = pgdat->node_zones + j; 4924 struct zone *zone = pgdat->node_zones + j;
@@ -4874,16 +4937,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4874 * and per-cpu initialisations 4937 * and per-cpu initialisations
4875 */ 4938 */
4876 memmap_pages = calc_memmap_size(size, realsize); 4939 memmap_pages = calc_memmap_size(size, realsize);
4877 if (freesize >= memmap_pages) { 4940 if (!is_highmem_idx(j)) {
4878 freesize -= memmap_pages; 4941 if (freesize >= memmap_pages) {
4879 if (memmap_pages) 4942 freesize -= memmap_pages;
4880 printk(KERN_DEBUG 4943 if (memmap_pages)
4881 " %s zone: %lu pages used for memmap\n", 4944 printk(KERN_DEBUG
4882 zone_names[j], memmap_pages); 4945 " %s zone: %lu pages used for memmap\n",
4883 } else 4946 zone_names[j], memmap_pages);
4884 printk(KERN_WARNING 4947 } else
4885 " %s zone: %lu pages exceeds freesize %lu\n", 4948 printk(KERN_WARNING
4886 zone_names[j], memmap_pages, freesize); 4949 " %s zone: %lu pages exceeds freesize %lu\n",
4950 zone_names[j], memmap_pages, freesize);
4951 }
4887 4952
4888 /* Account for reserved pages */ 4953 /* Account for reserved pages */
4889 if (j == 0 && freesize > dma_reserve) { 4954 if (j == 0 && freesize > dma_reserve) {
@@ -6221,9 +6286,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
6221 if (!PageLRU(page)) 6286 if (!PageLRU(page))
6222 found++; 6287 found++;
6223 /* 6288 /*
6224 * If there are RECLAIMABLE pages, we need to check it. 6289 * If there are RECLAIMABLE pages, we need to check
6225 * But now, memory offline itself doesn't call shrink_slab() 6290 * it. But now, memory offline itself doesn't call
6226 * and it still to be fixed. 6291 * shrink_node_slabs() and it still to be fixed.
6227 */ 6292 */
6228 /* 6293 /*
6229 * If the page is not RAM, page_count()should be 0. 6294 * If the page is not RAM, page_count()should be 0.
diff --git a/mm/page_ext.c b/mm/page_ext.c
new file mode 100644
index 000000000000..d86fd2f5353f
--- /dev/null
+++ b/mm/page_ext.c
@@ -0,0 +1,403 @@
1#include <linux/mm.h>
2#include <linux/mmzone.h>
3#include <linux/bootmem.h>
4#include <linux/page_ext.h>
5#include <linux/memory.h>
6#include <linux/vmalloc.h>
7#include <linux/kmemleak.h>
8#include <linux/page_owner.h>
9
10/*
11 * struct page extension
12 *
13 * This is the feature to manage memory for extended data per page.
14 *
15 * Until now, we must modify struct page itself to store extra data per page.
16 * This requires rebuilding the kernel and it is really time consuming process.
17 * And, sometimes, rebuild is impossible due to third party module dependency.
18 * At last, enlarging struct page could cause un-wanted system behaviour change.
19 *
20 * This feature is intended to overcome above mentioned problems. This feature
21 * allocates memory for extended data per page in certain place rather than
22 * the struct page itself. This memory can be accessed by the accessor
23 * functions provided by this code. During the boot process, it checks whether
24 * allocation of huge chunk of memory is needed or not. If not, it avoids
25 * allocating memory at all. With this advantage, we can include this feature
26 * into the kernel in default and can avoid rebuild and solve related problems.
27 *
28 * To help these things to work well, there are two callbacks for clients. One
29 * is the need callback which is mandatory if user wants to avoid useless
30 * memory allocation at boot-time. The other is optional, init callback, which
31 * is used to do proper initialization after memory is allocated.
32 *
33 * The need callback is used to decide whether extended memory allocation is
34 * needed or not. Sometimes users want to deactivate some features in this
35 * boot and extra memory would be unneccessary. In this case, to avoid
36 * allocating huge chunk of memory, each clients represent their need of
37 * extra memory through the need callback. If one of the need callbacks
38 * returns true, it means that someone needs extra memory so that
39 * page extension core should allocates memory for page extension. If
40 * none of need callbacks return true, memory isn't needed at all in this boot
41 * and page extension core can skip to allocate memory. As result,
42 * none of memory is wasted.
43 *
44 * The init callback is used to do proper initialization after page extension
45 * is completely initialized. In sparse memory system, extra memory is
46 * allocated some time later than memmap is allocated. In other words, lifetime
47 * of memory for page extension isn't same with memmap for struct page.
48 * Therefore, clients can't store extra data until page extension is
49 * initialized, even if pages are allocated and used freely. This could
50 * cause inadequate state of extra data per page, so, to prevent it, client
51 * can utilize this callback to initialize the state of it correctly.
52 */
53
54static struct page_ext_operations *page_ext_ops[] = {
55 &debug_guardpage_ops,
56#ifdef CONFIG_PAGE_POISONING
57 &page_poisoning_ops,
58#endif
59#ifdef CONFIG_PAGE_OWNER
60 &page_owner_ops,
61#endif
62};
63
64static unsigned long total_usage;
65
66static bool __init invoke_need_callbacks(void)
67{
68 int i;
69 int entries = ARRAY_SIZE(page_ext_ops);
70
71 for (i = 0; i < entries; i++) {
72 if (page_ext_ops[i]->need && page_ext_ops[i]->need())
73 return true;
74 }
75
76 return false;
77}
78
79static void __init invoke_init_callbacks(void)
80{
81 int i;
82 int entries = ARRAY_SIZE(page_ext_ops);
83
84 for (i = 0; i < entries; i++) {
85 if (page_ext_ops[i]->init)
86 page_ext_ops[i]->init();
87 }
88}
89
90#if !defined(CONFIG_SPARSEMEM)
91
92
93void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
94{
95 pgdat->node_page_ext = NULL;
96}
97
98struct page_ext *lookup_page_ext(struct page *page)
99{
100 unsigned long pfn = page_to_pfn(page);
101 unsigned long offset;
102 struct page_ext *base;
103
104 base = NODE_DATA(page_to_nid(page))->node_page_ext;
105#ifdef CONFIG_DEBUG_VM
106 /*
107 * The sanity checks the page allocator does upon freeing a
108 * page can reach here before the page_ext arrays are
109 * allocated when feeding a range of pages to the allocator
110 * for the first time during bootup or memory hotplug.
111 */
112 if (unlikely(!base))
113 return NULL;
114#endif
115 offset = pfn - round_down(node_start_pfn(page_to_nid(page)),
116 MAX_ORDER_NR_PAGES);
117 return base + offset;
118}
119
120static int __init alloc_node_page_ext(int nid)
121{
122 struct page_ext *base;
123 unsigned long table_size;
124 unsigned long nr_pages;
125
126 nr_pages = NODE_DATA(nid)->node_spanned_pages;
127 if (!nr_pages)
128 return 0;
129
130 /*
131 * Need extra space if node range is not aligned with
132 * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
133 * checks buddy's status, range could be out of exact node range.
134 */
135 if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
136 !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
137 nr_pages += MAX_ORDER_NR_PAGES;
138
139 table_size = sizeof(struct page_ext) * nr_pages;
140
141 base = memblock_virt_alloc_try_nid_nopanic(
142 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
143 BOOTMEM_ALLOC_ACCESSIBLE, nid);
144 if (!base)
145 return -ENOMEM;
146 NODE_DATA(nid)->node_page_ext = base;
147 total_usage += table_size;
148 return 0;
149}
150
151void __init page_ext_init_flatmem(void)
152{
153
154 int nid, fail;
155
156 if (!invoke_need_callbacks())
157 return;
158
159 for_each_online_node(nid) {
160 fail = alloc_node_page_ext(nid);
161 if (fail)
162 goto fail;
163 }
164 pr_info("allocated %ld bytes of page_ext\n", total_usage);
165 invoke_init_callbacks();
166 return;
167
168fail:
169 pr_crit("allocation of page_ext failed.\n");
170 panic("Out of memory");
171}
172
173#else /* CONFIG_FLAT_NODE_MEM_MAP */
174
175struct page_ext *lookup_page_ext(struct page *page)
176{
177 unsigned long pfn = page_to_pfn(page);
178 struct mem_section *section = __pfn_to_section(pfn);
179#ifdef CONFIG_DEBUG_VM
180 /*
181 * The sanity checks the page allocator does upon freeing a
182 * page can reach here before the page_ext arrays are
183 * allocated when feeding a range of pages to the allocator
184 * for the first time during bootup or memory hotplug.
185 */
186 if (!section->page_ext)
187 return NULL;
188#endif
189 return section->page_ext + pfn;
190}
191
192static void *__meminit alloc_page_ext(size_t size, int nid)
193{
194 gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
195 void *addr = NULL;
196
197 addr = alloc_pages_exact_nid(nid, size, flags);
198 if (addr) {
199 kmemleak_alloc(addr, size, 1, flags);
200 return addr;
201 }
202
203 if (node_state(nid, N_HIGH_MEMORY))
204 addr = vzalloc_node(size, nid);
205 else
206 addr = vzalloc(size);
207
208 return addr;
209}
210
211static int __meminit init_section_page_ext(unsigned long pfn, int nid)
212{
213 struct mem_section *section;
214 struct page_ext *base;
215 unsigned long table_size;
216
217 section = __pfn_to_section(pfn);
218
219 if (section->page_ext)
220 return 0;
221
222 table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
223 base = alloc_page_ext(table_size, nid);
224
225 /*
226 * The value stored in section->page_ext is (base - pfn)
227 * and it does not point to the memory block allocated above,
228 * causing kmemleak false positives.
229 */
230 kmemleak_not_leak(base);
231
232 if (!base) {
233 pr_err("page ext allocation failure\n");
234 return -ENOMEM;
235 }
236
237 /*
238 * The passed "pfn" may not be aligned to SECTION. For the calculation
239 * we need to apply a mask.
240 */
241 pfn &= PAGE_SECTION_MASK;
242 section->page_ext = base - pfn;
243 total_usage += table_size;
244 return 0;
245}
246#ifdef CONFIG_MEMORY_HOTPLUG
247static void free_page_ext(void *addr)
248{
249 if (is_vmalloc_addr(addr)) {
250 vfree(addr);
251 } else {
252 struct page *page = virt_to_page(addr);
253 size_t table_size;
254
255 table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
256
257 BUG_ON(PageReserved(page));
258 free_pages_exact(addr, table_size);
259 }
260}
261
262static void __free_page_ext(unsigned long pfn)
263{
264 struct mem_section *ms;
265 struct page_ext *base;
266
267 ms = __pfn_to_section(pfn);
268 if (!ms || !ms->page_ext)
269 return;
270 base = ms->page_ext + pfn;
271 free_page_ext(base);
272 ms->page_ext = NULL;
273}
274
275static int __meminit online_page_ext(unsigned long start_pfn,
276 unsigned long nr_pages,
277 int nid)
278{
279 unsigned long start, end, pfn;
280 int fail = 0;
281
282 start = SECTION_ALIGN_DOWN(start_pfn);
283 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
284
285 if (nid == -1) {
286 /*
287 * In this case, "nid" already exists and contains valid memory.
288 * "start_pfn" passed to us is a pfn which is an arg for
289 * online__pages(), and start_pfn should exist.
290 */
291 nid = pfn_to_nid(start_pfn);
292 VM_BUG_ON(!node_state(nid, N_ONLINE));
293 }
294
295 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
296 if (!pfn_present(pfn))
297 continue;
298 fail = init_section_page_ext(pfn, nid);
299 }
300 if (!fail)
301 return 0;
302
303 /* rollback */
304 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
305 __free_page_ext(pfn);
306
307 return -ENOMEM;
308}
309
310static int __meminit offline_page_ext(unsigned long start_pfn,
311 unsigned long nr_pages, int nid)
312{
313 unsigned long start, end, pfn;
314
315 start = SECTION_ALIGN_DOWN(start_pfn);
316 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
317
318 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
319 __free_page_ext(pfn);
320 return 0;
321
322}
323
324static int __meminit page_ext_callback(struct notifier_block *self,
325 unsigned long action, void *arg)
326{
327 struct memory_notify *mn = arg;
328 int ret = 0;
329
330 switch (action) {
331 case MEM_GOING_ONLINE:
332 ret = online_page_ext(mn->start_pfn,
333 mn->nr_pages, mn->status_change_nid);
334 break;
335 case MEM_OFFLINE:
336 offline_page_ext(mn->start_pfn,
337 mn->nr_pages, mn->status_change_nid);
338 break;
339 case MEM_CANCEL_ONLINE:
340 offline_page_ext(mn->start_pfn,
341 mn->nr_pages, mn->status_change_nid);
342 break;
343 case MEM_GOING_OFFLINE:
344 break;
345 case MEM_ONLINE:
346 case MEM_CANCEL_OFFLINE:
347 break;
348 }
349
350 return notifier_from_errno(ret);
351}
352
353#endif
354
355void __init page_ext_init(void)
356{
357 unsigned long pfn;
358 int nid;
359
360 if (!invoke_need_callbacks())
361 return;
362
363 for_each_node_state(nid, N_MEMORY) {
364 unsigned long start_pfn, end_pfn;
365
366 start_pfn = node_start_pfn(nid);
367 end_pfn = node_end_pfn(nid);
368 /*
369 * start_pfn and end_pfn may not be aligned to SECTION and the
370 * page->flags of out of node pages are not initialized. So we
371 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
372 */
373 for (pfn = start_pfn; pfn < end_pfn;
374 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
375
376 if (!pfn_valid(pfn))
377 continue;
378 /*
379 * Nodes's pfns can be overlapping.
380 * We know some arch can have a nodes layout such as
381 * -------------pfn-------------->
382 * N0 | N1 | N2 | N0 | N1 | N2|....
383 */
384 if (pfn_to_nid(pfn) != nid)
385 continue;
386 if (init_section_page_ext(pfn, nid))
387 goto oom;
388 }
389 }
390 hotplug_memory_notifier(page_ext_callback, 0);
391 pr_info("allocated %ld bytes of page_ext\n", total_usage);
392 invoke_init_callbacks();
393 return;
394
395oom:
396 panic("Out of memory");
397}
398
399void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
400{
401}
402
403#endif
diff --git a/mm/page_owner.c b/mm/page_owner.c
new file mode 100644
index 000000000000..9ab4a9b5bc09
--- /dev/null
+++ b/mm/page_owner.c
@@ -0,0 +1,311 @@
1#include <linux/debugfs.h>
2#include <linux/mm.h>
3#include <linux/slab.h>
4#include <linux/uaccess.h>
5#include <linux/bootmem.h>
6#include <linux/stacktrace.h>
7#include <linux/page_owner.h>
8#include "internal.h"
9
10static bool page_owner_disabled = true;
11bool page_owner_inited __read_mostly;
12
13static void init_early_allocated_pages(void);
14
15static int early_page_owner_param(char *buf)
16{
17 if (!buf)
18 return -EINVAL;
19
20 if (strcmp(buf, "on") == 0)
21 page_owner_disabled = false;
22
23 return 0;
24}
25early_param("page_owner", early_page_owner_param);
26
27static bool need_page_owner(void)
28{
29 if (page_owner_disabled)
30 return false;
31
32 return true;
33}
34
35static void init_page_owner(void)
36{
37 if (page_owner_disabled)
38 return;
39
40 page_owner_inited = true;
41 init_early_allocated_pages();
42}
43
44struct page_ext_operations page_owner_ops = {
45 .need = need_page_owner,
46 .init = init_page_owner,
47};
48
49void __reset_page_owner(struct page *page, unsigned int order)
50{
51 int i;
52 struct page_ext *page_ext;
53
54 for (i = 0; i < (1 << order); i++) {
55 page_ext = lookup_page_ext(page + i);
56 __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
57 }
58}
59
60void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
61{
62 struct page_ext *page_ext;
63 struct stack_trace *trace;
64
65 page_ext = lookup_page_ext(page);
66
67 trace = &page_ext->trace;
68 trace->nr_entries = 0;
69 trace->max_entries = ARRAY_SIZE(page_ext->trace_entries);
70 trace->entries = &page_ext->trace_entries[0];
71 trace->skip = 3;
72 save_stack_trace(&page_ext->trace);
73
74 page_ext->order = order;
75 page_ext->gfp_mask = gfp_mask;
76
77 __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
78}
79
80static ssize_t
81print_page_owner(char __user *buf, size_t count, unsigned long pfn,
82 struct page *page, struct page_ext *page_ext)
83{
84 int ret;
85 int pageblock_mt, page_mt;
86 char *kbuf;
87
88 kbuf = kmalloc(count, GFP_KERNEL);
89 if (!kbuf)
90 return -ENOMEM;
91
92 ret = snprintf(kbuf, count,
93 "Page allocated via order %u, mask 0x%x\n",
94 page_ext->order, page_ext->gfp_mask);
95
96 if (ret >= count)
97 goto err;
98
99 /* Print information relevant to grouping pages by mobility */
100 pageblock_mt = get_pfnblock_migratetype(page, pfn);
101 page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
102 ret += snprintf(kbuf + ret, count - ret,
103 "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
104 pfn,
105 pfn >> pageblock_order,
106 pageblock_mt,
107 pageblock_mt != page_mt ? "Fallback" : " ",
108 PageLocked(page) ? "K" : " ",
109 PageError(page) ? "E" : " ",
110 PageReferenced(page) ? "R" : " ",
111 PageUptodate(page) ? "U" : " ",
112 PageDirty(page) ? "D" : " ",
113 PageLRU(page) ? "L" : " ",
114 PageActive(page) ? "A" : " ",
115 PageSlab(page) ? "S" : " ",
116 PageWriteback(page) ? "W" : " ",
117 PageCompound(page) ? "C" : " ",
118 PageSwapCache(page) ? "B" : " ",
119 PageMappedToDisk(page) ? "M" : " ");
120
121 if (ret >= count)
122 goto err;
123
124 ret += snprint_stack_trace(kbuf + ret, count - ret,
125 &page_ext->trace, 0);
126 if (ret >= count)
127 goto err;
128
129 ret += snprintf(kbuf + ret, count - ret, "\n");
130 if (ret >= count)
131 goto err;
132
133 if (copy_to_user(buf, kbuf, ret))
134 ret = -EFAULT;
135
136 kfree(kbuf);
137 return ret;
138
139err:
140 kfree(kbuf);
141 return -ENOMEM;
142}
143
144static ssize_t
145read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
146{
147 unsigned long pfn;
148 struct page *page;
149 struct page_ext *page_ext;
150
151 if (!page_owner_inited)
152 return -EINVAL;
153
154 page = NULL;
155 pfn = min_low_pfn + *ppos;
156
157 /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
158 while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
159 pfn++;
160
161 drain_all_pages(NULL);
162
163 /* Find an allocated page */
164 for (; pfn < max_pfn; pfn++) {
165 /*
166 * If the new page is in a new MAX_ORDER_NR_PAGES area,
167 * validate the area as existing, skip it if not
168 */
169 if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
170 pfn += MAX_ORDER_NR_PAGES - 1;
171 continue;
172 }
173
174 /* Check for holes within a MAX_ORDER area */
175 if (!pfn_valid_within(pfn))
176 continue;
177
178 page = pfn_to_page(pfn);
179 if (PageBuddy(page)) {
180 unsigned long freepage_order = page_order_unsafe(page);
181
182 if (freepage_order < MAX_ORDER)
183 pfn += (1UL << freepage_order) - 1;
184 continue;
185 }
186
187 page_ext = lookup_page_ext(page);
188
189 /*
190 * Some pages could be missed by concurrent allocation or free,
191 * because we don't hold the zone lock.
192 */
193 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
194 continue;
195
196 /* Record the next PFN to read in the file offset */
197 *ppos = (pfn - min_low_pfn) + 1;
198
199 return print_page_owner(buf, count, pfn, page, page_ext);
200 }
201
202 return 0;
203}
204
205static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
206{
207 struct page *page;
208 struct page_ext *page_ext;
209 unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
210 unsigned long end_pfn = pfn + zone->spanned_pages;
211 unsigned long count = 0;
212
213 /* Scan block by block. First and last block may be incomplete */
214 pfn = zone->zone_start_pfn;
215
216 /*
217 * Walk the zone in pageblock_nr_pages steps. If a page block spans
218 * a zone boundary, it will be double counted between zones. This does
219 * not matter as the mixed block count will still be correct
220 */
221 for (; pfn < end_pfn; ) {
222 if (!pfn_valid(pfn)) {
223 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
224 continue;
225 }
226
227 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
228 block_end_pfn = min(block_end_pfn, end_pfn);
229
230 page = pfn_to_page(pfn);
231
232 for (; pfn < block_end_pfn; pfn++) {
233 if (!pfn_valid_within(pfn))
234 continue;
235
236 page = pfn_to_page(pfn);
237
238 /*
239 * We are safe to check buddy flag and order, because
240 * this is init stage and only single thread runs.
241 */
242 if (PageBuddy(page)) {
243 pfn += (1UL << page_order(page)) - 1;
244 continue;
245 }
246
247 if (PageReserved(page))
248 continue;
249
250 page_ext = lookup_page_ext(page);
251
252 /* Maybe overraping zone */
253 if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
254 continue;
255
256 /* Found early allocated page */
257 set_page_owner(page, 0, 0);
258 count++;
259 }
260 }
261
262 pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
263 pgdat->node_id, zone->name, count);
264}
265
266static void init_zones_in_node(pg_data_t *pgdat)
267{
268 struct zone *zone;
269 struct zone *node_zones = pgdat->node_zones;
270 unsigned long flags;
271
272 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
273 if (!populated_zone(zone))
274 continue;
275
276 spin_lock_irqsave(&zone->lock, flags);
277 init_pages_in_zone(pgdat, zone);
278 spin_unlock_irqrestore(&zone->lock, flags);
279 }
280}
281
282static void init_early_allocated_pages(void)
283{
284 pg_data_t *pgdat;
285
286 drain_all_pages(NULL);
287 for_each_online_pgdat(pgdat)
288 init_zones_in_node(pgdat);
289}
290
291static const struct file_operations proc_page_owner_operations = {
292 .read = read_page_owner,
293};
294
295static int __init pageowner_init(void)
296{
297 struct dentry *dentry;
298
299 if (!page_owner_inited) {
300 pr_info("page_owner is disabled\n");
301 return 0;
302 }
303
304 dentry = debugfs_create_file("page_owner", S_IRUSR, NULL,
305 NULL, &proc_page_owner_operations);
306 if (IS_ERR(dentry))
307 return PTR_ERR(dentry);
308
309 return 0;
310}
311module_init(pageowner_init)
diff --git a/mm/rmap.c b/mm/rmap.c
index 45eba36fd673..c52f43a69eea 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,7 +23,7 @@
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * mm->mmap_sem 24 * mm->mmap_sem
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * mapping->i_mmap_mutex 26 * mapping->i_mmap_rwsem
27 * anon_vma->rwsem 27 * anon_vma->rwsem
28 * mm->page_table_lock or pte_lock 28 * mm->page_table_lock or pte_lock
29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -1260,7 +1260,7 @@ out_mlock:
1260 /* 1260 /*
1261 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 1261 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1262 * unstable result and race. Plus, We can't wait here because 1262 * unstable result and race. Plus, We can't wait here because
1263 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. 1263 * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
1264 * if trylock failed, the page remain in evictable lru and later 1264 * if trylock failed, the page remain in evictable lru and later
1265 * vmscan could retry to move the page to unevictable lru if the 1265 * vmscan could retry to move the page to unevictable lru if the
1266 * page is actually mlocked. 1266 * page is actually mlocked.
@@ -1635,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
1635static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) 1635static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1636{ 1636{
1637 struct anon_vma *anon_vma; 1637 struct anon_vma *anon_vma;
1638 pgoff_t pgoff = page_to_pgoff(page); 1638 pgoff_t pgoff;
1639 struct anon_vma_chain *avc; 1639 struct anon_vma_chain *avc;
1640 int ret = SWAP_AGAIN; 1640 int ret = SWAP_AGAIN;
1641 1641
@@ -1643,6 +1643,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1643 if (!anon_vma) 1643 if (!anon_vma)
1644 return ret; 1644 return ret;
1645 1645
1646 pgoff = page_to_pgoff(page);
1646 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1647 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1647 struct vm_area_struct *vma = avc->vma; 1648 struct vm_area_struct *vma = avc->vma;
1648 unsigned long address = vma_address(page, vma); 1649 unsigned long address = vma_address(page, vma);
@@ -1676,7 +1677,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
1676static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) 1677static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1677{ 1678{
1678 struct address_space *mapping = page->mapping; 1679 struct address_space *mapping = page->mapping;
1679 pgoff_t pgoff = page_to_pgoff(page); 1680 pgoff_t pgoff;
1680 struct vm_area_struct *vma; 1681 struct vm_area_struct *vma;
1681 int ret = SWAP_AGAIN; 1682 int ret = SWAP_AGAIN;
1682 1683
@@ -1684,13 +1685,15 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1684 * The page lock not only makes sure that page->mapping cannot 1685 * The page lock not only makes sure that page->mapping cannot
1685 * suddenly be NULLified by truncation, it makes sure that the 1686 * suddenly be NULLified by truncation, it makes sure that the
1686 * structure at mapping cannot be freed and reused yet, 1687 * structure at mapping cannot be freed and reused yet,
1687 * so we can safely take mapping->i_mmap_mutex. 1688 * so we can safely take mapping->i_mmap_rwsem.
1688 */ 1689 */
1689 VM_BUG_ON_PAGE(!PageLocked(page), page); 1690 VM_BUG_ON_PAGE(!PageLocked(page), page);
1690 1691
1691 if (!mapping) 1692 if (!mapping)
1692 return ret; 1693 return ret;
1693 mutex_lock(&mapping->i_mmap_mutex); 1694
1695 pgoff = page_to_pgoff(page);
1696 i_mmap_lock_read(mapping);
1694 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1697 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1695 unsigned long address = vma_address(page, vma); 1698 unsigned long address = vma_address(page, vma);
1696 1699
@@ -1711,9 +1714,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1711 goto done; 1714 goto done;
1712 1715
1713 ret = rwc->file_nonlinear(page, mapping, rwc->arg); 1716 ret = rwc->file_nonlinear(page, mapping, rwc->arg);
1714
1715done: 1717done:
1716 mutex_unlock(&mapping->i_mmap_mutex); 1718 i_mmap_unlock_read(mapping);
1717 return ret; 1719 return ret;
1718} 1720}
1719 1721
diff --git a/mm/slab.c b/mm/slab.c
index fee275b5b6b7..65b5dcb6f671 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3015,7 +3015,7 @@ retry:
3015 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 3015 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
3016 nid = zone_to_nid(zone); 3016 nid = zone_to_nid(zone);
3017 3017
3018 if (cpuset_zone_allowed(zone, flags | __GFP_HARDWALL) && 3018 if (cpuset_zone_allowed(zone, flags) &&
3019 get_node(cache, nid) && 3019 get_node(cache, nid) &&
3020 get_node(cache, nid)->free_objects) { 3020 get_node(cache, nid)->free_objects) {
3021 obj = ____cache_alloc_node(cache, 3021 obj = ____cache_alloc_node(cache,
@@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3182 memset(ptr, 0, cachep->object_size); 3182 memset(ptr, 0, cachep->object_size);
3183 } 3183 }
3184 3184
3185 memcg_kmem_put_cache(cachep);
3185 return ptr; 3186 return ptr;
3186} 3187}
3187 3188
@@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3247 memset(objp, 0, cachep->object_size); 3248 memset(objp, 0, cachep->object_size);
3248 } 3249 }
3249 3250
3251 memcg_kmem_put_cache(cachep);
3250 return objp; 3252 return objp;
3251} 3253}
3252 3254
diff --git a/mm/slub.c b/mm/slub.c
index 765c5884d03d..fe376fe1f4fe 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x)
1233 kmemleak_free(x); 1233 kmemleak_free(x);
1234} 1234}
1235 1235
1236static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1236static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
1237 gfp_t flags)
1237{ 1238{
1238 flags &= gfp_allowed_mask; 1239 flags &= gfp_allowed_mask;
1239 lockdep_trace_alloc(flags); 1240 lockdep_trace_alloc(flags);
1240 might_sleep_if(flags & __GFP_WAIT); 1241 might_sleep_if(flags & __GFP_WAIT);
1241 1242
1242 return should_failslab(s->object_size, flags, s->flags); 1243 if (should_failslab(s->object_size, flags, s->flags))
1244 return NULL;
1245
1246 return memcg_kmem_get_cache(s, flags);
1243} 1247}
1244 1248
1245static inline void slab_post_alloc_hook(struct kmem_cache *s, 1249static inline void slab_post_alloc_hook(struct kmem_cache *s,
@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
1248 flags &= gfp_allowed_mask; 1252 flags &= gfp_allowed_mask;
1249 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 1253 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
1250 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); 1254 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
1255 memcg_kmem_put_cache(s);
1251} 1256}
1252 1257
1253static inline void slab_free_hook(struct kmem_cache *s, void *x) 1258static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -1665,8 +1670,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1665 1670
1666 n = get_node(s, zone_to_nid(zone)); 1671 n = get_node(s, zone_to_nid(zone));
1667 1672
1668 if (n && cpuset_zone_allowed(zone, 1673 if (n && cpuset_zone_allowed(zone, flags) &&
1669 flags | __GFP_HARDWALL) &&
1670 n->nr_partial > s->min_partial) { 1674 n->nr_partial > s->min_partial) {
1671 object = get_partial_node(s, n, c, flags); 1675 object = get_partial_node(s, n, c, flags);
1672 if (object) { 1676 if (object) {
@@ -2384,10 +2388,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2384 struct page *page; 2388 struct page *page;
2385 unsigned long tid; 2389 unsigned long tid;
2386 2390
2387 if (slab_pre_alloc_hook(s, gfpflags)) 2391 s = slab_pre_alloc_hook(s, gfpflags);
2392 if (!s)
2388 return NULL; 2393 return NULL;
2389
2390 s = memcg_kmem_get_cache(s, gfpflags);
2391redo: 2394redo:
2392 /* 2395 /*
2393 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 2396 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 9f25af825dec..b6e3662fe339 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -17,6 +17,8 @@ void vmacache_flush_all(struct mm_struct *mm)
17{ 17{
18 struct task_struct *g, *p; 18 struct task_struct *g, *p;
19 19
20 count_vm_vmacache_event(VMACACHE_FULL_FLUSHES);
21
20 /* 22 /*
21 * Single threaded tasks need not iterate the entire 23 * Single threaded tasks need not iterate the entire
22 * list of process. We can avoid the flushing as well 24 * list of process. We can avoid the flushing as well
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8a18196fcdff..39c338896416 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2574,10 +2574,10 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2574 if (!counters) 2574 if (!counters)
2575 return; 2575 return;
2576 2576
2577 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
2578 smp_rmb();
2579 if (v->flags & VM_UNINITIALIZED) 2577 if (v->flags & VM_UNINITIALIZED)
2580 return; 2578 return;
2579 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
2580 smp_rmb();
2581 2581
2582 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 2582 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
2583 2583
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a384339bf718..bd9a72bc4a1b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -229,9 +229,10 @@ EXPORT_SYMBOL(unregister_shrinker);
229 229
230#define SHRINK_BATCH 128 230#define SHRINK_BATCH 128
231 231
232static unsigned long 232static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
233shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, 233 struct shrinker *shrinker,
234 unsigned long nr_pages_scanned, unsigned long lru_pages) 234 unsigned long nr_scanned,
235 unsigned long nr_eligible)
235{ 236{
236 unsigned long freed = 0; 237 unsigned long freed = 0;
237 unsigned long long delta; 238 unsigned long long delta;
@@ -255,9 +256,9 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
255 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); 256 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
256 257
257 total_scan = nr; 258 total_scan = nr;
258 delta = (4 * nr_pages_scanned) / shrinker->seeks; 259 delta = (4 * nr_scanned) / shrinker->seeks;
259 delta *= freeable; 260 delta *= freeable;
260 do_div(delta, lru_pages + 1); 261 do_div(delta, nr_eligible + 1);
261 total_scan += delta; 262 total_scan += delta;
262 if (total_scan < 0) { 263 if (total_scan < 0) {
263 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", 264 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
@@ -289,8 +290,8 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
289 total_scan = freeable * 2; 290 total_scan = freeable * 2;
290 291
291 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, 292 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
292 nr_pages_scanned, lru_pages, 293 nr_scanned, nr_eligible,
293 freeable, delta, total_scan); 294 freeable, delta, total_scan);
294 295
295 /* 296 /*
296 * Normally, we should not scan less than batch_size objects in one 297 * Normally, we should not scan less than batch_size objects in one
@@ -339,34 +340,37 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
339 return freed; 340 return freed;
340} 341}
341 342
342/* 343/**
343 * Call the shrink functions to age shrinkable caches 344 * shrink_node_slabs - shrink slab caches of a given node
344 * 345 * @gfp_mask: allocation context
345 * Here we assume it costs one seek to replace a lru page and that it also 346 * @nid: node whose slab caches to target
346 * takes a seek to recreate a cache object. With this in mind we age equal 347 * @nr_scanned: pressure numerator
347 * percentages of the lru and ageable caches. This should balance the seeks 348 * @nr_eligible: pressure denominator
348 * generated by these structures.
349 * 349 *
350 * If the vm encountered mapped pages on the LRU it increase the pressure on 350 * Call the shrink functions to age shrinkable caches.
351 * slab to avoid swapping.
352 * 351 *
353 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 352 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
353 * unaware shrinkers will receive a node id of 0 instead.
354 * 354 *
355 * `lru_pages' represents the number of on-LRU pages in all the zones which 355 * @nr_scanned and @nr_eligible form a ratio that indicate how much of
356 * are eligible for the caller's allocation attempt. It is used for balancing 356 * the available objects should be scanned. Page reclaim for example
357 * slab reclaim versus page reclaim. 357 * passes the number of pages scanned and the number of pages on the
358 * LRU lists that it considered on @nid, plus a bias in @nr_scanned
359 * when it encountered mapped pages. The ratio is further biased by
360 * the ->seeks setting of the shrink function, which indicates the
361 * cost to recreate an object relative to that of an LRU page.
358 * 362 *
359 * Returns the number of slab objects which we shrunk. 363 * Returns the number of reclaimed slab objects.
360 */ 364 */
361unsigned long shrink_slab(struct shrink_control *shrinkctl, 365unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
362 unsigned long nr_pages_scanned, 366 unsigned long nr_scanned,
363 unsigned long lru_pages) 367 unsigned long nr_eligible)
364{ 368{
365 struct shrinker *shrinker; 369 struct shrinker *shrinker;
366 unsigned long freed = 0; 370 unsigned long freed = 0;
367 371
368 if (nr_pages_scanned == 0) 372 if (nr_scanned == 0)
369 nr_pages_scanned = SWAP_CLUSTER_MAX; 373 nr_scanned = SWAP_CLUSTER_MAX;
370 374
371 if (!down_read_trylock(&shrinker_rwsem)) { 375 if (!down_read_trylock(&shrinker_rwsem)) {
372 /* 376 /*
@@ -380,20 +384,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
380 } 384 }
381 385
382 list_for_each_entry(shrinker, &shrinker_list, list) { 386 list_for_each_entry(shrinker, &shrinker_list, list) {
383 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { 387 struct shrink_control sc = {
384 shrinkctl->nid = 0; 388 .gfp_mask = gfp_mask,
385 freed += shrink_slab_node(shrinkctl, shrinker, 389 .nid = nid,
386 nr_pages_scanned, lru_pages); 390 };
387 continue;
388 }
389 391
390 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { 392 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
391 if (node_online(shrinkctl->nid)) 393 sc.nid = 0;
392 freed += shrink_slab_node(shrinkctl, shrinker,
393 nr_pages_scanned, lru_pages);
394 394
395 } 395 freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible);
396 } 396 }
397
397 up_read(&shrinker_rwsem); 398 up_read(&shrinker_rwsem);
398out: 399out:
399 cond_resched(); 400 cond_resched();
@@ -1876,7 +1877,8 @@ enum scan_balance {
1876 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan 1877 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
1877 */ 1878 */
1878static void get_scan_count(struct lruvec *lruvec, int swappiness, 1879static void get_scan_count(struct lruvec *lruvec, int swappiness,
1879 struct scan_control *sc, unsigned long *nr) 1880 struct scan_control *sc, unsigned long *nr,
1881 unsigned long *lru_pages)
1880{ 1882{
1881 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1883 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1882 u64 fraction[2]; 1884 u64 fraction[2];
@@ -2022,6 +2024,7 @@ out:
2022 some_scanned = false; 2024 some_scanned = false;
2023 /* Only use force_scan on second pass. */ 2025 /* Only use force_scan on second pass. */
2024 for (pass = 0; !some_scanned && pass < 2; pass++) { 2026 for (pass = 0; !some_scanned && pass < 2; pass++) {
2027 *lru_pages = 0;
2025 for_each_evictable_lru(lru) { 2028 for_each_evictable_lru(lru) {
2026 int file = is_file_lru(lru); 2029 int file = is_file_lru(lru);
2027 unsigned long size; 2030 unsigned long size;
@@ -2048,14 +2051,19 @@ out:
2048 case SCAN_FILE: 2051 case SCAN_FILE:
2049 case SCAN_ANON: 2052 case SCAN_ANON:
2050 /* Scan one type exclusively */ 2053 /* Scan one type exclusively */
2051 if ((scan_balance == SCAN_FILE) != file) 2054 if ((scan_balance == SCAN_FILE) != file) {
2055 size = 0;
2052 scan = 0; 2056 scan = 0;
2057 }
2053 break; 2058 break;
2054 default: 2059 default:
2055 /* Look ma, no brain */ 2060 /* Look ma, no brain */
2056 BUG(); 2061 BUG();
2057 } 2062 }
2063
2064 *lru_pages += size;
2058 nr[lru] = scan; 2065 nr[lru] = scan;
2066
2059 /* 2067 /*
2060 * Skip the second pass and don't force_scan, 2068 * Skip the second pass and don't force_scan,
2061 * if we found something to scan. 2069 * if we found something to scan.
@@ -2069,7 +2077,7 @@ out:
2069 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 2077 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2070 */ 2078 */
2071static void shrink_lruvec(struct lruvec *lruvec, int swappiness, 2079static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
2072 struct scan_control *sc) 2080 struct scan_control *sc, unsigned long *lru_pages)
2073{ 2081{
2074 unsigned long nr[NR_LRU_LISTS]; 2082 unsigned long nr[NR_LRU_LISTS];
2075 unsigned long targets[NR_LRU_LISTS]; 2083 unsigned long targets[NR_LRU_LISTS];
@@ -2080,7 +2088,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
2080 struct blk_plug plug; 2088 struct blk_plug plug;
2081 bool scan_adjusted; 2089 bool scan_adjusted;
2082 2090
2083 get_scan_count(lruvec, swappiness, sc, nr); 2091 get_scan_count(lruvec, swappiness, sc, nr, lru_pages);
2084 2092
2085 /* Record the original scan target for proportional adjustments later */ 2093 /* Record the original scan target for proportional adjustments later */
2086 memcpy(targets, nr, sizeof(nr)); 2094 memcpy(targets, nr, sizeof(nr));
@@ -2258,7 +2266,8 @@ static inline bool should_continue_reclaim(struct zone *zone,
2258 } 2266 }
2259} 2267}
2260 2268
2261static bool shrink_zone(struct zone *zone, struct scan_control *sc) 2269static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2270 bool is_classzone)
2262{ 2271{
2263 unsigned long nr_reclaimed, nr_scanned; 2272 unsigned long nr_reclaimed, nr_scanned;
2264 bool reclaimable = false; 2273 bool reclaimable = false;
@@ -2269,6 +2278,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
2269 .zone = zone, 2278 .zone = zone,
2270 .priority = sc->priority, 2279 .priority = sc->priority,
2271 }; 2280 };
2281 unsigned long zone_lru_pages = 0;
2272 struct mem_cgroup *memcg; 2282 struct mem_cgroup *memcg;
2273 2283
2274 nr_reclaimed = sc->nr_reclaimed; 2284 nr_reclaimed = sc->nr_reclaimed;
@@ -2276,13 +2286,15 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
2276 2286
2277 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2287 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2278 do { 2288 do {
2289 unsigned long lru_pages;
2279 struct lruvec *lruvec; 2290 struct lruvec *lruvec;
2280 int swappiness; 2291 int swappiness;
2281 2292
2282 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2293 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2283 swappiness = mem_cgroup_swappiness(memcg); 2294 swappiness = mem_cgroup_swappiness(memcg);
2284 2295
2285 shrink_lruvec(lruvec, swappiness, sc); 2296 shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
2297 zone_lru_pages += lru_pages;
2286 2298
2287 /* 2299 /*
2288 * Direct reclaim and kswapd have to scan all memory 2300 * Direct reclaim and kswapd have to scan all memory
@@ -2302,6 +2314,25 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
2302 memcg = mem_cgroup_iter(root, memcg, &reclaim); 2314 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2303 } while (memcg); 2315 } while (memcg);
2304 2316
2317 /*
2318 * Shrink the slab caches in the same proportion that
2319 * the eligible LRU pages were scanned.
2320 */
2321 if (global_reclaim(sc) && is_classzone) {
2322 struct reclaim_state *reclaim_state;
2323
2324 shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
2325 sc->nr_scanned - nr_scanned,
2326 zone_lru_pages);
2327
2328 reclaim_state = current->reclaim_state;
2329 if (reclaim_state) {
2330 sc->nr_reclaimed +=
2331 reclaim_state->reclaimed_slab;
2332 reclaim_state->reclaimed_slab = 0;
2333 }
2334 }
2335
2305 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2336 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
2306 sc->nr_scanned - nr_scanned, 2337 sc->nr_scanned - nr_scanned,
2307 sc->nr_reclaimed - nr_reclaimed); 2338 sc->nr_reclaimed - nr_reclaimed);
@@ -2376,12 +2407,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2376 struct zone *zone; 2407 struct zone *zone;
2377 unsigned long nr_soft_reclaimed; 2408 unsigned long nr_soft_reclaimed;
2378 unsigned long nr_soft_scanned; 2409 unsigned long nr_soft_scanned;
2379 unsigned long lru_pages = 0;
2380 struct reclaim_state *reclaim_state = current->reclaim_state;
2381 gfp_t orig_mask; 2410 gfp_t orig_mask;
2382 struct shrink_control shrink = {
2383 .gfp_mask = sc->gfp_mask,
2384 };
2385 enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); 2411 enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
2386 bool reclaimable = false; 2412 bool reclaimable = false;
2387 2413
@@ -2394,12 +2420,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2394 if (buffer_heads_over_limit) 2420 if (buffer_heads_over_limit)
2395 sc->gfp_mask |= __GFP_HIGHMEM; 2421 sc->gfp_mask |= __GFP_HIGHMEM;
2396 2422
2397 nodes_clear(shrink.nodes_to_scan);
2398
2399 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2423 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2400 gfp_zone(sc->gfp_mask), sc->nodemask) { 2424 requested_highidx, sc->nodemask) {
2425 enum zone_type classzone_idx;
2426
2401 if (!populated_zone(zone)) 2427 if (!populated_zone(zone))
2402 continue; 2428 continue;
2429
2430 classzone_idx = requested_highidx;
2431 while (!populated_zone(zone->zone_pgdat->node_zones +
2432 classzone_idx))
2433 classzone_idx--;
2434
2403 /* 2435 /*
2404 * Take care memory controller reclaiming has small influence 2436 * Take care memory controller reclaiming has small influence
2405 * to global LRU. 2437 * to global LRU.
@@ -2409,9 +2441,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2409 GFP_KERNEL | __GFP_HARDWALL)) 2441 GFP_KERNEL | __GFP_HARDWALL))
2410 continue; 2442 continue;
2411 2443
2412 lru_pages += zone_reclaimable_pages(zone);
2413 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
2414
2415 if (sc->priority != DEF_PRIORITY && 2444 if (sc->priority != DEF_PRIORITY &&
2416 !zone_reclaimable(zone)) 2445 !zone_reclaimable(zone))
2417 continue; /* Let kswapd poll it */ 2446 continue; /* Let kswapd poll it */
@@ -2450,7 +2479,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2450 /* need some check for avoid more shrink_zone() */ 2479 /* need some check for avoid more shrink_zone() */
2451 } 2480 }
2452 2481
2453 if (shrink_zone(zone, sc)) 2482 if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
2454 reclaimable = true; 2483 reclaimable = true;
2455 2484
2456 if (global_reclaim(sc) && 2485 if (global_reclaim(sc) &&
@@ -2459,20 +2488,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2459 } 2488 }
2460 2489
2461 /* 2490 /*
2462 * Don't shrink slabs when reclaiming memory from over limit cgroups
2463 * but do shrink slab at least once when aborting reclaim for
2464 * compaction to avoid unevenly scanning file/anon LRU pages over slab
2465 * pages.
2466 */
2467 if (global_reclaim(sc)) {
2468 shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2469 if (reclaim_state) {
2470 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2471 reclaim_state->reclaimed_slab = 0;
2472 }
2473 }
2474
2475 /*
2476 * Restore to original mask to avoid the impact on the caller if we 2491 * Restore to original mask to avoid the impact on the caller if we
2477 * promoted it to __GFP_HIGHMEM. 2492 * promoted it to __GFP_HIGHMEM.
2478 */ 2493 */
@@ -2736,6 +2751,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2736 }; 2751 };
2737 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2752 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2738 int swappiness = mem_cgroup_swappiness(memcg); 2753 int swappiness = mem_cgroup_swappiness(memcg);
2754 unsigned long lru_pages;
2739 2755
2740 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2756 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2741 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2757 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2751,7 +2767,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2751 * will pick up pages from other mem cgroup's as well. We hack 2767 * will pick up pages from other mem cgroup's as well. We hack
2752 * the priority and make it zero. 2768 * the priority and make it zero.
2753 */ 2769 */
2754 shrink_lruvec(lruvec, swappiness, &sc); 2770 shrink_lruvec(lruvec, swappiness, &sc, &lru_pages);
2755 2771
2756 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2772 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2757 2773
@@ -2932,15 +2948,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2932static bool kswapd_shrink_zone(struct zone *zone, 2948static bool kswapd_shrink_zone(struct zone *zone,
2933 int classzone_idx, 2949 int classzone_idx,
2934 struct scan_control *sc, 2950 struct scan_control *sc,
2935 unsigned long lru_pages,
2936 unsigned long *nr_attempted) 2951 unsigned long *nr_attempted)
2937{ 2952{
2938 int testorder = sc->order; 2953 int testorder = sc->order;
2939 unsigned long balance_gap; 2954 unsigned long balance_gap;
2940 struct reclaim_state *reclaim_state = current->reclaim_state;
2941 struct shrink_control shrink = {
2942 .gfp_mask = sc->gfp_mask,
2943 };
2944 bool lowmem_pressure; 2955 bool lowmem_pressure;
2945 2956
2946 /* Reclaim above the high watermark. */ 2957 /* Reclaim above the high watermark. */
@@ -2975,13 +2986,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
2975 balance_gap, classzone_idx)) 2986 balance_gap, classzone_idx))
2976 return true; 2987 return true;
2977 2988
2978 shrink_zone(zone, sc); 2989 shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
2979 nodes_clear(shrink.nodes_to_scan);
2980 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
2981
2982 reclaim_state->reclaimed_slab = 0;
2983 shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2984 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2985 2990
2986 /* Account for the number of pages attempted to reclaim */ 2991 /* Account for the number of pages attempted to reclaim */
2987 *nr_attempted += sc->nr_to_reclaim; 2992 *nr_attempted += sc->nr_to_reclaim;
@@ -3042,7 +3047,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3042 count_vm_event(PAGEOUTRUN); 3047 count_vm_event(PAGEOUTRUN);
3043 3048
3044 do { 3049 do {
3045 unsigned long lru_pages = 0;
3046 unsigned long nr_attempted = 0; 3050 unsigned long nr_attempted = 0;
3047 bool raise_priority = true; 3051 bool raise_priority = true;
3048 bool pgdat_needs_compaction = (order > 0); 3052 bool pgdat_needs_compaction = (order > 0);
@@ -3102,8 +3106,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3102 if (!populated_zone(zone)) 3106 if (!populated_zone(zone))
3103 continue; 3107 continue;
3104 3108
3105 lru_pages += zone_reclaimable_pages(zone);
3106
3107 /* 3109 /*
3108 * If any zone is currently balanced then kswapd will 3110 * If any zone is currently balanced then kswapd will
3109 * not call compaction as it is expected that the 3111 * not call compaction as it is expected that the
@@ -3159,8 +3161,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3159 * that that high watermark would be met at 100% 3161 * that that high watermark would be met at 100%
3160 * efficiency. 3162 * efficiency.
3161 */ 3163 */
3162 if (kswapd_shrink_zone(zone, end_zone, &sc, 3164 if (kswapd_shrink_zone(zone, end_zone,
3163 lru_pages, &nr_attempted)) 3165 &sc, &nr_attempted))
3164 raise_priority = false; 3166 raise_priority = false;
3165 } 3167 }
3166 3168
@@ -3612,10 +3614,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3612 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3614 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3613 .may_swap = 1, 3615 .may_swap = 1,
3614 }; 3616 };
3615 struct shrink_control shrink = {
3616 .gfp_mask = sc.gfp_mask,
3617 };
3618 unsigned long nr_slab_pages0, nr_slab_pages1;
3619 3617
3620 cond_resched(); 3618 cond_resched();
3621 /* 3619 /*
@@ -3634,44 +3632,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3634 * priorities until we have enough memory freed. 3632 * priorities until we have enough memory freed.
3635 */ 3633 */
3636 do { 3634 do {
3637 shrink_zone(zone, &sc); 3635 shrink_zone(zone, &sc, true);
3638 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); 3636 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3639 } 3637 }
3640 3638
3641 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3642 if (nr_slab_pages0 > zone->min_slab_pages) {
3643 /*
3644 * shrink_slab() does not currently allow us to determine how
3645 * many pages were freed in this zone. So we take the current
3646 * number of slab pages and shake the slab until it is reduced
3647 * by the same nr_pages that we used for reclaiming unmapped
3648 * pages.
3649 */
3650 nodes_clear(shrink.nodes_to_scan);
3651 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
3652 for (;;) {
3653 unsigned long lru_pages = zone_reclaimable_pages(zone);
3654
3655 /* No reclaimable slab or very low memory pressure */
3656 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
3657 break;
3658
3659 /* Freed enough memory */
3660 nr_slab_pages1 = zone_page_state(zone,
3661 NR_SLAB_RECLAIMABLE);
3662 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3663 break;
3664 }
3665
3666 /*
3667 * Update nr_reclaimed by the number of slab pages we
3668 * reclaimed from this zone.
3669 */
3670 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3671 if (nr_slab_pages1 < nr_slab_pages0)
3672 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3673 }
3674
3675 p->reclaim_state = NULL; 3639 p->reclaim_state = NULL;
3676 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 3640 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3677 lockdep_clear_current_reclaim_state(); 3641 lockdep_clear_current_reclaim_state();
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1b12d390dc68..1284f89fca08 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -22,6 +22,8 @@
22#include <linux/writeback.h> 22#include <linux/writeback.h>
23#include <linux/compaction.h> 23#include <linux/compaction.h>
24#include <linux/mm_inline.h> 24#include <linux/mm_inline.h>
25#include <linux/page_ext.h>
26#include <linux/page_owner.h>
25 27
26#include "internal.h" 28#include "internal.h"
27 29
@@ -898,6 +900,7 @@ const char * const vmstat_text[] = {
898#ifdef CONFIG_DEBUG_VM_VMACACHE 900#ifdef CONFIG_DEBUG_VM_VMACACHE
899 "vmacache_find_calls", 901 "vmacache_find_calls",
900 "vmacache_find_hits", 902 "vmacache_find_hits",
903 "vmacache_full_flushes",
901#endif 904#endif
902#endif /* CONFIG_VM_EVENTS_COUNTERS */ 905#endif /* CONFIG_VM_EVENTS_COUNTERS */
903}; 906};
@@ -1017,6 +1020,104 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1017 return 0; 1020 return 0;
1018} 1021}
1019 1022
1023#ifdef CONFIG_PAGE_OWNER
1024static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
1025 pg_data_t *pgdat,
1026 struct zone *zone)
1027{
1028 struct page *page;
1029 struct page_ext *page_ext;
1030 unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
1031 unsigned long end_pfn = pfn + zone->spanned_pages;
1032 unsigned long count[MIGRATE_TYPES] = { 0, };
1033 int pageblock_mt, page_mt;
1034 int i;
1035
1036 /* Scan block by block. First and last block may be incomplete */
1037 pfn = zone->zone_start_pfn;
1038
1039 /*
1040 * Walk the zone in pageblock_nr_pages steps. If a page block spans
1041 * a zone boundary, it will be double counted between zones. This does
1042 * not matter as the mixed block count will still be correct
1043 */
1044 for (; pfn < end_pfn; ) {
1045 if (!pfn_valid(pfn)) {
1046 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
1047 continue;
1048 }
1049
1050 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
1051 block_end_pfn = min(block_end_pfn, end_pfn);
1052
1053 page = pfn_to_page(pfn);
1054 pageblock_mt = get_pfnblock_migratetype(page, pfn);
1055
1056 for (; pfn < block_end_pfn; pfn++) {
1057 if (!pfn_valid_within(pfn))
1058 continue;
1059
1060 page = pfn_to_page(pfn);
1061 if (PageBuddy(page)) {
1062 pfn += (1UL << page_order(page)) - 1;
1063 continue;
1064 }
1065
1066 if (PageReserved(page))
1067 continue;
1068
1069 page_ext = lookup_page_ext(page);
1070
1071 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
1072 continue;
1073
1074 page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
1075 if (pageblock_mt != page_mt) {
1076 if (is_migrate_cma(pageblock_mt))
1077 count[MIGRATE_MOVABLE]++;
1078 else
1079 count[pageblock_mt]++;
1080
1081 pfn = block_end_pfn;
1082 break;
1083 }
1084 pfn += (1UL << page_ext->order) - 1;
1085 }
1086 }
1087
1088 /* Print counts */
1089 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1090 for (i = 0; i < MIGRATE_TYPES; i++)
1091 seq_printf(m, "%12lu ", count[i]);
1092 seq_putc(m, '\n');
1093}
1094#endif /* CONFIG_PAGE_OWNER */
1095
1096/*
1097 * Print out the number of pageblocks for each migratetype that contain pages
1098 * of other types. This gives an indication of how well fallbacks are being
1099 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1100 * to determine what is going on
1101 */
1102static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1103{
1104#ifdef CONFIG_PAGE_OWNER
1105 int mtype;
1106
1107 if (!page_owner_inited)
1108 return;
1109
1110 drain_all_pages(NULL);
1111
1112 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1113 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1114 seq_printf(m, "%12s ", migratetype_names[mtype]);
1115 seq_putc(m, '\n');
1116
1117 walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
1118#endif /* CONFIG_PAGE_OWNER */
1119}
1120
1020/* 1121/*
1021 * This prints out statistics in relation to grouping pages by mobility. 1122 * This prints out statistics in relation to grouping pages by mobility.
1022 * It is expensive to collect so do not constantly read the file. 1123 * It is expensive to collect so do not constantly read the file.
@@ -1034,6 +1135,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
1034 seq_putc(m, '\n'); 1135 seq_putc(m, '\n');
1035 pagetypeinfo_showfree(m, pgdat); 1136 pagetypeinfo_showfree(m, pgdat);
1036 pagetypeinfo_showblockcount(m, pgdat); 1137 pagetypeinfo_showblockcount(m, pgdat);
1138 pagetypeinfo_showmixedcount(m, pgdat);
1037 1139
1038 return 0; 1140 return 0;
1039} 1141}
diff --git a/mm/zbud.c b/mm/zbud.c
index ec71b37fb06c..4e387bea702e 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -132,7 +132,7 @@ static struct zbud_ops zbud_zpool_ops = {
132 132
133static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) 133static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
134{ 134{
135 return zbud_create_pool(gfp, &zbud_zpool_ops); 135 return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
136} 136}
137 137
138static void zbud_zpool_destroy(void *pool) 138static void zbud_zpool_destroy(void *pool)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 839a48c3ca27..4d0a063145ec 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -155,8 +155,6 @@
155 * (reason above) 155 * (reason above)
156 */ 156 */
157#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) 157#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
158#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
159 ZS_SIZE_CLASS_DELTA + 1)
160 158
161/* 159/*
162 * We do not maintain any list for completely empty or full pages 160 * We do not maintain any list for completely empty or full pages
@@ -171,6 +169,11 @@ enum fullness_group {
171}; 169};
172 170
173/* 171/*
172 * number of size_classes
173 */
174static int zs_size_classes;
175
176/*
174 * We assign a page to ZS_ALMOST_EMPTY fullness group when: 177 * We assign a page to ZS_ALMOST_EMPTY fullness group when:
175 * n <= N / f, where 178 * n <= N / f, where
176 * n = number of allocated objects 179 * n = number of allocated objects
@@ -214,7 +217,7 @@ struct link_free {
214}; 217};
215 218
216struct zs_pool { 219struct zs_pool {
217 struct size_class size_class[ZS_SIZE_CLASSES]; 220 struct size_class **size_class;
218 221
219 gfp_t flags; /* allocation flags used when growing pool */ 222 gfp_t flags; /* allocation flags used when growing pool */
220 atomic_long_t pages_allocated; 223 atomic_long_t pages_allocated;
@@ -468,7 +471,7 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
468 if (newfg == currfg) 471 if (newfg == currfg)
469 goto out; 472 goto out;
470 473
471 class = &pool->size_class[class_idx]; 474 class = pool->size_class[class_idx];
472 remove_zspage(page, class, currfg); 475 remove_zspage(page, class, currfg);
473 insert_zspage(page, class, newfg); 476 insert_zspage(page, class, newfg);
474 set_zspage_mapping(page, class_idx, newfg); 477 set_zspage_mapping(page, class_idx, newfg);
@@ -629,6 +632,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
629 struct page *next_page; 632 struct page *next_page;
630 struct link_free *link; 633 struct link_free *link;
631 unsigned int i = 1; 634 unsigned int i = 1;
635 void *vaddr;
632 636
633 /* 637 /*
634 * page->index stores offset of first object starting 638 * page->index stores offset of first object starting
@@ -639,8 +643,8 @@ static void init_zspage(struct page *first_page, struct size_class *class)
639 if (page != first_page) 643 if (page != first_page)
640 page->index = off; 644 page->index = off;
641 645
642 link = (struct link_free *)kmap_atomic(page) + 646 vaddr = kmap_atomic(page);
643 off / sizeof(*link); 647 link = (struct link_free *)vaddr + off / sizeof(*link);
644 648
645 while ((off += class->size) < PAGE_SIZE) { 649 while ((off += class->size) < PAGE_SIZE) {
646 link->next = obj_location_to_handle(page, i++); 650 link->next = obj_location_to_handle(page, i++);
@@ -654,7 +658,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
654 */ 658 */
655 next_page = get_next_page(page); 659 next_page = get_next_page(page);
656 link->next = obj_location_to_handle(next_page, 0); 660 link->next = obj_location_to_handle(next_page, 0);
657 kunmap_atomic(link); 661 kunmap_atomic(vaddr);
658 page = next_page; 662 page = next_page;
659 off %= PAGE_SIZE; 663 off %= PAGE_SIZE;
660 } 664 }
@@ -784,7 +788,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
784 */ 788 */
785 if (area->vm_buf) 789 if (area->vm_buf)
786 return 0; 790 return 0;
787 area->vm_buf = (char *)__get_free_page(GFP_KERNEL); 791 area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
788 if (!area->vm_buf) 792 if (!area->vm_buf)
789 return -ENOMEM; 793 return -ENOMEM;
790 return 0; 794 return 0;
@@ -792,8 +796,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
792 796
793static inline void __zs_cpu_down(struct mapping_area *area) 797static inline void __zs_cpu_down(struct mapping_area *area)
794{ 798{
795 if (area->vm_buf) 799 kfree(area->vm_buf);
796 free_page((unsigned long)area->vm_buf);
797 area->vm_buf = NULL; 800 area->vm_buf = NULL;
798} 801}
799 802
@@ -881,14 +884,10 @@ static struct notifier_block zs_cpu_nb = {
881 .notifier_call = zs_cpu_notifier 884 .notifier_call = zs_cpu_notifier
882}; 885};
883 886
884static void zs_exit(void) 887static void zs_unregister_cpu_notifier(void)
885{ 888{
886 int cpu; 889 int cpu;
887 890
888#ifdef CONFIG_ZPOOL
889 zpool_unregister_driver(&zs_zpool_driver);
890#endif
891
892 cpu_notifier_register_begin(); 891 cpu_notifier_register_begin();
893 892
894 for_each_online_cpu(cpu) 893 for_each_online_cpu(cpu)
@@ -898,31 +897,74 @@ static void zs_exit(void)
898 cpu_notifier_register_done(); 897 cpu_notifier_register_done();
899} 898}
900 899
901static int zs_init(void) 900static int zs_register_cpu_notifier(void)
902{ 901{
903 int cpu, ret; 902 int cpu, uninitialized_var(ret);
904 903
905 cpu_notifier_register_begin(); 904 cpu_notifier_register_begin();
906 905
907 __register_cpu_notifier(&zs_cpu_nb); 906 __register_cpu_notifier(&zs_cpu_nb);
908 for_each_online_cpu(cpu) { 907 for_each_online_cpu(cpu) {
909 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 908 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
910 if (notifier_to_errno(ret)) { 909 if (notifier_to_errno(ret))
911 cpu_notifier_register_done(); 910 break;
912 goto fail;
913 }
914 } 911 }
915 912
916 cpu_notifier_register_done(); 913 cpu_notifier_register_done();
914 return notifier_to_errno(ret);
915}
916
917static void init_zs_size_classes(void)
918{
919 int nr;
917 920
921 nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1;
922 if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA)
923 nr += 1;
924
925 zs_size_classes = nr;
926}
927
928static void __exit zs_exit(void)
929{
918#ifdef CONFIG_ZPOOL 930#ifdef CONFIG_ZPOOL
919 zpool_register_driver(&zs_zpool_driver); 931 zpool_unregister_driver(&zs_zpool_driver);
920#endif 932#endif
933 zs_unregister_cpu_notifier();
934}
921 935
936static int __init zs_init(void)
937{
938 int ret = zs_register_cpu_notifier();
939
940 if (ret) {
941 zs_unregister_cpu_notifier();
942 return ret;
943 }
944
945 init_zs_size_classes();
946
947#ifdef CONFIG_ZPOOL
948 zpool_register_driver(&zs_zpool_driver);
949#endif
922 return 0; 950 return 0;
923fail: 951}
924 zs_exit(); 952
925 return notifier_to_errno(ret); 953static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
954{
955 return pages_per_zspage * PAGE_SIZE / size;
956}
957
958static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
959{
960 if (prev->pages_per_zspage != pages_per_zspage)
961 return false;
962
963 if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage)
964 != get_maxobj_per_zspage(size, pages_per_zspage))
965 return false;
966
967 return true;
926} 968}
927 969
928/** 970/**
@@ -937,33 +979,71 @@ fail:
937 */ 979 */
938struct zs_pool *zs_create_pool(gfp_t flags) 980struct zs_pool *zs_create_pool(gfp_t flags)
939{ 981{
940 int i, ovhd_size; 982 int i;
941 struct zs_pool *pool; 983 struct zs_pool *pool;
984 struct size_class *prev_class = NULL;
942 985
943 ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); 986 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
944 pool = kzalloc(ovhd_size, GFP_KERNEL);
945 if (!pool) 987 if (!pool)
946 return NULL; 988 return NULL;
947 989
948 for (i = 0; i < ZS_SIZE_CLASSES; i++) { 990 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
991 GFP_KERNEL);
992 if (!pool->size_class) {
993 kfree(pool);
994 return NULL;
995 }
996
997 /*
998 * Iterate reversly, because, size of size_class that we want to use
999 * for merging should be larger or equal to current size.
1000 */
1001 for (i = zs_size_classes - 1; i >= 0; i--) {
949 int size; 1002 int size;
1003 int pages_per_zspage;
950 struct size_class *class; 1004 struct size_class *class;
951 1005
952 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 1006 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
953 if (size > ZS_MAX_ALLOC_SIZE) 1007 if (size > ZS_MAX_ALLOC_SIZE)
954 size = ZS_MAX_ALLOC_SIZE; 1008 size = ZS_MAX_ALLOC_SIZE;
1009 pages_per_zspage = get_pages_per_zspage(size);
1010
1011 /*
1012 * size_class is used for normal zsmalloc operation such
1013 * as alloc/free for that size. Although it is natural that we
1014 * have one size_class for each size, there is a chance that we
1015 * can get more memory utilization if we use one size_class for
1016 * many different sizes whose size_class have same
1017 * characteristics. So, we makes size_class point to
1018 * previous size_class if possible.
1019 */
1020 if (prev_class) {
1021 if (can_merge(prev_class, size, pages_per_zspage)) {
1022 pool->size_class[i] = prev_class;
1023 continue;
1024 }
1025 }
1026
1027 class = kzalloc(sizeof(struct size_class), GFP_KERNEL);
1028 if (!class)
1029 goto err;
955 1030
956 class = &pool->size_class[i];
957 class->size = size; 1031 class->size = size;
958 class->index = i; 1032 class->index = i;
1033 class->pages_per_zspage = pages_per_zspage;
959 spin_lock_init(&class->lock); 1034 spin_lock_init(&class->lock);
960 class->pages_per_zspage = get_pages_per_zspage(size); 1035 pool->size_class[i] = class;
961 1036
1037 prev_class = class;
962 } 1038 }
963 1039
964 pool->flags = flags; 1040 pool->flags = flags;
965 1041
966 return pool; 1042 return pool;
1043
1044err:
1045 zs_destroy_pool(pool);
1046 return NULL;
967} 1047}
968EXPORT_SYMBOL_GPL(zs_create_pool); 1048EXPORT_SYMBOL_GPL(zs_create_pool);
969 1049
@@ -971,9 +1051,15 @@ void zs_destroy_pool(struct zs_pool *pool)
971{ 1051{
972 int i; 1052 int i;
973 1053
974 for (i = 0; i < ZS_SIZE_CLASSES; i++) { 1054 for (i = 0; i < zs_size_classes; i++) {
975 int fg; 1055 int fg;
976 struct size_class *class = &pool->size_class[i]; 1056 struct size_class *class = pool->size_class[i];
1057
1058 if (!class)
1059 continue;
1060
1061 if (class->index != i)
1062 continue;
977 1063
978 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { 1064 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
979 if (class->fullness_list[fg]) { 1065 if (class->fullness_list[fg]) {
@@ -981,7 +1067,10 @@ void zs_destroy_pool(struct zs_pool *pool)
981 class->size, fg); 1067 class->size, fg);
982 } 1068 }
983 } 1069 }
1070 kfree(class);
984 } 1071 }
1072
1073 kfree(pool->size_class);
985 kfree(pool); 1074 kfree(pool);
986} 1075}
987EXPORT_SYMBOL_GPL(zs_destroy_pool); 1076EXPORT_SYMBOL_GPL(zs_destroy_pool);
@@ -999,8 +1088,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
999{ 1088{
1000 unsigned long obj; 1089 unsigned long obj;
1001 struct link_free *link; 1090 struct link_free *link;
1002 int class_idx;
1003 struct size_class *class; 1091 struct size_class *class;
1092 void *vaddr;
1004 1093
1005 struct page *first_page, *m_page; 1094 struct page *first_page, *m_page;
1006 unsigned long m_objidx, m_offset; 1095 unsigned long m_objidx, m_offset;
@@ -1008,9 +1097,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1008 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1097 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
1009 return 0; 1098 return 0;
1010 1099
1011 class_idx = get_size_class_index(size); 1100 class = pool->size_class[get_size_class_index(size)];
1012 class = &pool->size_class[class_idx];
1013 BUG_ON(class_idx != class->index);
1014 1101
1015 spin_lock(&class->lock); 1102 spin_lock(&class->lock);
1016 first_page = find_get_zspage(class); 1103 first_page = find_get_zspage(class);
@@ -1031,11 +1118,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1031 obj_handle_to_location(obj, &m_page, &m_objidx); 1118 obj_handle_to_location(obj, &m_page, &m_objidx);
1032 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 1119 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
1033 1120
1034 link = (struct link_free *)kmap_atomic(m_page) + 1121 vaddr = kmap_atomic(m_page);
1035 m_offset / sizeof(*link); 1122 link = (struct link_free *)vaddr + m_offset / sizeof(*link);
1036 first_page->freelist = link->next; 1123 first_page->freelist = link->next;
1037 memset(link, POISON_INUSE, sizeof(*link)); 1124 memset(link, POISON_INUSE, sizeof(*link));
1038 kunmap_atomic(link); 1125 kunmap_atomic(vaddr);
1039 1126
1040 first_page->inuse++; 1127 first_page->inuse++;
1041 /* Now move the zspage to another fullness group, if required */ 1128 /* Now move the zspage to another fullness group, if required */
@@ -1051,6 +1138,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
1051 struct link_free *link; 1138 struct link_free *link;
1052 struct page *first_page, *f_page; 1139 struct page *first_page, *f_page;
1053 unsigned long f_objidx, f_offset; 1140 unsigned long f_objidx, f_offset;
1141 void *vaddr;
1054 1142
1055 int class_idx; 1143 int class_idx;
1056 struct size_class *class; 1144 struct size_class *class;
@@ -1063,16 +1151,16 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
1063 first_page = get_first_page(f_page); 1151 first_page = get_first_page(f_page);
1064 1152
1065 get_zspage_mapping(first_page, &class_idx, &fullness); 1153 get_zspage_mapping(first_page, &class_idx, &fullness);
1066 class = &pool->size_class[class_idx]; 1154 class = pool->size_class[class_idx];
1067 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 1155 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
1068 1156
1069 spin_lock(&class->lock); 1157 spin_lock(&class->lock);
1070 1158
1071 /* Insert this object in containing zspage's freelist */ 1159 /* Insert this object in containing zspage's freelist */
1072 link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) 1160 vaddr = kmap_atomic(f_page);
1073 + f_offset); 1161 link = (struct link_free *)(vaddr + f_offset);
1074 link->next = first_page->freelist; 1162 link->next = first_page->freelist;
1075 kunmap_atomic(link); 1163 kunmap_atomic(vaddr);
1076 first_page->freelist = (void *)obj; 1164 first_page->freelist = (void *)obj;
1077 1165
1078 first_page->inuse--; 1166 first_page->inuse--;
@@ -1124,7 +1212,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1124 1212
1125 obj_handle_to_location(handle, &page, &obj_idx); 1213 obj_handle_to_location(handle, &page, &obj_idx);
1126 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1214 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1127 class = &pool->size_class[class_idx]; 1215 class = pool->size_class[class_idx];
1128 off = obj_idx_to_offset(page, obj_idx, class->size); 1216 off = obj_idx_to_offset(page, obj_idx, class->size);
1129 1217
1130 area = &get_cpu_var(zs_map_area); 1218 area = &get_cpu_var(zs_map_area);
@@ -1158,7 +1246,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1158 1246
1159 obj_handle_to_location(handle, &page, &obj_idx); 1247 obj_handle_to_location(handle, &page, &obj_idx);
1160 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1248 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1161 class = &pool->size_class[class_idx]; 1249 class = pool->size_class[class_idx];
1162 off = obj_idx_to_offset(page, obj_idx, class->size); 1250 off = obj_idx_to_offset(page, obj_idx, class->size);
1163 1251
1164 area = this_cpu_ptr(&zs_map_area); 1252 area = this_cpu_ptr(&zs_map_area);
diff --git a/mm/zswap.c b/mm/zswap.c
index c1543061a192..0cfce9bc51e4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -149,11 +149,10 @@ static int __init zswap_comp_init(void)
149 return 0; 149 return 0;
150} 150}
151 151
152static void zswap_comp_exit(void) 152static void __init zswap_comp_exit(void)
153{ 153{
154 /* free percpu transforms */ 154 /* free percpu transforms */
155 if (zswap_comp_pcpu_tfms) 155 free_percpu(zswap_comp_pcpu_tfms);
156 free_percpu(zswap_comp_pcpu_tfms);
157} 156}
158 157
159/********************************* 158/*********************************
@@ -206,7 +205,7 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
206**********************************/ 205**********************************/
207static struct kmem_cache *zswap_entry_cache; 206static struct kmem_cache *zswap_entry_cache;
208 207
209static int zswap_entry_cache_create(void) 208static int __init zswap_entry_cache_create(void)
210{ 209{
211 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 210 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
212 return zswap_entry_cache == NULL; 211 return zswap_entry_cache == NULL;
@@ -389,7 +388,7 @@ static struct notifier_block zswap_cpu_notifier_block = {
389 .notifier_call = zswap_cpu_notifier 388 .notifier_call = zswap_cpu_notifier
390}; 389};
391 390
392static int zswap_cpu_init(void) 391static int __init zswap_cpu_init(void)
393{ 392{
394 unsigned long cpu; 393 unsigned long cpu;
395 394