diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-13 16:00:36 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-13 16:00:36 -0500 |
commit | 78a45c6f067824cf5d0a9fedea7339ac2e28603c (patch) | |
tree | b4f78c8b6b9059ddace0a18c11629b8d2045f793 /mm | |
parent | f96fe225677b3efb74346ebd56fafe3997b02afa (diff) | |
parent | 29d293b6007b91a4463f05bc8d0b26e0e65c5816 (diff) |
Merge branch 'akpm' (second patch-bomb from Andrew)
Merge second patchbomb from Andrew Morton:
- the rest of MM
- misc fs fixes
- add execveat() syscall
- new ratelimit feature for fault-injection
- decompressor updates
- ipc/ updates
- fallocate feature creep
- fsnotify cleanups
- a few other misc things
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (99 commits)
cgroups: Documentation: fix trivial typos and wrong paragraph numberings
parisc: percpu: update comments referring to __get_cpu_var
percpu: update local_ops.txt to reflect this_cpu operations
percpu: remove __get_cpu_var and __raw_get_cpu_var macros
fsnotify: remove destroy_list from fsnotify_mark
fsnotify: unify inode and mount marks handling
fallocate: create FAN_MODIFY and IN_MODIFY events
mm/cma: make kmemleak ignore CMA regions
slub: fix cpuset check in get_any_partial
slab: fix cpuset check in fallback_alloc
shmdt: use i_size_read() instead of ->i_size
ipc/shm.c: fix overly aggressive shmdt() when calls span multiple segments
ipc/msg: increase MSGMNI, remove scaling
ipc/sem.c: increase SEMMSL, SEMMNI, SEMOPM
ipc/sem.c: change memory barrier in sem_lock() to smp_rmb()
lib/decompress.c: consistency of compress formats for kernel image
decompress_bunzip2: off by one in get_next_block()
usr/Kconfig: make initrd compression algorithm selection not expert
fault-inject: add ratelimit option
ratelimit: add initialization macro
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig.debug | 10 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/cma.c | 25 | ||||
-rw-r--r-- | mm/debug-pagealloc.c | 45 | ||||
-rw-r--r-- | mm/fadvise.c | 6 | ||||
-rw-r--r-- | mm/filemap.c | 10 | ||||
-rw-r--r-- | mm/filemap_xip.c | 23 | ||||
-rw-r--r-- | mm/fremap.c | 4 | ||||
-rw-r--r-- | mm/hugetlb.c | 26 | ||||
-rw-r--r-- | mm/memblock.c | 43 | ||||
-rw-r--r-- | mm/memcontrol.c | 180 | ||||
-rw-r--r-- | mm/memory-failure.c | 15 | ||||
-rw-r--r-- | mm/memory.c | 9 | ||||
-rw-r--r-- | mm/migrate.c | 28 | ||||
-rw-r--r-- | mm/mincore.c | 7 | ||||
-rw-r--r-- | mm/mmap.c | 24 | ||||
-rw-r--r-- | mm/mremap.c | 6 | ||||
-rw-r--r-- | mm/nommu.c | 50 | ||||
-rw-r--r-- | mm/oom_kill.c | 15 | ||||
-rw-r--r-- | mm/page_alloc.c | 137 | ||||
-rw-r--r-- | mm/page_ext.c | 403 | ||||
-rw-r--r-- | mm/page_owner.c | 311 | ||||
-rw-r--r-- | mm/rmap.c | 18 | ||||
-rw-r--r-- | mm/slab.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 17 | ||||
-rw-r--r-- | mm/vmacache.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 216 | ||||
-rw-r--r-- | mm/vmstat.c | 102 | ||||
-rw-r--r-- | mm/zbud.c | 2 | ||||
-rw-r--r-- | mm/zsmalloc.c | 180 | ||||
-rw-r--r-- | mm/zswap.c | 9 |
32 files changed, 1409 insertions, 524 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 4b2443254de2..56badfc4810a 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -1,8 +1,18 @@ | |||
1 | config PAGE_EXTENSION | ||
2 | bool "Extend memmap on extra space for more information on page" | ||
3 | ---help--- | ||
4 | Extend memmap on extra space for more information on page. This | ||
5 | could be used for debugging features that need to insert extra | ||
6 | field for every page. This extension enables us to save memory | ||
7 | by not allocating this extra memory according to boottime | ||
8 | configuration. | ||
9 | |||
1 | config DEBUG_PAGEALLOC | 10 | config DEBUG_PAGEALLOC |
2 | bool "Debug page memory allocations" | 11 | bool "Debug page memory allocations" |
3 | depends on DEBUG_KERNEL | 12 | depends on DEBUG_KERNEL |
4 | depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC | 13 | depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC |
5 | depends on !KMEMCHECK | 14 | depends on !KMEMCHECK |
15 | select PAGE_EXTENSION | ||
6 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC | 16 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC |
7 | select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC | 17 | select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC |
8 | ---help--- | 18 | ---help--- |
diff --git a/mm/Makefile b/mm/Makefile index b3c6ce932c64..4bf586e66378 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -63,6 +63,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | |||
63 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 63 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
64 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | 64 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o |
65 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 65 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
66 | obj-$(CONFIG_PAGE_OWNER) += page_owner.o | ||
66 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | 67 | obj-$(CONFIG_CLEANCACHE) += cleancache.o |
67 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o | 68 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o |
68 | obj-$(CONFIG_ZPOOL) += zpool.o | 69 | obj-$(CONFIG_ZPOOL) += zpool.o |
@@ -71,3 +72,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o | |||
71 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o | 72 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o |
72 | obj-$(CONFIG_CMA) += cma.o | 73 | obj-$(CONFIG_CMA) += cma.o |
73 | obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o | 74 | obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o |
75 | obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o | ||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/log2.h> | 33 | #include <linux/log2.h> |
34 | #include <linux/cma.h> | 34 | #include <linux/cma.h> |
35 | #include <linux/highmem.h> | 35 | #include <linux/highmem.h> |
36 | #include <linux/io.h> | ||
36 | 37 | ||
37 | struct cma { | 38 | struct cma { |
38 | unsigned long base_pfn; | 39 | unsigned long base_pfn; |
@@ -63,6 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) | |||
63 | return (1UL << (align_order - cma->order_per_bit)) - 1; | 64 | return (1UL << (align_order - cma->order_per_bit)) - 1; |
64 | } | 65 | } |
65 | 66 | ||
67 | static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) | ||
68 | { | ||
69 | unsigned int alignment; | ||
70 | |||
71 | if (align_order <= cma->order_per_bit) | ||
72 | return 0; | ||
73 | alignment = 1UL << (align_order - cma->order_per_bit); | ||
74 | return ALIGN(cma->base_pfn, alignment) - | ||
75 | (cma->base_pfn >> cma->order_per_bit); | ||
76 | } | ||
77 | |||
66 | static unsigned long cma_bitmap_maxno(struct cma *cma) | 78 | static unsigned long cma_bitmap_maxno(struct cma *cma) |
67 | { | 79 | { |
68 | return cma->count >> cma->order_per_bit; | 80 | return cma->count >> cma->order_per_bit; |
@@ -313,6 +325,11 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
313 | } | 325 | } |
314 | } | 326 | } |
315 | 327 | ||
328 | /* | ||
329 | * kmemleak scans/reads tracked objects for pointers to other | ||
330 | * objects but this address isn't mapped and accessible | ||
331 | */ | ||
332 | kmemleak_ignore(phys_to_virt(addr)); | ||
316 | base = addr; | 333 | base = addr; |
317 | } | 334 | } |
318 | 335 | ||
@@ -340,7 +357,7 @@ err: | |||
340 | */ | 357 | */ |
341 | struct page *cma_alloc(struct cma *cma, int count, unsigned int align) | 358 | struct page *cma_alloc(struct cma *cma, int count, unsigned int align) |
342 | { | 359 | { |
343 | unsigned long mask, pfn, start = 0; | 360 | unsigned long mask, offset, pfn, start = 0; |
344 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; | 361 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; |
345 | struct page *page = NULL; | 362 | struct page *page = NULL; |
346 | int ret; | 363 | int ret; |
@@ -355,13 +372,15 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) | |||
355 | return NULL; | 372 | return NULL; |
356 | 373 | ||
357 | mask = cma_bitmap_aligned_mask(cma, align); | 374 | mask = cma_bitmap_aligned_mask(cma, align); |
375 | offset = cma_bitmap_aligned_offset(cma, align); | ||
358 | bitmap_maxno = cma_bitmap_maxno(cma); | 376 | bitmap_maxno = cma_bitmap_maxno(cma); |
359 | bitmap_count = cma_bitmap_pages_to_bits(cma, count); | 377 | bitmap_count = cma_bitmap_pages_to_bits(cma, count); |
360 | 378 | ||
361 | for (;;) { | 379 | for (;;) { |
362 | mutex_lock(&cma->lock); | 380 | mutex_lock(&cma->lock); |
363 | bitmap_no = bitmap_find_next_zero_area(cma->bitmap, | 381 | bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, |
364 | bitmap_maxno, start, bitmap_count, mask); | 382 | bitmap_maxno, start, bitmap_count, mask, |
383 | offset); | ||
365 | if (bitmap_no >= bitmap_maxno) { | 384 | if (bitmap_no >= bitmap_maxno) { |
366 | mutex_unlock(&cma->lock); | 385 | mutex_unlock(&cma->lock); |
367 | break; | 386 | break; |
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c index 789ff70c8a4a..5bf5906ce13b 100644 --- a/mm/debug-pagealloc.c +++ b/mm/debug-pagealloc.c | |||
@@ -2,23 +2,55 @@ | |||
2 | #include <linux/string.h> | 2 | #include <linux/string.h> |
3 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
4 | #include <linux/highmem.h> | 4 | #include <linux/highmem.h> |
5 | #include <linux/page-debug-flags.h> | 5 | #include <linux/page_ext.h> |
6 | #include <linux/poison.h> | 6 | #include <linux/poison.h> |
7 | #include <linux/ratelimit.h> | 7 | #include <linux/ratelimit.h> |
8 | 8 | ||
9 | static bool page_poisoning_enabled __read_mostly; | ||
10 | |||
11 | static bool need_page_poisoning(void) | ||
12 | { | ||
13 | if (!debug_pagealloc_enabled()) | ||
14 | return false; | ||
15 | |||
16 | return true; | ||
17 | } | ||
18 | |||
19 | static void init_page_poisoning(void) | ||
20 | { | ||
21 | if (!debug_pagealloc_enabled()) | ||
22 | return; | ||
23 | |||
24 | page_poisoning_enabled = true; | ||
25 | } | ||
26 | |||
27 | struct page_ext_operations page_poisoning_ops = { | ||
28 | .need = need_page_poisoning, | ||
29 | .init = init_page_poisoning, | ||
30 | }; | ||
31 | |||
9 | static inline void set_page_poison(struct page *page) | 32 | static inline void set_page_poison(struct page *page) |
10 | { | 33 | { |
11 | __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); | 34 | struct page_ext *page_ext; |
35 | |||
36 | page_ext = lookup_page_ext(page); | ||
37 | __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | ||
12 | } | 38 | } |
13 | 39 | ||
14 | static inline void clear_page_poison(struct page *page) | 40 | static inline void clear_page_poison(struct page *page) |
15 | { | 41 | { |
16 | __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); | 42 | struct page_ext *page_ext; |
43 | |||
44 | page_ext = lookup_page_ext(page); | ||
45 | __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | ||
17 | } | 46 | } |
18 | 47 | ||
19 | static inline bool page_poison(struct page *page) | 48 | static inline bool page_poison(struct page *page) |
20 | { | 49 | { |
21 | return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); | 50 | struct page_ext *page_ext; |
51 | |||
52 | page_ext = lookup_page_ext(page); | ||
53 | return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | ||
22 | } | 54 | } |
23 | 55 | ||
24 | static void poison_page(struct page *page) | 56 | static void poison_page(struct page *page) |
@@ -93,8 +125,11 @@ static void unpoison_pages(struct page *page, int n) | |||
93 | unpoison_page(page + i); | 125 | unpoison_page(page + i); |
94 | } | 126 | } |
95 | 127 | ||
96 | void kernel_map_pages(struct page *page, int numpages, int enable) | 128 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
97 | { | 129 | { |
130 | if (!page_poisoning_enabled) | ||
131 | return; | ||
132 | |||
98 | if (enable) | 133 | if (enable) |
99 | unpoison_pages(page, numpages); | 134 | unpoison_pages(page, numpages); |
100 | else | 135 | else |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 3bcfd81db45e..2ad7adf4f0a4 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -117,7 +117,11 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) | |||
117 | __filemap_fdatawrite_range(mapping, offset, endbyte, | 117 | __filemap_fdatawrite_range(mapping, offset, endbyte, |
118 | WB_SYNC_NONE); | 118 | WB_SYNC_NONE); |
119 | 119 | ||
120 | /* First and last FULL page! */ | 120 | /* |
121 | * First and last FULL page! Partial pages are deliberately | ||
122 | * preserved on the expectation that it is better to preserve | ||
123 | * needed memory than to discard unneeded memory. | ||
124 | */ | ||
121 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; | 125 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; |
122 | end_index = (endbyte >> PAGE_CACHE_SHIFT); | 126 | end_index = (endbyte >> PAGE_CACHE_SHIFT); |
123 | 127 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 14b4642279f1..e8905bc3cbd7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -62,16 +62,16 @@ | |||
62 | /* | 62 | /* |
63 | * Lock ordering: | 63 | * Lock ordering: |
64 | * | 64 | * |
65 | * ->i_mmap_mutex (truncate_pagecache) | 65 | * ->i_mmap_rwsem (truncate_pagecache) |
66 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 66 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
67 | * ->swap_lock (exclusive_swap_page, others) | 67 | * ->swap_lock (exclusive_swap_page, others) |
68 | * ->mapping->tree_lock | 68 | * ->mapping->tree_lock |
69 | * | 69 | * |
70 | * ->i_mutex | 70 | * ->i_mutex |
71 | * ->i_mmap_mutex (truncate->unmap_mapping_range) | 71 | * ->i_mmap_rwsem (truncate->unmap_mapping_range) |
72 | * | 72 | * |
73 | * ->mmap_sem | 73 | * ->mmap_sem |
74 | * ->i_mmap_mutex | 74 | * ->i_mmap_rwsem |
75 | * ->page_table_lock or pte_lock (various, mainly in memory.c) | 75 | * ->page_table_lock or pte_lock (various, mainly in memory.c) |
76 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) | 76 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) |
77 | * | 77 | * |
@@ -85,7 +85,7 @@ | |||
85 | * sb_lock (fs/fs-writeback.c) | 85 | * sb_lock (fs/fs-writeback.c) |
86 | * ->mapping->tree_lock (__sync_single_inode) | 86 | * ->mapping->tree_lock (__sync_single_inode) |
87 | * | 87 | * |
88 | * ->i_mmap_mutex | 88 | * ->i_mmap_rwsem |
89 | * ->anon_vma.lock (vma_adjust) | 89 | * ->anon_vma.lock (vma_adjust) |
90 | * | 90 | * |
91 | * ->anon_vma.lock | 91 | * ->anon_vma.lock |
@@ -105,7 +105,7 @@ | |||
105 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 105 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
106 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 106 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
107 | * | 107 | * |
108 | * ->i_mmap_mutex | 108 | * ->i_mmap_rwsem |
109 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 109 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
110 | */ | 110 | */ |
111 | 111 | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index d8d9fe3f685c..0d105aeff82f 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -155,22 +155,14 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) | |||
155 | EXPORT_SYMBOL_GPL(xip_file_read); | 155 | EXPORT_SYMBOL_GPL(xip_file_read); |
156 | 156 | ||
157 | /* | 157 | /* |
158 | * __xip_unmap is invoked from xip_unmap and | 158 | * __xip_unmap is invoked from xip_unmap and xip_write |
159 | * xip_write | ||
160 | * | 159 | * |
161 | * This function walks all vmas of the address_space and unmaps the | 160 | * This function walks all vmas of the address_space and unmaps the |
162 | * __xip_sparse_page when found at pgoff. | 161 | * __xip_sparse_page when found at pgoff. |
163 | */ | 162 | */ |
164 | static void | 163 | static void __xip_unmap(struct address_space * mapping, unsigned long pgoff) |
165 | __xip_unmap (struct address_space * mapping, | ||
166 | unsigned long pgoff) | ||
167 | { | 164 | { |
168 | struct vm_area_struct *vma; | 165 | struct vm_area_struct *vma; |
169 | struct mm_struct *mm; | ||
170 | unsigned long address; | ||
171 | pte_t *pte; | ||
172 | pte_t pteval; | ||
173 | spinlock_t *ptl; | ||
174 | struct page *page; | 166 | struct page *page; |
175 | unsigned count; | 167 | unsigned count; |
176 | int locked = 0; | 168 | int locked = 0; |
@@ -182,11 +174,14 @@ __xip_unmap (struct address_space * mapping, | |||
182 | return; | 174 | return; |
183 | 175 | ||
184 | retry: | 176 | retry: |
185 | mutex_lock(&mapping->i_mmap_mutex); | 177 | i_mmap_lock_read(mapping); |
186 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 178 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
187 | mm = vma->vm_mm; | 179 | pte_t *pte, pteval; |
188 | address = vma->vm_start + | 180 | spinlock_t *ptl; |
181 | struct mm_struct *mm = vma->vm_mm; | ||
182 | unsigned long address = vma->vm_start + | ||
189 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 183 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
184 | |||
190 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 185 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
191 | pte = page_check_address(page, mm, address, &ptl, 1); | 186 | pte = page_check_address(page, mm, address, &ptl, 1); |
192 | if (pte) { | 187 | if (pte) { |
@@ -202,7 +197,7 @@ retry: | |||
202 | page_cache_release(page); | 197 | page_cache_release(page); |
203 | } | 198 | } |
204 | } | 199 | } |
205 | mutex_unlock(&mapping->i_mmap_mutex); | 200 | i_mmap_unlock_read(mapping); |
206 | 201 | ||
207 | if (locked) { | 202 | if (locked) { |
208 | mutex_unlock(&xip_sparse_mutex); | 203 | mutex_unlock(&xip_sparse_mutex); |
diff --git a/mm/fremap.c b/mm/fremap.c index 72b8fa361433..11ef7ec40d13 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -238,13 +238,13 @@ get_write_lock: | |||
238 | } | 238 | } |
239 | goto out_freed; | 239 | goto out_freed; |
240 | } | 240 | } |
241 | mutex_lock(&mapping->i_mmap_mutex); | 241 | i_mmap_lock_write(mapping); |
242 | flush_dcache_mmap_lock(mapping); | 242 | flush_dcache_mmap_lock(mapping); |
243 | vma->vm_flags |= VM_NONLINEAR; | 243 | vma->vm_flags |= VM_NONLINEAR; |
244 | vma_interval_tree_remove(vma, &mapping->i_mmap); | 244 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
245 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 245 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
246 | flush_dcache_mmap_unlock(mapping); | 246 | flush_dcache_mmap_unlock(mapping); |
247 | mutex_unlock(&mapping->i_mmap_mutex); | 247 | i_mmap_unlock_write(mapping); |
248 | } | 248 | } |
249 | 249 | ||
250 | if (vma->vm_flags & VM_LOCKED) { | 250 | if (vma->vm_flags & VM_LOCKED) { |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 919b86a2164d..47f6070d7c46 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1457,7 +1457,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
1457 | return 0; | 1457 | return 0; |
1458 | 1458 | ||
1459 | found: | 1459 | found: |
1460 | BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); | 1460 | BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); |
1461 | /* Put them into a private list first because mem_map is not up yet */ | 1461 | /* Put them into a private list first because mem_map is not up yet */ |
1462 | list_add(&m->list, &huge_boot_pages); | 1462 | list_add(&m->list, &huge_boot_pages); |
1463 | m->hstate = h; | 1463 | m->hstate = h; |
@@ -2083,7 +2083,7 @@ static void hugetlb_register_node(struct node *node) | |||
2083 | * devices of nodes that have memory. All on-line nodes should have | 2083 | * devices of nodes that have memory. All on-line nodes should have |
2084 | * registered their associated device by this time. | 2084 | * registered their associated device by this time. |
2085 | */ | 2085 | */ |
2086 | static void hugetlb_register_all_nodes(void) | 2086 | static void __init hugetlb_register_all_nodes(void) |
2087 | { | 2087 | { |
2088 | int nid; | 2088 | int nid; |
2089 | 2089 | ||
@@ -2726,9 +2726,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, | |||
2726 | * on its way out. We're lucky that the flag has such an appropriate | 2726 | * on its way out. We're lucky that the flag has such an appropriate |
2727 | * name, and can in fact be safely cleared here. We could clear it | 2727 | * name, and can in fact be safely cleared here. We could clear it |
2728 | * before the __unmap_hugepage_range above, but all that's necessary | 2728 | * before the __unmap_hugepage_range above, but all that's necessary |
2729 | * is to clear it before releasing the i_mmap_mutex. This works | 2729 | * is to clear it before releasing the i_mmap_rwsem. This works |
2730 | * because in the context this is called, the VMA is about to be | 2730 | * because in the context this is called, the VMA is about to be |
2731 | * destroyed and the i_mmap_mutex is held. | 2731 | * destroyed and the i_mmap_rwsem is held. |
2732 | */ | 2732 | */ |
2733 | vma->vm_flags &= ~VM_MAYSHARE; | 2733 | vma->vm_flags &= ~VM_MAYSHARE; |
2734 | } | 2734 | } |
@@ -2774,7 +2774,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2774 | * this mapping should be shared between all the VMAs, | 2774 | * this mapping should be shared between all the VMAs, |
2775 | * __unmap_hugepage_range() is called as the lock is already held | 2775 | * __unmap_hugepage_range() is called as the lock is already held |
2776 | */ | 2776 | */ |
2777 | mutex_lock(&mapping->i_mmap_mutex); | 2777 | i_mmap_lock_write(mapping); |
2778 | vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { | 2778 | vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { |
2779 | /* Do not unmap the current VMA */ | 2779 | /* Do not unmap the current VMA */ |
2780 | if (iter_vma == vma) | 2780 | if (iter_vma == vma) |
@@ -2791,7 +2791,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2791 | unmap_hugepage_range(iter_vma, address, | 2791 | unmap_hugepage_range(iter_vma, address, |
2792 | address + huge_page_size(h), page); | 2792 | address + huge_page_size(h), page); |
2793 | } | 2793 | } |
2794 | mutex_unlock(&mapping->i_mmap_mutex); | 2794 | i_mmap_unlock_write(mapping); |
2795 | } | 2795 | } |
2796 | 2796 | ||
2797 | /* | 2797 | /* |
@@ -3348,7 +3348,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3348 | flush_cache_range(vma, address, end); | 3348 | flush_cache_range(vma, address, end); |
3349 | 3349 | ||
3350 | mmu_notifier_invalidate_range_start(mm, start, end); | 3350 | mmu_notifier_invalidate_range_start(mm, start, end); |
3351 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3351 | i_mmap_lock_write(vma->vm_file->f_mapping); |
3352 | for (; address < end; address += huge_page_size(h)) { | 3352 | for (; address < end; address += huge_page_size(h)) { |
3353 | spinlock_t *ptl; | 3353 | spinlock_t *ptl; |
3354 | ptep = huge_pte_offset(mm, address); | 3354 | ptep = huge_pte_offset(mm, address); |
@@ -3370,13 +3370,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3370 | spin_unlock(ptl); | 3370 | spin_unlock(ptl); |
3371 | } | 3371 | } |
3372 | /* | 3372 | /* |
3373 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare | 3373 | * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare |
3374 | * may have cleared our pud entry and done put_page on the page table: | 3374 | * may have cleared our pud entry and done put_page on the page table: |
3375 | * once we release i_mmap_mutex, another task can do the final put_page | 3375 | * once we release i_mmap_rwsem, another task can do the final put_page |
3376 | * and that page table be reused and filled with junk. | 3376 | * and that page table be reused and filled with junk. |
3377 | */ | 3377 | */ |
3378 | flush_tlb_range(vma, start, end); | 3378 | flush_tlb_range(vma, start, end); |
3379 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3379 | i_mmap_unlock_write(vma->vm_file->f_mapping); |
3380 | mmu_notifier_invalidate_range_end(mm, start, end); | 3380 | mmu_notifier_invalidate_range_end(mm, start, end); |
3381 | 3381 | ||
3382 | return pages << h->order; | 3382 | return pages << h->order; |
@@ -3525,7 +3525,7 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) | |||
3525 | * and returns the corresponding pte. While this is not necessary for the | 3525 | * and returns the corresponding pte. While this is not necessary for the |
3526 | * !shared pmd case because we can allocate the pmd later as well, it makes the | 3526 | * !shared pmd case because we can allocate the pmd later as well, it makes the |
3527 | * code much cleaner. pmd allocation is essential for the shared case because | 3527 | * code much cleaner. pmd allocation is essential for the shared case because |
3528 | * pud has to be populated inside the same i_mmap_mutex section - otherwise | 3528 | * pud has to be populated inside the same i_mmap_rwsem section - otherwise |
3529 | * racing tasks could either miss the sharing (see huge_pte_offset) or select a | 3529 | * racing tasks could either miss the sharing (see huge_pte_offset) or select a |
3530 | * bad pmd for sharing. | 3530 | * bad pmd for sharing. |
3531 | */ | 3531 | */ |
@@ -3544,7 +3544,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3544 | if (!vma_shareable(vma, addr)) | 3544 | if (!vma_shareable(vma, addr)) |
3545 | return (pte_t *)pmd_alloc(mm, pud, addr); | 3545 | return (pte_t *)pmd_alloc(mm, pud, addr); |
3546 | 3546 | ||
3547 | mutex_lock(&mapping->i_mmap_mutex); | 3547 | i_mmap_lock_write(mapping); |
3548 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { | 3548 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { |
3549 | if (svma == vma) | 3549 | if (svma == vma) |
3550 | continue; | 3550 | continue; |
@@ -3572,7 +3572,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3572 | spin_unlock(ptl); | 3572 | spin_unlock(ptl); |
3573 | out: | 3573 | out: |
3574 | pte = (pte_t *)pmd_alloc(mm, pud, addr); | 3574 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
3575 | mutex_unlock(&mapping->i_mmap_mutex); | 3575 | i_mmap_unlock_write(mapping); |
3576 | return pte; | 3576 | return pte; |
3577 | } | 3577 | } |
3578 | 3578 | ||
diff --git a/mm/memblock.c b/mm/memblock.c index 6ecb0d937fb5..252b77bdf65e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -715,16 +715,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | |||
715 | } | 715 | } |
716 | 716 | ||
717 | /** | 717 | /** |
718 | * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. | ||
719 | * @base: the base phys addr of the region | ||
720 | * @size: the size of the region | ||
721 | * | 718 | * |
722 | * This function isolates region [@base, @base + @size), and mark it with flag | 719 | * This function isolates region [@base, @base + @size), and sets/clears flag |
723 | * MEMBLOCK_HOTPLUG. | ||
724 | * | 720 | * |
725 | * Return 0 on succees, -errno on failure. | 721 | * Return 0 on succees, -errno on failure. |
726 | */ | 722 | */ |
727 | int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | 723 | static int __init_memblock memblock_setclr_flag(phys_addr_t base, |
724 | phys_addr_t size, int set, int flag) | ||
728 | { | 725 | { |
729 | struct memblock_type *type = &memblock.memory; | 726 | struct memblock_type *type = &memblock.memory; |
730 | int i, ret, start_rgn, end_rgn; | 727 | int i, ret, start_rgn, end_rgn; |
@@ -734,37 +731,37 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | |||
734 | return ret; | 731 | return ret; |
735 | 732 | ||
736 | for (i = start_rgn; i < end_rgn; i++) | 733 | for (i = start_rgn; i < end_rgn; i++) |
737 | memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); | 734 | if (set) |
735 | memblock_set_region_flags(&type->regions[i], flag); | ||
736 | else | ||
737 | memblock_clear_region_flags(&type->regions[i], flag); | ||
738 | 738 | ||
739 | memblock_merge_regions(type); | 739 | memblock_merge_regions(type); |
740 | return 0; | 740 | return 0; |
741 | } | 741 | } |
742 | 742 | ||
743 | /** | 743 | /** |
744 | * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. | 744 | * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. |
745 | * @base: the base phys addr of the region | 745 | * @base: the base phys addr of the region |
746 | * @size: the size of the region | 746 | * @size: the size of the region |
747 | * | 747 | * |
748 | * This function isolates region [@base, @base + @size), and clear flag | 748 | * Return 0 on succees, -errno on failure. |
749 | * MEMBLOCK_HOTPLUG for the isolated regions. | 749 | */ |
750 | int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | ||
751 | { | ||
752 | return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG); | ||
753 | } | ||
754 | |||
755 | /** | ||
756 | * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. | ||
757 | * @base: the base phys addr of the region | ||
758 | * @size: the size of the region | ||
750 | * | 759 | * |
751 | * Return 0 on succees, -errno on failure. | 760 | * Return 0 on succees, -errno on failure. |
752 | */ | 761 | */ |
753 | int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | 762 | int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) |
754 | { | 763 | { |
755 | struct memblock_type *type = &memblock.memory; | 764 | return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG); |
756 | int i, ret, start_rgn, end_rgn; | ||
757 | |||
758 | ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); | ||
759 | if (ret) | ||
760 | return ret; | ||
761 | |||
762 | for (i = start_rgn; i < end_rgn; i++) | ||
763 | memblock_clear_region_flags(&type->regions[i], | ||
764 | MEMBLOCK_HOTPLUG); | ||
765 | |||
766 | memblock_merge_regions(type); | ||
767 | return 0; | ||
768 | } | 765 | } |
769 | 766 | ||
770 | /** | 767 | /** |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 85df503ec023..ef91e856c7e4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -296,7 +296,6 @@ struct mem_cgroup { | |||
296 | * Should the accounting and control be hierarchical, per subtree? | 296 | * Should the accounting and control be hierarchical, per subtree? |
297 | */ | 297 | */ |
298 | bool use_hierarchy; | 298 | bool use_hierarchy; |
299 | unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ | ||
300 | 299 | ||
301 | bool oom_lock; | 300 | bool oom_lock; |
302 | atomic_t under_oom; | 301 | atomic_t under_oom; |
@@ -366,22 +365,11 @@ struct mem_cgroup { | |||
366 | /* WARNING: nodeinfo must be the last member here */ | 365 | /* WARNING: nodeinfo must be the last member here */ |
367 | }; | 366 | }; |
368 | 367 | ||
369 | /* internal only representation about the status of kmem accounting. */ | ||
370 | enum { | ||
371 | KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ | ||
372 | }; | ||
373 | |||
374 | #ifdef CONFIG_MEMCG_KMEM | 368 | #ifdef CONFIG_MEMCG_KMEM |
375 | static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) | ||
376 | { | ||
377 | set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | ||
378 | } | ||
379 | |||
380 | static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | 369 | static bool memcg_kmem_is_active(struct mem_cgroup *memcg) |
381 | { | 370 | { |
382 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | 371 | return memcg->kmemcg_id >= 0; |
383 | } | 372 | } |
384 | |||
385 | #endif | 373 | #endif |
386 | 374 | ||
387 | /* Stuffs for move charges at task migration. */ | 375 | /* Stuffs for move charges at task migration. */ |
@@ -1571,7 +1559,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1571 | * select it. The goal is to allow it to allocate so that it may | 1559 | * select it. The goal is to allow it to allocate so that it may |
1572 | * quickly exit and free its memory. | 1560 | * quickly exit and free its memory. |
1573 | */ | 1561 | */ |
1574 | if (fatal_signal_pending(current) || current->flags & PF_EXITING) { | 1562 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { |
1575 | set_thread_flag(TIF_MEMDIE); | 1563 | set_thread_flag(TIF_MEMDIE); |
1576 | return; | 1564 | return; |
1577 | } | 1565 | } |
@@ -1628,6 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1628 | NULL, "Memory cgroup out of memory"); | 1616 | NULL, "Memory cgroup out of memory"); |
1629 | } | 1617 | } |
1630 | 1618 | ||
1619 | #if MAX_NUMNODES > 1 | ||
1620 | |||
1631 | /** | 1621 | /** |
1632 | * test_mem_cgroup_node_reclaimable | 1622 | * test_mem_cgroup_node_reclaimable |
1633 | * @memcg: the target memcg | 1623 | * @memcg: the target memcg |
@@ -1650,7 +1640,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | |||
1650 | return false; | 1640 | return false; |
1651 | 1641 | ||
1652 | } | 1642 | } |
1653 | #if MAX_NUMNODES > 1 | ||
1654 | 1643 | ||
1655 | /* | 1644 | /* |
1656 | * Always updating the nodemask is not very good - even if we have an empty | 1645 | * Always updating the nodemask is not very good - even if we have an empty |
@@ -2646,7 +2635,6 @@ static void memcg_register_cache(struct mem_cgroup *memcg, | |||
2646 | if (!cachep) | 2635 | if (!cachep) |
2647 | return; | 2636 | return; |
2648 | 2637 | ||
2649 | css_get(&memcg->css); | ||
2650 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | 2638 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); |
2651 | 2639 | ||
2652 | /* | 2640 | /* |
@@ -2680,40 +2668,6 @@ static void memcg_unregister_cache(struct kmem_cache *cachep) | |||
2680 | list_del(&cachep->memcg_params->list); | 2668 | list_del(&cachep->memcg_params->list); |
2681 | 2669 | ||
2682 | kmem_cache_destroy(cachep); | 2670 | kmem_cache_destroy(cachep); |
2683 | |||
2684 | /* drop the reference taken in memcg_register_cache */ | ||
2685 | css_put(&memcg->css); | ||
2686 | } | ||
2687 | |||
2688 | /* | ||
2689 | * During the creation a new cache, we need to disable our accounting mechanism | ||
2690 | * altogether. This is true even if we are not creating, but rather just | ||
2691 | * enqueing new caches to be created. | ||
2692 | * | ||
2693 | * This is because that process will trigger allocations; some visible, like | ||
2694 | * explicit kmallocs to auxiliary data structures, name strings and internal | ||
2695 | * cache structures; some well concealed, like INIT_WORK() that can allocate | ||
2696 | * objects during debug. | ||
2697 | * | ||
2698 | * If any allocation happens during memcg_kmem_get_cache, we will recurse back | ||
2699 | * to it. This may not be a bounded recursion: since the first cache creation | ||
2700 | * failed to complete (waiting on the allocation), we'll just try to create the | ||
2701 | * cache again, failing at the same point. | ||
2702 | * | ||
2703 | * memcg_kmem_get_cache is prepared to abort after seeing a positive count of | ||
2704 | * memcg_kmem_skip_account. So we enclose anything that might allocate memory | ||
2705 | * inside the following two functions. | ||
2706 | */ | ||
2707 | static inline void memcg_stop_kmem_account(void) | ||
2708 | { | ||
2709 | VM_BUG_ON(!current->mm); | ||
2710 | current->memcg_kmem_skip_account++; | ||
2711 | } | ||
2712 | |||
2713 | static inline void memcg_resume_kmem_account(void) | ||
2714 | { | ||
2715 | VM_BUG_ON(!current->mm); | ||
2716 | current->memcg_kmem_skip_account--; | ||
2717 | } | 2671 | } |
2718 | 2672 | ||
2719 | int __memcg_cleanup_cache_params(struct kmem_cache *s) | 2673 | int __memcg_cleanup_cache_params(struct kmem_cache *s) |
@@ -2747,9 +2701,7 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg) | |||
2747 | mutex_lock(&memcg_slab_mutex); | 2701 | mutex_lock(&memcg_slab_mutex); |
2748 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { | 2702 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { |
2749 | cachep = memcg_params_to_cache(params); | 2703 | cachep = memcg_params_to_cache(params); |
2750 | kmem_cache_shrink(cachep); | 2704 | memcg_unregister_cache(cachep); |
2751 | if (atomic_read(&cachep->memcg_params->nr_pages) == 0) | ||
2752 | memcg_unregister_cache(cachep); | ||
2753 | } | 2705 | } |
2754 | mutex_unlock(&memcg_slab_mutex); | 2706 | mutex_unlock(&memcg_slab_mutex); |
2755 | } | 2707 | } |
@@ -2784,10 +2736,10 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
2784 | struct memcg_register_cache_work *cw; | 2736 | struct memcg_register_cache_work *cw; |
2785 | 2737 | ||
2786 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); | 2738 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); |
2787 | if (cw == NULL) { | 2739 | if (!cw) |
2788 | css_put(&memcg->css); | ||
2789 | return; | 2740 | return; |
2790 | } | 2741 | |
2742 | css_get(&memcg->css); | ||
2791 | 2743 | ||
2792 | cw->memcg = memcg; | 2744 | cw->memcg = memcg; |
2793 | cw->cachep = cachep; | 2745 | cw->cachep = cachep; |
@@ -2810,20 +2762,16 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
2810 | * this point we can't allow ourselves back into memcg_kmem_get_cache, | 2762 | * this point we can't allow ourselves back into memcg_kmem_get_cache, |
2811 | * the safest choice is to do it like this, wrapping the whole function. | 2763 | * the safest choice is to do it like this, wrapping the whole function. |
2812 | */ | 2764 | */ |
2813 | memcg_stop_kmem_account(); | 2765 | current->memcg_kmem_skip_account = 1; |
2814 | __memcg_schedule_register_cache(memcg, cachep); | 2766 | __memcg_schedule_register_cache(memcg, cachep); |
2815 | memcg_resume_kmem_account(); | 2767 | current->memcg_kmem_skip_account = 0; |
2816 | } | 2768 | } |
2817 | 2769 | ||
2818 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) | 2770 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) |
2819 | { | 2771 | { |
2820 | unsigned int nr_pages = 1 << order; | 2772 | unsigned int nr_pages = 1 << order; |
2821 | int res; | ||
2822 | 2773 | ||
2823 | res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); | 2774 | return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); |
2824 | if (!res) | ||
2825 | atomic_add(nr_pages, &cachep->memcg_params->nr_pages); | ||
2826 | return res; | ||
2827 | } | 2775 | } |
2828 | 2776 | ||
2829 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | 2777 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) |
@@ -2831,7 +2779,6 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | |||
2831 | unsigned int nr_pages = 1 << order; | 2779 | unsigned int nr_pages = 1 << order; |
2832 | 2780 | ||
2833 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); | 2781 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); |
2834 | atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); | ||
2835 | } | 2782 | } |
2836 | 2783 | ||
2837 | /* | 2784 | /* |
@@ -2847,8 +2794,7 @@ void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | |||
2847 | * Can't be called in interrupt context or from kernel threads. | 2794 | * Can't be called in interrupt context or from kernel threads. |
2848 | * This function needs to be called with rcu_read_lock() held. | 2795 | * This function needs to be called with rcu_read_lock() held. |
2849 | */ | 2796 | */ |
2850 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | 2797 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) |
2851 | gfp_t gfp) | ||
2852 | { | 2798 | { |
2853 | struct mem_cgroup *memcg; | 2799 | struct mem_cgroup *memcg; |
2854 | struct kmem_cache *memcg_cachep; | 2800 | struct kmem_cache *memcg_cachep; |
@@ -2856,25 +2802,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
2856 | VM_BUG_ON(!cachep->memcg_params); | 2802 | VM_BUG_ON(!cachep->memcg_params); |
2857 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); | 2803 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); |
2858 | 2804 | ||
2859 | if (!current->mm || current->memcg_kmem_skip_account) | 2805 | if (current->memcg_kmem_skip_account) |
2860 | return cachep; | 2806 | return cachep; |
2861 | 2807 | ||
2862 | rcu_read_lock(); | 2808 | memcg = get_mem_cgroup_from_mm(current->mm); |
2863 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); | ||
2864 | |||
2865 | if (!memcg_kmem_is_active(memcg)) | 2809 | if (!memcg_kmem_is_active(memcg)) |
2866 | goto out; | 2810 | goto out; |
2867 | 2811 | ||
2868 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); | 2812 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); |
2869 | if (likely(memcg_cachep)) { | 2813 | if (likely(memcg_cachep)) |
2870 | cachep = memcg_cachep; | 2814 | return memcg_cachep; |
2871 | goto out; | ||
2872 | } | ||
2873 | |||
2874 | /* The corresponding put will be done in the workqueue. */ | ||
2875 | if (!css_tryget_online(&memcg->css)) | ||
2876 | goto out; | ||
2877 | rcu_read_unlock(); | ||
2878 | 2815 | ||
2879 | /* | 2816 | /* |
2880 | * If we are in a safe context (can wait, and not in interrupt | 2817 | * If we are in a safe context (can wait, and not in interrupt |
@@ -2889,12 +2826,17 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
2889 | * defer everything. | 2826 | * defer everything. |
2890 | */ | 2827 | */ |
2891 | memcg_schedule_register_cache(memcg, cachep); | 2828 | memcg_schedule_register_cache(memcg, cachep); |
2892 | return cachep; | ||
2893 | out: | 2829 | out: |
2894 | rcu_read_unlock(); | 2830 | css_put(&memcg->css); |
2895 | return cachep; | 2831 | return cachep; |
2896 | } | 2832 | } |
2897 | 2833 | ||
2834 | void __memcg_kmem_put_cache(struct kmem_cache *cachep) | ||
2835 | { | ||
2836 | if (!is_root_cache(cachep)) | ||
2837 | css_put(&cachep->memcg_params->memcg->css); | ||
2838 | } | ||
2839 | |||
2898 | /* | 2840 | /* |
2899 | * We need to verify if the allocation against current->mm->owner's memcg is | 2841 | * We need to verify if the allocation against current->mm->owner's memcg is |
2900 | * possible for the given order. But the page is not allocated yet, so we'll | 2842 | * possible for the given order. But the page is not allocated yet, so we'll |
@@ -2917,34 +2859,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
2917 | 2859 | ||
2918 | *_memcg = NULL; | 2860 | *_memcg = NULL; |
2919 | 2861 | ||
2920 | /* | ||
2921 | * Disabling accounting is only relevant for some specific memcg | ||
2922 | * internal allocations. Therefore we would initially not have such | ||
2923 | * check here, since direct calls to the page allocator that are | ||
2924 | * accounted to kmemcg (alloc_kmem_pages and friends) only happen | ||
2925 | * outside memcg core. We are mostly concerned with cache allocations, | ||
2926 | * and by having this test at memcg_kmem_get_cache, we are already able | ||
2927 | * to relay the allocation to the root cache and bypass the memcg cache | ||
2928 | * altogether. | ||
2929 | * | ||
2930 | * There is one exception, though: the SLUB allocator does not create | ||
2931 | * large order caches, but rather service large kmallocs directly from | ||
2932 | * the page allocator. Therefore, the following sequence when backed by | ||
2933 | * the SLUB allocator: | ||
2934 | * | ||
2935 | * memcg_stop_kmem_account(); | ||
2936 | * kmalloc(<large_number>) | ||
2937 | * memcg_resume_kmem_account(); | ||
2938 | * | ||
2939 | * would effectively ignore the fact that we should skip accounting, | ||
2940 | * since it will drive us directly to this function without passing | ||
2941 | * through the cache selector memcg_kmem_get_cache. Such large | ||
2942 | * allocations are extremely rare but can happen, for instance, for the | ||
2943 | * cache arrays. We bring this test here. | ||
2944 | */ | ||
2945 | if (!current->mm || current->memcg_kmem_skip_account) | ||
2946 | return true; | ||
2947 | |||
2948 | memcg = get_mem_cgroup_from_mm(current->mm); | 2862 | memcg = get_mem_cgroup_from_mm(current->mm); |
2949 | 2863 | ||
2950 | if (!memcg_kmem_is_active(memcg)) { | 2864 | if (!memcg_kmem_is_active(memcg)) { |
@@ -2985,10 +2899,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) | |||
2985 | memcg_uncharge_kmem(memcg, 1 << order); | 2899 | memcg_uncharge_kmem(memcg, 1 << order); |
2986 | page->mem_cgroup = NULL; | 2900 | page->mem_cgroup = NULL; |
2987 | } | 2901 | } |
2988 | #else | ||
2989 | static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) | ||
2990 | { | ||
2991 | } | ||
2992 | #endif /* CONFIG_MEMCG_KMEM */ | 2902 | #endif /* CONFIG_MEMCG_KMEM */ |
2993 | 2903 | ||
2994 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2904 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
@@ -3539,12 +3449,6 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, | |||
3539 | return 0; | 3449 | return 0; |
3540 | 3450 | ||
3541 | /* | 3451 | /* |
3542 | * We are going to allocate memory for data shared by all memory | ||
3543 | * cgroups so let's stop accounting here. | ||
3544 | */ | ||
3545 | memcg_stop_kmem_account(); | ||
3546 | |||
3547 | /* | ||
3548 | * For simplicity, we won't allow this to be disabled. It also can't | 3452 | * For simplicity, we won't allow this to be disabled. It also can't |
3549 | * be changed if the cgroup has children already, or if tasks had | 3453 | * be changed if the cgroup has children already, or if tasks had |
3550 | * already joined. | 3454 | * already joined. |
@@ -3570,25 +3474,22 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, | |||
3570 | goto out; | 3474 | goto out; |
3571 | } | 3475 | } |
3572 | 3476 | ||
3573 | memcg->kmemcg_id = memcg_id; | ||
3574 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
3575 | |||
3576 | /* | 3477 | /* |
3577 | * We couldn't have accounted to this cgroup, because it hasn't got the | 3478 | * We couldn't have accounted to this cgroup, because it hasn't got |
3578 | * active bit set yet, so this should succeed. | 3479 | * activated yet, so this should succeed. |
3579 | */ | 3480 | */ |
3580 | err = page_counter_limit(&memcg->kmem, nr_pages); | 3481 | err = page_counter_limit(&memcg->kmem, nr_pages); |
3581 | VM_BUG_ON(err); | 3482 | VM_BUG_ON(err); |
3582 | 3483 | ||
3583 | static_key_slow_inc(&memcg_kmem_enabled_key); | 3484 | static_key_slow_inc(&memcg_kmem_enabled_key); |
3584 | /* | 3485 | /* |
3585 | * Setting the active bit after enabling static branching will | 3486 | * A memory cgroup is considered kmem-active as soon as it gets |
3487 | * kmemcg_id. Setting the id after enabling static branching will | ||
3586 | * guarantee no one starts accounting before all call sites are | 3488 | * guarantee no one starts accounting before all call sites are |
3587 | * patched. | 3489 | * patched. |
3588 | */ | 3490 | */ |
3589 | memcg_kmem_set_active(memcg); | 3491 | memcg->kmemcg_id = memcg_id; |
3590 | out: | 3492 | out: |
3591 | memcg_resume_kmem_account(); | ||
3592 | return err; | 3493 | return err; |
3593 | } | 3494 | } |
3594 | 3495 | ||
@@ -3791,11 +3692,6 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) | |||
3791 | } | 3692 | } |
3792 | #endif /* CONFIG_NUMA */ | 3693 | #endif /* CONFIG_NUMA */ |
3793 | 3694 | ||
3794 | static inline void mem_cgroup_lru_names_not_uptodate(void) | ||
3795 | { | ||
3796 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | ||
3797 | } | ||
3798 | |||
3799 | static int memcg_stat_show(struct seq_file *m, void *v) | 3695 | static int memcg_stat_show(struct seq_file *m, void *v) |
3800 | { | 3696 | { |
3801 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 3697 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
@@ -3803,6 +3699,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) | |||
3803 | struct mem_cgroup *mi; | 3699 | struct mem_cgroup *mi; |
3804 | unsigned int i; | 3700 | unsigned int i; |
3805 | 3701 | ||
3702 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | ||
3703 | |||
3806 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 3704 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
3807 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) | 3705 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
3808 | continue; | 3706 | continue; |
@@ -4259,7 +4157,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
4259 | { | 4157 | { |
4260 | int ret; | 4158 | int ret; |
4261 | 4159 | ||
4262 | memcg->kmemcg_id = -1; | ||
4263 | ret = memcg_propagate_kmem(memcg); | 4160 | ret = memcg_propagate_kmem(memcg); |
4264 | if (ret) | 4161 | if (ret) |
4265 | return ret; | 4162 | return ret; |
@@ -4269,6 +4166,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
4269 | 4166 | ||
4270 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4167 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
4271 | { | 4168 | { |
4169 | memcg_unregister_all_caches(memcg); | ||
4272 | mem_cgroup_sockets_destroy(memcg); | 4170 | mem_cgroup_sockets_destroy(memcg); |
4273 | } | 4171 | } |
4274 | #else | 4172 | #else |
@@ -4724,17 +4622,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
4724 | 4622 | ||
4725 | free_percpu(memcg->stat); | 4623 | free_percpu(memcg->stat); |
4726 | 4624 | ||
4727 | /* | ||
4728 | * We need to make sure that (at least for now), the jump label | ||
4729 | * destruction code runs outside of the cgroup lock. This is because | ||
4730 | * get_online_cpus(), which is called from the static_branch update, | ||
4731 | * can't be called inside the cgroup_lock. cpusets are the ones | ||
4732 | * enforcing this dependency, so if they ever change, we might as well. | ||
4733 | * | ||
4734 | * schedule_work() will guarantee this happens. Be careful if you need | ||
4735 | * to move this code around, and make sure it is outside | ||
4736 | * the cgroup_lock. | ||
4737 | */ | ||
4738 | disarm_static_keys(memcg); | 4625 | disarm_static_keys(memcg); |
4739 | kfree(memcg); | 4626 | kfree(memcg); |
4740 | } | 4627 | } |
@@ -4804,6 +4691,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
4804 | vmpressure_init(&memcg->vmpressure); | 4691 | vmpressure_init(&memcg->vmpressure); |
4805 | INIT_LIST_HEAD(&memcg->event_list); | 4692 | INIT_LIST_HEAD(&memcg->event_list); |
4806 | spin_lock_init(&memcg->event_list_lock); | 4693 | spin_lock_init(&memcg->event_list_lock); |
4694 | #ifdef CONFIG_MEMCG_KMEM | ||
4695 | memcg->kmemcg_id = -1; | ||
4696 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
4697 | #endif | ||
4807 | 4698 | ||
4808 | return &memcg->css; | 4699 | return &memcg->css; |
4809 | 4700 | ||
@@ -4885,7 +4776,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
4885 | } | 4776 | } |
4886 | spin_unlock(&memcg->event_list_lock); | 4777 | spin_unlock(&memcg->event_list_lock); |
4887 | 4778 | ||
4888 | memcg_unregister_all_caches(memcg); | ||
4889 | vmpressure_cleanup(&memcg->vmpressure); | 4779 | vmpressure_cleanup(&memcg->vmpressure); |
4890 | } | 4780 | } |
4891 | 4781 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e5ee0ca7ae85..feb803bf3443 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -239,19 +239,14 @@ void shake_page(struct page *p, int access) | |||
239 | } | 239 | } |
240 | 240 | ||
241 | /* | 241 | /* |
242 | * Only call shrink_slab here (which would also shrink other caches) if | 242 | * Only call shrink_node_slabs here (which would also shrink |
243 | * access is not potentially fatal. | 243 | * other caches) if access is not potentially fatal. |
244 | */ | 244 | */ |
245 | if (access) { | 245 | if (access) { |
246 | int nr; | 246 | int nr; |
247 | int nid = page_to_nid(p); | 247 | int nid = page_to_nid(p); |
248 | do { | 248 | do { |
249 | struct shrink_control shrink = { | 249 | nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); |
250 | .gfp_mask = GFP_KERNEL, | ||
251 | }; | ||
252 | node_set(nid, shrink.nodes_to_scan); | ||
253 | |||
254 | nr = shrink_slab(&shrink, 1000, 1000); | ||
255 | if (page_count(p) == 1) | 250 | if (page_count(p) == 1) |
256 | break; | 251 | break; |
257 | } while (nr > 10); | 252 | } while (nr > 10); |
@@ -466,7 +461,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
466 | struct task_struct *tsk; | 461 | struct task_struct *tsk; |
467 | struct address_space *mapping = page->mapping; | 462 | struct address_space *mapping = page->mapping; |
468 | 463 | ||
469 | mutex_lock(&mapping->i_mmap_mutex); | 464 | i_mmap_lock_read(mapping); |
470 | read_lock(&tasklist_lock); | 465 | read_lock(&tasklist_lock); |
471 | for_each_process(tsk) { | 466 | for_each_process(tsk) { |
472 | pgoff_t pgoff = page_to_pgoff(page); | 467 | pgoff_t pgoff = page_to_pgoff(page); |
@@ -488,7 +483,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
488 | } | 483 | } |
489 | } | 484 | } |
490 | read_unlock(&tasklist_lock); | 485 | read_unlock(&tasklist_lock); |
491 | mutex_unlock(&mapping->i_mmap_mutex); | 486 | i_mmap_unlock_read(mapping); |
492 | } | 487 | } |
493 | 488 | ||
494 | /* | 489 | /* |
diff --git a/mm/memory.c b/mm/memory.c index 4b5a282e1107..fbf74112de5b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1326,9 +1326,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1326 | * safe to do nothing in this case. | 1326 | * safe to do nothing in this case. |
1327 | */ | 1327 | */ |
1328 | if (vma->vm_file) { | 1328 | if (vma->vm_file) { |
1329 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 1329 | i_mmap_lock_write(vma->vm_file->f_mapping); |
1330 | __unmap_hugepage_range_final(tlb, vma, start, end, NULL); | 1330 | __unmap_hugepage_range_final(tlb, vma, start, end, NULL); |
1331 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 1331 | i_mmap_unlock_write(vma->vm_file->f_mapping); |
1332 | } | 1332 | } |
1333 | } else | 1333 | } else |
1334 | unmap_page_range(tlb, vma, start, end, details); | 1334 | unmap_page_range(tlb, vma, start, end, details); |
@@ -2377,12 +2377,12 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2377 | details.last_index = ULONG_MAX; | 2377 | details.last_index = ULONG_MAX; |
2378 | 2378 | ||
2379 | 2379 | ||
2380 | mutex_lock(&mapping->i_mmap_mutex); | 2380 | i_mmap_lock_read(mapping); |
2381 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) | 2381 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) |
2382 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2382 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
2383 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | 2383 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) |
2384 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | 2384 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); |
2385 | mutex_unlock(&mapping->i_mmap_mutex); | 2385 | i_mmap_unlock_read(mapping); |
2386 | } | 2386 | } |
2387 | EXPORT_SYMBOL(unmap_mapping_range); | 2387 | EXPORT_SYMBOL(unmap_mapping_range); |
2388 | 2388 | ||
@@ -3365,6 +3365,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3365 | 3365 | ||
3366 | return ret; | 3366 | return ret; |
3367 | } | 3367 | } |
3368 | EXPORT_SYMBOL_GPL(handle_mm_fault); | ||
3368 | 3369 | ||
3369 | #ifndef __PAGETABLE_PUD_FOLDED | 3370 | #ifndef __PAGETABLE_PUD_FOLDED |
3370 | /* | 3371 | /* |
diff --git a/mm/migrate.c b/mm/migrate.c index 01439953abf5..253474c22239 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -746,7 +746,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
746 | * MIGRATEPAGE_SUCCESS - success | 746 | * MIGRATEPAGE_SUCCESS - success |
747 | */ | 747 | */ |
748 | static int move_to_new_page(struct page *newpage, struct page *page, | 748 | static int move_to_new_page(struct page *newpage, struct page *page, |
749 | int remap_swapcache, enum migrate_mode mode) | 749 | int page_was_mapped, enum migrate_mode mode) |
750 | { | 750 | { |
751 | struct address_space *mapping; | 751 | struct address_space *mapping; |
752 | int rc; | 752 | int rc; |
@@ -784,7 +784,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
784 | newpage->mapping = NULL; | 784 | newpage->mapping = NULL; |
785 | } else { | 785 | } else { |
786 | mem_cgroup_migrate(page, newpage, false); | 786 | mem_cgroup_migrate(page, newpage, false); |
787 | if (remap_swapcache) | 787 | if (page_was_mapped) |
788 | remove_migration_ptes(page, newpage); | 788 | remove_migration_ptes(page, newpage); |
789 | page->mapping = NULL; | 789 | page->mapping = NULL; |
790 | } | 790 | } |
@@ -798,7 +798,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
798 | int force, enum migrate_mode mode) | 798 | int force, enum migrate_mode mode) |
799 | { | 799 | { |
800 | int rc = -EAGAIN; | 800 | int rc = -EAGAIN; |
801 | int remap_swapcache = 1; | 801 | int page_was_mapped = 0; |
802 | struct anon_vma *anon_vma = NULL; | 802 | struct anon_vma *anon_vma = NULL; |
803 | 803 | ||
804 | if (!trylock_page(page)) { | 804 | if (!trylock_page(page)) { |
@@ -870,7 +870,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
870 | * migrated but are not remapped when migration | 870 | * migrated but are not remapped when migration |
871 | * completes | 871 | * completes |
872 | */ | 872 | */ |
873 | remap_swapcache = 0; | ||
874 | } else { | 873 | } else { |
875 | goto out_unlock; | 874 | goto out_unlock; |
876 | } | 875 | } |
@@ -910,13 +909,17 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
910 | } | 909 | } |
911 | 910 | ||
912 | /* Establish migration ptes or remove ptes */ | 911 | /* Establish migration ptes or remove ptes */ |
913 | try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 912 | if (page_mapped(page)) { |
913 | try_to_unmap(page, | ||
914 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | ||
915 | page_was_mapped = 1; | ||
916 | } | ||
914 | 917 | ||
915 | skip_unmap: | 918 | skip_unmap: |
916 | if (!page_mapped(page)) | 919 | if (!page_mapped(page)) |
917 | rc = move_to_new_page(newpage, page, remap_swapcache, mode); | 920 | rc = move_to_new_page(newpage, page, page_was_mapped, mode); |
918 | 921 | ||
919 | if (rc && remap_swapcache) | 922 | if (rc && page_was_mapped) |
920 | remove_migration_ptes(page, page); | 923 | remove_migration_ptes(page, page); |
921 | 924 | ||
922 | /* Drop an anon_vma reference if we took one */ | 925 | /* Drop an anon_vma reference if we took one */ |
@@ -1017,6 +1020,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1017 | { | 1020 | { |
1018 | int rc = 0; | 1021 | int rc = 0; |
1019 | int *result = NULL; | 1022 | int *result = NULL; |
1023 | int page_was_mapped = 0; | ||
1020 | struct page *new_hpage; | 1024 | struct page *new_hpage; |
1021 | struct anon_vma *anon_vma = NULL; | 1025 | struct anon_vma *anon_vma = NULL; |
1022 | 1026 | ||
@@ -1047,12 +1051,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1047 | if (PageAnon(hpage)) | 1051 | if (PageAnon(hpage)) |
1048 | anon_vma = page_get_anon_vma(hpage); | 1052 | anon_vma = page_get_anon_vma(hpage); |
1049 | 1053 | ||
1050 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 1054 | if (page_mapped(hpage)) { |
1055 | try_to_unmap(hpage, | ||
1056 | TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | ||
1057 | page_was_mapped = 1; | ||
1058 | } | ||
1051 | 1059 | ||
1052 | if (!page_mapped(hpage)) | 1060 | if (!page_mapped(hpage)) |
1053 | rc = move_to_new_page(new_hpage, hpage, 1, mode); | 1061 | rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode); |
1054 | 1062 | ||
1055 | if (rc != MIGRATEPAGE_SUCCESS) | 1063 | if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) |
1056 | remove_migration_ptes(hpage, hpage); | 1064 | remove_migration_ptes(hpage, hpage); |
1057 | 1065 | ||
1058 | if (anon_vma) | 1066 | if (anon_vma) |
diff --git a/mm/mincore.c b/mm/mincore.c index 725c80961048..c8c528b36641 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -137,8 +137,11 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
137 | } else { /* pte is a swap entry */ | 137 | } else { /* pte is a swap entry */ |
138 | swp_entry_t entry = pte_to_swp_entry(pte); | 138 | swp_entry_t entry = pte_to_swp_entry(pte); |
139 | 139 | ||
140 | if (is_migration_entry(entry)) { | 140 | if (non_swap_entry(entry)) { |
141 | /* migration entries are always uptodate */ | 141 | /* |
142 | * migration or hwpoison entries are always | ||
143 | * uptodate | ||
144 | */ | ||
142 | *vec = 1; | 145 | *vec = 1; |
143 | } else { | 146 | } else { |
144 | #ifdef CONFIG_SWAP | 147 | #ifdef CONFIG_SWAP |
@@ -232,7 +232,7 @@ error: | |||
232 | } | 232 | } |
233 | 233 | ||
234 | /* | 234 | /* |
235 | * Requires inode->i_mapping->i_mmap_mutex | 235 | * Requires inode->i_mapping->i_mmap_rwsem |
236 | */ | 236 | */ |
237 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, | 237 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, |
238 | struct file *file, struct address_space *mapping) | 238 | struct file *file, struct address_space *mapping) |
@@ -260,9 +260,9 @@ void unlink_file_vma(struct vm_area_struct *vma) | |||
260 | 260 | ||
261 | if (file) { | 261 | if (file) { |
262 | struct address_space *mapping = file->f_mapping; | 262 | struct address_space *mapping = file->f_mapping; |
263 | mutex_lock(&mapping->i_mmap_mutex); | 263 | i_mmap_lock_write(mapping); |
264 | __remove_shared_vm_struct(vma, file, mapping); | 264 | __remove_shared_vm_struct(vma, file, mapping); |
265 | mutex_unlock(&mapping->i_mmap_mutex); | 265 | i_mmap_unlock_write(mapping); |
266 | } | 266 | } |
267 | } | 267 | } |
268 | 268 | ||
@@ -674,14 +674,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
674 | 674 | ||
675 | if (vma->vm_file) { | 675 | if (vma->vm_file) { |
676 | mapping = vma->vm_file->f_mapping; | 676 | mapping = vma->vm_file->f_mapping; |
677 | mutex_lock(&mapping->i_mmap_mutex); | 677 | i_mmap_lock_write(mapping); |
678 | } | 678 | } |
679 | 679 | ||
680 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 680 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
681 | __vma_link_file(vma); | 681 | __vma_link_file(vma); |
682 | 682 | ||
683 | if (mapping) | 683 | if (mapping) |
684 | mutex_unlock(&mapping->i_mmap_mutex); | 684 | i_mmap_unlock_write(mapping); |
685 | 685 | ||
686 | mm->map_count++; | 686 | mm->map_count++; |
687 | validate_mm(mm); | 687 | validate_mm(mm); |
@@ -796,7 +796,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
796 | next->vm_end); | 796 | next->vm_end); |
797 | } | 797 | } |
798 | 798 | ||
799 | mutex_lock(&mapping->i_mmap_mutex); | 799 | i_mmap_lock_write(mapping); |
800 | if (insert) { | 800 | if (insert) { |
801 | /* | 801 | /* |
802 | * Put into interval tree now, so instantiated pages | 802 | * Put into interval tree now, so instantiated pages |
@@ -883,7 +883,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
883 | anon_vma_unlock_write(anon_vma); | 883 | anon_vma_unlock_write(anon_vma); |
884 | } | 884 | } |
885 | if (mapping) | 885 | if (mapping) |
886 | mutex_unlock(&mapping->i_mmap_mutex); | 886 | i_mmap_unlock_write(mapping); |
887 | 887 | ||
888 | if (root) { | 888 | if (root) { |
889 | uprobe_mmap(vma); | 889 | uprobe_mmap(vma); |
@@ -2362,6 +2362,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
2362 | } | 2362 | } |
2363 | #endif | 2363 | #endif |
2364 | 2364 | ||
2365 | EXPORT_SYMBOL_GPL(find_extend_vma); | ||
2366 | |||
2365 | /* | 2367 | /* |
2366 | * Ok - we have the memory areas we should free on the vma list, | 2368 | * Ok - we have the memory areas we should free on the vma list, |
2367 | * so release them, and do the vma updates. | 2369 | * so release them, and do the vma updates. |
@@ -2791,7 +2793,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2791 | 2793 | ||
2792 | /* Insert vm structure into process list sorted by address | 2794 | /* Insert vm structure into process list sorted by address |
2793 | * and into the inode's i_mmap tree. If vm_file is non-NULL | 2795 | * and into the inode's i_mmap tree. If vm_file is non-NULL |
2794 | * then i_mmap_mutex is taken here. | 2796 | * then i_mmap_rwsem is taken here. |
2795 | */ | 2797 | */ |
2796 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 2798 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
2797 | { | 2799 | { |
@@ -3086,7 +3088,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
3086 | */ | 3088 | */ |
3087 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) | 3089 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) |
3088 | BUG(); | 3090 | BUG(); |
3089 | mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); | 3091 | down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); |
3090 | } | 3092 | } |
3091 | } | 3093 | } |
3092 | 3094 | ||
@@ -3113,7 +3115,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
3113 | * vma in this mm is backed by the same anon_vma or address_space. | 3115 | * vma in this mm is backed by the same anon_vma or address_space. |
3114 | * | 3116 | * |
3115 | * We can take all the locks in random order because the VM code | 3117 | * We can take all the locks in random order because the VM code |
3116 | * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never | 3118 | * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never |
3117 | * takes more than one of them in a row. Secondly we're protected | 3119 | * takes more than one of them in a row. Secondly we're protected |
3118 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | 3120 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. |
3119 | * | 3121 | * |
@@ -3182,7 +3184,7 @@ static void vm_unlock_mapping(struct address_space *mapping) | |||
3182 | * AS_MM_ALL_LOCKS can't change to 0 from under us | 3184 | * AS_MM_ALL_LOCKS can't change to 0 from under us |
3183 | * because we hold the mm_all_locks_mutex. | 3185 | * because we hold the mm_all_locks_mutex. |
3184 | */ | 3186 | */ |
3185 | mutex_unlock(&mapping->i_mmap_mutex); | 3187 | i_mmap_unlock_write(mapping); |
3186 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, | 3188 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, |
3187 | &mapping->flags)) | 3189 | &mapping->flags)) |
3188 | BUG(); | 3190 | BUG(); |
diff --git a/mm/mremap.c b/mm/mremap.c index b147f66f4c40..84aa36f9f308 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -99,7 +99,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
99 | spinlock_t *old_ptl, *new_ptl; | 99 | spinlock_t *old_ptl, *new_ptl; |
100 | 100 | ||
101 | /* | 101 | /* |
102 | * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma | 102 | * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma |
103 | * locks to ensure that rmap will always observe either the old or the | 103 | * locks to ensure that rmap will always observe either the old or the |
104 | * new ptes. This is the easiest way to avoid races with | 104 | * new ptes. This is the easiest way to avoid races with |
105 | * truncate_pagecache(), page migration, etc... | 105 | * truncate_pagecache(), page migration, etc... |
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
119 | if (need_rmap_locks) { | 119 | if (need_rmap_locks) { |
120 | if (vma->vm_file) { | 120 | if (vma->vm_file) { |
121 | mapping = vma->vm_file->f_mapping; | 121 | mapping = vma->vm_file->f_mapping; |
122 | mutex_lock(&mapping->i_mmap_mutex); | 122 | i_mmap_lock_write(mapping); |
123 | } | 123 | } |
124 | if (vma->anon_vma) { | 124 | if (vma->anon_vma) { |
125 | anon_vma = vma->anon_vma; | 125 | anon_vma = vma->anon_vma; |
@@ -156,7 +156,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
156 | if (anon_vma) | 156 | if (anon_vma) |
157 | anon_vma_unlock_write(anon_vma); | 157 | anon_vma_unlock_write(anon_vma); |
158 | if (mapping) | 158 | if (mapping) |
159 | mutex_unlock(&mapping->i_mmap_mutex); | 159 | i_mmap_unlock_write(mapping); |
160 | } | 160 | } |
161 | 161 | ||
162 | #define LATENCY_LIMIT (64 * PAGE_SIZE) | 162 | #define LATENCY_LIMIT (64 * PAGE_SIZE) |
diff --git a/mm/nommu.c b/mm/nommu.c index bd1808e194a7..b51eadf6d952 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -722,11 +722,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
722 | if (vma->vm_file) { | 722 | if (vma->vm_file) { |
723 | mapping = vma->vm_file->f_mapping; | 723 | mapping = vma->vm_file->f_mapping; |
724 | 724 | ||
725 | mutex_lock(&mapping->i_mmap_mutex); | 725 | i_mmap_lock_write(mapping); |
726 | flush_dcache_mmap_lock(mapping); | 726 | flush_dcache_mmap_lock(mapping); |
727 | vma_interval_tree_insert(vma, &mapping->i_mmap); | 727 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
728 | flush_dcache_mmap_unlock(mapping); | 728 | flush_dcache_mmap_unlock(mapping); |
729 | mutex_unlock(&mapping->i_mmap_mutex); | 729 | i_mmap_unlock_write(mapping); |
730 | } | 730 | } |
731 | 731 | ||
732 | /* add the VMA to the tree */ | 732 | /* add the VMA to the tree */ |
@@ -795,11 +795,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
795 | if (vma->vm_file) { | 795 | if (vma->vm_file) { |
796 | mapping = vma->vm_file->f_mapping; | 796 | mapping = vma->vm_file->f_mapping; |
797 | 797 | ||
798 | mutex_lock(&mapping->i_mmap_mutex); | 798 | i_mmap_lock_write(mapping); |
799 | flush_dcache_mmap_lock(mapping); | 799 | flush_dcache_mmap_lock(mapping); |
800 | vma_interval_tree_remove(vma, &mapping->i_mmap); | 800 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
801 | flush_dcache_mmap_unlock(mapping); | 801 | flush_dcache_mmap_unlock(mapping); |
802 | mutex_unlock(&mapping->i_mmap_mutex); | 802 | i_mmap_unlock_write(mapping); |
803 | } | 803 | } |
804 | 804 | ||
805 | /* remove from the MM's tree and list */ | 805 | /* remove from the MM's tree and list */ |
@@ -1149,8 +1149,7 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1149 | unsigned long len, | 1149 | unsigned long len, |
1150 | unsigned long capabilities) | 1150 | unsigned long capabilities) |
1151 | { | 1151 | { |
1152 | struct page *pages; | 1152 | unsigned long total, point; |
1153 | unsigned long total, point, n; | ||
1154 | void *base; | 1153 | void *base; |
1155 | int ret, order; | 1154 | int ret, order; |
1156 | 1155 | ||
@@ -1182,33 +1181,23 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1182 | order = get_order(len); | 1181 | order = get_order(len); |
1183 | kdebug("alloc order %d for %lx", order, len); | 1182 | kdebug("alloc order %d for %lx", order, len); |
1184 | 1183 | ||
1185 | pages = alloc_pages(GFP_KERNEL, order); | ||
1186 | if (!pages) | ||
1187 | goto enomem; | ||
1188 | |||
1189 | total = 1 << order; | 1184 | total = 1 << order; |
1190 | atomic_long_add(total, &mmap_pages_allocated); | ||
1191 | |||
1192 | point = len >> PAGE_SHIFT; | 1185 | point = len >> PAGE_SHIFT; |
1193 | 1186 | ||
1194 | /* we allocated a power-of-2 sized page set, so we may want to trim off | 1187 | /* we don't want to allocate a power-of-2 sized page set */ |
1195 | * the excess */ | ||
1196 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { | 1188 | if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { |
1197 | while (total > point) { | 1189 | total = point; |
1198 | order = ilog2(total - point); | 1190 | kdebug("try to alloc exact %lu pages", total); |
1199 | n = 1 << order; | 1191 | base = alloc_pages_exact(len, GFP_KERNEL); |
1200 | kdebug("shave %lu/%lu @%lu", n, total - point, total); | 1192 | } else { |
1201 | atomic_long_sub(n, &mmap_pages_allocated); | 1193 | base = (void *)__get_free_pages(GFP_KERNEL, order); |
1202 | total -= n; | ||
1203 | set_page_refcounted(pages + total); | ||
1204 | __free_pages(pages + total, order); | ||
1205 | } | ||
1206 | } | 1194 | } |
1207 | 1195 | ||
1208 | for (point = 1; point < total; point++) | 1196 | if (!base) |
1209 | set_page_refcounted(&pages[point]); | 1197 | goto enomem; |
1198 | |||
1199 | atomic_long_add(total, &mmap_pages_allocated); | ||
1210 | 1200 | ||
1211 | base = page_address(pages); | ||
1212 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; | 1201 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; |
1213 | region->vm_start = (unsigned long) base; | 1202 | region->vm_start = (unsigned long) base; |
1214 | region->vm_end = region->vm_start + len; | 1203 | region->vm_end = region->vm_start + len; |
@@ -2094,14 +2083,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2094 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 2083 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
2095 | 2084 | ||
2096 | down_write(&nommu_region_sem); | 2085 | down_write(&nommu_region_sem); |
2097 | mutex_lock(&inode->i_mapping->i_mmap_mutex); | 2086 | i_mmap_lock_read(inode->i_mapping); |
2098 | 2087 | ||
2099 | /* search for VMAs that fall within the dead zone */ | 2088 | /* search for VMAs that fall within the dead zone */ |
2100 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { | 2089 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { |
2101 | /* found one - only interested if it's shared out of the page | 2090 | /* found one - only interested if it's shared out of the page |
2102 | * cache */ | 2091 | * cache */ |
2103 | if (vma->vm_flags & VM_SHARED) { | 2092 | if (vma->vm_flags & VM_SHARED) { |
2104 | mutex_unlock(&inode->i_mapping->i_mmap_mutex); | 2093 | i_mmap_unlock_read(inode->i_mapping); |
2105 | up_write(&nommu_region_sem); | 2094 | up_write(&nommu_region_sem); |
2106 | return -ETXTBSY; /* not quite true, but near enough */ | 2095 | return -ETXTBSY; /* not quite true, but near enough */ |
2107 | } | 2096 | } |
@@ -2113,8 +2102,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2113 | * we don't check for any regions that start beyond the EOF as there | 2102 | * we don't check for any regions that start beyond the EOF as there |
2114 | * shouldn't be any | 2103 | * shouldn't be any |
2115 | */ | 2104 | */ |
2116 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, | 2105 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) { |
2117 | 0, ULONG_MAX) { | ||
2118 | if (!(vma->vm_flags & VM_SHARED)) | 2106 | if (!(vma->vm_flags & VM_SHARED)) |
2119 | continue; | 2107 | continue; |
2120 | 2108 | ||
@@ -2129,7 +2117,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2129 | } | 2117 | } |
2130 | } | 2118 | } |
2131 | 2119 | ||
2132 | mutex_unlock(&inode->i_mapping->i_mmap_mutex); | 2120 | i_mmap_unlock_read(inode->i_mapping); |
2133 | up_write(&nommu_region_sem); | 2121 | up_write(&nommu_region_sem); |
2134 | return 0; | 2122 | return 0; |
2135 | } | 2123 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 864bba992735..d503e9ce1c7b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -281,14 +281,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
281 | if (oom_task_origin(task)) | 281 | if (oom_task_origin(task)) |
282 | return OOM_SCAN_SELECT; | 282 | return OOM_SCAN_SELECT; |
283 | 283 | ||
284 | if (task->flags & PF_EXITING && !force_kill) { | 284 | if (task_will_free_mem(task) && !force_kill) |
285 | /* | 285 | return OOM_SCAN_ABORT; |
286 | * If this task is not being ptraced on exit, then wait for it | 286 | |
287 | * to finish before killing some other task unnecessarily. | ||
288 | */ | ||
289 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) | ||
290 | return OOM_SCAN_ABORT; | ||
291 | } | ||
292 | return OOM_SCAN_OK; | 287 | return OOM_SCAN_OK; |
293 | } | 288 | } |
294 | 289 | ||
@@ -443,7 +438,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
443 | * If the task is already exiting, don't alarm the sysadmin or kill | 438 | * If the task is already exiting, don't alarm the sysadmin or kill |
444 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 439 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
445 | */ | 440 | */ |
446 | if (p->flags & PF_EXITING) { | 441 | if (task_will_free_mem(p)) { |
447 | set_tsk_thread_flag(p, TIF_MEMDIE); | 442 | set_tsk_thread_flag(p, TIF_MEMDIE); |
448 | put_task_struct(p); | 443 | put_task_struct(p); |
449 | return; | 444 | return; |
@@ -649,7 +644,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
649 | * select it. The goal is to allow it to allocate so that it may | 644 | * select it. The goal is to allow it to allocate so that it may |
650 | * quickly exit and free its memory. | 645 | * quickly exit and free its memory. |
651 | */ | 646 | */ |
652 | if (fatal_signal_pending(current) || current->flags & PF_EXITING) { | 647 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { |
653 | set_thread_flag(TIF_MEMDIE); | 648 | set_thread_flag(TIF_MEMDIE); |
654 | return; | 649 | return; |
655 | } | 650 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df542feaac3b..fa974d87f60d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
49 | #include <linux/fault-inject.h> | 49 | #include <linux/fault-inject.h> |
50 | #include <linux/page-isolation.h> | 50 | #include <linux/page-isolation.h> |
51 | #include <linux/page_ext.h> | ||
51 | #include <linux/debugobjects.h> | 52 | #include <linux/debugobjects.h> |
52 | #include <linux/kmemleak.h> | 53 | #include <linux/kmemleak.h> |
53 | #include <linux/compaction.h> | 54 | #include <linux/compaction.h> |
@@ -55,9 +56,10 @@ | |||
55 | #include <linux/prefetch.h> | 56 | #include <linux/prefetch.h> |
56 | #include <linux/mm_inline.h> | 57 | #include <linux/mm_inline.h> |
57 | #include <linux/migrate.h> | 58 | #include <linux/migrate.h> |
58 | #include <linux/page-debug-flags.h> | 59 | #include <linux/page_ext.h> |
59 | #include <linux/hugetlb.h> | 60 | #include <linux/hugetlb.h> |
60 | #include <linux/sched/rt.h> | 61 | #include <linux/sched/rt.h> |
62 | #include <linux/page_owner.h> | ||
61 | 63 | ||
62 | #include <asm/sections.h> | 64 | #include <asm/sections.h> |
63 | #include <asm/tlbflush.h> | 65 | #include <asm/tlbflush.h> |
@@ -424,6 +426,42 @@ static inline void prep_zero_page(struct page *page, unsigned int order, | |||
424 | 426 | ||
425 | #ifdef CONFIG_DEBUG_PAGEALLOC | 427 | #ifdef CONFIG_DEBUG_PAGEALLOC |
426 | unsigned int _debug_guardpage_minorder; | 428 | unsigned int _debug_guardpage_minorder; |
429 | bool _debug_pagealloc_enabled __read_mostly; | ||
430 | bool _debug_guardpage_enabled __read_mostly; | ||
431 | |||
432 | static int __init early_debug_pagealloc(char *buf) | ||
433 | { | ||
434 | if (!buf) | ||
435 | return -EINVAL; | ||
436 | |||
437 | if (strcmp(buf, "on") == 0) | ||
438 | _debug_pagealloc_enabled = true; | ||
439 | |||
440 | return 0; | ||
441 | } | ||
442 | early_param("debug_pagealloc", early_debug_pagealloc); | ||
443 | |||
444 | static bool need_debug_guardpage(void) | ||
445 | { | ||
446 | /* If we don't use debug_pagealloc, we don't need guard page */ | ||
447 | if (!debug_pagealloc_enabled()) | ||
448 | return false; | ||
449 | |||
450 | return true; | ||
451 | } | ||
452 | |||
453 | static void init_debug_guardpage(void) | ||
454 | { | ||
455 | if (!debug_pagealloc_enabled()) | ||
456 | return; | ||
457 | |||
458 | _debug_guardpage_enabled = true; | ||
459 | } | ||
460 | |||
461 | struct page_ext_operations debug_guardpage_ops = { | ||
462 | .need = need_debug_guardpage, | ||
463 | .init = init_debug_guardpage, | ||
464 | }; | ||
427 | 465 | ||
428 | static int __init debug_guardpage_minorder_setup(char *buf) | 466 | static int __init debug_guardpage_minorder_setup(char *buf) |
429 | { | 467 | { |
@@ -439,18 +477,44 @@ static int __init debug_guardpage_minorder_setup(char *buf) | |||
439 | } | 477 | } |
440 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); | 478 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); |
441 | 479 | ||
442 | static inline void set_page_guard_flag(struct page *page) | 480 | static inline void set_page_guard(struct zone *zone, struct page *page, |
481 | unsigned int order, int migratetype) | ||
443 | { | 482 | { |
444 | __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | 483 | struct page_ext *page_ext; |
484 | |||
485 | if (!debug_guardpage_enabled()) | ||
486 | return; | ||
487 | |||
488 | page_ext = lookup_page_ext(page); | ||
489 | __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); | ||
490 | |||
491 | INIT_LIST_HEAD(&page->lru); | ||
492 | set_page_private(page, order); | ||
493 | /* Guard pages are not available for any usage */ | ||
494 | __mod_zone_freepage_state(zone, -(1 << order), migratetype); | ||
445 | } | 495 | } |
446 | 496 | ||
447 | static inline void clear_page_guard_flag(struct page *page) | 497 | static inline void clear_page_guard(struct zone *zone, struct page *page, |
498 | unsigned int order, int migratetype) | ||
448 | { | 499 | { |
449 | __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | 500 | struct page_ext *page_ext; |
501 | |||
502 | if (!debug_guardpage_enabled()) | ||
503 | return; | ||
504 | |||
505 | page_ext = lookup_page_ext(page); | ||
506 | __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); | ||
507 | |||
508 | set_page_private(page, 0); | ||
509 | if (!is_migrate_isolate(migratetype)) | ||
510 | __mod_zone_freepage_state(zone, (1 << order), migratetype); | ||
450 | } | 511 | } |
451 | #else | 512 | #else |
452 | static inline void set_page_guard_flag(struct page *page) { } | 513 | struct page_ext_operations debug_guardpage_ops = { NULL, }; |
453 | static inline void clear_page_guard_flag(struct page *page) { } | 514 | static inline void set_page_guard(struct zone *zone, struct page *page, |
515 | unsigned int order, int migratetype) {} | ||
516 | static inline void clear_page_guard(struct zone *zone, struct page *page, | ||
517 | unsigned int order, int migratetype) {} | ||
454 | #endif | 518 | #endif |
455 | 519 | ||
456 | static inline void set_page_order(struct page *page, unsigned int order) | 520 | static inline void set_page_order(struct page *page, unsigned int order) |
@@ -581,12 +645,7 @@ static inline void __free_one_page(struct page *page, | |||
581 | * merge with it and move up one order. | 645 | * merge with it and move up one order. |
582 | */ | 646 | */ |
583 | if (page_is_guard(buddy)) { | 647 | if (page_is_guard(buddy)) { |
584 | clear_page_guard_flag(buddy); | 648 | clear_page_guard(zone, buddy, order, migratetype); |
585 | set_page_private(buddy, 0); | ||
586 | if (!is_migrate_isolate(migratetype)) { | ||
587 | __mod_zone_freepage_state(zone, 1 << order, | ||
588 | migratetype); | ||
589 | } | ||
590 | } else { | 649 | } else { |
591 | list_del(&buddy->lru); | 650 | list_del(&buddy->lru); |
592 | zone->free_area[order].nr_free--; | 651 | zone->free_area[order].nr_free--; |
@@ -755,6 +814,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
755 | if (bad) | 814 | if (bad) |
756 | return false; | 815 | return false; |
757 | 816 | ||
817 | reset_page_owner(page, order); | ||
818 | |||
758 | if (!PageHighMem(page)) { | 819 | if (!PageHighMem(page)) { |
759 | debug_check_no_locks_freed(page_address(page), | 820 | debug_check_no_locks_freed(page_address(page), |
760 | PAGE_SIZE << order); | 821 | PAGE_SIZE << order); |
@@ -861,23 +922,18 @@ static inline void expand(struct zone *zone, struct page *page, | |||
861 | size >>= 1; | 922 | size >>= 1; |
862 | VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); | 923 | VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); |
863 | 924 | ||
864 | #ifdef CONFIG_DEBUG_PAGEALLOC | 925 | if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && |
865 | if (high < debug_guardpage_minorder()) { | 926 | debug_guardpage_enabled() && |
927 | high < debug_guardpage_minorder()) { | ||
866 | /* | 928 | /* |
867 | * Mark as guard pages (or page), that will allow to | 929 | * Mark as guard pages (or page), that will allow to |
868 | * merge back to allocator when buddy will be freed. | 930 | * merge back to allocator when buddy will be freed. |
869 | * Corresponding page table entries will not be touched, | 931 | * Corresponding page table entries will not be touched, |
870 | * pages will stay not present in virtual address space | 932 | * pages will stay not present in virtual address space |
871 | */ | 933 | */ |
872 | INIT_LIST_HEAD(&page[size].lru); | 934 | set_page_guard(zone, &page[size], high, migratetype); |
873 | set_page_guard_flag(&page[size]); | ||
874 | set_page_private(&page[size], high); | ||
875 | /* Guard pages are not available for any usage */ | ||
876 | __mod_zone_freepage_state(zone, -(1 << high), | ||
877 | migratetype); | ||
878 | continue; | 935 | continue; |
879 | } | 936 | } |
880 | #endif | ||
881 | list_add(&page[size].lru, &area->free_list[migratetype]); | 937 | list_add(&page[size].lru, &area->free_list[migratetype]); |
882 | area->nr_free++; | 938 | area->nr_free++; |
883 | set_page_order(&page[size], high); | 939 | set_page_order(&page[size], high); |
@@ -935,6 +991,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | |||
935 | if (order && (gfp_flags & __GFP_COMP)) | 991 | if (order && (gfp_flags & __GFP_COMP)) |
936 | prep_compound_page(page, order); | 992 | prep_compound_page(page, order); |
937 | 993 | ||
994 | set_page_owner(page, order, gfp_flags); | ||
995 | |||
938 | return 0; | 996 | return 0; |
939 | } | 997 | } |
940 | 998 | ||
@@ -1507,8 +1565,11 @@ void split_page(struct page *page, unsigned int order) | |||
1507 | split_page(virt_to_page(page[0].shadow), order); | 1565 | split_page(virt_to_page(page[0].shadow), order); |
1508 | #endif | 1566 | #endif |
1509 | 1567 | ||
1510 | for (i = 1; i < (1 << order); i++) | 1568 | set_page_owner(page, 0, 0); |
1569 | for (i = 1; i < (1 << order); i++) { | ||
1511 | set_page_refcounted(page + i); | 1570 | set_page_refcounted(page + i); |
1571 | set_page_owner(page + i, 0, 0); | ||
1572 | } | ||
1512 | } | 1573 | } |
1513 | EXPORT_SYMBOL_GPL(split_page); | 1574 | EXPORT_SYMBOL_GPL(split_page); |
1514 | 1575 | ||
@@ -1548,6 +1609,7 @@ int __isolate_free_page(struct page *page, unsigned int order) | |||
1548 | } | 1609 | } |
1549 | } | 1610 | } |
1550 | 1611 | ||
1612 | set_page_owner(page, order, 0); | ||
1551 | return 1UL << order; | 1613 | return 1UL << order; |
1552 | } | 1614 | } |
1553 | 1615 | ||
@@ -4856,6 +4918,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4856 | #endif | 4918 | #endif |
4857 | init_waitqueue_head(&pgdat->kswapd_wait); | 4919 | init_waitqueue_head(&pgdat->kswapd_wait); |
4858 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | 4920 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4921 | pgdat_page_ext_init(pgdat); | ||
4859 | 4922 | ||
4860 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4923 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4861 | struct zone *zone = pgdat->node_zones + j; | 4924 | struct zone *zone = pgdat->node_zones + j; |
@@ -4874,16 +4937,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4874 | * and per-cpu initialisations | 4937 | * and per-cpu initialisations |
4875 | */ | 4938 | */ |
4876 | memmap_pages = calc_memmap_size(size, realsize); | 4939 | memmap_pages = calc_memmap_size(size, realsize); |
4877 | if (freesize >= memmap_pages) { | 4940 | if (!is_highmem_idx(j)) { |
4878 | freesize -= memmap_pages; | 4941 | if (freesize >= memmap_pages) { |
4879 | if (memmap_pages) | 4942 | freesize -= memmap_pages; |
4880 | printk(KERN_DEBUG | 4943 | if (memmap_pages) |
4881 | " %s zone: %lu pages used for memmap\n", | 4944 | printk(KERN_DEBUG |
4882 | zone_names[j], memmap_pages); | 4945 | " %s zone: %lu pages used for memmap\n", |
4883 | } else | 4946 | zone_names[j], memmap_pages); |
4884 | printk(KERN_WARNING | 4947 | } else |
4885 | " %s zone: %lu pages exceeds freesize %lu\n", | 4948 | printk(KERN_WARNING |
4886 | zone_names[j], memmap_pages, freesize); | 4949 | " %s zone: %lu pages exceeds freesize %lu\n", |
4950 | zone_names[j], memmap_pages, freesize); | ||
4951 | } | ||
4887 | 4952 | ||
4888 | /* Account for reserved pages */ | 4953 | /* Account for reserved pages */ |
4889 | if (j == 0 && freesize > dma_reserve) { | 4954 | if (j == 0 && freesize > dma_reserve) { |
@@ -6221,9 +6286,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
6221 | if (!PageLRU(page)) | 6286 | if (!PageLRU(page)) |
6222 | found++; | 6287 | found++; |
6223 | /* | 6288 | /* |
6224 | * If there are RECLAIMABLE pages, we need to check it. | 6289 | * If there are RECLAIMABLE pages, we need to check |
6225 | * But now, memory offline itself doesn't call shrink_slab() | 6290 | * it. But now, memory offline itself doesn't call |
6226 | * and it still to be fixed. | 6291 | * shrink_node_slabs() and it still to be fixed. |
6227 | */ | 6292 | */ |
6228 | /* | 6293 | /* |
6229 | * If the page is not RAM, page_count()should be 0. | 6294 | * If the page is not RAM, page_count()should be 0. |
diff --git a/mm/page_ext.c b/mm/page_ext.c new file mode 100644 index 000000000000..d86fd2f5353f --- /dev/null +++ b/mm/page_ext.c | |||
@@ -0,0 +1,403 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/mmzone.h> | ||
3 | #include <linux/bootmem.h> | ||
4 | #include <linux/page_ext.h> | ||
5 | #include <linux/memory.h> | ||
6 | #include <linux/vmalloc.h> | ||
7 | #include <linux/kmemleak.h> | ||
8 | #include <linux/page_owner.h> | ||
9 | |||
10 | /* | ||
11 | * struct page extension | ||
12 | * | ||
13 | * This is the feature to manage memory for extended data per page. | ||
14 | * | ||
15 | * Until now, we must modify struct page itself to store extra data per page. | ||
16 | * This requires rebuilding the kernel and it is really time consuming process. | ||
17 | * And, sometimes, rebuild is impossible due to third party module dependency. | ||
18 | * At last, enlarging struct page could cause un-wanted system behaviour change. | ||
19 | * | ||
20 | * This feature is intended to overcome above mentioned problems. This feature | ||
21 | * allocates memory for extended data per page in certain place rather than | ||
22 | * the struct page itself. This memory can be accessed by the accessor | ||
23 | * functions provided by this code. During the boot process, it checks whether | ||
24 | * allocation of huge chunk of memory is needed or not. If not, it avoids | ||
25 | * allocating memory at all. With this advantage, we can include this feature | ||
26 | * into the kernel in default and can avoid rebuild and solve related problems. | ||
27 | * | ||
28 | * To help these things to work well, there are two callbacks for clients. One | ||
29 | * is the need callback which is mandatory if user wants to avoid useless | ||
30 | * memory allocation at boot-time. The other is optional, init callback, which | ||
31 | * is used to do proper initialization after memory is allocated. | ||
32 | * | ||
33 | * The need callback is used to decide whether extended memory allocation is | ||
34 | * needed or not. Sometimes users want to deactivate some features in this | ||
35 | * boot and extra memory would be unneccessary. In this case, to avoid | ||
36 | * allocating huge chunk of memory, each clients represent their need of | ||
37 | * extra memory through the need callback. If one of the need callbacks | ||
38 | * returns true, it means that someone needs extra memory so that | ||
39 | * page extension core should allocates memory for page extension. If | ||
40 | * none of need callbacks return true, memory isn't needed at all in this boot | ||
41 | * and page extension core can skip to allocate memory. As result, | ||
42 | * none of memory is wasted. | ||
43 | * | ||
44 | * The init callback is used to do proper initialization after page extension | ||
45 | * is completely initialized. In sparse memory system, extra memory is | ||
46 | * allocated some time later than memmap is allocated. In other words, lifetime | ||
47 | * of memory for page extension isn't same with memmap for struct page. | ||
48 | * Therefore, clients can't store extra data until page extension is | ||
49 | * initialized, even if pages are allocated and used freely. This could | ||
50 | * cause inadequate state of extra data per page, so, to prevent it, client | ||
51 | * can utilize this callback to initialize the state of it correctly. | ||
52 | */ | ||
53 | |||
54 | static struct page_ext_operations *page_ext_ops[] = { | ||
55 | &debug_guardpage_ops, | ||
56 | #ifdef CONFIG_PAGE_POISONING | ||
57 | &page_poisoning_ops, | ||
58 | #endif | ||
59 | #ifdef CONFIG_PAGE_OWNER | ||
60 | &page_owner_ops, | ||
61 | #endif | ||
62 | }; | ||
63 | |||
64 | static unsigned long total_usage; | ||
65 | |||
66 | static bool __init invoke_need_callbacks(void) | ||
67 | { | ||
68 | int i; | ||
69 | int entries = ARRAY_SIZE(page_ext_ops); | ||
70 | |||
71 | for (i = 0; i < entries; i++) { | ||
72 | if (page_ext_ops[i]->need && page_ext_ops[i]->need()) | ||
73 | return true; | ||
74 | } | ||
75 | |||
76 | return false; | ||
77 | } | ||
78 | |||
79 | static void __init invoke_init_callbacks(void) | ||
80 | { | ||
81 | int i; | ||
82 | int entries = ARRAY_SIZE(page_ext_ops); | ||
83 | |||
84 | for (i = 0; i < entries; i++) { | ||
85 | if (page_ext_ops[i]->init) | ||
86 | page_ext_ops[i]->init(); | ||
87 | } | ||
88 | } | ||
89 | |||
90 | #if !defined(CONFIG_SPARSEMEM) | ||
91 | |||
92 | |||
93 | void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) | ||
94 | { | ||
95 | pgdat->node_page_ext = NULL; | ||
96 | } | ||
97 | |||
98 | struct page_ext *lookup_page_ext(struct page *page) | ||
99 | { | ||
100 | unsigned long pfn = page_to_pfn(page); | ||
101 | unsigned long offset; | ||
102 | struct page_ext *base; | ||
103 | |||
104 | base = NODE_DATA(page_to_nid(page))->node_page_ext; | ||
105 | #ifdef CONFIG_DEBUG_VM | ||
106 | /* | ||
107 | * The sanity checks the page allocator does upon freeing a | ||
108 | * page can reach here before the page_ext arrays are | ||
109 | * allocated when feeding a range of pages to the allocator | ||
110 | * for the first time during bootup or memory hotplug. | ||
111 | */ | ||
112 | if (unlikely(!base)) | ||
113 | return NULL; | ||
114 | #endif | ||
115 | offset = pfn - round_down(node_start_pfn(page_to_nid(page)), | ||
116 | MAX_ORDER_NR_PAGES); | ||
117 | return base + offset; | ||
118 | } | ||
119 | |||
120 | static int __init alloc_node_page_ext(int nid) | ||
121 | { | ||
122 | struct page_ext *base; | ||
123 | unsigned long table_size; | ||
124 | unsigned long nr_pages; | ||
125 | |||
126 | nr_pages = NODE_DATA(nid)->node_spanned_pages; | ||
127 | if (!nr_pages) | ||
128 | return 0; | ||
129 | |||
130 | /* | ||
131 | * Need extra space if node range is not aligned with | ||
132 | * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm | ||
133 | * checks buddy's status, range could be out of exact node range. | ||
134 | */ | ||
135 | if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || | ||
136 | !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) | ||
137 | nr_pages += MAX_ORDER_NR_PAGES; | ||
138 | |||
139 | table_size = sizeof(struct page_ext) * nr_pages; | ||
140 | |||
141 | base = memblock_virt_alloc_try_nid_nopanic( | ||
142 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), | ||
143 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
144 | if (!base) | ||
145 | return -ENOMEM; | ||
146 | NODE_DATA(nid)->node_page_ext = base; | ||
147 | total_usage += table_size; | ||
148 | return 0; | ||
149 | } | ||
150 | |||
151 | void __init page_ext_init_flatmem(void) | ||
152 | { | ||
153 | |||
154 | int nid, fail; | ||
155 | |||
156 | if (!invoke_need_callbacks()) | ||
157 | return; | ||
158 | |||
159 | for_each_online_node(nid) { | ||
160 | fail = alloc_node_page_ext(nid); | ||
161 | if (fail) | ||
162 | goto fail; | ||
163 | } | ||
164 | pr_info("allocated %ld bytes of page_ext\n", total_usage); | ||
165 | invoke_init_callbacks(); | ||
166 | return; | ||
167 | |||
168 | fail: | ||
169 | pr_crit("allocation of page_ext failed.\n"); | ||
170 | panic("Out of memory"); | ||
171 | } | ||
172 | |||
173 | #else /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
174 | |||
175 | struct page_ext *lookup_page_ext(struct page *page) | ||
176 | { | ||
177 | unsigned long pfn = page_to_pfn(page); | ||
178 | struct mem_section *section = __pfn_to_section(pfn); | ||
179 | #ifdef CONFIG_DEBUG_VM | ||
180 | /* | ||
181 | * The sanity checks the page allocator does upon freeing a | ||
182 | * page can reach here before the page_ext arrays are | ||
183 | * allocated when feeding a range of pages to the allocator | ||
184 | * for the first time during bootup or memory hotplug. | ||
185 | */ | ||
186 | if (!section->page_ext) | ||
187 | return NULL; | ||
188 | #endif | ||
189 | return section->page_ext + pfn; | ||
190 | } | ||
191 | |||
192 | static void *__meminit alloc_page_ext(size_t size, int nid) | ||
193 | { | ||
194 | gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; | ||
195 | void *addr = NULL; | ||
196 | |||
197 | addr = alloc_pages_exact_nid(nid, size, flags); | ||
198 | if (addr) { | ||
199 | kmemleak_alloc(addr, size, 1, flags); | ||
200 | return addr; | ||
201 | } | ||
202 | |||
203 | if (node_state(nid, N_HIGH_MEMORY)) | ||
204 | addr = vzalloc_node(size, nid); | ||
205 | else | ||
206 | addr = vzalloc(size); | ||
207 | |||
208 | return addr; | ||
209 | } | ||
210 | |||
211 | static int __meminit init_section_page_ext(unsigned long pfn, int nid) | ||
212 | { | ||
213 | struct mem_section *section; | ||
214 | struct page_ext *base; | ||
215 | unsigned long table_size; | ||
216 | |||
217 | section = __pfn_to_section(pfn); | ||
218 | |||
219 | if (section->page_ext) | ||
220 | return 0; | ||
221 | |||
222 | table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; | ||
223 | base = alloc_page_ext(table_size, nid); | ||
224 | |||
225 | /* | ||
226 | * The value stored in section->page_ext is (base - pfn) | ||
227 | * and it does not point to the memory block allocated above, | ||
228 | * causing kmemleak false positives. | ||
229 | */ | ||
230 | kmemleak_not_leak(base); | ||
231 | |||
232 | if (!base) { | ||
233 | pr_err("page ext allocation failure\n"); | ||
234 | return -ENOMEM; | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * The passed "pfn" may not be aligned to SECTION. For the calculation | ||
239 | * we need to apply a mask. | ||
240 | */ | ||
241 | pfn &= PAGE_SECTION_MASK; | ||
242 | section->page_ext = base - pfn; | ||
243 | total_usage += table_size; | ||
244 | return 0; | ||
245 | } | ||
246 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
247 | static void free_page_ext(void *addr) | ||
248 | { | ||
249 | if (is_vmalloc_addr(addr)) { | ||
250 | vfree(addr); | ||
251 | } else { | ||
252 | struct page *page = virt_to_page(addr); | ||
253 | size_t table_size; | ||
254 | |||
255 | table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; | ||
256 | |||
257 | BUG_ON(PageReserved(page)); | ||
258 | free_pages_exact(addr, table_size); | ||
259 | } | ||
260 | } | ||
261 | |||
262 | static void __free_page_ext(unsigned long pfn) | ||
263 | { | ||
264 | struct mem_section *ms; | ||
265 | struct page_ext *base; | ||
266 | |||
267 | ms = __pfn_to_section(pfn); | ||
268 | if (!ms || !ms->page_ext) | ||
269 | return; | ||
270 | base = ms->page_ext + pfn; | ||
271 | free_page_ext(base); | ||
272 | ms->page_ext = NULL; | ||
273 | } | ||
274 | |||
275 | static int __meminit online_page_ext(unsigned long start_pfn, | ||
276 | unsigned long nr_pages, | ||
277 | int nid) | ||
278 | { | ||
279 | unsigned long start, end, pfn; | ||
280 | int fail = 0; | ||
281 | |||
282 | start = SECTION_ALIGN_DOWN(start_pfn); | ||
283 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); | ||
284 | |||
285 | if (nid == -1) { | ||
286 | /* | ||
287 | * In this case, "nid" already exists and contains valid memory. | ||
288 | * "start_pfn" passed to us is a pfn which is an arg for | ||
289 | * online__pages(), and start_pfn should exist. | ||
290 | */ | ||
291 | nid = pfn_to_nid(start_pfn); | ||
292 | VM_BUG_ON(!node_state(nid, N_ONLINE)); | ||
293 | } | ||
294 | |||
295 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { | ||
296 | if (!pfn_present(pfn)) | ||
297 | continue; | ||
298 | fail = init_section_page_ext(pfn, nid); | ||
299 | } | ||
300 | if (!fail) | ||
301 | return 0; | ||
302 | |||
303 | /* rollback */ | ||
304 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
305 | __free_page_ext(pfn); | ||
306 | |||
307 | return -ENOMEM; | ||
308 | } | ||
309 | |||
310 | static int __meminit offline_page_ext(unsigned long start_pfn, | ||
311 | unsigned long nr_pages, int nid) | ||
312 | { | ||
313 | unsigned long start, end, pfn; | ||
314 | |||
315 | start = SECTION_ALIGN_DOWN(start_pfn); | ||
316 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); | ||
317 | |||
318 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
319 | __free_page_ext(pfn); | ||
320 | return 0; | ||
321 | |||
322 | } | ||
323 | |||
324 | static int __meminit page_ext_callback(struct notifier_block *self, | ||
325 | unsigned long action, void *arg) | ||
326 | { | ||
327 | struct memory_notify *mn = arg; | ||
328 | int ret = 0; | ||
329 | |||
330 | switch (action) { | ||
331 | case MEM_GOING_ONLINE: | ||
332 | ret = online_page_ext(mn->start_pfn, | ||
333 | mn->nr_pages, mn->status_change_nid); | ||
334 | break; | ||
335 | case MEM_OFFLINE: | ||
336 | offline_page_ext(mn->start_pfn, | ||
337 | mn->nr_pages, mn->status_change_nid); | ||
338 | break; | ||
339 | case MEM_CANCEL_ONLINE: | ||
340 | offline_page_ext(mn->start_pfn, | ||
341 | mn->nr_pages, mn->status_change_nid); | ||
342 | break; | ||
343 | case MEM_GOING_OFFLINE: | ||
344 | break; | ||
345 | case MEM_ONLINE: | ||
346 | case MEM_CANCEL_OFFLINE: | ||
347 | break; | ||
348 | } | ||
349 | |||
350 | return notifier_from_errno(ret); | ||
351 | } | ||
352 | |||
353 | #endif | ||
354 | |||
355 | void __init page_ext_init(void) | ||
356 | { | ||
357 | unsigned long pfn; | ||
358 | int nid; | ||
359 | |||
360 | if (!invoke_need_callbacks()) | ||
361 | return; | ||
362 | |||
363 | for_each_node_state(nid, N_MEMORY) { | ||
364 | unsigned long start_pfn, end_pfn; | ||
365 | |||
366 | start_pfn = node_start_pfn(nid); | ||
367 | end_pfn = node_end_pfn(nid); | ||
368 | /* | ||
369 | * start_pfn and end_pfn may not be aligned to SECTION and the | ||
370 | * page->flags of out of node pages are not initialized. So we | ||
371 | * scan [start_pfn, the biggest section's pfn < end_pfn) here. | ||
372 | */ | ||
373 | for (pfn = start_pfn; pfn < end_pfn; | ||
374 | pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { | ||
375 | |||
376 | if (!pfn_valid(pfn)) | ||
377 | continue; | ||
378 | /* | ||
379 | * Nodes's pfns can be overlapping. | ||
380 | * We know some arch can have a nodes layout such as | ||
381 | * -------------pfn--------------> | ||
382 | * N0 | N1 | N2 | N0 | N1 | N2|.... | ||
383 | */ | ||
384 | if (pfn_to_nid(pfn) != nid) | ||
385 | continue; | ||
386 | if (init_section_page_ext(pfn, nid)) | ||
387 | goto oom; | ||
388 | } | ||
389 | } | ||
390 | hotplug_memory_notifier(page_ext_callback, 0); | ||
391 | pr_info("allocated %ld bytes of page_ext\n", total_usage); | ||
392 | invoke_init_callbacks(); | ||
393 | return; | ||
394 | |||
395 | oom: | ||
396 | panic("Out of memory"); | ||
397 | } | ||
398 | |||
399 | void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) | ||
400 | { | ||
401 | } | ||
402 | |||
403 | #endif | ||
diff --git a/mm/page_owner.c b/mm/page_owner.c new file mode 100644 index 000000000000..9ab4a9b5bc09 --- /dev/null +++ b/mm/page_owner.c | |||
@@ -0,0 +1,311 @@ | |||
1 | #include <linux/debugfs.h> | ||
2 | #include <linux/mm.h> | ||
3 | #include <linux/slab.h> | ||
4 | #include <linux/uaccess.h> | ||
5 | #include <linux/bootmem.h> | ||
6 | #include <linux/stacktrace.h> | ||
7 | #include <linux/page_owner.h> | ||
8 | #include "internal.h" | ||
9 | |||
10 | static bool page_owner_disabled = true; | ||
11 | bool page_owner_inited __read_mostly; | ||
12 | |||
13 | static void init_early_allocated_pages(void); | ||
14 | |||
15 | static int early_page_owner_param(char *buf) | ||
16 | { | ||
17 | if (!buf) | ||
18 | return -EINVAL; | ||
19 | |||
20 | if (strcmp(buf, "on") == 0) | ||
21 | page_owner_disabled = false; | ||
22 | |||
23 | return 0; | ||
24 | } | ||
25 | early_param("page_owner", early_page_owner_param); | ||
26 | |||
27 | static bool need_page_owner(void) | ||
28 | { | ||
29 | if (page_owner_disabled) | ||
30 | return false; | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | static void init_page_owner(void) | ||
36 | { | ||
37 | if (page_owner_disabled) | ||
38 | return; | ||
39 | |||
40 | page_owner_inited = true; | ||
41 | init_early_allocated_pages(); | ||
42 | } | ||
43 | |||
44 | struct page_ext_operations page_owner_ops = { | ||
45 | .need = need_page_owner, | ||
46 | .init = init_page_owner, | ||
47 | }; | ||
48 | |||
49 | void __reset_page_owner(struct page *page, unsigned int order) | ||
50 | { | ||
51 | int i; | ||
52 | struct page_ext *page_ext; | ||
53 | |||
54 | for (i = 0; i < (1 << order); i++) { | ||
55 | page_ext = lookup_page_ext(page + i); | ||
56 | __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); | ||
57 | } | ||
58 | } | ||
59 | |||
60 | void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) | ||
61 | { | ||
62 | struct page_ext *page_ext; | ||
63 | struct stack_trace *trace; | ||
64 | |||
65 | page_ext = lookup_page_ext(page); | ||
66 | |||
67 | trace = &page_ext->trace; | ||
68 | trace->nr_entries = 0; | ||
69 | trace->max_entries = ARRAY_SIZE(page_ext->trace_entries); | ||
70 | trace->entries = &page_ext->trace_entries[0]; | ||
71 | trace->skip = 3; | ||
72 | save_stack_trace(&page_ext->trace); | ||
73 | |||
74 | page_ext->order = order; | ||
75 | page_ext->gfp_mask = gfp_mask; | ||
76 | |||
77 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); | ||
78 | } | ||
79 | |||
80 | static ssize_t | ||
81 | print_page_owner(char __user *buf, size_t count, unsigned long pfn, | ||
82 | struct page *page, struct page_ext *page_ext) | ||
83 | { | ||
84 | int ret; | ||
85 | int pageblock_mt, page_mt; | ||
86 | char *kbuf; | ||
87 | |||
88 | kbuf = kmalloc(count, GFP_KERNEL); | ||
89 | if (!kbuf) | ||
90 | return -ENOMEM; | ||
91 | |||
92 | ret = snprintf(kbuf, count, | ||
93 | "Page allocated via order %u, mask 0x%x\n", | ||
94 | page_ext->order, page_ext->gfp_mask); | ||
95 | |||
96 | if (ret >= count) | ||
97 | goto err; | ||
98 | |||
99 | /* Print information relevant to grouping pages by mobility */ | ||
100 | pageblock_mt = get_pfnblock_migratetype(page, pfn); | ||
101 | page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); | ||
102 | ret += snprintf(kbuf + ret, count - ret, | ||
103 | "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", | ||
104 | pfn, | ||
105 | pfn >> pageblock_order, | ||
106 | pageblock_mt, | ||
107 | pageblock_mt != page_mt ? "Fallback" : " ", | ||
108 | PageLocked(page) ? "K" : " ", | ||
109 | PageError(page) ? "E" : " ", | ||
110 | PageReferenced(page) ? "R" : " ", | ||
111 | PageUptodate(page) ? "U" : " ", | ||
112 | PageDirty(page) ? "D" : " ", | ||
113 | PageLRU(page) ? "L" : " ", | ||
114 | PageActive(page) ? "A" : " ", | ||
115 | PageSlab(page) ? "S" : " ", | ||
116 | PageWriteback(page) ? "W" : " ", | ||
117 | PageCompound(page) ? "C" : " ", | ||
118 | PageSwapCache(page) ? "B" : " ", | ||
119 | PageMappedToDisk(page) ? "M" : " "); | ||
120 | |||
121 | if (ret >= count) | ||
122 | goto err; | ||
123 | |||
124 | ret += snprint_stack_trace(kbuf + ret, count - ret, | ||
125 | &page_ext->trace, 0); | ||
126 | if (ret >= count) | ||
127 | goto err; | ||
128 | |||
129 | ret += snprintf(kbuf + ret, count - ret, "\n"); | ||
130 | if (ret >= count) | ||
131 | goto err; | ||
132 | |||
133 | if (copy_to_user(buf, kbuf, ret)) | ||
134 | ret = -EFAULT; | ||
135 | |||
136 | kfree(kbuf); | ||
137 | return ret; | ||
138 | |||
139 | err: | ||
140 | kfree(kbuf); | ||
141 | return -ENOMEM; | ||
142 | } | ||
143 | |||
144 | static ssize_t | ||
145 | read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) | ||
146 | { | ||
147 | unsigned long pfn; | ||
148 | struct page *page; | ||
149 | struct page_ext *page_ext; | ||
150 | |||
151 | if (!page_owner_inited) | ||
152 | return -EINVAL; | ||
153 | |||
154 | page = NULL; | ||
155 | pfn = min_low_pfn + *ppos; | ||
156 | |||
157 | /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ | ||
158 | while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) | ||
159 | pfn++; | ||
160 | |||
161 | drain_all_pages(NULL); | ||
162 | |||
163 | /* Find an allocated page */ | ||
164 | for (; pfn < max_pfn; pfn++) { | ||
165 | /* | ||
166 | * If the new page is in a new MAX_ORDER_NR_PAGES area, | ||
167 | * validate the area as existing, skip it if not | ||
168 | */ | ||
169 | if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { | ||
170 | pfn += MAX_ORDER_NR_PAGES - 1; | ||
171 | continue; | ||
172 | } | ||
173 | |||
174 | /* Check for holes within a MAX_ORDER area */ | ||
175 | if (!pfn_valid_within(pfn)) | ||
176 | continue; | ||
177 | |||
178 | page = pfn_to_page(pfn); | ||
179 | if (PageBuddy(page)) { | ||
180 | unsigned long freepage_order = page_order_unsafe(page); | ||
181 | |||
182 | if (freepage_order < MAX_ORDER) | ||
183 | pfn += (1UL << freepage_order) - 1; | ||
184 | continue; | ||
185 | } | ||
186 | |||
187 | page_ext = lookup_page_ext(page); | ||
188 | |||
189 | /* | ||
190 | * Some pages could be missed by concurrent allocation or free, | ||
191 | * because we don't hold the zone lock. | ||
192 | */ | ||
193 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | ||
194 | continue; | ||
195 | |||
196 | /* Record the next PFN to read in the file offset */ | ||
197 | *ppos = (pfn - min_low_pfn) + 1; | ||
198 | |||
199 | return print_page_owner(buf, count, pfn, page, page_ext); | ||
200 | } | ||
201 | |||
202 | return 0; | ||
203 | } | ||
204 | |||
205 | static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) | ||
206 | { | ||
207 | struct page *page; | ||
208 | struct page_ext *page_ext; | ||
209 | unsigned long pfn = zone->zone_start_pfn, block_end_pfn; | ||
210 | unsigned long end_pfn = pfn + zone->spanned_pages; | ||
211 | unsigned long count = 0; | ||
212 | |||
213 | /* Scan block by block. First and last block may be incomplete */ | ||
214 | pfn = zone->zone_start_pfn; | ||
215 | |||
216 | /* | ||
217 | * Walk the zone in pageblock_nr_pages steps. If a page block spans | ||
218 | * a zone boundary, it will be double counted between zones. This does | ||
219 | * not matter as the mixed block count will still be correct | ||
220 | */ | ||
221 | for (; pfn < end_pfn; ) { | ||
222 | if (!pfn_valid(pfn)) { | ||
223 | pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); | ||
224 | continue; | ||
225 | } | ||
226 | |||
227 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
228 | block_end_pfn = min(block_end_pfn, end_pfn); | ||
229 | |||
230 | page = pfn_to_page(pfn); | ||
231 | |||
232 | for (; pfn < block_end_pfn; pfn++) { | ||
233 | if (!pfn_valid_within(pfn)) | ||
234 | continue; | ||
235 | |||
236 | page = pfn_to_page(pfn); | ||
237 | |||
238 | /* | ||
239 | * We are safe to check buddy flag and order, because | ||
240 | * this is init stage and only single thread runs. | ||
241 | */ | ||
242 | if (PageBuddy(page)) { | ||
243 | pfn += (1UL << page_order(page)) - 1; | ||
244 | continue; | ||
245 | } | ||
246 | |||
247 | if (PageReserved(page)) | ||
248 | continue; | ||
249 | |||
250 | page_ext = lookup_page_ext(page); | ||
251 | |||
252 | /* Maybe overraping zone */ | ||
253 | if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | ||
254 | continue; | ||
255 | |||
256 | /* Found early allocated page */ | ||
257 | set_page_owner(page, 0, 0); | ||
258 | count++; | ||
259 | } | ||
260 | } | ||
261 | |||
262 | pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", | ||
263 | pgdat->node_id, zone->name, count); | ||
264 | } | ||
265 | |||
266 | static void init_zones_in_node(pg_data_t *pgdat) | ||
267 | { | ||
268 | struct zone *zone; | ||
269 | struct zone *node_zones = pgdat->node_zones; | ||
270 | unsigned long flags; | ||
271 | |||
272 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
273 | if (!populated_zone(zone)) | ||
274 | continue; | ||
275 | |||
276 | spin_lock_irqsave(&zone->lock, flags); | ||
277 | init_pages_in_zone(pgdat, zone); | ||
278 | spin_unlock_irqrestore(&zone->lock, flags); | ||
279 | } | ||
280 | } | ||
281 | |||
282 | static void init_early_allocated_pages(void) | ||
283 | { | ||
284 | pg_data_t *pgdat; | ||
285 | |||
286 | drain_all_pages(NULL); | ||
287 | for_each_online_pgdat(pgdat) | ||
288 | init_zones_in_node(pgdat); | ||
289 | } | ||
290 | |||
291 | static const struct file_operations proc_page_owner_operations = { | ||
292 | .read = read_page_owner, | ||
293 | }; | ||
294 | |||
295 | static int __init pageowner_init(void) | ||
296 | { | ||
297 | struct dentry *dentry; | ||
298 | |||
299 | if (!page_owner_inited) { | ||
300 | pr_info("page_owner is disabled\n"); | ||
301 | return 0; | ||
302 | } | ||
303 | |||
304 | dentry = debugfs_create_file("page_owner", S_IRUSR, NULL, | ||
305 | NULL, &proc_page_owner_operations); | ||
306 | if (IS_ERR(dentry)) | ||
307 | return PTR_ERR(dentry); | ||
308 | |||
309 | return 0; | ||
310 | } | ||
311 | module_init(pageowner_init) | ||
@@ -23,7 +23,7 @@ | |||
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * mm->mmap_sem | 24 | * mm->mmap_sem |
25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
26 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_rwsem |
27 | * anon_vma->rwsem | 27 | * anon_vma->rwsem |
28 | * mm->page_table_lock or pte_lock | 28 | * mm->page_table_lock or pte_lock |
29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
@@ -1260,7 +1260,7 @@ out_mlock: | |||
1260 | /* | 1260 | /* |
1261 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | 1261 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes |
1262 | * unstable result and race. Plus, We can't wait here because | 1262 | * unstable result and race. Plus, We can't wait here because |
1263 | * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. | 1263 | * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem. |
1264 | * if trylock failed, the page remain in evictable lru and later | 1264 | * if trylock failed, the page remain in evictable lru and later |
1265 | * vmscan could retry to move the page to unevictable lru if the | 1265 | * vmscan could retry to move the page to unevictable lru if the |
1266 | * page is actually mlocked. | 1266 | * page is actually mlocked. |
@@ -1635,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, | |||
1635 | static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | 1635 | static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) |
1636 | { | 1636 | { |
1637 | struct anon_vma *anon_vma; | 1637 | struct anon_vma *anon_vma; |
1638 | pgoff_t pgoff = page_to_pgoff(page); | 1638 | pgoff_t pgoff; |
1639 | struct anon_vma_chain *avc; | 1639 | struct anon_vma_chain *avc; |
1640 | int ret = SWAP_AGAIN; | 1640 | int ret = SWAP_AGAIN; |
1641 | 1641 | ||
@@ -1643,6 +1643,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | |||
1643 | if (!anon_vma) | 1643 | if (!anon_vma) |
1644 | return ret; | 1644 | return ret; |
1645 | 1645 | ||
1646 | pgoff = page_to_pgoff(page); | ||
1646 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1647 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1647 | struct vm_area_struct *vma = avc->vma; | 1648 | struct vm_area_struct *vma = avc->vma; |
1648 | unsigned long address = vma_address(page, vma); | 1649 | unsigned long address = vma_address(page, vma); |
@@ -1676,7 +1677,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | |||
1676 | static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | 1677 | static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) |
1677 | { | 1678 | { |
1678 | struct address_space *mapping = page->mapping; | 1679 | struct address_space *mapping = page->mapping; |
1679 | pgoff_t pgoff = page_to_pgoff(page); | 1680 | pgoff_t pgoff; |
1680 | struct vm_area_struct *vma; | 1681 | struct vm_area_struct *vma; |
1681 | int ret = SWAP_AGAIN; | 1682 | int ret = SWAP_AGAIN; |
1682 | 1683 | ||
@@ -1684,13 +1685,15 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
1684 | * The page lock not only makes sure that page->mapping cannot | 1685 | * The page lock not only makes sure that page->mapping cannot |
1685 | * suddenly be NULLified by truncation, it makes sure that the | 1686 | * suddenly be NULLified by truncation, it makes sure that the |
1686 | * structure at mapping cannot be freed and reused yet, | 1687 | * structure at mapping cannot be freed and reused yet, |
1687 | * so we can safely take mapping->i_mmap_mutex. | 1688 | * so we can safely take mapping->i_mmap_rwsem. |
1688 | */ | 1689 | */ |
1689 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 1690 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
1690 | 1691 | ||
1691 | if (!mapping) | 1692 | if (!mapping) |
1692 | return ret; | 1693 | return ret; |
1693 | mutex_lock(&mapping->i_mmap_mutex); | 1694 | |
1695 | pgoff = page_to_pgoff(page); | ||
1696 | i_mmap_lock_read(mapping); | ||
1694 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 1697 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1695 | unsigned long address = vma_address(page, vma); | 1698 | unsigned long address = vma_address(page, vma); |
1696 | 1699 | ||
@@ -1711,9 +1714,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
1711 | goto done; | 1714 | goto done; |
1712 | 1715 | ||
1713 | ret = rwc->file_nonlinear(page, mapping, rwc->arg); | 1716 | ret = rwc->file_nonlinear(page, mapping, rwc->arg); |
1714 | |||
1715 | done: | 1717 | done: |
1716 | mutex_unlock(&mapping->i_mmap_mutex); | 1718 | i_mmap_unlock_read(mapping); |
1717 | return ret; | 1719 | return ret; |
1718 | } | 1720 | } |
1719 | 1721 | ||
@@ -3015,7 +3015,7 @@ retry: | |||
3015 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 3015 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
3016 | nid = zone_to_nid(zone); | 3016 | nid = zone_to_nid(zone); |
3017 | 3017 | ||
3018 | if (cpuset_zone_allowed(zone, flags | __GFP_HARDWALL) && | 3018 | if (cpuset_zone_allowed(zone, flags) && |
3019 | get_node(cache, nid) && | 3019 | get_node(cache, nid) && |
3020 | get_node(cache, nid)->free_objects) { | 3020 | get_node(cache, nid)->free_objects) { |
3021 | obj = ____cache_alloc_node(cache, | 3021 | obj = ____cache_alloc_node(cache, |
@@ -3182,6 +3182,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3182 | memset(ptr, 0, cachep->object_size); | 3182 | memset(ptr, 0, cachep->object_size); |
3183 | } | 3183 | } |
3184 | 3184 | ||
3185 | memcg_kmem_put_cache(cachep); | ||
3185 | return ptr; | 3186 | return ptr; |
3186 | } | 3187 | } |
3187 | 3188 | ||
@@ -3247,6 +3248,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) | |||
3247 | memset(objp, 0, cachep->object_size); | 3248 | memset(objp, 0, cachep->object_size); |
3248 | } | 3249 | } |
3249 | 3250 | ||
3251 | memcg_kmem_put_cache(cachep); | ||
3250 | return objp; | 3252 | return objp; |
3251 | } | 3253 | } |
3252 | 3254 | ||
@@ -1233,13 +1233,17 @@ static inline void kfree_hook(const void *x) | |||
1233 | kmemleak_free(x); | 1233 | kmemleak_free(x); |
1234 | } | 1234 | } |
1235 | 1235 | ||
1236 | static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) | 1236 | static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, |
1237 | gfp_t flags) | ||
1237 | { | 1238 | { |
1238 | flags &= gfp_allowed_mask; | 1239 | flags &= gfp_allowed_mask; |
1239 | lockdep_trace_alloc(flags); | 1240 | lockdep_trace_alloc(flags); |
1240 | might_sleep_if(flags & __GFP_WAIT); | 1241 | might_sleep_if(flags & __GFP_WAIT); |
1241 | 1242 | ||
1242 | return should_failslab(s->object_size, flags, s->flags); | 1243 | if (should_failslab(s->object_size, flags, s->flags)) |
1244 | return NULL; | ||
1245 | |||
1246 | return memcg_kmem_get_cache(s, flags); | ||
1243 | } | 1247 | } |
1244 | 1248 | ||
1245 | static inline void slab_post_alloc_hook(struct kmem_cache *s, | 1249 | static inline void slab_post_alloc_hook(struct kmem_cache *s, |
@@ -1248,6 +1252,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, | |||
1248 | flags &= gfp_allowed_mask; | 1252 | flags &= gfp_allowed_mask; |
1249 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | 1253 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); |
1250 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); | 1254 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); |
1255 | memcg_kmem_put_cache(s); | ||
1251 | } | 1256 | } |
1252 | 1257 | ||
1253 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | 1258 | static inline void slab_free_hook(struct kmem_cache *s, void *x) |
@@ -1665,8 +1670,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1665 | 1670 | ||
1666 | n = get_node(s, zone_to_nid(zone)); | 1671 | n = get_node(s, zone_to_nid(zone)); |
1667 | 1672 | ||
1668 | if (n && cpuset_zone_allowed(zone, | 1673 | if (n && cpuset_zone_allowed(zone, flags) && |
1669 | flags | __GFP_HARDWALL) && | ||
1670 | n->nr_partial > s->min_partial) { | 1674 | n->nr_partial > s->min_partial) { |
1671 | object = get_partial_node(s, n, c, flags); | 1675 | object = get_partial_node(s, n, c, flags); |
1672 | if (object) { | 1676 | if (object) { |
@@ -2384,10 +2388,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, | |||
2384 | struct page *page; | 2388 | struct page *page; |
2385 | unsigned long tid; | 2389 | unsigned long tid; |
2386 | 2390 | ||
2387 | if (slab_pre_alloc_hook(s, gfpflags)) | 2391 | s = slab_pre_alloc_hook(s, gfpflags); |
2392 | if (!s) | ||
2388 | return NULL; | 2393 | return NULL; |
2389 | |||
2390 | s = memcg_kmem_get_cache(s, gfpflags); | ||
2391 | redo: | 2394 | redo: |
2392 | /* | 2395 | /* |
2393 | * Must read kmem_cache cpu data via this cpu ptr. Preemption is | 2396 | * Must read kmem_cache cpu data via this cpu ptr. Preemption is |
diff --git a/mm/vmacache.c b/mm/vmacache.c index 9f25af825dec..b6e3662fe339 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c | |||
@@ -17,6 +17,8 @@ void vmacache_flush_all(struct mm_struct *mm) | |||
17 | { | 17 | { |
18 | struct task_struct *g, *p; | 18 | struct task_struct *g, *p; |
19 | 19 | ||
20 | count_vm_vmacache_event(VMACACHE_FULL_FLUSHES); | ||
21 | |||
20 | /* | 22 | /* |
21 | * Single threaded tasks need not iterate the entire | 23 | * Single threaded tasks need not iterate the entire |
22 | * list of process. We can avoid the flushing as well | 24 | * list of process. We can avoid the flushing as well |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8a18196fcdff..39c338896416 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -2574,10 +2574,10 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) | |||
2574 | if (!counters) | 2574 | if (!counters) |
2575 | return; | 2575 | return; |
2576 | 2576 | ||
2577 | /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ | ||
2578 | smp_rmb(); | ||
2579 | if (v->flags & VM_UNINITIALIZED) | 2577 | if (v->flags & VM_UNINITIALIZED) |
2580 | return; | 2578 | return; |
2579 | /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ | ||
2580 | smp_rmb(); | ||
2581 | 2581 | ||
2582 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); | 2582 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); |
2583 | 2583 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index a384339bf718..bd9a72bc4a1b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -229,9 +229,10 @@ EXPORT_SYMBOL(unregister_shrinker); | |||
229 | 229 | ||
230 | #define SHRINK_BATCH 128 | 230 | #define SHRINK_BATCH 128 |
231 | 231 | ||
232 | static unsigned long | 232 | static unsigned long shrink_slabs(struct shrink_control *shrinkctl, |
233 | shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | 233 | struct shrinker *shrinker, |
234 | unsigned long nr_pages_scanned, unsigned long lru_pages) | 234 | unsigned long nr_scanned, |
235 | unsigned long nr_eligible) | ||
235 | { | 236 | { |
236 | unsigned long freed = 0; | 237 | unsigned long freed = 0; |
237 | unsigned long long delta; | 238 | unsigned long long delta; |
@@ -255,9 +256,9 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
255 | nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); | 256 | nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); |
256 | 257 | ||
257 | total_scan = nr; | 258 | total_scan = nr; |
258 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | 259 | delta = (4 * nr_scanned) / shrinker->seeks; |
259 | delta *= freeable; | 260 | delta *= freeable; |
260 | do_div(delta, lru_pages + 1); | 261 | do_div(delta, nr_eligible + 1); |
261 | total_scan += delta; | 262 | total_scan += delta; |
262 | if (total_scan < 0) { | 263 | if (total_scan < 0) { |
263 | pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", | 264 | pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", |
@@ -289,8 +290,8 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
289 | total_scan = freeable * 2; | 290 | total_scan = freeable * 2; |
290 | 291 | ||
291 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, | 292 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, |
292 | nr_pages_scanned, lru_pages, | 293 | nr_scanned, nr_eligible, |
293 | freeable, delta, total_scan); | 294 | freeable, delta, total_scan); |
294 | 295 | ||
295 | /* | 296 | /* |
296 | * Normally, we should not scan less than batch_size objects in one | 297 | * Normally, we should not scan less than batch_size objects in one |
@@ -339,34 +340,37 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
339 | return freed; | 340 | return freed; |
340 | } | 341 | } |
341 | 342 | ||
342 | /* | 343 | /** |
343 | * Call the shrink functions to age shrinkable caches | 344 | * shrink_node_slabs - shrink slab caches of a given node |
344 | * | 345 | * @gfp_mask: allocation context |
345 | * Here we assume it costs one seek to replace a lru page and that it also | 346 | * @nid: node whose slab caches to target |
346 | * takes a seek to recreate a cache object. With this in mind we age equal | 347 | * @nr_scanned: pressure numerator |
347 | * percentages of the lru and ageable caches. This should balance the seeks | 348 | * @nr_eligible: pressure denominator |
348 | * generated by these structures. | ||
349 | * | 349 | * |
350 | * If the vm encountered mapped pages on the LRU it increase the pressure on | 350 | * Call the shrink functions to age shrinkable caches. |
351 | * slab to avoid swapping. | ||
352 | * | 351 | * |
353 | * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. | 352 | * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, |
353 | * unaware shrinkers will receive a node id of 0 instead. | ||
354 | * | 354 | * |
355 | * `lru_pages' represents the number of on-LRU pages in all the zones which | 355 | * @nr_scanned and @nr_eligible form a ratio that indicate how much of |
356 | * are eligible for the caller's allocation attempt. It is used for balancing | 356 | * the available objects should be scanned. Page reclaim for example |
357 | * slab reclaim versus page reclaim. | 357 | * passes the number of pages scanned and the number of pages on the |
358 | * LRU lists that it considered on @nid, plus a bias in @nr_scanned | ||
359 | * when it encountered mapped pages. The ratio is further biased by | ||
360 | * the ->seeks setting of the shrink function, which indicates the | ||
361 | * cost to recreate an object relative to that of an LRU page. | ||
358 | * | 362 | * |
359 | * Returns the number of slab objects which we shrunk. | 363 | * Returns the number of reclaimed slab objects. |
360 | */ | 364 | */ |
361 | unsigned long shrink_slab(struct shrink_control *shrinkctl, | 365 | unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, |
362 | unsigned long nr_pages_scanned, | 366 | unsigned long nr_scanned, |
363 | unsigned long lru_pages) | 367 | unsigned long nr_eligible) |
364 | { | 368 | { |
365 | struct shrinker *shrinker; | 369 | struct shrinker *shrinker; |
366 | unsigned long freed = 0; | 370 | unsigned long freed = 0; |
367 | 371 | ||
368 | if (nr_pages_scanned == 0) | 372 | if (nr_scanned == 0) |
369 | nr_pages_scanned = SWAP_CLUSTER_MAX; | 373 | nr_scanned = SWAP_CLUSTER_MAX; |
370 | 374 | ||
371 | if (!down_read_trylock(&shrinker_rwsem)) { | 375 | if (!down_read_trylock(&shrinker_rwsem)) { |
372 | /* | 376 | /* |
@@ -380,20 +384,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, | |||
380 | } | 384 | } |
381 | 385 | ||
382 | list_for_each_entry(shrinker, &shrinker_list, list) { | 386 | list_for_each_entry(shrinker, &shrinker_list, list) { |
383 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { | 387 | struct shrink_control sc = { |
384 | shrinkctl->nid = 0; | 388 | .gfp_mask = gfp_mask, |
385 | freed += shrink_slab_node(shrinkctl, shrinker, | 389 | .nid = nid, |
386 | nr_pages_scanned, lru_pages); | 390 | }; |
387 | continue; | ||
388 | } | ||
389 | 391 | ||
390 | for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { | 392 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) |
391 | if (node_online(shrinkctl->nid)) | 393 | sc.nid = 0; |
392 | freed += shrink_slab_node(shrinkctl, shrinker, | ||
393 | nr_pages_scanned, lru_pages); | ||
394 | 394 | ||
395 | } | 395 | freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); |
396 | } | 396 | } |
397 | |||
397 | up_read(&shrinker_rwsem); | 398 | up_read(&shrinker_rwsem); |
398 | out: | 399 | out: |
399 | cond_resched(); | 400 | cond_resched(); |
@@ -1876,7 +1877,8 @@ enum scan_balance { | |||
1876 | * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan | 1877 | * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan |
1877 | */ | 1878 | */ |
1878 | static void get_scan_count(struct lruvec *lruvec, int swappiness, | 1879 | static void get_scan_count(struct lruvec *lruvec, int swappiness, |
1879 | struct scan_control *sc, unsigned long *nr) | 1880 | struct scan_control *sc, unsigned long *nr, |
1881 | unsigned long *lru_pages) | ||
1880 | { | 1882 | { |
1881 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | 1883 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1882 | u64 fraction[2]; | 1884 | u64 fraction[2]; |
@@ -2022,6 +2024,7 @@ out: | |||
2022 | some_scanned = false; | 2024 | some_scanned = false; |
2023 | /* Only use force_scan on second pass. */ | 2025 | /* Only use force_scan on second pass. */ |
2024 | for (pass = 0; !some_scanned && pass < 2; pass++) { | 2026 | for (pass = 0; !some_scanned && pass < 2; pass++) { |
2027 | *lru_pages = 0; | ||
2025 | for_each_evictable_lru(lru) { | 2028 | for_each_evictable_lru(lru) { |
2026 | int file = is_file_lru(lru); | 2029 | int file = is_file_lru(lru); |
2027 | unsigned long size; | 2030 | unsigned long size; |
@@ -2048,14 +2051,19 @@ out: | |||
2048 | case SCAN_FILE: | 2051 | case SCAN_FILE: |
2049 | case SCAN_ANON: | 2052 | case SCAN_ANON: |
2050 | /* Scan one type exclusively */ | 2053 | /* Scan one type exclusively */ |
2051 | if ((scan_balance == SCAN_FILE) != file) | 2054 | if ((scan_balance == SCAN_FILE) != file) { |
2055 | size = 0; | ||
2052 | scan = 0; | 2056 | scan = 0; |
2057 | } | ||
2053 | break; | 2058 | break; |
2054 | default: | 2059 | default: |
2055 | /* Look ma, no brain */ | 2060 | /* Look ma, no brain */ |
2056 | BUG(); | 2061 | BUG(); |
2057 | } | 2062 | } |
2063 | |||
2064 | *lru_pages += size; | ||
2058 | nr[lru] = scan; | 2065 | nr[lru] = scan; |
2066 | |||
2059 | /* | 2067 | /* |
2060 | * Skip the second pass and don't force_scan, | 2068 | * Skip the second pass and don't force_scan, |
2061 | * if we found something to scan. | 2069 | * if we found something to scan. |
@@ -2069,7 +2077,7 @@ out: | |||
2069 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 2077 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
2070 | */ | 2078 | */ |
2071 | static void shrink_lruvec(struct lruvec *lruvec, int swappiness, | 2079 | static void shrink_lruvec(struct lruvec *lruvec, int swappiness, |
2072 | struct scan_control *sc) | 2080 | struct scan_control *sc, unsigned long *lru_pages) |
2073 | { | 2081 | { |
2074 | unsigned long nr[NR_LRU_LISTS]; | 2082 | unsigned long nr[NR_LRU_LISTS]; |
2075 | unsigned long targets[NR_LRU_LISTS]; | 2083 | unsigned long targets[NR_LRU_LISTS]; |
@@ -2080,7 +2088,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, | |||
2080 | struct blk_plug plug; | 2088 | struct blk_plug plug; |
2081 | bool scan_adjusted; | 2089 | bool scan_adjusted; |
2082 | 2090 | ||
2083 | get_scan_count(lruvec, swappiness, sc, nr); | 2091 | get_scan_count(lruvec, swappiness, sc, nr, lru_pages); |
2084 | 2092 | ||
2085 | /* Record the original scan target for proportional adjustments later */ | 2093 | /* Record the original scan target for proportional adjustments later */ |
2086 | memcpy(targets, nr, sizeof(nr)); | 2094 | memcpy(targets, nr, sizeof(nr)); |
@@ -2258,7 +2266,8 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2258 | } | 2266 | } |
2259 | } | 2267 | } |
2260 | 2268 | ||
2261 | static bool shrink_zone(struct zone *zone, struct scan_control *sc) | 2269 | static bool shrink_zone(struct zone *zone, struct scan_control *sc, |
2270 | bool is_classzone) | ||
2262 | { | 2271 | { |
2263 | unsigned long nr_reclaimed, nr_scanned; | 2272 | unsigned long nr_reclaimed, nr_scanned; |
2264 | bool reclaimable = false; | 2273 | bool reclaimable = false; |
@@ -2269,6 +2278,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2269 | .zone = zone, | 2278 | .zone = zone, |
2270 | .priority = sc->priority, | 2279 | .priority = sc->priority, |
2271 | }; | 2280 | }; |
2281 | unsigned long zone_lru_pages = 0; | ||
2272 | struct mem_cgroup *memcg; | 2282 | struct mem_cgroup *memcg; |
2273 | 2283 | ||
2274 | nr_reclaimed = sc->nr_reclaimed; | 2284 | nr_reclaimed = sc->nr_reclaimed; |
@@ -2276,13 +2286,15 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2276 | 2286 | ||
2277 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 2287 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
2278 | do { | 2288 | do { |
2289 | unsigned long lru_pages; | ||
2279 | struct lruvec *lruvec; | 2290 | struct lruvec *lruvec; |
2280 | int swappiness; | 2291 | int swappiness; |
2281 | 2292 | ||
2282 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2293 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2283 | swappiness = mem_cgroup_swappiness(memcg); | 2294 | swappiness = mem_cgroup_swappiness(memcg); |
2284 | 2295 | ||
2285 | shrink_lruvec(lruvec, swappiness, sc); | 2296 | shrink_lruvec(lruvec, swappiness, sc, &lru_pages); |
2297 | zone_lru_pages += lru_pages; | ||
2286 | 2298 | ||
2287 | /* | 2299 | /* |
2288 | * Direct reclaim and kswapd have to scan all memory | 2300 | * Direct reclaim and kswapd have to scan all memory |
@@ -2302,6 +2314,25 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2302 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 2314 | memcg = mem_cgroup_iter(root, memcg, &reclaim); |
2303 | } while (memcg); | 2315 | } while (memcg); |
2304 | 2316 | ||
2317 | /* | ||
2318 | * Shrink the slab caches in the same proportion that | ||
2319 | * the eligible LRU pages were scanned. | ||
2320 | */ | ||
2321 | if (global_reclaim(sc) && is_classzone) { | ||
2322 | struct reclaim_state *reclaim_state; | ||
2323 | |||
2324 | shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), | ||
2325 | sc->nr_scanned - nr_scanned, | ||
2326 | zone_lru_pages); | ||
2327 | |||
2328 | reclaim_state = current->reclaim_state; | ||
2329 | if (reclaim_state) { | ||
2330 | sc->nr_reclaimed += | ||
2331 | reclaim_state->reclaimed_slab; | ||
2332 | reclaim_state->reclaimed_slab = 0; | ||
2333 | } | ||
2334 | } | ||
2335 | |||
2305 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | 2336 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, |
2306 | sc->nr_scanned - nr_scanned, | 2337 | sc->nr_scanned - nr_scanned, |
2307 | sc->nr_reclaimed - nr_reclaimed); | 2338 | sc->nr_reclaimed - nr_reclaimed); |
@@ -2376,12 +2407,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2376 | struct zone *zone; | 2407 | struct zone *zone; |
2377 | unsigned long nr_soft_reclaimed; | 2408 | unsigned long nr_soft_reclaimed; |
2378 | unsigned long nr_soft_scanned; | 2409 | unsigned long nr_soft_scanned; |
2379 | unsigned long lru_pages = 0; | ||
2380 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
2381 | gfp_t orig_mask; | 2410 | gfp_t orig_mask; |
2382 | struct shrink_control shrink = { | ||
2383 | .gfp_mask = sc->gfp_mask, | ||
2384 | }; | ||
2385 | enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); | 2411 | enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); |
2386 | bool reclaimable = false; | 2412 | bool reclaimable = false; |
2387 | 2413 | ||
@@ -2394,12 +2420,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2394 | if (buffer_heads_over_limit) | 2420 | if (buffer_heads_over_limit) |
2395 | sc->gfp_mask |= __GFP_HIGHMEM; | 2421 | sc->gfp_mask |= __GFP_HIGHMEM; |
2396 | 2422 | ||
2397 | nodes_clear(shrink.nodes_to_scan); | ||
2398 | |||
2399 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2423 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2400 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2424 | requested_highidx, sc->nodemask) { |
2425 | enum zone_type classzone_idx; | ||
2426 | |||
2401 | if (!populated_zone(zone)) | 2427 | if (!populated_zone(zone)) |
2402 | continue; | 2428 | continue; |
2429 | |||
2430 | classzone_idx = requested_highidx; | ||
2431 | while (!populated_zone(zone->zone_pgdat->node_zones + | ||
2432 | classzone_idx)) | ||
2433 | classzone_idx--; | ||
2434 | |||
2403 | /* | 2435 | /* |
2404 | * Take care memory controller reclaiming has small influence | 2436 | * Take care memory controller reclaiming has small influence |
2405 | * to global LRU. | 2437 | * to global LRU. |
@@ -2409,9 +2441,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2409 | GFP_KERNEL | __GFP_HARDWALL)) | 2441 | GFP_KERNEL | __GFP_HARDWALL)) |
2410 | continue; | 2442 | continue; |
2411 | 2443 | ||
2412 | lru_pages += zone_reclaimable_pages(zone); | ||
2413 | node_set(zone_to_nid(zone), shrink.nodes_to_scan); | ||
2414 | |||
2415 | if (sc->priority != DEF_PRIORITY && | 2444 | if (sc->priority != DEF_PRIORITY && |
2416 | !zone_reclaimable(zone)) | 2445 | !zone_reclaimable(zone)) |
2417 | continue; /* Let kswapd poll it */ | 2446 | continue; /* Let kswapd poll it */ |
@@ -2450,7 +2479,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2450 | /* need some check for avoid more shrink_zone() */ | 2479 | /* need some check for avoid more shrink_zone() */ |
2451 | } | 2480 | } |
2452 | 2481 | ||
2453 | if (shrink_zone(zone, sc)) | 2482 | if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx)) |
2454 | reclaimable = true; | 2483 | reclaimable = true; |
2455 | 2484 | ||
2456 | if (global_reclaim(sc) && | 2485 | if (global_reclaim(sc) && |
@@ -2459,20 +2488,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2459 | } | 2488 | } |
2460 | 2489 | ||
2461 | /* | 2490 | /* |
2462 | * Don't shrink slabs when reclaiming memory from over limit cgroups | ||
2463 | * but do shrink slab at least once when aborting reclaim for | ||
2464 | * compaction to avoid unevenly scanning file/anon LRU pages over slab | ||
2465 | * pages. | ||
2466 | */ | ||
2467 | if (global_reclaim(sc)) { | ||
2468 | shrink_slab(&shrink, sc->nr_scanned, lru_pages); | ||
2469 | if (reclaim_state) { | ||
2470 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | ||
2471 | reclaim_state->reclaimed_slab = 0; | ||
2472 | } | ||
2473 | } | ||
2474 | |||
2475 | /* | ||
2476 | * Restore to original mask to avoid the impact on the caller if we | 2491 | * Restore to original mask to avoid the impact on the caller if we |
2477 | * promoted it to __GFP_HIGHMEM. | 2492 | * promoted it to __GFP_HIGHMEM. |
2478 | */ | 2493 | */ |
@@ -2736,6 +2751,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2736 | }; | 2751 | }; |
2737 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2752 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2738 | int swappiness = mem_cgroup_swappiness(memcg); | 2753 | int swappiness = mem_cgroup_swappiness(memcg); |
2754 | unsigned long lru_pages; | ||
2739 | 2755 | ||
2740 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2756 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2741 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2757 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
@@ -2751,7 +2767,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2751 | * will pick up pages from other mem cgroup's as well. We hack | 2767 | * will pick up pages from other mem cgroup's as well. We hack |
2752 | * the priority and make it zero. | 2768 | * the priority and make it zero. |
2753 | */ | 2769 | */ |
2754 | shrink_lruvec(lruvec, swappiness, &sc); | 2770 | shrink_lruvec(lruvec, swappiness, &sc, &lru_pages); |
2755 | 2771 | ||
2756 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2772 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2757 | 2773 | ||
@@ -2932,15 +2948,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2932 | static bool kswapd_shrink_zone(struct zone *zone, | 2948 | static bool kswapd_shrink_zone(struct zone *zone, |
2933 | int classzone_idx, | 2949 | int classzone_idx, |
2934 | struct scan_control *sc, | 2950 | struct scan_control *sc, |
2935 | unsigned long lru_pages, | ||
2936 | unsigned long *nr_attempted) | 2951 | unsigned long *nr_attempted) |
2937 | { | 2952 | { |
2938 | int testorder = sc->order; | 2953 | int testorder = sc->order; |
2939 | unsigned long balance_gap; | 2954 | unsigned long balance_gap; |
2940 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
2941 | struct shrink_control shrink = { | ||
2942 | .gfp_mask = sc->gfp_mask, | ||
2943 | }; | ||
2944 | bool lowmem_pressure; | 2955 | bool lowmem_pressure; |
2945 | 2956 | ||
2946 | /* Reclaim above the high watermark. */ | 2957 | /* Reclaim above the high watermark. */ |
@@ -2975,13 +2986,7 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2975 | balance_gap, classzone_idx)) | 2986 | balance_gap, classzone_idx)) |
2976 | return true; | 2987 | return true; |
2977 | 2988 | ||
2978 | shrink_zone(zone, sc); | 2989 | shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); |
2979 | nodes_clear(shrink.nodes_to_scan); | ||
2980 | node_set(zone_to_nid(zone), shrink.nodes_to_scan); | ||
2981 | |||
2982 | reclaim_state->reclaimed_slab = 0; | ||
2983 | shrink_slab(&shrink, sc->nr_scanned, lru_pages); | ||
2984 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | ||
2985 | 2990 | ||
2986 | /* Account for the number of pages attempted to reclaim */ | 2991 | /* Account for the number of pages attempted to reclaim */ |
2987 | *nr_attempted += sc->nr_to_reclaim; | 2992 | *nr_attempted += sc->nr_to_reclaim; |
@@ -3042,7 +3047,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3042 | count_vm_event(PAGEOUTRUN); | 3047 | count_vm_event(PAGEOUTRUN); |
3043 | 3048 | ||
3044 | do { | 3049 | do { |
3045 | unsigned long lru_pages = 0; | ||
3046 | unsigned long nr_attempted = 0; | 3050 | unsigned long nr_attempted = 0; |
3047 | bool raise_priority = true; | 3051 | bool raise_priority = true; |
3048 | bool pgdat_needs_compaction = (order > 0); | 3052 | bool pgdat_needs_compaction = (order > 0); |
@@ -3102,8 +3106,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3102 | if (!populated_zone(zone)) | 3106 | if (!populated_zone(zone)) |
3103 | continue; | 3107 | continue; |
3104 | 3108 | ||
3105 | lru_pages += zone_reclaimable_pages(zone); | ||
3106 | |||
3107 | /* | 3109 | /* |
3108 | * If any zone is currently balanced then kswapd will | 3110 | * If any zone is currently balanced then kswapd will |
3109 | * not call compaction as it is expected that the | 3111 | * not call compaction as it is expected that the |
@@ -3159,8 +3161,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3159 | * that that high watermark would be met at 100% | 3161 | * that that high watermark would be met at 100% |
3160 | * efficiency. | 3162 | * efficiency. |
3161 | */ | 3163 | */ |
3162 | if (kswapd_shrink_zone(zone, end_zone, &sc, | 3164 | if (kswapd_shrink_zone(zone, end_zone, |
3163 | lru_pages, &nr_attempted)) | 3165 | &sc, &nr_attempted)) |
3164 | raise_priority = false; | 3166 | raise_priority = false; |
3165 | } | 3167 | } |
3166 | 3168 | ||
@@ -3612,10 +3614,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3612 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 3614 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
3613 | .may_swap = 1, | 3615 | .may_swap = 1, |
3614 | }; | 3616 | }; |
3615 | struct shrink_control shrink = { | ||
3616 | .gfp_mask = sc.gfp_mask, | ||
3617 | }; | ||
3618 | unsigned long nr_slab_pages0, nr_slab_pages1; | ||
3619 | 3617 | ||
3620 | cond_resched(); | 3618 | cond_resched(); |
3621 | /* | 3619 | /* |
@@ -3634,44 +3632,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3634 | * priorities until we have enough memory freed. | 3632 | * priorities until we have enough memory freed. |
3635 | */ | 3633 | */ |
3636 | do { | 3634 | do { |
3637 | shrink_zone(zone, &sc); | 3635 | shrink_zone(zone, &sc, true); |
3638 | } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); | 3636 | } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); |
3639 | } | 3637 | } |
3640 | 3638 | ||
3641 | nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | ||
3642 | if (nr_slab_pages0 > zone->min_slab_pages) { | ||
3643 | /* | ||
3644 | * shrink_slab() does not currently allow us to determine how | ||
3645 | * many pages were freed in this zone. So we take the current | ||
3646 | * number of slab pages and shake the slab until it is reduced | ||
3647 | * by the same nr_pages that we used for reclaiming unmapped | ||
3648 | * pages. | ||
3649 | */ | ||
3650 | nodes_clear(shrink.nodes_to_scan); | ||
3651 | node_set(zone_to_nid(zone), shrink.nodes_to_scan); | ||
3652 | for (;;) { | ||
3653 | unsigned long lru_pages = zone_reclaimable_pages(zone); | ||
3654 | |||
3655 | /* No reclaimable slab or very low memory pressure */ | ||
3656 | if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) | ||
3657 | break; | ||
3658 | |||
3659 | /* Freed enough memory */ | ||
3660 | nr_slab_pages1 = zone_page_state(zone, | ||
3661 | NR_SLAB_RECLAIMABLE); | ||
3662 | if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) | ||
3663 | break; | ||
3664 | } | ||
3665 | |||
3666 | /* | ||
3667 | * Update nr_reclaimed by the number of slab pages we | ||
3668 | * reclaimed from this zone. | ||
3669 | */ | ||
3670 | nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); | ||
3671 | if (nr_slab_pages1 < nr_slab_pages0) | ||
3672 | sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; | ||
3673 | } | ||
3674 | |||
3675 | p->reclaim_state = NULL; | 3639 | p->reclaim_state = NULL; |
3676 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 3640 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
3677 | lockdep_clear_current_reclaim_state(); | 3641 | lockdep_clear_current_reclaim_state(); |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 1b12d390dc68..1284f89fca08 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -22,6 +22,8 @@ | |||
22 | #include <linux/writeback.h> | 22 | #include <linux/writeback.h> |
23 | #include <linux/compaction.h> | 23 | #include <linux/compaction.h> |
24 | #include <linux/mm_inline.h> | 24 | #include <linux/mm_inline.h> |
25 | #include <linux/page_ext.h> | ||
26 | #include <linux/page_owner.h> | ||
25 | 27 | ||
26 | #include "internal.h" | 28 | #include "internal.h" |
27 | 29 | ||
@@ -898,6 +900,7 @@ const char * const vmstat_text[] = { | |||
898 | #ifdef CONFIG_DEBUG_VM_VMACACHE | 900 | #ifdef CONFIG_DEBUG_VM_VMACACHE |
899 | "vmacache_find_calls", | 901 | "vmacache_find_calls", |
900 | "vmacache_find_hits", | 902 | "vmacache_find_hits", |
903 | "vmacache_full_flushes", | ||
901 | #endif | 904 | #endif |
902 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 905 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
903 | }; | 906 | }; |
@@ -1017,6 +1020,104 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) | |||
1017 | return 0; | 1020 | return 0; |
1018 | } | 1021 | } |
1019 | 1022 | ||
1023 | #ifdef CONFIG_PAGE_OWNER | ||
1024 | static void pagetypeinfo_showmixedcount_print(struct seq_file *m, | ||
1025 | pg_data_t *pgdat, | ||
1026 | struct zone *zone) | ||
1027 | { | ||
1028 | struct page *page; | ||
1029 | struct page_ext *page_ext; | ||
1030 | unsigned long pfn = zone->zone_start_pfn, block_end_pfn; | ||
1031 | unsigned long end_pfn = pfn + zone->spanned_pages; | ||
1032 | unsigned long count[MIGRATE_TYPES] = { 0, }; | ||
1033 | int pageblock_mt, page_mt; | ||
1034 | int i; | ||
1035 | |||
1036 | /* Scan block by block. First and last block may be incomplete */ | ||
1037 | pfn = zone->zone_start_pfn; | ||
1038 | |||
1039 | /* | ||
1040 | * Walk the zone in pageblock_nr_pages steps. If a page block spans | ||
1041 | * a zone boundary, it will be double counted between zones. This does | ||
1042 | * not matter as the mixed block count will still be correct | ||
1043 | */ | ||
1044 | for (; pfn < end_pfn; ) { | ||
1045 | if (!pfn_valid(pfn)) { | ||
1046 | pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); | ||
1047 | continue; | ||
1048 | } | ||
1049 | |||
1050 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
1051 | block_end_pfn = min(block_end_pfn, end_pfn); | ||
1052 | |||
1053 | page = pfn_to_page(pfn); | ||
1054 | pageblock_mt = get_pfnblock_migratetype(page, pfn); | ||
1055 | |||
1056 | for (; pfn < block_end_pfn; pfn++) { | ||
1057 | if (!pfn_valid_within(pfn)) | ||
1058 | continue; | ||
1059 | |||
1060 | page = pfn_to_page(pfn); | ||
1061 | if (PageBuddy(page)) { | ||
1062 | pfn += (1UL << page_order(page)) - 1; | ||
1063 | continue; | ||
1064 | } | ||
1065 | |||
1066 | if (PageReserved(page)) | ||
1067 | continue; | ||
1068 | |||
1069 | page_ext = lookup_page_ext(page); | ||
1070 | |||
1071 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | ||
1072 | continue; | ||
1073 | |||
1074 | page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); | ||
1075 | if (pageblock_mt != page_mt) { | ||
1076 | if (is_migrate_cma(pageblock_mt)) | ||
1077 | count[MIGRATE_MOVABLE]++; | ||
1078 | else | ||
1079 | count[pageblock_mt]++; | ||
1080 | |||
1081 | pfn = block_end_pfn; | ||
1082 | break; | ||
1083 | } | ||
1084 | pfn += (1UL << page_ext->order) - 1; | ||
1085 | } | ||
1086 | } | ||
1087 | |||
1088 | /* Print counts */ | ||
1089 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
1090 | for (i = 0; i < MIGRATE_TYPES; i++) | ||
1091 | seq_printf(m, "%12lu ", count[i]); | ||
1092 | seq_putc(m, '\n'); | ||
1093 | } | ||
1094 | #endif /* CONFIG_PAGE_OWNER */ | ||
1095 | |||
1096 | /* | ||
1097 | * Print out the number of pageblocks for each migratetype that contain pages | ||
1098 | * of other types. This gives an indication of how well fallbacks are being | ||
1099 | * contained by rmqueue_fallback(). It requires information from PAGE_OWNER | ||
1100 | * to determine what is going on | ||
1101 | */ | ||
1102 | static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) | ||
1103 | { | ||
1104 | #ifdef CONFIG_PAGE_OWNER | ||
1105 | int mtype; | ||
1106 | |||
1107 | if (!page_owner_inited) | ||
1108 | return; | ||
1109 | |||
1110 | drain_all_pages(NULL); | ||
1111 | |||
1112 | seq_printf(m, "\n%-23s", "Number of mixed blocks "); | ||
1113 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) | ||
1114 | seq_printf(m, "%12s ", migratetype_names[mtype]); | ||
1115 | seq_putc(m, '\n'); | ||
1116 | |||
1117 | walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); | ||
1118 | #endif /* CONFIG_PAGE_OWNER */ | ||
1119 | } | ||
1120 | |||
1020 | /* | 1121 | /* |
1021 | * This prints out statistics in relation to grouping pages by mobility. | 1122 | * This prints out statistics in relation to grouping pages by mobility. |
1022 | * It is expensive to collect so do not constantly read the file. | 1123 | * It is expensive to collect so do not constantly read the file. |
@@ -1034,6 +1135,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) | |||
1034 | seq_putc(m, '\n'); | 1135 | seq_putc(m, '\n'); |
1035 | pagetypeinfo_showfree(m, pgdat); | 1136 | pagetypeinfo_showfree(m, pgdat); |
1036 | pagetypeinfo_showblockcount(m, pgdat); | 1137 | pagetypeinfo_showblockcount(m, pgdat); |
1138 | pagetypeinfo_showmixedcount(m, pgdat); | ||
1037 | 1139 | ||
1038 | return 0; | 1140 | return 0; |
1039 | } | 1141 | } |
@@ -132,7 +132,7 @@ static struct zbud_ops zbud_zpool_ops = { | |||
132 | 132 | ||
133 | static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) | 133 | static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) |
134 | { | 134 | { |
135 | return zbud_create_pool(gfp, &zbud_zpool_ops); | 135 | return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); |
136 | } | 136 | } |
137 | 137 | ||
138 | static void zbud_zpool_destroy(void *pool) | 138 | static void zbud_zpool_destroy(void *pool) |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 839a48c3ca27..4d0a063145ec 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -155,8 +155,6 @@ | |||
155 | * (reason above) | 155 | * (reason above) |
156 | */ | 156 | */ |
157 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) | 157 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) |
158 | #define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ | ||
159 | ZS_SIZE_CLASS_DELTA + 1) | ||
160 | 158 | ||
161 | /* | 159 | /* |
162 | * We do not maintain any list for completely empty or full pages | 160 | * We do not maintain any list for completely empty or full pages |
@@ -171,6 +169,11 @@ enum fullness_group { | |||
171 | }; | 169 | }; |
172 | 170 | ||
173 | /* | 171 | /* |
172 | * number of size_classes | ||
173 | */ | ||
174 | static int zs_size_classes; | ||
175 | |||
176 | /* | ||
174 | * We assign a page to ZS_ALMOST_EMPTY fullness group when: | 177 | * We assign a page to ZS_ALMOST_EMPTY fullness group when: |
175 | * n <= N / f, where | 178 | * n <= N / f, where |
176 | * n = number of allocated objects | 179 | * n = number of allocated objects |
@@ -214,7 +217,7 @@ struct link_free { | |||
214 | }; | 217 | }; |
215 | 218 | ||
216 | struct zs_pool { | 219 | struct zs_pool { |
217 | struct size_class size_class[ZS_SIZE_CLASSES]; | 220 | struct size_class **size_class; |
218 | 221 | ||
219 | gfp_t flags; /* allocation flags used when growing pool */ | 222 | gfp_t flags; /* allocation flags used when growing pool */ |
220 | atomic_long_t pages_allocated; | 223 | atomic_long_t pages_allocated; |
@@ -468,7 +471,7 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, | |||
468 | if (newfg == currfg) | 471 | if (newfg == currfg) |
469 | goto out; | 472 | goto out; |
470 | 473 | ||
471 | class = &pool->size_class[class_idx]; | 474 | class = pool->size_class[class_idx]; |
472 | remove_zspage(page, class, currfg); | 475 | remove_zspage(page, class, currfg); |
473 | insert_zspage(page, class, newfg); | 476 | insert_zspage(page, class, newfg); |
474 | set_zspage_mapping(page, class_idx, newfg); | 477 | set_zspage_mapping(page, class_idx, newfg); |
@@ -629,6 +632,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
629 | struct page *next_page; | 632 | struct page *next_page; |
630 | struct link_free *link; | 633 | struct link_free *link; |
631 | unsigned int i = 1; | 634 | unsigned int i = 1; |
635 | void *vaddr; | ||
632 | 636 | ||
633 | /* | 637 | /* |
634 | * page->index stores offset of first object starting | 638 | * page->index stores offset of first object starting |
@@ -639,8 +643,8 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
639 | if (page != first_page) | 643 | if (page != first_page) |
640 | page->index = off; | 644 | page->index = off; |
641 | 645 | ||
642 | link = (struct link_free *)kmap_atomic(page) + | 646 | vaddr = kmap_atomic(page); |
643 | off / sizeof(*link); | 647 | link = (struct link_free *)vaddr + off / sizeof(*link); |
644 | 648 | ||
645 | while ((off += class->size) < PAGE_SIZE) { | 649 | while ((off += class->size) < PAGE_SIZE) { |
646 | link->next = obj_location_to_handle(page, i++); | 650 | link->next = obj_location_to_handle(page, i++); |
@@ -654,7 +658,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
654 | */ | 658 | */ |
655 | next_page = get_next_page(page); | 659 | next_page = get_next_page(page); |
656 | link->next = obj_location_to_handle(next_page, 0); | 660 | link->next = obj_location_to_handle(next_page, 0); |
657 | kunmap_atomic(link); | 661 | kunmap_atomic(vaddr); |
658 | page = next_page; | 662 | page = next_page; |
659 | off %= PAGE_SIZE; | 663 | off %= PAGE_SIZE; |
660 | } | 664 | } |
@@ -784,7 +788,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) | |||
784 | */ | 788 | */ |
785 | if (area->vm_buf) | 789 | if (area->vm_buf) |
786 | return 0; | 790 | return 0; |
787 | area->vm_buf = (char *)__get_free_page(GFP_KERNEL); | 791 | area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); |
788 | if (!area->vm_buf) | 792 | if (!area->vm_buf) |
789 | return -ENOMEM; | 793 | return -ENOMEM; |
790 | return 0; | 794 | return 0; |
@@ -792,8 +796,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) | |||
792 | 796 | ||
793 | static inline void __zs_cpu_down(struct mapping_area *area) | 797 | static inline void __zs_cpu_down(struct mapping_area *area) |
794 | { | 798 | { |
795 | if (area->vm_buf) | 799 | kfree(area->vm_buf); |
796 | free_page((unsigned long)area->vm_buf); | ||
797 | area->vm_buf = NULL; | 800 | area->vm_buf = NULL; |
798 | } | 801 | } |
799 | 802 | ||
@@ -881,14 +884,10 @@ static struct notifier_block zs_cpu_nb = { | |||
881 | .notifier_call = zs_cpu_notifier | 884 | .notifier_call = zs_cpu_notifier |
882 | }; | 885 | }; |
883 | 886 | ||
884 | static void zs_exit(void) | 887 | static void zs_unregister_cpu_notifier(void) |
885 | { | 888 | { |
886 | int cpu; | 889 | int cpu; |
887 | 890 | ||
888 | #ifdef CONFIG_ZPOOL | ||
889 | zpool_unregister_driver(&zs_zpool_driver); | ||
890 | #endif | ||
891 | |||
892 | cpu_notifier_register_begin(); | 891 | cpu_notifier_register_begin(); |
893 | 892 | ||
894 | for_each_online_cpu(cpu) | 893 | for_each_online_cpu(cpu) |
@@ -898,31 +897,74 @@ static void zs_exit(void) | |||
898 | cpu_notifier_register_done(); | 897 | cpu_notifier_register_done(); |
899 | } | 898 | } |
900 | 899 | ||
901 | static int zs_init(void) | 900 | static int zs_register_cpu_notifier(void) |
902 | { | 901 | { |
903 | int cpu, ret; | 902 | int cpu, uninitialized_var(ret); |
904 | 903 | ||
905 | cpu_notifier_register_begin(); | 904 | cpu_notifier_register_begin(); |
906 | 905 | ||
907 | __register_cpu_notifier(&zs_cpu_nb); | 906 | __register_cpu_notifier(&zs_cpu_nb); |
908 | for_each_online_cpu(cpu) { | 907 | for_each_online_cpu(cpu) { |
909 | ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 908 | ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
910 | if (notifier_to_errno(ret)) { | 909 | if (notifier_to_errno(ret)) |
911 | cpu_notifier_register_done(); | 910 | break; |
912 | goto fail; | ||
913 | } | ||
914 | } | 911 | } |
915 | 912 | ||
916 | cpu_notifier_register_done(); | 913 | cpu_notifier_register_done(); |
914 | return notifier_to_errno(ret); | ||
915 | } | ||
916 | |||
917 | static void init_zs_size_classes(void) | ||
918 | { | ||
919 | int nr; | ||
917 | 920 | ||
921 | nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; | ||
922 | if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) | ||
923 | nr += 1; | ||
924 | |||
925 | zs_size_classes = nr; | ||
926 | } | ||
927 | |||
928 | static void __exit zs_exit(void) | ||
929 | { | ||
918 | #ifdef CONFIG_ZPOOL | 930 | #ifdef CONFIG_ZPOOL |
919 | zpool_register_driver(&zs_zpool_driver); | 931 | zpool_unregister_driver(&zs_zpool_driver); |
920 | #endif | 932 | #endif |
933 | zs_unregister_cpu_notifier(); | ||
934 | } | ||
921 | 935 | ||
936 | static int __init zs_init(void) | ||
937 | { | ||
938 | int ret = zs_register_cpu_notifier(); | ||
939 | |||
940 | if (ret) { | ||
941 | zs_unregister_cpu_notifier(); | ||
942 | return ret; | ||
943 | } | ||
944 | |||
945 | init_zs_size_classes(); | ||
946 | |||
947 | #ifdef CONFIG_ZPOOL | ||
948 | zpool_register_driver(&zs_zpool_driver); | ||
949 | #endif | ||
922 | return 0; | 950 | return 0; |
923 | fail: | 951 | } |
924 | zs_exit(); | 952 | |
925 | return notifier_to_errno(ret); | 953 | static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) |
954 | { | ||
955 | return pages_per_zspage * PAGE_SIZE / size; | ||
956 | } | ||
957 | |||
958 | static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | ||
959 | { | ||
960 | if (prev->pages_per_zspage != pages_per_zspage) | ||
961 | return false; | ||
962 | |||
963 | if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) | ||
964 | != get_maxobj_per_zspage(size, pages_per_zspage)) | ||
965 | return false; | ||
966 | |||
967 | return true; | ||
926 | } | 968 | } |
927 | 969 | ||
928 | /** | 970 | /** |
@@ -937,33 +979,71 @@ fail: | |||
937 | */ | 979 | */ |
938 | struct zs_pool *zs_create_pool(gfp_t flags) | 980 | struct zs_pool *zs_create_pool(gfp_t flags) |
939 | { | 981 | { |
940 | int i, ovhd_size; | 982 | int i; |
941 | struct zs_pool *pool; | 983 | struct zs_pool *pool; |
984 | struct size_class *prev_class = NULL; | ||
942 | 985 | ||
943 | ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); | 986 | pool = kzalloc(sizeof(*pool), GFP_KERNEL); |
944 | pool = kzalloc(ovhd_size, GFP_KERNEL); | ||
945 | if (!pool) | 987 | if (!pool) |
946 | return NULL; | 988 | return NULL; |
947 | 989 | ||
948 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { | 990 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), |
991 | GFP_KERNEL); | ||
992 | if (!pool->size_class) { | ||
993 | kfree(pool); | ||
994 | return NULL; | ||
995 | } | ||
996 | |||
997 | /* | ||
998 | * Iterate reversly, because, size of size_class that we want to use | ||
999 | * for merging should be larger or equal to current size. | ||
1000 | */ | ||
1001 | for (i = zs_size_classes - 1; i >= 0; i--) { | ||
949 | int size; | 1002 | int size; |
1003 | int pages_per_zspage; | ||
950 | struct size_class *class; | 1004 | struct size_class *class; |
951 | 1005 | ||
952 | size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; | 1006 | size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; |
953 | if (size > ZS_MAX_ALLOC_SIZE) | 1007 | if (size > ZS_MAX_ALLOC_SIZE) |
954 | size = ZS_MAX_ALLOC_SIZE; | 1008 | size = ZS_MAX_ALLOC_SIZE; |
1009 | pages_per_zspage = get_pages_per_zspage(size); | ||
1010 | |||
1011 | /* | ||
1012 | * size_class is used for normal zsmalloc operation such | ||
1013 | * as alloc/free for that size. Although it is natural that we | ||
1014 | * have one size_class for each size, there is a chance that we | ||
1015 | * can get more memory utilization if we use one size_class for | ||
1016 | * many different sizes whose size_class have same | ||
1017 | * characteristics. So, we makes size_class point to | ||
1018 | * previous size_class if possible. | ||
1019 | */ | ||
1020 | if (prev_class) { | ||
1021 | if (can_merge(prev_class, size, pages_per_zspage)) { | ||
1022 | pool->size_class[i] = prev_class; | ||
1023 | continue; | ||
1024 | } | ||
1025 | } | ||
1026 | |||
1027 | class = kzalloc(sizeof(struct size_class), GFP_KERNEL); | ||
1028 | if (!class) | ||
1029 | goto err; | ||
955 | 1030 | ||
956 | class = &pool->size_class[i]; | ||
957 | class->size = size; | 1031 | class->size = size; |
958 | class->index = i; | 1032 | class->index = i; |
1033 | class->pages_per_zspage = pages_per_zspage; | ||
959 | spin_lock_init(&class->lock); | 1034 | spin_lock_init(&class->lock); |
960 | class->pages_per_zspage = get_pages_per_zspage(size); | 1035 | pool->size_class[i] = class; |
961 | 1036 | ||
1037 | prev_class = class; | ||
962 | } | 1038 | } |
963 | 1039 | ||
964 | pool->flags = flags; | 1040 | pool->flags = flags; |
965 | 1041 | ||
966 | return pool; | 1042 | return pool; |
1043 | |||
1044 | err: | ||
1045 | zs_destroy_pool(pool); | ||
1046 | return NULL; | ||
967 | } | 1047 | } |
968 | EXPORT_SYMBOL_GPL(zs_create_pool); | 1048 | EXPORT_SYMBOL_GPL(zs_create_pool); |
969 | 1049 | ||
@@ -971,9 +1051,15 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
971 | { | 1051 | { |
972 | int i; | 1052 | int i; |
973 | 1053 | ||
974 | for (i = 0; i < ZS_SIZE_CLASSES; i++) { | 1054 | for (i = 0; i < zs_size_classes; i++) { |
975 | int fg; | 1055 | int fg; |
976 | struct size_class *class = &pool->size_class[i]; | 1056 | struct size_class *class = pool->size_class[i]; |
1057 | |||
1058 | if (!class) | ||
1059 | continue; | ||
1060 | |||
1061 | if (class->index != i) | ||
1062 | continue; | ||
977 | 1063 | ||
978 | for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { | 1064 | for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { |
979 | if (class->fullness_list[fg]) { | 1065 | if (class->fullness_list[fg]) { |
@@ -981,7 +1067,10 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
981 | class->size, fg); | 1067 | class->size, fg); |
982 | } | 1068 | } |
983 | } | 1069 | } |
1070 | kfree(class); | ||
984 | } | 1071 | } |
1072 | |||
1073 | kfree(pool->size_class); | ||
985 | kfree(pool); | 1074 | kfree(pool); |
986 | } | 1075 | } |
987 | EXPORT_SYMBOL_GPL(zs_destroy_pool); | 1076 | EXPORT_SYMBOL_GPL(zs_destroy_pool); |
@@ -999,8 +1088,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
999 | { | 1088 | { |
1000 | unsigned long obj; | 1089 | unsigned long obj; |
1001 | struct link_free *link; | 1090 | struct link_free *link; |
1002 | int class_idx; | ||
1003 | struct size_class *class; | 1091 | struct size_class *class; |
1092 | void *vaddr; | ||
1004 | 1093 | ||
1005 | struct page *first_page, *m_page; | 1094 | struct page *first_page, *m_page; |
1006 | unsigned long m_objidx, m_offset; | 1095 | unsigned long m_objidx, m_offset; |
@@ -1008,9 +1097,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
1008 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) | 1097 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) |
1009 | return 0; | 1098 | return 0; |
1010 | 1099 | ||
1011 | class_idx = get_size_class_index(size); | 1100 | class = pool->size_class[get_size_class_index(size)]; |
1012 | class = &pool->size_class[class_idx]; | ||
1013 | BUG_ON(class_idx != class->index); | ||
1014 | 1101 | ||
1015 | spin_lock(&class->lock); | 1102 | spin_lock(&class->lock); |
1016 | first_page = find_get_zspage(class); | 1103 | first_page = find_get_zspage(class); |
@@ -1031,11 +1118,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
1031 | obj_handle_to_location(obj, &m_page, &m_objidx); | 1118 | obj_handle_to_location(obj, &m_page, &m_objidx); |
1032 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); | 1119 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); |
1033 | 1120 | ||
1034 | link = (struct link_free *)kmap_atomic(m_page) + | 1121 | vaddr = kmap_atomic(m_page); |
1035 | m_offset / sizeof(*link); | 1122 | link = (struct link_free *)vaddr + m_offset / sizeof(*link); |
1036 | first_page->freelist = link->next; | 1123 | first_page->freelist = link->next; |
1037 | memset(link, POISON_INUSE, sizeof(*link)); | 1124 | memset(link, POISON_INUSE, sizeof(*link)); |
1038 | kunmap_atomic(link); | 1125 | kunmap_atomic(vaddr); |
1039 | 1126 | ||
1040 | first_page->inuse++; | 1127 | first_page->inuse++; |
1041 | /* Now move the zspage to another fullness group, if required */ | 1128 | /* Now move the zspage to another fullness group, if required */ |
@@ -1051,6 +1138,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj) | |||
1051 | struct link_free *link; | 1138 | struct link_free *link; |
1052 | struct page *first_page, *f_page; | 1139 | struct page *first_page, *f_page; |
1053 | unsigned long f_objidx, f_offset; | 1140 | unsigned long f_objidx, f_offset; |
1141 | void *vaddr; | ||
1054 | 1142 | ||
1055 | int class_idx; | 1143 | int class_idx; |
1056 | struct size_class *class; | 1144 | struct size_class *class; |
@@ -1063,16 +1151,16 @@ void zs_free(struct zs_pool *pool, unsigned long obj) | |||
1063 | first_page = get_first_page(f_page); | 1151 | first_page = get_first_page(f_page); |
1064 | 1152 | ||
1065 | get_zspage_mapping(first_page, &class_idx, &fullness); | 1153 | get_zspage_mapping(first_page, &class_idx, &fullness); |
1066 | class = &pool->size_class[class_idx]; | 1154 | class = pool->size_class[class_idx]; |
1067 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); | 1155 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); |
1068 | 1156 | ||
1069 | spin_lock(&class->lock); | 1157 | spin_lock(&class->lock); |
1070 | 1158 | ||
1071 | /* Insert this object in containing zspage's freelist */ | 1159 | /* Insert this object in containing zspage's freelist */ |
1072 | link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) | 1160 | vaddr = kmap_atomic(f_page); |
1073 | + f_offset); | 1161 | link = (struct link_free *)(vaddr + f_offset); |
1074 | link->next = first_page->freelist; | 1162 | link->next = first_page->freelist; |
1075 | kunmap_atomic(link); | 1163 | kunmap_atomic(vaddr); |
1076 | first_page->freelist = (void *)obj; | 1164 | first_page->freelist = (void *)obj; |
1077 | 1165 | ||
1078 | first_page->inuse--; | 1166 | first_page->inuse--; |
@@ -1124,7 +1212,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
1124 | 1212 | ||
1125 | obj_handle_to_location(handle, &page, &obj_idx); | 1213 | obj_handle_to_location(handle, &page, &obj_idx); |
1126 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1214 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); |
1127 | class = &pool->size_class[class_idx]; | 1215 | class = pool->size_class[class_idx]; |
1128 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1216 | off = obj_idx_to_offset(page, obj_idx, class->size); |
1129 | 1217 | ||
1130 | area = &get_cpu_var(zs_map_area); | 1218 | area = &get_cpu_var(zs_map_area); |
@@ -1158,7 +1246,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
1158 | 1246 | ||
1159 | obj_handle_to_location(handle, &page, &obj_idx); | 1247 | obj_handle_to_location(handle, &page, &obj_idx); |
1160 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1248 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); |
1161 | class = &pool->size_class[class_idx]; | 1249 | class = pool->size_class[class_idx]; |
1162 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1250 | off = obj_idx_to_offset(page, obj_idx, class->size); |
1163 | 1251 | ||
1164 | area = this_cpu_ptr(&zs_map_area); | 1252 | area = this_cpu_ptr(&zs_map_area); |
diff --git a/mm/zswap.c b/mm/zswap.c index c1543061a192..0cfce9bc51e4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -149,11 +149,10 @@ static int __init zswap_comp_init(void) | |||
149 | return 0; | 149 | return 0; |
150 | } | 150 | } |
151 | 151 | ||
152 | static void zswap_comp_exit(void) | 152 | static void __init zswap_comp_exit(void) |
153 | { | 153 | { |
154 | /* free percpu transforms */ | 154 | /* free percpu transforms */ |
155 | if (zswap_comp_pcpu_tfms) | 155 | free_percpu(zswap_comp_pcpu_tfms); |
156 | free_percpu(zswap_comp_pcpu_tfms); | ||
157 | } | 156 | } |
158 | 157 | ||
159 | /********************************* | 158 | /********************************* |
@@ -206,7 +205,7 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; | |||
206 | **********************************/ | 205 | **********************************/ |
207 | static struct kmem_cache *zswap_entry_cache; | 206 | static struct kmem_cache *zswap_entry_cache; |
208 | 207 | ||
209 | static int zswap_entry_cache_create(void) | 208 | static int __init zswap_entry_cache_create(void) |
210 | { | 209 | { |
211 | zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); | 210 | zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); |
212 | return zswap_entry_cache == NULL; | 211 | return zswap_entry_cache == NULL; |
@@ -389,7 +388,7 @@ static struct notifier_block zswap_cpu_notifier_block = { | |||
389 | .notifier_call = zswap_cpu_notifier | 388 | .notifier_call = zswap_cpu_notifier |
390 | }; | 389 | }; |
391 | 390 | ||
392 | static int zswap_cpu_init(void) | 391 | static int __init zswap_cpu_init(void) |
393 | { | 392 | { |
394 | unsigned long cpu; | 393 | unsigned long cpu; |
395 | 394 | ||