aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-10 19:45:56 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-10 19:45:56 -0500
commit992de5a8eca7cbd3215e3eb2c439b2c11582a58b (patch)
tree863988f84c1dd57a02fa337ecbce49263a3b9511 /mm
parentb2718bffb4088faf13092db30c1ebf088ddee52e (diff)
parentd5b3cf7139b8770af4ed8bb36a1ab9d290ac39e9 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "Bite-sized chunks this time, to avoid the MTA ratelimiting woes. - fs/notify updates - ocfs2 - some of MM" That laconic "some MM" is mainly the removal of remap_file_pages(), which is a big simplification of the VM, and which gets rid of a *lot* of random cruft and special cases because we no longer support the non-linear mappings that it used. From a user interface perspective, nothing has changed, because the remap_file_pages() syscall still exists, it's just done by emulating the old behavior by creating a lot of individual small mappings instead of one non-linear one. The emulation is slower than the old "native" non-linear mappings, but nobody really uses or cares about remap_file_pages(), and simplifying the VM is a big advantage. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (78 commits) memcg: zap memcg_slab_caches and memcg_slab_mutex memcg: zap memcg_name argument of memcg_create_kmem_cache memcg: zap __memcg_{charge,uncharge}_slab mm/page_alloc.c: place zone_id check before VM_BUG_ON_PAGE check mm: hugetlb: fix type of hugetlb_treat_as_movable variable mm, hugetlb: remove unnecessary lower bound on sysctl handlers"? mm: memory: merge shared-writable dirtying branches in do_wp_page() mm: memory: remove ->vm_file check on shared writable vmas xtensa: drop _PAGE_FILE and pte_file()-related helpers x86: drop _PAGE_FILE and pte_file()-related helpers unicore32: drop pte_file()-related helpers um: drop _PAGE_FILE and pte_file()-related helpers tile: drop pte_file()-related helpers sparc: drop pte_file()-related helpers sh: drop _PAGE_FILE and pte_file()-related helpers score: drop _PAGE_FILE and pte_file()-related helpers s390: drop pte_file()-related helpers parisc: drop _PAGE_FILE and pte_file()-related helpers openrisc: drop _PAGE_FILE and pte_file()-related helpers nios2: drop _PAGE_FILE and pte_file()-related helpers ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile2
-rw-r--r--mm/debug.c1
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/filemap_xip.c1
-rw-r--r--mm/fremap.c283
-rw-r--r--mm/gup.c2
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/interval_tree.c34
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/madvise.c13
-rw-r--r--mm/memcontrol.c187
-rw-r--r--mm/memory.c276
-rw-r--r--mm/migrate.c32
-rw-r--r--mm/mincore.c9
-rw-r--r--mm/mmap.c93
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/msync.c5
-rw-r--r--mm/nommu.c8
-rw-r--r--mm/page_alloc.c8
-rw-r--r--mm/rmap.c225
-rw-r--r--mm/shmem.c1
-rw-r--r--mm/slab.h4
-rw-r--r--mm/slab_common.c151
-rw-r--r--mm/slub.c37
-rw-r--r--mm/swap.c4
-rw-r--r--mm/vmstat.c124
27 files changed, 411 insertions, 1098 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 4bf586e66378..3548460ab7b6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,7 +3,7 @@
3# 3#
4 4
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o pgtable-generic.o 8 vmalloc.o pagewalk.o pgtable-generic.o
9 9
diff --git a/mm/debug.c b/mm/debug.c
index 0e58f3211f89..d69cb5a7ba9a 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = {
130 {VM_ACCOUNT, "account" }, 130 {VM_ACCOUNT, "account" },
131 {VM_NORESERVE, "noreserve" }, 131 {VM_NORESERVE, "noreserve" },
132 {VM_HUGETLB, "hugetlb" }, 132 {VM_HUGETLB, "hugetlb" },
133 {VM_NONLINEAR, "nonlinear" },
134#if defined(CONFIG_X86) 133#if defined(CONFIG_X86)
135 {VM_PAT, "pat" }, 134 {VM_PAT, "pat" },
136#elif defined(CONFIG_PPC) 135#elif defined(CONFIG_PPC)
diff --git a/mm/filemap.c b/mm/filemap.c
index 673e4581a2e5..bf7a27142704 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2087,7 +2087,6 @@ const struct vm_operations_struct generic_file_vm_ops = {
2087 .fault = filemap_fault, 2087 .fault = filemap_fault,
2088 .map_pages = filemap_map_pages, 2088 .map_pages = filemap_map_pages,
2089 .page_mkwrite = filemap_page_mkwrite, 2089 .page_mkwrite = filemap_page_mkwrite,
2090 .remap_pages = generic_file_remap_pages,
2091}; 2090};
2092 2091
2093/* This is used for a general mmap of a disk file */ 2092/* This is used for a general mmap of a disk file */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0d105aeff82f..70c09da1a419 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -301,7 +301,6 @@ out:
301static const struct vm_operations_struct xip_file_vm_ops = { 301static const struct vm_operations_struct xip_file_vm_ops = {
302 .fault = xip_file_fault, 302 .fault = xip_file_fault,
303 .page_mkwrite = filemap_page_mkwrite, 303 .page_mkwrite = filemap_page_mkwrite,
304 .remap_pages = generic_file_remap_pages,
305}; 304};
306 305
307int xip_file_mmap(struct file * file, struct vm_area_struct * vma) 306int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
diff --git a/mm/fremap.c b/mm/fremap.c
deleted file mode 100644
index 2805d71cf476..000000000000
--- a/mm/fremap.c
+++ /dev/null
@@ -1,283 +0,0 @@
1/*
2 * linux/mm/fremap.c
3 *
4 * Explicit pagetable population and nonlinear (random) mappings support.
5 *
6 * started by Ingo Molnar, Copyright (C) 2002, 2003
7 */
8#include <linux/export.h>
9#include <linux/backing-dev.h>
10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/file.h>
13#include <linux/mman.h>
14#include <linux/pagemap.h>
15#include <linux/swapops.h>
16#include <linux/rmap.h>
17#include <linux/syscalls.h>
18#include <linux/mmu_notifier.h>
19
20#include <asm/mmu_context.h>
21#include <asm/cacheflush.h>
22#include <asm/tlbflush.h>
23
24#include "internal.h"
25
26static int mm_counter(struct page *page)
27{
28 return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
29}
30
31static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
32 unsigned long addr, pte_t *ptep)
33{
34 pte_t pte = *ptep;
35 struct page *page;
36 swp_entry_t entry;
37
38 if (pte_present(pte)) {
39 flush_cache_page(vma, addr, pte_pfn(pte));
40 pte = ptep_clear_flush_notify(vma, addr, ptep);
41 page = vm_normal_page(vma, addr, pte);
42 if (page) {
43 if (pte_dirty(pte))
44 set_page_dirty(page);
45 update_hiwater_rss(mm);
46 dec_mm_counter(mm, mm_counter(page));
47 page_remove_rmap(page);
48 page_cache_release(page);
49 }
50 } else { /* zap_pte() is not called when pte_none() */
51 if (!pte_file(pte)) {
52 update_hiwater_rss(mm);
53 entry = pte_to_swp_entry(pte);
54 if (non_swap_entry(entry)) {
55 if (is_migration_entry(entry)) {
56 page = migration_entry_to_page(entry);
57 dec_mm_counter(mm, mm_counter(page));
58 }
59 } else {
60 free_swap_and_cache(entry);
61 dec_mm_counter(mm, MM_SWAPENTS);
62 }
63 }
64 pte_clear_not_present_full(mm, addr, ptep, 0);
65 }
66}
67
68/*
69 * Install a file pte to a given virtual memory address, release any
70 * previously existing mapping.
71 */
72static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
73 unsigned long addr, unsigned long pgoff, pgprot_t prot)
74{
75 int err = -ENOMEM;
76 pte_t *pte, ptfile;
77 spinlock_t *ptl;
78
79 pte = get_locked_pte(mm, addr, &ptl);
80 if (!pte)
81 goto out;
82
83 ptfile = pgoff_to_pte(pgoff);
84
85 if (!pte_none(*pte))
86 zap_pte(mm, vma, addr, pte);
87
88 set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
89 /*
90 * We don't need to run update_mmu_cache() here because the "file pte"
91 * being installed by install_file_pte() is not a real pte - it's a
92 * non-present entry (like a swap entry), noting what file offset should
93 * be mapped there when there's a fault (in a non-linear vma where
94 * that's not obvious).
95 */
96 pte_unmap_unlock(pte, ptl);
97 err = 0;
98out:
99 return err;
100}
101
102int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
103 unsigned long size, pgoff_t pgoff)
104{
105 struct mm_struct *mm = vma->vm_mm;
106 int err;
107
108 do {
109 err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
110 if (err)
111 return err;
112
113 size -= PAGE_SIZE;
114 addr += PAGE_SIZE;
115 pgoff++;
116 } while (size);
117
118 return 0;
119}
120EXPORT_SYMBOL(generic_file_remap_pages);
121
122/**
123 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
124 * @start: start of the remapped virtual memory range
125 * @size: size of the remapped virtual memory range
126 * @prot: new protection bits of the range (see NOTE)
127 * @pgoff: to-be-mapped page of the backing store file
128 * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
129 *
130 * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
131 * (shared backing store file).
132 *
133 * This syscall works purely via pagetables, so it's the most efficient
134 * way to map the same (large) file into a given virtual window. Unlike
135 * mmap()/mremap() it does not create any new vmas. The new mappings are
136 * also safe across swapout.
137 *
138 * NOTE: the @prot parameter right now is ignored (but must be zero),
139 * and the vma's default protection is used. Arbitrary protections
140 * might be implemented in the future.
141 */
142SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
143 unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
144{
145 struct mm_struct *mm = current->mm;
146 struct address_space *mapping;
147 struct vm_area_struct *vma;
148 int err = -EINVAL;
149 int has_write_lock = 0;
150 vm_flags_t vm_flags = 0;
151
152 pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
153 "See Documentation/vm/remap_file_pages.txt.\n",
154 current->comm, current->pid);
155
156 if (prot)
157 return err;
158 /*
159 * Sanitize the syscall parameters:
160 */
161 start = start & PAGE_MASK;
162 size = size & PAGE_MASK;
163
164 /* Does the address range wrap, or is the span zero-sized? */
165 if (start + size <= start)
166 return err;
167
168 /* Does pgoff wrap? */
169 if (pgoff + (size >> PAGE_SHIFT) < pgoff)
170 return err;
171
172 /* Can we represent this offset inside this architecture's pte's? */
173#if PTE_FILE_MAX_BITS < BITS_PER_LONG
174 if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
175 return err;
176#endif
177
178 /* We need down_write() to change vma->vm_flags. */
179 down_read(&mm->mmap_sem);
180 retry:
181 vma = find_vma(mm, start);
182
183 /*
184 * Make sure the vma is shared, that it supports prefaulting,
185 * and that the remapped range is valid and fully within
186 * the single existing vma.
187 */
188 if (!vma || !(vma->vm_flags & VM_SHARED))
189 goto out;
190
191 if (!vma->vm_ops || !vma->vm_ops->remap_pages)
192 goto out;
193
194 if (start < vma->vm_start || start + size > vma->vm_end)
195 goto out;
196
197 /* Must set VM_NONLINEAR before any pages are populated. */
198 if (!(vma->vm_flags & VM_NONLINEAR)) {
199 /*
200 * vm_private_data is used as a swapout cursor
201 * in a VM_NONLINEAR vma.
202 */
203 if (vma->vm_private_data)
204 goto out;
205
206 /* Don't need a nonlinear mapping, exit success */
207 if (pgoff == linear_page_index(vma, start)) {
208 err = 0;
209 goto out;
210 }
211
212 if (!has_write_lock) {
213get_write_lock:
214 up_read(&mm->mmap_sem);
215 down_write(&mm->mmap_sem);
216 has_write_lock = 1;
217 goto retry;
218 }
219 mapping = vma->vm_file->f_mapping;
220 /*
221 * page_mkclean doesn't work on nonlinear vmas, so if
222 * dirty pages need to be accounted, emulate with linear
223 * vmas.
224 */
225 if (mapping_cap_account_dirty(mapping)) {
226 unsigned long addr;
227 struct file *file = get_file(vma->vm_file);
228 /* mmap_region may free vma; grab the info now */
229 vm_flags = vma->vm_flags;
230
231 addr = mmap_region(file, start, size, vm_flags, pgoff);
232 fput(file);
233 if (IS_ERR_VALUE(addr)) {
234 err = addr;
235 } else {
236 BUG_ON(addr != start);
237 err = 0;
238 }
239 goto out_freed;
240 }
241 i_mmap_lock_write(mapping);
242 flush_dcache_mmap_lock(mapping);
243 vma->vm_flags |= VM_NONLINEAR;
244 vma_interval_tree_remove(vma, &mapping->i_mmap);
245 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
246 flush_dcache_mmap_unlock(mapping);
247 i_mmap_unlock_write(mapping);
248 }
249
250 if (vma->vm_flags & VM_LOCKED) {
251 /*
252 * drop PG_Mlocked flag for over-mapped range
253 */
254 if (!has_write_lock)
255 goto get_write_lock;
256 vm_flags = vma->vm_flags;
257 munlock_vma_pages_range(vma, start, start + size);
258 vma->vm_flags = vm_flags;
259 }
260
261 mmu_notifier_invalidate_range_start(mm, start, start + size);
262 err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
263 mmu_notifier_invalidate_range_end(mm, start, start + size);
264
265 /*
266 * We can't clear VM_NONLINEAR because we'd have to do
267 * it after ->populate completes, and that would prevent
268 * downgrading the lock. (Locks can't be upgraded).
269 */
270
271out:
272 if (vma)
273 vm_flags = vma->vm_flags;
274out_freed:
275 if (likely(!has_write_lock))
276 up_read(&mm->mmap_sem);
277 else
278 up_write(&mm->mmap_sem);
279 if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
280 mm_populate(start, size);
281
282 return err;
283}
diff --git a/mm/gup.c b/mm/gup.c
index 8dd50ce6326f..12bc2bc33da7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -55,7 +55,7 @@ retry:
55 */ 55 */
56 if (likely(!(flags & FOLL_MIGRATION))) 56 if (likely(!(flags & FOLL_MIGRATION)))
57 goto no_page; 57 goto no_page;
58 if (pte_none(pte) || pte_file(pte)) 58 if (pte_none(pte))
59 goto no_page; 59 goto no_page;
60 entry = pte_to_swp_entry(pte); 60 entry = pte_to_swp_entry(pte);
61 if (!is_migration_entry(entry)) 61 if (!is_migration_entry(entry))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85032de5e20f..be0e5d0db5ec 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,7 +35,7 @@
35#include <linux/node.h> 35#include <linux/node.h>
36#include "internal.h" 36#include "internal.h"
37 37
38unsigned long hugepages_treat_as_movable; 38int hugepages_treat_as_movable;
39 39
40int hugetlb_max_hstate __read_mostly; 40int hugetlb_max_hstate __read_mostly;
41unsigned int default_hstate_idx; 41unsigned int default_hstate_idx;
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 8da581fa9060..f2c2492681bf 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
21 return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; 21 return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
22} 22}
23 23
24INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, 24INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
25 unsigned long, shared.linear.rb_subtree_last, 25 unsigned long, shared.rb_subtree_last,
26 vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) 26 vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
27 27
28/* Insert node immediately after prev in the interval tree */ 28/* Insert node immediately after prev in the interval tree */
@@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
36 36
37 VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); 37 VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
38 38
39 if (!prev->shared.linear.rb.rb_right) { 39 if (!prev->shared.rb.rb_right) {
40 parent = prev; 40 parent = prev;
41 link = &prev->shared.linear.rb.rb_right; 41 link = &prev->shared.rb.rb_right;
42 } else { 42 } else {
43 parent = rb_entry(prev->shared.linear.rb.rb_right, 43 parent = rb_entry(prev->shared.rb.rb_right,
44 struct vm_area_struct, shared.linear.rb); 44 struct vm_area_struct, shared.rb);
45 if (parent->shared.linear.rb_subtree_last < last) 45 if (parent->shared.rb_subtree_last < last)
46 parent->shared.linear.rb_subtree_last = last; 46 parent->shared.rb_subtree_last = last;
47 while (parent->shared.linear.rb.rb_left) { 47 while (parent->shared.rb.rb_left) {
48 parent = rb_entry(parent->shared.linear.rb.rb_left, 48 parent = rb_entry(parent->shared.rb.rb_left,
49 struct vm_area_struct, shared.linear.rb); 49 struct vm_area_struct, shared.rb);
50 if (parent->shared.linear.rb_subtree_last < last) 50 if (parent->shared.rb_subtree_last < last)
51 parent->shared.linear.rb_subtree_last = last; 51 parent->shared.rb_subtree_last = last;
52 } 52 }
53 link = &parent->shared.linear.rb.rb_left; 53 link = &parent->shared.rb.rb_left;
54 } 54 }
55 55
56 node->shared.linear.rb_subtree_last = last; 56 node->shared.rb_subtree_last = last;
57 rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); 57 rb_link_node(&node->shared.rb, &parent->shared.rb, link);
58 rb_insert_augmented(&node->shared.linear.rb, root, 58 rb_insert_augmented(&node->shared.rb, root,
59 &vma_interval_tree_augment); 59 &vma_interval_tree_augment);
60} 60}
61 61
diff --git a/mm/ksm.c b/mm/ksm.c
index 15647fb0394f..4162dce2eb44 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1748 */ 1748 */
1749 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1749 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1750 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1750 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1751 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) 1751 VM_HUGETLB | VM_MIXEDMAP))
1752 return 0; /* just ignore the advice */ 1752 return 0; /* just ignore the advice */
1753 1753
1754#ifdef VM_SAO 1754#ifdef VM_SAO
diff --git a/mm/madvise.c b/mm/madvise.c
index a271adc93289..d79fb5e8f80a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
155 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 155 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
156 pte_unmap_unlock(orig_pte, ptl); 156 pte_unmap_unlock(orig_pte, ptl);
157 157
158 if (pte_present(pte) || pte_none(pte) || pte_file(pte)) 158 if (pte_present(pte) || pte_none(pte))
159 continue; 159 continue;
160 entry = pte_to_swp_entry(pte); 160 entry = pte_to_swp_entry(pte);
161 if (unlikely(non_swap_entry(entry))) 161 if (unlikely(non_swap_entry(entry)))
@@ -278,14 +278,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
278 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) 278 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
279 return -EINVAL; 279 return -EINVAL;
280 280
281 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 281 zap_page_range(vma, start, end - start, NULL);
282 struct zap_details details = {
283 .nonlinear_vma = vma,
284 .last_index = ULONG_MAX,
285 };
286 zap_page_range(vma, start, end - start, &details);
287 } else
288 zap_page_range(vma, start, end - start, NULL);
289 return 0; 282 return 0;
290} 283}
291 284
@@ -303,7 +296,7 @@ static long madvise_remove(struct vm_area_struct *vma,
303 296
304 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 297 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
305 298
306 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 299 if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
307 return -EINVAL; 300 return -EINVAL;
308 301
309 f = vma->vm_file; 302 f = vma->vm_file;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2f6893c2f01b..f3f8a4f52a0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -343,9 +343,6 @@ struct mem_cgroup {
343 struct cg_proto tcp_mem; 343 struct cg_proto tcp_mem;
344#endif 344#endif
345#if defined(CONFIG_MEMCG_KMEM) 345#if defined(CONFIG_MEMCG_KMEM)
346 /* analogous to slab_common's slab_caches list, but per-memcg;
347 * protected by memcg_slab_mutex */
348 struct list_head memcg_slab_caches;
349 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 346 /* Index in the kmem_cache->memcg_params->memcg_caches array */
350 int kmemcg_id; 347 int kmemcg_id;
351#endif 348#endif
@@ -2476,27 +2473,8 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2476} 2473}
2477 2474
2478#ifdef CONFIG_MEMCG_KMEM 2475#ifdef CONFIG_MEMCG_KMEM
2479/* 2476int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2480 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or 2477 unsigned long nr_pages)
2481 * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
2482 */
2483static DEFINE_MUTEX(memcg_slab_mutex);
2484
2485/*
2486 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2487 * in the memcg_cache_params struct.
2488 */
2489static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2490{
2491 struct kmem_cache *cachep;
2492
2493 VM_BUG_ON(p->is_root_cache);
2494 cachep = p->root_cache;
2495 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
2496}
2497
2498static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2499 unsigned long nr_pages)
2500{ 2478{
2501 struct page_counter *counter; 2479 struct page_counter *counter;
2502 int ret = 0; 2480 int ret = 0;
@@ -2533,8 +2511,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2533 return ret; 2511 return ret;
2534} 2512}
2535 2513
2536static void memcg_uncharge_kmem(struct mem_cgroup *memcg, 2514void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
2537 unsigned long nr_pages)
2538{ 2515{
2539 page_counter_uncharge(&memcg->memory, nr_pages); 2516 page_counter_uncharge(&memcg->memory, nr_pages);
2540 if (do_swap_account) 2517 if (do_swap_account)
@@ -2579,10 +2556,7 @@ static int memcg_alloc_cache_id(void)
2579 else if (size > MEMCG_CACHES_MAX_SIZE) 2556 else if (size > MEMCG_CACHES_MAX_SIZE)
2580 size = MEMCG_CACHES_MAX_SIZE; 2557 size = MEMCG_CACHES_MAX_SIZE;
2581 2558
2582 mutex_lock(&memcg_slab_mutex);
2583 err = memcg_update_all_caches(size); 2559 err = memcg_update_all_caches(size);
2584 mutex_unlock(&memcg_slab_mutex);
2585
2586 if (err) { 2560 if (err) {
2587 ida_simple_remove(&kmem_limited_groups, id); 2561 ida_simple_remove(&kmem_limited_groups, id);
2588 return err; 2562 return err;
@@ -2605,123 +2579,20 @@ void memcg_update_array_size(int num)
2605 memcg_limited_groups_array_size = num; 2579 memcg_limited_groups_array_size = num;
2606} 2580}
2607 2581
2608static void memcg_register_cache(struct mem_cgroup *memcg, 2582struct memcg_kmem_cache_create_work {
2609 struct kmem_cache *root_cache)
2610{
2611 static char memcg_name_buf[NAME_MAX + 1]; /* protected by
2612 memcg_slab_mutex */
2613 struct kmem_cache *cachep;
2614 int id;
2615
2616 lockdep_assert_held(&memcg_slab_mutex);
2617
2618 id = memcg_cache_id(memcg);
2619
2620 /*
2621 * Since per-memcg caches are created asynchronously on first
2622 * allocation (see memcg_kmem_get_cache()), several threads can try to
2623 * create the same cache, but only one of them may succeed.
2624 */
2625 if (cache_from_memcg_idx(root_cache, id))
2626 return;
2627
2628 cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
2629 cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
2630 /*
2631 * If we could not create a memcg cache, do not complain, because
2632 * that's not critical at all as we can always proceed with the root
2633 * cache.
2634 */
2635 if (!cachep)
2636 return;
2637
2638 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2639
2640 /*
2641 * Since readers won't lock (see cache_from_memcg_idx()), we need a
2642 * barrier here to ensure nobody will see the kmem_cache partially
2643 * initialized.
2644 */
2645 smp_wmb();
2646
2647 BUG_ON(root_cache->memcg_params->memcg_caches[id]);
2648 root_cache->memcg_params->memcg_caches[id] = cachep;
2649}
2650
2651static void memcg_unregister_cache(struct kmem_cache *cachep)
2652{
2653 struct kmem_cache *root_cache;
2654 struct mem_cgroup *memcg;
2655 int id;
2656
2657 lockdep_assert_held(&memcg_slab_mutex);
2658
2659 BUG_ON(is_root_cache(cachep));
2660
2661 root_cache = cachep->memcg_params->root_cache;
2662 memcg = cachep->memcg_params->memcg;
2663 id = memcg_cache_id(memcg);
2664
2665 BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
2666 root_cache->memcg_params->memcg_caches[id] = NULL;
2667
2668 list_del(&cachep->memcg_params->list);
2669
2670 kmem_cache_destroy(cachep);
2671}
2672
2673int __memcg_cleanup_cache_params(struct kmem_cache *s)
2674{
2675 struct kmem_cache *c;
2676 int i, failed = 0;
2677
2678 mutex_lock(&memcg_slab_mutex);
2679 for_each_memcg_cache_index(i) {
2680 c = cache_from_memcg_idx(s, i);
2681 if (!c)
2682 continue;
2683
2684 memcg_unregister_cache(c);
2685
2686 if (cache_from_memcg_idx(s, i))
2687 failed++;
2688 }
2689 mutex_unlock(&memcg_slab_mutex);
2690 return failed;
2691}
2692
2693static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
2694{
2695 struct kmem_cache *cachep;
2696 struct memcg_cache_params *params, *tmp;
2697
2698 if (!memcg_kmem_is_active(memcg))
2699 return;
2700
2701 mutex_lock(&memcg_slab_mutex);
2702 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
2703 cachep = memcg_params_to_cache(params);
2704 memcg_unregister_cache(cachep);
2705 }
2706 mutex_unlock(&memcg_slab_mutex);
2707}
2708
2709struct memcg_register_cache_work {
2710 struct mem_cgroup *memcg; 2583 struct mem_cgroup *memcg;
2711 struct kmem_cache *cachep; 2584 struct kmem_cache *cachep;
2712 struct work_struct work; 2585 struct work_struct work;
2713}; 2586};
2714 2587
2715static void memcg_register_cache_func(struct work_struct *w) 2588static void memcg_kmem_cache_create_func(struct work_struct *w)
2716{ 2589{
2717 struct memcg_register_cache_work *cw = 2590 struct memcg_kmem_cache_create_work *cw =
2718 container_of(w, struct memcg_register_cache_work, work); 2591 container_of(w, struct memcg_kmem_cache_create_work, work);
2719 struct mem_cgroup *memcg = cw->memcg; 2592 struct mem_cgroup *memcg = cw->memcg;
2720 struct kmem_cache *cachep = cw->cachep; 2593 struct kmem_cache *cachep = cw->cachep;
2721 2594
2722 mutex_lock(&memcg_slab_mutex); 2595 memcg_create_kmem_cache(memcg, cachep);
2723 memcg_register_cache(memcg, cachep);
2724 mutex_unlock(&memcg_slab_mutex);
2725 2596
2726 css_put(&memcg->css); 2597 css_put(&memcg->css);
2727 kfree(cw); 2598 kfree(cw);
@@ -2730,10 +2601,10 @@ static void memcg_register_cache_func(struct work_struct *w)
2730/* 2601/*
2731 * Enqueue the creation of a per-memcg kmem_cache. 2602 * Enqueue the creation of a per-memcg kmem_cache.
2732 */ 2603 */
2733static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, 2604static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2734 struct kmem_cache *cachep) 2605 struct kmem_cache *cachep)
2735{ 2606{
2736 struct memcg_register_cache_work *cw; 2607 struct memcg_kmem_cache_create_work *cw;
2737 2608
2738 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 2609 cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
2739 if (!cw) 2610 if (!cw)
@@ -2743,18 +2614,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
2743 2614
2744 cw->memcg = memcg; 2615 cw->memcg = memcg;
2745 cw->cachep = cachep; 2616 cw->cachep = cachep;
2617 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2746 2618
2747 INIT_WORK(&cw->work, memcg_register_cache_func);
2748 schedule_work(&cw->work); 2619 schedule_work(&cw->work);
2749} 2620}
2750 2621
2751static void memcg_schedule_register_cache(struct mem_cgroup *memcg, 2622static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2752 struct kmem_cache *cachep) 2623 struct kmem_cache *cachep)
2753{ 2624{
2754 /* 2625 /*
2755 * We need to stop accounting when we kmalloc, because if the 2626 * We need to stop accounting when we kmalloc, because if the
2756 * corresponding kmalloc cache is not yet created, the first allocation 2627 * corresponding kmalloc cache is not yet created, the first allocation
2757 * in __memcg_schedule_register_cache will recurse. 2628 * in __memcg_schedule_kmem_cache_create will recurse.
2758 * 2629 *
2759 * However, it is better to enclose the whole function. Depending on 2630 * However, it is better to enclose the whole function. Depending on
2760 * the debugging options enabled, INIT_WORK(), for instance, can 2631 * the debugging options enabled, INIT_WORK(), for instance, can
@@ -2763,24 +2634,10 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
2763 * the safest choice is to do it like this, wrapping the whole function. 2634 * the safest choice is to do it like this, wrapping the whole function.
2764 */ 2635 */
2765 current->memcg_kmem_skip_account = 1; 2636 current->memcg_kmem_skip_account = 1;
2766 __memcg_schedule_register_cache(memcg, cachep); 2637 __memcg_schedule_kmem_cache_create(memcg, cachep);
2767 current->memcg_kmem_skip_account = 0; 2638 current->memcg_kmem_skip_account = 0;
2768} 2639}
2769 2640
2770int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
2771{
2772 unsigned int nr_pages = 1 << order;
2773
2774 return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
2775}
2776
2777void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
2778{
2779 unsigned int nr_pages = 1 << order;
2780
2781 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
2782}
2783
2784/* 2641/*
2785 * Return the kmem_cache we're supposed to use for a slab allocation. 2642 * Return the kmem_cache we're supposed to use for a slab allocation.
2786 * We try to use the current memcg's version of the cache. 2643 * We try to use the current memcg's version of the cache.
@@ -2825,7 +2682,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2825 * could happen with the slab_mutex held. So it's better to 2682 * could happen with the slab_mutex held. So it's better to
2826 * defer everything. 2683 * defer everything.
2827 */ 2684 */
2828 memcg_schedule_register_cache(memcg, cachep); 2685 memcg_schedule_kmem_cache_create(memcg, cachep);
2829out: 2686out:
2830 css_put(&memcg->css); 2687 css_put(&memcg->css);
2831 return cachep; 2688 return cachep;
@@ -4154,7 +4011,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4154 4011
4155static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4012static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4156{ 4013{
4157 memcg_unregister_all_caches(memcg); 4014 memcg_destroy_kmem_caches(memcg);
4158 mem_cgroup_sockets_destroy(memcg); 4015 mem_cgroup_sockets_destroy(memcg);
4159} 4016}
4160#else 4017#else
@@ -4682,7 +4539,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4682 spin_lock_init(&memcg->event_list_lock); 4539 spin_lock_init(&memcg->event_list_lock);
4683#ifdef CONFIG_MEMCG_KMEM 4540#ifdef CONFIG_MEMCG_KMEM
4684 memcg->kmemcg_id = -1; 4541 memcg->kmemcg_id = -1;
4685 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
4686#endif 4542#endif
4687 4543
4688 return &memcg->css; 4544 return &memcg->css;
@@ -4926,10 +4782,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4926 return NULL; 4782 return NULL;
4927 4783
4928 mapping = vma->vm_file->f_mapping; 4784 mapping = vma->vm_file->f_mapping;
4929 if (pte_none(ptent)) 4785 pgoff = linear_page_index(vma, addr);
4930 pgoff = linear_page_index(vma, addr);
4931 else /* pte_file(ptent) is true */
4932 pgoff = pte_to_pgoff(ptent);
4933 4786
4934 /* page is moved even if it's not RSS of this task(page-faulted). */ 4787 /* page is moved even if it's not RSS of this task(page-faulted). */
4935#ifdef CONFIG_SWAP 4788#ifdef CONFIG_SWAP
@@ -4961,7 +4814,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4961 page = mc_handle_present_pte(vma, addr, ptent); 4814 page = mc_handle_present_pte(vma, addr, ptent);
4962 else if (is_swap_pte(ptent)) 4815 else if (is_swap_pte(ptent))
4963 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4816 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4964 else if (pte_none(ptent) || pte_file(ptent)) 4817 else if (pte_none(ptent))
4965 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4818 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4966 4819
4967 if (!page && !ent.val) 4820 if (!page && !ent.val)
diff --git a/mm/memory.c b/mm/memory.c
index d707c4dfbbb4..d63849b5188f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -813,42 +813,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
813 813
814 /* pte contains position in swap or file, so copy. */ 814 /* pte contains position in swap or file, so copy. */
815 if (unlikely(!pte_present(pte))) { 815 if (unlikely(!pte_present(pte))) {
816 if (!pte_file(pte)) { 816 swp_entry_t entry = pte_to_swp_entry(pte);
817 swp_entry_t entry = pte_to_swp_entry(pte); 817
818 818 if (likely(!non_swap_entry(entry))) {
819 if (likely(!non_swap_entry(entry))) { 819 if (swap_duplicate(entry) < 0)
820 if (swap_duplicate(entry) < 0) 820 return entry.val;
821 return entry.val; 821
822 822 /* make sure dst_mm is on swapoff's mmlist. */
823 /* make sure dst_mm is on swapoff's mmlist. */ 823 if (unlikely(list_empty(&dst_mm->mmlist))) {
824 if (unlikely(list_empty(&dst_mm->mmlist))) { 824 spin_lock(&mmlist_lock);
825 spin_lock(&mmlist_lock); 825 if (list_empty(&dst_mm->mmlist))
826 if (list_empty(&dst_mm->mmlist)) 826 list_add(&dst_mm->mmlist,
827 list_add(&dst_mm->mmlist, 827 &src_mm->mmlist);
828 &src_mm->mmlist); 828 spin_unlock(&mmlist_lock);
829 spin_unlock(&mmlist_lock); 829 }
830 } 830 rss[MM_SWAPENTS]++;
831 rss[MM_SWAPENTS]++; 831 } else if (is_migration_entry(entry)) {
832 } else if (is_migration_entry(entry)) { 832 page = migration_entry_to_page(entry);
833 page = migration_entry_to_page(entry); 833
834 834 if (PageAnon(page))
835 if (PageAnon(page)) 835 rss[MM_ANONPAGES]++;
836 rss[MM_ANONPAGES]++; 836 else
837 else 837 rss[MM_FILEPAGES]++;
838 rss[MM_FILEPAGES]++; 838
839 839 if (is_write_migration_entry(entry) &&
840 if (is_write_migration_entry(entry) && 840 is_cow_mapping(vm_flags)) {
841 is_cow_mapping(vm_flags)) { 841 /*
842 /* 842 * COW mappings require pages in both
843 * COW mappings require pages in both 843 * parent and child to be set to read.
844 * parent and child to be set to read. 844 */
845 */ 845 make_migration_entry_read(&entry);
846 make_migration_entry_read(&entry); 846 pte = swp_entry_to_pte(entry);
847 pte = swp_entry_to_pte(entry); 847 if (pte_swp_soft_dirty(*src_pte))
848 if (pte_swp_soft_dirty(*src_pte)) 848 pte = pte_swp_mksoft_dirty(pte);
849 pte = pte_swp_mksoft_dirty(pte); 849 set_pte_at(src_mm, addr, src_pte, pte);
850 set_pte_at(src_mm, addr, src_pte, pte);
851 }
852 } 850 }
853 } 851 }
854 goto out_set_pte; 852 goto out_set_pte;
@@ -1022,11 +1020,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1022 * readonly mappings. The tradeoff is that copy_page_range is more 1020 * readonly mappings. The tradeoff is that copy_page_range is more
1023 * efficient than faulting. 1021 * efficient than faulting.
1024 */ 1022 */
1025 if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | 1023 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1026 VM_PFNMAP | VM_MIXEDMAP))) { 1024 !vma->anon_vma)
1027 if (!vma->anon_vma) 1025 return 0;
1028 return 0;
1029 }
1030 1026
1031 if (is_vm_hugetlb_page(vma)) 1027 if (is_vm_hugetlb_page(vma))
1032 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 1028 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
@@ -1084,6 +1080,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1084 spinlock_t *ptl; 1080 spinlock_t *ptl;
1085 pte_t *start_pte; 1081 pte_t *start_pte;
1086 pte_t *pte; 1082 pte_t *pte;
1083 swp_entry_t entry;
1087 1084
1088again: 1085again:
1089 init_rss_vec(rss); 1086 init_rss_vec(rss);
@@ -1109,28 +1106,12 @@ again:
1109 if (details->check_mapping && 1106 if (details->check_mapping &&
1110 details->check_mapping != page->mapping) 1107 details->check_mapping != page->mapping)
1111 continue; 1108 continue;
1112 /*
1113 * Each page->index must be checked when
1114 * invalidating or truncating nonlinear.
1115 */
1116 if (details->nonlinear_vma &&
1117 (page->index < details->first_index ||
1118 page->index > details->last_index))
1119 continue;
1120 } 1109 }
1121 ptent = ptep_get_and_clear_full(mm, addr, pte, 1110 ptent = ptep_get_and_clear_full(mm, addr, pte,
1122 tlb->fullmm); 1111 tlb->fullmm);
1123 tlb_remove_tlb_entry(tlb, pte, addr); 1112 tlb_remove_tlb_entry(tlb, pte, addr);
1124 if (unlikely(!page)) 1113 if (unlikely(!page))
1125 continue; 1114 continue;
1126 if (unlikely(details) && details->nonlinear_vma
1127 && linear_page_index(details->nonlinear_vma,
1128 addr) != page->index) {
1129 pte_t ptfile = pgoff_to_pte(page->index);
1130 if (pte_soft_dirty(ptent))
1131 ptfile = pte_file_mksoft_dirty(ptfile);
1132 set_pte_at(mm, addr, pte, ptfile);
1133 }
1134 if (PageAnon(page)) 1115 if (PageAnon(page))
1135 rss[MM_ANONPAGES]--; 1116 rss[MM_ANONPAGES]--;
1136 else { 1117 else {
@@ -1153,33 +1134,25 @@ again:
1153 } 1134 }
1154 continue; 1135 continue;
1155 } 1136 }
1156 /* 1137 /* If details->check_mapping, we leave swap entries. */
1157 * If details->check_mapping, we leave swap entries;
1158 * if details->nonlinear_vma, we leave file entries.
1159 */
1160 if (unlikely(details)) 1138 if (unlikely(details))
1161 continue; 1139 continue;
1162 if (pte_file(ptent)) {
1163 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1164 print_bad_pte(vma, addr, ptent, NULL);
1165 } else {
1166 swp_entry_t entry = pte_to_swp_entry(ptent);
1167 1140
1168 if (!non_swap_entry(entry)) 1141 entry = pte_to_swp_entry(ptent);
1169 rss[MM_SWAPENTS]--; 1142 if (!non_swap_entry(entry))
1170 else if (is_migration_entry(entry)) { 1143 rss[MM_SWAPENTS]--;
1171 struct page *page; 1144 else if (is_migration_entry(entry)) {
1145 struct page *page;
1172 1146
1173 page = migration_entry_to_page(entry); 1147 page = migration_entry_to_page(entry);
1174 1148
1175 if (PageAnon(page)) 1149 if (PageAnon(page))
1176 rss[MM_ANONPAGES]--; 1150 rss[MM_ANONPAGES]--;
1177 else 1151 else
1178 rss[MM_FILEPAGES]--; 1152 rss[MM_FILEPAGES]--;
1179 }
1180 if (unlikely(!free_swap_and_cache(entry)))
1181 print_bad_pte(vma, addr, ptent, NULL);
1182 } 1153 }
1154 if (unlikely(!free_swap_and_cache(entry)))
1155 print_bad_pte(vma, addr, ptent, NULL);
1183 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 1156 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1184 } while (pte++, addr += PAGE_SIZE, addr != end); 1157 } while (pte++, addr += PAGE_SIZE, addr != end);
1185 1158
@@ -1279,7 +1252,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
1279 pgd_t *pgd; 1252 pgd_t *pgd;
1280 unsigned long next; 1253 unsigned long next;
1281 1254
1282 if (details && !details->check_mapping && !details->nonlinear_vma) 1255 if (details && !details->check_mapping)
1283 details = NULL; 1256 details = NULL;
1284 1257
1285 BUG_ON(addr >= end); 1258 BUG_ON(addr >= end);
@@ -1373,7 +1346,7 @@ void unmap_vmas(struct mmu_gather *tlb,
1373 * @vma: vm_area_struct holding the applicable pages 1346 * @vma: vm_area_struct holding the applicable pages
1374 * @start: starting address of pages to zap 1347 * @start: starting address of pages to zap
1375 * @size: number of bytes to zap 1348 * @size: number of bytes to zap
1376 * @details: details of nonlinear truncation or shared cache invalidation 1349 * @details: details of shared cache invalidation
1377 * 1350 *
1378 * Caller must protect the VMA list 1351 * Caller must protect the VMA list
1379 */ 1352 */
@@ -1399,7 +1372,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1399 * @vma: vm_area_struct holding the applicable pages 1372 * @vma: vm_area_struct holding the applicable pages
1400 * @address: starting address of pages to zap 1373 * @address: starting address of pages to zap
1401 * @size: number of bytes to zap 1374 * @size: number of bytes to zap
1402 * @details: details of nonlinear truncation or shared cache invalidation 1375 * @details: details of shared cache invalidation
1403 * 1376 *
1404 * The range must fit into one VMA. 1377 * The range must fit into one VMA.
1405 */ 1378 */
@@ -1924,12 +1897,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1924EXPORT_SYMBOL_GPL(apply_to_page_range); 1897EXPORT_SYMBOL_GPL(apply_to_page_range);
1925 1898
1926/* 1899/*
1927 * handle_pte_fault chooses page fault handler according to an entry 1900 * handle_pte_fault chooses page fault handler according to an entry which was
1928 * which was read non-atomically. Before making any commitment, on 1901 * read non-atomically. Before making any commitment, on those architectures
1929 * those architectures or configurations (e.g. i386 with PAE) which 1902 * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
1930 * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault 1903 * parts, do_swap_page must check under lock before unmapping the pte and
1931 * must check under lock before unmapping the pte and proceeding 1904 * proceeding (but do_wp_page is only called after already making such a check;
1932 * (but do_wp_page is only called after already making such a check;
1933 * and do_anonymous_page can safely check later on). 1905 * and do_anonymous_page can safely check later on).
1934 */ 1906 */
1935static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, 1907static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
@@ -2035,7 +2007,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2035 pte_t entry; 2007 pte_t entry;
2036 int ret = 0; 2008 int ret = 0;
2037 int page_mkwrite = 0; 2009 int page_mkwrite = 0;
2038 struct page *dirty_page = NULL; 2010 bool dirty_shared = false;
2039 unsigned long mmun_start = 0; /* For mmu_notifiers */ 2011 unsigned long mmun_start = 0; /* For mmu_notifiers */
2040 unsigned long mmun_end = 0; /* For mmu_notifiers */ 2012 unsigned long mmun_end = 0; /* For mmu_notifiers */
2041 struct mem_cgroup *memcg; 2013 struct mem_cgroup *memcg;
@@ -2086,6 +2058,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2086 unlock_page(old_page); 2058 unlock_page(old_page);
2087 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2059 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2088 (VM_WRITE|VM_SHARED))) { 2060 (VM_WRITE|VM_SHARED))) {
2061 page_cache_get(old_page);
2089 /* 2062 /*
2090 * Only catch write-faults on shared writable pages, 2063 * Only catch write-faults on shared writable pages,
2091 * read-only shared pages can get COWed by 2064 * read-only shared pages can get COWed by
@@ -2093,7 +2066,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2093 */ 2066 */
2094 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 2067 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2095 int tmp; 2068 int tmp;
2096 page_cache_get(old_page); 2069
2097 pte_unmap_unlock(page_table, ptl); 2070 pte_unmap_unlock(page_table, ptl);
2098 tmp = do_page_mkwrite(vma, old_page, address); 2071 tmp = do_page_mkwrite(vma, old_page, address);
2099 if (unlikely(!tmp || (tmp & 2072 if (unlikely(!tmp || (tmp &
@@ -2113,11 +2086,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2113 unlock_page(old_page); 2086 unlock_page(old_page);
2114 goto unlock; 2087 goto unlock;
2115 } 2088 }
2116
2117 page_mkwrite = 1; 2089 page_mkwrite = 1;
2118 } 2090 }
2119 dirty_page = old_page; 2091
2120 get_page(dirty_page); 2092 dirty_shared = true;
2121 2093
2122reuse: 2094reuse:
2123 /* 2095 /*
@@ -2136,20 +2108,20 @@ reuse:
2136 pte_unmap_unlock(page_table, ptl); 2108 pte_unmap_unlock(page_table, ptl);
2137 ret |= VM_FAULT_WRITE; 2109 ret |= VM_FAULT_WRITE;
2138 2110
2139 if (!dirty_page) 2111 if (dirty_shared) {
2140 return ret;
2141
2142 if (!page_mkwrite) {
2143 struct address_space *mapping; 2112 struct address_space *mapping;
2144 int dirtied; 2113 int dirtied;
2145 2114
2146 lock_page(dirty_page); 2115 if (!page_mkwrite)
2147 dirtied = set_page_dirty(dirty_page); 2116 lock_page(old_page);
2148 VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page);
2149 mapping = dirty_page->mapping;
2150 unlock_page(dirty_page);
2151 2117
2152 if (dirtied && mapping) { 2118 dirtied = set_page_dirty(old_page);
2119 VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
2120 mapping = old_page->mapping;
2121 unlock_page(old_page);
2122 page_cache_release(old_page);
2123
2124 if ((dirtied || page_mkwrite) && mapping) {
2153 /* 2125 /*
2154 * Some device drivers do not set page.mapping 2126 * Some device drivers do not set page.mapping
2155 * but still dirty their pages 2127 * but still dirty their pages
@@ -2157,25 +2129,9 @@ reuse:
2157 balance_dirty_pages_ratelimited(mapping); 2129 balance_dirty_pages_ratelimited(mapping);
2158 } 2130 }
2159 2131
2160 /* file_update_time outside page_lock */ 2132 if (!page_mkwrite)
2161 if (vma->vm_file)
2162 file_update_time(vma->vm_file); 2133 file_update_time(vma->vm_file);
2163 } 2134 }
2164 put_page(dirty_page);
2165 if (page_mkwrite) {
2166 struct address_space *mapping = dirty_page->mapping;
2167
2168 set_page_dirty(dirty_page);
2169 unlock_page(dirty_page);
2170 page_cache_release(dirty_page);
2171 if (mapping) {
2172 /*
2173 * Some device drivers do not set page.mapping
2174 * but still dirty their pages
2175 */
2176 balance_dirty_pages_ratelimited(mapping);
2177 }
2178 }
2179 2135
2180 return ret; 2136 return ret;
2181 } 2137 }
@@ -2333,25 +2289,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
2333 } 2289 }
2334} 2290}
2335 2291
2336static inline void unmap_mapping_range_list(struct list_head *head,
2337 struct zap_details *details)
2338{
2339 struct vm_area_struct *vma;
2340
2341 /*
2342 * In nonlinear VMAs there is no correspondence between virtual address
2343 * offset and file offset. So we must perform an exhaustive search
2344 * across *all* the pages in each nonlinear VMA, not just the pages
2345 * whose virtual address lies outside the file truncation point.
2346 */
2347 list_for_each_entry(vma, head, shared.nonlinear) {
2348 details->nonlinear_vma = vma;
2349 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2350 }
2351}
2352
2353/** 2292/**
2354 * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. 2293 * unmap_mapping_range - unmap the portion of all mmaps in the specified
2294 * address_space corresponding to the specified page range in the underlying
2295 * file.
2296 *
2355 * @mapping: the address space containing mmaps to be unmapped. 2297 * @mapping: the address space containing mmaps to be unmapped.
2356 * @holebegin: byte in first page to unmap, relative to the start of 2298 * @holebegin: byte in first page to unmap, relative to the start of
2357 * the underlying file. This will be rounded down to a PAGE_SIZE 2299 * the underlying file. This will be rounded down to a PAGE_SIZE
@@ -2380,7 +2322,6 @@ void unmap_mapping_range(struct address_space *mapping,
2380 } 2322 }
2381 2323
2382 details.check_mapping = even_cows? NULL: mapping; 2324 details.check_mapping = even_cows? NULL: mapping;
2383 details.nonlinear_vma = NULL;
2384 details.first_index = hba; 2325 details.first_index = hba;
2385 details.last_index = hba + hlen - 1; 2326 details.last_index = hba + hlen - 1;
2386 if (details.last_index < details.first_index) 2327 if (details.last_index < details.first_index)
@@ -2390,8 +2331,6 @@ void unmap_mapping_range(struct address_space *mapping,
2390 i_mmap_lock_write(mapping); 2331 i_mmap_lock_write(mapping);
2391 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) 2332 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2392 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2333 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2393 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2394 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2395 i_mmap_unlock_write(mapping); 2334 i_mmap_unlock_write(mapping);
2396} 2335}
2397EXPORT_SYMBOL(unmap_mapping_range); 2336EXPORT_SYMBOL(unmap_mapping_range);
@@ -2752,8 +2691,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
2752 entry = mk_pte(page, vma->vm_page_prot); 2691 entry = mk_pte(page, vma->vm_page_prot);
2753 if (write) 2692 if (write)
2754 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2693 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2755 else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
2756 entry = pte_mksoft_dirty(entry);
2757 if (anon) { 2694 if (anon) {
2758 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2695 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2759 page_add_new_anon_rmap(page, vma, address); 2696 page_add_new_anon_rmap(page, vma, address);
@@ -2888,8 +2825,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2888 * if page by the offset is not ready to be mapped (cold cache or 2825 * if page by the offset is not ready to be mapped (cold cache or
2889 * something). 2826 * something).
2890 */ 2827 */
2891 if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && 2828 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
2892 fault_around_bytes >> PAGE_SHIFT > 1) {
2893 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2829 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2894 do_fault_around(vma, address, pte, pgoff, flags); 2830 do_fault_around(vma, address, pte, pgoff, flags);
2895 if (!pte_same(*pte, orig_pte)) 2831 if (!pte_same(*pte, orig_pte))
@@ -3021,8 +2957,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3021 balance_dirty_pages_ratelimited(mapping); 2957 balance_dirty_pages_ratelimited(mapping);
3022 } 2958 }
3023 2959
3024 /* file_update_time outside page_lock */ 2960 if (!vma->vm_ops->page_mkwrite)
3025 if (vma->vm_file && !vma->vm_ops->page_mkwrite)
3026 file_update_time(vma->vm_file); 2961 file_update_time(vma->vm_file);
3027 2962
3028 return ret; 2963 return ret;
@@ -3034,7 +2969,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3034 * The mmap_sem may have been released depending on flags and our 2969 * The mmap_sem may have been released depending on flags and our
3035 * return value. See filemap_fault() and __lock_page_or_retry(). 2970 * return value. See filemap_fault() and __lock_page_or_retry().
3036 */ 2971 */
3037static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2972static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3038 unsigned long address, pte_t *page_table, pmd_t *pmd, 2973 unsigned long address, pte_t *page_table, pmd_t *pmd,
3039 unsigned int flags, pte_t orig_pte) 2974 unsigned int flags, pte_t orig_pte)
3040{ 2975{
@@ -3051,46 +2986,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3051 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2986 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3052} 2987}
3053 2988
3054/*
3055 * Fault of a previously existing named mapping. Repopulate the pte
3056 * from the encoded file_pte if possible. This enables swappable
3057 * nonlinear vmas.
3058 *
3059 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3060 * but allow concurrent faults), and pte mapped but not yet locked.
3061 * We return with pte unmapped and unlocked.
3062 * The mmap_sem may have been released depending on flags and our
3063 * return value. See filemap_fault() and __lock_page_or_retry().
3064 */
3065static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3066 unsigned long address, pte_t *page_table, pmd_t *pmd,
3067 unsigned int flags, pte_t orig_pte)
3068{
3069 pgoff_t pgoff;
3070
3071 flags |= FAULT_FLAG_NONLINEAR;
3072
3073 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3074 return 0;
3075
3076 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3077 /*
3078 * Page table corrupted: show pte and kill process.
3079 */
3080 print_bad_pte(vma, address, orig_pte, NULL);
3081 return VM_FAULT_SIGBUS;
3082 }
3083
3084 pgoff = pte_to_pgoff(orig_pte);
3085 if (!(flags & FAULT_FLAG_WRITE))
3086 return do_read_fault(mm, vma, address, pmd, pgoff, flags,
3087 orig_pte);
3088 if (!(vma->vm_flags & VM_SHARED))
3089 return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
3090 orig_pte);
3091 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3092}
3093
3094static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 2989static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3095 unsigned long addr, int page_nid, 2990 unsigned long addr, int page_nid,
3096 int *flags) 2991 int *flags)
@@ -3218,15 +3113,12 @@ static int handle_pte_fault(struct mm_struct *mm,
3218 if (pte_none(entry)) { 3113 if (pte_none(entry)) {
3219 if (vma->vm_ops) { 3114 if (vma->vm_ops) {
3220 if (likely(vma->vm_ops->fault)) 3115 if (likely(vma->vm_ops->fault))
3221 return do_linear_fault(mm, vma, address, 3116 return do_fault(mm, vma, address, pte,
3222 pte, pmd, flags, entry); 3117 pmd, flags, entry);
3223 } 3118 }
3224 return do_anonymous_page(mm, vma, address, 3119 return do_anonymous_page(mm, vma, address,
3225 pte, pmd, flags); 3120 pte, pmd, flags);
3226 } 3121 }
3227 if (pte_file(entry))
3228 return do_nonlinear_fault(mm, vma, address,
3229 pte, pmd, flags, entry);
3230 return do_swap_page(mm, vma, address, 3122 return do_swap_page(mm, vma, address,
3231 pte, pmd, flags, entry); 3123 pte, pmd, flags, entry);
3232 } 3124 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 344cdf692fc8..6e284bcca8bb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -179,37 +179,6 @@ out:
179} 179}
180 180
181/* 181/*
182 * Congratulations to trinity for discovering this bug.
183 * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
184 * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
185 * replace the specified range by file ptes throughout (maybe populated after).
186 * If page migration finds a page within that range, while it's still located
187 * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
188 * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
189 * But if the migrating page is in a part of the vma outside the range to be
190 * remapped, then it will not be cleared, and remove_migration_ptes() needs to
191 * deal with it. Fortunately, this part of the vma is of course still linear,
192 * so we just need to use linear location on the nonlinear list.
193 */
194static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
195 struct address_space *mapping, void *arg)
196{
197 struct vm_area_struct *vma;
198 /* hugetlbfs does not support remap_pages, so no huge pgoff worries */
199 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
200 unsigned long addr;
201
202 list_for_each_entry(vma,
203 &mapping->i_mmap_nonlinear, shared.nonlinear) {
204
205 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
206 if (addr >= vma->vm_start && addr < vma->vm_end)
207 remove_migration_pte(page, vma, addr, arg);
208 }
209 return SWAP_AGAIN;
210}
211
212/*
213 * Get rid of all migration entries and replace them by 182 * Get rid of all migration entries and replace them by
214 * references to the indicated page. 183 * references to the indicated page.
215 */ 184 */
@@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new)
218 struct rmap_walk_control rwc = { 187 struct rmap_walk_control rwc = {
219 .rmap_one = remove_migration_pte, 188 .rmap_one = remove_migration_pte,
220 .arg = old, 189 .arg = old,
221 .file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
222 }; 190 };
223 191
224 rmap_walk(new, &rwc); 192 rmap_walk(new, &rwc);
diff --git a/mm/mincore.c b/mm/mincore.c
index c8c528b36641..46527c023e0c 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -124,17 +124,13 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
124 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 124 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
125 do { 125 do {
126 pte_t pte = *ptep; 126 pte_t pte = *ptep;
127 pgoff_t pgoff;
128 127
129 next = addr + PAGE_SIZE; 128 next = addr + PAGE_SIZE;
130 if (pte_none(pte)) 129 if (pte_none(pte))
131 mincore_unmapped_range(vma, addr, next, vec); 130 mincore_unmapped_range(vma, addr, next, vec);
132 else if (pte_present(pte)) 131 else if (pte_present(pte))
133 *vec = 1; 132 *vec = 1;
134 else if (pte_file(pte)) { 133 else { /* pte is a swap entry */
135 pgoff = pte_to_pgoff(pte);
136 *vec = mincore_page(vma->vm_file->f_mapping, pgoff);
137 } else { /* pte is a swap entry */
138 swp_entry_t entry = pte_to_swp_entry(pte); 134 swp_entry_t entry = pte_to_swp_entry(pte);
139 135
140 if (non_swap_entry(entry)) { 136 if (non_swap_entry(entry)) {
@@ -145,9 +141,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
145 *vec = 1; 141 *vec = 1;
146 } else { 142 } else {
147#ifdef CONFIG_SWAP 143#ifdef CONFIG_SWAP
148 pgoff = entry.val;
149 *vec = mincore_page(swap_address_space(entry), 144 *vec = mincore_page(swap_address_space(entry),
150 pgoff); 145 entry.val);
151#else 146#else
152 WARN_ON(1); 147 WARN_ON(1);
153 *vec = 1; 148 *vec = 1;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7f684d5a8087..14d84666e8ba 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
243 mapping_unmap_writable(mapping); 243 mapping_unmap_writable(mapping);
244 244
245 flush_dcache_mmap_lock(mapping); 245 flush_dcache_mmap_lock(mapping);
246 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 246 vma_interval_tree_remove(vma, &mapping->i_mmap);
247 list_del_init(&vma->shared.nonlinear);
248 else
249 vma_interval_tree_remove(vma, &mapping->i_mmap);
250 flush_dcache_mmap_unlock(mapping); 247 flush_dcache_mmap_unlock(mapping);
251} 248}
252 249
@@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
649 atomic_inc(&mapping->i_mmap_writable); 646 atomic_inc(&mapping->i_mmap_writable);
650 647
651 flush_dcache_mmap_lock(mapping); 648 flush_dcache_mmap_lock(mapping);
652 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 649 vma_interval_tree_insert(vma, &mapping->i_mmap);
653 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
654 else
655 vma_interval_tree_insert(vma, &mapping->i_mmap);
656 flush_dcache_mmap_unlock(mapping); 650 flush_dcache_mmap_unlock(mapping);
657 } 651 }
658} 652}
@@ -789,14 +783,11 @@ again: remove_next = 1 + (end > next->vm_end);
789 783
790 if (file) { 784 if (file) {
791 mapping = file->f_mapping; 785 mapping = file->f_mapping;
792 if (!(vma->vm_flags & VM_NONLINEAR)) { 786 root = &mapping->i_mmap;
793 root = &mapping->i_mmap; 787 uprobe_munmap(vma, vma->vm_start, vma->vm_end);
794 uprobe_munmap(vma, vma->vm_start, vma->vm_end);
795 788
796 if (adjust_next) 789 if (adjust_next)
797 uprobe_munmap(next, next->vm_start, 790 uprobe_munmap(next, next->vm_start, next->vm_end);
798 next->vm_end);
799 }
800 791
801 i_mmap_lock_write(mapping); 792 i_mmap_lock_write(mapping);
802 if (insert) { 793 if (insert) {
@@ -2634,6 +2625,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2634 return vm_munmap(addr, len); 2625 return vm_munmap(addr, len);
2635} 2626}
2636 2627
2628
2629/*
2630 * Emulation of deprecated remap_file_pages() syscall.
2631 */
2632SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
2633 unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
2634{
2635
2636 struct mm_struct *mm = current->mm;
2637 struct vm_area_struct *vma;
2638 unsigned long populate = 0;
2639 unsigned long ret = -EINVAL;
2640 struct file *file;
2641
2642 pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
2643 "See Documentation/vm/remap_file_pages.txt.\n",
2644 current->comm, current->pid);
2645
2646 if (prot)
2647 return ret;
2648 start = start & PAGE_MASK;
2649 size = size & PAGE_MASK;
2650
2651 if (start + size <= start)
2652 return ret;
2653
2654 /* Does pgoff wrap? */
2655 if (pgoff + (size >> PAGE_SHIFT) < pgoff)
2656 return ret;
2657
2658 down_write(&mm->mmap_sem);
2659 vma = find_vma(mm, start);
2660
2661 if (!vma || !(vma->vm_flags & VM_SHARED))
2662 goto out;
2663
2664 if (start < vma->vm_start || start + size > vma->vm_end)
2665 goto out;
2666
2667 if (pgoff == linear_page_index(vma, start)) {
2668 ret = 0;
2669 goto out;
2670 }
2671
2672 prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
2673 prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
2674 prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
2675
2676 flags &= MAP_NONBLOCK;
2677 flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
2678 if (vma->vm_flags & VM_LOCKED) {
2679 flags |= MAP_LOCKED;
2680 /* drop PG_Mlocked flag for over-mapped range */
2681 munlock_vma_pages_range(vma, start, start + size);
2682 }
2683
2684 file = get_file(vma->vm_file);
2685 ret = do_mmap_pgoff(vma->vm_file, start, size,
2686 prot, flags, pgoff, &populate);
2687 fput(file);
2688out:
2689 up_write(&mm->mmap_sem);
2690 if (populate)
2691 mm_populate(ret, populate);
2692 if (!IS_ERR_VALUE(ret))
2693 ret = 0;
2694 return ret;
2695}
2696
2637static inline void verify_mm_writelocked(struct mm_struct *mm) 2697static inline void verify_mm_writelocked(struct mm_struct *mm)
2638{ 2698{
2639#ifdef CONFIG_DEBUG_VM 2699#ifdef CONFIG_DEBUG_VM
@@ -3108,8 +3168,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
3108 * 3168 *
3109 * mmap_sem in write mode is required in order to block all operations 3169 * mmap_sem in write mode is required in order to block all operations
3110 * that could modify pagetables and free pages without need of 3170 * that could modify pagetables and free pages without need of
3111 * altering the vma layout (for example populate_range() with 3171 * altering the vma layout. It's also needed in write mode to avoid new
3112 * nonlinear vmas). It's also needed in write mode to avoid new
3113 * anon_vmas to be associated with existing vmas. 3172 * anon_vmas to be associated with existing vmas.
3114 * 3173 *
3115 * A single task can't take more than one mm_take_all_locks() in a row 3174 * A single task can't take more than one mm_take_all_locks() in a row
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ace93454ce8e..33121662f08b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -105,7 +105,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
105 } 105 }
106 if (updated) 106 if (updated)
107 pages++; 107 pages++;
108 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { 108 } else if (IS_ENABLED(CONFIG_MIGRATION)) {
109 swp_entry_t entry = pte_to_swp_entry(oldpte); 109 swp_entry_t entry = pte_to_swp_entry(oldpte);
110 110
111 if (is_write_migration_entry(entry)) { 111 if (is_write_migration_entry(entry)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 17fa018f5f39..57dadc025c64 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte)
81 pte = pte_mksoft_dirty(pte); 81 pte = pte_mksoft_dirty(pte);
82 else if (is_swap_pte(pte)) 82 else if (is_swap_pte(pte))
83 pte = pte_swp_mksoft_dirty(pte); 83 pte = pte_swp_mksoft_dirty(pte);
84 else if (pte_file(pte))
85 pte = pte_file_mksoft_dirty(pte);
86#endif 84#endif
87 return pte; 85 return pte;
88} 86}
diff --git a/mm/msync.c b/mm/msync.c
index 992a1673d488..bb04d53ae852 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
86 (vma->vm_flags & VM_SHARED)) { 86 (vma->vm_flags & VM_SHARED)) {
87 get_file(file); 87 get_file(file);
88 up_read(&mm->mmap_sem); 88 up_read(&mm->mmap_sem);
89 if (vma->vm_flags & VM_NONLINEAR) 89 error = vfs_fsync_range(file, fstart, fend, 1);
90 error = vfs_fsync(file, 1);
91 else
92 error = vfs_fsync_range(file, fstart, fend, 1);
93 fput(file); 90 fput(file);
94 if (error || start >= end) 91 if (error || start >= end)
95 goto out; 92 goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 28bd8c4dff6f..541bed64e348 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1984,14 +1984,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
1984} 1984}
1985EXPORT_SYMBOL(filemap_map_pages); 1985EXPORT_SYMBOL(filemap_map_pages);
1986 1986
1987int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
1988 unsigned long size, pgoff_t pgoff)
1989{
1990 BUG();
1991 return 0;
1992}
1993EXPORT_SYMBOL(generic_file_remap_pages);
1994
1995static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, 1987static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
1996 unsigned long addr, void *buf, int len, int write) 1988 unsigned long addr, void *buf, int len, int write)
1997{ 1989{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8e20f9c2fa5a..f121050e8530 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -552,17 +552,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
552 return 0; 552 return 0;
553 553
554 if (page_is_guard(buddy) && page_order(buddy) == order) { 554 if (page_is_guard(buddy) && page_order(buddy) == order) {
555 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
556
557 if (page_zone_id(page) != page_zone_id(buddy)) 555 if (page_zone_id(page) != page_zone_id(buddy))
558 return 0; 556 return 0;
559 557
558 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
559
560 return 1; 560 return 1;
561 } 561 }
562 562
563 if (PageBuddy(buddy) && page_order(buddy) == order) { 563 if (PageBuddy(buddy) && page_order(buddy) == order) {
564 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
565
566 /* 564 /*
567 * zone check is done late to avoid uselessly 565 * zone check is done late to avoid uselessly
568 * calculating zone/node ids for pages that could 566 * calculating zone/node ids for pages that could
@@ -571,6 +569,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
571 if (page_zone_id(page) != page_zone_id(buddy)) 569 if (page_zone_id(page) != page_zone_id(buddy))
572 return 0; 570 return 0;
573 571
572 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
573
574 return 1; 574 return 1;
575 } 575 }
576 return 0; 576 return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index 71cd5bd0c17d..70b32498d4f2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -590,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
590 if (!vma->anon_vma || !page__anon_vma || 590 if (!vma->anon_vma || !page__anon_vma ||
591 vma->anon_vma->root != page__anon_vma->root) 591 vma->anon_vma->root != page__anon_vma->root)
592 return -EFAULT; 592 return -EFAULT;
593 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 593 } else if (page->mapping) {
594 if (!vma->vm_file || 594 if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
595 vma->vm_file->f_mapping != page->mapping)
596 return -EFAULT; 595 return -EFAULT;
597 } else 596 } else
598 return -EFAULT; 597 return -EFAULT;
@@ -1274,7 +1273,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1274 if (pte_soft_dirty(pteval)) 1273 if (pte_soft_dirty(pteval))
1275 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1274 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1276 set_pte_at(mm, address, pte, swp_pte); 1275 set_pte_at(mm, address, pte, swp_pte);
1277 BUG_ON(pte_file(*pte));
1278 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1276 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1279 (flags & TTU_MIGRATION)) { 1277 (flags & TTU_MIGRATION)) {
1280 /* Establish migration entry for a file page */ 1278 /* Establish migration entry for a file page */
@@ -1316,211 +1314,6 @@ out_mlock:
1316 return ret; 1314 return ret;
1317} 1315}
1318 1316
1319/*
1320 * objrmap doesn't work for nonlinear VMAs because the assumption that
1321 * offset-into-file correlates with offset-into-virtual-addresses does not hold.
1322 * Consequently, given a particular page and its ->index, we cannot locate the
1323 * ptes which are mapping that page without an exhaustive linear search.
1324 *
1325 * So what this code does is a mini "virtual scan" of each nonlinear VMA which
1326 * maps the file to which the target page belongs. The ->vm_private_data field
1327 * holds the current cursor into that scan. Successive searches will circulate
1328 * around the vma's virtual address space.
1329 *
1330 * So as more replacement pressure is applied to the pages in a nonlinear VMA,
1331 * more scanning pressure is placed against them as well. Eventually pages
1332 * will become fully unmapped and are eligible for eviction.
1333 *
1334 * For very sparsely populated VMAs this is a little inefficient - chances are
1335 * there there won't be many ptes located within the scan cluster. In this case
1336 * maybe we could scan further - to the end of the pte page, perhaps.
1337 *
1338 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
1339 * acquire it without blocking. If vma locked, mlock the pages in the cluster,
1340 * rather than unmapping them. If we encounter the "check_page" that vmscan is
1341 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
1342 */
1343#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
1344#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
1345
1346static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1347 struct vm_area_struct *vma, struct page *check_page)
1348{
1349 struct mm_struct *mm = vma->vm_mm;
1350 pmd_t *pmd;
1351 pte_t *pte;
1352 pte_t pteval;
1353 spinlock_t *ptl;
1354 struct page *page;
1355 unsigned long address;
1356 unsigned long mmun_start; /* For mmu_notifiers */
1357 unsigned long mmun_end; /* For mmu_notifiers */
1358 unsigned long end;
1359 int ret = SWAP_AGAIN;
1360 int locked_vma = 0;
1361
1362 address = (vma->vm_start + cursor) & CLUSTER_MASK;
1363 end = address + CLUSTER_SIZE;
1364 if (address < vma->vm_start)
1365 address = vma->vm_start;
1366 if (end > vma->vm_end)
1367 end = vma->vm_end;
1368
1369 pmd = mm_find_pmd(mm, address);
1370 if (!pmd)
1371 return ret;
1372
1373 mmun_start = address;
1374 mmun_end = end;
1375 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1376
1377 /*
1378 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
1379 * keep the sem while scanning the cluster for mlocking pages.
1380 */
1381 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
1382 locked_vma = (vma->vm_flags & VM_LOCKED);
1383 if (!locked_vma)
1384 up_read(&vma->vm_mm->mmap_sem); /* don't need it */
1385 }
1386
1387 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1388
1389 /* Update high watermark before we lower rss */
1390 update_hiwater_rss(mm);
1391
1392 for (; address < end; pte++, address += PAGE_SIZE) {
1393 if (!pte_present(*pte))
1394 continue;
1395 page = vm_normal_page(vma, address, *pte);
1396 BUG_ON(!page || PageAnon(page));
1397
1398 if (locked_vma) {
1399 if (page == check_page) {
1400 /* we know we have check_page locked */
1401 mlock_vma_page(page);
1402 ret = SWAP_MLOCK;
1403 } else if (trylock_page(page)) {
1404 /*
1405 * If we can lock the page, perform mlock.
1406 * Otherwise leave the page alone, it will be
1407 * eventually encountered again later.
1408 */
1409 mlock_vma_page(page);
1410 unlock_page(page);
1411 }
1412 continue; /* don't unmap */
1413 }
1414
1415 /*
1416 * No need for _notify because we're within an
1417 * mmu_notifier_invalidate_range_ {start|end} scope.
1418 */
1419 if (ptep_clear_flush_young(vma, address, pte))
1420 continue;
1421
1422 /* Nuke the page table entry. */
1423 flush_cache_page(vma, address, pte_pfn(*pte));
1424 pteval = ptep_clear_flush_notify(vma, address, pte);
1425
1426 /* If nonlinear, store the file page offset in the pte. */
1427 if (page->index != linear_page_index(vma, address)) {
1428 pte_t ptfile = pgoff_to_pte(page->index);
1429 if (pte_soft_dirty(pteval))
1430 ptfile = pte_file_mksoft_dirty(ptfile);
1431 set_pte_at(mm, address, pte, ptfile);
1432 }
1433
1434 /* Move the dirty bit to the physical page now the pte is gone. */
1435 if (pte_dirty(pteval))
1436 set_page_dirty(page);
1437
1438 page_remove_rmap(page);
1439 page_cache_release(page);
1440 dec_mm_counter(mm, MM_FILEPAGES);
1441 (*mapcount)--;
1442 }
1443 pte_unmap_unlock(pte - 1, ptl);
1444 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1445 if (locked_vma)
1446 up_read(&vma->vm_mm->mmap_sem);
1447 return ret;
1448}
1449
1450static int try_to_unmap_nonlinear(struct page *page,
1451 struct address_space *mapping, void *arg)
1452{
1453 struct vm_area_struct *vma;
1454 int ret = SWAP_AGAIN;
1455 unsigned long cursor;
1456 unsigned long max_nl_cursor = 0;
1457 unsigned long max_nl_size = 0;
1458 unsigned int mapcount;
1459
1460 list_for_each_entry(vma,
1461 &mapping->i_mmap_nonlinear, shared.nonlinear) {
1462
1463 cursor = (unsigned long) vma->vm_private_data;
1464 if (cursor > max_nl_cursor)
1465 max_nl_cursor = cursor;
1466 cursor = vma->vm_end - vma->vm_start;
1467 if (cursor > max_nl_size)
1468 max_nl_size = cursor;
1469 }
1470
1471 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
1472 return SWAP_FAIL;
1473 }
1474
1475 /*
1476 * We don't try to search for this page in the nonlinear vmas,
1477 * and page_referenced wouldn't have found it anyway. Instead
1478 * just walk the nonlinear vmas trying to age and unmap some.
1479 * The mapcount of the page we came in with is irrelevant,
1480 * but even so use it as a guide to how hard we should try?
1481 */
1482 mapcount = page_mapcount(page);
1483 if (!mapcount)
1484 return ret;
1485
1486 cond_resched();
1487
1488 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
1489 if (max_nl_cursor == 0)
1490 max_nl_cursor = CLUSTER_SIZE;
1491
1492 do {
1493 list_for_each_entry(vma,
1494 &mapping->i_mmap_nonlinear, shared.nonlinear) {
1495
1496 cursor = (unsigned long) vma->vm_private_data;
1497 while (cursor < max_nl_cursor &&
1498 cursor < vma->vm_end - vma->vm_start) {
1499 if (try_to_unmap_cluster(cursor, &mapcount,
1500 vma, page) == SWAP_MLOCK)
1501 ret = SWAP_MLOCK;
1502 cursor += CLUSTER_SIZE;
1503 vma->vm_private_data = (void *) cursor;
1504 if ((int)mapcount <= 0)
1505 return ret;
1506 }
1507 vma->vm_private_data = (void *) max_nl_cursor;
1508 }
1509 cond_resched();
1510 max_nl_cursor += CLUSTER_SIZE;
1511 } while (max_nl_cursor <= max_nl_size);
1512
1513 /*
1514 * Don't loop forever (perhaps all the remaining pages are
1515 * in locked vmas). Reset cursor on all unreserved nonlinear
1516 * vmas, now forgetting on which ones it had fallen behind.
1517 */
1518 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
1519 vma->vm_private_data = NULL;
1520
1521 return ret;
1522}
1523
1524bool is_vma_temporary_stack(struct vm_area_struct *vma) 1317bool is_vma_temporary_stack(struct vm_area_struct *vma)
1525{ 1318{
1526 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1319 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1566,7 +1359,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1566 .rmap_one = try_to_unmap_one, 1359 .rmap_one = try_to_unmap_one,
1567 .arg = (void *)flags, 1360 .arg = (void *)flags,
1568 .done = page_not_mapped, 1361 .done = page_not_mapped,
1569 .file_nonlinear = try_to_unmap_nonlinear,
1570 .anon_lock = page_lock_anon_vma_read, 1362 .anon_lock = page_lock_anon_vma_read,
1571 }; 1363 };
1572 1364
@@ -1612,12 +1404,6 @@ int try_to_munlock(struct page *page)
1612 .rmap_one = try_to_unmap_one, 1404 .rmap_one = try_to_unmap_one,
1613 .arg = (void *)TTU_MUNLOCK, 1405 .arg = (void *)TTU_MUNLOCK,
1614 .done = page_not_mapped, 1406 .done = page_not_mapped,
1615 /*
1616 * We don't bother to try to find the munlocked page in
1617 * nonlinears. It's costly. Instead, later, page reclaim logic
1618 * may call try_to_unmap() and recover PG_mlocked lazily.
1619 */
1620 .file_nonlinear = NULL,
1621 .anon_lock = page_lock_anon_vma_read, 1407 .anon_lock = page_lock_anon_vma_read,
1622 1408
1623 }; 1409 };
@@ -1748,13 +1534,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1748 goto done; 1534 goto done;
1749 } 1535 }
1750 1536
1751 if (!rwc->file_nonlinear)
1752 goto done;
1753
1754 if (list_empty(&mapping->i_mmap_nonlinear))
1755 goto done;
1756
1757 ret = rwc->file_nonlinear(page, mapping, rwc->arg);
1758done: 1537done:
1759 i_mmap_unlock_read(mapping); 1538 i_mmap_unlock_read(mapping);
1760 return ret; 1539 return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 993e6ba689cc..b3e403181981 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3201,7 +3201,6 @@ static const struct vm_operations_struct shmem_vm_ops = {
3201 .set_policy = shmem_set_policy, 3201 .set_policy = shmem_set_policy,
3202 .get_policy = shmem_get_policy, 3202 .get_policy = shmem_get_policy,
3203#endif 3203#endif
3204 .remap_pages = generic_file_remap_pages,
3205}; 3204};
3206 3205
3207static struct dentry *shmem_mount(struct file_system_type *fs_type, 3206static struct dentry *shmem_mount(struct file_system_type *fs_type,
diff --git a/mm/slab.h b/mm/slab.h
index 1cf4005482dd..90430d6f665e 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -235,7 +235,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
235 return 0; 235 return 0;
236 if (is_root_cache(s)) 236 if (is_root_cache(s))
237 return 0; 237 return 0;
238 return __memcg_charge_slab(s, gfp, order); 238 return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order);
239} 239}
240 240
241static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) 241static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,7 +244,7 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
244 return; 244 return;
245 if (is_root_cache(s)) 245 if (is_root_cache(s))
246 return; 246 return;
247 __memcg_uncharge_slab(s, order); 247 memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order);
248} 248}
249#else 249#else
250static inline bool is_root_cache(struct kmem_cache *s) 250static inline bool is_root_cache(struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e03dd6f2a272..6e1e4cf65836 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -331,7 +331,7 @@ out:
331 331
332out_free_cache: 332out_free_cache:
333 memcg_free_cache_params(s); 333 memcg_free_cache_params(s);
334 kfree(s); 334 kmem_cache_free(kmem_cache, s);
335 goto out; 335 goto out;
336} 336}
337 337
@@ -425,21 +425,64 @@ out_unlock:
425} 425}
426EXPORT_SYMBOL(kmem_cache_create); 426EXPORT_SYMBOL(kmem_cache_create);
427 427
428static int do_kmem_cache_shutdown(struct kmem_cache *s,
429 struct list_head *release, bool *need_rcu_barrier)
430{
431 if (__kmem_cache_shutdown(s) != 0) {
432 printk(KERN_ERR "kmem_cache_destroy %s: "
433 "Slab cache still has objects\n", s->name);
434 dump_stack();
435 return -EBUSY;
436 }
437
438 if (s->flags & SLAB_DESTROY_BY_RCU)
439 *need_rcu_barrier = true;
440
441#ifdef CONFIG_MEMCG_KMEM
442 if (!is_root_cache(s)) {
443 struct kmem_cache *root_cache = s->memcg_params->root_cache;
444 int memcg_id = memcg_cache_id(s->memcg_params->memcg);
445
446 BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s);
447 root_cache->memcg_params->memcg_caches[memcg_id] = NULL;
448 }
449#endif
450 list_move(&s->list, release);
451 return 0;
452}
453
454static void do_kmem_cache_release(struct list_head *release,
455 bool need_rcu_barrier)
456{
457 struct kmem_cache *s, *s2;
458
459 if (need_rcu_barrier)
460 rcu_barrier();
461
462 list_for_each_entry_safe(s, s2, release, list) {
463#ifdef SLAB_SUPPORTS_SYSFS
464 sysfs_slab_remove(s);
465#else
466 slab_kmem_cache_release(s);
467#endif
468 }
469}
470
428#ifdef CONFIG_MEMCG_KMEM 471#ifdef CONFIG_MEMCG_KMEM
429/* 472/*
430 * memcg_create_kmem_cache - Create a cache for a memory cgroup. 473 * memcg_create_kmem_cache - Create a cache for a memory cgroup.
431 * @memcg: The memory cgroup the new cache is for. 474 * @memcg: The memory cgroup the new cache is for.
432 * @root_cache: The parent of the new cache. 475 * @root_cache: The parent of the new cache.
433 * @memcg_name: The name of the memory cgroup (used for naming the new cache).
434 * 476 *
435 * This function attempts to create a kmem cache that will serve allocation 477 * This function attempts to create a kmem cache that will serve allocation
436 * requests going from @memcg to @root_cache. The new cache inherits properties 478 * requests going from @memcg to @root_cache. The new cache inherits properties
437 * from its parent. 479 * from its parent.
438 */ 480 */
439struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, 481void memcg_create_kmem_cache(struct mem_cgroup *memcg,
440 struct kmem_cache *root_cache, 482 struct kmem_cache *root_cache)
441 const char *memcg_name)
442{ 483{
484 static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
485 int memcg_id = memcg_cache_id(memcg);
443 struct kmem_cache *s = NULL; 486 struct kmem_cache *s = NULL;
444 char *cache_name; 487 char *cache_name;
445 488
@@ -448,8 +491,18 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
448 491
449 mutex_lock(&slab_mutex); 492 mutex_lock(&slab_mutex);
450 493
494 /*
495 * Since per-memcg caches are created asynchronously on first
496 * allocation (see memcg_kmem_get_cache()), several threads can try to
497 * create the same cache, but only one of them may succeed.
498 */
499 if (cache_from_memcg_idx(root_cache, memcg_id))
500 goto out_unlock;
501
502 cgroup_name(mem_cgroup_css(memcg)->cgroup,
503 memcg_name_buf, sizeof(memcg_name_buf));
451 cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, 504 cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
452 memcg_cache_id(memcg), memcg_name); 505 memcg_cache_id(memcg), memcg_name_buf);
453 if (!cache_name) 506 if (!cache_name)
454 goto out_unlock; 507 goto out_unlock;
455 508
@@ -457,49 +510,73 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
457 root_cache->size, root_cache->align, 510 root_cache->size, root_cache->align,
458 root_cache->flags, root_cache->ctor, 511 root_cache->flags, root_cache->ctor,
459 memcg, root_cache); 512 memcg, root_cache);
513 /*
514 * If we could not create a memcg cache, do not complain, because
515 * that's not critical at all as we can always proceed with the root
516 * cache.
517 */
460 if (IS_ERR(s)) { 518 if (IS_ERR(s)) {
461 kfree(cache_name); 519 kfree(cache_name);
462 s = NULL; 520 goto out_unlock;
463 } 521 }
464 522
523 /*
524 * Since readers won't lock (see cache_from_memcg_idx()), we need a
525 * barrier here to ensure nobody will see the kmem_cache partially
526 * initialized.
527 */
528 smp_wmb();
529 root_cache->memcg_params->memcg_caches[memcg_id] = s;
530
465out_unlock: 531out_unlock:
466 mutex_unlock(&slab_mutex); 532 mutex_unlock(&slab_mutex);
467 533
468 put_online_mems(); 534 put_online_mems();
469 put_online_cpus(); 535 put_online_cpus();
470
471 return s;
472} 536}
473 537
474static int memcg_cleanup_cache_params(struct kmem_cache *s) 538void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
475{ 539{
476 int rc; 540 LIST_HEAD(release);
541 bool need_rcu_barrier = false;
542 struct kmem_cache *s, *s2;
477 543
478 if (!s->memcg_params || 544 get_online_cpus();
479 !s->memcg_params->is_root_cache) 545 get_online_mems();
480 return 0;
481 546
482 mutex_unlock(&slab_mutex);
483 rc = __memcg_cleanup_cache_params(s);
484 mutex_lock(&slab_mutex); 547 mutex_lock(&slab_mutex);
548 list_for_each_entry_safe(s, s2, &slab_caches, list) {
549 if (is_root_cache(s) || s->memcg_params->memcg != memcg)
550 continue;
551 /*
552 * The cgroup is about to be freed and therefore has no charges
553 * left. Hence, all its caches must be empty by now.
554 */
555 BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
556 }
557 mutex_unlock(&slab_mutex);
485 558
486 return rc; 559 put_online_mems();
487} 560 put_online_cpus();
488#else 561
489static int memcg_cleanup_cache_params(struct kmem_cache *s) 562 do_kmem_cache_release(&release, need_rcu_barrier);
490{
491 return 0;
492} 563}
493#endif /* CONFIG_MEMCG_KMEM */ 564#endif /* CONFIG_MEMCG_KMEM */
494 565
495void slab_kmem_cache_release(struct kmem_cache *s) 566void slab_kmem_cache_release(struct kmem_cache *s)
496{ 567{
568 memcg_free_cache_params(s);
497 kfree(s->name); 569 kfree(s->name);
498 kmem_cache_free(kmem_cache, s); 570 kmem_cache_free(kmem_cache, s);
499} 571}
500 572
501void kmem_cache_destroy(struct kmem_cache *s) 573void kmem_cache_destroy(struct kmem_cache *s)
502{ 574{
575 int i;
576 LIST_HEAD(release);
577 bool need_rcu_barrier = false;
578 bool busy = false;
579
503 get_online_cpus(); 580 get_online_cpus();
504 get_online_mems(); 581 get_online_mems();
505 582
@@ -509,35 +586,23 @@ void kmem_cache_destroy(struct kmem_cache *s)
509 if (s->refcount) 586 if (s->refcount)
510 goto out_unlock; 587 goto out_unlock;
511 588
512 if (memcg_cleanup_cache_params(s) != 0) 589 for_each_memcg_cache_index(i) {
513 goto out_unlock; 590 struct kmem_cache *c = cache_from_memcg_idx(s, i);
514 591
515 if (__kmem_cache_shutdown(s) != 0) { 592 if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
516 printk(KERN_ERR "kmem_cache_destroy %s: " 593 busy = true;
517 "Slab cache still has objects\n", s->name);
518 dump_stack();
519 goto out_unlock;
520 } 594 }
521 595
522 list_del(&s->list); 596 if (!busy)
523 597 do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
524 mutex_unlock(&slab_mutex);
525 if (s->flags & SLAB_DESTROY_BY_RCU)
526 rcu_barrier();
527
528 memcg_free_cache_params(s);
529#ifdef SLAB_SUPPORTS_SYSFS
530 sysfs_slab_remove(s);
531#else
532 slab_kmem_cache_release(s);
533#endif
534 goto out;
535 598
536out_unlock: 599out_unlock:
537 mutex_unlock(&slab_mutex); 600 mutex_unlock(&slab_mutex);
538out: 601
539 put_online_mems(); 602 put_online_mems();
540 put_online_cpus(); 603 put_online_cpus();
604
605 do_kmem_cache_release(&release, need_rcu_barrier);
541} 606}
542EXPORT_SYMBOL(kmem_cache_destroy); 607EXPORT_SYMBOL(kmem_cache_destroy);
543 608
diff --git a/mm/slub.c b/mm/slub.c
index fe376fe1f4fe..8b8508adf9c2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2398,13 +2398,24 @@ redo:
2398 * reading from one cpu area. That does not matter as long 2398 * reading from one cpu area. That does not matter as long
2399 * as we end up on the original cpu again when doing the cmpxchg. 2399 * as we end up on the original cpu again when doing the cmpxchg.
2400 * 2400 *
2401 * Preemption is disabled for the retrieval of the tid because that 2401 * We should guarantee that tid and kmem_cache are retrieved on
2402 * must occur from the current processor. We cannot allow rescheduling 2402 * the same cpu. It could be different if CONFIG_PREEMPT so we need
2403 * on a different processor between the determination of the pointer 2403 * to check if it is matched or not.
2404 * and the retrieval of the tid.
2405 */ 2404 */
2406 preempt_disable(); 2405 do {
2407 c = this_cpu_ptr(s->cpu_slab); 2406 tid = this_cpu_read(s->cpu_slab->tid);
2407 c = raw_cpu_ptr(s->cpu_slab);
2408 } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
2409
2410 /*
2411 * Irqless object alloc/free algorithm used here depends on sequence
2412 * of fetching cpu_slab's data. tid should be fetched before anything
2413 * on c to guarantee that object and page associated with previous tid
2414 * won't be used with current tid. If we fetch tid first, object and
2415 * page could be one associated with next tid and our alloc/free
2416 * request will be failed. In this case, we will retry. So, no problem.
2417 */
2418 barrier();
2408 2419
2409 /* 2420 /*
2410 * The transaction ids are globally unique per cpu and per operation on 2421 * The transaction ids are globally unique per cpu and per operation on
@@ -2412,8 +2423,6 @@ redo:
2412 * occurs on the right processor and that there was no operation on the 2423 * occurs on the right processor and that there was no operation on the
2413 * linked list in between. 2424 * linked list in between.
2414 */ 2425 */
2415 tid = c->tid;
2416 preempt_enable();
2417 2426
2418 object = c->freelist; 2427 object = c->freelist;
2419 page = c->page; 2428 page = c->page;
@@ -2512,7 +2521,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
2512#endif 2521#endif
2513 2522
2514/* 2523/*
2515 * Slow patch handling. This may still be called frequently since objects 2524 * Slow path handling. This may still be called frequently since objects
2516 * have a longer lifetime than the cpu slabs in most processing loads. 2525 * have a longer lifetime than the cpu slabs in most processing loads.
2517 * 2526 *
2518 * So we still attempt to reduce cache line usage. Just take the slab 2527 * So we still attempt to reduce cache line usage. Just take the slab
@@ -2659,11 +2668,13 @@ redo:
2659 * data is retrieved via this pointer. If we are on the same cpu 2668 * data is retrieved via this pointer. If we are on the same cpu
2660 * during the cmpxchg then the free will succedd. 2669 * during the cmpxchg then the free will succedd.
2661 */ 2670 */
2662 preempt_disable(); 2671 do {
2663 c = this_cpu_ptr(s->cpu_slab); 2672 tid = this_cpu_read(s->cpu_slab->tid);
2673 c = raw_cpu_ptr(s->cpu_slab);
2674 } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
2664 2675
2665 tid = c->tid; 2676 /* Same with comment on barrier() in slab_alloc_node() */
2666 preempt_enable(); 2677 barrier();
2667 2678
2668 if (likely(page == c->page)) { 2679 if (likely(page == c->page)) {
2669 set_freepointer(s, object, c->freelist); 2680 set_freepointer(s, object, c->freelist);
diff --git a/mm/swap.c b/mm/swap.c
index 8a12b33936b4..5b3087228b99 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1140,10 +1140,8 @@ void __init swap_setup(void)
1140 1140
1141 if (bdi_init(swapper_spaces[0].backing_dev_info)) 1141 if (bdi_init(swapper_spaces[0].backing_dev_info))
1142 panic("Failed to init swap bdi"); 1142 panic("Failed to init swap bdi");
1143 for (i = 0; i < MAX_SWAPFILES; i++) { 1143 for (i = 0; i < MAX_SWAPFILES; i++)
1144 spin_lock_init(&swapper_spaces[i].tree_lock); 1144 spin_lock_init(&swapper_spaces[i].tree_lock);
1145 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
1146 }
1147#endif 1145#endif
1148 1146
1149 /* Use a smaller cluster for small-memory machines */ 1147 /* Use a smaller cluster for small-memory machines */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1284f89fca08..9943e5fd74e6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,9 @@
17#include <linux/cpu.h> 17#include <linux/cpu.h>
18#include <linux/cpumask.h> 18#include <linux/cpumask.h>
19#include <linux/vmstat.h> 19#include <linux/vmstat.h>
20#include <linux/proc_fs.h>
21#include <linux/seq_file.h>
22#include <linux/debugfs.h>
20#include <linux/sched.h> 23#include <linux/sched.h>
21#include <linux/math64.h> 24#include <linux/math64.h>
22#include <linux/writeback.h> 25#include <linux/writeback.h>
@@ -670,66 +673,6 @@ int fragmentation_index(struct zone *zone, unsigned int order)
670} 673}
671#endif 674#endif
672 675
673#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
674#include <linux/proc_fs.h>
675#include <linux/seq_file.h>
676
677static char * const migratetype_names[MIGRATE_TYPES] = {
678 "Unmovable",
679 "Reclaimable",
680 "Movable",
681 "Reserve",
682#ifdef CONFIG_CMA
683 "CMA",
684#endif
685#ifdef CONFIG_MEMORY_ISOLATION
686 "Isolate",
687#endif
688};
689
690static void *frag_start(struct seq_file *m, loff_t *pos)
691{
692 pg_data_t *pgdat;
693 loff_t node = *pos;
694 for (pgdat = first_online_pgdat();
695 pgdat && node;
696 pgdat = next_online_pgdat(pgdat))
697 --node;
698
699 return pgdat;
700}
701
702static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
703{
704 pg_data_t *pgdat = (pg_data_t *)arg;
705
706 (*pos)++;
707 return next_online_pgdat(pgdat);
708}
709
710static void frag_stop(struct seq_file *m, void *arg)
711{
712}
713
714/* Walk all the zones in a node and print using a callback */
715static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
716 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
717{
718 struct zone *zone;
719 struct zone *node_zones = pgdat->node_zones;
720 unsigned long flags;
721
722 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
723 if (!populated_zone(zone))
724 continue;
725
726 spin_lock_irqsave(&zone->lock, flags);
727 print(m, pgdat, zone);
728 spin_unlock_irqrestore(&zone->lock, flags);
729 }
730}
731#endif
732
733#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) 676#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
734#ifdef CONFIG_ZONE_DMA 677#ifdef CONFIG_ZONE_DMA
735#define TEXT_FOR_DMA(xx) xx "_dma", 678#define TEXT_FOR_DMA(xx) xx "_dma",
@@ -907,7 +850,66 @@ const char * const vmstat_text[] = {
907#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ 850#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
908 851
909 852
853#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
854 defined(CONFIG_PROC_FS)
855static void *frag_start(struct seq_file *m, loff_t *pos)
856{
857 pg_data_t *pgdat;
858 loff_t node = *pos;
859
860 for (pgdat = first_online_pgdat();
861 pgdat && node;
862 pgdat = next_online_pgdat(pgdat))
863 --node;
864
865 return pgdat;
866}
867
868static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
869{
870 pg_data_t *pgdat = (pg_data_t *)arg;
871
872 (*pos)++;
873 return next_online_pgdat(pgdat);
874}
875
876static void frag_stop(struct seq_file *m, void *arg)
877{
878}
879
880/* Walk all the zones in a node and print using a callback */
881static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
882 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
883{
884 struct zone *zone;
885 struct zone *node_zones = pgdat->node_zones;
886 unsigned long flags;
887
888 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
889 if (!populated_zone(zone))
890 continue;
891
892 spin_lock_irqsave(&zone->lock, flags);
893 print(m, pgdat, zone);
894 spin_unlock_irqrestore(&zone->lock, flags);
895 }
896}
897#endif
898
910#ifdef CONFIG_PROC_FS 899#ifdef CONFIG_PROC_FS
900static char * const migratetype_names[MIGRATE_TYPES] = {
901 "Unmovable",
902 "Reclaimable",
903 "Movable",
904 "Reserve",
905#ifdef CONFIG_CMA
906 "CMA",
907#endif
908#ifdef CONFIG_MEMORY_ISOLATION
909 "Isolate",
910#endif
911};
912
911static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 913static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
912 struct zone *zone) 914 struct zone *zone)
913{ 915{
@@ -1536,8 +1538,6 @@ static int __init setup_vmstat(void)
1536module_init(setup_vmstat) 1538module_init(setup_vmstat)
1537 1539
1538#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) 1540#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1539#include <linux/debugfs.h>
1540
1541 1541
1542/* 1542/*
1543 * Return an index indicating how much of the available free memory is 1543 * Return an index indicating how much of the available free memory is