diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-10 19:45:56 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-10 19:45:56 -0500 |
commit | 992de5a8eca7cbd3215e3eb2c439b2c11582a58b (patch) | |
tree | 863988f84c1dd57a02fa337ecbce49263a3b9511 /mm | |
parent | b2718bffb4088faf13092db30c1ebf088ddee52e (diff) | |
parent | d5b3cf7139b8770af4ed8bb36a1ab9d290ac39e9 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton:
"Bite-sized chunks this time, to avoid the MTA ratelimiting woes.
- fs/notify updates
- ocfs2
- some of MM"
That laconic "some MM" is mainly the removal of remap_file_pages(),
which is a big simplification of the VM, and which gets rid of a *lot*
of random cruft and special cases because we no longer support the
non-linear mappings that it used.
From a user interface perspective, nothing has changed, because the
remap_file_pages() syscall still exists, it's just done by emulating the
old behavior by creating a lot of individual small mappings instead of
one non-linear one.
The emulation is slower than the old "native" non-linear mappings, but
nobody really uses or cares about remap_file_pages(), and simplifying
the VM is a big advantage.
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (78 commits)
memcg: zap memcg_slab_caches and memcg_slab_mutex
memcg: zap memcg_name argument of memcg_create_kmem_cache
memcg: zap __memcg_{charge,uncharge}_slab
mm/page_alloc.c: place zone_id check before VM_BUG_ON_PAGE check
mm: hugetlb: fix type of hugetlb_treat_as_movable variable
mm, hugetlb: remove unnecessary lower bound on sysctl handlers"?
mm: memory: merge shared-writable dirtying branches in do_wp_page()
mm: memory: remove ->vm_file check on shared writable vmas
xtensa: drop _PAGE_FILE and pte_file()-related helpers
x86: drop _PAGE_FILE and pte_file()-related helpers
unicore32: drop pte_file()-related helpers
um: drop _PAGE_FILE and pte_file()-related helpers
tile: drop pte_file()-related helpers
sparc: drop pte_file()-related helpers
sh: drop _PAGE_FILE and pte_file()-related helpers
score: drop _PAGE_FILE and pte_file()-related helpers
s390: drop pte_file()-related helpers
parisc: drop _PAGE_FILE and pte_file()-related helpers
openrisc: drop _PAGE_FILE and pte_file()-related helpers
nios2: drop _PAGE_FILE and pte_file()-related helpers
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/debug.c | 1 | ||||
-rw-r--r-- | mm/filemap.c | 1 | ||||
-rw-r--r-- | mm/filemap_xip.c | 1 | ||||
-rw-r--r-- | mm/fremap.c | 283 | ||||
-rw-r--r-- | mm/gup.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/interval_tree.c | 34 | ||||
-rw-r--r-- | mm/ksm.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 13 | ||||
-rw-r--r-- | mm/memcontrol.c | 187 | ||||
-rw-r--r-- | mm/memory.c | 276 | ||||
-rw-r--r-- | mm/migrate.c | 32 | ||||
-rw-r--r-- | mm/mincore.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 93 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/msync.c | 5 | ||||
-rw-r--r-- | mm/nommu.c | 8 | ||||
-rw-r--r-- | mm/page_alloc.c | 8 | ||||
-rw-r--r-- | mm/rmap.c | 225 | ||||
-rw-r--r-- | mm/shmem.c | 1 | ||||
-rw-r--r-- | mm/slab.h | 4 | ||||
-rw-r--r-- | mm/slab_common.c | 151 | ||||
-rw-r--r-- | mm/slub.c | 37 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/vmstat.c | 124 |
27 files changed, 411 insertions, 1098 deletions
diff --git a/mm/Makefile b/mm/Makefile index 4bf586e66378..3548460ab7b6 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -3,7 +3,7 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
6 | mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ |
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o pagewalk.o pgtable-generic.o | 8 | vmalloc.o pagewalk.o pgtable-generic.o |
9 | 9 | ||
diff --git a/mm/debug.c b/mm/debug.c index 0e58f3211f89..d69cb5a7ba9a 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = { | |||
130 | {VM_ACCOUNT, "account" }, | 130 | {VM_ACCOUNT, "account" }, |
131 | {VM_NORESERVE, "noreserve" }, | 131 | {VM_NORESERVE, "noreserve" }, |
132 | {VM_HUGETLB, "hugetlb" }, | 132 | {VM_HUGETLB, "hugetlb" }, |
133 | {VM_NONLINEAR, "nonlinear" }, | ||
134 | #if defined(CONFIG_X86) | 133 | #if defined(CONFIG_X86) |
135 | {VM_PAT, "pat" }, | 134 | {VM_PAT, "pat" }, |
136 | #elif defined(CONFIG_PPC) | 135 | #elif defined(CONFIG_PPC) |
diff --git a/mm/filemap.c b/mm/filemap.c index 673e4581a2e5..bf7a27142704 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -2087,7 +2087,6 @@ const struct vm_operations_struct generic_file_vm_ops = { | |||
2087 | .fault = filemap_fault, | 2087 | .fault = filemap_fault, |
2088 | .map_pages = filemap_map_pages, | 2088 | .map_pages = filemap_map_pages, |
2089 | .page_mkwrite = filemap_page_mkwrite, | 2089 | .page_mkwrite = filemap_page_mkwrite, |
2090 | .remap_pages = generic_file_remap_pages, | ||
2091 | }; | 2090 | }; |
2092 | 2091 | ||
2093 | /* This is used for a general mmap of a disk file */ | 2092 | /* This is used for a general mmap of a disk file */ |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 0d105aeff82f..70c09da1a419 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -301,7 +301,6 @@ out: | |||
301 | static const struct vm_operations_struct xip_file_vm_ops = { | 301 | static const struct vm_operations_struct xip_file_vm_ops = { |
302 | .fault = xip_file_fault, | 302 | .fault = xip_file_fault, |
303 | .page_mkwrite = filemap_page_mkwrite, | 303 | .page_mkwrite = filemap_page_mkwrite, |
304 | .remap_pages = generic_file_remap_pages, | ||
305 | }; | 304 | }; |
306 | 305 | ||
307 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) | 306 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) |
diff --git a/mm/fremap.c b/mm/fremap.c deleted file mode 100644 index 2805d71cf476..000000000000 --- a/mm/fremap.c +++ /dev/null | |||
@@ -1,283 +0,0 @@ | |||
1 | /* | ||
2 | * linux/mm/fremap.c | ||
3 | * | ||
4 | * Explicit pagetable population and nonlinear (random) mappings support. | ||
5 | * | ||
6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 | ||
7 | */ | ||
8 | #include <linux/export.h> | ||
9 | #include <linux/backing-dev.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/swap.h> | ||
12 | #include <linux/file.h> | ||
13 | #include <linux/mman.h> | ||
14 | #include <linux/pagemap.h> | ||
15 | #include <linux/swapops.h> | ||
16 | #include <linux/rmap.h> | ||
17 | #include <linux/syscalls.h> | ||
18 | #include <linux/mmu_notifier.h> | ||
19 | |||
20 | #include <asm/mmu_context.h> | ||
21 | #include <asm/cacheflush.h> | ||
22 | #include <asm/tlbflush.h> | ||
23 | |||
24 | #include "internal.h" | ||
25 | |||
26 | static int mm_counter(struct page *page) | ||
27 | { | ||
28 | return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; | ||
29 | } | ||
30 | |||
31 | static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | ||
32 | unsigned long addr, pte_t *ptep) | ||
33 | { | ||
34 | pte_t pte = *ptep; | ||
35 | struct page *page; | ||
36 | swp_entry_t entry; | ||
37 | |||
38 | if (pte_present(pte)) { | ||
39 | flush_cache_page(vma, addr, pte_pfn(pte)); | ||
40 | pte = ptep_clear_flush_notify(vma, addr, ptep); | ||
41 | page = vm_normal_page(vma, addr, pte); | ||
42 | if (page) { | ||
43 | if (pte_dirty(pte)) | ||
44 | set_page_dirty(page); | ||
45 | update_hiwater_rss(mm); | ||
46 | dec_mm_counter(mm, mm_counter(page)); | ||
47 | page_remove_rmap(page); | ||
48 | page_cache_release(page); | ||
49 | } | ||
50 | } else { /* zap_pte() is not called when pte_none() */ | ||
51 | if (!pte_file(pte)) { | ||
52 | update_hiwater_rss(mm); | ||
53 | entry = pte_to_swp_entry(pte); | ||
54 | if (non_swap_entry(entry)) { | ||
55 | if (is_migration_entry(entry)) { | ||
56 | page = migration_entry_to_page(entry); | ||
57 | dec_mm_counter(mm, mm_counter(page)); | ||
58 | } | ||
59 | } else { | ||
60 | free_swap_and_cache(entry); | ||
61 | dec_mm_counter(mm, MM_SWAPENTS); | ||
62 | } | ||
63 | } | ||
64 | pte_clear_not_present_full(mm, addr, ptep, 0); | ||
65 | } | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * Install a file pte to a given virtual memory address, release any | ||
70 | * previously existing mapping. | ||
71 | */ | ||
72 | static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | ||
73 | unsigned long addr, unsigned long pgoff, pgprot_t prot) | ||
74 | { | ||
75 | int err = -ENOMEM; | ||
76 | pte_t *pte, ptfile; | ||
77 | spinlock_t *ptl; | ||
78 | |||
79 | pte = get_locked_pte(mm, addr, &ptl); | ||
80 | if (!pte) | ||
81 | goto out; | ||
82 | |||
83 | ptfile = pgoff_to_pte(pgoff); | ||
84 | |||
85 | if (!pte_none(*pte)) | ||
86 | zap_pte(mm, vma, addr, pte); | ||
87 | |||
88 | set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); | ||
89 | /* | ||
90 | * We don't need to run update_mmu_cache() here because the "file pte" | ||
91 | * being installed by install_file_pte() is not a real pte - it's a | ||
92 | * non-present entry (like a swap entry), noting what file offset should | ||
93 | * be mapped there when there's a fault (in a non-linear vma where | ||
94 | * that's not obvious). | ||
95 | */ | ||
96 | pte_unmap_unlock(pte, ptl); | ||
97 | err = 0; | ||
98 | out: | ||
99 | return err; | ||
100 | } | ||
101 | |||
102 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, | ||
103 | unsigned long size, pgoff_t pgoff) | ||
104 | { | ||
105 | struct mm_struct *mm = vma->vm_mm; | ||
106 | int err; | ||
107 | |||
108 | do { | ||
109 | err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot); | ||
110 | if (err) | ||
111 | return err; | ||
112 | |||
113 | size -= PAGE_SIZE; | ||
114 | addr += PAGE_SIZE; | ||
115 | pgoff++; | ||
116 | } while (size); | ||
117 | |||
118 | return 0; | ||
119 | } | ||
120 | EXPORT_SYMBOL(generic_file_remap_pages); | ||
121 | |||
122 | /** | ||
123 | * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma | ||
124 | * @start: start of the remapped virtual memory range | ||
125 | * @size: size of the remapped virtual memory range | ||
126 | * @prot: new protection bits of the range (see NOTE) | ||
127 | * @pgoff: to-be-mapped page of the backing store file | ||
128 | * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. | ||
129 | * | ||
130 | * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma | ||
131 | * (shared backing store file). | ||
132 | * | ||
133 | * This syscall works purely via pagetables, so it's the most efficient | ||
134 | * way to map the same (large) file into a given virtual window. Unlike | ||
135 | * mmap()/mremap() it does not create any new vmas. The new mappings are | ||
136 | * also safe across swapout. | ||
137 | * | ||
138 | * NOTE: the @prot parameter right now is ignored (but must be zero), | ||
139 | * and the vma's default protection is used. Arbitrary protections | ||
140 | * might be implemented in the future. | ||
141 | */ | ||
142 | SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | ||
143 | unsigned long, prot, unsigned long, pgoff, unsigned long, flags) | ||
144 | { | ||
145 | struct mm_struct *mm = current->mm; | ||
146 | struct address_space *mapping; | ||
147 | struct vm_area_struct *vma; | ||
148 | int err = -EINVAL; | ||
149 | int has_write_lock = 0; | ||
150 | vm_flags_t vm_flags = 0; | ||
151 | |||
152 | pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " | ||
153 | "See Documentation/vm/remap_file_pages.txt.\n", | ||
154 | current->comm, current->pid); | ||
155 | |||
156 | if (prot) | ||
157 | return err; | ||
158 | /* | ||
159 | * Sanitize the syscall parameters: | ||
160 | */ | ||
161 | start = start & PAGE_MASK; | ||
162 | size = size & PAGE_MASK; | ||
163 | |||
164 | /* Does the address range wrap, or is the span zero-sized? */ | ||
165 | if (start + size <= start) | ||
166 | return err; | ||
167 | |||
168 | /* Does pgoff wrap? */ | ||
169 | if (pgoff + (size >> PAGE_SHIFT) < pgoff) | ||
170 | return err; | ||
171 | |||
172 | /* Can we represent this offset inside this architecture's pte's? */ | ||
173 | #if PTE_FILE_MAX_BITS < BITS_PER_LONG | ||
174 | if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) | ||
175 | return err; | ||
176 | #endif | ||
177 | |||
178 | /* We need down_write() to change vma->vm_flags. */ | ||
179 | down_read(&mm->mmap_sem); | ||
180 | retry: | ||
181 | vma = find_vma(mm, start); | ||
182 | |||
183 | /* | ||
184 | * Make sure the vma is shared, that it supports prefaulting, | ||
185 | * and that the remapped range is valid and fully within | ||
186 | * the single existing vma. | ||
187 | */ | ||
188 | if (!vma || !(vma->vm_flags & VM_SHARED)) | ||
189 | goto out; | ||
190 | |||
191 | if (!vma->vm_ops || !vma->vm_ops->remap_pages) | ||
192 | goto out; | ||
193 | |||
194 | if (start < vma->vm_start || start + size > vma->vm_end) | ||
195 | goto out; | ||
196 | |||
197 | /* Must set VM_NONLINEAR before any pages are populated. */ | ||
198 | if (!(vma->vm_flags & VM_NONLINEAR)) { | ||
199 | /* | ||
200 | * vm_private_data is used as a swapout cursor | ||
201 | * in a VM_NONLINEAR vma. | ||
202 | */ | ||
203 | if (vma->vm_private_data) | ||
204 | goto out; | ||
205 | |||
206 | /* Don't need a nonlinear mapping, exit success */ | ||
207 | if (pgoff == linear_page_index(vma, start)) { | ||
208 | err = 0; | ||
209 | goto out; | ||
210 | } | ||
211 | |||
212 | if (!has_write_lock) { | ||
213 | get_write_lock: | ||
214 | up_read(&mm->mmap_sem); | ||
215 | down_write(&mm->mmap_sem); | ||
216 | has_write_lock = 1; | ||
217 | goto retry; | ||
218 | } | ||
219 | mapping = vma->vm_file->f_mapping; | ||
220 | /* | ||
221 | * page_mkclean doesn't work on nonlinear vmas, so if | ||
222 | * dirty pages need to be accounted, emulate with linear | ||
223 | * vmas. | ||
224 | */ | ||
225 | if (mapping_cap_account_dirty(mapping)) { | ||
226 | unsigned long addr; | ||
227 | struct file *file = get_file(vma->vm_file); | ||
228 | /* mmap_region may free vma; grab the info now */ | ||
229 | vm_flags = vma->vm_flags; | ||
230 | |||
231 | addr = mmap_region(file, start, size, vm_flags, pgoff); | ||
232 | fput(file); | ||
233 | if (IS_ERR_VALUE(addr)) { | ||
234 | err = addr; | ||
235 | } else { | ||
236 | BUG_ON(addr != start); | ||
237 | err = 0; | ||
238 | } | ||
239 | goto out_freed; | ||
240 | } | ||
241 | i_mmap_lock_write(mapping); | ||
242 | flush_dcache_mmap_lock(mapping); | ||
243 | vma->vm_flags |= VM_NONLINEAR; | ||
244 | vma_interval_tree_remove(vma, &mapping->i_mmap); | ||
245 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | ||
246 | flush_dcache_mmap_unlock(mapping); | ||
247 | i_mmap_unlock_write(mapping); | ||
248 | } | ||
249 | |||
250 | if (vma->vm_flags & VM_LOCKED) { | ||
251 | /* | ||
252 | * drop PG_Mlocked flag for over-mapped range | ||
253 | */ | ||
254 | if (!has_write_lock) | ||
255 | goto get_write_lock; | ||
256 | vm_flags = vma->vm_flags; | ||
257 | munlock_vma_pages_range(vma, start, start + size); | ||
258 | vma->vm_flags = vm_flags; | ||
259 | } | ||
260 | |||
261 | mmu_notifier_invalidate_range_start(mm, start, start + size); | ||
262 | err = vma->vm_ops->remap_pages(vma, start, size, pgoff); | ||
263 | mmu_notifier_invalidate_range_end(mm, start, start + size); | ||
264 | |||
265 | /* | ||
266 | * We can't clear VM_NONLINEAR because we'd have to do | ||
267 | * it after ->populate completes, and that would prevent | ||
268 | * downgrading the lock. (Locks can't be upgraded). | ||
269 | */ | ||
270 | |||
271 | out: | ||
272 | if (vma) | ||
273 | vm_flags = vma->vm_flags; | ||
274 | out_freed: | ||
275 | if (likely(!has_write_lock)) | ||
276 | up_read(&mm->mmap_sem); | ||
277 | else | ||
278 | up_write(&mm->mmap_sem); | ||
279 | if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) | ||
280 | mm_populate(start, size); | ||
281 | |||
282 | return err; | ||
283 | } | ||
@@ -55,7 +55,7 @@ retry: | |||
55 | */ | 55 | */ |
56 | if (likely(!(flags & FOLL_MIGRATION))) | 56 | if (likely(!(flags & FOLL_MIGRATION))) |
57 | goto no_page; | 57 | goto no_page; |
58 | if (pte_none(pte) || pte_file(pte)) | 58 | if (pte_none(pte)) |
59 | goto no_page; | 59 | goto no_page; |
60 | entry = pte_to_swp_entry(pte); | 60 | entry = pte_to_swp_entry(pte); |
61 | if (!is_migration_entry(entry)) | 61 | if (!is_migration_entry(entry)) |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 85032de5e20f..be0e5d0db5ec 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -35,7 +35,7 @@ | |||
35 | #include <linux/node.h> | 35 | #include <linux/node.h> |
36 | #include "internal.h" | 36 | #include "internal.h" |
37 | 37 | ||
38 | unsigned long hugepages_treat_as_movable; | 38 | int hugepages_treat_as_movable; |
39 | 39 | ||
40 | int hugetlb_max_hstate __read_mostly; | 40 | int hugetlb_max_hstate __read_mostly; |
41 | unsigned int default_hstate_idx; | 41 | unsigned int default_hstate_idx; |
diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 8da581fa9060..f2c2492681bf 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c | |||
@@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) | |||
21 | return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; | 21 | return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; |
22 | } | 22 | } |
23 | 23 | ||
24 | INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, | 24 | INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, |
25 | unsigned long, shared.linear.rb_subtree_last, | 25 | unsigned long, shared.rb_subtree_last, |
26 | vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) | 26 | vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) |
27 | 27 | ||
28 | /* Insert node immediately after prev in the interval tree */ | 28 | /* Insert node immediately after prev in the interval tree */ |
@@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, | |||
36 | 36 | ||
37 | VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); | 37 | VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); |
38 | 38 | ||
39 | if (!prev->shared.linear.rb.rb_right) { | 39 | if (!prev->shared.rb.rb_right) { |
40 | parent = prev; | 40 | parent = prev; |
41 | link = &prev->shared.linear.rb.rb_right; | 41 | link = &prev->shared.rb.rb_right; |
42 | } else { | 42 | } else { |
43 | parent = rb_entry(prev->shared.linear.rb.rb_right, | 43 | parent = rb_entry(prev->shared.rb.rb_right, |
44 | struct vm_area_struct, shared.linear.rb); | 44 | struct vm_area_struct, shared.rb); |
45 | if (parent->shared.linear.rb_subtree_last < last) | 45 | if (parent->shared.rb_subtree_last < last) |
46 | parent->shared.linear.rb_subtree_last = last; | 46 | parent->shared.rb_subtree_last = last; |
47 | while (parent->shared.linear.rb.rb_left) { | 47 | while (parent->shared.rb.rb_left) { |
48 | parent = rb_entry(parent->shared.linear.rb.rb_left, | 48 | parent = rb_entry(parent->shared.rb.rb_left, |
49 | struct vm_area_struct, shared.linear.rb); | 49 | struct vm_area_struct, shared.rb); |
50 | if (parent->shared.linear.rb_subtree_last < last) | 50 | if (parent->shared.rb_subtree_last < last) |
51 | parent->shared.linear.rb_subtree_last = last; | 51 | parent->shared.rb_subtree_last = last; |
52 | } | 52 | } |
53 | link = &parent->shared.linear.rb.rb_left; | 53 | link = &parent->shared.rb.rb_left; |
54 | } | 54 | } |
55 | 55 | ||
56 | node->shared.linear.rb_subtree_last = last; | 56 | node->shared.rb_subtree_last = last; |
57 | rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); | 57 | rb_link_node(&node->shared.rb, &parent->shared.rb, link); |
58 | rb_insert_augmented(&node->shared.linear.rb, root, | 58 | rb_insert_augmented(&node->shared.rb, root, |
59 | &vma_interval_tree_augment); | 59 | &vma_interval_tree_augment); |
60 | } | 60 | } |
61 | 61 | ||
@@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1748 | */ | 1748 | */ |
1749 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | 1749 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | |
1750 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | 1750 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | |
1751 | VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) | 1751 | VM_HUGETLB | VM_MIXEDMAP)) |
1752 | return 0; /* just ignore the advice */ | 1752 | return 0; /* just ignore the advice */ |
1753 | 1753 | ||
1754 | #ifdef VM_SAO | 1754 | #ifdef VM_SAO |
diff --git a/mm/madvise.c b/mm/madvise.c index a271adc93289..d79fb5e8f80a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, | |||
155 | pte = *(orig_pte + ((index - start) / PAGE_SIZE)); | 155 | pte = *(orig_pte + ((index - start) / PAGE_SIZE)); |
156 | pte_unmap_unlock(orig_pte, ptl); | 156 | pte_unmap_unlock(orig_pte, ptl); |
157 | 157 | ||
158 | if (pte_present(pte) || pte_none(pte) || pte_file(pte)) | 158 | if (pte_present(pte) || pte_none(pte)) |
159 | continue; | 159 | continue; |
160 | entry = pte_to_swp_entry(pte); | 160 | entry = pte_to_swp_entry(pte); |
161 | if (unlikely(non_swap_entry(entry))) | 161 | if (unlikely(non_swap_entry(entry))) |
@@ -278,14 +278,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, | |||
278 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) | 278 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) |
279 | return -EINVAL; | 279 | return -EINVAL; |
280 | 280 | ||
281 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { | 281 | zap_page_range(vma, start, end - start, NULL); |
282 | struct zap_details details = { | ||
283 | .nonlinear_vma = vma, | ||
284 | .last_index = ULONG_MAX, | ||
285 | }; | ||
286 | zap_page_range(vma, start, end - start, &details); | ||
287 | } else | ||
288 | zap_page_range(vma, start, end - start, NULL); | ||
289 | return 0; | 282 | return 0; |
290 | } | 283 | } |
291 | 284 | ||
@@ -303,7 +296,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
303 | 296 | ||
304 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ | 297 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ |
305 | 298 | ||
306 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) | 299 | if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB)) |
307 | return -EINVAL; | 300 | return -EINVAL; |
308 | 301 | ||
309 | f = vma->vm_file; | 302 | f = vma->vm_file; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2f6893c2f01b..f3f8a4f52a0c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -343,9 +343,6 @@ struct mem_cgroup { | |||
343 | struct cg_proto tcp_mem; | 343 | struct cg_proto tcp_mem; |
344 | #endif | 344 | #endif |
345 | #if defined(CONFIG_MEMCG_KMEM) | 345 | #if defined(CONFIG_MEMCG_KMEM) |
346 | /* analogous to slab_common's slab_caches list, but per-memcg; | ||
347 | * protected by memcg_slab_mutex */ | ||
348 | struct list_head memcg_slab_caches; | ||
349 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ | 346 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ |
350 | int kmemcg_id; | 347 | int kmemcg_id; |
351 | #endif | 348 | #endif |
@@ -2476,27 +2473,8 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, | |||
2476 | } | 2473 | } |
2477 | 2474 | ||
2478 | #ifdef CONFIG_MEMCG_KMEM | 2475 | #ifdef CONFIG_MEMCG_KMEM |
2479 | /* | 2476 | int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, |
2480 | * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or | 2477 | unsigned long nr_pages) |
2481 | * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. | ||
2482 | */ | ||
2483 | static DEFINE_MUTEX(memcg_slab_mutex); | ||
2484 | |||
2485 | /* | ||
2486 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer | ||
2487 | * in the memcg_cache_params struct. | ||
2488 | */ | ||
2489 | static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) | ||
2490 | { | ||
2491 | struct kmem_cache *cachep; | ||
2492 | |||
2493 | VM_BUG_ON(p->is_root_cache); | ||
2494 | cachep = p->root_cache; | ||
2495 | return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); | ||
2496 | } | ||
2497 | |||
2498 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, | ||
2499 | unsigned long nr_pages) | ||
2500 | { | 2478 | { |
2501 | struct page_counter *counter; | 2479 | struct page_counter *counter; |
2502 | int ret = 0; | 2480 | int ret = 0; |
@@ -2533,8 +2511,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, | |||
2533 | return ret; | 2511 | return ret; |
2534 | } | 2512 | } |
2535 | 2513 | ||
2536 | static void memcg_uncharge_kmem(struct mem_cgroup *memcg, | 2514 | void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) |
2537 | unsigned long nr_pages) | ||
2538 | { | 2515 | { |
2539 | page_counter_uncharge(&memcg->memory, nr_pages); | 2516 | page_counter_uncharge(&memcg->memory, nr_pages); |
2540 | if (do_swap_account) | 2517 | if (do_swap_account) |
@@ -2579,10 +2556,7 @@ static int memcg_alloc_cache_id(void) | |||
2579 | else if (size > MEMCG_CACHES_MAX_SIZE) | 2556 | else if (size > MEMCG_CACHES_MAX_SIZE) |
2580 | size = MEMCG_CACHES_MAX_SIZE; | 2557 | size = MEMCG_CACHES_MAX_SIZE; |
2581 | 2558 | ||
2582 | mutex_lock(&memcg_slab_mutex); | ||
2583 | err = memcg_update_all_caches(size); | 2559 | err = memcg_update_all_caches(size); |
2584 | mutex_unlock(&memcg_slab_mutex); | ||
2585 | |||
2586 | if (err) { | 2560 | if (err) { |
2587 | ida_simple_remove(&kmem_limited_groups, id); | 2561 | ida_simple_remove(&kmem_limited_groups, id); |
2588 | return err; | 2562 | return err; |
@@ -2605,123 +2579,20 @@ void memcg_update_array_size(int num) | |||
2605 | memcg_limited_groups_array_size = num; | 2579 | memcg_limited_groups_array_size = num; |
2606 | } | 2580 | } |
2607 | 2581 | ||
2608 | static void memcg_register_cache(struct mem_cgroup *memcg, | 2582 | struct memcg_kmem_cache_create_work { |
2609 | struct kmem_cache *root_cache) | ||
2610 | { | ||
2611 | static char memcg_name_buf[NAME_MAX + 1]; /* protected by | ||
2612 | memcg_slab_mutex */ | ||
2613 | struct kmem_cache *cachep; | ||
2614 | int id; | ||
2615 | |||
2616 | lockdep_assert_held(&memcg_slab_mutex); | ||
2617 | |||
2618 | id = memcg_cache_id(memcg); | ||
2619 | |||
2620 | /* | ||
2621 | * Since per-memcg caches are created asynchronously on first | ||
2622 | * allocation (see memcg_kmem_get_cache()), several threads can try to | ||
2623 | * create the same cache, but only one of them may succeed. | ||
2624 | */ | ||
2625 | if (cache_from_memcg_idx(root_cache, id)) | ||
2626 | return; | ||
2627 | |||
2628 | cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); | ||
2629 | cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); | ||
2630 | /* | ||
2631 | * If we could not create a memcg cache, do not complain, because | ||
2632 | * that's not critical at all as we can always proceed with the root | ||
2633 | * cache. | ||
2634 | */ | ||
2635 | if (!cachep) | ||
2636 | return; | ||
2637 | |||
2638 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | ||
2639 | |||
2640 | /* | ||
2641 | * Since readers won't lock (see cache_from_memcg_idx()), we need a | ||
2642 | * barrier here to ensure nobody will see the kmem_cache partially | ||
2643 | * initialized. | ||
2644 | */ | ||
2645 | smp_wmb(); | ||
2646 | |||
2647 | BUG_ON(root_cache->memcg_params->memcg_caches[id]); | ||
2648 | root_cache->memcg_params->memcg_caches[id] = cachep; | ||
2649 | } | ||
2650 | |||
2651 | static void memcg_unregister_cache(struct kmem_cache *cachep) | ||
2652 | { | ||
2653 | struct kmem_cache *root_cache; | ||
2654 | struct mem_cgroup *memcg; | ||
2655 | int id; | ||
2656 | |||
2657 | lockdep_assert_held(&memcg_slab_mutex); | ||
2658 | |||
2659 | BUG_ON(is_root_cache(cachep)); | ||
2660 | |||
2661 | root_cache = cachep->memcg_params->root_cache; | ||
2662 | memcg = cachep->memcg_params->memcg; | ||
2663 | id = memcg_cache_id(memcg); | ||
2664 | |||
2665 | BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); | ||
2666 | root_cache->memcg_params->memcg_caches[id] = NULL; | ||
2667 | |||
2668 | list_del(&cachep->memcg_params->list); | ||
2669 | |||
2670 | kmem_cache_destroy(cachep); | ||
2671 | } | ||
2672 | |||
2673 | int __memcg_cleanup_cache_params(struct kmem_cache *s) | ||
2674 | { | ||
2675 | struct kmem_cache *c; | ||
2676 | int i, failed = 0; | ||
2677 | |||
2678 | mutex_lock(&memcg_slab_mutex); | ||
2679 | for_each_memcg_cache_index(i) { | ||
2680 | c = cache_from_memcg_idx(s, i); | ||
2681 | if (!c) | ||
2682 | continue; | ||
2683 | |||
2684 | memcg_unregister_cache(c); | ||
2685 | |||
2686 | if (cache_from_memcg_idx(s, i)) | ||
2687 | failed++; | ||
2688 | } | ||
2689 | mutex_unlock(&memcg_slab_mutex); | ||
2690 | return failed; | ||
2691 | } | ||
2692 | |||
2693 | static void memcg_unregister_all_caches(struct mem_cgroup *memcg) | ||
2694 | { | ||
2695 | struct kmem_cache *cachep; | ||
2696 | struct memcg_cache_params *params, *tmp; | ||
2697 | |||
2698 | if (!memcg_kmem_is_active(memcg)) | ||
2699 | return; | ||
2700 | |||
2701 | mutex_lock(&memcg_slab_mutex); | ||
2702 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { | ||
2703 | cachep = memcg_params_to_cache(params); | ||
2704 | memcg_unregister_cache(cachep); | ||
2705 | } | ||
2706 | mutex_unlock(&memcg_slab_mutex); | ||
2707 | } | ||
2708 | |||
2709 | struct memcg_register_cache_work { | ||
2710 | struct mem_cgroup *memcg; | 2583 | struct mem_cgroup *memcg; |
2711 | struct kmem_cache *cachep; | 2584 | struct kmem_cache *cachep; |
2712 | struct work_struct work; | 2585 | struct work_struct work; |
2713 | }; | 2586 | }; |
2714 | 2587 | ||
2715 | static void memcg_register_cache_func(struct work_struct *w) | 2588 | static void memcg_kmem_cache_create_func(struct work_struct *w) |
2716 | { | 2589 | { |
2717 | struct memcg_register_cache_work *cw = | 2590 | struct memcg_kmem_cache_create_work *cw = |
2718 | container_of(w, struct memcg_register_cache_work, work); | 2591 | container_of(w, struct memcg_kmem_cache_create_work, work); |
2719 | struct mem_cgroup *memcg = cw->memcg; | 2592 | struct mem_cgroup *memcg = cw->memcg; |
2720 | struct kmem_cache *cachep = cw->cachep; | 2593 | struct kmem_cache *cachep = cw->cachep; |
2721 | 2594 | ||
2722 | mutex_lock(&memcg_slab_mutex); | 2595 | memcg_create_kmem_cache(memcg, cachep); |
2723 | memcg_register_cache(memcg, cachep); | ||
2724 | mutex_unlock(&memcg_slab_mutex); | ||
2725 | 2596 | ||
2726 | css_put(&memcg->css); | 2597 | css_put(&memcg->css); |
2727 | kfree(cw); | 2598 | kfree(cw); |
@@ -2730,10 +2601,10 @@ static void memcg_register_cache_func(struct work_struct *w) | |||
2730 | /* | 2601 | /* |
2731 | * Enqueue the creation of a per-memcg kmem_cache. | 2602 | * Enqueue the creation of a per-memcg kmem_cache. |
2732 | */ | 2603 | */ |
2733 | static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, | 2604 | static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, |
2734 | struct kmem_cache *cachep) | 2605 | struct kmem_cache *cachep) |
2735 | { | 2606 | { |
2736 | struct memcg_register_cache_work *cw; | 2607 | struct memcg_kmem_cache_create_work *cw; |
2737 | 2608 | ||
2738 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); | 2609 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); |
2739 | if (!cw) | 2610 | if (!cw) |
@@ -2743,18 +2614,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
2743 | 2614 | ||
2744 | cw->memcg = memcg; | 2615 | cw->memcg = memcg; |
2745 | cw->cachep = cachep; | 2616 | cw->cachep = cachep; |
2617 | INIT_WORK(&cw->work, memcg_kmem_cache_create_func); | ||
2746 | 2618 | ||
2747 | INIT_WORK(&cw->work, memcg_register_cache_func); | ||
2748 | schedule_work(&cw->work); | 2619 | schedule_work(&cw->work); |
2749 | } | 2620 | } |
2750 | 2621 | ||
2751 | static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | 2622 | static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, |
2752 | struct kmem_cache *cachep) | 2623 | struct kmem_cache *cachep) |
2753 | { | 2624 | { |
2754 | /* | 2625 | /* |
2755 | * We need to stop accounting when we kmalloc, because if the | 2626 | * We need to stop accounting when we kmalloc, because if the |
2756 | * corresponding kmalloc cache is not yet created, the first allocation | 2627 | * corresponding kmalloc cache is not yet created, the first allocation |
2757 | * in __memcg_schedule_register_cache will recurse. | 2628 | * in __memcg_schedule_kmem_cache_create will recurse. |
2758 | * | 2629 | * |
2759 | * However, it is better to enclose the whole function. Depending on | 2630 | * However, it is better to enclose the whole function. Depending on |
2760 | * the debugging options enabled, INIT_WORK(), for instance, can | 2631 | * the debugging options enabled, INIT_WORK(), for instance, can |
@@ -2763,24 +2634,10 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
2763 | * the safest choice is to do it like this, wrapping the whole function. | 2634 | * the safest choice is to do it like this, wrapping the whole function. |
2764 | */ | 2635 | */ |
2765 | current->memcg_kmem_skip_account = 1; | 2636 | current->memcg_kmem_skip_account = 1; |
2766 | __memcg_schedule_register_cache(memcg, cachep); | 2637 | __memcg_schedule_kmem_cache_create(memcg, cachep); |
2767 | current->memcg_kmem_skip_account = 0; | 2638 | current->memcg_kmem_skip_account = 0; |
2768 | } | 2639 | } |
2769 | 2640 | ||
2770 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) | ||
2771 | { | ||
2772 | unsigned int nr_pages = 1 << order; | ||
2773 | |||
2774 | return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); | ||
2775 | } | ||
2776 | |||
2777 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | ||
2778 | { | ||
2779 | unsigned int nr_pages = 1 << order; | ||
2780 | |||
2781 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); | ||
2782 | } | ||
2783 | |||
2784 | /* | 2641 | /* |
2785 | * Return the kmem_cache we're supposed to use for a slab allocation. | 2642 | * Return the kmem_cache we're supposed to use for a slab allocation. |
2786 | * We try to use the current memcg's version of the cache. | 2643 | * We try to use the current memcg's version of the cache. |
@@ -2825,7 +2682,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
2825 | * could happen with the slab_mutex held. So it's better to | 2682 | * could happen with the slab_mutex held. So it's better to |
2826 | * defer everything. | 2683 | * defer everything. |
2827 | */ | 2684 | */ |
2828 | memcg_schedule_register_cache(memcg, cachep); | 2685 | memcg_schedule_kmem_cache_create(memcg, cachep); |
2829 | out: | 2686 | out: |
2830 | css_put(&memcg->css); | 2687 | css_put(&memcg->css); |
2831 | return cachep; | 2688 | return cachep; |
@@ -4154,7 +4011,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
4154 | 4011 | ||
4155 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4012 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
4156 | { | 4013 | { |
4157 | memcg_unregister_all_caches(memcg); | 4014 | memcg_destroy_kmem_caches(memcg); |
4158 | mem_cgroup_sockets_destroy(memcg); | 4015 | mem_cgroup_sockets_destroy(memcg); |
4159 | } | 4016 | } |
4160 | #else | 4017 | #else |
@@ -4682,7 +4539,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
4682 | spin_lock_init(&memcg->event_list_lock); | 4539 | spin_lock_init(&memcg->event_list_lock); |
4683 | #ifdef CONFIG_MEMCG_KMEM | 4540 | #ifdef CONFIG_MEMCG_KMEM |
4684 | memcg->kmemcg_id = -1; | 4541 | memcg->kmemcg_id = -1; |
4685 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
4686 | #endif | 4542 | #endif |
4687 | 4543 | ||
4688 | return &memcg->css; | 4544 | return &memcg->css; |
@@ -4926,10 +4782,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
4926 | return NULL; | 4782 | return NULL; |
4927 | 4783 | ||
4928 | mapping = vma->vm_file->f_mapping; | 4784 | mapping = vma->vm_file->f_mapping; |
4929 | if (pte_none(ptent)) | 4785 | pgoff = linear_page_index(vma, addr); |
4930 | pgoff = linear_page_index(vma, addr); | ||
4931 | else /* pte_file(ptent) is true */ | ||
4932 | pgoff = pte_to_pgoff(ptent); | ||
4933 | 4786 | ||
4934 | /* page is moved even if it's not RSS of this task(page-faulted). */ | 4787 | /* page is moved even if it's not RSS of this task(page-faulted). */ |
4935 | #ifdef CONFIG_SWAP | 4788 | #ifdef CONFIG_SWAP |
@@ -4961,7 +4814,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |||
4961 | page = mc_handle_present_pte(vma, addr, ptent); | 4814 | page = mc_handle_present_pte(vma, addr, ptent); |
4962 | else if (is_swap_pte(ptent)) | 4815 | else if (is_swap_pte(ptent)) |
4963 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); | 4816 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); |
4964 | else if (pte_none(ptent) || pte_file(ptent)) | 4817 | else if (pte_none(ptent)) |
4965 | page = mc_handle_file_pte(vma, addr, ptent, &ent); | 4818 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
4966 | 4819 | ||
4967 | if (!page && !ent.val) | 4820 | if (!page && !ent.val) |
diff --git a/mm/memory.c b/mm/memory.c index d707c4dfbbb4..d63849b5188f 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -813,42 +813,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
813 | 813 | ||
814 | /* pte contains position in swap or file, so copy. */ | 814 | /* pte contains position in swap or file, so copy. */ |
815 | if (unlikely(!pte_present(pte))) { | 815 | if (unlikely(!pte_present(pte))) { |
816 | if (!pte_file(pte)) { | 816 | swp_entry_t entry = pte_to_swp_entry(pte); |
817 | swp_entry_t entry = pte_to_swp_entry(pte); | 817 | |
818 | 818 | if (likely(!non_swap_entry(entry))) { | |
819 | if (likely(!non_swap_entry(entry))) { | 819 | if (swap_duplicate(entry) < 0) |
820 | if (swap_duplicate(entry) < 0) | 820 | return entry.val; |
821 | return entry.val; | 821 | |
822 | 822 | /* make sure dst_mm is on swapoff's mmlist. */ | |
823 | /* make sure dst_mm is on swapoff's mmlist. */ | 823 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
824 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 824 | spin_lock(&mmlist_lock); |
825 | spin_lock(&mmlist_lock); | 825 | if (list_empty(&dst_mm->mmlist)) |
826 | if (list_empty(&dst_mm->mmlist)) | 826 | list_add(&dst_mm->mmlist, |
827 | list_add(&dst_mm->mmlist, | 827 | &src_mm->mmlist); |
828 | &src_mm->mmlist); | 828 | spin_unlock(&mmlist_lock); |
829 | spin_unlock(&mmlist_lock); | 829 | } |
830 | } | 830 | rss[MM_SWAPENTS]++; |
831 | rss[MM_SWAPENTS]++; | 831 | } else if (is_migration_entry(entry)) { |
832 | } else if (is_migration_entry(entry)) { | 832 | page = migration_entry_to_page(entry); |
833 | page = migration_entry_to_page(entry); | 833 | |
834 | 834 | if (PageAnon(page)) | |
835 | if (PageAnon(page)) | 835 | rss[MM_ANONPAGES]++; |
836 | rss[MM_ANONPAGES]++; | 836 | else |
837 | else | 837 | rss[MM_FILEPAGES]++; |
838 | rss[MM_FILEPAGES]++; | 838 | |
839 | 839 | if (is_write_migration_entry(entry) && | |
840 | if (is_write_migration_entry(entry) && | 840 | is_cow_mapping(vm_flags)) { |
841 | is_cow_mapping(vm_flags)) { | 841 | /* |
842 | /* | 842 | * COW mappings require pages in both |
843 | * COW mappings require pages in both | 843 | * parent and child to be set to read. |
844 | * parent and child to be set to read. | 844 | */ |
845 | */ | 845 | make_migration_entry_read(&entry); |
846 | make_migration_entry_read(&entry); | 846 | pte = swp_entry_to_pte(entry); |
847 | pte = swp_entry_to_pte(entry); | 847 | if (pte_swp_soft_dirty(*src_pte)) |
848 | if (pte_swp_soft_dirty(*src_pte)) | 848 | pte = pte_swp_mksoft_dirty(pte); |
849 | pte = pte_swp_mksoft_dirty(pte); | 849 | set_pte_at(src_mm, addr, src_pte, pte); |
850 | set_pte_at(src_mm, addr, src_pte, pte); | ||
851 | } | ||
852 | } | 850 | } |
853 | } | 851 | } |
854 | goto out_set_pte; | 852 | goto out_set_pte; |
@@ -1022,11 +1020,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1022 | * readonly mappings. The tradeoff is that copy_page_range is more | 1020 | * readonly mappings. The tradeoff is that copy_page_range is more |
1023 | * efficient than faulting. | 1021 | * efficient than faulting. |
1024 | */ | 1022 | */ |
1025 | if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | | 1023 | if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && |
1026 | VM_PFNMAP | VM_MIXEDMAP))) { | 1024 | !vma->anon_vma) |
1027 | if (!vma->anon_vma) | 1025 | return 0; |
1028 | return 0; | ||
1029 | } | ||
1030 | 1026 | ||
1031 | if (is_vm_hugetlb_page(vma)) | 1027 | if (is_vm_hugetlb_page(vma)) |
1032 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | 1028 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); |
@@ -1084,6 +1080,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
1084 | spinlock_t *ptl; | 1080 | spinlock_t *ptl; |
1085 | pte_t *start_pte; | 1081 | pte_t *start_pte; |
1086 | pte_t *pte; | 1082 | pte_t *pte; |
1083 | swp_entry_t entry; | ||
1087 | 1084 | ||
1088 | again: | 1085 | again: |
1089 | init_rss_vec(rss); | 1086 | init_rss_vec(rss); |
@@ -1109,28 +1106,12 @@ again: | |||
1109 | if (details->check_mapping && | 1106 | if (details->check_mapping && |
1110 | details->check_mapping != page->mapping) | 1107 | details->check_mapping != page->mapping) |
1111 | continue; | 1108 | continue; |
1112 | /* | ||
1113 | * Each page->index must be checked when | ||
1114 | * invalidating or truncating nonlinear. | ||
1115 | */ | ||
1116 | if (details->nonlinear_vma && | ||
1117 | (page->index < details->first_index || | ||
1118 | page->index > details->last_index)) | ||
1119 | continue; | ||
1120 | } | 1109 | } |
1121 | ptent = ptep_get_and_clear_full(mm, addr, pte, | 1110 | ptent = ptep_get_and_clear_full(mm, addr, pte, |
1122 | tlb->fullmm); | 1111 | tlb->fullmm); |
1123 | tlb_remove_tlb_entry(tlb, pte, addr); | 1112 | tlb_remove_tlb_entry(tlb, pte, addr); |
1124 | if (unlikely(!page)) | 1113 | if (unlikely(!page)) |
1125 | continue; | 1114 | continue; |
1126 | if (unlikely(details) && details->nonlinear_vma | ||
1127 | && linear_page_index(details->nonlinear_vma, | ||
1128 | addr) != page->index) { | ||
1129 | pte_t ptfile = pgoff_to_pte(page->index); | ||
1130 | if (pte_soft_dirty(ptent)) | ||
1131 | ptfile = pte_file_mksoft_dirty(ptfile); | ||
1132 | set_pte_at(mm, addr, pte, ptfile); | ||
1133 | } | ||
1134 | if (PageAnon(page)) | 1115 | if (PageAnon(page)) |
1135 | rss[MM_ANONPAGES]--; | 1116 | rss[MM_ANONPAGES]--; |
1136 | else { | 1117 | else { |
@@ -1153,33 +1134,25 @@ again: | |||
1153 | } | 1134 | } |
1154 | continue; | 1135 | continue; |
1155 | } | 1136 | } |
1156 | /* | 1137 | /* If details->check_mapping, we leave swap entries. */ |
1157 | * If details->check_mapping, we leave swap entries; | ||
1158 | * if details->nonlinear_vma, we leave file entries. | ||
1159 | */ | ||
1160 | if (unlikely(details)) | 1138 | if (unlikely(details)) |
1161 | continue; | 1139 | continue; |
1162 | if (pte_file(ptent)) { | ||
1163 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) | ||
1164 | print_bad_pte(vma, addr, ptent, NULL); | ||
1165 | } else { | ||
1166 | swp_entry_t entry = pte_to_swp_entry(ptent); | ||
1167 | 1140 | ||
1168 | if (!non_swap_entry(entry)) | 1141 | entry = pte_to_swp_entry(ptent); |
1169 | rss[MM_SWAPENTS]--; | 1142 | if (!non_swap_entry(entry)) |
1170 | else if (is_migration_entry(entry)) { | 1143 | rss[MM_SWAPENTS]--; |
1171 | struct page *page; | 1144 | else if (is_migration_entry(entry)) { |
1145 | struct page *page; | ||
1172 | 1146 | ||
1173 | page = migration_entry_to_page(entry); | 1147 | page = migration_entry_to_page(entry); |
1174 | 1148 | ||
1175 | if (PageAnon(page)) | 1149 | if (PageAnon(page)) |
1176 | rss[MM_ANONPAGES]--; | 1150 | rss[MM_ANONPAGES]--; |
1177 | else | 1151 | else |
1178 | rss[MM_FILEPAGES]--; | 1152 | rss[MM_FILEPAGES]--; |
1179 | } | ||
1180 | if (unlikely(!free_swap_and_cache(entry))) | ||
1181 | print_bad_pte(vma, addr, ptent, NULL); | ||
1182 | } | 1153 | } |
1154 | if (unlikely(!free_swap_and_cache(entry))) | ||
1155 | print_bad_pte(vma, addr, ptent, NULL); | ||
1183 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 1156 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
1184 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1157 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1185 | 1158 | ||
@@ -1279,7 +1252,7 @@ static void unmap_page_range(struct mmu_gather *tlb, | |||
1279 | pgd_t *pgd; | 1252 | pgd_t *pgd; |
1280 | unsigned long next; | 1253 | unsigned long next; |
1281 | 1254 | ||
1282 | if (details && !details->check_mapping && !details->nonlinear_vma) | 1255 | if (details && !details->check_mapping) |
1283 | details = NULL; | 1256 | details = NULL; |
1284 | 1257 | ||
1285 | BUG_ON(addr >= end); | 1258 | BUG_ON(addr >= end); |
@@ -1373,7 +1346,7 @@ void unmap_vmas(struct mmu_gather *tlb, | |||
1373 | * @vma: vm_area_struct holding the applicable pages | 1346 | * @vma: vm_area_struct holding the applicable pages |
1374 | * @start: starting address of pages to zap | 1347 | * @start: starting address of pages to zap |
1375 | * @size: number of bytes to zap | 1348 | * @size: number of bytes to zap |
1376 | * @details: details of nonlinear truncation or shared cache invalidation | 1349 | * @details: details of shared cache invalidation |
1377 | * | 1350 | * |
1378 | * Caller must protect the VMA list | 1351 | * Caller must protect the VMA list |
1379 | */ | 1352 | */ |
@@ -1399,7 +1372,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, | |||
1399 | * @vma: vm_area_struct holding the applicable pages | 1372 | * @vma: vm_area_struct holding the applicable pages |
1400 | * @address: starting address of pages to zap | 1373 | * @address: starting address of pages to zap |
1401 | * @size: number of bytes to zap | 1374 | * @size: number of bytes to zap |
1402 | * @details: details of nonlinear truncation or shared cache invalidation | 1375 | * @details: details of shared cache invalidation |
1403 | * | 1376 | * |
1404 | * The range must fit into one VMA. | 1377 | * The range must fit into one VMA. |
1405 | */ | 1378 | */ |
@@ -1924,12 +1897,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | |||
1924 | EXPORT_SYMBOL_GPL(apply_to_page_range); | 1897 | EXPORT_SYMBOL_GPL(apply_to_page_range); |
1925 | 1898 | ||
1926 | /* | 1899 | /* |
1927 | * handle_pte_fault chooses page fault handler according to an entry | 1900 | * handle_pte_fault chooses page fault handler according to an entry which was |
1928 | * which was read non-atomically. Before making any commitment, on | 1901 | * read non-atomically. Before making any commitment, on those architectures |
1929 | * those architectures or configurations (e.g. i386 with PAE) which | 1902 | * or configurations (e.g. i386 with PAE) which might give a mix of unmatched |
1930 | * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault | 1903 | * parts, do_swap_page must check under lock before unmapping the pte and |
1931 | * must check under lock before unmapping the pte and proceeding | 1904 | * proceeding (but do_wp_page is only called after already making such a check; |
1932 | * (but do_wp_page is only called after already making such a check; | ||
1933 | * and do_anonymous_page can safely check later on). | 1905 | * and do_anonymous_page can safely check later on). |
1934 | */ | 1906 | */ |
1935 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | 1907 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, |
@@ -2035,7 +2007,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2035 | pte_t entry; | 2007 | pte_t entry; |
2036 | int ret = 0; | 2008 | int ret = 0; |
2037 | int page_mkwrite = 0; | 2009 | int page_mkwrite = 0; |
2038 | struct page *dirty_page = NULL; | 2010 | bool dirty_shared = false; |
2039 | unsigned long mmun_start = 0; /* For mmu_notifiers */ | 2011 | unsigned long mmun_start = 0; /* For mmu_notifiers */ |
2040 | unsigned long mmun_end = 0; /* For mmu_notifiers */ | 2012 | unsigned long mmun_end = 0; /* For mmu_notifiers */ |
2041 | struct mem_cgroup *memcg; | 2013 | struct mem_cgroup *memcg; |
@@ -2086,6 +2058,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2086 | unlock_page(old_page); | 2058 | unlock_page(old_page); |
2087 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2059 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2088 | (VM_WRITE|VM_SHARED))) { | 2060 | (VM_WRITE|VM_SHARED))) { |
2061 | page_cache_get(old_page); | ||
2089 | /* | 2062 | /* |
2090 | * Only catch write-faults on shared writable pages, | 2063 | * Only catch write-faults on shared writable pages, |
2091 | * read-only shared pages can get COWed by | 2064 | * read-only shared pages can get COWed by |
@@ -2093,7 +2066,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2093 | */ | 2066 | */ |
2094 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 2067 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
2095 | int tmp; | 2068 | int tmp; |
2096 | page_cache_get(old_page); | 2069 | |
2097 | pte_unmap_unlock(page_table, ptl); | 2070 | pte_unmap_unlock(page_table, ptl); |
2098 | tmp = do_page_mkwrite(vma, old_page, address); | 2071 | tmp = do_page_mkwrite(vma, old_page, address); |
2099 | if (unlikely(!tmp || (tmp & | 2072 | if (unlikely(!tmp || (tmp & |
@@ -2113,11 +2086,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2113 | unlock_page(old_page); | 2086 | unlock_page(old_page); |
2114 | goto unlock; | 2087 | goto unlock; |
2115 | } | 2088 | } |
2116 | |||
2117 | page_mkwrite = 1; | 2089 | page_mkwrite = 1; |
2118 | } | 2090 | } |
2119 | dirty_page = old_page; | 2091 | |
2120 | get_page(dirty_page); | 2092 | dirty_shared = true; |
2121 | 2093 | ||
2122 | reuse: | 2094 | reuse: |
2123 | /* | 2095 | /* |
@@ -2136,20 +2108,20 @@ reuse: | |||
2136 | pte_unmap_unlock(page_table, ptl); | 2108 | pte_unmap_unlock(page_table, ptl); |
2137 | ret |= VM_FAULT_WRITE; | 2109 | ret |= VM_FAULT_WRITE; |
2138 | 2110 | ||
2139 | if (!dirty_page) | 2111 | if (dirty_shared) { |
2140 | return ret; | ||
2141 | |||
2142 | if (!page_mkwrite) { | ||
2143 | struct address_space *mapping; | 2112 | struct address_space *mapping; |
2144 | int dirtied; | 2113 | int dirtied; |
2145 | 2114 | ||
2146 | lock_page(dirty_page); | 2115 | if (!page_mkwrite) |
2147 | dirtied = set_page_dirty(dirty_page); | 2116 | lock_page(old_page); |
2148 | VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page); | ||
2149 | mapping = dirty_page->mapping; | ||
2150 | unlock_page(dirty_page); | ||
2151 | 2117 | ||
2152 | if (dirtied && mapping) { | 2118 | dirtied = set_page_dirty(old_page); |
2119 | VM_BUG_ON_PAGE(PageAnon(old_page), old_page); | ||
2120 | mapping = old_page->mapping; | ||
2121 | unlock_page(old_page); | ||
2122 | page_cache_release(old_page); | ||
2123 | |||
2124 | if ((dirtied || page_mkwrite) && mapping) { | ||
2153 | /* | 2125 | /* |
2154 | * Some device drivers do not set page.mapping | 2126 | * Some device drivers do not set page.mapping |
2155 | * but still dirty their pages | 2127 | * but still dirty their pages |
@@ -2157,25 +2129,9 @@ reuse: | |||
2157 | balance_dirty_pages_ratelimited(mapping); | 2129 | balance_dirty_pages_ratelimited(mapping); |
2158 | } | 2130 | } |
2159 | 2131 | ||
2160 | /* file_update_time outside page_lock */ | 2132 | if (!page_mkwrite) |
2161 | if (vma->vm_file) | ||
2162 | file_update_time(vma->vm_file); | 2133 | file_update_time(vma->vm_file); |
2163 | } | 2134 | } |
2164 | put_page(dirty_page); | ||
2165 | if (page_mkwrite) { | ||
2166 | struct address_space *mapping = dirty_page->mapping; | ||
2167 | |||
2168 | set_page_dirty(dirty_page); | ||
2169 | unlock_page(dirty_page); | ||
2170 | page_cache_release(dirty_page); | ||
2171 | if (mapping) { | ||
2172 | /* | ||
2173 | * Some device drivers do not set page.mapping | ||
2174 | * but still dirty their pages | ||
2175 | */ | ||
2176 | balance_dirty_pages_ratelimited(mapping); | ||
2177 | } | ||
2178 | } | ||
2179 | 2135 | ||
2180 | return ret; | 2136 | return ret; |
2181 | } | 2137 | } |
@@ -2333,25 +2289,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, | |||
2333 | } | 2289 | } |
2334 | } | 2290 | } |
2335 | 2291 | ||
2336 | static inline void unmap_mapping_range_list(struct list_head *head, | ||
2337 | struct zap_details *details) | ||
2338 | { | ||
2339 | struct vm_area_struct *vma; | ||
2340 | |||
2341 | /* | ||
2342 | * In nonlinear VMAs there is no correspondence between virtual address | ||
2343 | * offset and file offset. So we must perform an exhaustive search | ||
2344 | * across *all* the pages in each nonlinear VMA, not just the pages | ||
2345 | * whose virtual address lies outside the file truncation point. | ||
2346 | */ | ||
2347 | list_for_each_entry(vma, head, shared.nonlinear) { | ||
2348 | details->nonlinear_vma = vma; | ||
2349 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); | ||
2350 | } | ||
2351 | } | ||
2352 | |||
2353 | /** | 2292 | /** |
2354 | * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. | 2293 | * unmap_mapping_range - unmap the portion of all mmaps in the specified |
2294 | * address_space corresponding to the specified page range in the underlying | ||
2295 | * file. | ||
2296 | * | ||
2355 | * @mapping: the address space containing mmaps to be unmapped. | 2297 | * @mapping: the address space containing mmaps to be unmapped. |
2356 | * @holebegin: byte in first page to unmap, relative to the start of | 2298 | * @holebegin: byte in first page to unmap, relative to the start of |
2357 | * the underlying file. This will be rounded down to a PAGE_SIZE | 2299 | * the underlying file. This will be rounded down to a PAGE_SIZE |
@@ -2380,7 +2322,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2380 | } | 2322 | } |
2381 | 2323 | ||
2382 | details.check_mapping = even_cows? NULL: mapping; | 2324 | details.check_mapping = even_cows? NULL: mapping; |
2383 | details.nonlinear_vma = NULL; | ||
2384 | details.first_index = hba; | 2325 | details.first_index = hba; |
2385 | details.last_index = hba + hlen - 1; | 2326 | details.last_index = hba + hlen - 1; |
2386 | if (details.last_index < details.first_index) | 2327 | if (details.last_index < details.first_index) |
@@ -2390,8 +2331,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2390 | i_mmap_lock_write(mapping); | 2331 | i_mmap_lock_write(mapping); |
2391 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) | 2332 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) |
2392 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2333 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
2393 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | ||
2394 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | ||
2395 | i_mmap_unlock_write(mapping); | 2334 | i_mmap_unlock_write(mapping); |
2396 | } | 2335 | } |
2397 | EXPORT_SYMBOL(unmap_mapping_range); | 2336 | EXPORT_SYMBOL(unmap_mapping_range); |
@@ -2752,8 +2691,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, | |||
2752 | entry = mk_pte(page, vma->vm_page_prot); | 2691 | entry = mk_pte(page, vma->vm_page_prot); |
2753 | if (write) | 2692 | if (write) |
2754 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2693 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2755 | else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) | ||
2756 | entry = pte_mksoft_dirty(entry); | ||
2757 | if (anon) { | 2694 | if (anon) { |
2758 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2695 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2759 | page_add_new_anon_rmap(page, vma, address); | 2696 | page_add_new_anon_rmap(page, vma, address); |
@@ -2888,8 +2825,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2888 | * if page by the offset is not ready to be mapped (cold cache or | 2825 | * if page by the offset is not ready to be mapped (cold cache or |
2889 | * something). | 2826 | * something). |
2890 | */ | 2827 | */ |
2891 | if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && | 2828 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { |
2892 | fault_around_bytes >> PAGE_SHIFT > 1) { | ||
2893 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2829 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
2894 | do_fault_around(vma, address, pte, pgoff, flags); | 2830 | do_fault_around(vma, address, pte, pgoff, flags); |
2895 | if (!pte_same(*pte, orig_pte)) | 2831 | if (!pte_same(*pte, orig_pte)) |
@@ -3021,8 +2957,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3021 | balance_dirty_pages_ratelimited(mapping); | 2957 | balance_dirty_pages_ratelimited(mapping); |
3022 | } | 2958 | } |
3023 | 2959 | ||
3024 | /* file_update_time outside page_lock */ | 2960 | if (!vma->vm_ops->page_mkwrite) |
3025 | if (vma->vm_file && !vma->vm_ops->page_mkwrite) | ||
3026 | file_update_time(vma->vm_file); | 2961 | file_update_time(vma->vm_file); |
3027 | 2962 | ||
3028 | return ret; | 2963 | return ret; |
@@ -3034,7 +2969,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3034 | * The mmap_sem may have been released depending on flags and our | 2969 | * The mmap_sem may have been released depending on flags and our |
3035 | * return value. See filemap_fault() and __lock_page_or_retry(). | 2970 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3036 | */ | 2971 | */ |
3037 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2972 | static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3038 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2973 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
3039 | unsigned int flags, pte_t orig_pte) | 2974 | unsigned int flags, pte_t orig_pte) |
3040 | { | 2975 | { |
@@ -3051,46 +2986,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3051 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 2986 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
3052 | } | 2987 | } |
3053 | 2988 | ||
3054 | /* | ||
3055 | * Fault of a previously existing named mapping. Repopulate the pte | ||
3056 | * from the encoded file_pte if possible. This enables swappable | ||
3057 | * nonlinear vmas. | ||
3058 | * | ||
3059 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
3060 | * but allow concurrent faults), and pte mapped but not yet locked. | ||
3061 | * We return with pte unmapped and unlocked. | ||
3062 | * The mmap_sem may have been released depending on flags and our | ||
3063 | * return value. See filemap_fault() and __lock_page_or_retry(). | ||
3064 | */ | ||
3065 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3066 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
3067 | unsigned int flags, pte_t orig_pte) | ||
3068 | { | ||
3069 | pgoff_t pgoff; | ||
3070 | |||
3071 | flags |= FAULT_FLAG_NONLINEAR; | ||
3072 | |||
3073 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | ||
3074 | return 0; | ||
3075 | |||
3076 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { | ||
3077 | /* | ||
3078 | * Page table corrupted: show pte and kill process. | ||
3079 | */ | ||
3080 | print_bad_pte(vma, address, orig_pte, NULL); | ||
3081 | return VM_FAULT_SIGBUS; | ||
3082 | } | ||
3083 | |||
3084 | pgoff = pte_to_pgoff(orig_pte); | ||
3085 | if (!(flags & FAULT_FLAG_WRITE)) | ||
3086 | return do_read_fault(mm, vma, address, pmd, pgoff, flags, | ||
3087 | orig_pte); | ||
3088 | if (!(vma->vm_flags & VM_SHARED)) | ||
3089 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, | ||
3090 | orig_pte); | ||
3091 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | ||
3092 | } | ||
3093 | |||
3094 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 2989 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
3095 | unsigned long addr, int page_nid, | 2990 | unsigned long addr, int page_nid, |
3096 | int *flags) | 2991 | int *flags) |
@@ -3218,15 +3113,12 @@ static int handle_pte_fault(struct mm_struct *mm, | |||
3218 | if (pte_none(entry)) { | 3113 | if (pte_none(entry)) { |
3219 | if (vma->vm_ops) { | 3114 | if (vma->vm_ops) { |
3220 | if (likely(vma->vm_ops->fault)) | 3115 | if (likely(vma->vm_ops->fault)) |
3221 | return do_linear_fault(mm, vma, address, | 3116 | return do_fault(mm, vma, address, pte, |
3222 | pte, pmd, flags, entry); | 3117 | pmd, flags, entry); |
3223 | } | 3118 | } |
3224 | return do_anonymous_page(mm, vma, address, | 3119 | return do_anonymous_page(mm, vma, address, |
3225 | pte, pmd, flags); | 3120 | pte, pmd, flags); |
3226 | } | 3121 | } |
3227 | if (pte_file(entry)) | ||
3228 | return do_nonlinear_fault(mm, vma, address, | ||
3229 | pte, pmd, flags, entry); | ||
3230 | return do_swap_page(mm, vma, address, | 3122 | return do_swap_page(mm, vma, address, |
3231 | pte, pmd, flags, entry); | 3123 | pte, pmd, flags, entry); |
3232 | } | 3124 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 344cdf692fc8..6e284bcca8bb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -179,37 +179,6 @@ out: | |||
179 | } | 179 | } |
180 | 180 | ||
181 | /* | 181 | /* |
182 | * Congratulations to trinity for discovering this bug. | ||
183 | * mm/fremap.c's remap_file_pages() accepts any range within a single vma to | ||
184 | * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then | ||
185 | * replace the specified range by file ptes throughout (maybe populated after). | ||
186 | * If page migration finds a page within that range, while it's still located | ||
187 | * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: | ||
188 | * zap_pte() clears the temporary migration entry before mmap_sem is dropped. | ||
189 | * But if the migrating page is in a part of the vma outside the range to be | ||
190 | * remapped, then it will not be cleared, and remove_migration_ptes() needs to | ||
191 | * deal with it. Fortunately, this part of the vma is of course still linear, | ||
192 | * so we just need to use linear location on the nonlinear list. | ||
193 | */ | ||
194 | static int remove_linear_migration_ptes_from_nonlinear(struct page *page, | ||
195 | struct address_space *mapping, void *arg) | ||
196 | { | ||
197 | struct vm_area_struct *vma; | ||
198 | /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ | ||
199 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
200 | unsigned long addr; | ||
201 | |||
202 | list_for_each_entry(vma, | ||
203 | &mapping->i_mmap_nonlinear, shared.nonlinear) { | ||
204 | |||
205 | addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | ||
206 | if (addr >= vma->vm_start && addr < vma->vm_end) | ||
207 | remove_migration_pte(page, vma, addr, arg); | ||
208 | } | ||
209 | return SWAP_AGAIN; | ||
210 | } | ||
211 | |||
212 | /* | ||
213 | * Get rid of all migration entries and replace them by | 182 | * Get rid of all migration entries and replace them by |
214 | * references to the indicated page. | 183 | * references to the indicated page. |
215 | */ | 184 | */ |
@@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new) | |||
218 | struct rmap_walk_control rwc = { | 187 | struct rmap_walk_control rwc = { |
219 | .rmap_one = remove_migration_pte, | 188 | .rmap_one = remove_migration_pte, |
220 | .arg = old, | 189 | .arg = old, |
221 | .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, | ||
222 | }; | 190 | }; |
223 | 191 | ||
224 | rmap_walk(new, &rwc); | 192 | rmap_walk(new, &rwc); |
diff --git a/mm/mincore.c b/mm/mincore.c index c8c528b36641..46527c023e0c 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -124,17 +124,13 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
124 | ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 124 | ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
125 | do { | 125 | do { |
126 | pte_t pte = *ptep; | 126 | pte_t pte = *ptep; |
127 | pgoff_t pgoff; | ||
128 | 127 | ||
129 | next = addr + PAGE_SIZE; | 128 | next = addr + PAGE_SIZE; |
130 | if (pte_none(pte)) | 129 | if (pte_none(pte)) |
131 | mincore_unmapped_range(vma, addr, next, vec); | 130 | mincore_unmapped_range(vma, addr, next, vec); |
132 | else if (pte_present(pte)) | 131 | else if (pte_present(pte)) |
133 | *vec = 1; | 132 | *vec = 1; |
134 | else if (pte_file(pte)) { | 133 | else { /* pte is a swap entry */ |
135 | pgoff = pte_to_pgoff(pte); | ||
136 | *vec = mincore_page(vma->vm_file->f_mapping, pgoff); | ||
137 | } else { /* pte is a swap entry */ | ||
138 | swp_entry_t entry = pte_to_swp_entry(pte); | 134 | swp_entry_t entry = pte_to_swp_entry(pte); |
139 | 135 | ||
140 | if (non_swap_entry(entry)) { | 136 | if (non_swap_entry(entry)) { |
@@ -145,9 +141,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
145 | *vec = 1; | 141 | *vec = 1; |
146 | } else { | 142 | } else { |
147 | #ifdef CONFIG_SWAP | 143 | #ifdef CONFIG_SWAP |
148 | pgoff = entry.val; | ||
149 | *vec = mincore_page(swap_address_space(entry), | 144 | *vec = mincore_page(swap_address_space(entry), |
150 | pgoff); | 145 | entry.val); |
151 | #else | 146 | #else |
152 | WARN_ON(1); | 147 | WARN_ON(1); |
153 | *vec = 1; | 148 | *vec = 1; |
@@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, | |||
243 | mapping_unmap_writable(mapping); | 243 | mapping_unmap_writable(mapping); |
244 | 244 | ||
245 | flush_dcache_mmap_lock(mapping); | 245 | flush_dcache_mmap_lock(mapping); |
246 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 246 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
247 | list_del_init(&vma->shared.nonlinear); | ||
248 | else | ||
249 | vma_interval_tree_remove(vma, &mapping->i_mmap); | ||
250 | flush_dcache_mmap_unlock(mapping); | 247 | flush_dcache_mmap_unlock(mapping); |
251 | } | 248 | } |
252 | 249 | ||
@@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma) | |||
649 | atomic_inc(&mapping->i_mmap_writable); | 646 | atomic_inc(&mapping->i_mmap_writable); |
650 | 647 | ||
651 | flush_dcache_mmap_lock(mapping); | 648 | flush_dcache_mmap_lock(mapping); |
652 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 649 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
653 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | ||
654 | else | ||
655 | vma_interval_tree_insert(vma, &mapping->i_mmap); | ||
656 | flush_dcache_mmap_unlock(mapping); | 650 | flush_dcache_mmap_unlock(mapping); |
657 | } | 651 | } |
658 | } | 652 | } |
@@ -789,14 +783,11 @@ again: remove_next = 1 + (end > next->vm_end); | |||
789 | 783 | ||
790 | if (file) { | 784 | if (file) { |
791 | mapping = file->f_mapping; | 785 | mapping = file->f_mapping; |
792 | if (!(vma->vm_flags & VM_NONLINEAR)) { | 786 | root = &mapping->i_mmap; |
793 | root = &mapping->i_mmap; | 787 | uprobe_munmap(vma, vma->vm_start, vma->vm_end); |
794 | uprobe_munmap(vma, vma->vm_start, vma->vm_end); | ||
795 | 788 | ||
796 | if (adjust_next) | 789 | if (adjust_next) |
797 | uprobe_munmap(next, next->vm_start, | 790 | uprobe_munmap(next, next->vm_start, next->vm_end); |
798 | next->vm_end); | ||
799 | } | ||
800 | 791 | ||
801 | i_mmap_lock_write(mapping); | 792 | i_mmap_lock_write(mapping); |
802 | if (insert) { | 793 | if (insert) { |
@@ -2634,6 +2625,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) | |||
2634 | return vm_munmap(addr, len); | 2625 | return vm_munmap(addr, len); |
2635 | } | 2626 | } |
2636 | 2627 | ||
2628 | |||
2629 | /* | ||
2630 | * Emulation of deprecated remap_file_pages() syscall. | ||
2631 | */ | ||
2632 | SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | ||
2633 | unsigned long, prot, unsigned long, pgoff, unsigned long, flags) | ||
2634 | { | ||
2635 | |||
2636 | struct mm_struct *mm = current->mm; | ||
2637 | struct vm_area_struct *vma; | ||
2638 | unsigned long populate = 0; | ||
2639 | unsigned long ret = -EINVAL; | ||
2640 | struct file *file; | ||
2641 | |||
2642 | pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " | ||
2643 | "See Documentation/vm/remap_file_pages.txt.\n", | ||
2644 | current->comm, current->pid); | ||
2645 | |||
2646 | if (prot) | ||
2647 | return ret; | ||
2648 | start = start & PAGE_MASK; | ||
2649 | size = size & PAGE_MASK; | ||
2650 | |||
2651 | if (start + size <= start) | ||
2652 | return ret; | ||
2653 | |||
2654 | /* Does pgoff wrap? */ | ||
2655 | if (pgoff + (size >> PAGE_SHIFT) < pgoff) | ||
2656 | return ret; | ||
2657 | |||
2658 | down_write(&mm->mmap_sem); | ||
2659 | vma = find_vma(mm, start); | ||
2660 | |||
2661 | if (!vma || !(vma->vm_flags & VM_SHARED)) | ||
2662 | goto out; | ||
2663 | |||
2664 | if (start < vma->vm_start || start + size > vma->vm_end) | ||
2665 | goto out; | ||
2666 | |||
2667 | if (pgoff == linear_page_index(vma, start)) { | ||
2668 | ret = 0; | ||
2669 | goto out; | ||
2670 | } | ||
2671 | |||
2672 | prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; | ||
2673 | prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; | ||
2674 | prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; | ||
2675 | |||
2676 | flags &= MAP_NONBLOCK; | ||
2677 | flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; | ||
2678 | if (vma->vm_flags & VM_LOCKED) { | ||
2679 | flags |= MAP_LOCKED; | ||
2680 | /* drop PG_Mlocked flag for over-mapped range */ | ||
2681 | munlock_vma_pages_range(vma, start, start + size); | ||
2682 | } | ||
2683 | |||
2684 | file = get_file(vma->vm_file); | ||
2685 | ret = do_mmap_pgoff(vma->vm_file, start, size, | ||
2686 | prot, flags, pgoff, &populate); | ||
2687 | fput(file); | ||
2688 | out: | ||
2689 | up_write(&mm->mmap_sem); | ||
2690 | if (populate) | ||
2691 | mm_populate(ret, populate); | ||
2692 | if (!IS_ERR_VALUE(ret)) | ||
2693 | ret = 0; | ||
2694 | return ret; | ||
2695 | } | ||
2696 | |||
2637 | static inline void verify_mm_writelocked(struct mm_struct *mm) | 2697 | static inline void verify_mm_writelocked(struct mm_struct *mm) |
2638 | { | 2698 | { |
2639 | #ifdef CONFIG_DEBUG_VM | 2699 | #ifdef CONFIG_DEBUG_VM |
@@ -3108,8 +3168,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
3108 | * | 3168 | * |
3109 | * mmap_sem in write mode is required in order to block all operations | 3169 | * mmap_sem in write mode is required in order to block all operations |
3110 | * that could modify pagetables and free pages without need of | 3170 | * that could modify pagetables and free pages without need of |
3111 | * altering the vma layout (for example populate_range() with | 3171 | * altering the vma layout. It's also needed in write mode to avoid new |
3112 | * nonlinear vmas). It's also needed in write mode to avoid new | ||
3113 | * anon_vmas to be associated with existing vmas. | 3172 | * anon_vmas to be associated with existing vmas. |
3114 | * | 3173 | * |
3115 | * A single task can't take more than one mm_take_all_locks() in a row | 3174 | * A single task can't take more than one mm_take_all_locks() in a row |
diff --git a/mm/mprotect.c b/mm/mprotect.c index ace93454ce8e..33121662f08b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -105,7 +105,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
105 | } | 105 | } |
106 | if (updated) | 106 | if (updated) |
107 | pages++; | 107 | pages++; |
108 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { | 108 | } else if (IS_ENABLED(CONFIG_MIGRATION)) { |
109 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 109 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
110 | 110 | ||
111 | if (is_write_migration_entry(entry)) { | 111 | if (is_write_migration_entry(entry)) { |
diff --git a/mm/mremap.c b/mm/mremap.c index 17fa018f5f39..57dadc025c64 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte) | |||
81 | pte = pte_mksoft_dirty(pte); | 81 | pte = pte_mksoft_dirty(pte); |
82 | else if (is_swap_pte(pte)) | 82 | else if (is_swap_pte(pte)) |
83 | pte = pte_swp_mksoft_dirty(pte); | 83 | pte = pte_swp_mksoft_dirty(pte); |
84 | else if (pte_file(pte)) | ||
85 | pte = pte_file_mksoft_dirty(pte); | ||
86 | #endif | 84 | #endif |
87 | return pte; | 85 | return pte; |
88 | } | 86 | } |
diff --git a/mm/msync.c b/mm/msync.c index 992a1673d488..bb04d53ae852 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) | |||
86 | (vma->vm_flags & VM_SHARED)) { | 86 | (vma->vm_flags & VM_SHARED)) { |
87 | get_file(file); | 87 | get_file(file); |
88 | up_read(&mm->mmap_sem); | 88 | up_read(&mm->mmap_sem); |
89 | if (vma->vm_flags & VM_NONLINEAR) | 89 | error = vfs_fsync_range(file, fstart, fend, 1); |
90 | error = vfs_fsync(file, 1); | ||
91 | else | ||
92 | error = vfs_fsync_range(file, fstart, fend, 1); | ||
93 | fput(file); | 90 | fput(file); |
94 | if (error || start >= end) | 91 | if (error || start >= end) |
95 | goto out; | 92 | goto out; |
diff --git a/mm/nommu.c b/mm/nommu.c index 28bd8c4dff6f..541bed64e348 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1984,14 +1984,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1984 | } | 1984 | } |
1985 | EXPORT_SYMBOL(filemap_map_pages); | 1985 | EXPORT_SYMBOL(filemap_map_pages); |
1986 | 1986 | ||
1987 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, | ||
1988 | unsigned long size, pgoff_t pgoff) | ||
1989 | { | ||
1990 | BUG(); | ||
1991 | return 0; | ||
1992 | } | ||
1993 | EXPORT_SYMBOL(generic_file_remap_pages); | ||
1994 | |||
1995 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | 1987 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, |
1996 | unsigned long addr, void *buf, int len, int write) | 1988 | unsigned long addr, void *buf, int len, int write) |
1997 | { | 1989 | { |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e20f9c2fa5a..f121050e8530 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -552,17 +552,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
552 | return 0; | 552 | return 0; |
553 | 553 | ||
554 | if (page_is_guard(buddy) && page_order(buddy) == order) { | 554 | if (page_is_guard(buddy) && page_order(buddy) == order) { |
555 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
556 | |||
557 | if (page_zone_id(page) != page_zone_id(buddy)) | 555 | if (page_zone_id(page) != page_zone_id(buddy)) |
558 | return 0; | 556 | return 0; |
559 | 557 | ||
558 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
559 | |||
560 | return 1; | 560 | return 1; |
561 | } | 561 | } |
562 | 562 | ||
563 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 563 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
564 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
565 | |||
566 | /* | 564 | /* |
567 | * zone check is done late to avoid uselessly | 565 | * zone check is done late to avoid uselessly |
568 | * calculating zone/node ids for pages that could | 566 | * calculating zone/node ids for pages that could |
@@ -571,6 +569,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
571 | if (page_zone_id(page) != page_zone_id(buddy)) | 569 | if (page_zone_id(page) != page_zone_id(buddy)) |
572 | return 0; | 570 | return 0; |
573 | 571 | ||
572 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
573 | |||
574 | return 1; | 574 | return 1; |
575 | } | 575 | } |
576 | return 0; | 576 | return 0; |
@@ -590,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
590 | if (!vma->anon_vma || !page__anon_vma || | 590 | if (!vma->anon_vma || !page__anon_vma || |
591 | vma->anon_vma->root != page__anon_vma->root) | 591 | vma->anon_vma->root != page__anon_vma->root) |
592 | return -EFAULT; | 592 | return -EFAULT; |
593 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | 593 | } else if (page->mapping) { |
594 | if (!vma->vm_file || | 594 | if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) |
595 | vma->vm_file->f_mapping != page->mapping) | ||
596 | return -EFAULT; | 595 | return -EFAULT; |
597 | } else | 596 | } else |
598 | return -EFAULT; | 597 | return -EFAULT; |
@@ -1274,7 +1273,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1274 | if (pte_soft_dirty(pteval)) | 1273 | if (pte_soft_dirty(pteval)) |
1275 | swp_pte = pte_swp_mksoft_dirty(swp_pte); | 1274 | swp_pte = pte_swp_mksoft_dirty(swp_pte); |
1276 | set_pte_at(mm, address, pte, swp_pte); | 1275 | set_pte_at(mm, address, pte, swp_pte); |
1277 | BUG_ON(pte_file(*pte)); | ||
1278 | } else if (IS_ENABLED(CONFIG_MIGRATION) && | 1276 | } else if (IS_ENABLED(CONFIG_MIGRATION) && |
1279 | (flags & TTU_MIGRATION)) { | 1277 | (flags & TTU_MIGRATION)) { |
1280 | /* Establish migration entry for a file page */ | 1278 | /* Establish migration entry for a file page */ |
@@ -1316,211 +1314,6 @@ out_mlock: | |||
1316 | return ret; | 1314 | return ret; |
1317 | } | 1315 | } |
1318 | 1316 | ||
1319 | /* | ||
1320 | * objrmap doesn't work for nonlinear VMAs because the assumption that | ||
1321 | * offset-into-file correlates with offset-into-virtual-addresses does not hold. | ||
1322 | * Consequently, given a particular page and its ->index, we cannot locate the | ||
1323 | * ptes which are mapping that page without an exhaustive linear search. | ||
1324 | * | ||
1325 | * So what this code does is a mini "virtual scan" of each nonlinear VMA which | ||
1326 | * maps the file to which the target page belongs. The ->vm_private_data field | ||
1327 | * holds the current cursor into that scan. Successive searches will circulate | ||
1328 | * around the vma's virtual address space. | ||
1329 | * | ||
1330 | * So as more replacement pressure is applied to the pages in a nonlinear VMA, | ||
1331 | * more scanning pressure is placed against them as well. Eventually pages | ||
1332 | * will become fully unmapped and are eligible for eviction. | ||
1333 | * | ||
1334 | * For very sparsely populated VMAs this is a little inefficient - chances are | ||
1335 | * there there won't be many ptes located within the scan cluster. In this case | ||
1336 | * maybe we could scan further - to the end of the pte page, perhaps. | ||
1337 | * | ||
1338 | * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can | ||
1339 | * acquire it without blocking. If vma locked, mlock the pages in the cluster, | ||
1340 | * rather than unmapping them. If we encounter the "check_page" that vmscan is | ||
1341 | * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. | ||
1342 | */ | ||
1343 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) | ||
1344 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) | ||
1345 | |||
1346 | static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | ||
1347 | struct vm_area_struct *vma, struct page *check_page) | ||
1348 | { | ||
1349 | struct mm_struct *mm = vma->vm_mm; | ||
1350 | pmd_t *pmd; | ||
1351 | pte_t *pte; | ||
1352 | pte_t pteval; | ||
1353 | spinlock_t *ptl; | ||
1354 | struct page *page; | ||
1355 | unsigned long address; | ||
1356 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
1357 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1358 | unsigned long end; | ||
1359 | int ret = SWAP_AGAIN; | ||
1360 | int locked_vma = 0; | ||
1361 | |||
1362 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | ||
1363 | end = address + CLUSTER_SIZE; | ||
1364 | if (address < vma->vm_start) | ||
1365 | address = vma->vm_start; | ||
1366 | if (end > vma->vm_end) | ||
1367 | end = vma->vm_end; | ||
1368 | |||
1369 | pmd = mm_find_pmd(mm, address); | ||
1370 | if (!pmd) | ||
1371 | return ret; | ||
1372 | |||
1373 | mmun_start = address; | ||
1374 | mmun_end = end; | ||
1375 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1376 | |||
1377 | /* | ||
1378 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, | ||
1379 | * keep the sem while scanning the cluster for mlocking pages. | ||
1380 | */ | ||
1381 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
1382 | locked_vma = (vma->vm_flags & VM_LOCKED); | ||
1383 | if (!locked_vma) | ||
1384 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ | ||
1385 | } | ||
1386 | |||
1387 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
1388 | |||
1389 | /* Update high watermark before we lower rss */ | ||
1390 | update_hiwater_rss(mm); | ||
1391 | |||
1392 | for (; address < end; pte++, address += PAGE_SIZE) { | ||
1393 | if (!pte_present(*pte)) | ||
1394 | continue; | ||
1395 | page = vm_normal_page(vma, address, *pte); | ||
1396 | BUG_ON(!page || PageAnon(page)); | ||
1397 | |||
1398 | if (locked_vma) { | ||
1399 | if (page == check_page) { | ||
1400 | /* we know we have check_page locked */ | ||
1401 | mlock_vma_page(page); | ||
1402 | ret = SWAP_MLOCK; | ||
1403 | } else if (trylock_page(page)) { | ||
1404 | /* | ||
1405 | * If we can lock the page, perform mlock. | ||
1406 | * Otherwise leave the page alone, it will be | ||
1407 | * eventually encountered again later. | ||
1408 | */ | ||
1409 | mlock_vma_page(page); | ||
1410 | unlock_page(page); | ||
1411 | } | ||
1412 | continue; /* don't unmap */ | ||
1413 | } | ||
1414 | |||
1415 | /* | ||
1416 | * No need for _notify because we're within an | ||
1417 | * mmu_notifier_invalidate_range_ {start|end} scope. | ||
1418 | */ | ||
1419 | if (ptep_clear_flush_young(vma, address, pte)) | ||
1420 | continue; | ||
1421 | |||
1422 | /* Nuke the page table entry. */ | ||
1423 | flush_cache_page(vma, address, pte_pfn(*pte)); | ||
1424 | pteval = ptep_clear_flush_notify(vma, address, pte); | ||
1425 | |||
1426 | /* If nonlinear, store the file page offset in the pte. */ | ||
1427 | if (page->index != linear_page_index(vma, address)) { | ||
1428 | pte_t ptfile = pgoff_to_pte(page->index); | ||
1429 | if (pte_soft_dirty(pteval)) | ||
1430 | ptfile = pte_file_mksoft_dirty(ptfile); | ||
1431 | set_pte_at(mm, address, pte, ptfile); | ||
1432 | } | ||
1433 | |||
1434 | /* Move the dirty bit to the physical page now the pte is gone. */ | ||
1435 | if (pte_dirty(pteval)) | ||
1436 | set_page_dirty(page); | ||
1437 | |||
1438 | page_remove_rmap(page); | ||
1439 | page_cache_release(page); | ||
1440 | dec_mm_counter(mm, MM_FILEPAGES); | ||
1441 | (*mapcount)--; | ||
1442 | } | ||
1443 | pte_unmap_unlock(pte - 1, ptl); | ||
1444 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1445 | if (locked_vma) | ||
1446 | up_read(&vma->vm_mm->mmap_sem); | ||
1447 | return ret; | ||
1448 | } | ||
1449 | |||
1450 | static int try_to_unmap_nonlinear(struct page *page, | ||
1451 | struct address_space *mapping, void *arg) | ||
1452 | { | ||
1453 | struct vm_area_struct *vma; | ||
1454 | int ret = SWAP_AGAIN; | ||
1455 | unsigned long cursor; | ||
1456 | unsigned long max_nl_cursor = 0; | ||
1457 | unsigned long max_nl_size = 0; | ||
1458 | unsigned int mapcount; | ||
1459 | |||
1460 | list_for_each_entry(vma, | ||
1461 | &mapping->i_mmap_nonlinear, shared.nonlinear) { | ||
1462 | |||
1463 | cursor = (unsigned long) vma->vm_private_data; | ||
1464 | if (cursor > max_nl_cursor) | ||
1465 | max_nl_cursor = cursor; | ||
1466 | cursor = vma->vm_end - vma->vm_start; | ||
1467 | if (cursor > max_nl_size) | ||
1468 | max_nl_size = cursor; | ||
1469 | } | ||
1470 | |||
1471 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ | ||
1472 | return SWAP_FAIL; | ||
1473 | } | ||
1474 | |||
1475 | /* | ||
1476 | * We don't try to search for this page in the nonlinear vmas, | ||
1477 | * and page_referenced wouldn't have found it anyway. Instead | ||
1478 | * just walk the nonlinear vmas trying to age and unmap some. | ||
1479 | * The mapcount of the page we came in with is irrelevant, | ||
1480 | * but even so use it as a guide to how hard we should try? | ||
1481 | */ | ||
1482 | mapcount = page_mapcount(page); | ||
1483 | if (!mapcount) | ||
1484 | return ret; | ||
1485 | |||
1486 | cond_resched(); | ||
1487 | |||
1488 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | ||
1489 | if (max_nl_cursor == 0) | ||
1490 | max_nl_cursor = CLUSTER_SIZE; | ||
1491 | |||
1492 | do { | ||
1493 | list_for_each_entry(vma, | ||
1494 | &mapping->i_mmap_nonlinear, shared.nonlinear) { | ||
1495 | |||
1496 | cursor = (unsigned long) vma->vm_private_data; | ||
1497 | while (cursor < max_nl_cursor && | ||
1498 | cursor < vma->vm_end - vma->vm_start) { | ||
1499 | if (try_to_unmap_cluster(cursor, &mapcount, | ||
1500 | vma, page) == SWAP_MLOCK) | ||
1501 | ret = SWAP_MLOCK; | ||
1502 | cursor += CLUSTER_SIZE; | ||
1503 | vma->vm_private_data = (void *) cursor; | ||
1504 | if ((int)mapcount <= 0) | ||
1505 | return ret; | ||
1506 | } | ||
1507 | vma->vm_private_data = (void *) max_nl_cursor; | ||
1508 | } | ||
1509 | cond_resched(); | ||
1510 | max_nl_cursor += CLUSTER_SIZE; | ||
1511 | } while (max_nl_cursor <= max_nl_size); | ||
1512 | |||
1513 | /* | ||
1514 | * Don't loop forever (perhaps all the remaining pages are | ||
1515 | * in locked vmas). Reset cursor on all unreserved nonlinear | ||
1516 | * vmas, now forgetting on which ones it had fallen behind. | ||
1517 | */ | ||
1518 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) | ||
1519 | vma->vm_private_data = NULL; | ||
1520 | |||
1521 | return ret; | ||
1522 | } | ||
1523 | |||
1524 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1317 | bool is_vma_temporary_stack(struct vm_area_struct *vma) |
1525 | { | 1318 | { |
1526 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | 1319 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); |
@@ -1566,7 +1359,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1566 | .rmap_one = try_to_unmap_one, | 1359 | .rmap_one = try_to_unmap_one, |
1567 | .arg = (void *)flags, | 1360 | .arg = (void *)flags, |
1568 | .done = page_not_mapped, | 1361 | .done = page_not_mapped, |
1569 | .file_nonlinear = try_to_unmap_nonlinear, | ||
1570 | .anon_lock = page_lock_anon_vma_read, | 1362 | .anon_lock = page_lock_anon_vma_read, |
1571 | }; | 1363 | }; |
1572 | 1364 | ||
@@ -1612,12 +1404,6 @@ int try_to_munlock(struct page *page) | |||
1612 | .rmap_one = try_to_unmap_one, | 1404 | .rmap_one = try_to_unmap_one, |
1613 | .arg = (void *)TTU_MUNLOCK, | 1405 | .arg = (void *)TTU_MUNLOCK, |
1614 | .done = page_not_mapped, | 1406 | .done = page_not_mapped, |
1615 | /* | ||
1616 | * We don't bother to try to find the munlocked page in | ||
1617 | * nonlinears. It's costly. Instead, later, page reclaim logic | ||
1618 | * may call try_to_unmap() and recover PG_mlocked lazily. | ||
1619 | */ | ||
1620 | .file_nonlinear = NULL, | ||
1621 | .anon_lock = page_lock_anon_vma_read, | 1407 | .anon_lock = page_lock_anon_vma_read, |
1622 | 1408 | ||
1623 | }; | 1409 | }; |
@@ -1748,13 +1534,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
1748 | goto done; | 1534 | goto done; |
1749 | } | 1535 | } |
1750 | 1536 | ||
1751 | if (!rwc->file_nonlinear) | ||
1752 | goto done; | ||
1753 | |||
1754 | if (list_empty(&mapping->i_mmap_nonlinear)) | ||
1755 | goto done; | ||
1756 | |||
1757 | ret = rwc->file_nonlinear(page, mapping, rwc->arg); | ||
1758 | done: | 1537 | done: |
1759 | i_mmap_unlock_read(mapping); | 1538 | i_mmap_unlock_read(mapping); |
1760 | return ret; | 1539 | return ret; |
diff --git a/mm/shmem.c b/mm/shmem.c index 993e6ba689cc..b3e403181981 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -3201,7 +3201,6 @@ static const struct vm_operations_struct shmem_vm_ops = { | |||
3201 | .set_policy = shmem_set_policy, | 3201 | .set_policy = shmem_set_policy, |
3202 | .get_policy = shmem_get_policy, | 3202 | .get_policy = shmem_get_policy, |
3203 | #endif | 3203 | #endif |
3204 | .remap_pages = generic_file_remap_pages, | ||
3205 | }; | 3204 | }; |
3206 | 3205 | ||
3207 | static struct dentry *shmem_mount(struct file_system_type *fs_type, | 3206 | static struct dentry *shmem_mount(struct file_system_type *fs_type, |
@@ -235,7 +235,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s, | |||
235 | return 0; | 235 | return 0; |
236 | if (is_root_cache(s)) | 236 | if (is_root_cache(s)) |
237 | return 0; | 237 | return 0; |
238 | return __memcg_charge_slab(s, gfp, order); | 238 | return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order); |
239 | } | 239 | } |
240 | 240 | ||
241 | static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) | 241 | static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) |
@@ -244,7 +244,7 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) | |||
244 | return; | 244 | return; |
245 | if (is_root_cache(s)) | 245 | if (is_root_cache(s)) |
246 | return; | 246 | return; |
247 | __memcg_uncharge_slab(s, order); | 247 | memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order); |
248 | } | 248 | } |
249 | #else | 249 | #else |
250 | static inline bool is_root_cache(struct kmem_cache *s) | 250 | static inline bool is_root_cache(struct kmem_cache *s) |
diff --git a/mm/slab_common.c b/mm/slab_common.c index e03dd6f2a272..6e1e4cf65836 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -331,7 +331,7 @@ out: | |||
331 | 331 | ||
332 | out_free_cache: | 332 | out_free_cache: |
333 | memcg_free_cache_params(s); | 333 | memcg_free_cache_params(s); |
334 | kfree(s); | 334 | kmem_cache_free(kmem_cache, s); |
335 | goto out; | 335 | goto out; |
336 | } | 336 | } |
337 | 337 | ||
@@ -425,21 +425,64 @@ out_unlock: | |||
425 | } | 425 | } |
426 | EXPORT_SYMBOL(kmem_cache_create); | 426 | EXPORT_SYMBOL(kmem_cache_create); |
427 | 427 | ||
428 | static int do_kmem_cache_shutdown(struct kmem_cache *s, | ||
429 | struct list_head *release, bool *need_rcu_barrier) | ||
430 | { | ||
431 | if (__kmem_cache_shutdown(s) != 0) { | ||
432 | printk(KERN_ERR "kmem_cache_destroy %s: " | ||
433 | "Slab cache still has objects\n", s->name); | ||
434 | dump_stack(); | ||
435 | return -EBUSY; | ||
436 | } | ||
437 | |||
438 | if (s->flags & SLAB_DESTROY_BY_RCU) | ||
439 | *need_rcu_barrier = true; | ||
440 | |||
441 | #ifdef CONFIG_MEMCG_KMEM | ||
442 | if (!is_root_cache(s)) { | ||
443 | struct kmem_cache *root_cache = s->memcg_params->root_cache; | ||
444 | int memcg_id = memcg_cache_id(s->memcg_params->memcg); | ||
445 | |||
446 | BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s); | ||
447 | root_cache->memcg_params->memcg_caches[memcg_id] = NULL; | ||
448 | } | ||
449 | #endif | ||
450 | list_move(&s->list, release); | ||
451 | return 0; | ||
452 | } | ||
453 | |||
454 | static void do_kmem_cache_release(struct list_head *release, | ||
455 | bool need_rcu_barrier) | ||
456 | { | ||
457 | struct kmem_cache *s, *s2; | ||
458 | |||
459 | if (need_rcu_barrier) | ||
460 | rcu_barrier(); | ||
461 | |||
462 | list_for_each_entry_safe(s, s2, release, list) { | ||
463 | #ifdef SLAB_SUPPORTS_SYSFS | ||
464 | sysfs_slab_remove(s); | ||
465 | #else | ||
466 | slab_kmem_cache_release(s); | ||
467 | #endif | ||
468 | } | ||
469 | } | ||
470 | |||
428 | #ifdef CONFIG_MEMCG_KMEM | 471 | #ifdef CONFIG_MEMCG_KMEM |
429 | /* | 472 | /* |
430 | * memcg_create_kmem_cache - Create a cache for a memory cgroup. | 473 | * memcg_create_kmem_cache - Create a cache for a memory cgroup. |
431 | * @memcg: The memory cgroup the new cache is for. | 474 | * @memcg: The memory cgroup the new cache is for. |
432 | * @root_cache: The parent of the new cache. | 475 | * @root_cache: The parent of the new cache. |
433 | * @memcg_name: The name of the memory cgroup (used for naming the new cache). | ||
434 | * | 476 | * |
435 | * This function attempts to create a kmem cache that will serve allocation | 477 | * This function attempts to create a kmem cache that will serve allocation |
436 | * requests going from @memcg to @root_cache. The new cache inherits properties | 478 | * requests going from @memcg to @root_cache. The new cache inherits properties |
437 | * from its parent. | 479 | * from its parent. |
438 | */ | 480 | */ |
439 | struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | 481 | void memcg_create_kmem_cache(struct mem_cgroup *memcg, |
440 | struct kmem_cache *root_cache, | 482 | struct kmem_cache *root_cache) |
441 | const char *memcg_name) | ||
442 | { | 483 | { |
484 | static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ | ||
485 | int memcg_id = memcg_cache_id(memcg); | ||
443 | struct kmem_cache *s = NULL; | 486 | struct kmem_cache *s = NULL; |
444 | char *cache_name; | 487 | char *cache_name; |
445 | 488 | ||
@@ -448,8 +491,18 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | |||
448 | 491 | ||
449 | mutex_lock(&slab_mutex); | 492 | mutex_lock(&slab_mutex); |
450 | 493 | ||
494 | /* | ||
495 | * Since per-memcg caches are created asynchronously on first | ||
496 | * allocation (see memcg_kmem_get_cache()), several threads can try to | ||
497 | * create the same cache, but only one of them may succeed. | ||
498 | */ | ||
499 | if (cache_from_memcg_idx(root_cache, memcg_id)) | ||
500 | goto out_unlock; | ||
501 | |||
502 | cgroup_name(mem_cgroup_css(memcg)->cgroup, | ||
503 | memcg_name_buf, sizeof(memcg_name_buf)); | ||
451 | cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, | 504 | cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, |
452 | memcg_cache_id(memcg), memcg_name); | 505 | memcg_cache_id(memcg), memcg_name_buf); |
453 | if (!cache_name) | 506 | if (!cache_name) |
454 | goto out_unlock; | 507 | goto out_unlock; |
455 | 508 | ||
@@ -457,49 +510,73 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | |||
457 | root_cache->size, root_cache->align, | 510 | root_cache->size, root_cache->align, |
458 | root_cache->flags, root_cache->ctor, | 511 | root_cache->flags, root_cache->ctor, |
459 | memcg, root_cache); | 512 | memcg, root_cache); |
513 | /* | ||
514 | * If we could not create a memcg cache, do not complain, because | ||
515 | * that's not critical at all as we can always proceed with the root | ||
516 | * cache. | ||
517 | */ | ||
460 | if (IS_ERR(s)) { | 518 | if (IS_ERR(s)) { |
461 | kfree(cache_name); | 519 | kfree(cache_name); |
462 | s = NULL; | 520 | goto out_unlock; |
463 | } | 521 | } |
464 | 522 | ||
523 | /* | ||
524 | * Since readers won't lock (see cache_from_memcg_idx()), we need a | ||
525 | * barrier here to ensure nobody will see the kmem_cache partially | ||
526 | * initialized. | ||
527 | */ | ||
528 | smp_wmb(); | ||
529 | root_cache->memcg_params->memcg_caches[memcg_id] = s; | ||
530 | |||
465 | out_unlock: | 531 | out_unlock: |
466 | mutex_unlock(&slab_mutex); | 532 | mutex_unlock(&slab_mutex); |
467 | 533 | ||
468 | put_online_mems(); | 534 | put_online_mems(); |
469 | put_online_cpus(); | 535 | put_online_cpus(); |
470 | |||
471 | return s; | ||
472 | } | 536 | } |
473 | 537 | ||
474 | static int memcg_cleanup_cache_params(struct kmem_cache *s) | 538 | void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) |
475 | { | 539 | { |
476 | int rc; | 540 | LIST_HEAD(release); |
541 | bool need_rcu_barrier = false; | ||
542 | struct kmem_cache *s, *s2; | ||
477 | 543 | ||
478 | if (!s->memcg_params || | 544 | get_online_cpus(); |
479 | !s->memcg_params->is_root_cache) | 545 | get_online_mems(); |
480 | return 0; | ||
481 | 546 | ||
482 | mutex_unlock(&slab_mutex); | ||
483 | rc = __memcg_cleanup_cache_params(s); | ||
484 | mutex_lock(&slab_mutex); | 547 | mutex_lock(&slab_mutex); |
548 | list_for_each_entry_safe(s, s2, &slab_caches, list) { | ||
549 | if (is_root_cache(s) || s->memcg_params->memcg != memcg) | ||
550 | continue; | ||
551 | /* | ||
552 | * The cgroup is about to be freed and therefore has no charges | ||
553 | * left. Hence, all its caches must be empty by now. | ||
554 | */ | ||
555 | BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); | ||
556 | } | ||
557 | mutex_unlock(&slab_mutex); | ||
485 | 558 | ||
486 | return rc; | 559 | put_online_mems(); |
487 | } | 560 | put_online_cpus(); |
488 | #else | 561 | |
489 | static int memcg_cleanup_cache_params(struct kmem_cache *s) | 562 | do_kmem_cache_release(&release, need_rcu_barrier); |
490 | { | ||
491 | return 0; | ||
492 | } | 563 | } |
493 | #endif /* CONFIG_MEMCG_KMEM */ | 564 | #endif /* CONFIG_MEMCG_KMEM */ |
494 | 565 | ||
495 | void slab_kmem_cache_release(struct kmem_cache *s) | 566 | void slab_kmem_cache_release(struct kmem_cache *s) |
496 | { | 567 | { |
568 | memcg_free_cache_params(s); | ||
497 | kfree(s->name); | 569 | kfree(s->name); |
498 | kmem_cache_free(kmem_cache, s); | 570 | kmem_cache_free(kmem_cache, s); |
499 | } | 571 | } |
500 | 572 | ||
501 | void kmem_cache_destroy(struct kmem_cache *s) | 573 | void kmem_cache_destroy(struct kmem_cache *s) |
502 | { | 574 | { |
575 | int i; | ||
576 | LIST_HEAD(release); | ||
577 | bool need_rcu_barrier = false; | ||
578 | bool busy = false; | ||
579 | |||
503 | get_online_cpus(); | 580 | get_online_cpus(); |
504 | get_online_mems(); | 581 | get_online_mems(); |
505 | 582 | ||
@@ -509,35 +586,23 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
509 | if (s->refcount) | 586 | if (s->refcount) |
510 | goto out_unlock; | 587 | goto out_unlock; |
511 | 588 | ||
512 | if (memcg_cleanup_cache_params(s) != 0) | 589 | for_each_memcg_cache_index(i) { |
513 | goto out_unlock; | 590 | struct kmem_cache *c = cache_from_memcg_idx(s, i); |
514 | 591 | ||
515 | if (__kmem_cache_shutdown(s) != 0) { | 592 | if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) |
516 | printk(KERN_ERR "kmem_cache_destroy %s: " | 593 | busy = true; |
517 | "Slab cache still has objects\n", s->name); | ||
518 | dump_stack(); | ||
519 | goto out_unlock; | ||
520 | } | 594 | } |
521 | 595 | ||
522 | list_del(&s->list); | 596 | if (!busy) |
523 | 597 | do_kmem_cache_shutdown(s, &release, &need_rcu_barrier); | |
524 | mutex_unlock(&slab_mutex); | ||
525 | if (s->flags & SLAB_DESTROY_BY_RCU) | ||
526 | rcu_barrier(); | ||
527 | |||
528 | memcg_free_cache_params(s); | ||
529 | #ifdef SLAB_SUPPORTS_SYSFS | ||
530 | sysfs_slab_remove(s); | ||
531 | #else | ||
532 | slab_kmem_cache_release(s); | ||
533 | #endif | ||
534 | goto out; | ||
535 | 598 | ||
536 | out_unlock: | 599 | out_unlock: |
537 | mutex_unlock(&slab_mutex); | 600 | mutex_unlock(&slab_mutex); |
538 | out: | 601 | |
539 | put_online_mems(); | 602 | put_online_mems(); |
540 | put_online_cpus(); | 603 | put_online_cpus(); |
604 | |||
605 | do_kmem_cache_release(&release, need_rcu_barrier); | ||
541 | } | 606 | } |
542 | EXPORT_SYMBOL(kmem_cache_destroy); | 607 | EXPORT_SYMBOL(kmem_cache_destroy); |
543 | 608 | ||
@@ -2398,13 +2398,24 @@ redo: | |||
2398 | * reading from one cpu area. That does not matter as long | 2398 | * reading from one cpu area. That does not matter as long |
2399 | * as we end up on the original cpu again when doing the cmpxchg. | 2399 | * as we end up on the original cpu again when doing the cmpxchg. |
2400 | * | 2400 | * |
2401 | * Preemption is disabled for the retrieval of the tid because that | 2401 | * We should guarantee that tid and kmem_cache are retrieved on |
2402 | * must occur from the current processor. We cannot allow rescheduling | 2402 | * the same cpu. It could be different if CONFIG_PREEMPT so we need |
2403 | * on a different processor between the determination of the pointer | 2403 | * to check if it is matched or not. |
2404 | * and the retrieval of the tid. | ||
2405 | */ | 2404 | */ |
2406 | preempt_disable(); | 2405 | do { |
2407 | c = this_cpu_ptr(s->cpu_slab); | 2406 | tid = this_cpu_read(s->cpu_slab->tid); |
2407 | c = raw_cpu_ptr(s->cpu_slab); | ||
2408 | } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); | ||
2409 | |||
2410 | /* | ||
2411 | * Irqless object alloc/free algorithm used here depends on sequence | ||
2412 | * of fetching cpu_slab's data. tid should be fetched before anything | ||
2413 | * on c to guarantee that object and page associated with previous tid | ||
2414 | * won't be used with current tid. If we fetch tid first, object and | ||
2415 | * page could be one associated with next tid and our alloc/free | ||
2416 | * request will be failed. In this case, we will retry. So, no problem. | ||
2417 | */ | ||
2418 | barrier(); | ||
2408 | 2419 | ||
2409 | /* | 2420 | /* |
2410 | * The transaction ids are globally unique per cpu and per operation on | 2421 | * The transaction ids are globally unique per cpu and per operation on |
@@ -2412,8 +2423,6 @@ redo: | |||
2412 | * occurs on the right processor and that there was no operation on the | 2423 | * occurs on the right processor and that there was no operation on the |
2413 | * linked list in between. | 2424 | * linked list in between. |
2414 | */ | 2425 | */ |
2415 | tid = c->tid; | ||
2416 | preempt_enable(); | ||
2417 | 2426 | ||
2418 | object = c->freelist; | 2427 | object = c->freelist; |
2419 | page = c->page; | 2428 | page = c->page; |
@@ -2512,7 +2521,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); | |||
2512 | #endif | 2521 | #endif |
2513 | 2522 | ||
2514 | /* | 2523 | /* |
2515 | * Slow patch handling. This may still be called frequently since objects | 2524 | * Slow path handling. This may still be called frequently since objects |
2516 | * have a longer lifetime than the cpu slabs in most processing loads. | 2525 | * have a longer lifetime than the cpu slabs in most processing loads. |
2517 | * | 2526 | * |
2518 | * So we still attempt to reduce cache line usage. Just take the slab | 2527 | * So we still attempt to reduce cache line usage. Just take the slab |
@@ -2659,11 +2668,13 @@ redo: | |||
2659 | * data is retrieved via this pointer. If we are on the same cpu | 2668 | * data is retrieved via this pointer. If we are on the same cpu |
2660 | * during the cmpxchg then the free will succedd. | 2669 | * during the cmpxchg then the free will succedd. |
2661 | */ | 2670 | */ |
2662 | preempt_disable(); | 2671 | do { |
2663 | c = this_cpu_ptr(s->cpu_slab); | 2672 | tid = this_cpu_read(s->cpu_slab->tid); |
2673 | c = raw_cpu_ptr(s->cpu_slab); | ||
2674 | } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); | ||
2664 | 2675 | ||
2665 | tid = c->tid; | 2676 | /* Same with comment on barrier() in slab_alloc_node() */ |
2666 | preempt_enable(); | 2677 | barrier(); |
2667 | 2678 | ||
2668 | if (likely(page == c->page)) { | 2679 | if (likely(page == c->page)) { |
2669 | set_freepointer(s, object, c->freelist); | 2680 | set_freepointer(s, object, c->freelist); |
@@ -1140,10 +1140,8 @@ void __init swap_setup(void) | |||
1140 | 1140 | ||
1141 | if (bdi_init(swapper_spaces[0].backing_dev_info)) | 1141 | if (bdi_init(swapper_spaces[0].backing_dev_info)) |
1142 | panic("Failed to init swap bdi"); | 1142 | panic("Failed to init swap bdi"); |
1143 | for (i = 0; i < MAX_SWAPFILES; i++) { | 1143 | for (i = 0; i < MAX_SWAPFILES; i++) |
1144 | spin_lock_init(&swapper_spaces[i].tree_lock); | 1144 | spin_lock_init(&swapper_spaces[i].tree_lock); |
1145 | INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); | ||
1146 | } | ||
1147 | #endif | 1145 | #endif |
1148 | 1146 | ||
1149 | /* Use a smaller cluster for small-memory machines */ | 1147 | /* Use a smaller cluster for small-memory machines */ |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 1284f89fca08..9943e5fd74e6 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -17,6 +17,9 @@ | |||
17 | #include <linux/cpu.h> | 17 | #include <linux/cpu.h> |
18 | #include <linux/cpumask.h> | 18 | #include <linux/cpumask.h> |
19 | #include <linux/vmstat.h> | 19 | #include <linux/vmstat.h> |
20 | #include <linux/proc_fs.h> | ||
21 | #include <linux/seq_file.h> | ||
22 | #include <linux/debugfs.h> | ||
20 | #include <linux/sched.h> | 23 | #include <linux/sched.h> |
21 | #include <linux/math64.h> | 24 | #include <linux/math64.h> |
22 | #include <linux/writeback.h> | 25 | #include <linux/writeback.h> |
@@ -670,66 +673,6 @@ int fragmentation_index(struct zone *zone, unsigned int order) | |||
670 | } | 673 | } |
671 | #endif | 674 | #endif |
672 | 675 | ||
673 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) | ||
674 | #include <linux/proc_fs.h> | ||
675 | #include <linux/seq_file.h> | ||
676 | |||
677 | static char * const migratetype_names[MIGRATE_TYPES] = { | ||
678 | "Unmovable", | ||
679 | "Reclaimable", | ||
680 | "Movable", | ||
681 | "Reserve", | ||
682 | #ifdef CONFIG_CMA | ||
683 | "CMA", | ||
684 | #endif | ||
685 | #ifdef CONFIG_MEMORY_ISOLATION | ||
686 | "Isolate", | ||
687 | #endif | ||
688 | }; | ||
689 | |||
690 | static void *frag_start(struct seq_file *m, loff_t *pos) | ||
691 | { | ||
692 | pg_data_t *pgdat; | ||
693 | loff_t node = *pos; | ||
694 | for (pgdat = first_online_pgdat(); | ||
695 | pgdat && node; | ||
696 | pgdat = next_online_pgdat(pgdat)) | ||
697 | --node; | ||
698 | |||
699 | return pgdat; | ||
700 | } | ||
701 | |||
702 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | ||
703 | { | ||
704 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
705 | |||
706 | (*pos)++; | ||
707 | return next_online_pgdat(pgdat); | ||
708 | } | ||
709 | |||
710 | static void frag_stop(struct seq_file *m, void *arg) | ||
711 | { | ||
712 | } | ||
713 | |||
714 | /* Walk all the zones in a node and print using a callback */ | ||
715 | static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | ||
716 | void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) | ||
717 | { | ||
718 | struct zone *zone; | ||
719 | struct zone *node_zones = pgdat->node_zones; | ||
720 | unsigned long flags; | ||
721 | |||
722 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
723 | if (!populated_zone(zone)) | ||
724 | continue; | ||
725 | |||
726 | spin_lock_irqsave(&zone->lock, flags); | ||
727 | print(m, pgdat, zone); | ||
728 | spin_unlock_irqrestore(&zone->lock, flags); | ||
729 | } | ||
730 | } | ||
731 | #endif | ||
732 | |||
733 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) | 676 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) |
734 | #ifdef CONFIG_ZONE_DMA | 677 | #ifdef CONFIG_ZONE_DMA |
735 | #define TEXT_FOR_DMA(xx) xx "_dma", | 678 | #define TEXT_FOR_DMA(xx) xx "_dma", |
@@ -907,7 +850,66 @@ const char * const vmstat_text[] = { | |||
907 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ | 850 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ |
908 | 851 | ||
909 | 852 | ||
853 | #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ | ||
854 | defined(CONFIG_PROC_FS) | ||
855 | static void *frag_start(struct seq_file *m, loff_t *pos) | ||
856 | { | ||
857 | pg_data_t *pgdat; | ||
858 | loff_t node = *pos; | ||
859 | |||
860 | for (pgdat = first_online_pgdat(); | ||
861 | pgdat && node; | ||
862 | pgdat = next_online_pgdat(pgdat)) | ||
863 | --node; | ||
864 | |||
865 | return pgdat; | ||
866 | } | ||
867 | |||
868 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | ||
869 | { | ||
870 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
871 | |||
872 | (*pos)++; | ||
873 | return next_online_pgdat(pgdat); | ||
874 | } | ||
875 | |||
876 | static void frag_stop(struct seq_file *m, void *arg) | ||
877 | { | ||
878 | } | ||
879 | |||
880 | /* Walk all the zones in a node and print using a callback */ | ||
881 | static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | ||
882 | void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) | ||
883 | { | ||
884 | struct zone *zone; | ||
885 | struct zone *node_zones = pgdat->node_zones; | ||
886 | unsigned long flags; | ||
887 | |||
888 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
889 | if (!populated_zone(zone)) | ||
890 | continue; | ||
891 | |||
892 | spin_lock_irqsave(&zone->lock, flags); | ||
893 | print(m, pgdat, zone); | ||
894 | spin_unlock_irqrestore(&zone->lock, flags); | ||
895 | } | ||
896 | } | ||
897 | #endif | ||
898 | |||
910 | #ifdef CONFIG_PROC_FS | 899 | #ifdef CONFIG_PROC_FS |
900 | static char * const migratetype_names[MIGRATE_TYPES] = { | ||
901 | "Unmovable", | ||
902 | "Reclaimable", | ||
903 | "Movable", | ||
904 | "Reserve", | ||
905 | #ifdef CONFIG_CMA | ||
906 | "CMA", | ||
907 | #endif | ||
908 | #ifdef CONFIG_MEMORY_ISOLATION | ||
909 | "Isolate", | ||
910 | #endif | ||
911 | }; | ||
912 | |||
911 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, | 913 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, |
912 | struct zone *zone) | 914 | struct zone *zone) |
913 | { | 915 | { |
@@ -1536,8 +1538,6 @@ static int __init setup_vmstat(void) | |||
1536 | module_init(setup_vmstat) | 1538 | module_init(setup_vmstat) |
1537 | 1539 | ||
1538 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) | 1540 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) |
1539 | #include <linux/debugfs.h> | ||
1540 | |||
1541 | 1541 | ||
1542 | /* | 1542 | /* |
1543 | * Return an index indicating how much of the available free memory is | 1543 | * Return an index indicating how much of the available free memory is |