diff options
-rw-r--r-- | arch/x86/kvm/Kconfig | 1 | ||||
-rw-r--r-- | include/linux/mm_types.h | 4 | ||||
-rw-r--r-- | include/linux/mmu_notifier.h | 279 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | mm/Kconfig | 3 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/filemap_xip.c | 3 | ||||
-rw-r--r-- | mm/fremap.c | 3 | ||||
-rw-r--r-- | mm/hugetlb.c | 3 | ||||
-rw-r--r-- | mm/memory.c | 35 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 277 | ||||
-rw-r--r-- | mm/mprotect.c | 3 | ||||
-rw-r--r-- | mm/mremap.c | 6 | ||||
-rw-r--r-- | mm/rmap.c | 13 |
15 files changed, 623 insertions, 13 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8d45fabc5f3b..ce3251ce5504 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -21,6 +21,7 @@ config KVM | |||
21 | tristate "Kernel-based Virtual Machine (KVM) support" | 21 | tristate "Kernel-based Virtual Machine (KVM) support" |
22 | depends on HAVE_KVM | 22 | depends on HAVE_KVM |
23 | select PREEMPT_NOTIFIERS | 23 | select PREEMPT_NOTIFIERS |
24 | select MMU_NOTIFIER | ||
24 | select ANON_INODES | 25 | select ANON_INODES |
25 | ---help--- | 26 | ---help--- |
26 | Support hosting fully virtualized guest machines using hardware | 27 | Support hosting fully virtualized guest machines using hardware |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 746f975b58ef..386edbe2cb4e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/rbtree.h> | 10 | #include <linux/rbtree.h> |
11 | #include <linux/rwsem.h> | 11 | #include <linux/rwsem.h> |
12 | #include <linux/completion.h> | 12 | #include <linux/completion.h> |
13 | #include <linux/cpumask.h> | ||
13 | #include <asm/page.h> | 14 | #include <asm/page.h> |
14 | #include <asm/mmu.h> | 15 | #include <asm/mmu.h> |
15 | 16 | ||
@@ -253,6 +254,9 @@ struct mm_struct { | |||
253 | struct file *exe_file; | 254 | struct file *exe_file; |
254 | unsigned long num_exe_file_vmas; | 255 | unsigned long num_exe_file_vmas; |
255 | #endif | 256 | #endif |
257 | #ifdef CONFIG_MMU_NOTIFIER | ||
258 | struct mmu_notifier_mm *mmu_notifier_mm; | ||
259 | #endif | ||
256 | }; | 260 | }; |
257 | 261 | ||
258 | #endif /* _LINUX_MM_TYPES_H */ | 262 | #endif /* _LINUX_MM_TYPES_H */ |
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h new file mode 100644 index 000000000000..b77486d152cd --- /dev/null +++ b/include/linux/mmu_notifier.h | |||
@@ -0,0 +1,279 @@ | |||
1 | #ifndef _LINUX_MMU_NOTIFIER_H | ||
2 | #define _LINUX_MMU_NOTIFIER_H | ||
3 | |||
4 | #include <linux/list.h> | ||
5 | #include <linux/spinlock.h> | ||
6 | #include <linux/mm_types.h> | ||
7 | |||
8 | struct mmu_notifier; | ||
9 | struct mmu_notifier_ops; | ||
10 | |||
11 | #ifdef CONFIG_MMU_NOTIFIER | ||
12 | |||
13 | /* | ||
14 | * The mmu notifier_mm structure is allocated and installed in | ||
15 | * mm->mmu_notifier_mm inside the mm_take_all_locks() protected | ||
16 | * critical section and it's released only when mm_count reaches zero | ||
17 | * in mmdrop(). | ||
18 | */ | ||
19 | struct mmu_notifier_mm { | ||
20 | /* all mmu notifiers registerd in this mm are queued in this list */ | ||
21 | struct hlist_head list; | ||
22 | /* to serialize the list modifications and hlist_unhashed */ | ||
23 | spinlock_t lock; | ||
24 | }; | ||
25 | |||
26 | struct mmu_notifier_ops { | ||
27 | /* | ||
28 | * Called either by mmu_notifier_unregister or when the mm is | ||
29 | * being destroyed by exit_mmap, always before all pages are | ||
30 | * freed. This can run concurrently with other mmu notifier | ||
31 | * methods (the ones invoked outside the mm context) and it | ||
32 | * should tear down all secondary mmu mappings and freeze the | ||
33 | * secondary mmu. If this method isn't implemented you've to | ||
34 | * be sure that nothing could possibly write to the pages | ||
35 | * through the secondary mmu by the time the last thread with | ||
36 | * tsk->mm == mm exits. | ||
37 | * | ||
38 | * As side note: the pages freed after ->release returns could | ||
39 | * be immediately reallocated by the gart at an alias physical | ||
40 | * address with a different cache model, so if ->release isn't | ||
41 | * implemented because all _software_ driven memory accesses | ||
42 | * through the secondary mmu are terminated by the time the | ||
43 | * last thread of this mm quits, you've also to be sure that | ||
44 | * speculative _hardware_ operations can't allocate dirty | ||
45 | * cachelines in the cpu that could not be snooped and made | ||
46 | * coherent with the other read and write operations happening | ||
47 | * through the gart alias address, so leading to memory | ||
48 | * corruption. | ||
49 | */ | ||
50 | void (*release)(struct mmu_notifier *mn, | ||
51 | struct mm_struct *mm); | ||
52 | |||
53 | /* | ||
54 | * clear_flush_young is called after the VM is | ||
55 | * test-and-clearing the young/accessed bitflag in the | ||
56 | * pte. This way the VM will provide proper aging to the | ||
57 | * accesses to the page through the secondary MMUs and not | ||
58 | * only to the ones through the Linux pte. | ||
59 | */ | ||
60 | int (*clear_flush_young)(struct mmu_notifier *mn, | ||
61 | struct mm_struct *mm, | ||
62 | unsigned long address); | ||
63 | |||
64 | /* | ||
65 | * Before this is invoked any secondary MMU is still ok to | ||
66 | * read/write to the page previously pointed to by the Linux | ||
67 | * pte because the page hasn't been freed yet and it won't be | ||
68 | * freed until this returns. If required set_page_dirty has to | ||
69 | * be called internally to this method. | ||
70 | */ | ||
71 | void (*invalidate_page)(struct mmu_notifier *mn, | ||
72 | struct mm_struct *mm, | ||
73 | unsigned long address); | ||
74 | |||
75 | /* | ||
76 | * invalidate_range_start() and invalidate_range_end() must be | ||
77 | * paired and are called only when the mmap_sem and/or the | ||
78 | * locks protecting the reverse maps are held. The subsystem | ||
79 | * must guarantee that no additional references are taken to | ||
80 | * the pages in the range established between the call to | ||
81 | * invalidate_range_start() and the matching call to | ||
82 | * invalidate_range_end(). | ||
83 | * | ||
84 | * Invalidation of multiple concurrent ranges may be | ||
85 | * optionally permitted by the driver. Either way the | ||
86 | * establishment of sptes is forbidden in the range passed to | ||
87 | * invalidate_range_begin/end for the whole duration of the | ||
88 | * invalidate_range_begin/end critical section. | ||
89 | * | ||
90 | * invalidate_range_start() is called when all pages in the | ||
91 | * range are still mapped and have at least a refcount of one. | ||
92 | * | ||
93 | * invalidate_range_end() is called when all pages in the | ||
94 | * range have been unmapped and the pages have been freed by | ||
95 | * the VM. | ||
96 | * | ||
97 | * The VM will remove the page table entries and potentially | ||
98 | * the page between invalidate_range_start() and | ||
99 | * invalidate_range_end(). If the page must not be freed | ||
100 | * because of pending I/O or other circumstances then the | ||
101 | * invalidate_range_start() callback (or the initial mapping | ||
102 | * by the driver) must make sure that the refcount is kept | ||
103 | * elevated. | ||
104 | * | ||
105 | * If the driver increases the refcount when the pages are | ||
106 | * initially mapped into an address space then either | ||
107 | * invalidate_range_start() or invalidate_range_end() may | ||
108 | * decrease the refcount. If the refcount is decreased on | ||
109 | * invalidate_range_start() then the VM can free pages as page | ||
110 | * table entries are removed. If the refcount is only | ||
111 | * droppped on invalidate_range_end() then the driver itself | ||
112 | * will drop the last refcount but it must take care to flush | ||
113 | * any secondary tlb before doing the final free on the | ||
114 | * page. Pages will no longer be referenced by the linux | ||
115 | * address space but may still be referenced by sptes until | ||
116 | * the last refcount is dropped. | ||
117 | */ | ||
118 | void (*invalidate_range_start)(struct mmu_notifier *mn, | ||
119 | struct mm_struct *mm, | ||
120 | unsigned long start, unsigned long end); | ||
121 | void (*invalidate_range_end)(struct mmu_notifier *mn, | ||
122 | struct mm_struct *mm, | ||
123 | unsigned long start, unsigned long end); | ||
124 | }; | ||
125 | |||
126 | /* | ||
127 | * The notifier chains are protected by mmap_sem and/or the reverse map | ||
128 | * semaphores. Notifier chains are only changed when all reverse maps and | ||
129 | * the mmap_sem locks are taken. | ||
130 | * | ||
131 | * Therefore notifier chains can only be traversed when either | ||
132 | * | ||
133 | * 1. mmap_sem is held. | ||
134 | * 2. One of the reverse map locks is held (i_mmap_lock or anon_vma->lock). | ||
135 | * 3. No other concurrent thread can access the list (release) | ||
136 | */ | ||
137 | struct mmu_notifier { | ||
138 | struct hlist_node hlist; | ||
139 | const struct mmu_notifier_ops *ops; | ||
140 | }; | ||
141 | |||
142 | static inline int mm_has_notifiers(struct mm_struct *mm) | ||
143 | { | ||
144 | return unlikely(mm->mmu_notifier_mm); | ||
145 | } | ||
146 | |||
147 | extern int mmu_notifier_register(struct mmu_notifier *mn, | ||
148 | struct mm_struct *mm); | ||
149 | extern int __mmu_notifier_register(struct mmu_notifier *mn, | ||
150 | struct mm_struct *mm); | ||
151 | extern void mmu_notifier_unregister(struct mmu_notifier *mn, | ||
152 | struct mm_struct *mm); | ||
153 | extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); | ||
154 | extern void __mmu_notifier_release(struct mm_struct *mm); | ||
155 | extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | ||
156 | unsigned long address); | ||
157 | extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, | ||
158 | unsigned long address); | ||
159 | extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | ||
160 | unsigned long start, unsigned long end); | ||
161 | extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | ||
162 | unsigned long start, unsigned long end); | ||
163 | |||
164 | static inline void mmu_notifier_release(struct mm_struct *mm) | ||
165 | { | ||
166 | if (mm_has_notifiers(mm)) | ||
167 | __mmu_notifier_release(mm); | ||
168 | } | ||
169 | |||
170 | static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, | ||
171 | unsigned long address) | ||
172 | { | ||
173 | if (mm_has_notifiers(mm)) | ||
174 | return __mmu_notifier_clear_flush_young(mm, address); | ||
175 | return 0; | ||
176 | } | ||
177 | |||
178 | static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, | ||
179 | unsigned long address) | ||
180 | { | ||
181 | if (mm_has_notifiers(mm)) | ||
182 | __mmu_notifier_invalidate_page(mm, address); | ||
183 | } | ||
184 | |||
185 | static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, | ||
186 | unsigned long start, unsigned long end) | ||
187 | { | ||
188 | if (mm_has_notifiers(mm)) | ||
189 | __mmu_notifier_invalidate_range_start(mm, start, end); | ||
190 | } | ||
191 | |||
192 | static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, | ||
193 | unsigned long start, unsigned long end) | ||
194 | { | ||
195 | if (mm_has_notifiers(mm)) | ||
196 | __mmu_notifier_invalidate_range_end(mm, start, end); | ||
197 | } | ||
198 | |||
199 | static inline void mmu_notifier_mm_init(struct mm_struct *mm) | ||
200 | { | ||
201 | mm->mmu_notifier_mm = NULL; | ||
202 | } | ||
203 | |||
204 | static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) | ||
205 | { | ||
206 | if (mm_has_notifiers(mm)) | ||
207 | __mmu_notifier_mm_destroy(mm); | ||
208 | } | ||
209 | |||
210 | /* | ||
211 | * These two macros will sometime replace ptep_clear_flush. | ||
212 | * ptep_clear_flush is impleemnted as macro itself, so this also is | ||
213 | * implemented as a macro until ptep_clear_flush will converted to an | ||
214 | * inline function, to diminish the risk of compilation failure. The | ||
215 | * invalidate_page method over time can be moved outside the PT lock | ||
216 | * and these two macros can be later removed. | ||
217 | */ | ||
218 | #define ptep_clear_flush_notify(__vma, __address, __ptep) \ | ||
219 | ({ \ | ||
220 | pte_t __pte; \ | ||
221 | struct vm_area_struct *___vma = __vma; \ | ||
222 | unsigned long ___address = __address; \ | ||
223 | __pte = ptep_clear_flush(___vma, ___address, __ptep); \ | ||
224 | mmu_notifier_invalidate_page(___vma->vm_mm, ___address); \ | ||
225 | __pte; \ | ||
226 | }) | ||
227 | |||
228 | #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ | ||
229 | ({ \ | ||
230 | int __young; \ | ||
231 | struct vm_area_struct *___vma = __vma; \ | ||
232 | unsigned long ___address = __address; \ | ||
233 | __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ | ||
234 | __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ | ||
235 | ___address); \ | ||
236 | __young; \ | ||
237 | }) | ||
238 | |||
239 | #else /* CONFIG_MMU_NOTIFIER */ | ||
240 | |||
241 | static inline void mmu_notifier_release(struct mm_struct *mm) | ||
242 | { | ||
243 | } | ||
244 | |||
245 | static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, | ||
246 | unsigned long address) | ||
247 | { | ||
248 | return 0; | ||
249 | } | ||
250 | |||
251 | static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, | ||
252 | unsigned long address) | ||
253 | { | ||
254 | } | ||
255 | |||
256 | static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, | ||
257 | unsigned long start, unsigned long end) | ||
258 | { | ||
259 | } | ||
260 | |||
261 | static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, | ||
262 | unsigned long start, unsigned long end) | ||
263 | { | ||
264 | } | ||
265 | |||
266 | static inline void mmu_notifier_mm_init(struct mm_struct *mm) | ||
267 | { | ||
268 | } | ||
269 | |||
270 | static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) | ||
271 | { | ||
272 | } | ||
273 | |||
274 | #define ptep_clear_flush_young_notify ptep_clear_flush_young | ||
275 | #define ptep_clear_flush_notify ptep_clear_flush | ||
276 | |||
277 | #endif /* CONFIG_MMU_NOTIFIER */ | ||
278 | |||
279 | #endif /* _LINUX_MMU_NOTIFIER_H */ | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 8214ba7c8bb1..7ce2ebe84796 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/key.h> | 27 | #include <linux/key.h> |
28 | #include <linux/binfmts.h> | 28 | #include <linux/binfmts.h> |
29 | #include <linux/mman.h> | 29 | #include <linux/mman.h> |
30 | #include <linux/mmu_notifier.h> | ||
30 | #include <linux/fs.h> | 31 | #include <linux/fs.h> |
31 | #include <linux/nsproxy.h> | 32 | #include <linux/nsproxy.h> |
32 | #include <linux/capability.h> | 33 | #include <linux/capability.h> |
@@ -414,6 +415,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | |||
414 | 415 | ||
415 | if (likely(!mm_alloc_pgd(mm))) { | 416 | if (likely(!mm_alloc_pgd(mm))) { |
416 | mm->def_flags = 0; | 417 | mm->def_flags = 0; |
418 | mmu_notifier_mm_init(mm); | ||
417 | return mm; | 419 | return mm; |
418 | } | 420 | } |
419 | 421 | ||
@@ -446,6 +448,7 @@ void __mmdrop(struct mm_struct *mm) | |||
446 | BUG_ON(mm == &init_mm); | 448 | BUG_ON(mm == &init_mm); |
447 | mm_free_pgd(mm); | 449 | mm_free_pgd(mm); |
448 | destroy_context(mm); | 450 | destroy_context(mm); |
451 | mmu_notifier_mm_destroy(mm); | ||
449 | free_mm(mm); | 452 | free_mm(mm); |
450 | } | 453 | } |
451 | EXPORT_SYMBOL_GPL(__mmdrop); | 454 | EXPORT_SYMBOL_GPL(__mmdrop); |
diff --git a/mm/Kconfig b/mm/Kconfig index efee5d379df4..446c6588c753 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -208,3 +208,6 @@ config NR_QUICK | |||
208 | config VIRT_TO_BUS | 208 | config VIRT_TO_BUS |
209 | def_bool y | 209 | def_bool y |
210 | depends on !ARCH_NO_VIRT_TO_BUS | 210 | depends on !ARCH_NO_VIRT_TO_BUS |
211 | |||
212 | config MMU_NOTIFIER | ||
213 | bool | ||
diff --git a/mm/Makefile b/mm/Makefile index 06ca2381fef1..da4ccf015aea 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o | |||
25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | 25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o |
26 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 26 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o |
27 | obj-$(CONFIG_SLOB) += slob.o | 27 | obj-$(CONFIG_SLOB) += slob.o |
28 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | ||
28 | obj-$(CONFIG_SLAB) += slab.o | 29 | obj-$(CONFIG_SLAB) += slab.o |
29 | obj-$(CONFIG_SLUB) += slub.o | 30 | obj-$(CONFIG_SLUB) += slub.o |
30 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 31 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 98a3f31ccd6a..380ab402d711 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/uio.h> | 14 | #include <linux/uio.h> |
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/mmu_notifier.h> | ||
16 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
17 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
18 | #include <asm/io.h> | 19 | #include <asm/io.h> |
@@ -188,7 +189,7 @@ __xip_unmap (struct address_space * mapping, | |||
188 | if (pte) { | 189 | if (pte) { |
189 | /* Nuke the page table entry. */ | 190 | /* Nuke the page table entry. */ |
190 | flush_cache_page(vma, address, pte_pfn(*pte)); | 191 | flush_cache_page(vma, address, pte_pfn(*pte)); |
191 | pteval = ptep_clear_flush(vma, address, pte); | 192 | pteval = ptep_clear_flush_notify(vma, address, pte); |
192 | page_remove_rmap(page, vma); | 193 | page_remove_rmap(page, vma); |
193 | dec_mm_counter(mm, file_rss); | 194 | dec_mm_counter(mm, file_rss); |
194 | BUG_ON(pte_dirty(pteval)); | 195 | BUG_ON(pte_dirty(pteval)); |
diff --git a/mm/fremap.c b/mm/fremap.c index 07a9c82ce1a3..7881638e4a12 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/syscalls.h> | 17 | #include <linux/syscalls.h> |
18 | #include <linux/mmu_notifier.h> | ||
18 | 19 | ||
19 | #include <asm/mmu_context.h> | 20 | #include <asm/mmu_context.h> |
20 | #include <asm/cacheflush.h> | 21 | #include <asm/cacheflush.h> |
@@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, | |||
214 | spin_unlock(&mapping->i_mmap_lock); | 215 | spin_unlock(&mapping->i_mmap_lock); |
215 | } | 216 | } |
216 | 217 | ||
218 | mmu_notifier_invalidate_range_start(mm, start, start + size); | ||
217 | err = populate_range(mm, vma, start, size, pgoff); | 219 | err = populate_range(mm, vma, start, size, pgoff); |
220 | mmu_notifier_invalidate_range_end(mm, start, start + size); | ||
218 | if (!err && !(flags & MAP_NONBLOCK)) { | 221 | if (!err && !(flags & MAP_NONBLOCK)) { |
219 | if (unlikely(has_write_lock)) { | 222 | if (unlikely(has_write_lock)) { |
220 | downgrade_write(&mm->mmap_sem); | 223 | downgrade_write(&mm->mmap_sem); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3be79dc18c5c..80eb0d31d0d3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/sysctl.h> | 10 | #include <linux/sysctl.h> |
11 | #include <linux/highmem.h> | 11 | #include <linux/highmem.h> |
12 | #include <linux/mmu_notifier.h> | ||
12 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
13 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
14 | #include <linux/mempolicy.h> | 15 | #include <linux/mempolicy.h> |
@@ -1672,6 +1673,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
1672 | BUG_ON(start & ~huge_page_mask(h)); | 1673 | BUG_ON(start & ~huge_page_mask(h)); |
1673 | BUG_ON(end & ~huge_page_mask(h)); | 1674 | BUG_ON(end & ~huge_page_mask(h)); |
1674 | 1675 | ||
1676 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
1675 | spin_lock(&mm->page_table_lock); | 1677 | spin_lock(&mm->page_table_lock); |
1676 | for (address = start; address < end; address += sz) { | 1678 | for (address = start; address < end; address += sz) { |
1677 | ptep = huge_pte_offset(mm, address); | 1679 | ptep = huge_pte_offset(mm, address); |
@@ -1713,6 +1715,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
1713 | } | 1715 | } |
1714 | spin_unlock(&mm->page_table_lock); | 1716 | spin_unlock(&mm->page_table_lock); |
1715 | flush_tlb_range(vma, start, end); | 1717 | flush_tlb_range(vma, start, end); |
1718 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
1716 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 1719 | list_for_each_entry_safe(page, tmp, &page_list, lru) { |
1717 | list_del(&page->lru); | 1720 | list_del(&page->lru); |
1718 | put_page(page); | 1721 | put_page(page); |
diff --git a/mm/memory.c b/mm/memory.c index a8ca04faaea6..67f0ab9077d9 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/memcontrol.h> | 53 | #include <linux/memcontrol.h> |
54 | #include <linux/mmu_notifier.h> | ||
54 | 55 | ||
55 | #include <asm/pgalloc.h> | 56 | #include <asm/pgalloc.h> |
56 | #include <asm/uaccess.h> | 57 | #include <asm/uaccess.h> |
@@ -652,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
652 | unsigned long next; | 653 | unsigned long next; |
653 | unsigned long addr = vma->vm_start; | 654 | unsigned long addr = vma->vm_start; |
654 | unsigned long end = vma->vm_end; | 655 | unsigned long end = vma->vm_end; |
656 | int ret; | ||
655 | 657 | ||
656 | /* | 658 | /* |
657 | * Don't copy ptes where a page fault will fill them correctly. | 659 | * Don't copy ptes where a page fault will fill them correctly. |
@@ -667,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
667 | if (is_vm_hugetlb_page(vma)) | 669 | if (is_vm_hugetlb_page(vma)) |
668 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | 670 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); |
669 | 671 | ||
672 | /* | ||
673 | * We need to invalidate the secondary MMU mappings only when | ||
674 | * there could be a permission downgrade on the ptes of the | ||
675 | * parent mm. And a permission downgrade will only happen if | ||
676 | * is_cow_mapping() returns true. | ||
677 | */ | ||
678 | if (is_cow_mapping(vma->vm_flags)) | ||
679 | mmu_notifier_invalidate_range_start(src_mm, addr, end); | ||
680 | |||
681 | ret = 0; | ||
670 | dst_pgd = pgd_offset(dst_mm, addr); | 682 | dst_pgd = pgd_offset(dst_mm, addr); |
671 | src_pgd = pgd_offset(src_mm, addr); | 683 | src_pgd = pgd_offset(src_mm, addr); |
672 | do { | 684 | do { |
673 | next = pgd_addr_end(addr, end); | 685 | next = pgd_addr_end(addr, end); |
674 | if (pgd_none_or_clear_bad(src_pgd)) | 686 | if (pgd_none_or_clear_bad(src_pgd)) |
675 | continue; | 687 | continue; |
676 | if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, | 688 | if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, |
677 | vma, addr, next)) | 689 | vma, addr, next))) { |
678 | return -ENOMEM; | 690 | ret = -ENOMEM; |
691 | break; | ||
692 | } | ||
679 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); | 693 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); |
680 | return 0; | 694 | |
695 | if (is_cow_mapping(vma->vm_flags)) | ||
696 | mmu_notifier_invalidate_range_end(src_mm, | ||
697 | vma->vm_start, end); | ||
698 | return ret; | ||
681 | } | 699 | } |
682 | 700 | ||
683 | static unsigned long zap_pte_range(struct mmu_gather *tlb, | 701 | static unsigned long zap_pte_range(struct mmu_gather *tlb, |
@@ -881,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
881 | unsigned long start = start_addr; | 899 | unsigned long start = start_addr; |
882 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; | 900 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; |
883 | int fullmm = (*tlbp)->fullmm; | 901 | int fullmm = (*tlbp)->fullmm; |
902 | struct mm_struct *mm = vma->vm_mm; | ||
884 | 903 | ||
904 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); | ||
885 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { | 905 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { |
886 | unsigned long end; | 906 | unsigned long end; |
887 | 907 | ||
@@ -946,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
946 | } | 966 | } |
947 | } | 967 | } |
948 | out: | 968 | out: |
969 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); | ||
949 | return start; /* which is now the end (or restart) address */ | 970 | return start; /* which is now the end (or restart) address */ |
950 | } | 971 | } |
951 | 972 | ||
@@ -1616,10 +1637,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | |||
1616 | { | 1637 | { |
1617 | pgd_t *pgd; | 1638 | pgd_t *pgd; |
1618 | unsigned long next; | 1639 | unsigned long next; |
1619 | unsigned long end = addr + size; | 1640 | unsigned long start = addr, end = addr + size; |
1620 | int err; | 1641 | int err; |
1621 | 1642 | ||
1622 | BUG_ON(addr >= end); | 1643 | BUG_ON(addr >= end); |
1644 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
1623 | pgd = pgd_offset(mm, addr); | 1645 | pgd = pgd_offset(mm, addr); |
1624 | do { | 1646 | do { |
1625 | next = pgd_addr_end(addr, end); | 1647 | next = pgd_addr_end(addr, end); |
@@ -1627,6 +1649,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | |||
1627 | if (err) | 1649 | if (err) |
1628 | break; | 1650 | break; |
1629 | } while (pgd++, addr = next, addr != end); | 1651 | } while (pgd++, addr = next, addr != end); |
1652 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
1630 | return err; | 1653 | return err; |
1631 | } | 1654 | } |
1632 | EXPORT_SYMBOL_GPL(apply_to_page_range); | 1655 | EXPORT_SYMBOL_GPL(apply_to_page_range); |
@@ -1839,7 +1862,7 @@ gotten: | |||
1839 | * seen in the presence of one thread doing SMC and another | 1862 | * seen in the presence of one thread doing SMC and another |
1840 | * thread doing COW. | 1863 | * thread doing COW. |
1841 | */ | 1864 | */ |
1842 | ptep_clear_flush(vma, address, page_table); | 1865 | ptep_clear_flush_notify(vma, address, page_table); |
1843 | set_pte_at(mm, address, page_table, entry); | 1866 | set_pte_at(mm, address, page_table, entry); |
1844 | update_mmu_cache(vma, address, entry); | 1867 | update_mmu_cache(vma, address, entry); |
1845 | lru_cache_add_active(new_page); | 1868 | lru_cache_add_active(new_page); |
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/mount.h> | 26 | #include <linux/mount.h> |
27 | #include <linux/mempolicy.h> | 27 | #include <linux/mempolicy.h> |
28 | #include <linux/rmap.h> | 28 | #include <linux/rmap.h> |
29 | #include <linux/mmu_notifier.h> | ||
29 | 30 | ||
30 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
31 | #include <asm/cacheflush.h> | 32 | #include <asm/cacheflush.h> |
@@ -2061,6 +2062,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2061 | 2062 | ||
2062 | /* mm's last user has gone, and its about to be pulled down */ | 2063 | /* mm's last user has gone, and its about to be pulled down */ |
2063 | arch_exit_mmap(mm); | 2064 | arch_exit_mmap(mm); |
2065 | mmu_notifier_release(mm); | ||
2064 | 2066 | ||
2065 | lru_add_drain(); | 2067 | lru_add_drain(); |
2066 | flush_cache_mm(mm); | 2068 | flush_cache_mm(mm); |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c new file mode 100644 index 000000000000..5f4ef0250bee --- /dev/null +++ b/mm/mmu_notifier.c | |||
@@ -0,0 +1,277 @@ | |||
1 | /* | ||
2 | * linux/mm/mmu_notifier.c | ||
3 | * | ||
4 | * Copyright (C) 2008 Qumranet, Inc. | ||
5 | * Copyright (C) 2008 SGI | ||
6 | * Christoph Lameter <clameter@sgi.com> | ||
7 | * | ||
8 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
9 | * the COPYING file in the top-level directory. | ||
10 | */ | ||
11 | |||
12 | #include <linux/rculist.h> | ||
13 | #include <linux/mmu_notifier.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/mm.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <linux/rcupdate.h> | ||
18 | #include <linux/sched.h> | ||
19 | |||
20 | /* | ||
21 | * This function can't run concurrently against mmu_notifier_register | ||
22 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap | ||
23 | * runs with mm_users == 0. Other tasks may still invoke mmu notifiers | ||
24 | * in parallel despite there being no task using this mm any more, | ||
25 | * through the vmas outside of the exit_mmap context, such as with | ||
26 | * vmtruncate. This serializes against mmu_notifier_unregister with | ||
27 | * the mmu_notifier_mm->lock in addition to RCU and it serializes | ||
28 | * against the other mmu notifiers with RCU. struct mmu_notifier_mm | ||
29 | * can't go away from under us as exit_mmap holds an mm_count pin | ||
30 | * itself. | ||
31 | */ | ||
32 | void __mmu_notifier_release(struct mm_struct *mm) | ||
33 | { | ||
34 | struct mmu_notifier *mn; | ||
35 | |||
36 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
37 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | ||
38 | mn = hlist_entry(mm->mmu_notifier_mm->list.first, | ||
39 | struct mmu_notifier, | ||
40 | hlist); | ||
41 | /* | ||
42 | * We arrived before mmu_notifier_unregister so | ||
43 | * mmu_notifier_unregister will do nothing other than | ||
44 | * to wait ->release to finish and | ||
45 | * mmu_notifier_unregister to return. | ||
46 | */ | ||
47 | hlist_del_init_rcu(&mn->hlist); | ||
48 | /* | ||
49 | * RCU here will block mmu_notifier_unregister until | ||
50 | * ->release returns. | ||
51 | */ | ||
52 | rcu_read_lock(); | ||
53 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
54 | /* | ||
55 | * if ->release runs before mmu_notifier_unregister it | ||
56 | * must be handled as it's the only way for the driver | ||
57 | * to flush all existing sptes and stop the driver | ||
58 | * from establishing any more sptes before all the | ||
59 | * pages in the mm are freed. | ||
60 | */ | ||
61 | if (mn->ops->release) | ||
62 | mn->ops->release(mn, mm); | ||
63 | rcu_read_unlock(); | ||
64 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
65 | } | ||
66 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
67 | |||
68 | /* | ||
69 | * synchronize_rcu here prevents mmu_notifier_release to | ||
70 | * return to exit_mmap (which would proceed freeing all pages | ||
71 | * in the mm) until the ->release method returns, if it was | ||
72 | * invoked by mmu_notifier_unregister. | ||
73 | * | ||
74 | * The mmu_notifier_mm can't go away from under us because one | ||
75 | * mm_count is hold by exit_mmap. | ||
76 | */ | ||
77 | synchronize_rcu(); | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * If no young bitflag is supported by the hardware, ->clear_flush_young can | ||
82 | * unmap the address and return 1 or 0 depending if the mapping previously | ||
83 | * existed or not. | ||
84 | */ | ||
85 | int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | ||
86 | unsigned long address) | ||
87 | { | ||
88 | struct mmu_notifier *mn; | ||
89 | struct hlist_node *n; | ||
90 | int young = 0; | ||
91 | |||
92 | rcu_read_lock(); | ||
93 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
94 | if (mn->ops->clear_flush_young) | ||
95 | young |= mn->ops->clear_flush_young(mn, mm, address); | ||
96 | } | ||
97 | rcu_read_unlock(); | ||
98 | |||
99 | return young; | ||
100 | } | ||
101 | |||
102 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, | ||
103 | unsigned long address) | ||
104 | { | ||
105 | struct mmu_notifier *mn; | ||
106 | struct hlist_node *n; | ||
107 | |||
108 | rcu_read_lock(); | ||
109 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
110 | if (mn->ops->invalidate_page) | ||
111 | mn->ops->invalidate_page(mn, mm, address); | ||
112 | } | ||
113 | rcu_read_unlock(); | ||
114 | } | ||
115 | |||
116 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | ||
117 | unsigned long start, unsigned long end) | ||
118 | { | ||
119 | struct mmu_notifier *mn; | ||
120 | struct hlist_node *n; | ||
121 | |||
122 | rcu_read_lock(); | ||
123 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
124 | if (mn->ops->invalidate_range_start) | ||
125 | mn->ops->invalidate_range_start(mn, mm, start, end); | ||
126 | } | ||
127 | rcu_read_unlock(); | ||
128 | } | ||
129 | |||
130 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | ||
131 | unsigned long start, unsigned long end) | ||
132 | { | ||
133 | struct mmu_notifier *mn; | ||
134 | struct hlist_node *n; | ||
135 | |||
136 | rcu_read_lock(); | ||
137 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
138 | if (mn->ops->invalidate_range_end) | ||
139 | mn->ops->invalidate_range_end(mn, mm, start, end); | ||
140 | } | ||
141 | rcu_read_unlock(); | ||
142 | } | ||
143 | |||
144 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | ||
145 | struct mm_struct *mm, | ||
146 | int take_mmap_sem) | ||
147 | { | ||
148 | struct mmu_notifier_mm *mmu_notifier_mm; | ||
149 | int ret; | ||
150 | |||
151 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | ||
152 | |||
153 | ret = -ENOMEM; | ||
154 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); | ||
155 | if (unlikely(!mmu_notifier_mm)) | ||
156 | goto out; | ||
157 | |||
158 | if (take_mmap_sem) | ||
159 | down_write(&mm->mmap_sem); | ||
160 | ret = mm_take_all_locks(mm); | ||
161 | if (unlikely(ret)) | ||
162 | goto out_cleanup; | ||
163 | |||
164 | if (!mm_has_notifiers(mm)) { | ||
165 | INIT_HLIST_HEAD(&mmu_notifier_mm->list); | ||
166 | spin_lock_init(&mmu_notifier_mm->lock); | ||
167 | mm->mmu_notifier_mm = mmu_notifier_mm; | ||
168 | mmu_notifier_mm = NULL; | ||
169 | } | ||
170 | atomic_inc(&mm->mm_count); | ||
171 | |||
172 | /* | ||
173 | * Serialize the update against mmu_notifier_unregister. A | ||
174 | * side note: mmu_notifier_release can't run concurrently with | ||
175 | * us because we hold the mm_users pin (either implicitly as | ||
176 | * current->mm or explicitly with get_task_mm() or similar). | ||
177 | * We can't race against any other mmu notifier method either | ||
178 | * thanks to mm_take_all_locks(). | ||
179 | */ | ||
180 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
181 | hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); | ||
182 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
183 | |||
184 | mm_drop_all_locks(mm); | ||
185 | out_cleanup: | ||
186 | if (take_mmap_sem) | ||
187 | up_write(&mm->mmap_sem); | ||
188 | /* kfree() does nothing if mmu_notifier_mm is NULL */ | ||
189 | kfree(mmu_notifier_mm); | ||
190 | out: | ||
191 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | ||
192 | return ret; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * Must not hold mmap_sem nor any other VM related lock when calling | ||
197 | * this registration function. Must also ensure mm_users can't go down | ||
198 | * to zero while this runs to avoid races with mmu_notifier_release, | ||
199 | * so mm has to be current->mm or the mm should be pinned safely such | ||
200 | * as with get_task_mm(). If the mm is not current->mm, the mm_users | ||
201 | * pin should be released by calling mmput after mmu_notifier_register | ||
202 | * returns. mmu_notifier_unregister must be always called to | ||
203 | * unregister the notifier. mm_count is automatically pinned to allow | ||
204 | * mmu_notifier_unregister to safely run at any time later, before or | ||
205 | * after exit_mmap. ->release will always be called before exit_mmap | ||
206 | * frees the pages. | ||
207 | */ | ||
208 | int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | ||
209 | { | ||
210 | return do_mmu_notifier_register(mn, mm, 1); | ||
211 | } | ||
212 | EXPORT_SYMBOL_GPL(mmu_notifier_register); | ||
213 | |||
214 | /* | ||
215 | * Same as mmu_notifier_register but here the caller must hold the | ||
216 | * mmap_sem in write mode. | ||
217 | */ | ||
218 | int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | ||
219 | { | ||
220 | return do_mmu_notifier_register(mn, mm, 0); | ||
221 | } | ||
222 | EXPORT_SYMBOL_GPL(__mmu_notifier_register); | ||
223 | |||
224 | /* this is called after the last mmu_notifier_unregister() returned */ | ||
225 | void __mmu_notifier_mm_destroy(struct mm_struct *mm) | ||
226 | { | ||
227 | BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); | ||
228 | kfree(mm->mmu_notifier_mm); | ||
229 | mm->mmu_notifier_mm = LIST_POISON1; /* debug */ | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * This releases the mm_count pin automatically and frees the mm | ||
234 | * structure if it was the last user of it. It serializes against | ||
235 | * running mmu notifiers with RCU and against mmu_notifier_unregister | ||
236 | * with the unregister lock + RCU. All sptes must be dropped before | ||
237 | * calling mmu_notifier_unregister. ->release or any other notifier | ||
238 | * method may be invoked concurrently with mmu_notifier_unregister, | ||
239 | * and only after mmu_notifier_unregister returned we're guaranteed | ||
240 | * that ->release or any other method can't run anymore. | ||
241 | */ | ||
242 | void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | ||
243 | { | ||
244 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | ||
245 | |||
246 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
247 | if (!hlist_unhashed(&mn->hlist)) { | ||
248 | hlist_del_rcu(&mn->hlist); | ||
249 | |||
250 | /* | ||
251 | * RCU here will force exit_mmap to wait ->release to finish | ||
252 | * before freeing the pages. | ||
253 | */ | ||
254 | rcu_read_lock(); | ||
255 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
256 | /* | ||
257 | * exit_mmap will block in mmu_notifier_release to | ||
258 | * guarantee ->release is called before freeing the | ||
259 | * pages. | ||
260 | */ | ||
261 | if (mn->ops->release) | ||
262 | mn->ops->release(mn, mm); | ||
263 | rcu_read_unlock(); | ||
264 | } else | ||
265 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
266 | |||
267 | /* | ||
268 | * Wait any running method to finish, of course including | ||
269 | * ->release if it was run by mmu_notifier_relase instead of us. | ||
270 | */ | ||
271 | synchronize_rcu(); | ||
272 | |||
273 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | ||
274 | |||
275 | mmdrop(mm); | ||
276 | } | ||
277 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index abd645a3b0a0..fded06f923f4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/syscalls.h> | 21 | #include <linux/syscalls.h> |
22 | #include <linux/swap.h> | 22 | #include <linux/swap.h> |
23 | #include <linux/swapops.h> | 23 | #include <linux/swapops.h> |
24 | #include <linux/mmu_notifier.h> | ||
24 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
25 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
26 | #include <asm/cacheflush.h> | 27 | #include <asm/cacheflush.h> |
@@ -203,10 +204,12 @@ success: | |||
203 | dirty_accountable = 1; | 204 | dirty_accountable = 1; |
204 | } | 205 | } |
205 | 206 | ||
207 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
206 | if (is_vm_hugetlb_page(vma)) | 208 | if (is_vm_hugetlb_page(vma)) |
207 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); | 209 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); |
208 | else | 210 | else |
209 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); | 211 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); |
212 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
210 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 213 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
211 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 214 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
212 | return 0; | 215 | return 0; |
diff --git a/mm/mremap.c b/mm/mremap.c index 08e3c7f2bd15..1a7743923c8c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/highmem.h> | 18 | #include <linux/highmem.h> |
19 | #include <linux/security.h> | 19 | #include <linux/security.h> |
20 | #include <linux/syscalls.h> | 20 | #include <linux/syscalls.h> |
21 | #include <linux/mmu_notifier.h> | ||
21 | 22 | ||
22 | #include <asm/uaccess.h> | 23 | #include <asm/uaccess.h> |
23 | #include <asm/cacheflush.h> | 24 | #include <asm/cacheflush.h> |
@@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
74 | struct mm_struct *mm = vma->vm_mm; | 75 | struct mm_struct *mm = vma->vm_mm; |
75 | pte_t *old_pte, *new_pte, pte; | 76 | pte_t *old_pte, *new_pte, pte; |
76 | spinlock_t *old_ptl, *new_ptl; | 77 | spinlock_t *old_ptl, *new_ptl; |
78 | unsigned long old_start; | ||
77 | 79 | ||
80 | old_start = old_addr; | ||
81 | mmu_notifier_invalidate_range_start(vma->vm_mm, | ||
82 | old_start, old_end); | ||
78 | if (vma->vm_file) { | 83 | if (vma->vm_file) { |
79 | /* | 84 | /* |
80 | * Subtle point from Rajesh Venkatasubramanian: before | 85 | * Subtle point from Rajesh Venkatasubramanian: before |
@@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
116 | pte_unmap_unlock(old_pte - 1, old_ptl); | 121 | pte_unmap_unlock(old_pte - 1, old_ptl); |
117 | if (mapping) | 122 | if (mapping) |
118 | spin_unlock(&mapping->i_mmap_lock); | 123 | spin_unlock(&mapping->i_mmap_lock); |
124 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); | ||
119 | } | 125 | } |
120 | 126 | ||
121 | #define LATENCY_LIMIT (64 * PAGE_SIZE) | 127 | #define LATENCY_LIMIT (64 * PAGE_SIZE) |
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/kallsyms.h> | 50 | #include <linux/kallsyms.h> |
51 | #include <linux/memcontrol.h> | 51 | #include <linux/memcontrol.h> |
52 | #include <linux/mmu_notifier.h> | ||
52 | 53 | ||
53 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
54 | 55 | ||
@@ -287,7 +288,7 @@ static int page_referenced_one(struct page *page, | |||
287 | if (vma->vm_flags & VM_LOCKED) { | 288 | if (vma->vm_flags & VM_LOCKED) { |
288 | referenced++; | 289 | referenced++; |
289 | *mapcount = 1; /* break early from loop */ | 290 | *mapcount = 1; /* break early from loop */ |
290 | } else if (ptep_clear_flush_young(vma, address, pte)) | 291 | } else if (ptep_clear_flush_young_notify(vma, address, pte)) |
291 | referenced++; | 292 | referenced++; |
292 | 293 | ||
293 | /* Pretend the page is referenced if the task has the | 294 | /* Pretend the page is referenced if the task has the |
@@ -457,7 +458,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
457 | pte_t entry; | 458 | pte_t entry; |
458 | 459 | ||
459 | flush_cache_page(vma, address, pte_pfn(*pte)); | 460 | flush_cache_page(vma, address, pte_pfn(*pte)); |
460 | entry = ptep_clear_flush(vma, address, pte); | 461 | entry = ptep_clear_flush_notify(vma, address, pte); |
461 | entry = pte_wrprotect(entry); | 462 | entry = pte_wrprotect(entry); |
462 | entry = pte_mkclean(entry); | 463 | entry = pte_mkclean(entry); |
463 | set_pte_at(mm, address, pte, entry); | 464 | set_pte_at(mm, address, pte, entry); |
@@ -705,14 +706,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
705 | * skipped over this mm) then we should reactivate it. | 706 | * skipped over this mm) then we should reactivate it. |
706 | */ | 707 | */ |
707 | if (!migration && ((vma->vm_flags & VM_LOCKED) || | 708 | if (!migration && ((vma->vm_flags & VM_LOCKED) || |
708 | (ptep_clear_flush_young(vma, address, pte)))) { | 709 | (ptep_clear_flush_young_notify(vma, address, pte)))) { |
709 | ret = SWAP_FAIL; | 710 | ret = SWAP_FAIL; |
710 | goto out_unmap; | 711 | goto out_unmap; |
711 | } | 712 | } |
712 | 713 | ||
713 | /* Nuke the page table entry. */ | 714 | /* Nuke the page table entry. */ |
714 | flush_cache_page(vma, address, page_to_pfn(page)); | 715 | flush_cache_page(vma, address, page_to_pfn(page)); |
715 | pteval = ptep_clear_flush(vma, address, pte); | 716 | pteval = ptep_clear_flush_notify(vma, address, pte); |
716 | 717 | ||
717 | /* Move the dirty bit to the physical page now the pte is gone. */ | 718 | /* Move the dirty bit to the physical page now the pte is gone. */ |
718 | if (pte_dirty(pteval)) | 719 | if (pte_dirty(pteval)) |
@@ -837,12 +838,12 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
837 | page = vm_normal_page(vma, address, *pte); | 838 | page = vm_normal_page(vma, address, *pte); |
838 | BUG_ON(!page || PageAnon(page)); | 839 | BUG_ON(!page || PageAnon(page)); |
839 | 840 | ||
840 | if (ptep_clear_flush_young(vma, address, pte)) | 841 | if (ptep_clear_flush_young_notify(vma, address, pte)) |
841 | continue; | 842 | continue; |
842 | 843 | ||
843 | /* Nuke the page table entry. */ | 844 | /* Nuke the page table entry. */ |
844 | flush_cache_page(vma, address, pte_pfn(*pte)); | 845 | flush_cache_page(vma, address, pte_pfn(*pte)); |
845 | pteval = ptep_clear_flush(vma, address, pte); | 846 | pteval = ptep_clear_flush_notify(vma, address, pte); |
846 | 847 | ||
847 | /* If nonlinear, store the file page offset in the pte. */ | 848 | /* If nonlinear, store the file page offset in the pte. */ |
848 | if (page->index != linear_page_index(vma, address)) | 849 | if (page->index != linear_page_index(vma, address)) |