aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mmu_notifier.c
diff options
context:
space:
mode:
authorAndrea Arcangeli <andrea@qumranet.com>2008-07-28 18:46:29 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-28 19:30:21 -0400
commitcddb8a5c14aa89810b40495d94d3d2a0faee6619 (patch)
treed0b47b071f7d2dd1d6f9c36084aa8cfcef90d1da /mm/mmu_notifier.c
parent7906d00cd1f687268f0a3599442d113767795ae6 (diff)
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages. There are secondary MMUs (with secondary sptes and secondary tlbs) too. sptes in the kvm case are shadow pagetables, but when I say spte in mmu-notifier context, I mean "secondary pte". In GRU case there's no actual secondary pte and there's only a secondary tlb because the GRU secondary MMU has no knowledge about sptes and every secondary tlb miss event in the MMU always generates a page fault that has to be resolved by the CPU (this is not the case of KVM where the a secondary tlb miss will walk sptes in hardware and it will refill the secondary tlb transparently to software if the corresponding spte is present). The same way zap_page_range has to invalidate the pte before freeing the page, the spte (and secondary tlb) must also be invalidated before any page is freed and reused. Currently we take a page_count pin on every page mapped by sptes, but that means the pages can't be swapped whenever they're mapped by any spte because they're part of the guest working set. Furthermore a spte unmap event can immediately lead to a page to be freed when the pin is released (so requiring the same complex and relatively slow tlb_gather smp safe logic we have in zap_page_range and that can be avoided completely if the spte unmap event doesn't require an unpin of the page previously mapped in the secondary MMU). The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know when the VM is swapping or freeing or doing anything on the primary MMU so that the secondary MMU code can drop sptes before the pages are freed, avoiding all page pinning and allowing 100% reliable swapping of guest physical address space. Furthermore it avoids the code that teardown the mappings of the secondary MMU, to implement a logic like tlb_gather in zap_page_range that would require many IPI to flush other cpu tlbs, for each fixed number of spte unmapped. To make an example: if what happens on the primary MMU is a protection downgrade (from writeable to wrprotect) the secondary MMU mappings will be invalidated, and the next secondary-mmu-page-fault will call get_user_pages and trigger a do_wp_page through get_user_pages if it called get_user_pages with write=1, and it'll re-establishing an updated spte or secondary-tlb-mapping on the copied page. Or it will setup a readonly spte or readonly tlb mapping if it's a guest-read, if it calls get_user_pages with write=0. This is just an example. This allows to map any page pointed by any pte (and in turn visible in the primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an full MMU with both sptes and secondary-tlb like the shadow-pagetable layer with kvm), or a remote DMA in software like XPMEM (hence needing of schedule in XPMEM code to send the invalidate to the remote node, while no need to schedule in kvm/gru as it's an immediate event like invalidating primary-mmu pte). At least for KVM without this patch it's impossible to swap guests reliably. And having this feature and removing the page pin allows several other optimizations that simplify life considerably. Dependencies: 1) mm_take_all_locks() to register the mmu notifier when the whole VM isn't doing anything with "mm". This allows mmu notifier users to keep track if the VM is in the middle of the invalidate_range_begin/end critical section with an atomic counter incraese in range_begin and decreased in range_end. No secondary MMU page fault is allowed to map any spte or secondary tlb reference, while the VM is in the middle of range_begin/end as any page returned by get_user_pages in that critical section could later immediately be freed without any further ->invalidate_page notification (invalidate_range_begin/end works on ranges and ->invalidate_page isn't called immediately before freeing the page). To stop all page freeing and pagetable overwrites the mmap_sem must be taken in write mode and all other anon_vma/i_mmap locks must be taken too. 2) It'd be a waste to add branches in the VM if nobody could possibly run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of mmu notifiers, but this already allows to compile a KVM external module against a kernel with mmu notifiers enabled and from the next pull from kvm.git we'll start using them. And GRU/XPMEM will also be able to continue the development by enabling KVM=m in their config, until they submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n). This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM are all =n. The mmu_notifier_register call can fail because mm_take_all_locks may be interrupted by a signal and return -EINTR. Because mmu_notifier_reigster is used when a driver startup, a failure can be gracefully handled. Here an example of the change applied to kvm to register the mmu notifiers. Usually when a driver startups other allocations are required anyway and -ENOMEM failure paths exists already. struct kvm *kvm_arch_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + int err; if (!kvm) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops; + err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm); + if (err) { + kfree(kvm); + return ERR_PTR(err); + } + return kvm; } mmu_notifier_unregister returns void and it's reliable. The patch also adds a few needed but missing includes that would prevent kernel to compile after these changes on non-x86 archs (x86 didn't need them by luck). [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix mm/filemap_xip.c build] [akpm@linux-foundation.org: fix mm/mmu_notifier.c build] Signed-off-by: Andrea Arcangeli <andrea@qumranet.com> Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Christoph Lameter <cl@linux-foundation.org> Cc: Jack Steiner <steiner@sgi.com> Cc: Robin Holt <holt@sgi.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Kanoj Sarcar <kanojsarcar@yahoo.com> Cc: Roland Dreier <rdreier@cisco.com> Cc: Steve Wise <swise@opengridcomputing.com> Cc: Avi Kivity <avi@qumranet.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Anthony Liguori <aliguori@us.ibm.com> Cc: Chris Wright <chrisw@redhat.com> Cc: Marcelo Tosatti <marcelo@kvack.org> Cc: Eric Dumazet <dada1@cosmosbay.com> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Cc: Izik Eidus <izike@qumranet.com> Cc: Anthony Liguori <aliguori@us.ibm.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mmu_notifier.c')
-rw-r--r--mm/mmu_notifier.c277
1 files changed, 277 insertions, 0 deletions
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 000000000000..5f4ef0250bee
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,277 @@
1/*
2 * linux/mm/mmu_notifier.c
3 *
4 * Copyright (C) 2008 Qumranet, Inc.
5 * Copyright (C) 2008 SGI
6 * Christoph Lameter <clameter@sgi.com>
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2. See
9 * the COPYING file in the top-level directory.
10 */
11
12#include <linux/rculist.h>
13#include <linux/mmu_notifier.h>
14#include <linux/module.h>
15#include <linux/mm.h>
16#include <linux/err.h>
17#include <linux/rcupdate.h>
18#include <linux/sched.h>
19
20/*
21 * This function can't run concurrently against mmu_notifier_register
22 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
23 * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
24 * in parallel despite there being no task using this mm any more,
25 * through the vmas outside of the exit_mmap context, such as with
26 * vmtruncate. This serializes against mmu_notifier_unregister with
27 * the mmu_notifier_mm->lock in addition to RCU and it serializes
28 * against the other mmu notifiers with RCU. struct mmu_notifier_mm
29 * can't go away from under us as exit_mmap holds an mm_count pin
30 * itself.
31 */
32void __mmu_notifier_release(struct mm_struct *mm)
33{
34 struct mmu_notifier *mn;
35
36 spin_lock(&mm->mmu_notifier_mm->lock);
37 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
38 mn = hlist_entry(mm->mmu_notifier_mm->list.first,
39 struct mmu_notifier,
40 hlist);
41 /*
42 * We arrived before mmu_notifier_unregister so
43 * mmu_notifier_unregister will do nothing other than
44 * to wait ->release to finish and
45 * mmu_notifier_unregister to return.
46 */
47 hlist_del_init_rcu(&mn->hlist);
48 /*
49 * RCU here will block mmu_notifier_unregister until
50 * ->release returns.
51 */
52 rcu_read_lock();
53 spin_unlock(&mm->mmu_notifier_mm->lock);
54 /*
55 * if ->release runs before mmu_notifier_unregister it
56 * must be handled as it's the only way for the driver
57 * to flush all existing sptes and stop the driver
58 * from establishing any more sptes before all the
59 * pages in the mm are freed.
60 */
61 if (mn->ops->release)
62 mn->ops->release(mn, mm);
63 rcu_read_unlock();
64 spin_lock(&mm->mmu_notifier_mm->lock);
65 }
66 spin_unlock(&mm->mmu_notifier_mm->lock);
67
68 /*
69 * synchronize_rcu here prevents mmu_notifier_release to
70 * return to exit_mmap (which would proceed freeing all pages
71 * in the mm) until the ->release method returns, if it was
72 * invoked by mmu_notifier_unregister.
73 *
74 * The mmu_notifier_mm can't go away from under us because one
75 * mm_count is hold by exit_mmap.
76 */
77 synchronize_rcu();
78}
79
80/*
81 * If no young bitflag is supported by the hardware, ->clear_flush_young can
82 * unmap the address and return 1 or 0 depending if the mapping previously
83 * existed or not.
84 */
85int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
86 unsigned long address)
87{
88 struct mmu_notifier *mn;
89 struct hlist_node *n;
90 int young = 0;
91
92 rcu_read_lock();
93 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
94 if (mn->ops->clear_flush_young)
95 young |= mn->ops->clear_flush_young(mn, mm, address);
96 }
97 rcu_read_unlock();
98
99 return young;
100}
101
102void __mmu_notifier_invalidate_page(struct mm_struct *mm,
103 unsigned long address)
104{
105 struct mmu_notifier *mn;
106 struct hlist_node *n;
107
108 rcu_read_lock();
109 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
110 if (mn->ops->invalidate_page)
111 mn->ops->invalidate_page(mn, mm, address);
112 }
113 rcu_read_unlock();
114}
115
116void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
117 unsigned long start, unsigned long end)
118{
119 struct mmu_notifier *mn;
120 struct hlist_node *n;
121
122 rcu_read_lock();
123 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
124 if (mn->ops->invalidate_range_start)
125 mn->ops->invalidate_range_start(mn, mm, start, end);
126 }
127 rcu_read_unlock();
128}
129
130void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
131 unsigned long start, unsigned long end)
132{
133 struct mmu_notifier *mn;
134 struct hlist_node *n;
135
136 rcu_read_lock();
137 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
138 if (mn->ops->invalidate_range_end)
139 mn->ops->invalidate_range_end(mn, mm, start, end);
140 }
141 rcu_read_unlock();
142}
143
144static int do_mmu_notifier_register(struct mmu_notifier *mn,
145 struct mm_struct *mm,
146 int take_mmap_sem)
147{
148 struct mmu_notifier_mm *mmu_notifier_mm;
149 int ret;
150
151 BUG_ON(atomic_read(&mm->mm_users) <= 0);
152
153 ret = -ENOMEM;
154 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
155 if (unlikely(!mmu_notifier_mm))
156 goto out;
157
158 if (take_mmap_sem)
159 down_write(&mm->mmap_sem);
160 ret = mm_take_all_locks(mm);
161 if (unlikely(ret))
162 goto out_cleanup;
163
164 if (!mm_has_notifiers(mm)) {
165 INIT_HLIST_HEAD(&mmu_notifier_mm->list);
166 spin_lock_init(&mmu_notifier_mm->lock);
167 mm->mmu_notifier_mm = mmu_notifier_mm;
168 mmu_notifier_mm = NULL;
169 }
170 atomic_inc(&mm->mm_count);
171
172 /*
173 * Serialize the update against mmu_notifier_unregister. A
174 * side note: mmu_notifier_release can't run concurrently with
175 * us because we hold the mm_users pin (either implicitly as
176 * current->mm or explicitly with get_task_mm() or similar).
177 * We can't race against any other mmu notifier method either
178 * thanks to mm_take_all_locks().
179 */
180 spin_lock(&mm->mmu_notifier_mm->lock);
181 hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
182 spin_unlock(&mm->mmu_notifier_mm->lock);
183
184 mm_drop_all_locks(mm);
185out_cleanup:
186 if (take_mmap_sem)
187 up_write(&mm->mmap_sem);
188 /* kfree() does nothing if mmu_notifier_mm is NULL */
189 kfree(mmu_notifier_mm);
190out:
191 BUG_ON(atomic_read(&mm->mm_users) <= 0);
192 return ret;
193}
194
195/*
196 * Must not hold mmap_sem nor any other VM related lock when calling
197 * this registration function. Must also ensure mm_users can't go down
198 * to zero while this runs to avoid races with mmu_notifier_release,
199 * so mm has to be current->mm or the mm should be pinned safely such
200 * as with get_task_mm(). If the mm is not current->mm, the mm_users
201 * pin should be released by calling mmput after mmu_notifier_register
202 * returns. mmu_notifier_unregister must be always called to
203 * unregister the notifier. mm_count is automatically pinned to allow
204 * mmu_notifier_unregister to safely run at any time later, before or
205 * after exit_mmap. ->release will always be called before exit_mmap
206 * frees the pages.
207 */
208int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
209{
210 return do_mmu_notifier_register(mn, mm, 1);
211}
212EXPORT_SYMBOL_GPL(mmu_notifier_register);
213
214/*
215 * Same as mmu_notifier_register but here the caller must hold the
216 * mmap_sem in write mode.
217 */
218int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
219{
220 return do_mmu_notifier_register(mn, mm, 0);
221}
222EXPORT_SYMBOL_GPL(__mmu_notifier_register);
223
224/* this is called after the last mmu_notifier_unregister() returned */
225void __mmu_notifier_mm_destroy(struct mm_struct *mm)
226{
227 BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
228 kfree(mm->mmu_notifier_mm);
229 mm->mmu_notifier_mm = LIST_POISON1; /* debug */
230}
231
232/*
233 * This releases the mm_count pin automatically and frees the mm
234 * structure if it was the last user of it. It serializes against
235 * running mmu notifiers with RCU and against mmu_notifier_unregister
236 * with the unregister lock + RCU. All sptes must be dropped before
237 * calling mmu_notifier_unregister. ->release or any other notifier
238 * method may be invoked concurrently with mmu_notifier_unregister,
239 * and only after mmu_notifier_unregister returned we're guaranteed
240 * that ->release or any other method can't run anymore.
241 */
242void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
243{
244 BUG_ON(atomic_read(&mm->mm_count) <= 0);
245
246 spin_lock(&mm->mmu_notifier_mm->lock);
247 if (!hlist_unhashed(&mn->hlist)) {
248 hlist_del_rcu(&mn->hlist);
249
250 /*
251 * RCU here will force exit_mmap to wait ->release to finish
252 * before freeing the pages.
253 */
254 rcu_read_lock();
255 spin_unlock(&mm->mmu_notifier_mm->lock);
256 /*
257 * exit_mmap will block in mmu_notifier_release to
258 * guarantee ->release is called before freeing the
259 * pages.
260 */
261 if (mn->ops->release)
262 mn->ops->release(mn, mm);
263 rcu_read_unlock();
264 } else
265 spin_unlock(&mm->mmu_notifier_mm->lock);
266
267 /*
268 * Wait any running method to finish, of course including
269 * ->release if it was run by mmu_notifier_relase instead of us.
270 */
271 synchronize_rcu();
272
273 BUG_ON(atomic_read(&mm->mm_count) <= 0);
274
275 mmdrop(mm);
276}
277EXPORT_SYMBOL_GPL(mmu_notifier_unregister);