mmu-notifiers: add mm_take_all_locks() operation

mm_take_all_locks holds off reclaim from an entire mm_struct. This allows mmu notifiers to register into the mm at any time with the guarantee that no mmu operation is in progress on the mm. This operation locks against the VM for all pte/vma/mm related operations that could ever happen on a certain mm. This includes vmtruncate, try_to_unmap, and all page faults. The caller must take the mmap_sem in write mode before calling mm_take_all_locks(). The caller isn't allowed to release the mmap_sem until mm_drop_all_locks() returns. mmap_sem in write mode is required in order to block all operations that could modify pagetables and free pages without need of altering the vma layout (for example populate_range() with nonlinear vmas). It's also needed in write mode to avoid new anon_vmas to be associated with existing vmas. A single task can't take more than one mm_take_all_locks() in a row or it would deadlock. mm_take_all_locks() and mm_drop_all_locks are expensive operations that may have to take thousand of locks. mm_take_all_locks() can fail if it's interrupted by signals. When mmu_notifier_register returns, we must be sure that the driver is notified if some task is in the middle of a vmtruncate for the 'mm' where the mmu notifier was registered (mmu_notifier_invalidate_range_start/end is run around the vmtruncation but mmu_notifier_register can run after mmu_notifier_invalidate_range_start and before mmu_notifier_invalidate_range_end). Same problem for rmap paths. And we've to remove page pinning to avoid replicating the tlb_gather logic inside KVM (and GRU doesn't work well with page pinning regardless of needing tlb_gather), so without mm_take_all_locks when vmtruncate frees the page, kvm would have no way to notice that it mapped into sptes a page that is going into the freelist without a chance of any further mmu_notifier notification. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrea Arcangeli <andrea@qumranet.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Jack Steiner <steiner@sgi.com> Cc: Robin Holt <holt@sgi.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Kanoj Sarcar <kanojsarcar@yahoo.com> Cc: Roland Dreier <rdreier@cisco.com> Cc: Steve Wise <swise@opengridcomputing.com> Cc: Avi Kivity <avi@qumranet.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Anthony Liguori <aliguori@us.ibm.com> Cc: Chris Wright <chrisw@redhat.com> Cc: Marcelo Tosatti <marcelo@kvack.org> Cc: Eric Dumazet <dada1@cosmosbay.com> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Cc: Izik Eidus <izike@qumranet.com> Cc: Anthony Liguori <aliguori@us.ibm.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Andrea Arcangeli <andrea@qumranet.com> 2008-07-28 18:46:26 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-07-28 19:30:21 -0400
commit: 7906d00cd1f687268f0a3599442d113767795ae6 (patch)
tree: 63609454d164a088d7f535f826764579c0f297f6 /mm/mmap.c
parent: 6beeac76f5f96590fb751af5e138fbc3f62e8460 (diff)
1 files changed, 158 insertions, 0 deletions
diff --git a/mm/mmap.c b/mm/mmap.c
index 5e0cc99e9cd5..e5f9cb83d6d4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2268,3 +2268,161 @@ int install_special_mapping(struct mm_struct *mm,
        return 0;
 }
+static DEFINE_MUTEX(mm_all_locks_mutex);
+static void vm_lock_anon_vma(struct anon_vma *anon_vma)
+{
+        if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+                /*
+                 * The LSB of head.next can't change from under us
+                 * because we hold the mm_all_locks_mutex.
+                 */
+                spin_lock(&anon_vma->lock);
+                /*
+                 * We can safely modify head.next after taking the
+                 * anon_vma->lock. If some other vma in this mm shares
+                 * the same anon_vma we won't take it again.
+                 *
+                 * No need of atomic instructions here, head.next
+                 * can't change from under us thanks to the
+                 * anon_vma->lock.
+                 */
+                if (__test_and_set_bit(0, (unsigned long *)
+                                       &anon_vma->head.next))
+                        BUG();
+        }
+}
+static void vm_lock_mapping(struct address_space *mapping)
+{
+        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+                /*
+                 * AS_MM_ALL_LOCKS can't change from under us because
+                 * we hold the mm_all_locks_mutex.
+                 *
+                 * Operations on ->flags have to be atomic because
+                 * even if AS_MM_ALL_LOCKS is stable thanks to the
+                 * mm_all_locks_mutex, there may be other cpus
+                 * changing other bitflags in parallel to us.
+                 */
+                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+                        BUG();
+                spin_lock(&mapping->i_mmap_lock);
+        }
+}
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_sem in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_sem until mm_drop_all_locks() returns.
+ *
+ * mmap_sem in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout (for example populate_range() with
+ * nonlinear vmas). It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We can take all the locks in random order because the VM code
+ * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * takes more than one of them in a row. Secondly we're protected
+ * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        int ret = -EINTR;
+        BUG_ON(down_read_trylock(&mm->mmap_sem));
+        mutex_lock(&mm_all_locks_mutex);
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (signal_pending(current))
+                        goto out_unlock;
+                if (vma->anon_vma)
+                        vm_lock_anon_vma(vma->anon_vma);
+                if (vma->vm_file && vma->vm_file->f_mapping)
+                        vm_lock_mapping(vma->vm_file->f_mapping);
+        }
+        ret = 0;
+out_unlock:
+        if (ret)
+                mm_drop_all_locks(mm);
+        return ret;
+}
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+        if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+                /*
+                 * The LSB of head.next can't change to 0 from under
+                 * us because we hold the mm_all_locks_mutex.
+                 *
+                 * We must however clear the bitflag before unlocking
+                 * the vma so the users using the anon_vma->head will
+                 * never see our bitflag.
+                 *
+                 * No need of atomic instructions here, head.next
+                 * can't change from under us until we release the
+                 * anon_vma->lock.
+                 */
+                if (!__test_and_clear_bit(0, (unsigned long *)
+                                          &anon_vma->head.next))
+                        BUG();
+                spin_unlock(&anon_vma->lock);
+        }
+}
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+                /*
+                 * AS_MM_ALL_LOCKS can't change to 0 from under us
+                 * because we hold the mm_all_locks_mutex.
+                 */
+                spin_unlock(&mapping->i_mmap_lock);
+                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+                                        &mapping->flags))
+                        BUG();
+        }
+}
+/*
+ * The mmap_sem cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        BUG_ON(down_read_trylock(&mm->mmap_sem));
+        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (vma->anon_vma)
+                        vm_unlock_anon_vma(vma->anon_vma);
+                if (vma->vm_file && vma->vm_file->f_mapping)
+                        vm_unlock_mapping(vma->vm_file->f_mapping);
+        }
+        mutex_unlock(&mm_all_locks_mutex);
+}
author	Andrea Arcangeli <andrea@qumranet.com>	2008-07-28 18:46:26 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-07-28 19:30:21 -0400
commit	7906d00cd1f687268f0a3599442d113767795ae6 (patch)
tree	63609454d164a088d7f535f826764579c0f297f6 /mm/mmap.c
parent	6beeac76f5f96590fb751af5e138fbc3f62e8460 (diff)

diff --git a/mm/mmap.c b/mm/mmap.c index 5e0cc99e9cd5..e5f9cb83d6d4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c
@@ -2268,3 +2268,161 @@ int install_special_mapping(struct mm_struct *mm,
2268		2268
2269	return 0;	2269	return 0;
2270	}	2270	}
		2271
		2272	static DEFINE_MUTEX(mm_all_locks_mutex);
		2273
		2274	static void vm_lock_anon_vma(struct anon_vma *anon_vma)
		2275	{
		2276	if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
		2277	/*
		2278	* The LSB of head.next can't change from under us
		2279	* because we hold the mm_all_locks_mutex.
		2280	*/
		2281	spin_lock(&anon_vma->lock);
		2282	/*
		2283	* We can safely modify head.next after taking the
		2284	* anon_vma->lock. If some other vma in this mm shares
		2285	* the same anon_vma we won't take it again.
		2286	*
		2287	* No need of atomic instructions here, head.next
		2288	* can't change from under us thanks to the
		2289	* anon_vma->lock.
		2290	*/
		2291	if (__test_and_set_bit(0, (unsigned long *)
		2292	&anon_vma->head.next))
		2293	BUG();
		2294	}
		2295	}
		2296
		2297	static void vm_lock_mapping(struct address_space *mapping)
		2298	{
		2299	if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
		2300	/*
		2301	* AS_MM_ALL_LOCKS can't change from under us because
		2302	* we hold the mm_all_locks_mutex.
		2303	*
		2304	* Operations on ->flags have to be atomic because
		2305	* even if AS_MM_ALL_LOCKS is stable thanks to the
		2306	* mm_all_locks_mutex, there may be other cpus
		2307	* changing other bitflags in parallel to us.
		2308	*/
		2309	if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
		2310	BUG();
		2311	spin_lock(&mapping->i_mmap_lock);
		2312	}
		2313	}
		2314
		2315	/*
		2316	* This operation locks against the VM for all pte/vma/mm related
		2317	* operations that could ever happen on a certain mm. This includes
		2318	* vmtruncate, try_to_unmap, and all page faults.
		2319	*
		2320	* The caller must take the mmap_sem in write mode before calling
		2321	* mm_take_all_locks(). The caller isn't allowed to release the
		2322	* mmap_sem until mm_drop_all_locks() returns.
		2323	*
		2324	* mmap_sem in write mode is required in order to block all operations
		2325	* that could modify pagetables and free pages without need of
		2326	* altering the vma layout (for example populate_range() with
		2327	* nonlinear vmas). It's also needed in write mode to avoid new
		2328	* anon_vmas to be associated with existing vmas.
		2329	*
		2330	* A single task can't take more than one mm_take_all_locks() in a row
		2331	* or it would deadlock.
		2332	*
		2333	* The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
		2334	* mapping->flags avoid to take the same lock twice, if more than one
		2335	* vma in this mm is backed by the same anon_vma or address_space.
		2336	*
		2337	* We can take all the locks in random order because the VM code
		2338	* taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
		2339	* takes more than one of them in a row. Secondly we're protected
		2340	* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
		2341	*
		2342	* mm_take_all_locks() and mm_drop_all_locks are expensive operations
		2343	* that may have to take thousand of locks.
		2344	*
		2345	* mm_take_all_locks() can fail if it's interrupted by signals.
		2346	*/
		2347	int mm_take_all_locks(struct mm_struct *mm)
		2348	{
		2349	struct vm_area_struct *vma;
		2350	int ret = -EINTR;
		2351
		2352	BUG_ON(down_read_trylock(&mm->mmap_sem));
		2353
		2354	mutex_lock(&mm_all_locks_mutex);
		2355
		2356	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		2357	if (signal_pending(current))
		2358	goto out_unlock;
		2359	if (vma->anon_vma)
		2360	vm_lock_anon_vma(vma->anon_vma);
		2361	if (vma->vm_file && vma->vm_file->f_mapping)
		2362	vm_lock_mapping(vma->vm_file->f_mapping);
		2363	}
		2364	ret = 0;
		2365
		2366	out_unlock:
		2367	if (ret)
		2368	mm_drop_all_locks(mm);
		2369
		2370	return ret;
		2371	}
		2372
		2373	static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
		2374	{
		2375	if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
		2376	/*
		2377	* The LSB of head.next can't change to 0 from under
		2378	* us because we hold the mm_all_locks_mutex.
		2379	*
		2380	* We must however clear the bitflag before unlocking
		2381	* the vma so the users using the anon_vma->head will
		2382	* never see our bitflag.
		2383	*
		2384	* No need of atomic instructions here, head.next
		2385	* can't change from under us until we release the
		2386	* anon_vma->lock.
		2387	*/
		2388	if (!__test_and_clear_bit(0, (unsigned long *)
		2389	&anon_vma->head.next))
		2390	BUG();
		2391	spin_unlock(&anon_vma->lock);
		2392	}
		2393	}
		2394
		2395	static void vm_unlock_mapping(struct address_space *mapping)
		2396	{
		2397	if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
		2398	/*
		2399	* AS_MM_ALL_LOCKS can't change to 0 from under us
		2400	* because we hold the mm_all_locks_mutex.
		2401	*/
		2402	spin_unlock(&mapping->i_mmap_lock);
		2403	if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
		2404	&mapping->flags))
		2405	BUG();
		2406	}
		2407	}
		2408
		2409	/*
		2410	* The mmap_sem cannot be released by the caller until
		2411	* mm_drop_all_locks() returns.
		2412	*/
		2413	void mm_drop_all_locks(struct mm_struct *mm)
		2414	{
		2415	struct vm_area_struct *vma;
		2416
		2417	BUG_ON(down_read_trylock(&mm->mmap_sem));
		2418	BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
		2419
		2420	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		2421	if (vma->anon_vma)
		2422	vm_unlock_anon_vma(vma->anon_vma);
		2423	if (vma->vm_file && vma->vm_file->f_mapping)
		2424	vm_unlock_mapping(vma->vm_file->f_mapping);
		2425	}
		2426
		2427	mutex_unlock(&mm_all_locks_mutex);
		2428	}