aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrea Arcangeli <andrea@qumranet.com>2008-07-28 18:46:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-28 19:30:21 -0400
commit7906d00cd1f687268f0a3599442d113767795ae6 (patch)
tree63609454d164a088d7f535f826764579c0f297f6
parent6beeac76f5f96590fb751af5e138fbc3f62e8460 (diff)
mmu-notifiers: add mm_take_all_locks() operation
mm_take_all_locks holds off reclaim from an entire mm_struct. This allows mmu notifiers to register into the mm at any time with the guarantee that no mmu operation is in progress on the mm. This operation locks against the VM for all pte/vma/mm related operations that could ever happen on a certain mm. This includes vmtruncate, try_to_unmap, and all page faults. The caller must take the mmap_sem in write mode before calling mm_take_all_locks(). The caller isn't allowed to release the mmap_sem until mm_drop_all_locks() returns. mmap_sem in write mode is required in order to block all operations that could modify pagetables and free pages without need of altering the vma layout (for example populate_range() with nonlinear vmas). It's also needed in write mode to avoid new anon_vmas to be associated with existing vmas. A single task can't take more than one mm_take_all_locks() in a row or it would deadlock. mm_take_all_locks() and mm_drop_all_locks are expensive operations that may have to take thousand of locks. mm_take_all_locks() can fail if it's interrupted by signals. When mmu_notifier_register returns, we must be sure that the driver is notified if some task is in the middle of a vmtruncate for the 'mm' where the mmu notifier was registered (mmu_notifier_invalidate_range_start/end is run around the vmtruncation but mmu_notifier_register can run after mmu_notifier_invalidate_range_start and before mmu_notifier_invalidate_range_end). Same problem for rmap paths. And we've to remove page pinning to avoid replicating the tlb_gather logic inside KVM (and GRU doesn't work well with page pinning regardless of needing tlb_gather), so without mm_take_all_locks when vmtruncate frees the page, kvm would have no way to notice that it mapped into sptes a page that is going into the freelist without a chance of any further mmu_notifier notification. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrea Arcangeli <andrea@qumranet.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Jack Steiner <steiner@sgi.com> Cc: Robin Holt <holt@sgi.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Kanoj Sarcar <kanojsarcar@yahoo.com> Cc: Roland Dreier <rdreier@cisco.com> Cc: Steve Wise <swise@opengridcomputing.com> Cc: Avi Kivity <avi@qumranet.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Anthony Liguori <aliguori@us.ibm.com> Cc: Chris Wright <chrisw@redhat.com> Cc: Marcelo Tosatti <marcelo@kvack.org> Cc: Eric Dumazet <dada1@cosmosbay.com> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Cc: Izik Eidus <izike@qumranet.com> Cc: Anthony Liguori <aliguori@us.ibm.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mm.h3
-rw-r--r--include/linux/pagemap.h1
-rw-r--r--include/linux/rmap.h8
-rw-r--r--mm/mmap.c158
4 files changed, 170 insertions, 0 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6e695eaab4ce..866a3dbe5c75 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1104,6 +1104,9 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
1104 unsigned long addr, unsigned long len, pgoff_t pgoff); 1104 unsigned long addr, unsigned long len, pgoff_t pgoff);
1105extern void exit_mmap(struct mm_struct *); 1105extern void exit_mmap(struct mm_struct *);
1106 1106
1107extern int mm_take_all_locks(struct mm_struct *mm);
1108extern void mm_drop_all_locks(struct mm_struct *mm);
1109
1107#ifdef CONFIG_PROC_FS 1110#ifdef CONFIG_PROC_FS
1108/* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */ 1111/* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */
1109extern void added_exe_file_vma(struct mm_struct *mm); 1112extern void added_exe_file_vma(struct mm_struct *mm);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a81d81890422..a39b38ccdc97 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -20,6 +20,7 @@
20 */ 20 */
21#define AS_EIO (__GFP_BITS_SHIFT + 0) /* IO error on async write */ 21#define AS_EIO (__GFP_BITS_SHIFT + 0) /* IO error on async write */
22#define AS_ENOSPC (__GFP_BITS_SHIFT + 1) /* ENOSPC on async write */ 22#define AS_ENOSPC (__GFP_BITS_SHIFT + 1) /* ENOSPC on async write */
23#define AS_MM_ALL_LOCKS (__GFP_BITS_SHIFT + 2) /* under mm_take_all_locks() */
23 24
24static inline void mapping_set_error(struct address_space *mapping, int error) 25static inline void mapping_set_error(struct address_space *mapping, int error)
25{ 26{
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1383692ac5bd..69407f85e10b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -26,6 +26,14 @@
26 */ 26 */
27struct anon_vma { 27struct anon_vma {
28 spinlock_t lock; /* Serialize access to vma list */ 28 spinlock_t lock; /* Serialize access to vma list */
29 /*
30 * NOTE: the LSB of the head.next is set by
31 * mm_take_all_locks() _after_ taking the above lock. So the
32 * head must only be read/written after taking the above lock
33 * to be sure to see a valid next pointer. The LSB bit itself
34 * is serialized by a system wide lock only visible to
35 * mm_take_all_locks() (mm_all_locks_mutex).
36 */
29 struct list_head head; /* List of private "related" vmas */ 37 struct list_head head; /* List of private "related" vmas */
30}; 38};
31 39
diff --git a/mm/mmap.c b/mm/mmap.c
index 5e0cc99e9cd5..e5f9cb83d6d4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2268,3 +2268,161 @@ int install_special_mapping(struct mm_struct *mm,
2268 2268
2269 return 0; 2269 return 0;
2270} 2270}
2271
2272static DEFINE_MUTEX(mm_all_locks_mutex);
2273
2274static void vm_lock_anon_vma(struct anon_vma *anon_vma)
2275{
2276 if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
2277 /*
2278 * The LSB of head.next can't change from under us
2279 * because we hold the mm_all_locks_mutex.
2280 */
2281 spin_lock(&anon_vma->lock);
2282 /*
2283 * We can safely modify head.next after taking the
2284 * anon_vma->lock. If some other vma in this mm shares
2285 * the same anon_vma we won't take it again.
2286 *
2287 * No need of atomic instructions here, head.next
2288 * can't change from under us thanks to the
2289 * anon_vma->lock.
2290 */
2291 if (__test_and_set_bit(0, (unsigned long *)
2292 &anon_vma->head.next))
2293 BUG();
2294 }
2295}
2296
2297static void vm_lock_mapping(struct address_space *mapping)
2298{
2299 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2300 /*
2301 * AS_MM_ALL_LOCKS can't change from under us because
2302 * we hold the mm_all_locks_mutex.
2303 *
2304 * Operations on ->flags have to be atomic because
2305 * even if AS_MM_ALL_LOCKS is stable thanks to the
2306 * mm_all_locks_mutex, there may be other cpus
2307 * changing other bitflags in parallel to us.
2308 */
2309 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
2310 BUG();
2311 spin_lock(&mapping->i_mmap_lock);
2312 }
2313}
2314
2315/*
2316 * This operation locks against the VM for all pte/vma/mm related
2317 * operations that could ever happen on a certain mm. This includes
2318 * vmtruncate, try_to_unmap, and all page faults.
2319 *
2320 * The caller must take the mmap_sem in write mode before calling
2321 * mm_take_all_locks(). The caller isn't allowed to release the
2322 * mmap_sem until mm_drop_all_locks() returns.
2323 *
2324 * mmap_sem in write mode is required in order to block all operations
2325 * that could modify pagetables and free pages without need of
2326 * altering the vma layout (for example populate_range() with
2327 * nonlinear vmas). It's also needed in write mode to avoid new
2328 * anon_vmas to be associated with existing vmas.
2329 *
2330 * A single task can't take more than one mm_take_all_locks() in a row
2331 * or it would deadlock.
2332 *
2333 * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
2334 * mapping->flags avoid to take the same lock twice, if more than one
2335 * vma in this mm is backed by the same anon_vma or address_space.
2336 *
2337 * We can take all the locks in random order because the VM code
2338 * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
2339 * takes more than one of them in a row. Secondly we're protected
2340 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
2341 *
2342 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
2343 * that may have to take thousand of locks.
2344 *
2345 * mm_take_all_locks() can fail if it's interrupted by signals.
2346 */
2347int mm_take_all_locks(struct mm_struct *mm)
2348{
2349 struct vm_area_struct *vma;
2350 int ret = -EINTR;
2351
2352 BUG_ON(down_read_trylock(&mm->mmap_sem));
2353
2354 mutex_lock(&mm_all_locks_mutex);
2355
2356 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2357 if (signal_pending(current))
2358 goto out_unlock;
2359 if (vma->anon_vma)
2360 vm_lock_anon_vma(vma->anon_vma);
2361 if (vma->vm_file && vma->vm_file->f_mapping)
2362 vm_lock_mapping(vma->vm_file->f_mapping);
2363 }
2364 ret = 0;
2365
2366out_unlock:
2367 if (ret)
2368 mm_drop_all_locks(mm);
2369
2370 return ret;
2371}
2372
2373static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2374{
2375 if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
2376 /*
2377 * The LSB of head.next can't change to 0 from under
2378 * us because we hold the mm_all_locks_mutex.
2379 *
2380 * We must however clear the bitflag before unlocking
2381 * the vma so the users using the anon_vma->head will
2382 * never see our bitflag.
2383 *
2384 * No need of atomic instructions here, head.next
2385 * can't change from under us until we release the
2386 * anon_vma->lock.
2387 */
2388 if (!__test_and_clear_bit(0, (unsigned long *)
2389 &anon_vma->head.next))
2390 BUG();
2391 spin_unlock(&anon_vma->lock);
2392 }
2393}
2394
2395static void vm_unlock_mapping(struct address_space *mapping)
2396{
2397 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2398 /*
2399 * AS_MM_ALL_LOCKS can't change to 0 from under us
2400 * because we hold the mm_all_locks_mutex.
2401 */
2402 spin_unlock(&mapping->i_mmap_lock);
2403 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
2404 &mapping->flags))
2405 BUG();
2406 }
2407}
2408
2409/*
2410 * The mmap_sem cannot be released by the caller until
2411 * mm_drop_all_locks() returns.
2412 */
2413void mm_drop_all_locks(struct mm_struct *mm)
2414{
2415 struct vm_area_struct *vma;
2416
2417 BUG_ON(down_read_trylock(&mm->mmap_sem));
2418 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2419
2420 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2421 if (vma->anon_vma)
2422 vm_unlock_anon_vma(vma->anon_vma);
2423 if (vma->vm_file && vma->vm_file->f_mapping)
2424 vm_unlock_mapping(vma->vm_file->f_mapping);
2425 }
2426
2427 mutex_unlock(&mm_all_locks_mutex);
2428}