diff options
author | Ingo Molnar <mingo@kernel.org> | 2012-12-02 14:56:50 -0500 |
---|---|---|
committer | Mel Gorman <mgorman@suse.de> | 2012-12-11 09:43:00 -0500 |
commit | 4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 (patch) | |
tree | 90baaa56f6b5244525c7637a14550f830486112a /mm/rmap.c | |
parent | 5a505085f043e8380f83610f79642853c051e2f1 (diff) |
mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable
rmap_walk_anon() and try_to_unmap_anon() appears to be too
careful about locking the anon vma: while it needs protection
against anon vma list modifications, it does not need exclusive
access to the list itself.
Transforming this exclusive lock to a read-locked rwsem removes
a global lock from the hot path of page-migration intense
threaded workloads which can cause pathological performance like
this:
96.43% process 0 [kernel.kallsyms] [k] perf_trace_sched_switch
|
--- perf_trace_sched_switch
__schedule
schedule
schedule_preempt_disabled
__mutex_lock_common.isra.6
__mutex_lock_slowpath
mutex_lock
|
|--50.61%-- rmap_walk
| move_to_new_page
| migrate_pages
| migrate_misplaced_page
| __do_numa_page.isra.69
| handle_pte_fault
| handle_mm_fault
| __do_page_fault
| do_page_fault
| page_fault
| __memset_sse2
| |
| --100.00%-- worker_thread
| |
| --100.00%-- start_thread
|
--49.39%-- page_lock_anon_vma
try_to_unmap_anon
try_to_unmap
migrate_pages
migrate_misplaced_page
__do_numa_page.isra.69
handle_pte_fault
handle_mm_fault
__do_page_fault
do_page_fault
page_fault
__memset_sse2
|
--100.00%-- worker_thread
start_thread
With this change applied the profile is now nicely flat
and there's no anon-vma related scheduling/blocking.
Rename anon_vma_[un]lock() => anon_vma_[un]lock_write(),
to make it clearer that it's an exclusive write-lock in
that case - suggested by Rik van Riel.
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Turner <pjt@google.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Diffstat (limited to 'mm/rmap.c')
-rw-r--r-- | mm/rmap.c | 48 |
1 files changed, 24 insertions, 24 deletions
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
87 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); | 87 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); |
88 | 88 | ||
89 | /* | 89 | /* |
90 | * Synchronize against page_lock_anon_vma() such that | 90 | * Synchronize against page_lock_anon_vma_read() such that |
91 | * we can safely hold the lock without the anon_vma getting | 91 | * we can safely hold the lock without the anon_vma getting |
92 | * freed. | 92 | * freed. |
93 | * | 93 | * |
94 | * Relies on the full mb implied by the atomic_dec_and_test() from | 94 | * Relies on the full mb implied by the atomic_dec_and_test() from |
95 | * put_anon_vma() against the acquire barrier implied by | 95 | * put_anon_vma() against the acquire barrier implied by |
96 | * mutex_trylock() from page_lock_anon_vma(). This orders: | 96 | * down_read_trylock() from page_lock_anon_vma_read(). This orders: |
97 | * | 97 | * |
98 | * page_lock_anon_vma() VS put_anon_vma() | 98 | * page_lock_anon_vma_read() VS put_anon_vma() |
99 | * mutex_trylock() atomic_dec_and_test() | 99 | * down_read_trylock() atomic_dec_and_test() |
100 | * LOCK MB | 100 | * LOCK MB |
101 | * atomic_read() mutex_is_locked() | 101 | * atomic_read() rwsem_is_locked() |
102 | * | 102 | * |
103 | * LOCK should suffice since the actual taking of the lock must | 103 | * LOCK should suffice since the actual taking of the lock must |
104 | * happen _before_ what follows. | 104 | * happen _before_ what follows. |
105 | */ | 105 | */ |
106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { | 106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { |
107 | anon_vma_lock(anon_vma); | 107 | anon_vma_lock_write(anon_vma); |
108 | anon_vma_unlock(anon_vma); | 108 | anon_vma_unlock(anon_vma); |
109 | } | 109 | } |
110 | 110 | ||
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
146 | * allocate a new one. | 146 | * allocate a new one. |
147 | * | 147 | * |
148 | * Anon-vma allocations are very subtle, because we may have | 148 | * Anon-vma allocations are very subtle, because we may have |
149 | * optimistically looked up an anon_vma in page_lock_anon_vma() | 149 | * optimistically looked up an anon_vma in page_lock_anon_vma_read() |
150 | * and that may actually touch the spinlock even in the newly | 150 | * and that may actually touch the spinlock even in the newly |
151 | * allocated vma (it depends on RCU to make sure that the | 151 | * allocated vma (it depends on RCU to make sure that the |
152 | * anon_vma isn't actually destroyed). | 152 | * anon_vma isn't actually destroyed). |
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
181 | allocated = anon_vma; | 181 | allocated = anon_vma; |
182 | } | 182 | } |
183 | 183 | ||
184 | anon_vma_lock(anon_vma); | 184 | anon_vma_lock_write(anon_vma); |
185 | /* page_table_lock to protect against threads */ | 185 | /* page_table_lock to protect against threads */ |
186 | spin_lock(&mm->page_table_lock); | 186 | spin_lock(&mm->page_table_lock); |
187 | if (likely(!vma->anon_vma)) { | 187 | if (likely(!vma->anon_vma)) { |
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
306 | get_anon_vma(anon_vma->root); | 306 | get_anon_vma(anon_vma->root); |
307 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | 307 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ |
308 | vma->anon_vma = anon_vma; | 308 | vma->anon_vma = anon_vma; |
309 | anon_vma_lock(anon_vma); | 309 | anon_vma_lock_write(anon_vma); |
310 | anon_vma_chain_link(vma, avc, anon_vma); | 310 | anon_vma_chain_link(vma, avc, anon_vma); |
311 | anon_vma_unlock(anon_vma); | 311 | anon_vma_unlock(anon_vma); |
312 | 312 | ||
@@ -442,7 +442,7 @@ out: | |||
442 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a | 442 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a |
443 | * reference like with page_get_anon_vma() and then block on the mutex. | 443 | * reference like with page_get_anon_vma() and then block on the mutex. |
444 | */ | 444 | */ |
445 | struct anon_vma *page_lock_anon_vma(struct page *page) | 445 | struct anon_vma *page_lock_anon_vma_read(struct page *page) |
446 | { | 446 | { |
447 | struct anon_vma *anon_vma = NULL; | 447 | struct anon_vma *anon_vma = NULL; |
448 | struct anon_vma *root_anon_vma; | 448 | struct anon_vma *root_anon_vma; |
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
457 | 457 | ||
458 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 458 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
459 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | 459 | root_anon_vma = ACCESS_ONCE(anon_vma->root); |
460 | if (down_write_trylock(&root_anon_vma->rwsem)) { | 460 | if (down_read_trylock(&root_anon_vma->rwsem)) { |
461 | /* | 461 | /* |
462 | * If the page is still mapped, then this anon_vma is still | 462 | * If the page is still mapped, then this anon_vma is still |
463 | * its anon_vma, and holding the mutex ensures that it will | 463 | * its anon_vma, and holding the mutex ensures that it will |
464 | * not go away, see anon_vma_free(). | 464 | * not go away, see anon_vma_free(). |
465 | */ | 465 | */ |
466 | if (!page_mapped(page)) { | 466 | if (!page_mapped(page)) { |
467 | up_write(&root_anon_vma->rwsem); | 467 | up_read(&root_anon_vma->rwsem); |
468 | anon_vma = NULL; | 468 | anon_vma = NULL; |
469 | } | 469 | } |
470 | goto out; | 470 | goto out; |
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
484 | 484 | ||
485 | /* we pinned the anon_vma, its safe to sleep */ | 485 | /* we pinned the anon_vma, its safe to sleep */ |
486 | rcu_read_unlock(); | 486 | rcu_read_unlock(); |
487 | anon_vma_lock(anon_vma); | 487 | anon_vma_lock_read(anon_vma); |
488 | 488 | ||
489 | if (atomic_dec_and_test(&anon_vma->refcount)) { | 489 | if (atomic_dec_and_test(&anon_vma->refcount)) { |
490 | /* | 490 | /* |
491 | * Oops, we held the last refcount, release the lock | 491 | * Oops, we held the last refcount, release the lock |
492 | * and bail -- can't simply use put_anon_vma() because | 492 | * and bail -- can't simply use put_anon_vma() because |
493 | * we'll deadlock on the anon_vma_lock() recursion. | 493 | * we'll deadlock on the anon_vma_lock_write() recursion. |
494 | */ | 494 | */ |
495 | anon_vma_unlock(anon_vma); | 495 | anon_vma_unlock_read(anon_vma); |
496 | __put_anon_vma(anon_vma); | 496 | __put_anon_vma(anon_vma); |
497 | anon_vma = NULL; | 497 | anon_vma = NULL; |
498 | } | 498 | } |
@@ -504,9 +504,9 @@ out: | |||
504 | return anon_vma; | 504 | return anon_vma; |
505 | } | 505 | } |
506 | 506 | ||
507 | void page_unlock_anon_vma(struct anon_vma *anon_vma) | 507 | void page_unlock_anon_vma_read(struct anon_vma *anon_vma) |
508 | { | 508 | { |
509 | anon_vma_unlock(anon_vma); | 509 | anon_vma_unlock_read(anon_vma); |
510 | } | 510 | } |
511 | 511 | ||
512 | /* | 512 | /* |
@@ -732,7 +732,7 @@ static int page_referenced_anon(struct page *page, | |||
732 | struct anon_vma_chain *avc; | 732 | struct anon_vma_chain *avc; |
733 | int referenced = 0; | 733 | int referenced = 0; |
734 | 734 | ||
735 | anon_vma = page_lock_anon_vma(page); | 735 | anon_vma = page_lock_anon_vma_read(page); |
736 | if (!anon_vma) | 736 | if (!anon_vma) |
737 | return referenced; | 737 | return referenced; |
738 | 738 | ||
@@ -754,7 +754,7 @@ static int page_referenced_anon(struct page *page, | |||
754 | break; | 754 | break; |
755 | } | 755 | } |
756 | 756 | ||
757 | page_unlock_anon_vma(anon_vma); | 757 | page_unlock_anon_vma_read(anon_vma); |
758 | return referenced; | 758 | return referenced; |
759 | } | 759 | } |
760 | 760 | ||
@@ -1474,7 +1474,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1474 | struct anon_vma_chain *avc; | 1474 | struct anon_vma_chain *avc; |
1475 | int ret = SWAP_AGAIN; | 1475 | int ret = SWAP_AGAIN; |
1476 | 1476 | ||
1477 | anon_vma = page_lock_anon_vma(page); | 1477 | anon_vma = page_lock_anon_vma_read(page); |
1478 | if (!anon_vma) | 1478 | if (!anon_vma) |
1479 | return ret; | 1479 | return ret; |
1480 | 1480 | ||
@@ -1501,7 +1501,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1501 | break; | 1501 | break; |
1502 | } | 1502 | } |
1503 | 1503 | ||
1504 | page_unlock_anon_vma(anon_vma); | 1504 | page_unlock_anon_vma_read(anon_vma); |
1505 | return ret; | 1505 | return ret; |
1506 | } | 1506 | } |
1507 | 1507 | ||
@@ -1696,7 +1696,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1696 | int ret = SWAP_AGAIN; | 1696 | int ret = SWAP_AGAIN; |
1697 | 1697 | ||
1698 | /* | 1698 | /* |
1699 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | 1699 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() |
1700 | * because that depends on page_mapped(); but not all its usages | 1700 | * because that depends on page_mapped(); but not all its usages |
1701 | * are holding mmap_sem. Users without mmap_sem are required to | 1701 | * are holding mmap_sem. Users without mmap_sem are required to |
1702 | * take a reference count to prevent the anon_vma disappearing | 1702 | * take a reference count to prevent the anon_vma disappearing |
@@ -1704,7 +1704,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1704 | anon_vma = page_anon_vma(page); | 1704 | anon_vma = page_anon_vma(page); |
1705 | if (!anon_vma) | 1705 | if (!anon_vma) |
1706 | return ret; | 1706 | return ret; |
1707 | anon_vma_lock(anon_vma); | 1707 | anon_vma_lock_read(anon_vma); |
1708 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1708 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1709 | struct vm_area_struct *vma = avc->vma; | 1709 | struct vm_area_struct *vma = avc->vma; |
1710 | unsigned long address = vma_address(page, vma); | 1710 | unsigned long address = vma_address(page, vma); |
@@ -1712,7 +1712,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1712 | if (ret != SWAP_AGAIN) | 1712 | if (ret != SWAP_AGAIN) |
1713 | break; | 1713 | break; |
1714 | } | 1714 | } |
1715 | anon_vma_unlock(anon_vma); | 1715 | anon_vma_unlock_read(anon_vma); |
1716 | return ret; | 1716 | return ret; |
1717 | } | 1717 | } |
1718 | 1718 | ||