aboutsummaryrefslogtreecommitdiffstats
path: root/mm/rmap.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 17:33:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 18:18:08 -0500
commit3d59eebc5e137bd89c6351e4c70e90ba1d0dc234 (patch)
treeb4ddfd0b057454a7437a3b4e3074a3b8b4b03817 /mm/rmap.c
parent11520e5e7c1855fc3bf202bb3be35a39d9efa034 (diff)
parent4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 (diff)
Merge tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma
Pull Automatic NUMA Balancing bare-bones from Mel Gorman: "There are three implementations for NUMA balancing, this tree (balancenuma), numacore which has been developed in tip/master and autonuma which is in aa.git. In almost all respects balancenuma is the dumbest of the three because its main impact is on the VM side with no attempt to be smart about scheduling. In the interest of getting the ball rolling, it would be desirable to see this much merged for 3.8 with the view to building scheduler smarts on top and adapting the VM where required for 3.9. The most recent set of comparisons available from different people are mel: https://lkml.org/lkml/2012/12/9/108 mingo: https://lkml.org/lkml/2012/12/7/331 tglx: https://lkml.org/lkml/2012/12/10/437 srikar: https://lkml.org/lkml/2012/12/10/397 The results are a mixed bag. In my own tests, balancenuma does reasonably well. It's dumb as rocks and does not regress against mainline. On the other hand, Ingo's tests shows that balancenuma is incapable of converging for this workloads driven by perf which is bad but is potentially explained by the lack of scheduler smarts. Thomas' results show balancenuma improves on mainline but falls far short of numacore or autonuma. Srikar's results indicate we all suffer on a large machine with imbalanced node sizes. My own testing showed that recent numacore results have improved dramatically, particularly in the last week but not universally. We've butted heads heavily on system CPU usage and high levels of migration even when it shows that overall performance is better. There are also cases where it regresses. Of interest is that for specjbb in some configurations it will regress for lower numbers of warehouses and show gains for higher numbers which is not reported by the tool by default and sometimes missed in treports. Recently I reported for numacore that the JVM was crashing with NullPointerExceptions but currently it's unclear what the source of this problem is. Initially I thought it was in how numacore batch handles PTEs but I'm no longer think this is the case. It's possible numacore is just able to trigger it due to higher rates of migration. These reports were quite late in the cycle so I/we would like to start with this tree as it contains much of the code we can agree on and has not changed significantly over the last 2-3 weeks." * tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma: (50 commits) mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable mm/rmap: Convert the struct anon_vma::mutex to an rwsem mm: migrate: Account a transhuge page properly when rate limiting mm: numa: Account for failed allocations and isolations as migration failures mm: numa: Add THP migration for the NUMA working set scanning fault case build fix mm: numa: Add THP migration for the NUMA working set scanning fault case. mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node mm: sched: numa: Control enabling and disabling of NUMA balancing if !SCHED_DEBUG mm: sched: numa: Control enabling and disabling of NUMA balancing mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate mm: numa: Use a two-stage filter to restrict pages being migrated for unlikely task<->node relationships mm: numa: migrate: Set last_nid on newly allocated page mm: numa: split_huge_page: Transfer last_nid on tail page mm: numa: Introduce last_nid to the page frame sched: numa: Slowly increase the scanning period as NUMA faults are handled mm: numa: Rate limit setting of pte_numa if node is saturated mm: numa: Rate limit the amount of memory that is migrated between nodes mm: numa: Structures for Migrate On Fault per NUMA migration rate limiting mm: numa: Migrate pages handled during a pmd_numa hinting fault mm: numa: Migrate on reference policy ...
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c66
1 files changed, 33 insertions, 33 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index face808a489e..2c78f8cadc95 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
24 * mm->mmap_sem 24 * mm->mmap_sem
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * mapping->i_mmap_mutex 26 * mapping->i_mmap_mutex
27 * anon_vma->mutex 27 * anon_vma->rwsem
28 * mm->page_table_lock or pte_lock 28 * mm->page_table_lock or pte_lock
29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
30 * swap_lock (in swap_duplicate, swap_info_get) 30 * swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within bdi.wb->list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
39 * 39 *
40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) 40 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
41 * ->tasklist_lock 41 * ->tasklist_lock
42 * pte map lock 42 * pte map lock
43 */ 43 */
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
87 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 87 VM_BUG_ON(atomic_read(&anon_vma->refcount));
88 88
89 /* 89 /*
90 * Synchronize against page_lock_anon_vma() such that 90 * Synchronize against page_lock_anon_vma_read() such that
91 * we can safely hold the lock without the anon_vma getting 91 * we can safely hold the lock without the anon_vma getting
92 * freed. 92 * freed.
93 * 93 *
94 * Relies on the full mb implied by the atomic_dec_and_test() from 94 * Relies on the full mb implied by the atomic_dec_and_test() from
95 * put_anon_vma() against the acquire barrier implied by 95 * put_anon_vma() against the acquire barrier implied by
96 * mutex_trylock() from page_lock_anon_vma(). This orders: 96 * down_read_trylock() from page_lock_anon_vma_read(). This orders:
97 * 97 *
98 * page_lock_anon_vma() VS put_anon_vma() 98 * page_lock_anon_vma_read() VS put_anon_vma()
99 * mutex_trylock() atomic_dec_and_test() 99 * down_read_trylock() atomic_dec_and_test()
100 * LOCK MB 100 * LOCK MB
101 * atomic_read() mutex_is_locked() 101 * atomic_read() rwsem_is_locked()
102 * 102 *
103 * LOCK should suffice since the actual taking of the lock must 103 * LOCK should suffice since the actual taking of the lock must
104 * happen _before_ what follows. 104 * happen _before_ what follows.
105 */ 105 */
106 if (mutex_is_locked(&anon_vma->root->mutex)) { 106 if (rwsem_is_locked(&anon_vma->root->rwsem)) {
107 anon_vma_lock(anon_vma); 107 anon_vma_lock_write(anon_vma);
108 anon_vma_unlock(anon_vma); 108 anon_vma_unlock(anon_vma);
109 } 109 }
110 110
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
146 * allocate a new one. 146 * allocate a new one.
147 * 147 *
148 * Anon-vma allocations are very subtle, because we may have 148 * Anon-vma allocations are very subtle, because we may have
149 * optimistically looked up an anon_vma in page_lock_anon_vma() 149 * optimistically looked up an anon_vma in page_lock_anon_vma_read()
150 * and that may actually touch the spinlock even in the newly 150 * and that may actually touch the spinlock even in the newly
151 * allocated vma (it depends on RCU to make sure that the 151 * allocated vma (it depends on RCU to make sure that the
152 * anon_vma isn't actually destroyed). 152 * anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
181 allocated = anon_vma; 181 allocated = anon_vma;
182 } 182 }
183 183
184 anon_vma_lock(anon_vma); 184 anon_vma_lock_write(anon_vma);
185 /* page_table_lock to protect against threads */ 185 /* page_table_lock to protect against threads */
186 spin_lock(&mm->page_table_lock); 186 spin_lock(&mm->page_table_lock);
187 if (likely(!vma->anon_vma)) { 187 if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
219 struct anon_vma *new_root = anon_vma->root; 219 struct anon_vma *new_root = anon_vma->root;
220 if (new_root != root) { 220 if (new_root != root) {
221 if (WARN_ON_ONCE(root)) 221 if (WARN_ON_ONCE(root))
222 mutex_unlock(&root->mutex); 222 up_write(&root->rwsem);
223 root = new_root; 223 root = new_root;
224 mutex_lock(&root->mutex); 224 down_write(&root->rwsem);
225 } 225 }
226 return root; 226 return root;
227} 227}
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
229static inline void unlock_anon_vma_root(struct anon_vma *root) 229static inline void unlock_anon_vma_root(struct anon_vma *root)
230{ 230{
231 if (root) 231 if (root)
232 mutex_unlock(&root->mutex); 232 up_write(&root->rwsem);
233} 233}
234 234
235/* 235/*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
306 get_anon_vma(anon_vma->root); 306 get_anon_vma(anon_vma->root);
307 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 307 /* Mark this anon_vma as the one where our new (COWed) pages go. */
308 vma->anon_vma = anon_vma; 308 vma->anon_vma = anon_vma;
309 anon_vma_lock(anon_vma); 309 anon_vma_lock_write(anon_vma);
310 anon_vma_chain_link(vma, avc, anon_vma); 310 anon_vma_chain_link(vma, avc, anon_vma);
311 anon_vma_unlock(anon_vma); 311 anon_vma_unlock(anon_vma);
312 312
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
349 /* 349 /*
350 * Iterate the list once more, it now only contains empty and unlinked 350 * Iterate the list once more, it now only contains empty and unlinked
351 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 351 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
352 * needing to acquire the anon_vma->root->mutex. 352 * needing to write-acquire the anon_vma->root->rwsem.
353 */ 353 */
354 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 354 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
355 struct anon_vma *anon_vma = avc->anon_vma; 355 struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
365{ 365{
366 struct anon_vma *anon_vma = data; 366 struct anon_vma *anon_vma = data;
367 367
368 mutex_init(&anon_vma->mutex); 368 init_rwsem(&anon_vma->rwsem);
369 atomic_set(&anon_vma->refcount, 0); 369 atomic_set(&anon_vma->refcount, 0);
370 anon_vma->rb_root = RB_ROOT; 370 anon_vma->rb_root = RB_ROOT;
371} 371}
@@ -442,7 +442,7 @@ out:
442 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 442 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
443 * reference like with page_get_anon_vma() and then block on the mutex. 443 * reference like with page_get_anon_vma() and then block on the mutex.
444 */ 444 */
445struct anon_vma *page_lock_anon_vma(struct page *page) 445struct anon_vma *page_lock_anon_vma_read(struct page *page)
446{ 446{
447 struct anon_vma *anon_vma = NULL; 447 struct anon_vma *anon_vma = NULL;
448 struct anon_vma *root_anon_vma; 448 struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
457 457
458 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 458 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
459 root_anon_vma = ACCESS_ONCE(anon_vma->root); 459 root_anon_vma = ACCESS_ONCE(anon_vma->root);
460 if (mutex_trylock(&root_anon_vma->mutex)) { 460 if (down_read_trylock(&root_anon_vma->rwsem)) {
461 /* 461 /*
462 * If the page is still mapped, then this anon_vma is still 462 * If the page is still mapped, then this anon_vma is still
463 * its anon_vma, and holding the mutex ensures that it will 463 * its anon_vma, and holding the mutex ensures that it will
464 * not go away, see anon_vma_free(). 464 * not go away, see anon_vma_free().
465 */ 465 */
466 if (!page_mapped(page)) { 466 if (!page_mapped(page)) {
467 mutex_unlock(&root_anon_vma->mutex); 467 up_read(&root_anon_vma->rwsem);
468 anon_vma = NULL; 468 anon_vma = NULL;
469 } 469 }
470 goto out; 470 goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
484 484
485 /* we pinned the anon_vma, its safe to sleep */ 485 /* we pinned the anon_vma, its safe to sleep */
486 rcu_read_unlock(); 486 rcu_read_unlock();
487 anon_vma_lock(anon_vma); 487 anon_vma_lock_read(anon_vma);
488 488
489 if (atomic_dec_and_test(&anon_vma->refcount)) { 489 if (atomic_dec_and_test(&anon_vma->refcount)) {
490 /* 490 /*
491 * Oops, we held the last refcount, release the lock 491 * Oops, we held the last refcount, release the lock
492 * and bail -- can't simply use put_anon_vma() because 492 * and bail -- can't simply use put_anon_vma() because
493 * we'll deadlock on the anon_vma_lock() recursion. 493 * we'll deadlock on the anon_vma_lock_write() recursion.
494 */ 494 */
495 anon_vma_unlock(anon_vma); 495 anon_vma_unlock_read(anon_vma);
496 __put_anon_vma(anon_vma); 496 __put_anon_vma(anon_vma);
497 anon_vma = NULL; 497 anon_vma = NULL;
498 } 498 }
@@ -504,9 +504,9 @@ out:
504 return anon_vma; 504 return anon_vma;
505} 505}
506 506
507void page_unlock_anon_vma(struct anon_vma *anon_vma) 507void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
508{ 508{
509 anon_vma_unlock(anon_vma); 509 anon_vma_unlock_read(anon_vma);
510} 510}
511 511
512/* 512/*
@@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page,
744 struct anon_vma_chain *avc; 744 struct anon_vma_chain *avc;
745 int referenced = 0; 745 int referenced = 0;
746 746
747 anon_vma = page_lock_anon_vma(page); 747 anon_vma = page_lock_anon_vma_read(page);
748 if (!anon_vma) 748 if (!anon_vma)
749 return referenced; 749 return referenced;
750 750
@@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page,
766 break; 766 break;
767 } 767 }
768 768
769 page_unlock_anon_vma(anon_vma); 769 page_unlock_anon_vma_read(anon_vma);
770 return referenced; 770 return referenced;
771} 771}
772 772
@@ -1315,7 +1315,7 @@ out_mlock:
1315 /* 1315 /*
1316 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 1316 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1317 * unstable result and race. Plus, We can't wait here because 1317 * unstable result and race. Plus, We can't wait here because
1318 * we now hold anon_vma->mutex or mapping->i_mmap_mutex. 1318 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
1319 * if trylock failed, the page remain in evictable lru and later 1319 * if trylock failed, the page remain in evictable lru and later
1320 * vmscan could retry to move the page to unevictable lru if the 1320 * vmscan could retry to move the page to unevictable lru if the
1321 * page is actually mlocked. 1321 * page is actually mlocked.
@@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1480 struct anon_vma_chain *avc; 1480 struct anon_vma_chain *avc;
1481 int ret = SWAP_AGAIN; 1481 int ret = SWAP_AGAIN;
1482 1482
1483 anon_vma = page_lock_anon_vma(page); 1483 anon_vma = page_lock_anon_vma_read(page);
1484 if (!anon_vma) 1484 if (!anon_vma)
1485 return ret; 1485 return ret;
1486 1486
@@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1507 break; 1507 break;
1508 } 1508 }
1509 1509
1510 page_unlock_anon_vma(anon_vma); 1510 page_unlock_anon_vma_read(anon_vma);
1511 return ret; 1511 return ret;
1512} 1512}
1513 1513
@@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1702 int ret = SWAP_AGAIN; 1702 int ret = SWAP_AGAIN;
1703 1703
1704 /* 1704 /*
1705 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() 1705 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
1706 * because that depends on page_mapped(); but not all its usages 1706 * because that depends on page_mapped(); but not all its usages
1707 * are holding mmap_sem. Users without mmap_sem are required to 1707 * are holding mmap_sem. Users without mmap_sem are required to
1708 * take a reference count to prevent the anon_vma disappearing 1708 * take a reference count to prevent the anon_vma disappearing
@@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1710 anon_vma = page_anon_vma(page); 1710 anon_vma = page_anon_vma(page);
1711 if (!anon_vma) 1711 if (!anon_vma)
1712 return ret; 1712 return ret;
1713 anon_vma_lock(anon_vma); 1713 anon_vma_lock_read(anon_vma);
1714 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1714 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1715 struct vm_area_struct *vma = avc->vma; 1715 struct vm_area_struct *vma = avc->vma;
1716 unsigned long address = vma_address(page, vma); 1716 unsigned long address = vma_address(page, vma);
@@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1718 if (ret != SWAP_AGAIN) 1718 if (ret != SWAP_AGAIN)
1719 break; 1719 break;
1720 } 1720 }
1721 anon_vma_unlock(anon_vma); 1721 anon_vma_unlock_read(anon_vma);
1722 return ret; 1722 return ret;
1723} 1723}
1724 1724