aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mmap.c
diff options
context:
space:
mode:
authorMichel Lespinasse <walken@google.com>2012-10-08 19:31:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 03:22:41 -0400
commitbf181b9f9d8dfbba58b23441ad60d0bc33806d64 (patch)
tree7ad0caaf8998f31c5d910dcbb768f5a1d381b5f4 /mm/mmap.c
parent108d6642ad81bb1d62b401490a334d2c12397517 (diff)
mm anon rmap: replace same_anon_vma linked list with an interval tree.
When a large VMA (anon or private file mapping) is first touched, which will populate its anon_vma field, and then split into many regions through the use of mprotect(), the original anon_vma ends up linking all of the vmas on a linked list. This can cause rmap to become inefficient, as we have to walk potentially thousands of irrelevent vmas before finding the one a given anon page might fall into. By replacing the same_anon_vma linked list with an interval tree (where each avc's interval is determined by its vma's start and last pgoffs), we can make rmap efficient for this use case again. While the change is large, all of its pieces are fairly simple. Most places that were walking the same_anon_vma list were looking for a known pgoff, so they can just use the anon_vma_interval_tree_foreach() interval tree iterator instead. The exception here is ksm, where the page's index is not known. It would probably be possible to rework ksm so that the index would be known, but for now I have decided to keep things simple and just walk the entirety of the interval tree there. When updating vma's that already have an anon_vma assigned, we must take care to re-index the corresponding avc's on their interval tree. This is done through the use of anon_vma_interval_tree_pre_update_vma() and anon_vma_interval_tree_post_update_vma(), which remove the avc's from their interval tree before the update and re-insert them after the update. The anon_vma stays locked during the update, so there is no chance that rmap would miss the vmas that are being updated. Signed-off-by: Michel Lespinasse <walken@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Daniel Santos <daniel.santos@pobox.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mmap.c')
-rw-r--r--mm/mmap.c73
1 files changed, 55 insertions, 18 deletions
diff --git a/mm/mmap.c b/mm/mmap.c
index 66984aab7915..2e580ed79211 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -353,6 +353,38 @@ void validate_mm(struct mm_struct *mm)
353#define validate_mm(mm) do { } while (0) 353#define validate_mm(mm) do { } while (0)
354#endif 354#endif
355 355
356/*
357 * vma has some anon_vma assigned, and is already inserted on that
358 * anon_vma's interval trees.
359 *
360 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
361 * vma must be removed from the anon_vma's interval trees using
362 * anon_vma_interval_tree_pre_update_vma().
363 *
364 * After the update, the vma will be reinserted using
365 * anon_vma_interval_tree_post_update_vma().
366 *
367 * The entire update must be protected by exclusive mmap_sem and by
368 * the root anon_vma's mutex.
369 */
370static inline void
371anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
372{
373 struct anon_vma_chain *avc;
374
375 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
376 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
377}
378
379static inline void
380anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
381{
382 struct anon_vma_chain *avc;
383
384 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
385 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
386}
387
356static int find_vma_links(struct mm_struct *mm, unsigned long addr, 388static int find_vma_links(struct mm_struct *mm, unsigned long addr,
357 unsigned long end, struct vm_area_struct **pprev, 389 unsigned long end, struct vm_area_struct **pprev,
358 struct rb_node ***rb_link, struct rb_node **rb_parent) 390 struct rb_node ***rb_link, struct rb_node **rb_parent)
@@ -565,20 +597,17 @@ again: remove_next = 1 + (end > next->vm_end);
565 597
566 vma_adjust_trans_huge(vma, start, end, adjust_next); 598 vma_adjust_trans_huge(vma, start, end, adjust_next);
567 599
568 /* 600 anon_vma = vma->anon_vma;
569 * When changing only vma->vm_end, we don't really need anon_vma 601 if (!anon_vma && adjust_next)
570 * lock. This is a fairly rare case by itself, but the anon_vma 602 anon_vma = next->anon_vma;
571 * lock may be shared between many sibling processes. Skipping 603 if (anon_vma) {
572 * the lock for brk adjustments makes a difference sometimes.
573 */
574 if (vma->anon_vma && (importer || start != vma->vm_start)) {
575 anon_vma = vma->anon_vma;
576 VM_BUG_ON(adjust_next && next->anon_vma && 604 VM_BUG_ON(adjust_next && next->anon_vma &&
577 anon_vma != next->anon_vma); 605 anon_vma != next->anon_vma);
578 } else if (adjust_next && next->anon_vma)
579 anon_vma = next->anon_vma;
580 if (anon_vma)
581 anon_vma_lock(anon_vma); 606 anon_vma_lock(anon_vma);
607 anon_vma_interval_tree_pre_update_vma(vma);
608 if (adjust_next)
609 anon_vma_interval_tree_pre_update_vma(next);
610 }
582 611
583 if (root) { 612 if (root) {
584 flush_dcache_mmap_lock(mapping); 613 flush_dcache_mmap_lock(mapping);
@@ -619,8 +648,12 @@ again: remove_next = 1 + (end > next->vm_end);
619 __insert_vm_struct(mm, insert); 648 __insert_vm_struct(mm, insert);
620 } 649 }
621 650
622 if (anon_vma) 651 if (anon_vma) {
652 anon_vma_interval_tree_post_update_vma(vma);
653 if (adjust_next)
654 anon_vma_interval_tree_post_update_vma(next);
623 anon_vma_unlock(anon_vma); 655 anon_vma_unlock(anon_vma);
656 }
624 if (mapping) 657 if (mapping)
625 mutex_unlock(&mapping->i_mmap_mutex); 658 mutex_unlock(&mapping->i_mmap_mutex);
626 659
@@ -1748,7 +1781,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1748 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 1781 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
1749 error = acct_stack_growth(vma, size, grow); 1782 error = acct_stack_growth(vma, size, grow);
1750 if (!error) { 1783 if (!error) {
1784 anon_vma_interval_tree_pre_update_vma(vma);
1751 vma->vm_end = address; 1785 vma->vm_end = address;
1786 anon_vma_interval_tree_post_update_vma(vma);
1752 perf_event_mmap(vma); 1787 perf_event_mmap(vma);
1753 } 1788 }
1754 } 1789 }
@@ -1798,8 +1833,10 @@ int expand_downwards(struct vm_area_struct *vma,
1798 if (grow <= vma->vm_pgoff) { 1833 if (grow <= vma->vm_pgoff) {
1799 error = acct_stack_growth(vma, size, grow); 1834 error = acct_stack_growth(vma, size, grow);
1800 if (!error) { 1835 if (!error) {
1836 anon_vma_interval_tree_pre_update_vma(vma);
1801 vma->vm_start = address; 1837 vma->vm_start = address;
1802 vma->vm_pgoff -= grow; 1838 vma->vm_pgoff -= grow;
1839 anon_vma_interval_tree_post_update_vma(vma);
1803 perf_event_mmap(vma); 1840 perf_event_mmap(vma);
1804 } 1841 }
1805 } 1842 }
@@ -2515,7 +2552,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
2515 2552
2516static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2553static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2517{ 2554{
2518 if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { 2555 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
2519 /* 2556 /*
2520 * The LSB of head.next can't change from under us 2557 * The LSB of head.next can't change from under us
2521 * because we hold the mm_all_locks_mutex. 2558 * because we hold the mm_all_locks_mutex.
@@ -2531,7 +2568,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2531 * anon_vma->root->mutex. 2568 * anon_vma->root->mutex.
2532 */ 2569 */
2533 if (__test_and_set_bit(0, (unsigned long *) 2570 if (__test_and_set_bit(0, (unsigned long *)
2534 &anon_vma->root->head.next)) 2571 &anon_vma->root->rb_root.rb_node))
2535 BUG(); 2572 BUG();
2536 } 2573 }
2537} 2574}
@@ -2572,7 +2609,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2572 * A single task can't take more than one mm_take_all_locks() in a row 2609 * A single task can't take more than one mm_take_all_locks() in a row
2573 * or it would deadlock. 2610 * or it would deadlock.
2574 * 2611 *
2575 * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in 2612 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
2576 * mapping->flags avoid to take the same lock twice, if more than one 2613 * mapping->flags avoid to take the same lock twice, if more than one
2577 * vma in this mm is backed by the same anon_vma or address_space. 2614 * vma in this mm is backed by the same anon_vma or address_space.
2578 * 2615 *
@@ -2619,13 +2656,13 @@ out_unlock:
2619 2656
2620static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2657static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2621{ 2658{
2622 if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { 2659 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
2623 /* 2660 /*
2624 * The LSB of head.next can't change to 0 from under 2661 * The LSB of head.next can't change to 0 from under
2625 * us because we hold the mm_all_locks_mutex. 2662 * us because we hold the mm_all_locks_mutex.
2626 * 2663 *
2627 * We must however clear the bitflag before unlocking 2664 * We must however clear the bitflag before unlocking
2628 * the vma so the users using the anon_vma->head will 2665 * the vma so the users using the anon_vma->rb_root will
2629 * never see our bitflag. 2666 * never see our bitflag.
2630 * 2667 *
2631 * No need of atomic instructions here, head.next 2668 * No need of atomic instructions here, head.next
@@ -2633,7 +2670,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2633 * anon_vma->root->mutex. 2670 * anon_vma->root->mutex.
2634 */ 2671 */
2635 if (!__test_and_clear_bit(0, (unsigned long *) 2672 if (!__test_and_clear_bit(0, (unsigned long *)
2636 &anon_vma->root->head.next)) 2673 &anon_vma->root->rb_root.rb_node))
2637 BUG(); 2674 BUG();
2638 anon_vma_unlock(anon_vma); 2675 anon_vma_unlock(anon_vma);
2639 } 2676 }