diff options
author | Michel Lespinasse <walken@google.com> | 2012-10-08 19:31:39 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-09 03:22:41 -0400 |
commit | bf181b9f9d8dfbba58b23441ad60d0bc33806d64 (patch) | |
tree | 7ad0caaf8998f31c5d910dcbb768f5a1d381b5f4 /mm/rmap.c | |
parent | 108d6642ad81bb1d62b401490a334d2c12397517 (diff) |
mm anon rmap: replace same_anon_vma linked list with an interval tree.
When a large VMA (anon or private file mapping) is first touched, which
will populate its anon_vma field, and then split into many regions through
the use of mprotect(), the original anon_vma ends up linking all of the
vmas on a linked list. This can cause rmap to become inefficient, as we
have to walk potentially thousands of irrelevent vmas before finding the
one a given anon page might fall into.
By replacing the same_anon_vma linked list with an interval tree (where
each avc's interval is determined by its vma's start and last pgoffs), we
can make rmap efficient for this use case again.
While the change is large, all of its pieces are fairly simple.
Most places that were walking the same_anon_vma list were looking for a
known pgoff, so they can just use the anon_vma_interval_tree_foreach()
interval tree iterator instead. The exception here is ksm, where the
page's index is not known. It would probably be possible to rework ksm so
that the index would be known, but for now I have decided to keep things
simple and just walk the entirety of the interval tree there.
When updating vma's that already have an anon_vma assigned, we must take
care to re-index the corresponding avc's on their interval tree. This is
done through the use of anon_vma_interval_tree_pre_update_vma() and
anon_vma_interval_tree_post_update_vma(), which remove the avc's from
their interval tree before the update and re-insert them after the update.
The anon_vma stays locked during the update, so there is no chance that
rmap would miss the vmas that are being updated.
Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/rmap.c')
-rw-r--r-- | mm/rmap.c | 24 |
1 files changed, 12 insertions, 12 deletions
@@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
127 | avc->vma = vma; | 127 | avc->vma = vma; |
128 | avc->anon_vma = anon_vma; | 128 | avc->anon_vma = anon_vma; |
129 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 129 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
130 | 130 | anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); | |
131 | /* | ||
132 | * It's critical to add new vmas to the tail of the anon_vma, | ||
133 | * see comment in huge_memory.c:__split_huge_page(). | ||
134 | */ | ||
135 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
136 | } | 131 | } |
137 | 132 | ||
138 | /** | 133 | /** |
@@ -336,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma) | |||
336 | struct anon_vma *anon_vma = avc->anon_vma; | 331 | struct anon_vma *anon_vma = avc->anon_vma; |
337 | 332 | ||
338 | root = lock_anon_vma_root(root, anon_vma); | 333 | root = lock_anon_vma_root(root, anon_vma); |
339 | list_del(&avc->same_anon_vma); | 334 | anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); |
340 | 335 | ||
341 | /* | 336 | /* |
342 | * Leave empty anon_vmas on the list - we'll need | 337 | * Leave empty anon_vmas on the list - we'll need |
343 | * to free them outside the lock. | 338 | * to free them outside the lock. |
344 | */ | 339 | */ |
345 | if (list_empty(&anon_vma->head)) | 340 | if (RB_EMPTY_ROOT(&anon_vma->rb_root)) |
346 | continue; | 341 | continue; |
347 | 342 | ||
348 | list_del(&avc->same_vma); | 343 | list_del(&avc->same_vma); |
@@ -371,7 +366,7 @@ static void anon_vma_ctor(void *data) | |||
371 | 366 | ||
372 | mutex_init(&anon_vma->mutex); | 367 | mutex_init(&anon_vma->mutex); |
373 | atomic_set(&anon_vma->refcount, 0); | 368 | atomic_set(&anon_vma->refcount, 0); |
374 | INIT_LIST_HEAD(&anon_vma->head); | 369 | anon_vma->rb_root = RB_ROOT; |
375 | } | 370 | } |
376 | 371 | ||
377 | void __init anon_vma_init(void) | 372 | void __init anon_vma_init(void) |
@@ -724,6 +719,7 @@ static int page_referenced_anon(struct page *page, | |||
724 | { | 719 | { |
725 | unsigned int mapcount; | 720 | unsigned int mapcount; |
726 | struct anon_vma *anon_vma; | 721 | struct anon_vma *anon_vma; |
722 | pgoff_t pgoff; | ||
727 | struct anon_vma_chain *avc; | 723 | struct anon_vma_chain *avc; |
728 | int referenced = 0; | 724 | int referenced = 0; |
729 | 725 | ||
@@ -732,7 +728,8 @@ static int page_referenced_anon(struct page *page, | |||
732 | return referenced; | 728 | return referenced; |
733 | 729 | ||
734 | mapcount = page_mapcount(page); | 730 | mapcount = page_mapcount(page); |
735 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 731 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
732 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
736 | struct vm_area_struct *vma = avc->vma; | 733 | struct vm_area_struct *vma = avc->vma; |
737 | unsigned long address = vma_address(page, vma); | 734 | unsigned long address = vma_address(page, vma); |
738 | if (address == -EFAULT) | 735 | if (address == -EFAULT) |
@@ -1445,6 +1442,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma) | |||
1445 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | 1442 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
1446 | { | 1443 | { |
1447 | struct anon_vma *anon_vma; | 1444 | struct anon_vma *anon_vma; |
1445 | pgoff_t pgoff; | ||
1448 | struct anon_vma_chain *avc; | 1446 | struct anon_vma_chain *avc; |
1449 | int ret = SWAP_AGAIN; | 1447 | int ret = SWAP_AGAIN; |
1450 | 1448 | ||
@@ -1452,7 +1450,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1452 | if (!anon_vma) | 1450 | if (!anon_vma) |
1453 | return ret; | 1451 | return ret; |
1454 | 1452 | ||
1455 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1453 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1454 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
1456 | struct vm_area_struct *vma = avc->vma; | 1455 | struct vm_area_struct *vma = avc->vma; |
1457 | unsigned long address; | 1456 | unsigned long address; |
1458 | 1457 | ||
@@ -1668,6 +1667,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1668 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1667 | struct vm_area_struct *, unsigned long, void *), void *arg) |
1669 | { | 1668 | { |
1670 | struct anon_vma *anon_vma; | 1669 | struct anon_vma *anon_vma; |
1670 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1671 | struct anon_vma_chain *avc; | 1671 | struct anon_vma_chain *avc; |
1672 | int ret = SWAP_AGAIN; | 1672 | int ret = SWAP_AGAIN; |
1673 | 1673 | ||
@@ -1681,7 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1681 | if (!anon_vma) | 1681 | if (!anon_vma) |
1682 | return ret; | 1682 | return ret; |
1683 | anon_vma_lock(anon_vma); | 1683 | anon_vma_lock(anon_vma); |
1684 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1684 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1685 | struct vm_area_struct *vma = avc->vma; | 1685 | struct vm_area_struct *vma = avc->vma; |
1686 | unsigned long address = vma_address(page, vma); | 1686 | unsigned long address = vma_address(page, vma); |
1687 | if (address == -EFAULT) | 1687 | if (address == -EFAULT) |