diff options
author | Michel Lespinasse <walken@google.com> | 2012-10-08 19:31:50 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-09 03:22:42 -0400 |
commit | 38a76013ad809beb0b52f60d365c960d035bd83c (patch) | |
tree | c63ba707ab17dd1ff1e90650faf74570daa3cf9f /mm | |
parent | 523d4e2008fd4a68b1a164e63e8c75b7b20f07e0 (diff) |
mm: avoid taking rmap locks in move_ptes()
During mremap(), the destination VMA is generally placed after the
original vma in rmap traversal order: in move_vma(), we always have
new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >=
vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one.
When the destination VMA is placed after the original in rmap traversal
order, we can avoid taking the rmap locks in move_ptes().
Essentially, this reintroduces the optimization that had been disabled in
"mm anon rmap: remove anon_vma_moveto_tail". The difference is that we
don't try to impose the rmap traversal order; instead we just rely on
things being in the desired order in the common case and fall back to
taking locks in the uncommon case. Also we skip the i_mmap_mutex in
addition to the anon_vma lock: in both cases, the vmas are traversed in
increasing vm_pgoff order with ties resolved in tree insertion order.
Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/mmap.c | 7 | ||||
-rw-r--r-- | mm/mremap.c | 57 |
2 files changed, 44 insertions, 20 deletions
@@ -2371,7 +2371,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | |||
2371 | * prior to moving page table entries, to effect an mremap move. | 2371 | * prior to moving page table entries, to effect an mremap move. |
2372 | */ | 2372 | */ |
2373 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | 2373 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, |
2374 | unsigned long addr, unsigned long len, pgoff_t pgoff) | 2374 | unsigned long addr, unsigned long len, pgoff_t pgoff, |
2375 | bool *need_rmap_locks) | ||
2375 | { | 2376 | { |
2376 | struct vm_area_struct *vma = *vmap; | 2377 | struct vm_area_struct *vma = *vmap; |
2377 | unsigned long vma_start = vma->vm_start; | 2378 | unsigned long vma_start = vma->vm_start; |
@@ -2413,8 +2414,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2413 | * linear if there are no pages mapped yet. | 2414 | * linear if there are no pages mapped yet. |
2414 | */ | 2415 | */ |
2415 | VM_BUG_ON(faulted_in_anon_vma); | 2416 | VM_BUG_ON(faulted_in_anon_vma); |
2416 | *vmap = new_vma; | 2417 | *vmap = vma = new_vma; |
2417 | } | 2418 | } |
2419 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); | ||
2418 | } else { | 2420 | } else { |
2419 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2421 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2420 | if (new_vma) { | 2422 | if (new_vma) { |
@@ -2434,6 +2436,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2434 | if (new_vma->vm_ops && new_vma->vm_ops->open) | 2436 | if (new_vma->vm_ops && new_vma->vm_ops->open) |
2435 | new_vma->vm_ops->open(new_vma); | 2437 | new_vma->vm_ops->open(new_vma); |
2436 | vma_link(mm, new_vma, prev, rb_link, rb_parent); | 2438 | vma_link(mm, new_vma, prev, rb_link, rb_parent); |
2439 | *need_rmap_locks = false; | ||
2437 | } | 2440 | } |
2438 | } | 2441 | } |
2439 | return new_vma; | 2442 | return new_vma; |
diff --git a/mm/mremap.c b/mm/mremap.c index 5588bb6e9295..3b639a4b26bd 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -71,26 +71,42 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
71 | static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | 71 | static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, |
72 | unsigned long old_addr, unsigned long old_end, | 72 | unsigned long old_addr, unsigned long old_end, |
73 | struct vm_area_struct *new_vma, pmd_t *new_pmd, | 73 | struct vm_area_struct *new_vma, pmd_t *new_pmd, |
74 | unsigned long new_addr) | 74 | unsigned long new_addr, bool need_rmap_locks) |
75 | { | 75 | { |
76 | struct address_space *mapping = NULL; | 76 | struct address_space *mapping = NULL; |
77 | struct anon_vma *anon_vma = vma->anon_vma; | 77 | struct anon_vma *anon_vma = NULL; |
78 | struct mm_struct *mm = vma->vm_mm; | 78 | struct mm_struct *mm = vma->vm_mm; |
79 | pte_t *old_pte, *new_pte, pte; | 79 | pte_t *old_pte, *new_pte, pte; |
80 | spinlock_t *old_ptl, *new_ptl; | 80 | spinlock_t *old_ptl, *new_ptl; |
81 | 81 | ||
82 | if (vma->vm_file) { | 82 | /* |
83 | /* | 83 | * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma |
84 | * Subtle point from Rajesh Venkatasubramanian: before | 84 | * locks to ensure that rmap will always observe either the old or the |
85 | * moving file-based ptes, we must lock truncate_pagecache | 85 | * new ptes. This is the easiest way to avoid races with |
86 | * out, since it might clean the dst vma before the src vma, | 86 | * truncate_pagecache(), page migration, etc... |
87 | * and we propagate stale pages into the dst afterward. | 87 | * |
88 | */ | 88 | * When need_rmap_locks is false, we use other ways to avoid |
89 | mapping = vma->vm_file->f_mapping; | 89 | * such races: |
90 | mutex_lock(&mapping->i_mmap_mutex); | 90 | * |
91 | * - During exec() shift_arg_pages(), we use a specially tagged vma | ||
92 | * which rmap call sites look for using is_vma_temporary_stack(). | ||
93 | * | ||
94 | * - During mremap(), new_vma is often known to be placed after vma | ||
95 | * in rmap traversal order. This ensures rmap will always observe | ||
96 | * either the old pte, or the new pte, or both (the page table locks | ||
97 | * serialize access to individual ptes, but only rmap traversal | ||
98 | * order guarantees that we won't miss both the old and new ptes). | ||
99 | */ | ||
100 | if (need_rmap_locks) { | ||
101 | if (vma->vm_file) { | ||
102 | mapping = vma->vm_file->f_mapping; | ||
103 | mutex_lock(&mapping->i_mmap_mutex); | ||
104 | } | ||
105 | if (vma->anon_vma) { | ||
106 | anon_vma = vma->anon_vma; | ||
107 | anon_vma_lock(anon_vma); | ||
108 | } | ||
91 | } | 109 | } |
92 | if (anon_vma) | ||
93 | anon_vma_lock(anon_vma); | ||
94 | 110 | ||
95 | /* | 111 | /* |
96 | * We don't have to worry about the ordering of src and dst | 112 | * We don't have to worry about the ordering of src and dst |
@@ -127,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
127 | 143 | ||
128 | unsigned long move_page_tables(struct vm_area_struct *vma, | 144 | unsigned long move_page_tables(struct vm_area_struct *vma, |
129 | unsigned long old_addr, struct vm_area_struct *new_vma, | 145 | unsigned long old_addr, struct vm_area_struct *new_vma, |
130 | unsigned long new_addr, unsigned long len) | 146 | unsigned long new_addr, unsigned long len, |
147 | bool need_rmap_locks) | ||
131 | { | 148 | { |
132 | unsigned long extent, next, old_end; | 149 | unsigned long extent, next, old_end; |
133 | pmd_t *old_pmd, *new_pmd; | 150 | pmd_t *old_pmd, *new_pmd; |
@@ -174,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
174 | if (extent > LATENCY_LIMIT) | 191 | if (extent > LATENCY_LIMIT) |
175 | extent = LATENCY_LIMIT; | 192 | extent = LATENCY_LIMIT; |
176 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, | 193 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, |
177 | new_vma, new_pmd, new_addr); | 194 | new_vma, new_pmd, new_addr, need_rmap_locks); |
178 | need_flush = true; | 195 | need_flush = true; |
179 | } | 196 | } |
180 | if (likely(need_flush)) | 197 | if (likely(need_flush)) |
@@ -198,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
198 | unsigned long hiwater_vm; | 215 | unsigned long hiwater_vm; |
199 | int split = 0; | 216 | int split = 0; |
200 | int err; | 217 | int err; |
218 | bool need_rmap_locks; | ||
201 | 219 | ||
202 | /* | 220 | /* |
203 | * We'd prefer to avoid failure later on in do_munmap: | 221 | * We'd prefer to avoid failure later on in do_munmap: |
@@ -219,18 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
219 | return err; | 237 | return err; |
220 | 238 | ||
221 | new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); | 239 | new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); |
222 | new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); | 240 | new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, |
241 | &need_rmap_locks); | ||
223 | if (!new_vma) | 242 | if (!new_vma) |
224 | return -ENOMEM; | 243 | return -ENOMEM; |
225 | 244 | ||
226 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); | 245 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, |
246 | need_rmap_locks); | ||
227 | if (moved_len < old_len) { | 247 | if (moved_len < old_len) { |
228 | /* | 248 | /* |
229 | * On error, move entries back from new area to old, | 249 | * On error, move entries back from new area to old, |
230 | * which will succeed since page tables still there, | 250 | * which will succeed since page tables still there, |
231 | * and then proceed to unmap new area instead of old. | 251 | * and then proceed to unmap new area instead of old. |
232 | */ | 252 | */ |
233 | move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); | 253 | move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, |
254 | true); | ||
234 | vma = new_vma; | 255 | vma = new_vma; |
235 | old_len = new_len; | 256 | old_len = new_len; |
236 | old_addr = new_addr; | 257 | old_addr = new_addr; |