aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMichel Lespinasse <walken@google.com>2012-10-08 19:31:50 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 03:22:42 -0400
commit38a76013ad809beb0b52f60d365c960d035bd83c (patch)
treec63ba707ab17dd1ff1e90650faf74570daa3cf9f /mm
parent523d4e2008fd4a68b1a164e63e8c75b7b20f07e0 (diff)
mm: avoid taking rmap locks in move_ptes()
During mremap(), the destination VMA is generally placed after the original vma in rmap traversal order: in move_vma(), we always have new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >= vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one. When the destination VMA is placed after the original in rmap traversal order, we can avoid taking the rmap locks in move_ptes(). Essentially, this reintroduces the optimization that had been disabled in "mm anon rmap: remove anon_vma_moveto_tail". The difference is that we don't try to impose the rmap traversal order; instead we just rely on things being in the desired order in the common case and fall back to taking locks in the uncommon case. Also we skip the i_mmap_mutex in addition to the anon_vma lock: in both cases, the vmas are traversed in increasing vm_pgoff order with ties resolved in tree insertion order. Signed-off-by: Michel Lespinasse <walken@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Daniel Santos <daniel.santos@pobox.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/mmap.c7
-rw-r--r--mm/mremap.c57
2 files changed, 44 insertions, 20 deletions
diff --git a/mm/mmap.c b/mm/mmap.c
index 81248992120d..2d942353d681 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2371,7 +2371,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2371 * prior to moving page table entries, to effect an mremap move. 2371 * prior to moving page table entries, to effect an mremap move.
2372 */ 2372 */
2373struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 2373struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2374 unsigned long addr, unsigned long len, pgoff_t pgoff) 2374 unsigned long addr, unsigned long len, pgoff_t pgoff,
2375 bool *need_rmap_locks)
2375{ 2376{
2376 struct vm_area_struct *vma = *vmap; 2377 struct vm_area_struct *vma = *vmap;
2377 unsigned long vma_start = vma->vm_start; 2378 unsigned long vma_start = vma->vm_start;
@@ -2413,8 +2414,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2413 * linear if there are no pages mapped yet. 2414 * linear if there are no pages mapped yet.
2414 */ 2415 */
2415 VM_BUG_ON(faulted_in_anon_vma); 2416 VM_BUG_ON(faulted_in_anon_vma);
2416 *vmap = new_vma; 2417 *vmap = vma = new_vma;
2417 } 2418 }
2419 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
2418 } else { 2420 } else {
2419 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2421 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2420 if (new_vma) { 2422 if (new_vma) {
@@ -2434,6 +2436,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2434 if (new_vma->vm_ops && new_vma->vm_ops->open) 2436 if (new_vma->vm_ops && new_vma->vm_ops->open)
2435 new_vma->vm_ops->open(new_vma); 2437 new_vma->vm_ops->open(new_vma);
2436 vma_link(mm, new_vma, prev, rb_link, rb_parent); 2438 vma_link(mm, new_vma, prev, rb_link, rb_parent);
2439 *need_rmap_locks = false;
2437 } 2440 }
2438 } 2441 }
2439 return new_vma; 2442 return new_vma;
diff --git a/mm/mremap.c b/mm/mremap.c
index 5588bb6e9295..3b639a4b26bd 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,26 +71,42 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
71static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, 71static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
72 unsigned long old_addr, unsigned long old_end, 72 unsigned long old_addr, unsigned long old_end,
73 struct vm_area_struct *new_vma, pmd_t *new_pmd, 73 struct vm_area_struct *new_vma, pmd_t *new_pmd,
74 unsigned long new_addr) 74 unsigned long new_addr, bool need_rmap_locks)
75{ 75{
76 struct address_space *mapping = NULL; 76 struct address_space *mapping = NULL;
77 struct anon_vma *anon_vma = vma->anon_vma; 77 struct anon_vma *anon_vma = NULL;
78 struct mm_struct *mm = vma->vm_mm; 78 struct mm_struct *mm = vma->vm_mm;
79 pte_t *old_pte, *new_pte, pte; 79 pte_t *old_pte, *new_pte, pte;
80 spinlock_t *old_ptl, *new_ptl; 80 spinlock_t *old_ptl, *new_ptl;
81 81
82 if (vma->vm_file) { 82 /*
83 /* 83 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
84 * Subtle point from Rajesh Venkatasubramanian: before 84 * locks to ensure that rmap will always observe either the old or the
85 * moving file-based ptes, we must lock truncate_pagecache 85 * new ptes. This is the easiest way to avoid races with
86 * out, since it might clean the dst vma before the src vma, 86 * truncate_pagecache(), page migration, etc...
87 * and we propagate stale pages into the dst afterward. 87 *
88 */ 88 * When need_rmap_locks is false, we use other ways to avoid
89 mapping = vma->vm_file->f_mapping; 89 * such races:
90 mutex_lock(&mapping->i_mmap_mutex); 90 *
91 * - During exec() shift_arg_pages(), we use a specially tagged vma
92 * which rmap call sites look for using is_vma_temporary_stack().
93 *
94 * - During mremap(), new_vma is often known to be placed after vma
95 * in rmap traversal order. This ensures rmap will always observe
96 * either the old pte, or the new pte, or both (the page table locks
97 * serialize access to individual ptes, but only rmap traversal
98 * order guarantees that we won't miss both the old and new ptes).
99 */
100 if (need_rmap_locks) {
101 if (vma->vm_file) {
102 mapping = vma->vm_file->f_mapping;
103 mutex_lock(&mapping->i_mmap_mutex);
104 }
105 if (vma->anon_vma) {
106 anon_vma = vma->anon_vma;
107 anon_vma_lock(anon_vma);
108 }
91 } 109 }
92 if (anon_vma)
93 anon_vma_lock(anon_vma);
94 110
95 /* 111 /*
96 * We don't have to worry about the ordering of src and dst 112 * We don't have to worry about the ordering of src and dst
@@ -127,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
127 143
128unsigned long move_page_tables(struct vm_area_struct *vma, 144unsigned long move_page_tables(struct vm_area_struct *vma,
129 unsigned long old_addr, struct vm_area_struct *new_vma, 145 unsigned long old_addr, struct vm_area_struct *new_vma,
130 unsigned long new_addr, unsigned long len) 146 unsigned long new_addr, unsigned long len,
147 bool need_rmap_locks)
131{ 148{
132 unsigned long extent, next, old_end; 149 unsigned long extent, next, old_end;
133 pmd_t *old_pmd, *new_pmd; 150 pmd_t *old_pmd, *new_pmd;
@@ -174,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
174 if (extent > LATENCY_LIMIT) 191 if (extent > LATENCY_LIMIT)
175 extent = LATENCY_LIMIT; 192 extent = LATENCY_LIMIT;
176 move_ptes(vma, old_pmd, old_addr, old_addr + extent, 193 move_ptes(vma, old_pmd, old_addr, old_addr + extent,
177 new_vma, new_pmd, new_addr); 194 new_vma, new_pmd, new_addr, need_rmap_locks);
178 need_flush = true; 195 need_flush = true;
179 } 196 }
180 if (likely(need_flush)) 197 if (likely(need_flush))
@@ -198,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
198 unsigned long hiwater_vm; 215 unsigned long hiwater_vm;
199 int split = 0; 216 int split = 0;
200 int err; 217 int err;
218 bool need_rmap_locks;
201 219
202 /* 220 /*
203 * We'd prefer to avoid failure later on in do_munmap: 221 * We'd prefer to avoid failure later on in do_munmap:
@@ -219,18 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
219 return err; 237 return err;
220 238
221 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); 239 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
222 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); 240 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
241 &need_rmap_locks);
223 if (!new_vma) 242 if (!new_vma)
224 return -ENOMEM; 243 return -ENOMEM;
225 244
226 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); 245 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
246 need_rmap_locks);
227 if (moved_len < old_len) { 247 if (moved_len < old_len) {
228 /* 248 /*
229 * On error, move entries back from new area to old, 249 * On error, move entries back from new area to old,
230 * which will succeed since page tables still there, 250 * which will succeed since page tables still there,
231 * and then proceed to unmap new area instead of old. 251 * and then proceed to unmap new area instead of old.
232 */ 252 */
233 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); 253 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
254 true);
234 vma = new_vma; 255 vma = new_vma;
235 old_len = new_len; 256 old_len = new_len;
236 old_addr = new_addr; 257 old_addr = new_addr;