aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mmap.c
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2012-01-10 18:08:05 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-10 19:30:44 -0500
commit948f017b093a9baac23855fcd920d3a970b71bb6 (patch)
tree4536a9cbe1077133e600da2072998f5e60c696c6 /mm/mmap.c
parentdf0a6daa01fa3856c08f4274d4f21a8092caa480 (diff)
mremap: enforce rmap src/dst vma ordering in case of vma_merge() succeeding in copy_vma()
migrate was doing an rmap_walk with speculative lock-less access on pagetables. That could lead it to not serializing properly against mremap PT locks. But a second problem remains in the order of vmas in the same_anon_vma list used by the rmap_walk. If vma_merge succeeds in copy_vma, the src vma could be placed after the dst vma in the same_anon_vma list. That could still lead to migrate missing some pte. This patch adds an anon_vma_moveto_tail() function to force the dst vma at the end of the list before mremap starts to solve the problem. If the mremap is very large and there are a lots of parents or childs sharing the anon_vma root lock, this should still scale better than taking the anon_vma root lock around every pte copy practically for the whole duration of mremap. Update: Hugh noticed special care is needed in the error path where move_page_tables goes in the reverse direction, a second anon_vma_moveto_tail() call is needed in the error path. This program exercises the anon_vma_moveto_tail: === int main() { static struct timeval oldstamp, newstamp; long diffsec; char *p, *p2, *p3, *p4; if (posix_memalign((void **)&p, 2*1024*1024, SIZE)) perror("memalign"), exit(1); if (posix_memalign((void **)&p2, 2*1024*1024, SIZE)) perror("memalign"), exit(1); if (posix_memalign((void **)&p3, 2*1024*1024, SIZE)) perror("memalign"), exit(1); memset(p, 0xff, SIZE); printf("%p\n", p); memset(p2, 0xff, SIZE); memset(p3, 0x77, 4096); if (memcmp(p, p2, SIZE)) printf("error\n"); p4 = mremap(p+SIZE/2, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p3); if (p4 != p3) perror("mremap"), exit(1); p4 = mremap(p4, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p+SIZE/2); if (p4 != p+SIZE/2) perror("mremap"), exit(1); if (memcmp(p, p2, SIZE)) printf("error\n"); printf("ok\n"); return 0; } === $ perf probe -a anon_vma_moveto_tail Add new event: probe:anon_vma_moveto_tail (on anon_vma_moveto_tail) You can now use it on all perf tools, such as: perf record -e probe:anon_vma_moveto_tail -aR sleep 1 $ perf record -e probe:anon_vma_moveto_tail -aR ./anon_vma_moveto_tail 0x7f2ca2800000 ok [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.043 MB perf.data (~1860 samples) ] $ perf report --stdio 100.00% anon_vma_moveto [kernel.kallsyms] [k] anon_vma_moveto_tail Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Nai Xia <nai.xia@gmail.com> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Hugh Dickins <hughd@google.com> Cc: Pawel Sikora <pluto@agmk.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mmap.c')
-rw-r--r--mm/mmap.c24
1 files changed, 21 insertions, 3 deletions
diff --git a/mm/mmap.c b/mm/mmap.c
index eae90af60ea6..adea3b8880e3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2322,13 +2322,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2322 struct vm_area_struct *new_vma, *prev; 2322 struct vm_area_struct *new_vma, *prev;
2323 struct rb_node **rb_link, *rb_parent; 2323 struct rb_node **rb_link, *rb_parent;
2324 struct mempolicy *pol; 2324 struct mempolicy *pol;
2325 bool faulted_in_anon_vma = true;
2325 2326
2326 /* 2327 /*
2327 * If anonymous vma has not yet been faulted, update new pgoff 2328 * If anonymous vma has not yet been faulted, update new pgoff
2328 * to match new location, to increase its chance of merging. 2329 * to match new location, to increase its chance of merging.
2329 */ 2330 */
2330 if (!vma->vm_file && !vma->anon_vma) 2331 if (unlikely(!vma->vm_file && !vma->anon_vma)) {
2331 pgoff = addr >> PAGE_SHIFT; 2332 pgoff = addr >> PAGE_SHIFT;
2333 faulted_in_anon_vma = false;
2334 }
2332 2335
2333 find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 2336 find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
2334 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, 2337 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
@@ -2337,9 +2340,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2337 /* 2340 /*
2338 * Source vma may have been merged into new_vma 2341 * Source vma may have been merged into new_vma
2339 */ 2342 */
2340 if (vma_start >= new_vma->vm_start && 2343 if (unlikely(vma_start >= new_vma->vm_start &&
2341 vma_start < new_vma->vm_end) 2344 vma_start < new_vma->vm_end)) {
2345 /*
2346 * The only way we can get a vma_merge with
2347 * self during an mremap is if the vma hasn't
2348 * been faulted in yet and we were allowed to
2349 * reset the dst vma->vm_pgoff to the
2350 * destination address of the mremap to allow
2351 * the merge to happen. mremap must change the
2352 * vm_pgoff linearity between src and dst vmas
2353 * (in turn preventing a vma_merge) to be
2354 * safe. It is only safe to keep the vm_pgoff
2355 * linear if there are no pages mapped yet.
2356 */
2357 VM_BUG_ON(faulted_in_anon_vma);
2342 *vmap = new_vma; 2358 *vmap = new_vma;
2359 } else
2360 anon_vma_moveto_tail(new_vma);
2343 } else { 2361 } else {
2344 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2362 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2345 if (new_vma) { 2363 if (new_vma) {