diff options
author | Andrea Arcangeli <aarcange@redhat.com> | 2012-01-10 18:08:05 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-10 19:30:44 -0500 |
commit | 948f017b093a9baac23855fcd920d3a970b71bb6 (patch) | |
tree | 4536a9cbe1077133e600da2072998f5e60c696c6 /mm/mmap.c | |
parent | df0a6daa01fa3856c08f4274d4f21a8092caa480 (diff) |
mremap: enforce rmap src/dst vma ordering in case of vma_merge() succeeding in copy_vma()
migrate was doing an rmap_walk with speculative lock-less access on
pagetables. That could lead it to not serializing properly against mremap
PT locks. But a second problem remains in the order of vmas in the
same_anon_vma list used by the rmap_walk.
If vma_merge succeeds in copy_vma, the src vma could be placed after the
dst vma in the same_anon_vma list. That could still lead to migrate
missing some pte.
This patch adds an anon_vma_moveto_tail() function to force the dst vma at
the end of the list before mremap starts to solve the problem.
If the mremap is very large and there are a lots of parents or childs
sharing the anon_vma root lock, this should still scale better than taking
the anon_vma root lock around every pte copy practically for the whole
duration of mremap.
Update: Hugh noticed special care is needed in the error path where
move_page_tables goes in the reverse direction, a second
anon_vma_moveto_tail() call is needed in the error path.
This program exercises the anon_vma_moveto_tail:
===
int main()
{
static struct timeval oldstamp, newstamp;
long diffsec;
char *p, *p2, *p3, *p4;
if (posix_memalign((void **)&p, 2*1024*1024, SIZE))
perror("memalign"), exit(1);
if (posix_memalign((void **)&p2, 2*1024*1024, SIZE))
perror("memalign"), exit(1);
if (posix_memalign((void **)&p3, 2*1024*1024, SIZE))
perror("memalign"), exit(1);
memset(p, 0xff, SIZE);
printf("%p\n", p);
memset(p2, 0xff, SIZE);
memset(p3, 0x77, 4096);
if (memcmp(p, p2, SIZE))
printf("error\n");
p4 = mremap(p+SIZE/2, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p3);
if (p4 != p3)
perror("mremap"), exit(1);
p4 = mremap(p4, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p+SIZE/2);
if (p4 != p+SIZE/2)
perror("mremap"), exit(1);
if (memcmp(p, p2, SIZE))
printf("error\n");
printf("ok\n");
return 0;
}
===
$ perf probe -a anon_vma_moveto_tail
Add new event:
probe:anon_vma_moveto_tail (on anon_vma_moveto_tail)
You can now use it on all perf tools, such as:
perf record -e probe:anon_vma_moveto_tail -aR sleep 1
$ perf record -e probe:anon_vma_moveto_tail -aR ./anon_vma_moveto_tail
0x7f2ca2800000
ok
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.043 MB perf.data (~1860 samples) ]
$ perf report --stdio
100.00% anon_vma_moveto [kernel.kallsyms] [k] anon_vma_moveto_tail
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Nai Xia <nai.xia@gmail.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Pawel Sikora <pluto@agmk.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mmap.c')
-rw-r--r-- | mm/mmap.c | 24 |
1 files changed, 21 insertions, 3 deletions
@@ -2322,13 +2322,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2322 | struct vm_area_struct *new_vma, *prev; | 2322 | struct vm_area_struct *new_vma, *prev; |
2323 | struct rb_node **rb_link, *rb_parent; | 2323 | struct rb_node **rb_link, *rb_parent; |
2324 | struct mempolicy *pol; | 2324 | struct mempolicy *pol; |
2325 | bool faulted_in_anon_vma = true; | ||
2325 | 2326 | ||
2326 | /* | 2327 | /* |
2327 | * If anonymous vma has not yet been faulted, update new pgoff | 2328 | * If anonymous vma has not yet been faulted, update new pgoff |
2328 | * to match new location, to increase its chance of merging. | 2329 | * to match new location, to increase its chance of merging. |
2329 | */ | 2330 | */ |
2330 | if (!vma->vm_file && !vma->anon_vma) | 2331 | if (unlikely(!vma->vm_file && !vma->anon_vma)) { |
2331 | pgoff = addr >> PAGE_SHIFT; | 2332 | pgoff = addr >> PAGE_SHIFT; |
2333 | faulted_in_anon_vma = false; | ||
2334 | } | ||
2332 | 2335 | ||
2333 | find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 2336 | find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); |
2334 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, | 2337 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, |
@@ -2337,9 +2340,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2337 | /* | 2340 | /* |
2338 | * Source vma may have been merged into new_vma | 2341 | * Source vma may have been merged into new_vma |
2339 | */ | 2342 | */ |
2340 | if (vma_start >= new_vma->vm_start && | 2343 | if (unlikely(vma_start >= new_vma->vm_start && |
2341 | vma_start < new_vma->vm_end) | 2344 | vma_start < new_vma->vm_end)) { |
2345 | /* | ||
2346 | * The only way we can get a vma_merge with | ||
2347 | * self during an mremap is if the vma hasn't | ||
2348 | * been faulted in yet and we were allowed to | ||
2349 | * reset the dst vma->vm_pgoff to the | ||
2350 | * destination address of the mremap to allow | ||
2351 | * the merge to happen. mremap must change the | ||
2352 | * vm_pgoff linearity between src and dst vmas | ||
2353 | * (in turn preventing a vma_merge) to be | ||
2354 | * safe. It is only safe to keep the vm_pgoff | ||
2355 | * linear if there are no pages mapped yet. | ||
2356 | */ | ||
2357 | VM_BUG_ON(faulted_in_anon_vma); | ||
2342 | *vmap = new_vma; | 2358 | *vmap = new_vma; |
2359 | } else | ||
2360 | anon_vma_moveto_tail(new_vma); | ||
2343 | } else { | 2361 | } else { |
2344 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2362 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2345 | if (new_vma) { | 2363 | if (new_vma) { |