aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2010-05-24 17:32:24 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-25 11:06:59 -0400
commita8bef8ff6ea15fa4c67433cab0f5f3484574ef7c (patch)
tree3ea9c122e02f523379d4560ee2134124d097895c
parente9e96b39f932a065e14f5d5bab0797ae261d03b5 (diff)
mm: migration: avoid race between shift_arg_pages() and rmap_walk() during migration by not migrating temporary stacks
Page migration requires rmap to be able to find all ptes mapping a page at all times, otherwise the migration entry can be instantiated, but it is possible to leave one behind if the second rmap_walk fails to find the page. If this page is later faulted, migration_entry_to_page() will call BUG because the page is locked indicating the page was migrated by the migration PTE not cleaned up. For example kernel BUG at include/linux/swapops.h:105! invalid opcode: 0000 [#1] PREEMPT SMP ... Call Trace: [<ffffffff810e951a>] handle_mm_fault+0x3f8/0x76a [<ffffffff8130c7a2>] do_page_fault+0x44a/0x46e [<ffffffff813099b5>] page_fault+0x25/0x30 [<ffffffff8114de33>] load_elf_binary+0x152a/0x192b [<ffffffff8111329b>] search_binary_handler+0x173/0x313 [<ffffffff81114896>] do_execve+0x219/0x30a [<ffffffff8100a5c6>] sys_execve+0x43/0x5e [<ffffffff8100320a>] stub_execve+0x6a/0xc0 RIP [<ffffffff811094ff>] migration_entry_wait+0xc1/0x129 There is a race between shift_arg_pages and migration that triggers this bug. A temporary stack is setup during exec and later moved. If migration moves a page in the temporary stack and the VMA is then removed before migration completes, the migration PTE may not be found leading to a BUG when the stack is faulted. This patch causes pages within the temporary stack during exec to be skipped by migration. It does this by marking the VMA covering the temporary stack with an otherwise impossible combination of VMA flags. These flags are cleared when the temporary stack is moved to its final location. [kamezawa.hiroyu@jp.fujitsu.com: idea for having migration skip temporary stacks] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reviewed-by: Rik van Riel <riel@redhat.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Christoph Lameter <cl@linux.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/exec.c7
-rw-r--r--include/linux/mm.h3
-rw-r--r--mm/rmap.c30
3 files changed, 38 insertions, 2 deletions
diff --git a/fs/exec.c b/fs/exec.c
index e6e94c626c2c..9badbc0bfb1d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
242 * use STACK_TOP because that can depend on attributes which aren't 242 * use STACK_TOP because that can depend on attributes which aren't
243 * configured yet. 243 * configured yet.
244 */ 244 */
245 BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
245 vma->vm_end = STACK_TOP_MAX; 246 vma->vm_end = STACK_TOP_MAX;
246 vma->vm_start = vma->vm_end - PAGE_SIZE; 247 vma->vm_start = vma->vm_end - PAGE_SIZE;
247 vma->vm_flags = VM_STACK_FLAGS; 248 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 249 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
249 INIT_LIST_HEAD(&vma->anon_vma_chain); 250 INIT_LIST_HEAD(&vma->anon_vma_chain);
250 err = insert_vm_struct(mm, vma); 251 err = insert_vm_struct(mm, vma);
@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
616 else if (executable_stack == EXSTACK_DISABLE_X) 617 else if (executable_stack == EXSTACK_DISABLE_X)
617 vm_flags &= ~VM_EXEC; 618 vm_flags &= ~VM_EXEC;
618 vm_flags |= mm->def_flags; 619 vm_flags |= mm->def_flags;
620 vm_flags |= VM_STACK_INCOMPLETE_SETUP;
619 621
620 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, 622 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
621 vm_flags); 623 vm_flags);
@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
630 goto out_unlock; 632 goto out_unlock;
631 } 633 }
632 634
635 /* mprotect_fixup is overkill to remove the temporary stack flags */
636 vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
637
633 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ 638 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
634 stack_size = vma->vm_end - vma->vm_start; 639 stack_size = vma->vm_end - vma->vm_start;
635 /* 640 /*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fb19bb92b809..98ea5bab963e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -106,6 +106,9 @@ extern unsigned int kobjsize(const void *objp);
106#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ 106#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */
107#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ 107#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
108 108
109/* Bits set in the VMA until the stack is in its final location */
110#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
111
109#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ 112#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
110#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS 113#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
111#endif 114#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index b5c320f7d0a5..38a336e2eea1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1131,6 +1131,20 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1131 return ret; 1131 return ret;
1132} 1132}
1133 1133
1134static bool is_vma_temporary_stack(struct vm_area_struct *vma)
1135{
1136 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1137
1138 if (!maybe_stack)
1139 return false;
1140
1141 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1142 VM_STACK_INCOMPLETE_SETUP)
1143 return true;
1144
1145 return false;
1146}
1147
1134/** 1148/**
1135 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based 1149 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
1136 * rmap method 1150 * rmap method
@@ -1159,7 +1173,21 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1159 1173
1160 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1174 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1161 struct vm_area_struct *vma = avc->vma; 1175 struct vm_area_struct *vma = avc->vma;
1162 unsigned long address = vma_address(page, vma); 1176 unsigned long address;
1177
1178 /*
1179 * During exec, a temporary VMA is setup and later moved.
1180 * The VMA is moved under the anon_vma lock but not the
1181 * page tables leading to a race where migration cannot
1182 * find the migration ptes. Rather than increasing the
1183 * locking requirements of exec(), migration skips
1184 * temporary VMAs until after exec() completes.
1185 */
1186 if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&
1187 is_vma_temporary_stack(vma))
1188 continue;
1189
1190 address = vma_address(page, vma);
1163 if (address == -EFAULT) 1191 if (address == -EFAULT)
1164 continue; 1192 continue;
1165 ret = try_to_unmap_one(page, vma, address, flags); 1193 ret = try_to_unmap_one(page, vma, address, flags);