aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>2014-01-23 18:53:14 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2014-02-13 16:48:00 -0500
commit9fa1577a45d333d37b0dd7e56524c351bab6a21b (patch)
treea5ba192883783ef0669828ae1663e27a06c10d46 /mm
parent186b643ae0c6db1816c93b9b98daf583ef62c372 (diff)
mm/memory-failure.c: shift page lock from head page to tail page after thp split
commit 54b9dd14d09f24927285359a227aa363ce46089e upstream. After thp split in hwpoison_user_mappings(), we hold page lock on the raw error page only between try_to_unmap, hence we are in danger of race condition. I found in the RHEL7 MCE-relay testing that we have "bad page" error when a memory error happens on a thp tail page used by qemu-kvm: Triggering MCE exception on CPU 10 mce: [Hardware Error]: Machine check events logged MCE exception done on CPU 10 MCE 0x38c535: Killing qemu-kvm:8418 due to hardware memory corruption MCE 0x38c535: dirty LRU page recovery: Recovered qemu-kvm[8418]: segfault at 20 ip 00007ffb0f0f229a sp 00007fffd6bc5240 error 4 in qemu-kvm[7ffb0ef14000+420000] BUG: Bad page state in process qemu-kvm pfn:38c400 page:ffffea000e310000 count:0 mapcount:0 mapping: (null) index:0x7ffae3c00 page flags: 0x2fffff0008001d(locked|referenced|uptodate|dirty|swapbacked) Modules linked in: hwpoison_inject mce_inject vhost_net macvtap macvlan ... CPU: 0 PID: 8418 Comm: qemu-kvm Tainted: G M -------------- 3.10.0-54.0.1.el7.mce_test_fixed.x86_64 #1 Hardware name: NEC NEC Express5800/R120b-1 [N8100-1719F]/MS-91E7-001, BIOS 4.6.3C19 02/10/2011 Call Trace: dump_stack+0x19/0x1b bad_page.part.59+0xcf/0xe8 free_pages_prepare+0x148/0x160 free_hot_cold_page+0x31/0x140 free_hot_cold_page_list+0x46/0xa0 release_pages+0x1c1/0x200 free_pages_and_swap_cache+0xad/0xd0 tlb_flush_mmu.part.46+0x4c/0x90 tlb_finish_mmu+0x55/0x60 exit_mmap+0xcb/0x170 mmput+0x67/0xf0 vhost_dev_cleanup+0x231/0x260 [vhost_net] vhost_net_release+0x3f/0x90 [vhost_net] __fput+0xe9/0x270 ____fput+0xe/0x10 task_work_run+0xc4/0xe0 do_exit+0x2bb/0xa40 do_group_exit+0x3f/0xa0 get_signal_to_deliver+0x1d0/0x6e0 do_signal+0x48/0x5e0 do_notify_resume+0x71/0xc0 retint_signal+0x48/0x8c The reason of this bug is that a page fault happens before unlocking the head page at the end of memory_failure(). This strange page fault is trying to access to address 0x20 and I'm not sure why qemu-kvm does this, but anyway as a result the SIGSEGV makes qemu-kvm exit and on the way we catch the bad page bug/warning because we try to free a locked page (which was the former head page.) To fix this, this patch suggests to shift page lock from head page to tail page just after thp split. SIGSEGV still happens, but it affects only error affected VMs, not a whole system. Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memory-failure.c21
1 files changed, 11 insertions, 10 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7e3601ce51c6..3b4120e38d48 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -854,14 +854,14 @@ static int page_action(struct page_state *ps, struct page *p,
854 * the pages and send SIGBUS to the processes if the data was dirty. 854 * the pages and send SIGBUS to the processes if the data was dirty.
855 */ 855 */
856static int hwpoison_user_mappings(struct page *p, unsigned long pfn, 856static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
857 int trapno, int flags) 857 int trapno, int flags, struct page **hpagep)
858{ 858{
859 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 859 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
860 struct address_space *mapping; 860 struct address_space *mapping;
861 LIST_HEAD(tokill); 861 LIST_HEAD(tokill);
862 int ret; 862 int ret;
863 int kill = 1, forcekill; 863 int kill = 1, forcekill;
864 struct page *hpage = compound_head(p); 864 struct page *hpage = *hpagep;
865 struct page *ppage; 865 struct page *ppage;
866 866
867 if (PageReserved(p) || PageSlab(p)) 867 if (PageReserved(p) || PageSlab(p))
@@ -940,11 +940,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
940 * We pinned the head page for hwpoison handling, 940 * We pinned the head page for hwpoison handling,
941 * now we split the thp and we are interested in 941 * now we split the thp and we are interested in
942 * the hwpoisoned raw page, so move the refcount 942 * the hwpoisoned raw page, so move the refcount
943 * to it. 943 * to it. Similarly, page lock is shifted.
944 */ 944 */
945 if (hpage != p) { 945 if (hpage != p) {
946 put_page(hpage); 946 put_page(hpage);
947 get_page(p); 947 get_page(p);
948 lock_page(p);
949 unlock_page(hpage);
950 *hpagep = p;
948 } 951 }
949 /* THP is split, so ppage should be the real poisoned page. */ 952 /* THP is split, so ppage should be the real poisoned page. */
950 ppage = p; 953 ppage = p;
@@ -962,17 +965,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
962 if (kill) 965 if (kill)
963 collect_procs(ppage, &tokill); 966 collect_procs(ppage, &tokill);
964 967
965 if (hpage != ppage)
966 lock_page(ppage);
967
968 ret = try_to_unmap(ppage, ttu); 968 ret = try_to_unmap(ppage, ttu);
969 if (ret != SWAP_SUCCESS) 969 if (ret != SWAP_SUCCESS)
970 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 970 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
971 pfn, page_mapcount(ppage)); 971 pfn, page_mapcount(ppage));
972 972
973 if (hpage != ppage)
974 unlock_page(ppage);
975
976 /* 973 /*
977 * Now that the dirty bit has been propagated to the 974 * Now that the dirty bit has been propagated to the
978 * struct page and all unmaps done we can decide if 975 * struct page and all unmaps done we can decide if
@@ -1189,8 +1186,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1189 /* 1186 /*
1190 * Now take care of user space mappings. 1187 * Now take care of user space mappings.
1191 * Abort on fail: __delete_from_page_cache() assumes unmapped page. 1188 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1189 *
1190 * When the raw error page is thp tail page, hpage points to the raw
1191 * page after thp split.
1192 */ 1192 */
1193 if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { 1193 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
1194 != SWAP_SUCCESS) {
1194 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); 1195 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1195 res = -EBUSY; 1196 res = -EBUSY;
1196 goto out; 1197 goto out;