aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
authorChen Yucong <slaoub@gmail.com>2014-07-02 18:22:37 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-07-03 12:21:54 -0400
commit0bc1f8b0682caa39f45ce1e0228ebf43acb46111 (patch)
treeb70c12a8c2a6ec6b72dcad7bbce556a67becc1d7 /mm/memory-failure.c
parentb27ebf77919fdc4e7f76b1972307c30c4a3c8859 (diff)
hwpoison: fix the handling path of the victimized page frame that belong to non-LRU
Until now, the kernel has the same policy to handle victimized page frames that belong to kernel-space(reserved/slab-subsystem) or non-LRU(unknown page state). In other word, the result of handling either of these victimized page frames is (IGNORED | FAILED), and the return value of memory_failure() is -EBUSY. This patch is to avoid that memory_failure() returns very soon due to the "true" value of (!PageLRU(p)), and it also ensures that action_result() can report more precise information("reserved kernel", "kernel slab", and "unknown page state") instead of "non LRU", especially for memory errors which are detected by memory-scrubbing. Andi said: : While running the mcelog test suite on 3.14 I hit the following VM_BUG_ON: : : soft_offline: 0x56d4: unknown non LRU page type 3ffff800008000 : page:ffffea000015b400 count:3 mapcount:2097169 mapping: (null) index:0xffff8800056d7000 : page flags: 0x3ffff800004081(locked|slab|head) : ------------[ cut here ]------------ : kernel BUG at mm/rmap.c:1495! : : I think what happened is that a LRU page turned into a slab page in : parallel with offlining. memory_failure initially tests for this case, : but doesn't retest later after the page has been locked. : : ... : : I ran this patch in a loop over night with some stress plus : the mcelog test suite running in a loop. I cannot guarantee it hit it, : but it should have given it a good beating. : : The kernel survived with no messages, although the mcelog test suite : got killed at some point because it couldn't fork anymore. Probably : some unrelated problem. : : So the patch is ok for me for .16. Signed-off-by: Chen Yucong <slaoub@gmail.com> Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Reported-by: Andi Kleen <andi@firstfloor.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c9
1 files changed, 5 insertions, 4 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index cd8989c1027e..c6399e328931 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -895,7 +895,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
895 struct page *hpage = *hpagep; 895 struct page *hpage = *hpagep;
896 struct page *ppage; 896 struct page *ppage;
897 897
898 if (PageReserved(p) || PageSlab(p)) 898 if (PageReserved(p) || PageSlab(p) || !PageLRU(p))
899 return SWAP_SUCCESS; 899 return SWAP_SUCCESS;
900 900
901 /* 901 /*
@@ -1159,9 +1159,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1159 action_result(pfn, "free buddy, 2nd try", DELAYED); 1159 action_result(pfn, "free buddy, 2nd try", DELAYED);
1160 return 0; 1160 return 0;
1161 } 1161 }
1162 action_result(pfn, "non LRU", IGNORED);
1163 put_page(p);
1164 return -EBUSY;
1165 } 1162 }
1166 } 1163 }
1167 1164
@@ -1194,6 +1191,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1194 return 0; 1191 return 0;
1195 } 1192 }
1196 1193
1194 if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
1195 goto identify_page_state;
1196
1197 /* 1197 /*
1198 * For error on the tail page, we should set PG_hwpoison 1198 * For error on the tail page, we should set PG_hwpoison
1199 * on the head page to show that the hugepage is hwpoisoned 1199 * on the head page to show that the hugepage is hwpoisoned
@@ -1243,6 +1243,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1243 goto out; 1243 goto out;
1244 } 1244 }
1245 1245
1246identify_page_state:
1246 res = -EBUSY; 1247 res = -EBUSY;
1247 /* 1248 /*
1248 * The first check uses the current page flags which may not have any 1249 * The first check uses the current page flags which may not have any