diff options
author | Chen Yucong <slaoub@gmail.com> | 2014-07-02 18:22:37 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-07-03 12:21:54 -0400 |
commit | 0bc1f8b0682caa39f45ce1e0228ebf43acb46111 (patch) | |
tree | b70c12a8c2a6ec6b72dcad7bbce556a67becc1d7 /mm/memory-failure.c | |
parent | b27ebf77919fdc4e7f76b1972307c30c4a3c8859 (diff) |
hwpoison: fix the handling path of the victimized page frame that belong to non-LRU
Until now, the kernel has the same policy to handle victimized page
frames that belong to kernel-space(reserved/slab-subsystem) or
non-LRU(unknown page state). In other word, the result of handling
either of these victimized page frames is (IGNORED | FAILED), and the
return value of memory_failure() is -EBUSY.
This patch is to avoid that memory_failure() returns very soon due to
the "true" value of (!PageLRU(p)), and it also ensures that
action_result() can report more precise information("reserved kernel",
"kernel slab", and "unknown page state") instead of "non LRU",
especially for memory errors which are detected by memory-scrubbing.
Andi said:
: While running the mcelog test suite on 3.14 I hit the following VM_BUG_ON:
:
: soft_offline: 0x56d4: unknown non LRU page type 3ffff800008000
: page:ffffea000015b400 count:3 mapcount:2097169 mapping: (null) index:0xffff8800056d7000
: page flags: 0x3ffff800004081(locked|slab|head)
: ------------[ cut here ]------------
: kernel BUG at mm/rmap.c:1495!
:
: I think what happened is that a LRU page turned into a slab page in
: parallel with offlining. memory_failure initially tests for this case,
: but doesn't retest later after the page has been locked.
:
: ...
:
: I ran this patch in a loop over night with some stress plus
: the mcelog test suite running in a loop. I cannot guarantee it hit it,
: but it should have given it a good beating.
:
: The kernel survived with no messages, although the mcelog test suite
: got killed at some point because it couldn't fork anymore. Probably
: some unrelated problem.
:
: So the patch is ok for me for .16.
Signed-off-by: Chen Yucong <slaoub@gmail.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r-- | mm/memory-failure.c | 9 |
1 files changed, 5 insertions, 4 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index cd8989c1027e..c6399e328931 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -895,7 +895,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
895 | struct page *hpage = *hpagep; | 895 | struct page *hpage = *hpagep; |
896 | struct page *ppage; | 896 | struct page *ppage; |
897 | 897 | ||
898 | if (PageReserved(p) || PageSlab(p)) | 898 | if (PageReserved(p) || PageSlab(p) || !PageLRU(p)) |
899 | return SWAP_SUCCESS; | 899 | return SWAP_SUCCESS; |
900 | 900 | ||
901 | /* | 901 | /* |
@@ -1159,9 +1159,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1159 | action_result(pfn, "free buddy, 2nd try", DELAYED); | 1159 | action_result(pfn, "free buddy, 2nd try", DELAYED); |
1160 | return 0; | 1160 | return 0; |
1161 | } | 1161 | } |
1162 | action_result(pfn, "non LRU", IGNORED); | ||
1163 | put_page(p); | ||
1164 | return -EBUSY; | ||
1165 | } | 1162 | } |
1166 | } | 1163 | } |
1167 | 1164 | ||
@@ -1194,6 +1191,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1194 | return 0; | 1191 | return 0; |
1195 | } | 1192 | } |
1196 | 1193 | ||
1194 | if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) | ||
1195 | goto identify_page_state; | ||
1196 | |||
1197 | /* | 1197 | /* |
1198 | * For error on the tail page, we should set PG_hwpoison | 1198 | * For error on the tail page, we should set PG_hwpoison |
1199 | * on the head page to show that the hugepage is hwpoisoned | 1199 | * on the head page to show that the hugepage is hwpoisoned |
@@ -1243,6 +1243,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1243 | goto out; | 1243 | goto out; |
1244 | } | 1244 | } |
1245 | 1245 | ||
1246 | identify_page_state: | ||
1246 | res = -EBUSY; | 1247 | res = -EBUSY; |
1247 | /* | 1248 | /* |
1248 | * The first check uses the current page flags which may not have any | 1249 | * The first check uses the current page flags which may not have any |