aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c370
1 files changed, 233 insertions, 137 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 757f6b0accfe..740c4f52059c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
7 * Free Software Foundation. 7 * Free Software Foundation.
8 * 8 *
9 * High level machine check handler. Handles pages reported by the 9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache 10 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
11 * failure. 11 * failure.
12 *
13 * In addition there is a "soft offline" entry point that allows stop using
14 * not-yet-corrupted-by-suspicious pages without killing anything.
12 * 15 *
13 * Handles page cache pages in various states. The tricky part 16 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM 17 * here is that we can access any page asynchronously in respect to
15 * users, because memory failures could happen anytime and anywhere, 18 * other VM users, because memory failures could happen anytime and
16 * possibly violating some of their assumptions. This is why this code 19 * anywhere. This could violate some of their assumptions. This is why
17 * has to be extremely careful. Generally it tries to use normal locking 20 * this code has to be extremely careful. Generally it tries to use
18 * rules, as in get the standard locks, even if that means the 21 * normal locking rules, as in get the standard locks, even if that means
19 * error handling takes potentially a long time. 22 * the error handling takes potentially a long time.
20 * 23 *
21 * The operation to map back from RMAP chains to processes has to walk 24 * There are several operations here with exponential complexity because
22 * the complete process list and has non linear complexity with the number 25 * of unsuitable VM data structures. For example the operation to map back
23 * mappings. In short it can be quite slow. But since memory corruptions 26 * from RMAP chains to processes has to walk the complete process list and
24 * are rare we hope to get away with this. 27 * has non linear complexity with the number. But since memory corruptions
28 * are rare we hope to get away with this. This avoids impacting the core
29 * VM.
25 */ 30 */
26 31
27/* 32/*
@@ -30,7 +35,6 @@
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages 35 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel 36 * - pass bad pages to kdump next kernel
32 */ 37 */
33#define DEBUG 1 /* remove me in 2.6.34 */
34#include <linux/kernel.h> 38#include <linux/kernel.h>
35#include <linux/mm.h> 39#include <linux/mm.h>
36#include <linux/page-flags.h> 40#include <linux/page-flags.h>
@@ -47,6 +51,8 @@
47#include <linux/slab.h> 51#include <linux/slab.h>
48#include <linux/swapops.h> 52#include <linux/swapops.h>
49#include <linux/hugetlb.h> 53#include <linux/hugetlb.h>
54#include <linux/memory_hotplug.h>
55#include <linux/mm_inline.h>
50#include "internal.h" 56#include "internal.h"
51 57
52int sysctl_memory_failure_early_kill __read_mostly = 0; 58int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -78,7 +84,7 @@ static int hwpoison_filter_dev(struct page *p)
78 return 0; 84 return 0;
79 85
80 /* 86 /*
81 * page_mapping() does not accept slab page 87 * page_mapping() does not accept slab pages.
82 */ 88 */
83 if (PageSlab(p)) 89 if (PageSlab(p))
84 return -EINVAL; 90 return -EINVAL;
@@ -198,12 +204,12 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
198#ifdef __ARCH_SI_TRAPNO 204#ifdef __ARCH_SI_TRAPNO
199 si.si_trapno = trapno; 205 si.si_trapno = trapno;
200#endif 206#endif
201 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; 207 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
202 /* 208 /*
203 * Don't use force here, it's convenient if the signal 209 * Don't use force here, it's convenient if the signal
204 * can be temporarily blocked. 210 * can be temporarily blocked.
205 * This could cause a loop when the user sets SIGBUS 211 * This could cause a loop when the user sets SIGBUS
206 * to SIG_IGN, but hopefully noone will do that? 212 * to SIG_IGN, but hopefully no one will do that?
207 */ 213 */
208 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ 214 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
209 if (ret < 0) 215 if (ret < 0)
@@ -228,13 +234,17 @@ void shake_page(struct page *p, int access)
228 } 234 }
229 235
230 /* 236 /*
231 * Only all shrink_slab here (which would also 237 * Only call shrink_slab here (which would also shrink other caches) if
232 * shrink other caches) if access is not potentially fatal. 238 * access is not potentially fatal.
233 */ 239 */
234 if (access) { 240 if (access) {
235 int nr; 241 int nr;
236 do { 242 do {
237 nr = shrink_slab(1000, GFP_KERNEL, 1000); 243 struct shrink_control shrink = {
244 .gfp_mask = GFP_KERNEL,
245 };
246
247 nr = shrink_slab(&shrink, 1000, 1000);
238 if (page_count(p) == 1) 248 if (page_count(p) == 1)
239 break; 249 break;
240 } while (nr > 10); 250 } while (nr > 10);
@@ -268,7 +278,7 @@ struct to_kill {
268 struct list_head nd; 278 struct list_head nd;
269 struct task_struct *tsk; 279 struct task_struct *tsk;
270 unsigned long addr; 280 unsigned long addr;
271 unsigned addr_valid:1; 281 char addr_valid;
272}; 282};
273 283
274/* 284/*
@@ -309,7 +319,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
309 * a SIGKILL because the error is not contained anymore. 319 * a SIGKILL because the error is not contained anymore.
310 */ 320 */
311 if (tk->addr == -EFAULT) { 321 if (tk->addr == -EFAULT) {
312 pr_debug("MCE: Unable to find user space address %lx in %s\n", 322 pr_info("MCE: Unable to find user space address %lx in %s\n",
313 page_to_pfn(p), tsk->comm); 323 page_to_pfn(p), tsk->comm);
314 tk->addr_valid = 0; 324 tk->addr_valid = 0;
315 } 325 }
@@ -381,10 +391,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
381 struct task_struct *tsk; 391 struct task_struct *tsk;
382 struct anon_vma *av; 392 struct anon_vma *av;
383 393
384 read_lock(&tasklist_lock);
385 av = page_lock_anon_vma(page); 394 av = page_lock_anon_vma(page);
386 if (av == NULL) /* Not actually mapped anymore */ 395 if (av == NULL) /* Not actually mapped anymore */
387 goto out; 396 return;
397
398 read_lock(&tasklist_lock);
388 for_each_process (tsk) { 399 for_each_process (tsk) {
389 struct anon_vma_chain *vmac; 400 struct anon_vma_chain *vmac;
390 401
@@ -398,9 +409,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
398 add_to_kill(tsk, page, vma, to_kill, tkc); 409 add_to_kill(tsk, page, vma, to_kill, tkc);
399 } 410 }
400 } 411 }
401 page_unlock_anon_vma(av);
402out:
403 read_unlock(&tasklist_lock); 412 read_unlock(&tasklist_lock);
413 page_unlock_anon_vma(av);
404} 414}
405 415
406/* 416/*
@@ -414,17 +424,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
414 struct prio_tree_iter iter; 424 struct prio_tree_iter iter;
415 struct address_space *mapping = page->mapping; 425 struct address_space *mapping = page->mapping;
416 426
417 /* 427 mutex_lock(&mapping->i_mmap_mutex);
418 * A note on the locking order between the two locks.
419 * We don't rely on this particular order.
420 * If you have some other code that needs a different order
421 * feel free to switch them around. Or add a reverse link
422 * from mm_struct to task_struct, then this could be all
423 * done without taking tasklist_lock and looping over all tasks.
424 */
425
426 read_lock(&tasklist_lock); 428 read_lock(&tasklist_lock);
427 spin_lock(&mapping->i_mmap_lock);
428 for_each_process(tsk) { 429 for_each_process(tsk) {
429 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 430 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
430 431
@@ -444,8 +445,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
444 add_to_kill(tsk, page, vma, to_kill, tkc); 445 add_to_kill(tsk, page, vma, to_kill, tkc);
445 } 446 }
446 } 447 }
447 spin_unlock(&mapping->i_mmap_lock);
448 read_unlock(&tasklist_lock); 448 read_unlock(&tasklist_lock);
449 mutex_unlock(&mapping->i_mmap_mutex);
449} 450}
450 451
451/* 452/*
@@ -577,7 +578,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
577 pfn, err); 578 pfn, err);
578 } else if (page_has_private(p) && 579 } else if (page_has_private(p) &&
579 !try_to_release_page(p, GFP_NOIO)) { 580 !try_to_release_page(p, GFP_NOIO)) {
580 pr_debug("MCE %#lx: failed to release buffers\n", pfn); 581 pr_info("MCE %#lx: failed to release buffers\n", pfn);
581 } else { 582 } else {
582 ret = RECOVERED; 583 ret = RECOVERED;
583 } 584 }
@@ -629,7 +630,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
629 * when the page is reread or dropped. If an 630 * when the page is reread or dropped. If an
630 * application assumes it will always get error on 631 * application assumes it will always get error on
631 * fsync, but does other operations on the fd before 632 * fsync, but does other operations on the fd before
632 * and the page is dropped inbetween then the error 633 * and the page is dropped between then the error
633 * will not be properly reported. 634 * will not be properly reported.
634 * 635 *
635 * This can already happen even without hwpoisoned 636 * This can already happen even without hwpoisoned
@@ -693,11 +694,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
693 * Issues: 694 * Issues:
694 * - Error on hugepage is contained in hugepage unit (not in raw page unit.) 695 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
695 * To narrow down kill region to one page, we need to break up pmd. 696 * To narrow down kill region to one page, we need to break up pmd.
696 * - To support soft-offlining for hugepage, we need to support hugepage
697 * migration.
698 */ 697 */
699static int me_huge_page(struct page *p, unsigned long pfn) 698static int me_huge_page(struct page *p, unsigned long pfn)
700{ 699{
700 int res = 0;
701 struct page *hpage = compound_head(p); 701 struct page *hpage = compound_head(p);
702 /* 702 /*
703 * We can safely recover from error on free or reserved (i.e. 703 * We can safely recover from error on free or reserved (i.e.
@@ -710,8 +710,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
710 * so there is no race between isolation and mapping/unmapping. 710 * so there is no race between isolation and mapping/unmapping.
711 */ 711 */
712 if (!(page_mapping(hpage) || PageAnon(hpage))) { 712 if (!(page_mapping(hpage) || PageAnon(hpage))) {
713 __isolate_hwpoisoned_huge_page(hpage); 713 res = dequeue_hwpoisoned_huge_page(hpage);
714 return RECOVERED; 714 if (!res)
715 return RECOVERED;
715 } 716 }
716 return DELAYED; 717 return DELAYED;
717} 718}
@@ -723,7 +724,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
723 * The table matches them in order and calls the right handler. 724 * The table matches them in order and calls the right handler.
724 * 725 *
725 * This is quite tricky because we can access page at any time 726 * This is quite tricky because we can access page at any time
726 * in its live cycle, so all accesses have to be extremly careful. 727 * in its live cycle, so all accesses have to be extremely careful.
727 * 728 *
728 * This is not complete. More states could be added. 729 * This is not complete. More states could be added.
729 * For any missing state don't attempt recovery. 730 * For any missing state don't attempt recovery.
@@ -836,8 +837,6 @@ static int page_action(struct page_state *ps, struct page *p,
836 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; 837 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
837} 838}
838 839
839#define N_UNMAP_TRIES 5
840
841/* 840/*
842 * Do all that is necessary to remove user space mappings. Unmap 841 * Do all that is necessary to remove user space mappings. Unmap
843 * the pages and send SIGBUS to the processes if the data was dirty. 842 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -849,9 +848,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
849 struct address_space *mapping; 848 struct address_space *mapping;
850 LIST_HEAD(tokill); 849 LIST_HEAD(tokill);
851 int ret; 850 int ret;
852 int i;
853 int kill = 1; 851 int kill = 1;
854 struct page *hpage = compound_head(p); 852 struct page *hpage = compound_head(p);
853 struct page *ppage;
855 854
856 if (PageReserved(p) || PageSlab(p)) 855 if (PageReserved(p) || PageSlab(p))
857 return SWAP_SUCCESS; 856 return SWAP_SUCCESS;
@@ -893,6 +892,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
893 } 892 }
894 893
895 /* 894 /*
895 * ppage: poisoned page
896 * if p is regular page(4k page)
897 * ppage == real poisoned page;
898 * else p is hugetlb or THP, ppage == head page.
899 */
900 ppage = hpage;
901
902 if (PageTransHuge(hpage)) {
903 /*
904 * Verify that this isn't a hugetlbfs head page, the check for
905 * PageAnon is just for avoid tripping a split_huge_page
906 * internal debug check, as split_huge_page refuses to deal with
907 * anything that isn't an anon page. PageAnon can't go away fro
908 * under us because we hold a refcount on the hpage, without a
909 * refcount on the hpage. split_huge_page can't be safely called
910 * in the first place, having a refcount on the tail isn't
911 * enough * to be safe.
912 */
913 if (!PageHuge(hpage) && PageAnon(hpage)) {
914 if (unlikely(split_huge_page(hpage))) {
915 /*
916 * FIXME: if splitting THP is failed, it is
917 * better to stop the following operation rather
918 * than causing panic by unmapping. System might
919 * survive if the page is freed later.
920 */
921 printk(KERN_INFO
922 "MCE %#lx: failed to split THP\n", pfn);
923
924 BUG_ON(!PageHWPoison(p));
925 return SWAP_FAIL;
926 }
927 /* THP is split, so ppage should be the real poisoned page. */
928 ppage = p;
929 }
930 }
931
932 /*
896 * First collect all the processes that have the page 933 * First collect all the processes that have the page
897 * mapped in dirty form. This has to be done before try_to_unmap, 934 * mapped in dirty form. This has to be done before try_to_unmap,
898 * because ttu takes the rmap data structures down. 935 * because ttu takes the rmap data structures down.
@@ -901,22 +938,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
901 * there's nothing that can be done. 938 * there's nothing that can be done.
902 */ 939 */
903 if (kill) 940 if (kill)
904 collect_procs(hpage, &tokill); 941 collect_procs(ppage, &tokill);
905 942
906 /* 943 if (hpage != ppage)
907 * try_to_unmap can fail temporarily due to races. 944 lock_page(ppage);
908 * Try a few times (RED-PEN better strategy?)
909 */
910 for (i = 0; i < N_UNMAP_TRIES; i++) {
911 ret = try_to_unmap(hpage, ttu);
912 if (ret == SWAP_SUCCESS)
913 break;
914 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
915 }
916 945
946 ret = try_to_unmap(ppage, ttu);
917 if (ret != SWAP_SUCCESS) 947 if (ret != SWAP_SUCCESS)
918 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 948 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
919 pfn, page_mapcount(hpage)); 949 pfn, page_mapcount(ppage));
950
951 if (hpage != ppage)
952 unlock_page(ppage);
920 953
921 /* 954 /*
922 * Now that the dirty bit has been propagated to the 955 * Now that the dirty bit has been propagated to the
@@ -927,7 +960,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
927 * use a more force-full uncatchable kill to prevent 960 * use a more force-full uncatchable kill to prevent
928 * any accesses to the poisoned memory. 961 * any accesses to the poisoned memory.
929 */ 962 */
930 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, 963 kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
931 ret != SWAP_SUCCESS, p, pfn); 964 ret != SWAP_SUCCESS, p, pfn);
932 965
933 return ret; 966 return ret;
@@ -936,7 +969,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
936static void set_page_hwpoison_huge_page(struct page *hpage) 969static void set_page_hwpoison_huge_page(struct page *hpage)
937{ 970{
938 int i; 971 int i;
939 int nr_pages = 1 << compound_order(hpage); 972 int nr_pages = 1 << compound_trans_order(hpage);
940 for (i = 0; i < nr_pages; i++) 973 for (i = 0; i < nr_pages; i++)
941 SetPageHWPoison(hpage + i); 974 SetPageHWPoison(hpage + i);
942} 975}
@@ -944,7 +977,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
944static void clear_page_hwpoison_huge_page(struct page *hpage) 977static void clear_page_hwpoison_huge_page(struct page *hpage)
945{ 978{
946 int i; 979 int i;
947 int nr_pages = 1 << compound_order(hpage); 980 int nr_pages = 1 << compound_trans_order(hpage);
948 for (i = 0; i < nr_pages; i++) 981 for (i = 0; i < nr_pages; i++)
949 ClearPageHWPoison(hpage + i); 982 ClearPageHWPoison(hpage + i);
950} 983}
@@ -974,14 +1007,17 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
974 return 0; 1007 return 0;
975 } 1008 }
976 1009
977 nr_pages = 1 << compound_order(hpage); 1010 nr_pages = 1 << compound_trans_order(hpage);
978 atomic_long_add(nr_pages, &mce_bad_pages); 1011 atomic_long_add(nr_pages, &mce_bad_pages);
979 1012
980 /* 1013 /*
981 * We need/can do nothing about count=0 pages. 1014 * We need/can do nothing about count=0 pages.
982 * 1) it's a free page, and therefore in safe hand: 1015 * 1) it's a free page, and therefore in safe hand:
983 * prep_new_page() will be the gate keeper. 1016 * prep_new_page() will be the gate keeper.
984 * 2) it's part of a non-compound high order page. 1017 * 2) it's a free hugepage, which is also safe:
1018 * an affected hugepage will be dequeued from hugepage freelist,
1019 * so there's no concern about reusing it ever after.
1020 * 3) it's part of a non-compound high order page.
985 * Implies some kernel user: cannot stop them from 1021 * Implies some kernel user: cannot stop them from
986 * R/W the page; let's pray that the page has been 1022 * R/W the page; let's pray that the page has been
987 * used and will be freed some time later. 1023 * used and will be freed some time later.
@@ -993,6 +1029,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
993 if (is_free_buddy_page(p)) { 1029 if (is_free_buddy_page(p)) {
994 action_result(pfn, "free buddy", DELAYED); 1030 action_result(pfn, "free buddy", DELAYED);
995 return 0; 1031 return 0;
1032 } else if (PageHuge(hpage)) {
1033 /*
1034 * Check "just unpoisoned", "filter hit", and
1035 * "race with other subpage."
1036 */
1037 lock_page(hpage);
1038 if (!PageHWPoison(hpage)
1039 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1040 || (p != hpage && TestSetPageHWPoison(hpage))) {
1041 atomic_long_sub(nr_pages, &mce_bad_pages);
1042 return 0;
1043 }
1044 set_page_hwpoison_huge_page(hpage);
1045 res = dequeue_hwpoisoned_huge_page(hpage);
1046 action_result(pfn, "free huge",
1047 res ? IGNORED : DELAYED);
1048 unlock_page(hpage);
1049 return res;
996 } else { 1050 } else {
997 action_result(pfn, "high order kernel", IGNORED); 1051 action_result(pfn, "high order kernel", IGNORED);
998 return -EBUSY; 1052 return -EBUSY;
@@ -1007,19 +1061,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1007 * The check (unnecessarily) ignores LRU pages being isolated and 1061 * The check (unnecessarily) ignores LRU pages being isolated and
1008 * walked by the page reclaim code, however that's not a big loss. 1062 * walked by the page reclaim code, however that's not a big loss.
1009 */ 1063 */
1010 if (!PageLRU(p) && !PageHuge(p)) 1064 if (!PageHuge(p) && !PageTransCompound(p)) {
1011 shake_page(p, 0); 1065 if (!PageLRU(p))
1012 if (!PageLRU(p) && !PageHuge(p)) { 1066 shake_page(p, 0);
1013 /* 1067 if (!PageLRU(p)) {
1014 * shake_page could have turned it free. 1068 /*
1015 */ 1069 * shake_page could have turned it free.
1016 if (is_free_buddy_page(p)) { 1070 */
1017 action_result(pfn, "free buddy, 2nd try", DELAYED); 1071 if (is_free_buddy_page(p)) {
1018 return 0; 1072 action_result(pfn, "free buddy, 2nd try",
1073 DELAYED);
1074 return 0;
1075 }
1076 action_result(pfn, "non LRU", IGNORED);
1077 put_page(p);
1078 return -EBUSY;
1019 } 1079 }
1020 action_result(pfn, "non LRU", IGNORED);
1021 put_page(p);
1022 return -EBUSY;
1023 } 1080 }
1024 1081
1025 /* 1082 /*
@@ -1027,7 +1084,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1027 * It's very difficult to mess with pages currently under IO 1084 * It's very difficult to mess with pages currently under IO
1028 * and in many cases impossible, so we just avoid it here. 1085 * and in many cases impossible, so we just avoid it here.
1029 */ 1086 */
1030 lock_page_nosync(hpage); 1087 lock_page(hpage);
1031 1088
1032 /* 1089 /*
1033 * unpoison always clear PG_hwpoison inside page lock 1090 * unpoison always clear PG_hwpoison inside page lock
@@ -1049,7 +1106,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1049 * For error on the tail page, we should set PG_hwpoison 1106 * For error on the tail page, we should set PG_hwpoison
1050 * on the head page to show that the hugepage is hwpoisoned 1107 * on the head page to show that the hugepage is hwpoisoned
1051 */ 1108 */
1052 if (PageTail(p) && TestSetPageHWPoison(hpage)) { 1109 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1053 action_result(pfn, "hugepage already hardware poisoned", 1110 action_result(pfn, "hugepage already hardware poisoned",
1054 IGNORED); 1111 IGNORED);
1055 unlock_page(hpage); 1112 unlock_page(hpage);
@@ -1069,7 +1126,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1069 1126
1070 /* 1127 /*
1071 * Now take care of user space mappings. 1128 * Now take care of user space mappings.
1072 * Abort on fail: __remove_from_page_cache() assumes unmapped page. 1129 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1073 */ 1130 */
1074 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { 1131 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1075 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); 1132 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
@@ -1147,20 +1204,30 @@ int unpoison_memory(unsigned long pfn)
1147 page = compound_head(p); 1204 page = compound_head(p);
1148 1205
1149 if (!PageHWPoison(p)) { 1206 if (!PageHWPoison(p)) {
1150 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); 1207 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1151 return 0; 1208 return 0;
1152 } 1209 }
1153 1210
1154 nr_pages = 1 << compound_order(page); 1211 nr_pages = 1 << compound_trans_order(page);
1155 1212
1156 if (!get_page_unless_zero(page)) { 1213 if (!get_page_unless_zero(page)) {
1214 /*
1215 * Since HWPoisoned hugepage should have non-zero refcount,
1216 * race between memory failure and unpoison seems to happen.
1217 * In such case unpoison fails and memory failure runs
1218 * to the end.
1219 */
1220 if (PageHuge(page)) {
1221 pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1222 return 0;
1223 }
1157 if (TestClearPageHWPoison(p)) 1224 if (TestClearPageHWPoison(p))
1158 atomic_long_sub(nr_pages, &mce_bad_pages); 1225 atomic_long_sub(nr_pages, &mce_bad_pages);
1159 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1226 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1160 return 0; 1227 return 0;
1161 } 1228 }
1162 1229
1163 lock_page_nosync(page); 1230 lock_page(page);
1164 /* 1231 /*
1165 * This test is racy because PG_hwpoison is set outside of page lock. 1232 * This test is racy because PG_hwpoison is set outside of page lock.
1166 * That's acceptable because that won't trigger kernel panic. Instead, 1233 * That's acceptable because that won't trigger kernel panic. Instead,
@@ -1168,12 +1235,12 @@ int unpoison_memory(unsigned long pfn)
1168 * the free buddy page pool. 1235 * the free buddy page pool.
1169 */ 1236 */
1170 if (TestClearPageHWPoison(page)) { 1237 if (TestClearPageHWPoison(page)) {
1171 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1238 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1172 atomic_long_sub(nr_pages, &mce_bad_pages); 1239 atomic_long_sub(nr_pages, &mce_bad_pages);
1173 freeit = 1; 1240 freeit = 1;
1241 if (PageHuge(page))
1242 clear_page_hwpoison_huge_page(page);
1174 } 1243 }
1175 if (PageHuge(p))
1176 clear_page_hwpoison_huge_page(page);
1177 unlock_page(page); 1244 unlock_page(page);
1178 1245
1179 put_page(page); 1246 put_page(page);
@@ -1187,7 +1254,11 @@ EXPORT_SYMBOL(unpoison_memory);
1187static struct page *new_page(struct page *p, unsigned long private, int **x) 1254static struct page *new_page(struct page *p, unsigned long private, int **x)
1188{ 1255{
1189 int nid = page_to_nid(p); 1256 int nid = page_to_nid(p);
1190 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 1257 if (PageHuge(p))
1258 return alloc_huge_page_node(page_hstate(compound_head(p)),
1259 nid);
1260 else
1261 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1191} 1262}
1192 1263
1193/* 1264/*
@@ -1204,25 +1275,31 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1204 return 1; 1275 return 1;
1205 1276
1206 /* 1277 /*
1207 * The lock_system_sleep prevents a race with memory hotplug, 1278 * The lock_memory_hotplug prevents a race with memory hotplug.
1208 * because the isolation assumes there's only a single user.
1209 * This is a big hammer, a better would be nicer. 1279 * This is a big hammer, a better would be nicer.
1210 */ 1280 */
1211 lock_system_sleep(); 1281 lock_memory_hotplug();
1212 1282
1213 /* 1283 /*
1214 * Isolate the page, so that it doesn't get reallocated if it 1284 * Isolate the page, so that it doesn't get reallocated if it
1215 * was free. 1285 * was free.
1216 */ 1286 */
1217 set_migratetype_isolate(p); 1287 set_migratetype_isolate(p);
1288 /*
1289 * When the target page is a free hugepage, just remove it
1290 * from free hugepage list.
1291 */
1218 if (!get_page_unless_zero(compound_head(p))) { 1292 if (!get_page_unless_zero(compound_head(p))) {
1219 if (is_free_buddy_page(p)) { 1293 if (PageHuge(p)) {
1220 pr_debug("get_any_page: %#lx free buddy page\n", pfn); 1294 pr_info("get_any_page: %#lx free huge page\n", pfn);
1295 ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1296 } else if (is_free_buddy_page(p)) {
1297 pr_info("get_any_page: %#lx free buddy page\n", pfn);
1221 /* Set hwpoison bit while page is still isolated */ 1298 /* Set hwpoison bit while page is still isolated */
1222 SetPageHWPoison(p); 1299 SetPageHWPoison(p);
1223 ret = 0; 1300 ret = 0;
1224 } else { 1301 } else {
1225 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", 1302 pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1226 pfn, p->flags); 1303 pfn, p->flags);
1227 ret = -EIO; 1304 ret = -EIO;
1228 } 1305 }
@@ -1231,7 +1308,51 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1231 ret = 1; 1308 ret = 1;
1232 } 1309 }
1233 unset_migratetype_isolate(p); 1310 unset_migratetype_isolate(p);
1234 unlock_system_sleep(); 1311 unlock_memory_hotplug();
1312 return ret;
1313}
1314
1315static int soft_offline_huge_page(struct page *page, int flags)
1316{
1317 int ret;
1318 unsigned long pfn = page_to_pfn(page);
1319 struct page *hpage = compound_head(page);
1320 LIST_HEAD(pagelist);
1321
1322 ret = get_any_page(page, pfn, flags);
1323 if (ret < 0)
1324 return ret;
1325 if (ret == 0)
1326 goto done;
1327
1328 if (PageHWPoison(hpage)) {
1329 put_page(hpage);
1330 pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
1331 return -EBUSY;
1332 }
1333
1334 /* Keep page count to indicate a given hugepage is isolated. */
1335
1336 list_add(&hpage->lru, &pagelist);
1337 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1338 true);
1339 if (ret) {
1340 struct page *page1, *page2;
1341 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1342 put_page(page1);
1343
1344 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1345 pfn, ret, page->flags);
1346 if (ret > 0)
1347 ret = -EIO;
1348 return ret;
1349 }
1350done:
1351 if (!PageHWPoison(hpage))
1352 atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
1353 set_page_hwpoison_huge_page(hpage);
1354 dequeue_hwpoisoned_huge_page(hpage);
1355 /* keep elevated page count for bad page */
1235 return ret; 1356 return ret;
1236} 1357}
1237 1358
@@ -1262,6 +1383,9 @@ int soft_offline_page(struct page *page, int flags)
1262 int ret; 1383 int ret;
1263 unsigned long pfn = page_to_pfn(page); 1384 unsigned long pfn = page_to_pfn(page);
1264 1385
1386 if (PageHuge(page))
1387 return soft_offline_huge_page(page, flags);
1388
1265 ret = get_any_page(page, pfn, flags); 1389 ret = get_any_page(page, pfn, flags);
1266 if (ret < 0) 1390 if (ret < 0)
1267 return ret; 1391 return ret;
@@ -1288,7 +1412,7 @@ int soft_offline_page(struct page *page, int flags)
1288 goto done; 1412 goto done;
1289 } 1413 }
1290 if (!PageLRU(page)) { 1414 if (!PageLRU(page)) {
1291 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", 1415 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1292 pfn, page->flags); 1416 pfn, page->flags);
1293 return -EIO; 1417 return -EIO;
1294 } 1418 }
@@ -1302,7 +1426,7 @@ int soft_offline_page(struct page *page, int flags)
1302 if (PageHWPoison(page)) { 1426 if (PageHWPoison(page)) {
1303 unlock_page(page); 1427 unlock_page(page);
1304 put_page(page); 1428 put_page(page);
1305 pr_debug("soft offline: %#lx page already poisoned\n", pfn); 1429 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1306 return -EBUSY; 1430 return -EBUSY;
1307 } 1431 }
1308 1432
@@ -1312,18 +1436,14 @@ int soft_offline_page(struct page *page, int flags)
1312 */ 1436 */
1313 ret = invalidate_inode_page(page); 1437 ret = invalidate_inode_page(page);
1314 unlock_page(page); 1438 unlock_page(page);
1315
1316 /* 1439 /*
1317 * Drop count because page migration doesn't like raised
1318 * counts. The page could get re-allocated, but if it becomes
1319 * LRU the isolation will just fail.
1320 * RED-PEN would be better to keep it isolated here, but we 1440 * RED-PEN would be better to keep it isolated here, but we
1321 * would need to fix isolation locking first. 1441 * would need to fix isolation locking first.
1322 */ 1442 */
1323 put_page(page);
1324 if (ret == 1) { 1443 if (ret == 1) {
1444 put_page(page);
1325 ret = 0; 1445 ret = 0;
1326 pr_debug("soft_offline: %#lx: invalidated\n", pfn); 1446 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1327 goto done; 1447 goto done;
1328 } 1448 }
1329 1449
@@ -1333,19 +1453,27 @@ int soft_offline_page(struct page *page, int flags)
1333 * handles a large number of cases for us. 1453 * handles a large number of cases for us.
1334 */ 1454 */
1335 ret = isolate_lru_page(page); 1455 ret = isolate_lru_page(page);
1456 /*
1457 * Drop page reference which is came from get_any_page()
1458 * successful isolate_lru_page() already took another one.
1459 */
1460 put_page(page);
1336 if (!ret) { 1461 if (!ret) {
1337 LIST_HEAD(pagelist); 1462 LIST_HEAD(pagelist);
1338 1463 inc_zone_page_state(page, NR_ISOLATED_ANON +
1464 page_is_file_cache(page));
1339 list_add(&page->lru, &pagelist); 1465 list_add(&page->lru, &pagelist);
1340 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1466 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1467 0, true);
1341 if (ret) { 1468 if (ret) {
1342 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1469 putback_lru_pages(&pagelist);
1470 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1343 pfn, ret, page->flags); 1471 pfn, ret, page->flags);
1344 if (ret > 0) 1472 if (ret > 0)
1345 ret = -EIO; 1473 ret = -EIO;
1346 } 1474 }
1347 } else { 1475 } else {
1348 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1476 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1349 pfn, ret, page_count(page), page->flags); 1477 pfn, ret, page_count(page), page->flags);
1350 } 1478 }
1351 if (ret) 1479 if (ret)
@@ -1357,35 +1485,3 @@ done:
1357 /* keep elevated page count for bad page */ 1485 /* keep elevated page count for bad page */
1358 return ret; 1486 return ret;
1359} 1487}
1360
1361/*
1362 * The caller must hold current->mm->mmap_sem in read mode.
1363 */
1364int is_hwpoison_address(unsigned long addr)
1365{
1366 pgd_t *pgdp;
1367 pud_t pud, *pudp;
1368 pmd_t pmd, *pmdp;
1369 pte_t pte, *ptep;
1370 swp_entry_t entry;
1371
1372 pgdp = pgd_offset(current->mm, addr);
1373 if (!pgd_present(*pgdp))
1374 return 0;
1375 pudp = pud_offset(pgdp, addr);
1376 pud = *pudp;
1377 if (!pud_present(pud) || pud_large(pud))
1378 return 0;
1379 pmdp = pmd_offset(pudp, addr);
1380 pmd = *pmdp;
1381 if (!pmd_present(pmd) || pmd_large(pmd))
1382 return 0;
1383 ptep = pte_offset_map(pmdp, addr);
1384 pte = *ptep;
1385 pte_unmap(ptep);
1386 if (!is_swap_pte(pte))
1387 return 0;
1388 entry = pte_to_swp_entry(pte);
1389 return is_hwpoison_entry(entry);
1390}
1391EXPORT_SYMBOL_GPL(is_hwpoison_address);