aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c153
1 files changed, 123 insertions, 30 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 620b0b461593..9c26eeca1342 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -45,6 +45,8 @@
45#include <linux/page-isolation.h> 45#include <linux/page-isolation.h>
46#include <linux/suspend.h> 46#include <linux/suspend.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/swapops.h>
49#include <linux/hugetlb.h>
48#include "internal.h" 50#include "internal.h"
49 51
50int sysctl_memory_failure_early_kill __read_mostly = 0; 52int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -689,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
689/* 691/*
690 * Huge pages. Needs work. 692 * Huge pages. Needs work.
691 * Issues: 693 * Issues:
692 * No rmap support so we cannot find the original mapper. In theory could walk 694 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
693 * all MMs and look for the mappings, but that would be non atomic and racy. 695 * To narrow down kill region to one page, we need to break up pmd.
694 * Need rmap for hugepages for this. Alternatively we could employ a heuristic, 696 * - To support soft-offlining for hugepage, we need to support hugepage
695 * like just walking the current process and hoping it has it mapped (that 697 * migration.
696 * should be usually true for the common "shared database cache" case)
697 * Should handle free huge pages and dequeue them too, but this needs to
698 * handle huge page accounting correctly.
699 */ 698 */
700static int me_huge_page(struct page *p, unsigned long pfn) 699static int me_huge_page(struct page *p, unsigned long pfn)
701{ 700{
702 return FAILED; 701 struct page *hpage = compound_head(p);
702 /*
703 * We can safely recover from error on free or reserved (i.e.
704 * not in-use) hugepage by dequeuing it from freelist.
705 * To check whether a hugepage is in-use or not, we can't use
706 * page->lru because it can be used in other hugepage operations,
707 * such as __unmap_hugepage_range() and gather_surplus_pages().
708 * So instead we use page_mapping() and PageAnon().
709 * We assume that this function is called with page lock held,
710 * so there is no race between isolation and mapping/unmapping.
711 */
712 if (!(page_mapping(hpage) || PageAnon(hpage))) {
713 __isolate_hwpoisoned_huge_page(hpage);
714 return RECOVERED;
715 }
716 return DELAYED;
703} 717}
704 718
705/* 719/*
@@ -837,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
837 int ret; 851 int ret;
838 int i; 852 int i;
839 int kill = 1; 853 int kill = 1;
854 struct page *hpage = compound_head(p);
840 855
841 if (PageReserved(p) || PageSlab(p)) 856 if (PageReserved(p) || PageSlab(p))
842 return SWAP_SUCCESS; 857 return SWAP_SUCCESS;
@@ -845,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
845 * This check implies we don't kill processes if their pages 860 * This check implies we don't kill processes if their pages
846 * are in the swap cache early. Those are always late kills. 861 * are in the swap cache early. Those are always late kills.
847 */ 862 */
848 if (!page_mapped(p)) 863 if (!page_mapped(hpage))
849 return SWAP_SUCCESS; 864 return SWAP_SUCCESS;
850 865
851 if (PageCompound(p) || PageKsm(p)) 866 if (PageKsm(p))
852 return SWAP_FAIL; 867 return SWAP_FAIL;
853 868
854 if (PageSwapCache(p)) { 869 if (PageSwapCache(p)) {
@@ -863,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
863 * XXX: the dirty test could be racy: set_page_dirty() may not always 878 * XXX: the dirty test could be racy: set_page_dirty() may not always
864 * be called inside page lock (it's recommended but not enforced). 879 * be called inside page lock (it's recommended but not enforced).
865 */ 880 */
866 mapping = page_mapping(p); 881 mapping = page_mapping(hpage);
867 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { 882 if (!PageDirty(hpage) && mapping &&
868 if (page_mkclean(p)) { 883 mapping_cap_writeback_dirty(mapping)) {
869 SetPageDirty(p); 884 if (page_mkclean(hpage)) {
885 SetPageDirty(hpage);
870 } else { 886 } else {
871 kill = 0; 887 kill = 0;
872 ttu |= TTU_IGNORE_HWPOISON; 888 ttu |= TTU_IGNORE_HWPOISON;
@@ -885,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
885 * there's nothing that can be done. 901 * there's nothing that can be done.
886 */ 902 */
887 if (kill) 903 if (kill)
888 collect_procs(p, &tokill); 904 collect_procs(hpage, &tokill);
889 905
890 /* 906 /*
891 * try_to_unmap can fail temporarily due to races. 907 * try_to_unmap can fail temporarily due to races.
892 * Try a few times (RED-PEN better strategy?) 908 * Try a few times (RED-PEN better strategy?)
893 */ 909 */
894 for (i = 0; i < N_UNMAP_TRIES; i++) { 910 for (i = 0; i < N_UNMAP_TRIES; i++) {
895 ret = try_to_unmap(p, ttu); 911 ret = try_to_unmap(hpage, ttu);
896 if (ret == SWAP_SUCCESS) 912 if (ret == SWAP_SUCCESS)
897 break; 913 break;
898 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); 914 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
@@ -900,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
900 916
901 if (ret != SWAP_SUCCESS) 917 if (ret != SWAP_SUCCESS)
902 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 918 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
903 pfn, page_mapcount(p)); 919 pfn, page_mapcount(hpage));
904 920
905 /* 921 /*
906 * Now that the dirty bit has been propagated to the 922 * Now that the dirty bit has been propagated to the
@@ -911,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
911 * use a more force-full uncatchable kill to prevent 927 * use a more force-full uncatchable kill to prevent
912 * any accesses to the poisoned memory. 928 * any accesses to the poisoned memory.
913 */ 929 */
914 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 930 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
915 ret != SWAP_SUCCESS, pfn); 931 ret != SWAP_SUCCESS, pfn);
916 932
917 return ret; 933 return ret;
918} 934}
919 935
936static void set_page_hwpoison_huge_page(struct page *hpage)
937{
938 int i;
939 int nr_pages = 1 << compound_order(hpage);
940 for (i = 0; i < nr_pages; i++)
941 SetPageHWPoison(hpage + i);
942}
943
944static void clear_page_hwpoison_huge_page(struct page *hpage)
945{
946 int i;
947 int nr_pages = 1 << compound_order(hpage);
948 for (i = 0; i < nr_pages; i++)
949 ClearPageHWPoison(hpage + i);
950}
951
920int __memory_failure(unsigned long pfn, int trapno, int flags) 952int __memory_failure(unsigned long pfn, int trapno, int flags)
921{ 953{
922 struct page_state *ps; 954 struct page_state *ps;
923 struct page *p; 955 struct page *p;
956 struct page *hpage;
924 int res; 957 int res;
958 unsigned int nr_pages;
925 959
926 if (!sysctl_memory_failure_recovery) 960 if (!sysctl_memory_failure_recovery)
927 panic("Memory failure from trap %d on page %lx", trapno, pfn); 961 panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -934,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
934 } 968 }
935 969
936 p = pfn_to_page(pfn); 970 p = pfn_to_page(pfn);
971 hpage = compound_head(p);
937 if (TestSetPageHWPoison(p)) { 972 if (TestSetPageHWPoison(p)) {
938 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); 973 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
939 return 0; 974 return 0;
940 } 975 }
941 976
942 atomic_long_add(1, &mce_bad_pages); 977 nr_pages = 1 << compound_order(hpage);
978 atomic_long_add(nr_pages, &mce_bad_pages);
943 979
944 /* 980 /*
945 * We need/can do nothing about count=0 pages. 981 * We need/can do nothing about count=0 pages.
@@ -953,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
953 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 989 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
954 */ 990 */
955 if (!(flags & MF_COUNT_INCREASED) && 991 if (!(flags & MF_COUNT_INCREASED) &&
956 !get_page_unless_zero(compound_head(p))) { 992 !get_page_unless_zero(hpage)) {
957 if (is_free_buddy_page(p)) { 993 if (is_free_buddy_page(p)) {
958 action_result(pfn, "free buddy", DELAYED); 994 action_result(pfn, "free buddy", DELAYED);
959 return 0; 995 return 0;
@@ -971,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
971 * The check (unnecessarily) ignores LRU pages being isolated and 1007 * The check (unnecessarily) ignores LRU pages being isolated and
972 * walked by the page reclaim code, however that's not a big loss. 1008 * walked by the page reclaim code, however that's not a big loss.
973 */ 1009 */
974 if (!PageLRU(p)) 1010 if (!PageLRU(p) && !PageHuge(p))
975 shake_page(p, 0); 1011 shake_page(p, 0);
976 if (!PageLRU(p)) { 1012 if (!PageLRU(p) && !PageHuge(p)) {
977 /* 1013 /*
978 * shake_page could have turned it free. 1014 * shake_page could have turned it free.
979 */ 1015 */
@@ -991,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
991 * It's very difficult to mess with pages currently under IO 1027 * It's very difficult to mess with pages currently under IO
992 * and in many cases impossible, so we just avoid it here. 1028 * and in many cases impossible, so we just avoid it here.
993 */ 1029 */
994 lock_page_nosync(p); 1030 lock_page_nosync(hpage);
995 1031
996 /* 1032 /*
997 * unpoison always clear PG_hwpoison inside page lock 1033 * unpoison always clear PG_hwpoison inside page lock
@@ -1003,12 +1039,32 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1003 } 1039 }
1004 if (hwpoison_filter(p)) { 1040 if (hwpoison_filter(p)) {
1005 if (TestClearPageHWPoison(p)) 1041 if (TestClearPageHWPoison(p))
1006 atomic_long_dec(&mce_bad_pages); 1042 atomic_long_sub(nr_pages, &mce_bad_pages);
1007 unlock_page(p); 1043 unlock_page(hpage);
1008 put_page(p); 1044 put_page(hpage);
1009 return 0; 1045 return 0;
1010 } 1046 }
1011 1047
1048 /*
1049 * For error on the tail page, we should set PG_hwpoison
1050 * on the head page to show that the hugepage is hwpoisoned
1051 */
1052 if (PageTail(p) && TestSetPageHWPoison(hpage)) {
1053 action_result(pfn, "hugepage already hardware poisoned",
1054 IGNORED);
1055 unlock_page(hpage);
1056 put_page(hpage);
1057 return 0;
1058 }
1059 /*
1060 * Set PG_hwpoison on all pages in an error hugepage,
1061 * because containment is done in hugepage unit for now.
1062 * Since we have done TestSetPageHWPoison() for the head page with
1063 * page lock held, we can safely set PG_hwpoison bits on tail pages.
1064 */
1065 if (PageHuge(p))
1066 set_page_hwpoison_huge_page(hpage);
1067
1012 wait_on_page_writeback(p); 1068 wait_on_page_writeback(p);
1013 1069
1014 /* 1070 /*
@@ -1038,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1038 } 1094 }
1039 } 1095 }
1040out: 1096out:
1041 unlock_page(p); 1097 unlock_page(hpage);
1042 return res; 1098 return res;
1043} 1099}
1044EXPORT_SYMBOL_GPL(__memory_failure); 1100EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1082,6 +1138,7 @@ int unpoison_memory(unsigned long pfn)
1082 struct page *page; 1138 struct page *page;
1083 struct page *p; 1139 struct page *p;
1084 int freeit = 0; 1140 int freeit = 0;
1141 unsigned int nr_pages;
1085 1142
1086 if (!pfn_valid(pfn)) 1143 if (!pfn_valid(pfn))
1087 return -ENXIO; 1144 return -ENXIO;
@@ -1094,9 +1151,11 @@ int unpoison_memory(unsigned long pfn)
1094 return 0; 1151 return 0;
1095 } 1152 }
1096 1153
1154 nr_pages = 1 << compound_order(page);
1155
1097 if (!get_page_unless_zero(page)) { 1156 if (!get_page_unless_zero(page)) {
1098 if (TestClearPageHWPoison(p)) 1157 if (TestClearPageHWPoison(p))
1099 atomic_long_dec(&mce_bad_pages); 1158 atomic_long_sub(nr_pages, &mce_bad_pages);
1100 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1159 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
1101 return 0; 1160 return 0;
1102 } 1161 }
@@ -1108,11 +1167,13 @@ int unpoison_memory(unsigned long pfn)
1108 * the PG_hwpoison page will be caught and isolated on the entrance to 1167 * the PG_hwpoison page will be caught and isolated on the entrance to
1109 * the free buddy page pool. 1168 * the free buddy page pool.
1110 */ 1169 */
1111 if (TestClearPageHWPoison(p)) { 1170 if (TestClearPageHWPoison(page)) {
1112 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1171 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
1113 atomic_long_dec(&mce_bad_pages); 1172 atomic_long_sub(nr_pages, &mce_bad_pages);
1114 freeit = 1; 1173 freeit = 1;
1115 } 1174 }
1175 if (PageHuge(p))
1176 clear_page_hwpoison_huge_page(page);
1116 unlock_page(page); 1177 unlock_page(page);
1117 1178
1118 put_page(page); 1179 put_page(page);
@@ -1296,3 +1357,35 @@ done:
1296 /* keep elevated page count for bad page */ 1357 /* keep elevated page count for bad page */
1297 return ret; 1358 return ret;
1298} 1359}
1360
1361/*
1362 * The caller must hold current->mm->mmap_sem in read mode.
1363 */
1364int is_hwpoison_address(unsigned long addr)
1365{
1366 pgd_t *pgdp;
1367 pud_t pud, *pudp;
1368 pmd_t pmd, *pmdp;
1369 pte_t pte, *ptep;
1370 swp_entry_t entry;
1371
1372 pgdp = pgd_offset(current->mm, addr);
1373 if (!pgd_present(*pgdp))
1374 return 0;
1375 pudp = pud_offset(pgdp, addr);
1376 pud = *pudp;
1377 if (!pud_present(pud) || pud_large(pud))
1378 return 0;
1379 pmdp = pmd_offset(pudp, addr);
1380 pmd = *pmdp;
1381 if (!pmd_present(pmd) || pmd_large(pmd))
1382 return 0;
1383 ptep = pte_offset_map(pmdp, addr);
1384 pte = *ptep;
1385 pte_unmap(ptep);
1386 if (!is_swap_pte(pte))
1387 return 0;
1388 entry = pte_to_swp_entry(pte);
1389 return is_hwpoison_entry(entry);
1390}
1391EXPORT_SYMBOL_GPL(is_hwpoison_address);