aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c120
1 files changed, 90 insertions, 30 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6b44e52cacaa..9c26eeca1342 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -46,6 +46,7 @@
46#include <linux/suspend.h> 46#include <linux/suspend.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/swapops.h> 48#include <linux/swapops.h>
49#include <linux/hugetlb.h>
49#include "internal.h" 50#include "internal.h"
50 51
51int sysctl_memory_failure_early_kill __read_mostly = 0; 52int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -690,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
690/* 691/*
691 * Huge pages. Needs work. 692 * Huge pages. Needs work.
692 * Issues: 693 * Issues:
693 * No rmap support so we cannot find the original mapper. In theory could walk 694 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
694 * all MMs and look for the mappings, but that would be non atomic and racy. 695 * To narrow down kill region to one page, we need to break up pmd.
695 * Need rmap for hugepages for this. Alternatively we could employ a heuristic, 696 * - To support soft-offlining for hugepage, we need to support hugepage
696 * like just walking the current process and hoping it has it mapped (that 697 * migration.
697 * should be usually true for the common "shared database cache" case)
698 * Should handle free huge pages and dequeue them too, but this needs to
699 * handle huge page accounting correctly.
700 */ 698 */
701static int me_huge_page(struct page *p, unsigned long pfn) 699static int me_huge_page(struct page *p, unsigned long pfn)
702{ 700{
703 return FAILED; 701 struct page *hpage = compound_head(p);
702 /*
703 * We can safely recover from error on free or reserved (i.e.
704 * not in-use) hugepage by dequeuing it from freelist.
705 * To check whether a hugepage is in-use or not, we can't use
706 * page->lru because it can be used in other hugepage operations,
707 * such as __unmap_hugepage_range() and gather_surplus_pages().
708 * So instead we use page_mapping() and PageAnon().
709 * We assume that this function is called with page lock held,
710 * so there is no race between isolation and mapping/unmapping.
711 */
712 if (!(page_mapping(hpage) || PageAnon(hpage))) {
713 __isolate_hwpoisoned_huge_page(hpage);
714 return RECOVERED;
715 }
716 return DELAYED;
704} 717}
705 718
706/* 719/*
@@ -838,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
838 int ret; 851 int ret;
839 int i; 852 int i;
840 int kill = 1; 853 int kill = 1;
854 struct page *hpage = compound_head(p);
841 855
842 if (PageReserved(p) || PageSlab(p)) 856 if (PageReserved(p) || PageSlab(p))
843 return SWAP_SUCCESS; 857 return SWAP_SUCCESS;
@@ -846,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
846 * This check implies we don't kill processes if their pages 860 * This check implies we don't kill processes if their pages
847 * are in the swap cache early. Those are always late kills. 861 * are in the swap cache early. Those are always late kills.
848 */ 862 */
849 if (!page_mapped(p)) 863 if (!page_mapped(hpage))
850 return SWAP_SUCCESS; 864 return SWAP_SUCCESS;
851 865
852 if (PageCompound(p) || PageKsm(p)) 866 if (PageKsm(p))
853 return SWAP_FAIL; 867 return SWAP_FAIL;
854 868
855 if (PageSwapCache(p)) { 869 if (PageSwapCache(p)) {
@@ -864,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
864 * XXX: the dirty test could be racy: set_page_dirty() may not always 878 * XXX: the dirty test could be racy: set_page_dirty() may not always
865 * be called inside page lock (it's recommended but not enforced). 879 * be called inside page lock (it's recommended but not enforced).
866 */ 880 */
867 mapping = page_mapping(p); 881 mapping = page_mapping(hpage);
868 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { 882 if (!PageDirty(hpage) && mapping &&
869 if (page_mkclean(p)) { 883 mapping_cap_writeback_dirty(mapping)) {
870 SetPageDirty(p); 884 if (page_mkclean(hpage)) {
885 SetPageDirty(hpage);
871 } else { 886 } else {
872 kill = 0; 887 kill = 0;
873 ttu |= TTU_IGNORE_HWPOISON; 888 ttu |= TTU_IGNORE_HWPOISON;
@@ -886,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
886 * there's nothing that can be done. 901 * there's nothing that can be done.
887 */ 902 */
888 if (kill) 903 if (kill)
889 collect_procs(p, &tokill); 904 collect_procs(hpage, &tokill);
890 905
891 /* 906 /*
892 * try_to_unmap can fail temporarily due to races. 907 * try_to_unmap can fail temporarily due to races.
893 * Try a few times (RED-PEN better strategy?) 908 * Try a few times (RED-PEN better strategy?)
894 */ 909 */
895 for (i = 0; i < N_UNMAP_TRIES; i++) { 910 for (i = 0; i < N_UNMAP_TRIES; i++) {
896 ret = try_to_unmap(p, ttu); 911 ret = try_to_unmap(hpage, ttu);
897 if (ret == SWAP_SUCCESS) 912 if (ret == SWAP_SUCCESS)
898 break; 913 break;
899 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); 914 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
@@ -901,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
901 916
902 if (ret != SWAP_SUCCESS) 917 if (ret != SWAP_SUCCESS)
903 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 918 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
904 pfn, page_mapcount(p)); 919 pfn, page_mapcount(hpage));
905 920
906 /* 921 /*
907 * Now that the dirty bit has been propagated to the 922 * Now that the dirty bit has been propagated to the
@@ -912,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
912 * use a more force-full uncatchable kill to prevent 927 * use a more force-full uncatchable kill to prevent
913 * any accesses to the poisoned memory. 928 * any accesses to the poisoned memory.
914 */ 929 */
915 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 930 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
916 ret != SWAP_SUCCESS, pfn); 931 ret != SWAP_SUCCESS, pfn);
917 932
918 return ret; 933 return ret;
919} 934}
920 935
936static void set_page_hwpoison_huge_page(struct page *hpage)
937{
938 int i;
939 int nr_pages = 1 << compound_order(hpage);
940 for (i = 0; i < nr_pages; i++)
941 SetPageHWPoison(hpage + i);
942}
943
944static void clear_page_hwpoison_huge_page(struct page *hpage)
945{
946 int i;
947 int nr_pages = 1 << compound_order(hpage);
948 for (i = 0; i < nr_pages; i++)
949 ClearPageHWPoison(hpage + i);
950}
951
921int __memory_failure(unsigned long pfn, int trapno, int flags) 952int __memory_failure(unsigned long pfn, int trapno, int flags)
922{ 953{
923 struct page_state *ps; 954 struct page_state *ps;
924 struct page *p; 955 struct page *p;
956 struct page *hpage;
925 int res; 957 int res;
958 unsigned int nr_pages;
926 959
927 if (!sysctl_memory_failure_recovery) 960 if (!sysctl_memory_failure_recovery)
928 panic("Memory failure from trap %d on page %lx", trapno, pfn); 961 panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -935,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
935 } 968 }
936 969
937 p = pfn_to_page(pfn); 970 p = pfn_to_page(pfn);
971 hpage = compound_head(p);
938 if (TestSetPageHWPoison(p)) { 972 if (TestSetPageHWPoison(p)) {
939 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); 973 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
940 return 0; 974 return 0;
941 } 975 }
942 976
943 atomic_long_add(1, &mce_bad_pages); 977 nr_pages = 1 << compound_order(hpage);
978 atomic_long_add(nr_pages, &mce_bad_pages);
944 979
945 /* 980 /*
946 * We need/can do nothing about count=0 pages. 981 * We need/can do nothing about count=0 pages.
@@ -954,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
954 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 989 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
955 */ 990 */
956 if (!(flags & MF_COUNT_INCREASED) && 991 if (!(flags & MF_COUNT_INCREASED) &&
957 !get_page_unless_zero(compound_head(p))) { 992 !get_page_unless_zero(hpage)) {
958 if (is_free_buddy_page(p)) { 993 if (is_free_buddy_page(p)) {
959 action_result(pfn, "free buddy", DELAYED); 994 action_result(pfn, "free buddy", DELAYED);
960 return 0; 995 return 0;
@@ -972,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
972 * The check (unnecessarily) ignores LRU pages being isolated and 1007 * The check (unnecessarily) ignores LRU pages being isolated and
973 * walked by the page reclaim code, however that's not a big loss. 1008 * walked by the page reclaim code, however that's not a big loss.
974 */ 1009 */
975 if (!PageLRU(p)) 1010 if (!PageLRU(p) && !PageHuge(p))
976 shake_page(p, 0); 1011 shake_page(p, 0);
977 if (!PageLRU(p)) { 1012 if (!PageLRU(p) && !PageHuge(p)) {
978 /* 1013 /*
979 * shake_page could have turned it free. 1014 * shake_page could have turned it free.
980 */ 1015 */
@@ -992,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
992 * It's very difficult to mess with pages currently under IO 1027 * It's very difficult to mess with pages currently under IO
993 * and in many cases impossible, so we just avoid it here. 1028 * and in many cases impossible, so we just avoid it here.
994 */ 1029 */
995 lock_page_nosync(p); 1030 lock_page_nosync(hpage);
996 1031
997 /* 1032 /*
998 * unpoison always clear PG_hwpoison inside page lock 1033 * unpoison always clear PG_hwpoison inside page lock
@@ -1004,11 +1039,31 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1004 } 1039 }
1005 if (hwpoison_filter(p)) { 1040 if (hwpoison_filter(p)) {
1006 if (TestClearPageHWPoison(p)) 1041 if (TestClearPageHWPoison(p))
1007 atomic_long_dec(&mce_bad_pages); 1042 atomic_long_sub(nr_pages, &mce_bad_pages);
1008 unlock_page(p); 1043 unlock_page(hpage);
1009 put_page(p); 1044 put_page(hpage);
1045 return 0;
1046 }
1047
1048 /*
1049 * For error on the tail page, we should set PG_hwpoison
1050 * on the head page to show that the hugepage is hwpoisoned
1051 */
1052 if (PageTail(p) && TestSetPageHWPoison(hpage)) {
1053 action_result(pfn, "hugepage already hardware poisoned",
1054 IGNORED);
1055 unlock_page(hpage);
1056 put_page(hpage);
1010 return 0; 1057 return 0;
1011 } 1058 }
1059 /*
1060 * Set PG_hwpoison on all pages in an error hugepage,
1061 * because containment is done in hugepage unit for now.
1062 * Since we have done TestSetPageHWPoison() for the head page with
1063 * page lock held, we can safely set PG_hwpoison bits on tail pages.
1064 */
1065 if (PageHuge(p))
1066 set_page_hwpoison_huge_page(hpage);
1012 1067
1013 wait_on_page_writeback(p); 1068 wait_on_page_writeback(p);
1014 1069
@@ -1039,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1039 } 1094 }
1040 } 1095 }
1041out: 1096out:
1042 unlock_page(p); 1097 unlock_page(hpage);
1043 return res; 1098 return res;
1044} 1099}
1045EXPORT_SYMBOL_GPL(__memory_failure); 1100EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1083,6 +1138,7 @@ int unpoison_memory(unsigned long pfn)
1083 struct page *page; 1138 struct page *page;
1084 struct page *p; 1139 struct page *p;
1085 int freeit = 0; 1140 int freeit = 0;
1141 unsigned int nr_pages;
1086 1142
1087 if (!pfn_valid(pfn)) 1143 if (!pfn_valid(pfn))
1088 return -ENXIO; 1144 return -ENXIO;
@@ -1095,9 +1151,11 @@ int unpoison_memory(unsigned long pfn)
1095 return 0; 1151 return 0;
1096 } 1152 }
1097 1153
1154 nr_pages = 1 << compound_order(page);
1155
1098 if (!get_page_unless_zero(page)) { 1156 if (!get_page_unless_zero(page)) {
1099 if (TestClearPageHWPoison(p)) 1157 if (TestClearPageHWPoison(p))
1100 atomic_long_dec(&mce_bad_pages); 1158 atomic_long_sub(nr_pages, &mce_bad_pages);
1101 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1159 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
1102 return 0; 1160 return 0;
1103 } 1161 }
@@ -1109,11 +1167,13 @@ int unpoison_memory(unsigned long pfn)
1109 * the PG_hwpoison page will be caught and isolated on the entrance to 1167 * the PG_hwpoison page will be caught and isolated on the entrance to
1110 * the free buddy page pool. 1168 * the free buddy page pool.
1111 */ 1169 */
1112 if (TestClearPageHWPoison(p)) { 1170 if (TestClearPageHWPoison(page)) {
1113 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1171 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
1114 atomic_long_dec(&mce_bad_pages); 1172 atomic_long_sub(nr_pages, &mce_bad_pages);
1115 freeit = 1; 1173 freeit = 1;
1116 } 1174 }
1175 if (PageHuge(p))
1176 clear_page_hwpoison_huge_page(page);
1117 unlock_page(page); 1177 unlock_page(page);
1118 1178
1119 put_page(page); 1179 put_page(page);