diff options
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r-- | mm/memory-failure.c | 120 |
1 files changed, 90 insertions, 30 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6b44e52cacaa..9c26eeca1342 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <linux/suspend.h> | 46 | #include <linux/suspend.h> |
47 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
48 | #include <linux/swapops.h> | 48 | #include <linux/swapops.h> |
49 | #include <linux/hugetlb.h> | ||
49 | #include "internal.h" | 50 | #include "internal.h" |
50 | 51 | ||
51 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 52 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -690,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) | |||
690 | /* | 691 | /* |
691 | * Huge pages. Needs work. | 692 | * Huge pages. Needs work. |
692 | * Issues: | 693 | * Issues: |
693 | * No rmap support so we cannot find the original mapper. In theory could walk | 694 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) |
694 | * all MMs and look for the mappings, but that would be non atomic and racy. | 695 | * To narrow down kill region to one page, we need to break up pmd. |
695 | * Need rmap for hugepages for this. Alternatively we could employ a heuristic, | 696 | * - To support soft-offlining for hugepage, we need to support hugepage |
696 | * like just walking the current process and hoping it has it mapped (that | 697 | * migration. |
697 | * should be usually true for the common "shared database cache" case) | ||
698 | * Should handle free huge pages and dequeue them too, but this needs to | ||
699 | * handle huge page accounting correctly. | ||
700 | */ | 698 | */ |
701 | static int me_huge_page(struct page *p, unsigned long pfn) | 699 | static int me_huge_page(struct page *p, unsigned long pfn) |
702 | { | 700 | { |
703 | return FAILED; | 701 | struct page *hpage = compound_head(p); |
702 | /* | ||
703 | * We can safely recover from error on free or reserved (i.e. | ||
704 | * not in-use) hugepage by dequeuing it from freelist. | ||
705 | * To check whether a hugepage is in-use or not, we can't use | ||
706 | * page->lru because it can be used in other hugepage operations, | ||
707 | * such as __unmap_hugepage_range() and gather_surplus_pages(). | ||
708 | * So instead we use page_mapping() and PageAnon(). | ||
709 | * We assume that this function is called with page lock held, | ||
710 | * so there is no race between isolation and mapping/unmapping. | ||
711 | */ | ||
712 | if (!(page_mapping(hpage) || PageAnon(hpage))) { | ||
713 | __isolate_hwpoisoned_huge_page(hpage); | ||
714 | return RECOVERED; | ||
715 | } | ||
716 | return DELAYED; | ||
704 | } | 717 | } |
705 | 718 | ||
706 | /* | 719 | /* |
@@ -838,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
838 | int ret; | 851 | int ret; |
839 | int i; | 852 | int i; |
840 | int kill = 1; | 853 | int kill = 1; |
854 | struct page *hpage = compound_head(p); | ||
841 | 855 | ||
842 | if (PageReserved(p) || PageSlab(p)) | 856 | if (PageReserved(p) || PageSlab(p)) |
843 | return SWAP_SUCCESS; | 857 | return SWAP_SUCCESS; |
@@ -846,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
846 | * This check implies we don't kill processes if their pages | 860 | * This check implies we don't kill processes if their pages |
847 | * are in the swap cache early. Those are always late kills. | 861 | * are in the swap cache early. Those are always late kills. |
848 | */ | 862 | */ |
849 | if (!page_mapped(p)) | 863 | if (!page_mapped(hpage)) |
850 | return SWAP_SUCCESS; | 864 | return SWAP_SUCCESS; |
851 | 865 | ||
852 | if (PageCompound(p) || PageKsm(p)) | 866 | if (PageKsm(p)) |
853 | return SWAP_FAIL; | 867 | return SWAP_FAIL; |
854 | 868 | ||
855 | if (PageSwapCache(p)) { | 869 | if (PageSwapCache(p)) { |
@@ -864,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
864 | * XXX: the dirty test could be racy: set_page_dirty() may not always | 878 | * XXX: the dirty test could be racy: set_page_dirty() may not always |
865 | * be called inside page lock (it's recommended but not enforced). | 879 | * be called inside page lock (it's recommended but not enforced). |
866 | */ | 880 | */ |
867 | mapping = page_mapping(p); | 881 | mapping = page_mapping(hpage); |
868 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { | 882 | if (!PageDirty(hpage) && mapping && |
869 | if (page_mkclean(p)) { | 883 | mapping_cap_writeback_dirty(mapping)) { |
870 | SetPageDirty(p); | 884 | if (page_mkclean(hpage)) { |
885 | SetPageDirty(hpage); | ||
871 | } else { | 886 | } else { |
872 | kill = 0; | 887 | kill = 0; |
873 | ttu |= TTU_IGNORE_HWPOISON; | 888 | ttu |= TTU_IGNORE_HWPOISON; |
@@ -886,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
886 | * there's nothing that can be done. | 901 | * there's nothing that can be done. |
887 | */ | 902 | */ |
888 | if (kill) | 903 | if (kill) |
889 | collect_procs(p, &tokill); | 904 | collect_procs(hpage, &tokill); |
890 | 905 | ||
891 | /* | 906 | /* |
892 | * try_to_unmap can fail temporarily due to races. | 907 | * try_to_unmap can fail temporarily due to races. |
893 | * Try a few times (RED-PEN better strategy?) | 908 | * Try a few times (RED-PEN better strategy?) |
894 | */ | 909 | */ |
895 | for (i = 0; i < N_UNMAP_TRIES; i++) { | 910 | for (i = 0; i < N_UNMAP_TRIES; i++) { |
896 | ret = try_to_unmap(p, ttu); | 911 | ret = try_to_unmap(hpage, ttu); |
897 | if (ret == SWAP_SUCCESS) | 912 | if (ret == SWAP_SUCCESS) |
898 | break; | 913 | break; |
899 | pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); | 914 | pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); |
@@ -901,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
901 | 916 | ||
902 | if (ret != SWAP_SUCCESS) | 917 | if (ret != SWAP_SUCCESS) |
903 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | 918 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", |
904 | pfn, page_mapcount(p)); | 919 | pfn, page_mapcount(hpage)); |
905 | 920 | ||
906 | /* | 921 | /* |
907 | * Now that the dirty bit has been propagated to the | 922 | * Now that the dirty bit has been propagated to the |
@@ -912,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
912 | * use a more force-full uncatchable kill to prevent | 927 | * use a more force-full uncatchable kill to prevent |
913 | * any accesses to the poisoned memory. | 928 | * any accesses to the poisoned memory. |
914 | */ | 929 | */ |
915 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, | 930 | kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, |
916 | ret != SWAP_SUCCESS, pfn); | 931 | ret != SWAP_SUCCESS, pfn); |
917 | 932 | ||
918 | return ret; | 933 | return ret; |
919 | } | 934 | } |
920 | 935 | ||
936 | static void set_page_hwpoison_huge_page(struct page *hpage) | ||
937 | { | ||
938 | int i; | ||
939 | int nr_pages = 1 << compound_order(hpage); | ||
940 | for (i = 0; i < nr_pages; i++) | ||
941 | SetPageHWPoison(hpage + i); | ||
942 | } | ||
943 | |||
944 | static void clear_page_hwpoison_huge_page(struct page *hpage) | ||
945 | { | ||
946 | int i; | ||
947 | int nr_pages = 1 << compound_order(hpage); | ||
948 | for (i = 0; i < nr_pages; i++) | ||
949 | ClearPageHWPoison(hpage + i); | ||
950 | } | ||
951 | |||
921 | int __memory_failure(unsigned long pfn, int trapno, int flags) | 952 | int __memory_failure(unsigned long pfn, int trapno, int flags) |
922 | { | 953 | { |
923 | struct page_state *ps; | 954 | struct page_state *ps; |
924 | struct page *p; | 955 | struct page *p; |
956 | struct page *hpage; | ||
925 | int res; | 957 | int res; |
958 | unsigned int nr_pages; | ||
926 | 959 | ||
927 | if (!sysctl_memory_failure_recovery) | 960 | if (!sysctl_memory_failure_recovery) |
928 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | 961 | panic("Memory failure from trap %d on page %lx", trapno, pfn); |
@@ -935,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
935 | } | 968 | } |
936 | 969 | ||
937 | p = pfn_to_page(pfn); | 970 | p = pfn_to_page(pfn); |
971 | hpage = compound_head(p); | ||
938 | if (TestSetPageHWPoison(p)) { | 972 | if (TestSetPageHWPoison(p)) { |
939 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); | 973 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); |
940 | return 0; | 974 | return 0; |
941 | } | 975 | } |
942 | 976 | ||
943 | atomic_long_add(1, &mce_bad_pages); | 977 | nr_pages = 1 << compound_order(hpage); |
978 | atomic_long_add(nr_pages, &mce_bad_pages); | ||
944 | 979 | ||
945 | /* | 980 | /* |
946 | * We need/can do nothing about count=0 pages. | 981 | * We need/can do nothing about count=0 pages. |
@@ -954,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
954 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | 989 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. |
955 | */ | 990 | */ |
956 | if (!(flags & MF_COUNT_INCREASED) && | 991 | if (!(flags & MF_COUNT_INCREASED) && |
957 | !get_page_unless_zero(compound_head(p))) { | 992 | !get_page_unless_zero(hpage)) { |
958 | if (is_free_buddy_page(p)) { | 993 | if (is_free_buddy_page(p)) { |
959 | action_result(pfn, "free buddy", DELAYED); | 994 | action_result(pfn, "free buddy", DELAYED); |
960 | return 0; | 995 | return 0; |
@@ -972,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
972 | * The check (unnecessarily) ignores LRU pages being isolated and | 1007 | * The check (unnecessarily) ignores LRU pages being isolated and |
973 | * walked by the page reclaim code, however that's not a big loss. | 1008 | * walked by the page reclaim code, however that's not a big loss. |
974 | */ | 1009 | */ |
975 | if (!PageLRU(p)) | 1010 | if (!PageLRU(p) && !PageHuge(p)) |
976 | shake_page(p, 0); | 1011 | shake_page(p, 0); |
977 | if (!PageLRU(p)) { | 1012 | if (!PageLRU(p) && !PageHuge(p)) { |
978 | /* | 1013 | /* |
979 | * shake_page could have turned it free. | 1014 | * shake_page could have turned it free. |
980 | */ | 1015 | */ |
@@ -992,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
992 | * It's very difficult to mess with pages currently under IO | 1027 | * It's very difficult to mess with pages currently under IO |
993 | * and in many cases impossible, so we just avoid it here. | 1028 | * and in many cases impossible, so we just avoid it here. |
994 | */ | 1029 | */ |
995 | lock_page_nosync(p); | 1030 | lock_page_nosync(hpage); |
996 | 1031 | ||
997 | /* | 1032 | /* |
998 | * unpoison always clear PG_hwpoison inside page lock | 1033 | * unpoison always clear PG_hwpoison inside page lock |
@@ -1004,11 +1039,31 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1004 | } | 1039 | } |
1005 | if (hwpoison_filter(p)) { | 1040 | if (hwpoison_filter(p)) { |
1006 | if (TestClearPageHWPoison(p)) | 1041 | if (TestClearPageHWPoison(p)) |
1007 | atomic_long_dec(&mce_bad_pages); | 1042 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1008 | unlock_page(p); | 1043 | unlock_page(hpage); |
1009 | put_page(p); | 1044 | put_page(hpage); |
1045 | return 0; | ||
1046 | } | ||
1047 | |||
1048 | /* | ||
1049 | * For error on the tail page, we should set PG_hwpoison | ||
1050 | * on the head page to show that the hugepage is hwpoisoned | ||
1051 | */ | ||
1052 | if (PageTail(p) && TestSetPageHWPoison(hpage)) { | ||
1053 | action_result(pfn, "hugepage already hardware poisoned", | ||
1054 | IGNORED); | ||
1055 | unlock_page(hpage); | ||
1056 | put_page(hpage); | ||
1010 | return 0; | 1057 | return 0; |
1011 | } | 1058 | } |
1059 | /* | ||
1060 | * Set PG_hwpoison on all pages in an error hugepage, | ||
1061 | * because containment is done in hugepage unit for now. | ||
1062 | * Since we have done TestSetPageHWPoison() for the head page with | ||
1063 | * page lock held, we can safely set PG_hwpoison bits on tail pages. | ||
1064 | */ | ||
1065 | if (PageHuge(p)) | ||
1066 | set_page_hwpoison_huge_page(hpage); | ||
1012 | 1067 | ||
1013 | wait_on_page_writeback(p); | 1068 | wait_on_page_writeback(p); |
1014 | 1069 | ||
@@ -1039,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1039 | } | 1094 | } |
1040 | } | 1095 | } |
1041 | out: | 1096 | out: |
1042 | unlock_page(p); | 1097 | unlock_page(hpage); |
1043 | return res; | 1098 | return res; |
1044 | } | 1099 | } |
1045 | EXPORT_SYMBOL_GPL(__memory_failure); | 1100 | EXPORT_SYMBOL_GPL(__memory_failure); |
@@ -1083,6 +1138,7 @@ int unpoison_memory(unsigned long pfn) | |||
1083 | struct page *page; | 1138 | struct page *page; |
1084 | struct page *p; | 1139 | struct page *p; |
1085 | int freeit = 0; | 1140 | int freeit = 0; |
1141 | unsigned int nr_pages; | ||
1086 | 1142 | ||
1087 | if (!pfn_valid(pfn)) | 1143 | if (!pfn_valid(pfn)) |
1088 | return -ENXIO; | 1144 | return -ENXIO; |
@@ -1095,9 +1151,11 @@ int unpoison_memory(unsigned long pfn) | |||
1095 | return 0; | 1151 | return 0; |
1096 | } | 1152 | } |
1097 | 1153 | ||
1154 | nr_pages = 1 << compound_order(page); | ||
1155 | |||
1098 | if (!get_page_unless_zero(page)) { | 1156 | if (!get_page_unless_zero(page)) { |
1099 | if (TestClearPageHWPoison(p)) | 1157 | if (TestClearPageHWPoison(p)) |
1100 | atomic_long_dec(&mce_bad_pages); | 1158 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1101 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1159 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); |
1102 | return 0; | 1160 | return 0; |
1103 | } | 1161 | } |
@@ -1109,11 +1167,13 @@ int unpoison_memory(unsigned long pfn) | |||
1109 | * the PG_hwpoison page will be caught and isolated on the entrance to | 1167 | * the PG_hwpoison page will be caught and isolated on the entrance to |
1110 | * the free buddy page pool. | 1168 | * the free buddy page pool. |
1111 | */ | 1169 | */ |
1112 | if (TestClearPageHWPoison(p)) { | 1170 | if (TestClearPageHWPoison(page)) { |
1113 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); | 1171 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); |
1114 | atomic_long_dec(&mce_bad_pages); | 1172 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1115 | freeit = 1; | 1173 | freeit = 1; |
1116 | } | 1174 | } |
1175 | if (PageHuge(p)) | ||
1176 | clear_page_hwpoison_huge_page(page); | ||
1117 | unlock_page(page); | 1177 | unlock_page(page); |
1118 | 1178 | ||
1119 | put_page(page); | 1179 | put_page(page); |