diff options
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r-- | mm/memory-failure.c | 153 |
1 files changed, 123 insertions, 30 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 620b0b461593..9c26eeca1342 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -45,6 +45,8 @@ | |||
45 | #include <linux/page-isolation.h> | 45 | #include <linux/page-isolation.h> |
46 | #include <linux/suspend.h> | 46 | #include <linux/suspend.h> |
47 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
48 | #include <linux/swapops.h> | ||
49 | #include <linux/hugetlb.h> | ||
48 | #include "internal.h" | 50 | #include "internal.h" |
49 | 51 | ||
50 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 52 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -689,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) | |||
689 | /* | 691 | /* |
690 | * Huge pages. Needs work. | 692 | * Huge pages. Needs work. |
691 | * Issues: | 693 | * Issues: |
692 | * No rmap support so we cannot find the original mapper. In theory could walk | 694 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) |
693 | * all MMs and look for the mappings, but that would be non atomic and racy. | 695 | * To narrow down kill region to one page, we need to break up pmd. |
694 | * Need rmap for hugepages for this. Alternatively we could employ a heuristic, | 696 | * - To support soft-offlining for hugepage, we need to support hugepage |
695 | * like just walking the current process and hoping it has it mapped (that | 697 | * migration. |
696 | * should be usually true for the common "shared database cache" case) | ||
697 | * Should handle free huge pages and dequeue them too, but this needs to | ||
698 | * handle huge page accounting correctly. | ||
699 | */ | 698 | */ |
700 | static int me_huge_page(struct page *p, unsigned long pfn) | 699 | static int me_huge_page(struct page *p, unsigned long pfn) |
701 | { | 700 | { |
702 | return FAILED; | 701 | struct page *hpage = compound_head(p); |
702 | /* | ||
703 | * We can safely recover from error on free or reserved (i.e. | ||
704 | * not in-use) hugepage by dequeuing it from freelist. | ||
705 | * To check whether a hugepage is in-use or not, we can't use | ||
706 | * page->lru because it can be used in other hugepage operations, | ||
707 | * such as __unmap_hugepage_range() and gather_surplus_pages(). | ||
708 | * So instead we use page_mapping() and PageAnon(). | ||
709 | * We assume that this function is called with page lock held, | ||
710 | * so there is no race between isolation and mapping/unmapping. | ||
711 | */ | ||
712 | if (!(page_mapping(hpage) || PageAnon(hpage))) { | ||
713 | __isolate_hwpoisoned_huge_page(hpage); | ||
714 | return RECOVERED; | ||
715 | } | ||
716 | return DELAYED; | ||
703 | } | 717 | } |
704 | 718 | ||
705 | /* | 719 | /* |
@@ -837,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
837 | int ret; | 851 | int ret; |
838 | int i; | 852 | int i; |
839 | int kill = 1; | 853 | int kill = 1; |
854 | struct page *hpage = compound_head(p); | ||
840 | 855 | ||
841 | if (PageReserved(p) || PageSlab(p)) | 856 | if (PageReserved(p) || PageSlab(p)) |
842 | return SWAP_SUCCESS; | 857 | return SWAP_SUCCESS; |
@@ -845,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
845 | * This check implies we don't kill processes if their pages | 860 | * This check implies we don't kill processes if their pages |
846 | * are in the swap cache early. Those are always late kills. | 861 | * are in the swap cache early. Those are always late kills. |
847 | */ | 862 | */ |
848 | if (!page_mapped(p)) | 863 | if (!page_mapped(hpage)) |
849 | return SWAP_SUCCESS; | 864 | return SWAP_SUCCESS; |
850 | 865 | ||
851 | if (PageCompound(p) || PageKsm(p)) | 866 | if (PageKsm(p)) |
852 | return SWAP_FAIL; | 867 | return SWAP_FAIL; |
853 | 868 | ||
854 | if (PageSwapCache(p)) { | 869 | if (PageSwapCache(p)) { |
@@ -863,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
863 | * XXX: the dirty test could be racy: set_page_dirty() may not always | 878 | * XXX: the dirty test could be racy: set_page_dirty() may not always |
864 | * be called inside page lock (it's recommended but not enforced). | 879 | * be called inside page lock (it's recommended but not enforced). |
865 | */ | 880 | */ |
866 | mapping = page_mapping(p); | 881 | mapping = page_mapping(hpage); |
867 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { | 882 | if (!PageDirty(hpage) && mapping && |
868 | if (page_mkclean(p)) { | 883 | mapping_cap_writeback_dirty(mapping)) { |
869 | SetPageDirty(p); | 884 | if (page_mkclean(hpage)) { |
885 | SetPageDirty(hpage); | ||
870 | } else { | 886 | } else { |
871 | kill = 0; | 887 | kill = 0; |
872 | ttu |= TTU_IGNORE_HWPOISON; | 888 | ttu |= TTU_IGNORE_HWPOISON; |
@@ -885,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
885 | * there's nothing that can be done. | 901 | * there's nothing that can be done. |
886 | */ | 902 | */ |
887 | if (kill) | 903 | if (kill) |
888 | collect_procs(p, &tokill); | 904 | collect_procs(hpage, &tokill); |
889 | 905 | ||
890 | /* | 906 | /* |
891 | * try_to_unmap can fail temporarily due to races. | 907 | * try_to_unmap can fail temporarily due to races. |
892 | * Try a few times (RED-PEN better strategy?) | 908 | * Try a few times (RED-PEN better strategy?) |
893 | */ | 909 | */ |
894 | for (i = 0; i < N_UNMAP_TRIES; i++) { | 910 | for (i = 0; i < N_UNMAP_TRIES; i++) { |
895 | ret = try_to_unmap(p, ttu); | 911 | ret = try_to_unmap(hpage, ttu); |
896 | if (ret == SWAP_SUCCESS) | 912 | if (ret == SWAP_SUCCESS) |
897 | break; | 913 | break; |
898 | pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); | 914 | pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); |
@@ -900,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
900 | 916 | ||
901 | if (ret != SWAP_SUCCESS) | 917 | if (ret != SWAP_SUCCESS) |
902 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | 918 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", |
903 | pfn, page_mapcount(p)); | 919 | pfn, page_mapcount(hpage)); |
904 | 920 | ||
905 | /* | 921 | /* |
906 | * Now that the dirty bit has been propagated to the | 922 | * Now that the dirty bit has been propagated to the |
@@ -911,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
911 | * use a more force-full uncatchable kill to prevent | 927 | * use a more force-full uncatchable kill to prevent |
912 | * any accesses to the poisoned memory. | 928 | * any accesses to the poisoned memory. |
913 | */ | 929 | */ |
914 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, | 930 | kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, |
915 | ret != SWAP_SUCCESS, pfn); | 931 | ret != SWAP_SUCCESS, pfn); |
916 | 932 | ||
917 | return ret; | 933 | return ret; |
918 | } | 934 | } |
919 | 935 | ||
936 | static void set_page_hwpoison_huge_page(struct page *hpage) | ||
937 | { | ||
938 | int i; | ||
939 | int nr_pages = 1 << compound_order(hpage); | ||
940 | for (i = 0; i < nr_pages; i++) | ||
941 | SetPageHWPoison(hpage + i); | ||
942 | } | ||
943 | |||
944 | static void clear_page_hwpoison_huge_page(struct page *hpage) | ||
945 | { | ||
946 | int i; | ||
947 | int nr_pages = 1 << compound_order(hpage); | ||
948 | for (i = 0; i < nr_pages; i++) | ||
949 | ClearPageHWPoison(hpage + i); | ||
950 | } | ||
951 | |||
920 | int __memory_failure(unsigned long pfn, int trapno, int flags) | 952 | int __memory_failure(unsigned long pfn, int trapno, int flags) |
921 | { | 953 | { |
922 | struct page_state *ps; | 954 | struct page_state *ps; |
923 | struct page *p; | 955 | struct page *p; |
956 | struct page *hpage; | ||
924 | int res; | 957 | int res; |
958 | unsigned int nr_pages; | ||
925 | 959 | ||
926 | if (!sysctl_memory_failure_recovery) | 960 | if (!sysctl_memory_failure_recovery) |
927 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | 961 | panic("Memory failure from trap %d on page %lx", trapno, pfn); |
@@ -934,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
934 | } | 968 | } |
935 | 969 | ||
936 | p = pfn_to_page(pfn); | 970 | p = pfn_to_page(pfn); |
971 | hpage = compound_head(p); | ||
937 | if (TestSetPageHWPoison(p)) { | 972 | if (TestSetPageHWPoison(p)) { |
938 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); | 973 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); |
939 | return 0; | 974 | return 0; |
940 | } | 975 | } |
941 | 976 | ||
942 | atomic_long_add(1, &mce_bad_pages); | 977 | nr_pages = 1 << compound_order(hpage); |
978 | atomic_long_add(nr_pages, &mce_bad_pages); | ||
943 | 979 | ||
944 | /* | 980 | /* |
945 | * We need/can do nothing about count=0 pages. | 981 | * We need/can do nothing about count=0 pages. |
@@ -953,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
953 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | 989 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. |
954 | */ | 990 | */ |
955 | if (!(flags & MF_COUNT_INCREASED) && | 991 | if (!(flags & MF_COUNT_INCREASED) && |
956 | !get_page_unless_zero(compound_head(p))) { | 992 | !get_page_unless_zero(hpage)) { |
957 | if (is_free_buddy_page(p)) { | 993 | if (is_free_buddy_page(p)) { |
958 | action_result(pfn, "free buddy", DELAYED); | 994 | action_result(pfn, "free buddy", DELAYED); |
959 | return 0; | 995 | return 0; |
@@ -971,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
971 | * The check (unnecessarily) ignores LRU pages being isolated and | 1007 | * The check (unnecessarily) ignores LRU pages being isolated and |
972 | * walked by the page reclaim code, however that's not a big loss. | 1008 | * walked by the page reclaim code, however that's not a big loss. |
973 | */ | 1009 | */ |
974 | if (!PageLRU(p)) | 1010 | if (!PageLRU(p) && !PageHuge(p)) |
975 | shake_page(p, 0); | 1011 | shake_page(p, 0); |
976 | if (!PageLRU(p)) { | 1012 | if (!PageLRU(p) && !PageHuge(p)) { |
977 | /* | 1013 | /* |
978 | * shake_page could have turned it free. | 1014 | * shake_page could have turned it free. |
979 | */ | 1015 | */ |
@@ -991,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
991 | * It's very difficult to mess with pages currently under IO | 1027 | * It's very difficult to mess with pages currently under IO |
992 | * and in many cases impossible, so we just avoid it here. | 1028 | * and in many cases impossible, so we just avoid it here. |
993 | */ | 1029 | */ |
994 | lock_page_nosync(p); | 1030 | lock_page_nosync(hpage); |
995 | 1031 | ||
996 | /* | 1032 | /* |
997 | * unpoison always clear PG_hwpoison inside page lock | 1033 | * unpoison always clear PG_hwpoison inside page lock |
@@ -1003,12 +1039,32 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1003 | } | 1039 | } |
1004 | if (hwpoison_filter(p)) { | 1040 | if (hwpoison_filter(p)) { |
1005 | if (TestClearPageHWPoison(p)) | 1041 | if (TestClearPageHWPoison(p)) |
1006 | atomic_long_dec(&mce_bad_pages); | 1042 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1007 | unlock_page(p); | 1043 | unlock_page(hpage); |
1008 | put_page(p); | 1044 | put_page(hpage); |
1009 | return 0; | 1045 | return 0; |
1010 | } | 1046 | } |
1011 | 1047 | ||
1048 | /* | ||
1049 | * For error on the tail page, we should set PG_hwpoison | ||
1050 | * on the head page to show that the hugepage is hwpoisoned | ||
1051 | */ | ||
1052 | if (PageTail(p) && TestSetPageHWPoison(hpage)) { | ||
1053 | action_result(pfn, "hugepage already hardware poisoned", | ||
1054 | IGNORED); | ||
1055 | unlock_page(hpage); | ||
1056 | put_page(hpage); | ||
1057 | return 0; | ||
1058 | } | ||
1059 | /* | ||
1060 | * Set PG_hwpoison on all pages in an error hugepage, | ||
1061 | * because containment is done in hugepage unit for now. | ||
1062 | * Since we have done TestSetPageHWPoison() for the head page with | ||
1063 | * page lock held, we can safely set PG_hwpoison bits on tail pages. | ||
1064 | */ | ||
1065 | if (PageHuge(p)) | ||
1066 | set_page_hwpoison_huge_page(hpage); | ||
1067 | |||
1012 | wait_on_page_writeback(p); | 1068 | wait_on_page_writeback(p); |
1013 | 1069 | ||
1014 | /* | 1070 | /* |
@@ -1038,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1038 | } | 1094 | } |
1039 | } | 1095 | } |
1040 | out: | 1096 | out: |
1041 | unlock_page(p); | 1097 | unlock_page(hpage); |
1042 | return res; | 1098 | return res; |
1043 | } | 1099 | } |
1044 | EXPORT_SYMBOL_GPL(__memory_failure); | 1100 | EXPORT_SYMBOL_GPL(__memory_failure); |
@@ -1082,6 +1138,7 @@ int unpoison_memory(unsigned long pfn) | |||
1082 | struct page *page; | 1138 | struct page *page; |
1083 | struct page *p; | 1139 | struct page *p; |
1084 | int freeit = 0; | 1140 | int freeit = 0; |
1141 | unsigned int nr_pages; | ||
1085 | 1142 | ||
1086 | if (!pfn_valid(pfn)) | 1143 | if (!pfn_valid(pfn)) |
1087 | return -ENXIO; | 1144 | return -ENXIO; |
@@ -1094,9 +1151,11 @@ int unpoison_memory(unsigned long pfn) | |||
1094 | return 0; | 1151 | return 0; |
1095 | } | 1152 | } |
1096 | 1153 | ||
1154 | nr_pages = 1 << compound_order(page); | ||
1155 | |||
1097 | if (!get_page_unless_zero(page)) { | 1156 | if (!get_page_unless_zero(page)) { |
1098 | if (TestClearPageHWPoison(p)) | 1157 | if (TestClearPageHWPoison(p)) |
1099 | atomic_long_dec(&mce_bad_pages); | 1158 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1100 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1159 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); |
1101 | return 0; | 1160 | return 0; |
1102 | } | 1161 | } |
@@ -1108,11 +1167,13 @@ int unpoison_memory(unsigned long pfn) | |||
1108 | * the PG_hwpoison page will be caught and isolated on the entrance to | 1167 | * the PG_hwpoison page will be caught and isolated on the entrance to |
1109 | * the free buddy page pool. | 1168 | * the free buddy page pool. |
1110 | */ | 1169 | */ |
1111 | if (TestClearPageHWPoison(p)) { | 1170 | if (TestClearPageHWPoison(page)) { |
1112 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); | 1171 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); |
1113 | atomic_long_dec(&mce_bad_pages); | 1172 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1114 | freeit = 1; | 1173 | freeit = 1; |
1115 | } | 1174 | } |
1175 | if (PageHuge(p)) | ||
1176 | clear_page_hwpoison_huge_page(page); | ||
1116 | unlock_page(page); | 1177 | unlock_page(page); |
1117 | 1178 | ||
1118 | put_page(page); | 1179 | put_page(page); |
@@ -1296,3 +1357,35 @@ done: | |||
1296 | /* keep elevated page count for bad page */ | 1357 | /* keep elevated page count for bad page */ |
1297 | return ret; | 1358 | return ret; |
1298 | } | 1359 | } |
1360 | |||
1361 | /* | ||
1362 | * The caller must hold current->mm->mmap_sem in read mode. | ||
1363 | */ | ||
1364 | int is_hwpoison_address(unsigned long addr) | ||
1365 | { | ||
1366 | pgd_t *pgdp; | ||
1367 | pud_t pud, *pudp; | ||
1368 | pmd_t pmd, *pmdp; | ||
1369 | pte_t pte, *ptep; | ||
1370 | swp_entry_t entry; | ||
1371 | |||
1372 | pgdp = pgd_offset(current->mm, addr); | ||
1373 | if (!pgd_present(*pgdp)) | ||
1374 | return 0; | ||
1375 | pudp = pud_offset(pgdp, addr); | ||
1376 | pud = *pudp; | ||
1377 | if (!pud_present(pud) || pud_large(pud)) | ||
1378 | return 0; | ||
1379 | pmdp = pmd_offset(pudp, addr); | ||
1380 | pmd = *pmdp; | ||
1381 | if (!pmd_present(pmd) || pmd_large(pmd)) | ||
1382 | return 0; | ||
1383 | ptep = pte_offset_map(pmdp, addr); | ||
1384 | pte = *ptep; | ||
1385 | pte_unmap(ptep); | ||
1386 | if (!is_swap_pte(pte)) | ||
1387 | return 0; | ||
1388 | entry = pte_to_swp_entry(pte); | ||
1389 | return is_hwpoison_entry(entry); | ||
1390 | } | ||
1391 | EXPORT_SYMBOL_GPL(is_hwpoison_address); | ||