diff options
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r-- | mm/memory-failure.c | 370 |
1 files changed, 233 insertions, 137 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 757f6b0accfe..740c4f52059c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -7,21 +7,26 @@ | |||
7 | * Free Software Foundation. | 7 | * Free Software Foundation. |
8 | * | 8 | * |
9 | * High level machine check handler. Handles pages reported by the | 9 | * High level machine check handler. Handles pages reported by the |
10 | * hardware as being corrupted usually due to a 2bit ECC memory or cache | 10 | * hardware as being corrupted usually due to a multi-bit ECC memory or cache |
11 | * failure. | 11 | * failure. |
12 | * | ||
13 | * In addition there is a "soft offline" entry point that allows stop using | ||
14 | * not-yet-corrupted-by-suspicious pages without killing anything. | ||
12 | * | 15 | * |
13 | * Handles page cache pages in various states. The tricky part | 16 | * Handles page cache pages in various states. The tricky part |
14 | * here is that we can access any page asynchronous to other VM | 17 | * here is that we can access any page asynchronously in respect to |
15 | * users, because memory failures could happen anytime and anywhere, | 18 | * other VM users, because memory failures could happen anytime and |
16 | * possibly violating some of their assumptions. This is why this code | 19 | * anywhere. This could violate some of their assumptions. This is why |
17 | * has to be extremely careful. Generally it tries to use normal locking | 20 | * this code has to be extremely careful. Generally it tries to use |
18 | * rules, as in get the standard locks, even if that means the | 21 | * normal locking rules, as in get the standard locks, even if that means |
19 | * error handling takes potentially a long time. | 22 | * the error handling takes potentially a long time. |
20 | * | 23 | * |
21 | * The operation to map back from RMAP chains to processes has to walk | 24 | * There are several operations here with exponential complexity because |
22 | * the complete process list and has non linear complexity with the number | 25 | * of unsuitable VM data structures. For example the operation to map back |
23 | * mappings. In short it can be quite slow. But since memory corruptions | 26 | * from RMAP chains to processes has to walk the complete process list and |
24 | * are rare we hope to get away with this. | 27 | * has non linear complexity with the number. But since memory corruptions |
28 | * are rare we hope to get away with this. This avoids impacting the core | ||
29 | * VM. | ||
25 | */ | 30 | */ |
26 | 31 | ||
27 | /* | 32 | /* |
@@ -30,7 +35,6 @@ | |||
30 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages | 35 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages |
31 | * - pass bad pages to kdump next kernel | 36 | * - pass bad pages to kdump next kernel |
32 | */ | 37 | */ |
33 | #define DEBUG 1 /* remove me in 2.6.34 */ | ||
34 | #include <linux/kernel.h> | 38 | #include <linux/kernel.h> |
35 | #include <linux/mm.h> | 39 | #include <linux/mm.h> |
36 | #include <linux/page-flags.h> | 40 | #include <linux/page-flags.h> |
@@ -47,6 +51,8 @@ | |||
47 | #include <linux/slab.h> | 51 | #include <linux/slab.h> |
48 | #include <linux/swapops.h> | 52 | #include <linux/swapops.h> |
49 | #include <linux/hugetlb.h> | 53 | #include <linux/hugetlb.h> |
54 | #include <linux/memory_hotplug.h> | ||
55 | #include <linux/mm_inline.h> | ||
50 | #include "internal.h" | 56 | #include "internal.h" |
51 | 57 | ||
52 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 58 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -78,7 +84,7 @@ static int hwpoison_filter_dev(struct page *p) | |||
78 | return 0; | 84 | return 0; |
79 | 85 | ||
80 | /* | 86 | /* |
81 | * page_mapping() does not accept slab page | 87 | * page_mapping() does not accept slab pages. |
82 | */ | 88 | */ |
83 | if (PageSlab(p)) | 89 | if (PageSlab(p)) |
84 | return -EINVAL; | 90 | return -EINVAL; |
@@ -198,12 +204,12 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | |||
198 | #ifdef __ARCH_SI_TRAPNO | 204 | #ifdef __ARCH_SI_TRAPNO |
199 | si.si_trapno = trapno; | 205 | si.si_trapno = trapno; |
200 | #endif | 206 | #endif |
201 | si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; | 207 | si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; |
202 | /* | 208 | /* |
203 | * Don't use force here, it's convenient if the signal | 209 | * Don't use force here, it's convenient if the signal |
204 | * can be temporarily blocked. | 210 | * can be temporarily blocked. |
205 | * This could cause a loop when the user sets SIGBUS | 211 | * This could cause a loop when the user sets SIGBUS |
206 | * to SIG_IGN, but hopefully noone will do that? | 212 | * to SIG_IGN, but hopefully no one will do that? |
207 | */ | 213 | */ |
208 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ | 214 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ |
209 | if (ret < 0) | 215 | if (ret < 0) |
@@ -228,13 +234,17 @@ void shake_page(struct page *p, int access) | |||
228 | } | 234 | } |
229 | 235 | ||
230 | /* | 236 | /* |
231 | * Only all shrink_slab here (which would also | 237 | * Only call shrink_slab here (which would also shrink other caches) if |
232 | * shrink other caches) if access is not potentially fatal. | 238 | * access is not potentially fatal. |
233 | */ | 239 | */ |
234 | if (access) { | 240 | if (access) { |
235 | int nr; | 241 | int nr; |
236 | do { | 242 | do { |
237 | nr = shrink_slab(1000, GFP_KERNEL, 1000); | 243 | struct shrink_control shrink = { |
244 | .gfp_mask = GFP_KERNEL, | ||
245 | }; | ||
246 | |||
247 | nr = shrink_slab(&shrink, 1000, 1000); | ||
238 | if (page_count(p) == 1) | 248 | if (page_count(p) == 1) |
239 | break; | 249 | break; |
240 | } while (nr > 10); | 250 | } while (nr > 10); |
@@ -268,7 +278,7 @@ struct to_kill { | |||
268 | struct list_head nd; | 278 | struct list_head nd; |
269 | struct task_struct *tsk; | 279 | struct task_struct *tsk; |
270 | unsigned long addr; | 280 | unsigned long addr; |
271 | unsigned addr_valid:1; | 281 | char addr_valid; |
272 | }; | 282 | }; |
273 | 283 | ||
274 | /* | 284 | /* |
@@ -309,7 +319,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, | |||
309 | * a SIGKILL because the error is not contained anymore. | 319 | * a SIGKILL because the error is not contained anymore. |
310 | */ | 320 | */ |
311 | if (tk->addr == -EFAULT) { | 321 | if (tk->addr == -EFAULT) { |
312 | pr_debug("MCE: Unable to find user space address %lx in %s\n", | 322 | pr_info("MCE: Unable to find user space address %lx in %s\n", |
313 | page_to_pfn(p), tsk->comm); | 323 | page_to_pfn(p), tsk->comm); |
314 | tk->addr_valid = 0; | 324 | tk->addr_valid = 0; |
315 | } | 325 | } |
@@ -381,10 +391,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
381 | struct task_struct *tsk; | 391 | struct task_struct *tsk; |
382 | struct anon_vma *av; | 392 | struct anon_vma *av; |
383 | 393 | ||
384 | read_lock(&tasklist_lock); | ||
385 | av = page_lock_anon_vma(page); | 394 | av = page_lock_anon_vma(page); |
386 | if (av == NULL) /* Not actually mapped anymore */ | 395 | if (av == NULL) /* Not actually mapped anymore */ |
387 | goto out; | 396 | return; |
397 | |||
398 | read_lock(&tasklist_lock); | ||
388 | for_each_process (tsk) { | 399 | for_each_process (tsk) { |
389 | struct anon_vma_chain *vmac; | 400 | struct anon_vma_chain *vmac; |
390 | 401 | ||
@@ -398,9 +409,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
398 | add_to_kill(tsk, page, vma, to_kill, tkc); | 409 | add_to_kill(tsk, page, vma, to_kill, tkc); |
399 | } | 410 | } |
400 | } | 411 | } |
401 | page_unlock_anon_vma(av); | ||
402 | out: | ||
403 | read_unlock(&tasklist_lock); | 412 | read_unlock(&tasklist_lock); |
413 | page_unlock_anon_vma(av); | ||
404 | } | 414 | } |
405 | 415 | ||
406 | /* | 416 | /* |
@@ -414,17 +424,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
414 | struct prio_tree_iter iter; | 424 | struct prio_tree_iter iter; |
415 | struct address_space *mapping = page->mapping; | 425 | struct address_space *mapping = page->mapping; |
416 | 426 | ||
417 | /* | 427 | mutex_lock(&mapping->i_mmap_mutex); |
418 | * A note on the locking order between the two locks. | ||
419 | * We don't rely on this particular order. | ||
420 | * If you have some other code that needs a different order | ||
421 | * feel free to switch them around. Or add a reverse link | ||
422 | * from mm_struct to task_struct, then this could be all | ||
423 | * done without taking tasklist_lock and looping over all tasks. | ||
424 | */ | ||
425 | |||
426 | read_lock(&tasklist_lock); | 428 | read_lock(&tasklist_lock); |
427 | spin_lock(&mapping->i_mmap_lock); | ||
428 | for_each_process(tsk) { | 429 | for_each_process(tsk) { |
429 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 430 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
430 | 431 | ||
@@ -444,8 +445,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
444 | add_to_kill(tsk, page, vma, to_kill, tkc); | 445 | add_to_kill(tsk, page, vma, to_kill, tkc); |
445 | } | 446 | } |
446 | } | 447 | } |
447 | spin_unlock(&mapping->i_mmap_lock); | ||
448 | read_unlock(&tasklist_lock); | 448 | read_unlock(&tasklist_lock); |
449 | mutex_unlock(&mapping->i_mmap_mutex); | ||
449 | } | 450 | } |
450 | 451 | ||
451 | /* | 452 | /* |
@@ -577,7 +578,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
577 | pfn, err); | 578 | pfn, err); |
578 | } else if (page_has_private(p) && | 579 | } else if (page_has_private(p) && |
579 | !try_to_release_page(p, GFP_NOIO)) { | 580 | !try_to_release_page(p, GFP_NOIO)) { |
580 | pr_debug("MCE %#lx: failed to release buffers\n", pfn); | 581 | pr_info("MCE %#lx: failed to release buffers\n", pfn); |
581 | } else { | 582 | } else { |
582 | ret = RECOVERED; | 583 | ret = RECOVERED; |
583 | } | 584 | } |
@@ -629,7 +630,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) | |||
629 | * when the page is reread or dropped. If an | 630 | * when the page is reread or dropped. If an |
630 | * application assumes it will always get error on | 631 | * application assumes it will always get error on |
631 | * fsync, but does other operations on the fd before | 632 | * fsync, but does other operations on the fd before |
632 | * and the page is dropped inbetween then the error | 633 | * and the page is dropped between then the error |
633 | * will not be properly reported. | 634 | * will not be properly reported. |
634 | * | 635 | * |
635 | * This can already happen even without hwpoisoned | 636 | * This can already happen even without hwpoisoned |
@@ -693,11 +694,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) | |||
693 | * Issues: | 694 | * Issues: |
694 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) | 695 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) |
695 | * To narrow down kill region to one page, we need to break up pmd. | 696 | * To narrow down kill region to one page, we need to break up pmd. |
696 | * - To support soft-offlining for hugepage, we need to support hugepage | ||
697 | * migration. | ||
698 | */ | 697 | */ |
699 | static int me_huge_page(struct page *p, unsigned long pfn) | 698 | static int me_huge_page(struct page *p, unsigned long pfn) |
700 | { | 699 | { |
700 | int res = 0; | ||
701 | struct page *hpage = compound_head(p); | 701 | struct page *hpage = compound_head(p); |
702 | /* | 702 | /* |
703 | * We can safely recover from error on free or reserved (i.e. | 703 | * We can safely recover from error on free or reserved (i.e. |
@@ -710,8 +710,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
710 | * so there is no race between isolation and mapping/unmapping. | 710 | * so there is no race between isolation and mapping/unmapping. |
711 | */ | 711 | */ |
712 | if (!(page_mapping(hpage) || PageAnon(hpage))) { | 712 | if (!(page_mapping(hpage) || PageAnon(hpage))) { |
713 | __isolate_hwpoisoned_huge_page(hpage); | 713 | res = dequeue_hwpoisoned_huge_page(hpage); |
714 | return RECOVERED; | 714 | if (!res) |
715 | return RECOVERED; | ||
715 | } | 716 | } |
716 | return DELAYED; | 717 | return DELAYED; |
717 | } | 718 | } |
@@ -723,7 +724,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
723 | * The table matches them in order and calls the right handler. | 724 | * The table matches them in order and calls the right handler. |
724 | * | 725 | * |
725 | * This is quite tricky because we can access page at any time | 726 | * This is quite tricky because we can access page at any time |
726 | * in its live cycle, so all accesses have to be extremly careful. | 727 | * in its live cycle, so all accesses have to be extremely careful. |
727 | * | 728 | * |
728 | * This is not complete. More states could be added. | 729 | * This is not complete. More states could be added. |
729 | * For any missing state don't attempt recovery. | 730 | * For any missing state don't attempt recovery. |
@@ -836,8 +837,6 @@ static int page_action(struct page_state *ps, struct page *p, | |||
836 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; | 837 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; |
837 | } | 838 | } |
838 | 839 | ||
839 | #define N_UNMAP_TRIES 5 | ||
840 | |||
841 | /* | 840 | /* |
842 | * Do all that is necessary to remove user space mappings. Unmap | 841 | * Do all that is necessary to remove user space mappings. Unmap |
843 | * the pages and send SIGBUS to the processes if the data was dirty. | 842 | * the pages and send SIGBUS to the processes if the data was dirty. |
@@ -849,9 +848,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
849 | struct address_space *mapping; | 848 | struct address_space *mapping; |
850 | LIST_HEAD(tokill); | 849 | LIST_HEAD(tokill); |
851 | int ret; | 850 | int ret; |
852 | int i; | ||
853 | int kill = 1; | 851 | int kill = 1; |
854 | struct page *hpage = compound_head(p); | 852 | struct page *hpage = compound_head(p); |
853 | struct page *ppage; | ||
855 | 854 | ||
856 | if (PageReserved(p) || PageSlab(p)) | 855 | if (PageReserved(p) || PageSlab(p)) |
857 | return SWAP_SUCCESS; | 856 | return SWAP_SUCCESS; |
@@ -893,6 +892,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
893 | } | 892 | } |
894 | 893 | ||
895 | /* | 894 | /* |
895 | * ppage: poisoned page | ||
896 | * if p is regular page(4k page) | ||
897 | * ppage == real poisoned page; | ||
898 | * else p is hugetlb or THP, ppage == head page. | ||
899 | */ | ||
900 | ppage = hpage; | ||
901 | |||
902 | if (PageTransHuge(hpage)) { | ||
903 | /* | ||
904 | * Verify that this isn't a hugetlbfs head page, the check for | ||
905 | * PageAnon is just for avoid tripping a split_huge_page | ||
906 | * internal debug check, as split_huge_page refuses to deal with | ||
907 | * anything that isn't an anon page. PageAnon can't go away fro | ||
908 | * under us because we hold a refcount on the hpage, without a | ||
909 | * refcount on the hpage. split_huge_page can't be safely called | ||
910 | * in the first place, having a refcount on the tail isn't | ||
911 | * enough * to be safe. | ||
912 | */ | ||
913 | if (!PageHuge(hpage) && PageAnon(hpage)) { | ||
914 | if (unlikely(split_huge_page(hpage))) { | ||
915 | /* | ||
916 | * FIXME: if splitting THP is failed, it is | ||
917 | * better to stop the following operation rather | ||
918 | * than causing panic by unmapping. System might | ||
919 | * survive if the page is freed later. | ||
920 | */ | ||
921 | printk(KERN_INFO | ||
922 | "MCE %#lx: failed to split THP\n", pfn); | ||
923 | |||
924 | BUG_ON(!PageHWPoison(p)); | ||
925 | return SWAP_FAIL; | ||
926 | } | ||
927 | /* THP is split, so ppage should be the real poisoned page. */ | ||
928 | ppage = p; | ||
929 | } | ||
930 | } | ||
931 | |||
932 | /* | ||
896 | * First collect all the processes that have the page | 933 | * First collect all the processes that have the page |
897 | * mapped in dirty form. This has to be done before try_to_unmap, | 934 | * mapped in dirty form. This has to be done before try_to_unmap, |
898 | * because ttu takes the rmap data structures down. | 935 | * because ttu takes the rmap data structures down. |
@@ -901,22 +938,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
901 | * there's nothing that can be done. | 938 | * there's nothing that can be done. |
902 | */ | 939 | */ |
903 | if (kill) | 940 | if (kill) |
904 | collect_procs(hpage, &tokill); | 941 | collect_procs(ppage, &tokill); |
905 | 942 | ||
906 | /* | 943 | if (hpage != ppage) |
907 | * try_to_unmap can fail temporarily due to races. | 944 | lock_page(ppage); |
908 | * Try a few times (RED-PEN better strategy?) | ||
909 | */ | ||
910 | for (i = 0; i < N_UNMAP_TRIES; i++) { | ||
911 | ret = try_to_unmap(hpage, ttu); | ||
912 | if (ret == SWAP_SUCCESS) | ||
913 | break; | ||
914 | pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); | ||
915 | } | ||
916 | 945 | ||
946 | ret = try_to_unmap(ppage, ttu); | ||
917 | if (ret != SWAP_SUCCESS) | 947 | if (ret != SWAP_SUCCESS) |
918 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | 948 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", |
919 | pfn, page_mapcount(hpage)); | 949 | pfn, page_mapcount(ppage)); |
950 | |||
951 | if (hpage != ppage) | ||
952 | unlock_page(ppage); | ||
920 | 953 | ||
921 | /* | 954 | /* |
922 | * Now that the dirty bit has been propagated to the | 955 | * Now that the dirty bit has been propagated to the |
@@ -927,7 +960,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
927 | * use a more force-full uncatchable kill to prevent | 960 | * use a more force-full uncatchable kill to prevent |
928 | * any accesses to the poisoned memory. | 961 | * any accesses to the poisoned memory. |
929 | */ | 962 | */ |
930 | kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, | 963 | kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, |
931 | ret != SWAP_SUCCESS, p, pfn); | 964 | ret != SWAP_SUCCESS, p, pfn); |
932 | 965 | ||
933 | return ret; | 966 | return ret; |
@@ -936,7 +969,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
936 | static void set_page_hwpoison_huge_page(struct page *hpage) | 969 | static void set_page_hwpoison_huge_page(struct page *hpage) |
937 | { | 970 | { |
938 | int i; | 971 | int i; |
939 | int nr_pages = 1 << compound_order(hpage); | 972 | int nr_pages = 1 << compound_trans_order(hpage); |
940 | for (i = 0; i < nr_pages; i++) | 973 | for (i = 0; i < nr_pages; i++) |
941 | SetPageHWPoison(hpage + i); | 974 | SetPageHWPoison(hpage + i); |
942 | } | 975 | } |
@@ -944,7 +977,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) | |||
944 | static void clear_page_hwpoison_huge_page(struct page *hpage) | 977 | static void clear_page_hwpoison_huge_page(struct page *hpage) |
945 | { | 978 | { |
946 | int i; | 979 | int i; |
947 | int nr_pages = 1 << compound_order(hpage); | 980 | int nr_pages = 1 << compound_trans_order(hpage); |
948 | for (i = 0; i < nr_pages; i++) | 981 | for (i = 0; i < nr_pages; i++) |
949 | ClearPageHWPoison(hpage + i); | 982 | ClearPageHWPoison(hpage + i); |
950 | } | 983 | } |
@@ -974,14 +1007,17 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
974 | return 0; | 1007 | return 0; |
975 | } | 1008 | } |
976 | 1009 | ||
977 | nr_pages = 1 << compound_order(hpage); | 1010 | nr_pages = 1 << compound_trans_order(hpage); |
978 | atomic_long_add(nr_pages, &mce_bad_pages); | 1011 | atomic_long_add(nr_pages, &mce_bad_pages); |
979 | 1012 | ||
980 | /* | 1013 | /* |
981 | * We need/can do nothing about count=0 pages. | 1014 | * We need/can do nothing about count=0 pages. |
982 | * 1) it's a free page, and therefore in safe hand: | 1015 | * 1) it's a free page, and therefore in safe hand: |
983 | * prep_new_page() will be the gate keeper. | 1016 | * prep_new_page() will be the gate keeper. |
984 | * 2) it's part of a non-compound high order page. | 1017 | * 2) it's a free hugepage, which is also safe: |
1018 | * an affected hugepage will be dequeued from hugepage freelist, | ||
1019 | * so there's no concern about reusing it ever after. | ||
1020 | * 3) it's part of a non-compound high order page. | ||
985 | * Implies some kernel user: cannot stop them from | 1021 | * Implies some kernel user: cannot stop them from |
986 | * R/W the page; let's pray that the page has been | 1022 | * R/W the page; let's pray that the page has been |
987 | * used and will be freed some time later. | 1023 | * used and will be freed some time later. |
@@ -993,6 +1029,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
993 | if (is_free_buddy_page(p)) { | 1029 | if (is_free_buddy_page(p)) { |
994 | action_result(pfn, "free buddy", DELAYED); | 1030 | action_result(pfn, "free buddy", DELAYED); |
995 | return 0; | 1031 | return 0; |
1032 | } else if (PageHuge(hpage)) { | ||
1033 | /* | ||
1034 | * Check "just unpoisoned", "filter hit", and | ||
1035 | * "race with other subpage." | ||
1036 | */ | ||
1037 | lock_page(hpage); | ||
1038 | if (!PageHWPoison(hpage) | ||
1039 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) | ||
1040 | || (p != hpage && TestSetPageHWPoison(hpage))) { | ||
1041 | atomic_long_sub(nr_pages, &mce_bad_pages); | ||
1042 | return 0; | ||
1043 | } | ||
1044 | set_page_hwpoison_huge_page(hpage); | ||
1045 | res = dequeue_hwpoisoned_huge_page(hpage); | ||
1046 | action_result(pfn, "free huge", | ||
1047 | res ? IGNORED : DELAYED); | ||
1048 | unlock_page(hpage); | ||
1049 | return res; | ||
996 | } else { | 1050 | } else { |
997 | action_result(pfn, "high order kernel", IGNORED); | 1051 | action_result(pfn, "high order kernel", IGNORED); |
998 | return -EBUSY; | 1052 | return -EBUSY; |
@@ -1007,19 +1061,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1007 | * The check (unnecessarily) ignores LRU pages being isolated and | 1061 | * The check (unnecessarily) ignores LRU pages being isolated and |
1008 | * walked by the page reclaim code, however that's not a big loss. | 1062 | * walked by the page reclaim code, however that's not a big loss. |
1009 | */ | 1063 | */ |
1010 | if (!PageLRU(p) && !PageHuge(p)) | 1064 | if (!PageHuge(p) && !PageTransCompound(p)) { |
1011 | shake_page(p, 0); | 1065 | if (!PageLRU(p)) |
1012 | if (!PageLRU(p) && !PageHuge(p)) { | 1066 | shake_page(p, 0); |
1013 | /* | 1067 | if (!PageLRU(p)) { |
1014 | * shake_page could have turned it free. | 1068 | /* |
1015 | */ | 1069 | * shake_page could have turned it free. |
1016 | if (is_free_buddy_page(p)) { | 1070 | */ |
1017 | action_result(pfn, "free buddy, 2nd try", DELAYED); | 1071 | if (is_free_buddy_page(p)) { |
1018 | return 0; | 1072 | action_result(pfn, "free buddy, 2nd try", |
1073 | DELAYED); | ||
1074 | return 0; | ||
1075 | } | ||
1076 | action_result(pfn, "non LRU", IGNORED); | ||
1077 | put_page(p); | ||
1078 | return -EBUSY; | ||
1019 | } | 1079 | } |
1020 | action_result(pfn, "non LRU", IGNORED); | ||
1021 | put_page(p); | ||
1022 | return -EBUSY; | ||
1023 | } | 1080 | } |
1024 | 1081 | ||
1025 | /* | 1082 | /* |
@@ -1027,7 +1084,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1027 | * It's very difficult to mess with pages currently under IO | 1084 | * It's very difficult to mess with pages currently under IO |
1028 | * and in many cases impossible, so we just avoid it here. | 1085 | * and in many cases impossible, so we just avoid it here. |
1029 | */ | 1086 | */ |
1030 | lock_page_nosync(hpage); | 1087 | lock_page(hpage); |
1031 | 1088 | ||
1032 | /* | 1089 | /* |
1033 | * unpoison always clear PG_hwpoison inside page lock | 1090 | * unpoison always clear PG_hwpoison inside page lock |
@@ -1049,7 +1106,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1049 | * For error on the tail page, we should set PG_hwpoison | 1106 | * For error on the tail page, we should set PG_hwpoison |
1050 | * on the head page to show that the hugepage is hwpoisoned | 1107 | * on the head page to show that the hugepage is hwpoisoned |
1051 | */ | 1108 | */ |
1052 | if (PageTail(p) && TestSetPageHWPoison(hpage)) { | 1109 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { |
1053 | action_result(pfn, "hugepage already hardware poisoned", | 1110 | action_result(pfn, "hugepage already hardware poisoned", |
1054 | IGNORED); | 1111 | IGNORED); |
1055 | unlock_page(hpage); | 1112 | unlock_page(hpage); |
@@ -1069,7 +1126,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1069 | 1126 | ||
1070 | /* | 1127 | /* |
1071 | * Now take care of user space mappings. | 1128 | * Now take care of user space mappings. |
1072 | * Abort on fail: __remove_from_page_cache() assumes unmapped page. | 1129 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. |
1073 | */ | 1130 | */ |
1074 | if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { | 1131 | if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { |
1075 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | 1132 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); |
@@ -1147,20 +1204,30 @@ int unpoison_memory(unsigned long pfn) | |||
1147 | page = compound_head(p); | 1204 | page = compound_head(p); |
1148 | 1205 | ||
1149 | if (!PageHWPoison(p)) { | 1206 | if (!PageHWPoison(p)) { |
1150 | pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); | 1207 | pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); |
1151 | return 0; | 1208 | return 0; |
1152 | } | 1209 | } |
1153 | 1210 | ||
1154 | nr_pages = 1 << compound_order(page); | 1211 | nr_pages = 1 << compound_trans_order(page); |
1155 | 1212 | ||
1156 | if (!get_page_unless_zero(page)) { | 1213 | if (!get_page_unless_zero(page)) { |
1214 | /* | ||
1215 | * Since HWPoisoned hugepage should have non-zero refcount, | ||
1216 | * race between memory failure and unpoison seems to happen. | ||
1217 | * In such case unpoison fails and memory failure runs | ||
1218 | * to the end. | ||
1219 | */ | ||
1220 | if (PageHuge(page)) { | ||
1221 | pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); | ||
1222 | return 0; | ||
1223 | } | ||
1157 | if (TestClearPageHWPoison(p)) | 1224 | if (TestClearPageHWPoison(p)) |
1158 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1225 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1159 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1226 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); |
1160 | return 0; | 1227 | return 0; |
1161 | } | 1228 | } |
1162 | 1229 | ||
1163 | lock_page_nosync(page); | 1230 | lock_page(page); |
1164 | /* | 1231 | /* |
1165 | * This test is racy because PG_hwpoison is set outside of page lock. | 1232 | * This test is racy because PG_hwpoison is set outside of page lock. |
1166 | * That's acceptable because that won't trigger kernel panic. Instead, | 1233 | * That's acceptable because that won't trigger kernel panic. Instead, |
@@ -1168,12 +1235,12 @@ int unpoison_memory(unsigned long pfn) | |||
1168 | * the free buddy page pool. | 1235 | * the free buddy page pool. |
1169 | */ | 1236 | */ |
1170 | if (TestClearPageHWPoison(page)) { | 1237 | if (TestClearPageHWPoison(page)) { |
1171 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); | 1238 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); |
1172 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1239 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1173 | freeit = 1; | 1240 | freeit = 1; |
1241 | if (PageHuge(page)) | ||
1242 | clear_page_hwpoison_huge_page(page); | ||
1174 | } | 1243 | } |
1175 | if (PageHuge(p)) | ||
1176 | clear_page_hwpoison_huge_page(page); | ||
1177 | unlock_page(page); | 1244 | unlock_page(page); |
1178 | 1245 | ||
1179 | put_page(page); | 1246 | put_page(page); |
@@ -1187,7 +1254,11 @@ EXPORT_SYMBOL(unpoison_memory); | |||
1187 | static struct page *new_page(struct page *p, unsigned long private, int **x) | 1254 | static struct page *new_page(struct page *p, unsigned long private, int **x) |
1188 | { | 1255 | { |
1189 | int nid = page_to_nid(p); | 1256 | int nid = page_to_nid(p); |
1190 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | 1257 | if (PageHuge(p)) |
1258 | return alloc_huge_page_node(page_hstate(compound_head(p)), | ||
1259 | nid); | ||
1260 | else | ||
1261 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | ||
1191 | } | 1262 | } |
1192 | 1263 | ||
1193 | /* | 1264 | /* |
@@ -1204,25 +1275,31 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1204 | return 1; | 1275 | return 1; |
1205 | 1276 | ||
1206 | /* | 1277 | /* |
1207 | * The lock_system_sleep prevents a race with memory hotplug, | 1278 | * The lock_memory_hotplug prevents a race with memory hotplug. |
1208 | * because the isolation assumes there's only a single user. | ||
1209 | * This is a big hammer, a better would be nicer. | 1279 | * This is a big hammer, a better would be nicer. |
1210 | */ | 1280 | */ |
1211 | lock_system_sleep(); | 1281 | lock_memory_hotplug(); |
1212 | 1282 | ||
1213 | /* | 1283 | /* |
1214 | * Isolate the page, so that it doesn't get reallocated if it | 1284 | * Isolate the page, so that it doesn't get reallocated if it |
1215 | * was free. | 1285 | * was free. |
1216 | */ | 1286 | */ |
1217 | set_migratetype_isolate(p); | 1287 | set_migratetype_isolate(p); |
1288 | /* | ||
1289 | * When the target page is a free hugepage, just remove it | ||
1290 | * from free hugepage list. | ||
1291 | */ | ||
1218 | if (!get_page_unless_zero(compound_head(p))) { | 1292 | if (!get_page_unless_zero(compound_head(p))) { |
1219 | if (is_free_buddy_page(p)) { | 1293 | if (PageHuge(p)) { |
1220 | pr_debug("get_any_page: %#lx free buddy page\n", pfn); | 1294 | pr_info("get_any_page: %#lx free huge page\n", pfn); |
1295 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); | ||
1296 | } else if (is_free_buddy_page(p)) { | ||
1297 | pr_info("get_any_page: %#lx free buddy page\n", pfn); | ||
1221 | /* Set hwpoison bit while page is still isolated */ | 1298 | /* Set hwpoison bit while page is still isolated */ |
1222 | SetPageHWPoison(p); | 1299 | SetPageHWPoison(p); |
1223 | ret = 0; | 1300 | ret = 0; |
1224 | } else { | 1301 | } else { |
1225 | pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", | 1302 | pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", |
1226 | pfn, p->flags); | 1303 | pfn, p->flags); |
1227 | ret = -EIO; | 1304 | ret = -EIO; |
1228 | } | 1305 | } |
@@ -1231,7 +1308,51 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1231 | ret = 1; | 1308 | ret = 1; |
1232 | } | 1309 | } |
1233 | unset_migratetype_isolate(p); | 1310 | unset_migratetype_isolate(p); |
1234 | unlock_system_sleep(); | 1311 | unlock_memory_hotplug(); |
1312 | return ret; | ||
1313 | } | ||
1314 | |||
1315 | static int soft_offline_huge_page(struct page *page, int flags) | ||
1316 | { | ||
1317 | int ret; | ||
1318 | unsigned long pfn = page_to_pfn(page); | ||
1319 | struct page *hpage = compound_head(page); | ||
1320 | LIST_HEAD(pagelist); | ||
1321 | |||
1322 | ret = get_any_page(page, pfn, flags); | ||
1323 | if (ret < 0) | ||
1324 | return ret; | ||
1325 | if (ret == 0) | ||
1326 | goto done; | ||
1327 | |||
1328 | if (PageHWPoison(hpage)) { | ||
1329 | put_page(hpage); | ||
1330 | pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); | ||
1331 | return -EBUSY; | ||
1332 | } | ||
1333 | |||
1334 | /* Keep page count to indicate a given hugepage is isolated. */ | ||
1335 | |||
1336 | list_add(&hpage->lru, &pagelist); | ||
1337 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, | ||
1338 | true); | ||
1339 | if (ret) { | ||
1340 | struct page *page1, *page2; | ||
1341 | list_for_each_entry_safe(page1, page2, &pagelist, lru) | ||
1342 | put_page(page1); | ||
1343 | |||
1344 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | ||
1345 | pfn, ret, page->flags); | ||
1346 | if (ret > 0) | ||
1347 | ret = -EIO; | ||
1348 | return ret; | ||
1349 | } | ||
1350 | done: | ||
1351 | if (!PageHWPoison(hpage)) | ||
1352 | atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); | ||
1353 | set_page_hwpoison_huge_page(hpage); | ||
1354 | dequeue_hwpoisoned_huge_page(hpage); | ||
1355 | /* keep elevated page count for bad page */ | ||
1235 | return ret; | 1356 | return ret; |
1236 | } | 1357 | } |
1237 | 1358 | ||
@@ -1262,6 +1383,9 @@ int soft_offline_page(struct page *page, int flags) | |||
1262 | int ret; | 1383 | int ret; |
1263 | unsigned long pfn = page_to_pfn(page); | 1384 | unsigned long pfn = page_to_pfn(page); |
1264 | 1385 | ||
1386 | if (PageHuge(page)) | ||
1387 | return soft_offline_huge_page(page, flags); | ||
1388 | |||
1265 | ret = get_any_page(page, pfn, flags); | 1389 | ret = get_any_page(page, pfn, flags); |
1266 | if (ret < 0) | 1390 | if (ret < 0) |
1267 | return ret; | 1391 | return ret; |
@@ -1288,7 +1412,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1288 | goto done; | 1412 | goto done; |
1289 | } | 1413 | } |
1290 | if (!PageLRU(page)) { | 1414 | if (!PageLRU(page)) { |
1291 | pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", | 1415 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", |
1292 | pfn, page->flags); | 1416 | pfn, page->flags); |
1293 | return -EIO; | 1417 | return -EIO; |
1294 | } | 1418 | } |
@@ -1302,7 +1426,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1302 | if (PageHWPoison(page)) { | 1426 | if (PageHWPoison(page)) { |
1303 | unlock_page(page); | 1427 | unlock_page(page); |
1304 | put_page(page); | 1428 | put_page(page); |
1305 | pr_debug("soft offline: %#lx page already poisoned\n", pfn); | 1429 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
1306 | return -EBUSY; | 1430 | return -EBUSY; |
1307 | } | 1431 | } |
1308 | 1432 | ||
@@ -1312,18 +1436,14 @@ int soft_offline_page(struct page *page, int flags) | |||
1312 | */ | 1436 | */ |
1313 | ret = invalidate_inode_page(page); | 1437 | ret = invalidate_inode_page(page); |
1314 | unlock_page(page); | 1438 | unlock_page(page); |
1315 | |||
1316 | /* | 1439 | /* |
1317 | * Drop count because page migration doesn't like raised | ||
1318 | * counts. The page could get re-allocated, but if it becomes | ||
1319 | * LRU the isolation will just fail. | ||
1320 | * RED-PEN would be better to keep it isolated here, but we | 1440 | * RED-PEN would be better to keep it isolated here, but we |
1321 | * would need to fix isolation locking first. | 1441 | * would need to fix isolation locking first. |
1322 | */ | 1442 | */ |
1323 | put_page(page); | ||
1324 | if (ret == 1) { | 1443 | if (ret == 1) { |
1444 | put_page(page); | ||
1325 | ret = 0; | 1445 | ret = 0; |
1326 | pr_debug("soft_offline: %#lx: invalidated\n", pfn); | 1446 | pr_info("soft_offline: %#lx: invalidated\n", pfn); |
1327 | goto done; | 1447 | goto done; |
1328 | } | 1448 | } |
1329 | 1449 | ||
@@ -1333,19 +1453,27 @@ int soft_offline_page(struct page *page, int flags) | |||
1333 | * handles a large number of cases for us. | 1453 | * handles a large number of cases for us. |
1334 | */ | 1454 | */ |
1335 | ret = isolate_lru_page(page); | 1455 | ret = isolate_lru_page(page); |
1456 | /* | ||
1457 | * Drop page reference which is came from get_any_page() | ||
1458 | * successful isolate_lru_page() already took another one. | ||
1459 | */ | ||
1460 | put_page(page); | ||
1336 | if (!ret) { | 1461 | if (!ret) { |
1337 | LIST_HEAD(pagelist); | 1462 | LIST_HEAD(pagelist); |
1338 | 1463 | inc_zone_page_state(page, NR_ISOLATED_ANON + | |
1464 | page_is_file_cache(page)); | ||
1339 | list_add(&page->lru, &pagelist); | 1465 | list_add(&page->lru, &pagelist); |
1340 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | 1466 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1467 | 0, true); | ||
1341 | if (ret) { | 1468 | if (ret) { |
1342 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | 1469 | putback_lru_pages(&pagelist); |
1470 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | ||
1343 | pfn, ret, page->flags); | 1471 | pfn, ret, page->flags); |
1344 | if (ret > 0) | 1472 | if (ret > 0) |
1345 | ret = -EIO; | 1473 | ret = -EIO; |
1346 | } | 1474 | } |
1347 | } else { | 1475 | } else { |
1348 | pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | 1476 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", |
1349 | pfn, ret, page_count(page), page->flags); | 1477 | pfn, ret, page_count(page), page->flags); |
1350 | } | 1478 | } |
1351 | if (ret) | 1479 | if (ret) |
@@ -1357,35 +1485,3 @@ done: | |||
1357 | /* keep elevated page count for bad page */ | 1485 | /* keep elevated page count for bad page */ |
1358 | return ret; | 1486 | return ret; |
1359 | } | 1487 | } |
1360 | |||
1361 | /* | ||
1362 | * The caller must hold current->mm->mmap_sem in read mode. | ||
1363 | */ | ||
1364 | int is_hwpoison_address(unsigned long addr) | ||
1365 | { | ||
1366 | pgd_t *pgdp; | ||
1367 | pud_t pud, *pudp; | ||
1368 | pmd_t pmd, *pmdp; | ||
1369 | pte_t pte, *ptep; | ||
1370 | swp_entry_t entry; | ||
1371 | |||
1372 | pgdp = pgd_offset(current->mm, addr); | ||
1373 | if (!pgd_present(*pgdp)) | ||
1374 | return 0; | ||
1375 | pudp = pud_offset(pgdp, addr); | ||
1376 | pud = *pudp; | ||
1377 | if (!pud_present(pud) || pud_large(pud)) | ||
1378 | return 0; | ||
1379 | pmdp = pmd_offset(pudp, addr); | ||
1380 | pmd = *pmdp; | ||
1381 | if (!pmd_present(pmd) || pmd_large(pmd)) | ||
1382 | return 0; | ||
1383 | ptep = pte_offset_map(pmdp, addr); | ||
1384 | pte = *ptep; | ||
1385 | pte_unmap(ptep); | ||
1386 | if (!is_swap_pte(pte)) | ||
1387 | return 0; | ||
1388 | entry = pte_to_swp_entry(pte); | ||
1389 | return is_hwpoison_entry(entry); | ||
1390 | } | ||
1391 | EXPORT_SYMBOL_GPL(is_hwpoison_address); | ||