diff options
| -rw-r--r-- | Documentation/sysctl/vm.txt | 41 | ||||
| -rw-r--r-- | fs/proc/meminfo.c | 9 | ||||
| -rw-r--r-- | include/linux/mm.h | 7 | ||||
| -rw-r--r-- | include/linux/rmap.h | 1 | ||||
| -rw-r--r-- | kernel/sysctl.c | 25 | ||||
| -rw-r--r-- | mm/Kconfig | 10 | ||||
| -rw-r--r-- | mm/Makefile | 1 | ||||
| -rw-r--r-- | mm/filemap.c | 4 | ||||
| -rw-r--r-- | mm/memory-failure.c | 832 | ||||
| -rw-r--r-- | mm/rmap.c | 7 |
10 files changed, 934 insertions, 3 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index c4de6359d440..faf62740aa2c 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
| @@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/vm: | |||
| 32 | - legacy_va_layout | 32 | - legacy_va_layout |
| 33 | - lowmem_reserve_ratio | 33 | - lowmem_reserve_ratio |
| 34 | - max_map_count | 34 | - max_map_count |
| 35 | - memory_failure_early_kill | ||
| 36 | - memory_failure_recovery | ||
| 35 | - min_free_kbytes | 37 | - min_free_kbytes |
| 36 | - min_slab_ratio | 38 | - min_slab_ratio |
| 37 | - min_unmapped_ratio | 39 | - min_unmapped_ratio |
| @@ -53,7 +55,6 @@ Currently, these files are in /proc/sys/vm: | |||
| 53 | - vfs_cache_pressure | 55 | - vfs_cache_pressure |
| 54 | - zone_reclaim_mode | 56 | - zone_reclaim_mode |
| 55 | 57 | ||
| 56 | |||
| 57 | ============================================================== | 58 | ============================================================== |
| 58 | 59 | ||
| 59 | block_dump | 60 | block_dump |
| @@ -275,6 +276,44 @@ e.g., up to one or two maps per allocation. | |||
| 275 | 276 | ||
| 276 | The default value is 65536. | 277 | The default value is 65536. |
| 277 | 278 | ||
| 279 | ============================================================= | ||
| 280 | |||
| 281 | memory_failure_early_kill: | ||
| 282 | |||
| 283 | Control how to kill processes when uncorrected memory error (typically | ||
| 284 | a 2bit error in a memory module) is detected in the background by hardware | ||
| 285 | that cannot be handled by the kernel. In some cases (like the page | ||
| 286 | still having a valid copy on disk) the kernel will handle the failure | ||
| 287 | transparently without affecting any applications. But if there is | ||
| 288 | no other uptodate copy of the data it will kill to prevent any data | ||
| 289 | corruptions from propagating. | ||
| 290 | |||
| 291 | 1: Kill all processes that have the corrupted and not reloadable page mapped | ||
| 292 | as soon as the corruption is detected. Note this is not supported | ||
| 293 | for a few types of pages, like kernel internally allocated data or | ||
| 294 | the swap cache, but works for the majority of user pages. | ||
| 295 | |||
| 296 | 0: Only unmap the corrupted page from all processes and only kill a process | ||
| 297 | who tries to access it. | ||
| 298 | |||
| 299 | The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can | ||
| 300 | handle this if they want to. | ||
| 301 | |||
| 302 | This is only active on architectures/platforms with advanced machine | ||
| 303 | check handling and depends on the hardware capabilities. | ||
| 304 | |||
| 305 | Applications can override this setting individually with the PR_MCE_KILL prctl | ||
| 306 | |||
| 307 | ============================================================== | ||
| 308 | |||
| 309 | memory_failure_recovery | ||
| 310 | |||
| 311 | Enable memory failure recovery (when supported by the platform) | ||
| 312 | |||
| 313 | 1: Attempt recovery. | ||
| 314 | |||
| 315 | 0: Always panic on a memory failure. | ||
| 316 | |||
| 278 | ============================================================== | 317 | ============================================================== |
| 279 | 318 | ||
| 280 | min_free_kbytes: | 319 | min_free_kbytes: |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index d5c410d47fae..78faedcb0a8d 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
| @@ -95,7 +95,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
| 95 | "Committed_AS: %8lu kB\n" | 95 | "Committed_AS: %8lu kB\n" |
| 96 | "VmallocTotal: %8lu kB\n" | 96 | "VmallocTotal: %8lu kB\n" |
| 97 | "VmallocUsed: %8lu kB\n" | 97 | "VmallocUsed: %8lu kB\n" |
| 98 | "VmallocChunk: %8lu kB\n", | 98 | "VmallocChunk: %8lu kB\n" |
| 99 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 100 | "HardwareCorrupted: %8lu kB\n" | ||
| 101 | #endif | ||
| 102 | , | ||
| 99 | K(i.totalram), | 103 | K(i.totalram), |
| 100 | K(i.freeram), | 104 | K(i.freeram), |
| 101 | K(i.bufferram), | 105 | K(i.bufferram), |
| @@ -140,6 +144,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
| 140 | (unsigned long)VMALLOC_TOTAL >> 10, | 144 | (unsigned long)VMALLOC_TOTAL >> 10, |
| 141 | vmi.used >> 10, | 145 | vmi.used >> 10, |
| 142 | vmi.largest_chunk >> 10 | 146 | vmi.largest_chunk >> 10 |
| 147 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 148 | ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) | ||
| 149 | #endif | ||
| 143 | ); | 150 | ); |
| 144 | 151 | ||
| 145 | hugetlb_report_meminfo(m); | 152 | hugetlb_report_meminfo(m); |
diff --git a/include/linux/mm.h b/include/linux/mm.h index a16018f7d61c..1ffca03f34b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
| @@ -1309,5 +1309,12 @@ void vmemmap_populate_print_last(void); | |||
| 1309 | extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, | 1309 | extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, |
| 1310 | size_t size); | 1310 | size_t size); |
| 1311 | extern void refund_locked_memory(struct mm_struct *mm, size_t size); | 1311 | extern void refund_locked_memory(struct mm_struct *mm, size_t size); |
| 1312 | |||
| 1313 | extern void memory_failure(unsigned long pfn, int trapno); | ||
| 1314 | extern int __memory_failure(unsigned long pfn, int trapno, int ref); | ||
| 1315 | extern int sysctl_memory_failure_early_kill; | ||
| 1316 | extern int sysctl_memory_failure_recovery; | ||
| 1317 | extern atomic_long_t mce_bad_pages; | ||
| 1318 | |||
| 1312 | #endif /* __KERNEL__ */ | 1319 | #endif /* __KERNEL__ */ |
| 1313 | #endif /* _LINUX_MM_H */ | 1320 | #endif /* _LINUX_MM_H */ |
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index ce989f1fc2ed..3c1004e50747 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
| @@ -129,6 +129,7 @@ int try_to_munlock(struct page *); | |||
| 129 | */ | 129 | */ |
| 130 | struct anon_vma *page_lock_anon_vma(struct page *page); | 130 | struct anon_vma *page_lock_anon_vma(struct page *page); |
| 131 | void page_unlock_anon_vma(struct anon_vma *anon_vma); | 131 | void page_unlock_anon_vma(struct anon_vma *anon_vma); |
| 132 | int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); | ||
| 132 | 133 | ||
| 133 | #else /* !CONFIG_MMU */ | 134 | #else /* !CONFIG_MMU */ |
| 134 | 135 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6bb59f707402..eacae77ac9fc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -1372,6 +1372,31 @@ static struct ctl_table vm_table[] = { | |||
| 1372 | .mode = 0644, | 1372 | .mode = 0644, |
| 1373 | .proc_handler = &scan_unevictable_handler, | 1373 | .proc_handler = &scan_unevictable_handler, |
| 1374 | }, | 1374 | }, |
| 1375 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 1376 | { | ||
| 1377 | .ctl_name = CTL_UNNUMBERED, | ||
| 1378 | .procname = "memory_failure_early_kill", | ||
| 1379 | .data = &sysctl_memory_failure_early_kill, | ||
| 1380 | .maxlen = sizeof(sysctl_memory_failure_early_kill), | ||
| 1381 | .mode = 0644, | ||
| 1382 | .proc_handler = &proc_dointvec_minmax, | ||
| 1383 | .strategy = &sysctl_intvec, | ||
| 1384 | .extra1 = &zero, | ||
| 1385 | .extra2 = &one, | ||
| 1386 | }, | ||
| 1387 | { | ||
| 1388 | .ctl_name = CTL_UNNUMBERED, | ||
| 1389 | .procname = "memory_failure_recovery", | ||
| 1390 | .data = &sysctl_memory_failure_recovery, | ||
| 1391 | .maxlen = sizeof(sysctl_memory_failure_recovery), | ||
| 1392 | .mode = 0644, | ||
| 1393 | .proc_handler = &proc_dointvec_minmax, | ||
| 1394 | .strategy = &sysctl_intvec, | ||
| 1395 | .extra1 = &zero, | ||
| 1396 | .extra2 = &one, | ||
| 1397 | }, | ||
| 1398 | #endif | ||
| 1399 | |||
| 1375 | /* | 1400 | /* |
| 1376 | * NOTE: do not add new entries to this table unless you have read | 1401 | * NOTE: do not add new entries to this table unless you have read |
| 1377 | * Documentation/sysctl/ctl_unnumbered.txt | 1402 | * Documentation/sysctl/ctl_unnumbered.txt |
diff --git a/mm/Kconfig b/mm/Kconfig index 3aa519f52e18..ea2d8b61c631 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -233,6 +233,16 @@ config DEFAULT_MMAP_MIN_ADDR | |||
| 233 | /proc/sys/vm/mmap_min_addr tunable. | 233 | /proc/sys/vm/mmap_min_addr tunable. |
| 234 | 234 | ||
| 235 | 235 | ||
| 236 | config MEMORY_FAILURE | ||
| 237 | depends on MMU | ||
| 238 | depends on X86_MCE | ||
| 239 | bool "Enable recovery from hardware memory errors" | ||
| 240 | help | ||
| 241 | Enables code to recover from some memory failures on systems | ||
| 242 | with MCA recovery. This allows a system to continue running | ||
| 243 | even when some of its memory has uncorrected errors. This requires | ||
| 244 | special hardware support and typically ECC memory. | ||
| 245 | |||
| 236 | config NOMMU_INITIAL_TRIM_EXCESS | 246 | config NOMMU_INITIAL_TRIM_EXCESS |
| 237 | int "Turn on mmap() excess space trimming before booting" | 247 | int "Turn on mmap() excess space trimming before booting" |
| 238 | depends on !MMU | 248 | depends on !MMU |
diff --git a/mm/Makefile b/mm/Makefile index ea4b18bd3960..dc2551e7d006 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -40,5 +40,6 @@ obj-$(CONFIG_SMP) += allocpercpu.o | |||
| 40 | endif | 40 | endif |
| 41 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 41 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
| 42 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 42 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
| 43 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | ||
| 43 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | 44 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o |
| 44 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 45 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
diff --git a/mm/filemap.c b/mm/filemap.c index dd51c68e2b86..75575c392167 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -104,6 +104,10 @@ | |||
| 104 | * | 104 | * |
| 105 | * ->task->proc_lock | 105 | * ->task->proc_lock |
| 106 | * ->dcache_lock (proc_pid_lookup) | 106 | * ->dcache_lock (proc_pid_lookup) |
| 107 | * | ||
| 108 | * (code doesn't rely on that order, so you could switch it around) | ||
| 109 | * ->tasklist_lock (memory_failure, collect_procs_ao) | ||
| 110 | * ->i_mmap_lock | ||
| 107 | */ | 111 | */ |
| 108 | 112 | ||
| 109 | /* | 113 | /* |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c new file mode 100644 index 000000000000..729d4b15b645 --- /dev/null +++ b/mm/memory-failure.c | |||
| @@ -0,0 +1,832 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2008, 2009 Intel Corporation | ||
| 3 | * Authors: Andi Kleen, Fengguang Wu | ||
| 4 | * | ||
| 5 | * This software may be redistributed and/or modified under the terms of | ||
| 6 | * the GNU General Public License ("GPL") version 2 only as published by the | ||
| 7 | * Free Software Foundation. | ||
| 8 | * | ||
| 9 | * High level machine check handler. Handles pages reported by the | ||
| 10 | * hardware as being corrupted usually due to a 2bit ECC memory or cache | ||
| 11 | * failure. | ||
| 12 | * | ||
| 13 | * Handles page cache pages in various states. The tricky part | ||
| 14 | * here is that we can access any page asynchronous to other VM | ||
| 15 | * users, because memory failures could happen anytime and anywhere, | ||
| 16 | * possibly violating some of their assumptions. This is why this code | ||
| 17 | * has to be extremely careful. Generally it tries to use normal locking | ||
| 18 | * rules, as in get the standard locks, even if that means the | ||
| 19 | * error handling takes potentially a long time. | ||
| 20 | * | ||
| 21 | * The operation to map back from RMAP chains to processes has to walk | ||
| 22 | * the complete process list and has non linear complexity with the number | ||
| 23 | * mappings. In short it can be quite slow. But since memory corruptions | ||
| 24 | * are rare we hope to get away with this. | ||
| 25 | */ | ||
| 26 | |||
| 27 | /* | ||
| 28 | * Notebook: | ||
| 29 | * - hugetlb needs more code | ||
| 30 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages | ||
| 31 | * - pass bad pages to kdump next kernel | ||
| 32 | */ | ||
| 33 | #define DEBUG 1 /* remove me in 2.6.34 */ | ||
| 34 | #include <linux/kernel.h> | ||
| 35 | #include <linux/mm.h> | ||
| 36 | #include <linux/page-flags.h> | ||
| 37 | #include <linux/sched.h> | ||
| 38 | #include <linux/rmap.h> | ||
| 39 | #include <linux/pagemap.h> | ||
| 40 | #include <linux/swap.h> | ||
| 41 | #include <linux/backing-dev.h> | ||
| 42 | #include "internal.h" | ||
| 43 | |||
| 44 | int sysctl_memory_failure_early_kill __read_mostly = 0; | ||
| 45 | |||
| 46 | int sysctl_memory_failure_recovery __read_mostly = 1; | ||
| 47 | |||
| 48 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); | ||
| 49 | |||
| 50 | /* | ||
| 51 | * Send all the processes who have the page mapped an ``action optional'' | ||
| 52 | * signal. | ||
| 53 | */ | ||
| 54 | static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | ||
| 55 | unsigned long pfn) | ||
| 56 | { | ||
| 57 | struct siginfo si; | ||
| 58 | int ret; | ||
| 59 | |||
| 60 | printk(KERN_ERR | ||
| 61 | "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", | ||
| 62 | pfn, t->comm, t->pid); | ||
| 63 | si.si_signo = SIGBUS; | ||
| 64 | si.si_errno = 0; | ||
| 65 | si.si_code = BUS_MCEERR_AO; | ||
| 66 | si.si_addr = (void *)addr; | ||
| 67 | #ifdef __ARCH_SI_TRAPNO | ||
| 68 | si.si_trapno = trapno; | ||
| 69 | #endif | ||
| 70 | si.si_addr_lsb = PAGE_SHIFT; | ||
| 71 | /* | ||
| 72 | * Don't use force here, it's convenient if the signal | ||
| 73 | * can be temporarily blocked. | ||
| 74 | * This could cause a loop when the user sets SIGBUS | ||
| 75 | * to SIG_IGN, but hopefully noone will do that? | ||
| 76 | */ | ||
| 77 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ | ||
| 78 | if (ret < 0) | ||
| 79 | printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", | ||
| 80 | t->comm, t->pid, ret); | ||
| 81 | return ret; | ||
| 82 | } | ||
| 83 | |||
| 84 | /* | ||
| 85 | * Kill all processes that have a poisoned page mapped and then isolate | ||
| 86 | * the page. | ||
| 87 | * | ||
| 88 | * General strategy: | ||
| 89 | * Find all processes having the page mapped and kill them. | ||
| 90 | * But we keep a page reference around so that the page is not | ||
| 91 | * actually freed yet. | ||
| 92 | * Then stash the page away | ||
| 93 | * | ||
| 94 | * There's no convenient way to get back to mapped processes | ||
| 95 | * from the VMAs. So do a brute-force search over all | ||
| 96 | * running processes. | ||
| 97 | * | ||
| 98 | * Remember that machine checks are not common (or rather | ||
| 99 | * if they are common you have other problems), so this shouldn't | ||
| 100 | * be a performance issue. | ||
| 101 | * | ||
| 102 | * Also there are some races possible while we get from the | ||
| 103 | * error detection to actually handle it. | ||
| 104 | */ | ||
| 105 | |||
| 106 | struct to_kill { | ||
| 107 | struct list_head nd; | ||
| 108 | struct task_struct *tsk; | ||
| 109 | unsigned long addr; | ||
| 110 | unsigned addr_valid:1; | ||
| 111 | }; | ||
| 112 | |||
| 113 | /* | ||
| 114 | * Failure handling: if we can't find or can't kill a process there's | ||
| 115 | * not much we can do. We just print a message and ignore otherwise. | ||
| 116 | */ | ||
| 117 | |||
| 118 | /* | ||
| 119 | * Schedule a process for later kill. | ||
| 120 | * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. | ||
| 121 | * TBD would GFP_NOIO be enough? | ||
| 122 | */ | ||
| 123 | static void add_to_kill(struct task_struct *tsk, struct page *p, | ||
| 124 | struct vm_area_struct *vma, | ||
| 125 | struct list_head *to_kill, | ||
| 126 | struct to_kill **tkc) | ||
| 127 | { | ||
| 128 | struct to_kill *tk; | ||
| 129 | |||
| 130 | if (*tkc) { | ||
| 131 | tk = *tkc; | ||
| 132 | *tkc = NULL; | ||
| 133 | } else { | ||
| 134 | tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); | ||
| 135 | if (!tk) { | ||
| 136 | printk(KERN_ERR | ||
| 137 | "MCE: Out of memory while machine check handling\n"); | ||
| 138 | return; | ||
| 139 | } | ||
| 140 | } | ||
| 141 | tk->addr = page_address_in_vma(p, vma); | ||
| 142 | tk->addr_valid = 1; | ||
| 143 | |||
| 144 | /* | ||
| 145 | * In theory we don't have to kill when the page was | ||
| 146 | * munmaped. But it could be also a mremap. Since that's | ||
| 147 | * likely very rare kill anyways just out of paranoia, but use | ||
| 148 | * a SIGKILL because the error is not contained anymore. | ||
| 149 | */ | ||
| 150 | if (tk->addr == -EFAULT) { | ||
| 151 | pr_debug("MCE: Unable to find user space address %lx in %s\n", | ||
| 152 | page_to_pfn(p), tsk->comm); | ||
| 153 | tk->addr_valid = 0; | ||
| 154 | } | ||
| 155 | get_task_struct(tsk); | ||
| 156 | tk->tsk = tsk; | ||
| 157 | list_add_tail(&tk->nd, to_kill); | ||
| 158 | } | ||
| 159 | |||
| 160 | /* | ||
| 161 | * Kill the processes that have been collected earlier. | ||
| 162 | * | ||
| 163 | * Only do anything when DOIT is set, otherwise just free the list | ||
| 164 | * (this is used for clean pages which do not need killing) | ||
| 165 | * Also when FAIL is set do a force kill because something went | ||
| 166 | * wrong earlier. | ||
| 167 | */ | ||
| 168 | static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | ||
| 169 | int fail, unsigned long pfn) | ||
| 170 | { | ||
| 171 | struct to_kill *tk, *next; | ||
| 172 | |||
| 173 | list_for_each_entry_safe (tk, next, to_kill, nd) { | ||
| 174 | if (doit) { | ||
| 175 | /* | ||
| 176 | * In case something went wrong with munmaping | ||
| 177 | * make sure the process doesn't catch the | ||
| 178 | * signal and then access the memory. Just kill it. | ||
| 179 | * the signal handlers | ||
| 180 | */ | ||
| 181 | if (fail || tk->addr_valid == 0) { | ||
| 182 | printk(KERN_ERR | ||
| 183 | "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", | ||
| 184 | pfn, tk->tsk->comm, tk->tsk->pid); | ||
| 185 | force_sig(SIGKILL, tk->tsk); | ||
| 186 | } | ||
| 187 | |||
| 188 | /* | ||
| 189 | * In theory the process could have mapped | ||
| 190 | * something else on the address in-between. We could | ||
| 191 | * check for that, but we need to tell the | ||
| 192 | * process anyways. | ||
| 193 | */ | ||
| 194 | else if (kill_proc_ao(tk->tsk, tk->addr, trapno, | ||
| 195 | pfn) < 0) | ||
| 196 | printk(KERN_ERR | ||
| 197 | "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", | ||
| 198 | pfn, tk->tsk->comm, tk->tsk->pid); | ||
| 199 | } | ||
| 200 | put_task_struct(tk->tsk); | ||
| 201 | kfree(tk); | ||
| 202 | } | ||
| 203 | } | ||
| 204 | |||
| 205 | static int task_early_kill(struct task_struct *tsk) | ||
| 206 | { | ||
| 207 | if (!tsk->mm) | ||
| 208 | return 0; | ||
| 209 | if (tsk->flags & PF_MCE_PROCESS) | ||
| 210 | return !!(tsk->flags & PF_MCE_EARLY); | ||
| 211 | return sysctl_memory_failure_early_kill; | ||
| 212 | } | ||
| 213 | |||
| 214 | /* | ||
| 215 | * Collect processes when the error hit an anonymous page. | ||
| 216 | */ | ||
| 217 | static void collect_procs_anon(struct page *page, struct list_head *to_kill, | ||
| 218 | struct to_kill **tkc) | ||
| 219 | { | ||
| 220 | struct vm_area_struct *vma; | ||
| 221 | struct task_struct *tsk; | ||
| 222 | struct anon_vma *av; | ||
| 223 | |||
| 224 | read_lock(&tasklist_lock); | ||
| 225 | av = page_lock_anon_vma(page); | ||
| 226 | if (av == NULL) /* Not actually mapped anymore */ | ||
| 227 | goto out; | ||
| 228 | for_each_process (tsk) { | ||
| 229 | if (!task_early_kill(tsk)) | ||
| 230 | continue; | ||
| 231 | list_for_each_entry (vma, &av->head, anon_vma_node) { | ||
| 232 | if (!page_mapped_in_vma(page, vma)) | ||
| 233 | continue; | ||
| 234 | if (vma->vm_mm == tsk->mm) | ||
| 235 | add_to_kill(tsk, page, vma, to_kill, tkc); | ||
| 236 | } | ||
| 237 | } | ||
| 238 | page_unlock_anon_vma(av); | ||
| 239 | out: | ||
| 240 | read_unlock(&tasklist_lock); | ||
| 241 | } | ||
| 242 | |||
| 243 | /* | ||
| 244 | * Collect processes when the error hit a file mapped page. | ||
| 245 | */ | ||
| 246 | static void collect_procs_file(struct page *page, struct list_head *to_kill, | ||
| 247 | struct to_kill **tkc) | ||
| 248 | { | ||
| 249 | struct vm_area_struct *vma; | ||
| 250 | struct task_struct *tsk; | ||
| 251 | struct prio_tree_iter iter; | ||
| 252 | struct address_space *mapping = page->mapping; | ||
| 253 | |||
| 254 | /* | ||
| 255 | * A note on the locking order between the two locks. | ||
| 256 | * We don't rely on this particular order. | ||
| 257 | * If you have some other code that needs a different order | ||
| 258 | * feel free to switch them around. Or add a reverse link | ||
| 259 | * from mm_struct to task_struct, then this could be all | ||
| 260 | * done without taking tasklist_lock and looping over all tasks. | ||
| 261 | */ | ||
| 262 | |||
| 263 | read_lock(&tasklist_lock); | ||
| 264 | spin_lock(&mapping->i_mmap_lock); | ||
| 265 | for_each_process(tsk) { | ||
| 266 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 267 | |||
| 268 | if (!task_early_kill(tsk)) | ||
| 269 | continue; | ||
| 270 | |||
| 271 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, | ||
| 272 | pgoff) { | ||
| 273 | /* | ||
| 274 | * Send early kill signal to tasks where a vma covers | ||
| 275 | * the page but the corrupted page is not necessarily | ||
| 276 | * mapped it in its pte. | ||
| 277 | * Assume applications who requested early kill want | ||
| 278 | * to be informed of all such data corruptions. | ||
| 279 | */ | ||
| 280 | if (vma->vm_mm == tsk->mm) | ||
| 281 | add_to_kill(tsk, page, vma, to_kill, tkc); | ||
| 282 | } | ||
| 283 | } | ||
| 284 | spin_unlock(&mapping->i_mmap_lock); | ||
| 285 | read_unlock(&tasklist_lock); | ||
| 286 | } | ||
| 287 | |||
| 288 | /* | ||
| 289 | * Collect the processes who have the corrupted page mapped to kill. | ||
| 290 | * This is done in two steps for locking reasons. | ||
| 291 | * First preallocate one tokill structure outside the spin locks, | ||
| 292 | * so that we can kill at least one process reasonably reliable. | ||
| 293 | */ | ||
| 294 | static void collect_procs(struct page *page, struct list_head *tokill) | ||
| 295 | { | ||
| 296 | struct to_kill *tk; | ||
| 297 | |||
| 298 | if (!page->mapping) | ||
| 299 | return; | ||
| 300 | |||
| 301 | tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); | ||
| 302 | if (!tk) | ||
| 303 | return; | ||
| 304 | if (PageAnon(page)) | ||
| 305 | collect_procs_anon(page, tokill, &tk); | ||
| 306 | else | ||
| 307 | collect_procs_file(page, tokill, &tk); | ||
| 308 | kfree(tk); | ||
| 309 | } | ||
| 310 | |||
| 311 | /* | ||
| 312 | * Error handlers for various types of pages. | ||
| 313 | */ | ||
| 314 | |||
| 315 | enum outcome { | ||
| 316 | FAILED, /* Error handling failed */ | ||
| 317 | DELAYED, /* Will be handled later */ | ||
| 318 | IGNORED, /* Error safely ignored */ | ||
| 319 | RECOVERED, /* Successfully recovered */ | ||
| 320 | }; | ||
| 321 | |||
| 322 | static const char *action_name[] = { | ||
| 323 | [FAILED] = "Failed", | ||
| 324 | [DELAYED] = "Delayed", | ||
| 325 | [IGNORED] = "Ignored", | ||
| 326 | [RECOVERED] = "Recovered", | ||
| 327 | }; | ||
| 328 | |||
| 329 | /* | ||
| 330 | * Error hit kernel page. | ||
| 331 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | ||
| 332 | * could be more sophisticated. | ||
| 333 | */ | ||
| 334 | static int me_kernel(struct page *p, unsigned long pfn) | ||
| 335 | { | ||
| 336 | return DELAYED; | ||
| 337 | } | ||
| 338 | |||
| 339 | /* | ||
| 340 | * Already poisoned page. | ||
| 341 | */ | ||
| 342 | static int me_ignore(struct page *p, unsigned long pfn) | ||
| 343 | { | ||
| 344 | return IGNORED; | ||
| 345 | } | ||
| 346 | |||
| 347 | /* | ||
| 348 | * Page in unknown state. Do nothing. | ||
| 349 | */ | ||
| 350 | static int me_unknown(struct page *p, unsigned long pfn) | ||
| 351 | { | ||
| 352 | printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); | ||
| 353 | return FAILED; | ||
| 354 | } | ||
| 355 | |||
| 356 | /* | ||
| 357 | * Free memory | ||
| 358 | */ | ||
| 359 | static int me_free(struct page *p, unsigned long pfn) | ||
| 360 | { | ||
| 361 | return DELAYED; | ||
| 362 | } | ||
| 363 | |||
| 364 | /* | ||
| 365 | * Clean (or cleaned) page cache page. | ||
| 366 | */ | ||
| 367 | static int me_pagecache_clean(struct page *p, unsigned long pfn) | ||
| 368 | { | ||
| 369 | int err; | ||
| 370 | int ret = FAILED; | ||
| 371 | struct address_space *mapping; | ||
| 372 | |||
| 373 | if (!isolate_lru_page(p)) | ||
| 374 | page_cache_release(p); | ||
| 375 | |||
| 376 | /* | ||
| 377 | * For anonymous pages we're done the only reference left | ||
| 378 | * should be the one m_f() holds. | ||
| 379 | */ | ||
| 380 | if (PageAnon(p)) | ||
| 381 | return RECOVERED; | ||
| 382 | |||
| 383 | /* | ||
| 384 | * Now truncate the page in the page cache. This is really | ||
| 385 | * more like a "temporary hole punch" | ||
| 386 | * Don't do this for block devices when someone else | ||
| 387 | * has a reference, because it could be file system metadata | ||
| 388 | * and that's not safe to truncate. | ||
| 389 | */ | ||
| 390 | mapping = page_mapping(p); | ||
| 391 | if (!mapping) { | ||
| 392 | /* | ||
| 393 | * Page has been teared down in the meanwhile | ||
| 394 | */ | ||
| 395 | return FAILED; | ||
| 396 | } | ||
| 397 | |||
| 398 | /* | ||
| 399 | * Truncation is a bit tricky. Enable it per file system for now. | ||
| 400 | * | ||
| 401 | * Open: to take i_mutex or not for this? Right now we don't. | ||
| 402 | */ | ||
| 403 | if (mapping->a_ops->error_remove_page) { | ||
| 404 | err = mapping->a_ops->error_remove_page(mapping, p); | ||
| 405 | if (err != 0) { | ||
| 406 | printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", | ||
| 407 | pfn, err); | ||
| 408 | } else if (page_has_private(p) && | ||
| 409 | !try_to_release_page(p, GFP_NOIO)) { | ||
| 410 | pr_debug("MCE %#lx: failed to release buffers\n", pfn); | ||
| 411 | } else { | ||
| 412 | ret = RECOVERED; | ||
| 413 | } | ||
| 414 | } else { | ||
| 415 | /* | ||
| 416 | * If the file system doesn't support it just invalidate | ||
| 417 | * This fails on dirty or anything with private pages | ||
| 418 | */ | ||
| 419 | if (invalidate_inode_page(p)) | ||
| 420 | ret = RECOVERED; | ||
| 421 | else | ||
| 422 | printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", | ||
| 423 | pfn); | ||
| 424 | } | ||
| 425 | return ret; | ||
| 426 | } | ||
| 427 | |||
| 428 | /* | ||
| 429 | * Dirty cache page page | ||
| 430 | * Issues: when the error hit a hole page the error is not properly | ||
| 431 | * propagated. | ||
| 432 | */ | ||
| 433 | static int me_pagecache_dirty(struct page *p, unsigned long pfn) | ||
| 434 | { | ||
| 435 | struct address_space *mapping = page_mapping(p); | ||
| 436 | |||
| 437 | SetPageError(p); | ||
| 438 | /* TBD: print more information about the file. */ | ||
| 439 | if (mapping) { | ||
| 440 | /* | ||
| 441 | * IO error will be reported by write(), fsync(), etc. | ||
| 442 | * who check the mapping. | ||
| 443 | * This way the application knows that something went | ||
| 444 | * wrong with its dirty file data. | ||
| 445 | * | ||
| 446 | * There's one open issue: | ||
| 447 | * | ||
| 448 | * The EIO will be only reported on the next IO | ||
| 449 | * operation and then cleared through the IO map. | ||
| 450 | * Normally Linux has two mechanisms to pass IO error | ||
| 451 | * first through the AS_EIO flag in the address space | ||
| 452 | * and then through the PageError flag in the page. | ||
| 453 | * Since we drop pages on memory failure handling the | ||
| 454 | * only mechanism open to use is through AS_AIO. | ||
| 455 | * | ||
| 456 | * This has the disadvantage that it gets cleared on | ||
| 457 | * the first operation that returns an error, while | ||
| 458 | * the PageError bit is more sticky and only cleared | ||
| 459 | * when the page is reread or dropped. If an | ||
| 460 | * application assumes it will always get error on | ||
| 461 | * fsync, but does other operations on the fd before | ||
| 462 | * and the page is dropped inbetween then the error | ||
| 463 | * will not be properly reported. | ||
| 464 | * | ||
| 465 | * This can already happen even without hwpoisoned | ||
| 466 | * pages: first on metadata IO errors (which only | ||
| 467 | * report through AS_EIO) or when the page is dropped | ||
| 468 | * at the wrong time. | ||
| 469 | * | ||
| 470 | * So right now we assume that the application DTRT on | ||
| 471 | * the first EIO, but we're not worse than other parts | ||
| 472 | * of the kernel. | ||
| 473 | */ | ||
| 474 | mapping_set_error(mapping, EIO); | ||
| 475 | } | ||
| 476 | |||
| 477 | return me_pagecache_clean(p, pfn); | ||
| 478 | } | ||
| 479 | |||
| 480 | /* | ||
| 481 | * Clean and dirty swap cache. | ||
| 482 | * | ||
| 483 | * Dirty swap cache page is tricky to handle. The page could live both in page | ||
| 484 | * cache and swap cache(ie. page is freshly swapped in). So it could be | ||
| 485 | * referenced concurrently by 2 types of PTEs: | ||
| 486 | * normal PTEs and swap PTEs. We try to handle them consistently by calling | ||
| 487 | * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, | ||
| 488 | * and then | ||
| 489 | * - clear dirty bit to prevent IO | ||
| 490 | * - remove from LRU | ||
| 491 | * - but keep in the swap cache, so that when we return to it on | ||
| 492 | * a later page fault, we know the application is accessing | ||
| 493 | * corrupted data and shall be killed (we installed simple | ||
| 494 | * interception code in do_swap_page to catch it). | ||
| 495 | * | ||
| 496 | * Clean swap cache pages can be directly isolated. A later page fault will | ||
| 497 | * bring in the known good data from disk. | ||
| 498 | */ | ||
| 499 | static int me_swapcache_dirty(struct page *p, unsigned long pfn) | ||
| 500 | { | ||
| 501 | int ret = FAILED; | ||
| 502 | |||
| 503 | ClearPageDirty(p); | ||
| 504 | /* Trigger EIO in shmem: */ | ||
| 505 | ClearPageUptodate(p); | ||
| 506 | |||
| 507 | if (!isolate_lru_page(p)) { | ||
| 508 | page_cache_release(p); | ||
| 509 | ret = DELAYED; | ||
| 510 | } | ||
| 511 | |||
| 512 | return ret; | ||
| 513 | } | ||
| 514 | |||
| 515 | static int me_swapcache_clean(struct page *p, unsigned long pfn) | ||
| 516 | { | ||
| 517 | int ret = FAILED; | ||
| 518 | |||
| 519 | if (!isolate_lru_page(p)) { | ||
| 520 | page_cache_release(p); | ||
| 521 | ret = RECOVERED; | ||
| 522 | } | ||
| 523 | delete_from_swap_cache(p); | ||
| 524 | return ret; | ||
| 525 | } | ||
| 526 | |||
| 527 | /* | ||
| 528 | * Huge pages. Needs work. | ||
| 529 | * Issues: | ||
| 530 | * No rmap support so we cannot find the original mapper. In theory could walk | ||
| 531 | * all MMs and look for the mappings, but that would be non atomic and racy. | ||
| 532 | * Need rmap for hugepages for this. Alternatively we could employ a heuristic, | ||
| 533 | * like just walking the current process and hoping it has it mapped (that | ||
| 534 | * should be usually true for the common "shared database cache" case) | ||
| 535 | * Should handle free huge pages and dequeue them too, but this needs to | ||
| 536 | * handle huge page accounting correctly. | ||
| 537 | */ | ||
| 538 | static int me_huge_page(struct page *p, unsigned long pfn) | ||
| 539 | { | ||
| 540 | return FAILED; | ||
| 541 | } | ||
| 542 | |||
| 543 | /* | ||
| 544 | * Various page states we can handle. | ||
| 545 | * | ||
| 546 | * A page state is defined by its current page->flags bits. | ||
| 547 | * The table matches them in order and calls the right handler. | ||
| 548 | * | ||
| 549 | * This is quite tricky because we can access page at any time | ||
| 550 | * in its live cycle, so all accesses have to be extremly careful. | ||
| 551 | * | ||
| 552 | * This is not complete. More states could be added. | ||
| 553 | * For any missing state don't attempt recovery. | ||
| 554 | */ | ||
| 555 | |||
| 556 | #define dirty (1UL << PG_dirty) | ||
| 557 | #define sc (1UL << PG_swapcache) | ||
| 558 | #define unevict (1UL << PG_unevictable) | ||
| 559 | #define mlock (1UL << PG_mlocked) | ||
| 560 | #define writeback (1UL << PG_writeback) | ||
| 561 | #define lru (1UL << PG_lru) | ||
| 562 | #define swapbacked (1UL << PG_swapbacked) | ||
| 563 | #define head (1UL << PG_head) | ||
| 564 | #define tail (1UL << PG_tail) | ||
| 565 | #define compound (1UL << PG_compound) | ||
| 566 | #define slab (1UL << PG_slab) | ||
| 567 | #define buddy (1UL << PG_buddy) | ||
| 568 | #define reserved (1UL << PG_reserved) | ||
| 569 | |||
| 570 | static struct page_state { | ||
| 571 | unsigned long mask; | ||
| 572 | unsigned long res; | ||
| 573 | char *msg; | ||
| 574 | int (*action)(struct page *p, unsigned long pfn); | ||
| 575 | } error_states[] = { | ||
| 576 | { reserved, reserved, "reserved kernel", me_ignore }, | ||
| 577 | { buddy, buddy, "free kernel", me_free }, | ||
| 578 | |||
| 579 | /* | ||
| 580 | * Could in theory check if slab page is free or if we can drop | ||
| 581 | * currently unused objects without touching them. But just | ||
| 582 | * treat it as standard kernel for now. | ||
| 583 | */ | ||
| 584 | { slab, slab, "kernel slab", me_kernel }, | ||
| 585 | |||
| 586 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | ||
| 587 | { head, head, "huge", me_huge_page }, | ||
| 588 | { tail, tail, "huge", me_huge_page }, | ||
| 589 | #else | ||
| 590 | { compound, compound, "huge", me_huge_page }, | ||
| 591 | #endif | ||
| 592 | |||
| 593 | { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, | ||
| 594 | { sc|dirty, sc, "swapcache", me_swapcache_clean }, | ||
| 595 | |||
| 596 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, | ||
| 597 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, | ||
| 598 | |||
| 599 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
| 600 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, | ||
| 601 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, | ||
| 602 | #endif | ||
| 603 | |||
| 604 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, | ||
| 605 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | ||
| 606 | { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, | ||
| 607 | |||
| 608 | /* | ||
| 609 | * Catchall entry: must be at end. | ||
| 610 | */ | ||
| 611 | { 0, 0, "unknown page state", me_unknown }, | ||
| 612 | }; | ||
| 613 | |||
| 614 | #undef lru | ||
| 615 | |||
| 616 | static void action_result(unsigned long pfn, char *msg, int result) | ||
| 617 | { | ||
| 618 | struct page *page = NULL; | ||
| 619 | if (pfn_valid(pfn)) | ||
| 620 | page = pfn_to_page(pfn); | ||
| 621 | |||
| 622 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", | ||
| 623 | pfn, | ||
| 624 | page && PageDirty(page) ? "dirty " : "", | ||
| 625 | msg, action_name[result]); | ||
| 626 | } | ||
| 627 | |||
| 628 | static int page_action(struct page_state *ps, struct page *p, | ||
| 629 | unsigned long pfn, int ref) | ||
| 630 | { | ||
| 631 | int result; | ||
| 632 | |||
| 633 | result = ps->action(p, pfn); | ||
| 634 | action_result(pfn, ps->msg, result); | ||
| 635 | if (page_count(p) != 1 + ref) | ||
| 636 | printk(KERN_ERR | ||
| 637 | "MCE %#lx: %s page still referenced by %d users\n", | ||
| 638 | pfn, ps->msg, page_count(p) - 1); | ||
| 639 | |||
| 640 | /* Could do more checks here if page looks ok */ | ||
| 641 | /* | ||
| 642 | * Could adjust zone counters here to correct for the missing page. | ||
| 643 | */ | ||
| 644 | |||
| 645 | return result == RECOVERED ? 0 : -EBUSY; | ||
| 646 | } | ||
| 647 | |||
| 648 | #define N_UNMAP_TRIES 5 | ||
| 649 | |||
| 650 | /* | ||
| 651 | * Do all that is necessary to remove user space mappings. Unmap | ||
| 652 | * the pages and send SIGBUS to the processes if the data was dirty. | ||
| 653 | */ | ||
| 654 | static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | ||
| 655 | int trapno) | ||
| 656 | { | ||
| 657 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | ||
| 658 | struct address_space *mapping; | ||
| 659 | LIST_HEAD(tokill); | ||
| 660 | int ret; | ||
| 661 | int i; | ||
| 662 | int kill = 1; | ||
| 663 | |||
| 664 | if (PageReserved(p) || PageCompound(p) || PageSlab(p)) | ||
| 665 | return; | ||
| 666 | |||
| 667 | if (!PageLRU(p)) | ||
| 668 | lru_add_drain_all(); | ||
| 669 | |||
| 670 | /* | ||
| 671 | * This check implies we don't kill processes if their pages | ||
| 672 | * are in the swap cache early. Those are always late kills. | ||
| 673 | */ | ||
| 674 | if (!page_mapped(p)) | ||
| 675 | return; | ||
| 676 | |||
| 677 | if (PageSwapCache(p)) { | ||
| 678 | printk(KERN_ERR | ||
| 679 | "MCE %#lx: keeping poisoned page in swap cache\n", pfn); | ||
| 680 | ttu |= TTU_IGNORE_HWPOISON; | ||
| 681 | } | ||
| 682 | |||
| 683 | /* | ||
| 684 | * Propagate the dirty bit from PTEs to struct page first, because we | ||
| 685 | * need this to decide if we should kill or just drop the page. | ||
| 686 | */ | ||
| 687 | mapping = page_mapping(p); | ||
| 688 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { | ||
| 689 | if (page_mkclean(p)) { | ||
| 690 | SetPageDirty(p); | ||
| 691 | } else { | ||
| 692 | kill = 0; | ||
| 693 | ttu |= TTU_IGNORE_HWPOISON; | ||
| 694 | printk(KERN_INFO | ||
| 695 | "MCE %#lx: corrupted page was clean: dropped without side effects\n", | ||
| 696 | pfn); | ||
| 697 | } | ||
| 698 | } | ||
| 699 | |||
| 700 | /* | ||
| 701 | * First collect all the processes that have the page | ||
| 702 | * mapped in dirty form. This has to be done before try_to_unmap, | ||
| 703 | * because ttu takes the rmap data structures down. | ||
| 704 | * | ||
| 705 | * Error handling: We ignore errors here because | ||
| 706 | * there's nothing that can be done. | ||
| 707 | */ | ||
| 708 | if (kill) | ||
| 709 | collect_procs(p, &tokill); | ||
| 710 | |||
| 711 | /* | ||
| 712 | * try_to_unmap can fail temporarily due to races. | ||
| 713 | * Try a few times (RED-PEN better strategy?) | ||
| 714 | */ | ||
| 715 | for (i = 0; i < N_UNMAP_TRIES; i++) { | ||
| 716 | ret = try_to_unmap(p, ttu); | ||
| 717 | if (ret == SWAP_SUCCESS) | ||
| 718 | break; | ||
| 719 | pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); | ||
| 720 | } | ||
| 721 | |||
| 722 | if (ret != SWAP_SUCCESS) | ||
| 723 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | ||
| 724 | pfn, page_mapcount(p)); | ||
| 725 | |||
| 726 | /* | ||
| 727 | * Now that the dirty bit has been propagated to the | ||
| 728 | * struct page and all unmaps done we can decide if | ||
| 729 | * killing is needed or not. Only kill when the page | ||
| 730 | * was dirty, otherwise the tokill list is merely | ||
| 731 | * freed. When there was a problem unmapping earlier | ||
| 732 | * use a more force-full uncatchable kill to prevent | ||
| 733 | * any accesses to the poisoned memory. | ||
| 734 | */ | ||
| 735 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, | ||
| 736 | ret != SWAP_SUCCESS, pfn); | ||
| 737 | } | ||
| 738 | |||
| 739 | int __memory_failure(unsigned long pfn, int trapno, int ref) | ||
| 740 | { | ||
| 741 | struct page_state *ps; | ||
| 742 | struct page *p; | ||
| 743 | int res; | ||
| 744 | |||
| 745 | if (!sysctl_memory_failure_recovery) | ||
| 746 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | ||
| 747 | |||
| 748 | if (!pfn_valid(pfn)) { | ||
| 749 | action_result(pfn, "memory outside kernel control", IGNORED); | ||
| 750 | return -EIO; | ||
| 751 | } | ||
| 752 | |||
| 753 | p = pfn_to_page(pfn); | ||
| 754 | if (TestSetPageHWPoison(p)) { | ||
| 755 | action_result(pfn, "already hardware poisoned", IGNORED); | ||
| 756 | return 0; | ||
| 757 | } | ||
| 758 | |||
| 759 | atomic_long_add(1, &mce_bad_pages); | ||
| 760 | |||
| 761 | /* | ||
| 762 | * We need/can do nothing about count=0 pages. | ||
| 763 | * 1) it's a free page, and therefore in safe hand: | ||
| 764 | * prep_new_page() will be the gate keeper. | ||
| 765 | * 2) it's part of a non-compound high order page. | ||
| 766 | * Implies some kernel user: cannot stop them from | ||
| 767 | * R/W the page; let's pray that the page has been | ||
| 768 | * used and will be freed some time later. | ||
| 769 | * In fact it's dangerous to directly bump up page count from 0, | ||
| 770 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | ||
| 771 | */ | ||
| 772 | if (!get_page_unless_zero(compound_head(p))) { | ||
| 773 | action_result(pfn, "free or high order kernel", IGNORED); | ||
| 774 | return PageBuddy(compound_head(p)) ? 0 : -EBUSY; | ||
| 775 | } | ||
| 776 | |||
| 777 | /* | ||
| 778 | * Lock the page and wait for writeback to finish. | ||
| 779 | * It's very difficult to mess with pages currently under IO | ||
| 780 | * and in many cases impossible, so we just avoid it here. | ||
| 781 | */ | ||
| 782 | lock_page_nosync(p); | ||
| 783 | wait_on_page_writeback(p); | ||
| 784 | |||
| 785 | /* | ||
| 786 | * Now take care of user space mappings. | ||
| 787 | */ | ||
| 788 | hwpoison_user_mappings(p, pfn, trapno); | ||
| 789 | |||
| 790 | /* | ||
| 791 | * Torn down by someone else? | ||
| 792 | */ | ||
| 793 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { | ||
| 794 | action_result(pfn, "already truncated LRU", IGNORED); | ||
| 795 | res = 0; | ||
| 796 | goto out; | ||
| 797 | } | ||
| 798 | |||
| 799 | res = -EBUSY; | ||
| 800 | for (ps = error_states;; ps++) { | ||
| 801 | if ((p->flags & ps->mask) == ps->res) { | ||
| 802 | res = page_action(ps, p, pfn, ref); | ||
| 803 | break; | ||
| 804 | } | ||
| 805 | } | ||
| 806 | out: | ||
| 807 | unlock_page(p); | ||
| 808 | return res; | ||
| 809 | } | ||
| 810 | EXPORT_SYMBOL_GPL(__memory_failure); | ||
| 811 | |||
| 812 | /** | ||
| 813 | * memory_failure - Handle memory failure of a page. | ||
| 814 | * @pfn: Page Number of the corrupted page | ||
| 815 | * @trapno: Trap number reported in the signal to user space. | ||
| 816 | * | ||
| 817 | * This function is called by the low level machine check code | ||
| 818 | * of an architecture when it detects hardware memory corruption | ||
| 819 | * of a page. It tries its best to recover, which includes | ||
| 820 | * dropping pages, killing processes etc. | ||
| 821 | * | ||
| 822 | * The function is primarily of use for corruptions that | ||
| 823 | * happen outside the current execution context (e.g. when | ||
| 824 | * detected by a background scrubber) | ||
| 825 | * | ||
| 826 | * Must run in process context (e.g. a work queue) with interrupts | ||
| 827 | * enabled and no spinlocks hold. | ||
| 828 | */ | ||
| 829 | void memory_failure(unsigned long pfn, int trapno) | ||
| 830 | { | ||
| 831 | __memory_failure(pfn, trapno, 0); | ||
| 832 | } | ||
| @@ -36,6 +36,11 @@ | |||
| 36 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
| 37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
| 38 | * within inode_lock in __sync_single_inode) | 38 | * within inode_lock in __sync_single_inode) |
| 39 | * | ||
| 40 | * (code doesn't rely on that order so it could be switched around) | ||
| 41 | * ->tasklist_lock | ||
| 42 | * anon_vma->lock (memory_failure, collect_procs_anon) | ||
| 43 | * pte map lock | ||
| 39 | */ | 44 | */ |
| 40 | 45 | ||
| 41 | #include <linux/mm.h> | 46 | #include <linux/mm.h> |
| @@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, | |||
| 311 | * if the page is not mapped into the page tables of this VMA. Only | 316 | * if the page is not mapped into the page tables of this VMA. Only |
| 312 | * valid for normal file or anonymous VMAs. | 317 | * valid for normal file or anonymous VMAs. |
| 313 | */ | 318 | */ |
| 314 | static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | 319 | int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) |
| 315 | { | 320 | { |
| 316 | unsigned long address; | 321 | unsigned long address; |
| 317 | pte_t *pte; | 322 | pte_t *pte; |
