aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-25 21:43:59 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-25 21:43:59 -0400
commit2923b27e54242acf27fd16b299e102117c82f52f (patch)
tree86b3e27575814dab74307a7928bf579455b70e24
parent828bf6e904eb8fc8969333568802689fbbf07a40 (diff)
parentc953cc987ab87d180e1d5de2f1c217abe33aac77 (diff)
Merge tag 'libnvdimm-for-4.19_dax-memory-failure' of gitolite.kernel.org:pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm memory-failure update from Dave Jiang: "As it stands, memory_failure() gets thoroughly confused by dev_pagemap backed mappings. The recovery code has specific enabling for several possible page states and needs new enabling to handle poison in dax mappings. In order to support reliable reverse mapping of user space addresses: 1/ Add new locking in the memory_failure() rmap path to prevent races that would typically be handled by the page lock. 2/ Since dev_pagemap pages are hidden from the page allocator and the "compound page" accounting machinery, add a mechanism to determine the size of the mapping that encompasses a given poisoned pfn. 3/ Given pmem errors can be repaired, change the speculatively accessed poison protection, mce_unmap_kpfn(), to be reversible and otherwise allow ongoing access from the kernel. A side effect of this enabling is that MADV_HWPOISON becomes usable for dax mappings, however the primary motivation is to allow the system to survive userspace consumption of hardware-poison via dax. Specifically the current behavior is: mce: Uncorrected hardware memory error in user-access at af34214200 {1}[Hardware Error]: It has been corrected by h/w and requires no further action mce: [Hardware Error]: Machine check events logged {1}[Hardware Error]: event severity: corrected Memory failure: 0xaf34214: reserved kernel page still referenced by 1 users [..] Memory failure: 0xaf34214: recovery action for reserved kernel page: Failed mce: Memory error not recovered <reboot> ...and with these changes: Injecting memory failure for pfn 0x20cb00 at process virtual address 0x7f763dd00000 Memory failure: 0x20cb00: Killing dax-pmd:5421 due to hardware memory corruption Memory failure: 0x20cb00: recovery action for dax page: Recovered Given all the cross dependencies I propose taking this through nvdimm.git with acks from Naoya, x86/core, x86/RAS, and of course dax folks" * tag 'libnvdimm-for-4.19_dax-memory-failure' of gitolite.kernel.org:pub/scm/linux/kernel/git/nvdimm/nvdimm: libnvdimm, pmem: Restore page attributes when clearing errors x86/memory_failure: Introduce {set, clear}_mce_nospec() x86/mm/pat: Prepare {reserve, free}_memtype() for "decoy" addresses mm, memory_failure: Teach memory_failure() about dev_pagemap pages filesystem-dax: Introduce dax_lock_mapping_entry() mm, memory_failure: Collect mapping size in collect_procs() mm, madvise_inject_error: Let memory_failure() optionally take a page reference mm, dev_pagemap: Do not clear ->mapping on final put mm, madvise_inject_error: Disable MADV_SOFT_OFFLINE for ZONE_DEVICE pages filesystem-dax: Set page->index device-dax: Set page->index device-dax: Enable page_mapping() device-dax: Convert to vmf_insert_mixed and vm_fault_t
-rw-r--r--arch/x86/include/asm/set_memory.h42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c38
-rw-r--r--arch/x86/mm/pat.c16
-rw-r--r--drivers/dax/device.c75
-rw-r--r--drivers/nvdimm/pmem.c26
-rw-r--r--drivers/nvdimm/pmem.h13
-rw-r--r--fs/dax.c125
-rw-r--r--include/linux/dax.h13
-rw-r--r--include/linux/huge_mm.h5
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/linux/set_memory.h14
-rw-r--r--kernel/memremap.c1
-rw-r--r--mm/hmm.c2
-rw-r--r--mm/huge_memory.c4
-rw-r--r--mm/madvise.c16
-rw-r--r--mm/memory-failure.c210
17 files changed, 481 insertions, 135 deletions
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 34cffcef7375..07a25753e85c 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -89,4 +89,46 @@ extern int kernel_set_to_readonly;
89void set_kernel_text_rw(void); 89void set_kernel_text_rw(void);
90void set_kernel_text_ro(void); 90void set_kernel_text_ro(void);
91 91
92#ifdef CONFIG_X86_64
93static inline int set_mce_nospec(unsigned long pfn)
94{
95 unsigned long decoy_addr;
96 int rc;
97
98 /*
99 * Mark the linear address as UC to make sure we don't log more
100 * errors because of speculative access to the page.
101 * We would like to just call:
102 * set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1);
103 * but doing that would radically increase the odds of a
104 * speculative access to the poison page because we'd have
105 * the virtual address of the kernel 1:1 mapping sitting
106 * around in registers.
107 * Instead we get tricky. We create a non-canonical address
108 * that looks just like the one we want, but has bit 63 flipped.
109 * This relies on set_memory_uc() properly sanitizing any __pa()
110 * results with __PHYSICAL_MASK or PTE_PFN_MASK.
111 */
112 decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
113
114 rc = set_memory_uc(decoy_addr, 1);
115 if (rc)
116 pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
117 return rc;
118}
119#define set_mce_nospec set_mce_nospec
120
121/* Restore full speculative operation to the pfn. */
122static inline int clear_mce_nospec(unsigned long pfn)
123{
124 return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
125}
126#define clear_mce_nospec clear_mce_nospec
127#else
128/*
129 * Few people would run a 32-bit kernel on a machine that supports
130 * recoverable errors because they have too much memory to boot 32-bit.
131 */
132#endif
133
92#endif /* _ASM_X86_SET_MEMORY_H */ 134#endif /* _ASM_X86_SET_MEMORY_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 374d1aa66952..ceb67cd5918f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -113,21 +113,6 @@ static inline void mce_register_injector_chain(struct notifier_block *nb) { }
113static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } 113static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
114#endif 114#endif
115 115
116#ifndef CONFIG_X86_64
117/*
118 * On 32-bit systems it would be difficult to safely unmap a poison page
119 * from the kernel 1:1 map because there are no non-canonical addresses that
120 * we can use to refer to the address without risking a speculative access.
121 * However, this isn't much of an issue because:
122 * 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which
123 * are only mapped into the kernel as needed
124 * 2) Few people would run a 32-bit kernel on a machine that supports
125 * recoverable errors because they have too much memory to boot 32-bit.
126 */
127static inline void mce_unmap_kpfn(unsigned long pfn) {}
128#define mce_unmap_kpfn mce_unmap_kpfn
129#endif
130
131struct mca_config { 116struct mca_config {
132 bool dont_log_ce; 117 bool dont_log_ce;
133 bool cmci_disabled; 118 bool cmci_disabled;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 4b767284b7f5..953b3ce92dcc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -42,6 +42,7 @@
42#include <linux/irq_work.h> 42#include <linux/irq_work.h>
43#include <linux/export.h> 43#include <linux/export.h>
44#include <linux/jump_label.h> 44#include <linux/jump_label.h>
45#include <linux/set_memory.h>
45 46
46#include <asm/intel-family.h> 47#include <asm/intel-family.h>
47#include <asm/processor.h> 48#include <asm/processor.h>
@@ -50,7 +51,6 @@
50#include <asm/mce.h> 51#include <asm/mce.h>
51#include <asm/msr.h> 52#include <asm/msr.h>
52#include <asm/reboot.h> 53#include <asm/reboot.h>
53#include <asm/set_memory.h>
54 54
55#include "mce-internal.h" 55#include "mce-internal.h"
56 56
@@ -108,10 +108,6 @@ static struct irq_work mce_irq_work;
108 108
109static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); 109static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
110 110
111#ifndef mce_unmap_kpfn
112static void mce_unmap_kpfn(unsigned long pfn);
113#endif
114
115/* 111/*
116 * CPU/chipset specific EDAC code can register a notifier call here to print 112 * CPU/chipset specific EDAC code can register a notifier call here to print
117 * MCE errors in a human-readable form. 113 * MCE errors in a human-readable form.
@@ -602,7 +598,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
602 if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) { 598 if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
603 pfn = mce->addr >> PAGE_SHIFT; 599 pfn = mce->addr >> PAGE_SHIFT;
604 if (!memory_failure(pfn, 0)) 600 if (!memory_failure(pfn, 0))
605 mce_unmap_kpfn(pfn); 601 set_mce_nospec(pfn);
606 } 602 }
607 603
608 return NOTIFY_OK; 604 return NOTIFY_OK;
@@ -1072,38 +1068,10 @@ static int do_memory_failure(struct mce *m)
1072 if (ret) 1068 if (ret)
1073 pr_err("Memory error not recovered"); 1069 pr_err("Memory error not recovered");
1074 else 1070 else
1075 mce_unmap_kpfn(m->addr >> PAGE_SHIFT); 1071 set_mce_nospec(m->addr >> PAGE_SHIFT);
1076 return ret; 1072 return ret;
1077} 1073}
1078 1074
1079#ifndef mce_unmap_kpfn
1080static void mce_unmap_kpfn(unsigned long pfn)
1081{
1082 unsigned long decoy_addr;
1083
1084 /*
1085 * Unmap this page from the kernel 1:1 mappings to make sure
1086 * we don't log more errors because of speculative access to
1087 * the page.
1088 * We would like to just call:
1089 * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
1090 * but doing that would radically increase the odds of a
1091 * speculative access to the poison page because we'd have
1092 * the virtual address of the kernel 1:1 mapping sitting
1093 * around in registers.
1094 * Instead we get tricky. We create a non-canonical address
1095 * that looks just like the one we want, but has bit 63 flipped.
1096 * This relies on set_memory_np() not checking whether we passed
1097 * a legal address.
1098 */
1099
1100 decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
1101
1102 if (set_memory_np(decoy_addr, 1))
1103 pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
1104}
1105#endif
1106
1107 1075
1108/* 1076/*
1109 * Cases where we avoid rendezvous handler timeout: 1077 * Cases where we avoid rendezvous handler timeout:
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 1555bd7d3449..3d0c83ef6aab 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -512,6 +512,17 @@ static int free_ram_pages_type(u64 start, u64 end)
512 return 0; 512 return 0;
513} 513}
514 514
515static u64 sanitize_phys(u64 address)
516{
517 /*
518 * When changing the memtype for pages containing poison allow
519 * for a "decoy" virtual address (bit 63 clear) passed to
520 * set_memory_X(). __pa() on a "decoy" address results in a
521 * physical address with bit 63 set.
522 */
523 return address & __PHYSICAL_MASK;
524}
525
515/* 526/*
516 * req_type typically has one of the: 527 * req_type typically has one of the:
517 * - _PAGE_CACHE_MODE_WB 528 * - _PAGE_CACHE_MODE_WB
@@ -533,6 +544,8 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
533 int is_range_ram; 544 int is_range_ram;
534 int err = 0; 545 int err = 0;
535 546
547 start = sanitize_phys(start);
548 end = sanitize_phys(end);
536 BUG_ON(start >= end); /* end is exclusive */ 549 BUG_ON(start >= end); /* end is exclusive */
537 550
538 if (!pat_enabled()) { 551 if (!pat_enabled()) {
@@ -609,6 +622,9 @@ int free_memtype(u64 start, u64 end)
609 if (!pat_enabled()) 622 if (!pat_enabled())
610 return 0; 623 return 0;
611 624
625 start = sanitize_phys(start);
626 end = sanitize_phys(end);
627
612 /* Low ISA region is always mapped WB. No need to track */ 628 /* Low ISA region is always mapped WB. No need to track */
613 if (x86_platform.is_untracked_pat_range(start, end)) 629 if (x86_platform.is_untracked_pat_range(start, end))
614 return 0; 630 return 0;
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 0a2acd7993f0..6fd46083e629 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -248,13 +248,12 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
248 return -1; 248 return -1;
249} 249}
250 250
251static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 251static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
252 struct vm_fault *vmf, pfn_t *pfn)
252{ 253{
253 struct device *dev = &dev_dax->dev; 254 struct device *dev = &dev_dax->dev;
254 struct dax_region *dax_region; 255 struct dax_region *dax_region;
255 int rc = VM_FAULT_SIGBUS;
256 phys_addr_t phys; 256 phys_addr_t phys;
257 pfn_t pfn;
258 unsigned int fault_size = PAGE_SIZE; 257 unsigned int fault_size = PAGE_SIZE;
259 258
260 if (check_vma(dev_dax, vmf->vma, __func__)) 259 if (check_vma(dev_dax, vmf->vma, __func__))
@@ -276,26 +275,19 @@ static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
276 return VM_FAULT_SIGBUS; 275 return VM_FAULT_SIGBUS;
277 } 276 }
278 277
279 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 278 *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
280
281 rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
282
283 if (rc == -ENOMEM)
284 return VM_FAULT_OOM;
285 if (rc < 0 && rc != -EBUSY)
286 return VM_FAULT_SIGBUS;
287 279
288 return VM_FAULT_NOPAGE; 280 return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
289} 281}
290 282
291static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 283static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
284 struct vm_fault *vmf, pfn_t *pfn)
292{ 285{
293 unsigned long pmd_addr = vmf->address & PMD_MASK; 286 unsigned long pmd_addr = vmf->address & PMD_MASK;
294 struct device *dev = &dev_dax->dev; 287 struct device *dev = &dev_dax->dev;
295 struct dax_region *dax_region; 288 struct dax_region *dax_region;
296 phys_addr_t phys; 289 phys_addr_t phys;
297 pgoff_t pgoff; 290 pgoff_t pgoff;
298 pfn_t pfn;
299 unsigned int fault_size = PMD_SIZE; 291 unsigned int fault_size = PMD_SIZE;
300 292
301 if (check_vma(dev_dax, vmf->vma, __func__)) 293 if (check_vma(dev_dax, vmf->vma, __func__))
@@ -331,21 +323,21 @@ static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
331 return VM_FAULT_SIGBUS; 323 return VM_FAULT_SIGBUS;
332 } 324 }
333 325
334 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 326 *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
335 327
336 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, 328 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, *pfn,
337 vmf->flags & FAULT_FLAG_WRITE); 329 vmf->flags & FAULT_FLAG_WRITE);
338} 330}
339 331
340#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 332#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
341static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 333static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
334 struct vm_fault *vmf, pfn_t *pfn)
342{ 335{
343 unsigned long pud_addr = vmf->address & PUD_MASK; 336 unsigned long pud_addr = vmf->address & PUD_MASK;
344 struct device *dev = &dev_dax->dev; 337 struct device *dev = &dev_dax->dev;
345 struct dax_region *dax_region; 338 struct dax_region *dax_region;
346 phys_addr_t phys; 339 phys_addr_t phys;
347 pgoff_t pgoff; 340 pgoff_t pgoff;
348 pfn_t pfn;
349 unsigned int fault_size = PUD_SIZE; 341 unsigned int fault_size = PUD_SIZE;
350 342
351 343
@@ -382,23 +374,26 @@ static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
382 return VM_FAULT_SIGBUS; 374 return VM_FAULT_SIGBUS;
383 } 375 }
384 376
385 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 377 *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
386 378
387 return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn, 379 return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, *pfn,
388 vmf->flags & FAULT_FLAG_WRITE); 380 vmf->flags & FAULT_FLAG_WRITE);
389} 381}
390#else 382#else
391static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 383static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
384 struct vm_fault *vmf, pfn_t *pfn)
392{ 385{
393 return VM_FAULT_FALLBACK; 386 return VM_FAULT_FALLBACK;
394} 387}
395#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 388#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
396 389
397static int dev_dax_huge_fault(struct vm_fault *vmf, 390static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
398 enum page_entry_size pe_size) 391 enum page_entry_size pe_size)
399{ 392{
400 int rc, id;
401 struct file *filp = vmf->vma->vm_file; 393 struct file *filp = vmf->vma->vm_file;
394 unsigned long fault_size;
395 int rc, id;
396 pfn_t pfn;
402 struct dev_dax *dev_dax = filp->private_data; 397 struct dev_dax *dev_dax = filp->private_data;
403 398
404 dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, 399 dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm,
@@ -408,23 +403,49 @@ static int dev_dax_huge_fault(struct vm_fault *vmf,
408 id = dax_read_lock(); 403 id = dax_read_lock();
409 switch (pe_size) { 404 switch (pe_size) {
410 case PE_SIZE_PTE: 405 case PE_SIZE_PTE:
411 rc = __dev_dax_pte_fault(dev_dax, vmf); 406 fault_size = PAGE_SIZE;
407 rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn);
412 break; 408 break;
413 case PE_SIZE_PMD: 409 case PE_SIZE_PMD:
414 rc = __dev_dax_pmd_fault(dev_dax, vmf); 410 fault_size = PMD_SIZE;
411 rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn);
415 break; 412 break;
416 case PE_SIZE_PUD: 413 case PE_SIZE_PUD:
417 rc = __dev_dax_pud_fault(dev_dax, vmf); 414 fault_size = PUD_SIZE;
415 rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn);
418 break; 416 break;
419 default: 417 default:
420 rc = VM_FAULT_SIGBUS; 418 rc = VM_FAULT_SIGBUS;
421 } 419 }
420
421 if (rc == VM_FAULT_NOPAGE) {
422 unsigned long i;
423 pgoff_t pgoff;
424
425 /*
426 * In the device-dax case the only possibility for a
427 * VM_FAULT_NOPAGE result is when device-dax capacity is
428 * mapped. No need to consider the zero page, or racing
429 * conflicting mappings.
430 */
431 pgoff = linear_page_index(vmf->vma, vmf->address
432 & ~(fault_size - 1));
433 for (i = 0; i < fault_size / PAGE_SIZE; i++) {
434 struct page *page;
435
436 page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
437 if (page->mapping)
438 continue;
439 page->mapping = filp->f_mapping;
440 page->index = pgoff + i;
441 }
442 }
422 dax_read_unlock(id); 443 dax_read_unlock(id);
423 444
424 return rc; 445 return rc;
425} 446}
426 447
427static int dev_dax_fault(struct vm_fault *vmf) 448static vm_fault_t dev_dax_fault(struct vm_fault *vmf)
428{ 449{
429 return dev_dax_huge_fault(vmf, PE_SIZE_PTE); 450 return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
430} 451}
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index c23649867696..6071e2942053 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -20,6 +20,7 @@
20#include <linux/hdreg.h> 20#include <linux/hdreg.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/platform_device.h> 22#include <linux/platform_device.h>
23#include <linux/set_memory.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/moduleparam.h> 25#include <linux/moduleparam.h>
25#include <linux/badblocks.h> 26#include <linux/badblocks.h>
@@ -51,6 +52,30 @@ static struct nd_region *to_region(struct pmem_device *pmem)
51 return to_nd_region(to_dev(pmem)->parent); 52 return to_nd_region(to_dev(pmem)->parent);
52} 53}
53 54
55static void hwpoison_clear(struct pmem_device *pmem,
56 phys_addr_t phys, unsigned int len)
57{
58 unsigned long pfn_start, pfn_end, pfn;
59
60 /* only pmem in the linear map supports HWPoison */
61 if (is_vmalloc_addr(pmem->virt_addr))
62 return;
63
64 pfn_start = PHYS_PFN(phys);
65 pfn_end = pfn_start + PHYS_PFN(len);
66 for (pfn = pfn_start; pfn < pfn_end; pfn++) {
67 struct page *page = pfn_to_page(pfn);
68
69 /*
70 * Note, no need to hold a get_dev_pagemap() reference
71 * here since we're in the driver I/O path and
72 * outstanding I/O requests pin the dev_pagemap.
73 */
74 if (test_and_clear_pmem_poison(page))
75 clear_mce_nospec(pfn);
76 }
77}
78
54static blk_status_t pmem_clear_poison(struct pmem_device *pmem, 79static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
55 phys_addr_t offset, unsigned int len) 80 phys_addr_t offset, unsigned int len)
56{ 81{
@@ -65,6 +90,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
65 if (cleared < len) 90 if (cleared < len)
66 rc = BLK_STS_IOERR; 91 rc = BLK_STS_IOERR;
67 if (cleared > 0 && cleared / 512) { 92 if (cleared > 0 && cleared / 512) {
93 hwpoison_clear(pmem, pmem->phys_addr + offset, cleared);
68 cleared /= 512; 94 cleared /= 512;
69 dev_dbg(dev, "%#llx clear %ld sector%s\n", 95 dev_dbg(dev, "%#llx clear %ld sector%s\n",
70 (unsigned long long) sector, cleared, 96 (unsigned long long) sector, cleared,
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index a64ebc78b5df..59cfe13ea8a8 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -1,6 +1,7 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __NVDIMM_PMEM_H__ 2#ifndef __NVDIMM_PMEM_H__
3#define __NVDIMM_PMEM_H__ 3#define __NVDIMM_PMEM_H__
4#include <linux/page-flags.h>
4#include <linux/badblocks.h> 5#include <linux/badblocks.h>
5#include <linux/types.h> 6#include <linux/types.h>
6#include <linux/pfn_t.h> 7#include <linux/pfn_t.h>
@@ -27,4 +28,16 @@ struct pmem_device {
27 28
28long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, 29long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
29 long nr_pages, void **kaddr, pfn_t *pfn); 30 long nr_pages, void **kaddr, pfn_t *pfn);
31
32#ifdef CONFIG_MEMORY_FAILURE
33static inline bool test_and_clear_pmem_poison(struct page *page)
34{
35 return TestClearPageHWPoison(page);
36}
37#else
38static inline bool test_and_clear_pmem_poison(struct page *page)
39{
40 return false;
41}
42#endif
30#endif /* __NVDIMM_PMEM_H__ */ 43#endif /* __NVDIMM_PMEM_H__ */
diff --git a/fs/dax.c b/fs/dax.c
index f76724139f80..f32d7125ad0f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -226,8 +226,8 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
226 * 226 *
227 * Must be called with the i_pages lock held. 227 * Must be called with the i_pages lock held.
228 */ 228 */
229static void *get_unlocked_mapping_entry(struct address_space *mapping, 229static void *__get_unlocked_mapping_entry(struct address_space *mapping,
230 pgoff_t index, void ***slotp) 230 pgoff_t index, void ***slotp, bool (*wait_fn)(void))
231{ 231{
232 void *entry, **slot; 232 void *entry, **slot;
233 struct wait_exceptional_entry_queue ewait; 233 struct wait_exceptional_entry_queue ewait;
@@ -237,6 +237,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
237 ewait.wait.func = wake_exceptional_entry_func; 237 ewait.wait.func = wake_exceptional_entry_func;
238 238
239 for (;;) { 239 for (;;) {
240 bool revalidate;
241
240 entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, 242 entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
241 &slot); 243 &slot);
242 if (!entry || 244 if (!entry ||
@@ -251,14 +253,31 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
251 prepare_to_wait_exclusive(wq, &ewait.wait, 253 prepare_to_wait_exclusive(wq, &ewait.wait,
252 TASK_UNINTERRUPTIBLE); 254 TASK_UNINTERRUPTIBLE);
253 xa_unlock_irq(&mapping->i_pages); 255 xa_unlock_irq(&mapping->i_pages);
254 schedule(); 256 revalidate = wait_fn();
255 finish_wait(wq, &ewait.wait); 257 finish_wait(wq, &ewait.wait);
256 xa_lock_irq(&mapping->i_pages); 258 xa_lock_irq(&mapping->i_pages);
259 if (revalidate)
260 return ERR_PTR(-EAGAIN);
257 } 261 }
258} 262}
259 263
260static void dax_unlock_mapping_entry(struct address_space *mapping, 264static bool entry_wait(void)
261 pgoff_t index) 265{
266 schedule();
267 /*
268 * Never return an ERR_PTR() from
269 * __get_unlocked_mapping_entry(), just keep looping.
270 */
271 return false;
272}
273
274static void *get_unlocked_mapping_entry(struct address_space *mapping,
275 pgoff_t index, void ***slotp)
276{
277 return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
278}
279
280static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
262{ 281{
263 void *entry, **slot; 282 void *entry, **slot;
264 283
@@ -277,7 +296,7 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
277static void put_locked_mapping_entry(struct address_space *mapping, 296static void put_locked_mapping_entry(struct address_space *mapping,
278 pgoff_t index) 297 pgoff_t index)
279{ 298{
280 dax_unlock_mapping_entry(mapping, index); 299 unlock_mapping_entry(mapping, index);
281} 300}
282 301
283/* 302/*
@@ -319,18 +338,27 @@ static unsigned long dax_radix_end_pfn(void *entry)
319 for (pfn = dax_radix_pfn(entry); \ 338 for (pfn = dax_radix_pfn(entry); \
320 pfn < dax_radix_end_pfn(entry); pfn++) 339 pfn < dax_radix_end_pfn(entry); pfn++)
321 340
322static void dax_associate_entry(void *entry, struct address_space *mapping) 341/*
342 * TODO: for reflink+dax we need a way to associate a single page with
343 * multiple address_space instances at different linear_page_index()
344 * offsets.
345 */
346static void dax_associate_entry(void *entry, struct address_space *mapping,
347 struct vm_area_struct *vma, unsigned long address)
323{ 348{
324 unsigned long pfn; 349 unsigned long size = dax_entry_size(entry), pfn, index;
350 int i = 0;
325 351
326 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 352 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
327 return; 353 return;
328 354
355 index = linear_page_index(vma, address & ~(size - 1));
329 for_each_mapped_pfn(entry, pfn) { 356 for_each_mapped_pfn(entry, pfn) {
330 struct page *page = pfn_to_page(pfn); 357 struct page *page = pfn_to_page(pfn);
331 358
332 WARN_ON_ONCE(page->mapping); 359 WARN_ON_ONCE(page->mapping);
333 page->mapping = mapping; 360 page->mapping = mapping;
361 page->index = index + i++;
334 } 362 }
335} 363}
336 364
@@ -348,6 +376,7 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
348 WARN_ON_ONCE(trunc && page_ref_count(page) > 1); 376 WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
349 WARN_ON_ONCE(page->mapping && page->mapping != mapping); 377 WARN_ON_ONCE(page->mapping && page->mapping != mapping);
350 page->mapping = NULL; 378 page->mapping = NULL;
379 page->index = 0;
351 } 380 }
352} 381}
353 382
@@ -364,6 +393,84 @@ static struct page *dax_busy_page(void *entry)
364 return NULL; 393 return NULL;
365} 394}
366 395
396static bool entry_wait_revalidate(void)
397{
398 rcu_read_unlock();
399 schedule();
400 rcu_read_lock();
401
402 /*
403 * Tell __get_unlocked_mapping_entry() to take a break, we need
404 * to revalidate page->mapping after dropping locks
405 */
406 return true;
407}
408
409bool dax_lock_mapping_entry(struct page *page)
410{
411 pgoff_t index;
412 struct inode *inode;
413 bool did_lock = false;
414 void *entry = NULL, **slot;
415 struct address_space *mapping;
416
417 rcu_read_lock();
418 for (;;) {
419 mapping = READ_ONCE(page->mapping);
420
421 if (!dax_mapping(mapping))
422 break;
423
424 /*
425 * In the device-dax case there's no need to lock, a
426 * struct dev_pagemap pin is sufficient to keep the
427 * inode alive, and we assume we have dev_pagemap pin
428 * otherwise we would not have a valid pfn_to_page()
429 * translation.
430 */
431 inode = mapping->host;
432 if (S_ISCHR(inode->i_mode)) {
433 did_lock = true;
434 break;
435 }
436
437 xa_lock_irq(&mapping->i_pages);
438 if (mapping != page->mapping) {
439 xa_unlock_irq(&mapping->i_pages);
440 continue;
441 }
442 index = page->index;
443
444 entry = __get_unlocked_mapping_entry(mapping, index, &slot,
445 entry_wait_revalidate);
446 if (!entry) {
447 xa_unlock_irq(&mapping->i_pages);
448 break;
449 } else if (IS_ERR(entry)) {
450 WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
451 continue;
452 }
453 lock_slot(mapping, slot);
454 did_lock = true;
455 xa_unlock_irq(&mapping->i_pages);
456 break;
457 }
458 rcu_read_unlock();
459
460 return did_lock;
461}
462
463void dax_unlock_mapping_entry(struct page *page)
464{
465 struct address_space *mapping = page->mapping;
466 struct inode *inode = mapping->host;
467
468 if (S_ISCHR(inode->i_mode))
469 return;
470
471 unlock_mapping_entry(mapping, page->index);
472}
473
367/* 474/*
368 * Find radix tree entry at given index. If it points to an exceptional entry, 475 * Find radix tree entry at given index. If it points to an exceptional entry,
369 * return it with the radix tree entry locked. If the radix tree doesn't 476 * return it with the radix tree entry locked. If the radix tree doesn't
@@ -708,7 +815,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
708 new_entry = dax_radix_locked_entry(pfn, flags); 815 new_entry = dax_radix_locked_entry(pfn, flags);
709 if (dax_entry_size(entry) != dax_entry_size(new_entry)) { 816 if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
710 dax_disassociate_entry(entry, mapping, false); 817 dax_disassociate_entry(entry, mapping, false);
711 dax_associate_entry(new_entry, mapping); 818 dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
712 } 819 }
713 820
714 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 821 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index deb0f663252f..450b28db9533 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -88,6 +88,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
88 struct block_device *bdev, struct writeback_control *wbc); 88 struct block_device *bdev, struct writeback_control *wbc);
89 89
90struct page *dax_layout_busy_page(struct address_space *mapping); 90struct page *dax_layout_busy_page(struct address_space *mapping);
91bool dax_lock_mapping_entry(struct page *page);
92void dax_unlock_mapping_entry(struct page *page);
91#else 93#else
92static inline bool bdev_dax_supported(struct block_device *bdev, 94static inline bool bdev_dax_supported(struct block_device *bdev,
93 int blocksize) 95 int blocksize)
@@ -119,6 +121,17 @@ static inline int dax_writeback_mapping_range(struct address_space *mapping,
119{ 121{
120 return -EOPNOTSUPP; 122 return -EOPNOTSUPP;
121} 123}
124
125static inline bool dax_lock_mapping_entry(struct page *page)
126{
127 if (IS_DAX(page->mapping->host))
128 return true;
129 return false;
130}
131
132static inline void dax_unlock_mapping_entry(struct page *page)
133{
134}
122#endif 135#endif
123 136
124int dax_read_lock(void); 137int dax_read_lock(void);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 27e3e32135a8..99c19b06d9a4 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -3,6 +3,7 @@
3#define _LINUX_HUGE_MM_H 3#define _LINUX_HUGE_MM_H
4 4
5#include <linux/sched/coredump.h> 5#include <linux/sched/coredump.h>
6#include <linux/mm_types.h>
6 7
7#include <linux/fs.h> /* only for vma_is_dax() */ 8#include <linux/fs.h> /* only for vma_is_dax() */
8 9
@@ -46,9 +47,9 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
46extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 47extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
47 unsigned long addr, pgprot_t newprot, 48 unsigned long addr, pgprot_t newprot,
48 int prot_numa); 49 int prot_numa);
49int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 50vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
50 pmd_t *pmd, pfn_t pfn, bool write); 51 pmd_t *pmd, pfn_t pfn, bool write);
51int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 52vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
52 pud_t *pud, pfn_t pfn, bool write); 53 pud_t *pud, pfn_t pfn, bool write);
53enum transparent_hugepage_flag { 54enum transparent_hugepage_flag {
54 TRANSPARENT_HUGEPAGE_FLAG, 55 TRANSPARENT_HUGEPAGE_FLAG,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8fcc36660de6..a61ebe8ad4ca 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2731,6 +2731,7 @@ enum mf_action_page_type {
2731 MF_MSG_TRUNCATED_LRU, 2731 MF_MSG_TRUNCATED_LRU,
2732 MF_MSG_BUDDY, 2732 MF_MSG_BUDDY,
2733 MF_MSG_BUDDY_2ND, 2733 MF_MSG_BUDDY_2ND,
2734 MF_MSG_DAX,
2734 MF_MSG_UNKNOWN, 2735 MF_MSG_UNKNOWN,
2735}; 2736};
2736 2737
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index da5178216da5..2a986d282a97 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -17,6 +17,20 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
17static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } 17static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
18#endif 18#endif
19 19
20#ifndef set_mce_nospec
21static inline int set_mce_nospec(unsigned long pfn)
22{
23 return 0;
24}
25#endif
26
27#ifndef clear_mce_nospec
28static inline int clear_mce_nospec(unsigned long pfn)
29{
30 return 0;
31}
32#endif
33
20#ifndef CONFIG_ARCH_HAS_MEM_ENCRYPT 34#ifndef CONFIG_ARCH_HAS_MEM_ENCRYPT
21static inline int set_memory_encrypted(unsigned long addr, int numpages) 35static inline int set_memory_encrypted(unsigned long addr, int numpages)
22{ 36{
diff --git a/kernel/memremap.c b/kernel/memremap.c
index d57d58f77409..5b8600d39931 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -365,7 +365,6 @@ void __put_devmap_managed_page(struct page *page)
365 __ClearPageActive(page); 365 __ClearPageActive(page);
366 __ClearPageWaiters(page); 366 __ClearPageWaiters(page);
367 367
368 page->mapping = NULL;
369 mem_cgroup_uncharge(page); 368 mem_cgroup_uncharge(page);
370 369
371 page->pgmap->page_free(page, page->pgmap->data); 370 page->pgmap->page_free(page, page->pgmap->data);
diff --git a/mm/hmm.c b/mm/hmm.c
index 0b0554591610..c968e49f7a0c 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -968,6 +968,8 @@ static void hmm_devmem_free(struct page *page, void *data)
968{ 968{
969 struct hmm_devmem *devmem = data; 969 struct hmm_devmem *devmem = data;
970 970
971 page->mapping = NULL;
972
971 devmem->ops->free(devmem, page); 973 devmem->ops->free(devmem, page);
972} 974}
973 975
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08b544383d74..c3bc7e9c9a2a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -752,7 +752,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
752 spin_unlock(ptl); 752 spin_unlock(ptl);
753} 753}
754 754
755int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 755vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
756 pmd_t *pmd, pfn_t pfn, bool write) 756 pmd_t *pmd, pfn_t pfn, bool write)
757{ 757{
758 pgprot_t pgprot = vma->vm_page_prot; 758 pgprot_t pgprot = vma->vm_page_prot;
@@ -812,7 +812,7 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
812 spin_unlock(ptl); 812 spin_unlock(ptl);
813} 813}
814 814
815int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 815vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
816 pud_t *pud, pfn_t pfn, bool write) 816 pud_t *pud, pfn_t pfn, bool write)
817{ 817{
818 pgprot_t pgprot = vma->vm_page_prot; 818 pgprot_t pgprot = vma->vm_page_prot;
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d3c922ea1a1..972a9eaa898b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -631,11 +631,13 @@ static int madvise_inject_error(int behavior,
631 631
632 632
633 for (; start < end; start += PAGE_SIZE << order) { 633 for (; start < end; start += PAGE_SIZE << order) {
634 unsigned long pfn;
634 int ret; 635 int ret;
635 636
636 ret = get_user_pages_fast(start, 1, 0, &page); 637 ret = get_user_pages_fast(start, 1, 0, &page);
637 if (ret != 1) 638 if (ret != 1)
638 return ret; 639 return ret;
640 pfn = page_to_pfn(page);
639 641
640 /* 642 /*
641 * When soft offlining hugepages, after migrating the page 643 * When soft offlining hugepages, after migrating the page
@@ -651,17 +653,25 @@ static int madvise_inject_error(int behavior,
651 653
652 if (behavior == MADV_SOFT_OFFLINE) { 654 if (behavior == MADV_SOFT_OFFLINE) {
653 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 655 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
654 page_to_pfn(page), start); 656 pfn, start);
655 657
656 ret = soft_offline_page(page, MF_COUNT_INCREASED); 658 ret = soft_offline_page(page, MF_COUNT_INCREASED);
657 if (ret) 659 if (ret)
658 return ret; 660 return ret;
659 continue; 661 continue;
660 } 662 }
663
661 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 664 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
662 page_to_pfn(page), start); 665 pfn, start);
663 666
664 ret = memory_failure(page_to_pfn(page), MF_COUNT_INCREASED); 667 /*
668 * Drop the page reference taken by get_user_pages_fast(). In
669 * the absence of MF_COUNT_INCREASED the memory_failure()
670 * routine is responsible for pinning the page to prevent it
671 * from being released back to the page allocator.
672 */
673 put_page(page);
674 ret = memory_failure(pfn, 0);
665 if (ret) 675 if (ret)
666 return ret; 676 return ret;
667 } 677 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 192d0bbfc9ea..0cd3de3550f0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -55,6 +55,7 @@
55#include <linux/hugetlb.h> 55#include <linux/hugetlb.h>
56#include <linux/memory_hotplug.h> 56#include <linux/memory_hotplug.h>
57#include <linux/mm_inline.h> 57#include <linux/mm_inline.h>
58#include <linux/memremap.h>
58#include <linux/kfifo.h> 59#include <linux/kfifo.h>
59#include <linux/ratelimit.h> 60#include <linux/ratelimit.h>
60#include <linux/page-isolation.h> 61#include <linux/page-isolation.h>
@@ -175,22 +176,51 @@ int hwpoison_filter(struct page *p)
175EXPORT_SYMBOL_GPL(hwpoison_filter); 176EXPORT_SYMBOL_GPL(hwpoison_filter);
176 177
177/* 178/*
179 * Kill all processes that have a poisoned page mapped and then isolate
180 * the page.
181 *
182 * General strategy:
183 * Find all processes having the page mapped and kill them.
184 * But we keep a page reference around so that the page is not
185 * actually freed yet.
186 * Then stash the page away
187 *
188 * There's no convenient way to get back to mapped processes
189 * from the VMAs. So do a brute-force search over all
190 * running processes.
191 *
192 * Remember that machine checks are not common (or rather
193 * if they are common you have other problems), so this shouldn't
194 * be a performance issue.
195 *
196 * Also there are some races possible while we get from the
197 * error detection to actually handle it.
198 */
199
200struct to_kill {
201 struct list_head nd;
202 struct task_struct *tsk;
203 unsigned long addr;
204 short size_shift;
205 char addr_valid;
206};
207
208/*
178 * Send all the processes who have the page mapped a signal. 209 * Send all the processes who have the page mapped a signal.
179 * ``action optional'' if they are not immediately affected by the error 210 * ``action optional'' if they are not immediately affected by the error
180 * ``action required'' if error happened in current execution context 211 * ``action required'' if error happened in current execution context
181 */ 212 */
182static int kill_proc(struct task_struct *t, unsigned long addr, 213static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
183 unsigned long pfn, struct page *page, int flags)
184{ 214{
185 short addr_lsb; 215 struct task_struct *t = tk->tsk;
216 short addr_lsb = tk->size_shift;
186 int ret; 217 int ret;
187 218
188 pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n", 219 pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
189 pfn, t->comm, t->pid); 220 pfn, t->comm, t->pid);
190 addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
191 221
192 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { 222 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
193 ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, 223 ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
194 addr_lsb, current); 224 addr_lsb, current);
195 } else { 225 } else {
196 /* 226 /*
@@ -199,7 +229,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr,
199 * This could cause a loop when the user sets SIGBUS 229 * This could cause a loop when the user sets SIGBUS
200 * to SIG_IGN, but hopefully no one will do that? 230 * to SIG_IGN, but hopefully no one will do that?
201 */ 231 */
202 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)addr, 232 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
203 addr_lsb, t); /* synchronous? */ 233 addr_lsb, t); /* synchronous? */
204 } 234 }
205 if (ret < 0) 235 if (ret < 0)
@@ -235,34 +265,39 @@ void shake_page(struct page *p, int access)
235} 265}
236EXPORT_SYMBOL_GPL(shake_page); 266EXPORT_SYMBOL_GPL(shake_page);
237 267
238/* 268static unsigned long dev_pagemap_mapping_shift(struct page *page,
239 * Kill all processes that have a poisoned page mapped and then isolate 269 struct vm_area_struct *vma)
240 * the page. 270{
241 * 271 unsigned long address = vma_address(page, vma);
242 * General strategy: 272 pgd_t *pgd;
243 * Find all processes having the page mapped and kill them. 273 p4d_t *p4d;
244 * But we keep a page reference around so that the page is not 274 pud_t *pud;
245 * actually freed yet. 275 pmd_t *pmd;
246 * Then stash the page away 276 pte_t *pte;
247 * 277
248 * There's no convenient way to get back to mapped processes 278 pgd = pgd_offset(vma->vm_mm, address);
249 * from the VMAs. So do a brute-force search over all 279 if (!pgd_present(*pgd))
250 * running processes. 280 return 0;
251 * 281 p4d = p4d_offset(pgd, address);
252 * Remember that machine checks are not common (or rather 282 if (!p4d_present(*p4d))
253 * if they are common you have other problems), so this shouldn't 283 return 0;
254 * be a performance issue. 284 pud = pud_offset(p4d, address);
255 * 285 if (!pud_present(*pud))
256 * Also there are some races possible while we get from the 286 return 0;
257 * error detection to actually handle it. 287 if (pud_devmap(*pud))
258 */ 288 return PUD_SHIFT;
259 289 pmd = pmd_offset(pud, address);
260struct to_kill { 290 if (!pmd_present(*pmd))
261 struct list_head nd; 291 return 0;
262 struct task_struct *tsk; 292 if (pmd_devmap(*pmd))
263 unsigned long addr; 293 return PMD_SHIFT;
264 char addr_valid; 294 pte = pte_offset_map(pmd, address);
265}; 295 if (!pte_present(*pte))
296 return 0;
297 if (pte_devmap(*pte))
298 return PAGE_SHIFT;
299 return 0;
300}
266 301
267/* 302/*
268 * Failure handling: if we can't find or can't kill a process there's 303 * Failure handling: if we can't find or can't kill a process there's
@@ -293,6 +328,10 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
293 } 328 }
294 tk->addr = page_address_in_vma(p, vma); 329 tk->addr = page_address_in_vma(p, vma);
295 tk->addr_valid = 1; 330 tk->addr_valid = 1;
331 if (is_zone_device_page(p))
332 tk->size_shift = dev_pagemap_mapping_shift(p, vma);
333 else
334 tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
296 335
297 /* 336 /*
298 * In theory we don't have to kill when the page was 337 * In theory we don't have to kill when the page was
@@ -300,7 +339,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
300 * likely very rare kill anyways just out of paranoia, but use 339 * likely very rare kill anyways just out of paranoia, but use
301 * a SIGKILL because the error is not contained anymore. 340 * a SIGKILL because the error is not contained anymore.
302 */ 341 */
303 if (tk->addr == -EFAULT) { 342 if (tk->addr == -EFAULT || tk->size_shift == 0) {
304 pr_info("Memory failure: Unable to find user space address %lx in %s\n", 343 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
305 page_to_pfn(p), tsk->comm); 344 page_to_pfn(p), tsk->comm);
306 tk->addr_valid = 0; 345 tk->addr_valid = 0;
@@ -318,9 +357,8 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
318 * Also when FAIL is set do a force kill because something went 357 * Also when FAIL is set do a force kill because something went
319 * wrong earlier. 358 * wrong earlier.
320 */ 359 */
321static void kill_procs(struct list_head *to_kill, int forcekill, 360static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
322 bool fail, struct page *page, unsigned long pfn, 361 unsigned long pfn, int flags)
323 int flags)
324{ 362{
325 struct to_kill *tk, *next; 363 struct to_kill *tk, *next;
326 364
@@ -343,8 +381,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill,
343 * check for that, but we need to tell the 381 * check for that, but we need to tell the
344 * process anyways. 382 * process anyways.
345 */ 383 */
346 else if (kill_proc(tk->tsk, tk->addr, 384 else if (kill_proc(tk, pfn, flags) < 0)
347 pfn, page, flags) < 0)
348 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n", 385 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
349 pfn, tk->tsk->comm, tk->tsk->pid); 386 pfn, tk->tsk->comm, tk->tsk->pid);
350 } 387 }
@@ -516,6 +553,7 @@ static const char * const action_page_types[] = {
516 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", 553 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
517 [MF_MSG_BUDDY] = "free buddy page", 554 [MF_MSG_BUDDY] = "free buddy page",
518 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", 555 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
556 [MF_MSG_DAX] = "dax page",
519 [MF_MSG_UNKNOWN] = "unknown page", 557 [MF_MSG_UNKNOWN] = "unknown page",
520}; 558};
521 559
@@ -1013,7 +1051,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
1013 * any accesses to the poisoned memory. 1051 * any accesses to the poisoned memory.
1014 */ 1052 */
1015 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); 1053 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1016 kill_procs(&tokill, forcekill, !unmap_success, p, pfn, flags); 1054 kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
1017 1055
1018 return unmap_success; 1056 return unmap_success;
1019} 1057}
@@ -1113,6 +1151,83 @@ out:
1113 return res; 1151 return res;
1114} 1152}
1115 1153
1154static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
1155 struct dev_pagemap *pgmap)
1156{
1157 struct page *page = pfn_to_page(pfn);
1158 const bool unmap_success = true;
1159 unsigned long size = 0;
1160 struct to_kill *tk;
1161 LIST_HEAD(tokill);
1162 int rc = -EBUSY;
1163 loff_t start;
1164
1165 /*
1166 * Prevent the inode from being freed while we are interrogating
1167 * the address_space, typically this would be handled by
1168 * lock_page(), but dax pages do not use the page lock. This
1169 * also prevents changes to the mapping of this pfn until
1170 * poison signaling is complete.
1171 */
1172 if (!dax_lock_mapping_entry(page))
1173 goto out;
1174
1175 if (hwpoison_filter(page)) {
1176 rc = 0;
1177 goto unlock;
1178 }
1179
1180 switch (pgmap->type) {
1181 case MEMORY_DEVICE_PRIVATE:
1182 case MEMORY_DEVICE_PUBLIC:
1183 /*
1184 * TODO: Handle HMM pages which may need coordination
1185 * with device-side memory.
1186 */
1187 goto unlock;
1188 default:
1189 break;
1190 }
1191
1192 /*
1193 * Use this flag as an indication that the dax page has been
1194 * remapped UC to prevent speculative consumption of poison.
1195 */
1196 SetPageHWPoison(page);
1197
1198 /*
1199 * Unlike System-RAM there is no possibility to swap in a
1200 * different physical page at a given virtual address, so all
1201 * userspace consumption of ZONE_DEVICE memory necessitates
1202 * SIGBUS (i.e. MF_MUST_KILL)
1203 */
1204 flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1205 collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
1206
1207 list_for_each_entry(tk, &tokill, nd)
1208 if (tk->size_shift)
1209 size = max(size, 1UL << tk->size_shift);
1210 if (size) {
1211 /*
1212 * Unmap the largest mapping to avoid breaking up
1213 * device-dax mappings which are constant size. The
1214 * actual size of the mapping being torn down is
1215 * communicated in siginfo, see kill_proc()
1216 */
1217 start = (page->index << PAGE_SHIFT) & ~(size - 1);
1218 unmap_mapping_range(page->mapping, start, start + size, 0);
1219 }
1220 kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
1221 rc = 0;
1222unlock:
1223 dax_unlock_mapping_entry(page);
1224out:
1225 /* drop pgmap ref acquired in caller */
1226 put_dev_pagemap(pgmap);
1227 action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
1228 return rc;
1229}
1230
1116/** 1231/**
1117 * memory_failure - Handle memory failure of a page. 1232 * memory_failure - Handle memory failure of a page.
1118 * @pfn: Page Number of the corrupted page 1233 * @pfn: Page Number of the corrupted page
@@ -1135,6 +1250,7 @@ int memory_failure(unsigned long pfn, int flags)
1135 struct page *p; 1250 struct page *p;
1136 struct page *hpage; 1251 struct page *hpage;
1137 struct page *orig_head; 1252 struct page *orig_head;
1253 struct dev_pagemap *pgmap;
1138 int res; 1254 int res;
1139 unsigned long page_flags; 1255 unsigned long page_flags;
1140 1256
@@ -1147,6 +1263,10 @@ int memory_failure(unsigned long pfn, int flags)
1147 return -ENXIO; 1263 return -ENXIO;
1148 } 1264 }
1149 1265
1266 pgmap = get_dev_pagemap(pfn, NULL);
1267 if (pgmap)
1268 return memory_failure_dev_pagemap(pfn, flags, pgmap);
1269
1150 p = pfn_to_page(pfn); 1270 p = pfn_to_page(pfn);
1151 if (PageHuge(p)) 1271 if (PageHuge(p))
1152 return memory_failure_hugetlb(pfn, flags); 1272 return memory_failure_hugetlb(pfn, flags);
@@ -1777,6 +1897,14 @@ int soft_offline_page(struct page *page, int flags)
1777 int ret; 1897 int ret;
1778 unsigned long pfn = page_to_pfn(page); 1898 unsigned long pfn = page_to_pfn(page);
1779 1899
1900 if (is_zone_device_page(page)) {
1901 pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
1902 pfn);
1903 if (flags & MF_COUNT_INCREASED)
1904 put_page(page);
1905 return -EIO;
1906 }
1907
1780 if (PageHWPoison(page)) { 1908 if (PageHWPoison(page)) {
1781 pr_info("soft offline: %#lx page already poisoned\n", pfn); 1909 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1782 if (flags & MF_COUNT_INCREASED) 1910 if (flags & MF_COUNT_INCREASED)