summaryrefslogtreecommitdiffstats
path: root/drivers/dax
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-25 21:43:59 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-25 21:43:59 -0400
commit2923b27e54242acf27fd16b299e102117c82f52f (patch)
tree86b3e27575814dab74307a7928bf579455b70e24 /drivers/dax
parent828bf6e904eb8fc8969333568802689fbbf07a40 (diff)
parentc953cc987ab87d180e1d5de2f1c217abe33aac77 (diff)
Merge tag 'libnvdimm-for-4.19_dax-memory-failure' of gitolite.kernel.org:pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm memory-failure update from Dave Jiang: "As it stands, memory_failure() gets thoroughly confused by dev_pagemap backed mappings. The recovery code has specific enabling for several possible page states and needs new enabling to handle poison in dax mappings. In order to support reliable reverse mapping of user space addresses: 1/ Add new locking in the memory_failure() rmap path to prevent races that would typically be handled by the page lock. 2/ Since dev_pagemap pages are hidden from the page allocator and the "compound page" accounting machinery, add a mechanism to determine the size of the mapping that encompasses a given poisoned pfn. 3/ Given pmem errors can be repaired, change the speculatively accessed poison protection, mce_unmap_kpfn(), to be reversible and otherwise allow ongoing access from the kernel. A side effect of this enabling is that MADV_HWPOISON becomes usable for dax mappings, however the primary motivation is to allow the system to survive userspace consumption of hardware-poison via dax. Specifically the current behavior is: mce: Uncorrected hardware memory error in user-access at af34214200 {1}[Hardware Error]: It has been corrected by h/w and requires no further action mce: [Hardware Error]: Machine check events logged {1}[Hardware Error]: event severity: corrected Memory failure: 0xaf34214: reserved kernel page still referenced by 1 users [..] Memory failure: 0xaf34214: recovery action for reserved kernel page: Failed mce: Memory error not recovered <reboot> ...and with these changes: Injecting memory failure for pfn 0x20cb00 at process virtual address 0x7f763dd00000 Memory failure: 0x20cb00: Killing dax-pmd:5421 due to hardware memory corruption Memory failure: 0x20cb00: recovery action for dax page: Recovered Given all the cross dependencies I propose taking this through nvdimm.git with acks from Naoya, x86/core, x86/RAS, and of course dax folks" * tag 'libnvdimm-for-4.19_dax-memory-failure' of gitolite.kernel.org:pub/scm/linux/kernel/git/nvdimm/nvdimm: libnvdimm, pmem: Restore page attributes when clearing errors x86/memory_failure: Introduce {set, clear}_mce_nospec() x86/mm/pat: Prepare {reserve, free}_memtype() for "decoy" addresses mm, memory_failure: Teach memory_failure() about dev_pagemap pages filesystem-dax: Introduce dax_lock_mapping_entry() mm, memory_failure: Collect mapping size in collect_procs() mm, madvise_inject_error: Let memory_failure() optionally take a page reference mm, dev_pagemap: Do not clear ->mapping on final put mm, madvise_inject_error: Disable MADV_SOFT_OFFLINE for ZONE_DEVICE pages filesystem-dax: Set page->index device-dax: Set page->index device-dax: Enable page_mapping() device-dax: Convert to vmf_insert_mixed and vm_fault_t
Diffstat (limited to 'drivers/dax')
-rw-r--r--drivers/dax/device.c75
1 files changed, 48 insertions, 27 deletions
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 0a2acd7993f0..6fd46083e629 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -248,13 +248,12 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
248 return -1; 248 return -1;
249} 249}
250 250
251static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 251static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
252 struct vm_fault *vmf, pfn_t *pfn)
252{ 253{
253 struct device *dev = &dev_dax->dev; 254 struct device *dev = &dev_dax->dev;
254 struct dax_region *dax_region; 255 struct dax_region *dax_region;
255 int rc = VM_FAULT_SIGBUS;
256 phys_addr_t phys; 256 phys_addr_t phys;
257 pfn_t pfn;
258 unsigned int fault_size = PAGE_SIZE; 257 unsigned int fault_size = PAGE_SIZE;
259 258
260 if (check_vma(dev_dax, vmf->vma, __func__)) 259 if (check_vma(dev_dax, vmf->vma, __func__))
@@ -276,26 +275,19 @@ static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
276 return VM_FAULT_SIGBUS; 275 return VM_FAULT_SIGBUS;
277 } 276 }
278 277
279 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 278 *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
280
281 rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
282
283 if (rc == -ENOMEM)
284 return VM_FAULT_OOM;
285 if (rc < 0 && rc != -EBUSY)
286 return VM_FAULT_SIGBUS;
287 279
288 return VM_FAULT_NOPAGE; 280 return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
289} 281}
290 282
291static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 283static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
284 struct vm_fault *vmf, pfn_t *pfn)
292{ 285{
293 unsigned long pmd_addr = vmf->address & PMD_MASK; 286 unsigned long pmd_addr = vmf->address & PMD_MASK;
294 struct device *dev = &dev_dax->dev; 287 struct device *dev = &dev_dax->dev;
295 struct dax_region *dax_region; 288 struct dax_region *dax_region;
296 phys_addr_t phys; 289 phys_addr_t phys;
297 pgoff_t pgoff; 290 pgoff_t pgoff;
298 pfn_t pfn;
299 unsigned int fault_size = PMD_SIZE; 291 unsigned int fault_size = PMD_SIZE;
300 292
301 if (check_vma(dev_dax, vmf->vma, __func__)) 293 if (check_vma(dev_dax, vmf->vma, __func__))
@@ -331,21 +323,21 @@ static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
331 return VM_FAULT_SIGBUS; 323 return VM_FAULT_SIGBUS;
332 } 324 }
333 325
334 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 326 *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
335 327
336 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, 328 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, *pfn,
337 vmf->flags & FAULT_FLAG_WRITE); 329 vmf->flags & FAULT_FLAG_WRITE);
338} 330}
339 331
340#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 332#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
341static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 333static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
334 struct vm_fault *vmf, pfn_t *pfn)
342{ 335{
343 unsigned long pud_addr = vmf->address & PUD_MASK; 336 unsigned long pud_addr = vmf->address & PUD_MASK;
344 struct device *dev = &dev_dax->dev; 337 struct device *dev = &dev_dax->dev;
345 struct dax_region *dax_region; 338 struct dax_region *dax_region;
346 phys_addr_t phys; 339 phys_addr_t phys;
347 pgoff_t pgoff; 340 pgoff_t pgoff;
348 pfn_t pfn;
349 unsigned int fault_size = PUD_SIZE; 341 unsigned int fault_size = PUD_SIZE;
350 342
351 343
@@ -382,23 +374,26 @@ static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
382 return VM_FAULT_SIGBUS; 374 return VM_FAULT_SIGBUS;
383 } 375 }
384 376
385 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 377 *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
386 378
387 return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn, 379 return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, *pfn,
388 vmf->flags & FAULT_FLAG_WRITE); 380 vmf->flags & FAULT_FLAG_WRITE);
389} 381}
390#else 382#else
391static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 383static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
384 struct vm_fault *vmf, pfn_t *pfn)
392{ 385{
393 return VM_FAULT_FALLBACK; 386 return VM_FAULT_FALLBACK;
394} 387}
395#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 388#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
396 389
397static int dev_dax_huge_fault(struct vm_fault *vmf, 390static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
398 enum page_entry_size pe_size) 391 enum page_entry_size pe_size)
399{ 392{
400 int rc, id;
401 struct file *filp = vmf->vma->vm_file; 393 struct file *filp = vmf->vma->vm_file;
394 unsigned long fault_size;
395 int rc, id;
396 pfn_t pfn;
402 struct dev_dax *dev_dax = filp->private_data; 397 struct dev_dax *dev_dax = filp->private_data;
403 398
404 dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, 399 dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm,
@@ -408,23 +403,49 @@ static int dev_dax_huge_fault(struct vm_fault *vmf,
408 id = dax_read_lock(); 403 id = dax_read_lock();
409 switch (pe_size) { 404 switch (pe_size) {
410 case PE_SIZE_PTE: 405 case PE_SIZE_PTE:
411 rc = __dev_dax_pte_fault(dev_dax, vmf); 406 fault_size = PAGE_SIZE;
407 rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn);
412 break; 408 break;
413 case PE_SIZE_PMD: 409 case PE_SIZE_PMD:
414 rc = __dev_dax_pmd_fault(dev_dax, vmf); 410 fault_size = PMD_SIZE;
411 rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn);
415 break; 412 break;
416 case PE_SIZE_PUD: 413 case PE_SIZE_PUD:
417 rc = __dev_dax_pud_fault(dev_dax, vmf); 414 fault_size = PUD_SIZE;
415 rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn);
418 break; 416 break;
419 default: 417 default:
420 rc = VM_FAULT_SIGBUS; 418 rc = VM_FAULT_SIGBUS;
421 } 419 }
420
421 if (rc == VM_FAULT_NOPAGE) {
422 unsigned long i;
423 pgoff_t pgoff;
424
425 /*
426 * In the device-dax case the only possibility for a
427 * VM_FAULT_NOPAGE result is when device-dax capacity is
428 * mapped. No need to consider the zero page, or racing
429 * conflicting mappings.
430 */
431 pgoff = linear_page_index(vmf->vma, vmf->address
432 & ~(fault_size - 1));
433 for (i = 0; i < fault_size / PAGE_SIZE; i++) {
434 struct page *page;
435
436 page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
437 if (page->mapping)
438 continue;
439 page->mapping = filp->f_mapping;
440 page->index = pgoff + i;
441 }
442 }
422 dax_read_unlock(id); 443 dax_read_unlock(id);
423 444
424 return rc; 445 return rc;
425} 446}
426 447
427static int dev_dax_fault(struct vm_fault *vmf) 448static vm_fault_t dev_dax_fault(struct vm_fault *vmf)
428{ 449{
429 return dev_dax_huge_fault(vmf, PE_SIZE_PTE); 450 return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
430} 451}