summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRoss Zwisler <ross.zwisler@linux.intel.com>2015-10-15 18:28:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-10-16 14:42:28 -0400
commit0f90cc6609c72b0bdf2aad0cb0456194dd896e19 (patch)
tree1e9c2d12331cdc92576d46cd1f6d681323c4080b
parent424cdc14138088ada1b0e407a2195b2783c6e5ef (diff)
mm, dax: fix DAX deadlocks
The following two locking commits in the DAX code: commit 843172978bb9 ("dax: fix race between simultaneous faults") commit 46c043ede471 ("mm: take i_mmap_lock in unmap_mapping_range() for DAX") introduced a number of deadlocks and other issues which need to be fixed for the v4.3 kernel. The list of issues in DAX after these commits (some newly introduced by the commits, some preexisting) can be found here: https://lkml.org/lkml/2015/9/25/602 (Subject: "Re: [PATCH] dax: fix deadlock in __dax_fault"). This undoes most of the changes introduced by those two commits, essentially returning us to the DAX locking scheme that was used in v4.2. Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Dan Williams <dan.j.williams@intel.com> Tested-by: Dave Chinner <dchinner@redhat.com> Cc: Jan Kara <jack@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Matthew Wilcox <matthew.r.wilcox@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/dax.c70
-rw-r--r--mm/memory.c2
2 files changed, 31 insertions, 41 deletions
diff --git a/fs/dax.c b/fs/dax.c
index bcfb14bfc1e4..a86d3cc2b389 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -285,6 +285,7 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
285static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 285static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
286 struct vm_area_struct *vma, struct vm_fault *vmf) 286 struct vm_area_struct *vma, struct vm_fault *vmf)
287{ 287{
288 struct address_space *mapping = inode->i_mapping;
288 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 289 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
289 unsigned long vaddr = (unsigned long)vmf->virtual_address; 290 unsigned long vaddr = (unsigned long)vmf->virtual_address;
290 void __pmem *addr; 291 void __pmem *addr;
@@ -292,6 +293,8 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
292 pgoff_t size; 293 pgoff_t size;
293 int error; 294 int error;
294 295
296 i_mmap_lock_read(mapping);
297
295 /* 298 /*
296 * Check truncate didn't happen while we were allocating a block. 299 * Check truncate didn't happen while we were allocating a block.
297 * If it did, this block may or may not be still allocated to the 300 * If it did, this block may or may not be still allocated to the
@@ -321,6 +324,8 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
321 error = vm_insert_mixed(vma, vaddr, pfn); 324 error = vm_insert_mixed(vma, vaddr, pfn);
322 325
323 out: 326 out:
327 i_mmap_unlock_read(mapping);
328
324 return error; 329 return error;
325} 330}
326 331
@@ -382,17 +387,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
382 * from a read fault and we've raced with a truncate 387 * from a read fault and we've raced with a truncate
383 */ 388 */
384 error = -EIO; 389 error = -EIO;
385 goto unlock; 390 goto unlock_page;
386 } 391 }
387 } else {
388 i_mmap_lock_write(mapping);
389 } 392 }
390 393
391 error = get_block(inode, block, &bh, 0); 394 error = get_block(inode, block, &bh, 0);
392 if (!error && (bh.b_size < PAGE_SIZE)) 395 if (!error && (bh.b_size < PAGE_SIZE))
393 error = -EIO; /* fs corruption? */ 396 error = -EIO; /* fs corruption? */
394 if (error) 397 if (error)
395 goto unlock; 398 goto unlock_page;
396 399
397 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { 400 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
398 if (vmf->flags & FAULT_FLAG_WRITE) { 401 if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -403,9 +406,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
403 if (!error && (bh.b_size < PAGE_SIZE)) 406 if (!error && (bh.b_size < PAGE_SIZE))
404 error = -EIO; 407 error = -EIO;
405 if (error) 408 if (error)
406 goto unlock; 409 goto unlock_page;
407 } else { 410 } else {
408 i_mmap_unlock_write(mapping);
409 return dax_load_hole(mapping, page, vmf); 411 return dax_load_hole(mapping, page, vmf);
410 } 412 }
411 } 413 }
@@ -417,15 +419,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
417 else 419 else
418 clear_user_highpage(new_page, vaddr); 420 clear_user_highpage(new_page, vaddr);
419 if (error) 421 if (error)
420 goto unlock; 422 goto unlock_page;
421 vmf->page = page; 423 vmf->page = page;
422 if (!page) { 424 if (!page) {
425 i_mmap_lock_read(mapping);
423 /* Check we didn't race with truncate */ 426 /* Check we didn't race with truncate */
424 size = (i_size_read(inode) + PAGE_SIZE - 1) >> 427 size = (i_size_read(inode) + PAGE_SIZE - 1) >>
425 PAGE_SHIFT; 428 PAGE_SHIFT;
426 if (vmf->pgoff >= size) { 429 if (vmf->pgoff >= size) {
430 i_mmap_unlock_read(mapping);
427 error = -EIO; 431 error = -EIO;
428 goto unlock; 432 goto out;
429 } 433 }
430 } 434 }
431 return VM_FAULT_LOCKED; 435 return VM_FAULT_LOCKED;
@@ -461,8 +465,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
461 WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); 465 WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
462 } 466 }
463 467
464 if (!page)
465 i_mmap_unlock_write(mapping);
466 out: 468 out:
467 if (error == -ENOMEM) 469 if (error == -ENOMEM)
468 return VM_FAULT_OOM | major; 470 return VM_FAULT_OOM | major;
@@ -471,14 +473,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
471 return VM_FAULT_SIGBUS | major; 473 return VM_FAULT_SIGBUS | major;
472 return VM_FAULT_NOPAGE | major; 474 return VM_FAULT_NOPAGE | major;
473 475
474 unlock: 476 unlock_page:
475 if (page) { 477 if (page) {
476 unlock_page(page); 478 unlock_page(page);
477 page_cache_release(page); 479 page_cache_release(page);
478 } else {
479 i_mmap_unlock_write(mapping);
480 } 480 }
481
482 goto out; 481 goto out;
483} 482}
484EXPORT_SYMBOL(__dax_fault); 483EXPORT_SYMBOL(__dax_fault);
@@ -556,10 +555,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
556 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 555 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
557 556
558 bh.b_size = PMD_SIZE; 557 bh.b_size = PMD_SIZE;
559 i_mmap_lock_write(mapping);
560 length = get_block(inode, block, &bh, write); 558 length = get_block(inode, block, &bh, write);
561 if (length) 559 if (length)
562 return VM_FAULT_SIGBUS; 560 return VM_FAULT_SIGBUS;
561 i_mmap_lock_read(mapping);
563 562
564 /* 563 /*
565 * If the filesystem isn't willing to tell us the length of a hole, 564 * If the filesystem isn't willing to tell us the length of a hole,
@@ -569,36 +568,14 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
569 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) 568 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
570 goto fallback; 569 goto fallback;
571 570
572 sector = bh.b_blocknr << (blkbits - 9);
573
574 if (buffer_unwritten(&bh) || buffer_new(&bh)) {
575 int i;
576
577 length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
578 bh.b_size);
579 if (length < 0) {
580 result = VM_FAULT_SIGBUS;
581 goto out;
582 }
583 if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
584 goto fallback;
585
586 for (i = 0; i < PTRS_PER_PMD; i++)
587 clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
588 wmb_pmem();
589 count_vm_event(PGMAJFAULT);
590 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
591 result |= VM_FAULT_MAJOR;
592 }
593
594 /* 571 /*
595 * If we allocated new storage, make sure no process has any 572 * If we allocated new storage, make sure no process has any
596 * zero pages covering this hole 573 * zero pages covering this hole
597 */ 574 */
598 if (buffer_new(&bh)) { 575 if (buffer_new(&bh)) {
599 i_mmap_unlock_write(mapping); 576 i_mmap_unlock_read(mapping);
600 unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); 577 unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
601 i_mmap_lock_write(mapping); 578 i_mmap_lock_read(mapping);
602 } 579 }
603 580
604 /* 581 /*
@@ -635,6 +612,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
635 result = VM_FAULT_NOPAGE; 612 result = VM_FAULT_NOPAGE;
636 spin_unlock(ptl); 613 spin_unlock(ptl);
637 } else { 614 } else {
615 sector = bh.b_blocknr << (blkbits - 9);
638 length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, 616 length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
639 bh.b_size); 617 bh.b_size);
640 if (length < 0) { 618 if (length < 0) {
@@ -644,15 +622,25 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
644 if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) 622 if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
645 goto fallback; 623 goto fallback;
646 624
625 if (buffer_unwritten(&bh) || buffer_new(&bh)) {
626 int i;
627 for (i = 0; i < PTRS_PER_PMD; i++)
628 clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
629 wmb_pmem();
630 count_vm_event(PGMAJFAULT);
631 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
632 result |= VM_FAULT_MAJOR;
633 }
634
647 result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); 635 result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
648 } 636 }
649 637
650 out: 638 out:
639 i_mmap_unlock_read(mapping);
640
651 if (buffer_unwritten(&bh)) 641 if (buffer_unwritten(&bh))
652 complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); 642 complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
653 643
654 i_mmap_unlock_write(mapping);
655
656 return result; 644 return result;
657 645
658 fallback: 646 fallback:
diff --git a/mm/memory.c b/mm/memory.c
index 9cb27470fee9..deb679c31f2a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2426,6 +2426,8 @@ void unmap_mapping_range(struct address_space *mapping,
2426 if (details.last_index < details.first_index) 2426 if (details.last_index < details.first_index)
2427 details.last_index = ULONG_MAX; 2427 details.last_index = ULONG_MAX;
2428 2428
2429
2430 /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
2429 i_mmap_lock_write(mapping); 2431 i_mmap_lock_write(mapping);
2430 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) 2432 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2431 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2433 unmap_mapping_range_tree(&mapping->i_mmap, &details);