aboutsummaryrefslogtreecommitdiffstats
path: root/fs/dax.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dax.c')
-rw-r--r--fs/dax.c208
1 files changed, 148 insertions, 60 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 5ae8e11ad786..a8732fbed381 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,7 @@
31#include <linux/vmstat.h> 31#include <linux/vmstat.h>
32#include <linux/pfn_t.h> 32#include <linux/pfn_t.h>
33#include <linux/sizes.h> 33#include <linux/sizes.h>
34#include <linux/mmu_notifier.h>
34#include <linux/iomap.h> 35#include <linux/iomap.h>
35#include "internal.h" 36#include "internal.h"
36 37
@@ -240,6 +241,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
240 } 241 }
241} 242}
242 243
244static void dax_unlock_mapping_entry(struct address_space *mapping,
245 pgoff_t index)
246{
247 void *entry, **slot;
248
249 spin_lock_irq(&mapping->tree_lock);
250 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
251 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
252 !slot_locked(mapping, slot))) {
253 spin_unlock_irq(&mapping->tree_lock);
254 return;
255 }
256 unlock_slot(mapping, slot);
257 spin_unlock_irq(&mapping->tree_lock);
258 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
259}
260
243static void put_locked_mapping_entry(struct address_space *mapping, 261static void put_locked_mapping_entry(struct address_space *mapping,
244 pgoff_t index, void *entry) 262 pgoff_t index, void *entry)
245{ 263{
@@ -433,22 +451,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
433 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 451 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
434} 452}
435 453
436void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
437{
438 void *entry, **slot;
439
440 spin_lock_irq(&mapping->tree_lock);
441 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
442 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
443 !slot_locked(mapping, slot))) {
444 spin_unlock_irq(&mapping->tree_lock);
445 return;
446 }
447 unlock_slot(mapping, slot);
448 spin_unlock_irq(&mapping->tree_lock);
449 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
450}
451
452/* 454/*
453 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 455 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
454 * entry to get unlocked before deleting it. 456 * entry to get unlocked before deleting it.
@@ -500,10 +502,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
500 /* This will replace locked radix tree entry with a hole page */ 502 /* This will replace locked radix tree entry with a hole page */
501 page = find_or_create_page(mapping, vmf->pgoff, 503 page = find_or_create_page(mapping, vmf->pgoff,
502 vmf->gfp_mask | __GFP_ZERO); 504 vmf->gfp_mask | __GFP_ZERO);
503 if (!page) { 505 if (!page)
504 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
505 return VM_FAULT_OOM; 506 return VM_FAULT_OOM;
506 }
507 vmf->page = page; 507 vmf->page = page;
508 return VM_FAULT_LOCKED; 508 return VM_FAULT_LOCKED;
509} 509}
@@ -615,36 +615,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
615 return new_entry; 615 return new_entry;
616} 616}
617 617
618static inline unsigned long
619pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
620{
621 unsigned long address;
622
623 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
624 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
625 return address;
626}
627
628/* Walk all mappings of a given index of a file and writeprotect them */
629static void dax_mapping_entry_mkclean(struct address_space *mapping,
630 pgoff_t index, unsigned long pfn)
631{
632 struct vm_area_struct *vma;
633 pte_t *ptep;
634 pte_t pte;
635 spinlock_t *ptl;
636 bool changed;
637
638 i_mmap_lock_read(mapping);
639 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
640 unsigned long address;
641
642 cond_resched();
643
644 if (!(vma->vm_flags & VM_SHARED))
645 continue;
646
647 address = pgoff_address(index, vma);
648 changed = false;
649 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
650 continue;
651 if (pfn != pte_pfn(*ptep))
652 goto unlock;
653 if (!pte_dirty(*ptep) && !pte_write(*ptep))
654 goto unlock;
655
656 flush_cache_page(vma, address, pfn);
657 pte = ptep_clear_flush(vma, address, ptep);
658 pte = pte_wrprotect(pte);
659 pte = pte_mkclean(pte);
660 set_pte_at(vma->vm_mm, address, ptep, pte);
661 changed = true;
662unlock:
663 pte_unmap_unlock(ptep, ptl);
664
665 if (changed)
666 mmu_notifier_invalidate_page(vma->vm_mm, address);
667 }
668 i_mmap_unlock_read(mapping);
669}
670
618static int dax_writeback_one(struct block_device *bdev, 671static int dax_writeback_one(struct block_device *bdev,
619 struct address_space *mapping, pgoff_t index, void *entry) 672 struct address_space *mapping, pgoff_t index, void *entry)
620{ 673{
621 struct radix_tree_root *page_tree = &mapping->page_tree; 674 struct radix_tree_root *page_tree = &mapping->page_tree;
622 struct radix_tree_node *node;
623 struct blk_dax_ctl dax; 675 struct blk_dax_ctl dax;
624 void **slot; 676 void *entry2, **slot;
625 int ret = 0; 677 int ret = 0;
626 678
627 spin_lock_irq(&mapping->tree_lock);
628 /* 679 /*
629 * Regular page slots are stabilized by the page lock even 680 * A page got tagged dirty in DAX mapping? Something is seriously
630 * without the tree itself locked. These unlocked entries 681 * wrong.
631 * need verification under the tree lock.
632 */ 682 */
633 if (!__radix_tree_lookup(page_tree, index, &node, &slot)) 683 if (WARN_ON(!radix_tree_exceptional_entry(entry)))
634 goto unlock; 684 return -EIO;
635 if (*slot != entry)
636 goto unlock;
637
638 /* another fsync thread may have already written back this entry */
639 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
640 goto unlock;
641 685
686 spin_lock_irq(&mapping->tree_lock);
687 entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
688 /* Entry got punched out / reallocated? */
689 if (!entry2 || !radix_tree_exceptional_entry(entry2))
690 goto put_unlocked;
691 /*
692 * Entry got reallocated elsewhere? No need to writeback. We have to
693 * compare sectors as we must not bail out due to difference in lockbit
694 * or entry type.
695 */
696 if (dax_radix_sector(entry2) != dax_radix_sector(entry))
697 goto put_unlocked;
642 if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 698 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
643 dax_is_zero_entry(entry))) { 699 dax_is_zero_entry(entry))) {
644 ret = -EIO; 700 ret = -EIO;
645 goto unlock; 701 goto put_unlocked;
646 } 702 }
647 703
704 /* Another fsync thread may have already written back this entry */
705 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
706 goto put_unlocked;
707 /* Lock the entry to serialize with page faults */
708 entry = lock_slot(mapping, slot);
709 /*
710 * We can clear the tag now but we have to be careful so that concurrent
711 * dax_writeback_one() calls for the same index cannot finish before we
712 * actually flush the caches. This is achieved as the calls will look
713 * at the entry only under tree_lock and once they do that they will
714 * see the entry locked and wait for it to unlock.
715 */
716 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
717 spin_unlock_irq(&mapping->tree_lock);
718
648 /* 719 /*
649 * Even if dax_writeback_mapping_range() was given a wbc->range_start 720 * Even if dax_writeback_mapping_range() was given a wbc->range_start
650 * in the middle of a PMD, the 'index' we are given will be aligned to 721 * in the middle of a PMD, the 'index' we are given will be aligned to
@@ -654,31 +725,40 @@ static int dax_writeback_one(struct block_device *bdev,
654 */ 725 */
655 dax.sector = dax_radix_sector(entry); 726 dax.sector = dax_radix_sector(entry);
656 dax.size = PAGE_SIZE << dax_radix_order(entry); 727 dax.size = PAGE_SIZE << dax_radix_order(entry);
657 spin_unlock_irq(&mapping->tree_lock);
658 728
659 /* 729 /*
660 * We cannot hold tree_lock while calling dax_map_atomic() because it 730 * We cannot hold tree_lock while calling dax_map_atomic() because it
661 * eventually calls cond_resched(). 731 * eventually calls cond_resched().
662 */ 732 */
663 ret = dax_map_atomic(bdev, &dax); 733 ret = dax_map_atomic(bdev, &dax);
664 if (ret < 0) 734 if (ret < 0) {
735 put_locked_mapping_entry(mapping, index, entry);
665 return ret; 736 return ret;
737 }
666 738
667 if (WARN_ON_ONCE(ret < dax.size)) { 739 if (WARN_ON_ONCE(ret < dax.size)) {
668 ret = -EIO; 740 ret = -EIO;
669 goto unmap; 741 goto unmap;
670 } 742 }
671 743
744 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
672 wb_cache_pmem(dax.addr, dax.size); 745 wb_cache_pmem(dax.addr, dax.size);
673 746 /*
747 * After we have flushed the cache, we can clear the dirty tag. There
748 * cannot be new dirty data in the pfn after the flush has completed as
749 * the pfn mappings are writeprotected and fault waits for mapping
750 * entry lock.
751 */
674 spin_lock_irq(&mapping->tree_lock); 752 spin_lock_irq(&mapping->tree_lock);
675 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 753 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
676 spin_unlock_irq(&mapping->tree_lock); 754 spin_unlock_irq(&mapping->tree_lock);
677 unmap: 755 unmap:
678 dax_unmap_atomic(bdev, &dax); 756 dax_unmap_atomic(bdev, &dax);
757 put_locked_mapping_entry(mapping, index, entry);
679 return ret; 758 return ret;
680 759
681 unlock: 760 put_unlocked:
761 put_unlocked_mapping_entry(mapping, index, entry2);
682 spin_unlock_irq(&mapping->tree_lock); 762 spin_unlock_irq(&mapping->tree_lock);
683 return ret; 763 return ret;
684} 764}
@@ -738,7 +818,7 @@ static int dax_insert_mapping(struct address_space *mapping,
738 struct block_device *bdev, sector_t sector, size_t size, 818 struct block_device *bdev, sector_t sector, size_t size,
739 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) 819 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
740{ 820{
741 unsigned long vaddr = (unsigned long)vmf->virtual_address; 821 unsigned long vaddr = vmf->address;
742 struct blk_dax_ctl dax = { 822 struct blk_dax_ctl dax = {
743 .sector = sector, 823 .sector = sector,
744 .size = size, 824 .size = size,
@@ -767,17 +847,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
767{ 847{
768 struct file *file = vma->vm_file; 848 struct file *file = vma->vm_file;
769 struct address_space *mapping = file->f_mapping; 849 struct address_space *mapping = file->f_mapping;
770 void *entry; 850 void *entry, **slot;
771 pgoff_t index = vmf->pgoff; 851 pgoff_t index = vmf->pgoff;
772 852
773 spin_lock_irq(&mapping->tree_lock); 853 spin_lock_irq(&mapping->tree_lock);
774 entry = get_unlocked_mapping_entry(mapping, index, NULL); 854 entry = get_unlocked_mapping_entry(mapping, index, &slot);
775 if (!entry || !radix_tree_exceptional_entry(entry)) 855 if (!entry || !radix_tree_exceptional_entry(entry)) {
776 goto out; 856 if (entry)
857 put_unlocked_mapping_entry(mapping, index, entry);
858 spin_unlock_irq(&mapping->tree_lock);
859 return VM_FAULT_NOPAGE;
860 }
777 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 861 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
778 put_unlocked_mapping_entry(mapping, index, entry); 862 entry = lock_slot(mapping, slot);
779out:
780 spin_unlock_irq(&mapping->tree_lock); 863 spin_unlock_irq(&mapping->tree_lock);
864 /*
865 * If we race with somebody updating the PTE and finish_mkwrite_fault()
866 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
867 * the fault in either case.
868 */
869 finish_mkwrite_fault(vmf);
870 put_locked_mapping_entry(mapping, index, entry);
781 return VM_FAULT_NOPAGE; 871 return VM_FAULT_NOPAGE;
782} 872}
783EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 873EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -948,13 +1038,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
948{ 1038{
949 struct address_space *mapping = vma->vm_file->f_mapping; 1039 struct address_space *mapping = vma->vm_file->f_mapping;
950 struct inode *inode = mapping->host; 1040 struct inode *inode = mapping->host;
951 unsigned long vaddr = (unsigned long)vmf->virtual_address; 1041 unsigned long vaddr = vmf->address;
952 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1042 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
953 sector_t sector; 1043 sector_t sector;
954 struct iomap iomap = { 0 }; 1044 struct iomap iomap = { 0 };
955 unsigned flags = IOMAP_FAULT; 1045 unsigned flags = IOMAP_FAULT;
956 int error, major = 0; 1046 int error, major = 0;
957 int locked_status = 0; 1047 int vmf_ret = 0;
958 void *entry; 1048 void *entry;
959 1049
960 /* 1050 /*
@@ -1007,13 +1097,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1007 1097
1008 if (error) 1098 if (error)
1009 goto finish_iomap; 1099 goto finish_iomap;
1010 if (!radix_tree_exceptional_entry(entry)) { 1100
1011 vmf->page = entry; 1101 __SetPageUptodate(vmf->cow_page);
1012 locked_status = VM_FAULT_LOCKED; 1102 vmf_ret = finish_fault(vmf);
1013 } else { 1103 if (!vmf_ret)
1014 vmf->entry = entry; 1104 vmf_ret = VM_FAULT_DONE_COW;
1015 locked_status = VM_FAULT_DAX_LOCKED;
1016 }
1017 goto finish_iomap; 1105 goto finish_iomap;
1018 } 1106 }
1019 1107
@@ -1030,7 +1118,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1030 case IOMAP_UNWRITTEN: 1118 case IOMAP_UNWRITTEN:
1031 case IOMAP_HOLE: 1119 case IOMAP_HOLE:
1032 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1120 if (!(vmf->flags & FAULT_FLAG_WRITE)) {
1033 locked_status = dax_load_hole(mapping, entry, vmf); 1121 vmf_ret = dax_load_hole(mapping, entry, vmf);
1034 break; 1122 break;
1035 } 1123 }
1036 /*FALLTHRU*/ 1124 /*FALLTHRU*/
@@ -1042,7 +1130,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1042 1130
1043 finish_iomap: 1131 finish_iomap:
1044 if (ops->iomap_end) { 1132 if (ops->iomap_end) {
1045 if (error) { 1133 if (error || (vmf_ret & VM_FAULT_ERROR)) {
1046 /* keep previous error */ 1134 /* keep previous error */
1047 ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, 1135 ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
1048 &iomap); 1136 &iomap);
@@ -1052,7 +1140,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1052 } 1140 }
1053 } 1141 }
1054 unlock_entry: 1142 unlock_entry:
1055 if (!locked_status || error) 1143 if (vmf_ret != VM_FAULT_LOCKED || error)
1056 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1144 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1057 out: 1145 out:
1058 if (error == -ENOMEM) 1146 if (error == -ENOMEM)
@@ -1060,9 +1148,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1060 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1148 /* -EBUSY is fine, somebody else faulted on the same PTE */
1061 if (error < 0 && error != -EBUSY) 1149 if (error < 0 && error != -EBUSY)
1062 return VM_FAULT_SIGBUS | major; 1150 return VM_FAULT_SIGBUS | major;
1063 if (locked_status) { 1151 if (vmf_ret) {
1064 WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ 1152 WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
1065 return locked_status; 1153 return vmf_ret;
1066 } 1154 }
1067 return VM_FAULT_NOPAGE | major; 1155 return VM_FAULT_NOPAGE | major;
1068} 1156}