diff options
Diffstat (limited to 'fs/dax.c')
-rw-r--r-- | fs/dax.c | 208 |
1 files changed, 148 insertions, 60 deletions
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/vmstat.h> | 31 | #include <linux/vmstat.h> |
32 | #include <linux/pfn_t.h> | 32 | #include <linux/pfn_t.h> |
33 | #include <linux/sizes.h> | 33 | #include <linux/sizes.h> |
34 | #include <linux/mmu_notifier.h> | ||
34 | #include <linux/iomap.h> | 35 | #include <linux/iomap.h> |
35 | #include "internal.h" | 36 | #include "internal.h" |
36 | 37 | ||
@@ -240,6 +241,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, | |||
240 | } | 241 | } |
241 | } | 242 | } |
242 | 243 | ||
244 | static void dax_unlock_mapping_entry(struct address_space *mapping, | ||
245 | pgoff_t index) | ||
246 | { | ||
247 | void *entry, **slot; | ||
248 | |||
249 | spin_lock_irq(&mapping->tree_lock); | ||
250 | entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); | ||
251 | if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || | ||
252 | !slot_locked(mapping, slot))) { | ||
253 | spin_unlock_irq(&mapping->tree_lock); | ||
254 | return; | ||
255 | } | ||
256 | unlock_slot(mapping, slot); | ||
257 | spin_unlock_irq(&mapping->tree_lock); | ||
258 | dax_wake_mapping_entry_waiter(mapping, index, entry, false); | ||
259 | } | ||
260 | |||
243 | static void put_locked_mapping_entry(struct address_space *mapping, | 261 | static void put_locked_mapping_entry(struct address_space *mapping, |
244 | pgoff_t index, void *entry) | 262 | pgoff_t index, void *entry) |
245 | { | 263 | { |
@@ -433,22 +451,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, | |||
433 | __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); | 451 | __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); |
434 | } | 452 | } |
435 | 453 | ||
436 | void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) | ||
437 | { | ||
438 | void *entry, **slot; | ||
439 | |||
440 | spin_lock_irq(&mapping->tree_lock); | ||
441 | entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); | ||
442 | if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || | ||
443 | !slot_locked(mapping, slot))) { | ||
444 | spin_unlock_irq(&mapping->tree_lock); | ||
445 | return; | ||
446 | } | ||
447 | unlock_slot(mapping, slot); | ||
448 | spin_unlock_irq(&mapping->tree_lock); | ||
449 | dax_wake_mapping_entry_waiter(mapping, index, entry, false); | ||
450 | } | ||
451 | |||
452 | /* | 454 | /* |
453 | * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree | 455 | * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree |
454 | * entry to get unlocked before deleting it. | 456 | * entry to get unlocked before deleting it. |
@@ -500,10 +502,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry, | |||
500 | /* This will replace locked radix tree entry with a hole page */ | 502 | /* This will replace locked radix tree entry with a hole page */ |
501 | page = find_or_create_page(mapping, vmf->pgoff, | 503 | page = find_or_create_page(mapping, vmf->pgoff, |
502 | vmf->gfp_mask | __GFP_ZERO); | 504 | vmf->gfp_mask | __GFP_ZERO); |
503 | if (!page) { | 505 | if (!page) |
504 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | ||
505 | return VM_FAULT_OOM; | 506 | return VM_FAULT_OOM; |
506 | } | ||
507 | vmf->page = page; | 507 | vmf->page = page; |
508 | return VM_FAULT_LOCKED; | 508 | return VM_FAULT_LOCKED; |
509 | } | 509 | } |
@@ -615,36 +615,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, | |||
615 | return new_entry; | 615 | return new_entry; |
616 | } | 616 | } |
617 | 617 | ||
618 | static inline unsigned long | ||
619 | pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) | ||
620 | { | ||
621 | unsigned long address; | ||
622 | |||
623 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | ||
624 | VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); | ||
625 | return address; | ||
626 | } | ||
627 | |||
628 | /* Walk all mappings of a given index of a file and writeprotect them */ | ||
629 | static void dax_mapping_entry_mkclean(struct address_space *mapping, | ||
630 | pgoff_t index, unsigned long pfn) | ||
631 | { | ||
632 | struct vm_area_struct *vma; | ||
633 | pte_t *ptep; | ||
634 | pte_t pte; | ||
635 | spinlock_t *ptl; | ||
636 | bool changed; | ||
637 | |||
638 | i_mmap_lock_read(mapping); | ||
639 | vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { | ||
640 | unsigned long address; | ||
641 | |||
642 | cond_resched(); | ||
643 | |||
644 | if (!(vma->vm_flags & VM_SHARED)) | ||
645 | continue; | ||
646 | |||
647 | address = pgoff_address(index, vma); | ||
648 | changed = false; | ||
649 | if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) | ||
650 | continue; | ||
651 | if (pfn != pte_pfn(*ptep)) | ||
652 | goto unlock; | ||
653 | if (!pte_dirty(*ptep) && !pte_write(*ptep)) | ||
654 | goto unlock; | ||
655 | |||
656 | flush_cache_page(vma, address, pfn); | ||
657 | pte = ptep_clear_flush(vma, address, ptep); | ||
658 | pte = pte_wrprotect(pte); | ||
659 | pte = pte_mkclean(pte); | ||
660 | set_pte_at(vma->vm_mm, address, ptep, pte); | ||
661 | changed = true; | ||
662 | unlock: | ||
663 | pte_unmap_unlock(ptep, ptl); | ||
664 | |||
665 | if (changed) | ||
666 | mmu_notifier_invalidate_page(vma->vm_mm, address); | ||
667 | } | ||
668 | i_mmap_unlock_read(mapping); | ||
669 | } | ||
670 | |||
618 | static int dax_writeback_one(struct block_device *bdev, | 671 | static int dax_writeback_one(struct block_device *bdev, |
619 | struct address_space *mapping, pgoff_t index, void *entry) | 672 | struct address_space *mapping, pgoff_t index, void *entry) |
620 | { | 673 | { |
621 | struct radix_tree_root *page_tree = &mapping->page_tree; | 674 | struct radix_tree_root *page_tree = &mapping->page_tree; |
622 | struct radix_tree_node *node; | ||
623 | struct blk_dax_ctl dax; | 675 | struct blk_dax_ctl dax; |
624 | void **slot; | 676 | void *entry2, **slot; |
625 | int ret = 0; | 677 | int ret = 0; |
626 | 678 | ||
627 | spin_lock_irq(&mapping->tree_lock); | ||
628 | /* | 679 | /* |
629 | * Regular page slots are stabilized by the page lock even | 680 | * A page got tagged dirty in DAX mapping? Something is seriously |
630 | * without the tree itself locked. These unlocked entries | 681 | * wrong. |
631 | * need verification under the tree lock. | ||
632 | */ | 682 | */ |
633 | if (!__radix_tree_lookup(page_tree, index, &node, &slot)) | 683 | if (WARN_ON(!radix_tree_exceptional_entry(entry))) |
634 | goto unlock; | 684 | return -EIO; |
635 | if (*slot != entry) | ||
636 | goto unlock; | ||
637 | |||
638 | /* another fsync thread may have already written back this entry */ | ||
639 | if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) | ||
640 | goto unlock; | ||
641 | 685 | ||
686 | spin_lock_irq(&mapping->tree_lock); | ||
687 | entry2 = get_unlocked_mapping_entry(mapping, index, &slot); | ||
688 | /* Entry got punched out / reallocated? */ | ||
689 | if (!entry2 || !radix_tree_exceptional_entry(entry2)) | ||
690 | goto put_unlocked; | ||
691 | /* | ||
692 | * Entry got reallocated elsewhere? No need to writeback. We have to | ||
693 | * compare sectors as we must not bail out due to difference in lockbit | ||
694 | * or entry type. | ||
695 | */ | ||
696 | if (dax_radix_sector(entry2) != dax_radix_sector(entry)) | ||
697 | goto put_unlocked; | ||
642 | if (WARN_ON_ONCE(dax_is_empty_entry(entry) || | 698 | if (WARN_ON_ONCE(dax_is_empty_entry(entry) || |
643 | dax_is_zero_entry(entry))) { | 699 | dax_is_zero_entry(entry))) { |
644 | ret = -EIO; | 700 | ret = -EIO; |
645 | goto unlock; | 701 | goto put_unlocked; |
646 | } | 702 | } |
647 | 703 | ||
704 | /* Another fsync thread may have already written back this entry */ | ||
705 | if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) | ||
706 | goto put_unlocked; | ||
707 | /* Lock the entry to serialize with page faults */ | ||
708 | entry = lock_slot(mapping, slot); | ||
709 | /* | ||
710 | * We can clear the tag now but we have to be careful so that concurrent | ||
711 | * dax_writeback_one() calls for the same index cannot finish before we | ||
712 | * actually flush the caches. This is achieved as the calls will look | ||
713 | * at the entry only under tree_lock and once they do that they will | ||
714 | * see the entry locked and wait for it to unlock. | ||
715 | */ | ||
716 | radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); | ||
717 | spin_unlock_irq(&mapping->tree_lock); | ||
718 | |||
648 | /* | 719 | /* |
649 | * Even if dax_writeback_mapping_range() was given a wbc->range_start | 720 | * Even if dax_writeback_mapping_range() was given a wbc->range_start |
650 | * in the middle of a PMD, the 'index' we are given will be aligned to | 721 | * in the middle of a PMD, the 'index' we are given will be aligned to |
@@ -654,31 +725,40 @@ static int dax_writeback_one(struct block_device *bdev, | |||
654 | */ | 725 | */ |
655 | dax.sector = dax_radix_sector(entry); | 726 | dax.sector = dax_radix_sector(entry); |
656 | dax.size = PAGE_SIZE << dax_radix_order(entry); | 727 | dax.size = PAGE_SIZE << dax_radix_order(entry); |
657 | spin_unlock_irq(&mapping->tree_lock); | ||
658 | 728 | ||
659 | /* | 729 | /* |
660 | * We cannot hold tree_lock while calling dax_map_atomic() because it | 730 | * We cannot hold tree_lock while calling dax_map_atomic() because it |
661 | * eventually calls cond_resched(). | 731 | * eventually calls cond_resched(). |
662 | */ | 732 | */ |
663 | ret = dax_map_atomic(bdev, &dax); | 733 | ret = dax_map_atomic(bdev, &dax); |
664 | if (ret < 0) | 734 | if (ret < 0) { |
735 | put_locked_mapping_entry(mapping, index, entry); | ||
665 | return ret; | 736 | return ret; |
737 | } | ||
666 | 738 | ||
667 | if (WARN_ON_ONCE(ret < dax.size)) { | 739 | if (WARN_ON_ONCE(ret < dax.size)) { |
668 | ret = -EIO; | 740 | ret = -EIO; |
669 | goto unmap; | 741 | goto unmap; |
670 | } | 742 | } |
671 | 743 | ||
744 | dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); | ||
672 | wb_cache_pmem(dax.addr, dax.size); | 745 | wb_cache_pmem(dax.addr, dax.size); |
673 | 746 | /* | |
747 | * After we have flushed the cache, we can clear the dirty tag. There | ||
748 | * cannot be new dirty data in the pfn after the flush has completed as | ||
749 | * the pfn mappings are writeprotected and fault waits for mapping | ||
750 | * entry lock. | ||
751 | */ | ||
674 | spin_lock_irq(&mapping->tree_lock); | 752 | spin_lock_irq(&mapping->tree_lock); |
675 | radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); | 753 | radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); |
676 | spin_unlock_irq(&mapping->tree_lock); | 754 | spin_unlock_irq(&mapping->tree_lock); |
677 | unmap: | 755 | unmap: |
678 | dax_unmap_atomic(bdev, &dax); | 756 | dax_unmap_atomic(bdev, &dax); |
757 | put_locked_mapping_entry(mapping, index, entry); | ||
679 | return ret; | 758 | return ret; |
680 | 759 | ||
681 | unlock: | 760 | put_unlocked: |
761 | put_unlocked_mapping_entry(mapping, index, entry2); | ||
682 | spin_unlock_irq(&mapping->tree_lock); | 762 | spin_unlock_irq(&mapping->tree_lock); |
683 | return ret; | 763 | return ret; |
684 | } | 764 | } |
@@ -738,7 +818,7 @@ static int dax_insert_mapping(struct address_space *mapping, | |||
738 | struct block_device *bdev, sector_t sector, size_t size, | 818 | struct block_device *bdev, sector_t sector, size_t size, |
739 | void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) | 819 | void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) |
740 | { | 820 | { |
741 | unsigned long vaddr = (unsigned long)vmf->virtual_address; | 821 | unsigned long vaddr = vmf->address; |
742 | struct blk_dax_ctl dax = { | 822 | struct blk_dax_ctl dax = { |
743 | .sector = sector, | 823 | .sector = sector, |
744 | .size = size, | 824 | .size = size, |
@@ -767,17 +847,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
767 | { | 847 | { |
768 | struct file *file = vma->vm_file; | 848 | struct file *file = vma->vm_file; |
769 | struct address_space *mapping = file->f_mapping; | 849 | struct address_space *mapping = file->f_mapping; |
770 | void *entry; | 850 | void *entry, **slot; |
771 | pgoff_t index = vmf->pgoff; | 851 | pgoff_t index = vmf->pgoff; |
772 | 852 | ||
773 | spin_lock_irq(&mapping->tree_lock); | 853 | spin_lock_irq(&mapping->tree_lock); |
774 | entry = get_unlocked_mapping_entry(mapping, index, NULL); | 854 | entry = get_unlocked_mapping_entry(mapping, index, &slot); |
775 | if (!entry || !radix_tree_exceptional_entry(entry)) | 855 | if (!entry || !radix_tree_exceptional_entry(entry)) { |
776 | goto out; | 856 | if (entry) |
857 | put_unlocked_mapping_entry(mapping, index, entry); | ||
858 | spin_unlock_irq(&mapping->tree_lock); | ||
859 | return VM_FAULT_NOPAGE; | ||
860 | } | ||
777 | radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); | 861 | radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); |
778 | put_unlocked_mapping_entry(mapping, index, entry); | 862 | entry = lock_slot(mapping, slot); |
779 | out: | ||
780 | spin_unlock_irq(&mapping->tree_lock); | 863 | spin_unlock_irq(&mapping->tree_lock); |
864 | /* | ||
865 | * If we race with somebody updating the PTE and finish_mkwrite_fault() | ||
866 | * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry | ||
867 | * the fault in either case. | ||
868 | */ | ||
869 | finish_mkwrite_fault(vmf); | ||
870 | put_locked_mapping_entry(mapping, index, entry); | ||
781 | return VM_FAULT_NOPAGE; | 871 | return VM_FAULT_NOPAGE; |
782 | } | 872 | } |
783 | EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); | 873 | EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); |
@@ -948,13 +1038,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
948 | { | 1038 | { |
949 | struct address_space *mapping = vma->vm_file->f_mapping; | 1039 | struct address_space *mapping = vma->vm_file->f_mapping; |
950 | struct inode *inode = mapping->host; | 1040 | struct inode *inode = mapping->host; |
951 | unsigned long vaddr = (unsigned long)vmf->virtual_address; | 1041 | unsigned long vaddr = vmf->address; |
952 | loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; | 1042 | loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; |
953 | sector_t sector; | 1043 | sector_t sector; |
954 | struct iomap iomap = { 0 }; | 1044 | struct iomap iomap = { 0 }; |
955 | unsigned flags = IOMAP_FAULT; | 1045 | unsigned flags = IOMAP_FAULT; |
956 | int error, major = 0; | 1046 | int error, major = 0; |
957 | int locked_status = 0; | 1047 | int vmf_ret = 0; |
958 | void *entry; | 1048 | void *entry; |
959 | 1049 | ||
960 | /* | 1050 | /* |
@@ -1007,13 +1097,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1007 | 1097 | ||
1008 | if (error) | 1098 | if (error) |
1009 | goto finish_iomap; | 1099 | goto finish_iomap; |
1010 | if (!radix_tree_exceptional_entry(entry)) { | 1100 | |
1011 | vmf->page = entry; | 1101 | __SetPageUptodate(vmf->cow_page); |
1012 | locked_status = VM_FAULT_LOCKED; | 1102 | vmf_ret = finish_fault(vmf); |
1013 | } else { | 1103 | if (!vmf_ret) |
1014 | vmf->entry = entry; | 1104 | vmf_ret = VM_FAULT_DONE_COW; |
1015 | locked_status = VM_FAULT_DAX_LOCKED; | ||
1016 | } | ||
1017 | goto finish_iomap; | 1105 | goto finish_iomap; |
1018 | } | 1106 | } |
1019 | 1107 | ||
@@ -1030,7 +1118,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1030 | case IOMAP_UNWRITTEN: | 1118 | case IOMAP_UNWRITTEN: |
1031 | case IOMAP_HOLE: | 1119 | case IOMAP_HOLE: |
1032 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { | 1120 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { |
1033 | locked_status = dax_load_hole(mapping, entry, vmf); | 1121 | vmf_ret = dax_load_hole(mapping, entry, vmf); |
1034 | break; | 1122 | break; |
1035 | } | 1123 | } |
1036 | /*FALLTHRU*/ | 1124 | /*FALLTHRU*/ |
@@ -1042,7 +1130,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1042 | 1130 | ||
1043 | finish_iomap: | 1131 | finish_iomap: |
1044 | if (ops->iomap_end) { | 1132 | if (ops->iomap_end) { |
1045 | if (error) { | 1133 | if (error || (vmf_ret & VM_FAULT_ERROR)) { |
1046 | /* keep previous error */ | 1134 | /* keep previous error */ |
1047 | ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, | 1135 | ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, |
1048 | &iomap); | 1136 | &iomap); |
@@ -1052,7 +1140,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1052 | } | 1140 | } |
1053 | } | 1141 | } |
1054 | unlock_entry: | 1142 | unlock_entry: |
1055 | if (!locked_status || error) | 1143 | if (vmf_ret != VM_FAULT_LOCKED || error) |
1056 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | 1144 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); |
1057 | out: | 1145 | out: |
1058 | if (error == -ENOMEM) | 1146 | if (error == -ENOMEM) |
@@ -1060,9 +1148,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1060 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | 1148 | /* -EBUSY is fine, somebody else faulted on the same PTE */ |
1061 | if (error < 0 && error != -EBUSY) | 1149 | if (error < 0 && error != -EBUSY) |
1062 | return VM_FAULT_SIGBUS | major; | 1150 | return VM_FAULT_SIGBUS | major; |
1063 | if (locked_status) { | 1151 | if (vmf_ret) { |
1064 | WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ | 1152 | WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ |
1065 | return locked_status; | 1153 | return vmf_ret; |
1066 | } | 1154 | } |
1067 | return VM_FAULT_NOPAGE | major; | 1155 | return VM_FAULT_NOPAGE | major; |
1068 | } | 1156 | } |