summaryrefslogtreecommitdiffstats
path: root/fs/dax.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dax.c')
-rw-r--r--fs/dax.c297
1 files changed, 156 insertions, 141 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 6433650be833..43bbd6d1037d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void)
55} 55}
56fs_initcall(init_dax_wait_table); 56fs_initcall(init_dax_wait_table);
57 57
58static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
59{
60 struct request_queue *q = bdev->bd_queue;
61 long rc = -EIO;
62
63 dax->addr = ERR_PTR(-EIO);
64 if (blk_queue_enter(q, true) != 0)
65 return rc;
66
67 rc = bdev_direct_access(bdev, dax);
68 if (rc < 0) {
69 dax->addr = ERR_PTR(rc);
70 blk_queue_exit(q);
71 return rc;
72 }
73 return rc;
74}
75
76static void dax_unmap_atomic(struct block_device *bdev,
77 const struct blk_dax_ctl *dax)
78{
79 if (IS_ERR(dax->addr))
80 return;
81 blk_queue_exit(bdev->bd_queue);
82}
83
84static int dax_is_pmd_entry(void *entry) 58static int dax_is_pmd_entry(void *entry)
85{ 59{
86 return (unsigned long)entry & RADIX_DAX_PMD; 60 return (unsigned long)entry & RADIX_DAX_PMD;
@@ -101,26 +75,6 @@ static int dax_is_empty_entry(void *entry)
101 return (unsigned long)entry & RADIX_DAX_EMPTY; 75 return (unsigned long)entry & RADIX_DAX_EMPTY;
102} 76}
103 77
104struct page *read_dax_sector(struct block_device *bdev, sector_t n)
105{
106 struct page *page = alloc_pages(GFP_KERNEL, 0);
107 struct blk_dax_ctl dax = {
108 .size = PAGE_SIZE,
109 .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
110 };
111 long rc;
112
113 if (!page)
114 return ERR_PTR(-ENOMEM);
115
116 rc = dax_map_atomic(bdev, &dax);
117 if (rc < 0)
118 return ERR_PTR(rc);
119 memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
120 dax_unmap_atomic(bdev, &dax);
121 return page;
122}
123
124/* 78/*
125 * DAX radix tree locking 79 * DAX radix tree locking
126 */ 80 */
@@ -582,21 +536,30 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
582 return ret; 536 return ret;
583} 537}
584 538
585static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, 539static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
586 struct page *to, unsigned long vaddr) 540 sector_t sector, size_t size, struct page *to,
541 unsigned long vaddr)
587{ 542{
588 struct blk_dax_ctl dax = { 543 void *vto, *kaddr;
589 .sector = sector, 544 pgoff_t pgoff;
590 .size = size, 545 pfn_t pfn;
591 }; 546 long rc;
592 void *vto; 547 int id;
593 548
594 if (dax_map_atomic(bdev, &dax) < 0) 549 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
595 return PTR_ERR(dax.addr); 550 if (rc)
551 return rc;
552
553 id = dax_read_lock();
554 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
555 if (rc < 0) {
556 dax_read_unlock(id);
557 return rc;
558 }
596 vto = kmap_atomic(to); 559 vto = kmap_atomic(to);
597 copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 560 copy_user_page(vto, (void __force *)kaddr, vaddr, to);
598 kunmap_atomic(vto); 561 kunmap_atomic(vto);
599 dax_unmap_atomic(bdev, &dax); 562 dax_read_unlock(id);
600 return 0; 563 return 0;
601} 564}
602 565
@@ -764,12 +727,16 @@ unlock_pte:
764} 727}
765 728
766static int dax_writeback_one(struct block_device *bdev, 729static int dax_writeback_one(struct block_device *bdev,
767 struct address_space *mapping, pgoff_t index, void *entry) 730 struct dax_device *dax_dev, struct address_space *mapping,
731 pgoff_t index, void *entry)
768{ 732{
769 struct radix_tree_root *page_tree = &mapping->page_tree; 733 struct radix_tree_root *page_tree = &mapping->page_tree;
770 struct blk_dax_ctl dax; 734 void *entry2, **slot, *kaddr;
771 void *entry2, **slot; 735 long ret = 0, id;
772 int ret = 0; 736 sector_t sector;
737 pgoff_t pgoff;
738 size_t size;
739 pfn_t pfn;
773 740
774 /* 741 /*
775 * A page got tagged dirty in DAX mapping? Something is seriously 742 * A page got tagged dirty in DAX mapping? Something is seriously
@@ -818,26 +785,29 @@ static int dax_writeback_one(struct block_device *bdev,
818 * 'entry'. This allows us to flush for PMD_SIZE and not have to 785 * 'entry'. This allows us to flush for PMD_SIZE and not have to
819 * worry about partial PMD writebacks. 786 * worry about partial PMD writebacks.
820 */ 787 */
821 dax.sector = dax_radix_sector(entry); 788 sector = dax_radix_sector(entry);
822 dax.size = PAGE_SIZE << dax_radix_order(entry); 789 size = PAGE_SIZE << dax_radix_order(entry);
790
791 id = dax_read_lock();
792 ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
793 if (ret)
794 goto dax_unlock;
823 795
824 /* 796 /*
825 * We cannot hold tree_lock while calling dax_map_atomic() because it 797 * dax_direct_access() may sleep, so cannot hold tree_lock over
826 * eventually calls cond_resched(). 798 * its invocation.
827 */ 799 */
828 ret = dax_map_atomic(bdev, &dax); 800 ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
829 if (ret < 0) { 801 if (ret < 0)
830 put_locked_mapping_entry(mapping, index, entry); 802 goto dax_unlock;
831 return ret;
832 }
833 803
834 if (WARN_ON_ONCE(ret < dax.size)) { 804 if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
835 ret = -EIO; 805 ret = -EIO;
836 goto unmap; 806 goto dax_unlock;
837 } 807 }
838 808
839 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); 809 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
840 wb_cache_pmem(dax.addr, dax.size); 810 wb_cache_pmem(kaddr, size);
841 /* 811 /*
842 * After we have flushed the cache, we can clear the dirty tag. There 812 * After we have flushed the cache, we can clear the dirty tag. There
843 * cannot be new dirty data in the pfn after the flush has completed as 813 * cannot be new dirty data in the pfn after the flush has completed as
@@ -847,8 +817,8 @@ static int dax_writeback_one(struct block_device *bdev,
847 spin_lock_irq(&mapping->tree_lock); 817 spin_lock_irq(&mapping->tree_lock);
848 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); 818 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
849 spin_unlock_irq(&mapping->tree_lock); 819 spin_unlock_irq(&mapping->tree_lock);
850 unmap: 820 dax_unlock:
851 dax_unmap_atomic(bdev, &dax); 821 dax_read_unlock(id);
852 put_locked_mapping_entry(mapping, index, entry); 822 put_locked_mapping_entry(mapping, index, entry);
853 return ret; 823 return ret;
854 824
@@ -869,6 +839,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
869 struct inode *inode = mapping->host; 839 struct inode *inode = mapping->host;
870 pgoff_t start_index, end_index; 840 pgoff_t start_index, end_index;
871 pgoff_t indices[PAGEVEC_SIZE]; 841 pgoff_t indices[PAGEVEC_SIZE];
842 struct dax_device *dax_dev;
872 struct pagevec pvec; 843 struct pagevec pvec;
873 bool done = false; 844 bool done = false;
874 int i, ret = 0; 845 int i, ret = 0;
@@ -879,6 +850,10 @@ int dax_writeback_mapping_range(struct address_space *mapping,
879 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 850 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
880 return 0; 851 return 0;
881 852
853 dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
854 if (!dax_dev)
855 return -EIO;
856
882 start_index = wbc->range_start >> PAGE_SHIFT; 857 start_index = wbc->range_start >> PAGE_SHIFT;
883 end_index = wbc->range_end >> PAGE_SHIFT; 858 end_index = wbc->range_end >> PAGE_SHIFT;
884 859
@@ -899,38 +874,49 @@ int dax_writeback_mapping_range(struct address_space *mapping,
899 break; 874 break;
900 } 875 }
901 876
902 ret = dax_writeback_one(bdev, mapping, indices[i], 877 ret = dax_writeback_one(bdev, dax_dev, mapping,
903 pvec.pages[i]); 878 indices[i], pvec.pages[i]);
904 if (ret < 0) 879 if (ret < 0) {
880 put_dax(dax_dev);
905 return ret; 881 return ret;
882 }
906 } 883 }
907 } 884 }
885 put_dax(dax_dev);
908 return 0; 886 return 0;
909} 887}
910EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 888EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
911 889
912static int dax_insert_mapping(struct address_space *mapping, 890static int dax_insert_mapping(struct address_space *mapping,
913 struct block_device *bdev, sector_t sector, size_t size, 891 struct block_device *bdev, struct dax_device *dax_dev,
914 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) 892 sector_t sector, size_t size, void **entryp,
893 struct vm_area_struct *vma, struct vm_fault *vmf)
915{ 894{
916 unsigned long vaddr = vmf->address; 895 unsigned long vaddr = vmf->address;
917 struct blk_dax_ctl dax = {
918 .sector = sector,
919 .size = size,
920 };
921 void *ret;
922 void *entry = *entryp; 896 void *entry = *entryp;
897 void *ret, *kaddr;
898 pgoff_t pgoff;
899 int id, rc;
900 pfn_t pfn;
923 901
924 if (dax_map_atomic(bdev, &dax) < 0) 902 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
925 return PTR_ERR(dax.addr); 903 if (rc)
926 dax_unmap_atomic(bdev, &dax); 904 return rc;
927 905
928 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); 906 id = dax_read_lock();
907 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
908 if (rc < 0) {
909 dax_read_unlock(id);
910 return rc;
911 }
912 dax_read_unlock(id);
913
914 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
929 if (IS_ERR(ret)) 915 if (IS_ERR(ret))
930 return PTR_ERR(ret); 916 return PTR_ERR(ret);
931 *entryp = ret; 917 *entryp = ret;
932 918
933 return vm_insert_mixed(vma, vaddr, dax.pfn); 919 return vm_insert_mixed(vma, vaddr, pfn);
934} 920}
935 921
936/** 922/**
@@ -979,24 +965,34 @@ static bool dax_range_is_aligned(struct block_device *bdev,
979 return true; 965 return true;
980} 966}
981 967
982int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 968int __dax_zero_page_range(struct block_device *bdev,
983 unsigned int offset, unsigned int length) 969 struct dax_device *dax_dev, sector_t sector,
970 unsigned int offset, unsigned int size)
984{ 971{
985 struct blk_dax_ctl dax = { 972 if (dax_range_is_aligned(bdev, offset, size)) {
986 .sector = sector, 973 sector_t start_sector = sector + (offset >> 9);
987 .size = PAGE_SIZE,
988 };
989
990 if (dax_range_is_aligned(bdev, offset, length)) {
991 sector_t start_sector = dax.sector + (offset >> 9);
992 974
993 return blkdev_issue_zeroout(bdev, start_sector, 975 return blkdev_issue_zeroout(bdev, start_sector,
994 length >> 9, GFP_NOFS, 0); 976 size >> 9, GFP_NOFS, 0);
995 } else { 977 } else {
996 if (dax_map_atomic(bdev, &dax) < 0) 978 pgoff_t pgoff;
997 return PTR_ERR(dax.addr); 979 long rc, id;
998 clear_pmem(dax.addr + offset, length); 980 void *kaddr;
999 dax_unmap_atomic(bdev, &dax); 981 pfn_t pfn;
982
983 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
984 if (rc)
985 return rc;
986
987 id = dax_read_lock();
988 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
989 &pfn);
990 if (rc < 0) {
991 dax_read_unlock(id);
992 return rc;
993 }
994 clear_pmem(kaddr + offset, size);
995 dax_read_unlock(id);
1000 } 996 }
1001 return 0; 997 return 0;
1002} 998}
@@ -1011,9 +1007,12 @@ static loff_t
1011dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 1007dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1012 struct iomap *iomap) 1008 struct iomap *iomap)
1013{ 1009{
1010 struct block_device *bdev = iomap->bdev;
1011 struct dax_device *dax_dev = iomap->dax_dev;
1014 struct iov_iter *iter = data; 1012 struct iov_iter *iter = data;
1015 loff_t end = pos + length, done = 0; 1013 loff_t end = pos + length, done = 0;
1016 ssize_t ret = 0; 1014 ssize_t ret = 0;
1015 int id;
1017 1016
1018 if (iov_iter_rw(iter) == READ) { 1017 if (iov_iter_rw(iter) == READ) {
1019 end = min(end, i_size_read(inode)); 1018 end = min(end, i_size_read(inode));
@@ -1038,34 +1037,42 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1038 (end - 1) >> PAGE_SHIFT); 1037 (end - 1) >> PAGE_SHIFT);
1039 } 1038 }
1040 1039
1040 id = dax_read_lock();
1041 while (pos < end) { 1041 while (pos < end) {
1042 unsigned offset = pos & (PAGE_SIZE - 1); 1042 unsigned offset = pos & (PAGE_SIZE - 1);
1043 struct blk_dax_ctl dax = { 0 }; 1043 const size_t size = ALIGN(length + offset, PAGE_SIZE);
1044 const sector_t sector = dax_iomap_sector(iomap, pos);
1044 ssize_t map_len; 1045 ssize_t map_len;
1046 pgoff_t pgoff;
1047 void *kaddr;
1048 pfn_t pfn;
1045 1049
1046 if (fatal_signal_pending(current)) { 1050 if (fatal_signal_pending(current)) {
1047 ret = -EINTR; 1051 ret = -EINTR;
1048 break; 1052 break;
1049 } 1053 }
1050 1054
1051 dax.sector = dax_iomap_sector(iomap, pos); 1055 ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
1052 dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; 1056 if (ret)
1053 map_len = dax_map_atomic(iomap->bdev, &dax); 1057 break;
1058
1059 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
1060 &kaddr, &pfn);
1054 if (map_len < 0) { 1061 if (map_len < 0) {
1055 ret = map_len; 1062 ret = map_len;
1056 break; 1063 break;
1057 } 1064 }
1058 1065
1059 dax.addr += offset; 1066 map_len = PFN_PHYS(map_len);
1067 kaddr += offset;
1060 map_len -= offset; 1068 map_len -= offset;
1061 if (map_len > end - pos) 1069 if (map_len > end - pos)
1062 map_len = end - pos; 1070 map_len = end - pos;
1063 1071
1064 if (iov_iter_rw(iter) == WRITE) 1072 if (iov_iter_rw(iter) == WRITE)
1065 map_len = copy_from_iter_pmem(dax.addr, map_len, iter); 1073 map_len = copy_from_iter_pmem(kaddr, map_len, iter);
1066 else 1074 else
1067 map_len = copy_to_iter(dax.addr, map_len, iter); 1075 map_len = copy_to_iter(kaddr, map_len, iter);
1068 dax_unmap_atomic(iomap->bdev, &dax);
1069 if (map_len <= 0) { 1076 if (map_len <= 0) {
1070 ret = map_len ? map_len : -EFAULT; 1077 ret = map_len ? map_len : -EFAULT;
1071 break; 1078 break;
@@ -1075,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1075 length -= map_len; 1082 length -= map_len;
1076 done += map_len; 1083 done += map_len;
1077 } 1084 }
1085 dax_read_unlock(id);
1078 1086
1079 return done ? done : ret; 1087 return done ? done : ret;
1080} 1088}
@@ -1181,8 +1189,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1181 clear_user_highpage(vmf->cow_page, vaddr); 1189 clear_user_highpage(vmf->cow_page, vaddr);
1182 break; 1190 break;
1183 case IOMAP_MAPPED: 1191 case IOMAP_MAPPED:
1184 error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, 1192 error = copy_user_dax(iomap.bdev, iomap.dax_dev,
1185 vmf->cow_page, vaddr); 1193 sector, PAGE_SIZE, vmf->cow_page, vaddr);
1186 break; 1194 break;
1187 default: 1195 default:
1188 WARN_ON_ONCE(1); 1196 WARN_ON_ONCE(1);
@@ -1207,8 +1215,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1207 mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); 1215 mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
1208 major = VM_FAULT_MAJOR; 1216 major = VM_FAULT_MAJOR;
1209 } 1217 }
1210 error = dax_insert_mapping(mapping, iomap.bdev, sector, 1218 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
1211 PAGE_SIZE, &entry, vmf->vma, vmf); 1219 sector, PAGE_SIZE, &entry, vmf->vma, vmf);
1212 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1220 /* -EBUSY is fine, somebody else faulted on the same PTE */
1213 if (error == -EBUSY) 1221 if (error == -EBUSY)
1214 error = 0; 1222 error = 0;
@@ -1258,41 +1266,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
1258 loff_t pos, void **entryp) 1266 loff_t pos, void **entryp)
1259{ 1267{
1260 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1268 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1269 const sector_t sector = dax_iomap_sector(iomap, pos);
1270 struct dax_device *dax_dev = iomap->dax_dev;
1261 struct block_device *bdev = iomap->bdev; 1271 struct block_device *bdev = iomap->bdev;
1262 struct inode *inode = mapping->host; 1272 struct inode *inode = mapping->host;
1263 struct blk_dax_ctl dax = { 1273 const size_t size = PMD_SIZE;
1264 .sector = dax_iomap_sector(iomap, pos), 1274 void *ret = NULL, *kaddr;
1265 .size = PMD_SIZE, 1275 long length = 0;
1266 }; 1276 pgoff_t pgoff;
1267 long length = dax_map_atomic(bdev, &dax); 1277 pfn_t pfn;
1268 void *ret = NULL; 1278 int id;
1269 1279
1270 if (length < 0) /* dax_map_atomic() failed */ 1280 if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
1271 goto fallback; 1281 goto fallback;
1272 if (length < PMD_SIZE)
1273 goto unmap_fallback;
1274 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
1275 goto unmap_fallback;
1276 if (!pfn_t_devmap(dax.pfn))
1277 goto unmap_fallback;
1278
1279 dax_unmap_atomic(bdev, &dax);
1280 1282
1281 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, 1283 id = dax_read_lock();
1284 length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
1285 if (length < 0)
1286 goto unlock_fallback;
1287 length = PFN_PHYS(length);
1288
1289 if (length < size)
1290 goto unlock_fallback;
1291 if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
1292 goto unlock_fallback;
1293 if (!pfn_t_devmap(pfn))
1294 goto unlock_fallback;
1295 dax_read_unlock(id);
1296
1297 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
1282 RADIX_DAX_PMD); 1298 RADIX_DAX_PMD);
1283 if (IS_ERR(ret)) 1299 if (IS_ERR(ret))
1284 goto fallback; 1300 goto fallback;
1285 *entryp = ret; 1301 *entryp = ret;
1286 1302
1287 trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret); 1303 trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
1288 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, 1304 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
1289 dax.pfn, vmf->flags & FAULT_FLAG_WRITE); 1305 pfn, vmf->flags & FAULT_FLAG_WRITE);
1290 1306
1291 unmap_fallback: 1307unlock_fallback:
1292 dax_unmap_atomic(bdev, &dax); 1308 dax_read_unlock(id);
1293fallback: 1309fallback:
1294 trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, 1310 trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
1295 dax.pfn, ret);
1296 return VM_FAULT_FALLBACK; 1311 return VM_FAULT_FALLBACK;
1297} 1312}
1298 1313