aboutsummaryrefslogtreecommitdiffstats
path: root/fs/dax.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-17 12:51:57 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-17 12:51:57 -0500
commita3841f94c7ecb3ede0f888d3fcfe8fb6368ddd7a (patch)
tree6625eedf10d0672068ee218bb893a5a0e1803df2 /fs/dax.c
parentadeba81ac2a6451f44545874da3d181081f0ab04 (diff)
parent4247f24c23589bcc3bc3490515ef8c9497e9ae55 (diff)
Merge tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm and dax updates from Dan Williams: "Save for a few late fixes, all of these commits have shipped in -next releases since before the merge window opened, and 0day has given a build success notification. The ext4 touches came from Jan, and the xfs touches have Darrick's reviewed-by. An xfstest for the MAP_SYNC feature has been through a few round of reviews and is on track to be merged. - Introduce MAP_SYNC and MAP_SHARED_VALIDATE, a mechanism to enable 'userspace flush' of persistent memory updates via filesystem-dax mappings. It arranges for any filesystem metadata updates that may be required to satisfy a write fault to also be flushed ("on disk") before the kernel returns to userspace from the fault handler. Effectively every write-fault that dirties metadata completes an fsync() before returning from the fault handler. The new MAP_SHARED_VALIDATE mapping type guarantees that the MAP_SYNC flag is validated as supported by the filesystem's ->mmap() file operation. - Add support for the standard ACPI 6.2 label access methods that replace the NVDIMM_FAMILY_INTEL (vendor specific) label methods. This enables interoperability with environments that only implement the standardized methods. - Add support for the ACPI 6.2 NVDIMM media error injection methods. - Add support for the NVDIMM_FAMILY_INTEL v1.6 DIMM commands for latch last shutdown status, firmware update, SMART error injection, and SMART alarm threshold control. - Cleanup physical address information disclosures to be root-only. - Fix revalidation of the DIMM "locked label area" status to support dynamic unlock of the label area. - Expand unit test infrastructure to mock the ACPI 6.2 Translate SPA (system-physical-address) command and error injection commands. Acknowledgements that came after the commits were pushed to -next: - 957ac8c421ad ("dax: fix PMD faults on zero-length files"): Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> - a39e596baa07 ("xfs: support for synchronous DAX faults") and 7b565c9f965b ("xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()") Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>" * tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (49 commits) acpi, nfit: add 'Enable Latch System Shutdown Status' command support dax: fix general protection fault in dax_alloc_inode dax: fix PMD faults on zero-length files dax: stop requiring a live device for dax_flush() brd: remove dax support dax: quiet bdev_dax_supported() fs, dax: unify IOMAP_F_DIRTY read vs write handling policy in the dax core tools/testing/nvdimm: unit test clear-error commands acpi, nfit: validate commands against the device type tools/testing/nvdimm: stricter bounds checking for error injection commands xfs: support for synchronous DAX faults xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault() ext4: Support for synchronous DAX faults ext4: Simplify error handling in ext4_dax_huge_fault() dax: Implement dax_finish_sync_fault() dax, iomap: Add support for synchronous faults mm: Define MAP_SYNC and VM_SYNC flags dax: Allow tuning whether dax_insert_mapping_entry() dirties entry dax: Allow dax_iomap_fault() to return pfn dax: Fix comment describing dax_iomap_fault() ...
Diffstat (limited to 'fs/dax.c')
-rw-r--r--fs/dax.c319
1 files changed, 219 insertions, 100 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 3652b26a0048..95981591977a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
526static void *dax_insert_mapping_entry(struct address_space *mapping, 526static void *dax_insert_mapping_entry(struct address_space *mapping,
527 struct vm_fault *vmf, 527 struct vm_fault *vmf,
528 void *entry, sector_t sector, 528 void *entry, sector_t sector,
529 unsigned long flags) 529 unsigned long flags, bool dirty)
530{ 530{
531 struct radix_tree_root *page_tree = &mapping->page_tree; 531 struct radix_tree_root *page_tree = &mapping->page_tree;
532 void *new_entry; 532 void *new_entry;
533 pgoff_t index = vmf->pgoff; 533 pgoff_t index = vmf->pgoff;
534 534
535 if (vmf->flags & FAULT_FLAG_WRITE) 535 if (dirty)
536 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 536 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
537 537
538 if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { 538 if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
@@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
569 entry = new_entry; 569 entry = new_entry;
570 } 570 }
571 571
572 if (vmf->flags & FAULT_FLAG_WRITE) 572 if (dirty)
573 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 573 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
574 574
575 spin_unlock_irq(&mapping->tree_lock); 575 spin_unlock_irq(&mapping->tree_lock);
@@ -825,38 +825,42 @@ out:
825} 825}
826EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 826EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
827 827
828static int dax_insert_mapping(struct address_space *mapping, 828static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
829 struct block_device *bdev, struct dax_device *dax_dev,
830 sector_t sector, size_t size, void *entry,
831 struct vm_area_struct *vma, struct vm_fault *vmf)
832{ 829{
833 unsigned long vaddr = vmf->address; 830 return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
834 void *ret, *kaddr; 831}
832
833static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
834 pfn_t *pfnp)
835{
836 const sector_t sector = dax_iomap_sector(iomap, pos);
835 pgoff_t pgoff; 837 pgoff_t pgoff;
838 void *kaddr;
836 int id, rc; 839 int id, rc;
837 pfn_t pfn; 840 long length;
838 841
839 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); 842 rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
840 if (rc) 843 if (rc)
841 return rc; 844 return rc;
842
843 id = dax_read_lock(); 845 id = dax_read_lock();
844 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); 846 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
845 if (rc < 0) { 847 &kaddr, pfnp);
846 dax_read_unlock(id); 848 if (length < 0) {
847 return rc; 849 rc = length;
850 goto out;
848 } 851 }
852 rc = -EINVAL;
853 if (PFN_PHYS(length) < size)
854 goto out;
855 if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
856 goto out;
857 /* For larger pages we need devmap */
858 if (length > 1 && !pfn_t_devmap(*pfnp))
859 goto out;
860 rc = 0;
861out:
849 dax_read_unlock(id); 862 dax_read_unlock(id);
850 863 return rc;
851 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
852 if (IS_ERR(ret))
853 return PTR_ERR(ret);
854
855 trace_dax_insert_mapping(mapping->host, vmf, ret);
856 if (vmf->flags & FAULT_FLAG_WRITE)
857 return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
858 else
859 return vm_insert_mixed(vma, vaddr, pfn);
860} 864}
861 865
862/* 866/*
@@ -882,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
882 } 886 }
883 887
884 entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, 888 entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
885 RADIX_DAX_ZERO_PAGE); 889 RADIX_DAX_ZERO_PAGE, false);
886 if (IS_ERR(entry2)) { 890 if (IS_ERR(entry2)) {
887 ret = VM_FAULT_SIGBUS; 891 ret = VM_FAULT_SIGBUS;
888 goto out; 892 goto out;
@@ -941,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev,
941} 945}
942EXPORT_SYMBOL_GPL(__dax_zero_page_range); 946EXPORT_SYMBOL_GPL(__dax_zero_page_range);
943 947
944static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
945{
946 return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
947}
948
949static loff_t 948static loff_t
950dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 949dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
951 struct iomap *iomap) 950 struct iomap *iomap)
@@ -1085,19 +1084,33 @@ static int dax_fault_return(int error)
1085 return VM_FAULT_SIGBUS; 1084 return VM_FAULT_SIGBUS;
1086} 1085}
1087 1086
1088static int dax_iomap_pte_fault(struct vm_fault *vmf, 1087/*
1088 * MAP_SYNC on a dax mapping guarantees dirty metadata is
1089 * flushed on write-faults (non-cow), but not read-faults.
1090 */
1091static bool dax_fault_is_synchronous(unsigned long flags,
1092 struct vm_area_struct *vma, struct iomap *iomap)
1093{
1094 return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
1095 && (iomap->flags & IOMAP_F_DIRTY);
1096}
1097
1098static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1089 const struct iomap_ops *ops) 1099 const struct iomap_ops *ops)
1090{ 1100{
1091 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1101 struct vm_area_struct *vma = vmf->vma;
1102 struct address_space *mapping = vma->vm_file->f_mapping;
1092 struct inode *inode = mapping->host; 1103 struct inode *inode = mapping->host;
1093 unsigned long vaddr = vmf->address; 1104 unsigned long vaddr = vmf->address;
1094 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1105 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1095 sector_t sector;
1096 struct iomap iomap = { 0 }; 1106 struct iomap iomap = { 0 };
1097 unsigned flags = IOMAP_FAULT; 1107 unsigned flags = IOMAP_FAULT;
1098 int error, major = 0; 1108 int error, major = 0;
1109 bool write = vmf->flags & FAULT_FLAG_WRITE;
1110 bool sync;
1099 int vmf_ret = 0; 1111 int vmf_ret = 0;
1100 void *entry; 1112 void *entry;
1113 pfn_t pfn;
1101 1114
1102 trace_dax_pte_fault(inode, vmf, vmf_ret); 1115 trace_dax_pte_fault(inode, vmf, vmf_ret);
1103 /* 1116 /*
@@ -1110,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1110 goto out; 1123 goto out;
1111 } 1124 }
1112 1125
1113 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1126 if (write && !vmf->cow_page)
1114 flags |= IOMAP_WRITE; 1127 flags |= IOMAP_WRITE;
1115 1128
1116 entry = grab_mapping_entry(mapping, vmf->pgoff, 0); 1129 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
@@ -1145,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1145 goto error_finish_iomap; 1158 goto error_finish_iomap;
1146 } 1159 }
1147 1160
1148 sector = dax_iomap_sector(&iomap, pos);
1149
1150 if (vmf->cow_page) { 1161 if (vmf->cow_page) {
1162 sector_t sector = dax_iomap_sector(&iomap, pos);
1163
1151 switch (iomap.type) { 1164 switch (iomap.type) {
1152 case IOMAP_HOLE: 1165 case IOMAP_HOLE:
1153 case IOMAP_UNWRITTEN: 1166 case IOMAP_UNWRITTEN:
@@ -1173,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1173 goto finish_iomap; 1186 goto finish_iomap;
1174 } 1187 }
1175 1188
1189 sync = dax_fault_is_synchronous(flags, vma, &iomap);
1190
1176 switch (iomap.type) { 1191 switch (iomap.type) {
1177 case IOMAP_MAPPED: 1192 case IOMAP_MAPPED:
1178 if (iomap.flags & IOMAP_F_NEW) { 1193 if (iomap.flags & IOMAP_F_NEW) {
1179 count_vm_event(PGMAJFAULT); 1194 count_vm_event(PGMAJFAULT);
1180 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); 1195 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
1181 major = VM_FAULT_MAJOR; 1196 major = VM_FAULT_MAJOR;
1182 } 1197 }
1183 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, 1198 error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
1184 sector, PAGE_SIZE, entry, vmf->vma, vmf); 1199 if (error < 0)
1200 goto error_finish_iomap;
1201
1202 entry = dax_insert_mapping_entry(mapping, vmf, entry,
1203 dax_iomap_sector(&iomap, pos),
1204 0, write && !sync);
1205 if (IS_ERR(entry)) {
1206 error = PTR_ERR(entry);
1207 goto error_finish_iomap;
1208 }
1209
1210 /*
1211 * If we are doing synchronous page fault and inode needs fsync,
1212 * we can insert PTE into page tables only after that happens.
1213 * Skip insertion for now and return the pfn so that caller can
1214 * insert it after fsync is done.
1215 */
1216 if (sync) {
1217 if (WARN_ON_ONCE(!pfnp)) {
1218 error = -EIO;
1219 goto error_finish_iomap;
1220 }
1221 *pfnp = pfn;
1222 vmf_ret = VM_FAULT_NEEDDSYNC | major;
1223 goto finish_iomap;
1224 }
1225 trace_dax_insert_mapping(inode, vmf, entry);
1226 if (write)
1227 error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
1228 else
1229 error = vm_insert_mixed(vma, vaddr, pfn);
1230
1185 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1231 /* -EBUSY is fine, somebody else faulted on the same PTE */
1186 if (error == -EBUSY) 1232 if (error == -EBUSY)
1187 error = 0; 1233 error = 0;
1188 break; 1234 break;
1189 case IOMAP_UNWRITTEN: 1235 case IOMAP_UNWRITTEN:
1190 case IOMAP_HOLE: 1236 case IOMAP_HOLE:
1191 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1237 if (!write) {
1192 vmf_ret = dax_load_hole(mapping, entry, vmf); 1238 vmf_ret = dax_load_hole(mapping, entry, vmf);
1193 goto finish_iomap; 1239 goto finish_iomap;
1194 } 1240 }
@@ -1223,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1223} 1269}
1224 1270
1225#ifdef CONFIG_FS_DAX_PMD 1271#ifdef CONFIG_FS_DAX_PMD
1226static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, 1272/*
1227 loff_t pos, void *entry) 1273 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
1228{ 1274 * more often than one might expect in the below functions.
1229 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1275 */
1230 const sector_t sector = dax_iomap_sector(iomap, pos); 1276#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
1231 struct dax_device *dax_dev = iomap->dax_dev;
1232 struct block_device *bdev = iomap->bdev;
1233 struct inode *inode = mapping->host;
1234 const size_t size = PMD_SIZE;
1235 void *ret = NULL, *kaddr;
1236 long length = 0;
1237 pgoff_t pgoff;
1238 pfn_t pfn = {};
1239 int id;
1240
1241 if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
1242 goto fallback;
1243
1244 id = dax_read_lock();
1245 length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
1246 if (length < 0)
1247 goto unlock_fallback;
1248 length = PFN_PHYS(length);
1249
1250 if (length < size)
1251 goto unlock_fallback;
1252 if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
1253 goto unlock_fallback;
1254 if (!pfn_t_devmap(pfn))
1255 goto unlock_fallback;
1256 dax_read_unlock(id);
1257
1258 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
1259 RADIX_DAX_PMD);
1260 if (IS_ERR(ret))
1261 goto fallback;
1262
1263 trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
1264 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
1265 pfn, vmf->flags & FAULT_FLAG_WRITE);
1266
1267unlock_fallback:
1268 dax_read_unlock(id);
1269fallback:
1270 trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
1271 return VM_FAULT_FALLBACK;
1272}
1273 1277
1274static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, 1278static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1275 void *entry) 1279 void *entry)
@@ -1288,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1288 goto fallback; 1292 goto fallback;
1289 1293
1290 ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, 1294 ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
1291 RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); 1295 RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
1292 if (IS_ERR(ret)) 1296 if (IS_ERR(ret))
1293 goto fallback; 1297 goto fallback;
1294 1298
@@ -1310,13 +1314,14 @@ fallback:
1310 return VM_FAULT_FALLBACK; 1314 return VM_FAULT_FALLBACK;
1311} 1315}
1312 1316
1313static int dax_iomap_pmd_fault(struct vm_fault *vmf, 1317static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1314 const struct iomap_ops *ops) 1318 const struct iomap_ops *ops)
1315{ 1319{
1316 struct vm_area_struct *vma = vmf->vma; 1320 struct vm_area_struct *vma = vmf->vma;
1317 struct address_space *mapping = vma->vm_file->f_mapping; 1321 struct address_space *mapping = vma->vm_file->f_mapping;
1318 unsigned long pmd_addr = vmf->address & PMD_MASK; 1322 unsigned long pmd_addr = vmf->address & PMD_MASK;
1319 bool write = vmf->flags & FAULT_FLAG_WRITE; 1323 bool write = vmf->flags & FAULT_FLAG_WRITE;
1324 bool sync;
1320 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; 1325 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
1321 struct inode *inode = mapping->host; 1326 struct inode *inode = mapping->host;
1322 int result = VM_FAULT_FALLBACK; 1327 int result = VM_FAULT_FALLBACK;
@@ -1325,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1325 void *entry; 1330 void *entry;
1326 loff_t pos; 1331 loff_t pos;
1327 int error; 1332 int error;
1333 pfn_t pfn;
1328 1334
1329 /* 1335 /*
1330 * Check whether offset isn't beyond end of file now. Caller is 1336 * Check whether offset isn't beyond end of file now. Caller is
@@ -1332,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1332 * this is a reliable test. 1338 * this is a reliable test.
1333 */ 1339 */
1334 pgoff = linear_page_index(vma, pmd_addr); 1340 pgoff = linear_page_index(vma, pmd_addr);
1335 max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; 1341 max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
1336 1342
1337 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); 1343 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
1338 1344
@@ -1356,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1356 if ((pmd_addr + PMD_SIZE) > vma->vm_end) 1362 if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1357 goto fallback; 1363 goto fallback;
1358 1364
1359 if (pgoff > max_pgoff) { 1365 if (pgoff >= max_pgoff) {
1360 result = VM_FAULT_SIGBUS; 1366 result = VM_FAULT_SIGBUS;
1361 goto out; 1367 goto out;
1362 } 1368 }
1363 1369
1364 /* If the PMD would extend beyond the file size */ 1370 /* If the PMD would extend beyond the file size */
1365 if ((pgoff | PG_PMD_COLOUR) > max_pgoff) 1371 if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
1366 goto fallback; 1372 goto fallback;
1367 1373
1368 /* 1374 /*
@@ -1400,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1400 if (iomap.offset + iomap.length < pos + PMD_SIZE) 1406 if (iomap.offset + iomap.length < pos + PMD_SIZE)
1401 goto finish_iomap; 1407 goto finish_iomap;
1402 1408
1409 sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
1410
1403 switch (iomap.type) { 1411 switch (iomap.type) {
1404 case IOMAP_MAPPED: 1412 case IOMAP_MAPPED:
1405 result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); 1413 error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
1414 if (error < 0)
1415 goto finish_iomap;
1416
1417 entry = dax_insert_mapping_entry(mapping, vmf, entry,
1418 dax_iomap_sector(&iomap, pos),
1419 RADIX_DAX_PMD, write && !sync);
1420 if (IS_ERR(entry))
1421 goto finish_iomap;
1422
1423 /*
1424 * If we are doing synchronous page fault and inode needs fsync,
1425 * we can insert PMD into page tables only after that happens.
1426 * Skip insertion for now and return the pfn so that caller can
1427 * insert it after fsync is done.
1428 */
1429 if (sync) {
1430 if (WARN_ON_ONCE(!pfnp))
1431 goto finish_iomap;
1432 *pfnp = pfn;
1433 result = VM_FAULT_NEEDDSYNC;
1434 goto finish_iomap;
1435 }
1436
1437 trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
1438 result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
1439 write);
1406 break; 1440 break;
1407 case IOMAP_UNWRITTEN: 1441 case IOMAP_UNWRITTEN:
1408 case IOMAP_HOLE: 1442 case IOMAP_HOLE:
@@ -1442,7 +1476,7 @@ out:
1442 return result; 1476 return result;
1443} 1477}
1444#else 1478#else
1445static int dax_iomap_pmd_fault(struct vm_fault *vmf, 1479static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1446 const struct iomap_ops *ops) 1480 const struct iomap_ops *ops)
1447{ 1481{
1448 return VM_FAULT_FALLBACK; 1482 return VM_FAULT_FALLBACK;
@@ -1452,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1452/** 1486/**
1453 * dax_iomap_fault - handle a page fault on a DAX file 1487 * dax_iomap_fault - handle a page fault on a DAX file
1454 * @vmf: The description of the fault 1488 * @vmf: The description of the fault
1455 * @ops: iomap ops passed from the file system 1489 * @pe_size: Size of the page to fault in
1490 * @pfnp: PFN to insert for synchronous faults if fsync is required
1491 * @ops: Iomap ops passed from the file system
1456 * 1492 *
1457 * When a page fault occurs, filesystems may call this helper in 1493 * When a page fault occurs, filesystems may call this helper in
1458 * their fault handler for DAX files. dax_iomap_fault() assumes the caller 1494 * their fault handler for DAX files. dax_iomap_fault() assumes the caller
@@ -1460,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1460 * successfully. 1496 * successfully.
1461 */ 1497 */
1462int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, 1498int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
1463 const struct iomap_ops *ops) 1499 pfn_t *pfnp, const struct iomap_ops *ops)
1464{ 1500{
1465 switch (pe_size) { 1501 switch (pe_size) {
1466 case PE_SIZE_PTE: 1502 case PE_SIZE_PTE:
1467 return dax_iomap_pte_fault(vmf, ops); 1503 return dax_iomap_pte_fault(vmf, pfnp, ops);
1468 case PE_SIZE_PMD: 1504 case PE_SIZE_PMD:
1469 return dax_iomap_pmd_fault(vmf, ops); 1505 return dax_iomap_pmd_fault(vmf, pfnp, ops);
1470 default: 1506 default:
1471 return VM_FAULT_FALLBACK; 1507 return VM_FAULT_FALLBACK;
1472 } 1508 }
1473} 1509}
1474EXPORT_SYMBOL_GPL(dax_iomap_fault); 1510EXPORT_SYMBOL_GPL(dax_iomap_fault);
1511
1512/**
1513 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
1514 * @vmf: The description of the fault
1515 * @pe_size: Size of entry to be inserted
1516 * @pfn: PFN to insert
1517 *
1518 * This function inserts writeable PTE or PMD entry into page tables for mmaped
1519 * DAX file. It takes care of marking corresponding radix tree entry as dirty
1520 * as well.
1521 */
1522static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
1523 enum page_entry_size pe_size,
1524 pfn_t pfn)
1525{
1526 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1527 void *entry, **slot;
1528 pgoff_t index = vmf->pgoff;
1529 int vmf_ret, error;
1530
1531 spin_lock_irq(&mapping->tree_lock);
1532 entry = get_unlocked_mapping_entry(mapping, index, &slot);
1533 /* Did we race with someone splitting entry or so? */
1534 if (!entry ||
1535 (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
1536 (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
1537 put_unlocked_mapping_entry(mapping, index, entry);
1538 spin_unlock_irq(&mapping->tree_lock);
1539 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
1540 VM_FAULT_NOPAGE);
1541 return VM_FAULT_NOPAGE;
1542 }
1543 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
1544 entry = lock_slot(mapping, slot);
1545 spin_unlock_irq(&mapping->tree_lock);
1546 switch (pe_size) {
1547 case PE_SIZE_PTE:
1548 error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1549 vmf_ret = dax_fault_return(error);
1550 break;
1551#ifdef CONFIG_FS_DAX_PMD
1552 case PE_SIZE_PMD:
1553 vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
1554 pfn, true);
1555 break;
1556#endif
1557 default:
1558 vmf_ret = VM_FAULT_FALLBACK;
1559 }
1560 put_locked_mapping_entry(mapping, index);
1561 trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
1562 return vmf_ret;
1563}
1564
1565/**
1566 * dax_finish_sync_fault - finish synchronous page fault
1567 * @vmf: The description of the fault
1568 * @pe_size: Size of entry to be inserted
1569 * @pfn: PFN to insert
1570 *
1571 * This function ensures that the file range touched by the page fault is
1572 * stored persistently on the media and handles inserting of appropriate page
1573 * table entry.
1574 */
1575int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
1576 pfn_t pfn)
1577{
1578 int err;
1579 loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
1580 size_t len = 0;
1581
1582 if (pe_size == PE_SIZE_PTE)
1583 len = PAGE_SIZE;
1584 else if (pe_size == PE_SIZE_PMD)
1585 len = PMD_SIZE;
1586 else
1587 WARN_ON_ONCE(1);
1588 err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
1589 if (err)
1590 return VM_FAULT_SIGBUS;
1591 return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
1592}
1593EXPORT_SYMBOL_GPL(dax_finish_sync_fault);