diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-17 12:51:57 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-17 12:51:57 -0500 |
commit | a3841f94c7ecb3ede0f888d3fcfe8fb6368ddd7a (patch) | |
tree | 6625eedf10d0672068ee218bb893a5a0e1803df2 /fs/dax.c | |
parent | adeba81ac2a6451f44545874da3d181081f0ab04 (diff) | |
parent | 4247f24c23589bcc3bc3490515ef8c9497e9ae55 (diff) |
Merge tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm and dax updates from Dan Williams:
"Save for a few late fixes, all of these commits have shipped in -next
releases since before the merge window opened, and 0day has given a
build success notification.
The ext4 touches came from Jan, and the xfs touches have Darrick's
reviewed-by. An xfstest for the MAP_SYNC feature has been through
a few round of reviews and is on track to be merged.
- Introduce MAP_SYNC and MAP_SHARED_VALIDATE, a mechanism to enable
'userspace flush' of persistent memory updates via filesystem-dax
mappings. It arranges for any filesystem metadata updates that may
be required to satisfy a write fault to also be flushed ("on disk")
before the kernel returns to userspace from the fault handler.
Effectively every write-fault that dirties metadata completes an
fsync() before returning from the fault handler. The new
MAP_SHARED_VALIDATE mapping type guarantees that the MAP_SYNC flag
is validated as supported by the filesystem's ->mmap() file
operation.
- Add support for the standard ACPI 6.2 label access methods that
replace the NVDIMM_FAMILY_INTEL (vendor specific) label methods.
This enables interoperability with environments that only implement
the standardized methods.
- Add support for the ACPI 6.2 NVDIMM media error injection methods.
- Add support for the NVDIMM_FAMILY_INTEL v1.6 DIMM commands for
latch last shutdown status, firmware update, SMART error injection,
and SMART alarm threshold control.
- Cleanup physical address information disclosures to be root-only.
- Fix revalidation of the DIMM "locked label area" status to support
dynamic unlock of the label area.
- Expand unit test infrastructure to mock the ACPI 6.2 Translate SPA
(system-physical-address) command and error injection commands.
Acknowledgements that came after the commits were pushed to -next:
- 957ac8c421ad ("dax: fix PMD faults on zero-length files"):
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
- a39e596baa07 ("xfs: support for synchronous DAX faults") and
7b565c9f965b ("xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()")
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>"
* tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (49 commits)
acpi, nfit: add 'Enable Latch System Shutdown Status' command support
dax: fix general protection fault in dax_alloc_inode
dax: fix PMD faults on zero-length files
dax: stop requiring a live device for dax_flush()
brd: remove dax support
dax: quiet bdev_dax_supported()
fs, dax: unify IOMAP_F_DIRTY read vs write handling policy in the dax core
tools/testing/nvdimm: unit test clear-error commands
acpi, nfit: validate commands against the device type
tools/testing/nvdimm: stricter bounds checking for error injection commands
xfs: support for synchronous DAX faults
xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()
ext4: Support for synchronous DAX faults
ext4: Simplify error handling in ext4_dax_huge_fault()
dax: Implement dax_finish_sync_fault()
dax, iomap: Add support for synchronous faults
mm: Define MAP_SYNC and VM_SYNC flags
dax: Allow tuning whether dax_insert_mapping_entry() dirties entry
dax: Allow dax_iomap_fault() to return pfn
dax: Fix comment describing dax_iomap_fault()
...
Diffstat (limited to 'fs/dax.c')
-rw-r--r-- | fs/dax.c | 319 |
1 files changed, 219 insertions, 100 deletions
@@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, | |||
526 | static void *dax_insert_mapping_entry(struct address_space *mapping, | 526 | static void *dax_insert_mapping_entry(struct address_space *mapping, |
527 | struct vm_fault *vmf, | 527 | struct vm_fault *vmf, |
528 | void *entry, sector_t sector, | 528 | void *entry, sector_t sector, |
529 | unsigned long flags) | 529 | unsigned long flags, bool dirty) |
530 | { | 530 | { |
531 | struct radix_tree_root *page_tree = &mapping->page_tree; | 531 | struct radix_tree_root *page_tree = &mapping->page_tree; |
532 | void *new_entry; | 532 | void *new_entry; |
533 | pgoff_t index = vmf->pgoff; | 533 | pgoff_t index = vmf->pgoff; |
534 | 534 | ||
535 | if (vmf->flags & FAULT_FLAG_WRITE) | 535 | if (dirty) |
536 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 536 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
537 | 537 | ||
538 | if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { | 538 | if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { |
@@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, | |||
569 | entry = new_entry; | 569 | entry = new_entry; |
570 | } | 570 | } |
571 | 571 | ||
572 | if (vmf->flags & FAULT_FLAG_WRITE) | 572 | if (dirty) |
573 | radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); | 573 | radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); |
574 | 574 | ||
575 | spin_unlock_irq(&mapping->tree_lock); | 575 | spin_unlock_irq(&mapping->tree_lock); |
@@ -825,38 +825,42 @@ out: | |||
825 | } | 825 | } |
826 | EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); | 826 | EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); |
827 | 827 | ||
828 | static int dax_insert_mapping(struct address_space *mapping, | 828 | static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) |
829 | struct block_device *bdev, struct dax_device *dax_dev, | ||
830 | sector_t sector, size_t size, void *entry, | ||
831 | struct vm_area_struct *vma, struct vm_fault *vmf) | ||
832 | { | 829 | { |
833 | unsigned long vaddr = vmf->address; | 830 | return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; |
834 | void *ret, *kaddr; | 831 | } |
832 | |||
833 | static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, | ||
834 | pfn_t *pfnp) | ||
835 | { | ||
836 | const sector_t sector = dax_iomap_sector(iomap, pos); | ||
835 | pgoff_t pgoff; | 837 | pgoff_t pgoff; |
838 | void *kaddr; | ||
836 | int id, rc; | 839 | int id, rc; |
837 | pfn_t pfn; | 840 | long length; |
838 | 841 | ||
839 | rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); | 842 | rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff); |
840 | if (rc) | 843 | if (rc) |
841 | return rc; | 844 | return rc; |
842 | |||
843 | id = dax_read_lock(); | 845 | id = dax_read_lock(); |
844 | rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); | 846 | length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), |
845 | if (rc < 0) { | 847 | &kaddr, pfnp); |
846 | dax_read_unlock(id); | 848 | if (length < 0) { |
847 | return rc; | 849 | rc = length; |
850 | goto out; | ||
848 | } | 851 | } |
852 | rc = -EINVAL; | ||
853 | if (PFN_PHYS(length) < size) | ||
854 | goto out; | ||
855 | if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) | ||
856 | goto out; | ||
857 | /* For larger pages we need devmap */ | ||
858 | if (length > 1 && !pfn_t_devmap(*pfnp)) | ||
859 | goto out; | ||
860 | rc = 0; | ||
861 | out: | ||
849 | dax_read_unlock(id); | 862 | dax_read_unlock(id); |
850 | 863 | return rc; | |
851 | ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); | ||
852 | if (IS_ERR(ret)) | ||
853 | return PTR_ERR(ret); | ||
854 | |||
855 | trace_dax_insert_mapping(mapping->host, vmf, ret); | ||
856 | if (vmf->flags & FAULT_FLAG_WRITE) | ||
857 | return vm_insert_mixed_mkwrite(vma, vaddr, pfn); | ||
858 | else | ||
859 | return vm_insert_mixed(vma, vaddr, pfn); | ||
860 | } | 864 | } |
861 | 865 | ||
862 | /* | 866 | /* |
@@ -882,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry, | |||
882 | } | 886 | } |
883 | 887 | ||
884 | entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, | 888 | entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, |
885 | RADIX_DAX_ZERO_PAGE); | 889 | RADIX_DAX_ZERO_PAGE, false); |
886 | if (IS_ERR(entry2)) { | 890 | if (IS_ERR(entry2)) { |
887 | ret = VM_FAULT_SIGBUS; | 891 | ret = VM_FAULT_SIGBUS; |
888 | goto out; | 892 | goto out; |
@@ -941,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev, | |||
941 | } | 945 | } |
942 | EXPORT_SYMBOL_GPL(__dax_zero_page_range); | 946 | EXPORT_SYMBOL_GPL(__dax_zero_page_range); |
943 | 947 | ||
944 | static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) | ||
945 | { | ||
946 | return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; | ||
947 | } | ||
948 | |||
949 | static loff_t | 948 | static loff_t |
950 | dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, | 949 | dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, |
951 | struct iomap *iomap) | 950 | struct iomap *iomap) |
@@ -1085,19 +1084,33 @@ static int dax_fault_return(int error) | |||
1085 | return VM_FAULT_SIGBUS; | 1084 | return VM_FAULT_SIGBUS; |
1086 | } | 1085 | } |
1087 | 1086 | ||
1088 | static int dax_iomap_pte_fault(struct vm_fault *vmf, | 1087 | /* |
1088 | * MAP_SYNC on a dax mapping guarantees dirty metadata is | ||
1089 | * flushed on write-faults (non-cow), but not read-faults. | ||
1090 | */ | ||
1091 | static bool dax_fault_is_synchronous(unsigned long flags, | ||
1092 | struct vm_area_struct *vma, struct iomap *iomap) | ||
1093 | { | ||
1094 | return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) | ||
1095 | && (iomap->flags & IOMAP_F_DIRTY); | ||
1096 | } | ||
1097 | |||
1098 | static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, | ||
1089 | const struct iomap_ops *ops) | 1099 | const struct iomap_ops *ops) |
1090 | { | 1100 | { |
1091 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; | 1101 | struct vm_area_struct *vma = vmf->vma; |
1102 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
1092 | struct inode *inode = mapping->host; | 1103 | struct inode *inode = mapping->host; |
1093 | unsigned long vaddr = vmf->address; | 1104 | unsigned long vaddr = vmf->address; |
1094 | loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; | 1105 | loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; |
1095 | sector_t sector; | ||
1096 | struct iomap iomap = { 0 }; | 1106 | struct iomap iomap = { 0 }; |
1097 | unsigned flags = IOMAP_FAULT; | 1107 | unsigned flags = IOMAP_FAULT; |
1098 | int error, major = 0; | 1108 | int error, major = 0; |
1109 | bool write = vmf->flags & FAULT_FLAG_WRITE; | ||
1110 | bool sync; | ||
1099 | int vmf_ret = 0; | 1111 | int vmf_ret = 0; |
1100 | void *entry; | 1112 | void *entry; |
1113 | pfn_t pfn; | ||
1101 | 1114 | ||
1102 | trace_dax_pte_fault(inode, vmf, vmf_ret); | 1115 | trace_dax_pte_fault(inode, vmf, vmf_ret); |
1103 | /* | 1116 | /* |
@@ -1110,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, | |||
1110 | goto out; | 1123 | goto out; |
1111 | } | 1124 | } |
1112 | 1125 | ||
1113 | if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) | 1126 | if (write && !vmf->cow_page) |
1114 | flags |= IOMAP_WRITE; | 1127 | flags |= IOMAP_WRITE; |
1115 | 1128 | ||
1116 | entry = grab_mapping_entry(mapping, vmf->pgoff, 0); | 1129 | entry = grab_mapping_entry(mapping, vmf->pgoff, 0); |
@@ -1145,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, | |||
1145 | goto error_finish_iomap; | 1158 | goto error_finish_iomap; |
1146 | } | 1159 | } |
1147 | 1160 | ||
1148 | sector = dax_iomap_sector(&iomap, pos); | ||
1149 | |||
1150 | if (vmf->cow_page) { | 1161 | if (vmf->cow_page) { |
1162 | sector_t sector = dax_iomap_sector(&iomap, pos); | ||
1163 | |||
1151 | switch (iomap.type) { | 1164 | switch (iomap.type) { |
1152 | case IOMAP_HOLE: | 1165 | case IOMAP_HOLE: |
1153 | case IOMAP_UNWRITTEN: | 1166 | case IOMAP_UNWRITTEN: |
@@ -1173,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, | |||
1173 | goto finish_iomap; | 1186 | goto finish_iomap; |
1174 | } | 1187 | } |
1175 | 1188 | ||
1189 | sync = dax_fault_is_synchronous(flags, vma, &iomap); | ||
1190 | |||
1176 | switch (iomap.type) { | 1191 | switch (iomap.type) { |
1177 | case IOMAP_MAPPED: | 1192 | case IOMAP_MAPPED: |
1178 | if (iomap.flags & IOMAP_F_NEW) { | 1193 | if (iomap.flags & IOMAP_F_NEW) { |
1179 | count_vm_event(PGMAJFAULT); | 1194 | count_vm_event(PGMAJFAULT); |
1180 | count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); | 1195 | count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); |
1181 | major = VM_FAULT_MAJOR; | 1196 | major = VM_FAULT_MAJOR; |
1182 | } | 1197 | } |
1183 | error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, | 1198 | error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn); |
1184 | sector, PAGE_SIZE, entry, vmf->vma, vmf); | 1199 | if (error < 0) |
1200 | goto error_finish_iomap; | ||
1201 | |||
1202 | entry = dax_insert_mapping_entry(mapping, vmf, entry, | ||
1203 | dax_iomap_sector(&iomap, pos), | ||
1204 | 0, write && !sync); | ||
1205 | if (IS_ERR(entry)) { | ||
1206 | error = PTR_ERR(entry); | ||
1207 | goto error_finish_iomap; | ||
1208 | } | ||
1209 | |||
1210 | /* | ||
1211 | * If we are doing synchronous page fault and inode needs fsync, | ||
1212 | * we can insert PTE into page tables only after that happens. | ||
1213 | * Skip insertion for now and return the pfn so that caller can | ||
1214 | * insert it after fsync is done. | ||
1215 | */ | ||
1216 | if (sync) { | ||
1217 | if (WARN_ON_ONCE(!pfnp)) { | ||
1218 | error = -EIO; | ||
1219 | goto error_finish_iomap; | ||
1220 | } | ||
1221 | *pfnp = pfn; | ||
1222 | vmf_ret = VM_FAULT_NEEDDSYNC | major; | ||
1223 | goto finish_iomap; | ||
1224 | } | ||
1225 | trace_dax_insert_mapping(inode, vmf, entry); | ||
1226 | if (write) | ||
1227 | error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); | ||
1228 | else | ||
1229 | error = vm_insert_mixed(vma, vaddr, pfn); | ||
1230 | |||
1185 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | 1231 | /* -EBUSY is fine, somebody else faulted on the same PTE */ |
1186 | if (error == -EBUSY) | 1232 | if (error == -EBUSY) |
1187 | error = 0; | 1233 | error = 0; |
1188 | break; | 1234 | break; |
1189 | case IOMAP_UNWRITTEN: | 1235 | case IOMAP_UNWRITTEN: |
1190 | case IOMAP_HOLE: | 1236 | case IOMAP_HOLE: |
1191 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { | 1237 | if (!write) { |
1192 | vmf_ret = dax_load_hole(mapping, entry, vmf); | 1238 | vmf_ret = dax_load_hole(mapping, entry, vmf); |
1193 | goto finish_iomap; | 1239 | goto finish_iomap; |
1194 | } | 1240 | } |
@@ -1223,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, | |||
1223 | } | 1269 | } |
1224 | 1270 | ||
1225 | #ifdef CONFIG_FS_DAX_PMD | 1271 | #ifdef CONFIG_FS_DAX_PMD |
1226 | static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, | 1272 | /* |
1227 | loff_t pos, void *entry) | 1273 | * The 'colour' (ie low bits) within a PMD of a page offset. This comes up |
1228 | { | 1274 | * more often than one might expect in the below functions. |
1229 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; | 1275 | */ |
1230 | const sector_t sector = dax_iomap_sector(iomap, pos); | 1276 | #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) |
1231 | struct dax_device *dax_dev = iomap->dax_dev; | ||
1232 | struct block_device *bdev = iomap->bdev; | ||
1233 | struct inode *inode = mapping->host; | ||
1234 | const size_t size = PMD_SIZE; | ||
1235 | void *ret = NULL, *kaddr; | ||
1236 | long length = 0; | ||
1237 | pgoff_t pgoff; | ||
1238 | pfn_t pfn = {}; | ||
1239 | int id; | ||
1240 | |||
1241 | if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) | ||
1242 | goto fallback; | ||
1243 | |||
1244 | id = dax_read_lock(); | ||
1245 | length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); | ||
1246 | if (length < 0) | ||
1247 | goto unlock_fallback; | ||
1248 | length = PFN_PHYS(length); | ||
1249 | |||
1250 | if (length < size) | ||
1251 | goto unlock_fallback; | ||
1252 | if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) | ||
1253 | goto unlock_fallback; | ||
1254 | if (!pfn_t_devmap(pfn)) | ||
1255 | goto unlock_fallback; | ||
1256 | dax_read_unlock(id); | ||
1257 | |||
1258 | ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, | ||
1259 | RADIX_DAX_PMD); | ||
1260 | if (IS_ERR(ret)) | ||
1261 | goto fallback; | ||
1262 | |||
1263 | trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); | ||
1264 | return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, | ||
1265 | pfn, vmf->flags & FAULT_FLAG_WRITE); | ||
1266 | |||
1267 | unlock_fallback: | ||
1268 | dax_read_unlock(id); | ||
1269 | fallback: | ||
1270 | trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); | ||
1271 | return VM_FAULT_FALLBACK; | ||
1272 | } | ||
1273 | 1277 | ||
1274 | static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, | 1278 | static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, |
1275 | void *entry) | 1279 | void *entry) |
@@ -1288,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, | |||
1288 | goto fallback; | 1292 | goto fallback; |
1289 | 1293 | ||
1290 | ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, | 1294 | ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, |
1291 | RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); | 1295 | RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); |
1292 | if (IS_ERR(ret)) | 1296 | if (IS_ERR(ret)) |
1293 | goto fallback; | 1297 | goto fallback; |
1294 | 1298 | ||
@@ -1310,13 +1314,14 @@ fallback: | |||
1310 | return VM_FAULT_FALLBACK; | 1314 | return VM_FAULT_FALLBACK; |
1311 | } | 1315 | } |
1312 | 1316 | ||
1313 | static int dax_iomap_pmd_fault(struct vm_fault *vmf, | 1317 | static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, |
1314 | const struct iomap_ops *ops) | 1318 | const struct iomap_ops *ops) |
1315 | { | 1319 | { |
1316 | struct vm_area_struct *vma = vmf->vma; | 1320 | struct vm_area_struct *vma = vmf->vma; |
1317 | struct address_space *mapping = vma->vm_file->f_mapping; | 1321 | struct address_space *mapping = vma->vm_file->f_mapping; |
1318 | unsigned long pmd_addr = vmf->address & PMD_MASK; | 1322 | unsigned long pmd_addr = vmf->address & PMD_MASK; |
1319 | bool write = vmf->flags & FAULT_FLAG_WRITE; | 1323 | bool write = vmf->flags & FAULT_FLAG_WRITE; |
1324 | bool sync; | ||
1320 | unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; | 1325 | unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; |
1321 | struct inode *inode = mapping->host; | 1326 | struct inode *inode = mapping->host; |
1322 | int result = VM_FAULT_FALLBACK; | 1327 | int result = VM_FAULT_FALLBACK; |
@@ -1325,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1325 | void *entry; | 1330 | void *entry; |
1326 | loff_t pos; | 1331 | loff_t pos; |
1327 | int error; | 1332 | int error; |
1333 | pfn_t pfn; | ||
1328 | 1334 | ||
1329 | /* | 1335 | /* |
1330 | * Check whether offset isn't beyond end of file now. Caller is | 1336 | * Check whether offset isn't beyond end of file now. Caller is |
@@ -1332,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1332 | * this is a reliable test. | 1338 | * this is a reliable test. |
1333 | */ | 1339 | */ |
1334 | pgoff = linear_page_index(vma, pmd_addr); | 1340 | pgoff = linear_page_index(vma, pmd_addr); |
1335 | max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; | 1341 | max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); |
1336 | 1342 | ||
1337 | trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); | 1343 | trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); |
1338 | 1344 | ||
@@ -1356,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1356 | if ((pmd_addr + PMD_SIZE) > vma->vm_end) | 1362 | if ((pmd_addr + PMD_SIZE) > vma->vm_end) |
1357 | goto fallback; | 1363 | goto fallback; |
1358 | 1364 | ||
1359 | if (pgoff > max_pgoff) { | 1365 | if (pgoff >= max_pgoff) { |
1360 | result = VM_FAULT_SIGBUS; | 1366 | result = VM_FAULT_SIGBUS; |
1361 | goto out; | 1367 | goto out; |
1362 | } | 1368 | } |
1363 | 1369 | ||
1364 | /* If the PMD would extend beyond the file size */ | 1370 | /* If the PMD would extend beyond the file size */ |
1365 | if ((pgoff | PG_PMD_COLOUR) > max_pgoff) | 1371 | if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) |
1366 | goto fallback; | 1372 | goto fallback; |
1367 | 1373 | ||
1368 | /* | 1374 | /* |
@@ -1400,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1400 | if (iomap.offset + iomap.length < pos + PMD_SIZE) | 1406 | if (iomap.offset + iomap.length < pos + PMD_SIZE) |
1401 | goto finish_iomap; | 1407 | goto finish_iomap; |
1402 | 1408 | ||
1409 | sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap); | ||
1410 | |||
1403 | switch (iomap.type) { | 1411 | switch (iomap.type) { |
1404 | case IOMAP_MAPPED: | 1412 | case IOMAP_MAPPED: |
1405 | result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); | 1413 | error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); |
1414 | if (error < 0) | ||
1415 | goto finish_iomap; | ||
1416 | |||
1417 | entry = dax_insert_mapping_entry(mapping, vmf, entry, | ||
1418 | dax_iomap_sector(&iomap, pos), | ||
1419 | RADIX_DAX_PMD, write && !sync); | ||
1420 | if (IS_ERR(entry)) | ||
1421 | goto finish_iomap; | ||
1422 | |||
1423 | /* | ||
1424 | * If we are doing synchronous page fault and inode needs fsync, | ||
1425 | * we can insert PMD into page tables only after that happens. | ||
1426 | * Skip insertion for now and return the pfn so that caller can | ||
1427 | * insert it after fsync is done. | ||
1428 | */ | ||
1429 | if (sync) { | ||
1430 | if (WARN_ON_ONCE(!pfnp)) | ||
1431 | goto finish_iomap; | ||
1432 | *pfnp = pfn; | ||
1433 | result = VM_FAULT_NEEDDSYNC; | ||
1434 | goto finish_iomap; | ||
1435 | } | ||
1436 | |||
1437 | trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); | ||
1438 | result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, | ||
1439 | write); | ||
1406 | break; | 1440 | break; |
1407 | case IOMAP_UNWRITTEN: | 1441 | case IOMAP_UNWRITTEN: |
1408 | case IOMAP_HOLE: | 1442 | case IOMAP_HOLE: |
@@ -1442,7 +1476,7 @@ out: | |||
1442 | return result; | 1476 | return result; |
1443 | } | 1477 | } |
1444 | #else | 1478 | #else |
1445 | static int dax_iomap_pmd_fault(struct vm_fault *vmf, | 1479 | static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, |
1446 | const struct iomap_ops *ops) | 1480 | const struct iomap_ops *ops) |
1447 | { | 1481 | { |
1448 | return VM_FAULT_FALLBACK; | 1482 | return VM_FAULT_FALLBACK; |
@@ -1452,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1452 | /** | 1486 | /** |
1453 | * dax_iomap_fault - handle a page fault on a DAX file | 1487 | * dax_iomap_fault - handle a page fault on a DAX file |
1454 | * @vmf: The description of the fault | 1488 | * @vmf: The description of the fault |
1455 | * @ops: iomap ops passed from the file system | 1489 | * @pe_size: Size of the page to fault in |
1490 | * @pfnp: PFN to insert for synchronous faults if fsync is required | ||
1491 | * @ops: Iomap ops passed from the file system | ||
1456 | * | 1492 | * |
1457 | * When a page fault occurs, filesystems may call this helper in | 1493 | * When a page fault occurs, filesystems may call this helper in |
1458 | * their fault handler for DAX files. dax_iomap_fault() assumes the caller | 1494 | * their fault handler for DAX files. dax_iomap_fault() assumes the caller |
@@ -1460,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1460 | * successfully. | 1496 | * successfully. |
1461 | */ | 1497 | */ |
1462 | int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, | 1498 | int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, |
1463 | const struct iomap_ops *ops) | 1499 | pfn_t *pfnp, const struct iomap_ops *ops) |
1464 | { | 1500 | { |
1465 | switch (pe_size) { | 1501 | switch (pe_size) { |
1466 | case PE_SIZE_PTE: | 1502 | case PE_SIZE_PTE: |
1467 | return dax_iomap_pte_fault(vmf, ops); | 1503 | return dax_iomap_pte_fault(vmf, pfnp, ops); |
1468 | case PE_SIZE_PMD: | 1504 | case PE_SIZE_PMD: |
1469 | return dax_iomap_pmd_fault(vmf, ops); | 1505 | return dax_iomap_pmd_fault(vmf, pfnp, ops); |
1470 | default: | 1506 | default: |
1471 | return VM_FAULT_FALLBACK; | 1507 | return VM_FAULT_FALLBACK; |
1472 | } | 1508 | } |
1473 | } | 1509 | } |
1474 | EXPORT_SYMBOL_GPL(dax_iomap_fault); | 1510 | EXPORT_SYMBOL_GPL(dax_iomap_fault); |
1511 | |||
1512 | /** | ||
1513 | * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables | ||
1514 | * @vmf: The description of the fault | ||
1515 | * @pe_size: Size of entry to be inserted | ||
1516 | * @pfn: PFN to insert | ||
1517 | * | ||
1518 | * This function inserts writeable PTE or PMD entry into page tables for mmaped | ||
1519 | * DAX file. It takes care of marking corresponding radix tree entry as dirty | ||
1520 | * as well. | ||
1521 | */ | ||
1522 | static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, | ||
1523 | enum page_entry_size pe_size, | ||
1524 | pfn_t pfn) | ||
1525 | { | ||
1526 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; | ||
1527 | void *entry, **slot; | ||
1528 | pgoff_t index = vmf->pgoff; | ||
1529 | int vmf_ret, error; | ||
1530 | |||
1531 | spin_lock_irq(&mapping->tree_lock); | ||
1532 | entry = get_unlocked_mapping_entry(mapping, index, &slot); | ||
1533 | /* Did we race with someone splitting entry or so? */ | ||
1534 | if (!entry || | ||
1535 | (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || | ||
1536 | (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { | ||
1537 | put_unlocked_mapping_entry(mapping, index, entry); | ||
1538 | spin_unlock_irq(&mapping->tree_lock); | ||
1539 | trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, | ||
1540 | VM_FAULT_NOPAGE); | ||
1541 | return VM_FAULT_NOPAGE; | ||
1542 | } | ||
1543 | radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); | ||
1544 | entry = lock_slot(mapping, slot); | ||
1545 | spin_unlock_irq(&mapping->tree_lock); | ||
1546 | switch (pe_size) { | ||
1547 | case PE_SIZE_PTE: | ||
1548 | error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); | ||
1549 | vmf_ret = dax_fault_return(error); | ||
1550 | break; | ||
1551 | #ifdef CONFIG_FS_DAX_PMD | ||
1552 | case PE_SIZE_PMD: | ||
1553 | vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, | ||
1554 | pfn, true); | ||
1555 | break; | ||
1556 | #endif | ||
1557 | default: | ||
1558 | vmf_ret = VM_FAULT_FALLBACK; | ||
1559 | } | ||
1560 | put_locked_mapping_entry(mapping, index); | ||
1561 | trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret); | ||
1562 | return vmf_ret; | ||
1563 | } | ||
1564 | |||
1565 | /** | ||
1566 | * dax_finish_sync_fault - finish synchronous page fault | ||
1567 | * @vmf: The description of the fault | ||
1568 | * @pe_size: Size of entry to be inserted | ||
1569 | * @pfn: PFN to insert | ||
1570 | * | ||
1571 | * This function ensures that the file range touched by the page fault is | ||
1572 | * stored persistently on the media and handles inserting of appropriate page | ||
1573 | * table entry. | ||
1574 | */ | ||
1575 | int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, | ||
1576 | pfn_t pfn) | ||
1577 | { | ||
1578 | int err; | ||
1579 | loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; | ||
1580 | size_t len = 0; | ||
1581 | |||
1582 | if (pe_size == PE_SIZE_PTE) | ||
1583 | len = PAGE_SIZE; | ||
1584 | else if (pe_size == PE_SIZE_PMD) | ||
1585 | len = PMD_SIZE; | ||
1586 | else | ||
1587 | WARN_ON_ONCE(1); | ||
1588 | err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); | ||
1589 | if (err) | ||
1590 | return VM_FAULT_SIGBUS; | ||
1591 | return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); | ||
1592 | } | ||
1593 | EXPORT_SYMBOL_GPL(dax_finish_sync_fault); | ||