aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/dax.c243
-rw-r--r--fs/ext2/inode.c3
-rw-r--r--fs/ext4/file.c48
-rw-r--r--include/linux/dax.h3
-rw-r--r--mm/truncate.c75
5 files changed, 229 insertions, 143 deletions
diff --git a/fs/dax.c b/fs/dax.c
index a8732fbed381..5c74f60d0a50 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -451,16 +451,37 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
451 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 451 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
452} 452}
453 453
454static int __dax_invalidate_mapping_entry(struct address_space *mapping,
455 pgoff_t index, bool trunc)
456{
457 int ret = 0;
458 void *entry;
459 struct radix_tree_root *page_tree = &mapping->page_tree;
460
461 spin_lock_irq(&mapping->tree_lock);
462 entry = get_unlocked_mapping_entry(mapping, index, NULL);
463 if (!entry || !radix_tree_exceptional_entry(entry))
464 goto out;
465 if (!trunc &&
466 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
467 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
468 goto out;
469 radix_tree_delete(page_tree, index);
470 mapping->nrexceptional--;
471 ret = 1;
472out:
473 put_unlocked_mapping_entry(mapping, index, entry);
474 spin_unlock_irq(&mapping->tree_lock);
475 return ret;
476}
454/* 477/*
455 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 478 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
456 * entry to get unlocked before deleting it. 479 * entry to get unlocked before deleting it.
457 */ 480 */
458int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 481int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
459{ 482{
460 void *entry; 483 int ret = __dax_invalidate_mapping_entry(mapping, index, true);
461 484
462 spin_lock_irq(&mapping->tree_lock);
463 entry = get_unlocked_mapping_entry(mapping, index, NULL);
464 /* 485 /*
465 * This gets called from truncate / punch_hole path. As such, the caller 486 * This gets called from truncate / punch_hole path. As such, the caller
466 * must hold locks protecting against concurrent modifications of the 487 * must hold locks protecting against concurrent modifications of the
@@ -468,16 +489,46 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
468 * caller has seen exceptional entry for this index, we better find it 489 * caller has seen exceptional entry for this index, we better find it
469 * at that index as well... 490 * at that index as well...
470 */ 491 */
471 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { 492 WARN_ON_ONCE(!ret);
472 spin_unlock_irq(&mapping->tree_lock); 493 return ret;
473 return 0; 494}
474 } 495
475 radix_tree_delete(&mapping->page_tree, index); 496/*
497 * Invalidate exceptional DAX entry if easily possible. This handles DAX
498 * entries for invalidate_inode_pages() so we evict the entry only if we can
499 * do so without blocking.
500 */
501int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
502{
503 int ret = 0;
504 void *entry, **slot;
505 struct radix_tree_root *page_tree = &mapping->page_tree;
506
507 spin_lock_irq(&mapping->tree_lock);
508 entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
509 if (!entry || !radix_tree_exceptional_entry(entry) ||
510 slot_locked(mapping, slot))
511 goto out;
512 if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
513 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
514 goto out;
515 radix_tree_delete(page_tree, index);
476 mapping->nrexceptional--; 516 mapping->nrexceptional--;
517 ret = 1;
518out:
477 spin_unlock_irq(&mapping->tree_lock); 519 spin_unlock_irq(&mapping->tree_lock);
478 dax_wake_mapping_entry_waiter(mapping, index, entry, true); 520 if (ret)
521 dax_wake_mapping_entry_waiter(mapping, index, entry, true);
522 return ret;
523}
479 524
480 return 1; 525/*
526 * Invalidate exceptional DAX entry if it is clean.
527 */
528int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
529 pgoff_t index)
530{
531 return __dax_invalidate_mapping_entry(mapping, index, false);
481} 532}
482 533
483/* 534/*
@@ -488,15 +539,16 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
488 * otherwise it will simply fall out of the page cache under memory 539 * otherwise it will simply fall out of the page cache under memory
489 * pressure without ever having been dirtied. 540 * pressure without ever having been dirtied.
490 */ 541 */
491static int dax_load_hole(struct address_space *mapping, void *entry, 542static int dax_load_hole(struct address_space *mapping, void **entry,
492 struct vm_fault *vmf) 543 struct vm_fault *vmf)
493{ 544{
494 struct page *page; 545 struct page *page;
546 int ret;
495 547
496 /* Hole page already exists? Return it... */ 548 /* Hole page already exists? Return it... */
497 if (!radix_tree_exceptional_entry(entry)) { 549 if (!radix_tree_exceptional_entry(*entry)) {
498 vmf->page = entry; 550 page = *entry;
499 return VM_FAULT_LOCKED; 551 goto out;
500 } 552 }
501 553
502 /* This will replace locked radix tree entry with a hole page */ 554 /* This will replace locked radix tree entry with a hole page */
@@ -504,8 +556,17 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
504 vmf->gfp_mask | __GFP_ZERO); 556 vmf->gfp_mask | __GFP_ZERO);
505 if (!page) 557 if (!page)
506 return VM_FAULT_OOM; 558 return VM_FAULT_OOM;
559 out:
507 vmf->page = page; 560 vmf->page = page;
508 return VM_FAULT_LOCKED; 561 ret = finish_fault(vmf);
562 vmf->page = NULL;
563 *entry = page;
564 if (!ret) {
565 /* Grab reference for PTE that is now referencing the page */
566 get_page(page);
567 return VM_FAULT_NOPAGE;
568 }
569 return ret;
509} 570}
510 571
511static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, 572static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
@@ -934,6 +995,17 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
934 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) 995 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
935 return -EIO; 996 return -EIO;
936 997
998 /*
999 * Write can allocate block for an area which has a hole page mapped
1000 * into page tables. We have to tear down these mappings so that data
1001 * written by write(2) is visible in mmap.
1002 */
1003 if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
1004 invalidate_inode_pages2_range(inode->i_mapping,
1005 pos >> PAGE_SHIFT,
1006 (end - 1) >> PAGE_SHIFT);
1007 }
1008
937 while (pos < end) { 1009 while (pos < end) {
938 unsigned offset = pos & (PAGE_SIZE - 1); 1010 unsigned offset = pos & (PAGE_SIZE - 1);
939 struct blk_dax_ctl dax = { 0 }; 1011 struct blk_dax_ctl dax = { 0 };
@@ -992,23 +1064,6 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
992 if (iov_iter_rw(iter) == WRITE) 1064 if (iov_iter_rw(iter) == WRITE)
993 flags |= IOMAP_WRITE; 1065 flags |= IOMAP_WRITE;
994 1066
995 /*
996 * Yes, even DAX files can have page cache attached to them: A zeroed
997 * page is inserted into the pagecache when we have to serve a write
998 * fault on a hole. It should never be dirtied and can simply be
999 * dropped from the pagecache once we get real data for the page.
1000 *
1001 * XXX: This is racy against mmap, and there's nothing we can do about
1002 * it. We'll eventually need to shift this down even further so that
1003 * we can check if we allocated blocks over a hole first.
1004 */
1005 if (mapping->nrpages) {
1006 ret = invalidate_inode_pages2_range(mapping,
1007 pos >> PAGE_SHIFT,
1008 (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
1009 WARN_ON_ONCE(ret);
1010 }
1011
1012 while (iov_iter_count(iter)) { 1067 while (iov_iter_count(iter)) {
1013 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, 1068 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
1014 iter, dax_iomap_actor); 1069 iter, dax_iomap_actor);
@@ -1023,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1023} 1078}
1024EXPORT_SYMBOL_GPL(dax_iomap_rw); 1079EXPORT_SYMBOL_GPL(dax_iomap_rw);
1025 1080
1081static int dax_fault_return(int error)
1082{
1083 if (error == 0)
1084 return VM_FAULT_NOPAGE;
1085 if (error == -ENOMEM)
1086 return VM_FAULT_OOM;
1087 return VM_FAULT_SIGBUS;
1088}
1089
1026/** 1090/**
1027 * dax_iomap_fault - handle a page fault on a DAX file 1091 * dax_iomap_fault - handle a page fault on a DAX file
1028 * @vma: The virtual memory area where the fault occurred 1092 * @vma: The virtual memory area where the fault occurred
@@ -1055,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1055 if (pos >= i_size_read(inode)) 1119 if (pos >= i_size_read(inode))
1056 return VM_FAULT_SIGBUS; 1120 return VM_FAULT_SIGBUS;
1057 1121
1058 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1059 if (IS_ERR(entry)) {
1060 error = PTR_ERR(entry);
1061 goto out;
1062 }
1063
1064 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1122 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1065 flags |= IOMAP_WRITE; 1123 flags |= IOMAP_WRITE;
1066 1124
@@ -1071,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1071 */ 1129 */
1072 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); 1130 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1073 if (error) 1131 if (error)
1074 goto unlock_entry; 1132 return dax_fault_return(error);
1075 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { 1133 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1076 error = -EIO; /* fs corruption? */ 1134 vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
1135 goto finish_iomap;
1136 }
1137
1138 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1139 if (IS_ERR(entry)) {
1140 vmf_ret = dax_fault_return(PTR_ERR(entry));
1077 goto finish_iomap; 1141 goto finish_iomap;
1078 } 1142 }
1079 1143
@@ -1096,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1096 } 1160 }
1097 1161
1098 if (error) 1162 if (error)
1099 goto finish_iomap; 1163 goto error_unlock_entry;
1100 1164
1101 __SetPageUptodate(vmf->cow_page); 1165 __SetPageUptodate(vmf->cow_page);
1102 vmf_ret = finish_fault(vmf); 1166 vmf_ret = finish_fault(vmf);
1103 if (!vmf_ret) 1167 if (!vmf_ret)
1104 vmf_ret = VM_FAULT_DONE_COW; 1168 vmf_ret = VM_FAULT_DONE_COW;
1105 goto finish_iomap; 1169 goto unlock_entry;
1106 } 1170 }
1107 1171
1108 switch (iomap.type) { 1172 switch (iomap.type) {
@@ -1114,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1114 } 1178 }
1115 error = dax_insert_mapping(mapping, iomap.bdev, sector, 1179 error = dax_insert_mapping(mapping, iomap.bdev, sector,
1116 PAGE_SIZE, &entry, vma, vmf); 1180 PAGE_SIZE, &entry, vma, vmf);
1181 /* -EBUSY is fine, somebody else faulted on the same PTE */
1182 if (error == -EBUSY)
1183 error = 0;
1117 break; 1184 break;
1118 case IOMAP_UNWRITTEN: 1185 case IOMAP_UNWRITTEN:
1119 case IOMAP_HOLE: 1186 case IOMAP_HOLE:
1120 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1187 if (!(vmf->flags & FAULT_FLAG_WRITE)) {
1121 vmf_ret = dax_load_hole(mapping, entry, vmf); 1188 vmf_ret = dax_load_hole(mapping, &entry, vmf);
1122 break; 1189 goto unlock_entry;
1123 } 1190 }
1124 /*FALLTHRU*/ 1191 /*FALLTHRU*/
1125 default: 1192 default:
@@ -1128,31 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1128 break; 1195 break;
1129 } 1196 }
1130 1197
1198 error_unlock_entry:
1199 vmf_ret = dax_fault_return(error) | major;
1200 unlock_entry:
1201 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1131 finish_iomap: 1202 finish_iomap:
1132 if (ops->iomap_end) { 1203 if (ops->iomap_end) {
1133 if (error || (vmf_ret & VM_FAULT_ERROR)) { 1204 int copied = PAGE_SIZE;
1134 /* keep previous error */ 1205
1135 ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, 1206 if (vmf_ret & VM_FAULT_ERROR)
1136 &iomap); 1207 copied = 0;
1137 } else { 1208 /*
1138 error = ops->iomap_end(inode, pos, PAGE_SIZE, 1209 * The fault is done by now and there's no way back (other
1139 PAGE_SIZE, flags, &iomap); 1210 * thread may be already happily using PTE we have installed).
1140 } 1211 * Just ignore error from ->iomap_end since we cannot do much
1141 } 1212 * with it.
1142 unlock_entry: 1213 */
1143 if (vmf_ret != VM_FAULT_LOCKED || error) 1214 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1144 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1145 out:
1146 if (error == -ENOMEM)
1147 return VM_FAULT_OOM | major;
1148 /* -EBUSY is fine, somebody else faulted on the same PTE */
1149 if (error < 0 && error != -EBUSY)
1150 return VM_FAULT_SIGBUS | major;
1151 if (vmf_ret) {
1152 WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
1153 return vmf_ret;
1154 } 1215 }
1155 return VM_FAULT_NOPAGE | major; 1216 return vmf_ret;
1156} 1217}
1157EXPORT_SYMBOL_GPL(dax_iomap_fault); 1218EXPORT_SYMBOL_GPL(dax_iomap_fault);
1158 1219
@@ -1277,16 +1338,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1277 goto fallback; 1338 goto fallback;
1278 1339
1279 /* 1340 /*
1280 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1281 * PMD or a HZP entry. If it can't (because a 4k page is already in
1282 * the tree, for instance), it will return -EEXIST and we just fall
1283 * back to 4k entries.
1284 */
1285 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1286 if (IS_ERR(entry))
1287 goto fallback;
1288
1289 /*
1290 * Note that we don't use iomap_apply here. We aren't doing I/O, only 1341 * Note that we don't use iomap_apply here. We aren't doing I/O, only
1291 * setting up a mapping, so really we're using iomap_begin() as a way 1342 * setting up a mapping, so really we're using iomap_begin() as a way
1292 * to look up our filesystem block. 1343 * to look up our filesystem block.
@@ -1294,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1294 pos = (loff_t)pgoff << PAGE_SHIFT; 1345 pos = (loff_t)pgoff << PAGE_SHIFT;
1295 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); 1346 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1296 if (error) 1347 if (error)
1297 goto unlock_entry; 1348 goto fallback;
1349
1298 if (iomap.offset + iomap.length < pos + PMD_SIZE) 1350 if (iomap.offset + iomap.length < pos + PMD_SIZE)
1299 goto finish_iomap; 1351 goto finish_iomap;
1300 1352
1353 /*
1354 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1355 * PMD or a HZP entry. If it can't (because a 4k page is already in
1356 * the tree, for instance), it will return -EEXIST and we just fall
1357 * back to 4k entries.
1358 */
1359 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1360 if (IS_ERR(entry))
1361 goto finish_iomap;
1362
1301 vmf.pgoff = pgoff; 1363 vmf.pgoff = pgoff;
1302 vmf.flags = flags; 1364 vmf.flags = flags;
1303 vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; 1365 vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
@@ -1310,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1310 case IOMAP_UNWRITTEN: 1372 case IOMAP_UNWRITTEN:
1311 case IOMAP_HOLE: 1373 case IOMAP_HOLE:
1312 if (WARN_ON_ONCE(write)) 1374 if (WARN_ON_ONCE(write))
1313 goto finish_iomap; 1375 goto unlock_entry;
1314 result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, 1376 result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
1315 &entry); 1377 &entry);
1316 break; 1378 break;
@@ -1319,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1319 break; 1381 break;
1320 } 1382 }
1321 1383
1384 unlock_entry:
1385 put_locked_mapping_entry(mapping, pgoff, entry);
1322 finish_iomap: 1386 finish_iomap:
1323 if (ops->iomap_end) { 1387 if (ops->iomap_end) {
1324 if (result == VM_FAULT_FALLBACK) { 1388 int copied = PMD_SIZE;
1325 ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags, 1389
1326 &iomap); 1390 if (result == VM_FAULT_FALLBACK)
1327 } else { 1391 copied = 0;
1328 error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE, 1392 /*
1329 iomap_flags, &iomap); 1393 * The fault is done by now and there's no way back (other
1330 if (error) 1394 * thread may be already happily using PMD we have installed).
1331 result = VM_FAULT_FALLBACK; 1395 * Just ignore error from ->iomap_end since we cannot do much
1332 } 1396 * with it.
1397 */
1398 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
1399 &iomap);
1333 } 1400 }
1334 unlock_entry:
1335 put_locked_mapping_entry(mapping, pgoff, entry);
1336 fallback: 1401 fallback:
1337 if (result == VM_FAULT_FALLBACK) { 1402 if (result == VM_FAULT_FALLBACK) {
1338 split_huge_pmd(vma, pmd, address); 1403 split_huge_pmd(vma, pmd, address);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0093ea2512a8..f073bfca694b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -751,9 +751,8 @@ static int ext2_get_blocks(struct inode *inode,
751 mutex_unlock(&ei->truncate_mutex); 751 mutex_unlock(&ei->truncate_mutex);
752 goto cleanup; 752 goto cleanup;
753 } 753 }
754 } else {
755 *new = true;
756 } 754 }
755 *new = true;
757 756
758 ext2_splice_branch(inode, iblock, partial, indirect_blks, count); 757 ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
759 mutex_unlock(&ei->truncate_mutex); 758 mutex_unlock(&ei->truncate_mutex);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b5f184493c57..d663d3d7c81c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -258,7 +258,6 @@ out:
258static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 258static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
259{ 259{
260 int result; 260 int result;
261 handle_t *handle = NULL;
262 struct inode *inode = file_inode(vma->vm_file); 261 struct inode *inode = file_inode(vma->vm_file);
263 struct super_block *sb = inode->i_sb; 262 struct super_block *sb = inode->i_sb;
264 bool write = vmf->flags & FAULT_FLAG_WRITE; 263 bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -266,24 +265,12 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
266 if (write) { 265 if (write) {
267 sb_start_pagefault(sb); 266 sb_start_pagefault(sb);
268 file_update_time(vma->vm_file); 267 file_update_time(vma->vm_file);
269 down_read(&EXT4_I(inode)->i_mmap_sem); 268 }
270 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, 269 down_read(&EXT4_I(inode)->i_mmap_sem);
271 EXT4_DATA_TRANS_BLOCKS(sb)); 270 result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
272 } else 271 up_read(&EXT4_I(inode)->i_mmap_sem);
273 down_read(&EXT4_I(inode)->i_mmap_sem); 272 if (write)
274
275 if (IS_ERR(handle))
276 result = VM_FAULT_SIGBUS;
277 else
278 result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
279
280 if (write) {
281 if (!IS_ERR(handle))
282 ext4_journal_stop(handle);
283 up_read(&EXT4_I(inode)->i_mmap_sem);
284 sb_end_pagefault(sb); 273 sb_end_pagefault(sb);
285 } else
286 up_read(&EXT4_I(inode)->i_mmap_sem);
287 274
288 return result; 275 return result;
289} 276}
@@ -292,7 +279,6 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
292 pmd_t *pmd, unsigned int flags) 279 pmd_t *pmd, unsigned int flags)
293{ 280{
294 int result; 281 int result;
295 handle_t *handle = NULL;
296 struct inode *inode = file_inode(vma->vm_file); 282 struct inode *inode = file_inode(vma->vm_file);
297 struct super_block *sb = inode->i_sb; 283 struct super_block *sb = inode->i_sb;
298 bool write = flags & FAULT_FLAG_WRITE; 284 bool write = flags & FAULT_FLAG_WRITE;
@@ -300,27 +286,13 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
300 if (write) { 286 if (write) {
301 sb_start_pagefault(sb); 287 sb_start_pagefault(sb);
302 file_update_time(vma->vm_file); 288 file_update_time(vma->vm_file);
303 down_read(&EXT4_I(inode)->i_mmap_sem);
304 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
305 ext4_chunk_trans_blocks(inode,
306 PMD_SIZE / PAGE_SIZE));
307 } else
308 down_read(&EXT4_I(inode)->i_mmap_sem);
309
310 if (IS_ERR(handle))
311 result = VM_FAULT_SIGBUS;
312 else {
313 result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
314 &ext4_iomap_ops);
315 } 289 }
316 290 down_read(&EXT4_I(inode)->i_mmap_sem);
317 if (write) { 291 result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
318 if (!IS_ERR(handle)) 292 &ext4_iomap_ops);
319 ext4_journal_stop(handle); 293 up_read(&EXT4_I(inode)->i_mmap_sem);
320 up_read(&EXT4_I(inode)->i_mmap_sem); 294 if (write)
321 sb_end_pagefault(sb); 295 sb_end_pagefault(sb);
322 } else
323 up_read(&EXT4_I(inode)->i_mmap_sem);
324 296
325 return result; 297 return result;
326} 298}
diff --git a/include/linux/dax.h b/include/linux/dax.h
index f97bcfe79472..24ad71173995 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -41,6 +41,9 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
41int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 41int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
42 struct iomap_ops *ops); 42 struct iomap_ops *ops);
43int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); 43int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
44int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
45int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
46 pgoff_t index);
44void dax_wake_mapping_entry_waiter(struct address_space *mapping, 47void dax_wake_mapping_entry_waiter(struct address_space *mapping,
45 pgoff_t index, void *entry, bool wake_all); 48 pgoff_t index, void *entry, bool wake_all);
46 49
diff --git a/mm/truncate.c b/mm/truncate.c
index fd97f1dbce29..dd7b24e083c5 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -24,20 +24,12 @@
24#include <linux/rmap.h> 24#include <linux/rmap.h>
25#include "internal.h" 25#include "internal.h"
26 26
27static void clear_exceptional_entry(struct address_space *mapping, 27static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
28 pgoff_t index, void *entry) 28 void *entry)
29{ 29{
30 struct radix_tree_node *node; 30 struct radix_tree_node *node;
31 void **slot; 31 void **slot;
32 32
33 /* Handled by shmem itself */
34 if (shmem_mapping(mapping))
35 return;
36
37 if (dax_mapping(mapping)) {
38 dax_delete_mapping_entry(mapping, index);
39 return;
40 }
41 spin_lock_irq(&mapping->tree_lock); 33 spin_lock_irq(&mapping->tree_lock);
42 /* 34 /*
43 * Regular page slots are stabilized by the page lock even 35 * Regular page slots are stabilized by the page lock even
@@ -55,6 +47,56 @@ unlock:
55 spin_unlock_irq(&mapping->tree_lock); 47 spin_unlock_irq(&mapping->tree_lock);
56} 48}
57 49
50/*
51 * Unconditionally remove exceptional entry. Usually called from truncate path.
52 */
53static void truncate_exceptional_entry(struct address_space *mapping,
54 pgoff_t index, void *entry)
55{
56 /* Handled by shmem itself */
57 if (shmem_mapping(mapping))
58 return;
59
60 if (dax_mapping(mapping)) {
61 dax_delete_mapping_entry(mapping, index);
62 return;
63 }
64 clear_shadow_entry(mapping, index, entry);
65}
66
67/*
68 * Invalidate exceptional entry if easily possible. This handles exceptional
69 * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and
70 * clean entries.
71 */
72static int invalidate_exceptional_entry(struct address_space *mapping,
73 pgoff_t index, void *entry)
74{
75 /* Handled by shmem itself */
76 if (shmem_mapping(mapping))
77 return 1;
78 if (dax_mapping(mapping))
79 return dax_invalidate_mapping_entry(mapping, index);
80 clear_shadow_entry(mapping, index, entry);
81 return 1;
82}
83
84/*
85 * Invalidate exceptional entry if clean. This handles exceptional entries for
86 * invalidate_inode_pages2() so for DAX it evicts only clean entries.
87 */
88static int invalidate_exceptional_entry2(struct address_space *mapping,
89 pgoff_t index, void *entry)
90{
91 /* Handled by shmem itself */
92 if (shmem_mapping(mapping))
93 return 1;
94 if (dax_mapping(mapping))
95 return dax_invalidate_mapping_entry_sync(mapping, index);
96 clear_shadow_entry(mapping, index, entry);
97 return 1;
98}
99
58/** 100/**
59 * do_invalidatepage - invalidate part or all of a page 101 * do_invalidatepage - invalidate part or all of a page
60 * @page: the page which is affected 102 * @page: the page which is affected
@@ -262,7 +304,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
262 break; 304 break;
263 305
264 if (radix_tree_exceptional_entry(page)) { 306 if (radix_tree_exceptional_entry(page)) {
265 clear_exceptional_entry(mapping, index, page); 307 truncate_exceptional_entry(mapping, index,
308 page);
266 continue; 309 continue;
267 } 310 }
268 311
@@ -351,7 +394,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
351 } 394 }
352 395
353 if (radix_tree_exceptional_entry(page)) { 396 if (radix_tree_exceptional_entry(page)) {
354 clear_exceptional_entry(mapping, index, page); 397 truncate_exceptional_entry(mapping, index,
398 page);
355 continue; 399 continue;
356 } 400 }
357 401
@@ -470,7 +514,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
470 break; 514 break;
471 515
472 if (radix_tree_exceptional_entry(page)) { 516 if (radix_tree_exceptional_entry(page)) {
473 clear_exceptional_entry(mapping, index, page); 517 invalidate_exceptional_entry(mapping, index,
518 page);
474 continue; 519 continue;
475 } 520 }
476 521
@@ -592,7 +637,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
592 break; 637 break;
593 638
594 if (radix_tree_exceptional_entry(page)) { 639 if (radix_tree_exceptional_entry(page)) {
595 clear_exceptional_entry(mapping, index, page); 640 if (!invalidate_exceptional_entry2(mapping,
641 index, page))
642 ret = -EBUSY;
596 continue; 643 continue;
597 } 644 }
598 645