diff options
| -rw-r--r-- | fs/dax.c | 243 | ||||
| -rw-r--r-- | fs/ext2/inode.c | 3 | ||||
| -rw-r--r-- | fs/ext4/file.c | 48 | ||||
| -rw-r--r-- | include/linux/dax.h | 3 | ||||
| -rw-r--r-- | mm/truncate.c | 75 |
5 files changed, 229 insertions, 143 deletions
| @@ -451,16 +451,37 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, | |||
| 451 | __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); | 451 | __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); |
| 452 | } | 452 | } |
| 453 | 453 | ||
| 454 | static int __dax_invalidate_mapping_entry(struct address_space *mapping, | ||
| 455 | pgoff_t index, bool trunc) | ||
| 456 | { | ||
| 457 | int ret = 0; | ||
| 458 | void *entry; | ||
| 459 | struct radix_tree_root *page_tree = &mapping->page_tree; | ||
| 460 | |||
| 461 | spin_lock_irq(&mapping->tree_lock); | ||
| 462 | entry = get_unlocked_mapping_entry(mapping, index, NULL); | ||
| 463 | if (!entry || !radix_tree_exceptional_entry(entry)) | ||
| 464 | goto out; | ||
| 465 | if (!trunc && | ||
| 466 | (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || | ||
| 467 | radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) | ||
| 468 | goto out; | ||
| 469 | radix_tree_delete(page_tree, index); | ||
| 470 | mapping->nrexceptional--; | ||
| 471 | ret = 1; | ||
| 472 | out: | ||
| 473 | put_unlocked_mapping_entry(mapping, index, entry); | ||
| 474 | spin_unlock_irq(&mapping->tree_lock); | ||
| 475 | return ret; | ||
| 476 | } | ||
| 454 | /* | 477 | /* |
| 455 | * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree | 478 | * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree |
| 456 | * entry to get unlocked before deleting it. | 479 | * entry to get unlocked before deleting it. |
| 457 | */ | 480 | */ |
| 458 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) | 481 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) |
| 459 | { | 482 | { |
| 460 | void *entry; | 483 | int ret = __dax_invalidate_mapping_entry(mapping, index, true); |
| 461 | 484 | ||
| 462 | spin_lock_irq(&mapping->tree_lock); | ||
| 463 | entry = get_unlocked_mapping_entry(mapping, index, NULL); | ||
| 464 | /* | 485 | /* |
| 465 | * This gets called from truncate / punch_hole path. As such, the caller | 486 | * This gets called from truncate / punch_hole path. As such, the caller |
| 466 | * must hold locks protecting against concurrent modifications of the | 487 | * must hold locks protecting against concurrent modifications of the |
| @@ -468,16 +489,46 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) | |||
| 468 | * caller has seen exceptional entry for this index, we better find it | 489 | * caller has seen exceptional entry for this index, we better find it |
| 469 | * at that index as well... | 490 | * at that index as well... |
| 470 | */ | 491 | */ |
| 471 | if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { | 492 | WARN_ON_ONCE(!ret); |
| 472 | spin_unlock_irq(&mapping->tree_lock); | 493 | return ret; |
| 473 | return 0; | 494 | } |
| 474 | } | 495 | |
| 475 | radix_tree_delete(&mapping->page_tree, index); | 496 | /* |
| 497 | * Invalidate exceptional DAX entry if easily possible. This handles DAX | ||
| 498 | * entries for invalidate_inode_pages() so we evict the entry only if we can | ||
| 499 | * do so without blocking. | ||
| 500 | */ | ||
| 501 | int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index) | ||
| 502 | { | ||
| 503 | int ret = 0; | ||
| 504 | void *entry, **slot; | ||
| 505 | struct radix_tree_root *page_tree = &mapping->page_tree; | ||
| 506 | |||
| 507 | spin_lock_irq(&mapping->tree_lock); | ||
| 508 | entry = __radix_tree_lookup(page_tree, index, NULL, &slot); | ||
| 509 | if (!entry || !radix_tree_exceptional_entry(entry) || | ||
| 510 | slot_locked(mapping, slot)) | ||
| 511 | goto out; | ||
| 512 | if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || | ||
| 513 | radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) | ||
| 514 | goto out; | ||
| 515 | radix_tree_delete(page_tree, index); | ||
| 476 | mapping->nrexceptional--; | 516 | mapping->nrexceptional--; |
| 517 | ret = 1; | ||
| 518 | out: | ||
| 477 | spin_unlock_irq(&mapping->tree_lock); | 519 | spin_unlock_irq(&mapping->tree_lock); |
| 478 | dax_wake_mapping_entry_waiter(mapping, index, entry, true); | 520 | if (ret) |
| 521 | dax_wake_mapping_entry_waiter(mapping, index, entry, true); | ||
| 522 | return ret; | ||
| 523 | } | ||
| 479 | 524 | ||
| 480 | return 1; | 525 | /* |
| 526 | * Invalidate exceptional DAX entry if it is clean. | ||
| 527 | */ | ||
| 528 | int dax_invalidate_mapping_entry_sync(struct address_space *mapping, | ||
| 529 | pgoff_t index) | ||
| 530 | { | ||
| 531 | return __dax_invalidate_mapping_entry(mapping, index, false); | ||
| 481 | } | 532 | } |
| 482 | 533 | ||
| 483 | /* | 534 | /* |
| @@ -488,15 +539,16 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) | |||
| 488 | * otherwise it will simply fall out of the page cache under memory | 539 | * otherwise it will simply fall out of the page cache under memory |
| 489 | * pressure without ever having been dirtied. | 540 | * pressure without ever having been dirtied. |
| 490 | */ | 541 | */ |
| 491 | static int dax_load_hole(struct address_space *mapping, void *entry, | 542 | static int dax_load_hole(struct address_space *mapping, void **entry, |
| 492 | struct vm_fault *vmf) | 543 | struct vm_fault *vmf) |
| 493 | { | 544 | { |
| 494 | struct page *page; | 545 | struct page *page; |
| 546 | int ret; | ||
| 495 | 547 | ||
| 496 | /* Hole page already exists? Return it... */ | 548 | /* Hole page already exists? Return it... */ |
| 497 | if (!radix_tree_exceptional_entry(entry)) { | 549 | if (!radix_tree_exceptional_entry(*entry)) { |
| 498 | vmf->page = entry; | 550 | page = *entry; |
| 499 | return VM_FAULT_LOCKED; | 551 | goto out; |
| 500 | } | 552 | } |
| 501 | 553 | ||
| 502 | /* This will replace locked radix tree entry with a hole page */ | 554 | /* This will replace locked radix tree entry with a hole page */ |
| @@ -504,8 +556,17 @@ static int dax_load_hole(struct address_space *mapping, void *entry, | |||
| 504 | vmf->gfp_mask | __GFP_ZERO); | 556 | vmf->gfp_mask | __GFP_ZERO); |
| 505 | if (!page) | 557 | if (!page) |
| 506 | return VM_FAULT_OOM; | 558 | return VM_FAULT_OOM; |
| 559 | out: | ||
| 507 | vmf->page = page; | 560 | vmf->page = page; |
| 508 | return VM_FAULT_LOCKED; | 561 | ret = finish_fault(vmf); |
| 562 | vmf->page = NULL; | ||
| 563 | *entry = page; | ||
| 564 | if (!ret) { | ||
| 565 | /* Grab reference for PTE that is now referencing the page */ | ||
| 566 | get_page(page); | ||
| 567 | return VM_FAULT_NOPAGE; | ||
| 568 | } | ||
| 569 | return ret; | ||
| 509 | } | 570 | } |
| 510 | 571 | ||
| 511 | static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, | 572 | static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, |
| @@ -934,6 +995,17 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, | |||
| 934 | if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) | 995 | if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) |
| 935 | return -EIO; | 996 | return -EIO; |
| 936 | 997 | ||
| 998 | /* | ||
| 999 | * Write can allocate block for an area which has a hole page mapped | ||
| 1000 | * into page tables. We have to tear down these mappings so that data | ||
| 1001 | * written by write(2) is visible in mmap. | ||
| 1002 | */ | ||
| 1003 | if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) { | ||
| 1004 | invalidate_inode_pages2_range(inode->i_mapping, | ||
| 1005 | pos >> PAGE_SHIFT, | ||
| 1006 | (end - 1) >> PAGE_SHIFT); | ||
| 1007 | } | ||
| 1008 | |||
| 937 | while (pos < end) { | 1009 | while (pos < end) { |
| 938 | unsigned offset = pos & (PAGE_SIZE - 1); | 1010 | unsigned offset = pos & (PAGE_SIZE - 1); |
| 939 | struct blk_dax_ctl dax = { 0 }; | 1011 | struct blk_dax_ctl dax = { 0 }; |
| @@ -992,23 +1064,6 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, | |||
| 992 | if (iov_iter_rw(iter) == WRITE) | 1064 | if (iov_iter_rw(iter) == WRITE) |
| 993 | flags |= IOMAP_WRITE; | 1065 | flags |= IOMAP_WRITE; |
| 994 | 1066 | ||
| 995 | /* | ||
| 996 | * Yes, even DAX files can have page cache attached to them: A zeroed | ||
| 997 | * page is inserted into the pagecache when we have to serve a write | ||
| 998 | * fault on a hole. It should never be dirtied and can simply be | ||
| 999 | * dropped from the pagecache once we get real data for the page. | ||
| 1000 | * | ||
| 1001 | * XXX: This is racy against mmap, and there's nothing we can do about | ||
| 1002 | * it. We'll eventually need to shift this down even further so that | ||
| 1003 | * we can check if we allocated blocks over a hole first. | ||
| 1004 | */ | ||
| 1005 | if (mapping->nrpages) { | ||
| 1006 | ret = invalidate_inode_pages2_range(mapping, | ||
| 1007 | pos >> PAGE_SHIFT, | ||
| 1008 | (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT); | ||
| 1009 | WARN_ON_ONCE(ret); | ||
| 1010 | } | ||
| 1011 | |||
| 1012 | while (iov_iter_count(iter)) { | 1067 | while (iov_iter_count(iter)) { |
| 1013 | ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, | 1068 | ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, |
| 1014 | iter, dax_iomap_actor); | 1069 | iter, dax_iomap_actor); |
| @@ -1023,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, | |||
| 1023 | } | 1078 | } |
| 1024 | EXPORT_SYMBOL_GPL(dax_iomap_rw); | 1079 | EXPORT_SYMBOL_GPL(dax_iomap_rw); |
| 1025 | 1080 | ||
| 1081 | static int dax_fault_return(int error) | ||
| 1082 | { | ||
| 1083 | if (error == 0) | ||
| 1084 | return VM_FAULT_NOPAGE; | ||
| 1085 | if (error == -ENOMEM) | ||
| 1086 | return VM_FAULT_OOM; | ||
| 1087 | return VM_FAULT_SIGBUS; | ||
| 1088 | } | ||
| 1089 | |||
| 1026 | /** | 1090 | /** |
| 1027 | * dax_iomap_fault - handle a page fault on a DAX file | 1091 | * dax_iomap_fault - handle a page fault on a DAX file |
| 1028 | * @vma: The virtual memory area where the fault occurred | 1092 | * @vma: The virtual memory area where the fault occurred |
| @@ -1055,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1055 | if (pos >= i_size_read(inode)) | 1119 | if (pos >= i_size_read(inode)) |
| 1056 | return VM_FAULT_SIGBUS; | 1120 | return VM_FAULT_SIGBUS; |
| 1057 | 1121 | ||
| 1058 | entry = grab_mapping_entry(mapping, vmf->pgoff, 0); | ||
| 1059 | if (IS_ERR(entry)) { | ||
| 1060 | error = PTR_ERR(entry); | ||
| 1061 | goto out; | ||
| 1062 | } | ||
| 1063 | |||
| 1064 | if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) | 1122 | if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) |
| 1065 | flags |= IOMAP_WRITE; | 1123 | flags |= IOMAP_WRITE; |
| 1066 | 1124 | ||
| @@ -1071,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1071 | */ | 1129 | */ |
| 1072 | error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); | 1130 | error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); |
| 1073 | if (error) | 1131 | if (error) |
| 1074 | goto unlock_entry; | 1132 | return dax_fault_return(error); |
| 1075 | if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { | 1133 | if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { |
| 1076 | error = -EIO; /* fs corruption? */ | 1134 | vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ |
| 1135 | goto finish_iomap; | ||
| 1136 | } | ||
| 1137 | |||
| 1138 | entry = grab_mapping_entry(mapping, vmf->pgoff, 0); | ||
| 1139 | if (IS_ERR(entry)) { | ||
| 1140 | vmf_ret = dax_fault_return(PTR_ERR(entry)); | ||
| 1077 | goto finish_iomap; | 1141 | goto finish_iomap; |
| 1078 | } | 1142 | } |
| 1079 | 1143 | ||
| @@ -1096,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1096 | } | 1160 | } |
| 1097 | 1161 | ||
| 1098 | if (error) | 1162 | if (error) |
| 1099 | goto finish_iomap; | 1163 | goto error_unlock_entry; |
| 1100 | 1164 | ||
| 1101 | __SetPageUptodate(vmf->cow_page); | 1165 | __SetPageUptodate(vmf->cow_page); |
| 1102 | vmf_ret = finish_fault(vmf); | 1166 | vmf_ret = finish_fault(vmf); |
| 1103 | if (!vmf_ret) | 1167 | if (!vmf_ret) |
| 1104 | vmf_ret = VM_FAULT_DONE_COW; | 1168 | vmf_ret = VM_FAULT_DONE_COW; |
| 1105 | goto finish_iomap; | 1169 | goto unlock_entry; |
| 1106 | } | 1170 | } |
| 1107 | 1171 | ||
| 1108 | switch (iomap.type) { | 1172 | switch (iomap.type) { |
| @@ -1114,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1114 | } | 1178 | } |
| 1115 | error = dax_insert_mapping(mapping, iomap.bdev, sector, | 1179 | error = dax_insert_mapping(mapping, iomap.bdev, sector, |
| 1116 | PAGE_SIZE, &entry, vma, vmf); | 1180 | PAGE_SIZE, &entry, vma, vmf); |
| 1181 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | ||
| 1182 | if (error == -EBUSY) | ||
| 1183 | error = 0; | ||
| 1117 | break; | 1184 | break; |
| 1118 | case IOMAP_UNWRITTEN: | 1185 | case IOMAP_UNWRITTEN: |
| 1119 | case IOMAP_HOLE: | 1186 | case IOMAP_HOLE: |
| 1120 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { | 1187 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { |
| 1121 | vmf_ret = dax_load_hole(mapping, entry, vmf); | 1188 | vmf_ret = dax_load_hole(mapping, &entry, vmf); |
| 1122 | break; | 1189 | goto unlock_entry; |
| 1123 | } | 1190 | } |
| 1124 | /*FALLTHRU*/ | 1191 | /*FALLTHRU*/ |
| 1125 | default: | 1192 | default: |
| @@ -1128,31 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1128 | break; | 1195 | break; |
| 1129 | } | 1196 | } |
| 1130 | 1197 | ||
| 1198 | error_unlock_entry: | ||
| 1199 | vmf_ret = dax_fault_return(error) | major; | ||
| 1200 | unlock_entry: | ||
| 1201 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | ||
| 1131 | finish_iomap: | 1202 | finish_iomap: |
| 1132 | if (ops->iomap_end) { | 1203 | if (ops->iomap_end) { |
| 1133 | if (error || (vmf_ret & VM_FAULT_ERROR)) { | 1204 | int copied = PAGE_SIZE; |
| 1134 | /* keep previous error */ | 1205 | |
| 1135 | ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, | 1206 | if (vmf_ret & VM_FAULT_ERROR) |
| 1136 | &iomap); | 1207 | copied = 0; |
| 1137 | } else { | 1208 | /* |
| 1138 | error = ops->iomap_end(inode, pos, PAGE_SIZE, | 1209 | * The fault is done by now and there's no way back (other |
| 1139 | PAGE_SIZE, flags, &iomap); | 1210 | * thread may be already happily using PTE we have installed). |
| 1140 | } | 1211 | * Just ignore error from ->iomap_end since we cannot do much |
| 1141 | } | 1212 | * with it. |
| 1142 | unlock_entry: | 1213 | */ |
| 1143 | if (vmf_ret != VM_FAULT_LOCKED || error) | 1214 | ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); |
| 1144 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | ||
| 1145 | out: | ||
| 1146 | if (error == -ENOMEM) | ||
| 1147 | return VM_FAULT_OOM | major; | ||
| 1148 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | ||
| 1149 | if (error < 0 && error != -EBUSY) | ||
| 1150 | return VM_FAULT_SIGBUS | major; | ||
| 1151 | if (vmf_ret) { | ||
| 1152 | WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ | ||
| 1153 | return vmf_ret; | ||
| 1154 | } | 1215 | } |
| 1155 | return VM_FAULT_NOPAGE | major; | 1216 | return vmf_ret; |
| 1156 | } | 1217 | } |
| 1157 | EXPORT_SYMBOL_GPL(dax_iomap_fault); | 1218 | EXPORT_SYMBOL_GPL(dax_iomap_fault); |
| 1158 | 1219 | ||
| @@ -1277,16 +1338,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 1277 | goto fallback; | 1338 | goto fallback; |
| 1278 | 1339 | ||
| 1279 | /* | 1340 | /* |
| 1280 | * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX | ||
| 1281 | * PMD or a HZP entry. If it can't (because a 4k page is already in | ||
| 1282 | * the tree, for instance), it will return -EEXIST and we just fall | ||
| 1283 | * back to 4k entries. | ||
| 1284 | */ | ||
| 1285 | entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); | ||
| 1286 | if (IS_ERR(entry)) | ||
| 1287 | goto fallback; | ||
| 1288 | |||
| 1289 | /* | ||
| 1290 | * Note that we don't use iomap_apply here. We aren't doing I/O, only | 1341 | * Note that we don't use iomap_apply here. We aren't doing I/O, only |
| 1291 | * setting up a mapping, so really we're using iomap_begin() as a way | 1342 | * setting up a mapping, so really we're using iomap_begin() as a way |
| 1292 | * to look up our filesystem block. | 1343 | * to look up our filesystem block. |
| @@ -1294,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 1294 | pos = (loff_t)pgoff << PAGE_SHIFT; | 1345 | pos = (loff_t)pgoff << PAGE_SHIFT; |
| 1295 | error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); | 1346 | error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); |
| 1296 | if (error) | 1347 | if (error) |
| 1297 | goto unlock_entry; | 1348 | goto fallback; |
| 1349 | |||
| 1298 | if (iomap.offset + iomap.length < pos + PMD_SIZE) | 1350 | if (iomap.offset + iomap.length < pos + PMD_SIZE) |
| 1299 | goto finish_iomap; | 1351 | goto finish_iomap; |
| 1300 | 1352 | ||
| 1353 | /* | ||
| 1354 | * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX | ||
| 1355 | * PMD or a HZP entry. If it can't (because a 4k page is already in | ||
| 1356 | * the tree, for instance), it will return -EEXIST and we just fall | ||
| 1357 | * back to 4k entries. | ||
| 1358 | */ | ||
| 1359 | entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); | ||
| 1360 | if (IS_ERR(entry)) | ||
| 1361 | goto finish_iomap; | ||
| 1362 | |||
| 1301 | vmf.pgoff = pgoff; | 1363 | vmf.pgoff = pgoff; |
| 1302 | vmf.flags = flags; | 1364 | vmf.flags = flags; |
| 1303 | vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; | 1365 | vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; |
| @@ -1310,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 1310 | case IOMAP_UNWRITTEN: | 1372 | case IOMAP_UNWRITTEN: |
| 1311 | case IOMAP_HOLE: | 1373 | case IOMAP_HOLE: |
| 1312 | if (WARN_ON_ONCE(write)) | 1374 | if (WARN_ON_ONCE(write)) |
| 1313 | goto finish_iomap; | 1375 | goto unlock_entry; |
| 1314 | result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, | 1376 | result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, |
| 1315 | &entry); | 1377 | &entry); |
| 1316 | break; | 1378 | break; |
| @@ -1319,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 1319 | break; | 1381 | break; |
| 1320 | } | 1382 | } |
| 1321 | 1383 | ||
| 1384 | unlock_entry: | ||
| 1385 | put_locked_mapping_entry(mapping, pgoff, entry); | ||
| 1322 | finish_iomap: | 1386 | finish_iomap: |
| 1323 | if (ops->iomap_end) { | 1387 | if (ops->iomap_end) { |
| 1324 | if (result == VM_FAULT_FALLBACK) { | 1388 | int copied = PMD_SIZE; |
| 1325 | ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags, | 1389 | |
| 1326 | &iomap); | 1390 | if (result == VM_FAULT_FALLBACK) |
| 1327 | } else { | 1391 | copied = 0; |
| 1328 | error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE, | 1392 | /* |
| 1329 | iomap_flags, &iomap); | 1393 | * The fault is done by now and there's no way back (other |
| 1330 | if (error) | 1394 | * thread may be already happily using PMD we have installed). |
| 1331 | result = VM_FAULT_FALLBACK; | 1395 | * Just ignore error from ->iomap_end since we cannot do much |
| 1332 | } | 1396 | * with it. |
| 1397 | */ | ||
| 1398 | ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, | ||
| 1399 | &iomap); | ||
| 1333 | } | 1400 | } |
| 1334 | unlock_entry: | ||
| 1335 | put_locked_mapping_entry(mapping, pgoff, entry); | ||
| 1336 | fallback: | 1401 | fallback: |
| 1337 | if (result == VM_FAULT_FALLBACK) { | 1402 | if (result == VM_FAULT_FALLBACK) { |
| 1338 | split_huge_pmd(vma, pmd, address); | 1403 | split_huge_pmd(vma, pmd, address); |
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0093ea2512a8..f073bfca694b 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c | |||
| @@ -751,9 +751,8 @@ static int ext2_get_blocks(struct inode *inode, | |||
| 751 | mutex_unlock(&ei->truncate_mutex); | 751 | mutex_unlock(&ei->truncate_mutex); |
| 752 | goto cleanup; | 752 | goto cleanup; |
| 753 | } | 753 | } |
| 754 | } else { | ||
| 755 | *new = true; | ||
| 756 | } | 754 | } |
| 755 | *new = true; | ||
| 757 | 756 | ||
| 758 | ext2_splice_branch(inode, iblock, partial, indirect_blks, count); | 757 | ext2_splice_branch(inode, iblock, partial, indirect_blks, count); |
| 759 | mutex_unlock(&ei->truncate_mutex); | 758 | mutex_unlock(&ei->truncate_mutex); |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b5f184493c57..d663d3d7c81c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
| @@ -258,7 +258,6 @@ out: | |||
| 258 | static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 258 | static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
| 259 | { | 259 | { |
| 260 | int result; | 260 | int result; |
| 261 | handle_t *handle = NULL; | ||
| 262 | struct inode *inode = file_inode(vma->vm_file); | 261 | struct inode *inode = file_inode(vma->vm_file); |
| 263 | struct super_block *sb = inode->i_sb; | 262 | struct super_block *sb = inode->i_sb; |
| 264 | bool write = vmf->flags & FAULT_FLAG_WRITE; | 263 | bool write = vmf->flags & FAULT_FLAG_WRITE; |
| @@ -266,24 +265,12 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 266 | if (write) { | 265 | if (write) { |
| 267 | sb_start_pagefault(sb); | 266 | sb_start_pagefault(sb); |
| 268 | file_update_time(vma->vm_file); | 267 | file_update_time(vma->vm_file); |
| 269 | down_read(&EXT4_I(inode)->i_mmap_sem); | 268 | } |
| 270 | handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, | 269 | down_read(&EXT4_I(inode)->i_mmap_sem); |
| 271 | EXT4_DATA_TRANS_BLOCKS(sb)); | 270 | result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops); |
| 272 | } else | 271 | up_read(&EXT4_I(inode)->i_mmap_sem); |
| 273 | down_read(&EXT4_I(inode)->i_mmap_sem); | 272 | if (write) |
| 274 | |||
| 275 | if (IS_ERR(handle)) | ||
| 276 | result = VM_FAULT_SIGBUS; | ||
| 277 | else | ||
| 278 | result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops); | ||
| 279 | |||
| 280 | if (write) { | ||
| 281 | if (!IS_ERR(handle)) | ||
| 282 | ext4_journal_stop(handle); | ||
| 283 | up_read(&EXT4_I(inode)->i_mmap_sem); | ||
| 284 | sb_end_pagefault(sb); | 273 | sb_end_pagefault(sb); |
| 285 | } else | ||
| 286 | up_read(&EXT4_I(inode)->i_mmap_sem); | ||
| 287 | 274 | ||
| 288 | return result; | 275 | return result; |
| 289 | } | 276 | } |
| @@ -292,7 +279,6 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | |||
| 292 | pmd_t *pmd, unsigned int flags) | 279 | pmd_t *pmd, unsigned int flags) |
| 293 | { | 280 | { |
| 294 | int result; | 281 | int result; |
| 295 | handle_t *handle = NULL; | ||
| 296 | struct inode *inode = file_inode(vma->vm_file); | 282 | struct inode *inode = file_inode(vma->vm_file); |
| 297 | struct super_block *sb = inode->i_sb; | 283 | struct super_block *sb = inode->i_sb; |
| 298 | bool write = flags & FAULT_FLAG_WRITE; | 284 | bool write = flags & FAULT_FLAG_WRITE; |
| @@ -300,27 +286,13 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | |||
| 300 | if (write) { | 286 | if (write) { |
| 301 | sb_start_pagefault(sb); | 287 | sb_start_pagefault(sb); |
| 302 | file_update_time(vma->vm_file); | 288 | file_update_time(vma->vm_file); |
| 303 | down_read(&EXT4_I(inode)->i_mmap_sem); | ||
| 304 | handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, | ||
| 305 | ext4_chunk_trans_blocks(inode, | ||
| 306 | PMD_SIZE / PAGE_SIZE)); | ||
| 307 | } else | ||
| 308 | down_read(&EXT4_I(inode)->i_mmap_sem); | ||
| 309 | |||
| 310 | if (IS_ERR(handle)) | ||
| 311 | result = VM_FAULT_SIGBUS; | ||
| 312 | else { | ||
| 313 | result = dax_iomap_pmd_fault(vma, addr, pmd, flags, | ||
| 314 | &ext4_iomap_ops); | ||
| 315 | } | 289 | } |
| 316 | 290 | down_read(&EXT4_I(inode)->i_mmap_sem); | |
| 317 | if (write) { | 291 | result = dax_iomap_pmd_fault(vma, addr, pmd, flags, |
| 318 | if (!IS_ERR(handle)) | 292 | &ext4_iomap_ops); |
| 319 | ext4_journal_stop(handle); | 293 | up_read(&EXT4_I(inode)->i_mmap_sem); |
| 320 | up_read(&EXT4_I(inode)->i_mmap_sem); | 294 | if (write) |
| 321 | sb_end_pagefault(sb); | 295 | sb_end_pagefault(sb); |
| 322 | } else | ||
| 323 | up_read(&EXT4_I(inode)->i_mmap_sem); | ||
| 324 | 296 | ||
| 325 | return result; | 297 | return result; |
| 326 | } | 298 | } |
diff --git a/include/linux/dax.h b/include/linux/dax.h index f97bcfe79472..24ad71173995 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h | |||
| @@ -41,6 +41,9 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, | |||
| 41 | int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | 41 | int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, |
| 42 | struct iomap_ops *ops); | 42 | struct iomap_ops *ops); |
| 43 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); | 43 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); |
| 44 | int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index); | ||
| 45 | int dax_invalidate_mapping_entry_sync(struct address_space *mapping, | ||
| 46 | pgoff_t index); | ||
| 44 | void dax_wake_mapping_entry_waiter(struct address_space *mapping, | 47 | void dax_wake_mapping_entry_waiter(struct address_space *mapping, |
| 45 | pgoff_t index, void *entry, bool wake_all); | 48 | pgoff_t index, void *entry, bool wake_all); |
| 46 | 49 | ||
diff --git a/mm/truncate.c b/mm/truncate.c index fd97f1dbce29..dd7b24e083c5 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -24,20 +24,12 @@ | |||
| 24 | #include <linux/rmap.h> | 24 | #include <linux/rmap.h> |
| 25 | #include "internal.h" | 25 | #include "internal.h" |
| 26 | 26 | ||
| 27 | static void clear_exceptional_entry(struct address_space *mapping, | 27 | static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, |
| 28 | pgoff_t index, void *entry) | 28 | void *entry) |
| 29 | { | 29 | { |
| 30 | struct radix_tree_node *node; | 30 | struct radix_tree_node *node; |
| 31 | void **slot; | 31 | void **slot; |
| 32 | 32 | ||
| 33 | /* Handled by shmem itself */ | ||
| 34 | if (shmem_mapping(mapping)) | ||
| 35 | return; | ||
| 36 | |||
| 37 | if (dax_mapping(mapping)) { | ||
| 38 | dax_delete_mapping_entry(mapping, index); | ||
| 39 | return; | ||
| 40 | } | ||
| 41 | spin_lock_irq(&mapping->tree_lock); | 33 | spin_lock_irq(&mapping->tree_lock); |
| 42 | /* | 34 | /* |
| 43 | * Regular page slots are stabilized by the page lock even | 35 | * Regular page slots are stabilized by the page lock even |
| @@ -55,6 +47,56 @@ unlock: | |||
| 55 | spin_unlock_irq(&mapping->tree_lock); | 47 | spin_unlock_irq(&mapping->tree_lock); |
| 56 | } | 48 | } |
| 57 | 49 | ||
| 50 | /* | ||
| 51 | * Unconditionally remove exceptional entry. Usually called from truncate path. | ||
| 52 | */ | ||
| 53 | static void truncate_exceptional_entry(struct address_space *mapping, | ||
| 54 | pgoff_t index, void *entry) | ||
| 55 | { | ||
| 56 | /* Handled by shmem itself */ | ||
| 57 | if (shmem_mapping(mapping)) | ||
| 58 | return; | ||
| 59 | |||
| 60 | if (dax_mapping(mapping)) { | ||
| 61 | dax_delete_mapping_entry(mapping, index); | ||
| 62 | return; | ||
| 63 | } | ||
| 64 | clear_shadow_entry(mapping, index, entry); | ||
| 65 | } | ||
| 66 | |||
| 67 | /* | ||
| 68 | * Invalidate exceptional entry if easily possible. This handles exceptional | ||
| 69 | * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and | ||
| 70 | * clean entries. | ||
| 71 | */ | ||
| 72 | static int invalidate_exceptional_entry(struct address_space *mapping, | ||
| 73 | pgoff_t index, void *entry) | ||
| 74 | { | ||
| 75 | /* Handled by shmem itself */ | ||
| 76 | if (shmem_mapping(mapping)) | ||
| 77 | return 1; | ||
| 78 | if (dax_mapping(mapping)) | ||
| 79 | return dax_invalidate_mapping_entry(mapping, index); | ||
| 80 | clear_shadow_entry(mapping, index, entry); | ||
| 81 | return 1; | ||
| 82 | } | ||
| 83 | |||
| 84 | /* | ||
| 85 | * Invalidate exceptional entry if clean. This handles exceptional entries for | ||
| 86 | * invalidate_inode_pages2() so for DAX it evicts only clean entries. | ||
| 87 | */ | ||
| 88 | static int invalidate_exceptional_entry2(struct address_space *mapping, | ||
| 89 | pgoff_t index, void *entry) | ||
| 90 | { | ||
| 91 | /* Handled by shmem itself */ | ||
| 92 | if (shmem_mapping(mapping)) | ||
| 93 | return 1; | ||
| 94 | if (dax_mapping(mapping)) | ||
| 95 | return dax_invalidate_mapping_entry_sync(mapping, index); | ||
| 96 | clear_shadow_entry(mapping, index, entry); | ||
| 97 | return 1; | ||
| 98 | } | ||
| 99 | |||
| 58 | /** | 100 | /** |
| 59 | * do_invalidatepage - invalidate part or all of a page | 101 | * do_invalidatepage - invalidate part or all of a page |
| 60 | * @page: the page which is affected | 102 | * @page: the page which is affected |
| @@ -262,7 +304,8 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
| 262 | break; | 304 | break; |
| 263 | 305 | ||
| 264 | if (radix_tree_exceptional_entry(page)) { | 306 | if (radix_tree_exceptional_entry(page)) { |
| 265 | clear_exceptional_entry(mapping, index, page); | 307 | truncate_exceptional_entry(mapping, index, |
| 308 | page); | ||
| 266 | continue; | 309 | continue; |
| 267 | } | 310 | } |
| 268 | 311 | ||
| @@ -351,7 +394,8 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
| 351 | } | 394 | } |
| 352 | 395 | ||
| 353 | if (radix_tree_exceptional_entry(page)) { | 396 | if (radix_tree_exceptional_entry(page)) { |
| 354 | clear_exceptional_entry(mapping, index, page); | 397 | truncate_exceptional_entry(mapping, index, |
| 398 | page); | ||
| 355 | continue; | 399 | continue; |
| 356 | } | 400 | } |
| 357 | 401 | ||
| @@ -470,7 +514,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
| 470 | break; | 514 | break; |
| 471 | 515 | ||
| 472 | if (radix_tree_exceptional_entry(page)) { | 516 | if (radix_tree_exceptional_entry(page)) { |
| 473 | clear_exceptional_entry(mapping, index, page); | 517 | invalidate_exceptional_entry(mapping, index, |
| 518 | page); | ||
| 474 | continue; | 519 | continue; |
| 475 | } | 520 | } |
| 476 | 521 | ||
| @@ -592,7 +637,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
| 592 | break; | 637 | break; |
| 593 | 638 | ||
| 594 | if (radix_tree_exceptional_entry(page)) { | 639 | if (radix_tree_exceptional_entry(page)) { |
| 595 | clear_exceptional_entry(mapping, index, page); | 640 | if (!invalidate_exceptional_entry2(mapping, |
| 641 | index, page)) | ||
| 642 | ret = -EBUSY; | ||
| 596 | continue; | 643 | continue; |
| 597 | } | 644 | } |
| 598 | 645 | ||
