aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRoss Zwisler <ross.zwisler@linux.intel.com>2017-05-12 18:47:00 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-12 18:57:16 -0400
commit876f29460cbd4086b43475890c1bf2488fa11d40 (patch)
treefc4fdd4ca0fd5d391ccab5b6aa7323800c60555f
parent13e451fdc1af05568ea379d71c02a126295d2244 (diff)
dax: fix PMD data corruption when fault races with write
This is based on a patch from Jan Kara that fixed the equivalent race in the DAX PTE fault path. Currently DAX PMD read fault can race with write(2) in the following way: CPU1 - write(2) CPU2 - read fault dax_iomap_pmd_fault() ->iomap_begin() - sees hole dax_iomap_rw() iomap_apply() ->iomap_begin - allocates blocks dax_iomap_actor() invalidate_inode_pages2_range() - there's nothing to invalidate grab_mapping_entry() - we add huge zero page to the radix tree and map it to page tables The result is that hole page is mapped into page tables (and thus zeros are seen in mmap) while file has data written in that place. Fix the problem by locking exception entry before mapping blocks for the fault. That way we are sure invalidate_inode_pages2_range() call for racing write will either block on entry lock waiting for the fault to finish (and unmap stale page tables after that) or read fault will see already allocated blocks by write(2). Fixes: 9f141d6ef6258 ("dax: Call ->iomap_begin without entry lock during dax fault") Link: http://lkml.kernel.org/r/20170510172700.18991-1-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> Reviewed-by: Jan Kara <jack@suse.cz> Cc: Dan Williams <dan.j.williams@intel.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/dax.c28
1 files changed, 14 insertions, 14 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 32f020c9cedf..93ae87297ffa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1388,6 +1388,16 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1388 goto fallback; 1388 goto fallback;
1389 1389
1390 /* 1390 /*
1391 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1392 * PMD or a HZP entry. If it can't (because a 4k page is already in
1393 * the tree, for instance), it will return -EEXIST and we just fall
1394 * back to 4k entries.
1395 */
1396 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1397 if (IS_ERR(entry))
1398 goto fallback;
1399
1400 /*
1391 * Note that we don't use iomap_apply here. We aren't doing I/O, only 1401 * Note that we don't use iomap_apply here. We aren't doing I/O, only
1392 * setting up a mapping, so really we're using iomap_begin() as a way 1402 * setting up a mapping, so really we're using iomap_begin() as a way
1393 * to look up our filesystem block. 1403 * to look up our filesystem block.
@@ -1395,21 +1405,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1395 pos = (loff_t)pgoff << PAGE_SHIFT; 1405 pos = (loff_t)pgoff << PAGE_SHIFT;
1396 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); 1406 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1397 if (error) 1407 if (error)
1398 goto fallback; 1408 goto unlock_entry;
1399 1409
1400 if (iomap.offset + iomap.length < pos + PMD_SIZE) 1410 if (iomap.offset + iomap.length < pos + PMD_SIZE)
1401 goto finish_iomap; 1411 goto finish_iomap;
1402 1412
1403 /*
1404 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1405 * PMD or a HZP entry. If it can't (because a 4k page is already in
1406 * the tree, for instance), it will return -EEXIST and we just fall
1407 * back to 4k entries.
1408 */
1409 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1410 if (IS_ERR(entry))
1411 goto finish_iomap;
1412
1413 switch (iomap.type) { 1413 switch (iomap.type) {
1414 case IOMAP_MAPPED: 1414 case IOMAP_MAPPED:
1415 result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); 1415 result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
@@ -1417,7 +1417,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1417 case IOMAP_UNWRITTEN: 1417 case IOMAP_UNWRITTEN:
1418 case IOMAP_HOLE: 1418 case IOMAP_HOLE:
1419 if (WARN_ON_ONCE(write)) 1419 if (WARN_ON_ONCE(write))
1420 goto unlock_entry; 1420 break;
1421 result = dax_pmd_load_hole(vmf, &iomap, &entry); 1421 result = dax_pmd_load_hole(vmf, &iomap, &entry);
1422 break; 1422 break;
1423 default: 1423 default:
@@ -1425,8 +1425,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1425 break; 1425 break;
1426 } 1426 }
1427 1427
1428 unlock_entry:
1429 put_locked_mapping_entry(mapping, pgoff, entry);
1430 finish_iomap: 1428 finish_iomap:
1431 if (ops->iomap_end) { 1429 if (ops->iomap_end) {
1432 int copied = PMD_SIZE; 1430 int copied = PMD_SIZE;
@@ -1442,6 +1440,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1442 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, 1440 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
1443 &iomap); 1441 &iomap);
1444 } 1442 }
1443 unlock_entry:
1444 put_locked_mapping_entry(mapping, pgoff, entry);
1445 fallback: 1445 fallback:
1446 if (result == VM_FAULT_FALLBACK) { 1446 if (result == VM_FAULT_FALLBACK) {
1447 split_huge_pmd(vma, vmf->pmd, vmf->address); 1447 split_huge_pmd(vma, vmf->pmd, vmf->address);