aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorMatthew Wilcox <willy@linux.intel.com>2015-09-08 17:59:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-08 18:35:28 -0400
commit843172978bb92997310d2f7fbc172ece423cfc02 (patch)
tree3dd6214e78238293b1ac4612e7fc35775b567da0 /fs
parent01a33b4ace68bc35679a347f21d5ed6e222e30dc (diff)
dax: fix race between simultaneous faults
If two threads write-fault on the same hole at the same time, the winner of the race will return to userspace and complete their store, only to have the loser overwrite their store with zeroes. Fix this for now by taking the i_mmap_sem for write instead of read, and do so outside the call to get_block(). Now the loser of the race will see the block has already been zeroed, and will not zero it again. This severely limits our scalability. I have ideas for improving it, but those can wait for a later patch. Signed-off-by: Matthew Wilcox <willy@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/dax.c33
1 files changed, 17 insertions, 16 deletions
diff --git a/fs/dax.c b/fs/dax.c
index c694117a7062..9593f4bee327 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -272,7 +272,6 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
272static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 272static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
273 struct vm_area_struct *vma, struct vm_fault *vmf) 273 struct vm_area_struct *vma, struct vm_fault *vmf)
274{ 274{
275 struct address_space *mapping = inode->i_mapping;
276 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 275 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
277 unsigned long vaddr = (unsigned long)vmf->virtual_address; 276 unsigned long vaddr = (unsigned long)vmf->virtual_address;
278 void *addr; 277 void *addr;
@@ -280,8 +279,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
280 pgoff_t size; 279 pgoff_t size;
281 int error; 280 int error;
282 281
283 i_mmap_lock_read(mapping);
284
285 /* 282 /*
286 * Check truncate didn't happen while we were allocating a block. 283 * Check truncate didn't happen while we were allocating a block.
287 * If it did, this block may or may not be still allocated to the 284 * If it did, this block may or may not be still allocated to the
@@ -309,8 +306,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
309 error = vm_insert_mixed(vma, vaddr, pfn); 306 error = vm_insert_mixed(vma, vaddr, pfn);
310 307
311 out: 308 out:
312 i_mmap_unlock_read(mapping);
313
314 return error; 309 return error;
315} 310}
316 311
@@ -372,15 +367,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
372 * from a read fault and we've raced with a truncate 367 * from a read fault and we've raced with a truncate
373 */ 368 */
374 error = -EIO; 369 error = -EIO;
375 goto unlock_page; 370 goto unlock;
376 } 371 }
372 } else {
373 i_mmap_lock_write(mapping);
377 } 374 }
378 375
379 error = get_block(inode, block, &bh, 0); 376 error = get_block(inode, block, &bh, 0);
380 if (!error && (bh.b_size < PAGE_SIZE)) 377 if (!error && (bh.b_size < PAGE_SIZE))
381 error = -EIO; /* fs corruption? */ 378 error = -EIO; /* fs corruption? */
382 if (error) 379 if (error)
383 goto unlock_page; 380 goto unlock;
384 381
385 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { 382 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
386 if (vmf->flags & FAULT_FLAG_WRITE) { 383 if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -391,8 +388,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
391 if (!error && (bh.b_size < PAGE_SIZE)) 388 if (!error && (bh.b_size < PAGE_SIZE))
392 error = -EIO; 389 error = -EIO;
393 if (error) 390 if (error)
394 goto unlock_page; 391 goto unlock;
395 } else { 392 } else {
393 i_mmap_unlock_write(mapping);
396 return dax_load_hole(mapping, page, vmf); 394 return dax_load_hole(mapping, page, vmf);
397 } 395 }
398 } 396 }
@@ -404,17 +402,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
404 else 402 else
405 clear_user_highpage(new_page, vaddr); 403 clear_user_highpage(new_page, vaddr);
406 if (error) 404 if (error)
407 goto unlock_page; 405 goto unlock;
408 vmf->page = page; 406 vmf->page = page;
409 if (!page) { 407 if (!page) {
410 i_mmap_lock_read(mapping);
411 /* Check we didn't race with truncate */ 408 /* Check we didn't race with truncate */
412 size = (i_size_read(inode) + PAGE_SIZE - 1) >> 409 size = (i_size_read(inode) + PAGE_SIZE - 1) >>
413 PAGE_SHIFT; 410 PAGE_SHIFT;
414 if (vmf->pgoff >= size) { 411 if (vmf->pgoff >= size) {
415 i_mmap_unlock_read(mapping);
416 error = -EIO; 412 error = -EIO;
417 goto out; 413 goto unlock;
418 } 414 }
419 } 415 }
420 return VM_FAULT_LOCKED; 416 return VM_FAULT_LOCKED;
@@ -450,6 +446,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
450 WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); 446 WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
451 } 447 }
452 448
449 if (!page)
450 i_mmap_unlock_write(mapping);
453 out: 451 out:
454 if (error == -ENOMEM) 452 if (error == -ENOMEM)
455 return VM_FAULT_OOM | major; 453 return VM_FAULT_OOM | major;
@@ -458,11 +456,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
458 return VM_FAULT_SIGBUS | major; 456 return VM_FAULT_SIGBUS | major;
459 return VM_FAULT_NOPAGE | major; 457 return VM_FAULT_NOPAGE | major;
460 458
461 unlock_page: 459 unlock:
462 if (page) { 460 if (page) {
463 unlock_page(page); 461 unlock_page(page);
464 page_cache_release(page); 462 page_cache_release(page);
463 } else {
464 i_mmap_unlock_write(mapping);
465 } 465 }
466
466 goto out; 467 goto out;
467} 468}
468EXPORT_SYMBOL(__dax_fault); 469EXPORT_SYMBOL(__dax_fault);
@@ -540,10 +541,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
540 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 541 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
541 542
542 bh.b_size = PMD_SIZE; 543 bh.b_size = PMD_SIZE;
544 i_mmap_lock_write(mapping);
543 length = get_block(inode, block, &bh, write); 545 length = get_block(inode, block, &bh, write);
544 if (length) 546 if (length)
545 return VM_FAULT_SIGBUS; 547 return VM_FAULT_SIGBUS;
546 i_mmap_lock_read(mapping);
547 548
548 /* 549 /*
549 * If the filesystem isn't willing to tell us the length of a hole, 550 * If the filesystem isn't willing to tell us the length of a hole,
@@ -607,11 +608,11 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
607 } 608 }
608 609
609 out: 610 out:
610 i_mmap_unlock_read(mapping);
611
612 if (buffer_unwritten(&bh)) 611 if (buffer_unwritten(&bh))
613 complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); 612 complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
614 613
614 i_mmap_unlock_write(mapping);
615
615 return result; 616 return result;
616 617
617 fallback: 618 fallback: