aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2016-05-12 12:29:18 -0400
committerRoss Zwisler <ross.zwisler@linux.intel.com>2016-05-19 17:20:54 -0400
commitac401cc782429cc8560ce4840b1405d603740917 (patch)
tree44deea39b147b4f2e75286943e2ec1c838e7a2fa
parent4f622938a5e2b7f1374ffb1e5fc212744898f513 (diff)
dax: New fault locking
Currently DAX page fault locking is racy. CPU0 (write fault) CPU1 (read fault) __dax_fault() __dax_fault() get_block(inode, block, &bh, 0) -> not mapped get_block(inode, block, &bh, 0) -> not mapped if (!buffer_mapped(&bh)) if (vmf->flags & FAULT_FLAG_WRITE) get_block(inode, block, &bh, 1) -> allocates blocks if (page) -> no if (!buffer_mapped(&bh)) if (vmf->flags & FAULT_FLAG_WRITE) { } else { dax_load_hole(); } dax_insert_mapping() And we are in a situation where we fail in dax_radix_entry() with -EIO. Another problem with the current DAX page fault locking is that there is no race-free way to clear dirty tag in the radix tree. We can always end up with clean radix tree and dirty data in CPU cache. We fix the first problem by introducing locking of exceptional radix tree entries in DAX mappings acting very similarly to page lock and thus synchronizing properly faults against the same mapping index. The same lock can later be used to avoid races when clearing radix tree dirty tag. Reviewed-by: NeilBrown <neilb@suse.com> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
-rw-r--r--fs/dax.c553
-rw-r--r--include/linux/dax.h3
-rw-r--r--mm/filemap.c9
-rw-r--r--mm/truncate.c62
4 files changed, 447 insertions, 180 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 351afd3cf8be..f43c3d806fb6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -46,6 +46,30 @@
46 RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ 46 RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
47 RADIX_TREE_EXCEPTIONAL_ENTRY)) 47 RADIX_TREE_EXCEPTIONAL_ENTRY))
48 48
49/* We choose 4096 entries - same as per-zone page wait tables */
50#define DAX_WAIT_TABLE_BITS 12
51#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
52
53wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
54
55static int __init init_dax_wait_table(void)
56{
57 int i;
58
59 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
60 init_waitqueue_head(wait_table + i);
61 return 0;
62}
63fs_initcall(init_dax_wait_table);
64
65static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
66 pgoff_t index)
67{
68 unsigned long hash = hash_long((unsigned long)mapping ^ index,
69 DAX_WAIT_TABLE_BITS);
70 return wait_table + hash;
71}
72
49static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 73static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
50{ 74{
51 struct request_queue *q = bdev->bd_queue; 75 struct request_queue *q = bdev->bd_queue;
@@ -268,6 +292,263 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
268EXPORT_SYMBOL_GPL(dax_do_io); 292EXPORT_SYMBOL_GPL(dax_do_io);
269 293
270/* 294/*
295 * DAX radix tree locking
296 */
297struct exceptional_entry_key {
298 struct address_space *mapping;
299 unsigned long index;
300};
301
302struct wait_exceptional_entry_queue {
303 wait_queue_t wait;
304 struct exceptional_entry_key key;
305};
306
307static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
308 int sync, void *keyp)
309{
310 struct exceptional_entry_key *key = keyp;
311 struct wait_exceptional_entry_queue *ewait =
312 container_of(wait, struct wait_exceptional_entry_queue, wait);
313
314 if (key->mapping != ewait->key.mapping ||
315 key->index != ewait->key.index)
316 return 0;
317 return autoremove_wake_function(wait, mode, sync, NULL);
318}
319
320/*
321 * Check whether the given slot is locked. The function must be called with
322 * mapping->tree_lock held
323 */
324static inline int slot_locked(struct address_space *mapping, void **slot)
325{
326 unsigned long entry = (unsigned long)
327 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
328 return entry & RADIX_DAX_ENTRY_LOCK;
329}
330
331/*
332 * Mark the given slot is locked. The function must be called with
333 * mapping->tree_lock held
334 */
335static inline void *lock_slot(struct address_space *mapping, void **slot)
336{
337 unsigned long entry = (unsigned long)
338 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
339
340 entry |= RADIX_DAX_ENTRY_LOCK;
341 radix_tree_replace_slot(slot, (void *)entry);
342 return (void *)entry;
343}
344
345/*
346 * Mark the given slot is unlocked. The function must be called with
347 * mapping->tree_lock held
348 */
349static inline void *unlock_slot(struct address_space *mapping, void **slot)
350{
351 unsigned long entry = (unsigned long)
352 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
353
354 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
355 radix_tree_replace_slot(slot, (void *)entry);
356 return (void *)entry;
357}
358
359/*
360 * Lookup entry in radix tree, wait for it to become unlocked if it is
361 * exceptional entry and return it. The caller must call
362 * put_unlocked_mapping_entry() when he decided not to lock the entry or
363 * put_locked_mapping_entry() when he locked the entry and now wants to
364 * unlock it.
365 *
366 * The function must be called with mapping->tree_lock held.
367 */
368static void *get_unlocked_mapping_entry(struct address_space *mapping,
369 pgoff_t index, void ***slotp)
370{
371 void *ret, **slot;
372 struct wait_exceptional_entry_queue ewait;
373 wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
374
375 init_wait(&ewait.wait);
376 ewait.wait.func = wake_exceptional_entry_func;
377 ewait.key.mapping = mapping;
378 ewait.key.index = index;
379
380 for (;;) {
381 ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
382 &slot);
383 if (!ret || !radix_tree_exceptional_entry(ret) ||
384 !slot_locked(mapping, slot)) {
385 if (slotp)
386 *slotp = slot;
387 return ret;
388 }
389 prepare_to_wait_exclusive(wq, &ewait.wait,
390 TASK_UNINTERRUPTIBLE);
391 spin_unlock_irq(&mapping->tree_lock);
392 schedule();
393 finish_wait(wq, &ewait.wait);
394 spin_lock_irq(&mapping->tree_lock);
395 }
396}
397
398/*
399 * Find radix tree entry at given index. If it points to a page, return with
400 * the page locked. If it points to the exceptional entry, return with the
401 * radix tree entry locked. If the radix tree doesn't contain given index,
402 * create empty exceptional entry for the index and return with it locked.
403 *
404 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
405 * persistent memory the benefit is doubtful. We can add that later if we can
406 * show it helps.
407 */
408static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
409{
410 void *ret, **slot;
411
412restart:
413 spin_lock_irq(&mapping->tree_lock);
414 ret = get_unlocked_mapping_entry(mapping, index, &slot);
415 /* No entry for given index? Make sure radix tree is big enough. */
416 if (!ret) {
417 int err;
418
419 spin_unlock_irq(&mapping->tree_lock);
420 err = radix_tree_preload(
421 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
422 if (err)
423 return ERR_PTR(err);
424 ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
425 RADIX_DAX_ENTRY_LOCK);
426 spin_lock_irq(&mapping->tree_lock);
427 err = radix_tree_insert(&mapping->page_tree, index, ret);
428 radix_tree_preload_end();
429 if (err) {
430 spin_unlock_irq(&mapping->tree_lock);
431 /* Someone already created the entry? */
432 if (err == -EEXIST)
433 goto restart;
434 return ERR_PTR(err);
435 }
436 /* Good, we have inserted empty locked entry into the tree. */
437 mapping->nrexceptional++;
438 spin_unlock_irq(&mapping->tree_lock);
439 return ret;
440 }
441 /* Normal page in radix tree? */
442 if (!radix_tree_exceptional_entry(ret)) {
443 struct page *page = ret;
444
445 get_page(page);
446 spin_unlock_irq(&mapping->tree_lock);
447 lock_page(page);
448 /* Page got truncated? Retry... */
449 if (unlikely(page->mapping != mapping)) {
450 unlock_page(page);
451 put_page(page);
452 goto restart;
453 }
454 return page;
455 }
456 ret = lock_slot(mapping, slot);
457 spin_unlock_irq(&mapping->tree_lock);
458 return ret;
459}
460
461void dax_wake_mapping_entry_waiter(struct address_space *mapping,
462 pgoff_t index, bool wake_all)
463{
464 wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
465
466 /*
467 * Checking for locked entry and prepare_to_wait_exclusive() happens
468 * under mapping->tree_lock, ditto for entry handling in our callers.
469 * So at this point all tasks that could have seen our entry locked
470 * must be in the waitqueue and the following check will see them.
471 */
472 if (waitqueue_active(wq)) {
473 struct exceptional_entry_key key;
474
475 key.mapping = mapping;
476 key.index = index;
477 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
478 }
479}
480
481static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
482{
483 void *ret, **slot;
484
485 spin_lock_irq(&mapping->tree_lock);
486 ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
487 if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
488 !slot_locked(mapping, slot))) {
489 spin_unlock_irq(&mapping->tree_lock);
490 return;
491 }
492 unlock_slot(mapping, slot);
493 spin_unlock_irq(&mapping->tree_lock);
494 dax_wake_mapping_entry_waiter(mapping, index, false);
495}
496
497static void put_locked_mapping_entry(struct address_space *mapping,
498 pgoff_t index, void *entry)
499{
500 if (!radix_tree_exceptional_entry(entry)) {
501 unlock_page(entry);
502 put_page(entry);
503 } else {
504 unlock_mapping_entry(mapping, index);
505 }
506}
507
508/*
509 * Called when we are done with radix tree entry we looked up via
510 * get_unlocked_mapping_entry() and which we didn't lock in the end.
511 */
512static void put_unlocked_mapping_entry(struct address_space *mapping,
513 pgoff_t index, void *entry)
514{
515 if (!radix_tree_exceptional_entry(entry))
516 return;
517
518 /* We have to wake up next waiter for the radix tree entry lock */
519 dax_wake_mapping_entry_waiter(mapping, index, false);
520}
521
522/*
523 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
524 * entry to get unlocked before deleting it.
525 */
526int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
527{
528 void *entry;
529
530 spin_lock_irq(&mapping->tree_lock);
531 entry = get_unlocked_mapping_entry(mapping, index, NULL);
532 /*
533 * This gets called from truncate / punch_hole path. As such, the caller
534 * must hold locks protecting against concurrent modifications of the
535 * radix tree (usually fs-private i_mmap_sem for writing). Since the
536 * caller has seen exceptional entry for this index, we better find it
537 * at that index as well...
538 */
539 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
540 spin_unlock_irq(&mapping->tree_lock);
541 return 0;
542 }
543 radix_tree_delete(&mapping->page_tree, index);
544 mapping->nrexceptional--;
545 spin_unlock_irq(&mapping->tree_lock);
546 dax_wake_mapping_entry_waiter(mapping, index, true);
547
548 return 1;
549}
550
551/*
271 * The user has performed a load from a hole in the file. Allocating 552 * The user has performed a load from a hole in the file. Allocating
272 * a new page in the file would cause excessive storage usage for 553 * a new page in the file would cause excessive storage usage for
273 * workloads with sparse files. We allocate a page cache page instead. 554 * workloads with sparse files. We allocate a page cache page instead.
@@ -275,15 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io);
275 * otherwise it will simply fall out of the page cache under memory 556 * otherwise it will simply fall out of the page cache under memory
276 * pressure without ever having been dirtied. 557 * pressure without ever having been dirtied.
277 */ 558 */
278static int dax_load_hole(struct address_space *mapping, struct page *page, 559static int dax_load_hole(struct address_space *mapping, void *entry,
279 struct vm_fault *vmf) 560 struct vm_fault *vmf)
280{ 561{
281 if (!page) 562 struct page *page;
282 page = find_or_create_page(mapping, vmf->pgoff, 563
283 GFP_KERNEL | __GFP_ZERO); 564 /* Hole page already exists? Return it... */
284 if (!page) 565 if (!radix_tree_exceptional_entry(entry)) {
285 return VM_FAULT_OOM; 566 vmf->page = entry;
567 return VM_FAULT_LOCKED;
568 }
286 569
570 /* This will replace locked radix tree entry with a hole page */
571 page = find_or_create_page(mapping, vmf->pgoff,
572 vmf->gfp_mask | __GFP_ZERO);
573 if (!page) {
574 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
575 return VM_FAULT_OOM;
576 }
287 vmf->page = page; 577 vmf->page = page;
288 return VM_FAULT_LOCKED; 578 return VM_FAULT_LOCKED;
289} 579}
@@ -307,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode,
307 return 0; 597 return 0;
308} 598}
309 599
310#define NO_SECTOR -1
311#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) 600#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
312 601
313static int dax_radix_entry(struct address_space *mapping, pgoff_t index, 602static void *dax_insert_mapping_entry(struct address_space *mapping,
314 sector_t sector, bool pmd_entry, bool dirty) 603 struct vm_fault *vmf,
604 void *entry, sector_t sector)
315{ 605{
316 struct radix_tree_root *page_tree = &mapping->page_tree; 606 struct radix_tree_root *page_tree = &mapping->page_tree;
317 pgoff_t pmd_index = DAX_PMD_INDEX(index); 607 int error = 0;
318 int type, error = 0; 608 bool hole_fill = false;
319 void *entry; 609 void *new_entry;
610 pgoff_t index = vmf->pgoff;
320 611
321 WARN_ON_ONCE(pmd_entry && !dirty); 612 if (vmf->flags & FAULT_FLAG_WRITE)
322 if (dirty)
323 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 613 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
324 614
325 spin_lock_irq(&mapping->tree_lock); 615 /* Replacing hole page with block mapping? */
326 616 if (!radix_tree_exceptional_entry(entry)) {
327 entry = radix_tree_lookup(page_tree, pmd_index); 617 hole_fill = true;
328 if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { 618 /*
329 index = pmd_index; 619 * Unmap the page now before we remove it from page cache below.
330 goto dirty; 620 * The page is locked so it cannot be faulted in again.
621 */
622 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
623 PAGE_SIZE, 0);
624 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
625 if (error)
626 return ERR_PTR(error);
331 } 627 }
332 628
333 entry = radix_tree_lookup(page_tree, index); 629 spin_lock_irq(&mapping->tree_lock);
334 if (entry) { 630 new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
335 type = RADIX_DAX_TYPE(entry); 631 RADIX_DAX_ENTRY_LOCK);
336 if (WARN_ON_ONCE(type != RADIX_DAX_PTE && 632 if (hole_fill) {
337 type != RADIX_DAX_PMD)) { 633 __delete_from_page_cache(entry, NULL);
338 error = -EIO; 634 /* Drop pagecache reference */
635 put_page(entry);
636 error = radix_tree_insert(page_tree, index, new_entry);
637 if (error) {
638 new_entry = ERR_PTR(error);
339 goto unlock; 639 goto unlock;
340 } 640 }
641 mapping->nrexceptional++;
642 } else {
643 void **slot;
644 void *ret;
341 645
342 if (!pmd_entry || type == RADIX_DAX_PMD) 646 ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
343 goto dirty; 647 WARN_ON_ONCE(ret != entry);
344 648 radix_tree_replace_slot(slot, new_entry);
345 /*
346 * We only insert dirty PMD entries into the radix tree. This
347 * means we don't need to worry about removing a dirty PTE
348 * entry and inserting a clean PMD entry, thus reducing the
349 * range we would flush with a follow-up fsync/msync call.
350 */
351 radix_tree_delete(&mapping->page_tree, index);
352 mapping->nrexceptional--;
353 }
354
355 if (sector == NO_SECTOR) {
356 /*
357 * This can happen during correct operation if our pfn_mkwrite
358 * fault raced against a hole punch operation. If this
359 * happens the pte that was hole punched will have been
360 * unmapped and the radix tree entry will have been removed by
361 * the time we are called, but the call will still happen. We
362 * will return all the way up to wp_pfn_shared(), where the
363 * pte_same() check will fail, eventually causing page fault
364 * to be retried by the CPU.
365 */
366 goto unlock;
367 } 649 }
368 650 if (vmf->flags & FAULT_FLAG_WRITE)
369 error = radix_tree_insert(page_tree, index,
370 RADIX_DAX_ENTRY(sector, pmd_entry));
371 if (error)
372 goto unlock;
373
374 mapping->nrexceptional++;
375 dirty:
376 if (dirty)
377 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 651 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
378 unlock: 652 unlock:
379 spin_unlock_irq(&mapping->tree_lock); 653 spin_unlock_irq(&mapping->tree_lock);
380 return error; 654 if (hole_fill) {
655 radix_tree_preload_end();
656 /*
657 * We don't need hole page anymore, it has been replaced with
658 * locked radix tree entry now.
659 */
660 if (mapping->a_ops->freepage)
661 mapping->a_ops->freepage(entry);
662 unlock_page(entry);
663 put_page(entry);
664 }
665 return new_entry;
381} 666}
382 667
383static int dax_writeback_one(struct block_device *bdev, 668static int dax_writeback_one(struct block_device *bdev,
@@ -503,17 +788,19 @@ int dax_writeback_mapping_range(struct address_space *mapping,
503} 788}
504EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 789EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
505 790
506static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 791static int dax_insert_mapping(struct address_space *mapping,
792 struct buffer_head *bh, void **entryp,
507 struct vm_area_struct *vma, struct vm_fault *vmf) 793 struct vm_area_struct *vma, struct vm_fault *vmf)
508{ 794{
509 unsigned long vaddr = (unsigned long)vmf->virtual_address; 795 unsigned long vaddr = (unsigned long)vmf->virtual_address;
510 struct address_space *mapping = inode->i_mapping;
511 struct block_device *bdev = bh->b_bdev; 796 struct block_device *bdev = bh->b_bdev;
512 struct blk_dax_ctl dax = { 797 struct blk_dax_ctl dax = {
513 .sector = to_sector(bh, inode), 798 .sector = to_sector(bh, mapping->host),
514 .size = bh->b_size, 799 .size = bh->b_size,
515 }; 800 };
516 int error; 801 int error;
802 void *ret;
803 void *entry = *entryp;
517 804
518 i_mmap_lock_read(mapping); 805 i_mmap_lock_read(mapping);
519 806
@@ -523,16 +810,16 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
523 } 810 }
524 dax_unmap_atomic(bdev, &dax); 811 dax_unmap_atomic(bdev, &dax);
525 812
526 error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, 813 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
527 vmf->flags & FAULT_FLAG_WRITE); 814 if (IS_ERR(ret)) {
528 if (error) 815 error = PTR_ERR(ret);
529 goto out; 816 goto out;
817 }
818 *entryp = ret;
530 819
531 error = vm_insert_mixed(vma, vaddr, dax.pfn); 820 error = vm_insert_mixed(vma, vaddr, dax.pfn);
532
533 out: 821 out:
534 i_mmap_unlock_read(mapping); 822 i_mmap_unlock_read(mapping);
535
536 return error; 823 return error;
537} 824}
538 825
@@ -552,7 +839,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
552 struct file *file = vma->vm_file; 839 struct file *file = vma->vm_file;
553 struct address_space *mapping = file->f_mapping; 840 struct address_space *mapping = file->f_mapping;
554 struct inode *inode = mapping->host; 841 struct inode *inode = mapping->host;
555 struct page *page; 842 void *entry;
556 struct buffer_head bh; 843 struct buffer_head bh;
557 unsigned long vaddr = (unsigned long)vmf->virtual_address; 844 unsigned long vaddr = (unsigned long)vmf->virtual_address;
558 unsigned blkbits = inode->i_blkbits; 845 unsigned blkbits = inode->i_blkbits;
@@ -561,6 +848,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
561 int error; 848 int error;
562 int major = 0; 849 int major = 0;
563 850
851 /*
852 * Check whether offset isn't beyond end of file now. Caller is supposed
853 * to hold locks serializing us with truncate / punch hole so this is
854 * a reliable test.
855 */
564 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 856 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
565 if (vmf->pgoff >= size) 857 if (vmf->pgoff >= size)
566 return VM_FAULT_SIGBUS; 858 return VM_FAULT_SIGBUS;
@@ -570,40 +862,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
570 bh.b_bdev = inode->i_sb->s_bdev; 862 bh.b_bdev = inode->i_sb->s_bdev;
571 bh.b_size = PAGE_SIZE; 863 bh.b_size = PAGE_SIZE;
572 864
573 repeat: 865 entry = grab_mapping_entry(mapping, vmf->pgoff);
574 page = find_get_page(mapping, vmf->pgoff); 866 if (IS_ERR(entry)) {
575 if (page) { 867 error = PTR_ERR(entry);
576 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 868 goto out;
577 put_page(page);
578 return VM_FAULT_RETRY;
579 }
580 if (unlikely(page->mapping != mapping)) {
581 unlock_page(page);
582 put_page(page);
583 goto repeat;
584 }
585 } 869 }
586 870
587 error = get_block(inode, block, &bh, 0); 871 error = get_block(inode, block, &bh, 0);
588 if (!error && (bh.b_size < PAGE_SIZE)) 872 if (!error && (bh.b_size < PAGE_SIZE))
589 error = -EIO; /* fs corruption? */ 873 error = -EIO; /* fs corruption? */
590 if (error) 874 if (error)
591 goto unlock_page; 875 goto unlock_entry;
592
593 if (!buffer_mapped(&bh) && !vmf->cow_page) {
594 if (vmf->flags & FAULT_FLAG_WRITE) {
595 error = get_block(inode, block, &bh, 1);
596 count_vm_event(PGMAJFAULT);
597 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
598 major = VM_FAULT_MAJOR;
599 if (!error && (bh.b_size < PAGE_SIZE))
600 error = -EIO;
601 if (error)
602 goto unlock_page;
603 } else {
604 return dax_load_hole(mapping, page, vmf);
605 }
606 }
607 876
608 if (vmf->cow_page) { 877 if (vmf->cow_page) {
609 struct page *new_page = vmf->cow_page; 878 struct page *new_page = vmf->cow_page;
@@ -612,30 +881,37 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
612 else 881 else
613 clear_user_highpage(new_page, vaddr); 882 clear_user_highpage(new_page, vaddr);
614 if (error) 883 if (error)
615 goto unlock_page; 884 goto unlock_entry;
616 vmf->page = page; 885 if (!radix_tree_exceptional_entry(entry)) {
617 if (!page) 886 vmf->page = entry;
887 } else {
888 unlock_mapping_entry(mapping, vmf->pgoff);
618 i_mmap_lock_read(mapping); 889 i_mmap_lock_read(mapping);
890 vmf->page = NULL;
891 }
619 return VM_FAULT_LOCKED; 892 return VM_FAULT_LOCKED;
620 } 893 }
621 894
622 /* Check we didn't race with a read fault installing a new page */ 895 if (!buffer_mapped(&bh)) {
623 if (!page && major) 896 if (vmf->flags & FAULT_FLAG_WRITE) {
624 page = find_lock_page(mapping, vmf->pgoff); 897 error = get_block(inode, block, &bh, 1);
625 898 count_vm_event(PGMAJFAULT);
626 if (page) { 899 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
627 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 900 major = VM_FAULT_MAJOR;
628 PAGE_SIZE, 0); 901 if (!error && (bh.b_size < PAGE_SIZE))
629 delete_from_page_cache(page); 902 error = -EIO;
630 unlock_page(page); 903 if (error)
631 put_page(page); 904 goto unlock_entry;
632 page = NULL; 905 } else {
906 return dax_load_hole(mapping, entry, vmf);
907 }
633 } 908 }
634 909
635 /* Filesystem should not return unwritten buffers to us! */ 910 /* Filesystem should not return unwritten buffers to us! */
636 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 911 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
637 error = dax_insert_mapping(inode, &bh, vma, vmf); 912 error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
638 913 unlock_entry:
914 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
639 out: 915 out:
640 if (error == -ENOMEM) 916 if (error == -ENOMEM)
641 return VM_FAULT_OOM | major; 917 return VM_FAULT_OOM | major;
@@ -643,13 +919,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
643 if ((error < 0) && (error != -EBUSY)) 919 if ((error < 0) && (error != -EBUSY))
644 return VM_FAULT_SIGBUS | major; 920 return VM_FAULT_SIGBUS | major;
645 return VM_FAULT_NOPAGE | major; 921 return VM_FAULT_NOPAGE | major;
646
647 unlock_page:
648 if (page) {
649 unlock_page(page);
650 put_page(page);
651 }
652 goto out;
653} 922}
654EXPORT_SYMBOL(__dax_fault); 923EXPORT_SYMBOL(__dax_fault);
655 924
@@ -718,7 +987,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
718 struct block_device *bdev; 987 struct block_device *bdev;
719 pgoff_t size, pgoff; 988 pgoff_t size, pgoff;
720 sector_t block; 989 sector_t block;
721 int error, result = 0; 990 int result = 0;
722 bool alloc = false; 991 bool alloc = false;
723 992
724 /* dax pmd mappings require pfn_t_devmap() */ 993 /* dax pmd mappings require pfn_t_devmap() */
@@ -865,13 +1134,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
865 * the write to insert a dirty entry. 1134 * the write to insert a dirty entry.
866 */ 1135 */
867 if (write) { 1136 if (write) {
868 error = dax_radix_entry(mapping, pgoff, dax.sector, 1137 /*
869 true, true); 1138 * We should insert radix-tree entry and dirty it here.
870 if (error) { 1139 * For now this is broken...
871 dax_pmd_dbg(&bh, address, 1140 */
872 "PMD radix insertion failed");
873 goto fallback;
874 }
875 } 1141 }
876 1142
877 dev_dbg(part_to_dev(bdev->bd_part), 1143 dev_dbg(part_to_dev(bdev->bd_part),
@@ -931,23 +1197,18 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
931int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1197int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
932{ 1198{
933 struct file *file = vma->vm_file; 1199 struct file *file = vma->vm_file;
934 int error; 1200 struct address_space *mapping = file->f_mapping;
935 1201 void *entry;
936 /* 1202 pgoff_t index = vmf->pgoff;
937 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
938 * RADIX_DAX_PTE entry already exists in the radix tree from a
939 * previous call to __dax_fault(). We just want to look up that PTE
940 * entry using vmf->pgoff and make sure the dirty tag is set. This
941 * saves us from having to make a call to get_block() here to look
942 * up the sector.
943 */
944 error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
945 true);
946 1203
947 if (error == -ENOMEM) 1204 spin_lock_irq(&mapping->tree_lock);
948 return VM_FAULT_OOM; 1205 entry = get_unlocked_mapping_entry(mapping, index, NULL);
949 if (error) 1206 if (!entry || !radix_tree_exceptional_entry(entry))
950 return VM_FAULT_SIGBUS; 1207 goto out;
1208 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
1209 put_unlocked_mapping_entry(mapping, index, entry);
1210out:
1211 spin_unlock_irq(&mapping->tree_lock);
951 return VM_FAULT_NOPAGE; 1212 return VM_FAULT_NOPAGE;
952} 1213}
953EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 1214EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index aa148937bb3f..756625c6d0dd 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -15,6 +15,9 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
15int dax_truncate_page(struct inode *, loff_t from, get_block_t); 15int dax_truncate_page(struct inode *, loff_t from, get_block_t);
16int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); 16int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
17int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); 17int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
18int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
19void dax_wake_mapping_entry_waiter(struct address_space *mapping,
20 pgoff_t index, bool wake_all);
18 21
19#ifdef CONFIG_FS_DAX 22#ifdef CONFIG_FS_DAX
20struct page *read_dax_sector(struct block_device *bdev, sector_t n); 23struct page *read_dax_sector(struct block_device *bdev, sector_t n);
diff --git a/mm/filemap.c b/mm/filemap.c
index dfe55c2cfb34..7b9a4b180cae 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -160,13 +160,15 @@ static void page_cache_tree_delete(struct address_space *mapping,
160 return; 160 return;
161 161
162 /* 162 /*
163 * Track node that only contains shadow entries. 163 * Track node that only contains shadow entries. DAX mappings contain
164 * no shadow entries and may contain other exceptional entries so skip
165 * those.
164 * 166 *
165 * Avoid acquiring the list_lru lock if already tracked. The 167 * Avoid acquiring the list_lru lock if already tracked. The
166 * list_empty() test is safe as node->private_list is 168 * list_empty() test is safe as node->private_list is
167 * protected by mapping->tree_lock. 169 * protected by mapping->tree_lock.
168 */ 170 */
169 if (!workingset_node_pages(node) && 171 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
170 list_empty(&node->private_list)) { 172 list_empty(&node->private_list)) {
171 node->private_data = mapping; 173 node->private_data = mapping;
172 list_lru_add(&workingset_shadow_nodes, &node->private_list); 174 list_lru_add(&workingset_shadow_nodes, &node->private_list);
@@ -611,6 +613,9 @@ static int page_cache_tree_insert(struct address_space *mapping,
611 /* DAX accounts exceptional entries as normal pages */ 613 /* DAX accounts exceptional entries as normal pages */
612 if (node) 614 if (node)
613 workingset_node_pages_dec(node); 615 workingset_node_pages_dec(node);
616 /* Wakeup waiters for exceptional entry lock */
617 dax_wake_mapping_entry_waiter(mapping, page->index,
618 false);
614 } 619 }
615 } 620 }
616 radix_tree_replace_slot(slot, page); 621 radix_tree_replace_slot(slot, page);
diff --git a/mm/truncate.c b/mm/truncate.c
index b00272810871..4064f8f53daa 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping,
34 if (shmem_mapping(mapping)) 34 if (shmem_mapping(mapping))
35 return; 35 return;
36 36
37 spin_lock_irq(&mapping->tree_lock);
38
39 if (dax_mapping(mapping)) { 37 if (dax_mapping(mapping)) {
40 if (radix_tree_delete_item(&mapping->page_tree, index, entry)) 38 dax_delete_mapping_entry(mapping, index);
41 mapping->nrexceptional--; 39 return;
42 } else {
43 /*
44 * Regular page slots are stabilized by the page lock even
45 * without the tree itself locked. These unlocked entries
46 * need verification under the tree lock.
47 */
48 if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
49 &slot))
50 goto unlock;
51 if (*slot != entry)
52 goto unlock;
53 radix_tree_replace_slot(slot, NULL);
54 mapping->nrexceptional--;
55 if (!node)
56 goto unlock;
57 workingset_node_shadows_dec(node);
58 /*
59 * Don't track node without shadow entries.
60 *
61 * Avoid acquiring the list_lru lock if already untracked.
62 * The list_empty() test is safe as node->private_list is
63 * protected by mapping->tree_lock.
64 */
65 if (!workingset_node_shadows(node) &&
66 !list_empty(&node->private_list))
67 list_lru_del(&workingset_shadow_nodes,
68 &node->private_list);
69 __radix_tree_delete_node(&mapping->page_tree, node);
70 } 40 }
41 spin_lock_irq(&mapping->tree_lock);
42 /*
43 * Regular page slots are stabilized by the page lock even
44 * without the tree itself locked. These unlocked entries
45 * need verification under the tree lock.
46 */
47 if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
48 &slot))
49 goto unlock;
50 if (*slot != entry)
51 goto unlock;
52 radix_tree_replace_slot(slot, NULL);
53 mapping->nrexceptional--;
54 if (!node)
55 goto unlock;
56 workingset_node_shadows_dec(node);
57 /*
58 * Don't track node without shadow entries.
59 *
60 * Avoid acquiring the list_lru lock if already untracked.
61 * The list_empty() test is safe as node->private_list is
62 * protected by mapping->tree_lock.
63 */
64 if (!workingset_node_shadows(node) &&
65 !list_empty(&node->private_list))
66 list_lru_del(&workingset_shadow_nodes,
67 &node->private_list);
68 __radix_tree_delete_node(&mapping->page_tree, node);
71unlock: 69unlock:
72 spin_unlock_irq(&mapping->tree_lock); 70 spin_unlock_irq(&mapping->tree_lock);
73} 71}