aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/dax.c378
-rw-r--r--include/linux/dax.h55
-rw-r--r--mm/filemap.c3
3 files changed, 386 insertions, 50 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 0582c7c2ae40..281e91a63367 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -76,6 +76,26 @@ static void dax_unmap_atomic(struct block_device *bdev,
76 blk_queue_exit(bdev->bd_queue); 76 blk_queue_exit(bdev->bd_queue);
77} 77}
78 78
79static int dax_is_pmd_entry(void *entry)
80{
81 return (unsigned long)entry & RADIX_DAX_PMD;
82}
83
84static int dax_is_pte_entry(void *entry)
85{
86 return !((unsigned long)entry & RADIX_DAX_PMD);
87}
88
89static int dax_is_zero_entry(void *entry)
90{
91 return (unsigned long)entry & RADIX_DAX_HZP;
92}
93
94static int dax_is_empty_entry(void *entry)
95{
96 return (unsigned long)entry & RADIX_DAX_EMPTY;
97}
98
79struct page *read_dax_sector(struct block_device *bdev, sector_t n) 99struct page *read_dax_sector(struct block_device *bdev, sector_t n)
80{ 100{
81 struct page *page = alloc_pages(GFP_KERNEL, 0); 101 struct page *page = alloc_pages(GFP_KERNEL, 0);
@@ -281,7 +301,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
281 * queue to the start of that PMD. This ensures that all offsets in 301 * queue to the start of that PMD. This ensures that all offsets in
282 * the range covered by the PMD map to the same bit lock. 302 * the range covered by the PMD map to the same bit lock.
283 */ 303 */
284 if (RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) 304 if (dax_is_pmd_entry(entry))
285 index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); 305 index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
286 306
287 key->mapping = mapping; 307 key->mapping = mapping;
@@ -413,36 +433,116 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
413 * radix tree entry locked. If the radix tree doesn't contain given index, 433 * radix tree entry locked. If the radix tree doesn't contain given index,
414 * create empty exceptional entry for the index and return with it locked. 434 * create empty exceptional entry for the index and return with it locked.
415 * 435 *
436 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
437 * either return that locked entry or will return an error. This error will
438 * happen if there are any 4k entries (either zero pages or DAX entries)
439 * within the 2MiB range that we are requesting.
440 *
441 * We always favor 4k entries over 2MiB entries. There isn't a flow where we
442 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
443 * insertion will fail if it finds any 4k entries already in the tree, and a
444 * 4k insertion will cause an existing 2MiB entry to be unmapped and
445 * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
446 * well as 2MiB empty entries.
447 *
448 * The exception to this downgrade path is for 2MiB DAX PMD entries that have
449 * real storage backing them. We will leave these real 2MiB DAX entries in
450 * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
451 *
416 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 452 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
417 * persistent memory the benefit is doubtful. We can add that later if we can 453 * persistent memory the benefit is doubtful. We can add that later if we can
418 * show it helps. 454 * show it helps.
419 */ 455 */
420static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) 456static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
457 unsigned long size_flag)
421{ 458{
459 bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
422 void *entry, **slot; 460 void *entry, **slot;
423 461
424restart: 462restart:
425 spin_lock_irq(&mapping->tree_lock); 463 spin_lock_irq(&mapping->tree_lock);
426 entry = get_unlocked_mapping_entry(mapping, index, &slot); 464 entry = get_unlocked_mapping_entry(mapping, index, &slot);
465
466 if (entry) {
467 if (size_flag & RADIX_DAX_PMD) {
468 if (!radix_tree_exceptional_entry(entry) ||
469 dax_is_pte_entry(entry)) {
470 put_unlocked_mapping_entry(mapping, index,
471 entry);
472 entry = ERR_PTR(-EEXIST);
473 goto out_unlock;
474 }
475 } else { /* trying to grab a PTE entry */
476 if (radix_tree_exceptional_entry(entry) &&
477 dax_is_pmd_entry(entry) &&
478 (dax_is_zero_entry(entry) ||
479 dax_is_empty_entry(entry))) {
480 pmd_downgrade = true;
481 }
482 }
483 }
484
427 /* No entry for given index? Make sure radix tree is big enough. */ 485 /* No entry for given index? Make sure radix tree is big enough. */
428 if (!entry) { 486 if (!entry || pmd_downgrade) {
429 int err; 487 int err;
430 488
489 if (pmd_downgrade) {
490 /*
491 * Make sure 'entry' remains valid while we drop
492 * mapping->tree_lock.
493 */
494 entry = lock_slot(mapping, slot);
495 }
496
431 spin_unlock_irq(&mapping->tree_lock); 497 spin_unlock_irq(&mapping->tree_lock);
432 err = radix_tree_preload( 498 err = radix_tree_preload(
433 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 499 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
434 if (err) 500 if (err) {
501 if (pmd_downgrade)
502 put_locked_mapping_entry(mapping, index, entry);
435 return ERR_PTR(err); 503 return ERR_PTR(err);
436 entry = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 504 }
437 RADIX_DAX_ENTRY_LOCK); 505
506 /*
507 * Besides huge zero pages the only other thing that gets
508 * downgraded are empty entries which don't need to be
509 * unmapped.
510 */
511 if (pmd_downgrade && dax_is_zero_entry(entry))
512 unmap_mapping_range(mapping,
513 (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
514
438 spin_lock_irq(&mapping->tree_lock); 515 spin_lock_irq(&mapping->tree_lock);
439 err = radix_tree_insert(&mapping->page_tree, index, entry); 516
517 if (pmd_downgrade) {
518 radix_tree_delete(&mapping->page_tree, index);
519 mapping->nrexceptional--;
520 dax_wake_mapping_entry_waiter(mapping, index, entry,
521 true);
522 }
523
524 entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
525
526 err = __radix_tree_insert(&mapping->page_tree, index,
527 dax_radix_order(entry), entry);
440 radix_tree_preload_end(); 528 radix_tree_preload_end();
441 if (err) { 529 if (err) {
442 spin_unlock_irq(&mapping->tree_lock); 530 spin_unlock_irq(&mapping->tree_lock);
443 /* Someone already created the entry? */ 531 /*
444 if (err == -EEXIST) 532 * Someone already created the entry? This is a
533 * normal failure when inserting PMDs in a range
534 * that already contains PTEs. In that case we want
535 * to return -EEXIST immediately.
536 */
537 if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
445 goto restart; 538 goto restart;
539 /*
540 * Our insertion of a DAX PMD entry failed, most
541 * likely because it collided with a PTE sized entry
542 * at a different index in the PMD range. We haven't
543 * inserted anything into the radix tree and have no
544 * waiters to wake.
545 */
446 return ERR_PTR(err); 546 return ERR_PTR(err);
447 } 547 }
448 /* Good, we have inserted empty locked entry into the tree. */ 548 /* Good, we have inserted empty locked entry into the tree. */
@@ -466,6 +566,7 @@ restart:
466 return page; 566 return page;
467 } 567 }
468 entry = lock_slot(mapping, slot); 568 entry = lock_slot(mapping, slot);
569 out_unlock:
469 spin_unlock_irq(&mapping->tree_lock); 570 spin_unlock_irq(&mapping->tree_lock);
470 return entry; 571 return entry;
471} 572}
@@ -473,9 +574,9 @@ restart:
473/* 574/*
474 * We do not necessarily hold the mapping->tree_lock when we call this 575 * We do not necessarily hold the mapping->tree_lock when we call this
475 * function so it is possible that 'entry' is no longer a valid item in the 576 * function so it is possible that 'entry' is no longer a valid item in the
476 * radix tree. This is okay, though, because all we really need to do is to 577 * radix tree. This is okay because all we really need to do is to find the
477 * find the correct waitqueue where tasks might be sleeping waiting for that 578 * correct waitqueue where tasks might be waiting for that old 'entry' and
478 * old 'entry' and wake them. 579 * wake them.
479 */ 580 */
480void dax_wake_mapping_entry_waiter(struct address_space *mapping, 581void dax_wake_mapping_entry_waiter(struct address_space *mapping,
481 pgoff_t index, void *entry, bool wake_all) 582 pgoff_t index, void *entry, bool wake_all)
@@ -588,11 +689,17 @@ static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size
588 return 0; 689 return 0;
589} 690}
590 691
591#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) 692/*
592 693 * By this point grab_mapping_entry() has ensured that we have a locked entry
694 * of the appropriate size so we don't have to worry about downgrading PMDs to
695 * PTEs. If we happen to be trying to insert a PTE and there is a PMD
696 * already in the tree, we will skip the insertion and just dirty the PMD as
697 * appropriate.
698 */
593static void *dax_insert_mapping_entry(struct address_space *mapping, 699static void *dax_insert_mapping_entry(struct address_space *mapping,
594 struct vm_fault *vmf, 700 struct vm_fault *vmf,
595 void *entry, sector_t sector) 701 void *entry, sector_t sector,
702 unsigned long flags)
596{ 703{
597 struct radix_tree_root *page_tree = &mapping->page_tree; 704 struct radix_tree_root *page_tree = &mapping->page_tree;
598 int error = 0; 705 int error = 0;
@@ -615,22 +722,35 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
615 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); 722 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
616 if (error) 723 if (error)
617 return ERR_PTR(error); 724 return ERR_PTR(error);
725 } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
726 /* replacing huge zero page with PMD block mapping */
727 unmap_mapping_range(mapping,
728 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
618 } 729 }
619 730
620 spin_lock_irq(&mapping->tree_lock); 731 spin_lock_irq(&mapping->tree_lock);
621 new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | 732 new_entry = dax_radix_locked_entry(sector, flags);
622 RADIX_DAX_ENTRY_LOCK); 733
623 if (hole_fill) { 734 if (hole_fill) {
624 __delete_from_page_cache(entry, NULL); 735 __delete_from_page_cache(entry, NULL);
625 /* Drop pagecache reference */ 736 /* Drop pagecache reference */
626 put_page(entry); 737 put_page(entry);
627 error = radix_tree_insert(page_tree, index, new_entry); 738 error = __radix_tree_insert(page_tree, index,
739 dax_radix_order(new_entry), new_entry);
628 if (error) { 740 if (error) {
629 new_entry = ERR_PTR(error); 741 new_entry = ERR_PTR(error);
630 goto unlock; 742 goto unlock;
631 } 743 }
632 mapping->nrexceptional++; 744 mapping->nrexceptional++;
633 } else { 745 } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
746 /*
747 * Only swap our new entry into the radix tree if the current
748 * entry is a zero page or an empty entry. If a normal PTE or
749 * PMD entry is already in the tree, we leave it alone. This
750 * means that if we are trying to insert a PTE and the
751 * existing entry is a PMD, we will just leave the PMD in the
752 * tree and dirty it if necessary.
753 */
634 void **slot; 754 void **slot;
635 void *ret; 755 void *ret;
636 756
@@ -660,7 +780,6 @@ static int dax_writeback_one(struct block_device *bdev,
660 struct address_space *mapping, pgoff_t index, void *entry) 780 struct address_space *mapping, pgoff_t index, void *entry)
661{ 781{
662 struct radix_tree_root *page_tree = &mapping->page_tree; 782 struct radix_tree_root *page_tree = &mapping->page_tree;
663 int type = RADIX_DAX_TYPE(entry);
664 struct radix_tree_node *node; 783 struct radix_tree_node *node;
665 struct blk_dax_ctl dax; 784 struct blk_dax_ctl dax;
666 void **slot; 785 void **slot;
@@ -681,13 +800,21 @@ static int dax_writeback_one(struct block_device *bdev,
681 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 800 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
682 goto unlock; 801 goto unlock;
683 802
684 if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { 803 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
804 dax_is_zero_entry(entry))) {
685 ret = -EIO; 805 ret = -EIO;
686 goto unlock; 806 goto unlock;
687 } 807 }
688 808
689 dax.sector = RADIX_DAX_SECTOR(entry); 809 /*
690 dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); 810 * Even if dax_writeback_mapping_range() was given a wbc->range_start
811 * in the middle of a PMD, the 'index' we are given will be aligned to
812 * the start index of the PMD, as will the sector we pull from
813 * 'entry'. This allows us to flush for PMD_SIZE and not have to
814 * worry about partial PMD writebacks.
815 */
816 dax.sector = dax_radix_sector(entry);
817 dax.size = PAGE_SIZE << dax_radix_order(entry);
691 spin_unlock_irq(&mapping->tree_lock); 818 spin_unlock_irq(&mapping->tree_lock);
692 819
693 /* 820 /*
@@ -726,12 +853,11 @@ int dax_writeback_mapping_range(struct address_space *mapping,
726 struct block_device *bdev, struct writeback_control *wbc) 853 struct block_device *bdev, struct writeback_control *wbc)
727{ 854{
728 struct inode *inode = mapping->host; 855 struct inode *inode = mapping->host;
729 pgoff_t start_index, end_index, pmd_index; 856 pgoff_t start_index, end_index;
730 pgoff_t indices[PAGEVEC_SIZE]; 857 pgoff_t indices[PAGEVEC_SIZE];
731 struct pagevec pvec; 858 struct pagevec pvec;
732 bool done = false; 859 bool done = false;
733 int i, ret = 0; 860 int i, ret = 0;
734 void *entry;
735 861
736 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 862 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
737 return -EIO; 863 return -EIO;
@@ -741,15 +867,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
741 867
742 start_index = wbc->range_start >> PAGE_SHIFT; 868 start_index = wbc->range_start >> PAGE_SHIFT;
743 end_index = wbc->range_end >> PAGE_SHIFT; 869 end_index = wbc->range_end >> PAGE_SHIFT;
744 pmd_index = DAX_PMD_INDEX(start_index);
745
746 rcu_read_lock();
747 entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
748 rcu_read_unlock();
749
750 /* see if the start of our range is covered by a PMD entry */
751 if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
752 start_index = pmd_index;
753 870
754 tag_pages_for_writeback(mapping, start_index, end_index); 871 tag_pages_for_writeback(mapping, start_index, end_index);
755 872
@@ -794,7 +911,7 @@ static int dax_insert_mapping(struct address_space *mapping,
794 return PTR_ERR(dax.addr); 911 return PTR_ERR(dax.addr);
795 dax_unmap_atomic(bdev, &dax); 912 dax_unmap_atomic(bdev, &dax);
796 913
797 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); 914 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
798 if (IS_ERR(ret)) 915 if (IS_ERR(ret))
799 return PTR_ERR(ret); 916 return PTR_ERR(ret);
800 *entryp = ret; 917 *entryp = ret;
@@ -841,7 +958,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
841 bh.b_bdev = inode->i_sb->s_bdev; 958 bh.b_bdev = inode->i_sb->s_bdev;
842 bh.b_size = PAGE_SIZE; 959 bh.b_size = PAGE_SIZE;
843 960
844 entry = grab_mapping_entry(mapping, vmf->pgoff); 961 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
845 if (IS_ERR(entry)) { 962 if (IS_ERR(entry)) {
846 error = PTR_ERR(entry); 963 error = PTR_ERR(entry);
847 goto out; 964 goto out;
@@ -1162,7 +1279,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1162 if (pos >= i_size_read(inode)) 1279 if (pos >= i_size_read(inode))
1163 return VM_FAULT_SIGBUS; 1280 return VM_FAULT_SIGBUS;
1164 1281
1165 entry = grab_mapping_entry(mapping, vmf->pgoff); 1282 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1166 if (IS_ERR(entry)) { 1283 if (IS_ERR(entry)) {
1167 error = PTR_ERR(entry); 1284 error = PTR_ERR(entry);
1168 goto out; 1285 goto out;
@@ -1264,4 +1381,191 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1264 return VM_FAULT_NOPAGE | major; 1381 return VM_FAULT_NOPAGE | major;
1265} 1382}
1266EXPORT_SYMBOL_GPL(dax_iomap_fault); 1383EXPORT_SYMBOL_GPL(dax_iomap_fault);
1384
1385#ifdef CONFIG_FS_DAX_PMD
1386/*
1387 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
1388 * more often than one might expect in the below functions.
1389 */
1390#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
1391
1392static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
1393 struct vm_fault *vmf, unsigned long address,
1394 struct iomap *iomap, loff_t pos, bool write, void **entryp)
1395{
1396 struct address_space *mapping = vma->vm_file->f_mapping;
1397 struct block_device *bdev = iomap->bdev;
1398 struct blk_dax_ctl dax = {
1399 .sector = dax_iomap_sector(iomap, pos),
1400 .size = PMD_SIZE,
1401 };
1402 long length = dax_map_atomic(bdev, &dax);
1403 void *ret;
1404
1405 if (length < 0) /* dax_map_atomic() failed */
1406 return VM_FAULT_FALLBACK;
1407 if (length < PMD_SIZE)
1408 goto unmap_fallback;
1409 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
1410 goto unmap_fallback;
1411 if (!pfn_t_devmap(dax.pfn))
1412 goto unmap_fallback;
1413
1414 dax_unmap_atomic(bdev, &dax);
1415
1416 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
1417 RADIX_DAX_PMD);
1418 if (IS_ERR(ret))
1419 return VM_FAULT_FALLBACK;
1420 *entryp = ret;
1421
1422 return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
1423
1424 unmap_fallback:
1425 dax_unmap_atomic(bdev, &dax);
1426 return VM_FAULT_FALLBACK;
1427}
1428
1429static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
1430 struct vm_fault *vmf, unsigned long address,
1431 struct iomap *iomap, void **entryp)
1432{
1433 struct address_space *mapping = vma->vm_file->f_mapping;
1434 unsigned long pmd_addr = address & PMD_MASK;
1435 struct page *zero_page;
1436 spinlock_t *ptl;
1437 pmd_t pmd_entry;
1438 void *ret;
1439
1440 zero_page = mm_get_huge_zero_page(vma->vm_mm);
1441
1442 if (unlikely(!zero_page))
1443 return VM_FAULT_FALLBACK;
1444
1445 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
1446 RADIX_DAX_PMD | RADIX_DAX_HZP);
1447 if (IS_ERR(ret))
1448 return VM_FAULT_FALLBACK;
1449 *entryp = ret;
1450
1451 ptl = pmd_lock(vma->vm_mm, pmd);
1452 if (!pmd_none(*pmd)) {
1453 spin_unlock(ptl);
1454 return VM_FAULT_FALLBACK;
1455 }
1456
1457 pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
1458 pmd_entry = pmd_mkhuge(pmd_entry);
1459 set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
1460 spin_unlock(ptl);
1461 return VM_FAULT_NOPAGE;
1462}
1463
1464int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1465 pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
1466{
1467 struct address_space *mapping = vma->vm_file->f_mapping;
1468 unsigned long pmd_addr = address & PMD_MASK;
1469 bool write = flags & FAULT_FLAG_WRITE;
1470 unsigned int iomap_flags = write ? IOMAP_WRITE : 0;
1471 struct inode *inode = mapping->host;
1472 int result = VM_FAULT_FALLBACK;
1473 struct iomap iomap = { 0 };
1474 pgoff_t max_pgoff, pgoff;
1475 struct vm_fault vmf;
1476 void *entry;
1477 loff_t pos;
1478 int error;
1479
1480 /* Fall back to PTEs if we're going to COW */
1481 if (write && !(vma->vm_flags & VM_SHARED))
1482 goto fallback;
1483
1484 /* If the PMD would extend outside the VMA */
1485 if (pmd_addr < vma->vm_start)
1486 goto fallback;
1487 if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1488 goto fallback;
1489
1490 /*
1491 * Check whether offset isn't beyond end of file now. Caller is
1492 * supposed to hold locks serializing us with truncate / punch hole so
1493 * this is a reliable test.
1494 */
1495 pgoff = linear_page_index(vma, pmd_addr);
1496 max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
1497
1498 if (pgoff > max_pgoff)
1499 return VM_FAULT_SIGBUS;
1500
1501 /* If the PMD would extend beyond the file size */
1502 if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
1503 goto fallback;
1504
1505 /*
1506 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1507 * PMD or a HZP entry. If it can't (because a 4k page is already in
1508 * the tree, for instance), it will return -EEXIST and we just fall
1509 * back to 4k entries.
1510 */
1511 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1512 if (IS_ERR(entry))
1513 goto fallback;
1514
1515 /*
1516 * Note that we don't use iomap_apply here. We aren't doing I/O, only
1517 * setting up a mapping, so really we're using iomap_begin() as a way
1518 * to look up our filesystem block.
1519 */
1520 pos = (loff_t)pgoff << PAGE_SHIFT;
1521 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1522 if (error)
1523 goto unlock_entry;
1524 if (iomap.offset + iomap.length < pos + PMD_SIZE)
1525 goto finish_iomap;
1526
1527 vmf.pgoff = pgoff;
1528 vmf.flags = flags;
1529 vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
1530
1531 switch (iomap.type) {
1532 case IOMAP_MAPPED:
1533 result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
1534 &iomap, pos, write, &entry);
1535 break;
1536 case IOMAP_UNWRITTEN:
1537 case IOMAP_HOLE:
1538 if (WARN_ON_ONCE(write))
1539 goto finish_iomap;
1540 result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
1541 &entry);
1542 break;
1543 default:
1544 WARN_ON_ONCE(1);
1545 break;
1546 }
1547
1548 finish_iomap:
1549 if (ops->iomap_end) {
1550 if (result == VM_FAULT_FALLBACK) {
1551 ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
1552 &iomap);
1553 } else {
1554 error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
1555 iomap_flags, &iomap);
1556 if (error)
1557 result = VM_FAULT_FALLBACK;
1558 }
1559 }
1560 unlock_entry:
1561 put_locked_mapping_entry(mapping, pgoff, entry);
1562 fallback:
1563 if (result == VM_FAULT_FALLBACK) {
1564 split_huge_pmd(vma, pmd, address);
1565 count_vm_event(THP_FAULT_FALLBACK);
1566 }
1567 return result;
1568}
1569EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
1570#endif /* CONFIG_FS_DAX_PMD */
1267#endif /* CONFIG_FS_IOMAP */ 1571#endif /* CONFIG_FS_IOMAP */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e9ea78c1cf98..8d1a5c47945f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -9,20 +9,32 @@
9struct iomap_ops; 9struct iomap_ops;
10 10
11/* 11/*
12 * We use lowest available bit in exceptional entry for locking, other two 12 * We use lowest available bit in exceptional entry for locking, one bit for
13 * bits to determine entry type. In total 3 special bits. 13 * the entry size (PMD) and two more to tell us if the entry is a huge zero
14 * page (HZP) or an empty entry that is just used for locking. In total four
15 * special bits.
16 *
17 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
18 * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
19 * block allocation.
14 */ 20 */
15#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) 21#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
16#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) 22#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
17#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) 23#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
18#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) 24#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
19#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) 25#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
20#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
21#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
22#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
23 RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
24 RADIX_TREE_EXCEPTIONAL_ENTRY))
25 26
27static inline unsigned long dax_radix_sector(void *entry)
28{
29 return (unsigned long)entry >> RADIX_DAX_SHIFT;
30}
31
32static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
33{
34 return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
35 ((unsigned long)sector << RADIX_DAX_SHIFT) |
36 RADIX_DAX_ENTRY_LOCK);
37}
26 38
27ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 39ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
28 struct iomap_ops *ops); 40 struct iomap_ops *ops);
@@ -67,6 +79,27 @@ static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
67 return VM_FAULT_FALLBACK; 79 return VM_FAULT_FALLBACK;
68} 80}
69 81
82#ifdef CONFIG_FS_DAX_PMD
83static inline unsigned int dax_radix_order(void *entry)
84{
85 if ((unsigned long)entry & RADIX_DAX_PMD)
86 return PMD_SHIFT - PAGE_SHIFT;
87 return 0;
88}
89int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
90 pmd_t *pmd, unsigned int flags, struct iomap_ops *ops);
91#else
92static inline unsigned int dax_radix_order(void *entry)
93{
94 return 0;
95}
96static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
97 unsigned long address, pmd_t *pmd, unsigned int flags,
98 struct iomap_ops *ops)
99{
100 return VM_FAULT_FALLBACK;
101}
102#endif
70int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); 103int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
71#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) 104#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
72 105
diff --git a/mm/filemap.c b/mm/filemap.c
index 1ffb7dcd1b5d..00ab94a882de 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -137,8 +137,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
137 } else { 137 } else {
138 /* DAX can replace empty locked entry with a hole */ 138 /* DAX can replace empty locked entry with a hole */
139 WARN_ON_ONCE(p != 139 WARN_ON_ONCE(p !=
140 (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 140 dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
141 RADIX_DAX_ENTRY_LOCK));
142 /* DAX accounts exceptional entries as normal pages */ 141 /* DAX accounts exceptional entries as normal pages */
143 if (node) 142 if (node)
144 workingset_node_pages_dec(node); 143 workingset_node_pages_dec(node);