aboutsummaryrefslogtreecommitdiffstats
path: root/fs/dax.c
diff options
context:
space:
mode:
authorRoss Zwisler <ross.zwisler@linux.intel.com>2017-09-06 19:18:43 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 20:27:24 -0400
commit91d25ba8a6b0d810dc844cebeedc53029118ce3e (patch)
treec8cb66c28c7603cafab060bc57b55c0cef98c36e /fs/dax.c
parente30331ff05f689f8f2faeb51664299c4d7841f15 (diff)
dax: use common 4k zero page for dax mmap reads
When servicing mmap() reads from file holes the current DAX code allocates a page cache page of all zeroes and places the struct page pointer in the mapping->page_tree radix tree. This has three major drawbacks: 1) It consumes memory unnecessarily. For every 4k page that is read via a DAX mmap() over a hole, we allocate a new page cache page. This means that if you read 1GiB worth of pages, you end up using 1GiB of zeroed memory. This is easily visible by looking at the overall memory consumption of the system or by looking at /proc/[pid]/smaps: 7f62e72b3000-7f63272b3000 rw-s 00000000 103:00 12 /root/dax/data Size: 1048576 kB Rss: 1048576 kB Pss: 1048576 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 1048576 kB Private_Dirty: 0 kB Referenced: 1048576 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Locked: 0 kB 2) It is slower than using a common zero page because each page fault has more work to do. Instead of just inserting a common zero page we have to allocate a page cache page, zero it, and then insert it. Here are the average latencies of dax_load_hole() as measured by ftrace on a random test box: Old method, using zeroed page cache pages: 3.4 us New method, using the common 4k zero page: 0.8 us This was the average latency over 1 GiB of sequential reads done by this simple fio script: [global] size=1G filename=/root/dax/data fallocate=none [io] rw=read ioengine=mmap 3) The fact that we had to check for both DAX exceptional entries and for page cache pages in the radix tree made the DAX code more complex. Solve these issues by following the lead of the DAX PMD code and using a common 4k zero page instead. As with the PMD code we will now insert a DAX exceptional entry into the radix tree instead of a struct page pointer which allows us to remove all the special casing in the DAX code. Note that we do still pretty aggressively check for regular pages in the DAX radix tree, especially where we take action based on the bits set in the page. If we ever find a regular page in our radix tree now that most likely means that someone besides DAX is inserting pages (which has happened lots of times in the past), and we want to find that out early and fail loudly. This solution also removes the extra memory consumption. Here is that same /proc/[pid]/smaps after 1GiB of reading from a hole with the new code: 7f2054a74000-7f2094a74000 rw-s 00000000 103:00 12 /root/dax/data Size: 1048576 kB Rss: 0 kB Pss: 0 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 0 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Locked: 0 kB Overall system memory consumption is similarly improved. Another major change is that we remove dax_pfn_mkwrite() from our fault flow, and instead rely on the page fault itself to make the PTE dirty and writeable. The following description from the patch adding the vm_insert_mixed_mkwrite() call explains this a little more: "To be able to use the common 4k zero page in DAX we need to have our PTE fault path look more like our PMD fault path where a PTE entry can be marked as dirty and writeable as it is first inserted rather than waiting for a follow-up dax_pfn_mkwrite() => finish_mkwrite_fault() call. Right now we can rely on having a dax_pfn_mkwrite() call because we can distinguish between these two cases in do_wp_page(): case 1: 4k zero page => writable DAX storage case 2: read-only DAX storage => writeable DAX storage This distinction is made by via vm_normal_page(). vm_normal_page() returns false for the common 4k zero page, though, just as it does for DAX ptes. Instead of special casing the DAX + 4k zero page case we will simplify our DAX PTE page fault sequence so that it matches our DAX PMD sequence, and get rid of the dax_pfn_mkwrite() helper. We will instead use dax_iomap_fault() to handle write-protection faults. This means that insert_pfn() needs to follow the lead of insert_pfn_pmd() and allow us to pass in a 'mkwrite' flag. If 'mkwrite' is set insert_pfn() will do the work that was previously done by wp_page_reuse() as part of the dax_pfn_mkwrite() call path" Link: http://lkml.kernel.org/r/20170724170616.25810-4-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> Reviewed-by: Jan Kara <jack@suse.cz> Cc: "Darrick J. Wong" <darrick.wong@oracle.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Christoph Hellwig <hch@lst.de> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Matthew Wilcox <mawilcox@microsoft.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/dax.c')
-rw-r--r--fs/dax.c243
1 files changed, 76 insertions, 167 deletions
diff --git a/fs/dax.c b/fs/dax.c
index b8882b5ce6ed..ab67ae30ccbf 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -66,7 +66,7 @@ static int dax_is_pte_entry(void *entry)
66 66
67static int dax_is_zero_entry(void *entry) 67static int dax_is_zero_entry(void *entry)
68{ 68{
69 return (unsigned long)entry & RADIX_DAX_HZP; 69 return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
70} 70}
71 71
72static int dax_is_empty_entry(void *entry) 72static int dax_is_empty_entry(void *entry)
@@ -206,7 +206,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
206 for (;;) { 206 for (;;) {
207 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, 207 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
208 &slot); 208 &slot);
209 if (!entry || !radix_tree_exceptional_entry(entry) || 209 if (!entry ||
210 WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
210 !slot_locked(mapping, slot)) { 211 !slot_locked(mapping, slot)) {
211 if (slotp) 212 if (slotp)
212 *slotp = slot; 213 *slotp = slot;
@@ -241,14 +242,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
241} 242}
242 243
243static void put_locked_mapping_entry(struct address_space *mapping, 244static void put_locked_mapping_entry(struct address_space *mapping,
244 pgoff_t index, void *entry) 245 pgoff_t index)
245{ 246{
246 if (!radix_tree_exceptional_entry(entry)) { 247 dax_unlock_mapping_entry(mapping, index);
247 unlock_page(entry);
248 put_page(entry);
249 } else {
250 dax_unlock_mapping_entry(mapping, index);
251 }
252} 248}
253 249
254/* 250/*
@@ -258,7 +254,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
258static void put_unlocked_mapping_entry(struct address_space *mapping, 254static void put_unlocked_mapping_entry(struct address_space *mapping,
259 pgoff_t index, void *entry) 255 pgoff_t index, void *entry)
260{ 256{
261 if (!radix_tree_exceptional_entry(entry)) 257 if (!entry)
262 return; 258 return;
263 259
264 /* We have to wake up next waiter for the radix tree entry lock */ 260 /* We have to wake up next waiter for the radix tree entry lock */
@@ -266,15 +262,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
266} 262}
267 263
268/* 264/*
269 * Find radix tree entry at given index. If it points to a page, return with 265 * Find radix tree entry at given index. If it points to an exceptional entry,
270 * the page locked. If it points to the exceptional entry, return with the 266 * return it with the radix tree entry locked. If the radix tree doesn't
271 * radix tree entry locked. If the radix tree doesn't contain given index, 267 * contain given index, create an empty exceptional entry for the index and
272 * create empty exceptional entry for the index and return with it locked. 268 * return with it locked.
273 * 269 *
274 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will 270 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
275 * either return that locked entry or will return an error. This error will 271 * either return that locked entry or will return an error. This error will
276 * happen if there are any 4k entries (either zero pages or DAX entries) 272 * happen if there are any 4k entries within the 2MiB range that we are
277 * within the 2MiB range that we are requesting. 273 * requesting.
278 * 274 *
279 * We always favor 4k entries over 2MiB entries. There isn't a flow where we 275 * We always favor 4k entries over 2MiB entries. There isn't a flow where we
280 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB 276 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
@@ -301,18 +297,21 @@ restart:
301 spin_lock_irq(&mapping->tree_lock); 297 spin_lock_irq(&mapping->tree_lock);
302 entry = get_unlocked_mapping_entry(mapping, index, &slot); 298 entry = get_unlocked_mapping_entry(mapping, index, &slot);
303 299
300 if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
301 entry = ERR_PTR(-EIO);
302 goto out_unlock;
303 }
304
304 if (entry) { 305 if (entry) {
305 if (size_flag & RADIX_DAX_PMD) { 306 if (size_flag & RADIX_DAX_PMD) {
306 if (!radix_tree_exceptional_entry(entry) || 307 if (dax_is_pte_entry(entry)) {
307 dax_is_pte_entry(entry)) {
308 put_unlocked_mapping_entry(mapping, index, 308 put_unlocked_mapping_entry(mapping, index,
309 entry); 309 entry);
310 entry = ERR_PTR(-EEXIST); 310 entry = ERR_PTR(-EEXIST);
311 goto out_unlock; 311 goto out_unlock;
312 } 312 }
313 } else { /* trying to grab a PTE entry */ 313 } else { /* trying to grab a PTE entry */
314 if (radix_tree_exceptional_entry(entry) && 314 if (dax_is_pmd_entry(entry) &&
315 dax_is_pmd_entry(entry) &&
316 (dax_is_zero_entry(entry) || 315 (dax_is_zero_entry(entry) ||
317 dax_is_empty_entry(entry))) { 316 dax_is_empty_entry(entry))) {
318 pmd_downgrade = true; 317 pmd_downgrade = true;
@@ -346,7 +345,7 @@ restart:
346 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 345 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
347 if (err) { 346 if (err) {
348 if (pmd_downgrade) 347 if (pmd_downgrade)
349 put_locked_mapping_entry(mapping, index, entry); 348 put_locked_mapping_entry(mapping, index);
350 return ERR_PTR(err); 349 return ERR_PTR(err);
351 } 350 }
352 spin_lock_irq(&mapping->tree_lock); 351 spin_lock_irq(&mapping->tree_lock);
@@ -396,21 +395,6 @@ restart:
396 spin_unlock_irq(&mapping->tree_lock); 395 spin_unlock_irq(&mapping->tree_lock);
397 return entry; 396 return entry;
398 } 397 }
399 /* Normal page in radix tree? */
400 if (!radix_tree_exceptional_entry(entry)) {
401 struct page *page = entry;
402
403 get_page(page);
404 spin_unlock_irq(&mapping->tree_lock);
405 lock_page(page);
406 /* Page got truncated? Retry... */
407 if (unlikely(page->mapping != mapping)) {
408 unlock_page(page);
409 put_page(page);
410 goto restart;
411 }
412 return page;
413 }
414 entry = lock_slot(mapping, slot); 398 entry = lock_slot(mapping, slot);
415 out_unlock: 399 out_unlock:
416 spin_unlock_irq(&mapping->tree_lock); 400 spin_unlock_irq(&mapping->tree_lock);
@@ -426,7 +410,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
426 410
427 spin_lock_irq(&mapping->tree_lock); 411 spin_lock_irq(&mapping->tree_lock);
428 entry = get_unlocked_mapping_entry(mapping, index, NULL); 412 entry = get_unlocked_mapping_entry(mapping, index, NULL);
429 if (!entry || !radix_tree_exceptional_entry(entry)) 413 if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
430 goto out; 414 goto out;
431 if (!trunc && 415 if (!trunc &&
432 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || 416 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
@@ -508,47 +492,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
508 unsigned long flags) 492 unsigned long flags)
509{ 493{
510 struct radix_tree_root *page_tree = &mapping->page_tree; 494 struct radix_tree_root *page_tree = &mapping->page_tree;
511 int error = 0;
512 bool hole_fill = false;
513 void *new_entry; 495 void *new_entry;
514 pgoff_t index = vmf->pgoff; 496 pgoff_t index = vmf->pgoff;
515 497
516 if (vmf->flags & FAULT_FLAG_WRITE) 498 if (vmf->flags & FAULT_FLAG_WRITE)
517 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 499 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
518 500
519 /* Replacing hole page with block mapping? */ 501 if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
520 if (!radix_tree_exceptional_entry(entry)) { 502 /* we are replacing a zero page with block mapping */
521 hole_fill = true; 503 if (dax_is_pmd_entry(entry))
522 /* 504 unmap_mapping_range(mapping,
523 * Unmap the page now before we remove it from page cache below. 505 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
524 * The page is locked so it cannot be faulted in again. 506 PMD_SIZE, 0);
525 */ 507 else /* pte entry */
526 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 508 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
527 PAGE_SIZE, 0); 509 PAGE_SIZE, 0);
528 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
529 if (error)
530 return ERR_PTR(error);
531 } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
532 /* replacing huge zero page with PMD block mapping */
533 unmap_mapping_range(mapping,
534 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
535 } 510 }
536 511
537 spin_lock_irq(&mapping->tree_lock); 512 spin_lock_irq(&mapping->tree_lock);
538 new_entry = dax_radix_locked_entry(sector, flags); 513 new_entry = dax_radix_locked_entry(sector, flags);
539 514
540 if (hole_fill) { 515 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
541 __delete_from_page_cache(entry, NULL);
542 /* Drop pagecache reference */
543 put_page(entry);
544 error = __radix_tree_insert(page_tree, index,
545 dax_radix_order(new_entry), new_entry);
546 if (error) {
547 new_entry = ERR_PTR(error);
548 goto unlock;
549 }
550 mapping->nrexceptional++;
551 } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
552 /* 516 /*
553 * Only swap our new entry into the radix tree if the current 517 * Only swap our new entry into the radix tree if the current
554 * entry is a zero page or an empty entry. If a normal PTE or 518 * entry is a zero page or an empty entry. If a normal PTE or
@@ -565,23 +529,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
565 WARN_ON_ONCE(ret != entry); 529 WARN_ON_ONCE(ret != entry);
566 __radix_tree_replace(page_tree, node, slot, 530 __radix_tree_replace(page_tree, node, slot,
567 new_entry, NULL, NULL); 531 new_entry, NULL, NULL);
532 entry = new_entry;
568 } 533 }
534
569 if (vmf->flags & FAULT_FLAG_WRITE) 535 if (vmf->flags & FAULT_FLAG_WRITE)
570 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 536 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
571 unlock: 537
572 spin_unlock_irq(&mapping->tree_lock); 538 spin_unlock_irq(&mapping->tree_lock);
573 if (hole_fill) { 539 return entry;
574 radix_tree_preload_end();
575 /*
576 * We don't need hole page anymore, it has been replaced with
577 * locked radix tree entry now.
578 */
579 if (mapping->a_ops->freepage)
580 mapping->a_ops->freepage(entry);
581 unlock_page(entry);
582 put_page(entry);
583 }
584 return new_entry;
585} 540}
586 541
587static inline unsigned long 542static inline unsigned long
@@ -683,7 +638,7 @@ static int dax_writeback_one(struct block_device *bdev,
683 spin_lock_irq(&mapping->tree_lock); 638 spin_lock_irq(&mapping->tree_lock);
684 entry2 = get_unlocked_mapping_entry(mapping, index, &slot); 639 entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
685 /* Entry got punched out / reallocated? */ 640 /* Entry got punched out / reallocated? */
686 if (!entry2 || !radix_tree_exceptional_entry(entry2)) 641 if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
687 goto put_unlocked; 642 goto put_unlocked;
688 /* 643 /*
689 * Entry got reallocated elsewhere? No need to writeback. We have to 644 * Entry got reallocated elsewhere? No need to writeback. We have to
@@ -755,7 +710,7 @@ static int dax_writeback_one(struct block_device *bdev,
755 trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); 710 trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
756 dax_unlock: 711 dax_unlock:
757 dax_read_unlock(id); 712 dax_read_unlock(id);
758 put_locked_mapping_entry(mapping, index, entry); 713 put_locked_mapping_entry(mapping, index);
759 return ret; 714 return ret;
760 715
761 put_unlocked: 716 put_unlocked:
@@ -830,11 +785,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
830 785
831static int dax_insert_mapping(struct address_space *mapping, 786static int dax_insert_mapping(struct address_space *mapping,
832 struct block_device *bdev, struct dax_device *dax_dev, 787 struct block_device *bdev, struct dax_device *dax_dev,
833 sector_t sector, size_t size, void **entryp, 788 sector_t sector, size_t size, void *entry,
834 struct vm_area_struct *vma, struct vm_fault *vmf) 789 struct vm_area_struct *vma, struct vm_fault *vmf)
835{ 790{
836 unsigned long vaddr = vmf->address; 791 unsigned long vaddr = vmf->address;
837 void *entry = *entryp;
838 void *ret, *kaddr; 792 void *ret, *kaddr;
839 pgoff_t pgoff; 793 pgoff_t pgoff;
840 int id, rc; 794 int id, rc;
@@ -855,87 +809,44 @@ static int dax_insert_mapping(struct address_space *mapping,
855 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); 809 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
856 if (IS_ERR(ret)) 810 if (IS_ERR(ret))
857 return PTR_ERR(ret); 811 return PTR_ERR(ret);
858 *entryp = ret;
859 812
860 trace_dax_insert_mapping(mapping->host, vmf, ret); 813 trace_dax_insert_mapping(mapping->host, vmf, ret);
861 return vm_insert_mixed(vma, vaddr, pfn); 814 if (vmf->flags & FAULT_FLAG_WRITE)
862} 815 return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
863 816 else
864/** 817 return vm_insert_mixed(vma, vaddr, pfn);
865 * dax_pfn_mkwrite - handle first write to DAX page
866 * @vmf: The description of the fault
867 */
868int dax_pfn_mkwrite(struct vm_fault *vmf)
869{
870 struct file *file = vmf->vma->vm_file;
871 struct address_space *mapping = file->f_mapping;
872 struct inode *inode = mapping->host;
873 void *entry, **slot;
874 pgoff_t index = vmf->pgoff;
875
876 spin_lock_irq(&mapping->tree_lock);
877 entry = get_unlocked_mapping_entry(mapping, index, &slot);
878 if (!entry || !radix_tree_exceptional_entry(entry)) {
879 if (entry)
880 put_unlocked_mapping_entry(mapping, index, entry);
881 spin_unlock_irq(&mapping->tree_lock);
882 trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
883 return VM_FAULT_NOPAGE;
884 }
885 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
886 entry = lock_slot(mapping, slot);
887 spin_unlock_irq(&mapping->tree_lock);
888 /*
889 * If we race with somebody updating the PTE and finish_mkwrite_fault()
890 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
891 * the fault in either case.
892 */
893 finish_mkwrite_fault(vmf);
894 put_locked_mapping_entry(mapping, index, entry);
895 trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
896 return VM_FAULT_NOPAGE;
897} 818}
898EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
899 819
900/* 820/*
901 * The user has performed a load from a hole in the file. Allocating 821 * The user has performed a load from a hole in the file. Allocating a new
902 * a new page in the file would cause excessive storage usage for 822 * page in the file would cause excessive storage usage for workloads with
903 * workloads with sparse files. We allocate a page cache page instead. 823 * sparse files. Instead we insert a read-only mapping of the 4k zero page.
904 * We'll kick it out of the page cache if it's ever written to, 824 * If this page is ever written to we will re-fault and change the mapping to
905 * otherwise it will simply fall out of the page cache under memory 825 * point to real DAX storage instead.
906 * pressure without ever having been dirtied.
907 */ 826 */
908static int dax_load_hole(struct address_space *mapping, void **entry, 827static int dax_load_hole(struct address_space *mapping, void *entry,
909 struct vm_fault *vmf) 828 struct vm_fault *vmf)
910{ 829{
911 struct inode *inode = mapping->host; 830 struct inode *inode = mapping->host;
912 struct page *page; 831 unsigned long vaddr = vmf->address;
913 int ret; 832 int ret = VM_FAULT_NOPAGE;
914 833 struct page *zero_page;
915 /* Hole page already exists? Return it... */ 834 void *entry2;
916 if (!radix_tree_exceptional_entry(*entry)) {
917 page = *entry;
918 goto finish_fault;
919 }
920 835
921 /* This will replace locked radix tree entry with a hole page */ 836 zero_page = ZERO_PAGE(0);
922 page = find_or_create_page(mapping, vmf->pgoff, 837 if (unlikely(!zero_page)) {
923 vmf->gfp_mask | __GFP_ZERO);
924 if (!page) {
925 ret = VM_FAULT_OOM; 838 ret = VM_FAULT_OOM;
926 goto out; 839 goto out;
927 } 840 }
928 841
929finish_fault: 842 entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
930 vmf->page = page; 843 RADIX_DAX_ZERO_PAGE);
931 ret = finish_fault(vmf); 844 if (IS_ERR(entry2)) {
932 vmf->page = NULL; 845 ret = VM_FAULT_SIGBUS;
933 *entry = page; 846 goto out;
934 if (!ret) {
935 /* Grab reference for PTE that is now referencing the page */
936 get_page(page);
937 ret = VM_FAULT_NOPAGE;
938 } 847 }
848
849 vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
939out: 850out:
940 trace_dax_load_hole(inode, vmf, ret); 851 trace_dax_load_hole(inode, vmf, ret);
941 return ret; 852 return ret;
@@ -1223,7 +1134,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1223 major = VM_FAULT_MAJOR; 1134 major = VM_FAULT_MAJOR;
1224 } 1135 }
1225 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, 1136 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
1226 sector, PAGE_SIZE, &entry, vmf->vma, vmf); 1137 sector, PAGE_SIZE, entry, vmf->vma, vmf);
1227 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1138 /* -EBUSY is fine, somebody else faulted on the same PTE */
1228 if (error == -EBUSY) 1139 if (error == -EBUSY)
1229 error = 0; 1140 error = 0;
@@ -1231,7 +1142,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1231 case IOMAP_UNWRITTEN: 1142 case IOMAP_UNWRITTEN:
1232 case IOMAP_HOLE: 1143 case IOMAP_HOLE:
1233 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1144 if (!(vmf->flags & FAULT_FLAG_WRITE)) {
1234 vmf_ret = dax_load_hole(mapping, &entry, vmf); 1145 vmf_ret = dax_load_hole(mapping, entry, vmf);
1235 goto finish_iomap; 1146 goto finish_iomap;
1236 } 1147 }
1237 /*FALLTHRU*/ 1148 /*FALLTHRU*/
@@ -1258,7 +1169,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1258 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); 1169 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1259 } 1170 }
1260 unlock_entry: 1171 unlock_entry:
1261 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1172 put_locked_mapping_entry(mapping, vmf->pgoff);
1262 out: 1173 out:
1263 trace_dax_pte_fault_done(inode, vmf, vmf_ret); 1174 trace_dax_pte_fault_done(inode, vmf, vmf_ret);
1264 return vmf_ret; 1175 return vmf_ret;
@@ -1272,7 +1183,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1272#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 1183#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
1273 1184
1274static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, 1185static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
1275 loff_t pos, void **entryp) 1186 loff_t pos, void *entry)
1276{ 1187{
1277 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1188 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1278 const sector_t sector = dax_iomap_sector(iomap, pos); 1189 const sector_t sector = dax_iomap_sector(iomap, pos);
@@ -1303,11 +1214,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
1303 goto unlock_fallback; 1214 goto unlock_fallback;
1304 dax_read_unlock(id); 1215 dax_read_unlock(id);
1305 1216
1306 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, 1217 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
1307 RADIX_DAX_PMD); 1218 RADIX_DAX_PMD);
1308 if (IS_ERR(ret)) 1219 if (IS_ERR(ret))
1309 goto fallback; 1220 goto fallback;
1310 *entryp = ret;
1311 1221
1312 trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); 1222 trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
1313 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, 1223 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
@@ -1321,7 +1231,7 @@ fallback:
1321} 1231}
1322 1232
1323static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, 1233static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1324 void **entryp) 1234 void *entry)
1325{ 1235{
1326 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1236 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1327 unsigned long pmd_addr = vmf->address & PMD_MASK; 1237 unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -1336,11 +1246,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1336 if (unlikely(!zero_page)) 1246 if (unlikely(!zero_page))
1337 goto fallback; 1247 goto fallback;
1338 1248
1339 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, 1249 ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
1340 RADIX_DAX_PMD | RADIX_DAX_HZP); 1250 RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
1341 if (IS_ERR(ret)) 1251 if (IS_ERR(ret))
1342 goto fallback; 1252 goto fallback;
1343 *entryp = ret;
1344 1253
1345 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1254 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1346 if (!pmd_none(*(vmf->pmd))) { 1255 if (!pmd_none(*(vmf->pmd))) {
@@ -1416,10 +1325,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1416 goto fallback; 1325 goto fallback;
1417 1326
1418 /* 1327 /*
1419 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX 1328 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
1420 * PMD or a HZP entry. If it can't (because a 4k page is already in 1329 * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
1421 * the tree, for instance), it will return -EEXIST and we just fall 1330 * is already in the tree, for instance), it will return -EEXIST and
1422 * back to 4k entries. 1331 * we just fall back to 4k entries.
1423 */ 1332 */
1424 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); 1333 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1425 if (IS_ERR(entry)) 1334 if (IS_ERR(entry))
@@ -1452,13 +1361,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1452 1361
1453 switch (iomap.type) { 1362 switch (iomap.type) {
1454 case IOMAP_MAPPED: 1363 case IOMAP_MAPPED:
1455 result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); 1364 result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
1456 break; 1365 break;
1457 case IOMAP_UNWRITTEN: 1366 case IOMAP_UNWRITTEN:
1458 case IOMAP_HOLE: 1367 case IOMAP_HOLE:
1459 if (WARN_ON_ONCE(write)) 1368 if (WARN_ON_ONCE(write))
1460 break; 1369 break;
1461 result = dax_pmd_load_hole(vmf, &iomap, &entry); 1370 result = dax_pmd_load_hole(vmf, &iomap, entry);
1462 break; 1371 break;
1463 default: 1372 default:
1464 WARN_ON_ONCE(1); 1373 WARN_ON_ONCE(1);
@@ -1481,7 +1390,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1481 &iomap); 1390 &iomap);
1482 } 1391 }
1483 unlock_entry: 1392 unlock_entry:
1484 put_locked_mapping_entry(mapping, pgoff, entry); 1393 put_locked_mapping_entry(mapping, pgoff);
1485 fallback: 1394 fallback:
1486 if (result == VM_FAULT_FALLBACK) { 1395 if (result == VM_FAULT_FALLBACK) {
1487 split_huge_pmd(vma, vmf->pmd, vmf->address); 1396 split_huge_pmd(vma, vmf->pmd, vmf->address);