aboutsummaryrefslogtreecommitdiffstats
path: root/fs/dax.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dax.c')
-rw-r--r--fs/dax.c392
1 files changed, 175 insertions, 217 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 306c2b603fb8..6afcacb3a87b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -42,6 +42,9 @@
42#define DAX_WAIT_TABLE_BITS 12 42#define DAX_WAIT_TABLE_BITS 12
43#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 43#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
44 44
45/* The 'colour' (ie low bits) within a PMD of a page offset. */
46#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
47
45static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 48static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
46 49
47static int __init init_dax_wait_table(void) 50static int __init init_dax_wait_table(void)
@@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void)
54} 57}
55fs_initcall(init_dax_wait_table); 58fs_initcall(init_dax_wait_table);
56 59
60/*
61 * We use lowest available bit in exceptional entry for locking, one bit for
62 * the entry size (PMD) and two more to tell us if the entry is a zero page or
63 * an empty entry that is just used for locking. In total four special bits.
64 *
65 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
66 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
67 * block allocation.
68 */
69#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
70#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
71#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
72#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
73#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
74
75static unsigned long dax_radix_sector(void *entry)
76{
77 return (unsigned long)entry >> RADIX_DAX_SHIFT;
78}
79
80static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
81{
82 return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
83 ((unsigned long)sector << RADIX_DAX_SHIFT) |
84 RADIX_DAX_ENTRY_LOCK);
85}
86
87static unsigned int dax_radix_order(void *entry)
88{
89 if ((unsigned long)entry & RADIX_DAX_PMD)
90 return PMD_SHIFT - PAGE_SHIFT;
91 return 0;
92}
93
57static int dax_is_pmd_entry(void *entry) 94static int dax_is_pmd_entry(void *entry)
58{ 95{
59 return (unsigned long)entry & RADIX_DAX_PMD; 96 return (unsigned long)entry & RADIX_DAX_PMD;
@@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry)
66 103
67static int dax_is_zero_entry(void *entry) 104static int dax_is_zero_entry(void *entry)
68{ 105{
69 return (unsigned long)entry & RADIX_DAX_HZP; 106 return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
70} 107}
71 108
72static int dax_is_empty_entry(void *entry) 109static int dax_is_empty_entry(void *entry)
@@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
98 * the range covered by the PMD map to the same bit lock. 135 * the range covered by the PMD map to the same bit lock.
99 */ 136 */
100 if (dax_is_pmd_entry(entry)) 137 if (dax_is_pmd_entry(entry))
101 index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); 138 index &= ~PG_PMD_COLOUR;
102 139
103 key->mapping = mapping; 140 key->mapping = mapping;
104 key->entry_start = index; 141 key->entry_start = index;
@@ -121,6 +158,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
121} 158}
122 159
123/* 160/*
161 * We do not necessarily hold the mapping->tree_lock when we call this
162 * function so it is possible that 'entry' is no longer a valid item in the
163 * radix tree. This is okay because all we really need to do is to find the
164 * correct waitqueue where tasks might be waiting for that old 'entry' and
165 * wake them.
166 */
167static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
168 pgoff_t index, void *entry, bool wake_all)
169{
170 struct exceptional_entry_key key;
171 wait_queue_head_t *wq;
172
173 wq = dax_entry_waitqueue(mapping, index, entry, &key);
174
175 /*
176 * Checking for locked entry and prepare_to_wait_exclusive() happens
177 * under mapping->tree_lock, ditto for entry handling in our callers.
178 * So at this point all tasks that could have seen our entry locked
179 * must be in the waitqueue and the following check will see them.
180 */
181 if (waitqueue_active(wq))
182 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
183}
184
185/*
124 * Check whether the given slot is locked. The function must be called with 186 * Check whether the given slot is locked. The function must be called with
125 * mapping->tree_lock held 187 * mapping->tree_lock held
126 */ 188 */
@@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
181 for (;;) { 243 for (;;) {
182 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, 244 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
183 &slot); 245 &slot);
184 if (!entry || !radix_tree_exceptional_entry(entry) || 246 if (!entry ||
247 WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
185 !slot_locked(mapping, slot)) { 248 !slot_locked(mapping, slot)) {
186 if (slotp) 249 if (slotp)
187 *slotp = slot; 250 *slotp = slot;
@@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
216} 279}
217 280
218static void put_locked_mapping_entry(struct address_space *mapping, 281static void put_locked_mapping_entry(struct address_space *mapping,
219 pgoff_t index, void *entry) 282 pgoff_t index)
220{ 283{
221 if (!radix_tree_exceptional_entry(entry)) { 284 dax_unlock_mapping_entry(mapping, index);
222 unlock_page(entry);
223 put_page(entry);
224 } else {
225 dax_unlock_mapping_entry(mapping, index);
226 }
227} 285}
228 286
229/* 287/*
@@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
233static void put_unlocked_mapping_entry(struct address_space *mapping, 291static void put_unlocked_mapping_entry(struct address_space *mapping,
234 pgoff_t index, void *entry) 292 pgoff_t index, void *entry)
235{ 293{
236 if (!radix_tree_exceptional_entry(entry)) 294 if (!entry)
237 return; 295 return;
238 296
239 /* We have to wake up next waiter for the radix tree entry lock */ 297 /* We have to wake up next waiter for the radix tree entry lock */
@@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
241} 299}
242 300
243/* 301/*
244 * Find radix tree entry at given index. If it points to a page, return with 302 * Find radix tree entry at given index. If it points to an exceptional entry,
245 * the page locked. If it points to the exceptional entry, return with the 303 * return it with the radix tree entry locked. If the radix tree doesn't
246 * radix tree entry locked. If the radix tree doesn't contain given index, 304 * contain given index, create an empty exceptional entry for the index and
247 * create empty exceptional entry for the index and return with it locked. 305 * return with it locked.
248 * 306 *
249 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will 307 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
250 * either return that locked entry or will return an error. This error will 308 * either return that locked entry or will return an error. This error will
251 * happen if there are any 4k entries (either zero pages or DAX entries) 309 * happen if there are any 4k entries within the 2MiB range that we are
252 * within the 2MiB range that we are requesting. 310 * requesting.
253 * 311 *
254 * We always favor 4k entries over 2MiB entries. There isn't a flow where we 312 * We always favor 4k entries over 2MiB entries. There isn't a flow where we
255 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB 313 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
@@ -276,18 +334,21 @@ restart:
276 spin_lock_irq(&mapping->tree_lock); 334 spin_lock_irq(&mapping->tree_lock);
277 entry = get_unlocked_mapping_entry(mapping, index, &slot); 335 entry = get_unlocked_mapping_entry(mapping, index, &slot);
278 336
337 if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
338 entry = ERR_PTR(-EIO);
339 goto out_unlock;
340 }
341
279 if (entry) { 342 if (entry) {
280 if (size_flag & RADIX_DAX_PMD) { 343 if (size_flag & RADIX_DAX_PMD) {
281 if (!radix_tree_exceptional_entry(entry) || 344 if (dax_is_pte_entry(entry)) {
282 dax_is_pte_entry(entry)) {
283 put_unlocked_mapping_entry(mapping, index, 345 put_unlocked_mapping_entry(mapping, index,
284 entry); 346 entry);
285 entry = ERR_PTR(-EEXIST); 347 entry = ERR_PTR(-EEXIST);
286 goto out_unlock; 348 goto out_unlock;
287 } 349 }
288 } else { /* trying to grab a PTE entry */ 350 } else { /* trying to grab a PTE entry */
289 if (radix_tree_exceptional_entry(entry) && 351 if (dax_is_pmd_entry(entry) &&
290 dax_is_pmd_entry(entry) &&
291 (dax_is_zero_entry(entry) || 352 (dax_is_zero_entry(entry) ||
292 dax_is_empty_entry(entry))) { 353 dax_is_empty_entry(entry))) {
293 pmd_downgrade = true; 354 pmd_downgrade = true;
@@ -321,7 +382,7 @@ restart:
321 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 382 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
322 if (err) { 383 if (err) {
323 if (pmd_downgrade) 384 if (pmd_downgrade)
324 put_locked_mapping_entry(mapping, index, entry); 385 put_locked_mapping_entry(mapping, index);
325 return ERR_PTR(err); 386 return ERR_PTR(err);
326 } 387 }
327 spin_lock_irq(&mapping->tree_lock); 388 spin_lock_irq(&mapping->tree_lock);
@@ -371,52 +432,12 @@ restart:
371 spin_unlock_irq(&mapping->tree_lock); 432 spin_unlock_irq(&mapping->tree_lock);
372 return entry; 433 return entry;
373 } 434 }
374 /* Normal page in radix tree? */
375 if (!radix_tree_exceptional_entry(entry)) {
376 struct page *page = entry;
377
378 get_page(page);
379 spin_unlock_irq(&mapping->tree_lock);
380 lock_page(page);
381 /* Page got truncated? Retry... */
382 if (unlikely(page->mapping != mapping)) {
383 unlock_page(page);
384 put_page(page);
385 goto restart;
386 }
387 return page;
388 }
389 entry = lock_slot(mapping, slot); 435 entry = lock_slot(mapping, slot);
390 out_unlock: 436 out_unlock:
391 spin_unlock_irq(&mapping->tree_lock); 437 spin_unlock_irq(&mapping->tree_lock);
392 return entry; 438 return entry;
393} 439}
394 440
395/*
396 * We do not necessarily hold the mapping->tree_lock when we call this
397 * function so it is possible that 'entry' is no longer a valid item in the
398 * radix tree. This is okay because all we really need to do is to find the
399 * correct waitqueue where tasks might be waiting for that old 'entry' and
400 * wake them.
401 */
402void dax_wake_mapping_entry_waiter(struct address_space *mapping,
403 pgoff_t index, void *entry, bool wake_all)
404{
405 struct exceptional_entry_key key;
406 wait_queue_head_t *wq;
407
408 wq = dax_entry_waitqueue(mapping, index, entry, &key);
409
410 /*
411 * Checking for locked entry and prepare_to_wait_exclusive() happens
412 * under mapping->tree_lock, ditto for entry handling in our callers.
413 * So at this point all tasks that could have seen our entry locked
414 * must be in the waitqueue and the following check will see them.
415 */
416 if (waitqueue_active(wq))
417 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
418}
419
420static int __dax_invalidate_mapping_entry(struct address_space *mapping, 441static int __dax_invalidate_mapping_entry(struct address_space *mapping,
421 pgoff_t index, bool trunc) 442 pgoff_t index, bool trunc)
422{ 443{
@@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
426 447
427 spin_lock_irq(&mapping->tree_lock); 448 spin_lock_irq(&mapping->tree_lock);
428 entry = get_unlocked_mapping_entry(mapping, index, NULL); 449 entry = get_unlocked_mapping_entry(mapping, index, NULL);
429 if (!entry || !radix_tree_exceptional_entry(entry)) 450 if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
430 goto out; 451 goto out;
431 if (!trunc && 452 if (!trunc &&
432 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || 453 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
@@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
468 return __dax_invalidate_mapping_entry(mapping, index, false); 489 return __dax_invalidate_mapping_entry(mapping, index, false);
469} 490}
470 491
471/*
472 * The user has performed a load from a hole in the file. Allocating
473 * a new page in the file would cause excessive storage usage for
474 * workloads with sparse files. We allocate a page cache page instead.
475 * We'll kick it out of the page cache if it's ever written to,
476 * otherwise it will simply fall out of the page cache under memory
477 * pressure without ever having been dirtied.
478 */
479static int dax_load_hole(struct address_space *mapping, void **entry,
480 struct vm_fault *vmf)
481{
482 struct inode *inode = mapping->host;
483 struct page *page;
484 int ret;
485
486 /* Hole page already exists? Return it... */
487 if (!radix_tree_exceptional_entry(*entry)) {
488 page = *entry;
489 goto finish_fault;
490 }
491
492 /* This will replace locked radix tree entry with a hole page */
493 page = find_or_create_page(mapping, vmf->pgoff,
494 vmf->gfp_mask | __GFP_ZERO);
495 if (!page) {
496 ret = VM_FAULT_OOM;
497 goto out;
498 }
499
500finish_fault:
501 vmf->page = page;
502 ret = finish_fault(vmf);
503 vmf->page = NULL;
504 *entry = page;
505 if (!ret) {
506 /* Grab reference for PTE that is now referencing the page */
507 get_page(page);
508 ret = VM_FAULT_NOPAGE;
509 }
510out:
511 trace_dax_load_hole(inode, vmf, ret);
512 return ret;
513}
514
515static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, 492static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
516 sector_t sector, size_t size, struct page *to, 493 sector_t sector, size_t size, struct page *to,
517 unsigned long vaddr) 494 unsigned long vaddr)
@@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
552 unsigned long flags) 529 unsigned long flags)
553{ 530{
554 struct radix_tree_root *page_tree = &mapping->page_tree; 531 struct radix_tree_root *page_tree = &mapping->page_tree;
555 int error = 0;
556 bool hole_fill = false;
557 void *new_entry; 532 void *new_entry;
558 pgoff_t index = vmf->pgoff; 533 pgoff_t index = vmf->pgoff;
559 534
560 if (vmf->flags & FAULT_FLAG_WRITE) 535 if (vmf->flags & FAULT_FLAG_WRITE)
561 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 536 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
562 537
563 /* Replacing hole page with block mapping? */ 538 if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
564 if (!radix_tree_exceptional_entry(entry)) { 539 /* we are replacing a zero page with block mapping */
565 hole_fill = true; 540 if (dax_is_pmd_entry(entry))
566 /* 541 unmap_mapping_range(mapping,
567 * Unmap the page now before we remove it from page cache below. 542 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
568 * The page is locked so it cannot be faulted in again. 543 PMD_SIZE, 0);
569 */ 544 else /* pte entry */
570 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 545 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
571 PAGE_SIZE, 0); 546 PAGE_SIZE, 0);
572 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
573 if (error)
574 return ERR_PTR(error);
575 } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
576 /* replacing huge zero page with PMD block mapping */
577 unmap_mapping_range(mapping,
578 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
579 } 547 }
580 548
581 spin_lock_irq(&mapping->tree_lock); 549 spin_lock_irq(&mapping->tree_lock);
582 new_entry = dax_radix_locked_entry(sector, flags); 550 new_entry = dax_radix_locked_entry(sector, flags);
583 551
584 if (hole_fill) { 552 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
585 __delete_from_page_cache(entry, NULL);
586 /* Drop pagecache reference */
587 put_page(entry);
588 error = __radix_tree_insert(page_tree, index,
589 dax_radix_order(new_entry), new_entry);
590 if (error) {
591 new_entry = ERR_PTR(error);
592 goto unlock;
593 }
594 mapping->nrexceptional++;
595 } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
596 /* 553 /*
597 * Only swap our new entry into the radix tree if the current 554 * Only swap our new entry into the radix tree if the current
598 * entry is a zero page or an empty entry. If a normal PTE or 555 * entry is a zero page or an empty entry. If a normal PTE or
@@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
609 WARN_ON_ONCE(ret != entry); 566 WARN_ON_ONCE(ret != entry);
610 __radix_tree_replace(page_tree, node, slot, 567 __radix_tree_replace(page_tree, node, slot,
611 new_entry, NULL, NULL); 568 new_entry, NULL, NULL);
569 entry = new_entry;
612 } 570 }
571
613 if (vmf->flags & FAULT_FLAG_WRITE) 572 if (vmf->flags & FAULT_FLAG_WRITE)
614 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 573 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
615 unlock: 574
616 spin_unlock_irq(&mapping->tree_lock); 575 spin_unlock_irq(&mapping->tree_lock);
617 if (hole_fill) { 576 return entry;
618 radix_tree_preload_end();
619 /*
620 * We don't need hole page anymore, it has been replaced with
621 * locked radix tree entry now.
622 */
623 if (mapping->a_ops->freepage)
624 mapping->a_ops->freepage(entry);
625 unlock_page(entry);
626 put_page(entry);
627 }
628 return new_entry;
629} 577}
630 578
631static inline unsigned long 579static inline unsigned long
@@ -646,11 +594,10 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
646 pte_t pte, *ptep = NULL; 594 pte_t pte, *ptep = NULL;
647 pmd_t *pmdp = NULL; 595 pmd_t *pmdp = NULL;
648 spinlock_t *ptl; 596 spinlock_t *ptl;
649 bool changed;
650 597
651 i_mmap_lock_read(mapping); 598 i_mmap_lock_read(mapping);
652 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { 599 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
653 unsigned long address; 600 unsigned long address, start, end;
654 601
655 cond_resched(); 602 cond_resched();
656 603
@@ -658,8 +605,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
658 continue; 605 continue;
659 606
660 address = pgoff_address(index, vma); 607 address = pgoff_address(index, vma);
661 changed = false; 608
662 if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) 609 /*
610 * Note because we provide start/end to follow_pte_pmd it will
611 * call mmu_notifier_invalidate_range_start() on our behalf
612 * before taking any lock.
613 */
614 if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
663 continue; 615 continue;
664 616
665 if (pmdp) { 617 if (pmdp) {
@@ -676,7 +628,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
676 pmd = pmd_wrprotect(pmd); 628 pmd = pmd_wrprotect(pmd);
677 pmd = pmd_mkclean(pmd); 629 pmd = pmd_mkclean(pmd);
678 set_pmd_at(vma->vm_mm, address, pmdp, pmd); 630 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
679 changed = true; 631 mmu_notifier_invalidate_range(vma->vm_mm, start, end);
680unlock_pmd: 632unlock_pmd:
681 spin_unlock(ptl); 633 spin_unlock(ptl);
682#endif 634#endif
@@ -691,13 +643,12 @@ unlock_pmd:
691 pte = pte_wrprotect(pte); 643 pte = pte_wrprotect(pte);
692 pte = pte_mkclean(pte); 644 pte = pte_mkclean(pte);
693 set_pte_at(vma->vm_mm, address, ptep, pte); 645 set_pte_at(vma->vm_mm, address, ptep, pte);
694 changed = true; 646 mmu_notifier_invalidate_range(vma->vm_mm, start, end);
695unlock_pte: 647unlock_pte:
696 pte_unmap_unlock(ptep, ptl); 648 pte_unmap_unlock(ptep, ptl);
697 } 649 }
698 650
699 if (changed) 651 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
700 mmu_notifier_invalidate_page(vma->vm_mm, address);
701 } 652 }
702 i_mmap_unlock_read(mapping); 653 i_mmap_unlock_read(mapping);
703} 654}
@@ -724,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev,
724 spin_lock_irq(&mapping->tree_lock); 675 spin_lock_irq(&mapping->tree_lock);
725 entry2 = get_unlocked_mapping_entry(mapping, index, &slot); 676 entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
726 /* Entry got punched out / reallocated? */ 677 /* Entry got punched out / reallocated? */
727 if (!entry2 || !radix_tree_exceptional_entry(entry2)) 678 if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
728 goto put_unlocked; 679 goto put_unlocked;
729 /* 680 /*
730 * Entry got reallocated elsewhere? No need to writeback. We have to 681 * Entry got reallocated elsewhere? No need to writeback. We have to
@@ -796,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev,
796 trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); 747 trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
797 dax_unlock: 748 dax_unlock:
798 dax_read_unlock(id); 749 dax_read_unlock(id);
799 put_locked_mapping_entry(mapping, index, entry); 750 put_locked_mapping_entry(mapping, index);
800 return ret; 751 return ret;
801 752
802 put_unlocked: 753 put_unlocked:
@@ -871,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
871 822
872static int dax_insert_mapping(struct address_space *mapping, 823static int dax_insert_mapping(struct address_space *mapping,
873 struct block_device *bdev, struct dax_device *dax_dev, 824 struct block_device *bdev, struct dax_device *dax_dev,
874 sector_t sector, size_t size, void **entryp, 825 sector_t sector, size_t size, void *entry,
875 struct vm_area_struct *vma, struct vm_fault *vmf) 826 struct vm_area_struct *vma, struct vm_fault *vmf)
876{ 827{
877 unsigned long vaddr = vmf->address; 828 unsigned long vaddr = vmf->address;
878 void *entry = *entryp;
879 void *ret, *kaddr; 829 void *ret, *kaddr;
880 pgoff_t pgoff; 830 pgoff_t pgoff;
881 int id, rc; 831 int id, rc;
@@ -896,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping,
896 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); 846 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
897 if (IS_ERR(ret)) 847 if (IS_ERR(ret))
898 return PTR_ERR(ret); 848 return PTR_ERR(ret);
899 *entryp = ret;
900 849
901 trace_dax_insert_mapping(mapping->host, vmf, ret); 850 trace_dax_insert_mapping(mapping->host, vmf, ret);
902 return vm_insert_mixed(vma, vaddr, pfn); 851 if (vmf->flags & FAULT_FLAG_WRITE)
852 return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
853 else
854 return vm_insert_mixed(vma, vaddr, pfn);
903} 855}
904 856
905/** 857/*
906 * dax_pfn_mkwrite - handle first write to DAX page 858 * The user has performed a load from a hole in the file. Allocating a new
907 * @vmf: The description of the fault 859 * page in the file would cause excessive storage usage for workloads with
860 * sparse files. Instead we insert a read-only mapping of the 4k zero page.
861 * If this page is ever written to we will re-fault and change the mapping to
862 * point to real DAX storage instead.
908 */ 863 */
909int dax_pfn_mkwrite(struct vm_fault *vmf) 864static int dax_load_hole(struct address_space *mapping, void *entry,
865 struct vm_fault *vmf)
910{ 866{
911 struct file *file = vmf->vma->vm_file;
912 struct address_space *mapping = file->f_mapping;
913 struct inode *inode = mapping->host; 867 struct inode *inode = mapping->host;
914 void *entry, **slot; 868 unsigned long vaddr = vmf->address;
915 pgoff_t index = vmf->pgoff; 869 int ret = VM_FAULT_NOPAGE;
870 struct page *zero_page;
871 void *entry2;
916 872
917 spin_lock_irq(&mapping->tree_lock); 873 zero_page = ZERO_PAGE(0);
918 entry = get_unlocked_mapping_entry(mapping, index, &slot); 874 if (unlikely(!zero_page)) {
919 if (!entry || !radix_tree_exceptional_entry(entry)) { 875 ret = VM_FAULT_OOM;
920 if (entry) 876 goto out;
921 put_unlocked_mapping_entry(mapping, index, entry);
922 spin_unlock_irq(&mapping->tree_lock);
923 trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
924 return VM_FAULT_NOPAGE;
925 } 877 }
926 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 878
927 entry = lock_slot(mapping, slot); 879 entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
928 spin_unlock_irq(&mapping->tree_lock); 880 RADIX_DAX_ZERO_PAGE);
929 /* 881 if (IS_ERR(entry2)) {
930 * If we race with somebody updating the PTE and finish_mkwrite_fault() 882 ret = VM_FAULT_SIGBUS;
931 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry 883 goto out;
932 * the fault in either case. 884 }
933 */ 885
934 finish_mkwrite_fault(vmf); 886 vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
935 put_locked_mapping_entry(mapping, index, entry); 887out:
936 trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); 888 trace_dax_load_hole(inode, vmf, ret);
937 return VM_FAULT_NOPAGE; 889 return ret;
938} 890}
939EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
940 891
941static bool dax_range_is_aligned(struct block_device *bdev, 892static bool dax_range_is_aligned(struct block_device *bdev,
942 unsigned int offset, unsigned int length) 893 unsigned int offset, unsigned int length)
@@ -1056,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1056 if (map_len > end - pos) 1007 if (map_len > end - pos)
1057 map_len = end - pos; 1008 map_len = end - pos;
1058 1009
1010 /*
1011 * The userspace address for the memory copy has already been
1012 * validated via access_ok() in either vfs_read() or
1013 * vfs_write(), depending on which operation we are doing.
1014 */
1059 if (iov_iter_rw(iter) == WRITE) 1015 if (iov_iter_rw(iter) == WRITE)
1060 map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, 1016 map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
1061 map_len, iter); 1017 map_len, iter);
@@ -1220,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1220 major = VM_FAULT_MAJOR; 1176 major = VM_FAULT_MAJOR;
1221 } 1177 }
1222 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, 1178 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
1223 sector, PAGE_SIZE, &entry, vmf->vma, vmf); 1179 sector, PAGE_SIZE, entry, vmf->vma, vmf);
1224 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1180 /* -EBUSY is fine, somebody else faulted on the same PTE */
1225 if (error == -EBUSY) 1181 if (error == -EBUSY)
1226 error = 0; 1182 error = 0;
@@ -1228,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1228 case IOMAP_UNWRITTEN: 1184 case IOMAP_UNWRITTEN:
1229 case IOMAP_HOLE: 1185 case IOMAP_HOLE:
1230 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1186 if (!(vmf->flags & FAULT_FLAG_WRITE)) {
1231 vmf_ret = dax_load_hole(mapping, &entry, vmf); 1187 vmf_ret = dax_load_hole(mapping, entry, vmf);
1232 goto finish_iomap; 1188 goto finish_iomap;
1233 } 1189 }
1234 /*FALLTHRU*/ 1190 /*FALLTHRU*/
@@ -1255,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1255 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); 1211 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1256 } 1212 }
1257 unlock_entry: 1213 unlock_entry:
1258 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1214 put_locked_mapping_entry(mapping, vmf->pgoff);
1259 out: 1215 out:
1260 trace_dax_pte_fault_done(inode, vmf, vmf_ret); 1216 trace_dax_pte_fault_done(inode, vmf, vmf_ret);
1261 return vmf_ret; 1217 return vmf_ret;
1262} 1218}
1263 1219
1264#ifdef CONFIG_FS_DAX_PMD 1220#ifdef CONFIG_FS_DAX_PMD
1265/*
1266 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
1267 * more often than one might expect in the below functions.
1268 */
1269#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
1270
1271static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, 1221static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
1272 loff_t pos, void **entryp) 1222 loff_t pos, void *entry)
1273{ 1223{
1274 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1224 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1275 const sector_t sector = dax_iomap_sector(iomap, pos); 1225 const sector_t sector = dax_iomap_sector(iomap, pos);
@@ -1280,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
1280 void *ret = NULL, *kaddr; 1230 void *ret = NULL, *kaddr;
1281 long length = 0; 1231 long length = 0;
1282 pgoff_t pgoff; 1232 pgoff_t pgoff;
1283 pfn_t pfn; 1233 pfn_t pfn = {};
1284 int id; 1234 int id;
1285 1235
1286 if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) 1236 if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
@@ -1300,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
1300 goto unlock_fallback; 1250 goto unlock_fallback;
1301 dax_read_unlock(id); 1251 dax_read_unlock(id);
1302 1252
1303 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, 1253 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
1304 RADIX_DAX_PMD); 1254 RADIX_DAX_PMD);
1305 if (IS_ERR(ret)) 1255 if (IS_ERR(ret))
1306 goto fallback; 1256 goto fallback;
1307 *entryp = ret;
1308 1257
1309 trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); 1258 trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
1310 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, 1259 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
@@ -1318,7 +1267,7 @@ fallback:
1318} 1267}
1319 1268
1320static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, 1269static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1321 void **entryp) 1270 void *entry)
1322{ 1271{
1323 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1272 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1324 unsigned long pmd_addr = vmf->address & PMD_MASK; 1273 unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -1333,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1333 if (unlikely(!zero_page)) 1282 if (unlikely(!zero_page))
1334 goto fallback; 1283 goto fallback;
1335 1284
1336 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, 1285 ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
1337 RADIX_DAX_PMD | RADIX_DAX_HZP); 1286 RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
1338 if (IS_ERR(ret)) 1287 if (IS_ERR(ret))
1339 goto fallback; 1288 goto fallback;
1340 *entryp = ret;
1341 1289
1342 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1290 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1343 if (!pmd_none(*(vmf->pmd))) { 1291 if (!pmd_none(*(vmf->pmd))) {
@@ -1383,6 +1331,16 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1383 1331
1384 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); 1332 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
1385 1333
1334 /*
1335 * Make sure that the faulting address's PMD offset (color) matches
1336 * the PMD offset from the start of the file. This is necessary so
1337 * that a PMD range in the page table overlaps exactly with a PMD
1338 * range in the radix tree.
1339 */
1340 if ((vmf->pgoff & PG_PMD_COLOUR) !=
1341 ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
1342 goto fallback;
1343
1386 /* Fall back to PTEs if we're going to COW */ 1344 /* Fall back to PTEs if we're going to COW */
1387 if (write && !(vma->vm_flags & VM_SHARED)) 1345 if (write && !(vma->vm_flags & VM_SHARED))
1388 goto fallback; 1346 goto fallback;
@@ -1403,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1403 goto fallback; 1361 goto fallback;
1404 1362
1405 /* 1363 /*
1406 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX 1364 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
1407 * PMD or a HZP entry. If it can't (because a 4k page is already in 1365 * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
1408 * the tree, for instance), it will return -EEXIST and we just fall 1366 * is already in the tree, for instance), it will return -EEXIST and
1409 * back to 4k entries. 1367 * we just fall back to 4k entries.
1410 */ 1368 */
1411 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); 1369 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1412 if (IS_ERR(entry)) 1370 if (IS_ERR(entry))
@@ -1439,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1439 1397
1440 switch (iomap.type) { 1398 switch (iomap.type) {
1441 case IOMAP_MAPPED: 1399 case IOMAP_MAPPED:
1442 result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); 1400 result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
1443 break; 1401 break;
1444 case IOMAP_UNWRITTEN: 1402 case IOMAP_UNWRITTEN:
1445 case IOMAP_HOLE: 1403 case IOMAP_HOLE:
1446 if (WARN_ON_ONCE(write)) 1404 if (WARN_ON_ONCE(write))
1447 break; 1405 break;
1448 result = dax_pmd_load_hole(vmf, &iomap, &entry); 1406 result = dax_pmd_load_hole(vmf, &iomap, entry);
1449 break; 1407 break;
1450 default: 1408 default:
1451 WARN_ON_ONCE(1); 1409 WARN_ON_ONCE(1);
@@ -1468,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1468 &iomap); 1426 &iomap);
1469 } 1427 }
1470 unlock_entry: 1428 unlock_entry:
1471 put_locked_mapping_entry(mapping, pgoff, entry); 1429 put_locked_mapping_entry(mapping, pgoff);
1472 fallback: 1430 fallback:
1473 if (result == VM_FAULT_FALLBACK) { 1431 if (result == VM_FAULT_FALLBACK) {
1474 split_huge_pmd(vma, vmf->pmd, vmf->address); 1432 split_huge_pmd(vma, vmf->pmd, vmf->address);