aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2016-11-09 18:29:06 -0500
committerDave Chinner <david@fromorbit.com>2016-11-09 18:29:06 -0500
commitb649c42e25b7d1be86d9dade8674dec3d64138fb (patch)
tree993b298c5de3d5adda6558f484769fe6ccf13400
parentb77428b12b55437b28deae738d9ce8b2e0663b55 (diff)
parent9484ab1bf4464faae695321dd4fa66365beda74e (diff)
Merge branch 'dax-4.10-iomap-pmd' into for-next
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/dax.c828
-rw-r--r--fs/ext2/file.c35
-rw-r--r--fs/ext4/inode.c3
-rw-r--r--fs/iomap.c5
-rw-r--r--fs/xfs/xfs_aops.c26
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_file.c10
-rw-r--r--include/linux/dax.h58
-rw-r--r--include/linux/iomap.h1
-rw-r--r--mm/filemap.c5
11 files changed, 542 insertions, 433 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 4bd03a2b0518..8e9e5f4104f4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -55,7 +55,6 @@ config FS_DAX_PMD
55 depends on FS_DAX 55 depends on FS_DAX
56 depends on ZONE_DEVICE 56 depends on ZONE_DEVICE
57 depends on TRANSPARENT_HUGEPAGE 57 depends on TRANSPARENT_HUGEPAGE
58 depends on BROKEN
59 58
60endif # BLOCK 59endif # BLOCK
61 60
diff --git a/fs/dax.c b/fs/dax.c
index 014defd2e744..28af41b9da3a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -34,25 +34,11 @@
34#include <linux/iomap.h> 34#include <linux/iomap.h>
35#include "internal.h" 35#include "internal.h"
36 36
37/*
38 * We use lowest available bit in exceptional entry for locking, other two
39 * bits to determine entry type. In total 3 special bits.
40 */
41#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
42#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
43#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
44#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
45#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
46#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
47#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
48 RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
49 RADIX_TREE_EXCEPTIONAL_ENTRY))
50
51/* We choose 4096 entries - same as per-zone page wait tables */ 37/* We choose 4096 entries - same as per-zone page wait tables */
52#define DAX_WAIT_TABLE_BITS 12 38#define DAX_WAIT_TABLE_BITS 12
53#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 39#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
54 40
55wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 41static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
56 42
57static int __init init_dax_wait_table(void) 43static int __init init_dax_wait_table(void)
58{ 44{
@@ -64,14 +50,6 @@ static int __init init_dax_wait_table(void)
64} 50}
65fs_initcall(init_dax_wait_table); 51fs_initcall(init_dax_wait_table);
66 52
67static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
68 pgoff_t index)
69{
70 unsigned long hash = hash_long((unsigned long)mapping ^ index,
71 DAX_WAIT_TABLE_BITS);
72 return wait_table + hash;
73}
74
75static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 53static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
76{ 54{
77 struct request_queue *q = bdev->bd_queue; 55 struct request_queue *q = bdev->bd_queue;
@@ -98,6 +76,26 @@ static void dax_unmap_atomic(struct block_device *bdev,
98 blk_queue_exit(bdev->bd_queue); 76 blk_queue_exit(bdev->bd_queue);
99} 77}
100 78
79static int dax_is_pmd_entry(void *entry)
80{
81 return (unsigned long)entry & RADIX_DAX_PMD;
82}
83
84static int dax_is_pte_entry(void *entry)
85{
86 return !((unsigned long)entry & RADIX_DAX_PMD);
87}
88
89static int dax_is_zero_entry(void *entry)
90{
91 return (unsigned long)entry & RADIX_DAX_HZP;
92}
93
94static int dax_is_empty_entry(void *entry)
95{
96 return (unsigned long)entry & RADIX_DAX_EMPTY;
97}
98
101struct page *read_dax_sector(struct block_device *bdev, sector_t n) 99struct page *read_dax_sector(struct block_device *bdev, sector_t n)
102{ 100{
103 struct page *page = alloc_pages(GFP_KERNEL, 0); 101 struct page *page = alloc_pages(GFP_KERNEL, 0);
@@ -123,19 +121,6 @@ static bool buffer_written(struct buffer_head *bh)
123 return buffer_mapped(bh) && !buffer_unwritten(bh); 121 return buffer_mapped(bh) && !buffer_unwritten(bh);
124} 122}
125 123
126/*
127 * When ext4 encounters a hole, it returns without modifying the buffer_head
128 * which means that we can't trust b_size. To cope with this, we set b_state
129 * to 0 before calling get_block and, if any bit is set, we know we can trust
130 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
131 * and would save us time calling get_block repeatedly.
132 */
133static bool buffer_size_valid(struct buffer_head *bh)
134{
135 return bh->b_state != 0;
136}
137
138
139static sector_t to_sector(const struct buffer_head *bh, 124static sector_t to_sector(const struct buffer_head *bh,
140 const struct inode *inode) 125 const struct inode *inode)
141{ 126{
@@ -177,8 +162,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
177 rc = get_block(inode, block, bh, rw == WRITE); 162 rc = get_block(inode, block, bh, rw == WRITE);
178 if (rc) 163 if (rc)
179 break; 164 break;
180 if (!buffer_size_valid(bh))
181 bh->b_size = 1 << blkbits;
182 bh_max = pos - first + bh->b_size; 165 bh_max = pos - first + bh->b_size;
183 bdev = bh->b_bdev; 166 bdev = bh->b_bdev;
184 /* 167 /*
@@ -300,7 +283,7 @@ EXPORT_SYMBOL_GPL(dax_do_io);
300 */ 283 */
301struct exceptional_entry_key { 284struct exceptional_entry_key {
302 struct address_space *mapping; 285 struct address_space *mapping;
303 unsigned long index; 286 pgoff_t entry_start;
304}; 287};
305 288
306struct wait_exceptional_entry_queue { 289struct wait_exceptional_entry_queue {
@@ -308,6 +291,26 @@ struct wait_exceptional_entry_queue {
308 struct exceptional_entry_key key; 291 struct exceptional_entry_key key;
309}; 292};
310 293
294static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
295 pgoff_t index, void *entry, struct exceptional_entry_key *key)
296{
297 unsigned long hash;
298
299 /*
300 * If 'entry' is a PMD, align the 'index' that we use for the wait
301 * queue to the start of that PMD. This ensures that all offsets in
302 * the range covered by the PMD map to the same bit lock.
303 */
304 if (dax_is_pmd_entry(entry))
305 index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
306
307 key->mapping = mapping;
308 key->entry_start = index;
309
310 hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
311 return wait_table + hash;
312}
313
311static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, 314static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
312 int sync, void *keyp) 315 int sync, void *keyp)
313{ 316{
@@ -316,7 +319,7 @@ static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
316 container_of(wait, struct wait_exceptional_entry_queue, wait); 319 container_of(wait, struct wait_exceptional_entry_queue, wait);
317 320
318 if (key->mapping != ewait->key.mapping || 321 if (key->mapping != ewait->key.mapping ||
319 key->index != ewait->key.index) 322 key->entry_start != ewait->key.entry_start)
320 return 0; 323 return 0;
321 return autoremove_wake_function(wait, mode, sync, NULL); 324 return autoremove_wake_function(wait, mode, sync, NULL);
322} 325}
@@ -372,24 +375,24 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
372static void *get_unlocked_mapping_entry(struct address_space *mapping, 375static void *get_unlocked_mapping_entry(struct address_space *mapping,
373 pgoff_t index, void ***slotp) 376 pgoff_t index, void ***slotp)
374{ 377{
375 void *ret, **slot; 378 void *entry, **slot;
376 struct wait_exceptional_entry_queue ewait; 379 struct wait_exceptional_entry_queue ewait;
377 wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 380 wait_queue_head_t *wq;
378 381
379 init_wait(&ewait.wait); 382 init_wait(&ewait.wait);
380 ewait.wait.func = wake_exceptional_entry_func; 383 ewait.wait.func = wake_exceptional_entry_func;
381 ewait.key.mapping = mapping;
382 ewait.key.index = index;
383 384
384 for (;;) { 385 for (;;) {
385 ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, 386 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
386 &slot); 387 &slot);
387 if (!ret || !radix_tree_exceptional_entry(ret) || 388 if (!entry || !radix_tree_exceptional_entry(entry) ||
388 !slot_locked(mapping, slot)) { 389 !slot_locked(mapping, slot)) {
389 if (slotp) 390 if (slotp)
390 *slotp = slot; 391 *slotp = slot;
391 return ret; 392 return entry;
392 } 393 }
394
395 wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
393 prepare_to_wait_exclusive(wq, &ewait.wait, 396 prepare_to_wait_exclusive(wq, &ewait.wait,
394 TASK_UNINTERRUPTIBLE); 397 TASK_UNINTERRUPTIBLE);
395 spin_unlock_irq(&mapping->tree_lock); 398 spin_unlock_irq(&mapping->tree_lock);
@@ -399,52 +402,157 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
399 } 402 }
400} 403}
401 404
405static void put_locked_mapping_entry(struct address_space *mapping,
406 pgoff_t index, void *entry)
407{
408 if (!radix_tree_exceptional_entry(entry)) {
409 unlock_page(entry);
410 put_page(entry);
411 } else {
412 dax_unlock_mapping_entry(mapping, index);
413 }
414}
415
416/*
417 * Called when we are done with radix tree entry we looked up via
418 * get_unlocked_mapping_entry() and which we didn't lock in the end.
419 */
420static void put_unlocked_mapping_entry(struct address_space *mapping,
421 pgoff_t index, void *entry)
422{
423 if (!radix_tree_exceptional_entry(entry))
424 return;
425
426 /* We have to wake up next waiter for the radix tree entry lock */
427 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
428}
429
402/* 430/*
403 * Find radix tree entry at given index. If it points to a page, return with 431 * Find radix tree entry at given index. If it points to a page, return with
404 * the page locked. If it points to the exceptional entry, return with the 432 * the page locked. If it points to the exceptional entry, return with the
405 * radix tree entry locked. If the radix tree doesn't contain given index, 433 * radix tree entry locked. If the radix tree doesn't contain given index,
406 * create empty exceptional entry for the index and return with it locked. 434 * create empty exceptional entry for the index and return with it locked.
407 * 435 *
436 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
437 * either return that locked entry or will return an error. This error will
438 * happen if there are any 4k entries (either zero pages or DAX entries)
439 * within the 2MiB range that we are requesting.
440 *
441 * We always favor 4k entries over 2MiB entries. There isn't a flow where we
442 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
443 * insertion will fail if it finds any 4k entries already in the tree, and a
444 * 4k insertion will cause an existing 2MiB entry to be unmapped and
445 * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
446 * well as 2MiB empty entries.
447 *
448 * The exception to this downgrade path is for 2MiB DAX PMD entries that have
449 * real storage backing them. We will leave these real 2MiB DAX entries in
450 * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
451 *
408 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 452 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
409 * persistent memory the benefit is doubtful. We can add that later if we can 453 * persistent memory the benefit is doubtful. We can add that later if we can
410 * show it helps. 454 * show it helps.
411 */ 455 */
412static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) 456static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
457 unsigned long size_flag)
413{ 458{
414 void *ret, **slot; 459 bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
460 void *entry, **slot;
415 461
416restart: 462restart:
417 spin_lock_irq(&mapping->tree_lock); 463 spin_lock_irq(&mapping->tree_lock);
418 ret = get_unlocked_mapping_entry(mapping, index, &slot); 464 entry = get_unlocked_mapping_entry(mapping, index, &slot);
465
466 if (entry) {
467 if (size_flag & RADIX_DAX_PMD) {
468 if (!radix_tree_exceptional_entry(entry) ||
469 dax_is_pte_entry(entry)) {
470 put_unlocked_mapping_entry(mapping, index,
471 entry);
472 entry = ERR_PTR(-EEXIST);
473 goto out_unlock;
474 }
475 } else { /* trying to grab a PTE entry */
476 if (radix_tree_exceptional_entry(entry) &&
477 dax_is_pmd_entry(entry) &&
478 (dax_is_zero_entry(entry) ||
479 dax_is_empty_entry(entry))) {
480 pmd_downgrade = true;
481 }
482 }
483 }
484
419 /* No entry for given index? Make sure radix tree is big enough. */ 485 /* No entry for given index? Make sure radix tree is big enough. */
420 if (!ret) { 486 if (!entry || pmd_downgrade) {
421 int err; 487 int err;
422 488
489 if (pmd_downgrade) {
490 /*
491 * Make sure 'entry' remains valid while we drop
492 * mapping->tree_lock.
493 */
494 entry = lock_slot(mapping, slot);
495 }
496
423 spin_unlock_irq(&mapping->tree_lock); 497 spin_unlock_irq(&mapping->tree_lock);
424 err = radix_tree_preload( 498 err = radix_tree_preload(
425 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 499 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
426 if (err) 500 if (err) {
501 if (pmd_downgrade)
502 put_locked_mapping_entry(mapping, index, entry);
427 return ERR_PTR(err); 503 return ERR_PTR(err);
428 ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 504 }
429 RADIX_DAX_ENTRY_LOCK); 505
506 /*
507 * Besides huge zero pages the only other thing that gets
508 * downgraded are empty entries which don't need to be
509 * unmapped.
510 */
511 if (pmd_downgrade && dax_is_zero_entry(entry))
512 unmap_mapping_range(mapping,
513 (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
514
430 spin_lock_irq(&mapping->tree_lock); 515 spin_lock_irq(&mapping->tree_lock);
431 err = radix_tree_insert(&mapping->page_tree, index, ret); 516
517 if (pmd_downgrade) {
518 radix_tree_delete(&mapping->page_tree, index);
519 mapping->nrexceptional--;
520 dax_wake_mapping_entry_waiter(mapping, index, entry,
521 true);
522 }
523
524 entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
525
526 err = __radix_tree_insert(&mapping->page_tree, index,
527 dax_radix_order(entry), entry);
432 radix_tree_preload_end(); 528 radix_tree_preload_end();
433 if (err) { 529 if (err) {
434 spin_unlock_irq(&mapping->tree_lock); 530 spin_unlock_irq(&mapping->tree_lock);
435 /* Someone already created the entry? */ 531 /*
436 if (err == -EEXIST) 532 * Someone already created the entry? This is a
533 * normal failure when inserting PMDs in a range
534 * that already contains PTEs. In that case we want
535 * to return -EEXIST immediately.
536 */
537 if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
437 goto restart; 538 goto restart;
539 /*
540 * Our insertion of a DAX PMD entry failed, most
541 * likely because it collided with a PTE sized entry
542 * at a different index in the PMD range. We haven't
543 * inserted anything into the radix tree and have no
544 * waiters to wake.
545 */
438 return ERR_PTR(err); 546 return ERR_PTR(err);
439 } 547 }
440 /* Good, we have inserted empty locked entry into the tree. */ 548 /* Good, we have inserted empty locked entry into the tree. */
441 mapping->nrexceptional++; 549 mapping->nrexceptional++;
442 spin_unlock_irq(&mapping->tree_lock); 550 spin_unlock_irq(&mapping->tree_lock);
443 return ret; 551 return entry;
444 } 552 }
445 /* Normal page in radix tree? */ 553 /* Normal page in radix tree? */
446 if (!radix_tree_exceptional_entry(ret)) { 554 if (!radix_tree_exceptional_entry(entry)) {
447 struct page *page = ret; 555 struct page *page = entry;
448 556
449 get_page(page); 557 get_page(page);
450 spin_unlock_irq(&mapping->tree_lock); 558 spin_unlock_irq(&mapping->tree_lock);
@@ -457,15 +565,26 @@ restart:
457 } 565 }
458 return page; 566 return page;
459 } 567 }
460 ret = lock_slot(mapping, slot); 568 entry = lock_slot(mapping, slot);
569 out_unlock:
461 spin_unlock_irq(&mapping->tree_lock); 570 spin_unlock_irq(&mapping->tree_lock);
462 return ret; 571 return entry;
463} 572}
464 573
574/*
575 * We do not necessarily hold the mapping->tree_lock when we call this
576 * function so it is possible that 'entry' is no longer a valid item in the
577 * radix tree. This is okay because all we really need to do is to find the
578 * correct waitqueue where tasks might be waiting for that old 'entry' and
579 * wake them.
580 */
465void dax_wake_mapping_entry_waiter(struct address_space *mapping, 581void dax_wake_mapping_entry_waiter(struct address_space *mapping,
466 pgoff_t index, bool wake_all) 582 pgoff_t index, void *entry, bool wake_all)
467{ 583{
468 wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); 584 struct exceptional_entry_key key;
585 wait_queue_head_t *wq;
586
587 wq = dax_entry_waitqueue(mapping, index, entry, &key);
469 588
470 /* 589 /*
471 * Checking for locked entry and prepare_to_wait_exclusive() happens 590 * Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -473,54 +592,24 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
473 * So at this point all tasks that could have seen our entry locked 592 * So at this point all tasks that could have seen our entry locked
474 * must be in the waitqueue and the following check will see them. 593 * must be in the waitqueue and the following check will see them.
475 */ 594 */
476 if (waitqueue_active(wq)) { 595 if (waitqueue_active(wq))
477 struct exceptional_entry_key key;
478
479 key.mapping = mapping;
480 key.index = index;
481 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 596 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
482 }
483} 597}
484 598
485void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) 599void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
486{ 600{
487 void *ret, **slot; 601 void *entry, **slot;
488 602
489 spin_lock_irq(&mapping->tree_lock); 603 spin_lock_irq(&mapping->tree_lock);
490 ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 604 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
491 if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || 605 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
492 !slot_locked(mapping, slot))) { 606 !slot_locked(mapping, slot))) {
493 spin_unlock_irq(&mapping->tree_lock); 607 spin_unlock_irq(&mapping->tree_lock);
494 return; 608 return;
495 } 609 }
496 unlock_slot(mapping, slot); 610 unlock_slot(mapping, slot);
497 spin_unlock_irq(&mapping->tree_lock); 611 spin_unlock_irq(&mapping->tree_lock);
498 dax_wake_mapping_entry_waiter(mapping, index, false); 612 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
499}
500
501static void put_locked_mapping_entry(struct address_space *mapping,
502 pgoff_t index, void *entry)
503{
504 if (!radix_tree_exceptional_entry(entry)) {
505 unlock_page(entry);
506 put_page(entry);
507 } else {
508 dax_unlock_mapping_entry(mapping, index);
509 }
510}
511
512/*
513 * Called when we are done with radix tree entry we looked up via
514 * get_unlocked_mapping_entry() and which we didn't lock in the end.
515 */
516static void put_unlocked_mapping_entry(struct address_space *mapping,
517 pgoff_t index, void *entry)
518{
519 if (!radix_tree_exceptional_entry(entry))
520 return;
521
522 /* We have to wake up next waiter for the radix tree entry lock */
523 dax_wake_mapping_entry_waiter(mapping, index, false);
524} 613}
525 614
526/* 615/*
@@ -547,7 +636,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
547 radix_tree_delete(&mapping->page_tree, index); 636 radix_tree_delete(&mapping->page_tree, index);
548 mapping->nrexceptional--; 637 mapping->nrexceptional--;
549 spin_unlock_irq(&mapping->tree_lock); 638 spin_unlock_irq(&mapping->tree_lock);
550 dax_wake_mapping_entry_waiter(mapping, index, true); 639 dax_wake_mapping_entry_waiter(mapping, index, entry, true);
551 640
552 return 1; 641 return 1;
553} 642}
@@ -600,11 +689,17 @@ static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size
600 return 0; 689 return 0;
601} 690}
602 691
603#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) 692/*
604 693 * By this point grab_mapping_entry() has ensured that we have a locked entry
694 * of the appropriate size so we don't have to worry about downgrading PMDs to
695 * PTEs. If we happen to be trying to insert a PTE and there is a PMD
696 * already in the tree, we will skip the insertion and just dirty the PMD as
697 * appropriate.
698 */
605static void *dax_insert_mapping_entry(struct address_space *mapping, 699static void *dax_insert_mapping_entry(struct address_space *mapping,
606 struct vm_fault *vmf, 700 struct vm_fault *vmf,
607 void *entry, sector_t sector) 701 void *entry, sector_t sector,
702 unsigned long flags)
608{ 703{
609 struct radix_tree_root *page_tree = &mapping->page_tree; 704 struct radix_tree_root *page_tree = &mapping->page_tree;
610 int error = 0; 705 int error = 0;
@@ -627,22 +722,35 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
627 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); 722 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
628 if (error) 723 if (error)
629 return ERR_PTR(error); 724 return ERR_PTR(error);
725 } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
726 /* replacing huge zero page with PMD block mapping */
727 unmap_mapping_range(mapping,
728 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
630 } 729 }
631 730
632 spin_lock_irq(&mapping->tree_lock); 731 spin_lock_irq(&mapping->tree_lock);
633 new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | 732 new_entry = dax_radix_locked_entry(sector, flags);
634 RADIX_DAX_ENTRY_LOCK); 733
635 if (hole_fill) { 734 if (hole_fill) {
636 __delete_from_page_cache(entry, NULL); 735 __delete_from_page_cache(entry, NULL);
637 /* Drop pagecache reference */ 736 /* Drop pagecache reference */
638 put_page(entry); 737 put_page(entry);
639 error = radix_tree_insert(page_tree, index, new_entry); 738 error = __radix_tree_insert(page_tree, index,
739 dax_radix_order(new_entry), new_entry);
640 if (error) { 740 if (error) {
641 new_entry = ERR_PTR(error); 741 new_entry = ERR_PTR(error);
642 goto unlock; 742 goto unlock;
643 } 743 }
644 mapping->nrexceptional++; 744 mapping->nrexceptional++;
645 } else { 745 } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
746 /*
747 * Only swap our new entry into the radix tree if the current
748 * entry is a zero page or an empty entry. If a normal PTE or
749 * PMD entry is already in the tree, we leave it alone. This
750 * means that if we are trying to insert a PTE and the
751 * existing entry is a PMD, we will just leave the PMD in the
752 * tree and dirty it if necessary.
753 */
646 void **slot; 754 void **slot;
647 void *ret; 755 void *ret;
648 756
@@ -672,7 +780,6 @@ static int dax_writeback_one(struct block_device *bdev,
672 struct address_space *mapping, pgoff_t index, void *entry) 780 struct address_space *mapping, pgoff_t index, void *entry)
673{ 781{
674 struct radix_tree_root *page_tree = &mapping->page_tree; 782 struct radix_tree_root *page_tree = &mapping->page_tree;
675 int type = RADIX_DAX_TYPE(entry);
676 struct radix_tree_node *node; 783 struct radix_tree_node *node;
677 struct blk_dax_ctl dax; 784 struct blk_dax_ctl dax;
678 void **slot; 785 void **slot;
@@ -693,13 +800,21 @@ static int dax_writeback_one(struct block_device *bdev,
693 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 800 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
694 goto unlock; 801 goto unlock;
695 802
696 if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { 803 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
804 dax_is_zero_entry(entry))) {
697 ret = -EIO; 805 ret = -EIO;
698 goto unlock; 806 goto unlock;
699 } 807 }
700 808
701 dax.sector = RADIX_DAX_SECTOR(entry); 809 /*
702 dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); 810 * Even if dax_writeback_mapping_range() was given a wbc->range_start
811 * in the middle of a PMD, the 'index' we are given will be aligned to
812 * the start index of the PMD, as will the sector we pull from
813 * 'entry'. This allows us to flush for PMD_SIZE and not have to
814 * worry about partial PMD writebacks.
815 */
816 dax.sector = dax_radix_sector(entry);
817 dax.size = PAGE_SIZE << dax_radix_order(entry);
703 spin_unlock_irq(&mapping->tree_lock); 818 spin_unlock_irq(&mapping->tree_lock);
704 819
705 /* 820 /*
@@ -738,12 +853,11 @@ int dax_writeback_mapping_range(struct address_space *mapping,
738 struct block_device *bdev, struct writeback_control *wbc) 853 struct block_device *bdev, struct writeback_control *wbc)
739{ 854{
740 struct inode *inode = mapping->host; 855 struct inode *inode = mapping->host;
741 pgoff_t start_index, end_index, pmd_index; 856 pgoff_t start_index, end_index;
742 pgoff_t indices[PAGEVEC_SIZE]; 857 pgoff_t indices[PAGEVEC_SIZE];
743 struct pagevec pvec; 858 struct pagevec pvec;
744 bool done = false; 859 bool done = false;
745 int i, ret = 0; 860 int i, ret = 0;
746 void *entry;
747 861
748 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 862 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
749 return -EIO; 863 return -EIO;
@@ -753,15 +867,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
753 867
754 start_index = wbc->range_start >> PAGE_SHIFT; 868 start_index = wbc->range_start >> PAGE_SHIFT;
755 end_index = wbc->range_end >> PAGE_SHIFT; 869 end_index = wbc->range_end >> PAGE_SHIFT;
756 pmd_index = DAX_PMD_INDEX(start_index);
757
758 rcu_read_lock();
759 entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
760 rcu_read_unlock();
761
762 /* see if the start of our range is covered by a PMD entry */
763 if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
764 start_index = pmd_index;
765 870
766 tag_pages_for_writeback(mapping, start_index, end_index); 871 tag_pages_for_writeback(mapping, start_index, end_index);
767 872
@@ -806,7 +911,7 @@ static int dax_insert_mapping(struct address_space *mapping,
806 return PTR_ERR(dax.addr); 911 return PTR_ERR(dax.addr);
807 dax_unmap_atomic(bdev, &dax); 912 dax_unmap_atomic(bdev, &dax);
808 913
809 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); 914 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
810 if (IS_ERR(ret)) 915 if (IS_ERR(ret))
811 return PTR_ERR(ret); 916 return PTR_ERR(ret);
812 *entryp = ret; 917 *entryp = ret;
@@ -853,7 +958,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
853 bh.b_bdev = inode->i_sb->s_bdev; 958 bh.b_bdev = inode->i_sb->s_bdev;
854 bh.b_size = PAGE_SIZE; 959 bh.b_size = PAGE_SIZE;
855 960
856 entry = grab_mapping_entry(mapping, vmf->pgoff); 961 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
857 if (IS_ERR(entry)) { 962 if (IS_ERR(entry)) {
858 error = PTR_ERR(entry); 963 error = PTR_ERR(entry);
859 goto out; 964 goto out;
@@ -913,224 +1018,6 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
913} 1018}
914EXPORT_SYMBOL_GPL(dax_fault); 1019EXPORT_SYMBOL_GPL(dax_fault);
915 1020
916#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
917/*
918 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
919 * more often than one might expect in the below function.
920 */
921#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
922
923static void __dax_dbg(struct buffer_head *bh, unsigned long address,
924 const char *reason, const char *fn)
925{
926 if (bh) {
927 char bname[BDEVNAME_SIZE];
928 bdevname(bh->b_bdev, bname);
929 pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
930 "length %zd fallback: %s\n", fn, current->comm,
931 address, bname, bh->b_state, (u64)bh->b_blocknr,
932 bh->b_size, reason);
933 } else {
934 pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
935 current->comm, address, reason);
936 }
937}
938
939#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
940
941/**
942 * dax_pmd_fault - handle a PMD fault on a DAX file
943 * @vma: The virtual memory area where the fault occurred
944 * @vmf: The description of the fault
945 * @get_block: The filesystem method used to translate file offsets to blocks
946 *
947 * When a page fault occurs, filesystems may call this helper in their
948 * pmd_fault handler for DAX files.
949 */
950int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
951 pmd_t *pmd, unsigned int flags, get_block_t get_block)
952{
953 struct file *file = vma->vm_file;
954 struct address_space *mapping = file->f_mapping;
955 struct inode *inode = mapping->host;
956 struct buffer_head bh;
957 unsigned blkbits = inode->i_blkbits;
958 unsigned long pmd_addr = address & PMD_MASK;
959 bool write = flags & FAULT_FLAG_WRITE;
960 struct block_device *bdev;
961 pgoff_t size, pgoff;
962 sector_t block;
963 int result = 0;
964 bool alloc = false;
965
966 /* dax pmd mappings require pfn_t_devmap() */
967 if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
968 return VM_FAULT_FALLBACK;
969
970 /* Fall back to PTEs if we're going to COW */
971 if (write && !(vma->vm_flags & VM_SHARED)) {
972 split_huge_pmd(vma, pmd, address);
973 dax_pmd_dbg(NULL, address, "cow write");
974 return VM_FAULT_FALLBACK;
975 }
976 /* If the PMD would extend outside the VMA */
977 if (pmd_addr < vma->vm_start) {
978 dax_pmd_dbg(NULL, address, "vma start unaligned");
979 return VM_FAULT_FALLBACK;
980 }
981 if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
982 dax_pmd_dbg(NULL, address, "vma end unaligned");
983 return VM_FAULT_FALLBACK;
984 }
985
986 pgoff = linear_page_index(vma, pmd_addr);
987 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
988 if (pgoff >= size)
989 return VM_FAULT_SIGBUS;
990 /* If the PMD would cover blocks out of the file */
991 if ((pgoff | PG_PMD_COLOUR) >= size) {
992 dax_pmd_dbg(NULL, address,
993 "offset + huge page size > file size");
994 return VM_FAULT_FALLBACK;
995 }
996
997 memset(&bh, 0, sizeof(bh));
998 bh.b_bdev = inode->i_sb->s_bdev;
999 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
1000
1001 bh.b_size = PMD_SIZE;
1002
1003 if (get_block(inode, block, &bh, 0) != 0)
1004 return VM_FAULT_SIGBUS;
1005
1006 if (!buffer_mapped(&bh) && write) {
1007 if (get_block(inode, block, &bh, 1) != 0)
1008 return VM_FAULT_SIGBUS;
1009 alloc = true;
1010 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
1011 }
1012
1013 bdev = bh.b_bdev;
1014
1015 /*
1016 * If the filesystem isn't willing to tell us the length of a hole,
1017 * just fall back to PTEs. Calling get_block 512 times in a loop
1018 * would be silly.
1019 */
1020 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
1021 dax_pmd_dbg(&bh, address, "allocated block too small");
1022 return VM_FAULT_FALLBACK;
1023 }
1024
1025 /*
1026 * If we allocated new storage, make sure no process has any
1027 * zero pages covering this hole
1028 */
1029 if (alloc) {
1030 loff_t lstart = pgoff << PAGE_SHIFT;
1031 loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
1032
1033 truncate_pagecache_range(inode, lstart, lend);
1034 }
1035
1036 if (!write && !buffer_mapped(&bh)) {
1037 spinlock_t *ptl;
1038 pmd_t entry;
1039 struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
1040
1041 if (unlikely(!zero_page)) {
1042 dax_pmd_dbg(&bh, address, "no zero page");
1043 goto fallback;
1044 }
1045
1046 ptl = pmd_lock(vma->vm_mm, pmd);
1047 if (!pmd_none(*pmd)) {
1048 spin_unlock(ptl);
1049 dax_pmd_dbg(&bh, address, "pmd already present");
1050 goto fallback;
1051 }
1052
1053 dev_dbg(part_to_dev(bdev->bd_part),
1054 "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
1055 __func__, current->comm, address,
1056 (unsigned long long) to_sector(&bh, inode));
1057
1058 entry = mk_pmd(zero_page, vma->vm_page_prot);
1059 entry = pmd_mkhuge(entry);
1060 set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
1061 result = VM_FAULT_NOPAGE;
1062 spin_unlock(ptl);
1063 } else {
1064 struct blk_dax_ctl dax = {
1065 .sector = to_sector(&bh, inode),
1066 .size = PMD_SIZE,
1067 };
1068 long length = dax_map_atomic(bdev, &dax);
1069
1070 if (length < 0) {
1071 dax_pmd_dbg(&bh, address, "dax-error fallback");
1072 goto fallback;
1073 }
1074 if (length < PMD_SIZE) {
1075 dax_pmd_dbg(&bh, address, "dax-length too small");
1076 dax_unmap_atomic(bdev, &dax);
1077 goto fallback;
1078 }
1079 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
1080 dax_pmd_dbg(&bh, address, "pfn unaligned");
1081 dax_unmap_atomic(bdev, &dax);
1082 goto fallback;
1083 }
1084
1085 if (!pfn_t_devmap(dax.pfn)) {
1086 dax_unmap_atomic(bdev, &dax);
1087 dax_pmd_dbg(&bh, address, "pfn not in memmap");
1088 goto fallback;
1089 }
1090 dax_unmap_atomic(bdev, &dax);
1091
1092 /*
1093 * For PTE faults we insert a radix tree entry for reads, and
1094 * leave it clean. Then on the first write we dirty the radix
1095 * tree entry via the dax_pfn_mkwrite() path. This sequence
1096 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
1097 * call into get_block() to translate the pgoff to a sector in
1098 * order to be able to create a new radix tree entry.
1099 *
1100 * The PMD path doesn't have an equivalent to
1101 * dax_pfn_mkwrite(), though, so for a read followed by a
1102 * write we traverse all the way through dax_pmd_fault()
1103 * twice. This means we can just skip inserting a radix tree
1104 * entry completely on the initial read and just wait until
1105 * the write to insert a dirty entry.
1106 */
1107 if (write) {
1108 /*
1109 * We should insert radix-tree entry and dirty it here.
1110 * For now this is broken...
1111 */
1112 }
1113
1114 dev_dbg(part_to_dev(bdev->bd_part),
1115 "%s: %s addr: %lx pfn: %lx sect: %llx\n",
1116 __func__, current->comm, address,
1117 pfn_t_to_pfn(dax.pfn),
1118 (unsigned long long) dax.sector);
1119 result |= vmf_insert_pfn_pmd(vma, address, pmd,
1120 dax.pfn, write);
1121 }
1122
1123 out:
1124 return result;
1125
1126 fallback:
1127 count_vm_event(THP_FAULT_FALLBACK);
1128 result = VM_FAULT_FALLBACK;
1129 goto out;
1130}
1131EXPORT_SYMBOL_GPL(dax_pmd_fault);
1132#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1133
1134/** 1021/**
1135 * dax_pfn_mkwrite - handle first write to DAX page 1022 * dax_pfn_mkwrite - handle first write to DAX page
1136 * @vma: The virtual memory area where the fault occurred 1023 * @vma: The virtual memory area where the fault occurred
@@ -1214,7 +1101,8 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
1214 /* Block boundary? Nothing to do */ 1101 /* Block boundary? Nothing to do */
1215 if (!length) 1102 if (!length)
1216 return 0; 1103 return 0;
1217 BUG_ON((offset + length) > PAGE_SIZE); 1104 if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
1105 return -EINVAL;
1218 1106
1219 memset(&bh, 0, sizeof(bh)); 1107 memset(&bh, 0, sizeof(bh));
1220 bh.b_bdev = inode->i_sb->s_bdev; 1108 bh.b_bdev = inode->i_sb->s_bdev;
@@ -1245,8 +1133,13 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
1245EXPORT_SYMBOL_GPL(dax_truncate_page); 1133EXPORT_SYMBOL_GPL(dax_truncate_page);
1246 1134
1247#ifdef CONFIG_FS_IOMAP 1135#ifdef CONFIG_FS_IOMAP
1136static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
1137{
1138 return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
1139}
1140
1248static loff_t 1141static loff_t
1249iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 1142dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1250 struct iomap *iomap) 1143 struct iomap *iomap)
1251{ 1144{
1252 struct iov_iter *iter = data; 1145 struct iov_iter *iter = data;
@@ -1270,8 +1163,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1270 struct blk_dax_ctl dax = { 0 }; 1163 struct blk_dax_ctl dax = { 0 };
1271 ssize_t map_len; 1164 ssize_t map_len;
1272 1165
1273 dax.sector = iomap->blkno + 1166 dax.sector = dax_iomap_sector(iomap, pos);
1274 (((pos & PAGE_MASK) - iomap->offset) >> 9);
1275 dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; 1167 dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
1276 map_len = dax_map_atomic(iomap->bdev, &dax); 1168 map_len = dax_map_atomic(iomap->bdev, &dax);
1277 if (map_len < 0) { 1169 if (map_len < 0) {
@@ -1303,7 +1195,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1303} 1195}
1304 1196
1305/** 1197/**
1306 * iomap_dax_rw - Perform I/O to a DAX file 1198 * dax_iomap_rw - Perform I/O to a DAX file
1307 * @iocb: The control block for this I/O 1199 * @iocb: The control block for this I/O
1308 * @iter: The addresses to do I/O from or to 1200 * @iter: The addresses to do I/O from or to
1309 * @ops: iomap ops passed from the file system 1201 * @ops: iomap ops passed from the file system
@@ -1313,7 +1205,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1313 * and evicting any page cache pages in the region under I/O. 1205 * and evicting any page cache pages in the region under I/O.
1314 */ 1206 */
1315ssize_t 1207ssize_t
1316iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, 1208dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1317 struct iomap_ops *ops) 1209 struct iomap_ops *ops)
1318{ 1210{
1319 struct address_space *mapping = iocb->ki_filp->f_mapping; 1211 struct address_space *mapping = iocb->ki_filp->f_mapping;
@@ -1343,7 +1235,7 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
1343 1235
1344 while (iov_iter_count(iter)) { 1236 while (iov_iter_count(iter)) {
1345 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, 1237 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
1346 iter, iomap_dax_actor); 1238 iter, dax_iomap_actor);
1347 if (ret <= 0) 1239 if (ret <= 0)
1348 break; 1240 break;
1349 pos += ret; 1241 pos += ret;
@@ -1353,10 +1245,10 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
1353 iocb->ki_pos += done; 1245 iocb->ki_pos += done;
1354 return done ? done : ret; 1246 return done ? done : ret;
1355} 1247}
1356EXPORT_SYMBOL_GPL(iomap_dax_rw); 1248EXPORT_SYMBOL_GPL(dax_iomap_rw);
1357 1249
1358/** 1250/**
1359 * iomap_dax_fault - handle a page fault on a DAX file 1251 * dax_iomap_fault - handle a page fault on a DAX file
1360 * @vma: The virtual memory area where the fault occurred 1252 * @vma: The virtual memory area where the fault occurred
1361 * @vmf: The description of the fault 1253 * @vmf: The description of the fault
1362 * @ops: iomap ops passed from the file system 1254 * @ops: iomap ops passed from the file system
@@ -1365,7 +1257,7 @@ EXPORT_SYMBOL_GPL(iomap_dax_rw);
1365 * or mkwrite handler for DAX files. Assumes the caller has done all the 1257 * or mkwrite handler for DAX files. Assumes the caller has done all the
1366 * necessary locking for the page fault to proceed successfully. 1258 * necessary locking for the page fault to proceed successfully.
1367 */ 1259 */
1368int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 1260int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1369 struct iomap_ops *ops) 1261 struct iomap_ops *ops)
1370{ 1262{
1371 struct address_space *mapping = vma->vm_file->f_mapping; 1263 struct address_space *mapping = vma->vm_file->f_mapping;
@@ -1374,8 +1266,9 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1374 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1266 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1375 sector_t sector; 1267 sector_t sector;
1376 struct iomap iomap = { 0 }; 1268 struct iomap iomap = { 0 };
1377 unsigned flags = 0; 1269 unsigned flags = IOMAP_FAULT;
1378 int error, major = 0; 1270 int error, major = 0;
1271 int locked_status = 0;
1379 void *entry; 1272 void *entry;
1380 1273
1381 /* 1274 /*
@@ -1386,7 +1279,7 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1386 if (pos >= i_size_read(inode)) 1279 if (pos >= i_size_read(inode))
1387 return VM_FAULT_SIGBUS; 1280 return VM_FAULT_SIGBUS;
1388 1281
1389 entry = grab_mapping_entry(mapping, vmf->pgoff); 1282 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1390 if (IS_ERR(entry)) { 1283 if (IS_ERR(entry)) {
1391 error = PTR_ERR(entry); 1284 error = PTR_ERR(entry);
1392 goto out; 1285 goto out;
@@ -1405,10 +1298,10 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1405 goto unlock_entry; 1298 goto unlock_entry;
1406 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { 1299 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1407 error = -EIO; /* fs corruption? */ 1300 error = -EIO; /* fs corruption? */
1408 goto unlock_entry; 1301 goto finish_iomap;
1409 } 1302 }
1410 1303
1411 sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); 1304 sector = dax_iomap_sector(&iomap, pos);
1412 1305
1413 if (vmf->cow_page) { 1306 if (vmf->cow_page) {
1414 switch (iomap.type) { 1307 switch (iomap.type) {
@@ -1427,13 +1320,15 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1427 } 1320 }
1428 1321
1429 if (error) 1322 if (error)
1430 goto unlock_entry; 1323 goto finish_iomap;
1431 if (!radix_tree_exceptional_entry(entry)) { 1324 if (!radix_tree_exceptional_entry(entry)) {
1432 vmf->page = entry; 1325 vmf->page = entry;
1433 return VM_FAULT_LOCKED; 1326 locked_status = VM_FAULT_LOCKED;
1327 } else {
1328 vmf->entry = entry;
1329 locked_status = VM_FAULT_DAX_LOCKED;
1434 } 1330 }
1435 vmf->entry = entry; 1331 goto finish_iomap;
1436 return VM_FAULT_DAX_LOCKED;
1437 } 1332 }
1438 1333
1439 switch (iomap.type) { 1334 switch (iomap.type) {
@@ -1448,8 +1343,10 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1448 break; 1343 break;
1449 case IOMAP_UNWRITTEN: 1344 case IOMAP_UNWRITTEN:
1450 case IOMAP_HOLE: 1345 case IOMAP_HOLE:
1451 if (!(vmf->flags & FAULT_FLAG_WRITE)) 1346 if (!(vmf->flags & FAULT_FLAG_WRITE)) {
1452 return dax_load_hole(mapping, entry, vmf); 1347 locked_status = dax_load_hole(mapping, entry, vmf);
1348 break;
1349 }
1453 /*FALLTHRU*/ 1350 /*FALLTHRU*/
1454 default: 1351 default:
1455 WARN_ON_ONCE(1); 1352 WARN_ON_ONCE(1);
@@ -1457,15 +1354,218 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1457 break; 1354 break;
1458 } 1355 }
1459 1356
1357 finish_iomap:
1358 if (ops->iomap_end) {
1359 if (error) {
1360 /* keep previous error */
1361 ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
1362 &iomap);
1363 } else {
1364 error = ops->iomap_end(inode, pos, PAGE_SIZE,
1365 PAGE_SIZE, flags, &iomap);
1366 }
1367 }
1460 unlock_entry: 1368 unlock_entry:
1461 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1369 if (!locked_status || error)
1370 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1462 out: 1371 out:
1463 if (error == -ENOMEM) 1372 if (error == -ENOMEM)
1464 return VM_FAULT_OOM | major; 1373 return VM_FAULT_OOM | major;
1465 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1374 /* -EBUSY is fine, somebody else faulted on the same PTE */
1466 if (error < 0 && error != -EBUSY) 1375 if (error < 0 && error != -EBUSY)
1467 return VM_FAULT_SIGBUS | major; 1376 return VM_FAULT_SIGBUS | major;
1377 if (locked_status) {
1378 WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
1379 return locked_status;
1380 }
1468 return VM_FAULT_NOPAGE | major; 1381 return VM_FAULT_NOPAGE | major;
1469} 1382}
1470EXPORT_SYMBOL_GPL(iomap_dax_fault); 1383EXPORT_SYMBOL_GPL(dax_iomap_fault);
1384
1385#ifdef CONFIG_FS_DAX_PMD
1386/*
1387 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
1388 * more often than one might expect in the below functions.
1389 */
1390#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
1391
1392static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
1393 struct vm_fault *vmf, unsigned long address,
1394 struct iomap *iomap, loff_t pos, bool write, void **entryp)
1395{
1396 struct address_space *mapping = vma->vm_file->f_mapping;
1397 struct block_device *bdev = iomap->bdev;
1398 struct blk_dax_ctl dax = {
1399 .sector = dax_iomap_sector(iomap, pos),
1400 .size = PMD_SIZE,
1401 };
1402 long length = dax_map_atomic(bdev, &dax);
1403 void *ret;
1404
1405 if (length < 0) /* dax_map_atomic() failed */
1406 return VM_FAULT_FALLBACK;
1407 if (length < PMD_SIZE)
1408 goto unmap_fallback;
1409 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
1410 goto unmap_fallback;
1411 if (!pfn_t_devmap(dax.pfn))
1412 goto unmap_fallback;
1413
1414 dax_unmap_atomic(bdev, &dax);
1415
1416 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
1417 RADIX_DAX_PMD);
1418 if (IS_ERR(ret))
1419 return VM_FAULT_FALLBACK;
1420 *entryp = ret;
1421
1422 return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
1423
1424 unmap_fallback:
1425 dax_unmap_atomic(bdev, &dax);
1426 return VM_FAULT_FALLBACK;
1427}
1428
1429static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
1430 struct vm_fault *vmf, unsigned long address,
1431 struct iomap *iomap, void **entryp)
1432{
1433 struct address_space *mapping = vma->vm_file->f_mapping;
1434 unsigned long pmd_addr = address & PMD_MASK;
1435 struct page *zero_page;
1436 spinlock_t *ptl;
1437 pmd_t pmd_entry;
1438 void *ret;
1439
1440 zero_page = mm_get_huge_zero_page(vma->vm_mm);
1441
1442 if (unlikely(!zero_page))
1443 return VM_FAULT_FALLBACK;
1444
1445 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
1446 RADIX_DAX_PMD | RADIX_DAX_HZP);
1447 if (IS_ERR(ret))
1448 return VM_FAULT_FALLBACK;
1449 *entryp = ret;
1450
1451 ptl = pmd_lock(vma->vm_mm, pmd);
1452 if (!pmd_none(*pmd)) {
1453 spin_unlock(ptl);
1454 return VM_FAULT_FALLBACK;
1455 }
1456
1457 pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
1458 pmd_entry = pmd_mkhuge(pmd_entry);
1459 set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
1460 spin_unlock(ptl);
1461 return VM_FAULT_NOPAGE;
1462}
1463
1464int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1465 pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
1466{
1467 struct address_space *mapping = vma->vm_file->f_mapping;
1468 unsigned long pmd_addr = address & PMD_MASK;
1469 bool write = flags & FAULT_FLAG_WRITE;
1470 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
1471 struct inode *inode = mapping->host;
1472 int result = VM_FAULT_FALLBACK;
1473 struct iomap iomap = { 0 };
1474 pgoff_t max_pgoff, pgoff;
1475 struct vm_fault vmf;
1476 void *entry;
1477 loff_t pos;
1478 int error;
1479
1480 /* Fall back to PTEs if we're going to COW */
1481 if (write && !(vma->vm_flags & VM_SHARED))
1482 goto fallback;
1483
1484 /* If the PMD would extend outside the VMA */
1485 if (pmd_addr < vma->vm_start)
1486 goto fallback;
1487 if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1488 goto fallback;
1489
1490 /*
1491 * Check whether offset isn't beyond end of file now. Caller is
1492 * supposed to hold locks serializing us with truncate / punch hole so
1493 * this is a reliable test.
1494 */
1495 pgoff = linear_page_index(vma, pmd_addr);
1496 max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
1497
1498 if (pgoff > max_pgoff)
1499 return VM_FAULT_SIGBUS;
1500
1501 /* If the PMD would extend beyond the file size */
1502 if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
1503 goto fallback;
1504
1505 /*
1506 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1507 * PMD or a HZP entry. If it can't (because a 4k page is already in
1508 * the tree, for instance), it will return -EEXIST and we just fall
1509 * back to 4k entries.
1510 */
1511 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1512 if (IS_ERR(entry))
1513 goto fallback;
1514
1515 /*
1516 * Note that we don't use iomap_apply here. We aren't doing I/O, only
1517 * setting up a mapping, so really we're using iomap_begin() as a way
1518 * to look up our filesystem block.
1519 */
1520 pos = (loff_t)pgoff << PAGE_SHIFT;
1521 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1522 if (error)
1523 goto unlock_entry;
1524 if (iomap.offset + iomap.length < pos + PMD_SIZE)
1525 goto finish_iomap;
1526
1527 vmf.pgoff = pgoff;
1528 vmf.flags = flags;
1529 vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
1530
1531 switch (iomap.type) {
1532 case IOMAP_MAPPED:
1533 result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
1534 &iomap, pos, write, &entry);
1535 break;
1536 case IOMAP_UNWRITTEN:
1537 case IOMAP_HOLE:
1538 if (WARN_ON_ONCE(write))
1539 goto finish_iomap;
1540 result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
1541 &entry);
1542 break;
1543 default:
1544 WARN_ON_ONCE(1);
1545 break;
1546 }
1547
1548 finish_iomap:
1549 if (ops->iomap_end) {
1550 if (result == VM_FAULT_FALLBACK) {
1551 ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
1552 &iomap);
1553 } else {
1554 error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
1555 iomap_flags, &iomap);
1556 if (error)
1557 result = VM_FAULT_FALLBACK;
1558 }
1559 }
1560 unlock_entry:
1561 put_locked_mapping_entry(mapping, pgoff, entry);
1562 fallback:
1563 if (result == VM_FAULT_FALLBACK) {
1564 split_huge_pmd(vma, pmd, address);
1565 count_vm_event(THP_FAULT_FALLBACK);
1566 }
1567 return result;
1568}
1569EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
1570#endif /* CONFIG_FS_DAX_PMD */
1471#endif /* CONFIG_FS_IOMAP */ 1571#endif /* CONFIG_FS_IOMAP */
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a0e1478dfd04..b0f241528a30 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -38,7 +38,7 @@ static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
38 return 0; /* skip atime */ 38 return 0; /* skip atime */
39 39
40 inode_lock_shared(inode); 40 inode_lock_shared(inode);
41 ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops); 41 ret = dax_iomap_rw(iocb, to, &ext2_iomap_ops);
42 inode_unlock_shared(inode); 42 inode_unlock_shared(inode);
43 43
44 file_accessed(iocb->ki_filp); 44 file_accessed(iocb->ki_filp);
@@ -62,7 +62,7 @@ static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
62 if (ret) 62 if (ret)
63 goto out_unlock; 63 goto out_unlock;
64 64
65 ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops); 65 ret = dax_iomap_rw(iocb, from, &ext2_iomap_ops);
66 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 66 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
67 i_size_write(inode, iocb->ki_pos); 67 i_size_write(inode, iocb->ki_pos);
68 mark_inode_dirty(inode); 68 mark_inode_dirty(inode);
@@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
99 } 99 }
100 down_read(&ei->dax_sem); 100 down_read(&ei->dax_sem);
101 101
102 ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops); 102 ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops);
103 103
104 up_read(&ei->dax_sem); 104 up_read(&ei->dax_sem);
105 if (vmf->flags & FAULT_FLAG_WRITE) 105 if (vmf->flags & FAULT_FLAG_WRITE)
@@ -107,27 +107,6 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
107 return ret; 107 return ret;
108} 108}
109 109
110static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
111 pmd_t *pmd, unsigned int flags)
112{
113 struct inode *inode = file_inode(vma->vm_file);
114 struct ext2_inode_info *ei = EXT2_I(inode);
115 int ret;
116
117 if (flags & FAULT_FLAG_WRITE) {
118 sb_start_pagefault(inode->i_sb);
119 file_update_time(vma->vm_file);
120 }
121 down_read(&ei->dax_sem);
122
123 ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
124
125 up_read(&ei->dax_sem);
126 if (flags & FAULT_FLAG_WRITE)
127 sb_end_pagefault(inode->i_sb);
128 return ret;
129}
130
131static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, 110static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
132 struct vm_fault *vmf) 111 struct vm_fault *vmf)
133{ 112{
@@ -154,7 +133,11 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
154 133
155static const struct vm_operations_struct ext2_dax_vm_ops = { 134static const struct vm_operations_struct ext2_dax_vm_ops = {
156 .fault = ext2_dax_fault, 135 .fault = ext2_dax_fault,
157 .pmd_fault = ext2_dax_pmd_fault, 136 /*
137 * .pmd_fault is not supported for DAX because allocation in ext2
138 * cannot be reliably aligned to huge page sizes and so pmd faults
139 * will always fail and fail back to regular faults.
140 */
158 .page_mkwrite = ext2_dax_fault, 141 .page_mkwrite = ext2_dax_fault,
159 .pfn_mkwrite = ext2_dax_pfn_mkwrite, 142 .pfn_mkwrite = ext2_dax_pfn_mkwrite,
160}; 143};
@@ -166,7 +149,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
166 149
167 file_accessed(file); 150 file_accessed(file);
168 vma->vm_ops = &ext2_dax_vm_ops; 151 vma->vm_ops = &ext2_dax_vm_ops;
169 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; 152 vma->vm_flags |= VM_MIXEDMAP;
170 return 0; 153 return 0;
171} 154}
172#else 155#else
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9c064727ed62..3d58b2b477e8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -767,6 +767,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
767 ext4_update_bh_state(bh, map.m_flags); 767 ext4_update_bh_state(bh, map.m_flags);
768 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 768 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
769 ret = 0; 769 ret = 0;
770 } else if (ret == 0) {
771 /* hole case, need to fill in bh->b_size */
772 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
770 } 773 }
771 return ret; 774 return ret;
772} 775}
diff --git a/fs/iomap.c b/fs/iomap.c
index a8ee8c33ca78..13dd413b2b9c 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -467,8 +467,9 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
467 467
468 offset = page_offset(page); 468 offset = page_offset(page);
469 while (length > 0) { 469 while (length > 0) {
470 ret = iomap_apply(inode, offset, length, IOMAP_WRITE, 470 ret = iomap_apply(inode, offset, length,
471 ops, page, iomap_page_mkwrite_actor); 471 IOMAP_WRITE | IOMAP_FAULT, ops, page,
472 iomap_page_mkwrite_actor);
472 if (unlikely(ret <= 0)) 473 if (unlikely(ret <= 0))
473 goto out_unlock; 474 goto out_unlock;
474 offset += ret; 475 offset += ret;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3e57a56cf829..561cf1456c6c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1298,8 +1298,7 @@ __xfs_get_blocks(
1298 sector_t iblock, 1298 sector_t iblock,
1299 struct buffer_head *bh_result, 1299 struct buffer_head *bh_result,
1300 int create, 1300 int create,
1301 bool direct, 1301 bool direct)
1302 bool dax_fault)
1303{ 1302{
1304 struct xfs_inode *ip = XFS_I(inode); 1303 struct xfs_inode *ip = XFS_I(inode);
1305 struct xfs_mount *mp = ip->i_mount; 1304 struct xfs_mount *mp = ip->i_mount;
@@ -1420,13 +1419,8 @@ __xfs_get_blocks(
1420 if (ISUNWRITTEN(&imap)) 1419 if (ISUNWRITTEN(&imap))
1421 set_buffer_unwritten(bh_result); 1420 set_buffer_unwritten(bh_result);
1422 /* direct IO needs special help */ 1421 /* direct IO needs special help */
1423 if (create) { 1422 if (create)
1424 if (dax_fault) 1423 xfs_map_direct(inode, bh_result, &imap, offset, is_cow);
1425 ASSERT(!ISUNWRITTEN(&imap));
1426 else
1427 xfs_map_direct(inode, bh_result, &imap, offset,
1428 is_cow);
1429 }
1430 } 1424 }
1431 1425
1432 /* 1426 /*
@@ -1466,7 +1460,7 @@ xfs_get_blocks(
1466 struct buffer_head *bh_result, 1460 struct buffer_head *bh_result,
1467 int create) 1461 int create)
1468{ 1462{
1469 return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); 1463 return __xfs_get_blocks(inode, iblock, bh_result, create, false);
1470} 1464}
1471 1465
1472int 1466int
@@ -1476,17 +1470,7 @@ xfs_get_blocks_direct(
1476 struct buffer_head *bh_result, 1470 struct buffer_head *bh_result,
1477 int create) 1471 int create)
1478{ 1472{
1479 return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); 1473 return __xfs_get_blocks(inode, iblock, bh_result, create, true);
1480}
1481
1482int
1483xfs_get_blocks_dax_fault(
1484 struct inode *inode,
1485 sector_t iblock,
1486 struct buffer_head *bh_result,
1487 int create)
1488{
1489 return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
1490} 1474}
1491 1475
1492/* 1476/*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index b3c6634f9518..34dc00dfb91d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -59,9 +59,6 @@ int xfs_get_blocks(struct inode *inode, sector_t offset,
59 struct buffer_head *map_bh, int create); 59 struct buffer_head *map_bh, int create);
60int xfs_get_blocks_direct(struct inode *inode, sector_t offset, 60int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
61 struct buffer_head *map_bh, int create); 61 struct buffer_head *map_bh, int create);
62int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
63 struct buffer_head *map_bh, int create);
64
65int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, 62int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
66 ssize_t size, void *private); 63 ssize_t size, void *private);
67int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); 64int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 6e4f7f900fea..d818c160451f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -318,7 +318,7 @@ xfs_file_dax_read(
318 return 0; /* skip atime */ 318 return 0; /* skip atime */
319 319
320 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 320 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
321 ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops); 321 ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
322 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 322 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
323 323
324 file_accessed(iocb->ki_filp); 324 file_accessed(iocb->ki_filp);
@@ -653,7 +653,7 @@ xfs_file_dax_write(
653 653
654 trace_xfs_file_dax_write(ip, count, pos); 654 trace_xfs_file_dax_write(ip, count, pos);
655 655
656 ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); 656 ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
657 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 657 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
658 i_size_write(inode, iocb->ki_pos); 658 i_size_write(inode, iocb->ki_pos);
659 error = xfs_setfilesize(ip, pos, ret); 659 error = xfs_setfilesize(ip, pos, ret);
@@ -1474,7 +1474,7 @@ xfs_filemap_page_mkwrite(
1474 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1474 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1475 1475
1476 if (IS_DAX(inode)) { 1476 if (IS_DAX(inode)) {
1477 ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); 1477 ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
1478 } else { 1478 } else {
1479 ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); 1479 ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
1480 ret = block_page_mkwrite_return(ret); 1480 ret = block_page_mkwrite_return(ret);
@@ -1508,7 +1508,7 @@ xfs_filemap_fault(
1508 * changes to xfs_get_blocks_direct() to map unwritten extent 1508 * changes to xfs_get_blocks_direct() to map unwritten extent
1509 * ioend for conversion on read-only mappings. 1509 * ioend for conversion on read-only mappings.
1510 */ 1510 */
1511 ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); 1511 ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
1512 } else 1512 } else
1513 ret = filemap_fault(vma, vmf); 1513 ret = filemap_fault(vma, vmf);
1514 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1514 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1545,7 +1545,7 @@ xfs_filemap_pmd_fault(
1545 } 1545 }
1546 1546
1547 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1547 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1548 ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); 1548 ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
1549 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1549 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1550 1550
1551 if (flags & FAULT_FLAG_WRITE) 1551 if (flags & FAULT_FLAG_WRITE)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index add6c4bc568f..8d1a5c47945f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -8,21 +8,46 @@
8 8
9struct iomap_ops; 9struct iomap_ops;
10 10
11/* We use lowest available exceptional entry bit for locking */ 11/*
12 * We use lowest available bit in exceptional entry for locking, one bit for
13 * the entry size (PMD) and two more to tell us if the entry is a huge zero
14 * page (HZP) or an empty entry that is just used for locking. In total four
15 * special bits.
16 *
17 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
18 * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
19 * block allocation.
20 */
21#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
12#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) 22#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
23#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
24#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
25#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
13 26
14ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, 27static inline unsigned long dax_radix_sector(void *entry)
28{
29 return (unsigned long)entry >> RADIX_DAX_SHIFT;
30}
31
32static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
33{
34 return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
35 ((unsigned long)sector << RADIX_DAX_SHIFT) |
36 RADIX_DAX_ENTRY_LOCK);
37}
38
39ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
15 struct iomap_ops *ops); 40 struct iomap_ops *ops);
16ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, 41ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
17 get_block_t, dio_iodone_t, int flags); 42 get_block_t, dio_iodone_t, int flags);
18int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); 43int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
19int dax_truncate_page(struct inode *, loff_t from, get_block_t); 44int dax_truncate_page(struct inode *, loff_t from, get_block_t);
20int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 45int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
21 struct iomap_ops *ops); 46 struct iomap_ops *ops);
22int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); 47int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
23int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); 48int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
24void dax_wake_mapping_entry_waiter(struct address_space *mapping, 49void dax_wake_mapping_entry_waiter(struct address_space *mapping,
25 pgoff_t index, bool wake_all); 50 pgoff_t index, void *entry, bool wake_all);
26 51
27#ifdef CONFIG_FS_DAX 52#ifdef CONFIG_FS_DAX
28struct page *read_dax_sector(struct block_device *bdev, sector_t n); 53struct page *read_dax_sector(struct block_device *bdev, sector_t n);
@@ -48,15 +73,32 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
48} 73}
49#endif 74#endif
50 75
51#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
52int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
53 unsigned int flags, get_block_t);
54#else
55static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, 76static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
56 pmd_t *pmd, unsigned int flags, get_block_t gb) 77 pmd_t *pmd, unsigned int flags, get_block_t gb)
57{ 78{
58 return VM_FAULT_FALLBACK; 79 return VM_FAULT_FALLBACK;
59} 80}
81
82#ifdef CONFIG_FS_DAX_PMD
83static inline unsigned int dax_radix_order(void *entry)
84{
85 if ((unsigned long)entry & RADIX_DAX_PMD)
86 return PMD_SHIFT - PAGE_SHIFT;
87 return 0;
88}
89int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
90 pmd_t *pmd, unsigned int flags, struct iomap_ops *ops);
91#else
92static inline unsigned int dax_radix_order(void *entry)
93{
94 return 0;
95}
96static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
97 unsigned long address, pmd_t *pmd, unsigned int flags,
98 struct iomap_ops *ops)
99{
100 return VM_FAULT_FALLBACK;
101}
60#endif 102#endif
61int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); 103int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
62#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) 104#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 7892f55a1866..f185156de74d 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -49,6 +49,7 @@ struct iomap {
49#define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */ 49#define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */
50#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ 50#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */
51#define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ 51#define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */
52#define IOMAP_FAULT (1 << 3) /* mapping for page fault */
52 53
53struct iomap_ops { 54struct iomap_ops {
54 /* 55 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 849f459ad078..00ab94a882de 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -137,13 +137,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
137 } else { 137 } else {
138 /* DAX can replace empty locked entry with a hole */ 138 /* DAX can replace empty locked entry with a hole */
139 WARN_ON_ONCE(p != 139 WARN_ON_ONCE(p !=
140 (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 140 dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
141 RADIX_DAX_ENTRY_LOCK));
142 /* DAX accounts exceptional entries as normal pages */ 141 /* DAX accounts exceptional entries as normal pages */
143 if (node) 142 if (node)
144 workingset_node_pages_dec(node); 143 workingset_node_pages_dec(node);
145 /* Wakeup waiters for exceptional entry lock */ 144 /* Wakeup waiters for exceptional entry lock */
146 dax_wake_mapping_entry_waiter(mapping, page->index, 145 dax_wake_mapping_entry_waiter(mapping, page->index, p,
147 false); 146 false);
148 } 147 }
149 } 148 }