aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRoss Zwisler <ross.zwisler@linux.intel.com>2016-01-22 18:10:47 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-22 20:02:18 -0500
commit9973c98ecfda3a1dfcab981665b5f1e39bcde64a (patch)
treec1bfc72b857e6fc915e6a2ef86481e5c472da7f6
parent7e7f774984cd88c45c18e7ffaf0256c3e9118043 (diff)
dax: add support for fsync/sync
To properly handle fsync/msync in an efficient way DAX needs to track dirty pages so it is able to flush them durably to media on demand. The tracking of dirty pages is done via the radix tree in struct address_space. This radix tree is already used by the page writeback infrastructure for tracking dirty pages associated with an open file, and it already has support for exceptional (non struct page*) entries. We build upon these features to add exceptional entries to the radix tree for DAX dirty PMD or PTE pages at fault time. [dan.j.williams@intel.com: fix dax_pmd_dbg build warning] Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "J. Bruce Fields" <bfields@fieldses.org> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Dave Chinner <david@fromorbit.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jan Kara <jack@suse.com> Cc: Jeff Layton <jlayton@poochiereds.net> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Matthew Wilcox <matthew.r.wilcox@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/dax.c274
-rw-r--r--include/linux/dax.h2
-rw-r--r--mm/filemap.c6
3 files changed, 266 insertions, 16 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 5b84a46201c2..d5f6aca5a4d7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,6 +24,7 @@
24#include <linux/memcontrol.h> 24#include <linux/memcontrol.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/mutex.h> 26#include <linux/mutex.h>
27#include <linux/pagevec.h>
27#include <linux/pmem.h> 28#include <linux/pmem.h>
28#include <linux/sched.h> 29#include <linux/sched.h>
29#include <linux/uio.h> 30#include <linux/uio.h>
@@ -324,6 +325,199 @@ static int copy_user_bh(struct page *to, struct inode *inode,
324 return 0; 325 return 0;
325} 326}
326 327
328#define NO_SECTOR -1
329#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
330
331static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
332 sector_t sector, bool pmd_entry, bool dirty)
333{
334 struct radix_tree_root *page_tree = &mapping->page_tree;
335 pgoff_t pmd_index = DAX_PMD_INDEX(index);
336 int type, error = 0;
337 void *entry;
338
339 WARN_ON_ONCE(pmd_entry && !dirty);
340 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
341
342 spin_lock_irq(&mapping->tree_lock);
343
344 entry = radix_tree_lookup(page_tree, pmd_index);
345 if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
346 index = pmd_index;
347 goto dirty;
348 }
349
350 entry = radix_tree_lookup(page_tree, index);
351 if (entry) {
352 type = RADIX_DAX_TYPE(entry);
353 if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
354 type != RADIX_DAX_PMD)) {
355 error = -EIO;
356 goto unlock;
357 }
358
359 if (!pmd_entry || type == RADIX_DAX_PMD)
360 goto dirty;
361
362 /*
363 * We only insert dirty PMD entries into the radix tree. This
364 * means we don't need to worry about removing a dirty PTE
365 * entry and inserting a clean PMD entry, thus reducing the
366 * range we would flush with a follow-up fsync/msync call.
367 */
368 radix_tree_delete(&mapping->page_tree, index);
369 mapping->nrexceptional--;
370 }
371
372 if (sector == NO_SECTOR) {
373 /*
374 * This can happen during correct operation if our pfn_mkwrite
375 * fault raced against a hole punch operation. If this
376 * happens the pte that was hole punched will have been
377 * unmapped and the radix tree entry will have been removed by
378 * the time we are called, but the call will still happen. We
379 * will return all the way up to wp_pfn_shared(), where the
380 * pte_same() check will fail, eventually causing page fault
381 * to be retried by the CPU.
382 */
383 goto unlock;
384 }
385
386 error = radix_tree_insert(page_tree, index,
387 RADIX_DAX_ENTRY(sector, pmd_entry));
388 if (error)
389 goto unlock;
390
391 mapping->nrexceptional++;
392 dirty:
393 if (dirty)
394 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
395 unlock:
396 spin_unlock_irq(&mapping->tree_lock);
397 return error;
398}
399
400static int dax_writeback_one(struct block_device *bdev,
401 struct address_space *mapping, pgoff_t index, void *entry)
402{
403 struct radix_tree_root *page_tree = &mapping->page_tree;
404 int type = RADIX_DAX_TYPE(entry);
405 struct radix_tree_node *node;
406 struct blk_dax_ctl dax;
407 void **slot;
408 int ret = 0;
409
410 spin_lock_irq(&mapping->tree_lock);
411 /*
412 * Regular page slots are stabilized by the page lock even
413 * without the tree itself locked. These unlocked entries
414 * need verification under the tree lock.
415 */
416 if (!__radix_tree_lookup(page_tree, index, &node, &slot))
417 goto unlock;
418 if (*slot != entry)
419 goto unlock;
420
421 /* another fsync thread may have already written back this entry */
422 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
423 goto unlock;
424
425 if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
426 ret = -EIO;
427 goto unlock;
428 }
429
430 dax.sector = RADIX_DAX_SECTOR(entry);
431 dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
432 spin_unlock_irq(&mapping->tree_lock);
433
434 /*
435 * We cannot hold tree_lock while calling dax_map_atomic() because it
436 * eventually calls cond_resched().
437 */
438 ret = dax_map_atomic(bdev, &dax);
439 if (ret < 0)
440 return ret;
441
442 if (WARN_ON_ONCE(ret < dax.size)) {
443 ret = -EIO;
444 goto unmap;
445 }
446
447 wb_cache_pmem(dax.addr, dax.size);
448
449 spin_lock_irq(&mapping->tree_lock);
450 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
451 spin_unlock_irq(&mapping->tree_lock);
452 unmap:
453 dax_unmap_atomic(bdev, &dax);
454 return ret;
455
456 unlock:
457 spin_unlock_irq(&mapping->tree_lock);
458 return ret;
459}
460
461/*
462 * Flush the mapping to the persistent domain within the byte range of [start,
463 * end]. This is required by data integrity operations to ensure file data is
464 * on persistent storage prior to completion of the operation.
465 */
466int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
467 loff_t end)
468{
469 struct inode *inode = mapping->host;
470 struct block_device *bdev = inode->i_sb->s_bdev;
471 pgoff_t start_index, end_index, pmd_index;
472 pgoff_t indices[PAGEVEC_SIZE];
473 struct pagevec pvec;
474 bool done = false;
475 int i, ret = 0;
476 void *entry;
477
478 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
479 return -EIO;
480
481 start_index = start >> PAGE_CACHE_SHIFT;
482 end_index = end >> PAGE_CACHE_SHIFT;
483 pmd_index = DAX_PMD_INDEX(start_index);
484
485 rcu_read_lock();
486 entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
487 rcu_read_unlock();
488
489 /* see if the start of our range is covered by a PMD entry */
490 if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
491 start_index = pmd_index;
492
493 tag_pages_for_writeback(mapping, start_index, end_index);
494
495 pagevec_init(&pvec, 0);
496 while (!done) {
497 pvec.nr = find_get_entries_tag(mapping, start_index,
498 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
499 pvec.pages, indices);
500
501 if (pvec.nr == 0)
502 break;
503
504 for (i = 0; i < pvec.nr; i++) {
505 if (indices[i] > end_index) {
506 done = true;
507 break;
508 }
509
510 ret = dax_writeback_one(bdev, mapping, indices[i],
511 pvec.pages[i]);
512 if (ret < 0)
513 return ret;
514 }
515 }
516 wmb_pmem();
517 return 0;
518}
519EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
520
327static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 521static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
328 struct vm_area_struct *vma, struct vm_fault *vmf) 522 struct vm_area_struct *vma, struct vm_fault *vmf)
329{ 523{
@@ -363,6 +557,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
363 } 557 }
364 dax_unmap_atomic(bdev, &dax); 558 dax_unmap_atomic(bdev, &dax);
365 559
560 error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
561 vmf->flags & FAULT_FLAG_WRITE);
562 if (error)
563 goto out;
564
366 error = vm_insert_mixed(vma, vaddr, dax.pfn); 565 error = vm_insert_mixed(vma, vaddr, dax.pfn);
367 566
368 out: 567 out:
@@ -487,6 +686,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
487 delete_from_page_cache(page); 686 delete_from_page_cache(page);
488 unlock_page(page); 687 unlock_page(page);
489 page_cache_release(page); 688 page_cache_release(page);
689 page = NULL;
490 } 690 }
491 691
492 /* 692 /*
@@ -589,9 +789,9 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
589 bool write = flags & FAULT_FLAG_WRITE; 789 bool write = flags & FAULT_FLAG_WRITE;
590 struct block_device *bdev; 790 struct block_device *bdev;
591 pgoff_t size, pgoff; 791 pgoff_t size, pgoff;
592 loff_t lstart, lend;
593 sector_t block; 792 sector_t block;
594 int result = 0; 793 int error, result = 0;
794 bool alloc = false;
595 795
596 /* dax pmd mappings require pfn_t_devmap() */ 796 /* dax pmd mappings require pfn_t_devmap() */
597 if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) 797 if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
@@ -629,10 +829,17 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
629 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 829 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
630 830
631 bh.b_size = PMD_SIZE; 831 bh.b_size = PMD_SIZE;
632 if (get_block(inode, block, &bh, write) != 0) 832
833 if (get_block(inode, block, &bh, 0) != 0)
633 return VM_FAULT_SIGBUS; 834 return VM_FAULT_SIGBUS;
835
836 if (!buffer_mapped(&bh) && write) {
837 if (get_block(inode, block, &bh, 1) != 0)
838 return VM_FAULT_SIGBUS;
839 alloc = true;
840 }
841
634 bdev = bh.b_bdev; 842 bdev = bh.b_bdev;
635 i_mmap_lock_read(mapping);
636 843
637 /* 844 /*
638 * If the filesystem isn't willing to tell us the length of a hole, 845 * If the filesystem isn't willing to tell us the length of a hole,
@@ -641,15 +848,20 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
641 */ 848 */
642 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { 849 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
643 dax_pmd_dbg(&bh, address, "allocated block too small"); 850 dax_pmd_dbg(&bh, address, "allocated block too small");
644 goto fallback; 851 return VM_FAULT_FALLBACK;
852 }
853
854 /*
855 * If we allocated new storage, make sure no process has any
856 * zero pages covering this hole
857 */
858 if (alloc) {
859 loff_t lstart = pgoff << PAGE_SHIFT;
860 loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
861
862 truncate_pagecache_range(inode, lstart, lend);
645 } 863 }
646 864
647 /* make sure no process has any zero pages covering this hole */
648 lstart = pgoff << PAGE_SHIFT;
649 lend = lstart + PMD_SIZE - 1; /* inclusive */
650 i_mmap_unlock_read(mapping);
651 unmap_mapping_range(mapping, lstart, PMD_SIZE, 0);
652 truncate_inode_pages_range(mapping, lstart, lend);
653 i_mmap_lock_read(mapping); 865 i_mmap_lock_read(mapping);
654 866
655 /* 867 /*
@@ -733,6 +945,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
733 } 945 }
734 dax_unmap_atomic(bdev, &dax); 946 dax_unmap_atomic(bdev, &dax);
735 947
948 /*
949 * For PTE faults we insert a radix tree entry for reads, and
950 * leave it clean. Then on the first write we dirty the radix
951 * tree entry via the dax_pfn_mkwrite() path. This sequence
952 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
953 * call into get_block() to translate the pgoff to a sector in
954 * order to be able to create a new radix tree entry.
955 *
956 * The PMD path doesn't have an equivalent to
957 * dax_pfn_mkwrite(), though, so for a read followed by a
958 * write we traverse all the way through __dax_pmd_fault()
959 * twice. This means we can just skip inserting a radix tree
960 * entry completely on the initial read and just wait until
961 * the write to insert a dirty entry.
962 */
963 if (write) {
964 error = dax_radix_entry(mapping, pgoff, dax.sector,
965 true, true);
966 if (error) {
967 dax_pmd_dbg(&bh, address,
968 "PMD radix insertion failed");
969 goto fallback;
970 }
971 }
972
736 dev_dbg(part_to_dev(bdev->bd_part), 973 dev_dbg(part_to_dev(bdev->bd_part),
737 "%s: %s addr: %lx pfn: %lx sect: %llx\n", 974 "%s: %s addr: %lx pfn: %lx sect: %llx\n",
738 __func__, current->comm, address, 975 __func__, current->comm, address,
@@ -791,15 +1028,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
791 * dax_pfn_mkwrite - handle first write to DAX page 1028 * dax_pfn_mkwrite - handle first write to DAX page
792 * @vma: The virtual memory area where the fault occurred 1029 * @vma: The virtual memory area where the fault occurred
793 * @vmf: The description of the fault 1030 * @vmf: The description of the fault
794 *
795 */ 1031 */
796int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1032int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
797{ 1033{
798 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 1034 struct file *file = vma->vm_file;
799 1035
800 sb_start_pagefault(sb); 1036 /*
801 file_update_time(vma->vm_file); 1037 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
802 sb_end_pagefault(sb); 1038 * RADIX_DAX_PTE entry already exists in the radix tree from a
1039 * previous call to __dax_fault(). We just want to look up that PTE
1040 * entry using vmf->pgoff and make sure the dirty tag is set. This
1041 * saves us from having to make a call to get_block() here to look
1042 * up the sector.
1043 */
1044 dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
803 return VM_FAULT_NOPAGE; 1045 return VM_FAULT_NOPAGE;
804} 1046}
805EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 1047EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e9d57f680f50..8204c3dc3800 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -41,4 +41,6 @@ static inline bool dax_mapping(struct address_space *mapping)
41{ 41{
42 return mapping->host && IS_DAX(mapping->host); 42 return mapping->host && IS_DAX(mapping->host);
43} 43}
44int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
45 loff_t end);
44#endif 46#endif
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e215fc36c83..2e7c8d980d5e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -482,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
482{ 482{
483 int err = 0; 483 int err = 0;
484 484
485 if (dax_mapping(mapping) && mapping->nrexceptional) {
486 err = dax_writeback_mapping_range(mapping, lstart, lend);
487 if (err)
488 return err;
489 }
490
485 if (mapping->nrpages) { 491 if (mapping->nrpages) {
486 err = __filemap_fdatawrite_range(mapping, lstart, lend, 492 err = __filemap_fdatawrite_range(mapping, lstart, lend,
487 WB_SYNC_ALL); 493 WB_SYNC_ALL);