dax: add support for fsync/sync

To properly handle fsync/msync in an efficient way DAX needs to track dirty pages so it is able to flush them durably to media on demand. The tracking of dirty pages is done via the radix tree in struct address_space. This radix tree is already used by the page writeback infrastructure for tracking dirty pages associated with an open file, and it already has support for exceptional (non struct page*) entries. We build upon these features to add exceptional entries to the radix tree for DAX dirty PMD or PTE pages at fault time. [dan.j.williams@intel.com: fix dax_pmd_dbg build warning] Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "J. Bruce Fields" <bfields@fieldses.org> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Dave Chinner <david@fromorbit.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jan Kara <jack@suse.com> Cc: Jeff Layton <jlayton@poochiereds.net> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Matthew Wilcox <matthew.r.wilcox@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Ross Zwisler <ross.zwisler@linux.intel.com> 2016-01-22 18:10:47 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-22 20:02:18 -0500
commit: 9973c98ecfda3a1dfcab981665b5f1e39bcde64a (patch)
tree: c1bfc72b857e6fc915e6a2ef86481e5c472da7f6
parent: 7e7f774984cd88c45c18e7ffaf0256c3e9118043 (diff)
3 files changed, 266 insertions, 16 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 5b84a46201c2..d5f6aca5a4d7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,6 +24,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/pagevec.h>
 #include <linux/pmem.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
@@ -324,6 +325,199 @@ static int copy_user_bh(struct page *to, struct inode *inode,
        return 0;
 }
+#define NO_SECTOR -1
+#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
+static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
+                sector_t sector, bool pmd_entry, bool dirty)
+{
+        struct radix_tree_root *page_tree = &mapping->page_tree;
+        pgoff_t pmd_index = DAX_PMD_INDEX(index);
+        int type, error = 0;
+        void *entry;
+        WARN_ON_ONCE(pmd_entry && !dirty);
+        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+        spin_lock_irq(&mapping->tree_lock);
+        entry = radix_tree_lookup(page_tree, pmd_index);
+        if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
+                index = pmd_index;
+                goto dirty;
+        }
+        entry = radix_tree_lookup(page_tree, index);
+        if (entry) {
+                type = RADIX_DAX_TYPE(entry);
+                if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
+                                        type != RADIX_DAX_PMD)) {
+                        error = -EIO;
+                        goto unlock;
+                }
+                if (!pmd_entry || type == RADIX_DAX_PMD)
+                        goto dirty;
+                /*
+                 * We only insert dirty PMD entries into the radix tree.  This
+                 * means we don't need to worry about removing a dirty PTE
+                 * entry and inserting a clean PMD entry, thus reducing the
+                 * range we would flush with a follow-up fsync/msync call.
+                 */
+                radix_tree_delete(&mapping->page_tree, index);
+                mapping->nrexceptional--;
+        }
+        if (sector == NO_SECTOR) {
+                /*
+                 * This can happen during correct operation if our pfn_mkwrite
+                 * fault raced against a hole punch operation.  If this
+                 * happens the pte that was hole punched will have been
+                 * unmapped and the radix tree entry will have been removed by
+                 * the time we are called, but the call will still happen.  We
+                 * will return all the way up to wp_pfn_shared(), where the
+                 * pte_same() check will fail, eventually causing page fault
+                 * to be retried by the CPU.
+                 */
+                goto unlock;
+        }
+        error = radix_tree_insert(page_tree, index,
+                        RADIX_DAX_ENTRY(sector, pmd_entry));
+        if (error)
+                goto unlock;
+        mapping->nrexceptional++;
+ dirty:
+        if (dirty)
+                radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
+ unlock:
+        spin_unlock_irq(&mapping->tree_lock);
+        return error;
+}
+static int dax_writeback_one(struct block_device *bdev,
+                struct address_space *mapping, pgoff_t index, void *entry)
+{
+        struct radix_tree_root *page_tree = &mapping->page_tree;
+        int type = RADIX_DAX_TYPE(entry);
+        struct radix_tree_node *node;
+        struct blk_dax_ctl dax;
+        void **slot;
+        int ret = 0;
+        spin_lock_irq(&mapping->tree_lock);
+        /*
+         * Regular page slots are stabilized by the page lock even
+         * without the tree itself locked.  These unlocked entries
+         * need verification under the tree lock.
+         */
+        if (!__radix_tree_lookup(page_tree, index, &node, &slot))
+                goto unlock;
+        if (*slot != entry)
+                goto unlock;
+        /* another fsync thread may have already written back this entry */
+        if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+                goto unlock;
+        if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+                ret = -EIO;
+                goto unlock;
+        }
+        dax.sector = RADIX_DAX_SECTOR(entry);
+        dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+        spin_unlock_irq(&mapping->tree_lock);
+        /*
+         * We cannot hold tree_lock while calling dax_map_atomic() because it
+         * eventually calls cond_resched().
+         */
+        ret = dax_map_atomic(bdev, &dax);
+        if (ret < 0)
+                return ret;
+        if (WARN_ON_ONCE(ret < dax.size)) {
+                ret = -EIO;
+                goto unmap;
+        }
+        wb_cache_pmem(dax.addr, dax.size);
+        spin_lock_irq(&mapping->tree_lock);
+        radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+        spin_unlock_irq(&mapping->tree_lock);
+ unmap:
+        dax_unmap_atomic(bdev, &dax);
+        return ret;
+ unlock:
+        spin_unlock_irq(&mapping->tree_lock);
+        return ret;
+}
+/*
+ * Flush the mapping to the persistent domain within the byte range of [start,
+ * end]. This is required by data integrity operations to ensure file data is
+ * on persistent storage prior to completion of the operation.
+ */
+int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
+                loff_t end)
+{
+        struct inode *inode = mapping->host;
+        struct block_device *bdev = inode->i_sb->s_bdev;
+        pgoff_t start_index, end_index, pmd_index;
+        pgoff_t indices[PAGEVEC_SIZE];
+        struct pagevec pvec;
+        bool done = false;
+        int i, ret = 0;
+        void *entry;
+        if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
+                return -EIO;
+        start_index = start >> PAGE_CACHE_SHIFT;
+        end_index = end >> PAGE_CACHE_SHIFT;
+        pmd_index = DAX_PMD_INDEX(start_index);
+        rcu_read_lock();
+        entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
+        rcu_read_unlock();
+        /* see if the start of our range is covered by a PMD entry */
+        if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
+                start_index = pmd_index;
+        tag_pages_for_writeback(mapping, start_index, end_index);
+        pagevec_init(&pvec, 0);
+        while (!done) {
+                pvec.nr = find_get_entries_tag(mapping, start_index,
+                                PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
+                                pvec.pages, indices);
+                if (pvec.nr == 0)
+                        break;
+                for (i = 0; i < pvec.nr; i++) {
+                        if (indices[i] > end_index) {
+                                done = true;
+                                break;
+                        }
+                        ret = dax_writeback_one(bdev, mapping, indices[i],
+                                        pvec.pages[i]);
+                        if (ret < 0)
+                                return ret;
+                }
+        }
+        wmb_pmem();
+        return 0;
+}
+EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
                        struct vm_area_struct *vma, struct vm_fault *vmf)
 {
@@ -363,6 +557,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
        }
        dax_unmap_atomic(bdev, &dax);
+        error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
+                        vmf->flags & FAULT_FLAG_WRITE);
+        if (error)
+                goto out;
        error = vm_insert_mixed(vma, vaddr, dax.pfn);
 out:
@@ -487,6 +686,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                delete_from_page_cache(page);
                unlock_page(page);
                page_cache_release(page);
+                page = NULL;
        }
        /*
@@ -589,9 +789,9 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        bool write = flags & FAULT_FLAG_WRITE;
        struct block_device *bdev;
        pgoff_t size, pgoff;
-        loff_t lstart, lend;
        sector_t block;
-        int result = 0;
+        int error, result = 0;
+        bool alloc = false;
        /* dax pmd mappings require pfn_t_devmap() */
        if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
@@ -629,10 +829,17 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
        bh.b_size = PMD_SIZE;
-        if (get_block(inode, block, &bh, write) != 0)
+        if (get_block(inode, block, &bh, 0) != 0)
                return VM_FAULT_SIGBUS;
+        if (!buffer_mapped(&bh) && write) {
+                if (get_block(inode, block, &bh, 1) != 0)
+                        return VM_FAULT_SIGBUS;
+                alloc = true;
+        }
        bdev = bh.b_bdev;
-        i_mmap_lock_read(mapping);
        /*
         * If the filesystem isn't willing to tell us the length of a hole,
@@ -641,15 +848,20 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
         */
        if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
                dax_pmd_dbg(&bh, address, "allocated block too small");
-                goto fallback;
+                return VM_FAULT_FALLBACK;
+        }
+        /*
+         * If we allocated new storage, make sure no process has any
+         * zero pages covering this hole
+         */
+        if (alloc) {
+                loff_t lstart = pgoff << PAGE_SHIFT;
+                loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
+                truncate_pagecache_range(inode, lstart, lend);
        }
-        /* make sure no process has any zero pages covering this hole */
-        lstart = pgoff << PAGE_SHIFT;
-        lend = lstart + PMD_SIZE - 1; /* inclusive */
-        i_mmap_unlock_read(mapping);
-        unmap_mapping_range(mapping, lstart, PMD_SIZE, 0);
-        truncate_inode_pages_range(mapping, lstart, lend);
        i_mmap_lock_read(mapping);
        /*
@@ -733,6 +945,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                }
                dax_unmap_atomic(bdev, &dax);
+                /*
+                 * For PTE faults we insert a radix tree entry for reads, and
+                 * leave it clean.  Then on the first write we dirty the radix
+                 * tree entry via the dax_pfn_mkwrite() path.  This sequence
+                 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
+                 * call into get_block() to translate the pgoff to a sector in
+                 * order to be able to create a new radix tree entry.
+                 *
+                 * The PMD path doesn't have an equivalent to
+                 * dax_pfn_mkwrite(), though, so for a read followed by a
+                 * write we traverse all the way through __dax_pmd_fault()
+                 * twice.  This means we can just skip inserting a radix tree
+                 * entry completely on the initial read and just wait until
+                 * the write to insert a dirty entry.
+                 */
+                if (write) {
+                        error = dax_radix_entry(mapping, pgoff, dax.sector,
+                                        true, true);
+                        if (error) {
+                                dax_pmd_dbg(&bh, address,
+                                                "PMD radix insertion failed");
+                                goto fallback;
+                        }
+                }
                dev_dbg(part_to_dev(bdev->bd_part),
                                "%s: %s addr: %lx pfn: %lx sect: %llx\n",
                                __func__, current->comm, address,
@@ -791,15 +1028,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
 * dax_pfn_mkwrite - handle first write to DAX page
 * @vma: The virtual memory area where the fault occurred
 * @vmf: The description of the fault
- *
 */
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-        struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+        struct file *file = vma->vm_file;
-        sb_start_pagefault(sb);
+        /*
-        file_update_time(vma->vm_file);
+         * We pass NO_SECTOR to dax_radix_entry() because we expect that a
-        sb_end_pagefault(sb);
+         * RADIX_DAX_PTE entry already exists in the radix tree from a
+         * previous call to __dax_fault().  We just want to look up that PTE
+         * entry using vmf->pgoff and make sure the dirty tag is set.  This
+         * saves us from having to make a call to get_block() here to look
+         * up the sector.
+         */
+        dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
        return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e9d57f680f50..8204c3dc3800 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -41,4 +41,6 @@ static inline bool dax_mapping(struct address_space *mapping)
 {
        return mapping->host && IS_DAX(mapping->host);
 }
+int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
+                loff_t end);
 #endif
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e215fc36c83..2e7c8d980d5e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -482,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 {
        int err = 0;
+        if (dax_mapping(mapping) && mapping->nrexceptional) {
+                err = dax_writeback_mapping_range(mapping, lstart, lend);
+                if (err)
+                        return err;
+        }
        if (mapping->nrpages) {
                err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                 WB_SYNC_ALL);
author	Ross Zwisler <ross.zwisler@linux.intel.com>	2016-01-22 18:10:47 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-01-22 20:02:18 -0500
commit	9973c98ecfda3a1dfcab981665b5f1e39bcde64a (patch)
tree	c1bfc72b857e6fc915e6a2ef86481e5c472da7f6
parent	7e7f774984cd88c45c18e7ffaf0256c3e9118043 (diff)

diff --git a/fs/dax.c b/fs/dax.c index 5b84a46201c2..d5f6aca5a4d7 100644 --- a/fs/dax.c +++ b/fs/dax.c
@@ -24,6 +24,7 @@
24	#include <linux/memcontrol.h>	24	#include <linux/memcontrol.h>
25	#include <linux/mm.h>	25	#include <linux/mm.h>
26	#include <linux/mutex.h>	26	#include <linux/mutex.h>
		27	#include <linux/pagevec.h>
27	#include <linux/pmem.h>	28	#include <linux/pmem.h>
28	#include <linux/sched.h>	29	#include <linux/sched.h>
29	#include <linux/uio.h>	30	#include <linux/uio.h>
@@ -324,6 +325,199 @@ static int copy_user_bh(struct page to, struct inode inode,
324	return 0;	325	return 0;
325	}	326	}
326		327
		328	#define NO_SECTOR -1
		329	#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
		330
		331	static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
		332	sector_t sector, bool pmd_entry, bool dirty)
		333	{
		334	struct radix_tree_root *page_tree = &mapping->page_tree;
		335	pgoff_t pmd_index = DAX_PMD_INDEX(index);
		336	int type, error = 0;
		337	void *entry;
		338
		339	WARN_ON_ONCE(pmd_entry && !dirty);
		340	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
		341
		342	spin_lock_irq(&mapping->tree_lock);
		343
		344	entry = radix_tree_lookup(page_tree, pmd_index);
		345	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
		346	index = pmd_index;
		347	goto dirty;
		348	}
		349
		350	entry = radix_tree_lookup(page_tree, index);
		351	if (entry) {
		352	type = RADIX_DAX_TYPE(entry);
		353	if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
		354	type != RADIX_DAX_PMD)) {
		355	error = -EIO;
		356	goto unlock;
		357	}
		358
		359	if (!pmd_entry \|\| type == RADIX_DAX_PMD)
		360	goto dirty;
		361
		362	/*
		363	* We only insert dirty PMD entries into the radix tree. This
		364	* means we don't need to worry about removing a dirty PTE
		365	* entry and inserting a clean PMD entry, thus reducing the
		366	* range we would flush with a follow-up fsync/msync call.
		367	*/
		368	radix_tree_delete(&mapping->page_tree, index);
		369	mapping->nrexceptional--;
		370	}
		371
		372	if (sector == NO_SECTOR) {
		373	/*
		374	* This can happen during correct operation if our pfn_mkwrite
		375	* fault raced against a hole punch operation. If this
		376	* happens the pte that was hole punched will have been
		377	* unmapped and the radix tree entry will have been removed by
		378	* the time we are called, but the call will still happen. We
		379	* will return all the way up to wp_pfn_shared(), where the
		380	* pte_same() check will fail, eventually causing page fault
		381	* to be retried by the CPU.
		382	*/
		383	goto unlock;
		384	}
		385
		386	error = radix_tree_insert(page_tree, index,
		387	RADIX_DAX_ENTRY(sector, pmd_entry));
		388	if (error)
		389	goto unlock;
		390
		391	mapping->nrexceptional++;
		392	dirty:
		393	if (dirty)
		394	radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
		395	unlock:
		396	spin_unlock_irq(&mapping->tree_lock);
		397	return error;
		398	}
		399
		400	static int dax_writeback_one(struct block_device *bdev,
		401	struct address_space mapping, pgoff_t index, void entry)
		402	{
		403	struct radix_tree_root *page_tree = &mapping->page_tree;
		404	int type = RADIX_DAX_TYPE(entry);
		405	struct radix_tree_node *node;
		406	struct blk_dax_ctl dax;
		407	void **slot;
		408	int ret = 0;
		409
		410	spin_lock_irq(&mapping->tree_lock);
		411	/*
		412	* Regular page slots are stabilized by the page lock even
		413	* without the tree itself locked. These unlocked entries
		414	* need verification under the tree lock.
		415	*/
		416	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
		417	goto unlock;
		418	if (*slot != entry)
		419	goto unlock;
		420
		421	/* another fsync thread may have already written back this entry */
		422	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
		423	goto unlock;
		424
		425	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
		426	ret = -EIO;
		427	goto unlock;
		428	}
		429
		430	dax.sector = RADIX_DAX_SECTOR(entry);
		431	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
		432	spin_unlock_irq(&mapping->tree_lock);
		433
		434	/*
		435	* We cannot hold tree_lock while calling dax_map_atomic() because it
		436	* eventually calls cond_resched().
		437	*/
		438	ret = dax_map_atomic(bdev, &dax);
		439	if (ret < 0)
		440	return ret;
		441
		442	if (WARN_ON_ONCE(ret < dax.size)) {
		443	ret = -EIO;
		444	goto unmap;
		445	}
		446
		447	wb_cache_pmem(dax.addr, dax.size);
		448
		449	spin_lock_irq(&mapping->tree_lock);
		450	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
		451	spin_unlock_irq(&mapping->tree_lock);
		452	unmap:
		453	dax_unmap_atomic(bdev, &dax);
		454	return ret;
		455
		456	unlock:
		457	spin_unlock_irq(&mapping->tree_lock);
		458	return ret;
		459	}
		460
		461	/*
		462	* Flush the mapping to the persistent domain within the byte range of [start,
		463	* end]. This is required by data integrity operations to ensure file data is
		464	* on persistent storage prior to completion of the operation.
		465	*/
		466	int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
		467	loff_t end)
		468	{
		469	struct inode *inode = mapping->host;
		470	struct block_device *bdev = inode->i_sb->s_bdev;
		471	pgoff_t start_index, end_index, pmd_index;
		472	pgoff_t indices[PAGEVEC_SIZE];
		473	struct pagevec pvec;
		474	bool done = false;
		475	int i, ret = 0;
		476	void *entry;
		477
		478	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
		479	return -EIO;
		480
		481	start_index = start >> PAGE_CACHE_SHIFT;
		482	end_index = end >> PAGE_CACHE_SHIFT;
		483	pmd_index = DAX_PMD_INDEX(start_index);
		484
		485	rcu_read_lock();
		486	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
		487	rcu_read_unlock();
		488
		489	/* see if the start of our range is covered by a PMD entry */
		490	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
		491	start_index = pmd_index;
		492
		493	tag_pages_for_writeback(mapping, start_index, end_index);
		494
		495	pagevec_init(&pvec, 0);
		496	while (!done) {
		497	pvec.nr = find_get_entries_tag(mapping, start_index,
		498	PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
		499	pvec.pages, indices);
		500
		501	if (pvec.nr == 0)
		502	break;
		503
		504	for (i = 0; i < pvec.nr; i++) {
		505	if (indices[i] > end_index) {
		506	done = true;
		507	break;
		508	}
		509
		510	ret = dax_writeback_one(bdev, mapping, indices[i],
		511	pvec.pages[i]);
		512	if (ret < 0)
		513	return ret;
		514	}
		515	}
		516	wmb_pmem();
		517	return 0;
		518	}
		519	EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
		520
327	static int dax_insert_mapping(struct inode inode, struct buffer_head bh,	521	static int dax_insert_mapping(struct inode inode, struct buffer_head bh,
328	struct vm_area_struct vma, struct vm_fault vmf)	522	struct vm_area_struct vma, struct vm_fault vmf)
329	{	523	{
@@ -363,6 +557,11 @@ static int dax_insert_mapping(struct inode inode, struct buffer_head bh,
363	}	557	}
364	dax_unmap_atomic(bdev, &dax);	558	dax_unmap_atomic(bdev, &dax);
365		559
		560	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
		561	vmf->flags & FAULT_FLAG_WRITE);
		562	if (error)
		563	goto out;
		564
366	error = vm_insert_mixed(vma, vaddr, dax.pfn);	565	error = vm_insert_mixed(vma, vaddr, dax.pfn);
367		566
368	out:	567	out:
@@ -487,6 +686,7 @@ int __dax_fault(struct vm_area_struct vma, struct vm_fault vmf,
487	delete_from_page_cache(page);	686	delete_from_page_cache(page);
488	unlock_page(page);	687	unlock_page(page);
489	page_cache_release(page);	688	page_cache_release(page);
		689	page = NULL;
490	}	690	}
491		691
492	/*	692	/*
@@ -589,9 +789,9 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
589	bool write = flags & FAULT_FLAG_WRITE;	789	bool write = flags & FAULT_FLAG_WRITE;
590	struct block_device *bdev;	790	struct block_device *bdev;
591	pgoff_t size, pgoff;	791	pgoff_t size, pgoff;
592	loff_t lstart, lend;
593	sector_t block;	792	sector_t block;
594	int result = 0;	793	int error, result = 0;
		794	bool alloc = false;
595		795
596	/* dax pmd mappings require pfn_t_devmap() */	796	/* dax pmd mappings require pfn_t_devmap() */
597	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))	797	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
@@ -629,10 +829,17 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
629	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);	829	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
630		830
631	bh.b_size = PMD_SIZE;	831	bh.b_size = PMD_SIZE;
632	if (get_block(inode, block, &bh, write) != 0)	832
		833	if (get_block(inode, block, &bh, 0) != 0)
633	return VM_FAULT_SIGBUS;	834	return VM_FAULT_SIGBUS;
		835
		836	if (!buffer_mapped(&bh) && write) {
		837	if (get_block(inode, block, &bh, 1) != 0)
		838	return VM_FAULT_SIGBUS;
		839	alloc = true;
		840	}
		841
634	bdev = bh.b_bdev;	842	bdev = bh.b_bdev;
635	i_mmap_lock_read(mapping);
636		843
637	/*	844	/*
638	* If the filesystem isn't willing to tell us the length of a hole,	845	* If the filesystem isn't willing to tell us the length of a hole,
@@ -641,15 +848,20 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
641	*/	848	*/
642	if (!buffer_size_valid(&bh) \|\| bh.b_size < PMD_SIZE) {	849	if (!buffer_size_valid(&bh) \|\| bh.b_size < PMD_SIZE) {
643	dax_pmd_dbg(&bh, address, "allocated block too small");	850	dax_pmd_dbg(&bh, address, "allocated block too small");
644	goto fallback;	851	return VM_FAULT_FALLBACK;
		852	}
		853
		854	/*
		855	* If we allocated new storage, make sure no process has any
		856	* zero pages covering this hole
		857	*/
		858	if (alloc) {
		859	loff_t lstart = pgoff << PAGE_SHIFT;
		860	loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
		861
		862	truncate_pagecache_range(inode, lstart, lend);
645	}	863	}
646		864
647	/* make sure no process has any zero pages covering this hole */
648	lstart = pgoff << PAGE_SHIFT;
649	lend = lstart + PMD_SIZE - 1; /* inclusive */
650	i_mmap_unlock_read(mapping);
651	unmap_mapping_range(mapping, lstart, PMD_SIZE, 0);
652	truncate_inode_pages_range(mapping, lstart, lend);
653	i_mmap_lock_read(mapping);	865	i_mmap_lock_read(mapping);
654		866
655	/*	867	/*
@@ -733,6 +945,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
733	}	945	}
734	dax_unmap_atomic(bdev, &dax);	946	dax_unmap_atomic(bdev, &dax);
735		947
		948	/*
		949	* For PTE faults we insert a radix tree entry for reads, and
		950	* leave it clean. Then on the first write we dirty the radix
		951	* tree entry via the dax_pfn_mkwrite() path. This sequence
		952	* allows the dax_pfn_mkwrite() call to be simpler and avoid a
		953	* call into get_block() to translate the pgoff to a sector in
		954	* order to be able to create a new radix tree entry.
		955	*
		956	* The PMD path doesn't have an equivalent to
		957	* dax_pfn_mkwrite(), though, so for a read followed by a
		958	* write we traverse all the way through __dax_pmd_fault()
		959	* twice. This means we can just skip inserting a radix tree
		960	* entry completely on the initial read and just wait until
		961	* the write to insert a dirty entry.
		962	*/
		963	if (write) {
		964	error = dax_radix_entry(mapping, pgoff, dax.sector,
		965	true, true);
		966	if (error) {
		967	dax_pmd_dbg(&bh, address,
		968	"PMD radix insertion failed");
		969	goto fallback;
		970	}
		971	}
		972
736	dev_dbg(part_to_dev(bdev->bd_part),	973	dev_dbg(part_to_dev(bdev->bd_part),
737	"%s: %s addr: %lx pfn: %lx sect: %llx\n",	974	"%s: %s addr: %lx pfn: %lx sect: %llx\n",
738	__func__, current->comm, address,	975	__func__, current->comm, address,
@@ -791,15 +1028,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
791	* dax_pfn_mkwrite - handle first write to DAX page	1028	* dax_pfn_mkwrite - handle first write to DAX page
792	* @vma: The virtual memory area where the fault occurred	1029	* @vma: The virtual memory area where the fault occurred
793	* @vmf: The description of the fault	1030	* @vmf: The description of the fault
794	*
795	*/	1031	*/
796	int dax_pfn_mkwrite(struct vm_area_struct vma, struct vm_fault vmf)	1032	int dax_pfn_mkwrite(struct vm_area_struct vma, struct vm_fault vmf)
797	{	1033	{
798	struct super_block *sb = file_inode(vma->vm_file)->i_sb;	1034	struct file *file = vma->vm_file;
799		1035
800	sb_start_pagefault(sb);	1036	/*
801	file_update_time(vma->vm_file);	1037	* We pass NO_SECTOR to dax_radix_entry() because we expect that a
802	sb_end_pagefault(sb);	1038	* RADIX_DAX_PTE entry already exists in the radix tree from a
		1039	* previous call to __dax_fault(). We just want to look up that PTE
		1040	* entry using vmf->pgoff and make sure the dirty tag is set. This
		1041	* saves us from having to make a call to get_block() here to look
		1042	* up the sector.
		1043	*/
		1044	dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
803	return VM_FAULT_NOPAGE;	1045	return VM_FAULT_NOPAGE;
804	}	1046	}
805	EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);	1047	EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);


diff --git a/include/linux/dax.h b/include/linux/dax.h index e9d57f680f50..8204c3dc3800 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h
@@ -41,4 +41,6 @@ static inline bool dax_mapping(struct address_space *mapping)
41	{	41	{
42	return mapping->host && IS_DAX(mapping->host);	42	return mapping->host && IS_DAX(mapping->host);
43	}	43	}
		44	int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
		45	loff_t end);
44	#endif	46	#endif


diff --git a/mm/filemap.c b/mm/filemap.c index 1e215fc36c83..2e7c8d980d5e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c
@@ -482,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
482	{	482	{
483	int err = 0;	483	int err = 0;
484		484
		485	if (dax_mapping(mapping) && mapping->nrexceptional) {
		486	err = dax_writeback_mapping_range(mapping, lstart, lend);
		487	if (err)
		488	return err;
		489	}
		490
485	if (mapping->nrpages) {	491	if (mapping->nrpages) {
486	err = __filemap_fdatawrite_range(mapping, lstart, lend,	492	err = __filemap_fdatawrite_range(mapping, lstart, lend,
487	WB_SYNC_ALL);	493	WB_SYNC_ALL);