Merge branch 'iomap-4.9-dax' into for-next

author: Dave Chinner <david@fromorbit.com> 2016-10-02 18:53:59 -0400
committer: Dave Chinner <david@fromorbit.com> 2016-10-02 18:53:59 -0400
commit: a1f45e668e14c26b4700b1936c5a41b58cc4ac74 (patch)
tree: 94c1d2b34c15fff8ff39baf7357673978b5a3b2f
parent: a89b3f97bb7c248aea155a90f31d3dfb93b75971 (diff)
parent: d5bfccdf38d094f2b15fae8b361d7bd47f2509d6 (diff)
13 files changed, 464 insertions, 122 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 993dc6fe0416..cc025f82ef07 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,8 @@
 #include <linux/vmstat.h>
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
+#include <linux/iomap.h>
+#include "internal.h"
 /*
 * We use lowest available bit in exceptional entry for locking, other two
@@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
        return VM_FAULT_LOCKED;
 }
-static int copy_user_bh(struct page *to, struct inode *inode,
+static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
-                struct buffer_head *bh, unsigned long vaddr)
+                struct page *to, unsigned long vaddr)
 {
        struct blk_dax_ctl dax = {
-                .sector = to_sector(bh, inode),
+                .sector = sector,
-                .size = bh->b_size,
+                .size = size,
        };
-        struct block_device *bdev = bh->b_bdev;
        void *vto;
        if (dax_map_atomic(bdev, &dax) < 0)
@@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 static int dax_insert_mapping(struct address_space *mapping,
-                        struct buffer_head *bh, void **entryp,
+                struct block_device *bdev, sector_t sector, size_t size,
-                        struct vm_area_struct *vma, struct vm_fault *vmf)
+                void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        unsigned long vaddr = (unsigned long)vmf->virtual_address;
-        struct block_device *bdev = bh->b_bdev;
        struct blk_dax_ctl dax = {
-                .sector = to_sector(bh, mapping->host),
+                .sector = sector,
-                .size = bh->b_size,
+                .size = size,
        };
        void *ret;
        void *entry = *entryp;
@@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        if (vmf->cow_page) {
                struct page *new_page = vmf->cow_page;
                if (buffer_written(&bh))
-                        error = copy_user_bh(new_page, inode, &bh, vaddr);
+                        error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
+                                        bh.b_size, new_page, vaddr);
                else
                        clear_user_highpage(new_page, vaddr);
                if (error)
@@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        /* Filesystem should not return unwritten buffers to us! */
        WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-        error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
+        error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
+                        bh.b_size, &entry, vma, vmf);
 unlock_entry:
        put_locked_mapping_entry(mapping, vmf->pgoff, entry);
 out:
@@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
        return dax_zero_page_range(inode, from, length, get_block);
 }
 EXPORT_SYMBOL_GPL(dax_truncate_page);
+#ifdef CONFIG_FS_IOMAP
+static loff_t
+iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+                struct iomap *iomap)
+{
+        struct iov_iter *iter = data;
+        loff_t end = pos + length, done = 0;
+        ssize_t ret = 0;
+        if (iov_iter_rw(iter) == READ) {
+                end = min(end, i_size_read(inode));
+                if (pos >= end)
+                        return 0;
+                if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+                        return iov_iter_zero(min(length, end - pos), iter);
+        }
+        if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+                return -EIO;
+        while (pos < end) {
+                unsigned offset = pos & (PAGE_SIZE - 1);
+                struct blk_dax_ctl dax = { 0 };
+                ssize_t map_len;
+                dax.sector = iomap->blkno +
+                        (((pos & PAGE_MASK) - iomap->offset) >> 9);
+                dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
+                map_len = dax_map_atomic(iomap->bdev, &dax);
+                if (map_len < 0) {
+                        ret = map_len;
+                        break;
+                }
+                dax.addr += offset;
+                map_len -= offset;
+                if (map_len > end - pos)
+                        map_len = end - pos;
+                if (iov_iter_rw(iter) == WRITE)
+                        map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
+                else
+                        map_len = copy_to_iter(dax.addr, map_len, iter);
+                dax_unmap_atomic(iomap->bdev, &dax);
+                if (map_len <= 0) {
+                        ret = map_len ? map_len : -EFAULT;
+                        break;
+                }
+                pos += map_len;
+                length -= map_len;
+                done += map_len;
+        }
+        return done ? done : ret;
+}
+/**
+ * iomap_dax_rw - Perform I/O to a DAX file
+ * @iocb:       The control block for this I/O
+ * @iter:       The addresses to do I/O from or to
+ * @ops:        iomap ops passed from the file system
+ *
+ * This function performs read and write operations to directly mapped
+ * persistent memory.  The callers needs to take care of read/write exclusion
+ * and evicting any page cache pages in the region under I/O.
+ */
+ssize_t
+iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+                struct iomap_ops *ops)
+{
+        struct address_space *mapping = iocb->ki_filp->f_mapping;
+        struct inode *inode = mapping->host;
+        loff_t pos = iocb->ki_pos, ret = 0, done = 0;
+        unsigned flags = 0;
+        if (iov_iter_rw(iter) == WRITE)
+                flags |= IOMAP_WRITE;
+        /*
+         * Yes, even DAX files can have page cache attached to them:  A zeroed
+         * page is inserted into the pagecache when we have to serve a write
+         * fault on a hole.  It should never be dirtied and can simply be
+         * dropped from the pagecache once we get real data for the page.
+         *
+         * XXX: This is racy against mmap, and there's nothing we can do about
+         * it. We'll eventually need to shift this down even further so that
+         * we can check if we allocated blocks over a hole first.
+         */
+        if (mapping->nrpages) {
+                ret = invalidate_inode_pages2_range(mapping,
+                                pos >> PAGE_SHIFT,
+                                (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
+                WARN_ON_ONCE(ret);
+        }
+        while (iov_iter_count(iter)) {
+                ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
+                                iter, iomap_dax_actor);
+                if (ret <= 0)
+                        break;
+                pos += ret;
+                done += ret;
+        }
+        iocb->ki_pos += done;
+        return done ? done : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dax_rw);
+/**
+ * iomap_dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @ops: iomap ops passed from the file system
+ *
+ * When a page fault occurs, filesystems may call this helper in their fault
+ * or mkwrite handler for DAX files. Assumes the caller has done all the
+ * necessary locking for the page fault to proceed successfully.
+ */
+int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                        struct iomap_ops *ops)
+{
+        struct address_space *mapping = vma->vm_file->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned long vaddr = (unsigned long)vmf->virtual_address;
+        loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
+        sector_t sector;
+        struct iomap iomap = { 0 };
+        unsigned flags = 0;
+        int error, major = 0;
+        void *entry;
+        /*
+         * Check whether offset isn't beyond end of file now. Caller is supposed
+         * to hold locks serializing us with truncate / punch hole so this is
+         * a reliable test.
+         */
+        if (pos >= i_size_read(inode))
+                return VM_FAULT_SIGBUS;
+        entry = grab_mapping_entry(mapping, vmf->pgoff);
+        if (IS_ERR(entry)) {
+                error = PTR_ERR(entry);
+                goto out;
+        }
+        if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+                flags |= IOMAP_WRITE;
+        /*
+         * Note that we don't bother to use iomap_apply here: DAX required
+         * the file system block size to be equal the page size, which means
+         * that we never have to deal with more than a single extent here.
+         */
+        error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
+        if (error)
+                goto unlock_entry;
+        if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
+                error = -EIO;           /* fs corruption? */
+                goto unlock_entry;
+        }
+        sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
+        if (vmf->cow_page) {
+                switch (iomap.type) {
+                case IOMAP_HOLE:
+                case IOMAP_UNWRITTEN:
+                        clear_user_highpage(vmf->cow_page, vaddr);
+                        break;
+                case IOMAP_MAPPED:
+                        error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
+                                        vmf->cow_page, vaddr);
+                        break;
+                default:
+                        WARN_ON_ONCE(1);
+                        error = -EIO;
+                        break;
+                }
+                if (error)
+                        goto unlock_entry;
+                if (!radix_tree_exceptional_entry(entry)) {
+                        vmf->page = entry;
+                        return VM_FAULT_LOCKED;
+                }
+                vmf->entry = entry;
+                return VM_FAULT_DAX_LOCKED;
+        }
+        switch (iomap.type) {
+        case IOMAP_MAPPED:
+                if (iomap.flags & IOMAP_F_NEW) {
+                        count_vm_event(PGMAJFAULT);
+                        mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+                        major = VM_FAULT_MAJOR;
+                }
+                error = dax_insert_mapping(mapping, iomap.bdev, sector,
+                                PAGE_SIZE, &entry, vma, vmf);
+                break;
+        case IOMAP_UNWRITTEN:
+        case IOMAP_HOLE:
+                if (!(vmf->flags & FAULT_FLAG_WRITE))
+                        return dax_load_hole(mapping, entry, vmf);
+                /*FALLTHRU*/
+        default:
+                WARN_ON_ONCE(1);
+                error = -EIO;
+                break;
+        }
+ unlock_entry:
+        put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ out:
+        if (error == -ENOMEM)
+                return VM_FAULT_OOM | major;
+        /* -EBUSY is fine, somebody else faulted on the same PTE */
+        if (error < 0 && error != -EBUSY)
+                return VM_FAULT_SIGBUS | major;
+        return VM_FAULT_NOPAGE | major;
+}
+EXPORT_SYMBOL_GPL(iomap_dax_fault);
+#endif /* CONFIG_FS_IOMAP */
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index c634874e12d9..36bea5adcaba 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,5 +1,6 @@
 config EXT2_FS
        tristate "Second extended fs support"
+        select FS_IOMAP if FS_DAX
        help
          Ext2 is a standard Linux file system for hard disks.
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 06af2f92226c..37e2be784ac7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations;
 /* inode.c */
 extern const struct address_space_operations ext2_aops;
 extern const struct address_space_operations ext2_nobh_aops;
+extern struct iomap_ops ext2_iomap_ops;
 /* namei.c */
 extern const struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5efeefe17abb..423cc01c9d41 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -22,11 +22,59 @@
 #include <linux/pagemap.h>
 #include <linux/dax.h>
 #include <linux/quotaops.h>
+#include <linux/iomap.h>
+#include <linux/uio.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
 #ifdef CONFIG_FS_DAX
+static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+        struct inode *inode = iocb->ki_filp->f_mapping->host;
+        ssize_t ret;
+        if (!iov_iter_count(to))
+                return 0; /* skip atime */
+        inode_lock_shared(inode);
+        ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops);
+        inode_unlock_shared(inode);
+        file_accessed(iocb->ki_filp);
+        return ret;
+}
+static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        ssize_t ret;
+        inode_lock(inode);
+        ret = generic_write_checks(iocb, from);
+        if (ret <= 0)
+                goto out_unlock;
+        ret = file_remove_privs(file);
+        if (ret)
+                goto out_unlock;
+        ret = file_update_time(file);
+        if (ret)
+                goto out_unlock;
+        ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops);
+        if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+                i_size_write(inode, iocb->ki_pos);
+                mark_inode_dirty(inode);
+        }
+out_unlock:
+        inode_unlock(inode);
+        if (ret > 0)
+                ret = generic_write_sync(iocb, ret);
+        return ret;
+}
 /*
 * The lock ordering for ext2 DAX fault paths is:
 *
@@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
        down_read(&ei->dax_sem);
-        ret = dax_fault(vma, vmf, ext2_get_block);
+        ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops);
        up_read(&ei->dax_sem);
        if (vmf->flags & FAULT_FLAG_WRITE)
@@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        return ret;
 }
-/*
+static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
- * We have mostly NULL's here: the current defaults are ok for
+{
- * the ext2 filesystem.
+#ifdef CONFIG_FS_DAX
- */
+        if (IS_DAX(iocb->ki_filp->f_mapping->host))
+                return ext2_dax_read_iter(iocb, to);
+#endif
+        return generic_file_read_iter(iocb, to);
+}
+static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+#ifdef CONFIG_FS_DAX
+        if (IS_DAX(iocb->ki_filp->f_mapping->host))
+                return ext2_dax_write_iter(iocb, from);
+#endif
+        return generic_file_write_iter(iocb, from);
+}
 const struct file_operations ext2_file_operations = {
        .llseek         = generic_file_llseek,
-        .read_iter      = generic_file_read_iter,
+        .read_iter      = ext2_file_read_iter,
-        .write_iter     = generic_file_write_iter,
+        .write_iter     = ext2_file_write_iter,
        .unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index d5c7d09919f3..c7dbb4661119 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
+#include <linux/iomap.h>
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include "ext2.h"
@@ -618,7 +619,7 @@ static void ext2_splice_branch(struct inode *inode,
 */
 static int ext2_get_blocks(struct inode *inode,
                           sector_t iblock, unsigned long maxblocks,
-                           struct buffer_head *bh_result,
+                           u32 *bno, bool *new, bool *boundary,
                           int create)
 {
        int err = -EIO;
@@ -644,7 +645,6 @@ static int ext2_get_blocks(struct inode *inode,
        /* Simplest case - block found, no allocation needed */
        if (!partial) {
                first_block = le32_to_cpu(chain[depth - 1].key);
-                clear_buffer_new(bh_result); /* What's this do? */
                count++;
                /*map more blocks*/
                while (count < maxblocks && count <= blocks_to_boundary) {
@@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode,
                        mutex_unlock(&ei->truncate_mutex);
                        if (err)
                                goto cleanup;
-                        clear_buffer_new(bh_result);
                        goto got_it;
                }
        }
@@ -745,15 +744,16 @@ static int ext2_get_blocks(struct inode *inode,
                        mutex_unlock(&ei->truncate_mutex);
                        goto cleanup;
                }
-        } else
+        } else {
-                set_buffer_new(bh_result);
+                *new = true;
+        }
        ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
        mutex_unlock(&ei->truncate_mutex);
 got_it:
-        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+        *bno = le32_to_cpu(chain[depth-1].key);
        if (count > blocks_to_boundary)
-                set_buffer_boundary(bh_result);
+                *boundary = true;
        err = count;
        /* Clean up and exit */
        partial = chain + depth - 1;    /* the whole chain */
@@ -765,19 +765,82 @@ cleanup:
        return err;
 }
-int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+int ext2_get_block(struct inode *inode, sector_t iblock,
+                struct buffer_head *bh_result, int create)
 {
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-        int ret = ext2_get_blocks(inode, iblock, max_blocks,
+        bool new = false, boundary = false;
-                              bh_result, create);
+        u32 bno;
-        if (ret > 0) {
+        int ret;
-                bh_result->b_size = (ret << inode->i_blkbits);
-                ret = 0;
+        ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary,
+                        create);
+        if (ret <= 0)
+                return ret;
+        map_bh(bh_result, inode->i_sb, bno);
+        bh_result->b_size = (ret << inode->i_blkbits);
+        if (new)
+                set_buffer_new(bh_result);
+        if (boundary)
+                set_buffer_boundary(bh_result);
+        return 0;
+}
+#ifdef CONFIG_FS_DAX
+static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+                unsigned flags, struct iomap *iomap)
+{
+        unsigned int blkbits = inode->i_blkbits;
+        unsigned long first_block = offset >> blkbits;
+        unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
+        bool new = false, boundary = false;
+        u32 bno;
+        int ret;
+        ret = ext2_get_blocks(inode, first_block, max_blocks,
+                        &bno, &new, &boundary, flags & IOMAP_WRITE);
+        if (ret < 0)
+                return ret;
+        iomap->flags = 0;
+        iomap->bdev = inode->i_sb->s_bdev;
+        iomap->offset = (u64)first_block << blkbits;
+        if (ret == 0) {
+                iomap->type = IOMAP_HOLE;
+                iomap->blkno = IOMAP_NULL_BLOCK;
+                iomap->length = 1 << blkbits;
+        } else {
+                iomap->type = IOMAP_MAPPED;
+                iomap->blkno = (sector_t)bno << (blkbits - 9);
+                iomap->length = (u64)ret << blkbits;
+                iomap->flags |= IOMAP_F_MERGED;
        }
-        return ret;
+        if (new)
+                iomap->flags |= IOMAP_F_NEW;
+        return 0;
 }
+static int
+ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+                ssize_t written, unsigned flags, struct iomap *iomap)
+{
+        if (iomap->type == IOMAP_MAPPED &&
+            written < length &&
+            (flags & IOMAP_WRITE))
+                ext2_write_failed(inode->i_mapping, offset + length);
+        return 0;
+}
+struct iomap_ops ext2_iomap_ops = {
+        .iomap_begin            = ext2_iomap_begin,
+        .iomap_end              = ext2_iomap_end,
+};
+#endif /* CONFIG_FS_DAX */
 int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                u64 start, u64 len)
 {
@@ -863,11 +926,10 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
        loff_t offset = iocb->ki_pos;
        ssize_t ret;
-        if (IS_DAX(inode))
+        if (WARN_ON_ONCE(IS_DAX(inode)))
-                ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL,
+                return -EIO;
-                                DIO_LOCKING);
-        else
+        ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
-                ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
        if (ret < 0 && iov_iter_rw(iter) == WRITE)
                ext2_write_failed(mapping, offset + count);
        return ret;
diff --git a/fs/internal.h b/fs/internal.h
index ba0737649d4a..859178692ce4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
 struct super_block;
 struct file_system_type;
 struct iomap;
+struct iomap_ops;
 struct linux_binprm;
 struct path;
 struct mount;
@@ -164,3 +165,13 @@ extern struct dentry_operations ns_dentry_operations;
 extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
                    unsigned long arg);
 extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+/*
+ * iomap support:
+ */
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+                void *data, struct iomap *iomap);
+loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
+                unsigned flags, struct iomap_ops *ops, void *data,
+                iomap_actor_t actor);
diff --git a/fs/iomap.c b/fs/iomap.c
index ec411a6b9edc..013d1d36fbbf 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -27,9 +27,6 @@
 #include <linux/dax.h>
 #include "internal.h"
-typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
-                void *data, struct iomap *iomap);
 /*
 * Execute a iomap write on a segment of the mapping that spans a
 * contiguous range of pages that have identical block mapping state.
@@ -41,7 +38,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
 * resources they require in the iomap_begin call, and release them in the
 * iomap_end call.
 */
-static loff_t
+loff_t
 iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
                struct iomap_ops *ops, void *data, iomap_actor_t actor)
 {
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 7575cfc3ad15..4a28fa91e3b1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -200,7 +200,7 @@ xfs_setfilesize_trans_alloc(
 * Update on-disk file size now that data has been written to disk.
 */
 STATIC int
-xfs_setfilesize(
+__xfs_setfilesize(
        struct xfs_inode        *ip,
        struct xfs_trans        *tp,
        xfs_off_t               offset,
@@ -225,6 +225,23 @@ xfs_setfilesize(
        return xfs_trans_commit(tp);
 }
+int
+xfs_setfilesize(
+        struct xfs_inode        *ip,
+        xfs_off_t               offset,
+        size_t                  size)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_trans        *tp;
+        int                     error;
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+        if (error)
+                return error;
+        return __xfs_setfilesize(ip, tp, offset, size);
+}
 STATIC int
 xfs_setfilesize_ioend(
        struct xfs_ioend        *ioend,
@@ -247,7 +264,7 @@ xfs_setfilesize_ioend(
                return error;
        }
-        return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
+        return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 }
 /*
@@ -1336,13 +1353,12 @@ xfs_end_io_direct_write(
 {
        struct inode            *inode = file_inode(iocb->ki_filp);
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
        uintptr_t               flags = (uintptr_t)private;
        int                     error = 0;
        trace_xfs_end_io_direct_write(ip, offset, size);
-        if (XFS_FORCED_SHUTDOWN(mp))
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
        if (size <= 0)
@@ -1380,14 +1396,9 @@ xfs_end_io_direct_write(
                error = xfs_iomap_write_unwritten(ip, offset, size);
        } else if (flags & XFS_DIO_FLAG_APPEND) {
-                struct xfs_trans *tp;
                trace_xfs_end_io_direct_write_append(ip, offset, size);
-                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
+                error = xfs_setfilesize(ip, offset, size);
-                                &tp);
-                if (!error)
-                        error = xfs_setfilesize(ip, tp, offset, size);
        }
        return error;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index bf2d9a141a73..1950e3bca2ac 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -62,6 +62,7 @@ int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
 int     xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
                ssize_t size, void *private);
+int     xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
 extern void xfs_count_page_state(struct page *, int *, int *);
 extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b927ea9abe33..c68517b0f248 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -333,10 +333,7 @@ xfs_file_dax_read(
        struct kiocb            *iocb,
        struct iov_iter         *to)
 {
-        struct address_space    *mapping = iocb->ki_filp->f_mapping;
+        struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
-        struct inode            *inode = mapping->host;
-        struct xfs_inode        *ip = XFS_I(inode);
-        struct iov_iter         data = *to;
        size_t                  count = iov_iter_count(to);
        ssize_t                 ret = 0;
@@ -346,11 +343,7 @@ xfs_file_dax_read(
                return 0; /* skip atime */
        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-        ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
+        ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
-        if (ret > 0) {
-                iocb->ki_pos += ret;
-                iov_iter_advance(to, ret);
-        }
        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        file_accessed(iocb->ki_filp);
@@ -712,70 +705,32 @@ xfs_file_dax_write(
        struct kiocb            *iocb,
        struct iov_iter         *from)
 {
-        struct address_space    *mapping = iocb->ki_filp->f_mapping;
+        struct inode            *inode = iocb->ki_filp->f_mapping->host;
-        struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
+        int                     iolock = XFS_IOLOCK_EXCL;
-        ssize_t                 ret = 0;
+        ssize_t                 ret, error = 0;
-        int                     unaligned_io = 0;
+        size_t                  count;
-        int                     iolock;
+        loff_t                  pos;
-        struct iov_iter         data;
-        /* "unaligned" here means not aligned to a filesystem block */
-        if ((iocb->ki_pos & mp->m_blockmask) ||
-            ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
-                unaligned_io = 1;
-                iolock = XFS_IOLOCK_EXCL;
-        } else if (mapping->nrpages) {
-                iolock = XFS_IOLOCK_EXCL;
-        } else {
-                iolock = XFS_IOLOCK_SHARED;
-        }
        xfs_rw_ilock(ip, iolock);
        ret = xfs_file_aio_write_checks(iocb, from, &iolock);
        if (ret)
                goto out;
-        /*
+        pos = iocb->ki_pos;
-         * Yes, even DAX files can have page cache attached to them:  A zeroed
+        count = iov_iter_count(from);
-         * page is inserted into the pagecache when we have to serve a write
-         * fault on a hole.  It should never be dirtied and can simply be
-         * dropped from the pagecache once we get real data for the page.
-         *
-         * XXX: This is racy against mmap, and there's nothing we can do about
-         * it. dax_do_io() should really do this invalidation internally as
-         * it will know if we've allocated over a holei for this specific IO and
-         * if so it needs to update the mapping tree and invalidate existing
-         * PTEs over the newly allocated range. Remove this invalidation when
-         * dax_do_io() is fixed up.
-         */
-        if (mapping->nrpages) {
-                loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
-                ret = invalidate_inode_pages2_range(mapping,
+        trace_xfs_file_dax_write(ip, count, pos);
-                                                    iocb->ki_pos >> PAGE_SHIFT,
-                                                    end >> PAGE_SHIFT);
-                WARN_ON_ONCE(ret);
-        }
-        if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+        ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
-                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+        if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
-                iolock = XFS_IOLOCK_SHARED;
+                i_size_write(inode, iocb->ki_pos);
+                error = xfs_setfilesize(ip, pos, ret);
        }
-        trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
-        data = *from;
-        ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
-                        xfs_end_io_direct_write, 0);
-        if (ret > 0) {
-                iocb->ki_pos += ret;
-                iov_iter_advance(from, ret);
-        }
 out:
        xfs_rw_iunlock(ip, iolock);
-        return ret;
+        return error ? error : ret;
 }
 STATIC ssize_t
@@ -1514,7 +1469,7 @@ xfs_filemap_page_mkwrite(
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
        if (IS_DAX(inode)) {
-                ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
+                ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
        } else {
                ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                ret = block_page_mkwrite_return(ret);
@@ -1548,7 +1503,7 @@ xfs_filemap_fault(
                 * changes to xfs_get_blocks_direct() to map unwritten extent
                 * ioend for conversion on read-only mappings.
                 */
-                ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
+                ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
        } else
                ret = filemap_fault(vma, vmf);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f96c8ffce5f4..c08253e11545 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -934,11 +934,13 @@ error_on_bmapi_transaction:
        return error;
 }
-static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+static inline bool imap_needs_alloc(struct inode *inode,
+                struct xfs_bmbt_irec *imap, int nimaps)
 {
        return !nimaps ||
                imap->br_startblock == HOLESTARTBLOCK ||
-                imap->br_startblock == DELAYSTARTBLOCK;
+                imap->br_startblock == DELAYSTARTBLOCK ||
+                (IS_DAX(inode) && ISUNWRITTEN(imap));
 }
 static int
@@ -954,16 +956,18 @@ xfs_file_iomap_begin(
        struct xfs_bmbt_irec    imap;
        xfs_fileoff_t           offset_fsb, end_fsb;
        int                     nimaps = 1, error = 0;
+        unsigned                lockmode;
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
-        if ((flags & IOMAP_WRITE) && !xfs_get_extsz_hint(ip)) {
+        if ((flags & IOMAP_WRITE) &&
+            !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
                return xfs_file_iomap_begin_delay(inode, offset, length, flags,
                                iomap);
        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        lockmode = xfs_ilock_data_map_shared(ip);
        ASSERT(offset <= mp->m_super->s_maxbytes);
        if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
@@ -974,11 +978,11 @@ xfs_file_iomap_begin(
        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
                               &nimaps, XFS_BMAPI_ENTIRE);
        if (error) {
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                xfs_iunlock(ip, lockmode);
                return error;
        }
-        if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+        if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
                /*
                 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
                 * pages to keep the chunks of work done where somewhat symmetric
@@ -994,17 +998,19 @@ xfs_file_iomap_begin(
                 * xfs_iomap_write_direct() expects the shared lock. It
                 * is unlocked on return.
                 */
-                xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+                if (lockmode == XFS_ILOCK_EXCL)
+                        xfs_ilock_demote(ip, lockmode);
                error = xfs_iomap_write_direct(ip, offset, length, &imap,
                                nimaps);
                if (error)
                        return error;
+                iomap->flags = IOMAP_F_NEW;
                trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
        } else {
                ASSERT(nimaps);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                xfs_iunlock(ip, lockmode);
                trace_xfs_iomap_found(ip, offset, length, 0, &imap);
        }
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9c6dc7704043..add6c4bc568f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -6,13 +6,19 @@
 #include <linux/radix-tree.h>
 #include <asm/pgtable.h>
+struct iomap_ops;
 /* We use lowest available exceptional entry bit for locking */
 #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+                struct iomap_ops *ops);
 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
                  get_block_t, dio_iodone_t, int flags);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
+int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                        struct iomap_ops *ops);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index c74226a738a3..e63e288dee83 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -23,6 +23,7 @@ struct vm_fault;
 */
 #define IOMAP_F_MERGED  0x01    /* contains multiple blocks/extents */
 #define IOMAP_F_SHARED  0x02    /* block shared with another file */
+#define IOMAP_F_NEW     0x04    /* blocks have been newly allocated */
 /*
 * Magic value for blkno:
author	Dave Chinner <david@fromorbit.com>	2016-10-02 18:53:59 -0400
committer	Dave Chinner <david@fromorbit.com>	2016-10-02 18:53:59 -0400
commit	a1f45e668e14c26b4700b1936c5a41b58cc4ac74 (patch)
tree	94c1d2b34c15fff8ff39baf7357673978b5a3b2f
parent	a89b3f97bb7c248aea155a90f31d3dfb93b75971 (diff)
parent	d5bfccdf38d094f2b15fae8b361d7bd47f2509d6 (diff)