dax,ext2: replace XIP read and write with DAX I/O

Use the generic AIO infrastructure instead of custom read and write methods. In addition to giving us support for AIO, this adds the missing locking between read() and truncate(). Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> Reviewed-by: Jan Kara <jack@suse.cz> Cc: Andreas Dilger <andreas.dilger@intel.com> Cc: Boaz Harrosh <boaz@plexistor.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Chinner <david@fromorbit.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Matthew Wilcox <matthew.r.wilcox@intel.com> 2015-02-16 18:58:56 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-16 20:56:03 -0500
commit: d475c6346a38aef3058eba96867bfa726a3cc940 (patch)
tree: 4d69d0f50a4a8e649a751dca8f710485848c0249
parent: fbbbad4bc2101e452b24e6e65d3d5e11314a0b5f (diff)
8 files changed, 214 insertions, 245 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 348f5c16ef50..8670c224c833 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3151,6 +3151,12 @@ L:	linux-i2c@vger.kernel.org
 S:      Maintained
 F:      drivers/i2c/busses/i2c-diolan-u2c.c
+DIRECT ACCESS (DAX)
+M:      Matthew Wilcox <willy@linux.intel.com>
+L:      linux-fsdevel@vger.kernel.org
+S:      Supported
+F:      fs/dax.c
 DIRECTORY NOTIFICATION (DNOTIFY)
 M:      Eric Paris <eparis@parisplace.org>
 S:      Maintained
diff --git a/fs/Makefile b/fs/Makefile
index bedff48e8fdc..0534444e257c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)           += timerfd.o
 obj-$(CONFIG_EVENTFD)           += eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
+obj-$(CONFIG_FS_XIP)            += dax.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)            += compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)       += binfmt_aout.o
diff --git a/fs/dax.c b/fs/dax.c
new file mode 100644
index 000000000000..1a2bdbfa3ea9
--- /dev/null
+++ b/fs/dax.c
@@ -0,0 +1,186 @@
+/*
+ * fs/dax.c - Direct Access filesystem code
+ * Copyright (c) 2013-2014 Intel Corporation
+ * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/mutex.h>
+#include <linux/uio.h>
+static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+{
+        unsigned long pfn;
+        sector_t sector = bh->b_blocknr << (blkbits - 9);
+        return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
+}
+static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
+                        loff_t end)
+{
+        loff_t final = end - pos + first; /* The final byte of the buffer */
+        if (first > 0)
+                memset(addr, 0, first);
+        if (final < size)
+                memset(addr + final, 0, size - final);
+}
+static bool buffer_written(struct buffer_head *bh)
+{
+        return buffer_mapped(bh) && !buffer_unwritten(bh);
+}
+/*
+ * When ext4 encounters a hole, it returns without modifying the buffer_head
+ * which means that we can't trust b_size.  To cope with this, we set b_state
+ * to 0 before calling get_block and, if any bit is set, we know we can trust
+ * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
+ * and would save us time calling get_block repeatedly.
+ */
+static bool buffer_size_valid(struct buffer_head *bh)
+{
+        return bh->b_state != 0;
+}
+static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
+                        loff_t start, loff_t end, get_block_t get_block,
+                        struct buffer_head *bh)
+{
+        ssize_t retval = 0;
+        loff_t pos = start;
+        loff_t max = start;
+        loff_t bh_max = start;
+        void *addr;
+        bool hole = false;
+        if (rw != WRITE)
+                end = min(end, i_size_read(inode));
+        while (pos < end) {
+                unsigned len;
+                if (pos == max) {
+                        unsigned blkbits = inode->i_blkbits;
+                        sector_t block = pos >> blkbits;
+                        unsigned first = pos - (block << blkbits);
+                        long size;
+                        if (pos == bh_max) {
+                                bh->b_size = PAGE_ALIGN(end - pos);
+                                bh->b_state = 0;
+                                retval = get_block(inode, block, bh,
+                                                                rw == WRITE);
+                                if (retval)
+                                        break;
+                                if (!buffer_size_valid(bh))
+                                        bh->b_size = 1 << blkbits;
+                                bh_max = pos - first + bh->b_size;
+                        } else {
+                                unsigned done = bh->b_size -
+                                                (bh_max - (pos - first));
+                                bh->b_blocknr += done >> blkbits;
+                                bh->b_size -= done;
+                        }
+                        hole = (rw != WRITE) && !buffer_written(bh);
+                        if (hole) {
+                                addr = NULL;
+                                size = bh->b_size - first;
+                        } else {
+                                retval = dax_get_addr(bh, &addr, blkbits);
+                                if (retval < 0)
+                                        break;
+                                if (buffer_unwritten(bh) || buffer_new(bh))
+                                        dax_new_buf(addr, retval, first, pos,
+                                                                        end);
+                                addr += first;
+                                size = retval - first;
+                        }
+                        max = min(pos + size, end);
+                }
+                if (rw == WRITE)
+                        len = copy_from_iter(addr, max - pos, iter);
+                else if (!hole)
+                        len = copy_to_iter(addr, max - pos, iter);
+                else
+                        len = iov_iter_zero(max - pos, iter);
+                if (!len)
+                        break;
+                pos += len;
+                addr += len;
+        }
+        return (pos == start) ? retval : pos - start;
+}
+/**
+ * dax_do_io - Perform I/O to a DAX file
+ * @rw: READ to read or WRITE to write
+ * @iocb: The control block for this I/O
+ * @inode: The file which the I/O is directed at
+ * @iter: The addresses to do I/O from or to
+ * @pos: The file offset where the I/O starts
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ * @end_io: A filesystem callback for I/O completion
+ * @flags: See below
+ *
+ * This function uses the same locking scheme as do_blockdev_direct_IO:
+ * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
+ * caller for writes.  For reads, we take and release the i_mutex ourselves.
+ * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
+ * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
+ * is in progress.
+ */
+ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
+                        struct iov_iter *iter, loff_t pos,
+                        get_block_t get_block, dio_iodone_t end_io, int flags)
+{
+        struct buffer_head bh;
+        ssize_t retval = -EINVAL;
+        loff_t end = pos + iov_iter_count(iter);
+        memset(&bh, 0, sizeof(bh));
+        if ((flags & DIO_LOCKING) && (rw == READ)) {
+                struct address_space *mapping = inode->i_mapping;
+                mutex_lock(&inode->i_mutex);
+                retval = filemap_write_and_wait_range(mapping, pos, end - 1);
+                if (retval) {
+                        mutex_unlock(&inode->i_mutex);
+                        goto out;
+                }
+        }
+        /* Protects against truncate */
+        atomic_inc(&inode->i_dio_count);
+        retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
+        if ((flags & DIO_LOCKING) && (rw == READ))
+                mutex_unlock(&inode->i_mutex);
+        if ((retval > 0) && end_io)
+                end_io(iocb, pos, retval, bh.b_private);
+        inode_dio_done(inode);
+ out:
+        return retval;
+}
+EXPORT_SYMBOL_GPL(dax_do_io);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 7c87b22a7228..a247123fd798 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -81,8 +81,10 @@ const struct file_operations ext2_file_operations = {
 #ifdef CONFIG_EXT2_FS_XIP
 const struct file_operations ext2_xip_file_operations = {
        .llseek         = generic_file_llseek,
-        .read           = xip_file_read,
+        .read           = new_sync_read,
-        .write          = xip_file_write,
+        .write          = new_sync_write,
+        .read_iter      = generic_file_read_iter,
+        .write_iter     = generic_file_write_iter,
        .unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0cb04486577d..3ccd5fd47d66 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -859,7 +859,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
        size_t count = iov_iter_count(iter);
        ssize_t ret;
-        ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
+        if (IS_DAX(inode))
+                ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
+                                NULL, DIO_LOCKING);
+        else
+                ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+                                         ext2_get_block);
        if (ret < 0 && (rw & WRITE))
                ext2_write_failed(mapping, offset + count);
        return ret;
@@ -888,6 +893,7 @@ const struct address_space_operations ext2_aops = {
 const struct address_space_operations ext2_aops_xip = {
        .bmap                   = ext2_bmap,
        .get_xip_mem            = ext2_get_xip_mem,
+        .direct_IO              = ext2_direct_IO,
 };
 const struct address_space_operations ext2_nobh_aops = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb373bb5cf03..241c3c030fb5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2587,12 +2587,11 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
+ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
+                loff_t, get_block_t, dio_iodone_t, int flags);
 #ifdef CONFIG_FS_XIP
-extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
-                             loff_t *ppos);
 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
-extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
-                              size_t len, loff_t *ppos);
 extern int xip_truncate_page(struct address_space *mapping, loff_t from);
 #else
 static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
@@ -2756,6 +2755,11 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root);
 extern void save_mount_options(struct super_block *sb, char *options);
 extern void replace_mount_options(struct super_block *sb, char *options);
+static inline bool io_is_direct(struct file *filp)
+{
+        return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp));
+}
 static inline ino_t parent_ino(struct dentry *dentry)
 {
        ino_t res;
diff --git a/mm/filemap.c b/mm/filemap.c
index 1578c224285e..ad7242043bdb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
        loff_t *ppos = &iocb->ki_pos;
        loff_t pos = *ppos;
-        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+        if (io_is_direct(file)) {
-        if (file->f_flags & O_DIRECT) {
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;
                size_t count = iov_iter_count(iter);
@@ -2584,8 +2583,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        if (err)
                goto out;
-        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+        if (io_is_direct(file)) {
-        if (unlikely(file->f_flags & O_DIRECT)) {
                loff_t endbyte;
                written = generic_file_direct_write(iocb, from, pos);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 59e1c5585748..9c869f402c07 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -43,119 +43,6 @@ static struct page *xip_sparse_page(void)
 }
 /*
- * This is a file read routine for execute in place files, and uses
- * the mapping->a_ops->get_xip_mem() function for the actual low-level
- * stuff.
- *
- * Note the struct file* is not used at all.  It may be NULL.
- */
-static ssize_t
-do_xip_mapping_read(struct address_space *mapping,
-                    struct file_ra_state *_ra,
-                    struct file *filp,
-                    char __user *buf,
-                    size_t len,
-                    loff_t *ppos)
-{
-        struct inode *inode = mapping->host;
-        pgoff_t index, end_index;
-        unsigned long offset;
-        loff_t isize, pos;
-        size_t copied = 0, error = 0;
-        BUG_ON(!mapping->a_ops->get_xip_mem);
-        pos = *ppos;
-        index = pos >> PAGE_CACHE_SHIFT;
-        offset = pos & ~PAGE_CACHE_MASK;
-        isize = i_size_read(inode);
-        if (!isize)
-                goto out;
-        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
-        do {
-                unsigned long nr, left;
-                void *xip_mem;
-                unsigned long xip_pfn;
-                int zero = 0;
-                /* nr is the maximum number of bytes to copy from this page */
-                nr = PAGE_CACHE_SIZE;
-                if (index >= end_index) {
-                        if (index > end_index)
-                                goto out;
-                        nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
-                        if (nr <= offset) {
-                                goto out;
-                        }
-                }
-                nr = nr - offset;
-                if (nr > len - copied)
-                        nr = len - copied;
-                error = mapping->a_ops->get_xip_mem(mapping, index, 0,
-                                                        &xip_mem, &xip_pfn);
-                if (unlikely(error)) {
-                        if (error == -ENODATA) {
-                                /* sparse */
-                                zero = 1;
-                        } else
-                                goto out;
-                }
-                /* If users can be writing to this page using arbitrary
-                 * virtual addresses, take care about potential aliasing
-                 * before reading the page on the kernel side.
-                 */
-                if (mapping_writably_mapped(mapping))
-                        /* address based flush */ ;
-                /*
-                 * Ok, we have the mem, so now we can copy it to user space...
-                 *
-                 * The actor routine returns how many bytes were actually used..
-                 * NOTE! This may not be the same as how much of a user buffer
-                 * we filled up (we may be padding etc), so we can only update
-                 * "pos" here (the actor routine has to update the user buffer
-                 * pointers and the remaining count).
-                 */
-                if (!zero)
-                        left = __copy_to_user(buf+copied, xip_mem+offset, nr);
-                else
-                        left = __clear_user(buf + copied, nr);
-                if (left) {
-                        error = -EFAULT;
-                        goto out;
-                }
-                copied += (nr - left);
-                offset += (nr - left);
-                index += offset >> PAGE_CACHE_SHIFT;
-                offset &= ~PAGE_CACHE_MASK;
-        } while (copied < len);
-out:
-        *ppos = pos + copied;
-        if (filp)
-                file_accessed(filp);
-        return (copied ? copied : error);
-}
-ssize_t
-xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
-{
-        if (!access_ok(VERIFY_WRITE, buf, len))
-                return -EFAULT;
-        return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
-                            buf, len, ppos);
-}
-EXPORT_SYMBOL_GPL(xip_file_read);
-/*
 * __xip_unmap is invoked from xip_unmap and xip_write
 *
 * This function walks all vmas of the address_space and unmaps the
@@ -341,127 +228,6 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
-static ssize_t
-__xip_file_write(struct file *filp, const char __user *buf,
-                  size_t count, loff_t pos, loff_t *ppos)
-{
-        struct address_space * mapping = filp->f_mapping;
-        const struct address_space_operations *a_ops = mapping->a_ops;
-        struct inode    *inode = mapping->host;
-        long            status = 0;
-        size_t          bytes;
-        ssize_t         written = 0;
-        BUG_ON(!mapping->a_ops->get_xip_mem);
-        do {
-                unsigned long index;
-                unsigned long offset;
-                size_t copied;
-                void *xip_mem;
-                unsigned long xip_pfn;
-                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-                index = pos >> PAGE_CACHE_SHIFT;
-                bytes = PAGE_CACHE_SIZE - offset;
-                if (bytes > count)
-                        bytes = count;
-                status = a_ops->get_xip_mem(mapping, index, 0,
-                                                &xip_mem, &xip_pfn);
-                if (status == -ENODATA) {
-                        /* we allocate a new page unmap it */
-                        mutex_lock(&xip_sparse_mutex);
-                        status = a_ops->get_xip_mem(mapping, index, 1,
-                                                        &xip_mem, &xip_pfn);
-                        mutex_unlock(&xip_sparse_mutex);
-                        if (!status)
-                                /* unmap page at pgoff from all other vmas */
-                                __xip_unmap(mapping, index);
-                }
-                if (status)
-                        break;
-                copied = bytes -
-                        __copy_from_user_nocache(xip_mem + offset, buf, bytes);
-                if (likely(copied > 0)) {
-                        status = copied;
-                        if (status >= 0) {
-                                written += status;
-                                count -= status;
-                                pos += status;
-                                buf += status;
-                        }
-                }
-                if (unlikely(copied != bytes))
-                        if (status >= 0)
-                                status = -EFAULT;
-                if (status < 0)
-                        break;
-        } while (count);
-        *ppos = pos;
-        /*
-         * No need to use i_size_read() here, the i_size
-         * cannot change under us because we hold i_mutex.
-         */
-        if (pos > inode->i_size) {
-                i_size_write(inode, pos);
-                mark_inode_dirty(inode);
-        }
-        return written ? written : status;
-}
-ssize_t
-xip_file_write(struct file *filp, const char __user *buf, size_t len,
-               loff_t *ppos)
-{
-        struct address_space *mapping = filp->f_mapping;
-        struct inode *inode = mapping->host;
-        size_t count;
-        loff_t pos;
-        ssize_t ret;
-        mutex_lock(&inode->i_mutex);
-        if (!access_ok(VERIFY_READ, buf, len)) {
-                ret=-EFAULT;
-                goto out_up;
-        }
-        pos = *ppos;
-        count = len;
-        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = inode_to_bdi(inode);
-        ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
-        if (ret)
-                goto out_backing;
-        if (count == 0)
-                goto out_backing;
-        ret = file_remove_suid(filp);
-        if (ret)
-                goto out_backing;
-        ret = file_update_time(filp);
-        if (ret)
-                goto out_backing;
-        ret = __xip_file_write (filp, buf, count, pos, ppos);
- out_backing:
-        current->backing_dev_info = NULL;
- out_up:
-        mutex_unlock(&inode->i_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(xip_file_write);
 /*
 * truncate a page used for execute in place
 * functionality is analog to block_truncate_page but does use get_xip_mem
author	Matthew Wilcox <matthew.r.wilcox@intel.com>	2015-02-16 18:58:56 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-16 20:56:03 -0500
commit	d475c6346a38aef3058eba96867bfa726a3cc940 (patch)
tree	4d69d0f50a4a8e649a751dca8f710485848c0249
parent	fbbbad4bc2101e452b24e6e65d3d5e11314a0b5f (diff)