fs: introduce iomap infrastructure

Add infrastructure for multipage buffered writes. This is implemented using an main iterator that applies an actor function to a range that can be written. This infrastucture is used to implement a buffered write helper, one to zero file ranges and one to implement the ->page_mkwrite VM operations. All of them borrow a fair amount of code from fs/buffers. for now by using an internal version of __block_write_begin that gets passed an iomap and builds the corresponding buffer head. The file system is gets a set of paired ->iomap_begin and ->iomap_end calls which allow it to map/reserve a range and get a notification once the write code is finished with it. Based on earlier code from Dave Chinner. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
author: Christoph Hellwig <hch@lst.de> 2016-06-20 19:23:11 -0400
committer: Dave Chinner <david@fromorbit.com> 2016-06-20 19:23:11 -0400
commit: ae259a9c8593f98aa60d045df978a5482a67c53f (patch)
tree: a3c07fa9fb8c61475ff85f4d8812d83c287258ff
parent: 199a31c6d93ba9dc6f831fa1e77d9926f34f4e8a (diff)
6 files changed, 523 insertions, 10 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index b8fcb416be72..4524916fa200 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
 if BLOCK
+config FS_IOMAP
+        bool
 source "fs/ext2/Kconfig"
 source "fs/ext4/Kconfig"
 source "fs/jbd2/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 85b6e13b62d3..ed2b63257ba9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP)		+= coredump.o
 obj-$(CONFIG_SYSCTL)            += drop_caches.o
 obj-$(CONFIG_FHANDLE)           += fhandle.o
+obj-$(CONFIG_FS_IOMAP)          += iomap.o
 obj-y                           += quota/
diff --git a/fs/buffer.c b/fs/buffer.c
index 754813a6962b..228288a7de38 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
+#include <linux/iomap.h>
 #include <linux/mm.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
@@ -1891,8 +1892,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+static void
-                get_block_t *get_block)
+iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+                struct iomap *iomap)
+{
+        loff_t offset = block << inode->i_blkbits;
+        bh->b_bdev = iomap->bdev;
+        /*
+         * Block points to offset in file we need to map, iomap contains
+         * the offset at which the map starts. If the map ends before the
+         * current block, then do not map the buffer and let the caller
+         * handle it.
+         */
+        BUG_ON(offset >= iomap->offset + iomap->length);
+        switch (iomap->type) {
+        case IOMAP_HOLE:
+                /*
+                 * If the buffer is not up to date or beyond the current EOF,
+                 * we need to mark it as new to ensure sub-block zeroing is
+                 * executed if necessary.
+                 */
+                if (!buffer_uptodate(bh) ||
+                    (offset >= i_size_read(inode)))
+                        set_buffer_new(bh);
+                break;
+        case IOMAP_DELALLOC:
+                if (!buffer_uptodate(bh) ||
+                    (offset >= i_size_read(inode)))
+                        set_buffer_new(bh);
+                set_buffer_uptodate(bh);
+                set_buffer_mapped(bh);
+                set_buffer_delay(bh);
+                break;
+        case IOMAP_UNWRITTEN:
+                /*
+                 * For unwritten regions, we always need to ensure that
+                 * sub-block writes cause the regions in the block we are not
+                 * writing to are zeroed. Set the buffer as new to ensure this.
+                 */
+                set_buffer_new(bh);
+                set_buffer_unwritten(bh);
+                /* FALLTHRU */
+        case IOMAP_MAPPED:
+                if (offset >= i_size_read(inode))
+                        set_buffer_new(bh);
+                bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+                                ((offset - iomap->offset) >> inode->i_blkbits);
+                set_buffer_mapped(bh);
+                break;
+        }
+}
+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+                get_block_t *get_block, struct iomap *iomap)
 {
        unsigned from = pos & (PAGE_SIZE - 1);
        unsigned to = from + len;
@@ -1928,9 +1983,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
-                        err = get_block(inode, block, bh, 1);
+                        if (get_block) {
-                        if (err)
+                                err = get_block(inode, block, bh, 1);
-                                break;
+                                if (err)
+                                        break;
+                        } else {
+                                iomap_to_bh(inode, block, bh, iomap);
+                        }
                        if (buffer_new(bh)) {
                                unmap_underlying_metadata(bh->b_bdev,
                                                        bh->b_blocknr);
@@ -1971,6 +2031,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                page_zero_new_buffers(page, from, to);
        return err;
 }
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+                get_block_t *get_block)
+{
+        return __block_write_begin_int(page, pos, len, get_block, NULL);
+}
 EXPORT_SYMBOL(__block_write_begin);
 static int __block_commit_write(struct inode *inode, struct page *page,
diff --git a/fs/internal.h b/fs/internal.h
index b71deeecea17..c0c6f493ab8a 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
 struct super_block;
 struct file_system_type;
+struct iomap;
 struct linux_binprm;
 struct path;
 struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
 * buffer.c
 */
 extern void guard_bio_eod(int rw, struct bio *bio);
+extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+                get_block_t *get_block, struct iomap *iomap);
 /*
 * char_dev.c
diff --git a/fs/iomap.c b/fs/iomap.c
new file mode 100644
index 000000000000..8e2fc17c266f
--- /dev/null
+++ b/fs/iomap.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+                void *data, struct iomap *iomap);
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+static loff_t
+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
+                struct iomap_ops *ops, void *data, iomap_actor_t actor)
+{
+        struct iomap iomap = { 0 };
+        loff_t written = 0, ret;
+        /*
+         * Need to map a range from start position for length bytes. This can
+         * span multiple pages - it is only guaranteed to return a range of a
+         * single type of pages (e.g. all into a hole, all mapped or all
+         * unwritten). Failure at this point has nothing to undo.
+         *
+         * If allocation is required for this range, reserve the space now so
+         * that the allocation is guaranteed to succeed later on. Once we copy
+         * the data into the page cache pages, then we cannot fail otherwise we
+         * expose transient stale data. If the reserve fails, we can safely
+         * back out at this point as there is nothing to undo.
+         */
+        ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+        if (ret)
+                return ret;
+        if (WARN_ON(iomap.offset > pos))
+                return -EIO;
+        /*
+         * Cut down the length to the one actually provided by the filesystem,
+         * as it might not be able to give us the whole size that we requested.
+         */
+        if (iomap.offset + iomap.length < pos + length)
+                length = iomap.offset + iomap.length - pos;
+        /*
+         * Now that we have guaranteed that the space allocation will succeed.
+         * we can do the copy-in page by page without having to worry about
+         * failures exposing transient data.
+         */
+        written = actor(inode, pos, length, data, &iomap);
+        /*
+         * Now the data has been copied, commit the range we've copied.  This
+         * should not fail unless the filesystem has had a fatal error.
+         */
+        ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
+                        flags, &iomap);
+        return written ? written : ret;
+}
+static void
+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
+{
+        loff_t i_size = i_size_read(inode);
+        /*
+         * Only truncate newly allocated pages beyoned EOF, even if the
+         * write started inside the existing inode size.
+         */
+        if (pos + len > i_size)
+                truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+}
+static int
+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, struct iomap *iomap)
+{
+        pgoff_t index = pos >> PAGE_SHIFT;
+        struct page *page;
+        int status = 0;
+        BUG_ON(pos + len > iomap->offset + iomap->length);
+        page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+        if (!page)
+                return -ENOMEM;
+        status = __block_write_begin_int(page, pos, len, NULL, iomap);
+        if (unlikely(status)) {
+                unlock_page(page);
+                put_page(page);
+                page = NULL;
+                iomap_write_failed(inode, pos, len);
+        }
+        *pagep = page;
+        return status;
+}
+static int
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+                unsigned copied, struct page *page)
+{
+        int ret;
+        ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+                        copied, page, NULL);
+        if (ret < len)
+                iomap_write_failed(inode, pos, len);
+        return ret;
+}
+static loff_t
+iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+                struct iomap *iomap)
+{
+        struct iov_iter *i = data;
+        long status = 0;
+        ssize_t written = 0;
+        unsigned int flags = AOP_FLAG_NOFS;
+        /*
+         * Copies from kernel address space cannot fail (NFSD is a big user).
+         */
+        if (!iter_is_iovec(i))
+                flags |= AOP_FLAG_UNINTERRUPTIBLE;
+        do {
+                struct page *page;
+                unsigned long offset;   /* Offset into pagecache page */
+                unsigned long bytes;    /* Bytes to write to page */
+                size_t copied;          /* Bytes copied from user */
+                offset = (pos & (PAGE_SIZE - 1));
+                bytes = min_t(unsigned long, PAGE_SIZE - offset,
+                                                iov_iter_count(i));
+again:
+                if (bytes > length)
+                        bytes = length;
+                /*
+                 * Bring in the user page that we will copy from _first_.
+                 * Otherwise there's a nasty deadlock on copying from the
+                 * same page as we're writing to, without it being marked
+                 * up-to-date.
+                 *
+                 * Not only is this an optimisation, but it is also required
+                 * to check that the address is actually valid, when atomic
+                 * usercopies are used, below.
+                 */
+                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+                        status = -EFAULT;
+                        break;
+                }
+                status = iomap_write_begin(inode, pos, bytes, flags, &page,
+                                iomap);
+                if (unlikely(status))
+                        break;
+                if (mapping_writably_mapped(inode->i_mapping))
+                        flush_dcache_page(page);
+                pagefault_disable();
+                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+                pagefault_enable();
+                flush_dcache_page(page);
+                mark_page_accessed(page);
+                status = iomap_write_end(inode, pos, bytes, copied, page);
+                if (unlikely(status < 0))
+                        break;
+                copied = status;
+                cond_resched();
+                iov_iter_advance(i, copied);
+                if (unlikely(copied == 0)) {
+                        /*
+                         * If we were unable to copy any data at all, we must
+                         * fall back to a single segment length write.
+                         *
+                         * If we didn't fallback here, we could livelock
+                         * because not all segments in the iov can be copied at
+                         * once without a pagefault.
+                         */
+                        bytes = min_t(unsigned long, PAGE_SIZE - offset,
+                                                iov_iter_single_seg_count(i));
+                        goto again;
+                }
+                pos += copied;
+                written += copied;
+                length -= copied;
+                balance_dirty_pages_ratelimited(inode->i_mapping);
+        } while (iov_iter_count(i) && length);
+        return written ? written : status;
+}
+ssize_t
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+                struct iomap_ops *ops)
+{
+        struct inode *inode = iocb->ki_filp->f_mapping->host;
+        loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+        while (iov_iter_count(iter)) {
+                ret = iomap_apply(inode, pos, iov_iter_count(iter),
+                                IOMAP_WRITE, ops, iter, iomap_write_actor);
+                if (ret <= 0)
+                        break;
+                pos += ret;
+                written += ret;
+        }
+        return written ? written : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
+                unsigned bytes, struct iomap *iomap)
+{
+        struct page *page;
+        int status;
+        status = iomap_write_begin(inode, pos, bytes,
+                        AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
+        if (status)
+                return status;
+        zero_user(page, offset, bytes);
+        mark_page_accessed(page);
+        return iomap_write_end(inode, pos, bytes, bytes, page);
+}
+static loff_t
+iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
+                void *data, struct iomap *iomap)
+{
+        bool *did_zero = data;
+        loff_t written = 0;
+        int status;
+        /* already zeroed?  we're done. */
+        if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+                return count;
+        do {
+                unsigned offset, bytes;
+                offset = pos & (PAGE_SIZE - 1); /* Within page */
+                bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+                status = iomap_zero(inode, pos, offset, bytes, iomap);
+                if (status < 0)
+                        return status;
+                pos += bytes;
+                count -= bytes;
+                written += bytes;
+                if (did_zero)
+                        *did_zero = true;
+        } while (count > 0);
+        return written;
+}
+int
+iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+                struct iomap_ops *ops)
+{
+        loff_t ret;
+        while (len > 0) {
+                ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
+                                ops, did_zero, iomap_zero_range_actor);
+                if (ret <= 0)
+                        return ret;
+                pos += ret;
+                len -= ret;
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_zero_range);
+int
+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+                struct iomap_ops *ops)
+{
+        unsigned blocksize = (1 << inode->i_blkbits);
+        unsigned off = pos & (blocksize - 1);
+        /* Block boundary? Nothing to do */
+        if (!off)
+                return 0;
+        return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(iomap_truncate_page);
+static loff_t
+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
+                void *data, struct iomap *iomap)
+{
+        struct page *page = data;
+        int ret;
+        ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
+                        NULL, iomap);
+        if (ret)
+                return ret;
+        block_commit_write(page, 0, length);
+        return length;
+}
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                struct iomap_ops *ops)
+{
+        struct page *page = vmf->page;
+        struct inode *inode = file_inode(vma->vm_file);
+        unsigned long length;
+        loff_t offset, size;
+        ssize_t ret;
+        lock_page(page);
+        size = i_size_read(inode);
+        if ((page->mapping != inode->i_mapping) ||
+            (page_offset(page) > size)) {
+                /* We overload EFAULT to mean page got truncated */
+                ret = -EFAULT;
+                goto out_unlock;
+        }
+        /* page is wholly or partially inside EOF */
+        if (((page->index + 1) << PAGE_SHIFT) > size)
+                length = size & ~PAGE_MASK;
+        else
+                length = PAGE_SIZE;
+        offset = page_offset(page);
+        while (length > 0) {
+                ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
+                                ops, page, iomap_page_mkwrite_actor);
+                if (unlikely(ret <= 0))
+                        goto out_unlock;
+                offset += ret;
+                length -= ret;
+        }
+        set_page_dirty(page);
+        wait_for_stable_page(page);
+        return 0;
+out_unlock:
+        unlock_page(page);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 1b22197bcf01..d2f469ae899a 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -3,19 +3,65 @@
 #include <linux/types.h>
-/* types of block ranges for multipage write mappings. */
+struct inode;
+struct iov_iter;
+struct kiocb;
+struct vm_area_struct;
+struct vm_fault;
+/*
+ * Types of block ranges for iomap mappings:
+ */
 #define IOMAP_HOLE      0x01    /* no blocks allocated, need allocation */
 #define IOMAP_DELALLOC  0x02    /* delayed allocation blocks */
 #define IOMAP_MAPPED    0x03    /* blocks allocated @blkno */
 #define IOMAP_UNWRITTEN 0x04    /* blocks allocated @blkno in unwritten state */
+/*
+ * Magic value for blkno:
+ */
 #define IOMAP_NULL_BLOCK -1LL   /* blkno is not valid */
 struct iomap {
-        sector_t        blkno;  /* first sector of mapping */
+        sector_t                blkno;  /* 1st sector of mapping, 512b units */
-        loff_t          offset; /* file offset of mapping, bytes */
+        loff_t                  offset; /* file offset of mapping, bytes */
-        u64             length; /* length of mapping, bytes */
+        u64                     length; /* length of mapping, bytes */
-        int             type;   /* type of mapping */
+        int                     type;   /* type of mapping */
+        struct block_device     *bdev;  /* block device for I/O */
+};
+/*
+ * Flags for iomap_begin / iomap_end.  No flag implies a read.
+ */
+#define IOMAP_WRITE             (1 << 0)
+#define IOMAP_ZERO              (1 << 1)
+struct iomap_ops {
+        /*
+         * Return the existing mapping at pos, or reserve space starting at
+         * pos for up to length, as long as we can do it as a single mapping.
+         * The actual length is returned in iomap->length.
+         */
+        int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
+                        unsigned flags, struct iomap *iomap);
+        /*
+         * Commit and/or unreserve space previous allocated using iomap_begin.
+         * Written indicates the length of the successful write operation which
+         * needs to be commited, while the rest needs to be unreserved.
+         * Written might be zero if no data was written.
+         */
+        int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
+                        ssize_t written, unsigned flags, struct iomap *iomap);
 };
+ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
+                struct iomap_ops *ops);
+int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
+                bool *did_zero, struct iomap_ops *ops);
+int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+                struct iomap_ops *ops);
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                struct iomap_ops *ops);
 #endif /* LINUX_IOMAP_H */
author	Christoph Hellwig <hch@lst.de>	2016-06-20 19:23:11 -0400
committer	Dave Chinner <david@fromorbit.com>	2016-06-20 19:23:11 -0400
commit	ae259a9c8593f98aa60d045df978a5482a67c53f (patch)
tree	a3c07fa9fb8c61475ff85f4d8812d83c287258ff
parent	199a31c6d93ba9dc6f831fa1e77d9926f34f4e8a (diff)

diff --git a/fs/Kconfig b/fs/Kconfig index b8fcb416be72..4524916fa200 100644 --- a/fs/Kconfig +++ b/fs/Kconfig
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
10		10
11	if BLOCK	11	if BLOCK
12		12
		13	config FS_IOMAP
		14	bool
		15
13	source "fs/ext2/Kconfig"	16	source "fs/ext2/Kconfig"
14	source "fs/ext4/Kconfig"	17	source "fs/ext4/Kconfig"
15	source "fs/jbd2/Kconfig"	18	source "fs/jbd2/Kconfig"


diff --git a/fs/Makefile b/fs/Makefile index 85b6e13b62d3..ed2b63257ba9 100644 --- a/fs/Makefile +++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o
49	obj-$(CONFIG_SYSCTL) += drop_caches.o	49	obj-$(CONFIG_SYSCTL) += drop_caches.o
50		50
51	obj-$(CONFIG_FHANDLE) += fhandle.o	51	obj-$(CONFIG_FHANDLE) += fhandle.o
		52	obj-$(CONFIG_FS_IOMAP) += iomap.o
52		53
53	obj-y += quota/	54	obj-y += quota/
54		55


diff --git a/fs/buffer.c b/fs/buffer.c index 754813a6962b..228288a7de38 100644 --- a/fs/buffer.c +++ b/fs/buffer.c
@@ -21,6 +21,7 @@
21	#include <linux/kernel.h>	21	#include <linux/kernel.h>
22	#include <linux/syscalls.h>	22	#include <linux/syscalls.h>
23	#include <linux/fs.h>	23	#include <linux/fs.h>
		24	#include <linux/iomap.h>
24	#include <linux/mm.h>	25	#include <linux/mm.h>
25	#include <linux/percpu.h>	26	#include <linux/percpu.h>
26	#include <linux/slab.h>	27	#include <linux/slab.h>
@@ -1891,8 +1892,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1891	}	1892	}
1892	EXPORT_SYMBOL(page_zero_new_buffers);	1893	EXPORT_SYMBOL(page_zero_new_buffers);
1893		1894
1894	int __block_write_begin(struct page *page, loff_t pos, unsigned len,	1895	static void
1895	get_block_t *get_block)	1896	iomap_to_bh(struct inode inode, sector_t block, struct buffer_head bh,
		1897	struct iomap *iomap)
		1898	{
		1899	loff_t offset = block << inode->i_blkbits;
		1900
		1901	bh->b_bdev = iomap->bdev;
		1902
		1903	/*
		1904	* Block points to offset in file we need to map, iomap contains
		1905	* the offset at which the map starts. If the map ends before the
		1906	* current block, then do not map the buffer and let the caller
		1907	* handle it.
		1908	*/
		1909	BUG_ON(offset >= iomap->offset + iomap->length);
		1910
		1911	switch (iomap->type) {
		1912	case IOMAP_HOLE:
		1913	/*
		1914	* If the buffer is not up to date or beyond the current EOF,
		1915	* we need to mark it as new to ensure sub-block zeroing is
		1916	* executed if necessary.
		1917	*/
		1918	if (!buffer_uptodate(bh) \|\|
		1919	(offset >= i_size_read(inode)))
		1920	set_buffer_new(bh);
		1921	break;
		1922	case IOMAP_DELALLOC:
		1923	if (!buffer_uptodate(bh) \|\|
		1924	(offset >= i_size_read(inode)))
		1925	set_buffer_new(bh);
		1926	set_buffer_uptodate(bh);
		1927	set_buffer_mapped(bh);
		1928	set_buffer_delay(bh);
		1929	break;
		1930	case IOMAP_UNWRITTEN:
		1931	/*
		1932	* For unwritten regions, we always need to ensure that
		1933	* sub-block writes cause the regions in the block we are not
		1934	* writing to are zeroed. Set the buffer as new to ensure this.
		1935	*/
		1936	set_buffer_new(bh);
		1937	set_buffer_unwritten(bh);
		1938	/* FALLTHRU */
		1939	case IOMAP_MAPPED:
		1940	if (offset >= i_size_read(inode))
		1941	set_buffer_new(bh);
		1942	bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
		1943	((offset - iomap->offset) >> inode->i_blkbits);
		1944	set_buffer_mapped(bh);
		1945	break;
		1946	}
		1947	}
		1948
		1949	int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
		1950	get_block_t get_block, struct iomap iomap)
1896	{	1951	{
1897	unsigned from = pos & (PAGE_SIZE - 1);	1952	unsigned from = pos & (PAGE_SIZE - 1);
1898	unsigned to = from + len;	1953	unsigned to = from + len;
@@ -1928,9 +1983,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1928	clear_buffer_new(bh);	1983	clear_buffer_new(bh);
1929	if (!buffer_mapped(bh)) {	1984	if (!buffer_mapped(bh)) {
1930	WARN_ON(bh->b_size != blocksize);	1985	WARN_ON(bh->b_size != blocksize);
1931	err = get_block(inode, block, bh, 1);	1986	if (get_block) {
1932	if (err)	1987	err = get_block(inode, block, bh, 1);
1933	break;	1988	if (err)
		1989	break;
		1990	} else {
		1991	iomap_to_bh(inode, block, bh, iomap);
		1992	}
		1993
1934	if (buffer_new(bh)) {	1994	if (buffer_new(bh)) {
1935	unmap_underlying_metadata(bh->b_bdev,	1995	unmap_underlying_metadata(bh->b_bdev,
1936	bh->b_blocknr);	1996	bh->b_blocknr);
@@ -1971,6 +2031,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1971	page_zero_new_buffers(page, from, to);	2031	page_zero_new_buffers(page, from, to);
1972	return err;	2032	return err;
1973	}	2033	}
		2034
		2035	int __block_write_begin(struct page *page, loff_t pos, unsigned len,
		2036	get_block_t *get_block)
		2037	{
		2038	return __block_write_begin_int(page, pos, len, get_block, NULL);
		2039	}
1974	EXPORT_SYMBOL(__block_write_begin);	2040	EXPORT_SYMBOL(__block_write_begin);
1975		2041
1976	static int __block_commit_write(struct inode inode, struct page page,	2042	static int __block_commit_write(struct inode inode, struct page page,


diff --git a/fs/internal.h b/fs/internal.h index b71deeecea17..c0c6f493ab8a 100644 --- a/fs/internal.h +++ b/fs/internal.h
@@ -11,6 +11,7 @@
11		11
12	struct super_block;	12	struct super_block;
13	struct file_system_type;	13	struct file_system_type;
		14	struct iomap;
14	struct linux_binprm;	15	struct linux_binprm;
15	struct path;	16	struct path;
16	struct mount;	17	struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
39	* buffer.c	40	* buffer.c
40	*/	41	*/
41	extern void guard_bio_eod(int rw, struct bio *bio);	42	extern void guard_bio_eod(int rw, struct bio *bio);
		43	extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
		44	get_block_t get_block, struct iomap iomap);
42		45
43	/*	46	/*
44	* char_dev.c	47	* char_dev.c


diff --git a/fs/iomap.c b/fs/iomap.c new file mode 100644 index 000000000000..8e2fc17c266f --- /dev/null +++ b/fs/iomap.c
@@ -0,0 +1,394 @@
		1	/*
		2	* Copyright (C) 2010 Red Hat, Inc.
		3	* Copyright (c) 2016 Christoph Hellwig.
		4	*
		5	* This program is free software; you can redistribute it and/or modify it
		6	* under the terms and conditions of the GNU General Public License,
		7	* version 2, as published by the Free Software Foundation.
		8	*
		9	* This program is distributed in the hope it will be useful, but WITHOUT
		10	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
		11	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
		12	* more details.
		13	*/
		14	#include <linux/module.h>
		15	#include <linux/compiler.h>
		16	#include <linux/fs.h>
		17	#include <linux/iomap.h>
		18	#include <linux/uaccess.h>
		19	#include <linux/gfp.h>
		20	#include <linux/mm.h>
		21	#include <linux/swap.h>
		22	#include <linux/pagemap.h>
		23	#include <linux/file.h>
		24	#include <linux/uio.h>
		25	#include <linux/backing-dev.h>
		26	#include <linux/buffer_head.h>
		27	#include "internal.h"
		28
		29	typedef loff_t (iomap_actor_t)(struct inode inode, loff_t pos, loff_t len,
		30	void data, struct iomap iomap);
		31
		32	/*
		33	* Execute a iomap write on a segment of the mapping that spans a
		34	* contiguous range of pages that have identical block mapping state.
		35	*
		36	* This avoids the need to map pages individually, do individual allocations
		37	* for each page and most importantly avoid the need for filesystem specific
		38	* locking per page. Instead, all the operations are amortised over the entire
		39	* range of pages. It is assumed that the filesystems will lock whatever
		40	* resources they require in the iomap_begin call, and release them in the
		41	* iomap_end call.
		42	*/
		43	static loff_t
		44	iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
		45	struct iomap_ops ops, void data, iomap_actor_t actor)
		46	{
		47	struct iomap iomap = { 0 };
		48	loff_t written = 0, ret;
		49
		50	/*
		51	* Need to map a range from start position for length bytes. This can
		52	* span multiple pages - it is only guaranteed to return a range of a
		53	* single type of pages (e.g. all into a hole, all mapped or all
		54	* unwritten). Failure at this point has nothing to undo.
		55	*
		56	* If allocation is required for this range, reserve the space now so
		57	* that the allocation is guaranteed to succeed later on. Once we copy
		58	* the data into the page cache pages, then we cannot fail otherwise we
		59	* expose transient stale data. If the reserve fails, we can safely
		60	* back out at this point as there is nothing to undo.
		61	*/
		62	ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
		63	if (ret)
		64	return ret;
		65	if (WARN_ON(iomap.offset > pos))
		66	return -EIO;
		67
		68	/*
		69	* Cut down the length to the one actually provided by the filesystem,
		70	* as it might not be able to give us the whole size that we requested.
		71	*/
		72	if (iomap.offset + iomap.length < pos + length)
		73	length = iomap.offset + iomap.length - pos;
		74
		75	/*
		76	* Now that we have guaranteed that the space allocation will succeed.
		77	* we can do the copy-in page by page without having to worry about
		78	* failures exposing transient data.
		79	*/
		80	written = actor(inode, pos, length, data, &iomap);
		81
		82	/*
		83	* Now the data has been copied, commit the range we've copied. This
		84	* should not fail unless the filesystem has had a fatal error.
		85	*/
		86	ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
		87	flags, &iomap);
		88
		89	return written ? written : ret;
		90	}
		91
		92	static void
		93	iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
		94	{
		95	loff_t i_size = i_size_read(inode);
		96
		97	/*
		98	* Only truncate newly allocated pages beyoned EOF, even if the
		99	* write started inside the existing inode size.
		100	*/
		101	if (pos + len > i_size)
		102	truncate_pagecache_range(inode, max(pos, i_size), pos + len);
		103	}
		104
		105	static int
		106	iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
		107	struct page *pagep, struct iomap iomap)
		108	{
		109	pgoff_t index = pos >> PAGE_SHIFT;
		110	struct page *page;
		111	int status = 0;
		112
		113	BUG_ON(pos + len > iomap->offset + iomap->length);
		114
		115	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
		116	if (!page)
		117	return -ENOMEM;
		118
		119	status = __block_write_begin_int(page, pos, len, NULL, iomap);
		120	if (unlikely(status)) {
		121	unlock_page(page);
		122	put_page(page);
		123	page = NULL;
		124
		125	iomap_write_failed(inode, pos, len);
		126	}
		127
		128	*pagep = page;
		129	return status;
		130	}
		131
		132	static int
		133	iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
		134	unsigned copied, struct page *page)
		135	{
		136	int ret;
		137
		138	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
		139	copied, page, NULL);
		140	if (ret < len)
		141	iomap_write_failed(inode, pos, len);
		142	return ret;
		143	}
		144
		145	static loff_t
		146	iomap_write_actor(struct inode inode, loff_t pos, loff_t length, void data,
		147	struct iomap *iomap)
		148	{
		149	struct iov_iter *i = data;
		150	long status = 0;
		151	ssize_t written = 0;
		152	unsigned int flags = AOP_FLAG_NOFS;
		153
		154	/*
		155	* Copies from kernel address space cannot fail (NFSD is a big user).
		156	*/
		157	if (!iter_is_iovec(i))
		158	flags \|= AOP_FLAG_UNINTERRUPTIBLE;
		159
		160	do {
		161	struct page *page;
		162	unsigned long offset; /* Offset into pagecache page */
		163	unsigned long bytes; /* Bytes to write to page */
		164	size_t copied; /* Bytes copied from user */
		165
		166	offset = (pos & (PAGE_SIZE - 1));
		167	bytes = min_t(unsigned long, PAGE_SIZE - offset,
		168	iov_iter_count(i));
		169	again:
		170	if (bytes > length)
		171	bytes = length;
		172
		173	/*
		174	* Bring in the user page that we will copy from _first_.
		175	* Otherwise there's a nasty deadlock on copying from the
		176	* same page as we're writing to, without it being marked
		177	* up-to-date.
		178	*
		179	* Not only is this an optimisation, but it is also required
		180	* to check that the address is actually valid, when atomic
		181	* usercopies are used, below.
		182	*/
		183	if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
		184	status = -EFAULT;
		185	break;
		186	}
		187
		188	status = iomap_write_begin(inode, pos, bytes, flags, &page,
		189	iomap);
		190	if (unlikely(status))
		191	break;
		192
		193	if (mapping_writably_mapped(inode->i_mapping))
		194	flush_dcache_page(page);
		195
		196	pagefault_disable();
		197	copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
		198	pagefault_enable();
		199
		200	flush_dcache_page(page);
		201	mark_page_accessed(page);
		202
		203	status = iomap_write_end(inode, pos, bytes, copied, page);
		204	if (unlikely(status < 0))
		205	break;
		206	copied = status;
		207
		208	cond_resched();
		209
		210	iov_iter_advance(i, copied);
		211	if (unlikely(copied == 0)) {
		212	/*
		213	* If we were unable to copy any data at all, we must
		214	* fall back to a single segment length write.
		215	*
		216	* If we didn't fallback here, we could livelock
		217	* because not all segments in the iov can be copied at
		218	* once without a pagefault.
		219	*/
		220	bytes = min_t(unsigned long, PAGE_SIZE - offset,
		221	iov_iter_single_seg_count(i));
		222	goto again;
		223	}
		224	pos += copied;
		225	written += copied;
		226	length -= copied;
		227
		228	balance_dirty_pages_ratelimited(inode->i_mapping);
		229	} while (iov_iter_count(i) && length);
		230
		231	return written ? written : status;
		232	}
		233
		234	ssize_t
		235	iomap_file_buffered_write(struct kiocb iocb, struct iov_iter iter,
		236	struct iomap_ops *ops)
		237	{
		238	struct inode *inode = iocb->ki_filp->f_mapping->host;
		239	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
		240
		241	while (iov_iter_count(iter)) {
		242	ret = iomap_apply(inode, pos, iov_iter_count(iter),
		243	IOMAP_WRITE, ops, iter, iomap_write_actor);
		244	if (ret <= 0)
		245	break;
		246	pos += ret;
		247	written += ret;
		248	}
		249
		250	return written ? written : ret;
		251	}
		252	EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
		253
		254	static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
		255	unsigned bytes, struct iomap *iomap)
		256	{
		257	struct page *page;
		258	int status;
		259
		260	status = iomap_write_begin(inode, pos, bytes,
		261	AOP_FLAG_UNINTERRUPTIBLE \| AOP_FLAG_NOFS, &page, iomap);
		262	if (status)
		263	return status;
		264
		265	zero_user(page, offset, bytes);
		266	mark_page_accessed(page);
		267
		268	return iomap_write_end(inode, pos, bytes, bytes, page);
		269	}
		270
		271	static loff_t
		272	iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
		273	void data, struct iomap iomap)
		274	{
		275	bool *did_zero = data;
		276	loff_t written = 0;
		277	int status;
		278
		279	/* already zeroed? we're done. */
		280	if (iomap->type == IOMAP_HOLE \|\| iomap->type == IOMAP_UNWRITTEN)
		281	return count;
		282
		283	do {
		284	unsigned offset, bytes;
		285
		286	offset = pos & (PAGE_SIZE - 1); /* Within page */
		287	bytes = min_t(unsigned, PAGE_SIZE - offset, count);
		288
		289	status = iomap_zero(inode, pos, offset, bytes, iomap);
		290	if (status < 0)
		291	return status;
		292
		293	pos += bytes;
		294	count -= bytes;
		295	written += bytes;
		296	if (did_zero)
		297	*did_zero = true;
		298	} while (count > 0);
		299
		300	return written;
		301	}
		302
		303	int
		304	iomap_zero_range(struct inode inode, loff_t pos, loff_t len, bool did_zero,
		305	struct iomap_ops *ops)
		306	{
		307	loff_t ret;
		308
		309	while (len > 0) {
		310	ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
		311	ops, did_zero, iomap_zero_range_actor);
		312	if (ret <= 0)
		313	return ret;
		314
		315	pos += ret;
		316	len -= ret;
		317	}
		318
		319	return 0;
		320	}
		321	EXPORT_SYMBOL_GPL(iomap_zero_range);
		322
		323	int
		324	iomap_truncate_page(struct inode inode, loff_t pos, bool did_zero,
		325	struct iomap_ops *ops)
		326	{
		327	unsigned blocksize = (1 << inode->i_blkbits);
		328	unsigned off = pos & (blocksize - 1);
		329
		330	/* Block boundary? Nothing to do */
		331	if (!off)
		332	return 0;
		333	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
		334	}
		335	EXPORT_SYMBOL_GPL(iomap_truncate_page);
		336
		337	static loff_t
		338	iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
		339	void data, struct iomap iomap)
		340	{
		341	struct page *page = data;
		342	int ret;
		343
		344	ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
		345	NULL, iomap);
		346	if (ret)
		347	return ret;
		348
		349	block_commit_write(page, 0, length);
		350	return length;
		351	}
		352
		353	int iomap_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
		354	struct iomap_ops *ops)
		355	{
		356	struct page *page = vmf->page;
		357	struct inode *inode = file_inode(vma->vm_file);
		358	unsigned long length;
		359	loff_t offset, size;
		360	ssize_t ret;
		361
		362	lock_page(page);
		363	size = i_size_read(inode);
		364	if ((page->mapping != inode->i_mapping) \|\|
		365	(page_offset(page) > size)) {
		366	/* We overload EFAULT to mean page got truncated */
		367	ret = -EFAULT;
		368	goto out_unlock;
		369	}
		370
		371	/* page is wholly or partially inside EOF */
		372	if (((page->index + 1) << PAGE_SHIFT) > size)
		373	length = size & ~PAGE_MASK;
		374	else
		375	length = PAGE_SIZE;
		376
		377	offset = page_offset(page);
		378	while (length > 0) {
		379	ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
		380	ops, page, iomap_page_mkwrite_actor);
		381	if (unlikely(ret <= 0))
		382	goto out_unlock;
		383	offset += ret;
		384	length -= ret;
		385	}
		386
		387	set_page_dirty(page);
		388	wait_for_stable_page(page);
		389	return 0;
		390	out_unlock:
		391	unlock_page(page);
		392	return ret;
		393	}
		394	EXPORT_SYMBOL_GPL(iomap_page_mkwrite);


diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 1b22197bcf01..d2f469ae899a 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h
@@ -3,19 +3,65 @@
3		3
4	#include <linux/types.h>	4	#include <linux/types.h>
5		5
6	/* types of block ranges for multipage write mappings. */	6	struct inode;
		7	struct iov_iter;
		8	struct kiocb;
		9	struct vm_area_struct;
		10	struct vm_fault;
		11
		12	/*
		13	* Types of block ranges for iomap mappings:
		14	*/
7	#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */	15	#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
8	#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */	16	#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
9	#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */	17	#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
10	#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */	18	#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
11		19
		20	/*
		21	* Magic value for blkno:
		22	*/
12	#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */	23	#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
13		24
14	struct iomap {	25	struct iomap {
15	sector_t blkno; /* first sector of mapping */	26	sector_t blkno; /* 1st sector of mapping, 512b units */
16	loff_t offset; /* file offset of mapping, bytes */	27	loff_t offset; /* file offset of mapping, bytes */
17	u64 length; /* length of mapping, bytes */	28	u64 length; /* length of mapping, bytes */
18	int type; /* type of mapping */	29	int type; /* type of mapping */
		30	struct block_device bdev; / block device for I/O */
		31	};
		32
		33	/*
		34	* Flags for iomap_begin / iomap_end. No flag implies a read.
		35	*/
		36	#define IOMAP_WRITE (1 << 0)
		37	#define IOMAP_ZERO (1 << 1)
		38
		39	struct iomap_ops {
		40	/*
		41	* Return the existing mapping at pos, or reserve space starting at
		42	* pos for up to length, as long as we can do it as a single mapping.
		43	* The actual length is returned in iomap->length.
		44	*/
		45	int (iomap_begin)(struct inode inode, loff_t pos, loff_t length,
		46	unsigned flags, struct iomap *iomap);
		47
		48	/*
		49	* Commit and/or unreserve space previous allocated using iomap_begin.
		50	* Written indicates the length of the successful write operation which
		51	* needs to be commited, while the rest needs to be unreserved.
		52	* Written might be zero if no data was written.
		53	*/
		54	int (iomap_end)(struct inode inode, loff_t pos, loff_t length,
		55	ssize_t written, unsigned flags, struct iomap *iomap);
19	};	56	};
20		57
		58	ssize_t iomap_file_buffered_write(struct kiocb iocb, struct iov_iter from,
		59	struct iomap_ops *ops);
		60	int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
		61	bool did_zero, struct iomap_ops ops);
		62	int iomap_truncate_page(struct inode inode, loff_t pos, bool did_zero,
		63	struct iomap_ops *ops);
		64	int iomap_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
		65	struct iomap_ops *ops);
		66
21	#endif /* LINUX_IOMAP_H */	67	#endif /* LINUX_IOMAP_H */