fs: introduce write_begin, write_end, and perform_write aops

These are intended to replace prepare_write and commit_write with more flexible alternatives that are also able to avoid the buffered write deadlock problems efficiently (which prepare_write is unable to do). [mark.fasheh@oracle.com: API design contributions, code review and fixes] [akpm@linux-foundation.org: various fixes] [dmonakhov@sw.ru: new aop block_write_begin fix] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Nick Piggin <npiggin@suse.de> 2007-10-16 04:25:01 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-16 12:42:55 -0400
commit: afddba49d18f346e5cc2938b6ed7c512db18ca68 (patch)
tree: 4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca
parent: 637aff46f94a754207c80c8c64bf1b74f24b967d (diff)
11 files changed, 575 insertions, 206 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index f0f825808ca4..fe26cc978523 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -178,15 +178,18 @@ prototypes:
 locking rules:
        All except set_page_dirty may block
-                        BKL     PageLocked(page)
+                        BKL     PageLocked(page)        i_sem
 writepage:              no      yes, unlocks (see below)
 readpage:               no      yes, unlocks
 sync_page:              no      maybe
 writepages:             no
 set_page_dirty          no      no
 readpages:              no
-prepare_write:          no      yes
+prepare_write:          no      yes                     yes
-commit_write:           no      yes
+commit_write:           no      yes                     yes
+write_begin:            no      locks the page          yes
+write_end:              no      yes, unlocks            yes
+perform_write:          no      n/a                     yes
 bmap:                   yes
 invalidatepage:         no      yes
 releasepage:            no      yes
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 045f3e055a28..281c19ff7f45 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -537,6 +537,12 @@ struct address_space_operations {
                        struct list_head *pages, unsigned nr_pages);
        int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
        int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+        int (*write_begin)(struct file *, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned flags,
+                                struct page **pagep, void **fsdata);
+        int (*write_end)(struct file *, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned copied,
+                                struct page *page, void *fsdata);
        sector_t (*bmap)(struct address_space *, sector_t);
        int (*invalidatepage) (struct page *, unsigned long);
        int (*releasepage) (struct page *, int);
@@ -633,6 +639,45 @@ struct address_space_operations {
        operations.  It should avoid returning an error if possible -
        errors should have been handled by prepare_write.
+  write_begin: This is intended as a replacement for prepare_write. The
+        key differences being that:
+                - it returns a locked page (in *pagep) rather than being
+                  given a pre locked page;
+                - it must be able to cope with short writes (where the
+                  length passed to write_begin is greater than the number
+                  of bytes copied into the page).
+        Called by the generic buffered write code to ask the filesystem to
+        prepare to write len bytes at the given offset in the file. The
+        address_space should check that the write will be able to complete,
+        by allocating space if necessary and doing any other internal
+        housekeeping.  If the write will update parts of any basic-blocks on
+        storage, then those blocks should be pre-read (if they haven't been
+        read already) so that the updated blocks can be written out properly.
+        The filesystem must return the locked pagecache page for the specified
+        offset, in *pagep, for the caller to write into.
+        flags is a field for AOP_FLAG_xxx flags, described in
+        include/linux/fs.h.
+        A void * may be returned in fsdata, which then gets passed into
+        write_end.
+        Returns 0 on success; < 0 on failure (which is the error code), in
+        which case write_end is not called.
+  write_end: After a successful write_begin, and data copy, write_end must
+        be called. len is the original len passed to write_begin, and copied
+        is the amount that was able to be copied (copied == len is always true
+        if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag).
+        The filesystem must take care of unlocking the page and releasing it
+        refcount, and updating i_size.
+        Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
+        that were able to be copied into pagecache.
  bmap: called by the VFS to map a logical block offset within object to
        physical block number. This method is used by the FIBMAP
        ioctl and for working with swap-files.  To be able to swap to
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b9233a06934c..a5f993ac28dd 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -204,14 +204,13 @@ lo_do_transfer(struct loop_device *lo, int cmd,
 * do_lo_send_aops - helper for writing data to a loop device
 *
 * This is the fast version for backing filesystems which implement the address
- * space operations prepare_write and commit_write.
+ * space operations write_begin and write_end.
 */
 static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
-                int bsize, loff_t pos, struct page *page)
+                int bsize, loff_t pos, struct page *unused)
 {
        struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
        struct address_space *mapping = file->f_mapping;
-        const struct address_space_operations *aops = mapping->a_ops;
        pgoff_t index;
        unsigned offset, bv_offs;
        int len, ret;
@@ -223,63 +222,47 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
        len = bvec->bv_len;
        while (len > 0) {
                sector_t IV;
-                unsigned size;
+                unsigned size, copied;
                int transfer_result;
+                struct page *page;
+                void *fsdata;
                IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
                size = PAGE_CACHE_SIZE - offset;
                if (size > len)
                        size = len;
-                page = grab_cache_page(mapping, index);
-                if (unlikely(!page))
+                ret = pagecache_write_begin(file, mapping, pos, size, 0,
+                                                        &page, &fsdata);
+                if (ret)
                        goto fail;
-                ret = aops->prepare_write(file, page, offset,
-                                          offset + size);
-                if (unlikely(ret)) {
-                        if (ret == AOP_TRUNCATED_PAGE) {
-                                page_cache_release(page);
-                                continue;
-                        }
-                        goto unlock;
-                }
                transfer_result = lo_do_transfer(lo, WRITE, page, offset,
                                bvec->bv_page, bv_offs, size, IV);
-                if (unlikely(transfer_result)) {
+                copied = size;
-                        /*
-                         * The transfer failed, but we still write the data to
-                         * keep prepare/commit calls balanced.
-                         */
-                        printk(KERN_ERR "loop: transfer error block %llu\n",
-                               (unsigned long long)index);
-                        zero_user_page(page, offset, size, KM_USER0);
-                }
-                flush_dcache_page(page);
-                ret = aops->commit_write(file, page, offset,
-                                         offset + size);
-                if (unlikely(ret)) {
-                        if (ret == AOP_TRUNCATED_PAGE) {
-                                page_cache_release(page);
-                                continue;
-                        }
-                        goto unlock;
-                }
                if (unlikely(transfer_result))
-                        goto unlock;
+                        copied = 0;
-                bv_offs += size;
-                len -= size;
+                ret = pagecache_write_end(file, mapping, pos, size, copied,
+                                                        page, fsdata);
+                if (ret < 0)
+                        goto fail;
+                if (ret < copied)
+                        copied = ret;
+                if (unlikely(transfer_result))
+                        goto fail;
+                bv_offs += copied;
+                len -= copied;
                offset = 0;
                index++;
-                pos += size;
+                pos += copied;
-                unlock_page(page);
-                page_cache_release(page);
        }
        ret = 0;
 out:
        mutex_unlock(&mapping->host->i_mutex);
        return ret;
-unlock:
-        unlock_page(page);
-        page_cache_release(page);
 fail:
        ret = -1;
        goto out;
@@ -313,7 +296,7 @@ static int __do_lo_send_write(struct file *file,
 * do_lo_send_direct_write - helper for writing data to a loop device
 *
 * This is the fast, non-transforming version for backing filesystems which do
- * not implement the address space operations prepare_write and commit_write.
+ * not implement the address space operations write_begin and write_end.
 * It uses the write file operation which should be present on all writeable
 * filesystems.
 */
@@ -332,7 +315,7 @@ static int do_lo_send_direct_write(struct loop_device *lo,
 * do_lo_send_write - helper for writing data to a loop device
 *
 * This is the slow, transforming version for filesystems which do not
- * implement the address space operations prepare_write and commit_write.  It
+ * implement the address space operations write_begin and write_end.  It
 * uses the write file operation which should be present on all writeable
 * filesystems.
 *
@@ -780,7 +763,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
                 */
                if (!file->f_op->splice_read)
                        goto out_putf;
-                if (aops->prepare_write && aops->commit_write)
+                if (aops->prepare_write || aops->write_begin)
                        lo_flags |= LO_FLAGS_USE_AOPS;
                if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
                        lo_flags |= LO_FLAGS_READ_ONLY;
diff --git a/fs/buffer.c b/fs/buffer.c
index 9ece6c2086d0..68b8fbdc1b28 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1770,6 +1770,48 @@ recover:
        goto done;
 }
+/*
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
+ */
+void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+        unsigned int block_start, block_end;
+        struct buffer_head *head, *bh;
+        BUG_ON(!PageLocked(page));
+        if (!page_has_buffers(page))
+                return;
+        bh = head = page_buffers(page);
+        block_start = 0;
+        do {
+                block_end = block_start + bh->b_size;
+                if (buffer_new(bh)) {
+                        if (block_end > from && block_start < to) {
+                                if (!PageUptodate(page)) {
+                                        unsigned start, size;
+                                        start = max(from, block_start);
+                                        size = min(to, block_end) - start;
+                                        zero_user_page(page, start, size, KM_USER0);
+                                        set_buffer_uptodate(bh);
+                                }
+                                clear_buffer_new(bh);
+                                mark_buffer_dirty(bh);
+                        }
+                }
+                block_start = block_end;
+                bh = bh->b_this_page;
+        } while (bh != head);
+}
+EXPORT_SYMBOL(page_zero_new_buffers);
 static int __block_prepare_write(struct inode *inode, struct page *page,
                unsigned from, unsigned to, get_block_t *get_block)
 {
@@ -1854,38 +1896,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
                if (!buffer_uptodate(*wait_bh))
                        err = -EIO;
        }
-        if (!err) {
+        if (unlikely(err))
-                bh = head;
+                page_zero_new_buffers(page, from, to);
-                do {
-                        if (buffer_new(bh))
-                                clear_buffer_new(bh);
-                } while ((bh = bh->b_this_page) != head);
-                return 0;
-        }
-        /* Error case: */
-        /*
-         * Zero out any newly allocated blocks to avoid exposing stale
-         * data.  If BH_New is set, we know that the block was newly
-         * allocated in the above loop.
-         */
-        bh = head;
-        block_start = 0;
-        do {
-                block_end = block_start+blocksize;
-                if (block_end <= from)
-                        goto next_bh;
-                if (block_start >= to)
-                        break;
-                if (buffer_new(bh)) {
-                        clear_buffer_new(bh);
-                        zero_user_page(page, block_start, bh->b_size, KM_USER0);
-                        set_buffer_uptodate(bh);
-                        mark_buffer_dirty(bh);
-                }
-next_bh:
-                block_start = block_end;
-                bh = bh->b_this_page;
-        } while (bh != head);
        return err;
 }
@@ -1910,6 +1922,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
                }
+                clear_buffer_new(bh);
        }
        /*
@@ -1924,6 +1937,130 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 }
 /*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * If *pagep is not NULL, then block_write_begin uses the locked page
+ * at *pagep rather than allocating its own. In this case, the page will
+ * not be unlocked or deallocated on failure.
+ */
+int block_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block)
+{
+        struct inode *inode = mapping->host;
+        int status = 0;
+        struct page *page;
+        pgoff_t index;
+        unsigned start, end;
+        int ownpage = 0;
+        index = pos >> PAGE_CACHE_SHIFT;
+        start = pos & (PAGE_CACHE_SIZE - 1);
+        end = start + len;
+        page = *pagep;
+        if (page == NULL) {
+                ownpage = 1;
+                page = __grab_cache_page(mapping, index);
+                if (!page) {
+                        status = -ENOMEM;
+                        goto out;
+                }
+                *pagep = page;
+        } else
+                BUG_ON(!PageLocked(page));
+        status = __block_prepare_write(inode, page, start, end, get_block);
+        if (unlikely(status)) {
+                ClearPageUptodate(page);
+                if (ownpage) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        *pagep = NULL;
+                        /*
+                         * prepare_write() may have instantiated a few blocks
+                         * outside i_size.  Trim these off again. Don't need
+                         * i_size_read because we hold i_mutex.
+                         */
+                        if (pos + len > inode->i_size)
+                                vmtruncate(inode, inode->i_size);
+                }
+                goto out;
+        }
+out:
+        return status;
+}
+EXPORT_SYMBOL(block_write_begin);
+int block_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        unsigned start;
+        start = pos & (PAGE_CACHE_SIZE - 1);
+        if (unlikely(copied < len)) {
+                /*
+                 * The buffers that were written will now be uptodate, so we
+                 * don't have to worry about a readpage reading them and
+                 * overwriting a partial write. However if we have encountered
+                 * a short write and only partially written into a buffer, it
+                 * will not be marked uptodate, so a readpage might come in and
+                 * destroy our partial write.
+                 *
+                 * Do the simplest thing, and just treat any short write to a
+                 * non uptodate page as a zero-length write, and force the
+                 * caller to redo the whole thing.
+                 */
+                if (!PageUptodate(page))
+                        copied = 0;
+                page_zero_new_buffers(page, start+copied, start+len);
+        }
+        flush_dcache_page(page);
+        /* This could be a short (even 0-length) commit */
+        __block_commit_write(inode, page, start, start+copied);
+        return copied;
+}
+EXPORT_SYMBOL(block_write_end);
+int generic_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold i_mutex.
+         *
+         * But it's important to update i_size while still holding page lock:
+         * page writeout could otherwise come in and zero beyond i_size.
+         */
+        if (pos+copied > inode->i_size) {
+                i_size_write(inode, pos+copied);
+                mark_inode_dirty(inode);
+        }
+        unlock_page(page);
+        page_cache_release(page);
+        return copied;
+}
+EXPORT_SYMBOL(generic_write_end);
+/*
 * Generic "read page" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the page asynchronously --- the unlock_buffer() and
diff --git a/fs/libfs.c b/fs/libfs.c
index 5294de1f40c4..f2b32d3a9093 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -351,6 +351,26 @@ int simple_prepare_write(struct file *file, struct page *page,
        return 0;
 }
+int simple_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata)
+{
+        struct page *page;
+        pgoff_t index;
+        unsigned from;
+        index = pos >> PAGE_CACHE_SHIFT;
+        from = pos & (PAGE_CACHE_SIZE - 1);
+        page = __grab_cache_page(mapping, index);
+        if (!page)
+                return -ENOMEM;
+        *pagep = page;
+        return simple_prepare_write(file, page, from, from+len);
+}
 int simple_commit_write(struct file *file, struct page *page,
                        unsigned from, unsigned to)
 {
@@ -369,6 +389,28 @@ int simple_commit_write(struct file *file, struct page *page,
        return 0;
 }
+int simple_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+        /* zero the stale part of the page if we did a short copy */
+        if (copied < len) {
+                void *kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr + from + copied, 0, len - copied);
+                flush_dcache_page(page);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+        simple_commit_write(file, page, from, from+copied);
+        unlock_page(page);
+        page_cache_release(page);
+        return copied;
+}
 /*
 * the inodes created here are not hashed. If you use iunique to generate
 * unique inode values later for this filesystem, then you must take care
@@ -642,6 +684,8 @@ EXPORT_SYMBOL(dcache_dir_open);
 EXPORT_SYMBOL(dcache_readdir);
 EXPORT_SYMBOL(generic_read_dir);
 EXPORT_SYMBOL(get_sb_pseudo);
+EXPORT_SYMBOL(simple_write_begin);
+EXPORT_SYMBOL(simple_write_end);
 EXPORT_SYMBOL(simple_commit_write);
 EXPORT_SYMBOL(simple_dir_inode_operations);
 EXPORT_SYMBOL(simple_dir_operations);
diff --git a/fs/namei.c b/fs/namei.c
index a83160acd748..b40b8084eefc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2729,53 +2729,29 @@ int __page_symlink(struct inode *inode, const char *symname, int len,
 {
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
+        void *fsdata;
        int err;
        char *kaddr;
 retry:
-        err = -ENOMEM;
+        err = pagecache_write_begin(NULL, mapping, 0, len-1,
-        page = find_or_create_page(mapping, 0, gfp_mask);
+                                AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
-        if (!page)
-                goto fail;
-        err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
-        if (err == AOP_TRUNCATED_PAGE) {
-                page_cache_release(page);
-                goto retry;
-        }
        if (err)
-                goto fail_map;
+                goto fail;
        kaddr = kmap_atomic(page, KM_USER0);
        memcpy(kaddr, symname, len-1);
        kunmap_atomic(kaddr, KM_USER0);
-        err = mapping->a_ops->commit_write(NULL, page, 0, len-1);
-        if (err == AOP_TRUNCATED_PAGE) {
+        err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
-                page_cache_release(page);
+                                                        page, fsdata);
-                goto retry;
-        }
-        if (err)
-                goto fail_map;
-        /*
-         * Notice that we are _not_ going to block here - end of page is
-         * unmapped, so this will only try to map the rest of page, see
-         * that it is unmapped (typically even will not look into inode -
-         * ->i_size will be enough for everything) and zero it out.
-         * OTOH it's obviously correct and should make the page up-to-date.
-         */
-        if (!PageUptodate(page)) {
-                err = mapping->a_ops->readpage(NULL, page);
-                if (err != AOP_TRUNCATED_PAGE)
-                        wait_on_page_locked(page);
-        } else {
-                unlock_page(page);
-        }
-        page_cache_release(page);
        if (err < 0)
                goto fail;
+        if (err < len-1)
+                goto retry;
        mark_inode_dirty(inode);
        return 0;
-fail_map:
-        unlock_page(page);
-        page_cache_release(page);
 fail:
        return err;
 }
diff --git a/fs/splice.c b/fs/splice.c
index 2df6be43c667..a7568bcc0f99 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -563,7 +563,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        struct address_space *mapping = file->f_mapping;
        unsigned int offset, this_len;
        struct page *page;
-        pgoff_t index;
+        void *fsdata;
        int ret;
        /*
@@ -573,49 +573,16 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        if (unlikely(ret))
                return ret;
-        index = sd->pos >> PAGE_CACHE_SHIFT;
        offset = sd->pos & ~PAGE_CACHE_MASK;
        this_len = sd->len;
        if (this_len + offset > PAGE_CACHE_SIZE)
                this_len = PAGE_CACHE_SIZE - offset;
-find_page:
+        ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
-        page = find_lock_page(mapping, index);
+                                AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
-        if (!page) {
+        if (unlikely(ret))
-                ret = -ENOMEM;
+                goto out;
-                page = page_cache_alloc_cold(mapping);
-                if (unlikely(!page))
-                        goto out_ret;
-                /*
-                 * This will also lock the page
-                 */
-                ret = add_to_page_cache_lru(page, mapping, index,
-                                            GFP_KERNEL);
-                if (unlikely(ret))
-                        goto out_release;
-        }
-        ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
-        if (unlikely(ret)) {
-                loff_t isize = i_size_read(mapping->host);
-                if (ret != AOP_TRUNCATED_PAGE)
-                        unlock_page(page);
-                page_cache_release(page);
-                if (ret == AOP_TRUNCATED_PAGE)
-                        goto find_page;
-                /*
-                 * prepare_write() may have instantiated a few blocks
-                 * outside i_size.  Trim these off again.
-                 */
-                if (sd->pos + this_len > isize)
-                        vmtruncate(mapping->host, isize);
-                goto out_ret;
-        }
        if (buf->page != page) {
                /*
@@ -629,31 +596,9 @@ find_page:
                kunmap_atomic(dst, KM_USER1);
                buf->ops->unmap(pipe, buf, src);
        }
+        ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
-        ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
+                                page, fsdata);
-        if (ret) {
-                if (ret == AOP_TRUNCATED_PAGE) {
-                        page_cache_release(page);
-                        goto find_page;
-                }
-                if (ret < 0)
-                        goto out;
-                /*
-                 * Partial write has happened, so 'ret' already initialized by
-                 * number of bytes written, Where is nothing we have to do here.
-                 */
-        } else
-                ret = this_len;
-        /*
-         * Return the number of bytes written and mark page as
-         * accessed, we are now done!
-         */
-        mark_page_accessed(page);
 out:
-        unlock_page(page);
-out_release:
-        page_cache_release(page);
-out_ret:
        return ret;
 }
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 35cadad84b14..a562ecfb1a14 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -203,6 +203,16 @@ void block_invalidatepage(struct page *page, unsigned long offset);
 int block_write_full_page(struct page *page, get_block_t *get_block,
                                struct writeback_control *wbc);
 int block_read_full_page(struct page*, get_block_t*);
+int block_write_begin(struct file *, struct address_space *,
+                                loff_t, unsigned, unsigned,
+                                struct page **, void **, get_block_t*);
+int block_write_end(struct file *, struct address_space *,
+                                loff_t, unsigned, unsigned,
+                                struct page *, void *);
+int generic_write_end(struct file *, struct address_space *,
+                                loff_t, unsigned, unsigned,
+                                struct page *, void *);
+void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
 int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
                                loff_t *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 86ce27c72554..e9344e6f877d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -394,6 +394,8 @@ enum positive_aop_returns {
        AOP_TRUNCATED_PAGE      = 0x80001,
 };
+#define AOP_FLAG_UNINTERRUPTIBLE        0x0001 /* will not do a short write */
 /*
 * oh the beauties of C type declarations.
 */
@@ -413,7 +415,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 size_t iov_iter_copy_from_user(struct page *page,
                struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
-int iov_iter_fault_in_readable(struct iov_iter *i);
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(struct iov_iter *i);
 static inline void iov_iter_init(struct iov_iter *i,
@@ -454,6 +456,14 @@ struct address_space_operations {
         */
        int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
        int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+        int (*write_begin)(struct file *, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned flags,
+                                struct page **pagep, void **fsdata);
+        int (*write_end)(struct file *, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned copied,
+                                struct page *page, void *fsdata);
        /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
        sector_t (*bmap)(struct address_space *, sector_t);
        void (*invalidatepage) (struct page *, unsigned long);
@@ -468,6 +478,18 @@ struct address_space_operations {
        int (*launder_page) (struct page *);
 };
+/*
+ * pagecache_write_begin/pagecache_write_end must be used by general code
+ * to write into the pagecache.
+ */
+int pagecache_write_begin(struct file *, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned flags,
+                                struct page **pagep, void **fsdata);
+int pagecache_write_end(struct file *, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned copied,
+                                struct page *page, void *fsdata);
 struct backing_dev_info;
 struct address_space {
        struct inode            *host;          /* owner: inode, block_device */
@@ -1866,6 +1888,12 @@ extern int simple_prepare_write(struct file *file, struct page *page,
                        unsigned offset, unsigned to);
 extern int simple_commit_write(struct file *file, struct page *page,
                                unsigned offset, unsigned to);
+extern int simple_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata);
+extern int simple_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata);
 extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *);
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 8f1e390fd71b..db8a410ae9e1 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -96,6 +96,8 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
                        int tag, unsigned int nr_pages, struct page **pages);
+struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index);
 /*
 * Returns locked page at given index in given cache, creating it if needed.
 */
diff --git a/mm/filemap.c b/mm/filemap.c
index 67a03a0a9aee..ec25ba1aef5f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1742,11 +1742,20 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
        i->count -= bytes;
 }
-int iov_iter_fault_in_readable(struct iov_iter *i)
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
 {
-        size_t seglen = min(i->iov->iov_len - i->iov_offset, i->count);
        char __user *buf = i->iov->iov_base + i->iov_offset;
-        return fault_in_pages_readable(buf, seglen);
+        bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+        return fault_in_pages_readable(buf, bytes);
 }
 /*
@@ -1843,6 +1852,95 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
 }
 EXPORT_SYMBOL(generic_write_checks);
+int pagecache_write_begin(struct file *file, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned flags,
+                                struct page **pagep, void **fsdata)
+{
+        const struct address_space_operations *aops = mapping->a_ops;
+        if (aops->write_begin) {
+                return aops->write_begin(file, mapping, pos, len, flags,
+                                                        pagep, fsdata);
+        } else {
+                int ret;
+                pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+                unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+                struct inode *inode = mapping->host;
+                struct page *page;
+again:
+                page = __grab_cache_page(mapping, index);
+                *pagep = page;
+                if (!page)
+                        return -ENOMEM;
+                if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
+                        /*
+                         * There is no way to resolve a short write situation
+                         * for a !Uptodate page (except by double copying in
+                         * the caller done by generic_perform_write_2copy).
+                         *
+                         * Instead, we have to bring it uptodate here.
+                         */
+                        ret = aops->readpage(file, page);
+                        page_cache_release(page);
+                        if (ret) {
+                                if (ret == AOP_TRUNCATED_PAGE)
+                                        goto again;
+                                return ret;
+                        }
+                        goto again;
+                }
+                ret = aops->prepare_write(file, page, offset, offset+len);
+                if (ret) {
+                        if (ret != AOP_TRUNCATED_PAGE)
+                                unlock_page(page);
+                        page_cache_release(page);
+                        if (pos + len > inode->i_size)
+                                vmtruncate(inode, inode->i_size);
+                        if (ret == AOP_TRUNCATED_PAGE)
+                                goto again;
+                }
+                return ret;
+        }
+}
+EXPORT_SYMBOL(pagecache_write_begin);
+int pagecache_write_end(struct file *file, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned copied,
+                                struct page *page, void *fsdata)
+{
+        const struct address_space_operations *aops = mapping->a_ops;
+        int ret;
+        if (aops->write_end) {
+                mark_page_accessed(page);
+                ret = aops->write_end(file, mapping, pos, len, copied,
+                                                        page, fsdata);
+        } else {
+                unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+                struct inode *inode = mapping->host;
+                flush_dcache_page(page);
+                ret = aops->commit_write(file, page, offset, offset+len);
+                unlock_page(page);
+                mark_page_accessed(page);
+                page_cache_release(page);
+                BUG_ON(ret == AOP_TRUNCATED_PAGE); /* can't deal with */
+                if (ret < 0) {
+                        if (pos + len > inode->i_size)
+                                vmtruncate(inode, inode->i_size);
+                } else if (ret > 0)
+                        ret = min_t(size_t, copied, ret);
+                else
+                        ret = copied;
+        }
+        return ret;
+}
+EXPORT_SYMBOL(pagecache_write_end);
 ssize_t
 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@@ -1886,8 +1984,7 @@ EXPORT_SYMBOL(generic_file_direct_write);
 * Find or create a page at the given pagecache position. Return the locked
 * page. This function is specifically for buffered writes.
 */
-static struct page *__grab_cache_page(struct address_space *mapping,
+struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
-                                                        pgoff_t index)
 {
        int status;
        struct page *page;
@@ -1908,20 +2005,16 @@ repeat:
        }
        return page;
 }
+EXPORT_SYMBOL(__grab_cache_page);
-ssize_t
+static ssize_t generic_perform_write_2copy(struct file *file,
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+                                struct iov_iter *i, loff_t pos)
-                unsigned long nr_segs, loff_t pos, loff_t *ppos,
-                size_t count, ssize_t written)
 {
-        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        const struct address_space_operations *a_ops = mapping->a_ops;
-        struct inode    *inode = mapping->host;
+        struct inode *inode = mapping->host;
-        long            status = 0;
+        long status = 0;
-        struct iov_iter i;
+        ssize_t written = 0;
-        iov_iter_init(&i, iov, nr_segs, count, written);
        do {
                struct page *src_page;
@@ -1934,7 +2027,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                offset = (pos & (PAGE_CACHE_SIZE - 1));
                index = pos >> PAGE_CACHE_SHIFT;
                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
-                                                iov_iter_count(&i));
+                                                iov_iter_count(i));
                /*
                 * a non-NULL src_page indicates that we're doing the
@@ -1952,7 +2045,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                 * to check that the address is actually valid, when atomic
                 * usercopies are used, below.
                 */
-                if (unlikely(iov_iter_fault_in_readable(&i))) {
+                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
                        status = -EFAULT;
                        break;
                }
@@ -1983,7 +2076,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                         * same reason as we can't take a page fault with a
                         * page locked (as explained below).
                         */
-                        copied = iov_iter_copy_from_user(src_page, &i,
+                        copied = iov_iter_copy_from_user(src_page, i,
                                                                offset, bytes);
                        if (unlikely(copied == 0)) {
                                status = -EFAULT;
@@ -2008,7 +2101,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                                page_cache_release(src_page);
                                continue;
                        }
                }
                status = a_ops->prepare_write(file, page, offset, offset+bytes);
@@ -2030,7 +2122,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                         * really matter.
                         */
                        pagefault_disable();
-                        copied = iov_iter_copy_from_user_atomic(page, &i,
+                        copied = iov_iter_copy_from_user_atomic(page, i,
                                                                offset, bytes);
                        pagefault_enable();
                } else {
@@ -2056,9 +2148,9 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                if (src_page)
                        page_cache_release(src_page);
-                iov_iter_advance(&i, copied);
+                iov_iter_advance(i, copied);
-                written += copied;
                pos += copied;
+                written += copied;
                balance_dirty_pages_ratelimited(mapping);
                cond_resched();
@@ -2082,13 +2174,117 @@ fs_write_aop_error:
                        continue;
                else
                        break;
-        } while (iov_iter_count(&i));
+        } while (iov_iter_count(i));
-        *ppos = pos;
+        return written ? written : status;
+}
+static ssize_t generic_perform_write(struct file *file,
+                                struct iov_iter *i, loff_t pos)
+{
+        struct address_space *mapping = file->f_mapping;
+        const struct address_space_operations *a_ops = mapping->a_ops;
+        long status = 0;
+        ssize_t written = 0;
+        do {
+                struct page *page;
+                pgoff_t index;          /* Pagecache index for current page */
+                unsigned long offset;   /* Offset into pagecache page */
+                unsigned long bytes;    /* Bytes to write to page */
+                size_t copied;          /* Bytes copied from user */
+                void *fsdata;
+                offset = (pos & (PAGE_CACHE_SIZE - 1));
+                index = pos >> PAGE_CACHE_SHIFT;
+                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+                                                iov_iter_count(i));
+again:
+                /*
+                 * Bring in the user page that we will copy from _first_.
+                 * Otherwise there's a nasty deadlock on copying from the
+                 * same page as we're writing to, without it being marked
+                 * up-to-date.
+                 *
+                 * Not only is this an optimisation, but it is also required
+                 * to check that the address is actually valid, when atomic
+                 * usercopies are used, below.
+                 */
+                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+                        status = -EFAULT;
+                        break;
+                }
+                status = a_ops->write_begin(file, mapping, pos, bytes, 0,
+                                                &page, &fsdata);
+                if (unlikely(status))
+                        break;
+                pagefault_disable();
+                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+                pagefault_enable();
+                flush_dcache_page(page);
+                status = a_ops->write_end(file, mapping, pos, bytes, copied,
+                                                page, fsdata);
+                if (unlikely(status < 0))
+                        break;
+                copied = status;
+                cond_resched();
+                if (unlikely(copied == 0)) {
+                        /*
+                         * If we were unable to copy any data at all, we must
+                         * fall back to a single segment length write.
+                         *
+                         * If we didn't fallback here, we could livelock
+                         * because not all segments in the iov can be copied at
+                         * once without a pagefault.
+                         */
+                        bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+                                                iov_iter_single_seg_count(i));
+                        goto again;
+                }
+                iov_iter_advance(i, copied);
+                pos += copied;
+                written += copied;
+                balance_dirty_pages_ratelimited(mapping);
+        } while (iov_iter_count(i));
+        return written ? written : status;
+}
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+                unsigned long nr_segs, loff_t pos, loff_t *ppos,
+                size_t count, ssize_t written)
+{
+        struct file *file = iocb->ki_filp;
+        struct address_space *mapping = file->f_mapping;
+        const struct address_space_operations *a_ops = mapping->a_ops;
+        struct inode *inode = mapping->host;
+        ssize_t status;
+        struct iov_iter i;
+        iov_iter_init(&i, iov, nr_segs, count, written);
+        if (a_ops->write_begin)
+                status = generic_perform_write(file, &i, pos);
+        else
+                status = generic_perform_write_2copy(file, &i, pos);
-        /*
-         * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
-         */
        if (likely(status >= 0)) {
+                written += status;
+                *ppos = pos + status;
+                /*
+                 * For now, when the user asks for O_SYNC, we'll actually give
+                 * O_DSYNC
+                 */
                if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                        if (!a_ops->writepage || !is_sync_kiocb(iocb))
                                status = generic_osync_inode(inode, mapping,
author	Nick Piggin <npiggin@suse.de>	2007-10-16 04:25:01 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-16 12:42:55 -0400
commit	afddba49d18f346e5cc2938b6ed7c512db18ca68 (patch)
tree	4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca
parent	637aff46f94a754207c80c8c64bf1b74f24b967d (diff)