4 files changed, 254 insertions, 56 deletions
diff --git a/fs/attr.c b/fs/attr.c
index 0815e93bb487..b4fa3b0aa596 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok);
 * @offset:     the new size to assign to the inode
 * @Returns:    0 on success, -ve errno on failure
 *
+ * inode_newsize_ok must be called with i_mutex held.
+ *
 * inode_newsize_ok will check filesystem limits and ulimits to check that the
 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
 * when necessary. Caller must not proceed with inode size change if failure is
 * returned. @inode must be a file (not directory), with appropriate
 * permissions to allow truncate (inode_newsize_ok does NOT check these
 * conditions).
- *
- * inode_newsize_ok must be called with i_mutex held.
 */
 int inode_newsize_ok(const struct inode *inode, loff_t offset)
 {
@@ -104,17 +104,25 @@ out_big:
 }
 EXPORT_SYMBOL(inode_newsize_ok);
-int inode_setattr(struct inode * inode, struct iattr * attr)
+/**
+ * generic_setattr - copy simple metadata updates into the generic inode
+ * @inode:      the inode to be updated
+ * @attr:       the new attributes
+ *
+ * generic_setattr must be called with i_mutex held.
+ *
+ * generic_setattr updates the inode's metadata with that specified
+ * in attr. Noticably missing is inode size update, which is more complex
+ * as it requires pagecache updates. See simple_setsize.
+ *
+ * The inode is not marked as dirty after this operation. The rationale is
+ * that for "simple" filesystems, the struct inode is the inode storage.
+ * The caller is free to mark the inode dirty afterwards if needed.
+ */
+void generic_setattr(struct inode *inode, const struct iattr *attr)
 {
        unsigned int ia_valid = attr->ia_valid;
-        if (ia_valid & ATTR_SIZE &&
-            attr->ia_size != i_size_read(inode)) {
-                int error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
-        }
        if (ia_valid & ATTR_UID)
                inode->i_uid = attr->ia_uid;
        if (ia_valid & ATTR_GID)
@@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
                        mode &= ~S_ISGID;
                inode->i_mode = mode;
        }
+}
+EXPORT_SYMBOL(generic_setattr);
+/*
+ * note this function is deprecated, the new truncate sequence should be
+ * used instead -- see eg. simple_setsize, generic_setattr.
+ */
+int inode_setattr(struct inode *inode, const struct iattr *attr)
+{
+        unsigned int ia_valid = attr->ia_valid;
+        if (ia_valid & ATTR_SIZE &&
+            attr->ia_size != i_size_read(inode)) {
+                int error;
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        generic_setattr(inode, attr);
        mark_inode_dirty(inode);
        return 0;
diff --git a/fs/buffer.c b/fs/buffer.c
index e8aa7081d25c..d54812b198e9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 }
 /*
- * block_write_begin takes care of the basic task of block allocation and
+ * Filesystems implementing the new truncate sequence should use the
- * bringing partial write blocks uptodate first.
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
- *
+ * The filesystem needs to handle block truncation upon failure.
- * If *pagep is not NULL, then block_write_begin uses the locked page
- * at *pagep rather than allocating its own. In this case, the page will
- * not be unlocked or deallocated on failure.
 */
-int block_write_begin(struct file *file, struct address_space *mapping,
+int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block)
@@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping,
                        unlock_page(page);
                        page_cache_release(page);
                        *pagep = NULL;
-                        /*
-                         * prepare_write() may have instantiated a few blocks
-                         * outside i_size.  Trim these off again. Don't need
-                         * i_size_read because we hold i_mutex.
-                         */
-                        if (pos + len > inode->i_size)
-                                vmtruncate(inode, inode->i_size);
                }
        }
 out:
        return status;
 }
+EXPORT_SYMBOL(block_write_begin_newtrunc);
+/*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * If *pagep is not NULL, then block_write_begin uses the locked page
+ * at *pagep rather than allocating its own. In this case, the page will
+ * not be unlocked or deallocated on failure.
+ */
+int block_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block)
+{
+        int ret;
+        ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block);
+        /*
+         * prepare_write() may have instantiated a few blocks
+         * outside i_size.  Trim these off again. Don't need
+         * i_size_read because we hold i_mutex.
+         *
+         * Filesystems which pass down their own page also cannot
+         * call into vmtruncate here because it would lead to lock
+         * inversion problems (*pagep is locked). This is a further
+         * example of where the old truncate sequence is inadequate.
+         */
+        if (unlikely(ret) && *pagep == NULL) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
+}
 EXPORT_SYMBOL(block_write_begin);
 int block_write_end(struct file *file, struct address_space *mapping,
@@ -2324,7 +2351,7 @@ out:
 * For moronic filesystems that do not allow holes in file.
 * We may have to extend the file.
 */
-int cont_write_begin(struct file *file, struct address_space *mapping,
+int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block, loff_t *bytes)
@@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
        }
        *pagep = NULL;
-        err = block_write_begin(file, mapping, pos, len,
+        err = block_write_begin_newtrunc(file, mapping, pos, len,
                                flags, pagep, fsdata, get_block);
 out:
        return err;
 }
+EXPORT_SYMBOL(cont_write_begin_newtrunc);
+int cont_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block, loff_t *bytes)
+{
+        int ret;
+        ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block, bytes);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
+}
 EXPORT_SYMBOL(cont_write_begin);
 int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
 *
 * We are not allowed to take the i_mutex here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
- * vmtruncate() writes the inode size before removing pages, once we have the
+ * truncate writes the inode size before removing pages, once we have the
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
@@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
 }
 /*
- * On entry, the page is fully not uptodate.
+ * Filesystems implementing the new truncate sequence should use the
- * On exit the page is fully uptodate in the areas outside (from,to)
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * The filesystem needs to handle block truncation upon failure.
 */
-int nobh_write_begin(struct file *file, struct address_space *mapping,
+int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block)
@@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
                unlock_page(page);
                page_cache_release(page);
                *pagep = NULL;
-                return block_write_begin(file, mapping, pos, len, flags, pagep,
+                return block_write_begin_newtrunc(file, mapping, pos, len,
-                                        fsdata, get_block);
+                                        flags, pagep, fsdata, get_block);
        }
        if (PageMappedToDisk(page))
@@ -2605,8 +2652,34 @@ out_release:
        page_cache_release(page);
        *pagep = NULL;
-        if (pos + len > inode->i_size)
+        return ret;
-                vmtruncate(inode, inode->i_size);
+}
+EXPORT_SYMBOL(nobh_write_begin_newtrunc);
+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ */
+int nobh_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block)
+{
+        int ret;
+        ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block);
+        /*
+         * prepare_write() may have instantiated a few blocks
+         * outside i_size.  Trim these off again. Don't need
+         * i_size_read because we hold i_mutex.
+         */
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
        return ret;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index da111aacb46e..7600aacf531d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1134,27 +1134,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        return ret;
 }
-/*
- * This is a library function for use by filesystem drivers.
- *
- * The locking rules are governed by the flags parameter:
- *  - if the flags value contains DIO_LOCKING we use a fancy locking
- *    scheme for dumb filesystems.
- *    For writes this function is called under i_mutex and returns with
- *    i_mutex held, for reads, i_mutex is not held on entry, but it is
- *    taken and dropped again before returning.
- *    For reads and writes i_alloc_sem is taken in shared mode and released
- *    on I/O completion (which may happen asynchronously after returning to
- *    the caller).
- *
- *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- *    internal locking but rather rely on the filesystem to synchronize
- *    direct I/O reads/writes versus each other and truncate.
- *    For reads and writes both i_mutex and i_alloc_sem are not held on
- *    entry and are never taken.
- */
 ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
        struct block_device *bdev, const struct iovec *iov, loff_t offset, 
        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
        dio_submit_t submit_io, int flags)
@@ -1247,9 +1228,46 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                                nr_segs, blkbits, get_block, end_io,
                                submit_io, dio);
+out:
+        return retval;
+}
+EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
+/*
+ * This is a library function for use by filesystem drivers.
+ *
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
+ *
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
+ */
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+        struct block_device *bdev, const struct iovec *iov, loff_t offset,
+        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+        dio_submit_t submit_io, int flags)
+{
+        ssize_t retval;
+        retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
+                        offset, nr_segs, get_block, end_io, submit_io, flags);
        /*
         * In case of error extending write may have instantiated a few
         * blocks outside i_size. Trim these off again for DIO_LOCKING.
+         * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
+         * their own manner. This is a further example of where the old
+         * truncate sequence is inadequate.
         *
         * NOTE: filesystems with their own locking have to handle this
         * on their own.
@@ -1257,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        if (flags & DIO_LOCKING) {
                if (unlikely((rw & WRITE) && retval < 0)) {
                        loff_t isize = i_size_read(inode);
+                        loff_t end = offset + iov_length(iov, nr_segs);
                        if (end > isize)
                                vmtruncate(inode, isize);
                }
        }
-out:
        return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/libfs.c b/fs/libfs.c
index b84d0a7a2204..09e1016eb774 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
+#include <linux/quotaops.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
@@ -325,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
        return 0;
 }
+/**
+ * simple_setsize - handle core mm and vfs requirements for file size change
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setsize must be called with inode_mutex held.
+ *
+ * simple_setsize will check that the requested new size is OK (see
+ * inode_newsize_ok), and then will perform the necessary i_size update
+ * and pagecache truncation (if necessary). It will be typically be called
+ * from the filesystem's setattr function when ATTR_SIZE is passed in.
+ *
+ * The inode itself must have correct permissions and attributes to allow
+ * i_size to be changed, this function then just checks that the new size
+ * requested is valid.
+ *
+ * In the case of simple in-memory filesystems with inodes stored solely
+ * in the inode cache, and file data in the pagecache, nothing more needs
+ * to be done to satisfy a truncate request. Filesystems with on-disk
+ * blocks for example will need to free them in the case of truncate, in
+ * that case it may be easier not to use simple_setsize (but each of its
+ * components will likely be required at some point to update pagecache
+ * and inode etc).
+ */
+int simple_setsize(struct inode *inode, loff_t newsize)
+{
+        loff_t oldsize;
+        int error;
+        error = inode_newsize_ok(inode, newsize);
+        if (error)
+                return error;
+        oldsize = inode->i_size;
+        i_size_write(inode, newsize);
+        truncate_pagecache(inode, oldsize, newsize);
+        return error;
+}
+EXPORT_SYMBOL(simple_setsize);
+/**
+ * simple_setattr - setattr for simple in-memory filesystem
+ * @dentry: dentry
+ * @iattr: iattr structure
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setattr implements setattr for an in-memory filesystem which
+ * does not store its own file data or metadata (eg. uses the page cache
+ * and inode cache as its data store).
+ */
+int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        error = inode_change_ok(inode, iattr);
+        if (error)
+                return error;
+        if (iattr->ia_valid & ATTR_SIZE) {
+                error = simple_setsize(inode, iattr->ia_size);
+                if (error)
+                        return error;
+        }
+        generic_setattr(inode, iattr);
+        return error;
+}
+EXPORT_SYMBOL(simple_setattr);
 int simple_readpage(struct file *file, struct page *page)
 {
        clear_highpage(page);

diff --git a/fs/attr.c b/fs/attr.c index 0815e93bb487..b4fa3b0aa596 100644 --- a/fs/attr.c +++ b/fs/attr.c
@@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok);
67	* @offset: the new size to assign to the inode	67	* @offset: the new size to assign to the inode
68	* @Returns: 0 on success, -ve errno on failure	68	* @Returns: 0 on success, -ve errno on failure
69	*	69	*
		70	* inode_newsize_ok must be called with i_mutex held.
		71	*
70	* inode_newsize_ok will check filesystem limits and ulimits to check that the	72	* inode_newsize_ok will check filesystem limits and ulimits to check that the
71	* new inode size is within limits. inode_newsize_ok will also send SIGXFSZ	73	* new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
72	* when necessary. Caller must not proceed with inode size change if failure is	74	* when necessary. Caller must not proceed with inode size change if failure is
73	* returned. @inode must be a file (not directory), with appropriate	75	* returned. @inode must be a file (not directory), with appropriate
74	* permissions to allow truncate (inode_newsize_ok does NOT check these	76	* permissions to allow truncate (inode_newsize_ok does NOT check these
75	* conditions).	77	* conditions).
76	*
77	* inode_newsize_ok must be called with i_mutex held.
78	*/	78	*/
79	int inode_newsize_ok(const struct inode *inode, loff_t offset)	79	int inode_newsize_ok(const struct inode *inode, loff_t offset)
80	{	80	{
@@ -104,17 +104,25 @@ out_big:
104	}	104	}
105	EXPORT_SYMBOL(inode_newsize_ok);	105	EXPORT_SYMBOL(inode_newsize_ok);
106		106
107	int inode_setattr(struct inode * inode, struct iattr * attr)	107	/**
		108	* generic_setattr - copy simple metadata updates into the generic inode
		109	* @inode: the inode to be updated
		110	* @attr: the new attributes
		111	*
		112	* generic_setattr must be called with i_mutex held.
		113	*
		114	* generic_setattr updates the inode's metadata with that specified
		115	* in attr. Noticably missing is inode size update, which is more complex
		116	* as it requires pagecache updates. See simple_setsize.
		117	*
		118	* The inode is not marked as dirty after this operation. The rationale is
		119	* that for "simple" filesystems, the struct inode is the inode storage.
		120	* The caller is free to mark the inode dirty afterwards if needed.
		121	*/
		122	void generic_setattr(struct inode inode, const struct iattr attr)
108	{	123	{
109	unsigned int ia_valid = attr->ia_valid;	124	unsigned int ia_valid = attr->ia_valid;
110		125
111	if (ia_valid & ATTR_SIZE &&
112	attr->ia_size != i_size_read(inode)) {
113	int error = vmtruncate(inode, attr->ia_size);
114	if (error)
115	return error;
116	}
117
118	if (ia_valid & ATTR_UID)	126	if (ia_valid & ATTR_UID)
119	inode->i_uid = attr->ia_uid;	127	inode->i_uid = attr->ia_uid;
120	if (ia_valid & ATTR_GID)	128	if (ia_valid & ATTR_GID)
@@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
135	mode &= ~S_ISGID;	143	mode &= ~S_ISGID;
136	inode->i_mode = mode;	144	inode->i_mode = mode;
137	}	145	}
		146	}
		147	EXPORT_SYMBOL(generic_setattr);
		148
		149	/*
		150	* note this function is deprecated, the new truncate sequence should be
		151	* used instead -- see eg. simple_setsize, generic_setattr.
		152	*/
		153	int inode_setattr(struct inode inode, const struct iattr attr)
		154	{
		155	unsigned int ia_valid = attr->ia_valid;
		156
		157	if (ia_valid & ATTR_SIZE &&
		158	attr->ia_size != i_size_read(inode)) {
		159	int error;
		160
		161	error = vmtruncate(inode, attr->ia_size);
		162	if (error)
		163	return error;
		164	}
		165
		166	generic_setattr(inode, attr);
		167
138	mark_inode_dirty(inode);	168	mark_inode_dirty(inode);
139		169
140	return 0;	170	return 0;


diff --git a/fs/buffer.c b/fs/buffer.c index e8aa7081d25c..d54812b198e9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c
@@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode inode, struct page page,
1949	}	1949	}
1950		1950
1951	/*	1951	/*
1952	* block_write_begin takes care of the basic task of block allocation and	1952	* Filesystems implementing the new truncate sequence should use the
1953	* bringing partial write blocks uptodate first.	1953	* _newtrunc postfix variant which won't incorrectly call vmtruncate.
1954	*	1954	* The filesystem needs to handle block truncation upon failure.
1955	* If *pagep is not NULL, then block_write_begin uses the locked page
1956	* at *pagep rather than allocating its own. In this case, the page will
1957	* not be unlocked or deallocated on failure.
1958	*/	1955	*/
1959	int block_write_begin(struct file file, struct address_space mapping,	1956	int block_write_begin_newtrunc(struct file file, struct address_space mapping,
1960	loff_t pos, unsigned len, unsigned flags,	1957	loff_t pos, unsigned len, unsigned flags,
1961	struct page pagep, void fsdata,	1958	struct page pagep, void fsdata,
1962	get_block_t *get_block)	1959	get_block_t *get_block)
@@ -1992,20 +1989,50 @@ int block_write_begin(struct file file, struct address_space mapping,
1992	unlock_page(page);	1989	unlock_page(page);
1993	page_cache_release(page);	1990	page_cache_release(page);
1994	*pagep = NULL;	1991	*pagep = NULL;
1995
1996	/*
1997	* prepare_write() may have instantiated a few blocks
1998	* outside i_size. Trim these off again. Don't need
1999	* i_size_read because we hold i_mutex.
2000	*/
2001	if (pos + len > inode->i_size)
2002	vmtruncate(inode, inode->i_size);
2003	}	1992	}
2004	}	1993	}
2005		1994
2006	out:	1995	out:
2007	return status;	1996	return status;
2008	}	1997	}
		1998	EXPORT_SYMBOL(block_write_begin_newtrunc);
		1999
		2000	/*
		2001	* block_write_begin takes care of the basic task of block allocation and
		2002	* bringing partial write blocks uptodate first.
		2003	*
		2004	* If *pagep is not NULL, then block_write_begin uses the locked page
		2005	* at *pagep rather than allocating its own. In this case, the page will
		2006	* not be unlocked or deallocated on failure.
		2007	*/
		2008	int block_write_begin(struct file file, struct address_space mapping,
		2009	loff_t pos, unsigned len, unsigned flags,
		2010	struct page pagep, void fsdata,
		2011	get_block_t *get_block)
		2012	{
		2013	int ret;
		2014
		2015	ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
		2016	pagep, fsdata, get_block);
		2017
		2018	/*
		2019	* prepare_write() may have instantiated a few blocks
		2020	* outside i_size. Trim these off again. Don't need
		2021	* i_size_read because we hold i_mutex.
		2022	*
		2023	* Filesystems which pass down their own page also cannot
		2024	* call into vmtruncate here because it would lead to lock
		2025	* inversion problems (*pagep is locked). This is a further
		2026	* example of where the old truncate sequence is inadequate.
		2027	*/
		2028	if (unlikely(ret) && *pagep == NULL) {
		2029	loff_t isize = mapping->host->i_size;
		2030	if (pos + len > isize)
		2031	vmtruncate(mapping->host, isize);
		2032	}
		2033
		2034	return ret;
		2035	}
2009	EXPORT_SYMBOL(block_write_begin);	2036	EXPORT_SYMBOL(block_write_begin);
2010		2037
2011	int block_write_end(struct file file, struct address_space mapping,	2038	int block_write_end(struct file file, struct address_space mapping,
@@ -2324,7 +2351,7 @@ out:
2324	* For moronic filesystems that do not allow holes in file.	2351	* For moronic filesystems that do not allow holes in file.
2325	* We may have to extend the file.	2352	* We may have to extend the file.
2326	*/	2353	*/
2327	int cont_write_begin(struct file file, struct address_space mapping,	2354	int cont_write_begin_newtrunc(struct file file, struct address_space mapping,
2328	loff_t pos, unsigned len, unsigned flags,	2355	loff_t pos, unsigned len, unsigned flags,
2329	struct page pagep, void fsdata,	2356	struct page pagep, void fsdata,
2330	get_block_t get_block, loff_t bytes)	2357	get_block_t get_block, loff_t bytes)
@@ -2345,11 +2372,30 @@ int cont_write_begin(struct file file, struct address_space mapping,
2345	}	2372	}
2346		2373
2347	*pagep = NULL;	2374	*pagep = NULL;
2348	err = block_write_begin(file, mapping, pos, len,	2375	err = block_write_begin_newtrunc(file, mapping, pos, len,
2349	flags, pagep, fsdata, get_block);	2376	flags, pagep, fsdata, get_block);
2350	out:	2377	out:
2351	return err;	2378	return err;
2352	}	2379	}
		2380	EXPORT_SYMBOL(cont_write_begin_newtrunc);
		2381
		2382	int cont_write_begin(struct file file, struct address_space mapping,
		2383	loff_t pos, unsigned len, unsigned flags,
		2384	struct page pagep, void fsdata,
		2385	get_block_t get_block, loff_t bytes)
		2386	{
		2387	int ret;
		2388
		2389	ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
		2390	pagep, fsdata, get_block, bytes);
		2391	if (unlikely(ret)) {
		2392	loff_t isize = mapping->host->i_size;
		2393	if (pos + len > isize)
		2394	vmtruncate(mapping->host, isize);
		2395	}
		2396
		2397	return ret;
		2398	}
2353	EXPORT_SYMBOL(cont_write_begin);	2399	EXPORT_SYMBOL(cont_write_begin);
2354		2400
2355	int block_prepare_write(struct page *page, unsigned from, unsigned to,	2401	int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
2381	*	2427	*
2382	* We are not allowed to take the i_mutex here so we have to play games to	2428	* We are not allowed to take the i_mutex here so we have to play games to
2383	* protect against truncate races as the page could now be beyond EOF. Because	2429	* protect against truncate races as the page could now be beyond EOF. Because
2384	* vmtruncate() writes the inode size before removing pages, once we have the	2430	* truncate writes the inode size before removing pages, once we have the
2385	* page lock we can determine safely if the page is beyond EOF. If it is not	2431	* page lock we can determine safely if the page is beyond EOF. If it is not
2386	* beyond EOF, then the page is guaranteed safe against truncation until we	2432	* beyond EOF, then the page is guaranteed safe against truncation until we
2387	* unlock the page.	2433	* unlock the page.
@@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page page, struct buffer_head head)
2464	}	2510	}
2465		2511
2466	/*	2512	/*
2467	* On entry, the page is fully not uptodate.	2513	* Filesystems implementing the new truncate sequence should use the
2468	* On exit the page is fully uptodate in the areas outside (from,to)	2514	* _newtrunc postfix variant which won't incorrectly call vmtruncate.
		2515	* The filesystem needs to handle block truncation upon failure.
2469	*/	2516	*/
2470	int nobh_write_begin(struct file file, struct address_space mapping,	2517	int nobh_write_begin_newtrunc(struct file file, struct address_space mapping,
2471	loff_t pos, unsigned len, unsigned flags,	2518	loff_t pos, unsigned len, unsigned flags,
2472	struct page pagep, void fsdata,	2519	struct page pagep, void fsdata,
2473	get_block_t *get_block)	2520	get_block_t *get_block)
@@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file file, struct address_space mapping,
2500	unlock_page(page);	2547	unlock_page(page);
2501	page_cache_release(page);	2548	page_cache_release(page);
2502	*pagep = NULL;	2549	*pagep = NULL;
2503	return block_write_begin(file, mapping, pos, len, flags, pagep,	2550	return block_write_begin_newtrunc(file, mapping, pos, len,
2504	fsdata, get_block);	2551	flags, pagep, fsdata, get_block);
2505	}	2552	}
2506		2553
2507	if (PageMappedToDisk(page))	2554	if (PageMappedToDisk(page))
@@ -2605,8 +2652,34 @@ out_release:
2605	page_cache_release(page);	2652	page_cache_release(page);
2606	*pagep = NULL;	2653	*pagep = NULL;
2607		2654
2608	if (pos + len > inode->i_size)	2655	return ret;
2609	vmtruncate(inode, inode->i_size);	2656	}
		2657	EXPORT_SYMBOL(nobh_write_begin_newtrunc);
		2658
		2659	/*
		2660	* On entry, the page is fully not uptodate.
		2661	* On exit the page is fully uptodate in the areas outside (from,to)
		2662	*/
		2663	int nobh_write_begin(struct file file, struct address_space mapping,
		2664	loff_t pos, unsigned len, unsigned flags,
		2665	struct page pagep, void fsdata,
		2666	get_block_t *get_block)
		2667	{
		2668	int ret;
		2669
		2670	ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
		2671	pagep, fsdata, get_block);
		2672
		2673	/*
		2674	* prepare_write() may have instantiated a few blocks
		2675	* outside i_size. Trim these off again. Don't need
		2676	* i_size_read because we hold i_mutex.
		2677	*/
		2678	if (unlikely(ret)) {
		2679	loff_t isize = mapping->host->i_size;
		2680	if (pos + len > isize)
		2681	vmtruncate(mapping->host, isize);
		2682	}
2610		2683
2611	return ret;	2684	return ret;
2612	}	2685	}


diff --git a/fs/direct-io.c b/fs/direct-io.c index da111aacb46e..7600aacf531d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c
@@ -1134,27 +1134,8 @@ direct_io_worker(int rw, struct kiocb iocb, struct inode inode,
1134	return ret;	1134	return ret;
1135	}	1135	}
1136		1136
1137	/*
1138	* This is a library function for use by filesystem drivers.
1139	*
1140	* The locking rules are governed by the flags parameter:
1141	* - if the flags value contains DIO_LOCKING we use a fancy locking
1142	* scheme for dumb filesystems.
1143	* For writes this function is called under i_mutex and returns with
1144	* i_mutex held, for reads, i_mutex is not held on entry, but it is
1145	* taken and dropped again before returning.
1146	* For reads and writes i_alloc_sem is taken in shared mode and released
1147	* on I/O completion (which may happen asynchronously after returning to
1148	* the caller).
1149	*
1150	* - if the flags value does NOT contain DIO_LOCKING we don't use any
1151	* internal locking but rather rely on the filesystem to synchronize
1152	* direct I/O reads/writes versus each other and truncate.
1153	* For reads and writes both i_mutex and i_alloc_sem are not held on
1154	* entry and are never taken.
1155	*/
1156	ssize_t	1137	ssize_t
1157	__blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,	1138	__blockdev_direct_IO_newtrunc(int rw, struct kiocb iocb, struct inode inode,
1158	struct block_device bdev, const struct iovec iov, loff_t offset,	1139	struct block_device bdev, const struct iovec iov, loff_t offset,
1159	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,	1140	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1160	dio_submit_t submit_io, int flags)	1141	dio_submit_t submit_io, int flags)
@@ -1247,9 +1228,46 @@ __blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,
1247	nr_segs, blkbits, get_block, end_io,	1228	nr_segs, blkbits, get_block, end_io,
1248	submit_io, dio);	1229	submit_io, dio);
1249		1230
		1231	out:
		1232	return retval;
		1233	}
		1234	EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
		1235
		1236	/*
		1237	* This is a library function for use by filesystem drivers.
		1238	*
		1239	* The locking rules are governed by the flags parameter:
		1240	* - if the flags value contains DIO_LOCKING we use a fancy locking
		1241	* scheme for dumb filesystems.
		1242	* For writes this function is called under i_mutex and returns with
		1243	* i_mutex held, for reads, i_mutex is not held on entry, but it is
		1244	* taken and dropped again before returning.
		1245	* For reads and writes i_alloc_sem is taken in shared mode and released
		1246	* on I/O completion (which may happen asynchronously after returning to
		1247	* the caller).
		1248	*
		1249	* - if the flags value does NOT contain DIO_LOCKING we don't use any
		1250	* internal locking but rather rely on the filesystem to synchronize
		1251	* direct I/O reads/writes versus each other and truncate.
		1252	* For reads and writes both i_mutex and i_alloc_sem are not held on
		1253	* entry and are never taken.
		1254	*/
		1255	ssize_t
		1256	__blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,
		1257	struct block_device bdev, const struct iovec iov, loff_t offset,
		1258	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
		1259	dio_submit_t submit_io, int flags)
		1260	{
		1261	ssize_t retval;
		1262
		1263	retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
		1264	offset, nr_segs, get_block, end_io, submit_io, flags);
1250	/*	1265	/*
1251	* In case of error extending write may have instantiated a few	1266	* In case of error extending write may have instantiated a few
1252	* blocks outside i_size. Trim these off again for DIO_LOCKING.	1267	* blocks outside i_size. Trim these off again for DIO_LOCKING.
		1268	* NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
		1269	* their own manner. This is a further example of where the old
		1270	* truncate sequence is inadequate.
1253	*	1271	*
1254	* NOTE: filesystems with their own locking have to handle this	1272	* NOTE: filesystems with their own locking have to handle this
1255	* on their own.	1273	* on their own.
@@ -1257,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,
1257	if (flags & DIO_LOCKING) {	1275	if (flags & DIO_LOCKING) {
1258	if (unlikely((rw & WRITE) && retval < 0)) {	1276	if (unlikely((rw & WRITE) && retval < 0)) {
1259	loff_t isize = i_size_read(inode);	1277	loff_t isize = i_size_read(inode);
		1278	loff_t end = offset + iov_length(iov, nr_segs);
		1279
1260	if (end > isize)	1280	if (end > isize)
1261	vmtruncate(inode, isize);	1281	vmtruncate(inode, isize);
1262	}	1282	}
1263	}	1283	}
1264		1284
1265	out:
1266	return retval;	1285	return retval;
1267	}	1286	}
1268	EXPORT_SYMBOL(__blockdev_direct_IO);	1287	EXPORT_SYMBOL(__blockdev_direct_IO);


diff --git a/fs/libfs.c b/fs/libfs.c index b84d0a7a2204..09e1016eb774 100644 --- a/fs/libfs.c +++ b/fs/libfs.c
@@ -8,6 +8,7 @@
8	#include <linux/slab.h>	8	#include <linux/slab.h>
9	#include <linux/mount.h>	9	#include <linux/mount.h>
10	#include <linux/vfs.h>	10	#include <linux/vfs.h>
		11	#include <linux/quotaops.h>
11	#include <linux/mutex.h>	12	#include <linux/mutex.h>
12	#include <linux/exportfs.h>	13	#include <linux/exportfs.h>
13	#include <linux/writeback.h>	14	#include <linux/writeback.h>
@@ -325,6 +326,81 @@ int simple_rename(struct inode old_dir, struct dentry old_dentry,
325	return 0;	326	return 0;
326	}	327	}
327		328
		329	/**
		330	* simple_setsize - handle core mm and vfs requirements for file size change
		331	* @inode: inode
		332	* @newsize: new file size
		333	*
		334	* Returns 0 on success, -error on failure.
		335	*
		336	* simple_setsize must be called with inode_mutex held.
		337	*
		338	* simple_setsize will check that the requested new size is OK (see
		339	* inode_newsize_ok), and then will perform the necessary i_size update
		340	* and pagecache truncation (if necessary). It will be typically be called
		341	* from the filesystem's setattr function when ATTR_SIZE is passed in.
		342	*
		343	* The inode itself must have correct permissions and attributes to allow
		344	* i_size to be changed, this function then just checks that the new size
		345	* requested is valid.
		346	*
		347	* In the case of simple in-memory filesystems with inodes stored solely
		348	* in the inode cache, and file data in the pagecache, nothing more needs
		349	* to be done to satisfy a truncate request. Filesystems with on-disk
		350	* blocks for example will need to free them in the case of truncate, in
		351	* that case it may be easier not to use simple_setsize (but each of its
		352	* components will likely be required at some point to update pagecache
		353	* and inode etc).
		354	*/
		355	int simple_setsize(struct inode *inode, loff_t newsize)
		356	{
		357	loff_t oldsize;
		358	int error;
		359
		360	error = inode_newsize_ok(inode, newsize);
		361	if (error)
		362	return error;
		363
		364	oldsize = inode->i_size;
		365	i_size_write(inode, newsize);
		366	truncate_pagecache(inode, oldsize, newsize);
		367
		368	return error;
		369	}
		370	EXPORT_SYMBOL(simple_setsize);
		371
		372	/**
		373	* simple_setattr - setattr for simple in-memory filesystem
		374	* @dentry: dentry
		375	* @iattr: iattr structure
		376	*
		377	* Returns 0 on success, -error on failure.
		378	*
		379	* simple_setattr implements setattr for an in-memory filesystem which
		380	* does not store its own file data or metadata (eg. uses the page cache
		381	* and inode cache as its data store).
		382	*/
		383	int simple_setattr(struct dentry dentry, struct iattr iattr)
		384	{
		385	struct inode *inode = dentry->d_inode;
		386	int error;
		387
		388	error = inode_change_ok(inode, iattr);
		389	if (error)
		390	return error;
		391
		392	if (iattr->ia_valid & ATTR_SIZE) {
		393	error = simple_setsize(inode, iattr->ia_size);
		394	if (error)
		395	return error;
		396	}
		397
		398	generic_setattr(inode, iattr);
		399
		400	return error;
		401	}
		402	EXPORT_SYMBOL(simple_setattr);
		403
328	int simple_readpage(struct file file, struct page page)	404	int simple_readpage(struct file file, struct page page)
329	{	405	{
330	clear_highpage(page);	406	clear_highpage(page);