fs: introduce new truncate sequence

Introduce a new truncate calling sequence into fs/mm subsystems. Rather than setattr > vmtruncate > truncate, have filesystems call their truncate sequence from ->setattr if filesystem specific operations are required. vmtruncate is deprecated, and truncate_pagecache and inode_newsize_ok helpers introduced previously should be used. simple_setattr is introduced for simple in-ram filesystems to implement the new truncate sequence. Eventually all filesystems should be converted to implement a setattr, and the default code in notify_change should go away. simple_setsize is also introduced to perform just the ATTR_SIZE portion of simple_setattr (ie. changing i_size and trimming pagecache). To implement the new truncate sequence: - filesystem specific manipulations (eg freeing blocks) must be done in the setattr method rather than ->truncate. - vmtruncate can not be used by core code to trim blocks past i_size in the event of write failure after allocation, so this must be performed in the fs code. - convert usage of helpers block_write_begin, nobh_write_begin, cont_write_begin, and *blockdev_direct_IO* to use _newtrunc postfixed variants. These avoid calling vmtruncate to trim blocks (see previous). - inode_setattr should not be used. generic_setattr is a new function to be used to copy simple attributes into the generic inode. - make use of the better opportunity to handle errors with the new sequence. Big problem with the previous calling sequence: the filesystem is not called until i_size has already changed. This means it is not allowed to fail the call, and also it does not know what the previous i_size was. Also, generic code calling vmtruncate to truncate allocated blocks in case of error had no good way to return a meaningful error (or, for example, atomically handle block deallocation). Cc: Christoph Hellwig <hch@lst.de> Acked-by: Jan Kara <jack@suse.cz> Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
author: npiggin@suse.de <npiggin@suse.de> 2010-05-26 11:05:33 -0400
committer: Al Viro <viro@zeniv.linux.org.uk> 2010-05-27 22:15:33 -0400
commit: 7bb46a6734a7e1ad4beaecc11cae7ed3ff81d30f (patch)
tree: e575d9c55e2a6ccc645dcb3ae2564de458b428f2 /fs/buffer.c
parent: 7000d3c424e5bb350e502a477fb0e1ed42f8b10e (diff)
1 files changed, 98 insertions, 25 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index e8aa7081d25c..d54812b198e9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 }
 /*
- * block_write_begin takes care of the basic task of block allocation and
+ * Filesystems implementing the new truncate sequence should use the
- * bringing partial write blocks uptodate first.
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
- *
+ * The filesystem needs to handle block truncation upon failure.
- * If *pagep is not NULL, then block_write_begin uses the locked page
- * at *pagep rather than allocating its own. In this case, the page will
- * not be unlocked or deallocated on failure.
 */
-int block_write_begin(struct file *file, struct address_space *mapping,
+int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block)
@@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping,
                        unlock_page(page);
                        page_cache_release(page);
                        *pagep = NULL;
-                        /*
-                         * prepare_write() may have instantiated a few blocks
-                         * outside i_size.  Trim these off again. Don't need
-                         * i_size_read because we hold i_mutex.
-                         */
-                        if (pos + len > inode->i_size)
-                                vmtruncate(inode, inode->i_size);
                }
        }
 out:
        return status;
 }
+EXPORT_SYMBOL(block_write_begin_newtrunc);
+/*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * If *pagep is not NULL, then block_write_begin uses the locked page
+ * at *pagep rather than allocating its own. In this case, the page will
+ * not be unlocked or deallocated on failure.
+ */
+int block_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block)
+{
+        int ret;
+        ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block);
+        /*
+         * prepare_write() may have instantiated a few blocks
+         * outside i_size.  Trim these off again. Don't need
+         * i_size_read because we hold i_mutex.
+         *
+         * Filesystems which pass down their own page also cannot
+         * call into vmtruncate here because it would lead to lock
+         * inversion problems (*pagep is locked). This is a further
+         * example of where the old truncate sequence is inadequate.
+         */
+        if (unlikely(ret) && *pagep == NULL) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
+}
 EXPORT_SYMBOL(block_write_begin);
 int block_write_end(struct file *file, struct address_space *mapping,
@@ -2324,7 +2351,7 @@ out:
 * For moronic filesystems that do not allow holes in file.
 * We may have to extend the file.
 */
-int cont_write_begin(struct file *file, struct address_space *mapping,
+int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block, loff_t *bytes)
@@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
        }
        *pagep = NULL;
-        err = block_write_begin(file, mapping, pos, len,
+        err = block_write_begin_newtrunc(file, mapping, pos, len,
                                flags, pagep, fsdata, get_block);
 out:
        return err;
 }
+EXPORT_SYMBOL(cont_write_begin_newtrunc);
+int cont_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block, loff_t *bytes)
+{
+        int ret;
+        ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block, bytes);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
+}
 EXPORT_SYMBOL(cont_write_begin);
 int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
 *
 * We are not allowed to take the i_mutex here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
- * vmtruncate() writes the inode size before removing pages, once we have the
+ * truncate writes the inode size before removing pages, once we have the
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
@@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
 }
 /*
- * On entry, the page is fully not uptodate.
+ * Filesystems implementing the new truncate sequence should use the
- * On exit the page is fully uptodate in the areas outside (from,to)
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * The filesystem needs to handle block truncation upon failure.
 */
-int nobh_write_begin(struct file *file, struct address_space *mapping,
+int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block)
@@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
                unlock_page(page);
                page_cache_release(page);
                *pagep = NULL;
-                return block_write_begin(file, mapping, pos, len, flags, pagep,
+                return block_write_begin_newtrunc(file, mapping, pos, len,
-                                        fsdata, get_block);
+                                        flags, pagep, fsdata, get_block);
        }
        if (PageMappedToDisk(page))
@@ -2605,8 +2652,34 @@ out_release:
        page_cache_release(page);
        *pagep = NULL;
-        if (pos + len > inode->i_size)
+        return ret;
-                vmtruncate(inode, inode->i_size);
+}
+EXPORT_SYMBOL(nobh_write_begin_newtrunc);
+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ */
+int nobh_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block)
+{
+        int ret;
+        ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block);
+        /*
+         * prepare_write() may have instantiated a few blocks
+         * outside i_size.  Trim these off again. Don't need
+         * i_size_read because we hold i_mutex.
+         */
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
        return ret;
 }
author	npiggin@suse.de <npiggin@suse.de>	2010-05-26 11:05:33 -0400
committer	Al Viro <viro@zeniv.linux.org.uk>	2010-05-27 22:15:33 -0400
commit	7bb46a6734a7e1ad4beaecc11cae7ed3ff81d30f (patch)
tree	e575d9c55e2a6ccc645dcb3ae2564de458b428f2 /fs/buffer.c
parent	7000d3c424e5bb350e502a477fb0e1ed42f8b10e (diff)