btrfs: disable strict file flushes for renames and truncates

Truncates and renames are often used to replace old versions of a file with new versions. Applications often expect this to be an atomic replacement, even if they haven't done anything to make sure the new version is fully on disk. Btrfs has strict flushing in place to make sure that renaming over an old file with a new file will fully flush out the new file before allowing the transaction commit with the rename to complete. This ordering means the commit code needs to be able to lock file pages, and there are a few paths in the filesystem where we will try to end a transaction with the page lock held. It's rare, but these things can deadlock. This patch removes the ordered flushes and switches to a best effort filemap_flush like ext4 uses. It's not perfect, but it should fix the deadlocks. Signed-off-by: Chris Mason <clm@fb.com>
author: Chris Mason <clm@fb.com> 2014-08-12 13:47:42 -0400
committer: Chris Mason <clm@fb.com> 2014-08-15 10:43:42 -0400
commit: 8d875f95da43c6a8f18f77869f2ef26e9594fecc (patch)
tree: 601416f676c0e2291bdbed359092eb284f1c32dc /fs/btrfs/inode.c
parent: 27b9a8122ff71a8cadfbffb9c4f0694300464f3b (diff)
1 files changed, 3 insertions, 44 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8ea7610fbaf3..73098328d040 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7951,27 +7951,6 @@ static int btrfs_truncate(struct inode *inode)
        BUG_ON(ret);
        /*
-         * setattr is responsible for setting the ordered_data_close flag,
-         * but that is only tested during the last file release.  That
-         * could happen well after the next commit, leaving a great big
-         * window where new writes may get lost if someone chooses to write
-         * to this file after truncating to zero
-         *
-         * The inode doesn't have any dirty data here, and so if we commit
-         * this is a noop.  If someone immediately starts writing to the inode
-         * it is very likely we'll catch some of their writes in this
-         * transaction, and the commit will find this file on the ordered
-         * data list with good things to send down.
-         *
-         * This is a best effort solution, there is still a window where
-         * using truncate to replace the contents of the file will
-         * end up with a zero length file after a crash.
-         */
-        if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
-                                           &BTRFS_I(inode)->runtime_flags))
-                btrfs_add_ordered_operation(trans, root, inode);
-        /*
         * So if we truncate and then write and fsync we normally would just
         * write the extents that changed, which is a problem if we need to
         * first truncate that entire inode.  So set this flag so we write out
@@ -8118,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->delalloc_inodes);
-        INIT_LIST_HEAD(&ei->ordered_operations);
        RB_CLEAR_NODE(&ei->rb_node);
        return inode;
@@ -8158,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode)
        if (!root)
                goto free;
-        /*
-         * Make sure we're properly removed from the ordered operation
-         * lists.
-         */
-        smp_mb();
-        if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-                spin_lock(&root->fs_info->ordered_root_lock);
-                list_del_init(&BTRFS_I(inode)->ordered_operations);
-                spin_unlock(&root->fs_info->ordered_root_lock);
-        }
        if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                     &BTRFS_I(inode)->runtime_flags)) {
                btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8350,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        ret = 0;
        /*
-         * we're using rename to replace one file with another.
+         * we're using rename to replace one file with another.  Start IO on it
-         * and the replacement file is large.  Start IO on it now so
+         * now so  we don't add too much work to the end of the transaction
-         * we don't add too much work to the end of the transaction
         */
-        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
+        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
-            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                filemap_flush(old_inode->i_mapping);
        /* close the racy window with snapshot create/destroy ioctl */
@@ -8403,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 */
                btrfs_pin_log_trans(root);
        }
-        /*
-         * make sure the inode gets flushed if it is replacing
-         * something.
-         */
-        if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
-                btrfs_add_ordered_operation(trans, root, old_inode);
        inode_inc_iversion(old_dir);
        inode_inc_iversion(new_dir);
author	Chris Mason <clm@fb.com>	2014-08-12 13:47:42 -0400
committer	Chris Mason <clm@fb.com>	2014-08-15 10:43:42 -0400
commit	8d875f95da43c6a8f18f77869f2ef26e9594fecc (patch)
tree	601416f676c0e2291bdbed359092eb284f1c32dc /fs/btrfs/inode.c
parent	27b9a8122ff71a8cadfbffb9c4f0694300464f3b (diff)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8ea7610fbaf3..73098328d040 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c
@@ -7951,27 +7951,6 @@ static int btrfs_truncate(struct inode *inode)
7951	BUG_ON(ret);	7951	BUG_ON(ret);
7952		7952
7953	/*	7953	/*
7954	* setattr is responsible for setting the ordered_data_close flag,
7955	* but that is only tested during the last file release. That
7956	* could happen well after the next commit, leaving a great big
7957	* window where new writes may get lost if someone chooses to write
7958	* to this file after truncating to zero
7959	*
7960	* The inode doesn't have any dirty data here, and so if we commit
7961	* this is a noop. If someone immediately starts writing to the inode
7962	* it is very likely we'll catch some of their writes in this
7963	* transaction, and the commit will find this file on the ordered
7964	* data list with good things to send down.
7965	*
7966	* This is a best effort solution, there is still a window where
7967	* using truncate to replace the contents of the file will
7968	* end up with a zero length file after a crash.
7969	*/
7970	if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
7971	&BTRFS_I(inode)->runtime_flags))
7972	btrfs_add_ordered_operation(trans, root, inode);
7973
7974	/*
7975	* So if we truncate and then write and fsync we normally would just	7954	* So if we truncate and then write and fsync we normally would just
7976	* write the extents that changed, which is a problem if we need to	7955	* write the extents that changed, which is a problem if we need to
7977	* first truncate that entire inode. So set this flag so we write out	7956	* first truncate that entire inode. So set this flag so we write out
@@ -8118,7 +8097,6 @@ struct inode btrfs_alloc_inode(struct super_block sb)
8118	mutex_init(&ei->delalloc_mutex);	8097	mutex_init(&ei->delalloc_mutex);
8119	btrfs_ordered_inode_tree_init(&ei->ordered_tree);	8098	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8120	INIT_LIST_HEAD(&ei->delalloc_inodes);	8099	INIT_LIST_HEAD(&ei->delalloc_inodes);
8121	INIT_LIST_HEAD(&ei->ordered_operations);
8122	RB_CLEAR_NODE(&ei->rb_node);	8100	RB_CLEAR_NODE(&ei->rb_node);
8123		8101
8124	return inode;	8102	return inode;
@@ -8158,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode)
8158	if (!root)	8136	if (!root)
8159	goto free;	8137	goto free;
8160		8138
8161	/*
8162	* Make sure we're properly removed from the ordered operation
8163	* lists.
8164	*/
8165	smp_mb();
8166	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
8167	spin_lock(&root->fs_info->ordered_root_lock);
8168	list_del_init(&BTRFS_I(inode)->ordered_operations);
8169	spin_unlock(&root->fs_info->ordered_root_lock);
8170	}
8171
8172	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,	8139	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
8173	&BTRFS_I(inode)->runtime_flags)) {	8140	&BTRFS_I(inode)->runtime_flags)) {
8174	btrfs_info(root->fs_info, "inode %llu still on the orphan list",	8141	btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8350,12 +8317,10 @@ static int btrfs_rename(struct inode old_dir, struct dentry old_dentry,
8350	ret = 0;	8317	ret = 0;
8351		8318
8352	/*	8319	/*
8353	* we're using rename to replace one file with another.	8320	* we're using rename to replace one file with another. Start IO on it
8354	* and the replacement file is large. Start IO on it now so	8321	* now so we don't add too much work to the end of the transaction
8355	* we don't add too much work to the end of the transaction
8356	*/	8322	*/
8357	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&	8323	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8358	old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
8359	filemap_flush(old_inode->i_mapping);	8324	filemap_flush(old_inode->i_mapping);
8360		8325
8361	/* close the racy window with snapshot create/destroy ioctl */	8326	/* close the racy window with snapshot create/destroy ioctl */
@@ -8403,12 +8368,6 @@ static int btrfs_rename(struct inode old_dir, struct dentry old_dentry,
8403	*/	8368	*/
8404	btrfs_pin_log_trans(root);	8369	btrfs_pin_log_trans(root);
8405	}	8370	}
8406	/*
8407	* make sure the inode gets flushed if it is replacing
8408	* something.
8409	*/
8410	if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
8411	btrfs_add_ordered_operation(trans, root, old_inode);
8412		8371
8413	inode_inc_iversion(old_dir);	8372	inode_inc_iversion(old_dir);
8414	inode_inc_iversion(new_dir);	8373	inode_inc_iversion(new_dir);