aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorChris Mason <clm@fb.com>2014-08-12 13:47:42 -0400
committerChris Mason <clm@fb.com>2014-08-15 10:43:42 -0400
commit8d875f95da43c6a8f18f77869f2ef26e9594fecc (patch)
tree601416f676c0e2291bdbed359092eb284f1c32dc /fs/btrfs/inode.c
parent27b9a8122ff71a8cadfbffb9c4f0694300464f3b (diff)
btrfs: disable strict file flushes for renames and truncates
Truncates and renames are often used to replace old versions of a file with new versions. Applications often expect this to be an atomic replacement, even if they haven't done anything to make sure the new version is fully on disk. Btrfs has strict flushing in place to make sure that renaming over an old file with a new file will fully flush out the new file before allowing the transaction commit with the rename to complete. This ordering means the commit code needs to be able to lock file pages, and there are a few paths in the filesystem where we will try to end a transaction with the page lock held. It's rare, but these things can deadlock. This patch removes the ordered flushes and switches to a best effort filemap_flush like ext4 uses. It's not perfect, but it should fix the deadlocks. Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c47
1 files changed, 3 insertions, 44 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8ea7610fbaf3..73098328d040 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7951,27 +7951,6 @@ static int btrfs_truncate(struct inode *inode)
7951 BUG_ON(ret); 7951 BUG_ON(ret);
7952 7952
7953 /* 7953 /*
7954 * setattr is responsible for setting the ordered_data_close flag,
7955 * but that is only tested during the last file release. That
7956 * could happen well after the next commit, leaving a great big
7957 * window where new writes may get lost if someone chooses to write
7958 * to this file after truncating to zero
7959 *
7960 * The inode doesn't have any dirty data here, and so if we commit
7961 * this is a noop. If someone immediately starts writing to the inode
7962 * it is very likely we'll catch some of their writes in this
7963 * transaction, and the commit will find this file on the ordered
7964 * data list with good things to send down.
7965 *
7966 * This is a best effort solution, there is still a window where
7967 * using truncate to replace the contents of the file will
7968 * end up with a zero length file after a crash.
7969 */
7970 if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
7971 &BTRFS_I(inode)->runtime_flags))
7972 btrfs_add_ordered_operation(trans, root, inode);
7973
7974 /*
7975 * So if we truncate and then write and fsync we normally would just 7954 * So if we truncate and then write and fsync we normally would just
7976 * write the extents that changed, which is a problem if we need to 7955 * write the extents that changed, which is a problem if we need to
7977 * first truncate that entire inode. So set this flag so we write out 7956 * first truncate that entire inode. So set this flag so we write out
@@ -8118,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
8118 mutex_init(&ei->delalloc_mutex); 8097 mutex_init(&ei->delalloc_mutex);
8119 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 8098 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8120 INIT_LIST_HEAD(&ei->delalloc_inodes); 8099 INIT_LIST_HEAD(&ei->delalloc_inodes);
8121 INIT_LIST_HEAD(&ei->ordered_operations);
8122 RB_CLEAR_NODE(&ei->rb_node); 8100 RB_CLEAR_NODE(&ei->rb_node);
8123 8101
8124 return inode; 8102 return inode;
@@ -8158,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode)
8158 if (!root) 8136 if (!root)
8159 goto free; 8137 goto free;
8160 8138
8161 /*
8162 * Make sure we're properly removed from the ordered operation
8163 * lists.
8164 */
8165 smp_mb();
8166 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
8167 spin_lock(&root->fs_info->ordered_root_lock);
8168 list_del_init(&BTRFS_I(inode)->ordered_operations);
8169 spin_unlock(&root->fs_info->ordered_root_lock);
8170 }
8171
8172 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 8139 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
8173 &BTRFS_I(inode)->runtime_flags)) { 8140 &BTRFS_I(inode)->runtime_flags)) {
8174 btrfs_info(root->fs_info, "inode %llu still on the orphan list", 8141 btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8350,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8350 ret = 0; 8317 ret = 0;
8351 8318
8352 /* 8319 /*
8353 * we're using rename to replace one file with another. 8320 * we're using rename to replace one file with another. Start IO on it
8354 * and the replacement file is large. Start IO on it now so 8321 * now so we don't add too much work to the end of the transaction
8355 * we don't add too much work to the end of the transaction
8356 */ 8322 */
8357 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 8323 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8358 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
8359 filemap_flush(old_inode->i_mapping); 8324 filemap_flush(old_inode->i_mapping);
8360 8325
8361 /* close the racy window with snapshot create/destroy ioctl */ 8326 /* close the racy window with snapshot create/destroy ioctl */
@@ -8403,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8403 */ 8368 */
8404 btrfs_pin_log_trans(root); 8369 btrfs_pin_log_trans(root);
8405 } 8370 }
8406 /*
8407 * make sure the inode gets flushed if it is replacing
8408 * something.
8409 */
8410 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
8411 btrfs_add_ordered_operation(trans, root, old_inode);
8412 8371
8413 inode_inc_iversion(old_dir); 8372 inode_inc_iversion(old_dir);
8414 inode_inc_iversion(new_dir); 8373 inode_inc_iversion(new_dir);