Btrfs: add extra flushing for renames and truncates

Renames and truncates are both common ways to replace old data with new data. The filesystem can make an effort to make sure the new data is on disk before actually replacing the old data. This is especially important for rename, which many application use as though it were atomic for both the data and the metadata involved. The current btrfs code will happily replace a file that is fully on disk with one that was just created and still has pending IO. If we crash after transaction commit but before the IO is done, we'll end up replacing a good file with a zero length file. The solution used here is to create a list of inodes that need special ordering and force them to disk before the commit is done. This is similar to the ext3 style data=ordering, except it is only done on selected files. Btrfs is able to get away with this because it does not wait on commits very often, even for fsync (which use a sub-commit). For renames, we order the file when it wasn't already on disk and when it is replacing an existing file. Larger files are sent to filemap_flush right away (before the transaction handle is opened). For truncates, we order if the file goes from non-zero size down to zero size. This is a little different, because at the time of the truncate the file has no dirty bytes to order. But, we flag the inode so that it is added to the ordered list on close (via release method). We also immediately add it to the ordered list of the current transaction so that we can try to flush down any writes the application sneaks in before commit. Signed-off-by: Chris Mason <chris.mason@oracle.com>
author: Chris Mason <chris.mason@oracle.com> 2009-03-31 13:27:11 -0400
committer: Chris Mason <chris.mason@oracle.com> 2009-03-31 14:27:58 -0400
commit: 5a3f23d515a2ebf0c750db80579ca57b28cbce6d (patch)
tree: e0ffb43dd35f1c3def9a74ec7a6f4470902c9761 /fs/btrfs/inode.c
parent: 1a81af4d1d9c60d4313309f937a1fc5567205a87 (diff)
1 files changed, 74 insertions, 7 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bffd79faffb5..1cff528d5b51 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
        if (err)
                return err;
-        if (S_ISREG(inode->i_mode) &&
+        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-            attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
+                if (attr->ia_size > inode->i_size) {
-                err = btrfs_cont_expand(inode, attr->ia_size);
+                        err = btrfs_cont_expand(inode, attr->ia_size);
-                if (err)
+                        if (err)
-                        return err;
+                                return err;
+                } else if (inode->i_size > 0 &&
+                           attr->ia_size == 0) {
+                        /* we're truncating a file that used to have good
+                         * data down to zero.  Make sure it gets into
+                         * the ordered flush list so that any new writes
+                         * get down to disk quickly.
+                         */
+                        BTRFS_I(inode)->ordered_data_close = 1;
+                }
        }
        err = inode_setattr(inode, attr);
@@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode)
        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
                             inode->i_mapping, GFP_NOFS);
        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+        INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
        mutex_init(&BTRFS_I(inode)->extent_mutex);
        mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -4419,6 +4430,8 @@ again:
        }
        ClearPageChecked(page);
        set_page_dirty(page);
+        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 out_unlock:
@@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode)
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        trans = btrfs_start_transaction(root, 1);
+        /*
+         * setattr is responsible for setting the ordered_data_close flag,
+         * but that is only tested during the last file release.  That
+         * could happen well after the next commit, leaving a great big
+         * window where new writes may get lost if someone chooses to write
+         * to this file after truncating to zero
+         *
+         * The inode doesn't have any dirty data here, and so if we commit
+         * this is a noop.  If someone immediately starts writing to the inode
+         * it is very likely we'll catch some of their writes in this
+         * transaction, and the commit will find this file on the ordered
+         * data list with good things to send down.
+         *
+         * This is a best effort solution, there is still a window where
+         * using truncate to replace the contents of the file will
+         * end up with a zero length file after a crash.
+         */
+        if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+                btrfs_add_ordered_operation(trans, root, inode);
        btrfs_set_trans_block_group(trans, inode);
        btrfs_i_size_write(inode, inode->i_size);
@@ -4520,12 +4554,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->i_acl = BTRFS_ACL_NOT_CACHED;
        ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
        INIT_LIST_HEAD(&ei->i_orphan);
+        INIT_LIST_HEAD(&ei->ordered_operations);
        return &ei->vfs_inode;
 }
 void btrfs_destroy_inode(struct inode *inode)
 {
        struct btrfs_ordered_extent *ordered;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
@@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode)
            BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
                posix_acl_release(BTRFS_I(inode)->i_default_acl);
-        spin_lock(&BTRFS_I(inode)->root->list_lock);
+        /*
+         * Make sure we're properly removed from the ordered operation
+         * lists.
+         */
+        smp_mb();
+        if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+                spin_lock(&root->fs_info->ordered_extent_lock);
+                list_del_init(&BTRFS_I(inode)->ordered_operations);
+                spin_unlock(&root->fs_info->ordered_extent_lock);
+        }
+        spin_lock(&root->list_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
                       " list\n", inode->i_ino);
                dump_stack();
        }
-        spin_unlock(&BTRFS_I(inode)->root->list_lock);
+        spin_unlock(&root->list_lock);
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4667,9 +4715,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (ret)
                goto out_unlock;
+        /*
+         * we're using rename to replace one file with another.
+         * and the replacement file is large.  Start IO on it now so
+         * we don't add too much work to the end of the transaction
+         */
+        if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+            new_inode->i_size &&
+            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+                filemap_flush(old_inode->i_mapping);
        trans = btrfs_start_transaction(root, 1);
        /*
+         * make sure the inode gets flushed if it is replacing
+         * something.
+         */
+        if (new_inode && new_inode->i_size &&
+            old_inode && S_ISREG(old_inode->i_mode)) {
+                btrfs_add_ordered_operation(trans, root, old_inode);
+        }
+        /*
         * this is an ugly little race, but the rename is required to make
         * sure that if we crash, the inode is either at the old name
         * or the new one.  pinning the log transaction lets us make sure
author	Chris Mason <chris.mason@oracle.com>	2009-03-31 13:27:11 -0400
committer	Chris Mason <chris.mason@oracle.com>	2009-03-31 14:27:58 -0400
commit	5a3f23d515a2ebf0c750db80579ca57b28cbce6d (patch)
tree	e0ffb43dd35f1c3def9a74ec7a6f4470902c9761 /fs/btrfs/inode.c
parent	1a81af4d1d9c60d4313309f937a1fc5567205a87 (diff)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bffd79faffb5..1cff528d5b51 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c
@@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry dentry, struct iattr attr)
2907	if (err)	2907	if (err)
2908	return err;	2908	return err;
2909		2909
2910	if (S_ISREG(inode->i_mode) &&	2910	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
2911	attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {	2911	if (attr->ia_size > inode->i_size) {
2912	err = btrfs_cont_expand(inode, attr->ia_size);	2912	err = btrfs_cont_expand(inode, attr->ia_size);
2913	if (err)	2913	if (err)
2914	return err;	2914	return err;
		2915	} else if (inode->i_size > 0 &&
		2916	attr->ia_size == 0) {
		2917
		2918	/* we're truncating a file that used to have good
		2919	* data down to zero. Make sure it gets into
		2920	* the ordered flush list so that any new writes
		2921	* get down to disk quickly.
		2922	*/
		2923	BTRFS_I(inode)->ordered_data_close = 1;
		2924	}
2915	}	2925	}
2916		2926
2917	err = inode_setattr(inode, attr);	2927	err = inode_setattr(inode, attr);
@@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3050	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,	3060	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3051	inode->i_mapping, GFP_NOFS);	3061	inode->i_mapping, GFP_NOFS);
3052	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);	3062	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
		3063	INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3053	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);	3064	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3054	mutex_init(&BTRFS_I(inode)->extent_mutex);	3065	mutex_init(&BTRFS_I(inode)->extent_mutex);
3055	mutex_init(&BTRFS_I(inode)->log_mutex);	3066	mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -4419,6 +4430,8 @@ again:
4419	}	4430	}
4420	ClearPageChecked(page);	4431	ClearPageChecked(page);
4421	set_page_dirty(page);	4432	set_page_dirty(page);
		4433
		4434	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
4422	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);	4435	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4423		4436
4424	out_unlock:	4437	out_unlock:
@@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode)
4444	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);	4457	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4445		4458
4446	trans = btrfs_start_transaction(root, 1);	4459	trans = btrfs_start_transaction(root, 1);
		4460
		4461	/*
		4462	* setattr is responsible for setting the ordered_data_close flag,
		4463	* but that is only tested during the last file release. That
		4464	* could happen well after the next commit, leaving a great big
		4465	* window where new writes may get lost if someone chooses to write
		4466	* to this file after truncating to zero
		4467	*
		4468	* The inode doesn't have any dirty data here, and so if we commit
		4469	* this is a noop. If someone immediately starts writing to the inode
		4470	* it is very likely we'll catch some of their writes in this
		4471	* transaction, and the commit will find this file on the ordered
		4472	* data list with good things to send down.
		4473	*
		4474	* This is a best effort solution, there is still a window where
		4475	* using truncate to replace the contents of the file will
		4476	* end up with a zero length file after a crash.
		4477	*/
		4478	if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
		4479	btrfs_add_ordered_operation(trans, root, inode);
		4480
4447	btrfs_set_trans_block_group(trans, inode);	4481	btrfs_set_trans_block_group(trans, inode);
4448	btrfs_i_size_write(inode, inode->i_size);	4482	btrfs_i_size_write(inode, inode->i_size);
4449		4483
@@ -4520,12 +4554,15 @@ struct inode btrfs_alloc_inode(struct super_block sb)
4520	ei->i_acl = BTRFS_ACL_NOT_CACHED;	4554	ei->i_acl = BTRFS_ACL_NOT_CACHED;
4521	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;	4555	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4522	INIT_LIST_HEAD(&ei->i_orphan);	4556	INIT_LIST_HEAD(&ei->i_orphan);
		4557	INIT_LIST_HEAD(&ei->ordered_operations);
4523	return &ei->vfs_inode;	4558	return &ei->vfs_inode;
4524	}	4559	}
4525		4560
4526	void btrfs_destroy_inode(struct inode *inode)	4561	void btrfs_destroy_inode(struct inode *inode)
4527	{	4562	{
4528	struct btrfs_ordered_extent *ordered;	4563	struct btrfs_ordered_extent *ordered;
		4564	struct btrfs_root *root = BTRFS_I(inode)->root;
		4565
4529	WARN_ON(!list_empty(&inode->i_dentry));	4566	WARN_ON(!list_empty(&inode->i_dentry));
4530	WARN_ON(inode->i_data.nrpages);	4567	WARN_ON(inode->i_data.nrpages);
4531		4568
@@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode)
4536	BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)	4573	BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4537	posix_acl_release(BTRFS_I(inode)->i_default_acl);	4574	posix_acl_release(BTRFS_I(inode)->i_default_acl);
4538		4575
4539	spin_lock(&BTRFS_I(inode)->root->list_lock);	4576	/*
		4577	* Make sure we're properly removed from the ordered operation
		4578	* lists.
		4579	*/
		4580	smp_mb();
		4581	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
		4582	spin_lock(&root->fs_info->ordered_extent_lock);
		4583	list_del_init(&BTRFS_I(inode)->ordered_operations);
		4584	spin_unlock(&root->fs_info->ordered_extent_lock);
		4585	}
		4586
		4587	spin_lock(&root->list_lock);
4540	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {	4588	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4541	printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"	4589	printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4542	" list\n", inode->i_ino);	4590	" list\n", inode->i_ino);
4543	dump_stack();	4591	dump_stack();
4544	}	4592	}
4545	spin_unlock(&BTRFS_I(inode)->root->list_lock);	4593	spin_unlock(&root->list_lock);
4546		4594
4547	while (1) {	4595	while (1) {
4548	ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);	4596	ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4667,9 +4715,28 @@ static int btrfs_rename(struct inode old_dir, struct dentry old_dentry,
4667	if (ret)	4715	if (ret)
4668	goto out_unlock;	4716	goto out_unlock;
4669		4717
		4718	/*
		4719	* we're using rename to replace one file with another.
		4720	* and the replacement file is large. Start IO on it now so
		4721	* we don't add too much work to the end of the transaction
		4722	*/
		4723	if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
		4724	new_inode->i_size &&
		4725	old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
		4726	filemap_flush(old_inode->i_mapping);
		4727
4670	trans = btrfs_start_transaction(root, 1);	4728	trans = btrfs_start_transaction(root, 1);
4671		4729
4672	/*	4730	/*
		4731	* make sure the inode gets flushed if it is replacing
		4732	* something.
		4733	*/
		4734	if (new_inode && new_inode->i_size &&
		4735	old_inode && S_ISREG(old_inode->i_mode)) {
		4736	btrfs_add_ordered_operation(trans, root, old_inode);
		4737	}
		4738
		4739	/*
4673	* this is an ugly little race, but the rename is required to make	4740	* this is an ugly little race, but the rename is required to make
4674	* sure that if we crash, the inode is either at the old name	4741	* sure that if we crash, the inode is either at the old name
4675	* or the new one. pinning the log transaction lets us make sure	4742	* or the new one. pinning the log transaction lets us make sure