aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2009-03-31 13:27:11 -0400
committerChris Mason <chris.mason@oracle.com>2009-03-31 14:27:58 -0400
commit5a3f23d515a2ebf0c750db80579ca57b28cbce6d (patch)
treee0ffb43dd35f1c3def9a74ec7a6f4470902c9761 /fs/btrfs/inode.c
parent1a81af4d1d9c60d4313309f937a1fc5567205a87 (diff)
Btrfs: add extra flushing for renames and truncates
Renames and truncates are both common ways to replace old data with new data. The filesystem can make an effort to make sure the new data is on disk before actually replacing the old data. This is especially important for rename, which many application use as though it were atomic for both the data and the metadata involved. The current btrfs code will happily replace a file that is fully on disk with one that was just created and still has pending IO. If we crash after transaction commit but before the IO is done, we'll end up replacing a good file with a zero length file. The solution used here is to create a list of inodes that need special ordering and force them to disk before the commit is done. This is similar to the ext3 style data=ordering, except it is only done on selected files. Btrfs is able to get away with this because it does not wait on commits very often, even for fsync (which use a sub-commit). For renames, we order the file when it wasn't already on disk and when it is replacing an existing file. Larger files are sent to filemap_flush right away (before the transaction handle is opened). For truncates, we order if the file goes from non-zero size down to zero size. This is a little different, because at the time of the truncate the file has no dirty bytes to order. But, we flag the inode so that it is added to the ordered list on close (via release method). We also immediately add it to the ordered list of the current transaction so that we can try to flush down any writes the application sneaks in before commit. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c81
1 files changed, 74 insertions, 7 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bffd79faffb5..1cff528d5b51 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2907 if (err) 2907 if (err)
2908 return err; 2908 return err;
2909 2909
2910 if (S_ISREG(inode->i_mode) && 2910 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
2911 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { 2911 if (attr->ia_size > inode->i_size) {
2912 err = btrfs_cont_expand(inode, attr->ia_size); 2912 err = btrfs_cont_expand(inode, attr->ia_size);
2913 if (err) 2913 if (err)
2914 return err; 2914 return err;
2915 } else if (inode->i_size > 0 &&
2916 attr->ia_size == 0) {
2917
2918 /* we're truncating a file that used to have good
2919 * data down to zero. Make sure it gets into
2920 * the ordered flush list so that any new writes
2921 * get down to disk quickly.
2922 */
2923 BTRFS_I(inode)->ordered_data_close = 1;
2924 }
2915 } 2925 }
2916 2926
2917 err = inode_setattr(inode, attr); 2927 err = inode_setattr(inode, attr);
@@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3050 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, 3060 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3051 inode->i_mapping, GFP_NOFS); 3061 inode->i_mapping, GFP_NOFS);
3052 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3062 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3063 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3053 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3064 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3054 mutex_init(&BTRFS_I(inode)->extent_mutex); 3065 mutex_init(&BTRFS_I(inode)->extent_mutex);
3055 mutex_init(&BTRFS_I(inode)->log_mutex); 3066 mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -4419,6 +4430,8 @@ again:
4419 } 4430 }
4420 ClearPageChecked(page); 4431 ClearPageChecked(page);
4421 set_page_dirty(page); 4432 set_page_dirty(page);
4433
4434 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
4422 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4435 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4423 4436
4424out_unlock: 4437out_unlock:
@@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode)
4444 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 4457 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4445 4458
4446 trans = btrfs_start_transaction(root, 1); 4459 trans = btrfs_start_transaction(root, 1);
4460
4461 /*
4462 * setattr is responsible for setting the ordered_data_close flag,
4463 * but that is only tested during the last file release. That
4464 * could happen well after the next commit, leaving a great big
4465 * window where new writes may get lost if someone chooses to write
4466 * to this file after truncating to zero
4467 *
4468 * The inode doesn't have any dirty data here, and so if we commit
4469 * this is a noop. If someone immediately starts writing to the inode
4470 * it is very likely we'll catch some of their writes in this
4471 * transaction, and the commit will find this file on the ordered
4472 * data list with good things to send down.
4473 *
4474 * This is a best effort solution, there is still a window where
4475 * using truncate to replace the contents of the file will
4476 * end up with a zero length file after a crash.
4477 */
4478 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
4479 btrfs_add_ordered_operation(trans, root, inode);
4480
4447 btrfs_set_trans_block_group(trans, inode); 4481 btrfs_set_trans_block_group(trans, inode);
4448 btrfs_i_size_write(inode, inode->i_size); 4482 btrfs_i_size_write(inode, inode->i_size);
4449 4483
@@ -4520,12 +4554,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4520 ei->i_acl = BTRFS_ACL_NOT_CACHED; 4554 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4521 ei->i_default_acl = BTRFS_ACL_NOT_CACHED; 4555 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4522 INIT_LIST_HEAD(&ei->i_orphan); 4556 INIT_LIST_HEAD(&ei->i_orphan);
4557 INIT_LIST_HEAD(&ei->ordered_operations);
4523 return &ei->vfs_inode; 4558 return &ei->vfs_inode;
4524} 4559}
4525 4560
4526void btrfs_destroy_inode(struct inode *inode) 4561void btrfs_destroy_inode(struct inode *inode)
4527{ 4562{
4528 struct btrfs_ordered_extent *ordered; 4563 struct btrfs_ordered_extent *ordered;
4564 struct btrfs_root *root = BTRFS_I(inode)->root;
4565
4529 WARN_ON(!list_empty(&inode->i_dentry)); 4566 WARN_ON(!list_empty(&inode->i_dentry));
4530 WARN_ON(inode->i_data.nrpages); 4567 WARN_ON(inode->i_data.nrpages);
4531 4568
@@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode)
4536 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) 4573 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4537 posix_acl_release(BTRFS_I(inode)->i_default_acl); 4574 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4538 4575
4539 spin_lock(&BTRFS_I(inode)->root->list_lock); 4576 /*
4577 * Make sure we're properly removed from the ordered operation
4578 * lists.
4579 */
4580 smp_mb();
4581 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
4582 spin_lock(&root->fs_info->ordered_extent_lock);
4583 list_del_init(&BTRFS_I(inode)->ordered_operations);
4584 spin_unlock(&root->fs_info->ordered_extent_lock);
4585 }
4586
4587 spin_lock(&root->list_lock);
4540 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 4588 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4541 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" 4589 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4542 " list\n", inode->i_ino); 4590 " list\n", inode->i_ino);
4543 dump_stack(); 4591 dump_stack();
4544 } 4592 }
4545 spin_unlock(&BTRFS_I(inode)->root->list_lock); 4593 spin_unlock(&root->list_lock);
4546 4594
4547 while (1) { 4595 while (1) {
4548 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 4596 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4667,9 +4715,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4667 if (ret) 4715 if (ret)
4668 goto out_unlock; 4716 goto out_unlock;
4669 4717
4718 /*
4719 * we're using rename to replace one file with another.
4720 * and the replacement file is large. Start IO on it now so
4721 * we don't add too much work to the end of the transaction
4722 */
4723 if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
4724 new_inode->i_size &&
4725 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4726 filemap_flush(old_inode->i_mapping);
4727
4670 trans = btrfs_start_transaction(root, 1); 4728 trans = btrfs_start_transaction(root, 1);
4671 4729
4672 /* 4730 /*
4731 * make sure the inode gets flushed if it is replacing
4732 * something.
4733 */
4734 if (new_inode && new_inode->i_size &&
4735 old_inode && S_ISREG(old_inode->i_mode)) {
4736 btrfs_add_ordered_operation(trans, root, old_inode);
4737 }
4738
4739 /*
4673 * this is an ugly little race, but the rename is required to make 4740 * this is an ugly little race, but the rename is required to make
4674 * sure that if we crash, the inode is either at the old name 4741 * sure that if we crash, the inode is either at the old name
4675 * or the new one. pinning the log transaction lets us make sure 4742 * or the new one. pinning the log transaction lets us make sure