aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2009-03-31 13:27:11 -0400
committerChris Mason <chris.mason@oracle.com>2009-03-31 14:27:58 -0400
commit5a3f23d515a2ebf0c750db80579ca57b28cbce6d (patch)
treee0ffb43dd35f1c3def9a74ec7a6f4470902c9761 /fs
parent1a81af4d1d9c60d4313309f937a1fc5567205a87 (diff)
Btrfs: add extra flushing for renames and truncates
Renames and truncates are both common ways to replace old data with new data. The filesystem can make an effort to make sure the new data is on disk before actually replacing the old data. This is especially important for rename, which many application use as though it were atomic for both the data and the metadata involved. The current btrfs code will happily replace a file that is fully on disk with one that was just created and still has pending IO. If we crash after transaction commit but before the IO is done, we'll end up replacing a good file with a zero length file. The solution used here is to create a list of inodes that need special ordering and force them to disk before the commit is done. This is similar to the ext3 style data=ordering, except it is only done on selected files. Btrfs is able to get away with this because it does not wait on commits very often, even for fsync (which use a sub-commit). For renames, we order the file when it wasn't already on disk and when it is replacing an existing file. Larger files are sent to filemap_flush right away (before the transaction handle is opened). For truncates, we order if the file goes from non-zero size down to zero size. This is a little different, because at the time of the truncate the file has no dirty bytes to order. But, we flag the inode so that it is added to the ordered list on close (via release method). We also immediately add it to the ordered list of the current transaction so that we can try to flush down any writes the application sneaks in before commit. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/btrfs_inode.h18
-rw-r--r--fs/btrfs/ctree.h35
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/file.c26
-rw-r--r--fs/btrfs/inode.c81
-rw-r--r--fs/btrfs/ordered-data.c118
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/transaction.c11
8 files changed, 288 insertions, 7 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3af4cfb5654c..b30986f00b9d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
66 */ 66 */
67 struct list_head delalloc_inodes; 67 struct list_head delalloc_inodes;
68 68
69 /*
70 * list for tracking inodes that must be sent to disk before a
71 * rename or truncate commit
72 */
73 struct list_head ordered_operations;
74
69 /* the space_info for where this inode's data allocations are done */ 75 /* the space_info for where this inode's data allocations are done */
70 struct btrfs_space_info *space_info; 76 struct btrfs_space_info *space_info;
71 77
@@ -122,6 +128,18 @@ struct btrfs_inode {
122 */ 128 */
123 u64 last_unlink_trans; 129 u64 last_unlink_trans;
124 130
131 /*
132 * ordered_data_close is set by truncate when a file that used
133 * to have good data has been truncated to zero. When it is set
134 * the btrfs file release call will add this inode to the
135 * ordered operations list so that we make sure to flush out any
136 * new data the application may have written before commit.
137 *
138 * yes, its silly to have a single bitflag, but we might grow more
139 * of these.
140 */
141 unsigned ordered_data_close:1;
142
125 struct inode vfs_inode; 143 struct inode vfs_inode;
126}; 144};
127 145
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2737facbd341..f48905ee5240 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
45 45
46#define BTRFS_MAX_LEVEL 8 46#define BTRFS_MAX_LEVEL 8
47 47
48/*
49 * files bigger than this get some pre-flushing when they are added
50 * to the ordered operations list. That way we limit the total
51 * work done by the commit
52 */
53#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
54
48/* holds pointers to all of the tree roots */ 55/* holds pointers to all of the tree roots */
49#define BTRFS_ROOT_TREE_OBJECTID 1ULL 56#define BTRFS_ROOT_TREE_OBJECTID 1ULL
50 57
@@ -727,6 +734,15 @@ struct btrfs_fs_info {
727 struct mutex volume_mutex; 734 struct mutex volume_mutex;
728 struct mutex tree_reloc_mutex; 735 struct mutex tree_reloc_mutex;
729 736
737 /*
738 * this protects the ordered operations list only while we are
739 * processing all of the entries on it. This way we make
740 * sure the commit code doesn't find the list temporarily empty
741 * because another function happens to be doing non-waiting preflush
742 * before jumping into the main commit.
743 */
744 struct mutex ordered_operations_mutex;
745
730 struct list_head trans_list; 746 struct list_head trans_list;
731 struct list_head hashers; 747 struct list_head hashers;
732 struct list_head dead_roots; 748 struct list_head dead_roots;
@@ -741,10 +757,29 @@ struct btrfs_fs_info {
741 * ordered extents 757 * ordered extents
742 */ 758 */
743 spinlock_t ordered_extent_lock; 759 spinlock_t ordered_extent_lock;
760
761 /*
762 * all of the data=ordered extents pending writeback
763 * these can span multiple transactions and basically include
764 * every dirty data page that isn't from nodatacow
765 */
744 struct list_head ordered_extents; 766 struct list_head ordered_extents;
767
768 /*
769 * all of the inodes that have delalloc bytes. It is possible for
770 * this list to be empty even when there is still dirty data=ordered
771 * extents waiting to finish IO.
772 */
745 struct list_head delalloc_inodes; 773 struct list_head delalloc_inodes;
746 774
747 /* 775 /*
776 * special rename and truncate targets that must be on disk before
777 * we're allowed to commit. This is basically the ext3 style
778 * data=ordered list.
779 */
780 struct list_head ordered_operations;
781
782 /*
748 * there is a pool of worker threads for checksumming during writes 783 * there is a pool of worker threads for checksumming during writes
749 * and a pool for checksumming after reads. This is because readers 784 * and a pool for checksumming after reads. This is because readers
750 * can run with FS locks held, and the writers may be waiting for 785 * can run with FS locks held, and the writers may be waiting for
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9244cd7313d4..1747dfd18654 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1572,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1572 INIT_LIST_HEAD(&fs_info->dead_roots); 1572 INIT_LIST_HEAD(&fs_info->dead_roots);
1573 INIT_LIST_HEAD(&fs_info->hashers); 1573 INIT_LIST_HEAD(&fs_info->hashers);
1574 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1574 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1575 INIT_LIST_HEAD(&fs_info->ordered_operations);
1575 spin_lock_init(&fs_info->delalloc_lock); 1576 spin_lock_init(&fs_info->delalloc_lock);
1576 spin_lock_init(&fs_info->new_trans_lock); 1577 spin_lock_init(&fs_info->new_trans_lock);
1577 spin_lock_init(&fs_info->ref_cache_lock); 1578 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1643,6 +1644,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1643 insert_inode_hash(fs_info->btree_inode); 1644 insert_inode_hash(fs_info->btree_inode);
1644 1645
1645 mutex_init(&fs_info->trans_mutex); 1646 mutex_init(&fs_info->trans_mutex);
1647 mutex_init(&fs_info->ordered_operations_mutex);
1646 mutex_init(&fs_info->tree_log_mutex); 1648 mutex_init(&fs_info->tree_log_mutex);
1647 mutex_init(&fs_info->drop_mutex); 1649 mutex_init(&fs_info->drop_mutex);
1648 mutex_init(&fs_info->pinned_mutex); 1650 mutex_init(&fs_info->pinned_mutex);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 32d10a617613..9c9fb46ccd08 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1161,6 +1161,20 @@ out_nolock:
1161 page_cache_release(pinned[1]); 1161 page_cache_release(pinned[1]);
1162 *ppos = pos; 1162 *ppos = pos;
1163 1163
1164 /*
1165 * we want to make sure fsync finds this change
1166 * but we haven't joined a transaction running right now.
1167 *
1168 * Later on, someone is sure to update the inode and get the
1169 * real transid recorded.
1170 *
1171 * We set last_trans now to the fs_info generation + 1,
1172 * this will either be one more than the running transaction
1173 * or the generation used for the next transaction if there isn't
1174 * one running right now.
1175 */
1176 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1177
1164 if (num_written > 0 && will_write) { 1178 if (num_written > 0 && will_write) {
1165 struct btrfs_trans_handle *trans; 1179 struct btrfs_trans_handle *trans;
1166 1180
@@ -1194,6 +1208,18 @@ out_nolock:
1194 1208
1195int btrfs_release_file(struct inode *inode, struct file *filp) 1209int btrfs_release_file(struct inode *inode, struct file *filp)
1196{ 1210{
1211 /*
1212 * ordered_data_close is set by settattr when we are about to truncate
1213 * a file from a non-zero size to a zero size. This tries to
1214 * flush down new bytes that may have been written if the
1215 * application were using truncate to replace a file in place.
1216 */
1217 if (BTRFS_I(inode)->ordered_data_close) {
1218 BTRFS_I(inode)->ordered_data_close = 0;
1219 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1220 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1221 filemap_flush(inode->i_mapping);
1222 }
1197 if (filp->private_data) 1223 if (filp->private_data)
1198 btrfs_ioctl_trans_end(filp); 1224 btrfs_ioctl_trans_end(filp);
1199 return 0; 1225 return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bffd79faffb5..1cff528d5b51 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2907 if (err) 2907 if (err)
2908 return err; 2908 return err;
2909 2909
2910 if (S_ISREG(inode->i_mode) && 2910 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
2911 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { 2911 if (attr->ia_size > inode->i_size) {
2912 err = btrfs_cont_expand(inode, attr->ia_size); 2912 err = btrfs_cont_expand(inode, attr->ia_size);
2913 if (err) 2913 if (err)
2914 return err; 2914 return err;
2915 } else if (inode->i_size > 0 &&
2916 attr->ia_size == 0) {
2917
2918 /* we're truncating a file that used to have good
2919 * data down to zero. Make sure it gets into
2920 * the ordered flush list so that any new writes
2921 * get down to disk quickly.
2922 */
2923 BTRFS_I(inode)->ordered_data_close = 1;
2924 }
2915 } 2925 }
2916 2926
2917 err = inode_setattr(inode, attr); 2927 err = inode_setattr(inode, attr);
@@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3050 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, 3060 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3051 inode->i_mapping, GFP_NOFS); 3061 inode->i_mapping, GFP_NOFS);
3052 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3062 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3063 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3053 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3064 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3054 mutex_init(&BTRFS_I(inode)->extent_mutex); 3065 mutex_init(&BTRFS_I(inode)->extent_mutex);
3055 mutex_init(&BTRFS_I(inode)->log_mutex); 3066 mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -4419,6 +4430,8 @@ again:
4419 } 4430 }
4420 ClearPageChecked(page); 4431 ClearPageChecked(page);
4421 set_page_dirty(page); 4432 set_page_dirty(page);
4433
4434 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
4422 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4435 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4423 4436
4424out_unlock: 4437out_unlock:
@@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode)
4444 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 4457 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4445 4458
4446 trans = btrfs_start_transaction(root, 1); 4459 trans = btrfs_start_transaction(root, 1);
4460
4461 /*
4462 * setattr is responsible for setting the ordered_data_close flag,
4463 * but that is only tested during the last file release. That
4464 * could happen well after the next commit, leaving a great big
4465 * window where new writes may get lost if someone chooses to write
4466 * to this file after truncating to zero
4467 *
4468 * The inode doesn't have any dirty data here, and so if we commit
4469 * this is a noop. If someone immediately starts writing to the inode
4470 * it is very likely we'll catch some of their writes in this
4471 * transaction, and the commit will find this file on the ordered
4472 * data list with good things to send down.
4473 *
4474 * This is a best effort solution, there is still a window where
4475 * using truncate to replace the contents of the file will
4476 * end up with a zero length file after a crash.
4477 */
4478 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
4479 btrfs_add_ordered_operation(trans, root, inode);
4480
4447 btrfs_set_trans_block_group(trans, inode); 4481 btrfs_set_trans_block_group(trans, inode);
4448 btrfs_i_size_write(inode, inode->i_size); 4482 btrfs_i_size_write(inode, inode->i_size);
4449 4483
@@ -4520,12 +4554,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4520 ei->i_acl = BTRFS_ACL_NOT_CACHED; 4554 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4521 ei->i_default_acl = BTRFS_ACL_NOT_CACHED; 4555 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4522 INIT_LIST_HEAD(&ei->i_orphan); 4556 INIT_LIST_HEAD(&ei->i_orphan);
4557 INIT_LIST_HEAD(&ei->ordered_operations);
4523 return &ei->vfs_inode; 4558 return &ei->vfs_inode;
4524} 4559}
4525 4560
4526void btrfs_destroy_inode(struct inode *inode) 4561void btrfs_destroy_inode(struct inode *inode)
4527{ 4562{
4528 struct btrfs_ordered_extent *ordered; 4563 struct btrfs_ordered_extent *ordered;
4564 struct btrfs_root *root = BTRFS_I(inode)->root;
4565
4529 WARN_ON(!list_empty(&inode->i_dentry)); 4566 WARN_ON(!list_empty(&inode->i_dentry));
4530 WARN_ON(inode->i_data.nrpages); 4567 WARN_ON(inode->i_data.nrpages);
4531 4568
@@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode)
4536 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) 4573 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4537 posix_acl_release(BTRFS_I(inode)->i_default_acl); 4574 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4538 4575
4539 spin_lock(&BTRFS_I(inode)->root->list_lock); 4576 /*
4577 * Make sure we're properly removed from the ordered operation
4578 * lists.
4579 */
4580 smp_mb();
4581 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
4582 spin_lock(&root->fs_info->ordered_extent_lock);
4583 list_del_init(&BTRFS_I(inode)->ordered_operations);
4584 spin_unlock(&root->fs_info->ordered_extent_lock);
4585 }
4586
4587 spin_lock(&root->list_lock);
4540 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 4588 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4541 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" 4589 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4542 " list\n", inode->i_ino); 4590 " list\n", inode->i_ino);
4543 dump_stack(); 4591 dump_stack();
4544 } 4592 }
4545 spin_unlock(&BTRFS_I(inode)->root->list_lock); 4593 spin_unlock(&root->list_lock);
4546 4594
4547 while (1) { 4595 while (1) {
4548 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 4596 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4667,9 +4715,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4667 if (ret) 4715 if (ret)
4668 goto out_unlock; 4716 goto out_unlock;
4669 4717
4718 /*
4719 * we're using rename to replace one file with another.
4720 * and the replacement file is large. Start IO on it now so
4721 * we don't add too much work to the end of the transaction
4722 */
4723 if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
4724 new_inode->i_size &&
4725 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4726 filemap_flush(old_inode->i_mapping);
4727
4670 trans = btrfs_start_transaction(root, 1); 4728 trans = btrfs_start_transaction(root, 1);
4671 4729
4672 /* 4730 /*
4731 * make sure the inode gets flushed if it is replacing
4732 * something.
4733 */
4734 if (new_inode && new_inode->i_size &&
4735 old_inode && S_ISREG(old_inode->i_mode)) {
4736 btrfs_add_ordered_operation(trans, root, old_inode);
4737 }
4738
4739 /*
4673 * this is an ugly little race, but the rename is required to make 4740 * this is an ugly little race, but the rename is required to make
4674 * sure that if we crash, the inode is either at the old name 4741 * sure that if we crash, the inode is either at the old name
4675 * or the new one. pinning the log transaction lets us make sure 4742 * or the new one. pinning the log transaction lets us make sure
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0f..53c87b197d70 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
310 310
311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
312 list_del_init(&entry->root_extent_list); 312 list_del_init(&entry->root_extent_list);
313
314 /*
315 * we have no more ordered extents for this inode and
316 * no dirty pages. We can safely remove it from the
317 * list of ordered extents
318 */
319 if (RB_EMPTY_ROOT(&tree->tree) &&
320 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
321 list_del_init(&BTRFS_I(inode)->ordered_operations);
322 }
313 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 323 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314 324
315 mutex_unlock(&tree->mutex); 325 mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
370} 380}
371 381
372/* 382/*
383 * this is used during transaction commit to write all the inodes
384 * added to the ordered operation list. These files must be fully on
385 * disk before the transaction commits.
386 *
387 * we have two modes here, one is to just start the IO via filemap_flush
388 * and the other is to wait for all the io. When we wait, we have an
389 * extra check to make sure the ordered operation list really is empty
390 * before we return
391 */
392int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
393{
394 struct btrfs_inode *btrfs_inode;
395 struct inode *inode;
396 struct list_head splice;
397
398 INIT_LIST_HEAD(&splice);
399
400 mutex_lock(&root->fs_info->ordered_operations_mutex);
401 spin_lock(&root->fs_info->ordered_extent_lock);
402again:
403 list_splice_init(&root->fs_info->ordered_operations, &splice);
404
405 while (!list_empty(&splice)) {
406 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
407 ordered_operations);
408
409 inode = &btrfs_inode->vfs_inode;
410
411 list_del_init(&btrfs_inode->ordered_operations);
412
413 /*
414 * the inode may be getting freed (in sys_unlink path).
415 */
416 inode = igrab(inode);
417
418 if (!wait && inode) {
419 list_add_tail(&BTRFS_I(inode)->ordered_operations,
420 &root->fs_info->ordered_operations);
421 }
422 spin_unlock(&root->fs_info->ordered_extent_lock);
423
424 if (inode) {
425 if (wait)
426 btrfs_wait_ordered_range(inode, 0, (u64)-1);
427 else
428 filemap_flush(inode->i_mapping);
429 iput(inode);
430 }
431
432 cond_resched();
433 spin_lock(&root->fs_info->ordered_extent_lock);
434 }
435 if (wait && !list_empty(&root->fs_info->ordered_operations))
436 goto again;
437
438 spin_unlock(&root->fs_info->ordered_extent_lock);
439 mutex_unlock(&root->fs_info->ordered_operations_mutex);
440
441 return 0;
442}
443
444/*
373 * Used to start IO or wait for a given ordered extent to finish. 445 * Used to start IO or wait for a given ordered extent to finish.
374 * 446 *
375 * If wait is one, this effectively waits on page writeback for all the pages 447 * If wait is one, this effectively waits on page writeback for all the pages
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
726 798
727 return ret; 799 return ret;
728} 800}
801
802/*
803 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes.
805 *
806 * This basically gives us the ext3 style data=ordered mode, and it is mostly
807 * used to make sure renamed files are fully on disk.
808 *
809 * It is a noop if the inode is already fully on disk.
810 *
811 * If trans is not null, we'll do a friendly check for a transaction that
812 * is already flushing things and force the IO down ourselves.
813 */
814int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
815 struct btrfs_root *root,
816 struct inode *inode)
817{
818 u64 last_mod;
819
820 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
821
822 /*
823 * if this file hasn't been changed since the last transaction
824 * commit, we can safely return without doing anything
825 */
826 if (last_mod < root->fs_info->last_trans_committed)
827 return 0;
828
829 /*
830 * the transaction is already committing. Just start the IO and
831 * don't bother with all of this list nonsense
832 */
833 if (trans && root->fs_info->running_transaction->blocked) {
834 btrfs_wait_ordered_range(inode, 0, (u64)-1);
835 return 0;
836 }
837
838 spin_lock(&root->fs_info->ordered_extent_lock);
839 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
840 list_add_tail(&BTRFS_I(inode)->ordered_operations,
841 &root->fs_info->ordered_operations);
842 }
843 spin_unlock(&root->fs_info->ordered_extent_lock);
844
845 return 0;
846}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d6..3d31c8827b01 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, 155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode); 156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
159int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
160 struct btrfs_root *root,
161 struct inode *inode);
158#endif 162#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9c8f158dd2db..664782c6a2df 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -975,6 +975,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
975 int should_grow = 0; 975 int should_grow = 0;
976 unsigned long now = get_seconds(); 976 unsigned long now = get_seconds();
977 977
978 btrfs_run_ordered_operations(root, 0);
979
978 /* make a pass through all the delayed refs we have so far 980 /* make a pass through all the delayed refs we have so far
979 * any runnings procs may add more while we are here 981 * any runnings procs may add more while we are here
980 */ 982 */
@@ -1056,6 +1058,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1056 BUG_ON(ret); 1058 BUG_ON(ret);
1057 } 1059 }
1058 1060
1061 /*
1062 * rename don't use btrfs_join_transaction, so, once we
1063 * set the transaction to blocked above, we aren't going
1064 * to get any new ordered operations. We can safely run
1065 * it here and no for sure that nothing new will be added
1066 * to the list
1067 */
1068 btrfs_run_ordered_operations(root, 1);
1069
1059 smp_mb(); 1070 smp_mb();
1060 if (cur_trans->num_writers > 1 || should_grow) 1071 if (cur_trans->num_writers > 1 || should_grow)
1061 schedule_timeout(timeout); 1072 schedule_timeout(timeout);