diff options
-rw-r--r-- | fs/btrfs/btrfs_inode.h | 18 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 35 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 2 | ||||
-rw-r--r-- | fs/btrfs/file.c | 26 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 81 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.c | 118 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.h | 4 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 11 |
8 files changed, 288 insertions, 7 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3af4cfb5654c..b30986f00b9d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h | |||
@@ -66,6 +66,12 @@ struct btrfs_inode { | |||
66 | */ | 66 | */ |
67 | struct list_head delalloc_inodes; | 67 | struct list_head delalloc_inodes; |
68 | 68 | ||
69 | /* | ||
70 | * list for tracking inodes that must be sent to disk before a | ||
71 | * rename or truncate commit | ||
72 | */ | ||
73 | struct list_head ordered_operations; | ||
74 | |||
69 | /* the space_info for where this inode's data allocations are done */ | 75 | /* the space_info for where this inode's data allocations are done */ |
70 | struct btrfs_space_info *space_info; | 76 | struct btrfs_space_info *space_info; |
71 | 77 | ||
@@ -122,6 +128,18 @@ struct btrfs_inode { | |||
122 | */ | 128 | */ |
123 | u64 last_unlink_trans; | 129 | u64 last_unlink_trans; |
124 | 130 | ||
131 | /* | ||
132 | * ordered_data_close is set by truncate when a file that used | ||
133 | * to have good data has been truncated to zero. When it is set | ||
134 | * the btrfs file release call will add this inode to the | ||
135 | * ordered operations list so that we make sure to flush out any | ||
136 | * new data the application may have written before commit. | ||
137 | * | ||
138 | * yes, its silly to have a single bitflag, but we might grow more | ||
139 | * of these. | ||
140 | */ | ||
141 | unsigned ordered_data_close:1; | ||
142 | |||
125 | struct inode vfs_inode; | 143 | struct inode vfs_inode; |
126 | }; | 144 | }; |
127 | 145 | ||
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2737facbd341..f48905ee5240 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum; | |||
45 | 45 | ||
46 | #define BTRFS_MAX_LEVEL 8 | 46 | #define BTRFS_MAX_LEVEL 8 |
47 | 47 | ||
48 | /* | ||
49 | * files bigger than this get some pre-flushing when they are added | ||
50 | * to the ordered operations list. That way we limit the total | ||
51 | * work done by the commit | ||
52 | */ | ||
53 | #define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024) | ||
54 | |||
48 | /* holds pointers to all of the tree roots */ | 55 | /* holds pointers to all of the tree roots */ |
49 | #define BTRFS_ROOT_TREE_OBJECTID 1ULL | 56 | #define BTRFS_ROOT_TREE_OBJECTID 1ULL |
50 | 57 | ||
@@ -727,6 +734,15 @@ struct btrfs_fs_info { | |||
727 | struct mutex volume_mutex; | 734 | struct mutex volume_mutex; |
728 | struct mutex tree_reloc_mutex; | 735 | struct mutex tree_reloc_mutex; |
729 | 736 | ||
737 | /* | ||
738 | * this protects the ordered operations list only while we are | ||
739 | * processing all of the entries on it. This way we make | ||
740 | * sure the commit code doesn't find the list temporarily empty | ||
741 | * because another function happens to be doing non-waiting preflush | ||
742 | * before jumping into the main commit. | ||
743 | */ | ||
744 | struct mutex ordered_operations_mutex; | ||
745 | |||
730 | struct list_head trans_list; | 746 | struct list_head trans_list; |
731 | struct list_head hashers; | 747 | struct list_head hashers; |
732 | struct list_head dead_roots; | 748 | struct list_head dead_roots; |
@@ -741,10 +757,29 @@ struct btrfs_fs_info { | |||
741 | * ordered extents | 757 | * ordered extents |
742 | */ | 758 | */ |
743 | spinlock_t ordered_extent_lock; | 759 | spinlock_t ordered_extent_lock; |
760 | |||
761 | /* | ||
762 | * all of the data=ordered extents pending writeback | ||
763 | * these can span multiple transactions and basically include | ||
764 | * every dirty data page that isn't from nodatacow | ||
765 | */ | ||
744 | struct list_head ordered_extents; | 766 | struct list_head ordered_extents; |
767 | |||
768 | /* | ||
769 | * all of the inodes that have delalloc bytes. It is possible for | ||
770 | * this list to be empty even when there is still dirty data=ordered | ||
771 | * extents waiting to finish IO. | ||
772 | */ | ||
745 | struct list_head delalloc_inodes; | 773 | struct list_head delalloc_inodes; |
746 | 774 | ||
747 | /* | 775 | /* |
776 | * special rename and truncate targets that must be on disk before | ||
777 | * we're allowed to commit. This is basically the ext3 style | ||
778 | * data=ordered list. | ||
779 | */ | ||
780 | struct list_head ordered_operations; | ||
781 | |||
782 | /* | ||
748 | * there is a pool of worker threads for checksumming during writes | 783 | * there is a pool of worker threads for checksumming during writes |
749 | * and a pool for checksumming after reads. This is because readers | 784 | * and a pool for checksumming after reads. This is because readers |
750 | * can run with FS locks held, and the writers may be waiting for | 785 | * can run with FS locks held, and the writers may be waiting for |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9244cd7313d4..1747dfd18654 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -1572,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1572 | INIT_LIST_HEAD(&fs_info->dead_roots); | 1572 | INIT_LIST_HEAD(&fs_info->dead_roots); |
1573 | INIT_LIST_HEAD(&fs_info->hashers); | 1573 | INIT_LIST_HEAD(&fs_info->hashers); |
1574 | INIT_LIST_HEAD(&fs_info->delalloc_inodes); | 1574 | INIT_LIST_HEAD(&fs_info->delalloc_inodes); |
1575 | INIT_LIST_HEAD(&fs_info->ordered_operations); | ||
1575 | spin_lock_init(&fs_info->delalloc_lock); | 1576 | spin_lock_init(&fs_info->delalloc_lock); |
1576 | spin_lock_init(&fs_info->new_trans_lock); | 1577 | spin_lock_init(&fs_info->new_trans_lock); |
1577 | spin_lock_init(&fs_info->ref_cache_lock); | 1578 | spin_lock_init(&fs_info->ref_cache_lock); |
@@ -1643,6 +1644,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1643 | insert_inode_hash(fs_info->btree_inode); | 1644 | insert_inode_hash(fs_info->btree_inode); |
1644 | 1645 | ||
1645 | mutex_init(&fs_info->trans_mutex); | 1646 | mutex_init(&fs_info->trans_mutex); |
1647 | mutex_init(&fs_info->ordered_operations_mutex); | ||
1646 | mutex_init(&fs_info->tree_log_mutex); | 1648 | mutex_init(&fs_info->tree_log_mutex); |
1647 | mutex_init(&fs_info->drop_mutex); | 1649 | mutex_init(&fs_info->drop_mutex); |
1648 | mutex_init(&fs_info->pinned_mutex); | 1650 | mutex_init(&fs_info->pinned_mutex); |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 32d10a617613..9c9fb46ccd08 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -1161,6 +1161,20 @@ out_nolock: | |||
1161 | page_cache_release(pinned[1]); | 1161 | page_cache_release(pinned[1]); |
1162 | *ppos = pos; | 1162 | *ppos = pos; |
1163 | 1163 | ||
1164 | /* | ||
1165 | * we want to make sure fsync finds this change | ||
1166 | * but we haven't joined a transaction running right now. | ||
1167 | * | ||
1168 | * Later on, someone is sure to update the inode and get the | ||
1169 | * real transid recorded. | ||
1170 | * | ||
1171 | * We set last_trans now to the fs_info generation + 1, | ||
1172 | * this will either be one more than the running transaction | ||
1173 | * or the generation used for the next transaction if there isn't | ||
1174 | * one running right now. | ||
1175 | */ | ||
1176 | BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; | ||
1177 | |||
1164 | if (num_written > 0 && will_write) { | 1178 | if (num_written > 0 && will_write) { |
1165 | struct btrfs_trans_handle *trans; | 1179 | struct btrfs_trans_handle *trans; |
1166 | 1180 | ||
@@ -1194,6 +1208,18 @@ out_nolock: | |||
1194 | 1208 | ||
1195 | int btrfs_release_file(struct inode *inode, struct file *filp) | 1209 | int btrfs_release_file(struct inode *inode, struct file *filp) |
1196 | { | 1210 | { |
1211 | /* | ||
1212 | * ordered_data_close is set by settattr when we are about to truncate | ||
1213 | * a file from a non-zero size to a zero size. This tries to | ||
1214 | * flush down new bytes that may have been written if the | ||
1215 | * application were using truncate to replace a file in place. | ||
1216 | */ | ||
1217 | if (BTRFS_I(inode)->ordered_data_close) { | ||
1218 | BTRFS_I(inode)->ordered_data_close = 0; | ||
1219 | btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); | ||
1220 | if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) | ||
1221 | filemap_flush(inode->i_mapping); | ||
1222 | } | ||
1197 | if (filp->private_data) | 1223 | if (filp->private_data) |
1198 | btrfs_ioctl_trans_end(filp); | 1224 | btrfs_ioctl_trans_end(filp); |
1199 | return 0; | 1225 | return 0; |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bffd79faffb5..1cff528d5b51 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) | |||
2907 | if (err) | 2907 | if (err) |
2908 | return err; | 2908 | return err; |
2909 | 2909 | ||
2910 | if (S_ISREG(inode->i_mode) && | 2910 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { |
2911 | attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { | 2911 | if (attr->ia_size > inode->i_size) { |
2912 | err = btrfs_cont_expand(inode, attr->ia_size); | 2912 | err = btrfs_cont_expand(inode, attr->ia_size); |
2913 | if (err) | 2913 | if (err) |
2914 | return err; | 2914 | return err; |
2915 | } else if (inode->i_size > 0 && | ||
2916 | attr->ia_size == 0) { | ||
2917 | |||
2918 | /* we're truncating a file that used to have good | ||
2919 | * data down to zero. Make sure it gets into | ||
2920 | * the ordered flush list so that any new writes | ||
2921 | * get down to disk quickly. | ||
2922 | */ | ||
2923 | BTRFS_I(inode)->ordered_data_close = 1; | ||
2924 | } | ||
2915 | } | 2925 | } |
2916 | 2926 | ||
2917 | err = inode_setattr(inode, attr); | 2927 | err = inode_setattr(inode, attr); |
@@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode) | |||
3050 | extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, | 3060 | extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, |
3051 | inode->i_mapping, GFP_NOFS); | 3061 | inode->i_mapping, GFP_NOFS); |
3052 | INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); | 3062 | INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); |
3063 | INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); | ||
3053 | btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); | 3064 | btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); |
3054 | mutex_init(&BTRFS_I(inode)->extent_mutex); | 3065 | mutex_init(&BTRFS_I(inode)->extent_mutex); |
3055 | mutex_init(&BTRFS_I(inode)->log_mutex); | 3066 | mutex_init(&BTRFS_I(inode)->log_mutex); |
@@ -4419,6 +4430,8 @@ again: | |||
4419 | } | 4430 | } |
4420 | ClearPageChecked(page); | 4431 | ClearPageChecked(page); |
4421 | set_page_dirty(page); | 4432 | set_page_dirty(page); |
4433 | |||
4434 | BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; | ||
4422 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | 4435 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); |
4423 | 4436 | ||
4424 | out_unlock: | 4437 | out_unlock: |
@@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode) | |||
4444 | btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); | 4457 | btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); |
4445 | 4458 | ||
4446 | trans = btrfs_start_transaction(root, 1); | 4459 | trans = btrfs_start_transaction(root, 1); |
4460 | |||
4461 | /* | ||
4462 | * setattr is responsible for setting the ordered_data_close flag, | ||
4463 | * but that is only tested during the last file release. That | ||
4464 | * could happen well after the next commit, leaving a great big | ||
4465 | * window where new writes may get lost if someone chooses to write | ||
4466 | * to this file after truncating to zero | ||
4467 | * | ||
4468 | * The inode doesn't have any dirty data here, and so if we commit | ||
4469 | * this is a noop. If someone immediately starts writing to the inode | ||
4470 | * it is very likely we'll catch some of their writes in this | ||
4471 | * transaction, and the commit will find this file on the ordered | ||
4472 | * data list with good things to send down. | ||
4473 | * | ||
4474 | * This is a best effort solution, there is still a window where | ||
4475 | * using truncate to replace the contents of the file will | ||
4476 | * end up with a zero length file after a crash. | ||
4477 | */ | ||
4478 | if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) | ||
4479 | btrfs_add_ordered_operation(trans, root, inode); | ||
4480 | |||
4447 | btrfs_set_trans_block_group(trans, inode); | 4481 | btrfs_set_trans_block_group(trans, inode); |
4448 | btrfs_i_size_write(inode, inode->i_size); | 4482 | btrfs_i_size_write(inode, inode->i_size); |
4449 | 4483 | ||
@@ -4520,12 +4554,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) | |||
4520 | ei->i_acl = BTRFS_ACL_NOT_CACHED; | 4554 | ei->i_acl = BTRFS_ACL_NOT_CACHED; |
4521 | ei->i_default_acl = BTRFS_ACL_NOT_CACHED; | 4555 | ei->i_default_acl = BTRFS_ACL_NOT_CACHED; |
4522 | INIT_LIST_HEAD(&ei->i_orphan); | 4556 | INIT_LIST_HEAD(&ei->i_orphan); |
4557 | INIT_LIST_HEAD(&ei->ordered_operations); | ||
4523 | return &ei->vfs_inode; | 4558 | return &ei->vfs_inode; |
4524 | } | 4559 | } |
4525 | 4560 | ||
4526 | void btrfs_destroy_inode(struct inode *inode) | 4561 | void btrfs_destroy_inode(struct inode *inode) |
4527 | { | 4562 | { |
4528 | struct btrfs_ordered_extent *ordered; | 4563 | struct btrfs_ordered_extent *ordered; |
4564 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
4565 | |||
4529 | WARN_ON(!list_empty(&inode->i_dentry)); | 4566 | WARN_ON(!list_empty(&inode->i_dentry)); |
4530 | WARN_ON(inode->i_data.nrpages); | 4567 | WARN_ON(inode->i_data.nrpages); |
4531 | 4568 | ||
@@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode) | |||
4536 | BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) | 4573 | BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) |
4537 | posix_acl_release(BTRFS_I(inode)->i_default_acl); | 4574 | posix_acl_release(BTRFS_I(inode)->i_default_acl); |
4538 | 4575 | ||
4539 | spin_lock(&BTRFS_I(inode)->root->list_lock); | 4576 | /* |
4577 | * Make sure we're properly removed from the ordered operation | ||
4578 | * lists. | ||
4579 | */ | ||
4580 | smp_mb(); | ||
4581 | if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { | ||
4582 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
4583 | list_del_init(&BTRFS_I(inode)->ordered_operations); | ||
4584 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
4585 | } | ||
4586 | |||
4587 | spin_lock(&root->list_lock); | ||
4540 | if (!list_empty(&BTRFS_I(inode)->i_orphan)) { | 4588 | if (!list_empty(&BTRFS_I(inode)->i_orphan)) { |
4541 | printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" | 4589 | printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" |
4542 | " list\n", inode->i_ino); | 4590 | " list\n", inode->i_ino); |
4543 | dump_stack(); | 4591 | dump_stack(); |
4544 | } | 4592 | } |
4545 | spin_unlock(&BTRFS_I(inode)->root->list_lock); | 4593 | spin_unlock(&root->list_lock); |
4546 | 4594 | ||
4547 | while (1) { | 4595 | while (1) { |
4548 | ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); | 4596 | ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); |
@@ -4667,9 +4715,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
4667 | if (ret) | 4715 | if (ret) |
4668 | goto out_unlock; | 4716 | goto out_unlock; |
4669 | 4717 | ||
4718 | /* | ||
4719 | * we're using rename to replace one file with another. | ||
4720 | * and the replacement file is large. Start IO on it now so | ||
4721 | * we don't add too much work to the end of the transaction | ||
4722 | */ | ||
4723 | if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && | ||
4724 | new_inode->i_size && | ||
4725 | old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) | ||
4726 | filemap_flush(old_inode->i_mapping); | ||
4727 | |||
4670 | trans = btrfs_start_transaction(root, 1); | 4728 | trans = btrfs_start_transaction(root, 1); |
4671 | 4729 | ||
4672 | /* | 4730 | /* |
4731 | * make sure the inode gets flushed if it is replacing | ||
4732 | * something. | ||
4733 | */ | ||
4734 | if (new_inode && new_inode->i_size && | ||
4735 | old_inode && S_ISREG(old_inode->i_mode)) { | ||
4736 | btrfs_add_ordered_operation(trans, root, old_inode); | ||
4737 | } | ||
4738 | |||
4739 | /* | ||
4673 | * this is an ugly little race, but the rename is required to make | 4740 | * this is an ugly little race, but the rename is required to make |
4674 | * sure that if we crash, the inode is either at the old name | 4741 | * sure that if we crash, the inode is either at the old name |
4675 | * or the new one. pinning the log transaction lets us make sure | 4742 | * or the new one. pinning the log transaction lets us make sure |
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 77c2411a5f0f..53c87b197d70 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c | |||
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode, | |||
310 | 310 | ||
311 | spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); | 311 | spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); |
312 | list_del_init(&entry->root_extent_list); | 312 | list_del_init(&entry->root_extent_list); |
313 | |||
314 | /* | ||
315 | * we have no more ordered extents for this inode and | ||
316 | * no dirty pages. We can safely remove it from the | ||
317 | * list of ordered extents | ||
318 | */ | ||
319 | if (RB_EMPTY_ROOT(&tree->tree) && | ||
320 | !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { | ||
321 | list_del_init(&BTRFS_I(inode)->ordered_operations); | ||
322 | } | ||
313 | spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); | 323 | spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); |
314 | 324 | ||
315 | mutex_unlock(&tree->mutex); | 325 | mutex_unlock(&tree->mutex); |
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) | |||
370 | } | 380 | } |
371 | 381 | ||
372 | /* | 382 | /* |
383 | * this is used during transaction commit to write all the inodes | ||
384 | * added to the ordered operation list. These files must be fully on | ||
385 | * disk before the transaction commits. | ||
386 | * | ||
387 | * we have two modes here, one is to just start the IO via filemap_flush | ||
388 | * and the other is to wait for all the io. When we wait, we have an | ||
389 | * extra check to make sure the ordered operation list really is empty | ||
390 | * before we return | ||
391 | */ | ||
392 | int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) | ||
393 | { | ||
394 | struct btrfs_inode *btrfs_inode; | ||
395 | struct inode *inode; | ||
396 | struct list_head splice; | ||
397 | |||
398 | INIT_LIST_HEAD(&splice); | ||
399 | |||
400 | mutex_lock(&root->fs_info->ordered_operations_mutex); | ||
401 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
402 | again: | ||
403 | list_splice_init(&root->fs_info->ordered_operations, &splice); | ||
404 | |||
405 | while (!list_empty(&splice)) { | ||
406 | btrfs_inode = list_entry(splice.next, struct btrfs_inode, | ||
407 | ordered_operations); | ||
408 | |||
409 | inode = &btrfs_inode->vfs_inode; | ||
410 | |||
411 | list_del_init(&btrfs_inode->ordered_operations); | ||
412 | |||
413 | /* | ||
414 | * the inode may be getting freed (in sys_unlink path). | ||
415 | */ | ||
416 | inode = igrab(inode); | ||
417 | |||
418 | if (!wait && inode) { | ||
419 | list_add_tail(&BTRFS_I(inode)->ordered_operations, | ||
420 | &root->fs_info->ordered_operations); | ||
421 | } | ||
422 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
423 | |||
424 | if (inode) { | ||
425 | if (wait) | ||
426 | btrfs_wait_ordered_range(inode, 0, (u64)-1); | ||
427 | else | ||
428 | filemap_flush(inode->i_mapping); | ||
429 | iput(inode); | ||
430 | } | ||
431 | |||
432 | cond_resched(); | ||
433 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
434 | } | ||
435 | if (wait && !list_empty(&root->fs_info->ordered_operations)) | ||
436 | goto again; | ||
437 | |||
438 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
439 | mutex_unlock(&root->fs_info->ordered_operations_mutex); | ||
440 | |||
441 | return 0; | ||
442 | } | ||
443 | |||
444 | /* | ||
373 | * Used to start IO or wait for a given ordered extent to finish. | 445 | * Used to start IO or wait for a given ordered extent to finish. |
374 | * | 446 | * |
375 | * If wait is one, this effectively waits on page writeback for all the pages | 447 | * If wait is one, this effectively waits on page writeback for all the pages |
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, | |||
726 | 798 | ||
727 | return ret; | 799 | return ret; |
728 | } | 800 | } |
801 | |||
802 | /* | ||
803 | * add a given inode to the list of inodes that must be fully on | ||
804 | * disk before a transaction commit finishes. | ||
805 | * | ||
806 | * This basically gives us the ext3 style data=ordered mode, and it is mostly | ||
807 | * used to make sure renamed files are fully on disk. | ||
808 | * | ||
809 | * It is a noop if the inode is already fully on disk. | ||
810 | * | ||
811 | * If trans is not null, we'll do a friendly check for a transaction that | ||
812 | * is already flushing things and force the IO down ourselves. | ||
813 | */ | ||
814 | int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, | ||
815 | struct btrfs_root *root, | ||
816 | struct inode *inode) | ||
817 | { | ||
818 | u64 last_mod; | ||
819 | |||
820 | last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); | ||
821 | |||
822 | /* | ||
823 | * if this file hasn't been changed since the last transaction | ||
824 | * commit, we can safely return without doing anything | ||
825 | */ | ||
826 | if (last_mod < root->fs_info->last_trans_committed) | ||
827 | return 0; | ||
828 | |||
829 | /* | ||
830 | * the transaction is already committing. Just start the IO and | ||
831 | * don't bother with all of this list nonsense | ||
832 | */ | ||
833 | if (trans && root->fs_info->running_transaction->blocked) { | ||
834 | btrfs_wait_ordered_range(inode, 0, (u64)-1); | ||
835 | return 0; | ||
836 | } | ||
837 | |||
838 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
839 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) { | ||
840 | list_add_tail(&BTRFS_I(inode)->ordered_operations, | ||
841 | &root->fs_info->ordered_operations); | ||
842 | } | ||
843 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
844 | |||
845 | return 0; | ||
846 | } | ||
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ab66d5e8d6d6..3d31c8827b01 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h | |||
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, | |||
155 | int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, | 155 | int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, |
156 | loff_t end, int sync_mode); | 156 | loff_t end, int sync_mode); |
157 | int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); | 157 | int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); |
158 | int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); | ||
159 | int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, | ||
160 | struct btrfs_root *root, | ||
161 | struct inode *inode); | ||
158 | #endif | 162 | #endif |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9c8f158dd2db..664782c6a2df 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -975,6 +975,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
975 | int should_grow = 0; | 975 | int should_grow = 0; |
976 | unsigned long now = get_seconds(); | 976 | unsigned long now = get_seconds(); |
977 | 977 | ||
978 | btrfs_run_ordered_operations(root, 0); | ||
979 | |||
978 | /* make a pass through all the delayed refs we have so far | 980 | /* make a pass through all the delayed refs we have so far |
979 | * any runnings procs may add more while we are here | 981 | * any runnings procs may add more while we are here |
980 | */ | 982 | */ |
@@ -1056,6 +1058,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1056 | BUG_ON(ret); | 1058 | BUG_ON(ret); |
1057 | } | 1059 | } |
1058 | 1060 | ||
1061 | /* | ||
1062 | * rename don't use btrfs_join_transaction, so, once we | ||
1063 | * set the transaction to blocked above, we aren't going | ||
1064 | * to get any new ordered operations. We can safely run | ||
1065 | * it here and no for sure that nothing new will be added | ||
1066 | * to the list | ||
1067 | */ | ||
1068 | btrfs_run_ordered_operations(root, 1); | ||
1069 | |||
1059 | smp_mb(); | 1070 | smp_mb(); |
1060 | if (cur_trans->num_writers > 1 || should_grow) | 1071 | if (cur_trans->num_writers > 1 || should_grow) |
1061 | schedule_timeout(timeout); | 1072 | schedule_timeout(timeout); |