diff options
| -rw-r--r-- | fs/btrfs/btrfs_inode.h | 18 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 35 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/file.c | 26 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 81 | ||||
| -rw-r--r-- | fs/btrfs/ordered-data.c | 118 | ||||
| -rw-r--r-- | fs/btrfs/ordered-data.h | 4 | ||||
| -rw-r--r-- | fs/btrfs/transaction.c | 11 |
8 files changed, 288 insertions, 7 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3af4cfb5654c..b30986f00b9d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h | |||
| @@ -66,6 +66,12 @@ struct btrfs_inode { | |||
| 66 | */ | 66 | */ |
| 67 | struct list_head delalloc_inodes; | 67 | struct list_head delalloc_inodes; |
| 68 | 68 | ||
| 69 | /* | ||
| 70 | * list for tracking inodes that must be sent to disk before a | ||
| 71 | * rename or truncate commit | ||
| 72 | */ | ||
| 73 | struct list_head ordered_operations; | ||
| 74 | |||
| 69 | /* the space_info for where this inode's data allocations are done */ | 75 | /* the space_info for where this inode's data allocations are done */ |
| 70 | struct btrfs_space_info *space_info; | 76 | struct btrfs_space_info *space_info; |
| 71 | 77 | ||
| @@ -122,6 +128,18 @@ struct btrfs_inode { | |||
| 122 | */ | 128 | */ |
| 123 | u64 last_unlink_trans; | 129 | u64 last_unlink_trans; |
| 124 | 130 | ||
| 131 | /* | ||
| 132 | * ordered_data_close is set by truncate when a file that used | ||
| 133 | * to have good data has been truncated to zero. When it is set | ||
| 134 | * the btrfs file release call will add this inode to the | ||
| 135 | * ordered operations list so that we make sure to flush out any | ||
| 136 | * new data the application may have written before commit. | ||
| 137 | * | ||
| 138 | * yes, its silly to have a single bitflag, but we might grow more | ||
| 139 | * of these. | ||
| 140 | */ | ||
| 141 | unsigned ordered_data_close:1; | ||
| 142 | |||
| 125 | struct inode vfs_inode; | 143 | struct inode vfs_inode; |
| 126 | }; | 144 | }; |
| 127 | 145 | ||
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2737facbd341..f48905ee5240 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
| @@ -45,6 +45,13 @@ struct btrfs_ordered_sum; | |||
| 45 | 45 | ||
| 46 | #define BTRFS_MAX_LEVEL 8 | 46 | #define BTRFS_MAX_LEVEL 8 |
| 47 | 47 | ||
| 48 | /* | ||
| 49 | * files bigger than this get some pre-flushing when they are added | ||
| 50 | * to the ordered operations list. That way we limit the total | ||
| 51 | * work done by the commit | ||
| 52 | */ | ||
| 53 | #define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024) | ||
| 54 | |||
| 48 | /* holds pointers to all of the tree roots */ | 55 | /* holds pointers to all of the tree roots */ |
| 49 | #define BTRFS_ROOT_TREE_OBJECTID 1ULL | 56 | #define BTRFS_ROOT_TREE_OBJECTID 1ULL |
| 50 | 57 | ||
| @@ -727,6 +734,15 @@ struct btrfs_fs_info { | |||
| 727 | struct mutex volume_mutex; | 734 | struct mutex volume_mutex; |
| 728 | struct mutex tree_reloc_mutex; | 735 | struct mutex tree_reloc_mutex; |
| 729 | 736 | ||
| 737 | /* | ||
| 738 | * this protects the ordered operations list only while we are | ||
| 739 | * processing all of the entries on it. This way we make | ||
| 740 | * sure the commit code doesn't find the list temporarily empty | ||
| 741 | * because another function happens to be doing non-waiting preflush | ||
| 742 | * before jumping into the main commit. | ||
| 743 | */ | ||
| 744 | struct mutex ordered_operations_mutex; | ||
| 745 | |||
| 730 | struct list_head trans_list; | 746 | struct list_head trans_list; |
| 731 | struct list_head hashers; | 747 | struct list_head hashers; |
| 732 | struct list_head dead_roots; | 748 | struct list_head dead_roots; |
| @@ -741,10 +757,29 @@ struct btrfs_fs_info { | |||
| 741 | * ordered extents | 757 | * ordered extents |
| 742 | */ | 758 | */ |
| 743 | spinlock_t ordered_extent_lock; | 759 | spinlock_t ordered_extent_lock; |
| 760 | |||
| 761 | /* | ||
| 762 | * all of the data=ordered extents pending writeback | ||
| 763 | * these can span multiple transactions and basically include | ||
| 764 | * every dirty data page that isn't from nodatacow | ||
| 765 | */ | ||
| 744 | struct list_head ordered_extents; | 766 | struct list_head ordered_extents; |
| 767 | |||
| 768 | /* | ||
| 769 | * all of the inodes that have delalloc bytes. It is possible for | ||
| 770 | * this list to be empty even when there is still dirty data=ordered | ||
| 771 | * extents waiting to finish IO. | ||
| 772 | */ | ||
| 745 | struct list_head delalloc_inodes; | 773 | struct list_head delalloc_inodes; |
| 746 | 774 | ||
| 747 | /* | 775 | /* |
| 776 | * special rename and truncate targets that must be on disk before | ||
| 777 | * we're allowed to commit. This is basically the ext3 style | ||
| 778 | * data=ordered list. | ||
| 779 | */ | ||
| 780 | struct list_head ordered_operations; | ||
| 781 | |||
| 782 | /* | ||
| 748 | * there is a pool of worker threads for checksumming during writes | 783 | * there is a pool of worker threads for checksumming during writes |
| 749 | * and a pool for checksumming after reads. This is because readers | 784 | * and a pool for checksumming after reads. This is because readers |
| 750 | * can run with FS locks held, and the writers may be waiting for | 785 | * can run with FS locks held, and the writers may be waiting for |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9244cd7313d4..1747dfd18654 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
| @@ -1572,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
| 1572 | INIT_LIST_HEAD(&fs_info->dead_roots); | 1572 | INIT_LIST_HEAD(&fs_info->dead_roots); |
| 1573 | INIT_LIST_HEAD(&fs_info->hashers); | 1573 | INIT_LIST_HEAD(&fs_info->hashers); |
| 1574 | INIT_LIST_HEAD(&fs_info->delalloc_inodes); | 1574 | INIT_LIST_HEAD(&fs_info->delalloc_inodes); |
| 1575 | INIT_LIST_HEAD(&fs_info->ordered_operations); | ||
| 1575 | spin_lock_init(&fs_info->delalloc_lock); | 1576 | spin_lock_init(&fs_info->delalloc_lock); |
| 1576 | spin_lock_init(&fs_info->new_trans_lock); | 1577 | spin_lock_init(&fs_info->new_trans_lock); |
| 1577 | spin_lock_init(&fs_info->ref_cache_lock); | 1578 | spin_lock_init(&fs_info->ref_cache_lock); |
| @@ -1643,6 +1644,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
| 1643 | insert_inode_hash(fs_info->btree_inode); | 1644 | insert_inode_hash(fs_info->btree_inode); |
| 1644 | 1645 | ||
| 1645 | mutex_init(&fs_info->trans_mutex); | 1646 | mutex_init(&fs_info->trans_mutex); |
| 1647 | mutex_init(&fs_info->ordered_operations_mutex); | ||
| 1646 | mutex_init(&fs_info->tree_log_mutex); | 1648 | mutex_init(&fs_info->tree_log_mutex); |
| 1647 | mutex_init(&fs_info->drop_mutex); | 1649 | mutex_init(&fs_info->drop_mutex); |
| 1648 | mutex_init(&fs_info->pinned_mutex); | 1650 | mutex_init(&fs_info->pinned_mutex); |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 32d10a617613..9c9fb46ccd08 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
| @@ -1161,6 +1161,20 @@ out_nolock: | |||
| 1161 | page_cache_release(pinned[1]); | 1161 | page_cache_release(pinned[1]); |
| 1162 | *ppos = pos; | 1162 | *ppos = pos; |
| 1163 | 1163 | ||
| 1164 | /* | ||
| 1165 | * we want to make sure fsync finds this change | ||
| 1166 | * but we haven't joined a transaction running right now. | ||
| 1167 | * | ||
| 1168 | * Later on, someone is sure to update the inode and get the | ||
| 1169 | * real transid recorded. | ||
| 1170 | * | ||
| 1171 | * We set last_trans now to the fs_info generation + 1, | ||
| 1172 | * this will either be one more than the running transaction | ||
| 1173 | * or the generation used for the next transaction if there isn't | ||
| 1174 | * one running right now. | ||
| 1175 | */ | ||
| 1176 | BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; | ||
| 1177 | |||
| 1164 | if (num_written > 0 && will_write) { | 1178 | if (num_written > 0 && will_write) { |
| 1165 | struct btrfs_trans_handle *trans; | 1179 | struct btrfs_trans_handle *trans; |
| 1166 | 1180 | ||
| @@ -1194,6 +1208,18 @@ out_nolock: | |||
| 1194 | 1208 | ||
| 1195 | int btrfs_release_file(struct inode *inode, struct file *filp) | 1209 | int btrfs_release_file(struct inode *inode, struct file *filp) |
| 1196 | { | 1210 | { |
| 1211 | /* | ||
| 1212 | * ordered_data_close is set by settattr when we are about to truncate | ||
| 1213 | * a file from a non-zero size to a zero size. This tries to | ||
| 1214 | * flush down new bytes that may have been written if the | ||
| 1215 | * application were using truncate to replace a file in place. | ||
| 1216 | */ | ||
| 1217 | if (BTRFS_I(inode)->ordered_data_close) { | ||
| 1218 | BTRFS_I(inode)->ordered_data_close = 0; | ||
| 1219 | btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); | ||
| 1220 | if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) | ||
| 1221 | filemap_flush(inode->i_mapping); | ||
| 1222 | } | ||
| 1197 | if (filp->private_data) | 1223 | if (filp->private_data) |
| 1198 | btrfs_ioctl_trans_end(filp); | 1224 | btrfs_ioctl_trans_end(filp); |
| 1199 | return 0; | 1225 | return 0; |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bffd79faffb5..1cff528d5b51 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
| @@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) | |||
| 2907 | if (err) | 2907 | if (err) |
| 2908 | return err; | 2908 | return err; |
| 2909 | 2909 | ||
| 2910 | if (S_ISREG(inode->i_mode) && | 2910 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { |
| 2911 | attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { | 2911 | if (attr->ia_size > inode->i_size) { |
| 2912 | err = btrfs_cont_expand(inode, attr->ia_size); | 2912 | err = btrfs_cont_expand(inode, attr->ia_size); |
| 2913 | if (err) | 2913 | if (err) |
| 2914 | return err; | 2914 | return err; |
| 2915 | } else if (inode->i_size > 0 && | ||
| 2916 | attr->ia_size == 0) { | ||
| 2917 | |||
| 2918 | /* we're truncating a file that used to have good | ||
| 2919 | * data down to zero. Make sure it gets into | ||
| 2920 | * the ordered flush list so that any new writes | ||
| 2921 | * get down to disk quickly. | ||
| 2922 | */ | ||
| 2923 | BTRFS_I(inode)->ordered_data_close = 1; | ||
| 2924 | } | ||
| 2915 | } | 2925 | } |
| 2916 | 2926 | ||
| 2917 | err = inode_setattr(inode, attr); | 2927 | err = inode_setattr(inode, attr); |
| @@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode) | |||
| 3050 | extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, | 3060 | extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, |
| 3051 | inode->i_mapping, GFP_NOFS); | 3061 | inode->i_mapping, GFP_NOFS); |
| 3052 | INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); | 3062 | INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); |
| 3063 | INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); | ||
| 3053 | btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); | 3064 | btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); |
| 3054 | mutex_init(&BTRFS_I(inode)->extent_mutex); | 3065 | mutex_init(&BTRFS_I(inode)->extent_mutex); |
| 3055 | mutex_init(&BTRFS_I(inode)->log_mutex); | 3066 | mutex_init(&BTRFS_I(inode)->log_mutex); |
| @@ -4419,6 +4430,8 @@ again: | |||
| 4419 | } | 4430 | } |
| 4420 | ClearPageChecked(page); | 4431 | ClearPageChecked(page); |
| 4421 | set_page_dirty(page); | 4432 | set_page_dirty(page); |
| 4433 | |||
| 4434 | BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; | ||
| 4422 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | 4435 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); |
| 4423 | 4436 | ||
| 4424 | out_unlock: | 4437 | out_unlock: |
| @@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode) | |||
| 4444 | btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); | 4457 | btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); |
| 4445 | 4458 | ||
| 4446 | trans = btrfs_start_transaction(root, 1); | 4459 | trans = btrfs_start_transaction(root, 1); |
| 4460 | |||
| 4461 | /* | ||
| 4462 | * setattr is responsible for setting the ordered_data_close flag, | ||
| 4463 | * but that is only tested during the last file release. That | ||
| 4464 | * could happen well after the next commit, leaving a great big | ||
| 4465 | * window where new writes may get lost if someone chooses to write | ||
| 4466 | * to this file after truncating to zero | ||
| 4467 | * | ||
| 4468 | * The inode doesn't have any dirty data here, and so if we commit | ||
| 4469 | * this is a noop. If someone immediately starts writing to the inode | ||
| 4470 | * it is very likely we'll catch some of their writes in this | ||
| 4471 | * transaction, and the commit will find this file on the ordered | ||
| 4472 | * data list with good things to send down. | ||
| 4473 | * | ||
| 4474 | * This is a best effort solution, there is still a window where | ||
| 4475 | * using truncate to replace the contents of the file will | ||
| 4476 | * end up with a zero length file after a crash. | ||
| 4477 | */ | ||
| 4478 | if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) | ||
| 4479 | btrfs_add_ordered_operation(trans, root, inode); | ||
| 4480 | |||
| 4447 | btrfs_set_trans_block_group(trans, inode); | 4481 | btrfs_set_trans_block_group(trans, inode); |
| 4448 | btrfs_i_size_write(inode, inode->i_size); | 4482 | btrfs_i_size_write(inode, inode->i_size); |
| 4449 | 4483 | ||
| @@ -4520,12 +4554,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) | |||
| 4520 | ei->i_acl = BTRFS_ACL_NOT_CACHED; | 4554 | ei->i_acl = BTRFS_ACL_NOT_CACHED; |
| 4521 | ei->i_default_acl = BTRFS_ACL_NOT_CACHED; | 4555 | ei->i_default_acl = BTRFS_ACL_NOT_CACHED; |
| 4522 | INIT_LIST_HEAD(&ei->i_orphan); | 4556 | INIT_LIST_HEAD(&ei->i_orphan); |
| 4557 | INIT_LIST_HEAD(&ei->ordered_operations); | ||
| 4523 | return &ei->vfs_inode; | 4558 | return &ei->vfs_inode; |
| 4524 | } | 4559 | } |
| 4525 | 4560 | ||
| 4526 | void btrfs_destroy_inode(struct inode *inode) | 4561 | void btrfs_destroy_inode(struct inode *inode) |
| 4527 | { | 4562 | { |
| 4528 | struct btrfs_ordered_extent *ordered; | 4563 | struct btrfs_ordered_extent *ordered; |
| 4564 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
| 4565 | |||
| 4529 | WARN_ON(!list_empty(&inode->i_dentry)); | 4566 | WARN_ON(!list_empty(&inode->i_dentry)); |
| 4530 | WARN_ON(inode->i_data.nrpages); | 4567 | WARN_ON(inode->i_data.nrpages); |
| 4531 | 4568 | ||
| @@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode) | |||
| 4536 | BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) | 4573 | BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) |
| 4537 | posix_acl_release(BTRFS_I(inode)->i_default_acl); | 4574 | posix_acl_release(BTRFS_I(inode)->i_default_acl); |
| 4538 | 4575 | ||
| 4539 | spin_lock(&BTRFS_I(inode)->root->list_lock); | 4576 | /* |
| 4577 | * Make sure we're properly removed from the ordered operation | ||
| 4578 | * lists. | ||
| 4579 | */ | ||
| 4580 | smp_mb(); | ||
| 4581 | if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { | ||
| 4582 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
| 4583 | list_del_init(&BTRFS_I(inode)->ordered_operations); | ||
| 4584 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
| 4585 | } | ||
| 4586 | |||
| 4587 | spin_lock(&root->list_lock); | ||
| 4540 | if (!list_empty(&BTRFS_I(inode)->i_orphan)) { | 4588 | if (!list_empty(&BTRFS_I(inode)->i_orphan)) { |
| 4541 | printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" | 4589 | printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" |
| 4542 | " list\n", inode->i_ino); | 4590 | " list\n", inode->i_ino); |
| 4543 | dump_stack(); | 4591 | dump_stack(); |
| 4544 | } | 4592 | } |
| 4545 | spin_unlock(&BTRFS_I(inode)->root->list_lock); | 4593 | spin_unlock(&root->list_lock); |
| 4546 | 4594 | ||
| 4547 | while (1) { | 4595 | while (1) { |
| 4548 | ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); | 4596 | ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); |
| @@ -4667,9 +4715,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
| 4667 | if (ret) | 4715 | if (ret) |
| 4668 | goto out_unlock; | 4716 | goto out_unlock; |
| 4669 | 4717 | ||
| 4718 | /* | ||
| 4719 | * we're using rename to replace one file with another. | ||
| 4720 | * and the replacement file is large. Start IO on it now so | ||
| 4721 | * we don't add too much work to the end of the transaction | ||
| 4722 | */ | ||
| 4723 | if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && | ||
| 4724 | new_inode->i_size && | ||
| 4725 | old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) | ||
| 4726 | filemap_flush(old_inode->i_mapping); | ||
| 4727 | |||
| 4670 | trans = btrfs_start_transaction(root, 1); | 4728 | trans = btrfs_start_transaction(root, 1); |
| 4671 | 4729 | ||
| 4672 | /* | 4730 | /* |
| 4731 | * make sure the inode gets flushed if it is replacing | ||
| 4732 | * something. | ||
| 4733 | */ | ||
| 4734 | if (new_inode && new_inode->i_size && | ||
| 4735 | old_inode && S_ISREG(old_inode->i_mode)) { | ||
| 4736 | btrfs_add_ordered_operation(trans, root, old_inode); | ||
| 4737 | } | ||
| 4738 | |||
| 4739 | /* | ||
| 4673 | * this is an ugly little race, but the rename is required to make | 4740 | * this is an ugly little race, but the rename is required to make |
| 4674 | * sure that if we crash, the inode is either at the old name | 4741 | * sure that if we crash, the inode is either at the old name |
| 4675 | * or the new one. pinning the log transaction lets us make sure | 4742 | * or the new one. pinning the log transaction lets us make sure |
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 77c2411a5f0f..53c87b197d70 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c | |||
| @@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode, | |||
| 310 | 310 | ||
| 311 | spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); | 311 | spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); |
| 312 | list_del_init(&entry->root_extent_list); | 312 | list_del_init(&entry->root_extent_list); |
| 313 | |||
| 314 | /* | ||
| 315 | * we have no more ordered extents for this inode and | ||
| 316 | * no dirty pages. We can safely remove it from the | ||
| 317 | * list of ordered extents | ||
| 318 | */ | ||
| 319 | if (RB_EMPTY_ROOT(&tree->tree) && | ||
| 320 | !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { | ||
| 321 | list_del_init(&BTRFS_I(inode)->ordered_operations); | ||
| 322 | } | ||
| 313 | spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); | 323 | spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); |
| 314 | 324 | ||
| 315 | mutex_unlock(&tree->mutex); | 325 | mutex_unlock(&tree->mutex); |
| @@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) | |||
| 370 | } | 380 | } |
| 371 | 381 | ||
| 372 | /* | 382 | /* |
| 383 | * this is used during transaction commit to write all the inodes | ||
| 384 | * added to the ordered operation list. These files must be fully on | ||
| 385 | * disk before the transaction commits. | ||
| 386 | * | ||
| 387 | * we have two modes here, one is to just start the IO via filemap_flush | ||
| 388 | * and the other is to wait for all the io. When we wait, we have an | ||
| 389 | * extra check to make sure the ordered operation list really is empty | ||
| 390 | * before we return | ||
| 391 | */ | ||
| 392 | int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) | ||
| 393 | { | ||
| 394 | struct btrfs_inode *btrfs_inode; | ||
| 395 | struct inode *inode; | ||
| 396 | struct list_head splice; | ||
| 397 | |||
| 398 | INIT_LIST_HEAD(&splice); | ||
| 399 | |||
| 400 | mutex_lock(&root->fs_info->ordered_operations_mutex); | ||
| 401 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
| 402 | again: | ||
| 403 | list_splice_init(&root->fs_info->ordered_operations, &splice); | ||
| 404 | |||
| 405 | while (!list_empty(&splice)) { | ||
| 406 | btrfs_inode = list_entry(splice.next, struct btrfs_inode, | ||
| 407 | ordered_operations); | ||
| 408 | |||
| 409 | inode = &btrfs_inode->vfs_inode; | ||
| 410 | |||
| 411 | list_del_init(&btrfs_inode->ordered_operations); | ||
| 412 | |||
| 413 | /* | ||
| 414 | * the inode may be getting freed (in sys_unlink path). | ||
| 415 | */ | ||
| 416 | inode = igrab(inode); | ||
| 417 | |||
| 418 | if (!wait && inode) { | ||
| 419 | list_add_tail(&BTRFS_I(inode)->ordered_operations, | ||
| 420 | &root->fs_info->ordered_operations); | ||
| 421 | } | ||
| 422 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
| 423 | |||
| 424 | if (inode) { | ||
| 425 | if (wait) | ||
| 426 | btrfs_wait_ordered_range(inode, 0, (u64)-1); | ||
| 427 | else | ||
| 428 | filemap_flush(inode->i_mapping); | ||
| 429 | iput(inode); | ||
| 430 | } | ||
| 431 | |||
| 432 | cond_resched(); | ||
| 433 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
| 434 | } | ||
| 435 | if (wait && !list_empty(&root->fs_info->ordered_operations)) | ||
| 436 | goto again; | ||
| 437 | |||
| 438 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
| 439 | mutex_unlock(&root->fs_info->ordered_operations_mutex); | ||
| 440 | |||
| 441 | return 0; | ||
| 442 | } | ||
| 443 | |||
| 444 | /* | ||
| 373 | * Used to start IO or wait for a given ordered extent to finish. | 445 | * Used to start IO or wait for a given ordered extent to finish. |
| 374 | * | 446 | * |
| 375 | * If wait is one, this effectively waits on page writeback for all the pages | 447 | * If wait is one, this effectively waits on page writeback for all the pages |
| @@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, | |||
| 726 | 798 | ||
| 727 | return ret; | 799 | return ret; |
| 728 | } | 800 | } |
| 801 | |||
| 802 | /* | ||
| 803 | * add a given inode to the list of inodes that must be fully on | ||
| 804 | * disk before a transaction commit finishes. | ||
| 805 | * | ||
| 806 | * This basically gives us the ext3 style data=ordered mode, and it is mostly | ||
| 807 | * used to make sure renamed files are fully on disk. | ||
| 808 | * | ||
| 809 | * It is a noop if the inode is already fully on disk. | ||
| 810 | * | ||
| 811 | * If trans is not null, we'll do a friendly check for a transaction that | ||
| 812 | * is already flushing things and force the IO down ourselves. | ||
| 813 | */ | ||
| 814 | int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, | ||
| 815 | struct btrfs_root *root, | ||
| 816 | struct inode *inode) | ||
| 817 | { | ||
| 818 | u64 last_mod; | ||
| 819 | |||
| 820 | last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); | ||
| 821 | |||
| 822 | /* | ||
| 823 | * if this file hasn't been changed since the last transaction | ||
| 824 | * commit, we can safely return without doing anything | ||
| 825 | */ | ||
| 826 | if (last_mod < root->fs_info->last_trans_committed) | ||
| 827 | return 0; | ||
| 828 | |||
| 829 | /* | ||
| 830 | * the transaction is already committing. Just start the IO and | ||
| 831 | * don't bother with all of this list nonsense | ||
| 832 | */ | ||
| 833 | if (trans && root->fs_info->running_transaction->blocked) { | ||
| 834 | btrfs_wait_ordered_range(inode, 0, (u64)-1); | ||
| 835 | return 0; | ||
| 836 | } | ||
| 837 | |||
| 838 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
| 839 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) { | ||
| 840 | list_add_tail(&BTRFS_I(inode)->ordered_operations, | ||
| 841 | &root->fs_info->ordered_operations); | ||
| 842 | } | ||
| 843 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
| 844 | |||
| 845 | return 0; | ||
| 846 | } | ||
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ab66d5e8d6d6..3d31c8827b01 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h | |||
| @@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping, | |||
| 155 | int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, | 155 | int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, |
| 156 | loff_t end, int sync_mode); | 156 | loff_t end, int sync_mode); |
| 157 | int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); | 157 | int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); |
| 158 | int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); | ||
| 159 | int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, | ||
| 160 | struct btrfs_root *root, | ||
| 161 | struct inode *inode); | ||
| 158 | #endif | 162 | #endif |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9c8f158dd2db..664782c6a2df 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
| @@ -975,6 +975,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
| 975 | int should_grow = 0; | 975 | int should_grow = 0; |
| 976 | unsigned long now = get_seconds(); | 976 | unsigned long now = get_seconds(); |
| 977 | 977 | ||
| 978 | btrfs_run_ordered_operations(root, 0); | ||
| 979 | |||
| 978 | /* make a pass through all the delayed refs we have so far | 980 | /* make a pass through all the delayed refs we have so far |
| 979 | * any runnings procs may add more while we are here | 981 | * any runnings procs may add more while we are here |
| 980 | */ | 982 | */ |
| @@ -1056,6 +1058,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
| 1056 | BUG_ON(ret); | 1058 | BUG_ON(ret); |
| 1057 | } | 1059 | } |
| 1058 | 1060 | ||
| 1061 | /* | ||
| 1062 | * rename don't use btrfs_join_transaction, so, once we | ||
| 1063 | * set the transaction to blocked above, we aren't going | ||
| 1064 | * to get any new ordered operations. We can safely run | ||
| 1065 | * it here and no for sure that nothing new will be added | ||
| 1066 | * to the list | ||
| 1067 | */ | ||
| 1068 | btrfs_run_ordered_operations(root, 1); | ||
| 1069 | |||
| 1059 | smp_mb(); | 1070 | smp_mb(); |
| 1060 | if (cur_trans->num_writers > 1 || should_grow) | 1071 | if (cur_trans->num_writers > 1 || should_grow) |
| 1061 | schedule_timeout(timeout); | 1072 | schedule_timeout(timeout); |
