diff options
author | Chris Mason <chris.mason@oracle.com> | 2009-03-12 20:12:45 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2009-03-24 16:14:26 -0400 |
commit | b7ec40d7845bffca8bb3af2ea3f192d6257bbe21 (patch) | |
tree | 65b833b979417d36f0fd26d647573de1df0646b9 /fs/btrfs | |
parent | c3e69d58e86c3917ae4e9e31b4acf490a7cafe60 (diff) |
Btrfs: reduce stalls during transaction commit
To avoid deadlocks and reduce latencies during some critical operations, some
transaction writers are allowed to jump into the running transaction and make
it run a little longer, while others sit around and wait for the commit to
finish.
This is a bit unfair, especially when the callers that jump in do a bunch
of IO that makes all the others procs on the box wait. This commit
reduces the stalls this produces by pre-reading file extent pointers
during btrfs_finish_ordered_io before the transaction is joined.
It also tunes the drop_snapshot code to politely wait for transactions
that have started writing out their delayed refs to finish. This avoids
new delayed refs being flooded into the queue while we're trying to
close off the transaction.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/extent-tree.c | 3 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 18 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 62 | ||||
-rw-r--r-- | fs/btrfs/transaction.h | 5 |
4 files changed, 80 insertions, 8 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3b8b6c212701..a421c32c6cfe 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -3797,7 +3797,8 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root | |||
3797 | break; | 3797 | break; |
3798 | if (wret < 0) | 3798 | if (wret < 0) |
3799 | ret = wret; | 3799 | ret = wret; |
3800 | if (trans->transaction->in_commit) { | 3800 | if (trans->transaction->in_commit || |
3801 | trans->transaction->delayed_refs.flushing) { | ||
3801 | ret = -EAGAIN; | 3802 | ret = -EAGAIN; |
3802 | break; | 3803 | break; |
3803 | } | 3804 | } |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d4f948bc22a..13a17477c4f4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -1502,6 +1502,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) | |||
1502 | struct btrfs_trans_handle *trans; | 1502 | struct btrfs_trans_handle *trans; |
1503 | struct btrfs_ordered_extent *ordered_extent; | 1503 | struct btrfs_ordered_extent *ordered_extent; |
1504 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | 1504 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; |
1505 | struct btrfs_path *path; | ||
1505 | int compressed = 0; | 1506 | int compressed = 0; |
1506 | int ret; | 1507 | int ret; |
1507 | 1508 | ||
@@ -1509,6 +1510,23 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) | |||
1509 | if (!ret) | 1510 | if (!ret) |
1510 | return 0; | 1511 | return 0; |
1511 | 1512 | ||
1513 | /* | ||
1514 | * before we join the transaction, try to do some of our IO. | ||
1515 | * This will limit the amount of IO that we have to do with | ||
1516 | * the transaction running. We're unlikely to need to do any | ||
1517 | * IO if the file extents are new, the disk_i_size checks | ||
1518 | * covers the most common case. | ||
1519 | */ | ||
1520 | if (start < BTRFS_I(inode)->disk_i_size) { | ||
1521 | path = btrfs_alloc_path(); | ||
1522 | if (path) { | ||
1523 | ret = btrfs_lookup_file_extent(NULL, root, path, | ||
1524 | inode->i_ino, | ||
1525 | start, 0); | ||
1526 | btrfs_free_path(path); | ||
1527 | } | ||
1528 | } | ||
1529 | |||
1512 | trans = btrfs_join_transaction(root, 1); | 1530 | trans = btrfs_join_transaction(root, 1); |
1513 | 1531 | ||
1514 | ordered_extent = btrfs_lookup_ordered_extent(inode, start); | 1532 | ordered_extent = btrfs_lookup_ordered_extent(inode, start); |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 903edab3659a..01c9620bb001 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -192,6 +192,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, | |||
192 | h->alloc_exclude_nr = 0; | 192 | h->alloc_exclude_nr = 0; |
193 | h->alloc_exclude_start = 0; | 193 | h->alloc_exclude_start = 0; |
194 | h->delayed_ref_updates = 0; | 194 | h->delayed_ref_updates = 0; |
195 | |||
195 | root->fs_info->running_transaction->use_count++; | 196 | root->fs_info->running_transaction->use_count++; |
196 | mutex_unlock(&root->fs_info->trans_mutex); | 197 | mutex_unlock(&root->fs_info->trans_mutex); |
197 | return h; | 198 | return h; |
@@ -281,7 +282,6 @@ void btrfs_throttle(struct btrfs_root *root) | |||
281 | if (!root->fs_info->open_ioctl_trans) | 282 | if (!root->fs_info->open_ioctl_trans) |
282 | wait_current_trans(root); | 283 | wait_current_trans(root); |
283 | mutex_unlock(&root->fs_info->trans_mutex); | 284 | mutex_unlock(&root->fs_info->trans_mutex); |
284 | |||
285 | throttle_on_drops(root); | 285 | throttle_on_drops(root); |
286 | } | 286 | } |
287 | 287 | ||
@@ -298,6 +298,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
298 | if (cur && | 298 | if (cur && |
299 | trans->transaction->delayed_refs.num_heads_ready > 64) { | 299 | trans->transaction->delayed_refs.num_heads_ready > 64) { |
300 | trans->delayed_ref_updates = 0; | 300 | trans->delayed_ref_updates = 0; |
301 | |||
302 | /* | ||
303 | * do a full flush if the transaction is trying | ||
304 | * to close | ||
305 | */ | ||
306 | if (trans->transaction->delayed_refs.flushing) | ||
307 | cur = 0; | ||
301 | btrfs_run_delayed_refs(trans, root, cur); | 308 | btrfs_run_delayed_refs(trans, root, cur); |
302 | } else { | 309 | } else { |
303 | break; | 310 | break; |
@@ -666,6 +673,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) | |||
666 | } | 673 | } |
667 | 674 | ||
668 | /* | 675 | /* |
676 | * when dropping snapshots, we generate a ton of delayed refs, and it makes | ||
677 | * sense not to join the transaction while it is trying to flush the current | ||
678 | * queue of delayed refs out. | ||
679 | * | ||
680 | * This is used by the drop snapshot code only | ||
681 | */ | ||
682 | static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) | ||
683 | { | ||
684 | DEFINE_WAIT(wait); | ||
685 | |||
686 | mutex_lock(&info->trans_mutex); | ||
687 | while (info->running_transaction && | ||
688 | info->running_transaction->delayed_refs.flushing) { | ||
689 | prepare_to_wait(&info->transaction_wait, &wait, | ||
690 | TASK_UNINTERRUPTIBLE); | ||
691 | mutex_unlock(&info->trans_mutex); | ||
692 | schedule(); | ||
693 | mutex_lock(&info->trans_mutex); | ||
694 | finish_wait(&info->transaction_wait, &wait); | ||
695 | } | ||
696 | mutex_unlock(&info->trans_mutex); | ||
697 | return 0; | ||
698 | } | ||
699 | |||
700 | /* | ||
669 | * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on | 701 | * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on |
670 | * all of them | 702 | * all of them |
671 | */ | 703 | */ |
@@ -692,7 +724,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, | |||
692 | atomic_inc(&root->fs_info->throttles); | 724 | atomic_inc(&root->fs_info->throttles); |
693 | 725 | ||
694 | while (1) { | 726 | while (1) { |
727 | /* | ||
728 | * we don't want to jump in and create a bunch of | ||
729 | * delayed refs if the transaction is starting to close | ||
730 | */ | ||
731 | wait_transaction_pre_flush(tree_root->fs_info); | ||
695 | trans = btrfs_start_transaction(tree_root, 1); | 732 | trans = btrfs_start_transaction(tree_root, 1); |
733 | |||
734 | /* | ||
735 | * we've joined a transaction, make sure it isn't | ||
736 | * closing right now | ||
737 | */ | ||
738 | if (trans->transaction->delayed_refs.flushing) { | ||
739 | btrfs_end_transaction(trans, tree_root); | ||
740 | continue; | ||
741 | } | ||
742 | |||
696 | mutex_lock(&root->fs_info->drop_mutex); | 743 | mutex_lock(&root->fs_info->drop_mutex); |
697 | ret = btrfs_drop_snapshot(trans, dirty->root); | 744 | ret = btrfs_drop_snapshot(trans, dirty->root); |
698 | if (ret != -EAGAIN) | 745 | if (ret != -EAGAIN) |
@@ -932,20 +979,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
932 | ret = btrfs_run_delayed_refs(trans, root, 0); | 979 | ret = btrfs_run_delayed_refs(trans, root, 0); |
933 | BUG_ON(ret); | 980 | BUG_ON(ret); |
934 | 981 | ||
982 | cur_trans = trans->transaction; | ||
935 | /* | 983 | /* |
936 | * set the flushing flag so procs in this transaction have to | 984 | * set the flushing flag so procs in this transaction have to |
937 | * start sending their work down. | 985 | * start sending their work down. |
938 | */ | 986 | */ |
939 | trans->transaction->delayed_refs.flushing = 1; | 987 | cur_trans->delayed_refs.flushing = 1; |
940 | 988 | ||
941 | ret = btrfs_run_delayed_refs(trans, root, 0); | 989 | ret = btrfs_run_delayed_refs(trans, root, 0); |
942 | BUG_ON(ret); | 990 | BUG_ON(ret); |
943 | 991 | ||
944 | INIT_LIST_HEAD(&dirty_fs_roots); | ||
945 | mutex_lock(&root->fs_info->trans_mutex); | 992 | mutex_lock(&root->fs_info->trans_mutex); |
946 | if (trans->transaction->in_commit) { | 993 | INIT_LIST_HEAD(&dirty_fs_roots); |
947 | cur_trans = trans->transaction; | 994 | if (cur_trans->in_commit) { |
948 | trans->transaction->use_count++; | 995 | cur_trans->use_count++; |
949 | mutex_unlock(&root->fs_info->trans_mutex); | 996 | mutex_unlock(&root->fs_info->trans_mutex); |
950 | btrfs_end_transaction(trans, root); | 997 | btrfs_end_transaction(trans, root); |
951 | 998 | ||
@@ -968,7 +1015,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
968 | 1015 | ||
969 | trans->transaction->in_commit = 1; | 1016 | trans->transaction->in_commit = 1; |
970 | trans->transaction->blocked = 1; | 1017 | trans->transaction->blocked = 1; |
971 | cur_trans = trans->transaction; | ||
972 | if (cur_trans->list.prev != &root->fs_info->trans_list) { | 1018 | if (cur_trans->list.prev != &root->fs_info->trans_list) { |
973 | prev_trans = list_entry(cur_trans->list.prev, | 1019 | prev_trans = list_entry(cur_trans->list.prev, |
974 | struct btrfs_transaction, list); | 1020 | struct btrfs_transaction, list); |
@@ -1081,6 +1127,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1081 | btrfs_copy_pinned(root, pinned_copy); | 1127 | btrfs_copy_pinned(root, pinned_copy); |
1082 | 1128 | ||
1083 | trans->transaction->blocked = 0; | 1129 | trans->transaction->blocked = 0; |
1130 | |||
1084 | wake_up(&root->fs_info->transaction_throttle); | 1131 | wake_up(&root->fs_info->transaction_throttle); |
1085 | wake_up(&root->fs_info->transaction_wait); | 1132 | wake_up(&root->fs_info->transaction_wait); |
1086 | 1133 | ||
@@ -1107,6 +1154,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1107 | mutex_lock(&root->fs_info->trans_mutex); | 1154 | mutex_lock(&root->fs_info->trans_mutex); |
1108 | 1155 | ||
1109 | cur_trans->commit_done = 1; | 1156 | cur_trans->commit_done = 1; |
1157 | |||
1110 | root->fs_info->last_trans_committed = cur_trans->transid; | 1158 | root->fs_info->last_trans_committed = cur_trans->transid; |
1111 | wake_up(&cur_trans->commit_wait); | 1159 | wake_up(&cur_trans->commit_wait); |
1112 | 1160 | ||
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 94876709217f..94f5bde2b58d 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h | |||
@@ -23,7 +23,12 @@ | |||
23 | 23 | ||
24 | struct btrfs_transaction { | 24 | struct btrfs_transaction { |
25 | u64 transid; | 25 | u64 transid; |
26 | /* | ||
27 | * total writers in this transaction, it must be zero before the | ||
28 | * transaction can end | ||
29 | */ | ||
26 | unsigned long num_writers; | 30 | unsigned long num_writers; |
31 | |||
27 | unsigned long num_joined; | 32 | unsigned long num_joined; |
28 | int in_commit; | 33 | int in_commit; |
29 | int use_count; | 34 | int use_count; |