aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fb.com>2015-09-24 16:17:39 -0400
committerChris Mason <clm@fb.com>2015-10-21 21:51:40 -0400
commit161c3549b45aeef05451b6822d8aaaf39c7bedce (patch)
treef14c534323f7b8d6bf181c9610df66313358360b
parenta408365c62762c30419018587cffd2b89836434e (diff)
Btrfs: change how we wait for pending ordered extents
We have a mechanism to make sure we don't lose updates for ordered extents that were logged in the transaction that is currently running. We add the ordered extent to a transaction list and then the transaction waits on all the ordered extents in that list. However are substantially large file systems this list can be extremely large, and can give us soft lockups, since the ordered extents don't remove themselves from the list when they do complete. To fix this we simply add a counter to the transaction that is incremented any time we have a logged extent that needs to be completed in the current transaction. Then when the ordered extent finally completes it decrements the per transaction counter and wakes up the transaction if we are the last ones. This will eliminate the softlockup. Thanks, Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r--fs/btrfs/disk-io.c20
-rw-r--r--fs/btrfs/ordered-data.c64
-rw-r--r--fs/btrfs/ordered-data.h2
-rw-r--r--fs/btrfs/transaction.c34
-rw-r--r--fs/btrfs/transaction.h4
5 files changed, 60 insertions, 64 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bcbb596d9695..dceabb13ddc4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4326,25 +4326,6 @@ again:
4326 return 0; 4326 return 0;
4327} 4327}
4328 4328
4329static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
4330 struct btrfs_fs_info *fs_info)
4331{
4332 struct btrfs_ordered_extent *ordered;
4333
4334 spin_lock(&fs_info->trans_lock);
4335 while (!list_empty(&cur_trans->pending_ordered)) {
4336 ordered = list_first_entry(&cur_trans->pending_ordered,
4337 struct btrfs_ordered_extent,
4338 trans_list);
4339 list_del_init(&ordered->trans_list);
4340 spin_unlock(&fs_info->trans_lock);
4341
4342 btrfs_put_ordered_extent(ordered);
4343 spin_lock(&fs_info->trans_lock);
4344 }
4345 spin_unlock(&fs_info->trans_lock);
4346}
4347
4348void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, 4329void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4349 struct btrfs_root *root) 4330 struct btrfs_root *root)
4350{ 4331{
@@ -4356,7 +4337,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4356 cur_trans->state = TRANS_STATE_UNBLOCKED; 4337 cur_trans->state = TRANS_STATE_UNBLOCKED;
4357 wake_up(&root->fs_info->transaction_wait); 4338 wake_up(&root->fs_info->transaction_wait);
4358 4339
4359 btrfs_free_pending_ordered(cur_trans, root->fs_info);
4360 btrfs_destroy_delayed_inodes(root); 4340 btrfs_destroy_delayed_inodes(root);
4361 btrfs_assert_delayed_root_empty(root); 4341 btrfs_assert_delayed_root_empty(root);
4362 4342
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 071005f008c1..8c27292ea9ea 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -490,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
490 490
491 spin_lock_irq(&log->log_extents_lock[index]); 491 spin_lock_irq(&log->log_extents_lock[index]);
492 while (!list_empty(&log->logged_list[index])) { 492 while (!list_empty(&log->logged_list[index])) {
493 struct inode *inode;
493 ordered = list_first_entry(&log->logged_list[index], 494 ordered = list_first_entry(&log->logged_list[index],
494 struct btrfs_ordered_extent, 495 struct btrfs_ordered_extent,
495 log_list); 496 log_list);
496 list_del_init(&ordered->log_list); 497 list_del_init(&ordered->log_list);
498 inode = ordered->inode;
497 spin_unlock_irq(&log->log_extents_lock[index]); 499 spin_unlock_irq(&log->log_extents_lock[index]);
498 500
499 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 501 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
500 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 502 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
501 struct inode *inode = ordered->inode;
502 u64 start = ordered->file_offset; 503 u64 start = ordered->file_offset;
503 u64 end = ordered->file_offset + ordered->len - 1; 504 u64 end = ordered->file_offset + ordered->len - 1;
504 505
@@ -509,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
509 &ordered->flags)); 510 &ordered->flags));
510 511
511 /* 512 /*
512 * If our ordered extent completed it means it updated the 513 * In order to keep us from losing our ordered extent
513 * fs/subvol and csum trees already, so no need to make the 514 * information when committing the transaction we have to make
514 * current transaction's commit wait for it, as we end up 515 * sure that any logged extents are completed when we go to
515 * holding memory unnecessarily and delaying the inode's iput 516 * commit the transaction. To do this we simply increase the
516 * until the transaction commit (we schedule an iput for the 517 * current transactions pending_ordered counter and decrement it
517 * inode when the ordered extent's refcount drops to 0), which 518 * when the ordered extent completes.
518 * prevents it from being evictable until the transaction
519 * commits.
520 */ 519 */
521 if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) 520 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
522 btrfs_put_ordered_extent(ordered); 521 struct btrfs_ordered_inode_tree *tree;
523 else 522
524 list_add_tail(&ordered->trans_list, &trans->ordered); 523 tree = &BTRFS_I(inode)->ordered_tree;
525 524 spin_lock_irq(&tree->lock);
525 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
526 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
527 atomic_inc(&trans->transaction->pending_ordered);
528 }
529 spin_unlock_irq(&tree->lock);
530 }
531 btrfs_put_ordered_extent(ordered);
526 spin_lock_irq(&log->log_extents_lock[index]); 532 spin_lock_irq(&log->log_extents_lock[index]);
527 } 533 }
528 spin_unlock_irq(&log->log_extents_lock[index]); 534 spin_unlock_irq(&log->log_extents_lock[index]);
@@ -584,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
584 struct btrfs_ordered_inode_tree *tree; 590 struct btrfs_ordered_inode_tree *tree;
585 struct btrfs_root *root = BTRFS_I(inode)->root; 591 struct btrfs_root *root = BTRFS_I(inode)->root;
586 struct rb_node *node; 592 struct rb_node *node;
593 bool dec_pending_ordered = false;
587 594
588 tree = &BTRFS_I(inode)->ordered_tree; 595 tree = &BTRFS_I(inode)->ordered_tree;
589 spin_lock_irq(&tree->lock); 596 spin_lock_irq(&tree->lock);
@@ -593,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode,
593 if (tree->last == node) 600 if (tree->last == node)
594 tree->last = NULL; 601 tree->last = NULL;
595 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 602 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
603 if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
604 dec_pending_ordered = true;
596 spin_unlock_irq(&tree->lock); 605 spin_unlock_irq(&tree->lock);
597 606
607 /*
608 * The current running transaction is waiting on us, we need to let it
609 * know that we're complete and wake it up.
610 */
611 if (dec_pending_ordered) {
612 struct btrfs_transaction *trans;
613
614 /*
615 * The checks for trans are just a formality, it should be set,
616 * but if it isn't we don't want to deref/assert under the spin
617 * lock, so be nice and check if trans is set, but ASSERT() so
618 * if it isn't set a developer will notice.
619 */
620 spin_lock(&root->fs_info->trans_lock);
621 trans = root->fs_info->running_transaction;
622 if (trans)
623 atomic_inc(&trans->use_count);
624 spin_unlock(&root->fs_info->trans_lock);
625
626 ASSERT(trans);
627 if (trans) {
628 if (atomic_dec_and_test(&trans->pending_ordered))
629 wake_up(&trans->pending_wait);
630 btrfs_put_transaction(trans);
631 }
632 }
633
598 spin_lock(&root->ordered_extent_lock); 634 spin_lock(&root->ordered_extent_lock);
599 list_del_init(&entry->root_extent_list); 635 list_del_init(&entry->root_extent_list);
600 root->nr_ordered_extents--; 636 root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 7176cc0fe43f..23c96059cef2 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -73,6 +73,8 @@ struct btrfs_ordered_sum {
73 73
74#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent 74#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
75 * in the logging code. */ 75 * in the logging code. */
76#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to
77 * complete in the current transaction. */
76struct btrfs_ordered_extent { 78struct btrfs_ordered_extent {
77 /* logical offset in the file */ 79 /* logical offset in the file */
78 u64 file_offset; 80 u64 file_offset;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9354e7a1247f..68a56c3cc555 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -232,6 +232,7 @@ loop:
232 extwriter_counter_init(cur_trans, type); 232 extwriter_counter_init(cur_trans, type);
233 init_waitqueue_head(&cur_trans->writer_wait); 233 init_waitqueue_head(&cur_trans->writer_wait);
234 init_waitqueue_head(&cur_trans->commit_wait); 234 init_waitqueue_head(&cur_trans->commit_wait);
235 init_waitqueue_head(&cur_trans->pending_wait);
235 cur_trans->state = TRANS_STATE_RUNNING; 236 cur_trans->state = TRANS_STATE_RUNNING;
236 /* 237 /*
237 * One for this trans handle, one so it will live on until we 238 * One for this trans handle, one so it will live on until we
@@ -239,6 +240,7 @@ loop:
239 */ 240 */
240 atomic_set(&cur_trans->use_count, 2); 241 atomic_set(&cur_trans->use_count, 2);
241 cur_trans->have_free_bgs = 0; 242 cur_trans->have_free_bgs = 0;
243 atomic_set(&cur_trans->pending_ordered, 0);
242 cur_trans->start_time = get_seconds(); 244 cur_trans->start_time = get_seconds();
243 cur_trans->dirty_bg_run = 0; 245 cur_trans->dirty_bg_run = 0;
244 246
@@ -266,7 +268,6 @@ loop:
266 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 268 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
267 INIT_LIST_HEAD(&cur_trans->pending_chunks); 269 INIT_LIST_HEAD(&cur_trans->pending_chunks);
268 INIT_LIST_HEAD(&cur_trans->switch_commits); 270 INIT_LIST_HEAD(&cur_trans->switch_commits);
269 INIT_LIST_HEAD(&cur_trans->pending_ordered);
270 INIT_LIST_HEAD(&cur_trans->dirty_bgs); 271 INIT_LIST_HEAD(&cur_trans->dirty_bgs);
271 INIT_LIST_HEAD(&cur_trans->io_bgs); 272 INIT_LIST_HEAD(&cur_trans->io_bgs);
272 INIT_LIST_HEAD(&cur_trans->dropped_roots); 273 INIT_LIST_HEAD(&cur_trans->dropped_roots);
@@ -551,7 +552,6 @@ again:
551 h->can_flush_pending_bgs = true; 552 h->can_flush_pending_bgs = true;
552 INIT_LIST_HEAD(&h->qgroup_ref_list); 553 INIT_LIST_HEAD(&h->qgroup_ref_list);
553 INIT_LIST_HEAD(&h->new_bgs); 554 INIT_LIST_HEAD(&h->new_bgs);
554 INIT_LIST_HEAD(&h->ordered);
555 555
556 smp_mb(); 556 smp_mb();
557 if (cur_trans->state >= TRANS_STATE_BLOCKED && 557 if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -784,12 +784,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
784 if (!list_empty(&trans->new_bgs)) 784 if (!list_empty(&trans->new_bgs))
785 btrfs_create_pending_block_groups(trans, root); 785 btrfs_create_pending_block_groups(trans, root);
786 786
787 if (!list_empty(&trans->ordered)) {
788 spin_lock(&info->trans_lock);
789 list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
790 spin_unlock(&info->trans_lock);
791 }
792
793 trans->delayed_ref_updates = 0; 787 trans->delayed_ref_updates = 0;
794 if (!trans->sync) { 788 if (!trans->sync) {
795 must_run_delayed_refs = 789 must_run_delayed_refs =
@@ -1788,25 +1782,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
1788} 1782}
1789 1783
1790static inline void 1784static inline void
1791btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, 1785btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
1792 struct btrfs_fs_info *fs_info)
1793{ 1786{
1794 struct btrfs_ordered_extent *ordered; 1787 wait_event(cur_trans->pending_wait,
1795 1788 atomic_read(&cur_trans->pending_ordered) == 0);
1796 spin_lock(&fs_info->trans_lock);
1797 while (!list_empty(&cur_trans->pending_ordered)) {
1798 ordered = list_first_entry(&cur_trans->pending_ordered,
1799 struct btrfs_ordered_extent,
1800 trans_list);
1801 list_del_init(&ordered->trans_list);
1802 spin_unlock(&fs_info->trans_lock);
1803
1804 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
1805 &ordered->flags));
1806 btrfs_put_ordered_extent(ordered);
1807 spin_lock(&fs_info->trans_lock);
1808 }
1809 spin_unlock(&fs_info->trans_lock);
1810} 1789}
1811 1790
1812int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1791int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1890,7 +1869,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1890 } 1869 }
1891 1870
1892 spin_lock(&root->fs_info->trans_lock); 1871 spin_lock(&root->fs_info->trans_lock);
1893 list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
1894 if (cur_trans->state >= TRANS_STATE_COMMIT_START) { 1872 if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
1895 spin_unlock(&root->fs_info->trans_lock); 1873 spin_unlock(&root->fs_info->trans_lock);
1896 atomic_inc(&cur_trans->use_count); 1874 atomic_inc(&cur_trans->use_count);
@@ -1949,7 +1927,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1949 1927
1950 btrfs_wait_delalloc_flush(root->fs_info); 1928 btrfs_wait_delalloc_flush(root->fs_info);
1951 1929
1952 btrfs_wait_pending_ordered(cur_trans, root->fs_info); 1930 btrfs_wait_pending_ordered(cur_trans);
1953 1931
1954 btrfs_scrub_pause(root); 1932 btrfs_scrub_pause(root);
1955 /* 1933 /*
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index db6bfd92f0ea..bf7b1ddf5993 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -46,6 +46,7 @@ struct btrfs_transaction {
46 */ 46 */
47 atomic_t num_writers; 47 atomic_t num_writers;
48 atomic_t use_count; 48 atomic_t use_count;
49 atomic_t pending_ordered;
49 50
50 /* 51 /*
51 * true if there is free bgs operations in this transaction 52 * true if there is free bgs operations in this transaction
@@ -59,9 +60,9 @@ struct btrfs_transaction {
59 unsigned long start_time; 60 unsigned long start_time;
60 wait_queue_head_t writer_wait; 61 wait_queue_head_t writer_wait;
61 wait_queue_head_t commit_wait; 62 wait_queue_head_t commit_wait;
63 wait_queue_head_t pending_wait;
62 struct list_head pending_snapshots; 64 struct list_head pending_snapshots;
63 struct list_head pending_chunks; 65 struct list_head pending_chunks;
64 struct list_head pending_ordered;
65 struct list_head switch_commits; 66 struct list_head switch_commits;
66 struct list_head dirty_bgs; 67 struct list_head dirty_bgs;
67 struct list_head io_bgs; 68 struct list_head io_bgs;
@@ -129,7 +130,6 @@ struct btrfs_trans_handle {
129 */ 130 */
130 struct btrfs_root *root; 131 struct btrfs_root *root;
131 struct seq_list delayed_ref_elem; 132 struct seq_list delayed_ref_elem;
132 struct list_head ordered;
133 struct list_head qgroup_ref_list; 133 struct list_head qgroup_ref_list;
134 struct list_head new_bgs; 134 struct list_head new_bgs;
135}; 135};