diff options
author | Jan Kara <jack@suse.cz> | 2013-06-04 12:08:56 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2013-06-04 12:08:56 -0400 |
commit | b34090e5e22a02fba0e4473056cce9420ad9dd0b (patch) | |
tree | 7ffb9ecd10ada2aefe9079c2df91405592132e47 /fs/jbd2 | |
parent | e5a120aeb57f40ae568a5ca1dd6ace53d0213582 (diff) |
jbd2: refine waiting for shadow buffers
Currently when we add a buffer to a transaction, we wait until the
buffer is removed from BJ_Shadow list (so that we prevent any changes
to the buffer that is just written to the journal). This can take
unnecessarily long as a lot happens between the time the buffer is
submitted to the journal and the time when we remove the buffer from
BJ_Shadow list. (e.g. We wait for all data buffers in the
transaction, we issue a cache flush, etc.) Also this creates a
dependency of do_get_write_access() on transaction commit (namely
waiting for data IO to complete) which we want to avoid when
implementing transaction reservation.
So we modify commit code to set new BH_Shadow flag when temporary
shadowing buffer is created and we clear that flag once IO on that
buffer is complete. This allows do_get_write_access() to wait only
for BH_Shadow bit and thus removes the dependency on data IO
completion.
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/jbd2')
-rw-r--r-- | fs/jbd2/commit.c | 18 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 2 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 44 |
3 files changed, 30 insertions, 34 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7c6f7eea2316..d73a0d808ec1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -30,15 +30,22 @@ | |||
30 | #include <trace/events/jbd2.h> | 30 | #include <trace/events/jbd2.h> |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * Default IO end handler for temporary BJ_IO buffer_heads. | 33 | * IO end handler for temporary buffer_heads handling writes to the journal. |
34 | */ | 34 | */ |
35 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | 35 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) |
36 | { | 36 | { |
37 | struct buffer_head *orig_bh = bh->b_private; | ||
38 | |||
37 | BUFFER_TRACE(bh, ""); | 39 | BUFFER_TRACE(bh, ""); |
38 | if (uptodate) | 40 | if (uptodate) |
39 | set_buffer_uptodate(bh); | 41 | set_buffer_uptodate(bh); |
40 | else | 42 | else |
41 | clear_buffer_uptodate(bh); | 43 | clear_buffer_uptodate(bh); |
44 | if (orig_bh) { | ||
45 | clear_bit_unlock(BH_Shadow, &orig_bh->b_state); | ||
46 | smp_mb__after_clear_bit(); | ||
47 | wake_up_bit(&orig_bh->b_state, BH_Shadow); | ||
48 | } | ||
42 | unlock_buffer(bh); | 49 | unlock_buffer(bh); |
43 | } | 50 | } |
44 | 51 | ||
@@ -832,6 +839,7 @@ start_journal_io: | |||
832 | bh = jh2bh(jh); | 839 | bh = jh2bh(jh); |
833 | clear_buffer_jwrite(bh); | 840 | clear_buffer_jwrite(bh); |
834 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); | 841 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); |
842 | J_ASSERT_BH(bh, !buffer_shadow(bh)); | ||
835 | 843 | ||
836 | /* The metadata is now released for reuse, but we need | 844 | /* The metadata is now released for reuse, but we need |
837 | to remember it against this transaction so that when | 845 | to remember it against this transaction so that when |
@@ -839,14 +847,6 @@ start_journal_io: | |||
839 | required. */ | 847 | required. */ |
840 | JBUFFER_TRACE(jh, "file as BJ_Forget"); | 848 | JBUFFER_TRACE(jh, "file as BJ_Forget"); |
841 | jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); | 849 | jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); |
842 | /* | ||
843 | * Wake up any transactions which were waiting for this IO to | ||
844 | * complete. The barrier must be here so that changes by | ||
845 | * jbd2_journal_file_buffer() take effect before wake_up_bit() | ||
846 | * does the waitqueue check. | ||
847 | */ | ||
848 | smp_mb(); | ||
849 | wake_up_bit(&bh->b_state, BH_Unshadow); | ||
850 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); | 850 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); |
851 | __brelse(bh); | 851 | __brelse(bh); |
852 | } | 852 | } |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b0a8d1e4703e..5ef0712e2f7a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -451,6 +451,7 @@ repeat: | |||
451 | new_bh->b_size = bh_in->b_size; | 451 | new_bh->b_size = bh_in->b_size; |
452 | new_bh->b_bdev = journal->j_dev; | 452 | new_bh->b_bdev = journal->j_dev; |
453 | new_bh->b_blocknr = blocknr; | 453 | new_bh->b_blocknr = blocknr; |
454 | new_bh->b_private = bh_in; | ||
454 | set_buffer_mapped(new_bh); | 455 | set_buffer_mapped(new_bh); |
455 | set_buffer_dirty(new_bh); | 456 | set_buffer_dirty(new_bh); |
456 | 457 | ||
@@ -465,6 +466,7 @@ repeat: | |||
465 | spin_lock(&journal->j_list_lock); | 466 | spin_lock(&journal->j_list_lock); |
466 | __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); | 467 | __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); |
467 | spin_unlock(&journal->j_list_lock); | 468 | spin_unlock(&journal->j_list_lock); |
469 | set_buffer_shadow(bh_in); | ||
468 | jbd_unlock_bh_state(bh_in); | 470 | jbd_unlock_bh_state(bh_in); |
469 | 471 | ||
470 | return do_escape | (done_copy_out << 1); | 472 | return do_escape | (done_copy_out << 1); |
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index f1c5392e62b6..6f4248dd8759 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c | |||
@@ -619,6 +619,12 @@ static void warn_dirty_buffer(struct buffer_head *bh) | |||
619 | bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); | 619 | bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); |
620 | } | 620 | } |
621 | 621 | ||
622 | static int sleep_on_shadow_bh(void *word) | ||
623 | { | ||
624 | io_schedule(); | ||
625 | return 0; | ||
626 | } | ||
627 | |||
622 | /* | 628 | /* |
623 | * If the buffer is already part of the current transaction, then there | 629 | * If the buffer is already part of the current transaction, then there |
624 | * is nothing we need to do. If it is already part of a prior | 630 | * is nothing we need to do. If it is already part of a prior |
@@ -754,41 +760,29 @@ repeat: | |||
754 | * journaled. If the primary copy is already going to | 760 | * journaled. If the primary copy is already going to |
755 | * disk then we cannot do copy-out here. */ | 761 | * disk then we cannot do copy-out here. */ |
756 | 762 | ||
757 | if (jh->b_jlist == BJ_Shadow) { | 763 | if (buffer_shadow(bh)) { |
758 | DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); | ||
759 | wait_queue_head_t *wqh; | ||
760 | |||
761 | wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); | ||
762 | |||
763 | JBUFFER_TRACE(jh, "on shadow: sleep"); | 764 | JBUFFER_TRACE(jh, "on shadow: sleep"); |
764 | jbd_unlock_bh_state(bh); | 765 | jbd_unlock_bh_state(bh); |
765 | /* commit wakes up all shadow buffers after IO */ | 766 | wait_on_bit(&bh->b_state, BH_Shadow, |
766 | for ( ; ; ) { | 767 | sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); |
767 | prepare_to_wait(wqh, &wait.wait, | ||
768 | TASK_UNINTERRUPTIBLE); | ||
769 | if (jh->b_jlist != BJ_Shadow) | ||
770 | break; | ||
771 | schedule(); | ||
772 | } | ||
773 | finish_wait(wqh, &wait.wait); | ||
774 | goto repeat; | 768 | goto repeat; |
775 | } | 769 | } |
776 | 770 | ||
777 | /* Only do the copy if the currently-owning transaction | 771 | /* |
778 | * still needs it. If it is on the Forget list, the | 772 | * Only do the copy if the currently-owning transaction still |
779 | * committing transaction is past that stage. The | 773 | * needs it. If buffer isn't on BJ_Metadata list, the |
780 | * buffer had better remain locked during the kmalloc, | 774 | * committing transaction is past that stage (here we use the |
781 | * but that should be true --- we hold the journal lock | 775 | * fact that BH_Shadow is set under bh_state lock together with |
782 | * still and the buffer is already on the BUF_JOURNAL | 776 | * refiling to BJ_Shadow list and at this point we know the |
783 | * list so won't be flushed. | 777 | * buffer doesn't have BH_Shadow set). |
784 | * | 778 | * |
785 | * Subtle point, though: if this is a get_undo_access, | 779 | * Subtle point, though: if this is a get_undo_access, |
786 | * then we will be relying on the frozen_data to contain | 780 | * then we will be relying on the frozen_data to contain |
787 | * the new value of the committed_data record after the | 781 | * the new value of the committed_data record after the |
788 | * transaction, so we HAVE to force the frozen_data copy | 782 | * transaction, so we HAVE to force the frozen_data copy |
789 | * in that case. */ | 783 | * in that case. |
790 | 784 | */ | |
791 | if (jh->b_jlist != BJ_Forget || force_copy) { | 785 | if (jh->b_jlist == BJ_Metadata || force_copy) { |
792 | JBUFFER_TRACE(jh, "generate frozen data"); | 786 | JBUFFER_TRACE(jh, "generate frozen data"); |
793 | if (!frozen_buffer) { | 787 | if (!frozen_buffer) { |
794 | JBUFFER_TRACE(jh, "allocate memory for buffer"); | 788 | JBUFFER_TRACE(jh, "allocate memory for buffer"); |