diff options
author | Jan Kara <jack@suse.cz> | 2013-06-04 12:08:56 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2013-06-04 12:08:56 -0400 |
commit | b34090e5e22a02fba0e4473056cce9420ad9dd0b (patch) | |
tree | 7ffb9ecd10ada2aefe9079c2df91405592132e47 | |
parent | e5a120aeb57f40ae568a5ca1dd6ace53d0213582 (diff) |
jbd2: refine waiting for shadow buffers
Currently when we add a buffer to a transaction, we wait until the
buffer is removed from BJ_Shadow list (so that we prevent any changes
to the buffer that is just written to the journal). This can take
unnecessarily long as a lot happens between the time the buffer is
submitted to the journal and the time when we remove the buffer from
BJ_Shadow list. (e.g. We wait for all data buffers in the
transaction, we issue a cache flush, etc.) Also this creates a
dependency of do_get_write_access() on transaction commit (namely
waiting for data IO to complete) which we want to avoid when
implementing transaction reservation.
So we modify commit code to set new BH_Shadow flag when temporary
shadowing buffer is created and we clear that flag once IO on that
buffer is complete. This allows do_get_write_access() to wait only
for BH_Shadow bit and thus removes the dependency on data IO
completion.
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r-- | fs/jbd2/commit.c | 18 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 2 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 44 | ||||
-rw-r--r-- | include/linux/jbd.h | 25 | ||||
-rw-r--r-- | include/linux/jbd2.h | 28 | ||||
-rw-r--r-- | include/linux/jbd_common.h | 26 |
6 files changed, 83 insertions, 60 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7c6f7eea2316..d73a0d808ec1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -30,15 +30,22 @@ | |||
30 | #include <trace/events/jbd2.h> | 30 | #include <trace/events/jbd2.h> |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * Default IO end handler for temporary BJ_IO buffer_heads. | 33 | * IO end handler for temporary buffer_heads handling writes to the journal. |
34 | */ | 34 | */ |
35 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | 35 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) |
36 | { | 36 | { |
37 | struct buffer_head *orig_bh = bh->b_private; | ||
38 | |||
37 | BUFFER_TRACE(bh, ""); | 39 | BUFFER_TRACE(bh, ""); |
38 | if (uptodate) | 40 | if (uptodate) |
39 | set_buffer_uptodate(bh); | 41 | set_buffer_uptodate(bh); |
40 | else | 42 | else |
41 | clear_buffer_uptodate(bh); | 43 | clear_buffer_uptodate(bh); |
44 | if (orig_bh) { | ||
45 | clear_bit_unlock(BH_Shadow, &orig_bh->b_state); | ||
46 | smp_mb__after_clear_bit(); | ||
47 | wake_up_bit(&orig_bh->b_state, BH_Shadow); | ||
48 | } | ||
42 | unlock_buffer(bh); | 49 | unlock_buffer(bh); |
43 | } | 50 | } |
44 | 51 | ||
@@ -832,6 +839,7 @@ start_journal_io: | |||
832 | bh = jh2bh(jh); | 839 | bh = jh2bh(jh); |
833 | clear_buffer_jwrite(bh); | 840 | clear_buffer_jwrite(bh); |
834 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); | 841 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); |
842 | J_ASSERT_BH(bh, !buffer_shadow(bh)); | ||
835 | 843 | ||
836 | /* The metadata is now released for reuse, but we need | 844 | /* The metadata is now released for reuse, but we need |
837 | to remember it against this transaction so that when | 845 | to remember it against this transaction so that when |
@@ -839,14 +847,6 @@ start_journal_io: | |||
839 | required. */ | 847 | required. */ |
840 | JBUFFER_TRACE(jh, "file as BJ_Forget"); | 848 | JBUFFER_TRACE(jh, "file as BJ_Forget"); |
841 | jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); | 849 | jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); |
842 | /* | ||
843 | * Wake up any transactions which were waiting for this IO to | ||
844 | * complete. The barrier must be here so that changes by | ||
845 | * jbd2_journal_file_buffer() take effect before wake_up_bit() | ||
846 | * does the waitqueue check. | ||
847 | */ | ||
848 | smp_mb(); | ||
849 | wake_up_bit(&bh->b_state, BH_Unshadow); | ||
850 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); | 850 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); |
851 | __brelse(bh); | 851 | __brelse(bh); |
852 | } | 852 | } |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b0a8d1e4703e..5ef0712e2f7a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -451,6 +451,7 @@ repeat: | |||
451 | new_bh->b_size = bh_in->b_size; | 451 | new_bh->b_size = bh_in->b_size; |
452 | new_bh->b_bdev = journal->j_dev; | 452 | new_bh->b_bdev = journal->j_dev; |
453 | new_bh->b_blocknr = blocknr; | 453 | new_bh->b_blocknr = blocknr; |
454 | new_bh->b_private = bh_in; | ||
454 | set_buffer_mapped(new_bh); | 455 | set_buffer_mapped(new_bh); |
455 | set_buffer_dirty(new_bh); | 456 | set_buffer_dirty(new_bh); |
456 | 457 | ||
@@ -465,6 +466,7 @@ repeat: | |||
465 | spin_lock(&journal->j_list_lock); | 466 | spin_lock(&journal->j_list_lock); |
466 | __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); | 467 | __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); |
467 | spin_unlock(&journal->j_list_lock); | 468 | spin_unlock(&journal->j_list_lock); |
469 | set_buffer_shadow(bh_in); | ||
468 | jbd_unlock_bh_state(bh_in); | 470 | jbd_unlock_bh_state(bh_in); |
469 | 471 | ||
470 | return do_escape | (done_copy_out << 1); | 472 | return do_escape | (done_copy_out << 1); |
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index f1c5392e62b6..6f4248dd8759 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c | |||
@@ -619,6 +619,12 @@ static void warn_dirty_buffer(struct buffer_head *bh) | |||
619 | bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); | 619 | bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); |
620 | } | 620 | } |
621 | 621 | ||
622 | static int sleep_on_shadow_bh(void *word) | ||
623 | { | ||
624 | io_schedule(); | ||
625 | return 0; | ||
626 | } | ||
627 | |||
622 | /* | 628 | /* |
623 | * If the buffer is already part of the current transaction, then there | 629 | * If the buffer is already part of the current transaction, then there |
624 | * is nothing we need to do. If it is already part of a prior | 630 | * is nothing we need to do. If it is already part of a prior |
@@ -754,41 +760,29 @@ repeat: | |||
754 | * journaled. If the primary copy is already going to | 760 | * journaled. If the primary copy is already going to |
755 | * disk then we cannot do copy-out here. */ | 761 | * disk then we cannot do copy-out here. */ |
756 | 762 | ||
757 | if (jh->b_jlist == BJ_Shadow) { | 763 | if (buffer_shadow(bh)) { |
758 | DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); | ||
759 | wait_queue_head_t *wqh; | ||
760 | |||
761 | wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); | ||
762 | |||
763 | JBUFFER_TRACE(jh, "on shadow: sleep"); | 764 | JBUFFER_TRACE(jh, "on shadow: sleep"); |
764 | jbd_unlock_bh_state(bh); | 765 | jbd_unlock_bh_state(bh); |
765 | /* commit wakes up all shadow buffers after IO */ | 766 | wait_on_bit(&bh->b_state, BH_Shadow, |
766 | for ( ; ; ) { | 767 | sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); |
767 | prepare_to_wait(wqh, &wait.wait, | ||
768 | TASK_UNINTERRUPTIBLE); | ||
769 | if (jh->b_jlist != BJ_Shadow) | ||
770 | break; | ||
771 | schedule(); | ||
772 | } | ||
773 | finish_wait(wqh, &wait.wait); | ||
774 | goto repeat; | 768 | goto repeat; |
775 | } | 769 | } |
776 | 770 | ||
777 | /* Only do the copy if the currently-owning transaction | 771 | /* |
778 | * still needs it. If it is on the Forget list, the | 772 | * Only do the copy if the currently-owning transaction still |
779 | * committing transaction is past that stage. The | 773 | * needs it. If buffer isn't on BJ_Metadata list, the |
780 | * buffer had better remain locked during the kmalloc, | 774 | * committing transaction is past that stage (here we use the |
781 | * but that should be true --- we hold the journal lock | 775 | * fact that BH_Shadow is set under bh_state lock together with |
782 | * still and the buffer is already on the BUF_JOURNAL | 776 | * refiling to BJ_Shadow list and at this point we know the |
783 | * list so won't be flushed. | 777 | * buffer doesn't have BH_Shadow set). |
784 | * | 778 | * |
785 | * Subtle point, though: if this is a get_undo_access, | 779 | * Subtle point, though: if this is a get_undo_access, |
786 | * then we will be relying on the frozen_data to contain | 780 | * then we will be relying on the frozen_data to contain |
787 | * the new value of the committed_data record after the | 781 | * the new value of the committed_data record after the |
788 | * transaction, so we HAVE to force the frozen_data copy | 782 | * transaction, so we HAVE to force the frozen_data copy |
789 | * in that case. */ | 783 | * in that case. |
790 | 784 | */ | |
791 | if (jh->b_jlist != BJ_Forget || force_copy) { | 785 | if (jh->b_jlist == BJ_Metadata || force_copy) { |
792 | JBUFFER_TRACE(jh, "generate frozen data"); | 786 | JBUFFER_TRACE(jh, "generate frozen data"); |
793 | if (!frozen_buffer) { | 787 | if (!frozen_buffer) { |
794 | JBUFFER_TRACE(jh, "allocate memory for buffer"); | 788 | JBUFFER_TRACE(jh, "allocate memory for buffer"); |
diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 9c505f1aa1fd..2439054a6c9a 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h | |||
@@ -244,6 +244,31 @@ typedef struct journal_superblock_s | |||
244 | 244 | ||
245 | #include <linux/fs.h> | 245 | #include <linux/fs.h> |
246 | #include <linux/sched.h> | 246 | #include <linux/sched.h> |
247 | |||
248 | enum jbd_state_bits { | ||
249 | BH_JBD /* Has an attached ext3 journal_head */ | ||
250 | = BH_PrivateStart, | ||
251 | BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ | ||
252 | BH_Freed, /* Has been freed (truncated) */ | ||
253 | BH_Revoked, /* Has been revoked from the log */ | ||
254 | BH_RevokeValid, /* Revoked flag is valid */ | ||
255 | BH_JBDDirty, /* Is dirty but journaled */ | ||
256 | BH_State, /* Pins most journal_head state */ | ||
257 | BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ | ||
258 | BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ | ||
259 | BH_JBDPrivateStart, /* First bit available for private use by FS */ | ||
260 | }; | ||
261 | |||
262 | BUFFER_FNS(JBD, jbd) | ||
263 | BUFFER_FNS(JWrite, jwrite) | ||
264 | BUFFER_FNS(JBDDirty, jbddirty) | ||
265 | TAS_BUFFER_FNS(JBDDirty, jbddirty) | ||
266 | BUFFER_FNS(Revoked, revoked) | ||
267 | TAS_BUFFER_FNS(Revoked, revoked) | ||
268 | BUFFER_FNS(RevokeValid, revokevalid) | ||
269 | TAS_BUFFER_FNS(RevokeValid, revokevalid) | ||
270 | BUFFER_FNS(Freed, freed) | ||
271 | |||
247 | #include <linux/jbd_common.h> | 272 | #include <linux/jbd_common.h> |
248 | 273 | ||
249 | #define J_ASSERT(assert) BUG_ON(!(assert)) | 274 | #define J_ASSERT(assert) BUG_ON(!(assert)) |
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b7dc40da99e0..e33e84b3d5c8 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h | |||
@@ -302,6 +302,34 @@ typedef struct journal_superblock_s | |||
302 | 302 | ||
303 | #include <linux/fs.h> | 303 | #include <linux/fs.h> |
304 | #include <linux/sched.h> | 304 | #include <linux/sched.h> |
305 | |||
306 | enum jbd_state_bits { | ||
307 | BH_JBD /* Has an attached ext3 journal_head */ | ||
308 | = BH_PrivateStart, | ||
309 | BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ | ||
310 | BH_Freed, /* Has been freed (truncated) */ | ||
311 | BH_Revoked, /* Has been revoked from the log */ | ||
312 | BH_RevokeValid, /* Revoked flag is valid */ | ||
313 | BH_JBDDirty, /* Is dirty but journaled */ | ||
314 | BH_State, /* Pins most journal_head state */ | ||
315 | BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ | ||
316 | BH_Shadow, /* IO on shadow buffer is running */ | ||
317 | BH_Verified, /* Metadata block has been verified ok */ | ||
318 | BH_JBDPrivateStart, /* First bit available for private use by FS */ | ||
319 | }; | ||
320 | |||
321 | BUFFER_FNS(JBD, jbd) | ||
322 | BUFFER_FNS(JWrite, jwrite) | ||
323 | BUFFER_FNS(JBDDirty, jbddirty) | ||
324 | TAS_BUFFER_FNS(JBDDirty, jbddirty) | ||
325 | BUFFER_FNS(Revoked, revoked) | ||
326 | TAS_BUFFER_FNS(Revoked, revoked) | ||
327 | BUFFER_FNS(RevokeValid, revokevalid) | ||
328 | TAS_BUFFER_FNS(RevokeValid, revokevalid) | ||
329 | BUFFER_FNS(Freed, freed) | ||
330 | BUFFER_FNS(Shadow, shadow) | ||
331 | BUFFER_FNS(Verified, verified) | ||
332 | |||
305 | #include <linux/jbd_common.h> | 333 | #include <linux/jbd_common.h> |
306 | 334 | ||
307 | #define J_ASSERT(assert) BUG_ON(!(assert)) | 335 | #define J_ASSERT(assert) BUG_ON(!(assert)) |
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h index 6133679bc4c0..b1f708976ffd 100644 --- a/include/linux/jbd_common.h +++ b/include/linux/jbd_common.h | |||
@@ -1,32 +1,6 @@ | |||
1 | #ifndef _LINUX_JBD_STATE_H | 1 | #ifndef _LINUX_JBD_STATE_H |
2 | #define _LINUX_JBD_STATE_H | 2 | #define _LINUX_JBD_STATE_H |
3 | 3 | ||
4 | enum jbd_state_bits { | ||
5 | BH_JBD /* Has an attached ext3 journal_head */ | ||
6 | = BH_PrivateStart, | ||
7 | BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ | ||
8 | BH_Freed, /* Has been freed (truncated) */ | ||
9 | BH_Revoked, /* Has been revoked from the log */ | ||
10 | BH_RevokeValid, /* Revoked flag is valid */ | ||
11 | BH_JBDDirty, /* Is dirty but journaled */ | ||
12 | BH_State, /* Pins most journal_head state */ | ||
13 | BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ | ||
14 | BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ | ||
15 | BH_Verified, /* Metadata block has been verified ok */ | ||
16 | BH_JBDPrivateStart, /* First bit available for private use by FS */ | ||
17 | }; | ||
18 | |||
19 | BUFFER_FNS(JBD, jbd) | ||
20 | BUFFER_FNS(JWrite, jwrite) | ||
21 | BUFFER_FNS(JBDDirty, jbddirty) | ||
22 | TAS_BUFFER_FNS(JBDDirty, jbddirty) | ||
23 | BUFFER_FNS(Revoked, revoked) | ||
24 | TAS_BUFFER_FNS(Revoked, revoked) | ||
25 | BUFFER_FNS(RevokeValid, revokevalid) | ||
26 | TAS_BUFFER_FNS(RevokeValid, revokevalid) | ||
27 | BUFFER_FNS(Freed, freed) | ||
28 | BUFFER_FNS(Verified, verified) | ||
29 | |||
30 | static inline struct buffer_head *jh2bh(struct journal_head *jh) | 4 | static inline struct buffer_head *jh2bh(struct journal_head *jh) |
31 | { | 5 | { |
32 | return jh->b_bh; | 6 | return jh->b_bh; |