aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2013-06-04 12:08:56 -0400
committerTheodore Ts'o <tytso@mit.edu>2013-06-04 12:08:56 -0400
commitb34090e5e22a02fba0e4473056cce9420ad9dd0b (patch)
tree7ffb9ecd10ada2aefe9079c2df91405592132e47
parente5a120aeb57f40ae568a5ca1dd6ace53d0213582 (diff)
jbd2: refine waiting for shadow buffers
Currently when we add a buffer to a transaction, we wait until the buffer is removed from BJ_Shadow list (so that we prevent any changes to the buffer that is just written to the journal). This can take unnecessarily long as a lot happens between the time the buffer is submitted to the journal and the time when we remove the buffer from BJ_Shadow list. (e.g. We wait for all data buffers in the transaction, we issue a cache flush, etc.) Also this creates a dependency of do_get_write_access() on transaction commit (namely waiting for data IO to complete) which we want to avoid when implementing transaction reservation. So we modify commit code to set new BH_Shadow flag when temporary shadowing buffer is created and we clear that flag once IO on that buffer is complete. This allows do_get_write_access() to wait only for BH_Shadow bit and thus removes the dependency on data IO completion. Reviewed-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/jbd2/commit.c18
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jbd2/transaction.c44
-rw-r--r--include/linux/jbd.h25
-rw-r--r--include/linux/jbd2.h28
-rw-r--r--include/linux/jbd_common.h26
6 files changed, 83 insertions, 60 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c6f7eea2316..d73a0d808ec1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -30,15 +30,22 @@
30#include <trace/events/jbd2.h> 30#include <trace/events/jbd2.h>
31 31
32/* 32/*
33 * Default IO end handler for temporary BJ_IO buffer_heads. 33 * IO end handler for temporary buffer_heads handling writes to the journal.
34 */ 34 */
35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36{ 36{
37 struct buffer_head *orig_bh = bh->b_private;
38
37 BUFFER_TRACE(bh, ""); 39 BUFFER_TRACE(bh, "");
38 if (uptodate) 40 if (uptodate)
39 set_buffer_uptodate(bh); 41 set_buffer_uptodate(bh);
40 else 42 else
41 clear_buffer_uptodate(bh); 43 clear_buffer_uptodate(bh);
44 if (orig_bh) {
45 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46 smp_mb__after_clear_bit();
47 wake_up_bit(&orig_bh->b_state, BH_Shadow);
48 }
42 unlock_buffer(bh); 49 unlock_buffer(bh);
43} 50}
44 51
@@ -832,6 +839,7 @@ start_journal_io:
832 bh = jh2bh(jh); 839 bh = jh2bh(jh);
833 clear_buffer_jwrite(bh); 840 clear_buffer_jwrite(bh);
834 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 841 J_ASSERT_BH(bh, buffer_jbddirty(bh));
842 J_ASSERT_BH(bh, !buffer_shadow(bh));
835 843
836 /* The metadata is now released for reuse, but we need 844 /* The metadata is now released for reuse, but we need
837 to remember it against this transaction so that when 845 to remember it against this transaction so that when
@@ -839,14 +847,6 @@ start_journal_io:
839 required. */ 847 required. */
840 JBUFFER_TRACE(jh, "file as BJ_Forget"); 848 JBUFFER_TRACE(jh, "file as BJ_Forget");
841 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 849 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
842 /*
843 * Wake up any transactions which were waiting for this IO to
844 * complete. The barrier must be here so that changes by
845 * jbd2_journal_file_buffer() take effect before wake_up_bit()
846 * does the waitqueue check.
847 */
848 smp_mb();
849 wake_up_bit(&bh->b_state, BH_Unshadow);
850 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 850 JBUFFER_TRACE(jh, "brelse shadowed buffer");
851 __brelse(bh); 851 __brelse(bh);
852 } 852 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b0a8d1e4703e..5ef0712e2f7a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -451,6 +451,7 @@ repeat:
451 new_bh->b_size = bh_in->b_size; 451 new_bh->b_size = bh_in->b_size;
452 new_bh->b_bdev = journal->j_dev; 452 new_bh->b_bdev = journal->j_dev;
453 new_bh->b_blocknr = blocknr; 453 new_bh->b_blocknr = blocknr;
454 new_bh->b_private = bh_in;
454 set_buffer_mapped(new_bh); 455 set_buffer_mapped(new_bh);
455 set_buffer_dirty(new_bh); 456 set_buffer_dirty(new_bh);
456 457
@@ -465,6 +466,7 @@ repeat:
465 spin_lock(&journal->j_list_lock); 466 spin_lock(&journal->j_list_lock);
466 __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); 467 __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
467 spin_unlock(&journal->j_list_lock); 468 spin_unlock(&journal->j_list_lock);
469 set_buffer_shadow(bh_in);
468 jbd_unlock_bh_state(bh_in); 470 jbd_unlock_bh_state(bh_in);
469 471
470 return do_escape | (done_copy_out << 1); 472 return do_escape | (done_copy_out << 1);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index f1c5392e62b6..6f4248dd8759 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -619,6 +619,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
619 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 619 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
620} 620}
621 621
622static int sleep_on_shadow_bh(void *word)
623{
624 io_schedule();
625 return 0;
626}
627
622/* 628/*
623 * If the buffer is already part of the current transaction, then there 629 * If the buffer is already part of the current transaction, then there
624 * is nothing we need to do. If it is already part of a prior 630 * is nothing we need to do. If it is already part of a prior
@@ -754,41 +760,29 @@ repeat:
754 * journaled. If the primary copy is already going to 760 * journaled. If the primary copy is already going to
755 * disk then we cannot do copy-out here. */ 761 * disk then we cannot do copy-out here. */
756 762
757 if (jh->b_jlist == BJ_Shadow) { 763 if (buffer_shadow(bh)) {
758 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
759 wait_queue_head_t *wqh;
760
761 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
762
763 JBUFFER_TRACE(jh, "on shadow: sleep"); 764 JBUFFER_TRACE(jh, "on shadow: sleep");
764 jbd_unlock_bh_state(bh); 765 jbd_unlock_bh_state(bh);
765 /* commit wakes up all shadow buffers after IO */ 766 wait_on_bit(&bh->b_state, BH_Shadow,
766 for ( ; ; ) { 767 sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
767 prepare_to_wait(wqh, &wait.wait,
768 TASK_UNINTERRUPTIBLE);
769 if (jh->b_jlist != BJ_Shadow)
770 break;
771 schedule();
772 }
773 finish_wait(wqh, &wait.wait);
774 goto repeat; 768 goto repeat;
775 } 769 }
776 770
777 /* Only do the copy if the currently-owning transaction 771 /*
778 * still needs it. If it is on the Forget list, the 772 * Only do the copy if the currently-owning transaction still
779 * committing transaction is past that stage. The 773 * needs it. If buffer isn't on BJ_Metadata list, the
780 * buffer had better remain locked during the kmalloc, 774 * committing transaction is past that stage (here we use the
781 * but that should be true --- we hold the journal lock 775 * fact that BH_Shadow is set under bh_state lock together with
782 * still and the buffer is already on the BUF_JOURNAL 776 * refiling to BJ_Shadow list and at this point we know the
783 * list so won't be flushed. 777 * buffer doesn't have BH_Shadow set).
784 * 778 *
785 * Subtle point, though: if this is a get_undo_access, 779 * Subtle point, though: if this is a get_undo_access,
786 * then we will be relying on the frozen_data to contain 780 * then we will be relying on the frozen_data to contain
787 * the new value of the committed_data record after the 781 * the new value of the committed_data record after the
788 * transaction, so we HAVE to force the frozen_data copy 782 * transaction, so we HAVE to force the frozen_data copy
789 * in that case. */ 783 * in that case.
790 784 */
791 if (jh->b_jlist != BJ_Forget || force_copy) { 785 if (jh->b_jlist == BJ_Metadata || force_copy) {
792 JBUFFER_TRACE(jh, "generate frozen data"); 786 JBUFFER_TRACE(jh, "generate frozen data");
793 if (!frozen_buffer) { 787 if (!frozen_buffer) {
794 JBUFFER_TRACE(jh, "allocate memory for buffer"); 788 JBUFFER_TRACE(jh, "allocate memory for buffer");
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 9c505f1aa1fd..2439054a6c9a 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -244,6 +244,31 @@ typedef struct journal_superblock_s
244 244
245#include <linux/fs.h> 245#include <linux/fs.h>
246#include <linux/sched.h> 246#include <linux/sched.h>
247
248enum jbd_state_bits {
249 BH_JBD /* Has an attached ext3 journal_head */
250 = BH_PrivateStart,
251 BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
252 BH_Freed, /* Has been freed (truncated) */
253 BH_Revoked, /* Has been revoked from the log */
254 BH_RevokeValid, /* Revoked flag is valid */
255 BH_JBDDirty, /* Is dirty but journaled */
256 BH_State, /* Pins most journal_head state */
257 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
258 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
259 BH_JBDPrivateStart, /* First bit available for private use by FS */
260};
261
262BUFFER_FNS(JBD, jbd)
263BUFFER_FNS(JWrite, jwrite)
264BUFFER_FNS(JBDDirty, jbddirty)
265TAS_BUFFER_FNS(JBDDirty, jbddirty)
266BUFFER_FNS(Revoked, revoked)
267TAS_BUFFER_FNS(Revoked, revoked)
268BUFFER_FNS(RevokeValid, revokevalid)
269TAS_BUFFER_FNS(RevokeValid, revokevalid)
270BUFFER_FNS(Freed, freed)
271
247#include <linux/jbd_common.h> 272#include <linux/jbd_common.h>
248 273
249#define J_ASSERT(assert) BUG_ON(!(assert)) 274#define J_ASSERT(assert) BUG_ON(!(assert))
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index b7dc40da99e0..e33e84b3d5c8 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -302,6 +302,34 @@ typedef struct journal_superblock_s
302 302
303#include <linux/fs.h> 303#include <linux/fs.h>
304#include <linux/sched.h> 304#include <linux/sched.h>
305
306enum jbd_state_bits {
307 BH_JBD /* Has an attached ext3 journal_head */
308 = BH_PrivateStart,
309 BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
310 BH_Freed, /* Has been freed (truncated) */
311 BH_Revoked, /* Has been revoked from the log */
312 BH_RevokeValid, /* Revoked flag is valid */
313 BH_JBDDirty, /* Is dirty but journaled */
314 BH_State, /* Pins most journal_head state */
315 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
316 BH_Shadow, /* IO on shadow buffer is running */
317 BH_Verified, /* Metadata block has been verified ok */
318 BH_JBDPrivateStart, /* First bit available for private use by FS */
319};
320
321BUFFER_FNS(JBD, jbd)
322BUFFER_FNS(JWrite, jwrite)
323BUFFER_FNS(JBDDirty, jbddirty)
324TAS_BUFFER_FNS(JBDDirty, jbddirty)
325BUFFER_FNS(Revoked, revoked)
326TAS_BUFFER_FNS(Revoked, revoked)
327BUFFER_FNS(RevokeValid, revokevalid)
328TAS_BUFFER_FNS(RevokeValid, revokevalid)
329BUFFER_FNS(Freed, freed)
330BUFFER_FNS(Shadow, shadow)
331BUFFER_FNS(Verified, verified)
332
305#include <linux/jbd_common.h> 333#include <linux/jbd_common.h>
306 334
307#define J_ASSERT(assert) BUG_ON(!(assert)) 335#define J_ASSERT(assert) BUG_ON(!(assert))
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
index 6133679bc4c0..b1f708976ffd 100644
--- a/include/linux/jbd_common.h
+++ b/include/linux/jbd_common.h
@@ -1,32 +1,6 @@
1#ifndef _LINUX_JBD_STATE_H 1#ifndef _LINUX_JBD_STATE_H
2#define _LINUX_JBD_STATE_H 2#define _LINUX_JBD_STATE_H
3 3
4enum jbd_state_bits {
5 BH_JBD /* Has an attached ext3 journal_head */
6 = BH_PrivateStart,
7 BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
8 BH_Freed, /* Has been freed (truncated) */
9 BH_Revoked, /* Has been revoked from the log */
10 BH_RevokeValid, /* Revoked flag is valid */
11 BH_JBDDirty, /* Is dirty but journaled */
12 BH_State, /* Pins most journal_head state */
13 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
14 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
15 BH_Verified, /* Metadata block has been verified ok */
16 BH_JBDPrivateStart, /* First bit available for private use by FS */
17};
18
19BUFFER_FNS(JBD, jbd)
20BUFFER_FNS(JWrite, jwrite)
21BUFFER_FNS(JBDDirty, jbddirty)
22TAS_BUFFER_FNS(JBDDirty, jbddirty)
23BUFFER_FNS(Revoked, revoked)
24TAS_BUFFER_FNS(Revoked, revoked)
25BUFFER_FNS(RevokeValid, revokevalid)
26TAS_BUFFER_FNS(RevokeValid, revokevalid)
27BUFFER_FNS(Freed, freed)
28BUFFER_FNS(Verified, verified)
29
30static inline struct buffer_head *jh2bh(struct journal_head *jh) 4static inline struct buffer_head *jh2bh(struct journal_head *jh)
31{ 5{
32 return jh->b_bh; 6 return jh->b_bh;