md/r5cache: State machine for raid5-cache write back mode

This patch adds state machine for raid5-cache. With log device, the raid456 array could operate in two different modes (r5c_journal_mode): - write-back (R5C_MODE_WRITE_BACK) - write-through (R5C_MODE_WRITE_THROUGH) Existing code of raid5-cache only has write-through mode. For write-back cache, it is necessary to extend the state machine. With write-back cache, every stripe could operate in two different phases: - caching - writing-out In caching phase, the stripe handles writes as: - write to journal - return IO In writing-out phase, the stripe behaviors as a stripe in write through mode R5C_MODE_WRITE_THROUGH. STRIPE_R5C_CACHING is added to sh->state to differentiate caching and writing-out phase. Please note: this is a "no-op" patch for raid5-cache write-through mode. The following detailed explanation is copied from the raid5-cache.c: /* * raid5 cache state machine * * With rhe RAID cache, each stripe works in two phases: * - caching phase * - writing-out phase * * These two phases are controlled by bit STRIPE_R5C_CACHING: * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase * * When there is no journal, or the journal is in write-through mode, * the stripe is always in writing-out phase. * * For write-back journal, the stripe is sent to caching phase on write * (r5c_handle_stripe_dirtying). r5c_make_stripe_write_out() kicks off * the write-out phase by clearing STRIPE_R5C_CACHING. * * Stripes in caching phase do not write the raid disks. Instead, all * writes are committed from the log device. Therefore, a stripe in * caching phase handles writes as: * - write to log device * - return IO * * Stripes in writing-out phase handle writes as: * - calculate parity * - write pending data and parity to journal * - write data and parity to raid disks * - return IO for pending writes */ Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Shaohua Li <shli@fb.com>
author: Song Liu <songliubraving@fb.com> 2016-11-17 18:24:38 -0500
committer: Shaohua Li <shli@fb.com> 2016-11-18 16:26:07 -0500
commit: 2ded370373a400c20cf0c6e941e724e61582a867 (patch)
tree: 704038326bbbe6f2e7fddd1b98b1f0ae1d3d9f44 /drivers/md/raid5.c
parent: 937621c36e0ea1af2aceeaea412ba3bd80247199 (diff)
1 files changed, 41 insertions, 4 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 34895f3218d9..7c98eb06d1b2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4107,6 +4107,9 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                        if (rdev && !test_bit(Faulty, &rdev->flags))
                                do_recovery = 1;
                }
+                if (test_bit(R5_InJournal, &dev->flags))
+                        s->injournal++;
        }
        if (test_bit(STRIPE_SYNCING, &sh->state)) {
                /* If there is a failed device being replaced,
@@ -4386,14 +4389,47 @@ static void handle_stripe(struct stripe_head *sh)
            || s.expanding)
                handle_stripe_fill(sh, &s, disks);
-        /* Now to consider new write requests and what else, if anything
+        /*
-         * should be read.  We do not handle new writes when:
+         * When the stripe finishes full journal write cycle (write to journal
+         * and raid disk), this is the clean up procedure so it is ready for
+         * next operation.
+         */
+        r5c_finish_stripe_write_out(conf, sh, &s);
+        /*
+         * Now to consider new write requests, cache write back and what else,
+         * if anything should be read.  We do not handle new writes when:
         * 1/ A 'write' operation (copy+xor) is already in flight.
         * 2/ A 'check' operation is in flight, as it may clobber the parity
         *    block.
+         * 3/ A r5c cache log write is in flight.
         */
-        if (s.to_write && !sh->reconstruct_state && !sh->check_state)
-                handle_stripe_dirtying(conf, sh, &s, disks);
+        if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
+                if (!r5c_is_writeback(conf->log)) {
+                        if (s.to_write)
+                                handle_stripe_dirtying(conf, sh, &s, disks);
+                } else { /* write back cache */
+                        int ret = 0;
+                        /* First, try handle writes in caching phase */
+                        if (s.to_write)
+                                ret = r5c_try_caching_write(conf, sh, &s,
+                                                            disks);
+                        /*
+                         * If caching phase failed: ret == -EAGAIN
+                         *    OR
+                         * stripe under reclaim: !caching && injournal
+                         *
+                         * fall back to handle_stripe_dirtying()
+                         */
+                        if (ret == -EAGAIN ||
+                            /* stripe under reclaim: !caching && injournal */
+                            (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
+                             s.injournal > 0))
+                                handle_stripe_dirtying(conf, sh, &s, disks);
+                }
+        }
        /* maybe we need to check and possibly fix the parity for this stripe
         * Any reads will already have been scheduled, so we just see if enough
@@ -5110,6 +5146,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
         * data on failed drives.
         */
        if (rw == READ && mddev->degraded == 0 &&
+            !r5c_is_writeback(conf->log) &&
            mddev->reshape_position == MaxSector) {
                bi = chunk_aligned_read(mddev, bi);
                if (!bi)
author	Song Liu <songliubraving@fb.com>	2016-11-17 18:24:38 -0500
committer	Shaohua Li <shli@fb.com>	2016-11-18 16:26:07 -0500
commit	2ded370373a400c20cf0c6e941e724e61582a867 (patch)
tree	704038326bbbe6f2e7fddd1b98b1f0ae1d3d9f44 /drivers/md/raid5.c
parent	937621c36e0ea1af2aceeaea412ba3bd80247199 (diff)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 34895f3218d9..7c98eb06d1b2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c
@@ -4107,6 +4107,9 @@ static void analyse_stripe(struct stripe_head sh, struct stripe_head_state s)
4107	if (rdev && !test_bit(Faulty, &rdev->flags))	4107	if (rdev && !test_bit(Faulty, &rdev->flags))
4108	do_recovery = 1;	4108	do_recovery = 1;
4109	}	4109	}
		4110
		4111	if (test_bit(R5_InJournal, &dev->flags))
		4112	s->injournal++;
4110	}	4113	}
4111	if (test_bit(STRIPE_SYNCING, &sh->state)) {	4114	if (test_bit(STRIPE_SYNCING, &sh->state)) {
4112	/* If there is a failed device being replaced,	4115	/* If there is a failed device being replaced,
@@ -4386,14 +4389,47 @@ static void handle_stripe(struct stripe_head *sh)
4386	\|\| s.expanding)	4389	\|\| s.expanding)
4387	handle_stripe_fill(sh, &s, disks);	4390	handle_stripe_fill(sh, &s, disks);
4388		4391
4389	/* Now to consider new write requests and what else, if anything	4392	/*
4390	* should be read. We do not handle new writes when:	4393	* When the stripe finishes full journal write cycle (write to journal
		4394	* and raid disk), this is the clean up procedure so it is ready for
		4395	* next operation.
		4396	*/
		4397	r5c_finish_stripe_write_out(conf, sh, &s);
		4398
		4399	/*
		4400	* Now to consider new write requests, cache write back and what else,
		4401	* if anything should be read. We do not handle new writes when:
4391	* 1/ A 'write' operation (copy+xor) is already in flight.	4402	* 1/ A 'write' operation (copy+xor) is already in flight.
4392	* 2/ A 'check' operation is in flight, as it may clobber the parity	4403	* 2/ A 'check' operation is in flight, as it may clobber the parity
4393	* block.	4404	* block.
		4405	* 3/ A r5c cache log write is in flight.
4394	*/	4406	*/
4395	if (s.to_write && !sh->reconstruct_state && !sh->check_state)	4407
4396	handle_stripe_dirtying(conf, sh, &s, disks);	4408	if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
		4409	if (!r5c_is_writeback(conf->log)) {
		4410	if (s.to_write)
		4411	handle_stripe_dirtying(conf, sh, &s, disks);
		4412	} else { /* write back cache */
		4413	int ret = 0;
		4414
		4415	/* First, try handle writes in caching phase */
		4416	if (s.to_write)
		4417	ret = r5c_try_caching_write(conf, sh, &s,
		4418	disks);
		4419	/*
		4420	* If caching phase failed: ret == -EAGAIN
		4421	* OR
		4422	* stripe under reclaim: !caching && injournal
		4423	*
		4424	* fall back to handle_stripe_dirtying()
		4425	*/
		4426	if (ret == -EAGAIN \|\|
		4427	/* stripe under reclaim: !caching && injournal */
		4428	(!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
		4429	s.injournal > 0))
		4430	handle_stripe_dirtying(conf, sh, &s, disks);
		4431	}
		4432	}
4397		4433
4398	/* maybe we need to check and possibly fix the parity for this stripe	4434	/* maybe we need to check and possibly fix the parity for this stripe
4399	* Any reads will already have been scheduled, so we just see if enough	4435	* Any reads will already have been scheduled, so we just see if enough
@@ -5110,6 +5146,7 @@ static void raid5_make_request(struct mddev mddev, struct bio bi)
5110	* data on failed drives.	5146	* data on failed drives.
5111	*/	5147	*/
5112	if (rw == READ && mddev->degraded == 0 &&	5148	if (rw == READ && mddev->degraded == 0 &&
		5149	!r5c_is_writeback(conf->log) &&
5113	mddev->reshape_position == MaxSector) {	5150	mddev->reshape_position == MaxSector) {
5114	bi = chunk_aligned_read(mddev, bi);	5151	bi = chunk_aligned_read(mddev, bi);
5115	if (!bi)	5152	if (!bi)