raid5-ppl: Partial Parity Log write logging implementation

Implement the calculation of partial parity for a stripe and PPL write logging functionality. The description of PPL is added to the documentation. More details can be found in the comments in raid5-ppl.c. Attach a page for holding the partial parity data to stripe_head. Allocate it only if mddev has the MD_HAS_PPL flag set. Partial parity is the xor of not modified data chunks of a stripe and is calculated as follows: - reconstruct-write case: xor data from all not updated disks in a stripe - read-modify-write case: xor old data and parity from all updated disks in a stripe Implement it using the async_tx API and integrate into raid_run_ops(). It must be called when we still have access to old data, so do it when STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is stored into sh->ppl_page. Partial parity is not meaningful for full stripe write and is not stored in the log or used for recovery, so don't attempt to calculate it when stripe has STRIPE_FULL_WRITE. Put the PPL metadata structures to md_p.h because userspace tools (mdadm) will also need to read/write PPL. Warn about using PPL with enabled disk volatile write-back cache for now. It can be removed once disk cache flushing before writing PPL is implemented. Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com> Signed-off-by: Shaohua Li <shli@fb.com>
author: Artur Paszkiewicz <artur.paszkiewicz@intel.com> 2017-03-09 03:59:59 -0500
committer: Shaohua Li <shli@fb.com> 2017-03-16 19:55:54 -0400
commit: 3418d036c81dcb604b7c7c71b209d5890a8418aa (patch)
tree: d02a31103e09f82858bf149ebcb511e12ed6065a /drivers/md/raid5.c
parent: ff875738edd44e3bc892d378deacc50bccc9d70c (diff)
1 files changed, 61 insertions, 3 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f575f40d2acb..6b86e0826afe 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -482,6 +482,11 @@ static void shrink_buffers(struct stripe_head *sh)
                sh->dev[i].page = NULL;
                put_page(p);
        }
+        if (sh->ppl_page) {
+                put_page(sh->ppl_page);
+                sh->ppl_page = NULL;
+        }
 }
 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -498,6 +503,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
                sh->dev[i].page = page;
                sh->dev[i].orig_page = page;
        }
+        if (raid5_has_ppl(sh->raid_conf)) {
+                sh->ppl_page = alloc_page(gfp);
+                if (!sh->ppl_page)
+                        return 1;
+        }
        return 0;
 }
@@ -746,7 +758,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
 {
        struct r5conf *conf = sh->raid_conf;
-        if (conf->log)
+        if (conf->log || raid5_has_ppl(conf))
                return false;
        return test_bit(STRIPE_BATCH_READY, &sh->state) &&
                !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -2093,6 +2105,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
                        async_tx_ack(tx);
        }
+        if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+                tx = ops_run_partial_parity(sh, percpu, tx);
        if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
                if (level < 6)
                        tx = ops_run_prexor5(sh, percpu, tx);
@@ -3168,6 +3183,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                s->locked++;
        }
+        if (raid5_has_ppl(sh->raid_conf) &&
+            test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+            !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+            test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+                set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
        pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
                __func__, (unsigned long long)sh->sector,
                s->locked, s->ops_request);
@@ -3215,6 +3236,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
        if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
                goto overlap;
+        if (forwrite && raid5_has_ppl(conf)) {
+                /*
+                 * With PPL only writes to consecutive data chunks within a
+                 * stripe are allowed because for a single stripe_head we can
+                 * only have one PPL entry at a time, which describes one data
+                 * range. Not really an overlap, but wait_for_overlap can be
+                 * used to handle this.
+                 */
+                sector_t sector;
+                sector_t first = 0;
+                sector_t last = 0;
+                int count = 0;
+                int i;
+                for (i = 0; i < sh->disks; i++) {
+                        if (i != sh->pd_idx &&
+                            (i == dd_idx || sh->dev[i].towrite)) {
+                                sector = sh->dev[i].sector;
+                                if (count == 0 || sector < first)
+                                        first = sector;
+                                if (sector > last)
+                                        last = sector;
+                                count++;
+                        }
+                }
+                if (first + conf->chunk_sectors * (count - 1) != last)
+                        goto overlap;
+        }
        if (!forwrite || previous)
                clear_bit(STRIPE_BATCH_READY, &sh->state);
@@ -7208,6 +7259,13 @@ static int raid5_run(struct mddev *mddev)
                BUG_ON(mddev->delta_disks != 0);
        }
+        if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+            test_bit(MD_HAS_PPL, &mddev->flags)) {
+                pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+                        mdname(mddev));
+                clear_bit(MD_HAS_PPL, &mddev->flags);
+        }
        if (mddev->private == NULL)
                conf = setup_conf(mddev);
        else
@@ -7689,7 +7747,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
        sector_t newsize;
        struct r5conf *conf = mddev->private;
-        if (conf->log)
+        if (conf->log || raid5_has_ppl(conf))
                return -EINVAL;
        sectors &= ~((sector_t)conf->chunk_sectors - 1);
        newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7740,7 +7798,7 @@ static int check_reshape(struct mddev *mddev)
 {
        struct r5conf *conf = mddev->private;
-        if (conf->log)
+        if (conf->log || raid5_has_ppl(conf))
                return -EINVAL;
        if (mddev->delta_disks == 0 &&
            mddev->new_layout == mddev->layout &&
author	Artur Paszkiewicz <artur.paszkiewicz@intel.com>	2017-03-09 03:59:59 -0500
committer	Shaohua Li <shli@fb.com>	2017-03-16 19:55:54 -0400
commit	3418d036c81dcb604b7c7c71b209d5890a8418aa (patch)
tree	d02a31103e09f82858bf149ebcb511e12ed6065a /drivers/md/raid5.c
parent	ff875738edd44e3bc892d378deacc50bccc9d70c (diff)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f575f40d2acb..6b86e0826afe 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c
@@ -482,6 +482,11 @@ static void shrink_buffers(struct stripe_head *sh)
482	sh->dev[i].page = NULL;	482	sh->dev[i].page = NULL;
483	put_page(p);	483	put_page(p);
484	}	484	}
		485
		486	if (sh->ppl_page) {
		487	put_page(sh->ppl_page);
		488	sh->ppl_page = NULL;
		489	}
485	}	490	}
486		491
487	static int grow_buffers(struct stripe_head *sh, gfp_t gfp)	492	static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -498,6 +503,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
498	sh->dev[i].page = page;	503	sh->dev[i].page = page;
499	sh->dev[i].orig_page = page;	504	sh->dev[i].orig_page = page;
500	}	505	}
		506
		507	if (raid5_has_ppl(sh->raid_conf)) {
		508	sh->ppl_page = alloc_page(gfp);
		509	if (!sh->ppl_page)
		510	return 1;
		511	}
		512
501	return 0;	513	return 0;
502	}	514	}
503		515
@@ -746,7 +758,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
746	{	758	{
747	struct r5conf *conf = sh->raid_conf;	759	struct r5conf *conf = sh->raid_conf;
748		760
749	if (conf->log)	761	if (conf->log \|\| raid5_has_ppl(conf))
750	return false;	762	return false;
751	return test_bit(STRIPE_BATCH_READY, &sh->state) &&	763	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
752	!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&	764	!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -2093,6 +2105,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2093	async_tx_ack(tx);	2105	async_tx_ack(tx);
2094	}	2106	}
2095		2107
		2108	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
		2109	tx = ops_run_partial_parity(sh, percpu, tx);
		2110
2096	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {	2111	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2097	if (level < 6)	2112	if (level < 6)
2098	tx = ops_run_prexor5(sh, percpu, tx);	2113	tx = ops_run_prexor5(sh, percpu, tx);
@@ -3168,6 +3183,12 @@ schedule_reconstruction(struct stripe_head sh, struct stripe_head_state s,
3168	s->locked++;	3183	s->locked++;
3169	}	3184	}
3170		3185
		3186	if (raid5_has_ppl(sh->raid_conf) &&
		3187	test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
		3188	!test_bit(STRIPE_FULL_WRITE, &sh->state) &&
		3189	test_bit(R5_Insync, &sh->dev[pd_idx].flags))
		3190	set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
		3191
3171	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",	3192	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3172	__func__, (unsigned long long)sh->sector,	3193	__func__, (unsigned long long)sh->sector,
3173	s->locked, s->ops_request);	3194	s->locked, s->ops_request);
@@ -3215,6 +3236,36 @@ static int add_stripe_bio(struct stripe_head sh, struct bio bi, int dd_idx,
3215	if (bip && (bip)->bi_iter.bi_sector < bio_end_sector(bi))	3236	if (bip && (bip)->bi_iter.bi_sector < bio_end_sector(bi))
3216	goto overlap;	3237	goto overlap;
3217		3238
		3239	if (forwrite && raid5_has_ppl(conf)) {
		3240	/*
		3241	* With PPL only writes to consecutive data chunks within a
		3242	* stripe are allowed because for a single stripe_head we can
		3243	* only have one PPL entry at a time, which describes one data
		3244	* range. Not really an overlap, but wait_for_overlap can be
		3245	* used to handle this.
		3246	*/
		3247	sector_t sector;
		3248	sector_t first = 0;
		3249	sector_t last = 0;
		3250	int count = 0;
		3251	int i;
		3252
		3253	for (i = 0; i < sh->disks; i++) {
		3254	if (i != sh->pd_idx &&
		3255	(i == dd_idx \|\| sh->dev[i].towrite)) {
		3256	sector = sh->dev[i].sector;
		3257	if (count == 0 \|\| sector < first)
		3258	first = sector;
		3259	if (sector > last)
		3260	last = sector;
		3261	count++;
		3262	}
		3263	}
		3264
		3265	if (first + conf->chunk_sectors * (count - 1) != last)
		3266	goto overlap;
		3267	}
		3268
3218	if (!forwrite \|\| previous)	3269	if (!forwrite \|\| previous)
3219	clear_bit(STRIPE_BATCH_READY, &sh->state);	3270	clear_bit(STRIPE_BATCH_READY, &sh->state);
3220		3271
@@ -7208,6 +7259,13 @@ static int raid5_run(struct mddev *mddev)
7208	BUG_ON(mddev->delta_disks != 0);	7259	BUG_ON(mddev->delta_disks != 0);
7209	}	7260	}
7210		7261
		7262	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
		7263	test_bit(MD_HAS_PPL, &mddev->flags)) {
		7264	pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
		7265	mdname(mddev));
		7266	clear_bit(MD_HAS_PPL, &mddev->flags);
		7267	}
		7268
7211	if (mddev->private == NULL)	7269	if (mddev->private == NULL)
7212	conf = setup_conf(mddev);	7270	conf = setup_conf(mddev);
7213	else	7271	else
@@ -7689,7 +7747,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
7689	sector_t newsize;	7747	sector_t newsize;
7690	struct r5conf *conf = mddev->private;	7748	struct r5conf *conf = mddev->private;
7691		7749
7692	if (conf->log)	7750	if (conf->log \|\| raid5_has_ppl(conf))
7693	return -EINVAL;	7751	return -EINVAL;
7694	sectors &= ~((sector_t)conf->chunk_sectors - 1);	7752	sectors &= ~((sector_t)conf->chunk_sectors - 1);
7695	newsize = raid5_size(mddev, sectors, mddev->raid_disks);	7753	newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7740,7 +7798,7 @@ static int check_reshape(struct mddev *mddev)
7740	{	7798	{
7741	struct r5conf *conf = mddev->private;	7799	struct r5conf *conf = mddev->private;
7742		7800
7743	if (conf->log)	7801	if (conf->log \|\| raid5_has_ppl(conf))
7744	return -EINVAL;	7802	return -EINVAL;
7745	if (mddev->delta_disks == 0 &&	7803	if (mddev->delta_disks == 0 &&
7746	mddev->new_layout == mddev->layout &&	7804	mddev->new_layout == mddev->layout &&