diff options
author | Artur Paszkiewicz <artur.paszkiewicz@intel.com> | 2017-03-09 03:59:59 -0500 |
---|---|---|
committer | Shaohua Li <shli@fb.com> | 2017-03-16 19:55:54 -0400 |
commit | 3418d036c81dcb604b7c7c71b209d5890a8418aa (patch) | |
tree | d02a31103e09f82858bf149ebcb511e12ed6065a /drivers/md/raid5.c | |
parent | ff875738edd44e3bc892d378deacc50bccc9d70c (diff) |
raid5-ppl: Partial Parity Log write logging implementation
Implement the calculation of partial parity for a stripe and PPL write
logging functionality. The description of PPL is added to the
documentation. More details can be found in the comments in raid5-ppl.c.
Attach a page for holding the partial parity data to stripe_head.
Allocate it only if mddev has the MD_HAS_PPL flag set.
Partial parity is the xor of not modified data chunks of a stripe and is
calculated as follows:
- reconstruct-write case:
xor data from all not updated disks in a stripe
- read-modify-write case:
xor old data and parity from all updated disks in a stripe
Implement it using the async_tx API and integrate into raid_run_ops().
It must be called when we still have access to old data, so do it when
STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is
stored into sh->ppl_page.
Partial parity is not meaningful for full stripe write and is not stored
in the log or used for recovery, so don't attempt to calculate it when
stripe has STRIPE_FULL_WRITE.
Put the PPL metadata structures to md_p.h because userspace tools
(mdadm) will also need to read/write PPL.
Warn about using PPL with enabled disk volatile write-back cache for
now. It can be removed once disk cache flushing before writing PPL is
implemented.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 64 |
1 files changed, 61 insertions, 3 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f575f40d2acb..6b86e0826afe 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -482,6 +482,11 @@ static void shrink_buffers(struct stripe_head *sh) | |||
482 | sh->dev[i].page = NULL; | 482 | sh->dev[i].page = NULL; |
483 | put_page(p); | 483 | put_page(p); |
484 | } | 484 | } |
485 | |||
486 | if (sh->ppl_page) { | ||
487 | put_page(sh->ppl_page); | ||
488 | sh->ppl_page = NULL; | ||
489 | } | ||
485 | } | 490 | } |
486 | 491 | ||
487 | static int grow_buffers(struct stripe_head *sh, gfp_t gfp) | 492 | static int grow_buffers(struct stripe_head *sh, gfp_t gfp) |
@@ -498,6 +503,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp) | |||
498 | sh->dev[i].page = page; | 503 | sh->dev[i].page = page; |
499 | sh->dev[i].orig_page = page; | 504 | sh->dev[i].orig_page = page; |
500 | } | 505 | } |
506 | |||
507 | if (raid5_has_ppl(sh->raid_conf)) { | ||
508 | sh->ppl_page = alloc_page(gfp); | ||
509 | if (!sh->ppl_page) | ||
510 | return 1; | ||
511 | } | ||
512 | |||
501 | return 0; | 513 | return 0; |
502 | } | 514 | } |
503 | 515 | ||
@@ -746,7 +758,7 @@ static bool stripe_can_batch(struct stripe_head *sh) | |||
746 | { | 758 | { |
747 | struct r5conf *conf = sh->raid_conf; | 759 | struct r5conf *conf = sh->raid_conf; |
748 | 760 | ||
749 | if (conf->log) | 761 | if (conf->log || raid5_has_ppl(conf)) |
750 | return false; | 762 | return false; |
751 | return test_bit(STRIPE_BATCH_READY, &sh->state) && | 763 | return test_bit(STRIPE_BATCH_READY, &sh->state) && |
752 | !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && | 764 | !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && |
@@ -2093,6 +2105,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
2093 | async_tx_ack(tx); | 2105 | async_tx_ack(tx); |
2094 | } | 2106 | } |
2095 | 2107 | ||
2108 | if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) | ||
2109 | tx = ops_run_partial_parity(sh, percpu, tx); | ||
2110 | |||
2096 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { | 2111 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { |
2097 | if (level < 6) | 2112 | if (level < 6) |
2098 | tx = ops_run_prexor5(sh, percpu, tx); | 2113 | tx = ops_run_prexor5(sh, percpu, tx); |
@@ -3168,6 +3183,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
3168 | s->locked++; | 3183 | s->locked++; |
3169 | } | 3184 | } |
3170 | 3185 | ||
3186 | if (raid5_has_ppl(sh->raid_conf) && | ||
3187 | test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && | ||
3188 | !test_bit(STRIPE_FULL_WRITE, &sh->state) && | ||
3189 | test_bit(R5_Insync, &sh->dev[pd_idx].flags)) | ||
3190 | set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); | ||
3191 | |||
3171 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", | 3192 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", |
3172 | __func__, (unsigned long long)sh->sector, | 3193 | __func__, (unsigned long long)sh->sector, |
3173 | s->locked, s->ops_request); | 3194 | s->locked, s->ops_request); |
@@ -3215,6 +3236,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, | |||
3215 | if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) | 3236 | if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) |
3216 | goto overlap; | 3237 | goto overlap; |
3217 | 3238 | ||
3239 | if (forwrite && raid5_has_ppl(conf)) { | ||
3240 | /* | ||
3241 | * With PPL only writes to consecutive data chunks within a | ||
3242 | * stripe are allowed because for a single stripe_head we can | ||
3243 | * only have one PPL entry at a time, which describes one data | ||
3244 | * range. Not really an overlap, but wait_for_overlap can be | ||
3245 | * used to handle this. | ||
3246 | */ | ||
3247 | sector_t sector; | ||
3248 | sector_t first = 0; | ||
3249 | sector_t last = 0; | ||
3250 | int count = 0; | ||
3251 | int i; | ||
3252 | |||
3253 | for (i = 0; i < sh->disks; i++) { | ||
3254 | if (i != sh->pd_idx && | ||
3255 | (i == dd_idx || sh->dev[i].towrite)) { | ||
3256 | sector = sh->dev[i].sector; | ||
3257 | if (count == 0 || sector < first) | ||
3258 | first = sector; | ||
3259 | if (sector > last) | ||
3260 | last = sector; | ||
3261 | count++; | ||
3262 | } | ||
3263 | } | ||
3264 | |||
3265 | if (first + conf->chunk_sectors * (count - 1) != last) | ||
3266 | goto overlap; | ||
3267 | } | ||
3268 | |||
3218 | if (!forwrite || previous) | 3269 | if (!forwrite || previous) |
3219 | clear_bit(STRIPE_BATCH_READY, &sh->state); | 3270 | clear_bit(STRIPE_BATCH_READY, &sh->state); |
3220 | 3271 | ||
@@ -7208,6 +7259,13 @@ static int raid5_run(struct mddev *mddev) | |||
7208 | BUG_ON(mddev->delta_disks != 0); | 7259 | BUG_ON(mddev->delta_disks != 0); |
7209 | } | 7260 | } |
7210 | 7261 | ||
7262 | if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && | ||
7263 | test_bit(MD_HAS_PPL, &mddev->flags)) { | ||
7264 | pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", | ||
7265 | mdname(mddev)); | ||
7266 | clear_bit(MD_HAS_PPL, &mddev->flags); | ||
7267 | } | ||
7268 | |||
7211 | if (mddev->private == NULL) | 7269 | if (mddev->private == NULL) |
7212 | conf = setup_conf(mddev); | 7270 | conf = setup_conf(mddev); |
7213 | else | 7271 | else |
@@ -7689,7 +7747,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) | |||
7689 | sector_t newsize; | 7747 | sector_t newsize; |
7690 | struct r5conf *conf = mddev->private; | 7748 | struct r5conf *conf = mddev->private; |
7691 | 7749 | ||
7692 | if (conf->log) | 7750 | if (conf->log || raid5_has_ppl(conf)) |
7693 | return -EINVAL; | 7751 | return -EINVAL; |
7694 | sectors &= ~((sector_t)conf->chunk_sectors - 1); | 7752 | sectors &= ~((sector_t)conf->chunk_sectors - 1); |
7695 | newsize = raid5_size(mddev, sectors, mddev->raid_disks); | 7753 | newsize = raid5_size(mddev, sectors, mddev->raid_disks); |
@@ -7740,7 +7798,7 @@ static int check_reshape(struct mddev *mddev) | |||
7740 | { | 7798 | { |
7741 | struct r5conf *conf = mddev->private; | 7799 | struct r5conf *conf = mddev->private; |
7742 | 7800 | ||
7743 | if (conf->log) | 7801 | if (conf->log || raid5_has_ppl(conf)) |
7744 | return -EINVAL; | 7802 | return -EINVAL; |
7745 | if (mddev->delta_disks == 0 && | 7803 | if (mddev->delta_disks == 0 && |
7746 | mddev->new_layout == mddev->layout && | 7804 | mddev->new_layout == mddev->layout && |