diff options
| author | NeilBrown <neilb@suse.de> | 2013-03-11 21:18:06 -0400 |
|---|---|---|
| committer | NeilBrown <neilb@suse.de> | 2013-03-19 22:20:59 -0400 |
| commit | f8dfcffd0472a0f353f34a567ad3f53568914d04 (patch) | |
| tree | 8e19d10fcd778cace960c49336f1323858dcb4d5 | |
| parent | 90584fc93d461520a888f691144f0879283b3624 (diff) | |
md/raid5: ensure sync and DISCARD don't happen at the same time.
A number of problems can occur due to races between
resync/recovery and discard.
- if sync_request calls handle_stripe() while a discard is
happening on the stripe, it might call handle_stripe_clean_event
before all of the individual discard requests have completed
(so some devices are still locked, but not all).
Since commit ca64cae96037de16e4af92678814f5d4bf0c1c65
md/raid5: Make sure we clear R5_Discard when discard is finished.
this will cause R5_Discard to be cleared for the parity device,
so handle_stripe_clean_event() will not be called when the other
devices do become unlocked, so their ->written will not be cleared.
This ultimately leads to a WARN_ON in init_stripe and a lock-up.
- If handle_stripe_clean_event() does clear R5_UPTODATE at an awkward
time for resync, it can lead to s->uptodate being less than disks
in handle_parity_checks5(), which triggers a BUG (because it is
one).
So:
- keep R5_Discard on the parity device until all other devices have
completed their discard request
- make sure we don't try to have a 'discard' and a 'sync' action at
the same time.
This involves a new stripe flag to we know when a 'discard' is
happening, and the use of R5_Overlap on the parity disk so when a
discard is wanted while a sync is active, so we know to wake up
the discard at the appropriate time.
Discard support for RAID5 was added in 3.7, so this is suitable for
any -stable kernel since 3.7.
Cc: stable@vger.kernel.org (v3.7+)
Reported-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Tested-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
| -rw-r--r-- | drivers/md/raid5.c | 45 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 1 |
2 files changed, 40 insertions, 6 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 52ba88a10668..42a899728748 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -2576,6 +2576,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, | |||
| 2576 | int i; | 2576 | int i; |
| 2577 | 2577 | ||
| 2578 | clear_bit(STRIPE_SYNCING, &sh->state); | 2578 | clear_bit(STRIPE_SYNCING, &sh->state); |
| 2579 | if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) | ||
| 2580 | wake_up(&conf->wait_for_overlap); | ||
| 2579 | s->syncing = 0; | 2581 | s->syncing = 0; |
| 2580 | s->replacing = 0; | 2582 | s->replacing = 0; |
| 2581 | /* There is nothing more to do for sync/check/repair. | 2583 | /* There is nothing more to do for sync/check/repair. |
| @@ -2749,6 +2751,7 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
| 2749 | { | 2751 | { |
| 2750 | int i; | 2752 | int i; |
| 2751 | struct r5dev *dev; | 2753 | struct r5dev *dev; |
| 2754 | int discard_pending = 0; | ||
| 2752 | 2755 | ||
| 2753 | for (i = disks; i--; ) | 2756 | for (i = disks; i--; ) |
| 2754 | if (sh->dev[i].written) { | 2757 | if (sh->dev[i].written) { |
| @@ -2777,9 +2780,23 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
| 2777 | STRIPE_SECTORS, | 2780 | STRIPE_SECTORS, |
| 2778 | !test_bit(STRIPE_DEGRADED, &sh->state), | 2781 | !test_bit(STRIPE_DEGRADED, &sh->state), |
| 2779 | 0); | 2782 | 0); |
| 2780 | } | 2783 | } else if (test_bit(R5_Discard, &dev->flags)) |
| 2781 | } else if (test_bit(R5_Discard, &sh->dev[i].flags)) | 2784 | discard_pending = 1; |
| 2782 | clear_bit(R5_Discard, &sh->dev[i].flags); | 2785 | } |
| 2786 | if (!discard_pending && | ||
| 2787 | test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { | ||
| 2788 | clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); | ||
| 2789 | clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); | ||
| 2790 | if (sh->qd_idx >= 0) { | ||
| 2791 | clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); | ||
| 2792 | clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); | ||
| 2793 | } | ||
| 2794 | /* now that discard is done we can proceed with any sync */ | ||
| 2795 | clear_bit(STRIPE_DISCARD, &sh->state); | ||
| 2796 | if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) | ||
| 2797 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 2798 | |||
| 2799 | } | ||
| 2783 | 2800 | ||
| 2784 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) | 2801 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) |
| 2785 | if (atomic_dec_and_test(&conf->pending_full_writes)) | 2802 | if (atomic_dec_and_test(&conf->pending_full_writes)) |
| @@ -3431,9 +3448,15 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 3431 | return; | 3448 | return; |
| 3432 | } | 3449 | } |
| 3433 | 3450 | ||
| 3434 | if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { | 3451 | if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { |
| 3435 | set_bit(STRIPE_SYNCING, &sh->state); | 3452 | spin_lock(&sh->stripe_lock); |
| 3436 | clear_bit(STRIPE_INSYNC, &sh->state); | 3453 | /* Cannot process 'sync' concurrently with 'discard' */ |
| 3454 | if (!test_bit(STRIPE_DISCARD, &sh->state) && | ||
| 3455 | test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { | ||
| 3456 | set_bit(STRIPE_SYNCING, &sh->state); | ||
| 3457 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
| 3458 | } | ||
| 3459 | spin_unlock(&sh->stripe_lock); | ||
| 3437 | } | 3460 | } |
| 3438 | clear_bit(STRIPE_DELAYED, &sh->state); | 3461 | clear_bit(STRIPE_DELAYED, &sh->state); |
| 3439 | 3462 | ||
| @@ -3593,6 +3616,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 3593 | test_bit(STRIPE_INSYNC, &sh->state)) { | 3616 | test_bit(STRIPE_INSYNC, &sh->state)) { |
| 3594 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); | 3617 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); |
| 3595 | clear_bit(STRIPE_SYNCING, &sh->state); | 3618 | clear_bit(STRIPE_SYNCING, &sh->state); |
| 3619 | if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) | ||
| 3620 | wake_up(&conf->wait_for_overlap); | ||
| 3596 | } | 3621 | } |
| 3597 | 3622 | ||
| 3598 | /* If the failed drives are just a ReadError, then we might need | 3623 | /* If the failed drives are just a ReadError, then we might need |
| @@ -4159,6 +4184,13 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) | |||
| 4159 | sh = get_active_stripe(conf, logical_sector, 0, 0, 0); | 4184 | sh = get_active_stripe(conf, logical_sector, 0, 0, 0); |
| 4160 | prepare_to_wait(&conf->wait_for_overlap, &w, | 4185 | prepare_to_wait(&conf->wait_for_overlap, &w, |
| 4161 | TASK_UNINTERRUPTIBLE); | 4186 | TASK_UNINTERRUPTIBLE); |
| 4187 | set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); | ||
| 4188 | if (test_bit(STRIPE_SYNCING, &sh->state)) { | ||
| 4189 | release_stripe(sh); | ||
| 4190 | schedule(); | ||
| 4191 | goto again; | ||
| 4192 | } | ||
| 4193 | clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); | ||
| 4162 | spin_lock_irq(&sh->stripe_lock); | 4194 | spin_lock_irq(&sh->stripe_lock); |
| 4163 | for (d = 0; d < conf->raid_disks; d++) { | 4195 | for (d = 0; d < conf->raid_disks; d++) { |
| 4164 | if (d == sh->pd_idx || d == sh->qd_idx) | 4196 | if (d == sh->pd_idx || d == sh->qd_idx) |
| @@ -4171,6 +4203,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) | |||
| 4171 | goto again; | 4203 | goto again; |
| 4172 | } | 4204 | } |
| 4173 | } | 4205 | } |
| 4206 | set_bit(STRIPE_DISCARD, &sh->state); | ||
| 4174 | finish_wait(&conf->wait_for_overlap, &w); | 4207 | finish_wait(&conf->wait_for_overlap, &w); |
| 4175 | for (d = 0; d < conf->raid_disks; d++) { | 4208 | for (d = 0; d < conf->raid_disks; d++) { |
| 4176 | if (d == sh->pd_idx || d == sh->qd_idx) | 4209 | if (d == sh->pd_idx || d == sh->qd_idx) |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 18b2c4a8a1fd..050a334e89c1 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
| @@ -323,6 +323,7 @@ enum { | |||
| 323 | STRIPE_COMPUTE_RUN, | 323 | STRIPE_COMPUTE_RUN, |
| 324 | STRIPE_OPS_REQ_PENDING, | 324 | STRIPE_OPS_REQ_PENDING, |
| 325 | STRIPE_ON_UNPLUG_LIST, | 325 | STRIPE_ON_UNPLUG_LIST, |
| 326 | STRIPE_DISCARD, | ||
| 326 | }; | 327 | }; |
| 327 | 328 | ||
| 328 | /* | 329 | /* |
