aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-05-29 13:35:21 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-05-29 13:35:21 -0400
commitc492e2d464c82fdb82002569033325b4e1a7e5eb (patch)
treeacb7110d8042fe2328bd065db47ee55f67f20ff2
parentace6a22a9fbcdba0ccf190f97b82a79ef2f44aeb (diff)
parent56ccc1125bc141cf63927eda7febff4216dea2d3 (diff)
Merge tag 'md/4.1-rc5-fixes' of git://neil.brown.name/md
Pull m,ore md bugfixes gfrom Neil Brown: "Assorted fixes for new RAID5 stripe-batching functionality. Unfortunately this functionality was merged a little prematurely. The necessary testing and code review is now complete (or as complete as it can be) and to code passes a variety of tests and looks quite sensible. Also a fix for some recent locking changes - a race was introduced which causes a reshape request to sometimes fail. No data safety issues" * tag 'md/4.1-rc5-fixes' of git://neil.brown.name/md: md: fix race when unfreezing sync_action md/raid5: break stripe-batches when the array has failed. md/raid5: call break_stripe_batch_list from handle_stripe_clean_event md/raid5: be more selective about distributing flags across batch. md/raid5: add handle_flags arg to break_stripe_batch_list. md/raid5: duplicate some more handle_stripe_clean_event code in break_stripe_batch_list md/raid5: remove condition test from check_break_stripe_batch_list. md/raid5: Ensure a batch member is not handled prematurely. md/raid5: close race between STRIPE_BIT_DELAY and batching. md/raid5: ensure whole batch is delayed for all required bitmap updates.
-rw-r--r--drivers/md/md.c14
-rw-r--r--drivers/md/raid5.c146
-rw-r--r--drivers/md/raid5.h5
3 files changed, 98 insertions, 67 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 593a02476c78..27506302eb7a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4211,12 +4211,12 @@ action_store(struct mddev *mddev, const char *page, size_t len)
4211 if (!mddev->pers || !mddev->pers->sync_request) 4211 if (!mddev->pers || !mddev->pers->sync_request)
4212 return -EINVAL; 4212 return -EINVAL;
4213 4213
4214 if (cmd_match(page, "frozen"))
4215 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4216 else
4217 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4218 4214
4219 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4215 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4216 if (cmd_match(page, "frozen"))
4217 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4218 else
4219 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4220 flush_workqueue(md_misc_wq); 4220 flush_workqueue(md_misc_wq);
4221 if (mddev->sync_thread) { 4221 if (mddev->sync_thread) {
4222 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4222 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -4229,16 +4229,17 @@ action_store(struct mddev *mddev, const char *page, size_t len)
4229 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 4229 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
4230 return -EBUSY; 4230 return -EBUSY;
4231 else if (cmd_match(page, "resync")) 4231 else if (cmd_match(page, "resync"))
4232 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4232 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4233 else if (cmd_match(page, "recover")) { 4233 else if (cmd_match(page, "recover")) {
4234 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4234 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4235 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4235 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4236 } else if (cmd_match(page, "reshape")) { 4236 } else if (cmd_match(page, "reshape")) {
4237 int err; 4237 int err;
4238 if (mddev->pers->start_reshape == NULL) 4238 if (mddev->pers->start_reshape == NULL)
4239 return -EINVAL; 4239 return -EINVAL;
4240 err = mddev_lock(mddev); 4240 err = mddev_lock(mddev);
4241 if (!err) { 4241 if (!err) {
4242 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4242 err = mddev->pers->start_reshape(mddev); 4243 err = mddev->pers->start_reshape(mddev);
4243 mddev_unlock(mddev); 4244 mddev_unlock(mddev);
4244 } 4245 }
@@ -4250,6 +4251,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
4250 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4251 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4251 else if (!cmd_match(page, "repair")) 4252 else if (!cmd_match(page, "repair"))
4252 return -EINVAL; 4253 return -EINVAL;
4254 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4253 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4255 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4254 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4256 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4255 } 4257 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b9f2b9cc6060..553d54b87052 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -749,6 +749,7 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
749static bool stripe_can_batch(struct stripe_head *sh) 749static bool stripe_can_batch(struct stripe_head *sh)
750{ 750{
751 return test_bit(STRIPE_BATCH_READY, &sh->state) && 751 return test_bit(STRIPE_BATCH_READY, &sh->state) &&
752 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
752 is_full_stripe_write(sh); 753 is_full_stripe_write(sh);
753} 754}
754 755
@@ -837,6 +838,15 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
837 < IO_THRESHOLD) 838 < IO_THRESHOLD)
838 md_wakeup_thread(conf->mddev->thread); 839 md_wakeup_thread(conf->mddev->thread);
839 840
841 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
842 int seq = sh->bm_seq;
843 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
844 sh->batch_head->bm_seq > seq)
845 seq = sh->batch_head->bm_seq;
846 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
847 sh->batch_head->bm_seq = seq;
848 }
849
840 atomic_inc(&sh->count); 850 atomic_inc(&sh->count);
841unlock_out: 851unlock_out:
842 unlock_two_stripes(head, sh); 852 unlock_two_stripes(head, sh);
@@ -2987,14 +2997,32 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
2987 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2997 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2988 (unsigned long long)(*bip)->bi_iter.bi_sector, 2998 (unsigned long long)(*bip)->bi_iter.bi_sector,
2989 (unsigned long long)sh->sector, dd_idx); 2999 (unsigned long long)sh->sector, dd_idx);
2990 spin_unlock_irq(&sh->stripe_lock);
2991 3000
2992 if (conf->mddev->bitmap && firstwrite) { 3001 if (conf->mddev->bitmap && firstwrite) {
3002 /* Cannot hold spinlock over bitmap_startwrite,
3003 * but must ensure this isn't added to a batch until
3004 * we have added to the bitmap and set bm_seq.
3005 * So set STRIPE_BITMAP_PENDING to prevent
3006 * batching.
3007 * If multiple add_stripe_bio() calls race here they
3008 * much all set STRIPE_BITMAP_PENDING. So only the first one
3009 * to complete "bitmap_startwrite" gets to set
3010 * STRIPE_BIT_DELAY. This is important as once a stripe
3011 * is added to a batch, STRIPE_BIT_DELAY cannot be changed
3012 * any more.
3013 */
3014 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3015 spin_unlock_irq(&sh->stripe_lock);
2993 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3016 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2994 STRIPE_SECTORS, 0); 3017 STRIPE_SECTORS, 0);
2995 sh->bm_seq = conf->seq_flush+1; 3018 spin_lock_irq(&sh->stripe_lock);
2996 set_bit(STRIPE_BIT_DELAY, &sh->state); 3019 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3020 if (!sh->batch_head) {
3021 sh->bm_seq = conf->seq_flush+1;
3022 set_bit(STRIPE_BIT_DELAY, &sh->state);
3023 }
2997 } 3024 }
3025 spin_unlock_irq(&sh->stripe_lock);
2998 3026
2999 if (stripe_can_batch(sh)) 3027 if (stripe_can_batch(sh))
3000 stripe_add_to_batch_list(conf, sh); 3028 stripe_add_to_batch_list(conf, sh);
@@ -3392,6 +3420,8 @@ static void handle_stripe_fill(struct stripe_head *sh,
3392 set_bit(STRIPE_HANDLE, &sh->state); 3420 set_bit(STRIPE_HANDLE, &sh->state);
3393} 3421}
3394 3422
3423static void break_stripe_batch_list(struct stripe_head *head_sh,
3424 unsigned long handle_flags);
3395/* handle_stripe_clean_event 3425/* handle_stripe_clean_event
3396 * any written block on an uptodate or failed drive can be returned. 3426 * any written block on an uptodate or failed drive can be returned.
3397 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3427 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
@@ -3405,7 +3435,6 @@ static void handle_stripe_clean_event(struct r5conf *conf,
3405 int discard_pending = 0; 3435 int discard_pending = 0;
3406 struct stripe_head *head_sh = sh; 3436 struct stripe_head *head_sh = sh;
3407 bool do_endio = false; 3437 bool do_endio = false;
3408 int wakeup_nr = 0;
3409 3438
3410 for (i = disks; i--; ) 3439 for (i = disks; i--; )
3411 if (sh->dev[i].written) { 3440 if (sh->dev[i].written) {
@@ -3494,44 +3523,8 @@ unhash:
3494 if (atomic_dec_and_test(&conf->pending_full_writes)) 3523 if (atomic_dec_and_test(&conf->pending_full_writes))
3495 md_wakeup_thread(conf->mddev->thread); 3524 md_wakeup_thread(conf->mddev->thread);
3496 3525
3497 if (!head_sh->batch_head || !do_endio) 3526 if (head_sh->batch_head && do_endio)
3498 return; 3527 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
3499 for (i = 0; i < head_sh->disks; i++) {
3500 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
3501 wakeup_nr++;
3502 }
3503 while (!list_empty(&head_sh->batch_list)) {
3504 int i;
3505 sh = list_first_entry(&head_sh->batch_list,
3506 struct stripe_head, batch_list);
3507 list_del_init(&sh->batch_list);
3508
3509 set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
3510 head_sh->state & ~((1 << STRIPE_ACTIVE) |
3511 (1 << STRIPE_PREREAD_ACTIVE) |
3512 STRIPE_EXPAND_SYNC_FLAG));
3513 sh->check_state = head_sh->check_state;
3514 sh->reconstruct_state = head_sh->reconstruct_state;
3515 for (i = 0; i < sh->disks; i++) {
3516 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3517 wakeup_nr++;
3518 sh->dev[i].flags = head_sh->dev[i].flags;
3519 }
3520
3521 spin_lock_irq(&sh->stripe_lock);
3522 sh->batch_head = NULL;
3523 spin_unlock_irq(&sh->stripe_lock);
3524 if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
3525 set_bit(STRIPE_HANDLE, &sh->state);
3526 release_stripe(sh);
3527 }
3528
3529 spin_lock_irq(&head_sh->stripe_lock);
3530 head_sh->batch_head = NULL;
3531 spin_unlock_irq(&head_sh->stripe_lock);
3532 wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
3533 if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
3534 set_bit(STRIPE_HANDLE, &head_sh->state);
3535} 3528}
3536 3529
3537static void handle_stripe_dirtying(struct r5conf *conf, 3530static void handle_stripe_dirtying(struct r5conf *conf,
@@ -4172,9 +4165,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4172 4165
4173static int clear_batch_ready(struct stripe_head *sh) 4166static int clear_batch_ready(struct stripe_head *sh)
4174{ 4167{
4168 /* Return '1' if this is a member of batch, or
4169 * '0' if it is a lone stripe or a head which can now be
4170 * handled.
4171 */
4175 struct stripe_head *tmp; 4172 struct stripe_head *tmp;
4176 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4173 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4177 return 0; 4174 return (sh->batch_head && sh->batch_head != sh);
4178 spin_lock(&sh->stripe_lock); 4175 spin_lock(&sh->stripe_lock);
4179 if (!sh->batch_head) { 4176 if (!sh->batch_head) {
4180 spin_unlock(&sh->stripe_lock); 4177 spin_unlock(&sh->stripe_lock);
@@ -4202,38 +4199,65 @@ static int clear_batch_ready(struct stripe_head *sh)
4202 return 0; 4199 return 0;
4203} 4200}
4204 4201
4205static void check_break_stripe_batch_list(struct stripe_head *sh) 4202static void break_stripe_batch_list(struct stripe_head *head_sh,
4203 unsigned long handle_flags)
4206{ 4204{
4207 struct stripe_head *head_sh, *next; 4205 struct stripe_head *sh, *next;
4208 int i; 4206 int i;
4209 4207 int do_wakeup = 0;
4210 if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4211 return;
4212
4213 head_sh = sh;
4214 4208
4215 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4209 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4216 4210
4217 list_del_init(&sh->batch_list); 4211 list_del_init(&sh->batch_list);
4218 4212
4219 set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, 4213 WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4220 head_sh->state & ~((1 << STRIPE_ACTIVE) | 4214 (1 << STRIPE_SYNCING) |
4221 (1 << STRIPE_PREREAD_ACTIVE) | 4215 (1 << STRIPE_REPLACED) |
4222 (1 << STRIPE_DEGRADED) | 4216 (1 << STRIPE_PREREAD_ACTIVE) |
4223 STRIPE_EXPAND_SYNC_FLAG)); 4217 (1 << STRIPE_DELAYED) |
4218 (1 << STRIPE_BIT_DELAY) |
4219 (1 << STRIPE_FULL_WRITE) |
4220 (1 << STRIPE_BIOFILL_RUN) |
4221 (1 << STRIPE_COMPUTE_RUN) |
4222 (1 << STRIPE_OPS_REQ_PENDING) |
4223 (1 << STRIPE_DISCARD) |
4224 (1 << STRIPE_BATCH_READY) |
4225 (1 << STRIPE_BATCH_ERR) |
4226 (1 << STRIPE_BITMAP_PENDING)));
4227 WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4228 (1 << STRIPE_REPLACED)));
4229
4230 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4231 (1 << STRIPE_DEGRADED)),
4232 head_sh->state & (1 << STRIPE_INSYNC));
4233
4224 sh->check_state = head_sh->check_state; 4234 sh->check_state = head_sh->check_state;
4225 sh->reconstruct_state = head_sh->reconstruct_state; 4235 sh->reconstruct_state = head_sh->reconstruct_state;
4226 for (i = 0; i < sh->disks; i++) 4236 for (i = 0; i < sh->disks; i++) {
4237 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4238 do_wakeup = 1;
4227 sh->dev[i].flags = head_sh->dev[i].flags & 4239 sh->dev[i].flags = head_sh->dev[i].flags &
4228 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4240 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4229 4241 }
4230 spin_lock_irq(&sh->stripe_lock); 4242 spin_lock_irq(&sh->stripe_lock);
4231 sh->batch_head = NULL; 4243 sh->batch_head = NULL;
4232 spin_unlock_irq(&sh->stripe_lock); 4244 spin_unlock_irq(&sh->stripe_lock);
4233 4245 if (handle_flags == 0 ||
4234 set_bit(STRIPE_HANDLE, &sh->state); 4246 sh->state & handle_flags)
4247 set_bit(STRIPE_HANDLE, &sh->state);
4235 release_stripe(sh); 4248 release_stripe(sh);
4236 } 4249 }
4250 spin_lock_irq(&head_sh->stripe_lock);
4251 head_sh->batch_head = NULL;
4252 spin_unlock_irq(&head_sh->stripe_lock);
4253 for (i = 0; i < head_sh->disks; i++)
4254 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4255 do_wakeup = 1;
4256 if (head_sh->state & handle_flags)
4257 set_bit(STRIPE_HANDLE, &head_sh->state);
4258
4259 if (do_wakeup)
4260 wake_up(&head_sh->raid_conf->wait_for_overlap);
4237} 4261}
4238 4262
4239static void handle_stripe(struct stripe_head *sh) 4263static void handle_stripe(struct stripe_head *sh)
@@ -4258,7 +4282,8 @@ static void handle_stripe(struct stripe_head *sh)
4258 return; 4282 return;
4259 } 4283 }
4260 4284
4261 check_break_stripe_batch_list(sh); 4285 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4286 break_stripe_batch_list(sh, 0);
4262 4287
4263 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4288 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4264 spin_lock(&sh->stripe_lock); 4289 spin_lock(&sh->stripe_lock);
@@ -4312,6 +4337,7 @@ static void handle_stripe(struct stripe_head *sh)
4312 if (s.failed > conf->max_degraded) { 4337 if (s.failed > conf->max_degraded) {
4313 sh->check_state = 0; 4338 sh->check_state = 0;
4314 sh->reconstruct_state = 0; 4339 sh->reconstruct_state = 0;
4340 break_stripe_batch_list(sh, 0);
4315 if (s.to_read+s.to_write+s.written) 4341 if (s.to_read+s.to_write+s.written)
4316 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 4342 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
4317 if (s.syncing + s.replacing) 4343 if (s.syncing + s.replacing)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7dc0dd86074b..896d603ad0da 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -337,9 +337,12 @@ enum {
337 STRIPE_ON_RELEASE_LIST, 337 STRIPE_ON_RELEASE_LIST,
338 STRIPE_BATCH_READY, 338 STRIPE_BATCH_READY,
339 STRIPE_BATCH_ERR, 339 STRIPE_BATCH_ERR,
340 STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
341 * to batch yet.
342 */
340}; 343};
341 344
342#define STRIPE_EXPAND_SYNC_FLAG \ 345#define STRIPE_EXPAND_SYNC_FLAGS \
343 ((1 << STRIPE_EXPAND_SOURCE) |\ 346 ((1 << STRIPE_EXPAND_SOURCE) |\
344 (1 << STRIPE_EXPAND_READY) |\ 347 (1 << STRIPE_EXPAND_READY) |\
345 (1 << STRIPE_EXPANDING) |\ 348 (1 << STRIPE_EXPANDING) |\