aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-20 16:05:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-20 16:05:25 -0500
commit6d6e352c80f22c446d933ca8103e02bac1f09129 (patch)
tree248a6a7ebc5ea95986da5bccdd6d75b255cf28e4 /drivers/md/md.c
parentb4789b8e6be3151a955ade74872822f30e8cd914 (diff)
parent60aaf933854511630e16be4efe0f96485e132de4 (diff)
Merge tag 'md/3.13' of git://neil.brown.name/md
Pull md update from Neil Brown: "Mostly optimisations and obscure bug fixes. - raid5 gets less lock contention - raid1 gets less contention between normal-io and resync-io during resync" * tag 'md/3.13' of git://neil.brown.name/md: md/raid5: Use conf->device_lock protect changing of multi-thread resources. md/raid5: Before freeing old multi-thread worker, it should flush them. md/raid5: For stripe with R5_ReadNoMerge, we replace REQ_FLUSH with REQ_NOMERGE. UAPI: include <asm/byteorder.h> in linux/raid/md_p.h raid1: Rewrite the implementation of iobarrier. raid1: Add some macros to make code clearly. raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array. raid1: Add a field array_frozen to indicate whether raid in freeze state. md: Convert use of typedef ctl_table to struct ctl_table md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes. md: use MD_RECOVERY_INTR instead of kthread_should_stop in resync thread. md: fix some places where mddev_lock return value is not checked. raid5: Retry R5_ReadNoMerge flag when hit a read error. raid5: relieve lock contention in get_active_stripe() raid5: relieve lock contention in get_active_stripe() wait: add wait_event_cmd() md/raid5.c: add proper locking to error path of raid5_start_reshape. md: fix calculation of stacking limits on level change. raid5: Use slow_path to release stripe when mddev->thread is null
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c133
1 files changed, 80 insertions, 53 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8766eabb0014..b6b7a2866c9e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev)
112 112
113static struct ctl_table_header *raid_table_header; 113static struct ctl_table_header *raid_table_header;
114 114
115static ctl_table raid_table[] = { 115static struct ctl_table raid_table[] = {
116 { 116 {
117 .procname = "speed_limit_min", 117 .procname = "speed_limit_min",
118 .data = &sysctl_speed_limit_min, 118 .data = &sysctl_speed_limit_min,
@@ -130,7 +130,7 @@ static ctl_table raid_table[] = {
130 { } 130 { }
131}; 131};
132 132
133static ctl_table raid_dir_table[] = { 133static struct ctl_table raid_dir_table[] = {
134 { 134 {
135 .procname = "raid", 135 .procname = "raid",
136 .maxlen = 0, 136 .maxlen = 0,
@@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = {
140 { } 140 { }
141}; 141};
142 142
143static ctl_table raid_root_table[] = { 143static struct ctl_table raid_root_table[] = {
144 { 144 {
145 .procname = "dev", 145 .procname = "dev",
146 .maxlen = 0, 146 .maxlen = 0,
@@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit)
562 goto retry; 562 goto retry;
563} 563}
564 564
565static inline int mddev_lock(struct mddev * mddev) 565static inline int __must_check mddev_lock(struct mddev * mddev)
566{ 566{
567 return mutex_lock_interruptible(&mddev->reconfig_mutex); 567 return mutex_lock_interruptible(&mddev->reconfig_mutex);
568} 568}
569 569
570/* Sometimes we need to take the lock in a situation where
571 * failure due to interrupts is not acceptable.
572 */
573static inline void mddev_lock_nointr(struct mddev * mddev)
574{
575 mutex_lock(&mddev->reconfig_mutex);
576}
577
570static inline int mddev_is_locked(struct mddev *mddev) 578static inline int mddev_is_locked(struct mddev *mddev)
571{ 579{
572 return mutex_is_locked(&mddev->reconfig_mutex); 580 return mutex_is_locked(&mddev->reconfig_mutex);
@@ -2978,7 +2986,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2978 for_each_mddev(mddev, tmp) { 2986 for_each_mddev(mddev, tmp) {
2979 struct md_rdev *rdev2; 2987 struct md_rdev *rdev2;
2980 2988
2981 mddev_lock(mddev); 2989 mddev_lock_nointr(mddev);
2982 rdev_for_each(rdev2, mddev) 2990 rdev_for_each(rdev2, mddev)
2983 if (rdev->bdev == rdev2->bdev && 2991 if (rdev->bdev == rdev2->bdev &&
2984 rdev != rdev2 && 2992 rdev != rdev2 &&
@@ -2994,7 +3002,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2994 break; 3002 break;
2995 } 3003 }
2996 } 3004 }
2997 mddev_lock(my_mddev); 3005 mddev_lock_nointr(my_mddev);
2998 if (overlap) { 3006 if (overlap) {
2999 /* Someone else could have slipped in a size 3007 /* Someone else could have slipped in a size
3000 * change here, but doing so is just silly. 3008 * change here, but doing so is just silly.
@@ -3580,6 +3588,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3580 mddev->in_sync = 1; 3588 mddev->in_sync = 1;
3581 del_timer_sync(&mddev->safemode_timer); 3589 del_timer_sync(&mddev->safemode_timer);
3582 } 3590 }
3591 blk_set_stacking_limits(&mddev->queue->limits);
3583 pers->run(mddev); 3592 pers->run(mddev);
3584 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3593 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3585 mddev_resume(mddev); 3594 mddev_resume(mddev);
@@ -5258,7 +5267,7 @@ static void __md_stop_writes(struct mddev *mddev)
5258 5267
5259void md_stop_writes(struct mddev *mddev) 5268void md_stop_writes(struct mddev *mddev)
5260{ 5269{
5261 mddev_lock(mddev); 5270 mddev_lock_nointr(mddev);
5262 __md_stop_writes(mddev); 5271 __md_stop_writes(mddev);
5263 mddev_unlock(mddev); 5272 mddev_unlock(mddev);
5264} 5273}
@@ -5291,20 +5300,35 @@ EXPORT_SYMBOL_GPL(md_stop);
5291static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5300static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5292{ 5301{
5293 int err = 0; 5302 int err = 0;
5303 int did_freeze = 0;
5304
5305 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5306 did_freeze = 1;
5307 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5308 md_wakeup_thread(mddev->thread);
5309 }
5310 if (mddev->sync_thread) {
5311 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5312 /* Thread might be blocked waiting for metadata update
5313 * which will now never happen */
5314 wake_up_process(mddev->sync_thread->tsk);
5315 }
5316 mddev_unlock(mddev);
5317 wait_event(resync_wait, mddev->sync_thread == NULL);
5318 mddev_lock_nointr(mddev);
5319
5294 mutex_lock(&mddev->open_mutex); 5320 mutex_lock(&mddev->open_mutex);
5295 if (atomic_read(&mddev->openers) > !!bdev) { 5321 if (atomic_read(&mddev->openers) > !!bdev ||
5322 mddev->sync_thread ||
5323 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5296 printk("md: %s still in use.\n",mdname(mddev)); 5324 printk("md: %s still in use.\n",mdname(mddev));
5325 if (did_freeze) {
5326 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5327 md_wakeup_thread(mddev->thread);
5328 }
5297 err = -EBUSY; 5329 err = -EBUSY;
5298 goto out; 5330 goto out;
5299 } 5331 }
5300 if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
5301 /* Someone opened the device since we flushed it
5302 * so page cache could be dirty and it is too late
5303 * to flush. So abort
5304 */
5305 mutex_unlock(&mddev->open_mutex);
5306 return -EBUSY;
5307 }
5308 if (mddev->pers) { 5332 if (mddev->pers) {
5309 __md_stop_writes(mddev); 5333 __md_stop_writes(mddev);
5310 5334
@@ -5315,7 +5339,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5315 set_disk_ro(mddev->gendisk, 1); 5339 set_disk_ro(mddev->gendisk, 1);
5316 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5340 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5317 sysfs_notify_dirent_safe(mddev->sysfs_state); 5341 sysfs_notify_dirent_safe(mddev->sysfs_state);
5318 err = 0; 5342 err = 0;
5319 } 5343 }
5320out: 5344out:
5321 mutex_unlock(&mddev->open_mutex); 5345 mutex_unlock(&mddev->open_mutex);
@@ -5331,20 +5355,34 @@ static int do_md_stop(struct mddev * mddev, int mode,
5331{ 5355{
5332 struct gendisk *disk = mddev->gendisk; 5356 struct gendisk *disk = mddev->gendisk;
5333 struct md_rdev *rdev; 5357 struct md_rdev *rdev;
5358 int did_freeze = 0;
5359
5360 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5361 did_freeze = 1;
5362 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5363 md_wakeup_thread(mddev->thread);
5364 }
5365 if (mddev->sync_thread) {
5366 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5367 /* Thread might be blocked waiting for metadata update
5368 * which will now never happen */
5369 wake_up_process(mddev->sync_thread->tsk);
5370 }
5371 mddev_unlock(mddev);
5372 wait_event(resync_wait, mddev->sync_thread == NULL);
5373 mddev_lock_nointr(mddev);
5334 5374
5335 mutex_lock(&mddev->open_mutex); 5375 mutex_lock(&mddev->open_mutex);
5336 if (atomic_read(&mddev->openers) > !!bdev || 5376 if (atomic_read(&mddev->openers) > !!bdev ||
5337 mddev->sysfs_active) { 5377 mddev->sysfs_active ||
5378 mddev->sync_thread ||
5379 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5338 printk("md: %s still in use.\n",mdname(mddev)); 5380 printk("md: %s still in use.\n",mdname(mddev));
5339 mutex_unlock(&mddev->open_mutex); 5381 mutex_unlock(&mddev->open_mutex);
5340 return -EBUSY; 5382 if (did_freeze) {
5341 } 5383 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5342 if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { 5384 md_wakeup_thread(mddev->thread);
5343 /* Someone opened the device since we flushed it 5385 }
5344 * so page cache could be dirty and it is too late
5345 * to flush. So abort
5346 */
5347 mutex_unlock(&mddev->open_mutex);
5348 return -EBUSY; 5386 return -EBUSY;
5349 } 5387 }
5350 if (mddev->pers) { 5388 if (mddev->pers) {
@@ -6551,7 +6589,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6551 wait_event(mddev->sb_wait, 6589 wait_event(mddev->sb_wait,
6552 !test_bit(MD_CHANGE_DEVS, &mddev->flags) && 6590 !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6553 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6591 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6554 mddev_lock(mddev); 6592 mddev_lock_nointr(mddev);
6555 } 6593 }
6556 } else { 6594 } else {
6557 err = -EROFS; 6595 err = -EROFS;
@@ -7361,9 +7399,6 @@ void md_do_sync(struct md_thread *thread)
7361 mddev->curr_resync = 2; 7399 mddev->curr_resync = 2;
7362 7400
7363 try_again: 7401 try_again:
7364 if (kthread_should_stop())
7365 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7366
7367 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7402 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7368 goto skip; 7403 goto skip;
7369 for_each_mddev(mddev2, tmp) { 7404 for_each_mddev(mddev2, tmp) {
@@ -7388,7 +7423,7 @@ void md_do_sync(struct md_thread *thread)
7388 * be caught by 'softlockup' 7423 * be caught by 'softlockup'
7389 */ 7424 */
7390 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 7425 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
7391 if (!kthread_should_stop() && 7426 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7392 mddev2->curr_resync >= mddev->curr_resync) { 7427 mddev2->curr_resync >= mddev->curr_resync) {
7393 printk(KERN_INFO "md: delaying %s of %s" 7428 printk(KERN_INFO "md: delaying %s of %s"
7394 " until %s has finished (they" 7429 " until %s has finished (they"
@@ -7464,7 +7499,7 @@ void md_do_sync(struct md_thread *thread)
7464 last_check = 0; 7499 last_check = 0;
7465 7500
7466 if (j>2) { 7501 if (j>2) {
7467 printk(KERN_INFO 7502 printk(KERN_INFO
7468 "md: resuming %s of %s from checkpoint.\n", 7503 "md: resuming %s of %s from checkpoint.\n",
7469 desc, mdname(mddev)); 7504 desc, mdname(mddev));
7470 mddev->curr_resync = j; 7505 mddev->curr_resync = j;
@@ -7501,7 +7536,8 @@ void md_do_sync(struct md_thread *thread)
7501 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7536 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7502 } 7537 }
7503 7538
7504 while (j >= mddev->resync_max && !kthread_should_stop()) { 7539 while (j >= mddev->resync_max &&
7540 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7505 /* As this condition is controlled by user-space, 7541 /* As this condition is controlled by user-space,
7506 * we can block indefinitely, so use '_interruptible' 7542 * we can block indefinitely, so use '_interruptible'
7507 * to avoid triggering warnings. 7543 * to avoid triggering warnings.
@@ -7509,17 +7545,18 @@ void md_do_sync(struct md_thread *thread)
7509 flush_signals(current); /* just in case */ 7545 flush_signals(current); /* just in case */
7510 wait_event_interruptible(mddev->recovery_wait, 7546 wait_event_interruptible(mddev->recovery_wait,
7511 mddev->resync_max > j 7547 mddev->resync_max > j
7512 || kthread_should_stop()); 7548 || test_bit(MD_RECOVERY_INTR,
7549 &mddev->recovery));
7513 } 7550 }
7514 7551
7515 if (kthread_should_stop()) 7552 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7516 goto interrupted; 7553 break;
7517 7554
7518 sectors = mddev->pers->sync_request(mddev, j, &skipped, 7555 sectors = mddev->pers->sync_request(mddev, j, &skipped,
7519 currspeed < speed_min(mddev)); 7556 currspeed < speed_min(mddev));
7520 if (sectors == 0) { 7557 if (sectors == 0) {
7521 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7558 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7522 goto out; 7559 break;
7523 } 7560 }
7524 7561
7525 if (!skipped) { /* actual IO requested */ 7562 if (!skipped) { /* actual IO requested */
@@ -7556,10 +7593,8 @@ void md_do_sync(struct md_thread *thread)
7556 last_mark = next; 7593 last_mark = next;
7557 } 7594 }
7558 7595
7559 7596 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7560 if (kthread_should_stop()) 7597 break;
7561 goto interrupted;
7562
7563 7598
7564 /* 7599 /*
7565 * this loop exits only if either when we are slower than 7600 * this loop exits only if either when we are slower than
@@ -7582,11 +7617,12 @@ void md_do_sync(struct md_thread *thread)
7582 } 7617 }
7583 } 7618 }
7584 } 7619 }
7585 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 7620 printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
7621 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
7622 ? "interrupted" : "done");
7586 /* 7623 /*
7587 * this also signals 'finished resyncing' to md_stop 7624 * this also signals 'finished resyncing' to md_stop
7588 */ 7625 */
7589 out:
7590 blk_finish_plug(&plug); 7626 blk_finish_plug(&plug);
7591 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 7627 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7592 7628
@@ -7640,16 +7676,6 @@ void md_do_sync(struct md_thread *thread)
7640 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 7676 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
7641 md_wakeup_thread(mddev->thread); 7677 md_wakeup_thread(mddev->thread);
7642 return; 7678 return;
7643
7644 interrupted:
7645 /*
7646 * got a signal, exit.
7647 */
7648 printk(KERN_INFO
7649 "md: md_do_sync() got signal ... exiting\n");
7650 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7651 goto out;
7652
7653} 7679}
7654EXPORT_SYMBOL_GPL(md_do_sync); 7680EXPORT_SYMBOL_GPL(md_do_sync);
7655 7681
@@ -7894,6 +7920,7 @@ void md_reap_sync_thread(struct mddev *mddev)
7894 7920
7895 /* resync has finished, collect result */ 7921 /* resync has finished, collect result */
7896 md_unregister_thread(&mddev->sync_thread); 7922 md_unregister_thread(&mddev->sync_thread);
7923 wake_up(&resync_wait);
7897 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 7924 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7898 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7925 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7899 /* success...*/ 7926 /* success...*/