diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-20 16:05:25 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-20 16:05:25 -0500 |
commit | 6d6e352c80f22c446d933ca8103e02bac1f09129 (patch) | |
tree | 248a6a7ebc5ea95986da5bccdd6d75b255cf28e4 /drivers/md/md.c | |
parent | b4789b8e6be3151a955ade74872822f30e8cd914 (diff) | |
parent | 60aaf933854511630e16be4efe0f96485e132de4 (diff) |
Merge tag 'md/3.13' of git://neil.brown.name/md
Pull md update from Neil Brown:
"Mostly optimisations and obscure bug fixes.
- raid5 gets less lock contention
- raid1 gets less contention between normal-io and resync-io during
resync"
* tag 'md/3.13' of git://neil.brown.name/md:
md/raid5: Use conf->device_lock protect changing of multi-thread resources.
md/raid5: Before freeing old multi-thread worker, it should flush them.
md/raid5: For stripe with R5_ReadNoMerge, we replace REQ_FLUSH with REQ_NOMERGE.
UAPI: include <asm/byteorder.h> in linux/raid/md_p.h
raid1: Rewrite the implementation of iobarrier.
raid1: Add some macros to make code clearly.
raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array.
raid1: Add a field array_frozen to indicate whether raid in freeze state.
md: Convert use of typedef ctl_table to struct ctl_table
md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes.
md: use MD_RECOVERY_INTR instead of kthread_should_stop in resync thread.
md: fix some places where mddev_lock return value is not checked.
raid5: Retry R5_ReadNoMerge flag when hit a read error.
raid5: relieve lock contention in get_active_stripe()
raid5: relieve lock contention in get_active_stripe()
wait: add wait_event_cmd()
md/raid5.c: add proper locking to error path of raid5_start_reshape.
md: fix calculation of stacking limits on level change.
raid5: Use slow_path to release stripe when mddev->thread is null
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 133 |
1 files changed, 80 insertions, 53 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 8766eabb0014..b6b7a2866c9e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev) | |||
112 | 112 | ||
113 | static struct ctl_table_header *raid_table_header; | 113 | static struct ctl_table_header *raid_table_header; |
114 | 114 | ||
115 | static ctl_table raid_table[] = { | 115 | static struct ctl_table raid_table[] = { |
116 | { | 116 | { |
117 | .procname = "speed_limit_min", | 117 | .procname = "speed_limit_min", |
118 | .data = &sysctl_speed_limit_min, | 118 | .data = &sysctl_speed_limit_min, |
@@ -130,7 +130,7 @@ static ctl_table raid_table[] = { | |||
130 | { } | 130 | { } |
131 | }; | 131 | }; |
132 | 132 | ||
133 | static ctl_table raid_dir_table[] = { | 133 | static struct ctl_table raid_dir_table[] = { |
134 | { | 134 | { |
135 | .procname = "raid", | 135 | .procname = "raid", |
136 | .maxlen = 0, | 136 | .maxlen = 0, |
@@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = { | |||
140 | { } | 140 | { } |
141 | }; | 141 | }; |
142 | 142 | ||
143 | static ctl_table raid_root_table[] = { | 143 | static struct ctl_table raid_root_table[] = { |
144 | { | 144 | { |
145 | .procname = "dev", | 145 | .procname = "dev", |
146 | .maxlen = 0, | 146 | .maxlen = 0, |
@@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit) | |||
562 | goto retry; | 562 | goto retry; |
563 | } | 563 | } |
564 | 564 | ||
565 | static inline int mddev_lock(struct mddev * mddev) | 565 | static inline int __must_check mddev_lock(struct mddev * mddev) |
566 | { | 566 | { |
567 | return mutex_lock_interruptible(&mddev->reconfig_mutex); | 567 | return mutex_lock_interruptible(&mddev->reconfig_mutex); |
568 | } | 568 | } |
569 | 569 | ||
570 | /* Sometimes we need to take the lock in a situation where | ||
571 | * failure due to interrupts is not acceptable. | ||
572 | */ | ||
573 | static inline void mddev_lock_nointr(struct mddev * mddev) | ||
574 | { | ||
575 | mutex_lock(&mddev->reconfig_mutex); | ||
576 | } | ||
577 | |||
570 | static inline int mddev_is_locked(struct mddev *mddev) | 578 | static inline int mddev_is_locked(struct mddev *mddev) |
571 | { | 579 | { |
572 | return mutex_is_locked(&mddev->reconfig_mutex); | 580 | return mutex_is_locked(&mddev->reconfig_mutex); |
@@ -2978,7 +2986,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2978 | for_each_mddev(mddev, tmp) { | 2986 | for_each_mddev(mddev, tmp) { |
2979 | struct md_rdev *rdev2; | 2987 | struct md_rdev *rdev2; |
2980 | 2988 | ||
2981 | mddev_lock(mddev); | 2989 | mddev_lock_nointr(mddev); |
2982 | rdev_for_each(rdev2, mddev) | 2990 | rdev_for_each(rdev2, mddev) |
2983 | if (rdev->bdev == rdev2->bdev && | 2991 | if (rdev->bdev == rdev2->bdev && |
2984 | rdev != rdev2 && | 2992 | rdev != rdev2 && |
@@ -2994,7 +3002,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2994 | break; | 3002 | break; |
2995 | } | 3003 | } |
2996 | } | 3004 | } |
2997 | mddev_lock(my_mddev); | 3005 | mddev_lock_nointr(my_mddev); |
2998 | if (overlap) { | 3006 | if (overlap) { |
2999 | /* Someone else could have slipped in a size | 3007 | /* Someone else could have slipped in a size |
3000 | * change here, but doing so is just silly. | 3008 | * change here, but doing so is just silly. |
@@ -3580,6 +3588,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3580 | mddev->in_sync = 1; | 3588 | mddev->in_sync = 1; |
3581 | del_timer_sync(&mddev->safemode_timer); | 3589 | del_timer_sync(&mddev->safemode_timer); |
3582 | } | 3590 | } |
3591 | blk_set_stacking_limits(&mddev->queue->limits); | ||
3583 | pers->run(mddev); | 3592 | pers->run(mddev); |
3584 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3593 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
3585 | mddev_resume(mddev); | 3594 | mddev_resume(mddev); |
@@ -5258,7 +5267,7 @@ static void __md_stop_writes(struct mddev *mddev) | |||
5258 | 5267 | ||
5259 | void md_stop_writes(struct mddev *mddev) | 5268 | void md_stop_writes(struct mddev *mddev) |
5260 | { | 5269 | { |
5261 | mddev_lock(mddev); | 5270 | mddev_lock_nointr(mddev); |
5262 | __md_stop_writes(mddev); | 5271 | __md_stop_writes(mddev); |
5263 | mddev_unlock(mddev); | 5272 | mddev_unlock(mddev); |
5264 | } | 5273 | } |
@@ -5291,20 +5300,35 @@ EXPORT_SYMBOL_GPL(md_stop); | |||
5291 | static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) | 5300 | static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) |
5292 | { | 5301 | { |
5293 | int err = 0; | 5302 | int err = 0; |
5303 | int did_freeze = 0; | ||
5304 | |||
5305 | if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { | ||
5306 | did_freeze = 1; | ||
5307 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
5308 | md_wakeup_thread(mddev->thread); | ||
5309 | } | ||
5310 | if (mddev->sync_thread) { | ||
5311 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
5312 | /* Thread might be blocked waiting for metadata update | ||
5313 | * which will now never happen */ | ||
5314 | wake_up_process(mddev->sync_thread->tsk); | ||
5315 | } | ||
5316 | mddev_unlock(mddev); | ||
5317 | wait_event(resync_wait, mddev->sync_thread == NULL); | ||
5318 | mddev_lock_nointr(mddev); | ||
5319 | |||
5294 | mutex_lock(&mddev->open_mutex); | 5320 | mutex_lock(&mddev->open_mutex); |
5295 | if (atomic_read(&mddev->openers) > !!bdev) { | 5321 | if (atomic_read(&mddev->openers) > !!bdev || |
5322 | mddev->sync_thread || | ||
5323 | (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { | ||
5296 | printk("md: %s still in use.\n",mdname(mddev)); | 5324 | printk("md: %s still in use.\n",mdname(mddev)); |
5325 | if (did_freeze) { | ||
5326 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
5327 | md_wakeup_thread(mddev->thread); | ||
5328 | } | ||
5297 | err = -EBUSY; | 5329 | err = -EBUSY; |
5298 | goto out; | 5330 | goto out; |
5299 | } | 5331 | } |
5300 | if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { | ||
5301 | /* Someone opened the device since we flushed it | ||
5302 | * so page cache could be dirty and it is too late | ||
5303 | * to flush. So abort | ||
5304 | */ | ||
5305 | mutex_unlock(&mddev->open_mutex); | ||
5306 | return -EBUSY; | ||
5307 | } | ||
5308 | if (mddev->pers) { | 5332 | if (mddev->pers) { |
5309 | __md_stop_writes(mddev); | 5333 | __md_stop_writes(mddev); |
5310 | 5334 | ||
@@ -5315,7 +5339,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) | |||
5315 | set_disk_ro(mddev->gendisk, 1); | 5339 | set_disk_ro(mddev->gendisk, 1); |
5316 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 5340 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
5317 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 5341 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
5318 | err = 0; | 5342 | err = 0; |
5319 | } | 5343 | } |
5320 | out: | 5344 | out: |
5321 | mutex_unlock(&mddev->open_mutex); | 5345 | mutex_unlock(&mddev->open_mutex); |
@@ -5331,20 +5355,34 @@ static int do_md_stop(struct mddev * mddev, int mode, | |||
5331 | { | 5355 | { |
5332 | struct gendisk *disk = mddev->gendisk; | 5356 | struct gendisk *disk = mddev->gendisk; |
5333 | struct md_rdev *rdev; | 5357 | struct md_rdev *rdev; |
5358 | int did_freeze = 0; | ||
5359 | |||
5360 | if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { | ||
5361 | did_freeze = 1; | ||
5362 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
5363 | md_wakeup_thread(mddev->thread); | ||
5364 | } | ||
5365 | if (mddev->sync_thread) { | ||
5366 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
5367 | /* Thread might be blocked waiting for metadata update | ||
5368 | * which will now never happen */ | ||
5369 | wake_up_process(mddev->sync_thread->tsk); | ||
5370 | } | ||
5371 | mddev_unlock(mddev); | ||
5372 | wait_event(resync_wait, mddev->sync_thread == NULL); | ||
5373 | mddev_lock_nointr(mddev); | ||
5334 | 5374 | ||
5335 | mutex_lock(&mddev->open_mutex); | 5375 | mutex_lock(&mddev->open_mutex); |
5336 | if (atomic_read(&mddev->openers) > !!bdev || | 5376 | if (atomic_read(&mddev->openers) > !!bdev || |
5337 | mddev->sysfs_active) { | 5377 | mddev->sysfs_active || |
5378 | mddev->sync_thread || | ||
5379 | (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { | ||
5338 | printk("md: %s still in use.\n",mdname(mddev)); | 5380 | printk("md: %s still in use.\n",mdname(mddev)); |
5339 | mutex_unlock(&mddev->open_mutex); | 5381 | mutex_unlock(&mddev->open_mutex); |
5340 | return -EBUSY; | 5382 | if (did_freeze) { |
5341 | } | 5383 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
5342 | if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { | 5384 | md_wakeup_thread(mddev->thread); |
5343 | /* Someone opened the device since we flushed it | 5385 | } |
5344 | * so page cache could be dirty and it is too late | ||
5345 | * to flush. So abort | ||
5346 | */ | ||
5347 | mutex_unlock(&mddev->open_mutex); | ||
5348 | return -EBUSY; | 5386 | return -EBUSY; |
5349 | } | 5387 | } |
5350 | if (mddev->pers) { | 5388 | if (mddev->pers) { |
@@ -6551,7 +6589,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6551 | wait_event(mddev->sb_wait, | 6589 | wait_event(mddev->sb_wait, |
6552 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) && | 6590 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) && |
6553 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 6591 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); |
6554 | mddev_lock(mddev); | 6592 | mddev_lock_nointr(mddev); |
6555 | } | 6593 | } |
6556 | } else { | 6594 | } else { |
6557 | err = -EROFS; | 6595 | err = -EROFS; |
@@ -7361,9 +7399,6 @@ void md_do_sync(struct md_thread *thread) | |||
7361 | mddev->curr_resync = 2; | 7399 | mddev->curr_resync = 2; |
7362 | 7400 | ||
7363 | try_again: | 7401 | try_again: |
7364 | if (kthread_should_stop()) | ||
7365 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
7366 | |||
7367 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | 7402 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
7368 | goto skip; | 7403 | goto skip; |
7369 | for_each_mddev(mddev2, tmp) { | 7404 | for_each_mddev(mddev2, tmp) { |
@@ -7388,7 +7423,7 @@ void md_do_sync(struct md_thread *thread) | |||
7388 | * be caught by 'softlockup' | 7423 | * be caught by 'softlockup' |
7389 | */ | 7424 | */ |
7390 | prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); | 7425 | prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); |
7391 | if (!kthread_should_stop() && | 7426 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
7392 | mddev2->curr_resync >= mddev->curr_resync) { | 7427 | mddev2->curr_resync >= mddev->curr_resync) { |
7393 | printk(KERN_INFO "md: delaying %s of %s" | 7428 | printk(KERN_INFO "md: delaying %s of %s" |
7394 | " until %s has finished (they" | 7429 | " until %s has finished (they" |
@@ -7464,7 +7499,7 @@ void md_do_sync(struct md_thread *thread) | |||
7464 | last_check = 0; | 7499 | last_check = 0; |
7465 | 7500 | ||
7466 | if (j>2) { | 7501 | if (j>2) { |
7467 | printk(KERN_INFO | 7502 | printk(KERN_INFO |
7468 | "md: resuming %s of %s from checkpoint.\n", | 7503 | "md: resuming %s of %s from checkpoint.\n", |
7469 | desc, mdname(mddev)); | 7504 | desc, mdname(mddev)); |
7470 | mddev->curr_resync = j; | 7505 | mddev->curr_resync = j; |
@@ -7501,7 +7536,8 @@ void md_do_sync(struct md_thread *thread) | |||
7501 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 7536 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
7502 | } | 7537 | } |
7503 | 7538 | ||
7504 | while (j >= mddev->resync_max && !kthread_should_stop()) { | 7539 | while (j >= mddev->resync_max && |
7540 | !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | ||
7505 | /* As this condition is controlled by user-space, | 7541 | /* As this condition is controlled by user-space, |
7506 | * we can block indefinitely, so use '_interruptible' | 7542 | * we can block indefinitely, so use '_interruptible' |
7507 | * to avoid triggering warnings. | 7543 | * to avoid triggering warnings. |
@@ -7509,17 +7545,18 @@ void md_do_sync(struct md_thread *thread) | |||
7509 | flush_signals(current); /* just in case */ | 7545 | flush_signals(current); /* just in case */ |
7510 | wait_event_interruptible(mddev->recovery_wait, | 7546 | wait_event_interruptible(mddev->recovery_wait, |
7511 | mddev->resync_max > j | 7547 | mddev->resync_max > j |
7512 | || kthread_should_stop()); | 7548 | || test_bit(MD_RECOVERY_INTR, |
7549 | &mddev->recovery)); | ||
7513 | } | 7550 | } |
7514 | 7551 | ||
7515 | if (kthread_should_stop()) | 7552 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
7516 | goto interrupted; | 7553 | break; |
7517 | 7554 | ||
7518 | sectors = mddev->pers->sync_request(mddev, j, &skipped, | 7555 | sectors = mddev->pers->sync_request(mddev, j, &skipped, |
7519 | currspeed < speed_min(mddev)); | 7556 | currspeed < speed_min(mddev)); |
7520 | if (sectors == 0) { | 7557 | if (sectors == 0) { |
7521 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 7558 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
7522 | goto out; | 7559 | break; |
7523 | } | 7560 | } |
7524 | 7561 | ||
7525 | if (!skipped) { /* actual IO requested */ | 7562 | if (!skipped) { /* actual IO requested */ |
@@ -7556,10 +7593,8 @@ void md_do_sync(struct md_thread *thread) | |||
7556 | last_mark = next; | 7593 | last_mark = next; |
7557 | } | 7594 | } |
7558 | 7595 | ||
7559 | 7596 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | |
7560 | if (kthread_should_stop()) | 7597 | break; |
7561 | goto interrupted; | ||
7562 | |||
7563 | 7598 | ||
7564 | /* | 7599 | /* |
7565 | * this loop exits only if either when we are slower than | 7600 | * this loop exits only if either when we are slower than |
@@ -7582,11 +7617,12 @@ void md_do_sync(struct md_thread *thread) | |||
7582 | } | 7617 | } |
7583 | } | 7618 | } |
7584 | } | 7619 | } |
7585 | printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); | 7620 | printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, |
7621 | test_bit(MD_RECOVERY_INTR, &mddev->recovery) | ||
7622 | ? "interrupted" : "done"); | ||
7586 | /* | 7623 | /* |
7587 | * this also signals 'finished resyncing' to md_stop | 7624 | * this also signals 'finished resyncing' to md_stop |
7588 | */ | 7625 | */ |
7589 | out: | ||
7590 | blk_finish_plug(&plug); | 7626 | blk_finish_plug(&plug); |
7591 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); | 7627 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); |
7592 | 7628 | ||
@@ -7640,16 +7676,6 @@ void md_do_sync(struct md_thread *thread) | |||
7640 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); | 7676 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); |
7641 | md_wakeup_thread(mddev->thread); | 7677 | md_wakeup_thread(mddev->thread); |
7642 | return; | 7678 | return; |
7643 | |||
7644 | interrupted: | ||
7645 | /* | ||
7646 | * got a signal, exit. | ||
7647 | */ | ||
7648 | printk(KERN_INFO | ||
7649 | "md: md_do_sync() got signal ... exiting\n"); | ||
7650 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
7651 | goto out; | ||
7652 | |||
7653 | } | 7679 | } |
7654 | EXPORT_SYMBOL_GPL(md_do_sync); | 7680 | EXPORT_SYMBOL_GPL(md_do_sync); |
7655 | 7681 | ||
@@ -7894,6 +7920,7 @@ void md_reap_sync_thread(struct mddev *mddev) | |||
7894 | 7920 | ||
7895 | /* resync has finished, collect result */ | 7921 | /* resync has finished, collect result */ |
7896 | md_unregister_thread(&mddev->sync_thread); | 7922 | md_unregister_thread(&mddev->sync_thread); |
7923 | wake_up(&resync_wait); | ||
7897 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && | 7924 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
7898 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 7925 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { |
7899 | /* success...*/ | 7926 | /* success...*/ |