diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-20 16:05:25 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-20 16:05:25 -0500 |
commit | 6d6e352c80f22c446d933ca8103e02bac1f09129 (patch) | |
tree | 248a6a7ebc5ea95986da5bccdd6d75b255cf28e4 | |
parent | b4789b8e6be3151a955ade74872822f30e8cd914 (diff) | |
parent | 60aaf933854511630e16be4efe0f96485e132de4 (diff) |
Merge tag 'md/3.13' of git://neil.brown.name/md
Pull md update from Neil Brown:
"Mostly optimisations and obscure bug fixes.
- raid5 gets less lock contention
- raid1 gets less contention between normal-io and resync-io during
resync"
* tag 'md/3.13' of git://neil.brown.name/md:
md/raid5: Use conf->device_lock protect changing of multi-thread resources.
md/raid5: Before freeing old multi-thread worker, it should flush them.
md/raid5: For stripe with R5_ReadNoMerge, we replace REQ_FLUSH with REQ_NOMERGE.
UAPI: include <asm/byteorder.h> in linux/raid/md_p.h
raid1: Rewrite the implementation of iobarrier.
raid1: Add some macros to make code clearly.
raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array.
raid1: Add a field array_frozen to indicate whether raid in freeze state.
md: Convert use of typedef ctl_table to struct ctl_table
md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes.
md: use MD_RECOVERY_INTR instead of kthread_should_stop in resync thread.
md: fix some places where mddev_lock return value is not checked.
raid5: Retry R5_ReadNoMerge flag when hit a read error.
raid5: relieve lock contention in get_active_stripe()
raid5: relieve lock contention in get_active_stripe()
wait: add wait_event_cmd()
md/raid5.c: add proper locking to error path of raid5_start_reshape.
md: fix calculation of stacking limits on level change.
raid5: Use slow_path to release stripe when mddev->thread is null
-rw-r--r-- | drivers/md/md.c | 133 | ||||
-rw-r--r-- | drivers/md/raid1.c | 162 | ||||
-rw-r--r-- | drivers/md/raid1.h | 15 | ||||
-rw-r--r-- | drivers/md/raid10.c | 6 | ||||
-rw-r--r-- | drivers/md/raid5.c | 420 | ||||
-rw-r--r-- | drivers/md/raid5.h | 16 | ||||
-rw-r--r-- | include/linux/wait.h | 25 | ||||
-rw-r--r-- | include/uapi/linux/raid/md_p.h | 1 |
8 files changed, 592 insertions, 186 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 8766eabb0014..b6b7a2866c9e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev) | |||
112 | 112 | ||
113 | static struct ctl_table_header *raid_table_header; | 113 | static struct ctl_table_header *raid_table_header; |
114 | 114 | ||
115 | static ctl_table raid_table[] = { | 115 | static struct ctl_table raid_table[] = { |
116 | { | 116 | { |
117 | .procname = "speed_limit_min", | 117 | .procname = "speed_limit_min", |
118 | .data = &sysctl_speed_limit_min, | 118 | .data = &sysctl_speed_limit_min, |
@@ -130,7 +130,7 @@ static ctl_table raid_table[] = { | |||
130 | { } | 130 | { } |
131 | }; | 131 | }; |
132 | 132 | ||
133 | static ctl_table raid_dir_table[] = { | 133 | static struct ctl_table raid_dir_table[] = { |
134 | { | 134 | { |
135 | .procname = "raid", | 135 | .procname = "raid", |
136 | .maxlen = 0, | 136 | .maxlen = 0, |
@@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = { | |||
140 | { } | 140 | { } |
141 | }; | 141 | }; |
142 | 142 | ||
143 | static ctl_table raid_root_table[] = { | 143 | static struct ctl_table raid_root_table[] = { |
144 | { | 144 | { |
145 | .procname = "dev", | 145 | .procname = "dev", |
146 | .maxlen = 0, | 146 | .maxlen = 0, |
@@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit) | |||
562 | goto retry; | 562 | goto retry; |
563 | } | 563 | } |
564 | 564 | ||
565 | static inline int mddev_lock(struct mddev * mddev) | 565 | static inline int __must_check mddev_lock(struct mddev * mddev) |
566 | { | 566 | { |
567 | return mutex_lock_interruptible(&mddev->reconfig_mutex); | 567 | return mutex_lock_interruptible(&mddev->reconfig_mutex); |
568 | } | 568 | } |
569 | 569 | ||
570 | /* Sometimes we need to take the lock in a situation where | ||
571 | * failure due to interrupts is not acceptable. | ||
572 | */ | ||
573 | static inline void mddev_lock_nointr(struct mddev * mddev) | ||
574 | { | ||
575 | mutex_lock(&mddev->reconfig_mutex); | ||
576 | } | ||
577 | |||
570 | static inline int mddev_is_locked(struct mddev *mddev) | 578 | static inline int mddev_is_locked(struct mddev *mddev) |
571 | { | 579 | { |
572 | return mutex_is_locked(&mddev->reconfig_mutex); | 580 | return mutex_is_locked(&mddev->reconfig_mutex); |
@@ -2978,7 +2986,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2978 | for_each_mddev(mddev, tmp) { | 2986 | for_each_mddev(mddev, tmp) { |
2979 | struct md_rdev *rdev2; | 2987 | struct md_rdev *rdev2; |
2980 | 2988 | ||
2981 | mddev_lock(mddev); | 2989 | mddev_lock_nointr(mddev); |
2982 | rdev_for_each(rdev2, mddev) | 2990 | rdev_for_each(rdev2, mddev) |
2983 | if (rdev->bdev == rdev2->bdev && | 2991 | if (rdev->bdev == rdev2->bdev && |
2984 | rdev != rdev2 && | 2992 | rdev != rdev2 && |
@@ -2994,7 +3002,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2994 | break; | 3002 | break; |
2995 | } | 3003 | } |
2996 | } | 3004 | } |
2997 | mddev_lock(my_mddev); | 3005 | mddev_lock_nointr(my_mddev); |
2998 | if (overlap) { | 3006 | if (overlap) { |
2999 | /* Someone else could have slipped in a size | 3007 | /* Someone else could have slipped in a size |
3000 | * change here, but doing so is just silly. | 3008 | * change here, but doing so is just silly. |
@@ -3580,6 +3588,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3580 | mddev->in_sync = 1; | 3588 | mddev->in_sync = 1; |
3581 | del_timer_sync(&mddev->safemode_timer); | 3589 | del_timer_sync(&mddev->safemode_timer); |
3582 | } | 3590 | } |
3591 | blk_set_stacking_limits(&mddev->queue->limits); | ||
3583 | pers->run(mddev); | 3592 | pers->run(mddev); |
3584 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3593 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
3585 | mddev_resume(mddev); | 3594 | mddev_resume(mddev); |
@@ -5258,7 +5267,7 @@ static void __md_stop_writes(struct mddev *mddev) | |||
5258 | 5267 | ||
5259 | void md_stop_writes(struct mddev *mddev) | 5268 | void md_stop_writes(struct mddev *mddev) |
5260 | { | 5269 | { |
5261 | mddev_lock(mddev); | 5270 | mddev_lock_nointr(mddev); |
5262 | __md_stop_writes(mddev); | 5271 | __md_stop_writes(mddev); |
5263 | mddev_unlock(mddev); | 5272 | mddev_unlock(mddev); |
5264 | } | 5273 | } |
@@ -5291,20 +5300,35 @@ EXPORT_SYMBOL_GPL(md_stop); | |||
5291 | static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) | 5300 | static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) |
5292 | { | 5301 | { |
5293 | int err = 0; | 5302 | int err = 0; |
5303 | int did_freeze = 0; | ||
5304 | |||
5305 | if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { | ||
5306 | did_freeze = 1; | ||
5307 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
5308 | md_wakeup_thread(mddev->thread); | ||
5309 | } | ||
5310 | if (mddev->sync_thread) { | ||
5311 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
5312 | /* Thread might be blocked waiting for metadata update | ||
5313 | * which will now never happen */ | ||
5314 | wake_up_process(mddev->sync_thread->tsk); | ||
5315 | } | ||
5316 | mddev_unlock(mddev); | ||
5317 | wait_event(resync_wait, mddev->sync_thread == NULL); | ||
5318 | mddev_lock_nointr(mddev); | ||
5319 | |||
5294 | mutex_lock(&mddev->open_mutex); | 5320 | mutex_lock(&mddev->open_mutex); |
5295 | if (atomic_read(&mddev->openers) > !!bdev) { | 5321 | if (atomic_read(&mddev->openers) > !!bdev || |
5322 | mddev->sync_thread || | ||
5323 | (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { | ||
5296 | printk("md: %s still in use.\n",mdname(mddev)); | 5324 | printk("md: %s still in use.\n",mdname(mddev)); |
5325 | if (did_freeze) { | ||
5326 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
5327 | md_wakeup_thread(mddev->thread); | ||
5328 | } | ||
5297 | err = -EBUSY; | 5329 | err = -EBUSY; |
5298 | goto out; | 5330 | goto out; |
5299 | } | 5331 | } |
5300 | if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { | ||
5301 | /* Someone opened the device since we flushed it | ||
5302 | * so page cache could be dirty and it is too late | ||
5303 | * to flush. So abort | ||
5304 | */ | ||
5305 | mutex_unlock(&mddev->open_mutex); | ||
5306 | return -EBUSY; | ||
5307 | } | ||
5308 | if (mddev->pers) { | 5332 | if (mddev->pers) { |
5309 | __md_stop_writes(mddev); | 5333 | __md_stop_writes(mddev); |
5310 | 5334 | ||
@@ -5315,7 +5339,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) | |||
5315 | set_disk_ro(mddev->gendisk, 1); | 5339 | set_disk_ro(mddev->gendisk, 1); |
5316 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 5340 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
5317 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 5341 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
5318 | err = 0; | 5342 | err = 0; |
5319 | } | 5343 | } |
5320 | out: | 5344 | out: |
5321 | mutex_unlock(&mddev->open_mutex); | 5345 | mutex_unlock(&mddev->open_mutex); |
@@ -5331,20 +5355,34 @@ static int do_md_stop(struct mddev * mddev, int mode, | |||
5331 | { | 5355 | { |
5332 | struct gendisk *disk = mddev->gendisk; | 5356 | struct gendisk *disk = mddev->gendisk; |
5333 | struct md_rdev *rdev; | 5357 | struct md_rdev *rdev; |
5358 | int did_freeze = 0; | ||
5359 | |||
5360 | if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { | ||
5361 | did_freeze = 1; | ||
5362 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
5363 | md_wakeup_thread(mddev->thread); | ||
5364 | } | ||
5365 | if (mddev->sync_thread) { | ||
5366 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
5367 | /* Thread might be blocked waiting for metadata update | ||
5368 | * which will now never happen */ | ||
5369 | wake_up_process(mddev->sync_thread->tsk); | ||
5370 | } | ||
5371 | mddev_unlock(mddev); | ||
5372 | wait_event(resync_wait, mddev->sync_thread == NULL); | ||
5373 | mddev_lock_nointr(mddev); | ||
5334 | 5374 | ||
5335 | mutex_lock(&mddev->open_mutex); | 5375 | mutex_lock(&mddev->open_mutex); |
5336 | if (atomic_read(&mddev->openers) > !!bdev || | 5376 | if (atomic_read(&mddev->openers) > !!bdev || |
5337 | mddev->sysfs_active) { | 5377 | mddev->sysfs_active || |
5378 | mddev->sync_thread || | ||
5379 | (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { | ||
5338 | printk("md: %s still in use.\n",mdname(mddev)); | 5380 | printk("md: %s still in use.\n",mdname(mddev)); |
5339 | mutex_unlock(&mddev->open_mutex); | 5381 | mutex_unlock(&mddev->open_mutex); |
5340 | return -EBUSY; | 5382 | if (did_freeze) { |
5341 | } | 5383 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
5342 | if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { | 5384 | md_wakeup_thread(mddev->thread); |
5343 | /* Someone opened the device since we flushed it | 5385 | } |
5344 | * so page cache could be dirty and it is too late | ||
5345 | * to flush. So abort | ||
5346 | */ | ||
5347 | mutex_unlock(&mddev->open_mutex); | ||
5348 | return -EBUSY; | 5386 | return -EBUSY; |
5349 | } | 5387 | } |
5350 | if (mddev->pers) { | 5388 | if (mddev->pers) { |
@@ -6551,7 +6589,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6551 | wait_event(mddev->sb_wait, | 6589 | wait_event(mddev->sb_wait, |
6552 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) && | 6590 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) && |
6553 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 6591 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); |
6554 | mddev_lock(mddev); | 6592 | mddev_lock_nointr(mddev); |
6555 | } | 6593 | } |
6556 | } else { | 6594 | } else { |
6557 | err = -EROFS; | 6595 | err = -EROFS; |
@@ -7361,9 +7399,6 @@ void md_do_sync(struct md_thread *thread) | |||
7361 | mddev->curr_resync = 2; | 7399 | mddev->curr_resync = 2; |
7362 | 7400 | ||
7363 | try_again: | 7401 | try_again: |
7364 | if (kthread_should_stop()) | ||
7365 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
7366 | |||
7367 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | 7402 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
7368 | goto skip; | 7403 | goto skip; |
7369 | for_each_mddev(mddev2, tmp) { | 7404 | for_each_mddev(mddev2, tmp) { |
@@ -7388,7 +7423,7 @@ void md_do_sync(struct md_thread *thread) | |||
7388 | * be caught by 'softlockup' | 7423 | * be caught by 'softlockup' |
7389 | */ | 7424 | */ |
7390 | prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); | 7425 | prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); |
7391 | if (!kthread_should_stop() && | 7426 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
7392 | mddev2->curr_resync >= mddev->curr_resync) { | 7427 | mddev2->curr_resync >= mddev->curr_resync) { |
7393 | printk(KERN_INFO "md: delaying %s of %s" | 7428 | printk(KERN_INFO "md: delaying %s of %s" |
7394 | " until %s has finished (they" | 7429 | " until %s has finished (they" |
@@ -7464,7 +7499,7 @@ void md_do_sync(struct md_thread *thread) | |||
7464 | last_check = 0; | 7499 | last_check = 0; |
7465 | 7500 | ||
7466 | if (j>2) { | 7501 | if (j>2) { |
7467 | printk(KERN_INFO | 7502 | printk(KERN_INFO |
7468 | "md: resuming %s of %s from checkpoint.\n", | 7503 | "md: resuming %s of %s from checkpoint.\n", |
7469 | desc, mdname(mddev)); | 7504 | desc, mdname(mddev)); |
7470 | mddev->curr_resync = j; | 7505 | mddev->curr_resync = j; |
@@ -7501,7 +7536,8 @@ void md_do_sync(struct md_thread *thread) | |||
7501 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 7536 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
7502 | } | 7537 | } |
7503 | 7538 | ||
7504 | while (j >= mddev->resync_max && !kthread_should_stop()) { | 7539 | while (j >= mddev->resync_max && |
7540 | !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | ||
7505 | /* As this condition is controlled by user-space, | 7541 | /* As this condition is controlled by user-space, |
7506 | * we can block indefinitely, so use '_interruptible' | 7542 | * we can block indefinitely, so use '_interruptible' |
7507 | * to avoid triggering warnings. | 7543 | * to avoid triggering warnings. |
@@ -7509,17 +7545,18 @@ void md_do_sync(struct md_thread *thread) | |||
7509 | flush_signals(current); /* just in case */ | 7545 | flush_signals(current); /* just in case */ |
7510 | wait_event_interruptible(mddev->recovery_wait, | 7546 | wait_event_interruptible(mddev->recovery_wait, |
7511 | mddev->resync_max > j | 7547 | mddev->resync_max > j |
7512 | || kthread_should_stop()); | 7548 | || test_bit(MD_RECOVERY_INTR, |
7549 | &mddev->recovery)); | ||
7513 | } | 7550 | } |
7514 | 7551 | ||
7515 | if (kthread_should_stop()) | 7552 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
7516 | goto interrupted; | 7553 | break; |
7517 | 7554 | ||
7518 | sectors = mddev->pers->sync_request(mddev, j, &skipped, | 7555 | sectors = mddev->pers->sync_request(mddev, j, &skipped, |
7519 | currspeed < speed_min(mddev)); | 7556 | currspeed < speed_min(mddev)); |
7520 | if (sectors == 0) { | 7557 | if (sectors == 0) { |
7521 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 7558 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
7522 | goto out; | 7559 | break; |
7523 | } | 7560 | } |
7524 | 7561 | ||
7525 | if (!skipped) { /* actual IO requested */ | 7562 | if (!skipped) { /* actual IO requested */ |
@@ -7556,10 +7593,8 @@ void md_do_sync(struct md_thread *thread) | |||
7556 | last_mark = next; | 7593 | last_mark = next; |
7557 | } | 7594 | } |
7558 | 7595 | ||
7559 | 7596 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | |
7560 | if (kthread_should_stop()) | 7597 | break; |
7561 | goto interrupted; | ||
7562 | |||
7563 | 7598 | ||
7564 | /* | 7599 | /* |
7565 | * this loop exits only if either when we are slower than | 7600 | * this loop exits only if either when we are slower than |
@@ -7582,11 +7617,12 @@ void md_do_sync(struct md_thread *thread) | |||
7582 | } | 7617 | } |
7583 | } | 7618 | } |
7584 | } | 7619 | } |
7585 | printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); | 7620 | printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, |
7621 | test_bit(MD_RECOVERY_INTR, &mddev->recovery) | ||
7622 | ? "interrupted" : "done"); | ||
7586 | /* | 7623 | /* |
7587 | * this also signals 'finished resyncing' to md_stop | 7624 | * this also signals 'finished resyncing' to md_stop |
7588 | */ | 7625 | */ |
7589 | out: | ||
7590 | blk_finish_plug(&plug); | 7626 | blk_finish_plug(&plug); |
7591 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); | 7627 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); |
7592 | 7628 | ||
@@ -7640,16 +7676,6 @@ void md_do_sync(struct md_thread *thread) | |||
7640 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); | 7676 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); |
7641 | md_wakeup_thread(mddev->thread); | 7677 | md_wakeup_thread(mddev->thread); |
7642 | return; | 7678 | return; |
7643 | |||
7644 | interrupted: | ||
7645 | /* | ||
7646 | * got a signal, exit. | ||
7647 | */ | ||
7648 | printk(KERN_INFO | ||
7649 | "md: md_do_sync() got signal ... exiting\n"); | ||
7650 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
7651 | goto out; | ||
7652 | |||
7653 | } | 7679 | } |
7654 | EXPORT_SYMBOL_GPL(md_do_sync); | 7680 | EXPORT_SYMBOL_GPL(md_do_sync); |
7655 | 7681 | ||
@@ -7894,6 +7920,7 @@ void md_reap_sync_thread(struct mddev *mddev) | |||
7894 | 7920 | ||
7895 | /* resync has finished, collect result */ | 7921 | /* resync has finished, collect result */ |
7896 | md_unregister_thread(&mddev->sync_thread); | 7922 | md_unregister_thread(&mddev->sync_thread); |
7923 | wake_up(&resync_wait); | ||
7897 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && | 7924 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
7898 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 7925 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { |
7899 | /* success...*/ | 7926 | /* success...*/ |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index af6681b19776..1e5a540995e9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -66,7 +66,8 @@ | |||
66 | */ | 66 | */ |
67 | static int max_queued_requests = 1024; | 67 | static int max_queued_requests = 1024; |
68 | 68 | ||
69 | static void allow_barrier(struct r1conf *conf); | 69 | static void allow_barrier(struct r1conf *conf, sector_t start_next_window, |
70 | sector_t bi_sector); | ||
70 | static void lower_barrier(struct r1conf *conf); | 71 | static void lower_barrier(struct r1conf *conf); |
71 | 72 | ||
72 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) | 73 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) |
@@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data) | |||
84 | } | 85 | } |
85 | 86 | ||
86 | #define RESYNC_BLOCK_SIZE (64*1024) | 87 | #define RESYNC_BLOCK_SIZE (64*1024) |
87 | //#define RESYNC_BLOCK_SIZE PAGE_SIZE | 88 | #define RESYNC_DEPTH 32 |
88 | #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) | 89 | #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) |
89 | #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) | 90 | #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) |
90 | #define RESYNC_WINDOW (2048*1024) | 91 | #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) |
92 | #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) | ||
93 | #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) | ||
91 | 94 | ||
92 | static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) | 95 | static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) |
93 | { | 96 | { |
@@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
225 | struct bio *bio = r1_bio->master_bio; | 228 | struct bio *bio = r1_bio->master_bio; |
226 | int done; | 229 | int done; |
227 | struct r1conf *conf = r1_bio->mddev->private; | 230 | struct r1conf *conf = r1_bio->mddev->private; |
231 | sector_t start_next_window = r1_bio->start_next_window; | ||
232 | sector_t bi_sector = bio->bi_sector; | ||
228 | 233 | ||
229 | if (bio->bi_phys_segments) { | 234 | if (bio->bi_phys_segments) { |
230 | unsigned long flags; | 235 | unsigned long flags; |
@@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
232 | bio->bi_phys_segments--; | 237 | bio->bi_phys_segments--; |
233 | done = (bio->bi_phys_segments == 0); | 238 | done = (bio->bi_phys_segments == 0); |
234 | spin_unlock_irqrestore(&conf->device_lock, flags); | 239 | spin_unlock_irqrestore(&conf->device_lock, flags); |
240 | /* | ||
241 | * make_request() might be waiting for | ||
242 | * bi_phys_segments to decrease | ||
243 | */ | ||
244 | wake_up(&conf->wait_barrier); | ||
235 | } else | 245 | } else |
236 | done = 1; | 246 | done = 1; |
237 | 247 | ||
@@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
243 | * Wake up any possible resync thread that waits for the device | 253 | * Wake up any possible resync thread that waits for the device |
244 | * to go idle. | 254 | * to go idle. |
245 | */ | 255 | */ |
246 | allow_barrier(conf); | 256 | allow_barrier(conf, start_next_window, bi_sector); |
247 | } | 257 | } |
248 | } | 258 | } |
249 | 259 | ||
@@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf) | |||
814 | * there is no normal IO happeing. It must arrange to call | 824 | * there is no normal IO happeing. It must arrange to call |
815 | * lower_barrier when the particular background IO completes. | 825 | * lower_barrier when the particular background IO completes. |
816 | */ | 826 | */ |
817 | #define RESYNC_DEPTH 32 | ||
818 | |||
819 | static void raise_barrier(struct r1conf *conf) | 827 | static void raise_barrier(struct r1conf *conf) |
820 | { | 828 | { |
821 | spin_lock_irq(&conf->resync_lock); | 829 | spin_lock_irq(&conf->resync_lock); |
@@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf) | |||
827 | /* block any new IO from starting */ | 835 | /* block any new IO from starting */ |
828 | conf->barrier++; | 836 | conf->barrier++; |
829 | 837 | ||
830 | /* Now wait for all pending IO to complete */ | 838 | /* For these conditions we must wait: |
839 | * A: while the array is in frozen state | ||
840 | * B: while barrier >= RESYNC_DEPTH, meaning resync reach | ||
841 | * the max count which allowed. | ||
842 | * C: next_resync + RESYNC_SECTORS > start_next_window, meaning | ||
843 | * next resync will reach to the window which normal bios are | ||
844 | * handling. | ||
845 | */ | ||
831 | wait_event_lock_irq(conf->wait_barrier, | 846 | wait_event_lock_irq(conf->wait_barrier, |
832 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, | 847 | !conf->array_frozen && |
848 | conf->barrier < RESYNC_DEPTH && | ||
849 | (conf->start_next_window >= | ||
850 | conf->next_resync + RESYNC_SECTORS), | ||
833 | conf->resync_lock); | 851 | conf->resync_lock); |
834 | 852 | ||
835 | spin_unlock_irq(&conf->resync_lock); | 853 | spin_unlock_irq(&conf->resync_lock); |
@@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf) | |||
845 | wake_up(&conf->wait_barrier); | 863 | wake_up(&conf->wait_barrier); |
846 | } | 864 | } |
847 | 865 | ||
848 | static void wait_barrier(struct r1conf *conf) | 866 | static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) |
849 | { | 867 | { |
868 | bool wait = false; | ||
869 | |||
870 | if (conf->array_frozen || !bio) | ||
871 | wait = true; | ||
872 | else if (conf->barrier && bio_data_dir(bio) == WRITE) { | ||
873 | if (conf->next_resync < RESYNC_WINDOW_SECTORS) | ||
874 | wait = true; | ||
875 | else if ((conf->next_resync - RESYNC_WINDOW_SECTORS | ||
876 | >= bio_end_sector(bio)) || | ||
877 | (conf->next_resync + NEXT_NORMALIO_DISTANCE | ||
878 | <= bio->bi_sector)) | ||
879 | wait = false; | ||
880 | else | ||
881 | wait = true; | ||
882 | } | ||
883 | |||
884 | return wait; | ||
885 | } | ||
886 | |||
887 | static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) | ||
888 | { | ||
889 | sector_t sector = 0; | ||
890 | |||
850 | spin_lock_irq(&conf->resync_lock); | 891 | spin_lock_irq(&conf->resync_lock); |
851 | if (conf->barrier) { | 892 | if (need_to_wait_for_sync(conf, bio)) { |
852 | conf->nr_waiting++; | 893 | conf->nr_waiting++; |
853 | /* Wait for the barrier to drop. | 894 | /* Wait for the barrier to drop. |
854 | * However if there are already pending | 895 | * However if there are already pending |
@@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf) | |||
860 | * count down. | 901 | * count down. |
861 | */ | 902 | */ |
862 | wait_event_lock_irq(conf->wait_barrier, | 903 | wait_event_lock_irq(conf->wait_barrier, |
863 | !conf->barrier || | 904 | !conf->array_frozen && |
864 | (conf->nr_pending && | 905 | (!conf->barrier || |
906 | ((conf->start_next_window < | ||
907 | conf->next_resync + RESYNC_SECTORS) && | ||
865 | current->bio_list && | 908 | current->bio_list && |
866 | !bio_list_empty(current->bio_list)), | 909 | !bio_list_empty(current->bio_list))), |
867 | conf->resync_lock); | 910 | conf->resync_lock); |
868 | conf->nr_waiting--; | 911 | conf->nr_waiting--; |
869 | } | 912 | } |
913 | |||
914 | if (bio && bio_data_dir(bio) == WRITE) { | ||
915 | if (conf->next_resync + NEXT_NORMALIO_DISTANCE | ||
916 | <= bio->bi_sector) { | ||
917 | if (conf->start_next_window == MaxSector) | ||
918 | conf->start_next_window = | ||
919 | conf->next_resync + | ||
920 | NEXT_NORMALIO_DISTANCE; | ||
921 | |||
922 | if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) | ||
923 | <= bio->bi_sector) | ||
924 | conf->next_window_requests++; | ||
925 | else | ||
926 | conf->current_window_requests++; | ||
927 | } | ||
928 | if (bio->bi_sector >= conf->start_next_window) | ||
929 | sector = conf->start_next_window; | ||
930 | } | ||
931 | |||
870 | conf->nr_pending++; | 932 | conf->nr_pending++; |
871 | spin_unlock_irq(&conf->resync_lock); | 933 | spin_unlock_irq(&conf->resync_lock); |
934 | return sector; | ||
872 | } | 935 | } |
873 | 936 | ||
874 | static void allow_barrier(struct r1conf *conf) | 937 | static void allow_barrier(struct r1conf *conf, sector_t start_next_window, |
938 | sector_t bi_sector) | ||
875 | { | 939 | { |
876 | unsigned long flags; | 940 | unsigned long flags; |
941 | |||
877 | spin_lock_irqsave(&conf->resync_lock, flags); | 942 | spin_lock_irqsave(&conf->resync_lock, flags); |
878 | conf->nr_pending--; | 943 | conf->nr_pending--; |
944 | if (start_next_window) { | ||
945 | if (start_next_window == conf->start_next_window) { | ||
946 | if (conf->start_next_window + NEXT_NORMALIO_DISTANCE | ||
947 | <= bi_sector) | ||
948 | conf->next_window_requests--; | ||
949 | else | ||
950 | conf->current_window_requests--; | ||
951 | } else | ||
952 | conf->current_window_requests--; | ||
953 | |||
954 | if (!conf->current_window_requests) { | ||
955 | if (conf->next_window_requests) { | ||
956 | conf->current_window_requests = | ||
957 | conf->next_window_requests; | ||
958 | conf->next_window_requests = 0; | ||
959 | conf->start_next_window += | ||
960 | NEXT_NORMALIO_DISTANCE; | ||
961 | } else | ||
962 | conf->start_next_window = MaxSector; | ||
963 | } | ||
964 | } | ||
879 | spin_unlock_irqrestore(&conf->resync_lock, flags); | 965 | spin_unlock_irqrestore(&conf->resync_lock, flags); |
880 | wake_up(&conf->wait_barrier); | 966 | wake_up(&conf->wait_barrier); |
881 | } | 967 | } |
@@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra) | |||
884 | { | 970 | { |
885 | /* stop syncio and normal IO and wait for everything to | 971 | /* stop syncio and normal IO and wait for everything to |
886 | * go quite. | 972 | * go quite. |
887 | * We increment barrier and nr_waiting, and then | 973 | * We wait until nr_pending match nr_queued+extra |
888 | * wait until nr_pending match nr_queued+extra | ||
889 | * This is called in the context of one normal IO request | 974 | * This is called in the context of one normal IO request |
890 | * that has failed. Thus any sync request that might be pending | 975 | * that has failed. Thus any sync request that might be pending |
891 | * will be blocked by nr_pending, and we need to wait for | 976 | * will be blocked by nr_pending, and we need to wait for |
@@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra) | |||
895 | * we continue. | 980 | * we continue. |
896 | */ | 981 | */ |
897 | spin_lock_irq(&conf->resync_lock); | 982 | spin_lock_irq(&conf->resync_lock); |
898 | conf->barrier++; | 983 | conf->array_frozen = 1; |
899 | conf->nr_waiting++; | ||
900 | wait_event_lock_irq_cmd(conf->wait_barrier, | 984 | wait_event_lock_irq_cmd(conf->wait_barrier, |
901 | conf->nr_pending == conf->nr_queued+extra, | 985 | conf->nr_pending == conf->nr_queued+extra, |
902 | conf->resync_lock, | 986 | conf->resync_lock, |
@@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf) | |||
907 | { | 991 | { |
908 | /* reverse the effect of the freeze */ | 992 | /* reverse the effect of the freeze */ |
909 | spin_lock_irq(&conf->resync_lock); | 993 | spin_lock_irq(&conf->resync_lock); |
910 | conf->barrier--; | 994 | conf->array_frozen = 0; |
911 | conf->nr_waiting--; | ||
912 | wake_up(&conf->wait_barrier); | 995 | wake_up(&conf->wait_barrier); |
913 | spin_unlock_irq(&conf->resync_lock); | 996 | spin_unlock_irq(&conf->resync_lock); |
914 | } | 997 | } |
@@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1013 | int first_clone; | 1096 | int first_clone; |
1014 | int sectors_handled; | 1097 | int sectors_handled; |
1015 | int max_sectors; | 1098 | int max_sectors; |
1099 | sector_t start_next_window; | ||
1016 | 1100 | ||
1017 | /* | 1101 | /* |
1018 | * Register the new request and wait if the reconstruction | 1102 | * Register the new request and wait if the reconstruction |
@@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1042 | finish_wait(&conf->wait_barrier, &w); | 1126 | finish_wait(&conf->wait_barrier, &w); |
1043 | } | 1127 | } |
1044 | 1128 | ||
1045 | wait_barrier(conf); | 1129 | start_next_window = wait_barrier(conf, bio); |
1046 | 1130 | ||
1047 | bitmap = mddev->bitmap; | 1131 | bitmap = mddev->bitmap; |
1048 | 1132 | ||
@@ -1163,6 +1247,7 @@ read_again: | |||
1163 | 1247 | ||
1164 | disks = conf->raid_disks * 2; | 1248 | disks = conf->raid_disks * 2; |
1165 | retry_write: | 1249 | retry_write: |
1250 | r1_bio->start_next_window = start_next_window; | ||
1166 | blocked_rdev = NULL; | 1251 | blocked_rdev = NULL; |
1167 | rcu_read_lock(); | 1252 | rcu_read_lock(); |
1168 | max_sectors = r1_bio->sectors; | 1253 | max_sectors = r1_bio->sectors; |
@@ -1231,14 +1316,24 @@ read_again: | |||
1231 | if (unlikely(blocked_rdev)) { | 1316 | if (unlikely(blocked_rdev)) { |
1232 | /* Wait for this device to become unblocked */ | 1317 | /* Wait for this device to become unblocked */ |
1233 | int j; | 1318 | int j; |
1319 | sector_t old = start_next_window; | ||
1234 | 1320 | ||
1235 | for (j = 0; j < i; j++) | 1321 | for (j = 0; j < i; j++) |
1236 | if (r1_bio->bios[j]) | 1322 | if (r1_bio->bios[j]) |
1237 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 1323 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
1238 | r1_bio->state = 0; | 1324 | r1_bio->state = 0; |
1239 | allow_barrier(conf); | 1325 | allow_barrier(conf, start_next_window, bio->bi_sector); |
1240 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1326 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
1241 | wait_barrier(conf); | 1327 | start_next_window = wait_barrier(conf, bio); |
1328 | /* | ||
1329 | * We must make sure the multi r1bios of bio have | ||
1330 | * the same value of bi_phys_segments | ||
1331 | */ | ||
1332 | if (bio->bi_phys_segments && old && | ||
1333 | old != start_next_window) | ||
1334 | /* Wait for the former r1bio(s) to complete */ | ||
1335 | wait_event(conf->wait_barrier, | ||
1336 | bio->bi_phys_segments == 1); | ||
1242 | goto retry_write; | 1337 | goto retry_write; |
1243 | } | 1338 | } |
1244 | 1339 | ||
@@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf) | |||
1438 | 1533 | ||
1439 | static void close_sync(struct r1conf *conf) | 1534 | static void close_sync(struct r1conf *conf) |
1440 | { | 1535 | { |
1441 | wait_barrier(conf); | 1536 | wait_barrier(conf, NULL); |
1442 | allow_barrier(conf); | 1537 | allow_barrier(conf, 0, 0); |
1443 | 1538 | ||
1444 | mempool_destroy(conf->r1buf_pool); | 1539 | mempool_destroy(conf->r1buf_pool); |
1445 | conf->r1buf_pool = NULL; | 1540 | conf->r1buf_pool = NULL; |
1541 | |||
1542 | conf->next_resync = 0; | ||
1543 | conf->start_next_window = MaxSector; | ||
1446 | } | 1544 | } |
1447 | 1545 | ||
1448 | static int raid1_spare_active(struct mddev *mddev) | 1546 | static int raid1_spare_active(struct mddev *mddev) |
@@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2714 | conf->pending_count = 0; | 2812 | conf->pending_count = 0; |
2715 | conf->recovery_disabled = mddev->recovery_disabled - 1; | 2813 | conf->recovery_disabled = mddev->recovery_disabled - 1; |
2716 | 2814 | ||
2815 | conf->start_next_window = MaxSector; | ||
2816 | conf->current_window_requests = conf->next_window_requests = 0; | ||
2817 | |||
2717 | err = -EIO; | 2818 | err = -EIO; |
2718 | for (i = 0; i < conf->raid_disks * 2; i++) { | 2819 | for (i = 0; i < conf->raid_disks * 2; i++) { |
2719 | 2820 | ||
@@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev) | |||
2871 | atomic_read(&bitmap->behind_writes) == 0); | 2972 | atomic_read(&bitmap->behind_writes) == 0); |
2872 | } | 2973 | } |
2873 | 2974 | ||
2874 | raise_barrier(conf); | 2975 | freeze_array(conf, 0); |
2875 | lower_barrier(conf); | 2976 | unfreeze_array(conf); |
2876 | 2977 | ||
2877 | md_unregister_thread(&mddev->thread); | 2978 | md_unregister_thread(&mddev->thread); |
2878 | if (conf->r1bio_pool) | 2979 | if (conf->r1bio_pool) |
@@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state) | |||
3031 | wake_up(&conf->wait_barrier); | 3132 | wake_up(&conf->wait_barrier); |
3032 | break; | 3133 | break; |
3033 | case 1: | 3134 | case 1: |
3034 | raise_barrier(conf); | 3135 | freeze_array(conf, 0); |
3035 | break; | 3136 | break; |
3036 | case 0: | 3137 | case 0: |
3037 | lower_barrier(conf); | 3138 | unfreeze_array(conf); |
3038 | break; | 3139 | break; |
3039 | } | 3140 | } |
3040 | } | 3141 | } |
@@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev) | |||
3051 | mddev->new_chunk_sectors = 0; | 3152 | mddev->new_chunk_sectors = 0; |
3052 | conf = setup_conf(mddev); | 3153 | conf = setup_conf(mddev); |
3053 | if (!IS_ERR(conf)) | 3154 | if (!IS_ERR(conf)) |
3054 | conf->barrier = 1; | 3155 | /* Array must appear to be quiesced */ |
3156 | conf->array_frozen = 1; | ||
3055 | return conf; | 3157 | return conf; |
3056 | } | 3158 | } |
3057 | return ERR_PTR(-EINVAL); | 3159 | return ERR_PTR(-EINVAL); |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 0ff3715fb7eb..9bebca7bff2f 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
@@ -41,6 +41,19 @@ struct r1conf { | |||
41 | */ | 41 | */ |
42 | sector_t next_resync; | 42 | sector_t next_resync; |
43 | 43 | ||
44 | /* When raid1 starts resync, we divide array into four partitions | ||
45 | * |---------|--------------|---------------------|-------------| | ||
46 | * next_resync start_next_window end_window | ||
47 | * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE | ||
48 | * end_window = start_next_window + NEXT_NORMALIO_DISTANCE | ||
49 | * current_window_requests means the count of normalIO between | ||
50 | * start_next_window and end_window. | ||
51 | * next_window_requests means the count of normalIO after end_window. | ||
52 | * */ | ||
53 | sector_t start_next_window; | ||
54 | int current_window_requests; | ||
55 | int next_window_requests; | ||
56 | |||
44 | spinlock_t device_lock; | 57 | spinlock_t device_lock; |
45 | 58 | ||
46 | /* list of 'struct r1bio' that need to be processed by raid1d, | 59 | /* list of 'struct r1bio' that need to be processed by raid1d, |
@@ -65,6 +78,7 @@ struct r1conf { | |||
65 | int nr_waiting; | 78 | int nr_waiting; |
66 | int nr_queued; | 79 | int nr_queued; |
67 | int barrier; | 80 | int barrier; |
81 | int array_frozen; | ||
68 | 82 | ||
69 | /* Set to 1 if a full sync is needed, (fresh device added). | 83 | /* Set to 1 if a full sync is needed, (fresh device added). |
70 | * Cleared when a sync completes. | 84 | * Cleared when a sync completes. |
@@ -111,6 +125,7 @@ struct r1bio { | |||
111 | * in this BehindIO request | 125 | * in this BehindIO request |
112 | */ | 126 | */ |
113 | sector_t sector; | 127 | sector_t sector; |
128 | sector_t start_next_window; | ||
114 | int sectors; | 129 | int sectors; |
115 | unsigned long state; | 130 | unsigned long state; |
116 | struct mddev *mddev; | 131 | struct mddev *mddev; |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7c3508abb5e1..c504e8389e69 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | |||
4384 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4384 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
4385 | md_wakeup_thread(mddev->thread); | 4385 | md_wakeup_thread(mddev->thread); |
4386 | wait_event(mddev->sb_wait, mddev->flags == 0 || | 4386 | wait_event(mddev->sb_wait, mddev->flags == 0 || |
4387 | kthread_should_stop()); | 4387 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
4388 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | ||
4389 | allow_barrier(conf); | ||
4390 | return sectors_done; | ||
4391 | } | ||
4388 | conf->reshape_safe = mddev->reshape_position; | 4392 | conf->reshape_safe = mddev->reshape_position; |
4389 | allow_barrier(conf); | 4393 | allow_barrier(conf); |
4390 | } | 4394 | } |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7f0e17a27aeb..47da0af6322b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) | |||
85 | return &conf->stripe_hashtbl[hash]; | 85 | return &conf->stripe_hashtbl[hash]; |
86 | } | 86 | } |
87 | 87 | ||
88 | static inline int stripe_hash_locks_hash(sector_t sect) | ||
89 | { | ||
90 | return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; | ||
91 | } | ||
92 | |||
93 | static inline void lock_device_hash_lock(struct r5conf *conf, int hash) | ||
94 | { | ||
95 | spin_lock_irq(conf->hash_locks + hash); | ||
96 | spin_lock(&conf->device_lock); | ||
97 | } | ||
98 | |||
99 | static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) | ||
100 | { | ||
101 | spin_unlock(&conf->device_lock); | ||
102 | spin_unlock_irq(conf->hash_locks + hash); | ||
103 | } | ||
104 | |||
105 | static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) | ||
106 | { | ||
107 | int i; | ||
108 | local_irq_disable(); | ||
109 | spin_lock(conf->hash_locks); | ||
110 | for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) | ||
111 | spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); | ||
112 | spin_lock(&conf->device_lock); | ||
113 | } | ||
114 | |||
115 | static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) | ||
116 | { | ||
117 | int i; | ||
118 | spin_unlock(&conf->device_lock); | ||
119 | for (i = NR_STRIPE_HASH_LOCKS; i; i--) | ||
120 | spin_unlock(conf->hash_locks + i - 1); | ||
121 | local_irq_enable(); | ||
122 | } | ||
123 | |||
88 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector | 124 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector |
89 | * order without overlap. There may be several bio's per stripe+device, and | 125 | * order without overlap. There may be several bio's per stripe+device, and |
90 | * a bio could span several devices. | 126 | * a bio could span several devices. |
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) | |||
249 | } | 285 | } |
250 | } | 286 | } |
251 | 287 | ||
252 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) | 288 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, |
289 | struct list_head *temp_inactive_list) | ||
253 | { | 290 | { |
254 | BUG_ON(!list_empty(&sh->lru)); | 291 | BUG_ON(!list_empty(&sh->lru)); |
255 | BUG_ON(atomic_read(&conf->active_stripes)==0); | 292 | BUG_ON(atomic_read(&conf->active_stripes)==0); |
@@ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) | |||
278 | < IO_THRESHOLD) | 315 | < IO_THRESHOLD) |
279 | md_wakeup_thread(conf->mddev->thread); | 316 | md_wakeup_thread(conf->mddev->thread); |
280 | atomic_dec(&conf->active_stripes); | 317 | atomic_dec(&conf->active_stripes); |
281 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { | 318 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) |
282 | list_add_tail(&sh->lru, &conf->inactive_list); | 319 | list_add_tail(&sh->lru, temp_inactive_list); |
283 | wake_up(&conf->wait_for_stripe); | ||
284 | if (conf->retry_read_aligned) | ||
285 | md_wakeup_thread(conf->mddev->thread); | ||
286 | } | ||
287 | } | 320 | } |
288 | } | 321 | } |
289 | 322 | ||
290 | static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) | 323 | static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, |
324 | struct list_head *temp_inactive_list) | ||
291 | { | 325 | { |
292 | if (atomic_dec_and_test(&sh->count)) | 326 | if (atomic_dec_and_test(&sh->count)) |
293 | do_release_stripe(conf, sh); | 327 | do_release_stripe(conf, sh, temp_inactive_list); |
328 | } | ||
329 | |||
330 | /* | ||
331 | * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list | ||
332 | * | ||
333 | * Be careful: Only one task can add/delete stripes from temp_inactive_list at | ||
334 | * given time. Adding stripes only takes device lock, while deleting stripes | ||
335 | * only takes hash lock. | ||
336 | */ | ||
337 | static void release_inactive_stripe_list(struct r5conf *conf, | ||
338 | struct list_head *temp_inactive_list, | ||
339 | int hash) | ||
340 | { | ||
341 | int size; | ||
342 | bool do_wakeup = false; | ||
343 | unsigned long flags; | ||
344 | |||
345 | if (hash == NR_STRIPE_HASH_LOCKS) { | ||
346 | size = NR_STRIPE_HASH_LOCKS; | ||
347 | hash = NR_STRIPE_HASH_LOCKS - 1; | ||
348 | } else | ||
349 | size = 1; | ||
350 | while (size) { | ||
351 | struct list_head *list = &temp_inactive_list[size - 1]; | ||
352 | |||
353 | /* | ||
354 | * We don't hold any lock here yet, get_active_stripe() might | ||
355 | * remove stripes from the list | ||
356 | */ | ||
357 | if (!list_empty_careful(list)) { | ||
358 | spin_lock_irqsave(conf->hash_locks + hash, flags); | ||
359 | if (list_empty(conf->inactive_list + hash) && | ||
360 | !list_empty(list)) | ||
361 | atomic_dec(&conf->empty_inactive_list_nr); | ||
362 | list_splice_tail_init(list, conf->inactive_list + hash); | ||
363 | do_wakeup = true; | ||
364 | spin_unlock_irqrestore(conf->hash_locks + hash, flags); | ||
365 | } | ||
366 | size--; | ||
367 | hash--; | ||
368 | } | ||
369 | |||
370 | if (do_wakeup) { | ||
371 | wake_up(&conf->wait_for_stripe); | ||
372 | if (conf->retry_read_aligned) | ||
373 | md_wakeup_thread(conf->mddev->thread); | ||
374 | } | ||
294 | } | 375 | } |
295 | 376 | ||
296 | /* should hold conf->device_lock already */ | 377 | /* should hold conf->device_lock already */ |
297 | static int release_stripe_list(struct r5conf *conf) | 378 | static int release_stripe_list(struct r5conf *conf, |
379 | struct list_head *temp_inactive_list) | ||
298 | { | 380 | { |
299 | struct stripe_head *sh; | 381 | struct stripe_head *sh; |
300 | int count = 0; | 382 | int count = 0; |
@@ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf) | |||
303 | head = llist_del_all(&conf->released_stripes); | 385 | head = llist_del_all(&conf->released_stripes); |
304 | head = llist_reverse_order(head); | 386 | head = llist_reverse_order(head); |
305 | while (head) { | 387 | while (head) { |
388 | int hash; | ||
389 | |||
306 | sh = llist_entry(head, struct stripe_head, release_list); | 390 | sh = llist_entry(head, struct stripe_head, release_list); |
307 | head = llist_next(head); | 391 | head = llist_next(head); |
308 | /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ | 392 | /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ |
@@ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf) | |||
313 | * again, the count is always > 1. This is true for | 397 | * again, the count is always > 1. This is true for |
314 | * STRIPE_ON_UNPLUG_LIST bit too. | 398 | * STRIPE_ON_UNPLUG_LIST bit too. |
315 | */ | 399 | */ |
316 | __release_stripe(conf, sh); | 400 | hash = sh->hash_lock_index; |
401 | __release_stripe(conf, sh, &temp_inactive_list[hash]); | ||
317 | count++; | 402 | count++; |
318 | } | 403 | } |
319 | 404 | ||
@@ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh) | |||
324 | { | 409 | { |
325 | struct r5conf *conf = sh->raid_conf; | 410 | struct r5conf *conf = sh->raid_conf; |
326 | unsigned long flags; | 411 | unsigned long flags; |
412 | struct list_head list; | ||
413 | int hash; | ||
327 | bool wakeup; | 414 | bool wakeup; |
328 | 415 | ||
329 | if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) | 416 | if (unlikely(!conf->mddev->thread) || |
417 | test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) | ||
330 | goto slow_path; | 418 | goto slow_path; |
331 | wakeup = llist_add(&sh->release_list, &conf->released_stripes); | 419 | wakeup = llist_add(&sh->release_list, &conf->released_stripes); |
332 | if (wakeup) | 420 | if (wakeup) |
@@ -336,8 +424,11 @@ slow_path: | |||
336 | local_irq_save(flags); | 424 | local_irq_save(flags); |
337 | /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ | 425 | /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ |
338 | if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { | 426 | if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { |
339 | do_release_stripe(conf, sh); | 427 | INIT_LIST_HEAD(&list); |
428 | hash = sh->hash_lock_index; | ||
429 | do_release_stripe(conf, sh, &list); | ||
340 | spin_unlock(&conf->device_lock); | 430 | spin_unlock(&conf->device_lock); |
431 | release_inactive_stripe_list(conf, &list, hash); | ||
341 | } | 432 | } |
342 | local_irq_restore(flags); | 433 | local_irq_restore(flags); |
343 | } | 434 | } |
@@ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) | |||
362 | 453 | ||
363 | 454 | ||
364 | /* find an idle stripe, make sure it is unhashed, and return it. */ | 455 | /* find an idle stripe, make sure it is unhashed, and return it. */ |
365 | static struct stripe_head *get_free_stripe(struct r5conf *conf) | 456 | static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) |
366 | { | 457 | { |
367 | struct stripe_head *sh = NULL; | 458 | struct stripe_head *sh = NULL; |
368 | struct list_head *first; | 459 | struct list_head *first; |
369 | 460 | ||
370 | if (list_empty(&conf->inactive_list)) | 461 | if (list_empty(conf->inactive_list + hash)) |
371 | goto out; | 462 | goto out; |
372 | first = conf->inactive_list.next; | 463 | first = (conf->inactive_list + hash)->next; |
373 | sh = list_entry(first, struct stripe_head, lru); | 464 | sh = list_entry(first, struct stripe_head, lru); |
374 | list_del_init(first); | 465 | list_del_init(first); |
375 | remove_hash(sh); | 466 | remove_hash(sh); |
376 | atomic_inc(&conf->active_stripes); | 467 | atomic_inc(&conf->active_stripes); |
468 | BUG_ON(hash != sh->hash_lock_index); | ||
469 | if (list_empty(conf->inactive_list + hash)) | ||
470 | atomic_inc(&conf->empty_inactive_list_nr); | ||
377 | out: | 471 | out: |
378 | return sh; | 472 | return sh; |
379 | } | 473 | } |
@@ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, | |||
416 | static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | 510 | static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) |
417 | { | 511 | { |
418 | struct r5conf *conf = sh->raid_conf; | 512 | struct r5conf *conf = sh->raid_conf; |
419 | int i; | 513 | int i, seq; |
420 | 514 | ||
421 | BUG_ON(atomic_read(&sh->count) != 0); | 515 | BUG_ON(atomic_read(&sh->count) != 0); |
422 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); | 516 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); |
@@ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | |||
426 | (unsigned long long)sh->sector); | 520 | (unsigned long long)sh->sector); |
427 | 521 | ||
428 | remove_hash(sh); | 522 | remove_hash(sh); |
429 | 523 | retry: | |
524 | seq = read_seqcount_begin(&conf->gen_lock); | ||
430 | sh->generation = conf->generation - previous; | 525 | sh->generation = conf->generation - previous; |
431 | sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; | 526 | sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; |
432 | sh->sector = sector; | 527 | sh->sector = sector; |
@@ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | |||
448 | dev->flags = 0; | 543 | dev->flags = 0; |
449 | raid5_build_block(sh, i, previous); | 544 | raid5_build_block(sh, i, previous); |
450 | } | 545 | } |
546 | if (read_seqcount_retry(&conf->gen_lock, seq)) | ||
547 | goto retry; | ||
451 | insert_hash(conf, sh); | 548 | insert_hash(conf, sh); |
452 | sh->cpu = smp_processor_id(); | 549 | sh->cpu = smp_processor_id(); |
453 | } | 550 | } |
@@ -552,29 +649,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
552 | int previous, int noblock, int noquiesce) | 649 | int previous, int noblock, int noquiesce) |
553 | { | 650 | { |
554 | struct stripe_head *sh; | 651 | struct stripe_head *sh; |
652 | int hash = stripe_hash_locks_hash(sector); | ||
555 | 653 | ||
556 | pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); | 654 | pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); |
557 | 655 | ||
558 | spin_lock_irq(&conf->device_lock); | 656 | spin_lock_irq(conf->hash_locks + hash); |
559 | 657 | ||
560 | do { | 658 | do { |
561 | wait_event_lock_irq(conf->wait_for_stripe, | 659 | wait_event_lock_irq(conf->wait_for_stripe, |
562 | conf->quiesce == 0 || noquiesce, | 660 | conf->quiesce == 0 || noquiesce, |
563 | conf->device_lock); | 661 | *(conf->hash_locks + hash)); |
564 | sh = __find_stripe(conf, sector, conf->generation - previous); | 662 | sh = __find_stripe(conf, sector, conf->generation - previous); |
565 | if (!sh) { | 663 | if (!sh) { |
566 | if (!conf->inactive_blocked) | 664 | if (!conf->inactive_blocked) |
567 | sh = get_free_stripe(conf); | 665 | sh = get_free_stripe(conf, hash); |
568 | if (noblock && sh == NULL) | 666 | if (noblock && sh == NULL) |
569 | break; | 667 | break; |
570 | if (!sh) { | 668 | if (!sh) { |
571 | conf->inactive_blocked = 1; | 669 | conf->inactive_blocked = 1; |
572 | wait_event_lock_irq(conf->wait_for_stripe, | 670 | wait_event_lock_irq( |
573 | !list_empty(&conf->inactive_list) && | 671 | conf->wait_for_stripe, |
574 | (atomic_read(&conf->active_stripes) | 672 | !list_empty(conf->inactive_list + hash) && |
575 | < (conf->max_nr_stripes *3/4) | 673 | (atomic_read(&conf->active_stripes) |
576 | || !conf->inactive_blocked), | 674 | < (conf->max_nr_stripes * 3 / 4) |
577 | conf->device_lock); | 675 | || !conf->inactive_blocked), |
676 | *(conf->hash_locks + hash)); | ||
578 | conf->inactive_blocked = 0; | 677 | conf->inactive_blocked = 0; |
579 | } else | 678 | } else |
580 | init_stripe(sh, sector, previous); | 679 | init_stripe(sh, sector, previous); |
@@ -585,9 +684,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
585 | && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) | 684 | && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) |
586 | && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); | 685 | && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); |
587 | } else { | 686 | } else { |
687 | spin_lock(&conf->device_lock); | ||
588 | if (!test_bit(STRIPE_HANDLE, &sh->state)) | 688 | if (!test_bit(STRIPE_HANDLE, &sh->state)) |
589 | atomic_inc(&conf->active_stripes); | 689 | atomic_inc(&conf->active_stripes); |
590 | if (list_empty(&sh->lru) && | 690 | if (list_empty(&sh->lru) && |
691 | !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) && | ||
591 | !test_bit(STRIPE_EXPANDING, &sh->state)) | 692 | !test_bit(STRIPE_EXPANDING, &sh->state)) |
592 | BUG(); | 693 | BUG(); |
593 | list_del_init(&sh->lru); | 694 | list_del_init(&sh->lru); |
@@ -595,6 +696,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
595 | sh->group->stripes_cnt--; | 696 | sh->group->stripes_cnt--; |
596 | sh->group = NULL; | 697 | sh->group = NULL; |
597 | } | 698 | } |
699 | spin_unlock(&conf->device_lock); | ||
598 | } | 700 | } |
599 | } | 701 | } |
600 | } while (sh == NULL); | 702 | } while (sh == NULL); |
@@ -602,7 +704,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
602 | if (sh) | 704 | if (sh) |
603 | atomic_inc(&sh->count); | 705 | atomic_inc(&sh->count); |
604 | 706 | ||
605 | spin_unlock_irq(&conf->device_lock); | 707 | spin_unlock_irq(conf->hash_locks + hash); |
606 | return sh; | 708 | return sh; |
607 | } | 709 | } |
608 | 710 | ||
@@ -758,7 +860,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
758 | bi->bi_sector = (sh->sector | 860 | bi->bi_sector = (sh->sector |
759 | + rdev->data_offset); | 861 | + rdev->data_offset); |
760 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | 862 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) |
761 | bi->bi_rw |= REQ_FLUSH; | 863 | bi->bi_rw |= REQ_NOMERGE; |
762 | 864 | ||
763 | bi->bi_vcnt = 1; | 865 | bi->bi_vcnt = 1; |
764 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 866 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
@@ -1582,7 +1684,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
1582 | put_cpu(); | 1684 | put_cpu(); |
1583 | } | 1685 | } |
1584 | 1686 | ||
1585 | static int grow_one_stripe(struct r5conf *conf) | 1687 | static int grow_one_stripe(struct r5conf *conf, int hash) |
1586 | { | 1688 | { |
1587 | struct stripe_head *sh; | 1689 | struct stripe_head *sh; |
1588 | sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); | 1690 | sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); |
@@ -1598,6 +1700,7 @@ static int grow_one_stripe(struct r5conf *conf) | |||
1598 | kmem_cache_free(conf->slab_cache, sh); | 1700 | kmem_cache_free(conf->slab_cache, sh); |
1599 | return 0; | 1701 | return 0; |
1600 | } | 1702 | } |
1703 | sh->hash_lock_index = hash; | ||
1601 | /* we just created an active stripe so... */ | 1704 | /* we just created an active stripe so... */ |
1602 | atomic_set(&sh->count, 1); | 1705 | atomic_set(&sh->count, 1); |
1603 | atomic_inc(&conf->active_stripes); | 1706 | atomic_inc(&conf->active_stripes); |
@@ -1610,6 +1713,7 @@ static int grow_stripes(struct r5conf *conf, int num) | |||
1610 | { | 1713 | { |
1611 | struct kmem_cache *sc; | 1714 | struct kmem_cache *sc; |
1612 | int devs = max(conf->raid_disks, conf->previous_raid_disks); | 1715 | int devs = max(conf->raid_disks, conf->previous_raid_disks); |
1716 | int hash; | ||
1613 | 1717 | ||
1614 | if (conf->mddev->gendisk) | 1718 | if (conf->mddev->gendisk) |
1615 | sprintf(conf->cache_name[0], | 1719 | sprintf(conf->cache_name[0], |
@@ -1627,9 +1731,13 @@ static int grow_stripes(struct r5conf *conf, int num) | |||
1627 | return 1; | 1731 | return 1; |
1628 | conf->slab_cache = sc; | 1732 | conf->slab_cache = sc; |
1629 | conf->pool_size = devs; | 1733 | conf->pool_size = devs; |
1630 | while (num--) | 1734 | hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; |
1631 | if (!grow_one_stripe(conf)) | 1735 | while (num--) { |
1736 | if (!grow_one_stripe(conf, hash)) | ||
1632 | return 1; | 1737 | return 1; |
1738 | conf->max_nr_stripes++; | ||
1739 | hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; | ||
1740 | } | ||
1633 | return 0; | 1741 | return 0; |
1634 | } | 1742 | } |
1635 | 1743 | ||
@@ -1687,6 +1795,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
1687 | int err; | 1795 | int err; |
1688 | struct kmem_cache *sc; | 1796 | struct kmem_cache *sc; |
1689 | int i; | 1797 | int i; |
1798 | int hash, cnt; | ||
1690 | 1799 | ||
1691 | if (newsize <= conf->pool_size) | 1800 | if (newsize <= conf->pool_size) |
1692 | return 0; /* never bother to shrink */ | 1801 | return 0; /* never bother to shrink */ |
@@ -1726,19 +1835,29 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
1726 | * OK, we have enough stripes, start collecting inactive | 1835 | * OK, we have enough stripes, start collecting inactive |
1727 | * stripes and copying them over | 1836 | * stripes and copying them over |
1728 | */ | 1837 | */ |
1838 | hash = 0; | ||
1839 | cnt = 0; | ||
1729 | list_for_each_entry(nsh, &newstripes, lru) { | 1840 | list_for_each_entry(nsh, &newstripes, lru) { |
1730 | spin_lock_irq(&conf->device_lock); | 1841 | lock_device_hash_lock(conf, hash); |
1731 | wait_event_lock_irq(conf->wait_for_stripe, | 1842 | wait_event_cmd(conf->wait_for_stripe, |
1732 | !list_empty(&conf->inactive_list), | 1843 | !list_empty(conf->inactive_list + hash), |
1733 | conf->device_lock); | 1844 | unlock_device_hash_lock(conf, hash), |
1734 | osh = get_free_stripe(conf); | 1845 | lock_device_hash_lock(conf, hash)); |
1735 | spin_unlock_irq(&conf->device_lock); | 1846 | osh = get_free_stripe(conf, hash); |
1847 | unlock_device_hash_lock(conf, hash); | ||
1736 | atomic_set(&nsh->count, 1); | 1848 | atomic_set(&nsh->count, 1); |
1737 | for(i=0; i<conf->pool_size; i++) | 1849 | for(i=0; i<conf->pool_size; i++) |
1738 | nsh->dev[i].page = osh->dev[i].page; | 1850 | nsh->dev[i].page = osh->dev[i].page; |
1739 | for( ; i<newsize; i++) | 1851 | for( ; i<newsize; i++) |
1740 | nsh->dev[i].page = NULL; | 1852 | nsh->dev[i].page = NULL; |
1853 | nsh->hash_lock_index = hash; | ||
1741 | kmem_cache_free(conf->slab_cache, osh); | 1854 | kmem_cache_free(conf->slab_cache, osh); |
1855 | cnt++; | ||
1856 | if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + | ||
1857 | !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { | ||
1858 | hash++; | ||
1859 | cnt = 0; | ||
1860 | } | ||
1742 | } | 1861 | } |
1743 | kmem_cache_destroy(conf->slab_cache); | 1862 | kmem_cache_destroy(conf->slab_cache); |
1744 | 1863 | ||
@@ -1797,13 +1916,13 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
1797 | return err; | 1916 | return err; |
1798 | } | 1917 | } |
1799 | 1918 | ||
1800 | static int drop_one_stripe(struct r5conf *conf) | 1919 | static int drop_one_stripe(struct r5conf *conf, int hash) |
1801 | { | 1920 | { |
1802 | struct stripe_head *sh; | 1921 | struct stripe_head *sh; |
1803 | 1922 | ||
1804 | spin_lock_irq(&conf->device_lock); | 1923 | spin_lock_irq(conf->hash_locks + hash); |
1805 | sh = get_free_stripe(conf); | 1924 | sh = get_free_stripe(conf, hash); |
1806 | spin_unlock_irq(&conf->device_lock); | 1925 | spin_unlock_irq(conf->hash_locks + hash); |
1807 | if (!sh) | 1926 | if (!sh) |
1808 | return 0; | 1927 | return 0; |
1809 | BUG_ON(atomic_read(&sh->count)); | 1928 | BUG_ON(atomic_read(&sh->count)); |
@@ -1815,8 +1934,10 @@ static int drop_one_stripe(struct r5conf *conf) | |||
1815 | 1934 | ||
1816 | static void shrink_stripes(struct r5conf *conf) | 1935 | static void shrink_stripes(struct r5conf *conf) |
1817 | { | 1936 | { |
1818 | while (drop_one_stripe(conf)) | 1937 | int hash; |
1819 | ; | 1938 | for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) |
1939 | while (drop_one_stripe(conf, hash)) | ||
1940 | ; | ||
1820 | 1941 | ||
1821 | if (conf->slab_cache) | 1942 | if (conf->slab_cache) |
1822 | kmem_cache_destroy(conf->slab_cache); | 1943 | kmem_cache_destroy(conf->slab_cache); |
@@ -1921,6 +2042,9 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1921 | mdname(conf->mddev), bdn); | 2042 | mdname(conf->mddev), bdn); |
1922 | else | 2043 | else |
1923 | retry = 1; | 2044 | retry = 1; |
2045 | if (set_bad && test_bit(In_sync, &rdev->flags) | ||
2046 | && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | ||
2047 | retry = 1; | ||
1924 | if (retry) | 2048 | if (retry) |
1925 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { | 2049 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { |
1926 | set_bit(R5_ReadError, &sh->dev[i].flags); | 2050 | set_bit(R5_ReadError, &sh->dev[i].flags); |
@@ -3900,7 +4024,8 @@ static void raid5_activate_delayed(struct r5conf *conf) | |||
3900 | } | 4024 | } |
3901 | } | 4025 | } |
3902 | 4026 | ||
3903 | static void activate_bit_delay(struct r5conf *conf) | 4027 | static void activate_bit_delay(struct r5conf *conf, |
4028 | struct list_head *temp_inactive_list) | ||
3904 | { | 4029 | { |
3905 | /* device_lock is held */ | 4030 | /* device_lock is held */ |
3906 | struct list_head head; | 4031 | struct list_head head; |
@@ -3908,9 +4033,11 @@ static void activate_bit_delay(struct r5conf *conf) | |||
3908 | list_del_init(&conf->bitmap_list); | 4033 | list_del_init(&conf->bitmap_list); |
3909 | while (!list_empty(&head)) { | 4034 | while (!list_empty(&head)) { |
3910 | struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); | 4035 | struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); |
4036 | int hash; | ||
3911 | list_del_init(&sh->lru); | 4037 | list_del_init(&sh->lru); |
3912 | atomic_inc(&sh->count); | 4038 | atomic_inc(&sh->count); |
3913 | __release_stripe(conf, sh); | 4039 | hash = sh->hash_lock_index; |
4040 | __release_stripe(conf, sh, &temp_inactive_list[hash]); | ||
3914 | } | 4041 | } |
3915 | } | 4042 | } |
3916 | 4043 | ||
@@ -3926,7 +4053,7 @@ int md_raid5_congested(struct mddev *mddev, int bits) | |||
3926 | return 1; | 4053 | return 1; |
3927 | if (conf->quiesce) | 4054 | if (conf->quiesce) |
3928 | return 1; | 4055 | return 1; |
3929 | if (list_empty_careful(&conf->inactive_list)) | 4056 | if (atomic_read(&conf->empty_inactive_list_nr)) |
3930 | return 1; | 4057 | return 1; |
3931 | 4058 | ||
3932 | return 0; | 4059 | return 0; |
@@ -4256,6 +4383,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) | |||
4256 | struct raid5_plug_cb { | 4383 | struct raid5_plug_cb { |
4257 | struct blk_plug_cb cb; | 4384 | struct blk_plug_cb cb; |
4258 | struct list_head list; | 4385 | struct list_head list; |
4386 | struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; | ||
4259 | }; | 4387 | }; |
4260 | 4388 | ||
4261 | static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | 4389 | static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) |
@@ -4266,6 +4394,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | |||
4266 | struct mddev *mddev = cb->cb.data; | 4394 | struct mddev *mddev = cb->cb.data; |
4267 | struct r5conf *conf = mddev->private; | 4395 | struct r5conf *conf = mddev->private; |
4268 | int cnt = 0; | 4396 | int cnt = 0; |
4397 | int hash; | ||
4269 | 4398 | ||
4270 | if (cb->list.next && !list_empty(&cb->list)) { | 4399 | if (cb->list.next && !list_empty(&cb->list)) { |
4271 | spin_lock_irq(&conf->device_lock); | 4400 | spin_lock_irq(&conf->device_lock); |
@@ -4283,11 +4412,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | |||
4283 | * STRIPE_ON_RELEASE_LIST could be set here. In that | 4412 | * STRIPE_ON_RELEASE_LIST could be set here. In that |
4284 | * case, the count is always > 1 here | 4413 | * case, the count is always > 1 here |
4285 | */ | 4414 | */ |
4286 | __release_stripe(conf, sh); | 4415 | hash = sh->hash_lock_index; |
4416 | __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); | ||
4287 | cnt++; | 4417 | cnt++; |
4288 | } | 4418 | } |
4289 | spin_unlock_irq(&conf->device_lock); | 4419 | spin_unlock_irq(&conf->device_lock); |
4290 | } | 4420 | } |
4421 | release_inactive_stripe_list(conf, cb->temp_inactive_list, | ||
4422 | NR_STRIPE_HASH_LOCKS); | ||
4291 | if (mddev->queue) | 4423 | if (mddev->queue) |
4292 | trace_block_unplug(mddev->queue, cnt, !from_schedule); | 4424 | trace_block_unplug(mddev->queue, cnt, !from_schedule); |
4293 | kfree(cb); | 4425 | kfree(cb); |
@@ -4308,8 +4440,12 @@ static void release_stripe_plug(struct mddev *mddev, | |||
4308 | 4440 | ||
4309 | cb = container_of(blk_cb, struct raid5_plug_cb, cb); | 4441 | cb = container_of(blk_cb, struct raid5_plug_cb, cb); |
4310 | 4442 | ||
4311 | if (cb->list.next == NULL) | 4443 | if (cb->list.next == NULL) { |
4444 | int i; | ||
4312 | INIT_LIST_HEAD(&cb->list); | 4445 | INIT_LIST_HEAD(&cb->list); |
4446 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||
4447 | INIT_LIST_HEAD(cb->temp_inactive_list + i); | ||
4448 | } | ||
4313 | 4449 | ||
4314 | if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) | 4450 | if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) |
4315 | list_add_tail(&sh->lru, &cb->list); | 4451 | list_add_tail(&sh->lru, &cb->list); |
@@ -4692,14 +4828,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4692 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | 4828 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { |
4693 | /* Cannot proceed until we've updated the superblock... */ | 4829 | /* Cannot proceed until we've updated the superblock... */ |
4694 | wait_event(conf->wait_for_overlap, | 4830 | wait_event(conf->wait_for_overlap, |
4695 | atomic_read(&conf->reshape_stripes)==0); | 4831 | atomic_read(&conf->reshape_stripes)==0 |
4832 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | ||
4833 | if (atomic_read(&conf->reshape_stripes) != 0) | ||
4834 | return 0; | ||
4696 | mddev->reshape_position = conf->reshape_progress; | 4835 | mddev->reshape_position = conf->reshape_progress; |
4697 | mddev->curr_resync_completed = sector_nr; | 4836 | mddev->curr_resync_completed = sector_nr; |
4698 | conf->reshape_checkpoint = jiffies; | 4837 | conf->reshape_checkpoint = jiffies; |
4699 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4838 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
4700 | md_wakeup_thread(mddev->thread); | 4839 | md_wakeup_thread(mddev->thread); |
4701 | wait_event(mddev->sb_wait, mddev->flags == 0 || | 4840 | wait_event(mddev->sb_wait, mddev->flags == 0 || |
4702 | kthread_should_stop()); | 4841 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
4842 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
4843 | return 0; | ||
4703 | spin_lock_irq(&conf->device_lock); | 4844 | spin_lock_irq(&conf->device_lock); |
4704 | conf->reshape_safe = mddev->reshape_position; | 4845 | conf->reshape_safe = mddev->reshape_position; |
4705 | spin_unlock_irq(&conf->device_lock); | 4846 | spin_unlock_irq(&conf->device_lock); |
@@ -4782,7 +4923,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4782 | >= mddev->resync_max - mddev->curr_resync_completed) { | 4923 | >= mddev->resync_max - mddev->curr_resync_completed) { |
4783 | /* Cannot proceed until we've updated the superblock... */ | 4924 | /* Cannot proceed until we've updated the superblock... */ |
4784 | wait_event(conf->wait_for_overlap, | 4925 | wait_event(conf->wait_for_overlap, |
4785 | atomic_read(&conf->reshape_stripes) == 0); | 4926 | atomic_read(&conf->reshape_stripes) == 0 |
4927 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | ||
4928 | if (atomic_read(&conf->reshape_stripes) != 0) | ||
4929 | goto ret; | ||
4786 | mddev->reshape_position = conf->reshape_progress; | 4930 | mddev->reshape_position = conf->reshape_progress; |
4787 | mddev->curr_resync_completed = sector_nr; | 4931 | mddev->curr_resync_completed = sector_nr; |
4788 | conf->reshape_checkpoint = jiffies; | 4932 | conf->reshape_checkpoint = jiffies; |
@@ -4790,13 +4934,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4790 | md_wakeup_thread(mddev->thread); | 4934 | md_wakeup_thread(mddev->thread); |
4791 | wait_event(mddev->sb_wait, | 4935 | wait_event(mddev->sb_wait, |
4792 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) | 4936 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) |
4793 | || kthread_should_stop()); | 4937 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
4938 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
4939 | goto ret; | ||
4794 | spin_lock_irq(&conf->device_lock); | 4940 | spin_lock_irq(&conf->device_lock); |
4795 | conf->reshape_safe = mddev->reshape_position; | 4941 | conf->reshape_safe = mddev->reshape_position; |
4796 | spin_unlock_irq(&conf->device_lock); | 4942 | spin_unlock_irq(&conf->device_lock); |
4797 | wake_up(&conf->wait_for_overlap); | 4943 | wake_up(&conf->wait_for_overlap); |
4798 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 4944 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
4799 | } | 4945 | } |
4946 | ret: | ||
4800 | return reshape_sectors; | 4947 | return reshape_sectors; |
4801 | } | 4948 | } |
4802 | 4949 | ||
@@ -4954,27 +5101,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
4954 | } | 5101 | } |
4955 | 5102 | ||
4956 | static int handle_active_stripes(struct r5conf *conf, int group, | 5103 | static int handle_active_stripes(struct r5conf *conf, int group, |
4957 | struct r5worker *worker) | 5104 | struct r5worker *worker, |
5105 | struct list_head *temp_inactive_list) | ||
4958 | { | 5106 | { |
4959 | struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; | 5107 | struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; |
4960 | int i, batch_size = 0; | 5108 | int i, batch_size = 0, hash; |
5109 | bool release_inactive = false; | ||
4961 | 5110 | ||
4962 | while (batch_size < MAX_STRIPE_BATCH && | 5111 | while (batch_size < MAX_STRIPE_BATCH && |
4963 | (sh = __get_priority_stripe(conf, group)) != NULL) | 5112 | (sh = __get_priority_stripe(conf, group)) != NULL) |
4964 | batch[batch_size++] = sh; | 5113 | batch[batch_size++] = sh; |
4965 | 5114 | ||
4966 | if (batch_size == 0) | 5115 | if (batch_size == 0) { |
4967 | return batch_size; | 5116 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) |
5117 | if (!list_empty(temp_inactive_list + i)) | ||
5118 | break; | ||
5119 | if (i == NR_STRIPE_HASH_LOCKS) | ||
5120 | return batch_size; | ||
5121 | release_inactive = true; | ||
5122 | } | ||
4968 | spin_unlock_irq(&conf->device_lock); | 5123 | spin_unlock_irq(&conf->device_lock); |
4969 | 5124 | ||
5125 | release_inactive_stripe_list(conf, temp_inactive_list, | ||
5126 | NR_STRIPE_HASH_LOCKS); | ||
5127 | |||
5128 | if (release_inactive) { | ||
5129 | spin_lock_irq(&conf->device_lock); | ||
5130 | return 0; | ||
5131 | } | ||
5132 | |||
4970 | for (i = 0; i < batch_size; i++) | 5133 | for (i = 0; i < batch_size; i++) |
4971 | handle_stripe(batch[i]); | 5134 | handle_stripe(batch[i]); |
4972 | 5135 | ||
4973 | cond_resched(); | 5136 | cond_resched(); |
4974 | 5137 | ||
4975 | spin_lock_irq(&conf->device_lock); | 5138 | spin_lock_irq(&conf->device_lock); |
4976 | for (i = 0; i < batch_size; i++) | 5139 | for (i = 0; i < batch_size; i++) { |
4977 | __release_stripe(conf, batch[i]); | 5140 | hash = batch[i]->hash_lock_index; |
5141 | __release_stripe(conf, batch[i], &temp_inactive_list[hash]); | ||
5142 | } | ||
4978 | return batch_size; | 5143 | return batch_size; |
4979 | } | 5144 | } |
4980 | 5145 | ||
@@ -4995,9 +5160,10 @@ static void raid5_do_work(struct work_struct *work) | |||
4995 | while (1) { | 5160 | while (1) { |
4996 | int batch_size, released; | 5161 | int batch_size, released; |
4997 | 5162 | ||
4998 | released = release_stripe_list(conf); | 5163 | released = release_stripe_list(conf, worker->temp_inactive_list); |
4999 | 5164 | ||
5000 | batch_size = handle_active_stripes(conf, group_id, worker); | 5165 | batch_size = handle_active_stripes(conf, group_id, worker, |
5166 | worker->temp_inactive_list); | ||
5001 | worker->working = false; | 5167 | worker->working = false; |
5002 | if (!batch_size && !released) | 5168 | if (!batch_size && !released) |
5003 | break; | 5169 | break; |
@@ -5036,7 +5202,7 @@ static void raid5d(struct md_thread *thread) | |||
5036 | struct bio *bio; | 5202 | struct bio *bio; |
5037 | int batch_size, released; | 5203 | int batch_size, released; |
5038 | 5204 | ||
5039 | released = release_stripe_list(conf); | 5205 | released = release_stripe_list(conf, conf->temp_inactive_list); |
5040 | 5206 | ||
5041 | if ( | 5207 | if ( |
5042 | !list_empty(&conf->bitmap_list)) { | 5208 | !list_empty(&conf->bitmap_list)) { |
@@ -5046,7 +5212,7 @@ static void raid5d(struct md_thread *thread) | |||
5046 | bitmap_unplug(mddev->bitmap); | 5212 | bitmap_unplug(mddev->bitmap); |
5047 | spin_lock_irq(&conf->device_lock); | 5213 | spin_lock_irq(&conf->device_lock); |
5048 | conf->seq_write = conf->seq_flush; | 5214 | conf->seq_write = conf->seq_flush; |
5049 | activate_bit_delay(conf); | 5215 | activate_bit_delay(conf, conf->temp_inactive_list); |
5050 | } | 5216 | } |
5051 | raid5_activate_delayed(conf); | 5217 | raid5_activate_delayed(conf); |
5052 | 5218 | ||
@@ -5060,7 +5226,8 @@ static void raid5d(struct md_thread *thread) | |||
5060 | handled++; | 5226 | handled++; |
5061 | } | 5227 | } |
5062 | 5228 | ||
5063 | batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); | 5229 | batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, |
5230 | conf->temp_inactive_list); | ||
5064 | if (!batch_size && !released) | 5231 | if (!batch_size && !released) |
5065 | break; | 5232 | break; |
5066 | handled += batch_size; | 5233 | handled += batch_size; |
@@ -5096,22 +5263,29 @@ raid5_set_cache_size(struct mddev *mddev, int size) | |||
5096 | { | 5263 | { |
5097 | struct r5conf *conf = mddev->private; | 5264 | struct r5conf *conf = mddev->private; |
5098 | int err; | 5265 | int err; |
5266 | int hash; | ||
5099 | 5267 | ||
5100 | if (size <= 16 || size > 32768) | 5268 | if (size <= 16 || size > 32768) |
5101 | return -EINVAL; | 5269 | return -EINVAL; |
5270 | hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; | ||
5102 | while (size < conf->max_nr_stripes) { | 5271 | while (size < conf->max_nr_stripes) { |
5103 | if (drop_one_stripe(conf)) | 5272 | if (drop_one_stripe(conf, hash)) |
5104 | conf->max_nr_stripes--; | 5273 | conf->max_nr_stripes--; |
5105 | else | 5274 | else |
5106 | break; | 5275 | break; |
5276 | hash--; | ||
5277 | if (hash < 0) | ||
5278 | hash = NR_STRIPE_HASH_LOCKS - 1; | ||
5107 | } | 5279 | } |
5108 | err = md_allow_write(mddev); | 5280 | err = md_allow_write(mddev); |
5109 | if (err) | 5281 | if (err) |
5110 | return err; | 5282 | return err; |
5283 | hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; | ||
5111 | while (size > conf->max_nr_stripes) { | 5284 | while (size > conf->max_nr_stripes) { |
5112 | if (grow_one_stripe(conf)) | 5285 | if (grow_one_stripe(conf, hash)) |
5113 | conf->max_nr_stripes++; | 5286 | conf->max_nr_stripes++; |
5114 | else break; | 5287 | else break; |
5288 | hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; | ||
5115 | } | 5289 | } |
5116 | return 0; | 5290 | return 0; |
5117 | } | 5291 | } |
@@ -5199,15 +5373,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page) | |||
5199 | return 0; | 5373 | return 0; |
5200 | } | 5374 | } |
5201 | 5375 | ||
5202 | static int alloc_thread_groups(struct r5conf *conf, int cnt); | 5376 | static int alloc_thread_groups(struct r5conf *conf, int cnt, |
5377 | int *group_cnt, | ||
5378 | int *worker_cnt_per_group, | ||
5379 | struct r5worker_group **worker_groups); | ||
5203 | static ssize_t | 5380 | static ssize_t |
5204 | raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) | 5381 | raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) |
5205 | { | 5382 | { |
5206 | struct r5conf *conf = mddev->private; | 5383 | struct r5conf *conf = mddev->private; |
5207 | unsigned long new; | 5384 | unsigned long new; |
5208 | int err; | 5385 | int err; |
5209 | struct r5worker_group *old_groups; | 5386 | struct r5worker_group *new_groups, *old_groups; |
5210 | int old_group_cnt; | 5387 | int group_cnt, worker_cnt_per_group; |
5211 | 5388 | ||
5212 | if (len >= PAGE_SIZE) | 5389 | if (len >= PAGE_SIZE) |
5213 | return -EINVAL; | 5390 | return -EINVAL; |
@@ -5223,14 +5400,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) | |||
5223 | mddev_suspend(mddev); | 5400 | mddev_suspend(mddev); |
5224 | 5401 | ||
5225 | old_groups = conf->worker_groups; | 5402 | old_groups = conf->worker_groups; |
5226 | old_group_cnt = conf->worker_cnt_per_group; | 5403 | if (old_groups) |
5404 | flush_workqueue(raid5_wq); | ||
5405 | |||
5406 | err = alloc_thread_groups(conf, new, | ||
5407 | &group_cnt, &worker_cnt_per_group, | ||
5408 | &new_groups); | ||
5409 | if (!err) { | ||
5410 | spin_lock_irq(&conf->device_lock); | ||
5411 | conf->group_cnt = group_cnt; | ||
5412 | conf->worker_cnt_per_group = worker_cnt_per_group; | ||
5413 | conf->worker_groups = new_groups; | ||
5414 | spin_unlock_irq(&conf->device_lock); | ||
5227 | 5415 | ||
5228 | conf->worker_groups = NULL; | ||
5229 | err = alloc_thread_groups(conf, new); | ||
5230 | if (err) { | ||
5231 | conf->worker_groups = old_groups; | ||
5232 | conf->worker_cnt_per_group = old_group_cnt; | ||
5233 | } else { | ||
5234 | if (old_groups) | 5416 | if (old_groups) |
5235 | kfree(old_groups[0].workers); | 5417 | kfree(old_groups[0].workers); |
5236 | kfree(old_groups); | 5418 | kfree(old_groups); |
@@ -5260,40 +5442,47 @@ static struct attribute_group raid5_attrs_group = { | |||
5260 | .attrs = raid5_attrs, | 5442 | .attrs = raid5_attrs, |
5261 | }; | 5443 | }; |
5262 | 5444 | ||
5263 | static int alloc_thread_groups(struct r5conf *conf, int cnt) | 5445 | static int alloc_thread_groups(struct r5conf *conf, int cnt, |
5446 | int *group_cnt, | ||
5447 | int *worker_cnt_per_group, | ||
5448 | struct r5worker_group **worker_groups) | ||
5264 | { | 5449 | { |
5265 | int i, j; | 5450 | int i, j, k; |
5266 | ssize_t size; | 5451 | ssize_t size; |
5267 | struct r5worker *workers; | 5452 | struct r5worker *workers; |
5268 | 5453 | ||
5269 | conf->worker_cnt_per_group = cnt; | 5454 | *worker_cnt_per_group = cnt; |
5270 | if (cnt == 0) { | 5455 | if (cnt == 0) { |
5271 | conf->worker_groups = NULL; | 5456 | *group_cnt = 0; |
5457 | *worker_groups = NULL; | ||
5272 | return 0; | 5458 | return 0; |
5273 | } | 5459 | } |
5274 | conf->group_cnt = num_possible_nodes(); | 5460 | *group_cnt = num_possible_nodes(); |
5275 | size = sizeof(struct r5worker) * cnt; | 5461 | size = sizeof(struct r5worker) * cnt; |
5276 | workers = kzalloc(size * conf->group_cnt, GFP_NOIO); | 5462 | workers = kzalloc(size * *group_cnt, GFP_NOIO); |
5277 | conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * | 5463 | *worker_groups = kzalloc(sizeof(struct r5worker_group) * |
5278 | conf->group_cnt, GFP_NOIO); | 5464 | *group_cnt, GFP_NOIO); |
5279 | if (!conf->worker_groups || !workers) { | 5465 | if (!*worker_groups || !workers) { |
5280 | kfree(workers); | 5466 | kfree(workers); |
5281 | kfree(conf->worker_groups); | 5467 | kfree(*worker_groups); |
5282 | conf->worker_groups = NULL; | ||
5283 | return -ENOMEM; | 5468 | return -ENOMEM; |
5284 | } | 5469 | } |
5285 | 5470 | ||
5286 | for (i = 0; i < conf->group_cnt; i++) { | 5471 | for (i = 0; i < *group_cnt; i++) { |
5287 | struct r5worker_group *group; | 5472 | struct r5worker_group *group; |
5288 | 5473 | ||
5289 | group = &conf->worker_groups[i]; | 5474 | group = worker_groups[i]; |
5290 | INIT_LIST_HEAD(&group->handle_list); | 5475 | INIT_LIST_HEAD(&group->handle_list); |
5291 | group->conf = conf; | 5476 | group->conf = conf; |
5292 | group->workers = workers + i * cnt; | 5477 | group->workers = workers + i * cnt; |
5293 | 5478 | ||
5294 | for (j = 0; j < cnt; j++) { | 5479 | for (j = 0; j < cnt; j++) { |
5295 | group->workers[j].group = group; | 5480 | struct r5worker *worker = group->workers + j; |
5296 | INIT_WORK(&group->workers[j].work, raid5_do_work); | 5481 | worker->group = group; |
5482 | INIT_WORK(&worker->work, raid5_do_work); | ||
5483 | |||
5484 | for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) | ||
5485 | INIT_LIST_HEAD(worker->temp_inactive_list + k); | ||
5297 | } | 5486 | } |
5298 | } | 5487 | } |
5299 | 5488 | ||
@@ -5444,6 +5633,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5444 | struct md_rdev *rdev; | 5633 | struct md_rdev *rdev; |
5445 | struct disk_info *disk; | 5634 | struct disk_info *disk; |
5446 | char pers_name[6]; | 5635 | char pers_name[6]; |
5636 | int i; | ||
5637 | int group_cnt, worker_cnt_per_group; | ||
5638 | struct r5worker_group *new_group; | ||
5447 | 5639 | ||
5448 | if (mddev->new_level != 5 | 5640 | if (mddev->new_level != 5 |
5449 | && mddev->new_level != 4 | 5641 | && mddev->new_level != 4 |
@@ -5478,7 +5670,12 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5478 | if (conf == NULL) | 5670 | if (conf == NULL) |
5479 | goto abort; | 5671 | goto abort; |
5480 | /* Don't enable multi-threading by default*/ | 5672 | /* Don't enable multi-threading by default*/ |
5481 | if (alloc_thread_groups(conf, 0)) | 5673 | if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, |
5674 | &new_group)) { | ||
5675 | conf->group_cnt = group_cnt; | ||
5676 | conf->worker_cnt_per_group = worker_cnt_per_group; | ||
5677 | conf->worker_groups = new_group; | ||
5678 | } else | ||
5482 | goto abort; | 5679 | goto abort; |
5483 | spin_lock_init(&conf->device_lock); | 5680 | spin_lock_init(&conf->device_lock); |
5484 | seqcount_init(&conf->gen_lock); | 5681 | seqcount_init(&conf->gen_lock); |
@@ -5488,7 +5685,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5488 | INIT_LIST_HEAD(&conf->hold_list); | 5685 | INIT_LIST_HEAD(&conf->hold_list); |
5489 | INIT_LIST_HEAD(&conf->delayed_list); | 5686 | INIT_LIST_HEAD(&conf->delayed_list); |
5490 | INIT_LIST_HEAD(&conf->bitmap_list); | 5687 | INIT_LIST_HEAD(&conf->bitmap_list); |
5491 | INIT_LIST_HEAD(&conf->inactive_list); | ||
5492 | init_llist_head(&conf->released_stripes); | 5688 | init_llist_head(&conf->released_stripes); |
5493 | atomic_set(&conf->active_stripes, 0); | 5689 | atomic_set(&conf->active_stripes, 0); |
5494 | atomic_set(&conf->preread_active_stripes, 0); | 5690 | atomic_set(&conf->preread_active_stripes, 0); |
@@ -5514,6 +5710,21 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5514 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) | 5710 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) |
5515 | goto abort; | 5711 | goto abort; |
5516 | 5712 | ||
5713 | /* We init hash_locks[0] separately to that it can be used | ||
5714 | * as the reference lock in the spin_lock_nest_lock() call | ||
5715 | * in lock_all_device_hash_locks_irq in order to convince | ||
5716 | * lockdep that we know what we are doing. | ||
5717 | */ | ||
5718 | spin_lock_init(conf->hash_locks); | ||
5719 | for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) | ||
5720 | spin_lock_init(conf->hash_locks + i); | ||
5721 | |||
5722 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||
5723 | INIT_LIST_HEAD(conf->inactive_list + i); | ||
5724 | |||
5725 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||
5726 | INIT_LIST_HEAD(conf->temp_inactive_list + i); | ||
5727 | |||
5517 | conf->level = mddev->new_level; | 5728 | conf->level = mddev->new_level; |
5518 | if (raid5_alloc_percpu(conf) != 0) | 5729 | if (raid5_alloc_percpu(conf) != 0) |
5519 | goto abort; | 5730 | goto abort; |
@@ -5554,7 +5765,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5554 | else | 5765 | else |
5555 | conf->max_degraded = 1; | 5766 | conf->max_degraded = 1; |
5556 | conf->algorithm = mddev->new_layout; | 5767 | conf->algorithm = mddev->new_layout; |
5557 | conf->max_nr_stripes = NR_STRIPES; | ||
5558 | conf->reshape_progress = mddev->reshape_position; | 5768 | conf->reshape_progress = mddev->reshape_position; |
5559 | if (conf->reshape_progress != MaxSector) { | 5769 | if (conf->reshape_progress != MaxSector) { |
5560 | conf->prev_chunk_sectors = mddev->chunk_sectors; | 5770 | conf->prev_chunk_sectors = mddev->chunk_sectors; |
@@ -5563,7 +5773,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5563 | 5773 | ||
5564 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + | 5774 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + |
5565 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | 5775 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; |
5566 | if (grow_stripes(conf, conf->max_nr_stripes)) { | 5776 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); |
5777 | if (grow_stripes(conf, NR_STRIPES)) { | ||
5567 | printk(KERN_ERR | 5778 | printk(KERN_ERR |
5568 | "md/raid:%s: couldn't allocate %dkB for buffers\n", | 5779 | "md/raid:%s: couldn't allocate %dkB for buffers\n", |
5569 | mdname(mddev), memory); | 5780 | mdname(mddev), memory); |
@@ -6369,12 +6580,18 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
6369 | if (!mddev->sync_thread) { | 6580 | if (!mddev->sync_thread) { |
6370 | mddev->recovery = 0; | 6581 | mddev->recovery = 0; |
6371 | spin_lock_irq(&conf->device_lock); | 6582 | spin_lock_irq(&conf->device_lock); |
6583 | write_seqcount_begin(&conf->gen_lock); | ||
6372 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; | 6584 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; |
6585 | mddev->new_chunk_sectors = | ||
6586 | conf->chunk_sectors = conf->prev_chunk_sectors; | ||
6587 | mddev->new_layout = conf->algorithm = conf->prev_algo; | ||
6373 | rdev_for_each(rdev, mddev) | 6588 | rdev_for_each(rdev, mddev) |
6374 | rdev->new_data_offset = rdev->data_offset; | 6589 | rdev->new_data_offset = rdev->data_offset; |
6375 | smp_wmb(); | 6590 | smp_wmb(); |
6591 | conf->generation --; | ||
6376 | conf->reshape_progress = MaxSector; | 6592 | conf->reshape_progress = MaxSector; |
6377 | mddev->reshape_position = MaxSector; | 6593 | mddev->reshape_position = MaxSector; |
6594 | write_seqcount_end(&conf->gen_lock); | ||
6378 | spin_unlock_irq(&conf->device_lock); | 6595 | spin_unlock_irq(&conf->device_lock); |
6379 | return -EAGAIN; | 6596 | return -EAGAIN; |
6380 | } | 6597 | } |
@@ -6462,27 +6679,28 @@ static void raid5_quiesce(struct mddev *mddev, int state) | |||
6462 | break; | 6679 | break; |
6463 | 6680 | ||
6464 | case 1: /* stop all writes */ | 6681 | case 1: /* stop all writes */ |
6465 | spin_lock_irq(&conf->device_lock); | 6682 | lock_all_device_hash_locks_irq(conf); |
6466 | /* '2' tells resync/reshape to pause so that all | 6683 | /* '2' tells resync/reshape to pause so that all |
6467 | * active stripes can drain | 6684 | * active stripes can drain |
6468 | */ | 6685 | */ |
6469 | conf->quiesce = 2; | 6686 | conf->quiesce = 2; |
6470 | wait_event_lock_irq(conf->wait_for_stripe, | 6687 | wait_event_cmd(conf->wait_for_stripe, |
6471 | atomic_read(&conf->active_stripes) == 0 && | 6688 | atomic_read(&conf->active_stripes) == 0 && |
6472 | atomic_read(&conf->active_aligned_reads) == 0, | 6689 | atomic_read(&conf->active_aligned_reads) == 0, |
6473 | conf->device_lock); | 6690 | unlock_all_device_hash_locks_irq(conf), |
6691 | lock_all_device_hash_locks_irq(conf)); | ||
6474 | conf->quiesce = 1; | 6692 | conf->quiesce = 1; |
6475 | spin_unlock_irq(&conf->device_lock); | 6693 | unlock_all_device_hash_locks_irq(conf); |
6476 | /* allow reshape to continue */ | 6694 | /* allow reshape to continue */ |
6477 | wake_up(&conf->wait_for_overlap); | 6695 | wake_up(&conf->wait_for_overlap); |
6478 | break; | 6696 | break; |
6479 | 6697 | ||
6480 | case 0: /* re-enable writes */ | 6698 | case 0: /* re-enable writes */ |
6481 | spin_lock_irq(&conf->device_lock); | 6699 | lock_all_device_hash_locks_irq(conf); |
6482 | conf->quiesce = 0; | 6700 | conf->quiesce = 0; |
6483 | wake_up(&conf->wait_for_stripe); | 6701 | wake_up(&conf->wait_for_stripe); |
6484 | wake_up(&conf->wait_for_overlap); | 6702 | wake_up(&conf->wait_for_overlap); |
6485 | spin_unlock_irq(&conf->device_lock); | 6703 | unlock_all_device_hash_locks_irq(conf); |
6486 | break; | 6704 | break; |
6487 | } | 6705 | } |
6488 | } | 6706 | } |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index b42e6b462eda..01ad8ae8f578 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -205,6 +205,7 @@ struct stripe_head { | |||
205 | short pd_idx; /* parity disk index */ | 205 | short pd_idx; /* parity disk index */ |
206 | short qd_idx; /* 'Q' disk index for raid6 */ | 206 | short qd_idx; /* 'Q' disk index for raid6 */ |
207 | short ddf_layout;/* use DDF ordering to calculate Q */ | 207 | short ddf_layout;/* use DDF ordering to calculate Q */ |
208 | short hash_lock_index; | ||
208 | unsigned long state; /* state flags */ | 209 | unsigned long state; /* state flags */ |
209 | atomic_t count; /* nr of active thread/requests */ | 210 | atomic_t count; /* nr of active thread/requests */ |
210 | int bm_seq; /* sequence number for bitmap flushes */ | 211 | int bm_seq; /* sequence number for bitmap flushes */ |
@@ -367,9 +368,18 @@ struct disk_info { | |||
367 | struct md_rdev *rdev, *replacement; | 368 | struct md_rdev *rdev, *replacement; |
368 | }; | 369 | }; |
369 | 370 | ||
371 | /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. | ||
372 | * This is because we sometimes take all the spinlocks | ||
373 | * and creating that much locking depth can cause | ||
374 | * problems. | ||
375 | */ | ||
376 | #define NR_STRIPE_HASH_LOCKS 8 | ||
377 | #define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1) | ||
378 | |||
370 | struct r5worker { | 379 | struct r5worker { |
371 | struct work_struct work; | 380 | struct work_struct work; |
372 | struct r5worker_group *group; | 381 | struct r5worker_group *group; |
382 | struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; | ||
373 | bool working; | 383 | bool working; |
374 | }; | 384 | }; |
375 | 385 | ||
@@ -382,6 +392,8 @@ struct r5worker_group { | |||
382 | 392 | ||
383 | struct r5conf { | 393 | struct r5conf { |
384 | struct hlist_head *stripe_hashtbl; | 394 | struct hlist_head *stripe_hashtbl; |
395 | /* only protect corresponding hash list and inactive_list */ | ||
396 | spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; | ||
385 | struct mddev *mddev; | 397 | struct mddev *mddev; |
386 | int chunk_sectors; | 398 | int chunk_sectors; |
387 | int level, algorithm; | 399 | int level, algorithm; |
@@ -462,7 +474,8 @@ struct r5conf { | |||
462 | * Free stripes pool | 474 | * Free stripes pool |
463 | */ | 475 | */ |
464 | atomic_t active_stripes; | 476 | atomic_t active_stripes; |
465 | struct list_head inactive_list; | 477 | struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; |
478 | atomic_t empty_inactive_list_nr; | ||
466 | struct llist_head released_stripes; | 479 | struct llist_head released_stripes; |
467 | wait_queue_head_t wait_for_stripe; | 480 | wait_queue_head_t wait_for_stripe; |
468 | wait_queue_head_t wait_for_overlap; | 481 | wait_queue_head_t wait_for_overlap; |
@@ -477,6 +490,7 @@ struct r5conf { | |||
477 | * the new thread here until we fully activate the array. | 490 | * the new thread here until we fully activate the array. |
478 | */ | 491 | */ |
479 | struct md_thread *thread; | 492 | struct md_thread *thread; |
493 | struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; | ||
480 | struct r5worker_group *worker_groups; | 494 | struct r5worker_group *worker_groups; |
481 | int group_cnt; | 495 | int group_cnt; |
482 | int worker_cnt_per_group; | 496 | int worker_cnt_per_group; |
diff --git a/include/linux/wait.h b/include/linux/wait.h index 61939ba30aa0..eaa00b10abaa 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h | |||
@@ -278,6 +278,31 @@ do { \ | |||
278 | __ret; \ | 278 | __ret; \ |
279 | }) | 279 | }) |
280 | 280 | ||
281 | #define __wait_event_cmd(wq, condition, cmd1, cmd2) \ | ||
282 | (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ | ||
283 | cmd1; schedule(); cmd2) | ||
284 | |||
285 | /** | ||
286 | * wait_event_cmd - sleep until a condition gets true | ||
287 | * @wq: the waitqueue to wait on | ||
288 | * @condition: a C expression for the event to wait for | ||
289 | * cmd1: the command will be executed before sleep | ||
290 | * cmd2: the command will be executed after sleep | ||
291 | * | ||
292 | * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the | ||
293 | * @condition evaluates to true. The @condition is checked each time | ||
294 | * the waitqueue @wq is woken up. | ||
295 | * | ||
296 | * wake_up() has to be called after changing any variable that could | ||
297 | * change the result of the wait condition. | ||
298 | */ | ||
299 | #define wait_event_cmd(wq, condition, cmd1, cmd2) \ | ||
300 | do { \ | ||
301 | if (condition) \ | ||
302 | break; \ | ||
303 | __wait_event_cmd(wq, condition, cmd1, cmd2); \ | ||
304 | } while (0) | ||
305 | |||
281 | #define __wait_event_interruptible(wq, condition) \ | 306 | #define __wait_event_interruptible(wq, condition) \ |
282 | ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ | 307 | ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ |
283 | schedule()) | 308 | schedule()) |
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index fe1a5406d4d9..f7cf7f351144 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h | |||
@@ -16,6 +16,7 @@ | |||
16 | #define _MD_P_H | 16 | #define _MD_P_H |
17 | 17 | ||
18 | #include <linux/types.h> | 18 | #include <linux/types.h> |
19 | #include <asm/byteorder.h> | ||
19 | 20 | ||
20 | /* | 21 | /* |
21 | * RAID superblock. | 22 | * RAID superblock. |