aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-20 16:05:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-20 16:05:25 -0500
commit6d6e352c80f22c446d933ca8103e02bac1f09129 (patch)
tree248a6a7ebc5ea95986da5bccdd6d75b255cf28e4
parentb4789b8e6be3151a955ade74872822f30e8cd914 (diff)
parent60aaf933854511630e16be4efe0f96485e132de4 (diff)
Merge tag 'md/3.13' of git://neil.brown.name/md
Pull md update from Neil Brown: "Mostly optimisations and obscure bug fixes. - raid5 gets less lock contention - raid1 gets less contention between normal-io and resync-io during resync" * tag 'md/3.13' of git://neil.brown.name/md: md/raid5: Use conf->device_lock protect changing of multi-thread resources. md/raid5: Before freeing old multi-thread worker, it should flush them. md/raid5: For stripe with R5_ReadNoMerge, we replace REQ_FLUSH with REQ_NOMERGE. UAPI: include <asm/byteorder.h> in linux/raid/md_p.h raid1: Rewrite the implementation of iobarrier. raid1: Add some macros to make code clearly. raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array. raid1: Add a field array_frozen to indicate whether raid in freeze state. md: Convert use of typedef ctl_table to struct ctl_table md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes. md: use MD_RECOVERY_INTR instead of kthread_should_stop in resync thread. md: fix some places where mddev_lock return value is not checked. raid5: Retry R5_ReadNoMerge flag when hit a read error. raid5: relieve lock contention in get_active_stripe() raid5: relieve lock contention in get_active_stripe() wait: add wait_event_cmd() md/raid5.c: add proper locking to error path of raid5_start_reshape. md: fix calculation of stacking limits on level change. raid5: Use slow_path to release stripe when mddev->thread is null
-rw-r--r--drivers/md/md.c133
-rw-r--r--drivers/md/raid1.c162
-rw-r--r--drivers/md/raid1.h15
-rw-r--r--drivers/md/raid10.c6
-rw-r--r--drivers/md/raid5.c420
-rw-r--r--drivers/md/raid5.h16
-rw-r--r--include/linux/wait.h25
-rw-r--r--include/uapi/linux/raid/md_p.h1
8 files changed, 592 insertions, 186 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8766eabb0014..b6b7a2866c9e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev)
112 112
113static struct ctl_table_header *raid_table_header; 113static struct ctl_table_header *raid_table_header;
114 114
115static ctl_table raid_table[] = { 115static struct ctl_table raid_table[] = {
116 { 116 {
117 .procname = "speed_limit_min", 117 .procname = "speed_limit_min",
118 .data = &sysctl_speed_limit_min, 118 .data = &sysctl_speed_limit_min,
@@ -130,7 +130,7 @@ static ctl_table raid_table[] = {
130 { } 130 { }
131}; 131};
132 132
133static ctl_table raid_dir_table[] = { 133static struct ctl_table raid_dir_table[] = {
134 { 134 {
135 .procname = "raid", 135 .procname = "raid",
136 .maxlen = 0, 136 .maxlen = 0,
@@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = {
140 { } 140 { }
141}; 141};
142 142
143static ctl_table raid_root_table[] = { 143static struct ctl_table raid_root_table[] = {
144 { 144 {
145 .procname = "dev", 145 .procname = "dev",
146 .maxlen = 0, 146 .maxlen = 0,
@@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit)
562 goto retry; 562 goto retry;
563} 563}
564 564
565static inline int mddev_lock(struct mddev * mddev) 565static inline int __must_check mddev_lock(struct mddev * mddev)
566{ 566{
567 return mutex_lock_interruptible(&mddev->reconfig_mutex); 567 return mutex_lock_interruptible(&mddev->reconfig_mutex);
568} 568}
569 569
570/* Sometimes we need to take the lock in a situation where
571 * failure due to interrupts is not acceptable.
572 */
573static inline void mddev_lock_nointr(struct mddev * mddev)
574{
575 mutex_lock(&mddev->reconfig_mutex);
576}
577
570static inline int mddev_is_locked(struct mddev *mddev) 578static inline int mddev_is_locked(struct mddev *mddev)
571{ 579{
572 return mutex_is_locked(&mddev->reconfig_mutex); 580 return mutex_is_locked(&mddev->reconfig_mutex);
@@ -2978,7 +2986,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2978 for_each_mddev(mddev, tmp) { 2986 for_each_mddev(mddev, tmp) {
2979 struct md_rdev *rdev2; 2987 struct md_rdev *rdev2;
2980 2988
2981 mddev_lock(mddev); 2989 mddev_lock_nointr(mddev);
2982 rdev_for_each(rdev2, mddev) 2990 rdev_for_each(rdev2, mddev)
2983 if (rdev->bdev == rdev2->bdev && 2991 if (rdev->bdev == rdev2->bdev &&
2984 rdev != rdev2 && 2992 rdev != rdev2 &&
@@ -2994,7 +3002,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2994 break; 3002 break;
2995 } 3003 }
2996 } 3004 }
2997 mddev_lock(my_mddev); 3005 mddev_lock_nointr(my_mddev);
2998 if (overlap) { 3006 if (overlap) {
2999 /* Someone else could have slipped in a size 3007 /* Someone else could have slipped in a size
3000 * change here, but doing so is just silly. 3008 * change here, but doing so is just silly.
@@ -3580,6 +3588,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3580 mddev->in_sync = 1; 3588 mddev->in_sync = 1;
3581 del_timer_sync(&mddev->safemode_timer); 3589 del_timer_sync(&mddev->safemode_timer);
3582 } 3590 }
3591 blk_set_stacking_limits(&mddev->queue->limits);
3583 pers->run(mddev); 3592 pers->run(mddev);
3584 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3593 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3585 mddev_resume(mddev); 3594 mddev_resume(mddev);
@@ -5258,7 +5267,7 @@ static void __md_stop_writes(struct mddev *mddev)
5258 5267
5259void md_stop_writes(struct mddev *mddev) 5268void md_stop_writes(struct mddev *mddev)
5260{ 5269{
5261 mddev_lock(mddev); 5270 mddev_lock_nointr(mddev);
5262 __md_stop_writes(mddev); 5271 __md_stop_writes(mddev);
5263 mddev_unlock(mddev); 5272 mddev_unlock(mddev);
5264} 5273}
@@ -5291,20 +5300,35 @@ EXPORT_SYMBOL_GPL(md_stop);
5291static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5300static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5292{ 5301{
5293 int err = 0; 5302 int err = 0;
5303 int did_freeze = 0;
5304
5305 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5306 did_freeze = 1;
5307 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5308 md_wakeup_thread(mddev->thread);
5309 }
5310 if (mddev->sync_thread) {
5311 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5312 /* Thread might be blocked waiting for metadata update
5313 * which will now never happen */
5314 wake_up_process(mddev->sync_thread->tsk);
5315 }
5316 mddev_unlock(mddev);
5317 wait_event(resync_wait, mddev->sync_thread == NULL);
5318 mddev_lock_nointr(mddev);
5319
5294 mutex_lock(&mddev->open_mutex); 5320 mutex_lock(&mddev->open_mutex);
5295 if (atomic_read(&mddev->openers) > !!bdev) { 5321 if (atomic_read(&mddev->openers) > !!bdev ||
5322 mddev->sync_thread ||
5323 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5296 printk("md: %s still in use.\n",mdname(mddev)); 5324 printk("md: %s still in use.\n",mdname(mddev));
5325 if (did_freeze) {
5326 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5327 md_wakeup_thread(mddev->thread);
5328 }
5297 err = -EBUSY; 5329 err = -EBUSY;
5298 goto out; 5330 goto out;
5299 } 5331 }
5300 if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
5301 /* Someone opened the device since we flushed it
5302 * so page cache could be dirty and it is too late
5303 * to flush. So abort
5304 */
5305 mutex_unlock(&mddev->open_mutex);
5306 return -EBUSY;
5307 }
5308 if (mddev->pers) { 5332 if (mddev->pers) {
5309 __md_stop_writes(mddev); 5333 __md_stop_writes(mddev);
5310 5334
@@ -5315,7 +5339,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5315 set_disk_ro(mddev->gendisk, 1); 5339 set_disk_ro(mddev->gendisk, 1);
5316 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5340 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5317 sysfs_notify_dirent_safe(mddev->sysfs_state); 5341 sysfs_notify_dirent_safe(mddev->sysfs_state);
5318 err = 0; 5342 err = 0;
5319 } 5343 }
5320out: 5344out:
5321 mutex_unlock(&mddev->open_mutex); 5345 mutex_unlock(&mddev->open_mutex);
@@ -5331,20 +5355,34 @@ static int do_md_stop(struct mddev * mddev, int mode,
5331{ 5355{
5332 struct gendisk *disk = mddev->gendisk; 5356 struct gendisk *disk = mddev->gendisk;
5333 struct md_rdev *rdev; 5357 struct md_rdev *rdev;
5358 int did_freeze = 0;
5359
5360 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5361 did_freeze = 1;
5362 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5363 md_wakeup_thread(mddev->thread);
5364 }
5365 if (mddev->sync_thread) {
5366 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5367 /* Thread might be blocked waiting for metadata update
5368 * which will now never happen */
5369 wake_up_process(mddev->sync_thread->tsk);
5370 }
5371 mddev_unlock(mddev);
5372 wait_event(resync_wait, mddev->sync_thread == NULL);
5373 mddev_lock_nointr(mddev);
5334 5374
5335 mutex_lock(&mddev->open_mutex); 5375 mutex_lock(&mddev->open_mutex);
5336 if (atomic_read(&mddev->openers) > !!bdev || 5376 if (atomic_read(&mddev->openers) > !!bdev ||
5337 mddev->sysfs_active) { 5377 mddev->sysfs_active ||
5378 mddev->sync_thread ||
5379 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5338 printk("md: %s still in use.\n",mdname(mddev)); 5380 printk("md: %s still in use.\n",mdname(mddev));
5339 mutex_unlock(&mddev->open_mutex); 5381 mutex_unlock(&mddev->open_mutex);
5340 return -EBUSY; 5382 if (did_freeze) {
5341 } 5383 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5342 if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { 5384 md_wakeup_thread(mddev->thread);
5343 /* Someone opened the device since we flushed it 5385 }
5344 * so page cache could be dirty and it is too late
5345 * to flush. So abort
5346 */
5347 mutex_unlock(&mddev->open_mutex);
5348 return -EBUSY; 5386 return -EBUSY;
5349 } 5387 }
5350 if (mddev->pers) { 5388 if (mddev->pers) {
@@ -6551,7 +6589,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6551 wait_event(mddev->sb_wait, 6589 wait_event(mddev->sb_wait,
6552 !test_bit(MD_CHANGE_DEVS, &mddev->flags) && 6590 !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6553 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6591 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6554 mddev_lock(mddev); 6592 mddev_lock_nointr(mddev);
6555 } 6593 }
6556 } else { 6594 } else {
6557 err = -EROFS; 6595 err = -EROFS;
@@ -7361,9 +7399,6 @@ void md_do_sync(struct md_thread *thread)
7361 mddev->curr_resync = 2; 7399 mddev->curr_resync = 2;
7362 7400
7363 try_again: 7401 try_again:
7364 if (kthread_should_stop())
7365 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7366
7367 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7402 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7368 goto skip; 7403 goto skip;
7369 for_each_mddev(mddev2, tmp) { 7404 for_each_mddev(mddev2, tmp) {
@@ -7388,7 +7423,7 @@ void md_do_sync(struct md_thread *thread)
7388 * be caught by 'softlockup' 7423 * be caught by 'softlockup'
7389 */ 7424 */
7390 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 7425 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
7391 if (!kthread_should_stop() && 7426 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7392 mddev2->curr_resync >= mddev->curr_resync) { 7427 mddev2->curr_resync >= mddev->curr_resync) {
7393 printk(KERN_INFO "md: delaying %s of %s" 7428 printk(KERN_INFO "md: delaying %s of %s"
7394 " until %s has finished (they" 7429 " until %s has finished (they"
@@ -7464,7 +7499,7 @@ void md_do_sync(struct md_thread *thread)
7464 last_check = 0; 7499 last_check = 0;
7465 7500
7466 if (j>2) { 7501 if (j>2) {
7467 printk(KERN_INFO 7502 printk(KERN_INFO
7468 "md: resuming %s of %s from checkpoint.\n", 7503 "md: resuming %s of %s from checkpoint.\n",
7469 desc, mdname(mddev)); 7504 desc, mdname(mddev));
7470 mddev->curr_resync = j; 7505 mddev->curr_resync = j;
@@ -7501,7 +7536,8 @@ void md_do_sync(struct md_thread *thread)
7501 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7536 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7502 } 7537 }
7503 7538
7504 while (j >= mddev->resync_max && !kthread_should_stop()) { 7539 while (j >= mddev->resync_max &&
7540 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7505 /* As this condition is controlled by user-space, 7541 /* As this condition is controlled by user-space,
7506 * we can block indefinitely, so use '_interruptible' 7542 * we can block indefinitely, so use '_interruptible'
7507 * to avoid triggering warnings. 7543 * to avoid triggering warnings.
@@ -7509,17 +7545,18 @@ void md_do_sync(struct md_thread *thread)
7509 flush_signals(current); /* just in case */ 7545 flush_signals(current); /* just in case */
7510 wait_event_interruptible(mddev->recovery_wait, 7546 wait_event_interruptible(mddev->recovery_wait,
7511 mddev->resync_max > j 7547 mddev->resync_max > j
7512 || kthread_should_stop()); 7548 || test_bit(MD_RECOVERY_INTR,
7549 &mddev->recovery));
7513 } 7550 }
7514 7551
7515 if (kthread_should_stop()) 7552 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7516 goto interrupted; 7553 break;
7517 7554
7518 sectors = mddev->pers->sync_request(mddev, j, &skipped, 7555 sectors = mddev->pers->sync_request(mddev, j, &skipped,
7519 currspeed < speed_min(mddev)); 7556 currspeed < speed_min(mddev));
7520 if (sectors == 0) { 7557 if (sectors == 0) {
7521 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7558 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7522 goto out; 7559 break;
7523 } 7560 }
7524 7561
7525 if (!skipped) { /* actual IO requested */ 7562 if (!skipped) { /* actual IO requested */
@@ -7556,10 +7593,8 @@ void md_do_sync(struct md_thread *thread)
7556 last_mark = next; 7593 last_mark = next;
7557 } 7594 }
7558 7595
7559 7596 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7560 if (kthread_should_stop()) 7597 break;
7561 goto interrupted;
7562
7563 7598
7564 /* 7599 /*
7565 * this loop exits only if either when we are slower than 7600 * this loop exits only if either when we are slower than
@@ -7582,11 +7617,12 @@ void md_do_sync(struct md_thread *thread)
7582 } 7617 }
7583 } 7618 }
7584 } 7619 }
7585 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 7620 printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
7621 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
7622 ? "interrupted" : "done");
7586 /* 7623 /*
7587 * this also signals 'finished resyncing' to md_stop 7624 * this also signals 'finished resyncing' to md_stop
7588 */ 7625 */
7589 out:
7590 blk_finish_plug(&plug); 7626 blk_finish_plug(&plug);
7591 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 7627 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7592 7628
@@ -7640,16 +7676,6 @@ void md_do_sync(struct md_thread *thread)
7640 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 7676 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
7641 md_wakeup_thread(mddev->thread); 7677 md_wakeup_thread(mddev->thread);
7642 return; 7678 return;
7643
7644 interrupted:
7645 /*
7646 * got a signal, exit.
7647 */
7648 printk(KERN_INFO
7649 "md: md_do_sync() got signal ... exiting\n");
7650 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7651 goto out;
7652
7653} 7679}
7654EXPORT_SYMBOL_GPL(md_do_sync); 7680EXPORT_SYMBOL_GPL(md_do_sync);
7655 7681
@@ -7894,6 +7920,7 @@ void md_reap_sync_thread(struct mddev *mddev)
7894 7920
7895 /* resync has finished, collect result */ 7921 /* resync has finished, collect result */
7896 md_unregister_thread(&mddev->sync_thread); 7922 md_unregister_thread(&mddev->sync_thread);
7923 wake_up(&resync_wait);
7897 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 7924 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7898 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7925 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7899 /* success...*/ 7926 /* success...*/
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index af6681b19776..1e5a540995e9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -66,7 +66,8 @@
66 */ 66 */
67static int max_queued_requests = 1024; 67static int max_queued_requests = 1024;
68 68
69static void allow_barrier(struct r1conf *conf); 69static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
70 sector_t bi_sector);
70static void lower_barrier(struct r1conf *conf); 71static void lower_barrier(struct r1conf *conf);
71 72
72static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 73static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
@@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data)
84} 85}
85 86
86#define RESYNC_BLOCK_SIZE (64*1024) 87#define RESYNC_BLOCK_SIZE (64*1024)
87//#define RESYNC_BLOCK_SIZE PAGE_SIZE 88#define RESYNC_DEPTH 32
88#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 89#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
89#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) 90#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
90#define RESYNC_WINDOW (2048*1024) 91#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
92#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
93#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
91 94
92static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 95static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
93{ 96{
@@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
225 struct bio *bio = r1_bio->master_bio; 228 struct bio *bio = r1_bio->master_bio;
226 int done; 229 int done;
227 struct r1conf *conf = r1_bio->mddev->private; 230 struct r1conf *conf = r1_bio->mddev->private;
231 sector_t start_next_window = r1_bio->start_next_window;
232 sector_t bi_sector = bio->bi_sector;
228 233
229 if (bio->bi_phys_segments) { 234 if (bio->bi_phys_segments) {
230 unsigned long flags; 235 unsigned long flags;
@@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
232 bio->bi_phys_segments--; 237 bio->bi_phys_segments--;
233 done = (bio->bi_phys_segments == 0); 238 done = (bio->bi_phys_segments == 0);
234 spin_unlock_irqrestore(&conf->device_lock, flags); 239 spin_unlock_irqrestore(&conf->device_lock, flags);
240 /*
241 * make_request() might be waiting for
242 * bi_phys_segments to decrease
243 */
244 wake_up(&conf->wait_barrier);
235 } else 245 } else
236 done = 1; 246 done = 1;
237 247
@@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
243 * Wake up any possible resync thread that waits for the device 253 * Wake up any possible resync thread that waits for the device
244 * to go idle. 254 * to go idle.
245 */ 255 */
246 allow_barrier(conf); 256 allow_barrier(conf, start_next_window, bi_sector);
247 } 257 }
248} 258}
249 259
@@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf)
814 * there is no normal IO happeing. It must arrange to call 824 * there is no normal IO happeing. It must arrange to call
815 * lower_barrier when the particular background IO completes. 825 * lower_barrier when the particular background IO completes.
816 */ 826 */
817#define RESYNC_DEPTH 32
818
819static void raise_barrier(struct r1conf *conf) 827static void raise_barrier(struct r1conf *conf)
820{ 828{
821 spin_lock_irq(&conf->resync_lock); 829 spin_lock_irq(&conf->resync_lock);
@@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf)
827 /* block any new IO from starting */ 835 /* block any new IO from starting */
828 conf->barrier++; 836 conf->barrier++;
829 837
830 /* Now wait for all pending IO to complete */ 838 /* For these conditions we must wait:
839 * A: while the array is in frozen state
840 * B: while barrier >= RESYNC_DEPTH, meaning resync reach
841 * the max count which allowed.
842 * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
843 * next resync will reach to the window which normal bios are
844 * handling.
845 */
831 wait_event_lock_irq(conf->wait_barrier, 846 wait_event_lock_irq(conf->wait_barrier,
832 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 847 !conf->array_frozen &&
848 conf->barrier < RESYNC_DEPTH &&
849 (conf->start_next_window >=
850 conf->next_resync + RESYNC_SECTORS),
833 conf->resync_lock); 851 conf->resync_lock);
834 852
835 spin_unlock_irq(&conf->resync_lock); 853 spin_unlock_irq(&conf->resync_lock);
@@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf)
845 wake_up(&conf->wait_barrier); 863 wake_up(&conf->wait_barrier);
846} 864}
847 865
848static void wait_barrier(struct r1conf *conf) 866static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
849{ 867{
868 bool wait = false;
869
870 if (conf->array_frozen || !bio)
871 wait = true;
872 else if (conf->barrier && bio_data_dir(bio) == WRITE) {
873 if (conf->next_resync < RESYNC_WINDOW_SECTORS)
874 wait = true;
875 else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
876 >= bio_end_sector(bio)) ||
877 (conf->next_resync + NEXT_NORMALIO_DISTANCE
878 <= bio->bi_sector))
879 wait = false;
880 else
881 wait = true;
882 }
883
884 return wait;
885}
886
887static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
888{
889 sector_t sector = 0;
890
850 spin_lock_irq(&conf->resync_lock); 891 spin_lock_irq(&conf->resync_lock);
851 if (conf->barrier) { 892 if (need_to_wait_for_sync(conf, bio)) {
852 conf->nr_waiting++; 893 conf->nr_waiting++;
853 /* Wait for the barrier to drop. 894 /* Wait for the barrier to drop.
854 * However if there are already pending 895 * However if there are already pending
@@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf)
860 * count down. 901 * count down.
861 */ 902 */
862 wait_event_lock_irq(conf->wait_barrier, 903 wait_event_lock_irq(conf->wait_barrier,
863 !conf->barrier || 904 !conf->array_frozen &&
864 (conf->nr_pending && 905 (!conf->barrier ||
906 ((conf->start_next_window <
907 conf->next_resync + RESYNC_SECTORS) &&
865 current->bio_list && 908 current->bio_list &&
866 !bio_list_empty(current->bio_list)), 909 !bio_list_empty(current->bio_list))),
867 conf->resync_lock); 910 conf->resync_lock);
868 conf->nr_waiting--; 911 conf->nr_waiting--;
869 } 912 }
913
914 if (bio && bio_data_dir(bio) == WRITE) {
915 if (conf->next_resync + NEXT_NORMALIO_DISTANCE
916 <= bio->bi_sector) {
917 if (conf->start_next_window == MaxSector)
918 conf->start_next_window =
919 conf->next_resync +
920 NEXT_NORMALIO_DISTANCE;
921
922 if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
923 <= bio->bi_sector)
924 conf->next_window_requests++;
925 else
926 conf->current_window_requests++;
927 }
928 if (bio->bi_sector >= conf->start_next_window)
929 sector = conf->start_next_window;
930 }
931
870 conf->nr_pending++; 932 conf->nr_pending++;
871 spin_unlock_irq(&conf->resync_lock); 933 spin_unlock_irq(&conf->resync_lock);
934 return sector;
872} 935}
873 936
874static void allow_barrier(struct r1conf *conf) 937static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
938 sector_t bi_sector)
875{ 939{
876 unsigned long flags; 940 unsigned long flags;
941
877 spin_lock_irqsave(&conf->resync_lock, flags); 942 spin_lock_irqsave(&conf->resync_lock, flags);
878 conf->nr_pending--; 943 conf->nr_pending--;
944 if (start_next_window) {
945 if (start_next_window == conf->start_next_window) {
946 if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
947 <= bi_sector)
948 conf->next_window_requests--;
949 else
950 conf->current_window_requests--;
951 } else
952 conf->current_window_requests--;
953
954 if (!conf->current_window_requests) {
955 if (conf->next_window_requests) {
956 conf->current_window_requests =
957 conf->next_window_requests;
958 conf->next_window_requests = 0;
959 conf->start_next_window +=
960 NEXT_NORMALIO_DISTANCE;
961 } else
962 conf->start_next_window = MaxSector;
963 }
964 }
879 spin_unlock_irqrestore(&conf->resync_lock, flags); 965 spin_unlock_irqrestore(&conf->resync_lock, flags);
880 wake_up(&conf->wait_barrier); 966 wake_up(&conf->wait_barrier);
881} 967}
@@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra)
884{ 970{
885 /* stop syncio and normal IO and wait for everything to 971 /* stop syncio and normal IO and wait for everything to
886 * go quite. 972 * go quite.
887 * We increment barrier and nr_waiting, and then 973 * We wait until nr_pending match nr_queued+extra
888 * wait until nr_pending match nr_queued+extra
889 * This is called in the context of one normal IO request 974 * This is called in the context of one normal IO request
890 * that has failed. Thus any sync request that might be pending 975 * that has failed. Thus any sync request that might be pending
891 * will be blocked by nr_pending, and we need to wait for 976 * will be blocked by nr_pending, and we need to wait for
@@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra)
895 * we continue. 980 * we continue.
896 */ 981 */
897 spin_lock_irq(&conf->resync_lock); 982 spin_lock_irq(&conf->resync_lock);
898 conf->barrier++; 983 conf->array_frozen = 1;
899 conf->nr_waiting++;
900 wait_event_lock_irq_cmd(conf->wait_barrier, 984 wait_event_lock_irq_cmd(conf->wait_barrier,
901 conf->nr_pending == conf->nr_queued+extra, 985 conf->nr_pending == conf->nr_queued+extra,
902 conf->resync_lock, 986 conf->resync_lock,
@@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf)
907{ 991{
908 /* reverse the effect of the freeze */ 992 /* reverse the effect of the freeze */
909 spin_lock_irq(&conf->resync_lock); 993 spin_lock_irq(&conf->resync_lock);
910 conf->barrier--; 994 conf->array_frozen = 0;
911 conf->nr_waiting--;
912 wake_up(&conf->wait_barrier); 995 wake_up(&conf->wait_barrier);
913 spin_unlock_irq(&conf->resync_lock); 996 spin_unlock_irq(&conf->resync_lock);
914} 997}
@@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1013 int first_clone; 1096 int first_clone;
1014 int sectors_handled; 1097 int sectors_handled;
1015 int max_sectors; 1098 int max_sectors;
1099 sector_t start_next_window;
1016 1100
1017 /* 1101 /*
1018 * Register the new request and wait if the reconstruction 1102 * Register the new request and wait if the reconstruction
@@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1042 finish_wait(&conf->wait_barrier, &w); 1126 finish_wait(&conf->wait_barrier, &w);
1043 } 1127 }
1044 1128
1045 wait_barrier(conf); 1129 start_next_window = wait_barrier(conf, bio);
1046 1130
1047 bitmap = mddev->bitmap; 1131 bitmap = mddev->bitmap;
1048 1132
@@ -1163,6 +1247,7 @@ read_again:
1163 1247
1164 disks = conf->raid_disks * 2; 1248 disks = conf->raid_disks * 2;
1165 retry_write: 1249 retry_write:
1250 r1_bio->start_next_window = start_next_window;
1166 blocked_rdev = NULL; 1251 blocked_rdev = NULL;
1167 rcu_read_lock(); 1252 rcu_read_lock();
1168 max_sectors = r1_bio->sectors; 1253 max_sectors = r1_bio->sectors;
@@ -1231,14 +1316,24 @@ read_again:
1231 if (unlikely(blocked_rdev)) { 1316 if (unlikely(blocked_rdev)) {
1232 /* Wait for this device to become unblocked */ 1317 /* Wait for this device to become unblocked */
1233 int j; 1318 int j;
1319 sector_t old = start_next_window;
1234 1320
1235 for (j = 0; j < i; j++) 1321 for (j = 0; j < i; j++)
1236 if (r1_bio->bios[j]) 1322 if (r1_bio->bios[j])
1237 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1323 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
1238 r1_bio->state = 0; 1324 r1_bio->state = 0;
1239 allow_barrier(conf); 1325 allow_barrier(conf, start_next_window, bio->bi_sector);
1240 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1326 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1241 wait_barrier(conf); 1327 start_next_window = wait_barrier(conf, bio);
1328 /*
1329 * We must make sure the multi r1bios of bio have
1330 * the same value of bi_phys_segments
1331 */
1332 if (bio->bi_phys_segments && old &&
1333 old != start_next_window)
1334 /* Wait for the former r1bio(s) to complete */
1335 wait_event(conf->wait_barrier,
1336 bio->bi_phys_segments == 1);
1242 goto retry_write; 1337 goto retry_write;
1243 } 1338 }
1244 1339
@@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf)
1438 1533
1439static void close_sync(struct r1conf *conf) 1534static void close_sync(struct r1conf *conf)
1440{ 1535{
1441 wait_barrier(conf); 1536 wait_barrier(conf, NULL);
1442 allow_barrier(conf); 1537 allow_barrier(conf, 0, 0);
1443 1538
1444 mempool_destroy(conf->r1buf_pool); 1539 mempool_destroy(conf->r1buf_pool);
1445 conf->r1buf_pool = NULL; 1540 conf->r1buf_pool = NULL;
1541
1542 conf->next_resync = 0;
1543 conf->start_next_window = MaxSector;
1446} 1544}
1447 1545
1448static int raid1_spare_active(struct mddev *mddev) 1546static int raid1_spare_active(struct mddev *mddev)
@@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2714 conf->pending_count = 0; 2812 conf->pending_count = 0;
2715 conf->recovery_disabled = mddev->recovery_disabled - 1; 2813 conf->recovery_disabled = mddev->recovery_disabled - 1;
2716 2814
2815 conf->start_next_window = MaxSector;
2816 conf->current_window_requests = conf->next_window_requests = 0;
2817
2717 err = -EIO; 2818 err = -EIO;
2718 for (i = 0; i < conf->raid_disks * 2; i++) { 2819 for (i = 0; i < conf->raid_disks * 2; i++) {
2719 2820
@@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev)
2871 atomic_read(&bitmap->behind_writes) == 0); 2972 atomic_read(&bitmap->behind_writes) == 0);
2872 } 2973 }
2873 2974
2874 raise_barrier(conf); 2975 freeze_array(conf, 0);
2875 lower_barrier(conf); 2976 unfreeze_array(conf);
2876 2977
2877 md_unregister_thread(&mddev->thread); 2978 md_unregister_thread(&mddev->thread);
2878 if (conf->r1bio_pool) 2979 if (conf->r1bio_pool)
@@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
3031 wake_up(&conf->wait_barrier); 3132 wake_up(&conf->wait_barrier);
3032 break; 3133 break;
3033 case 1: 3134 case 1:
3034 raise_barrier(conf); 3135 freeze_array(conf, 0);
3035 break; 3136 break;
3036 case 0: 3137 case 0:
3037 lower_barrier(conf); 3138 unfreeze_array(conf);
3038 break; 3139 break;
3039 } 3140 }
3040} 3141}
@@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev)
3051 mddev->new_chunk_sectors = 0; 3152 mddev->new_chunk_sectors = 0;
3052 conf = setup_conf(mddev); 3153 conf = setup_conf(mddev);
3053 if (!IS_ERR(conf)) 3154 if (!IS_ERR(conf))
3054 conf->barrier = 1; 3155 /* Array must appear to be quiesced */
3156 conf->array_frozen = 1;
3055 return conf; 3157 return conf;
3056 } 3158 }
3057 return ERR_PTR(-EINVAL); 3159 return ERR_PTR(-EINVAL);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 0ff3715fb7eb..9bebca7bff2f 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -41,6 +41,19 @@ struct r1conf {
41 */ 41 */
42 sector_t next_resync; 42 sector_t next_resync;
43 43
44 /* When raid1 starts resync, we divide array into four partitions
45 * |---------|--------------|---------------------|-------------|
46 * next_resync start_next_window end_window
47 * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
48 * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
49 * current_window_requests means the count of normalIO between
50 * start_next_window and end_window.
51 * next_window_requests means the count of normalIO after end_window.
52 * */
53 sector_t start_next_window;
54 int current_window_requests;
55 int next_window_requests;
56
44 spinlock_t device_lock; 57 spinlock_t device_lock;
45 58
46 /* list of 'struct r1bio' that need to be processed by raid1d, 59 /* list of 'struct r1bio' that need to be processed by raid1d,
@@ -65,6 +78,7 @@ struct r1conf {
65 int nr_waiting; 78 int nr_waiting;
66 int nr_queued; 79 int nr_queued;
67 int barrier; 80 int barrier;
81 int array_frozen;
68 82
69 /* Set to 1 if a full sync is needed, (fresh device added). 83 /* Set to 1 if a full sync is needed, (fresh device added).
70 * Cleared when a sync completes. 84 * Cleared when a sync completes.
@@ -111,6 +125,7 @@ struct r1bio {
111 * in this BehindIO request 125 * in this BehindIO request
112 */ 126 */
113 sector_t sector; 127 sector_t sector;
128 sector_t start_next_window;
114 int sectors; 129 int sectors;
115 unsigned long state; 130 unsigned long state;
116 struct mddev *mddev; 131 struct mddev *mddev;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7c3508abb5e1..c504e8389e69 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4384 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4384 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4385 md_wakeup_thread(mddev->thread); 4385 md_wakeup_thread(mddev->thread);
4386 wait_event(mddev->sb_wait, mddev->flags == 0 || 4386 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4387 kthread_should_stop()); 4387 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4388 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4389 allow_barrier(conf);
4390 return sectors_done;
4391 }
4388 conf->reshape_safe = mddev->reshape_position; 4392 conf->reshape_safe = mddev->reshape_position;
4389 allow_barrier(conf); 4393 allow_barrier(conf);
4390 } 4394 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7f0e17a27aeb..47da0af6322b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
85 return &conf->stripe_hashtbl[hash]; 85 return &conf->stripe_hashtbl[hash];
86} 86}
87 87
88static inline int stripe_hash_locks_hash(sector_t sect)
89{
90 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
91}
92
93static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
94{
95 spin_lock_irq(conf->hash_locks + hash);
96 spin_lock(&conf->device_lock);
97}
98
99static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
100{
101 spin_unlock(&conf->device_lock);
102 spin_unlock_irq(conf->hash_locks + hash);
103}
104
105static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
106{
107 int i;
108 local_irq_disable();
109 spin_lock(conf->hash_locks);
110 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
111 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
112 spin_lock(&conf->device_lock);
113}
114
115static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
116{
117 int i;
118 spin_unlock(&conf->device_lock);
119 for (i = NR_STRIPE_HASH_LOCKS; i; i--)
120 spin_unlock(conf->hash_locks + i - 1);
121 local_irq_enable();
122}
123
88/* bio's attached to a stripe+device for I/O are linked together in bi_sector 124/* bio's attached to a stripe+device for I/O are linked together in bi_sector
89 * order without overlap. There may be several bio's per stripe+device, and 125 * order without overlap. There may be several bio's per stripe+device, and
90 * a bio could span several devices. 126 * a bio could span several devices.
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
249 } 285 }
250} 286}
251 287
252static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 288static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
289 struct list_head *temp_inactive_list)
253{ 290{
254 BUG_ON(!list_empty(&sh->lru)); 291 BUG_ON(!list_empty(&sh->lru));
255 BUG_ON(atomic_read(&conf->active_stripes)==0); 292 BUG_ON(atomic_read(&conf->active_stripes)==0);
@@ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
278 < IO_THRESHOLD) 315 < IO_THRESHOLD)
279 md_wakeup_thread(conf->mddev->thread); 316 md_wakeup_thread(conf->mddev->thread);
280 atomic_dec(&conf->active_stripes); 317 atomic_dec(&conf->active_stripes);
281 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 318 if (!test_bit(STRIPE_EXPANDING, &sh->state))
282 list_add_tail(&sh->lru, &conf->inactive_list); 319 list_add_tail(&sh->lru, temp_inactive_list);
283 wake_up(&conf->wait_for_stripe);
284 if (conf->retry_read_aligned)
285 md_wakeup_thread(conf->mddev->thread);
286 }
287 } 320 }
288} 321}
289 322
290static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 323static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
324 struct list_head *temp_inactive_list)
291{ 325{
292 if (atomic_dec_and_test(&sh->count)) 326 if (atomic_dec_and_test(&sh->count))
293 do_release_stripe(conf, sh); 327 do_release_stripe(conf, sh, temp_inactive_list);
328}
329
330/*
331 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
332 *
333 * Be careful: Only one task can add/delete stripes from temp_inactive_list at
334 * given time. Adding stripes only takes device lock, while deleting stripes
335 * only takes hash lock.
336 */
337static void release_inactive_stripe_list(struct r5conf *conf,
338 struct list_head *temp_inactive_list,
339 int hash)
340{
341 int size;
342 bool do_wakeup = false;
343 unsigned long flags;
344
345 if (hash == NR_STRIPE_HASH_LOCKS) {
346 size = NR_STRIPE_HASH_LOCKS;
347 hash = NR_STRIPE_HASH_LOCKS - 1;
348 } else
349 size = 1;
350 while (size) {
351 struct list_head *list = &temp_inactive_list[size - 1];
352
353 /*
354 * We don't hold any lock here yet, get_active_stripe() might
355 * remove stripes from the list
356 */
357 if (!list_empty_careful(list)) {
358 spin_lock_irqsave(conf->hash_locks + hash, flags);
359 if (list_empty(conf->inactive_list + hash) &&
360 !list_empty(list))
361 atomic_dec(&conf->empty_inactive_list_nr);
362 list_splice_tail_init(list, conf->inactive_list + hash);
363 do_wakeup = true;
364 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
365 }
366 size--;
367 hash--;
368 }
369
370 if (do_wakeup) {
371 wake_up(&conf->wait_for_stripe);
372 if (conf->retry_read_aligned)
373 md_wakeup_thread(conf->mddev->thread);
374 }
294} 375}
295 376
296/* should hold conf->device_lock already */ 377/* should hold conf->device_lock already */
297static int release_stripe_list(struct r5conf *conf) 378static int release_stripe_list(struct r5conf *conf,
379 struct list_head *temp_inactive_list)
298{ 380{
299 struct stripe_head *sh; 381 struct stripe_head *sh;
300 int count = 0; 382 int count = 0;
@@ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf)
303 head = llist_del_all(&conf->released_stripes); 385 head = llist_del_all(&conf->released_stripes);
304 head = llist_reverse_order(head); 386 head = llist_reverse_order(head);
305 while (head) { 387 while (head) {
388 int hash;
389
306 sh = llist_entry(head, struct stripe_head, release_list); 390 sh = llist_entry(head, struct stripe_head, release_list);
307 head = llist_next(head); 391 head = llist_next(head);
308 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 392 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
@@ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf)
313 * again, the count is always > 1. This is true for 397 * again, the count is always > 1. This is true for
314 * STRIPE_ON_UNPLUG_LIST bit too. 398 * STRIPE_ON_UNPLUG_LIST bit too.
315 */ 399 */
316 __release_stripe(conf, sh); 400 hash = sh->hash_lock_index;
401 __release_stripe(conf, sh, &temp_inactive_list[hash]);
317 count++; 402 count++;
318 } 403 }
319 404
@@ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh)
324{ 409{
325 struct r5conf *conf = sh->raid_conf; 410 struct r5conf *conf = sh->raid_conf;
326 unsigned long flags; 411 unsigned long flags;
412 struct list_head list;
413 int hash;
327 bool wakeup; 414 bool wakeup;
328 415
329 if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 416 if (unlikely(!conf->mddev->thread) ||
417 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
330 goto slow_path; 418 goto slow_path;
331 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 419 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
332 if (wakeup) 420 if (wakeup)
@@ -336,8 +424,11 @@ slow_path:
336 local_irq_save(flags); 424 local_irq_save(flags);
337 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 425 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
338 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 426 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
339 do_release_stripe(conf, sh); 427 INIT_LIST_HEAD(&list);
428 hash = sh->hash_lock_index;
429 do_release_stripe(conf, sh, &list);
340 spin_unlock(&conf->device_lock); 430 spin_unlock(&conf->device_lock);
431 release_inactive_stripe_list(conf, &list, hash);
341 } 432 }
342 local_irq_restore(flags); 433 local_irq_restore(flags);
343} 434}
@@ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
362 453
363 454
364/* find an idle stripe, make sure it is unhashed, and return it. */ 455/* find an idle stripe, make sure it is unhashed, and return it. */
365static struct stripe_head *get_free_stripe(struct r5conf *conf) 456static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
366{ 457{
367 struct stripe_head *sh = NULL; 458 struct stripe_head *sh = NULL;
368 struct list_head *first; 459 struct list_head *first;
369 460
370 if (list_empty(&conf->inactive_list)) 461 if (list_empty(conf->inactive_list + hash))
371 goto out; 462 goto out;
372 first = conf->inactive_list.next; 463 first = (conf->inactive_list + hash)->next;
373 sh = list_entry(first, struct stripe_head, lru); 464 sh = list_entry(first, struct stripe_head, lru);
374 list_del_init(first); 465 list_del_init(first);
375 remove_hash(sh); 466 remove_hash(sh);
376 atomic_inc(&conf->active_stripes); 467 atomic_inc(&conf->active_stripes);
468 BUG_ON(hash != sh->hash_lock_index);
469 if (list_empty(conf->inactive_list + hash))
470 atomic_inc(&conf->empty_inactive_list_nr);
377out: 471out:
378 return sh; 472 return sh;
379} 473}
@@ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
416static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 510static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
417{ 511{
418 struct r5conf *conf = sh->raid_conf; 512 struct r5conf *conf = sh->raid_conf;
419 int i; 513 int i, seq;
420 514
421 BUG_ON(atomic_read(&sh->count) != 0); 515 BUG_ON(atomic_read(&sh->count) != 0);
422 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 516 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
@@ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
426 (unsigned long long)sh->sector); 520 (unsigned long long)sh->sector);
427 521
428 remove_hash(sh); 522 remove_hash(sh);
429 523retry:
524 seq = read_seqcount_begin(&conf->gen_lock);
430 sh->generation = conf->generation - previous; 525 sh->generation = conf->generation - previous;
431 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 526 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
432 sh->sector = sector; 527 sh->sector = sector;
@@ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
448 dev->flags = 0; 543 dev->flags = 0;
449 raid5_build_block(sh, i, previous); 544 raid5_build_block(sh, i, previous);
450 } 545 }
546 if (read_seqcount_retry(&conf->gen_lock, seq))
547 goto retry;
451 insert_hash(conf, sh); 548 insert_hash(conf, sh);
452 sh->cpu = smp_processor_id(); 549 sh->cpu = smp_processor_id();
453} 550}
@@ -552,29 +649,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
552 int previous, int noblock, int noquiesce) 649 int previous, int noblock, int noquiesce)
553{ 650{
554 struct stripe_head *sh; 651 struct stripe_head *sh;
652 int hash = stripe_hash_locks_hash(sector);
555 653
556 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 654 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
557 655
558 spin_lock_irq(&conf->device_lock); 656 spin_lock_irq(conf->hash_locks + hash);
559 657
560 do { 658 do {
561 wait_event_lock_irq(conf->wait_for_stripe, 659 wait_event_lock_irq(conf->wait_for_stripe,
562 conf->quiesce == 0 || noquiesce, 660 conf->quiesce == 0 || noquiesce,
563 conf->device_lock); 661 *(conf->hash_locks + hash));
564 sh = __find_stripe(conf, sector, conf->generation - previous); 662 sh = __find_stripe(conf, sector, conf->generation - previous);
565 if (!sh) { 663 if (!sh) {
566 if (!conf->inactive_blocked) 664 if (!conf->inactive_blocked)
567 sh = get_free_stripe(conf); 665 sh = get_free_stripe(conf, hash);
568 if (noblock && sh == NULL) 666 if (noblock && sh == NULL)
569 break; 667 break;
570 if (!sh) { 668 if (!sh) {
571 conf->inactive_blocked = 1; 669 conf->inactive_blocked = 1;
572 wait_event_lock_irq(conf->wait_for_stripe, 670 wait_event_lock_irq(
573 !list_empty(&conf->inactive_list) && 671 conf->wait_for_stripe,
574 (atomic_read(&conf->active_stripes) 672 !list_empty(conf->inactive_list + hash) &&
575 < (conf->max_nr_stripes *3/4) 673 (atomic_read(&conf->active_stripes)
576 || !conf->inactive_blocked), 674 < (conf->max_nr_stripes * 3 / 4)
577 conf->device_lock); 675 || !conf->inactive_blocked),
676 *(conf->hash_locks + hash));
578 conf->inactive_blocked = 0; 677 conf->inactive_blocked = 0;
579 } else 678 } else
580 init_stripe(sh, sector, previous); 679 init_stripe(sh, sector, previous);
@@ -585,9 +684,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
585 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) 684 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
586 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 685 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
587 } else { 686 } else {
687 spin_lock(&conf->device_lock);
588 if (!test_bit(STRIPE_HANDLE, &sh->state)) 688 if (!test_bit(STRIPE_HANDLE, &sh->state))
589 atomic_inc(&conf->active_stripes); 689 atomic_inc(&conf->active_stripes);
590 if (list_empty(&sh->lru) && 690 if (list_empty(&sh->lru) &&
691 !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) &&
591 !test_bit(STRIPE_EXPANDING, &sh->state)) 692 !test_bit(STRIPE_EXPANDING, &sh->state))
592 BUG(); 693 BUG();
593 list_del_init(&sh->lru); 694 list_del_init(&sh->lru);
@@ -595,6 +696,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
595 sh->group->stripes_cnt--; 696 sh->group->stripes_cnt--;
596 sh->group = NULL; 697 sh->group = NULL;
597 } 698 }
699 spin_unlock(&conf->device_lock);
598 } 700 }
599 } 701 }
600 } while (sh == NULL); 702 } while (sh == NULL);
@@ -602,7 +704,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
602 if (sh) 704 if (sh)
603 atomic_inc(&sh->count); 705 atomic_inc(&sh->count);
604 706
605 spin_unlock_irq(&conf->device_lock); 707 spin_unlock_irq(conf->hash_locks + hash);
606 return sh; 708 return sh;
607} 709}
608 710
@@ -758,7 +860,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
758 bi->bi_sector = (sh->sector 860 bi->bi_sector = (sh->sector
759 + rdev->data_offset); 861 + rdev->data_offset);
760 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 862 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
761 bi->bi_rw |= REQ_FLUSH; 863 bi->bi_rw |= REQ_NOMERGE;
762 864
763 bi->bi_vcnt = 1; 865 bi->bi_vcnt = 1;
764 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 866 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1582,7 +1684,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1582 put_cpu(); 1684 put_cpu();
1583} 1685}
1584 1686
1585static int grow_one_stripe(struct r5conf *conf) 1687static int grow_one_stripe(struct r5conf *conf, int hash)
1586{ 1688{
1587 struct stripe_head *sh; 1689 struct stripe_head *sh;
1588 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1690 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1598,6 +1700,7 @@ static int grow_one_stripe(struct r5conf *conf)
1598 kmem_cache_free(conf->slab_cache, sh); 1700 kmem_cache_free(conf->slab_cache, sh);
1599 return 0; 1701 return 0;
1600 } 1702 }
1703 sh->hash_lock_index = hash;
1601 /* we just created an active stripe so... */ 1704 /* we just created an active stripe so... */
1602 atomic_set(&sh->count, 1); 1705 atomic_set(&sh->count, 1);
1603 atomic_inc(&conf->active_stripes); 1706 atomic_inc(&conf->active_stripes);
@@ -1610,6 +1713,7 @@ static int grow_stripes(struct r5conf *conf, int num)
1610{ 1713{
1611 struct kmem_cache *sc; 1714 struct kmem_cache *sc;
1612 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1715 int devs = max(conf->raid_disks, conf->previous_raid_disks);
1716 int hash;
1613 1717
1614 if (conf->mddev->gendisk) 1718 if (conf->mddev->gendisk)
1615 sprintf(conf->cache_name[0], 1719 sprintf(conf->cache_name[0],
@@ -1627,9 +1731,13 @@ static int grow_stripes(struct r5conf *conf, int num)
1627 return 1; 1731 return 1;
1628 conf->slab_cache = sc; 1732 conf->slab_cache = sc;
1629 conf->pool_size = devs; 1733 conf->pool_size = devs;
1630 while (num--) 1734 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
1631 if (!grow_one_stripe(conf)) 1735 while (num--) {
1736 if (!grow_one_stripe(conf, hash))
1632 return 1; 1737 return 1;
1738 conf->max_nr_stripes++;
1739 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
1740 }
1633 return 0; 1741 return 0;
1634} 1742}
1635 1743
@@ -1687,6 +1795,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1687 int err; 1795 int err;
1688 struct kmem_cache *sc; 1796 struct kmem_cache *sc;
1689 int i; 1797 int i;
1798 int hash, cnt;
1690 1799
1691 if (newsize <= conf->pool_size) 1800 if (newsize <= conf->pool_size)
1692 return 0; /* never bother to shrink */ 1801 return 0; /* never bother to shrink */
@@ -1726,19 +1835,29 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1726 * OK, we have enough stripes, start collecting inactive 1835 * OK, we have enough stripes, start collecting inactive
1727 * stripes and copying them over 1836 * stripes and copying them over
1728 */ 1837 */
1838 hash = 0;
1839 cnt = 0;
1729 list_for_each_entry(nsh, &newstripes, lru) { 1840 list_for_each_entry(nsh, &newstripes, lru) {
1730 spin_lock_irq(&conf->device_lock); 1841 lock_device_hash_lock(conf, hash);
1731 wait_event_lock_irq(conf->wait_for_stripe, 1842 wait_event_cmd(conf->wait_for_stripe,
1732 !list_empty(&conf->inactive_list), 1843 !list_empty(conf->inactive_list + hash),
1733 conf->device_lock); 1844 unlock_device_hash_lock(conf, hash),
1734 osh = get_free_stripe(conf); 1845 lock_device_hash_lock(conf, hash));
1735 spin_unlock_irq(&conf->device_lock); 1846 osh = get_free_stripe(conf, hash);
1847 unlock_device_hash_lock(conf, hash);
1736 atomic_set(&nsh->count, 1); 1848 atomic_set(&nsh->count, 1);
1737 for(i=0; i<conf->pool_size; i++) 1849 for(i=0; i<conf->pool_size; i++)
1738 nsh->dev[i].page = osh->dev[i].page; 1850 nsh->dev[i].page = osh->dev[i].page;
1739 for( ; i<newsize; i++) 1851 for( ; i<newsize; i++)
1740 nsh->dev[i].page = NULL; 1852 nsh->dev[i].page = NULL;
1853 nsh->hash_lock_index = hash;
1741 kmem_cache_free(conf->slab_cache, osh); 1854 kmem_cache_free(conf->slab_cache, osh);
1855 cnt++;
1856 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
1857 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
1858 hash++;
1859 cnt = 0;
1860 }
1742 } 1861 }
1743 kmem_cache_destroy(conf->slab_cache); 1862 kmem_cache_destroy(conf->slab_cache);
1744 1863
@@ -1797,13 +1916,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1797 return err; 1916 return err;
1798} 1917}
1799 1918
1800static int drop_one_stripe(struct r5conf *conf) 1919static int drop_one_stripe(struct r5conf *conf, int hash)
1801{ 1920{
1802 struct stripe_head *sh; 1921 struct stripe_head *sh;
1803 1922
1804 spin_lock_irq(&conf->device_lock); 1923 spin_lock_irq(conf->hash_locks + hash);
1805 sh = get_free_stripe(conf); 1924 sh = get_free_stripe(conf, hash);
1806 spin_unlock_irq(&conf->device_lock); 1925 spin_unlock_irq(conf->hash_locks + hash);
1807 if (!sh) 1926 if (!sh)
1808 return 0; 1927 return 0;
1809 BUG_ON(atomic_read(&sh->count)); 1928 BUG_ON(atomic_read(&sh->count));
@@ -1815,8 +1934,10 @@ static int drop_one_stripe(struct r5conf *conf)
1815 1934
1816static void shrink_stripes(struct r5conf *conf) 1935static void shrink_stripes(struct r5conf *conf)
1817{ 1936{
1818 while (drop_one_stripe(conf)) 1937 int hash;
1819 ; 1938 for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
1939 while (drop_one_stripe(conf, hash))
1940 ;
1820 1941
1821 if (conf->slab_cache) 1942 if (conf->slab_cache)
1822 kmem_cache_destroy(conf->slab_cache); 1943 kmem_cache_destroy(conf->slab_cache);
@@ -1921,6 +2042,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
1921 mdname(conf->mddev), bdn); 2042 mdname(conf->mddev), bdn);
1922 else 2043 else
1923 retry = 1; 2044 retry = 1;
2045 if (set_bad && test_bit(In_sync, &rdev->flags)
2046 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2047 retry = 1;
1924 if (retry) 2048 if (retry)
1925 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2049 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
1926 set_bit(R5_ReadError, &sh->dev[i].flags); 2050 set_bit(R5_ReadError, &sh->dev[i].flags);
@@ -3900,7 +4024,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
3900 } 4024 }
3901} 4025}
3902 4026
3903static void activate_bit_delay(struct r5conf *conf) 4027static void activate_bit_delay(struct r5conf *conf,
4028 struct list_head *temp_inactive_list)
3904{ 4029{
3905 /* device_lock is held */ 4030 /* device_lock is held */
3906 struct list_head head; 4031 struct list_head head;
@@ -3908,9 +4033,11 @@ static void activate_bit_delay(struct r5conf *conf)
3908 list_del_init(&conf->bitmap_list); 4033 list_del_init(&conf->bitmap_list);
3909 while (!list_empty(&head)) { 4034 while (!list_empty(&head)) {
3910 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 4035 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
4036 int hash;
3911 list_del_init(&sh->lru); 4037 list_del_init(&sh->lru);
3912 atomic_inc(&sh->count); 4038 atomic_inc(&sh->count);
3913 __release_stripe(conf, sh); 4039 hash = sh->hash_lock_index;
4040 __release_stripe(conf, sh, &temp_inactive_list[hash]);
3914 } 4041 }
3915} 4042}
3916 4043
@@ -3926,7 +4053,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
3926 return 1; 4053 return 1;
3927 if (conf->quiesce) 4054 if (conf->quiesce)
3928 return 1; 4055 return 1;
3929 if (list_empty_careful(&conf->inactive_list)) 4056 if (atomic_read(&conf->empty_inactive_list_nr))
3930 return 1; 4057 return 1;
3931 4058
3932 return 0; 4059 return 0;
@@ -4256,6 +4383,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
4256struct raid5_plug_cb { 4383struct raid5_plug_cb {
4257 struct blk_plug_cb cb; 4384 struct blk_plug_cb cb;
4258 struct list_head list; 4385 struct list_head list;
4386 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
4259}; 4387};
4260 4388
4261static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4389static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
@@ -4266,6 +4394,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4266 struct mddev *mddev = cb->cb.data; 4394 struct mddev *mddev = cb->cb.data;
4267 struct r5conf *conf = mddev->private; 4395 struct r5conf *conf = mddev->private;
4268 int cnt = 0; 4396 int cnt = 0;
4397 int hash;
4269 4398
4270 if (cb->list.next && !list_empty(&cb->list)) { 4399 if (cb->list.next && !list_empty(&cb->list)) {
4271 spin_lock_irq(&conf->device_lock); 4400 spin_lock_irq(&conf->device_lock);
@@ -4283,11 +4412,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4283 * STRIPE_ON_RELEASE_LIST could be set here. In that 4412 * STRIPE_ON_RELEASE_LIST could be set here. In that
4284 * case, the count is always > 1 here 4413 * case, the count is always > 1 here
4285 */ 4414 */
4286 __release_stripe(conf, sh); 4415 hash = sh->hash_lock_index;
4416 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
4287 cnt++; 4417 cnt++;
4288 } 4418 }
4289 spin_unlock_irq(&conf->device_lock); 4419 spin_unlock_irq(&conf->device_lock);
4290 } 4420 }
4421 release_inactive_stripe_list(conf, cb->temp_inactive_list,
4422 NR_STRIPE_HASH_LOCKS);
4291 if (mddev->queue) 4423 if (mddev->queue)
4292 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4424 trace_block_unplug(mddev->queue, cnt, !from_schedule);
4293 kfree(cb); 4425 kfree(cb);
@@ -4308,8 +4440,12 @@ static void release_stripe_plug(struct mddev *mddev,
4308 4440
4309 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4441 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
4310 4442
4311 if (cb->list.next == NULL) 4443 if (cb->list.next == NULL) {
4444 int i;
4312 INIT_LIST_HEAD(&cb->list); 4445 INIT_LIST_HEAD(&cb->list);
4446 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
4447 INIT_LIST_HEAD(cb->temp_inactive_list + i);
4448 }
4313 4449
4314 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4450 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
4315 list_add_tail(&sh->lru, &cb->list); 4451 list_add_tail(&sh->lru, &cb->list);
@@ -4692,14 +4828,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4692 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4828 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4693 /* Cannot proceed until we've updated the superblock... */ 4829 /* Cannot proceed until we've updated the superblock... */
4694 wait_event(conf->wait_for_overlap, 4830 wait_event(conf->wait_for_overlap,
4695 atomic_read(&conf->reshape_stripes)==0); 4831 atomic_read(&conf->reshape_stripes)==0
4832 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4833 if (atomic_read(&conf->reshape_stripes) != 0)
4834 return 0;
4696 mddev->reshape_position = conf->reshape_progress; 4835 mddev->reshape_position = conf->reshape_progress;
4697 mddev->curr_resync_completed = sector_nr; 4836 mddev->curr_resync_completed = sector_nr;
4698 conf->reshape_checkpoint = jiffies; 4837 conf->reshape_checkpoint = jiffies;
4699 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4838 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4700 md_wakeup_thread(mddev->thread); 4839 md_wakeup_thread(mddev->thread);
4701 wait_event(mddev->sb_wait, mddev->flags == 0 || 4840 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4702 kthread_should_stop()); 4841 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4842 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4843 return 0;
4703 spin_lock_irq(&conf->device_lock); 4844 spin_lock_irq(&conf->device_lock);
4704 conf->reshape_safe = mddev->reshape_position; 4845 conf->reshape_safe = mddev->reshape_position;
4705 spin_unlock_irq(&conf->device_lock); 4846 spin_unlock_irq(&conf->device_lock);
@@ -4782,7 +4923,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4782 >= mddev->resync_max - mddev->curr_resync_completed) { 4923 >= mddev->resync_max - mddev->curr_resync_completed) {
4783 /* Cannot proceed until we've updated the superblock... */ 4924 /* Cannot proceed until we've updated the superblock... */
4784 wait_event(conf->wait_for_overlap, 4925 wait_event(conf->wait_for_overlap,
4785 atomic_read(&conf->reshape_stripes) == 0); 4926 atomic_read(&conf->reshape_stripes) == 0
4927 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4928 if (atomic_read(&conf->reshape_stripes) != 0)
4929 goto ret;
4786 mddev->reshape_position = conf->reshape_progress; 4930 mddev->reshape_position = conf->reshape_progress;
4787 mddev->curr_resync_completed = sector_nr; 4931 mddev->curr_resync_completed = sector_nr;
4788 conf->reshape_checkpoint = jiffies; 4932 conf->reshape_checkpoint = jiffies;
@@ -4790,13 +4934,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4790 md_wakeup_thread(mddev->thread); 4934 md_wakeup_thread(mddev->thread);
4791 wait_event(mddev->sb_wait, 4935 wait_event(mddev->sb_wait,
4792 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4936 !test_bit(MD_CHANGE_DEVS, &mddev->flags)
4793 || kthread_should_stop()); 4937 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4938 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4939 goto ret;
4794 spin_lock_irq(&conf->device_lock); 4940 spin_lock_irq(&conf->device_lock);
4795 conf->reshape_safe = mddev->reshape_position; 4941 conf->reshape_safe = mddev->reshape_position;
4796 spin_unlock_irq(&conf->device_lock); 4942 spin_unlock_irq(&conf->device_lock);
4797 wake_up(&conf->wait_for_overlap); 4943 wake_up(&conf->wait_for_overlap);
4798 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4944 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4799 } 4945 }
4946ret:
4800 return reshape_sectors; 4947 return reshape_sectors;
4801} 4948}
4802 4949
@@ -4954,27 +5101,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4954} 5101}
4955 5102
4956static int handle_active_stripes(struct r5conf *conf, int group, 5103static int handle_active_stripes(struct r5conf *conf, int group,
4957 struct r5worker *worker) 5104 struct r5worker *worker,
5105 struct list_head *temp_inactive_list)
4958{ 5106{
4959 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 5107 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
4960 int i, batch_size = 0; 5108 int i, batch_size = 0, hash;
5109 bool release_inactive = false;
4961 5110
4962 while (batch_size < MAX_STRIPE_BATCH && 5111 while (batch_size < MAX_STRIPE_BATCH &&
4963 (sh = __get_priority_stripe(conf, group)) != NULL) 5112 (sh = __get_priority_stripe(conf, group)) != NULL)
4964 batch[batch_size++] = sh; 5113 batch[batch_size++] = sh;
4965 5114
4966 if (batch_size == 0) 5115 if (batch_size == 0) {
4967 return batch_size; 5116 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5117 if (!list_empty(temp_inactive_list + i))
5118 break;
5119 if (i == NR_STRIPE_HASH_LOCKS)
5120 return batch_size;
5121 release_inactive = true;
5122 }
4968 spin_unlock_irq(&conf->device_lock); 5123 spin_unlock_irq(&conf->device_lock);
4969 5124
5125 release_inactive_stripe_list(conf, temp_inactive_list,
5126 NR_STRIPE_HASH_LOCKS);
5127
5128 if (release_inactive) {
5129 spin_lock_irq(&conf->device_lock);
5130 return 0;
5131 }
5132
4970 for (i = 0; i < batch_size; i++) 5133 for (i = 0; i < batch_size; i++)
4971 handle_stripe(batch[i]); 5134 handle_stripe(batch[i]);
4972 5135
4973 cond_resched(); 5136 cond_resched();
4974 5137
4975 spin_lock_irq(&conf->device_lock); 5138 spin_lock_irq(&conf->device_lock);
4976 for (i = 0; i < batch_size; i++) 5139 for (i = 0; i < batch_size; i++) {
4977 __release_stripe(conf, batch[i]); 5140 hash = batch[i]->hash_lock_index;
5141 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
5142 }
4978 return batch_size; 5143 return batch_size;
4979} 5144}
4980 5145
@@ -4995,9 +5160,10 @@ static void raid5_do_work(struct work_struct *work)
4995 while (1) { 5160 while (1) {
4996 int batch_size, released; 5161 int batch_size, released;
4997 5162
4998 released = release_stripe_list(conf); 5163 released = release_stripe_list(conf, worker->temp_inactive_list);
4999 5164
5000 batch_size = handle_active_stripes(conf, group_id, worker); 5165 batch_size = handle_active_stripes(conf, group_id, worker,
5166 worker->temp_inactive_list);
5001 worker->working = false; 5167 worker->working = false;
5002 if (!batch_size && !released) 5168 if (!batch_size && !released)
5003 break; 5169 break;
@@ -5036,7 +5202,7 @@ static void raid5d(struct md_thread *thread)
5036 struct bio *bio; 5202 struct bio *bio;
5037 int batch_size, released; 5203 int batch_size, released;
5038 5204
5039 released = release_stripe_list(conf); 5205 released = release_stripe_list(conf, conf->temp_inactive_list);
5040 5206
5041 if ( 5207 if (
5042 !list_empty(&conf->bitmap_list)) { 5208 !list_empty(&conf->bitmap_list)) {
@@ -5046,7 +5212,7 @@ static void raid5d(struct md_thread *thread)
5046 bitmap_unplug(mddev->bitmap); 5212 bitmap_unplug(mddev->bitmap);
5047 spin_lock_irq(&conf->device_lock); 5213 spin_lock_irq(&conf->device_lock);
5048 conf->seq_write = conf->seq_flush; 5214 conf->seq_write = conf->seq_flush;
5049 activate_bit_delay(conf); 5215 activate_bit_delay(conf, conf->temp_inactive_list);
5050 } 5216 }
5051 raid5_activate_delayed(conf); 5217 raid5_activate_delayed(conf);
5052 5218
@@ -5060,7 +5226,8 @@ static void raid5d(struct md_thread *thread)
5060 handled++; 5226 handled++;
5061 } 5227 }
5062 5228
5063 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); 5229 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
5230 conf->temp_inactive_list);
5064 if (!batch_size && !released) 5231 if (!batch_size && !released)
5065 break; 5232 break;
5066 handled += batch_size; 5233 handled += batch_size;
@@ -5096,22 +5263,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
5096{ 5263{
5097 struct r5conf *conf = mddev->private; 5264 struct r5conf *conf = mddev->private;
5098 int err; 5265 int err;
5266 int hash;
5099 5267
5100 if (size <= 16 || size > 32768) 5268 if (size <= 16 || size > 32768)
5101 return -EINVAL; 5269 return -EINVAL;
5270 hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
5102 while (size < conf->max_nr_stripes) { 5271 while (size < conf->max_nr_stripes) {
5103 if (drop_one_stripe(conf)) 5272 if (drop_one_stripe(conf, hash))
5104 conf->max_nr_stripes--; 5273 conf->max_nr_stripes--;
5105 else 5274 else
5106 break; 5275 break;
5276 hash--;
5277 if (hash < 0)
5278 hash = NR_STRIPE_HASH_LOCKS - 1;
5107 } 5279 }
5108 err = md_allow_write(mddev); 5280 err = md_allow_write(mddev);
5109 if (err) 5281 if (err)
5110 return err; 5282 return err;
5283 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
5111 while (size > conf->max_nr_stripes) { 5284 while (size > conf->max_nr_stripes) {
5112 if (grow_one_stripe(conf)) 5285 if (grow_one_stripe(conf, hash))
5113 conf->max_nr_stripes++; 5286 conf->max_nr_stripes++;
5114 else break; 5287 else break;
5288 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
5115 } 5289 }
5116 return 0; 5290 return 0;
5117} 5291}
@@ -5199,15 +5373,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
5199 return 0; 5373 return 0;
5200} 5374}
5201 5375
5202static int alloc_thread_groups(struct r5conf *conf, int cnt); 5376static int alloc_thread_groups(struct r5conf *conf, int cnt,
5377 int *group_cnt,
5378 int *worker_cnt_per_group,
5379 struct r5worker_group **worker_groups);
5203static ssize_t 5380static ssize_t
5204raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 5381raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
5205{ 5382{
5206 struct r5conf *conf = mddev->private; 5383 struct r5conf *conf = mddev->private;
5207 unsigned long new; 5384 unsigned long new;
5208 int err; 5385 int err;
5209 struct r5worker_group *old_groups; 5386 struct r5worker_group *new_groups, *old_groups;
5210 int old_group_cnt; 5387 int group_cnt, worker_cnt_per_group;
5211 5388
5212 if (len >= PAGE_SIZE) 5389 if (len >= PAGE_SIZE)
5213 return -EINVAL; 5390 return -EINVAL;
@@ -5223,14 +5400,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
5223 mddev_suspend(mddev); 5400 mddev_suspend(mddev);
5224 5401
5225 old_groups = conf->worker_groups; 5402 old_groups = conf->worker_groups;
5226 old_group_cnt = conf->worker_cnt_per_group; 5403 if (old_groups)
5404 flush_workqueue(raid5_wq);
5405
5406 err = alloc_thread_groups(conf, new,
5407 &group_cnt, &worker_cnt_per_group,
5408 &new_groups);
5409 if (!err) {
5410 spin_lock_irq(&conf->device_lock);
5411 conf->group_cnt = group_cnt;
5412 conf->worker_cnt_per_group = worker_cnt_per_group;
5413 conf->worker_groups = new_groups;
5414 spin_unlock_irq(&conf->device_lock);
5227 5415
5228 conf->worker_groups = NULL;
5229 err = alloc_thread_groups(conf, new);
5230 if (err) {
5231 conf->worker_groups = old_groups;
5232 conf->worker_cnt_per_group = old_group_cnt;
5233 } else {
5234 if (old_groups) 5416 if (old_groups)
5235 kfree(old_groups[0].workers); 5417 kfree(old_groups[0].workers);
5236 kfree(old_groups); 5418 kfree(old_groups);
@@ -5260,40 +5442,47 @@ static struct attribute_group raid5_attrs_group = {
5260 .attrs = raid5_attrs, 5442 .attrs = raid5_attrs,
5261}; 5443};
5262 5444
5263static int alloc_thread_groups(struct r5conf *conf, int cnt) 5445static int alloc_thread_groups(struct r5conf *conf, int cnt,
5446 int *group_cnt,
5447 int *worker_cnt_per_group,
5448 struct r5worker_group **worker_groups)
5264{ 5449{
5265 int i, j; 5450 int i, j, k;
5266 ssize_t size; 5451 ssize_t size;
5267 struct r5worker *workers; 5452 struct r5worker *workers;
5268 5453
5269 conf->worker_cnt_per_group = cnt; 5454 *worker_cnt_per_group = cnt;
5270 if (cnt == 0) { 5455 if (cnt == 0) {
5271 conf->worker_groups = NULL; 5456 *group_cnt = 0;
5457 *worker_groups = NULL;
5272 return 0; 5458 return 0;
5273 } 5459 }
5274 conf->group_cnt = num_possible_nodes(); 5460 *group_cnt = num_possible_nodes();
5275 size = sizeof(struct r5worker) * cnt; 5461 size = sizeof(struct r5worker) * cnt;
5276 workers = kzalloc(size * conf->group_cnt, GFP_NOIO); 5462 workers = kzalloc(size * *group_cnt, GFP_NOIO);
5277 conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * 5463 *worker_groups = kzalloc(sizeof(struct r5worker_group) *
5278 conf->group_cnt, GFP_NOIO); 5464 *group_cnt, GFP_NOIO);
5279 if (!conf->worker_groups || !workers) { 5465 if (!*worker_groups || !workers) {
5280 kfree(workers); 5466 kfree(workers);
5281 kfree(conf->worker_groups); 5467 kfree(*worker_groups);
5282 conf->worker_groups = NULL;
5283 return -ENOMEM; 5468 return -ENOMEM;
5284 } 5469 }
5285 5470
5286 for (i = 0; i < conf->group_cnt; i++) { 5471 for (i = 0; i < *group_cnt; i++) {
5287 struct r5worker_group *group; 5472 struct r5worker_group *group;
5288 5473
5289 group = &conf->worker_groups[i]; 5474 group = worker_groups[i];
5290 INIT_LIST_HEAD(&group->handle_list); 5475 INIT_LIST_HEAD(&group->handle_list);
5291 group->conf = conf; 5476 group->conf = conf;
5292 group->workers = workers + i * cnt; 5477 group->workers = workers + i * cnt;
5293 5478
5294 for (j = 0; j < cnt; j++) { 5479 for (j = 0; j < cnt; j++) {
5295 group->workers[j].group = group; 5480 struct r5worker *worker = group->workers + j;
5296 INIT_WORK(&group->workers[j].work, raid5_do_work); 5481 worker->group = group;
5482 INIT_WORK(&worker->work, raid5_do_work);
5483
5484 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
5485 INIT_LIST_HEAD(worker->temp_inactive_list + k);
5297 } 5486 }
5298 } 5487 }
5299 5488
@@ -5444,6 +5633,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5444 struct md_rdev *rdev; 5633 struct md_rdev *rdev;
5445 struct disk_info *disk; 5634 struct disk_info *disk;
5446 char pers_name[6]; 5635 char pers_name[6];
5636 int i;
5637 int group_cnt, worker_cnt_per_group;
5638 struct r5worker_group *new_group;
5447 5639
5448 if (mddev->new_level != 5 5640 if (mddev->new_level != 5
5449 && mddev->new_level != 4 5641 && mddev->new_level != 4
@@ -5478,7 +5670,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5478 if (conf == NULL) 5670 if (conf == NULL)
5479 goto abort; 5671 goto abort;
5480 /* Don't enable multi-threading by default*/ 5672 /* Don't enable multi-threading by default*/
5481 if (alloc_thread_groups(conf, 0)) 5673 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
5674 &new_group)) {
5675 conf->group_cnt = group_cnt;
5676 conf->worker_cnt_per_group = worker_cnt_per_group;
5677 conf->worker_groups = new_group;
5678 } else
5482 goto abort; 5679 goto abort;
5483 spin_lock_init(&conf->device_lock); 5680 spin_lock_init(&conf->device_lock);
5484 seqcount_init(&conf->gen_lock); 5681 seqcount_init(&conf->gen_lock);
@@ -5488,7 +5685,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5488 INIT_LIST_HEAD(&conf->hold_list); 5685 INIT_LIST_HEAD(&conf->hold_list);
5489 INIT_LIST_HEAD(&conf->delayed_list); 5686 INIT_LIST_HEAD(&conf->delayed_list);
5490 INIT_LIST_HEAD(&conf->bitmap_list); 5687 INIT_LIST_HEAD(&conf->bitmap_list);
5491 INIT_LIST_HEAD(&conf->inactive_list);
5492 init_llist_head(&conf->released_stripes); 5688 init_llist_head(&conf->released_stripes);
5493 atomic_set(&conf->active_stripes, 0); 5689 atomic_set(&conf->active_stripes, 0);
5494 atomic_set(&conf->preread_active_stripes, 0); 5690 atomic_set(&conf->preread_active_stripes, 0);
@@ -5514,6 +5710,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5514 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5710 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
5515 goto abort; 5711 goto abort;
5516 5712
5713 /* We init hash_locks[0] separately to that it can be used
5714 * as the reference lock in the spin_lock_nest_lock() call
5715 * in lock_all_device_hash_locks_irq in order to convince
5716 * lockdep that we know what we are doing.
5717 */
5718 spin_lock_init(conf->hash_locks);
5719 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
5720 spin_lock_init(conf->hash_locks + i);
5721
5722 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5723 INIT_LIST_HEAD(conf->inactive_list + i);
5724
5725 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5726 INIT_LIST_HEAD(conf->temp_inactive_list + i);
5727
5517 conf->level = mddev->new_level; 5728 conf->level = mddev->new_level;
5518 if (raid5_alloc_percpu(conf) != 0) 5729 if (raid5_alloc_percpu(conf) != 0)
5519 goto abort; 5730 goto abort;
@@ -5554,7 +5765,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5554 else 5765 else
5555 conf->max_degraded = 1; 5766 conf->max_degraded = 1;
5556 conf->algorithm = mddev->new_layout; 5767 conf->algorithm = mddev->new_layout;
5557 conf->max_nr_stripes = NR_STRIPES;
5558 conf->reshape_progress = mddev->reshape_position; 5768 conf->reshape_progress = mddev->reshape_position;
5559 if (conf->reshape_progress != MaxSector) { 5769 if (conf->reshape_progress != MaxSector) {
5560 conf->prev_chunk_sectors = mddev->chunk_sectors; 5770 conf->prev_chunk_sectors = mddev->chunk_sectors;
@@ -5563,7 +5773,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5563 5773
5564 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5774 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
5565 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5775 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
5566 if (grow_stripes(conf, conf->max_nr_stripes)) { 5776 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
5777 if (grow_stripes(conf, NR_STRIPES)) {
5567 printk(KERN_ERR 5778 printk(KERN_ERR
5568 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5779 "md/raid:%s: couldn't allocate %dkB for buffers\n",
5569 mdname(mddev), memory); 5780 mdname(mddev), memory);
@@ -6369,12 +6580,18 @@ static int raid5_start_reshape(struct mddev *mddev)
6369 if (!mddev->sync_thread) { 6580 if (!mddev->sync_thread) {
6370 mddev->recovery = 0; 6581 mddev->recovery = 0;
6371 spin_lock_irq(&conf->device_lock); 6582 spin_lock_irq(&conf->device_lock);
6583 write_seqcount_begin(&conf->gen_lock);
6372 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 6584 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
6585 mddev->new_chunk_sectors =
6586 conf->chunk_sectors = conf->prev_chunk_sectors;
6587 mddev->new_layout = conf->algorithm = conf->prev_algo;
6373 rdev_for_each(rdev, mddev) 6588 rdev_for_each(rdev, mddev)
6374 rdev->new_data_offset = rdev->data_offset; 6589 rdev->new_data_offset = rdev->data_offset;
6375 smp_wmb(); 6590 smp_wmb();
6591 conf->generation --;
6376 conf->reshape_progress = MaxSector; 6592 conf->reshape_progress = MaxSector;
6377 mddev->reshape_position = MaxSector; 6593 mddev->reshape_position = MaxSector;
6594 write_seqcount_end(&conf->gen_lock);
6378 spin_unlock_irq(&conf->device_lock); 6595 spin_unlock_irq(&conf->device_lock);
6379 return -EAGAIN; 6596 return -EAGAIN;
6380 } 6597 }
@@ -6462,27 +6679,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
6462 break; 6679 break;
6463 6680
6464 case 1: /* stop all writes */ 6681 case 1: /* stop all writes */
6465 spin_lock_irq(&conf->device_lock); 6682 lock_all_device_hash_locks_irq(conf);
6466 /* '2' tells resync/reshape to pause so that all 6683 /* '2' tells resync/reshape to pause so that all
6467 * active stripes can drain 6684 * active stripes can drain
6468 */ 6685 */
6469 conf->quiesce = 2; 6686 conf->quiesce = 2;
6470 wait_event_lock_irq(conf->wait_for_stripe, 6687 wait_event_cmd(conf->wait_for_stripe,
6471 atomic_read(&conf->active_stripes) == 0 && 6688 atomic_read(&conf->active_stripes) == 0 &&
6472 atomic_read(&conf->active_aligned_reads) == 0, 6689 atomic_read(&conf->active_aligned_reads) == 0,
6473 conf->device_lock); 6690 unlock_all_device_hash_locks_irq(conf),
6691 lock_all_device_hash_locks_irq(conf));
6474 conf->quiesce = 1; 6692 conf->quiesce = 1;
6475 spin_unlock_irq(&conf->device_lock); 6693 unlock_all_device_hash_locks_irq(conf);
6476 /* allow reshape to continue */ 6694 /* allow reshape to continue */
6477 wake_up(&conf->wait_for_overlap); 6695 wake_up(&conf->wait_for_overlap);
6478 break; 6696 break;
6479 6697
6480 case 0: /* re-enable writes */ 6698 case 0: /* re-enable writes */
6481 spin_lock_irq(&conf->device_lock); 6699 lock_all_device_hash_locks_irq(conf);
6482 conf->quiesce = 0; 6700 conf->quiesce = 0;
6483 wake_up(&conf->wait_for_stripe); 6701 wake_up(&conf->wait_for_stripe);
6484 wake_up(&conf->wait_for_overlap); 6702 wake_up(&conf->wait_for_overlap);
6485 spin_unlock_irq(&conf->device_lock); 6703 unlock_all_device_hash_locks_irq(conf);
6486 break; 6704 break;
6487 } 6705 }
6488} 6706}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index b42e6b462eda..01ad8ae8f578 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -205,6 +205,7 @@ struct stripe_head {
205 short pd_idx; /* parity disk index */ 205 short pd_idx; /* parity disk index */
206 short qd_idx; /* 'Q' disk index for raid6 */ 206 short qd_idx; /* 'Q' disk index for raid6 */
207 short ddf_layout;/* use DDF ordering to calculate Q */ 207 short ddf_layout;/* use DDF ordering to calculate Q */
208 short hash_lock_index;
208 unsigned long state; /* state flags */ 209 unsigned long state; /* state flags */
209 atomic_t count; /* nr of active thread/requests */ 210 atomic_t count; /* nr of active thread/requests */
210 int bm_seq; /* sequence number for bitmap flushes */ 211 int bm_seq; /* sequence number for bitmap flushes */
@@ -367,9 +368,18 @@ struct disk_info {
367 struct md_rdev *rdev, *replacement; 368 struct md_rdev *rdev, *replacement;
368}; 369};
369 370
371/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
372 * This is because we sometimes take all the spinlocks
373 * and creating that much locking depth can cause
374 * problems.
375 */
376#define NR_STRIPE_HASH_LOCKS 8
377#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
378
370struct r5worker { 379struct r5worker {
371 struct work_struct work; 380 struct work_struct work;
372 struct r5worker_group *group; 381 struct r5worker_group *group;
382 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
373 bool working; 383 bool working;
374}; 384};
375 385
@@ -382,6 +392,8 @@ struct r5worker_group {
382 392
383struct r5conf { 393struct r5conf {
384 struct hlist_head *stripe_hashtbl; 394 struct hlist_head *stripe_hashtbl;
395 /* only protect corresponding hash list and inactive_list */
396 spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS];
385 struct mddev *mddev; 397 struct mddev *mddev;
386 int chunk_sectors; 398 int chunk_sectors;
387 int level, algorithm; 399 int level, algorithm;
@@ -462,7 +474,8 @@ struct r5conf {
462 * Free stripes pool 474 * Free stripes pool
463 */ 475 */
464 atomic_t active_stripes; 476 atomic_t active_stripes;
465 struct list_head inactive_list; 477 struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
478 atomic_t empty_inactive_list_nr;
466 struct llist_head released_stripes; 479 struct llist_head released_stripes;
467 wait_queue_head_t wait_for_stripe; 480 wait_queue_head_t wait_for_stripe;
468 wait_queue_head_t wait_for_overlap; 481 wait_queue_head_t wait_for_overlap;
@@ -477,6 +490,7 @@ struct r5conf {
477 * the new thread here until we fully activate the array. 490 * the new thread here until we fully activate the array.
478 */ 491 */
479 struct md_thread *thread; 492 struct md_thread *thread;
493 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
480 struct r5worker_group *worker_groups; 494 struct r5worker_group *worker_groups;
481 int group_cnt; 495 int group_cnt;
482 int worker_cnt_per_group; 496 int worker_cnt_per_group;
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 61939ba30aa0..eaa00b10abaa 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -278,6 +278,31 @@ do { \
278 __ret; \ 278 __ret; \
279}) 279})
280 280
281#define __wait_event_cmd(wq, condition, cmd1, cmd2) \
282 (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
283 cmd1; schedule(); cmd2)
284
285/**
286 * wait_event_cmd - sleep until a condition gets true
287 * @wq: the waitqueue to wait on
288 * @condition: a C expression for the event to wait for
289 * cmd1: the command will be executed before sleep
290 * cmd2: the command will be executed after sleep
291 *
292 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
293 * @condition evaluates to true. The @condition is checked each time
294 * the waitqueue @wq is woken up.
295 *
296 * wake_up() has to be called after changing any variable that could
297 * change the result of the wait condition.
298 */
299#define wait_event_cmd(wq, condition, cmd1, cmd2) \
300do { \
301 if (condition) \
302 break; \
303 __wait_event_cmd(wq, condition, cmd1, cmd2); \
304} while (0)
305
281#define __wait_event_interruptible(wq, condition) \ 306#define __wait_event_interruptible(wq, condition) \
282 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ 307 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
283 schedule()) 308 schedule())
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index fe1a5406d4d9..f7cf7f351144 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -16,6 +16,7 @@
16#define _MD_P_H 16#define _MD_P_H
17 17
18#include <linux/types.h> 18#include <linux/types.h>
19#include <asm/byteorder.h>
19 20
20/* 21/*
21 * RAID superblock. 22 * RAID superblock.