aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-20 16:05:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-20 16:05:25 -0500
commit6d6e352c80f22c446d933ca8103e02bac1f09129 (patch)
tree248a6a7ebc5ea95986da5bccdd6d75b255cf28e4 /drivers/md/raid5.c
parentb4789b8e6be3151a955ade74872822f30e8cd914 (diff)
parent60aaf933854511630e16be4efe0f96485e132de4 (diff)
Merge tag 'md/3.13' of git://neil.brown.name/md
Pull md update from Neil Brown: "Mostly optimisations and obscure bug fixes. - raid5 gets less lock contention - raid1 gets less contention between normal-io and resync-io during resync" * tag 'md/3.13' of git://neil.brown.name/md: md/raid5: Use conf->device_lock protect changing of multi-thread resources. md/raid5: Before freeing old multi-thread worker, it should flush them. md/raid5: For stripe with R5_ReadNoMerge, we replace REQ_FLUSH with REQ_NOMERGE. UAPI: include <asm/byteorder.h> in linux/raid/md_p.h raid1: Rewrite the implementation of iobarrier. raid1: Add some macros to make code clearly. raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array. raid1: Add a field array_frozen to indicate whether raid in freeze state. md: Convert use of typedef ctl_table to struct ctl_table md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes. md: use MD_RECOVERY_INTR instead of kthread_should_stop in resync thread. md: fix some places where mddev_lock return value is not checked. raid5: Retry R5_ReadNoMerge flag when hit a read error. raid5: relieve lock contention in get_active_stripe() raid5: relieve lock contention in get_active_stripe() wait: add wait_event_cmd() md/raid5.c: add proper locking to error path of raid5_start_reshape. md: fix calculation of stacking limits on level change. raid5: Use slow_path to release stripe when mddev->thread is null
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c420
1 files changed, 319 insertions, 101 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7f0e17a27aeb..47da0af6322b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
85 return &conf->stripe_hashtbl[hash]; 85 return &conf->stripe_hashtbl[hash];
86} 86}
87 87
88static inline int stripe_hash_locks_hash(sector_t sect)
89{
90 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
91}
92
93static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
94{
95 spin_lock_irq(conf->hash_locks + hash);
96 spin_lock(&conf->device_lock);
97}
98
99static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
100{
101 spin_unlock(&conf->device_lock);
102 spin_unlock_irq(conf->hash_locks + hash);
103}
104
105static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
106{
107 int i;
108 local_irq_disable();
109 spin_lock(conf->hash_locks);
110 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
111 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
112 spin_lock(&conf->device_lock);
113}
114
115static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
116{
117 int i;
118 spin_unlock(&conf->device_lock);
119 for (i = NR_STRIPE_HASH_LOCKS; i; i--)
120 spin_unlock(conf->hash_locks + i - 1);
121 local_irq_enable();
122}
123
88/* bio's attached to a stripe+device for I/O are linked together in bi_sector 124/* bio's attached to a stripe+device for I/O are linked together in bi_sector
89 * order without overlap. There may be several bio's per stripe+device, and 125 * order without overlap. There may be several bio's per stripe+device, and
90 * a bio could span several devices. 126 * a bio could span several devices.
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
249 } 285 }
250} 286}
251 287
252static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 288static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
289 struct list_head *temp_inactive_list)
253{ 290{
254 BUG_ON(!list_empty(&sh->lru)); 291 BUG_ON(!list_empty(&sh->lru));
255 BUG_ON(atomic_read(&conf->active_stripes)==0); 292 BUG_ON(atomic_read(&conf->active_stripes)==0);
@@ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
278 < IO_THRESHOLD) 315 < IO_THRESHOLD)
279 md_wakeup_thread(conf->mddev->thread); 316 md_wakeup_thread(conf->mddev->thread);
280 atomic_dec(&conf->active_stripes); 317 atomic_dec(&conf->active_stripes);
281 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 318 if (!test_bit(STRIPE_EXPANDING, &sh->state))
282 list_add_tail(&sh->lru, &conf->inactive_list); 319 list_add_tail(&sh->lru, temp_inactive_list);
283 wake_up(&conf->wait_for_stripe);
284 if (conf->retry_read_aligned)
285 md_wakeup_thread(conf->mddev->thread);
286 }
287 } 320 }
288} 321}
289 322
290static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 323static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
324 struct list_head *temp_inactive_list)
291{ 325{
292 if (atomic_dec_and_test(&sh->count)) 326 if (atomic_dec_and_test(&sh->count))
293 do_release_stripe(conf, sh); 327 do_release_stripe(conf, sh, temp_inactive_list);
328}
329
330/*
331 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
332 *
333 * Be careful: Only one task can add/delete stripes from temp_inactive_list at
334 * given time. Adding stripes only takes device lock, while deleting stripes
335 * only takes hash lock.
336 */
337static void release_inactive_stripe_list(struct r5conf *conf,
338 struct list_head *temp_inactive_list,
339 int hash)
340{
341 int size;
342 bool do_wakeup = false;
343 unsigned long flags;
344
345 if (hash == NR_STRIPE_HASH_LOCKS) {
346 size = NR_STRIPE_HASH_LOCKS;
347 hash = NR_STRIPE_HASH_LOCKS - 1;
348 } else
349 size = 1;
350 while (size) {
351 struct list_head *list = &temp_inactive_list[size - 1];
352
353 /*
354 * We don't hold any lock here yet, get_active_stripe() might
355 * remove stripes from the list
356 */
357 if (!list_empty_careful(list)) {
358 spin_lock_irqsave(conf->hash_locks + hash, flags);
359 if (list_empty(conf->inactive_list + hash) &&
360 !list_empty(list))
361 atomic_dec(&conf->empty_inactive_list_nr);
362 list_splice_tail_init(list, conf->inactive_list + hash);
363 do_wakeup = true;
364 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
365 }
366 size--;
367 hash--;
368 }
369
370 if (do_wakeup) {
371 wake_up(&conf->wait_for_stripe);
372 if (conf->retry_read_aligned)
373 md_wakeup_thread(conf->mddev->thread);
374 }
294} 375}
295 376
296/* should hold conf->device_lock already */ 377/* should hold conf->device_lock already */
297static int release_stripe_list(struct r5conf *conf) 378static int release_stripe_list(struct r5conf *conf,
379 struct list_head *temp_inactive_list)
298{ 380{
299 struct stripe_head *sh; 381 struct stripe_head *sh;
300 int count = 0; 382 int count = 0;
@@ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf)
303 head = llist_del_all(&conf->released_stripes); 385 head = llist_del_all(&conf->released_stripes);
304 head = llist_reverse_order(head); 386 head = llist_reverse_order(head);
305 while (head) { 387 while (head) {
388 int hash;
389
306 sh = llist_entry(head, struct stripe_head, release_list); 390 sh = llist_entry(head, struct stripe_head, release_list);
307 head = llist_next(head); 391 head = llist_next(head);
308 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 392 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
@@ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf)
313 * again, the count is always > 1. This is true for 397 * again, the count is always > 1. This is true for
314 * STRIPE_ON_UNPLUG_LIST bit too. 398 * STRIPE_ON_UNPLUG_LIST bit too.
315 */ 399 */
316 __release_stripe(conf, sh); 400 hash = sh->hash_lock_index;
401 __release_stripe(conf, sh, &temp_inactive_list[hash]);
317 count++; 402 count++;
318 } 403 }
319 404
@@ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh)
324{ 409{
325 struct r5conf *conf = sh->raid_conf; 410 struct r5conf *conf = sh->raid_conf;
326 unsigned long flags; 411 unsigned long flags;
412 struct list_head list;
413 int hash;
327 bool wakeup; 414 bool wakeup;
328 415
329 if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 416 if (unlikely(!conf->mddev->thread) ||
417 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
330 goto slow_path; 418 goto slow_path;
331 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 419 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
332 if (wakeup) 420 if (wakeup)
@@ -336,8 +424,11 @@ slow_path:
336 local_irq_save(flags); 424 local_irq_save(flags);
337 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 425 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
338 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 426 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
339 do_release_stripe(conf, sh); 427 INIT_LIST_HEAD(&list);
428 hash = sh->hash_lock_index;
429 do_release_stripe(conf, sh, &list);
340 spin_unlock(&conf->device_lock); 430 spin_unlock(&conf->device_lock);
431 release_inactive_stripe_list(conf, &list, hash);
341 } 432 }
342 local_irq_restore(flags); 433 local_irq_restore(flags);
343} 434}
@@ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
362 453
363 454
364/* find an idle stripe, make sure it is unhashed, and return it. */ 455/* find an idle stripe, make sure it is unhashed, and return it. */
365static struct stripe_head *get_free_stripe(struct r5conf *conf) 456static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
366{ 457{
367 struct stripe_head *sh = NULL; 458 struct stripe_head *sh = NULL;
368 struct list_head *first; 459 struct list_head *first;
369 460
370 if (list_empty(&conf->inactive_list)) 461 if (list_empty(conf->inactive_list + hash))
371 goto out; 462 goto out;
372 first = conf->inactive_list.next; 463 first = (conf->inactive_list + hash)->next;
373 sh = list_entry(first, struct stripe_head, lru); 464 sh = list_entry(first, struct stripe_head, lru);
374 list_del_init(first); 465 list_del_init(first);
375 remove_hash(sh); 466 remove_hash(sh);
376 atomic_inc(&conf->active_stripes); 467 atomic_inc(&conf->active_stripes);
468 BUG_ON(hash != sh->hash_lock_index);
469 if (list_empty(conf->inactive_list + hash))
470 atomic_inc(&conf->empty_inactive_list_nr);
377out: 471out:
378 return sh; 472 return sh;
379} 473}
@@ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
416static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 510static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
417{ 511{
418 struct r5conf *conf = sh->raid_conf; 512 struct r5conf *conf = sh->raid_conf;
419 int i; 513 int i, seq;
420 514
421 BUG_ON(atomic_read(&sh->count) != 0); 515 BUG_ON(atomic_read(&sh->count) != 0);
422 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 516 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
@@ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
426 (unsigned long long)sh->sector); 520 (unsigned long long)sh->sector);
427 521
428 remove_hash(sh); 522 remove_hash(sh);
429 523retry:
524 seq = read_seqcount_begin(&conf->gen_lock);
430 sh->generation = conf->generation - previous; 525 sh->generation = conf->generation - previous;
431 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 526 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
432 sh->sector = sector; 527 sh->sector = sector;
@@ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
448 dev->flags = 0; 543 dev->flags = 0;
449 raid5_build_block(sh, i, previous); 544 raid5_build_block(sh, i, previous);
450 } 545 }
546 if (read_seqcount_retry(&conf->gen_lock, seq))
547 goto retry;
451 insert_hash(conf, sh); 548 insert_hash(conf, sh);
452 sh->cpu = smp_processor_id(); 549 sh->cpu = smp_processor_id();
453} 550}
@@ -552,29 +649,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
552 int previous, int noblock, int noquiesce) 649 int previous, int noblock, int noquiesce)
553{ 650{
554 struct stripe_head *sh; 651 struct stripe_head *sh;
652 int hash = stripe_hash_locks_hash(sector);
555 653
556 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 654 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
557 655
558 spin_lock_irq(&conf->device_lock); 656 spin_lock_irq(conf->hash_locks + hash);
559 657
560 do { 658 do {
561 wait_event_lock_irq(conf->wait_for_stripe, 659 wait_event_lock_irq(conf->wait_for_stripe,
562 conf->quiesce == 0 || noquiesce, 660 conf->quiesce == 0 || noquiesce,
563 conf->device_lock); 661 *(conf->hash_locks + hash));
564 sh = __find_stripe(conf, sector, conf->generation - previous); 662 sh = __find_stripe(conf, sector, conf->generation - previous);
565 if (!sh) { 663 if (!sh) {
566 if (!conf->inactive_blocked) 664 if (!conf->inactive_blocked)
567 sh = get_free_stripe(conf); 665 sh = get_free_stripe(conf, hash);
568 if (noblock && sh == NULL) 666 if (noblock && sh == NULL)
569 break; 667 break;
570 if (!sh) { 668 if (!sh) {
571 conf->inactive_blocked = 1; 669 conf->inactive_blocked = 1;
572 wait_event_lock_irq(conf->wait_for_stripe, 670 wait_event_lock_irq(
573 !list_empty(&conf->inactive_list) && 671 conf->wait_for_stripe,
574 (atomic_read(&conf->active_stripes) 672 !list_empty(conf->inactive_list + hash) &&
575 < (conf->max_nr_stripes *3/4) 673 (atomic_read(&conf->active_stripes)
576 || !conf->inactive_blocked), 674 < (conf->max_nr_stripes * 3 / 4)
577 conf->device_lock); 675 || !conf->inactive_blocked),
676 *(conf->hash_locks + hash));
578 conf->inactive_blocked = 0; 677 conf->inactive_blocked = 0;
579 } else 678 } else
580 init_stripe(sh, sector, previous); 679 init_stripe(sh, sector, previous);
@@ -585,9 +684,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
585 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) 684 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
586 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 685 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
587 } else { 686 } else {
687 spin_lock(&conf->device_lock);
588 if (!test_bit(STRIPE_HANDLE, &sh->state)) 688 if (!test_bit(STRIPE_HANDLE, &sh->state))
589 atomic_inc(&conf->active_stripes); 689 atomic_inc(&conf->active_stripes);
590 if (list_empty(&sh->lru) && 690 if (list_empty(&sh->lru) &&
691 !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) &&
591 !test_bit(STRIPE_EXPANDING, &sh->state)) 692 !test_bit(STRIPE_EXPANDING, &sh->state))
592 BUG(); 693 BUG();
593 list_del_init(&sh->lru); 694 list_del_init(&sh->lru);
@@ -595,6 +696,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
595 sh->group->stripes_cnt--; 696 sh->group->stripes_cnt--;
596 sh->group = NULL; 697 sh->group = NULL;
597 } 698 }
699 spin_unlock(&conf->device_lock);
598 } 700 }
599 } 701 }
600 } while (sh == NULL); 702 } while (sh == NULL);
@@ -602,7 +704,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
602 if (sh) 704 if (sh)
603 atomic_inc(&sh->count); 705 atomic_inc(&sh->count);
604 706
605 spin_unlock_irq(&conf->device_lock); 707 spin_unlock_irq(conf->hash_locks + hash);
606 return sh; 708 return sh;
607} 709}
608 710
@@ -758,7 +860,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
758 bi->bi_sector = (sh->sector 860 bi->bi_sector = (sh->sector
759 + rdev->data_offset); 861 + rdev->data_offset);
760 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 862 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
761 bi->bi_rw |= REQ_FLUSH; 863 bi->bi_rw |= REQ_NOMERGE;
762 864
763 bi->bi_vcnt = 1; 865 bi->bi_vcnt = 1;
764 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 866 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1582,7 +1684,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1582 put_cpu(); 1684 put_cpu();
1583} 1685}
1584 1686
1585static int grow_one_stripe(struct r5conf *conf) 1687static int grow_one_stripe(struct r5conf *conf, int hash)
1586{ 1688{
1587 struct stripe_head *sh; 1689 struct stripe_head *sh;
1588 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1690 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1598,6 +1700,7 @@ static int grow_one_stripe(struct r5conf *conf)
1598 kmem_cache_free(conf->slab_cache, sh); 1700 kmem_cache_free(conf->slab_cache, sh);
1599 return 0; 1701 return 0;
1600 } 1702 }
1703 sh->hash_lock_index = hash;
1601 /* we just created an active stripe so... */ 1704 /* we just created an active stripe so... */
1602 atomic_set(&sh->count, 1); 1705 atomic_set(&sh->count, 1);
1603 atomic_inc(&conf->active_stripes); 1706 atomic_inc(&conf->active_stripes);
@@ -1610,6 +1713,7 @@ static int grow_stripes(struct r5conf *conf, int num)
1610{ 1713{
1611 struct kmem_cache *sc; 1714 struct kmem_cache *sc;
1612 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1715 int devs = max(conf->raid_disks, conf->previous_raid_disks);
1716 int hash;
1613 1717
1614 if (conf->mddev->gendisk) 1718 if (conf->mddev->gendisk)
1615 sprintf(conf->cache_name[0], 1719 sprintf(conf->cache_name[0],
@@ -1627,9 +1731,13 @@ static int grow_stripes(struct r5conf *conf, int num)
1627 return 1; 1731 return 1;
1628 conf->slab_cache = sc; 1732 conf->slab_cache = sc;
1629 conf->pool_size = devs; 1733 conf->pool_size = devs;
1630 while (num--) 1734 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
1631 if (!grow_one_stripe(conf)) 1735 while (num--) {
1736 if (!grow_one_stripe(conf, hash))
1632 return 1; 1737 return 1;
1738 conf->max_nr_stripes++;
1739 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
1740 }
1633 return 0; 1741 return 0;
1634} 1742}
1635 1743
@@ -1687,6 +1795,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1687 int err; 1795 int err;
1688 struct kmem_cache *sc; 1796 struct kmem_cache *sc;
1689 int i; 1797 int i;
1798 int hash, cnt;
1690 1799
1691 if (newsize <= conf->pool_size) 1800 if (newsize <= conf->pool_size)
1692 return 0; /* never bother to shrink */ 1801 return 0; /* never bother to shrink */
@@ -1726,19 +1835,29 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1726 * OK, we have enough stripes, start collecting inactive 1835 * OK, we have enough stripes, start collecting inactive
1727 * stripes and copying them over 1836 * stripes and copying them over
1728 */ 1837 */
1838 hash = 0;
1839 cnt = 0;
1729 list_for_each_entry(nsh, &newstripes, lru) { 1840 list_for_each_entry(nsh, &newstripes, lru) {
1730 spin_lock_irq(&conf->device_lock); 1841 lock_device_hash_lock(conf, hash);
1731 wait_event_lock_irq(conf->wait_for_stripe, 1842 wait_event_cmd(conf->wait_for_stripe,
1732 !list_empty(&conf->inactive_list), 1843 !list_empty(conf->inactive_list + hash),
1733 conf->device_lock); 1844 unlock_device_hash_lock(conf, hash),
1734 osh = get_free_stripe(conf); 1845 lock_device_hash_lock(conf, hash));
1735 spin_unlock_irq(&conf->device_lock); 1846 osh = get_free_stripe(conf, hash);
1847 unlock_device_hash_lock(conf, hash);
1736 atomic_set(&nsh->count, 1); 1848 atomic_set(&nsh->count, 1);
1737 for(i=0; i<conf->pool_size; i++) 1849 for(i=0; i<conf->pool_size; i++)
1738 nsh->dev[i].page = osh->dev[i].page; 1850 nsh->dev[i].page = osh->dev[i].page;
1739 for( ; i<newsize; i++) 1851 for( ; i<newsize; i++)
1740 nsh->dev[i].page = NULL; 1852 nsh->dev[i].page = NULL;
1853 nsh->hash_lock_index = hash;
1741 kmem_cache_free(conf->slab_cache, osh); 1854 kmem_cache_free(conf->slab_cache, osh);
1855 cnt++;
1856 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
1857 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
1858 hash++;
1859 cnt = 0;
1860 }
1742 } 1861 }
1743 kmem_cache_destroy(conf->slab_cache); 1862 kmem_cache_destroy(conf->slab_cache);
1744 1863
@@ -1797,13 +1916,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1797 return err; 1916 return err;
1798} 1917}
1799 1918
1800static int drop_one_stripe(struct r5conf *conf) 1919static int drop_one_stripe(struct r5conf *conf, int hash)
1801{ 1920{
1802 struct stripe_head *sh; 1921 struct stripe_head *sh;
1803 1922
1804 spin_lock_irq(&conf->device_lock); 1923 spin_lock_irq(conf->hash_locks + hash);
1805 sh = get_free_stripe(conf); 1924 sh = get_free_stripe(conf, hash);
1806 spin_unlock_irq(&conf->device_lock); 1925 spin_unlock_irq(conf->hash_locks + hash);
1807 if (!sh) 1926 if (!sh)
1808 return 0; 1927 return 0;
1809 BUG_ON(atomic_read(&sh->count)); 1928 BUG_ON(atomic_read(&sh->count));
@@ -1815,8 +1934,10 @@ static int drop_one_stripe(struct r5conf *conf)
1815 1934
1816static void shrink_stripes(struct r5conf *conf) 1935static void shrink_stripes(struct r5conf *conf)
1817{ 1936{
1818 while (drop_one_stripe(conf)) 1937 int hash;
1819 ; 1938 for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
1939 while (drop_one_stripe(conf, hash))
1940 ;
1820 1941
1821 if (conf->slab_cache) 1942 if (conf->slab_cache)
1822 kmem_cache_destroy(conf->slab_cache); 1943 kmem_cache_destroy(conf->slab_cache);
@@ -1921,6 +2042,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
1921 mdname(conf->mddev), bdn); 2042 mdname(conf->mddev), bdn);
1922 else 2043 else
1923 retry = 1; 2044 retry = 1;
2045 if (set_bad && test_bit(In_sync, &rdev->flags)
2046 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2047 retry = 1;
1924 if (retry) 2048 if (retry)
1925 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2049 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
1926 set_bit(R5_ReadError, &sh->dev[i].flags); 2050 set_bit(R5_ReadError, &sh->dev[i].flags);
@@ -3900,7 +4024,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
3900 } 4024 }
3901} 4025}
3902 4026
3903static void activate_bit_delay(struct r5conf *conf) 4027static void activate_bit_delay(struct r5conf *conf,
4028 struct list_head *temp_inactive_list)
3904{ 4029{
3905 /* device_lock is held */ 4030 /* device_lock is held */
3906 struct list_head head; 4031 struct list_head head;
@@ -3908,9 +4033,11 @@ static void activate_bit_delay(struct r5conf *conf)
3908 list_del_init(&conf->bitmap_list); 4033 list_del_init(&conf->bitmap_list);
3909 while (!list_empty(&head)) { 4034 while (!list_empty(&head)) {
3910 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 4035 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
4036 int hash;
3911 list_del_init(&sh->lru); 4037 list_del_init(&sh->lru);
3912 atomic_inc(&sh->count); 4038 atomic_inc(&sh->count);
3913 __release_stripe(conf, sh); 4039 hash = sh->hash_lock_index;
4040 __release_stripe(conf, sh, &temp_inactive_list[hash]);
3914 } 4041 }
3915} 4042}
3916 4043
@@ -3926,7 +4053,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
3926 return 1; 4053 return 1;
3927 if (conf->quiesce) 4054 if (conf->quiesce)
3928 return 1; 4055 return 1;
3929 if (list_empty_careful(&conf->inactive_list)) 4056 if (atomic_read(&conf->empty_inactive_list_nr))
3930 return 1; 4057 return 1;
3931 4058
3932 return 0; 4059 return 0;
@@ -4256,6 +4383,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
4256struct raid5_plug_cb { 4383struct raid5_plug_cb {
4257 struct blk_plug_cb cb; 4384 struct blk_plug_cb cb;
4258 struct list_head list; 4385 struct list_head list;
4386 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
4259}; 4387};
4260 4388
4261static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4389static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
@@ -4266,6 +4394,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4266 struct mddev *mddev = cb->cb.data; 4394 struct mddev *mddev = cb->cb.data;
4267 struct r5conf *conf = mddev->private; 4395 struct r5conf *conf = mddev->private;
4268 int cnt = 0; 4396 int cnt = 0;
4397 int hash;
4269 4398
4270 if (cb->list.next && !list_empty(&cb->list)) { 4399 if (cb->list.next && !list_empty(&cb->list)) {
4271 spin_lock_irq(&conf->device_lock); 4400 spin_lock_irq(&conf->device_lock);
@@ -4283,11 +4412,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4283 * STRIPE_ON_RELEASE_LIST could be set here. In that 4412 * STRIPE_ON_RELEASE_LIST could be set here. In that
4284 * case, the count is always > 1 here 4413 * case, the count is always > 1 here
4285 */ 4414 */
4286 __release_stripe(conf, sh); 4415 hash = sh->hash_lock_index;
4416 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
4287 cnt++; 4417 cnt++;
4288 } 4418 }
4289 spin_unlock_irq(&conf->device_lock); 4419 spin_unlock_irq(&conf->device_lock);
4290 } 4420 }
4421 release_inactive_stripe_list(conf, cb->temp_inactive_list,
4422 NR_STRIPE_HASH_LOCKS);
4291 if (mddev->queue) 4423 if (mddev->queue)
4292 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4424 trace_block_unplug(mddev->queue, cnt, !from_schedule);
4293 kfree(cb); 4425 kfree(cb);
@@ -4308,8 +4440,12 @@ static void release_stripe_plug(struct mddev *mddev,
4308 4440
4309 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4441 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
4310 4442
4311 if (cb->list.next == NULL) 4443 if (cb->list.next == NULL) {
4444 int i;
4312 INIT_LIST_HEAD(&cb->list); 4445 INIT_LIST_HEAD(&cb->list);
4446 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
4447 INIT_LIST_HEAD(cb->temp_inactive_list + i);
4448 }
4313 4449
4314 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4450 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
4315 list_add_tail(&sh->lru, &cb->list); 4451 list_add_tail(&sh->lru, &cb->list);
@@ -4692,14 +4828,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4692 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4828 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4693 /* Cannot proceed until we've updated the superblock... */ 4829 /* Cannot proceed until we've updated the superblock... */
4694 wait_event(conf->wait_for_overlap, 4830 wait_event(conf->wait_for_overlap,
4695 atomic_read(&conf->reshape_stripes)==0); 4831 atomic_read(&conf->reshape_stripes)==0
4832 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4833 if (atomic_read(&conf->reshape_stripes) != 0)
4834 return 0;
4696 mddev->reshape_position = conf->reshape_progress; 4835 mddev->reshape_position = conf->reshape_progress;
4697 mddev->curr_resync_completed = sector_nr; 4836 mddev->curr_resync_completed = sector_nr;
4698 conf->reshape_checkpoint = jiffies; 4837 conf->reshape_checkpoint = jiffies;
4699 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4838 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4700 md_wakeup_thread(mddev->thread); 4839 md_wakeup_thread(mddev->thread);
4701 wait_event(mddev->sb_wait, mddev->flags == 0 || 4840 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4702 kthread_should_stop()); 4841 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4842 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4843 return 0;
4703 spin_lock_irq(&conf->device_lock); 4844 spin_lock_irq(&conf->device_lock);
4704 conf->reshape_safe = mddev->reshape_position; 4845 conf->reshape_safe = mddev->reshape_position;
4705 spin_unlock_irq(&conf->device_lock); 4846 spin_unlock_irq(&conf->device_lock);
@@ -4782,7 +4923,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4782 >= mddev->resync_max - mddev->curr_resync_completed) { 4923 >= mddev->resync_max - mddev->curr_resync_completed) {
4783 /* Cannot proceed until we've updated the superblock... */ 4924 /* Cannot proceed until we've updated the superblock... */
4784 wait_event(conf->wait_for_overlap, 4925 wait_event(conf->wait_for_overlap,
4785 atomic_read(&conf->reshape_stripes) == 0); 4926 atomic_read(&conf->reshape_stripes) == 0
4927 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4928 if (atomic_read(&conf->reshape_stripes) != 0)
4929 goto ret;
4786 mddev->reshape_position = conf->reshape_progress; 4930 mddev->reshape_position = conf->reshape_progress;
4787 mddev->curr_resync_completed = sector_nr; 4931 mddev->curr_resync_completed = sector_nr;
4788 conf->reshape_checkpoint = jiffies; 4932 conf->reshape_checkpoint = jiffies;
@@ -4790,13 +4934,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4790 md_wakeup_thread(mddev->thread); 4934 md_wakeup_thread(mddev->thread);
4791 wait_event(mddev->sb_wait, 4935 wait_event(mddev->sb_wait,
4792 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4936 !test_bit(MD_CHANGE_DEVS, &mddev->flags)
4793 || kthread_should_stop()); 4937 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4938 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4939 goto ret;
4794 spin_lock_irq(&conf->device_lock); 4940 spin_lock_irq(&conf->device_lock);
4795 conf->reshape_safe = mddev->reshape_position; 4941 conf->reshape_safe = mddev->reshape_position;
4796 spin_unlock_irq(&conf->device_lock); 4942 spin_unlock_irq(&conf->device_lock);
4797 wake_up(&conf->wait_for_overlap); 4943 wake_up(&conf->wait_for_overlap);
4798 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4944 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4799 } 4945 }
4946ret:
4800 return reshape_sectors; 4947 return reshape_sectors;
4801} 4948}
4802 4949
@@ -4954,27 +5101,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4954} 5101}
4955 5102
4956static int handle_active_stripes(struct r5conf *conf, int group, 5103static int handle_active_stripes(struct r5conf *conf, int group,
4957 struct r5worker *worker) 5104 struct r5worker *worker,
5105 struct list_head *temp_inactive_list)
4958{ 5106{
4959 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 5107 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
4960 int i, batch_size = 0; 5108 int i, batch_size = 0, hash;
5109 bool release_inactive = false;
4961 5110
4962 while (batch_size < MAX_STRIPE_BATCH && 5111 while (batch_size < MAX_STRIPE_BATCH &&
4963 (sh = __get_priority_stripe(conf, group)) != NULL) 5112 (sh = __get_priority_stripe(conf, group)) != NULL)
4964 batch[batch_size++] = sh; 5113 batch[batch_size++] = sh;
4965 5114
4966 if (batch_size == 0) 5115 if (batch_size == 0) {
4967 return batch_size; 5116 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5117 if (!list_empty(temp_inactive_list + i))
5118 break;
5119 if (i == NR_STRIPE_HASH_LOCKS)
5120 return batch_size;
5121 release_inactive = true;
5122 }
4968 spin_unlock_irq(&conf->device_lock); 5123 spin_unlock_irq(&conf->device_lock);
4969 5124
5125 release_inactive_stripe_list(conf, temp_inactive_list,
5126 NR_STRIPE_HASH_LOCKS);
5127
5128 if (release_inactive) {
5129 spin_lock_irq(&conf->device_lock);
5130 return 0;
5131 }
5132
4970 for (i = 0; i < batch_size; i++) 5133 for (i = 0; i < batch_size; i++)
4971 handle_stripe(batch[i]); 5134 handle_stripe(batch[i]);
4972 5135
4973 cond_resched(); 5136 cond_resched();
4974 5137
4975 spin_lock_irq(&conf->device_lock); 5138 spin_lock_irq(&conf->device_lock);
4976 for (i = 0; i < batch_size; i++) 5139 for (i = 0; i < batch_size; i++) {
4977 __release_stripe(conf, batch[i]); 5140 hash = batch[i]->hash_lock_index;
5141 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
5142 }
4978 return batch_size; 5143 return batch_size;
4979} 5144}
4980 5145
@@ -4995,9 +5160,10 @@ static void raid5_do_work(struct work_struct *work)
4995 while (1) { 5160 while (1) {
4996 int batch_size, released; 5161 int batch_size, released;
4997 5162
4998 released = release_stripe_list(conf); 5163 released = release_stripe_list(conf, worker->temp_inactive_list);
4999 5164
5000 batch_size = handle_active_stripes(conf, group_id, worker); 5165 batch_size = handle_active_stripes(conf, group_id, worker,
5166 worker->temp_inactive_list);
5001 worker->working = false; 5167 worker->working = false;
5002 if (!batch_size && !released) 5168 if (!batch_size && !released)
5003 break; 5169 break;
@@ -5036,7 +5202,7 @@ static void raid5d(struct md_thread *thread)
5036 struct bio *bio; 5202 struct bio *bio;
5037 int batch_size, released; 5203 int batch_size, released;
5038 5204
5039 released = release_stripe_list(conf); 5205 released = release_stripe_list(conf, conf->temp_inactive_list);
5040 5206
5041 if ( 5207 if (
5042 !list_empty(&conf->bitmap_list)) { 5208 !list_empty(&conf->bitmap_list)) {
@@ -5046,7 +5212,7 @@ static void raid5d(struct md_thread *thread)
5046 bitmap_unplug(mddev->bitmap); 5212 bitmap_unplug(mddev->bitmap);
5047 spin_lock_irq(&conf->device_lock); 5213 spin_lock_irq(&conf->device_lock);
5048 conf->seq_write = conf->seq_flush; 5214 conf->seq_write = conf->seq_flush;
5049 activate_bit_delay(conf); 5215 activate_bit_delay(conf, conf->temp_inactive_list);
5050 } 5216 }
5051 raid5_activate_delayed(conf); 5217 raid5_activate_delayed(conf);
5052 5218
@@ -5060,7 +5226,8 @@ static void raid5d(struct md_thread *thread)
5060 handled++; 5226 handled++;
5061 } 5227 }
5062 5228
5063 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); 5229 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
5230 conf->temp_inactive_list);
5064 if (!batch_size && !released) 5231 if (!batch_size && !released)
5065 break; 5232 break;
5066 handled += batch_size; 5233 handled += batch_size;
@@ -5096,22 +5263,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
5096{ 5263{
5097 struct r5conf *conf = mddev->private; 5264 struct r5conf *conf = mddev->private;
5098 int err; 5265 int err;
5266 int hash;
5099 5267
5100 if (size <= 16 || size > 32768) 5268 if (size <= 16 || size > 32768)
5101 return -EINVAL; 5269 return -EINVAL;
5270 hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
5102 while (size < conf->max_nr_stripes) { 5271 while (size < conf->max_nr_stripes) {
5103 if (drop_one_stripe(conf)) 5272 if (drop_one_stripe(conf, hash))
5104 conf->max_nr_stripes--; 5273 conf->max_nr_stripes--;
5105 else 5274 else
5106 break; 5275 break;
5276 hash--;
5277 if (hash < 0)
5278 hash = NR_STRIPE_HASH_LOCKS - 1;
5107 } 5279 }
5108 err = md_allow_write(mddev); 5280 err = md_allow_write(mddev);
5109 if (err) 5281 if (err)
5110 return err; 5282 return err;
5283 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
5111 while (size > conf->max_nr_stripes) { 5284 while (size > conf->max_nr_stripes) {
5112 if (grow_one_stripe(conf)) 5285 if (grow_one_stripe(conf, hash))
5113 conf->max_nr_stripes++; 5286 conf->max_nr_stripes++;
5114 else break; 5287 else break;
5288 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
5115 } 5289 }
5116 return 0; 5290 return 0;
5117} 5291}
@@ -5199,15 +5373,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
5199 return 0; 5373 return 0;
5200} 5374}
5201 5375
5202static int alloc_thread_groups(struct r5conf *conf, int cnt); 5376static int alloc_thread_groups(struct r5conf *conf, int cnt,
5377 int *group_cnt,
5378 int *worker_cnt_per_group,
5379 struct r5worker_group **worker_groups);
5203static ssize_t 5380static ssize_t
5204raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 5381raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
5205{ 5382{
5206 struct r5conf *conf = mddev->private; 5383 struct r5conf *conf = mddev->private;
5207 unsigned long new; 5384 unsigned long new;
5208 int err; 5385 int err;
5209 struct r5worker_group *old_groups; 5386 struct r5worker_group *new_groups, *old_groups;
5210 int old_group_cnt; 5387 int group_cnt, worker_cnt_per_group;
5211 5388
5212 if (len >= PAGE_SIZE) 5389 if (len >= PAGE_SIZE)
5213 return -EINVAL; 5390 return -EINVAL;
@@ -5223,14 +5400,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
5223 mddev_suspend(mddev); 5400 mddev_suspend(mddev);
5224 5401
5225 old_groups = conf->worker_groups; 5402 old_groups = conf->worker_groups;
5226 old_group_cnt = conf->worker_cnt_per_group; 5403 if (old_groups)
5404 flush_workqueue(raid5_wq);
5405
5406 err = alloc_thread_groups(conf, new,
5407 &group_cnt, &worker_cnt_per_group,
5408 &new_groups);
5409 if (!err) {
5410 spin_lock_irq(&conf->device_lock);
5411 conf->group_cnt = group_cnt;
5412 conf->worker_cnt_per_group = worker_cnt_per_group;
5413 conf->worker_groups = new_groups;
5414 spin_unlock_irq(&conf->device_lock);
5227 5415
5228 conf->worker_groups = NULL;
5229 err = alloc_thread_groups(conf, new);
5230 if (err) {
5231 conf->worker_groups = old_groups;
5232 conf->worker_cnt_per_group = old_group_cnt;
5233 } else {
5234 if (old_groups) 5416 if (old_groups)
5235 kfree(old_groups[0].workers); 5417 kfree(old_groups[0].workers);
5236 kfree(old_groups); 5418 kfree(old_groups);
@@ -5260,40 +5442,47 @@ static struct attribute_group raid5_attrs_group = {
5260 .attrs = raid5_attrs, 5442 .attrs = raid5_attrs,
5261}; 5443};
5262 5444
5263static int alloc_thread_groups(struct r5conf *conf, int cnt) 5445static int alloc_thread_groups(struct r5conf *conf, int cnt,
5446 int *group_cnt,
5447 int *worker_cnt_per_group,
5448 struct r5worker_group **worker_groups)
5264{ 5449{
5265 int i, j; 5450 int i, j, k;
5266 ssize_t size; 5451 ssize_t size;
5267 struct r5worker *workers; 5452 struct r5worker *workers;
5268 5453
5269 conf->worker_cnt_per_group = cnt; 5454 *worker_cnt_per_group = cnt;
5270 if (cnt == 0) { 5455 if (cnt == 0) {
5271 conf->worker_groups = NULL; 5456 *group_cnt = 0;
5457 *worker_groups = NULL;
5272 return 0; 5458 return 0;
5273 } 5459 }
5274 conf->group_cnt = num_possible_nodes(); 5460 *group_cnt = num_possible_nodes();
5275 size = sizeof(struct r5worker) * cnt; 5461 size = sizeof(struct r5worker) * cnt;
5276 workers = kzalloc(size * conf->group_cnt, GFP_NOIO); 5462 workers = kzalloc(size * *group_cnt, GFP_NOIO);
5277 conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * 5463 *worker_groups = kzalloc(sizeof(struct r5worker_group) *
5278 conf->group_cnt, GFP_NOIO); 5464 *group_cnt, GFP_NOIO);
5279 if (!conf->worker_groups || !workers) { 5465 if (!*worker_groups || !workers) {
5280 kfree(workers); 5466 kfree(workers);
5281 kfree(conf->worker_groups); 5467 kfree(*worker_groups);
5282 conf->worker_groups = NULL;
5283 return -ENOMEM; 5468 return -ENOMEM;
5284 } 5469 }
5285 5470
5286 for (i = 0; i < conf->group_cnt; i++) { 5471 for (i = 0; i < *group_cnt; i++) {
5287 struct r5worker_group *group; 5472 struct r5worker_group *group;
5288 5473
5289 group = &conf->worker_groups[i]; 5474 group = worker_groups[i];
5290 INIT_LIST_HEAD(&group->handle_list); 5475 INIT_LIST_HEAD(&group->handle_list);
5291 group->conf = conf; 5476 group->conf = conf;
5292 group->workers = workers + i * cnt; 5477 group->workers = workers + i * cnt;
5293 5478
5294 for (j = 0; j < cnt; j++) { 5479 for (j = 0; j < cnt; j++) {
5295 group->workers[j].group = group; 5480 struct r5worker *worker = group->workers + j;
5296 INIT_WORK(&group->workers[j].work, raid5_do_work); 5481 worker->group = group;
5482 INIT_WORK(&worker->work, raid5_do_work);
5483
5484 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
5485 INIT_LIST_HEAD(worker->temp_inactive_list + k);
5297 } 5486 }
5298 } 5487 }
5299 5488
@@ -5444,6 +5633,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5444 struct md_rdev *rdev; 5633 struct md_rdev *rdev;
5445 struct disk_info *disk; 5634 struct disk_info *disk;
5446 char pers_name[6]; 5635 char pers_name[6];
5636 int i;
5637 int group_cnt, worker_cnt_per_group;
5638 struct r5worker_group *new_group;
5447 5639
5448 if (mddev->new_level != 5 5640 if (mddev->new_level != 5
5449 && mddev->new_level != 4 5641 && mddev->new_level != 4
@@ -5478,7 +5670,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5478 if (conf == NULL) 5670 if (conf == NULL)
5479 goto abort; 5671 goto abort;
5480 /* Don't enable multi-threading by default*/ 5672 /* Don't enable multi-threading by default*/
5481 if (alloc_thread_groups(conf, 0)) 5673 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
5674 &new_group)) {
5675 conf->group_cnt = group_cnt;
5676 conf->worker_cnt_per_group = worker_cnt_per_group;
5677 conf->worker_groups = new_group;
5678 } else
5482 goto abort; 5679 goto abort;
5483 spin_lock_init(&conf->device_lock); 5680 spin_lock_init(&conf->device_lock);
5484 seqcount_init(&conf->gen_lock); 5681 seqcount_init(&conf->gen_lock);
@@ -5488,7 +5685,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5488 INIT_LIST_HEAD(&conf->hold_list); 5685 INIT_LIST_HEAD(&conf->hold_list);
5489 INIT_LIST_HEAD(&conf->delayed_list); 5686 INIT_LIST_HEAD(&conf->delayed_list);
5490 INIT_LIST_HEAD(&conf->bitmap_list); 5687 INIT_LIST_HEAD(&conf->bitmap_list);
5491 INIT_LIST_HEAD(&conf->inactive_list);
5492 init_llist_head(&conf->released_stripes); 5688 init_llist_head(&conf->released_stripes);
5493 atomic_set(&conf->active_stripes, 0); 5689 atomic_set(&conf->active_stripes, 0);
5494 atomic_set(&conf->preread_active_stripes, 0); 5690 atomic_set(&conf->preread_active_stripes, 0);
@@ -5514,6 +5710,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5514 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5710 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
5515 goto abort; 5711 goto abort;
5516 5712
5713 /* We init hash_locks[0] separately to that it can be used
5714 * as the reference lock in the spin_lock_nest_lock() call
5715 * in lock_all_device_hash_locks_irq in order to convince
5716 * lockdep that we know what we are doing.
5717 */
5718 spin_lock_init(conf->hash_locks);
5719 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
5720 spin_lock_init(conf->hash_locks + i);
5721
5722 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5723 INIT_LIST_HEAD(conf->inactive_list + i);
5724
5725 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5726 INIT_LIST_HEAD(conf->temp_inactive_list + i);
5727
5517 conf->level = mddev->new_level; 5728 conf->level = mddev->new_level;
5518 if (raid5_alloc_percpu(conf) != 0) 5729 if (raid5_alloc_percpu(conf) != 0)
5519 goto abort; 5730 goto abort;
@@ -5554,7 +5765,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5554 else 5765 else
5555 conf->max_degraded = 1; 5766 conf->max_degraded = 1;
5556 conf->algorithm = mddev->new_layout; 5767 conf->algorithm = mddev->new_layout;
5557 conf->max_nr_stripes = NR_STRIPES;
5558 conf->reshape_progress = mddev->reshape_position; 5768 conf->reshape_progress = mddev->reshape_position;
5559 if (conf->reshape_progress != MaxSector) { 5769 if (conf->reshape_progress != MaxSector) {
5560 conf->prev_chunk_sectors = mddev->chunk_sectors; 5770 conf->prev_chunk_sectors = mddev->chunk_sectors;
@@ -5563,7 +5773,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5563 5773
5564 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5774 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
5565 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5775 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
5566 if (grow_stripes(conf, conf->max_nr_stripes)) { 5776 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
5777 if (grow_stripes(conf, NR_STRIPES)) {
5567 printk(KERN_ERR 5778 printk(KERN_ERR
5568 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5779 "md/raid:%s: couldn't allocate %dkB for buffers\n",
5569 mdname(mddev), memory); 5780 mdname(mddev), memory);
@@ -6369,12 +6580,18 @@ static int raid5_start_reshape(struct mddev *mddev)
6369 if (!mddev->sync_thread) { 6580 if (!mddev->sync_thread) {
6370 mddev->recovery = 0; 6581 mddev->recovery = 0;
6371 spin_lock_irq(&conf->device_lock); 6582 spin_lock_irq(&conf->device_lock);
6583 write_seqcount_begin(&conf->gen_lock);
6372 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 6584 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
6585 mddev->new_chunk_sectors =
6586 conf->chunk_sectors = conf->prev_chunk_sectors;
6587 mddev->new_layout = conf->algorithm = conf->prev_algo;
6373 rdev_for_each(rdev, mddev) 6588 rdev_for_each(rdev, mddev)
6374 rdev->new_data_offset = rdev->data_offset; 6589 rdev->new_data_offset = rdev->data_offset;
6375 smp_wmb(); 6590 smp_wmb();
6591 conf->generation --;
6376 conf->reshape_progress = MaxSector; 6592 conf->reshape_progress = MaxSector;
6377 mddev->reshape_position = MaxSector; 6593 mddev->reshape_position = MaxSector;
6594 write_seqcount_end(&conf->gen_lock);
6378 spin_unlock_irq(&conf->device_lock); 6595 spin_unlock_irq(&conf->device_lock);
6379 return -EAGAIN; 6596 return -EAGAIN;
6380 } 6597 }
@@ -6462,27 +6679,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
6462 break; 6679 break;
6463 6680
6464 case 1: /* stop all writes */ 6681 case 1: /* stop all writes */
6465 spin_lock_irq(&conf->device_lock); 6682 lock_all_device_hash_locks_irq(conf);
6466 /* '2' tells resync/reshape to pause so that all 6683 /* '2' tells resync/reshape to pause so that all
6467 * active stripes can drain 6684 * active stripes can drain
6468 */ 6685 */
6469 conf->quiesce = 2; 6686 conf->quiesce = 2;
6470 wait_event_lock_irq(conf->wait_for_stripe, 6687 wait_event_cmd(conf->wait_for_stripe,
6471 atomic_read(&conf->active_stripes) == 0 && 6688 atomic_read(&conf->active_stripes) == 0 &&
6472 atomic_read(&conf->active_aligned_reads) == 0, 6689 atomic_read(&conf->active_aligned_reads) == 0,
6473 conf->device_lock); 6690 unlock_all_device_hash_locks_irq(conf),
6691 lock_all_device_hash_locks_irq(conf));
6474 conf->quiesce = 1; 6692 conf->quiesce = 1;
6475 spin_unlock_irq(&conf->device_lock); 6693 unlock_all_device_hash_locks_irq(conf);
6476 /* allow reshape to continue */ 6694 /* allow reshape to continue */
6477 wake_up(&conf->wait_for_overlap); 6695 wake_up(&conf->wait_for_overlap);
6478 break; 6696 break;
6479 6697
6480 case 0: /* re-enable writes */ 6698 case 0: /* re-enable writes */
6481 spin_lock_irq(&conf->device_lock); 6699 lock_all_device_hash_locks_irq(conf);
6482 conf->quiesce = 0; 6700 conf->quiesce = 0;
6483 wake_up(&conf->wait_for_stripe); 6701 wake_up(&conf->wait_for_stripe);
6484 wake_up(&conf->wait_for_overlap); 6702 wake_up(&conf->wait_for_overlap);
6485 spin_unlock_irq(&conf->device_lock); 6703 unlock_all_device_hash_locks_irq(conf);
6486 break; 6704 break;
6487 } 6705 }
6488} 6706}