aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/md/raid5.c317
-rw-r--r--drivers/md/raid5.h15
2 files changed, 259 insertions, 73 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4bbcb7e26d12..93090b2afab4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
85 return &conf->stripe_hashtbl[hash]; 85 return &conf->stripe_hashtbl[hash];
86} 86}
87 87
88static inline int stripe_hash_locks_hash(sector_t sect)
89{
90 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
91}
92
93static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
94{
95 spin_lock_irq(conf->hash_locks + hash);
96 spin_lock(&conf->device_lock);
97}
98
99static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
100{
101 spin_unlock(&conf->device_lock);
102 spin_unlock_irq(conf->hash_locks + hash);
103}
104
105static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
106{
107 int i;
108 local_irq_disable();
109 spin_lock(conf->hash_locks);
110 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
111 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
112 spin_lock(&conf->device_lock);
113}
114
115static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
116{
117 int i;
118 spin_unlock(&conf->device_lock);
119 for (i = NR_STRIPE_HASH_LOCKS; i; i--)
120 spin_unlock(conf->hash_locks + i - 1);
121 local_irq_enable();
122}
123
88/* bio's attached to a stripe+device for I/O are linked together in bi_sector 124/* bio's attached to a stripe+device for I/O are linked together in bi_sector
89 * order without overlap. There may be several bio's per stripe+device, and 125 * order without overlap. There may be several bio's per stripe+device, and
90 * a bio could span several devices. 126 * a bio could span several devices.
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
249 } 285 }
250} 286}
251 287
252static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 288static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
289 struct list_head *temp_inactive_list)
253{ 290{
254 BUG_ON(!list_empty(&sh->lru)); 291 BUG_ON(!list_empty(&sh->lru));
255 BUG_ON(atomic_read(&conf->active_stripes)==0); 292 BUG_ON(atomic_read(&conf->active_stripes)==0);
@@ -278,19 +315,60 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
278 < IO_THRESHOLD) 315 < IO_THRESHOLD)
279 md_wakeup_thread(conf->mddev->thread); 316 md_wakeup_thread(conf->mddev->thread);
280 atomic_dec(&conf->active_stripes); 317 atomic_dec(&conf->active_stripes);
281 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 318 if (!test_bit(STRIPE_EXPANDING, &sh->state))
282 list_add_tail(&sh->lru, &conf->inactive_list); 319 list_add_tail(&sh->lru, temp_inactive_list);
283 wake_up(&conf->wait_for_stripe);
284 if (conf->retry_read_aligned)
285 md_wakeup_thread(conf->mddev->thread);
286 }
287 } 320 }
288} 321}
289 322
290static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 323static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
324 struct list_head *temp_inactive_list)
291{ 325{
292 if (atomic_dec_and_test(&sh->count)) 326 if (atomic_dec_and_test(&sh->count))
293 do_release_stripe(conf, sh); 327 do_release_stripe(conf, sh, temp_inactive_list);
328}
329
330/*
331 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
332 *
333 * Be careful: Only one task can add/delete stripes from temp_inactive_list at
334 * given time. Adding stripes only takes device lock, while deleting stripes
335 * only takes hash lock.
336 */
337static void release_inactive_stripe_list(struct r5conf *conf,
338 struct list_head *temp_inactive_list,
339 int hash)
340{
341 int size;
342 bool do_wakeup = false;
343 unsigned long flags;
344
345 if (hash == NR_STRIPE_HASH_LOCKS) {
346 size = NR_STRIPE_HASH_LOCKS;
347 hash = NR_STRIPE_HASH_LOCKS - 1;
348 } else
349 size = 1;
350 while (size) {
351 struct list_head *list = &temp_inactive_list[size - 1];
352
353 /*
354 * We don't hold any lock here yet, get_active_stripe() might
355 * remove stripes from the list
356 */
357 if (!list_empty_careful(list)) {
358 spin_lock_irqsave(conf->hash_locks + hash, flags);
359 list_splice_tail_init(list, conf->inactive_list + hash);
360 do_wakeup = true;
361 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
362 }
363 size--;
364 hash--;
365 }
366
367 if (do_wakeup) {
368 wake_up(&conf->wait_for_stripe);
369 if (conf->retry_read_aligned)
370 md_wakeup_thread(conf->mddev->thread);
371 }
294} 372}
295 373
296static struct llist_node *llist_reverse_order(struct llist_node *head) 374static struct llist_node *llist_reverse_order(struct llist_node *head)
@@ -308,7 +386,8 @@ static struct llist_node *llist_reverse_order(struct llist_node *head)
308} 386}
309 387
310/* should hold conf->device_lock already */ 388/* should hold conf->device_lock already */
311static int release_stripe_list(struct r5conf *conf) 389static int release_stripe_list(struct r5conf *conf,
390 struct list_head *temp_inactive_list)
312{ 391{
313 struct stripe_head *sh; 392 struct stripe_head *sh;
314 int count = 0; 393 int count = 0;
@@ -317,6 +396,8 @@ static int release_stripe_list(struct r5conf *conf)
317 head = llist_del_all(&conf->released_stripes); 396 head = llist_del_all(&conf->released_stripes);
318 head = llist_reverse_order(head); 397 head = llist_reverse_order(head);
319 while (head) { 398 while (head) {
399 int hash;
400
320 sh = llist_entry(head, struct stripe_head, release_list); 401 sh = llist_entry(head, struct stripe_head, release_list);
321 head = llist_next(head); 402 head = llist_next(head);
322 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 403 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
@@ -327,7 +408,8 @@ static int release_stripe_list(struct r5conf *conf)
327 * again, the count is always > 1. This is true for 408 * again, the count is always > 1. This is true for
328 * STRIPE_ON_UNPLUG_LIST bit too. 409 * STRIPE_ON_UNPLUG_LIST bit too.
329 */ 410 */
330 __release_stripe(conf, sh); 411 hash = sh->hash_lock_index;
412 __release_stripe(conf, sh, &temp_inactive_list[hash]);
331 count++; 413 count++;
332 } 414 }
333 415
@@ -338,6 +420,8 @@ static void release_stripe(struct stripe_head *sh)
338{ 420{
339 struct r5conf *conf = sh->raid_conf; 421 struct r5conf *conf = sh->raid_conf;
340 unsigned long flags; 422 unsigned long flags;
423 struct list_head list;
424 int hash;
341 bool wakeup; 425 bool wakeup;
342 426
343 if (unlikely(!conf->mddev->thread) || 427 if (unlikely(!conf->mddev->thread) ||
@@ -351,8 +435,11 @@ slow_path:
351 local_irq_save(flags); 435 local_irq_save(flags);
352 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 436 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
353 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 437 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
354 do_release_stripe(conf, sh); 438 INIT_LIST_HEAD(&list);
439 hash = sh->hash_lock_index;
440 do_release_stripe(conf, sh, &list);
355 spin_unlock(&conf->device_lock); 441 spin_unlock(&conf->device_lock);
442 release_inactive_stripe_list(conf, &list, hash);
356 } 443 }
357 local_irq_restore(flags); 444 local_irq_restore(flags);
358} 445}
@@ -377,18 +464,19 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
377 464
378 465
379/* find an idle stripe, make sure it is unhashed, and return it. */ 466/* find an idle stripe, make sure it is unhashed, and return it. */
380static struct stripe_head *get_free_stripe(struct r5conf *conf) 467static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
381{ 468{
382 struct stripe_head *sh = NULL; 469 struct stripe_head *sh = NULL;
383 struct list_head *first; 470 struct list_head *first;
384 471
385 if (list_empty(&conf->inactive_list)) 472 if (list_empty(conf->inactive_list + hash))
386 goto out; 473 goto out;
387 first = conf->inactive_list.next; 474 first = (conf->inactive_list + hash)->next;
388 sh = list_entry(first, struct stripe_head, lru); 475 sh = list_entry(first, struct stripe_head, lru);
389 list_del_init(first); 476 list_del_init(first);
390 remove_hash(sh); 477 remove_hash(sh);
391 atomic_inc(&conf->active_stripes); 478 atomic_inc(&conf->active_stripes);
479 BUG_ON(hash != sh->hash_lock_index);
392out: 480out:
393 return sh; 481 return sh;
394} 482}
@@ -431,7 +519,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
431static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 519static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
432{ 520{
433 struct r5conf *conf = sh->raid_conf; 521 struct r5conf *conf = sh->raid_conf;
434 int i; 522 int i, seq;
435 523
436 BUG_ON(atomic_read(&sh->count) != 0); 524 BUG_ON(atomic_read(&sh->count) != 0);
437 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 525 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
@@ -441,7 +529,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
441 (unsigned long long)sh->sector); 529 (unsigned long long)sh->sector);
442 530
443 remove_hash(sh); 531 remove_hash(sh);
444 532retry:
533 seq = read_seqcount_begin(&conf->gen_lock);
445 sh->generation = conf->generation - previous; 534 sh->generation = conf->generation - previous;
446 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 535 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
447 sh->sector = sector; 536 sh->sector = sector;
@@ -463,6 +552,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
463 dev->flags = 0; 552 dev->flags = 0;
464 raid5_build_block(sh, i, previous); 553 raid5_build_block(sh, i, previous);
465 } 554 }
555 if (read_seqcount_retry(&conf->gen_lock, seq))
556 goto retry;
466 insert_hash(conf, sh); 557 insert_hash(conf, sh);
467 sh->cpu = smp_processor_id(); 558 sh->cpu = smp_processor_id();
468} 559}
@@ -567,29 +658,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
567 int previous, int noblock, int noquiesce) 658 int previous, int noblock, int noquiesce)
568{ 659{
569 struct stripe_head *sh; 660 struct stripe_head *sh;
661 int hash = stripe_hash_locks_hash(sector);
570 662
571 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 663 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
572 664
573 spin_lock_irq(&conf->device_lock); 665 spin_lock_irq(conf->hash_locks + hash);
574 666
575 do { 667 do {
576 wait_event_lock_irq(conf->wait_for_stripe, 668 wait_event_lock_irq(conf->wait_for_stripe,
577 conf->quiesce == 0 || noquiesce, 669 conf->quiesce == 0 || noquiesce,
578 conf->device_lock); 670 *(conf->hash_locks + hash));
579 sh = __find_stripe(conf, sector, conf->generation - previous); 671 sh = __find_stripe(conf, sector, conf->generation - previous);
580 if (!sh) { 672 if (!sh) {
581 if (!conf->inactive_blocked) 673 if (!conf->inactive_blocked)
582 sh = get_free_stripe(conf); 674 sh = get_free_stripe(conf, hash);
583 if (noblock && sh == NULL) 675 if (noblock && sh == NULL)
584 break; 676 break;
585 if (!sh) { 677 if (!sh) {
586 conf->inactive_blocked = 1; 678 conf->inactive_blocked = 1;
587 wait_event_lock_irq(conf->wait_for_stripe, 679 wait_event_lock_irq(
588 !list_empty(&conf->inactive_list) && 680 conf->wait_for_stripe,
589 (atomic_read(&conf->active_stripes) 681 !list_empty(conf->inactive_list + hash) &&
590 < (conf->max_nr_stripes *3/4) 682 (atomic_read(&conf->active_stripes)
591 || !conf->inactive_blocked), 683 < (conf->max_nr_stripes * 3 / 4)
592 conf->device_lock); 684 || !conf->inactive_blocked),
685 *(conf->hash_locks + hash));
593 conf->inactive_blocked = 0; 686 conf->inactive_blocked = 0;
594 } else 687 } else
595 init_stripe(sh, sector, previous); 688 init_stripe(sh, sector, previous);
@@ -600,9 +693,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
600 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) 693 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
601 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 694 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
602 } else { 695 } else {
696 spin_lock(&conf->device_lock);
603 if (!test_bit(STRIPE_HANDLE, &sh->state)) 697 if (!test_bit(STRIPE_HANDLE, &sh->state))
604 atomic_inc(&conf->active_stripes); 698 atomic_inc(&conf->active_stripes);
605 if (list_empty(&sh->lru) && 699 if (list_empty(&sh->lru) &&
700 !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) &&
606 !test_bit(STRIPE_EXPANDING, &sh->state)) 701 !test_bit(STRIPE_EXPANDING, &sh->state))
607 BUG(); 702 BUG();
608 list_del_init(&sh->lru); 703 list_del_init(&sh->lru);
@@ -610,6 +705,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
610 sh->group->stripes_cnt--; 705 sh->group->stripes_cnt--;
611 sh->group = NULL; 706 sh->group = NULL;
612 } 707 }
708 spin_unlock(&conf->device_lock);
613 } 709 }
614 } 710 }
615 } while (sh == NULL); 711 } while (sh == NULL);
@@ -617,7 +713,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
617 if (sh) 713 if (sh)
618 atomic_inc(&sh->count); 714 atomic_inc(&sh->count);
619 715
620 spin_unlock_irq(&conf->device_lock); 716 spin_unlock_irq(conf->hash_locks + hash);
621 return sh; 717 return sh;
622} 718}
623 719
@@ -1597,7 +1693,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1597 put_cpu(); 1693 put_cpu();
1598} 1694}
1599 1695
1600static int grow_one_stripe(struct r5conf *conf) 1696static int grow_one_stripe(struct r5conf *conf, int hash)
1601{ 1697{
1602 struct stripe_head *sh; 1698 struct stripe_head *sh;
1603 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1699 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1613,6 +1709,7 @@ static int grow_one_stripe(struct r5conf *conf)
1613 kmem_cache_free(conf->slab_cache, sh); 1709 kmem_cache_free(conf->slab_cache, sh);
1614 return 0; 1710 return 0;
1615 } 1711 }
1712 sh->hash_lock_index = hash;
1616 /* we just created an active stripe so... */ 1713 /* we just created an active stripe so... */
1617 atomic_set(&sh->count, 1); 1714 atomic_set(&sh->count, 1);
1618 atomic_inc(&conf->active_stripes); 1715 atomic_inc(&conf->active_stripes);
@@ -1625,6 +1722,7 @@ static int grow_stripes(struct r5conf *conf, int num)
1625{ 1722{
1626 struct kmem_cache *sc; 1723 struct kmem_cache *sc;
1627 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1724 int devs = max(conf->raid_disks, conf->previous_raid_disks);
1725 int hash;
1628 1726
1629 if (conf->mddev->gendisk) 1727 if (conf->mddev->gendisk)
1630 sprintf(conf->cache_name[0], 1728 sprintf(conf->cache_name[0],
@@ -1642,9 +1740,13 @@ static int grow_stripes(struct r5conf *conf, int num)
1642 return 1; 1740 return 1;
1643 conf->slab_cache = sc; 1741 conf->slab_cache = sc;
1644 conf->pool_size = devs; 1742 conf->pool_size = devs;
1645 while (num--) 1743 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
1646 if (!grow_one_stripe(conf)) 1744 while (num--) {
1745 if (!grow_one_stripe(conf, hash))
1647 return 1; 1746 return 1;
1747 conf->max_nr_stripes++;
1748 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
1749 }
1648 return 0; 1750 return 0;
1649} 1751}
1650 1752
@@ -1702,6 +1804,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1702 int err; 1804 int err;
1703 struct kmem_cache *sc; 1805 struct kmem_cache *sc;
1704 int i; 1806 int i;
1807 int hash, cnt;
1705 1808
1706 if (newsize <= conf->pool_size) 1809 if (newsize <= conf->pool_size)
1707 return 0; /* never bother to shrink */ 1810 return 0; /* never bother to shrink */
@@ -1741,19 +1844,29 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1741 * OK, we have enough stripes, start collecting inactive 1844 * OK, we have enough stripes, start collecting inactive
1742 * stripes and copying them over 1845 * stripes and copying them over
1743 */ 1846 */
1847 hash = 0;
1848 cnt = 0;
1744 list_for_each_entry(nsh, &newstripes, lru) { 1849 list_for_each_entry(nsh, &newstripes, lru) {
1745 spin_lock_irq(&conf->device_lock); 1850 lock_device_hash_lock(conf, hash);
1746 wait_event_lock_irq(conf->wait_for_stripe, 1851 wait_event_cmd(conf->wait_for_stripe,
1747 !list_empty(&conf->inactive_list), 1852 !list_empty(conf->inactive_list + hash),
1748 conf->device_lock); 1853 unlock_device_hash_lock(conf, hash),
1749 osh = get_free_stripe(conf); 1854 lock_device_hash_lock(conf, hash));
1750 spin_unlock_irq(&conf->device_lock); 1855 osh = get_free_stripe(conf, hash);
1856 unlock_device_hash_lock(conf, hash);
1751 atomic_set(&nsh->count, 1); 1857 atomic_set(&nsh->count, 1);
1752 for(i=0; i<conf->pool_size; i++) 1858 for(i=0; i<conf->pool_size; i++)
1753 nsh->dev[i].page = osh->dev[i].page; 1859 nsh->dev[i].page = osh->dev[i].page;
1754 for( ; i<newsize; i++) 1860 for( ; i<newsize; i++)
1755 nsh->dev[i].page = NULL; 1861 nsh->dev[i].page = NULL;
1862 nsh->hash_lock_index = hash;
1756 kmem_cache_free(conf->slab_cache, osh); 1863 kmem_cache_free(conf->slab_cache, osh);
1864 cnt++;
1865 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
1866 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
1867 hash++;
1868 cnt = 0;
1869 }
1757 } 1870 }
1758 kmem_cache_destroy(conf->slab_cache); 1871 kmem_cache_destroy(conf->slab_cache);
1759 1872
@@ -1812,13 +1925,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1812 return err; 1925 return err;
1813} 1926}
1814 1927
1815static int drop_one_stripe(struct r5conf *conf) 1928static int drop_one_stripe(struct r5conf *conf, int hash)
1816{ 1929{
1817 struct stripe_head *sh; 1930 struct stripe_head *sh;
1818 1931
1819 spin_lock_irq(&conf->device_lock); 1932 spin_lock_irq(conf->hash_locks + hash);
1820 sh = get_free_stripe(conf); 1933 sh = get_free_stripe(conf, hash);
1821 spin_unlock_irq(&conf->device_lock); 1934 spin_unlock_irq(conf->hash_locks + hash);
1822 if (!sh) 1935 if (!sh)
1823 return 0; 1936 return 0;
1824 BUG_ON(atomic_read(&sh->count)); 1937 BUG_ON(atomic_read(&sh->count));
@@ -1830,8 +1943,10 @@ static int drop_one_stripe(struct r5conf *conf)
1830 1943
1831static void shrink_stripes(struct r5conf *conf) 1944static void shrink_stripes(struct r5conf *conf)
1832{ 1945{
1833 while (drop_one_stripe(conf)) 1946 int hash;
1834 ; 1947 for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
1948 while (drop_one_stripe(conf, hash))
1949 ;
1835 1950
1836 if (conf->slab_cache) 1951 if (conf->slab_cache)
1837 kmem_cache_destroy(conf->slab_cache); 1952 kmem_cache_destroy(conf->slab_cache);
@@ -3915,7 +4030,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
3915 } 4030 }
3916} 4031}
3917 4032
3918static void activate_bit_delay(struct r5conf *conf) 4033static void activate_bit_delay(struct r5conf *conf,
4034 struct list_head *temp_inactive_list)
3919{ 4035{
3920 /* device_lock is held */ 4036 /* device_lock is held */
3921 struct list_head head; 4037 struct list_head head;
@@ -3923,9 +4039,11 @@ static void activate_bit_delay(struct r5conf *conf)
3923 list_del_init(&conf->bitmap_list); 4039 list_del_init(&conf->bitmap_list);
3924 while (!list_empty(&head)) { 4040 while (!list_empty(&head)) {
3925 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 4041 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
4042 int hash;
3926 list_del_init(&sh->lru); 4043 list_del_init(&sh->lru);
3927 atomic_inc(&sh->count); 4044 atomic_inc(&sh->count);
3928 __release_stripe(conf, sh); 4045 hash = sh->hash_lock_index;
4046 __release_stripe(conf, sh, &temp_inactive_list[hash]);
3929 } 4047 }
3930} 4048}
3931 4049
@@ -3941,7 +4059,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
3941 return 1; 4059 return 1;
3942 if (conf->quiesce) 4060 if (conf->quiesce)
3943 return 1; 4061 return 1;
3944 if (list_empty_careful(&conf->inactive_list)) 4062 if (atomic_read(&conf->active_stripes) == conf->max_nr_stripes)
3945 return 1; 4063 return 1;
3946 4064
3947 return 0; 4065 return 0;
@@ -4271,6 +4389,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
4271struct raid5_plug_cb { 4389struct raid5_plug_cb {
4272 struct blk_plug_cb cb; 4390 struct blk_plug_cb cb;
4273 struct list_head list; 4391 struct list_head list;
4392 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
4274}; 4393};
4275 4394
4276static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4395static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
@@ -4281,6 +4400,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4281 struct mddev *mddev = cb->cb.data; 4400 struct mddev *mddev = cb->cb.data;
4282 struct r5conf *conf = mddev->private; 4401 struct r5conf *conf = mddev->private;
4283 int cnt = 0; 4402 int cnt = 0;
4403 int hash;
4284 4404
4285 if (cb->list.next && !list_empty(&cb->list)) { 4405 if (cb->list.next && !list_empty(&cb->list)) {
4286 spin_lock_irq(&conf->device_lock); 4406 spin_lock_irq(&conf->device_lock);
@@ -4298,11 +4418,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4298 * STRIPE_ON_RELEASE_LIST could be set here. In that 4418 * STRIPE_ON_RELEASE_LIST could be set here. In that
4299 * case, the count is always > 1 here 4419 * case, the count is always > 1 here
4300 */ 4420 */
4301 __release_stripe(conf, sh); 4421 hash = sh->hash_lock_index;
4422 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
4302 cnt++; 4423 cnt++;
4303 } 4424 }
4304 spin_unlock_irq(&conf->device_lock); 4425 spin_unlock_irq(&conf->device_lock);
4305 } 4426 }
4427 release_inactive_stripe_list(conf, cb->temp_inactive_list,
4428 NR_STRIPE_HASH_LOCKS);
4306 if (mddev->queue) 4429 if (mddev->queue)
4307 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4430 trace_block_unplug(mddev->queue, cnt, !from_schedule);
4308 kfree(cb); 4431 kfree(cb);
@@ -4323,8 +4446,12 @@ static void release_stripe_plug(struct mddev *mddev,
4323 4446
4324 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4447 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
4325 4448
4326 if (cb->list.next == NULL) 4449 if (cb->list.next == NULL) {
4450 int i;
4327 INIT_LIST_HEAD(&cb->list); 4451 INIT_LIST_HEAD(&cb->list);
4452 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
4453 INIT_LIST_HEAD(cb->temp_inactive_list + i);
4454 }
4328 4455
4329 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4456 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
4330 list_add_tail(&sh->lru, &cb->list); 4457 list_add_tail(&sh->lru, &cb->list);
@@ -4969,27 +5096,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4969} 5096}
4970 5097
4971static int handle_active_stripes(struct r5conf *conf, int group, 5098static int handle_active_stripes(struct r5conf *conf, int group,
4972 struct r5worker *worker) 5099 struct r5worker *worker,
5100 struct list_head *temp_inactive_list)
4973{ 5101{
4974 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 5102 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
4975 int i, batch_size = 0; 5103 int i, batch_size = 0, hash;
5104 bool release_inactive = false;
4976 5105
4977 while (batch_size < MAX_STRIPE_BATCH && 5106 while (batch_size < MAX_STRIPE_BATCH &&
4978 (sh = __get_priority_stripe(conf, group)) != NULL) 5107 (sh = __get_priority_stripe(conf, group)) != NULL)
4979 batch[batch_size++] = sh; 5108 batch[batch_size++] = sh;
4980 5109
4981 if (batch_size == 0) 5110 if (batch_size == 0) {
4982 return batch_size; 5111 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5112 if (!list_empty(temp_inactive_list + i))
5113 break;
5114 if (i == NR_STRIPE_HASH_LOCKS)
5115 return batch_size;
5116 release_inactive = true;
5117 }
4983 spin_unlock_irq(&conf->device_lock); 5118 spin_unlock_irq(&conf->device_lock);
4984 5119
5120 release_inactive_stripe_list(conf, temp_inactive_list,
5121 NR_STRIPE_HASH_LOCKS);
5122
5123 if (release_inactive) {
5124 spin_lock_irq(&conf->device_lock);
5125 return 0;
5126 }
5127
4985 for (i = 0; i < batch_size; i++) 5128 for (i = 0; i < batch_size; i++)
4986 handle_stripe(batch[i]); 5129 handle_stripe(batch[i]);
4987 5130
4988 cond_resched(); 5131 cond_resched();
4989 5132
4990 spin_lock_irq(&conf->device_lock); 5133 spin_lock_irq(&conf->device_lock);
4991 for (i = 0; i < batch_size; i++) 5134 for (i = 0; i < batch_size; i++) {
4992 __release_stripe(conf, batch[i]); 5135 hash = batch[i]->hash_lock_index;
5136 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
5137 }
4993 return batch_size; 5138 return batch_size;
4994} 5139}
4995 5140
@@ -5010,9 +5155,10 @@ static void raid5_do_work(struct work_struct *work)
5010 while (1) { 5155 while (1) {
5011 int batch_size, released; 5156 int batch_size, released;
5012 5157
5013 released = release_stripe_list(conf); 5158 released = release_stripe_list(conf, worker->temp_inactive_list);
5014 5159
5015 batch_size = handle_active_stripes(conf, group_id, worker); 5160 batch_size = handle_active_stripes(conf, group_id, worker,
5161 worker->temp_inactive_list);
5016 worker->working = false; 5162 worker->working = false;
5017 if (!batch_size && !released) 5163 if (!batch_size && !released)
5018 break; 5164 break;
@@ -5051,7 +5197,7 @@ static void raid5d(struct md_thread *thread)
5051 struct bio *bio; 5197 struct bio *bio;
5052 int batch_size, released; 5198 int batch_size, released;
5053 5199
5054 released = release_stripe_list(conf); 5200 released = release_stripe_list(conf, conf->temp_inactive_list);
5055 5201
5056 if ( 5202 if (
5057 !list_empty(&conf->bitmap_list)) { 5203 !list_empty(&conf->bitmap_list)) {
@@ -5061,7 +5207,7 @@ static void raid5d(struct md_thread *thread)
5061 bitmap_unplug(mddev->bitmap); 5207 bitmap_unplug(mddev->bitmap);
5062 spin_lock_irq(&conf->device_lock); 5208 spin_lock_irq(&conf->device_lock);
5063 conf->seq_write = conf->seq_flush; 5209 conf->seq_write = conf->seq_flush;
5064 activate_bit_delay(conf); 5210 activate_bit_delay(conf, conf->temp_inactive_list);
5065 } 5211 }
5066 raid5_activate_delayed(conf); 5212 raid5_activate_delayed(conf);
5067 5213
@@ -5075,7 +5221,8 @@ static void raid5d(struct md_thread *thread)
5075 handled++; 5221 handled++;
5076 } 5222 }
5077 5223
5078 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); 5224 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
5225 conf->temp_inactive_list);
5079 if (!batch_size && !released) 5226 if (!batch_size && !released)
5080 break; 5227 break;
5081 handled += batch_size; 5228 handled += batch_size;
@@ -5111,22 +5258,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
5111{ 5258{
5112 struct r5conf *conf = mddev->private; 5259 struct r5conf *conf = mddev->private;
5113 int err; 5260 int err;
5261 int hash;
5114 5262
5115 if (size <= 16 || size > 32768) 5263 if (size <= 16 || size > 32768)
5116 return -EINVAL; 5264 return -EINVAL;
5265 hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
5117 while (size < conf->max_nr_stripes) { 5266 while (size < conf->max_nr_stripes) {
5118 if (drop_one_stripe(conf)) 5267 if (drop_one_stripe(conf, hash))
5119 conf->max_nr_stripes--; 5268 conf->max_nr_stripes--;
5120 else 5269 else
5121 break; 5270 break;
5271 hash--;
5272 if (hash < 0)
5273 hash = NR_STRIPE_HASH_LOCKS - 1;
5122 } 5274 }
5123 err = md_allow_write(mddev); 5275 err = md_allow_write(mddev);
5124 if (err) 5276 if (err)
5125 return err; 5277 return err;
5278 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
5126 while (size > conf->max_nr_stripes) { 5279 while (size > conf->max_nr_stripes) {
5127 if (grow_one_stripe(conf)) 5280 if (grow_one_stripe(conf, hash))
5128 conf->max_nr_stripes++; 5281 conf->max_nr_stripes++;
5129 else break; 5282 else break;
5283 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
5130 } 5284 }
5131 return 0; 5285 return 0;
5132} 5286}
@@ -5277,7 +5431,7 @@ static struct attribute_group raid5_attrs_group = {
5277 5431
5278static int alloc_thread_groups(struct r5conf *conf, int cnt) 5432static int alloc_thread_groups(struct r5conf *conf, int cnt)
5279{ 5433{
5280 int i, j; 5434 int i, j, k;
5281 ssize_t size; 5435 ssize_t size;
5282 struct r5worker *workers; 5436 struct r5worker *workers;
5283 5437
@@ -5307,8 +5461,12 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt)
5307 group->workers = workers + i * cnt; 5461 group->workers = workers + i * cnt;
5308 5462
5309 for (j = 0; j < cnt; j++) { 5463 for (j = 0; j < cnt; j++) {
5310 group->workers[j].group = group; 5464 struct r5worker *worker = group->workers + j;
5311 INIT_WORK(&group->workers[j].work, raid5_do_work); 5465 worker->group = group;
5466 INIT_WORK(&worker->work, raid5_do_work);
5467
5468 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
5469 INIT_LIST_HEAD(worker->temp_inactive_list + k);
5312 } 5470 }
5313 } 5471 }
5314 5472
@@ -5459,6 +5617,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5459 struct md_rdev *rdev; 5617 struct md_rdev *rdev;
5460 struct disk_info *disk; 5618 struct disk_info *disk;
5461 char pers_name[6]; 5619 char pers_name[6];
5620 int i;
5462 5621
5463 if (mddev->new_level != 5 5622 if (mddev->new_level != 5
5464 && mddev->new_level != 4 5623 && mddev->new_level != 4
@@ -5503,7 +5662,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5503 INIT_LIST_HEAD(&conf->hold_list); 5662 INIT_LIST_HEAD(&conf->hold_list);
5504 INIT_LIST_HEAD(&conf->delayed_list); 5663 INIT_LIST_HEAD(&conf->delayed_list);
5505 INIT_LIST_HEAD(&conf->bitmap_list); 5664 INIT_LIST_HEAD(&conf->bitmap_list);
5506 INIT_LIST_HEAD(&conf->inactive_list);
5507 init_llist_head(&conf->released_stripes); 5665 init_llist_head(&conf->released_stripes);
5508 atomic_set(&conf->active_stripes, 0); 5666 atomic_set(&conf->active_stripes, 0);
5509 atomic_set(&conf->preread_active_stripes, 0); 5667 atomic_set(&conf->preread_active_stripes, 0);
@@ -5529,6 +5687,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5529 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5687 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
5530 goto abort; 5688 goto abort;
5531 5689
5690 /* We init hash_locks[0] separately to that it can be used
5691 * as the reference lock in the spin_lock_nest_lock() call
5692 * in lock_all_device_hash_locks_irq in order to convince
5693 * lockdep that we know what we are doing.
5694 */
5695 spin_lock_init(conf->hash_locks);
5696 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
5697 spin_lock_init(conf->hash_locks + i);
5698
5699 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5700 INIT_LIST_HEAD(conf->inactive_list + i);
5701
5702 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5703 INIT_LIST_HEAD(conf->temp_inactive_list + i);
5704
5532 conf->level = mddev->new_level; 5705 conf->level = mddev->new_level;
5533 if (raid5_alloc_percpu(conf) != 0) 5706 if (raid5_alloc_percpu(conf) != 0)
5534 goto abort; 5707 goto abort;
@@ -5569,7 +5742,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5569 else 5742 else
5570 conf->max_degraded = 1; 5743 conf->max_degraded = 1;
5571 conf->algorithm = mddev->new_layout; 5744 conf->algorithm = mddev->new_layout;
5572 conf->max_nr_stripes = NR_STRIPES;
5573 conf->reshape_progress = mddev->reshape_position; 5745 conf->reshape_progress = mddev->reshape_position;
5574 if (conf->reshape_progress != MaxSector) { 5746 if (conf->reshape_progress != MaxSector) {
5575 conf->prev_chunk_sectors = mddev->chunk_sectors; 5747 conf->prev_chunk_sectors = mddev->chunk_sectors;
@@ -5578,7 +5750,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5578 5750
5579 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5751 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
5580 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5752 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
5581 if (grow_stripes(conf, conf->max_nr_stripes)) { 5753 if (grow_stripes(conf, NR_STRIPES)) {
5582 printk(KERN_ERR 5754 printk(KERN_ERR
5583 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5755 "md/raid:%s: couldn't allocate %dkB for buffers\n",
5584 mdname(mddev), memory); 5756 mdname(mddev), memory);
@@ -6483,27 +6655,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
6483 break; 6655 break;
6484 6656
6485 case 1: /* stop all writes */ 6657 case 1: /* stop all writes */
6486 spin_lock_irq(&conf->device_lock); 6658 lock_all_device_hash_locks_irq(conf);
6487 /* '2' tells resync/reshape to pause so that all 6659 /* '2' tells resync/reshape to pause so that all
6488 * active stripes can drain 6660 * active stripes can drain
6489 */ 6661 */
6490 conf->quiesce = 2; 6662 conf->quiesce = 2;
6491 wait_event_lock_irq(conf->wait_for_stripe, 6663 wait_event_cmd(conf->wait_for_stripe,
6492 atomic_read(&conf->active_stripes) == 0 && 6664 atomic_read(&conf->active_stripes) == 0 &&
6493 atomic_read(&conf->active_aligned_reads) == 0, 6665 atomic_read(&conf->active_aligned_reads) == 0,
6494 conf->device_lock); 6666 unlock_all_device_hash_locks_irq(conf),
6667 lock_all_device_hash_locks_irq(conf));
6495 conf->quiesce = 1; 6668 conf->quiesce = 1;
6496 spin_unlock_irq(&conf->device_lock); 6669 unlock_all_device_hash_locks_irq(conf);
6497 /* allow reshape to continue */ 6670 /* allow reshape to continue */
6498 wake_up(&conf->wait_for_overlap); 6671 wake_up(&conf->wait_for_overlap);
6499 break; 6672 break;
6500 6673
6501 case 0: /* re-enable writes */ 6674 case 0: /* re-enable writes */
6502 spin_lock_irq(&conf->device_lock); 6675 lock_all_device_hash_locks_irq(conf);
6503 conf->quiesce = 0; 6676 conf->quiesce = 0;
6504 wake_up(&conf->wait_for_stripe); 6677 wake_up(&conf->wait_for_stripe);
6505 wake_up(&conf->wait_for_overlap); 6678 wake_up(&conf->wait_for_overlap);
6506 spin_unlock_irq(&conf->device_lock); 6679 unlock_all_device_hash_locks_irq(conf);
6507 break; 6680 break;
6508 } 6681 }
6509} 6682}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2113ffa82c7a..a9e443a1116f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -205,6 +205,7 @@ struct stripe_head {
205 short pd_idx; /* parity disk index */ 205 short pd_idx; /* parity disk index */
206 short qd_idx; /* 'Q' disk index for raid6 */ 206 short qd_idx; /* 'Q' disk index for raid6 */
207 short ddf_layout;/* use DDF ordering to calculate Q */ 207 short ddf_layout;/* use DDF ordering to calculate Q */
208 short hash_lock_index;
208 unsigned long state; /* state flags */ 209 unsigned long state; /* state flags */
209 atomic_t count; /* nr of active thread/requests */ 210 atomic_t count; /* nr of active thread/requests */
210 int bm_seq; /* sequence number for bitmap flushes */ 211 int bm_seq; /* sequence number for bitmap flushes */
@@ -367,9 +368,18 @@ struct disk_info {
367 struct md_rdev *rdev, *replacement; 368 struct md_rdev *rdev, *replacement;
368}; 369};
369 370
371/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
372 * This is because we sometimes take all the spinlocks
373 * and creating that much locking depth can cause
374 * problems.
375 */
376#define NR_STRIPE_HASH_LOCKS 8
377#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
378
370struct r5worker { 379struct r5worker {
371 struct work_struct work; 380 struct work_struct work;
372 struct r5worker_group *group; 381 struct r5worker_group *group;
382 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
373 bool working; 383 bool working;
374}; 384};
375 385
@@ -382,6 +392,8 @@ struct r5worker_group {
382 392
383struct r5conf { 393struct r5conf {
384 struct hlist_head *stripe_hashtbl; 394 struct hlist_head *stripe_hashtbl;
395 /* only protect corresponding hash list and inactive_list */
396 spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS];
385 struct mddev *mddev; 397 struct mddev *mddev;
386 int chunk_sectors; 398 int chunk_sectors;
387 int level, algorithm; 399 int level, algorithm;
@@ -462,7 +474,7 @@ struct r5conf {
462 * Free stripes pool 474 * Free stripes pool
463 */ 475 */
464 atomic_t active_stripes; 476 atomic_t active_stripes;
465 struct list_head inactive_list; 477 struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
466 struct llist_head released_stripes; 478 struct llist_head released_stripes;
467 wait_queue_head_t wait_for_stripe; 479 wait_queue_head_t wait_for_stripe;
468 wait_queue_head_t wait_for_overlap; 480 wait_queue_head_t wait_for_overlap;
@@ -477,6 +489,7 @@ struct r5conf {
477 * the new thread here until we fully activate the array. 489 * the new thread here until we fully activate the array.
478 */ 490 */
479 struct md_thread *thread; 491 struct md_thread *thread;
492 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
480 struct r5worker_group *worker_groups; 493 struct r5worker_group *worker_groups;
481 int group_cnt; 494 int group_cnt;
482 int worker_cnt_per_group; 495 int worker_cnt_per_group;