aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorShaohua Li <shli@kernel.org>2013-11-13 23:16:17 -0500
committerNeilBrown <neilb@suse.de>2013-11-13 23:20:58 -0500
commit566c09c53455d7c4f1130928ef8071da1a24ea65 (patch)
tree8cc5de15ad5aa9027ca028844c16de3dc9532e95 /drivers
parent82e06c811163c4d853ed335d56c3378088bc89cc (diff)
raid5: relieve lock contention in get_active_stripe()
get_active_stripe() is the last place we have lock contention. It has two paths. One is stripe isn't found and new stripe is allocated, the other is stripe is found. The first path basically calls __find_stripe and init_stripe. It accesses conf->generation, conf->previous_raid_disks, conf->raid_disks, conf->prev_chunk_sectors, conf->chunk_sectors, conf->max_degraded, conf->prev_algo, conf->algorithm, the stripe_hashtbl and inactive_list. Except stripe_hashtbl and inactive_list, other fields are changed very rarely. With this patch, we split inactive_list and add new hash locks. Each free stripe belongs to a specific inactive list. Which inactive list is determined by stripe's lock_hash. Note, even a stripe hasn't a sector assigned, it has a lock_hash assigned. Stripe's inactive list is protected by a hash lock, which is determined by it's lock_hash too. The lock_hash is derivied from current stripe_hashtbl hash, which guarantees any stripe_hashtbl list will be assigned to a specific lock_hash, so we can use new hash lock to protect stripe_hashtbl list too. The goal of the new hash locks introduced is we can only use the new locks in the first path of get_active_stripe(). Since we have several hash locks, lock contention is relieved significantly. The first path of get_active_stripe() accesses other fields, since they are changed rarely, changing them now need take conf->device_lock and all hash locks. For a slow path, this isn't a problem. If we need lock device_lock and hash lock, we always lock hash lock first. The tricky part is release_stripe and friends. We need take device_lock first. Neil's suggestion is we put inactive stripes to a temporary list and readd it to inactive_list after device_lock is released. In this way, we add stripes to temporary list with device_lock hold and remove stripes from the list with hash lock hold. So we don't allow concurrent access to the temporary list, which means we need allocate temporary list for all participants of release_stripe. One downside is free stripes are maintained in their inactive list, they can't across between the lists. By default, we have total 256 stripes and 8 lists, so each list will have 32 stripes. It's possible one list has free stripe but other list hasn't. The chance should be rare because stripes allocation are even distributed. And we can always allocate more stripes for cache, several mega bytes memory isn't a big deal. This completely removes the lock contention of the first path of get_active_stripe(). It slows down the second code path a little bit though because we now need takes two locks, but since the hash lock isn't contended, the overhead should be quite small (several atomic instructions). The second path of get_active_stripe() (basically sequential write or big request size randwrite) still has lock contentions. Signed-off-by: Shaohua Li <shli@fusionio.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/raid5.c317
-rw-r--r--drivers/md/raid5.h15
2 files changed, 259 insertions, 73 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4bbcb7e26d12..93090b2afab4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
85 return &conf->stripe_hashtbl[hash]; 85 return &conf->stripe_hashtbl[hash];
86} 86}
87 87
88static inline int stripe_hash_locks_hash(sector_t sect)
89{
90 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
91}
92
93static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
94{
95 spin_lock_irq(conf->hash_locks + hash);
96 spin_lock(&conf->device_lock);
97}
98
99static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
100{
101 spin_unlock(&conf->device_lock);
102 spin_unlock_irq(conf->hash_locks + hash);
103}
104
105static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
106{
107 int i;
108 local_irq_disable();
109 spin_lock(conf->hash_locks);
110 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
111 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
112 spin_lock(&conf->device_lock);
113}
114
115static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
116{
117 int i;
118 spin_unlock(&conf->device_lock);
119 for (i = NR_STRIPE_HASH_LOCKS; i; i--)
120 spin_unlock(conf->hash_locks + i - 1);
121 local_irq_enable();
122}
123
88/* bio's attached to a stripe+device for I/O are linked together in bi_sector 124/* bio's attached to a stripe+device for I/O are linked together in bi_sector
89 * order without overlap. There may be several bio's per stripe+device, and 125 * order without overlap. There may be several bio's per stripe+device, and
90 * a bio could span several devices. 126 * a bio could span several devices.
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
249 } 285 }
250} 286}
251 287
252static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 288static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
289 struct list_head *temp_inactive_list)
253{ 290{
254 BUG_ON(!list_empty(&sh->lru)); 291 BUG_ON(!list_empty(&sh->lru));
255 BUG_ON(atomic_read(&conf->active_stripes)==0); 292 BUG_ON(atomic_read(&conf->active_stripes)==0);
@@ -278,19 +315,60 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
278 < IO_THRESHOLD) 315 < IO_THRESHOLD)
279 md_wakeup_thread(conf->mddev->thread); 316 md_wakeup_thread(conf->mddev->thread);
280 atomic_dec(&conf->active_stripes); 317 atomic_dec(&conf->active_stripes);
281 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 318 if (!test_bit(STRIPE_EXPANDING, &sh->state))
282 list_add_tail(&sh->lru, &conf->inactive_list); 319 list_add_tail(&sh->lru, temp_inactive_list);
283 wake_up(&conf->wait_for_stripe);
284 if (conf->retry_read_aligned)
285 md_wakeup_thread(conf->mddev->thread);
286 }
287 } 320 }
288} 321}
289 322
290static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 323static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
324 struct list_head *temp_inactive_list)
291{ 325{
292 if (atomic_dec_and_test(&sh->count)) 326 if (atomic_dec_and_test(&sh->count))
293 do_release_stripe(conf, sh); 327 do_release_stripe(conf, sh, temp_inactive_list);
328}
329
330/*
331 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
332 *
333 * Be careful: Only one task can add/delete stripes from temp_inactive_list at
334 * given time. Adding stripes only takes device lock, while deleting stripes
335 * only takes hash lock.
336 */
337static void release_inactive_stripe_list(struct r5conf *conf,
338 struct list_head *temp_inactive_list,
339 int hash)
340{
341 int size;
342 bool do_wakeup = false;
343 unsigned long flags;
344
345 if (hash == NR_STRIPE_HASH_LOCKS) {
346 size = NR_STRIPE_HASH_LOCKS;
347 hash = NR_STRIPE_HASH_LOCKS - 1;
348 } else
349 size = 1;
350 while (size) {
351 struct list_head *list = &temp_inactive_list[size - 1];
352
353 /*
354 * We don't hold any lock here yet, get_active_stripe() might
355 * remove stripes from the list
356 */
357 if (!list_empty_careful(list)) {
358 spin_lock_irqsave(conf->hash_locks + hash, flags);
359 list_splice_tail_init(list, conf->inactive_list + hash);
360 do_wakeup = true;
361 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
362 }
363 size--;
364 hash--;
365 }
366
367 if (do_wakeup) {
368 wake_up(&conf->wait_for_stripe);
369 if (conf->retry_read_aligned)
370 md_wakeup_thread(conf->mddev->thread);
371 }
294} 372}
295 373
296static struct llist_node *llist_reverse_order(struct llist_node *head) 374static struct llist_node *llist_reverse_order(struct llist_node *head)
@@ -308,7 +386,8 @@ static struct llist_node *llist_reverse_order(struct llist_node *head)
308} 386}
309 387
310/* should hold conf->device_lock already */ 388/* should hold conf->device_lock already */
311static int release_stripe_list(struct r5conf *conf) 389static int release_stripe_list(struct r5conf *conf,
390 struct list_head *temp_inactive_list)
312{ 391{
313 struct stripe_head *sh; 392 struct stripe_head *sh;
314 int count = 0; 393 int count = 0;
@@ -317,6 +396,8 @@ static int release_stripe_list(struct r5conf *conf)
317 head = llist_del_all(&conf->released_stripes); 396 head = llist_del_all(&conf->released_stripes);
318 head = llist_reverse_order(head); 397 head = llist_reverse_order(head);
319 while (head) { 398 while (head) {
399 int hash;
400
320 sh = llist_entry(head, struct stripe_head, release_list); 401 sh = llist_entry(head, struct stripe_head, release_list);
321 head = llist_next(head); 402 head = llist_next(head);
322 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 403 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
@@ -327,7 +408,8 @@ static int release_stripe_list(struct r5conf *conf)
327 * again, the count is always > 1. This is true for 408 * again, the count is always > 1. This is true for
328 * STRIPE_ON_UNPLUG_LIST bit too. 409 * STRIPE_ON_UNPLUG_LIST bit too.
329 */ 410 */
330 __release_stripe(conf, sh); 411 hash = sh->hash_lock_index;
412 __release_stripe(conf, sh, &temp_inactive_list[hash]);
331 count++; 413 count++;
332 } 414 }
333 415
@@ -338,6 +420,8 @@ static void release_stripe(struct stripe_head *sh)
338{ 420{
339 struct r5conf *conf = sh->raid_conf; 421 struct r5conf *conf = sh->raid_conf;
340 unsigned long flags; 422 unsigned long flags;
423 struct list_head list;
424 int hash;
341 bool wakeup; 425 bool wakeup;
342 426
343 if (unlikely(!conf->mddev->thread) || 427 if (unlikely(!conf->mddev->thread) ||
@@ -351,8 +435,11 @@ slow_path:
351 local_irq_save(flags); 435 local_irq_save(flags);
352 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 436 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
353 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 437 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
354 do_release_stripe(conf, sh); 438 INIT_LIST_HEAD(&list);
439 hash = sh->hash_lock_index;
440 do_release_stripe(conf, sh, &list);
355 spin_unlock(&conf->device_lock); 441 spin_unlock(&conf->device_lock);
442 release_inactive_stripe_list(conf, &list, hash);
356 } 443 }
357 local_irq_restore(flags); 444 local_irq_restore(flags);
358} 445}
@@ -377,18 +464,19 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
377 464
378 465
379/* find an idle stripe, make sure it is unhashed, and return it. */ 466/* find an idle stripe, make sure it is unhashed, and return it. */
380static struct stripe_head *get_free_stripe(struct r5conf *conf) 467static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
381{ 468{
382 struct stripe_head *sh = NULL; 469 struct stripe_head *sh = NULL;
383 struct list_head *first; 470 struct list_head *first;
384 471
385 if (list_empty(&conf->inactive_list)) 472 if (list_empty(conf->inactive_list + hash))
386 goto out; 473 goto out;
387 first = conf->inactive_list.next; 474 first = (conf->inactive_list + hash)->next;
388 sh = list_entry(first, struct stripe_head, lru); 475 sh = list_entry(first, struct stripe_head, lru);
389 list_del_init(first); 476 list_del_init(first);
390 remove_hash(sh); 477 remove_hash(sh);
391 atomic_inc(&conf->active_stripes); 478 atomic_inc(&conf->active_stripes);
479 BUG_ON(hash != sh->hash_lock_index);
392out: 480out:
393 return sh; 481 return sh;
394} 482}
@@ -431,7 +519,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
431static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 519static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
432{ 520{
433 struct r5conf *conf = sh->raid_conf; 521 struct r5conf *conf = sh->raid_conf;
434 int i; 522 int i, seq;
435 523
436 BUG_ON(atomic_read(&sh->count) != 0); 524 BUG_ON(atomic_read(&sh->count) != 0);
437 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 525 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
@@ -441,7 +529,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
441 (unsigned long long)sh->sector); 529 (unsigned long long)sh->sector);
442 530
443 remove_hash(sh); 531 remove_hash(sh);
444 532retry:
533 seq = read_seqcount_begin(&conf->gen_lock);
445 sh->generation = conf->generation - previous; 534 sh->generation = conf->generation - previous;
446 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 535 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
447 sh->sector = sector; 536 sh->sector = sector;
@@ -463,6 +552,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
463 dev->flags = 0; 552 dev->flags = 0;
464 raid5_build_block(sh, i, previous); 553 raid5_build_block(sh, i, previous);
465 } 554 }
555 if (read_seqcount_retry(&conf->gen_lock, seq))
556 goto retry;
466 insert_hash(conf, sh); 557 insert_hash(conf, sh);
467 sh->cpu = smp_processor_id(); 558 sh->cpu = smp_processor_id();
468} 559}
@@ -567,29 +658,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
567 int previous, int noblock, int noquiesce) 658 int previous, int noblock, int noquiesce)
568{ 659{
569 struct stripe_head *sh; 660 struct stripe_head *sh;
661 int hash = stripe_hash_locks_hash(sector);
570 662
571 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 663 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
572 664
573 spin_lock_irq(&conf->device_lock); 665 spin_lock_irq(conf->hash_locks + hash);
574 666
575 do { 667 do {
576 wait_event_lock_irq(conf->wait_for_stripe, 668 wait_event_lock_irq(conf->wait_for_stripe,
577 conf->quiesce == 0 || noquiesce, 669 conf->quiesce == 0 || noquiesce,
578 conf->device_lock); 670 *(conf->hash_locks + hash));
579 sh = __find_stripe(conf, sector, conf->generation - previous); 671 sh = __find_stripe(conf, sector, conf->generation - previous);
580 if (!sh) { 672 if (!sh) {
581 if (!conf->inactive_blocked) 673 if (!conf->inactive_blocked)
582 sh = get_free_stripe(conf); 674 sh = get_free_stripe(conf, hash);
583 if (noblock && sh == NULL) 675 if (noblock && sh == NULL)
584 break; 676 break;
585 if (!sh) { 677 if (!sh) {
586 conf->inactive_blocked = 1; 678 conf->inactive_blocked = 1;
587 wait_event_lock_irq(conf->wait_for_stripe, 679 wait_event_lock_irq(
588 !list_empty(&conf->inactive_list) && 680 conf->wait_for_stripe,
589 (atomic_read(&conf->active_stripes) 681 !list_empty(conf->inactive_list + hash) &&
590 < (conf->max_nr_stripes *3/4) 682 (atomic_read(&conf->active_stripes)
591 || !conf->inactive_blocked), 683 < (conf->max_nr_stripes * 3 / 4)
592 conf->device_lock); 684 || !conf->inactive_blocked),
685 *(conf->hash_locks + hash));
593 conf->inactive_blocked = 0; 686 conf->inactive_blocked = 0;
594 } else 687 } else
595 init_stripe(sh, sector, previous); 688 init_stripe(sh, sector, previous);
@@ -600,9 +693,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
600 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) 693 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
601 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 694 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
602 } else { 695 } else {
696 spin_lock(&conf->device_lock);
603 if (!test_bit(STRIPE_HANDLE, &sh->state)) 697 if (!test_bit(STRIPE_HANDLE, &sh->state))
604 atomic_inc(&conf->active_stripes); 698 atomic_inc(&conf->active_stripes);
605 if (list_empty(&sh->lru) && 699 if (list_empty(&sh->lru) &&
700 !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) &&
606 !test_bit(STRIPE_EXPANDING, &sh->state)) 701 !test_bit(STRIPE_EXPANDING, &sh->state))
607 BUG(); 702 BUG();
608 list_del_init(&sh->lru); 703 list_del_init(&sh->lru);
@@ -610,6 +705,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
610 sh->group->stripes_cnt--; 705 sh->group->stripes_cnt--;
611 sh->group = NULL; 706 sh->group = NULL;
612 } 707 }
708 spin_unlock(&conf->device_lock);
613 } 709 }
614 } 710 }
615 } while (sh == NULL); 711 } while (sh == NULL);
@@ -617,7 +713,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
617 if (sh) 713 if (sh)
618 atomic_inc(&sh->count); 714 atomic_inc(&sh->count);
619 715
620 spin_unlock_irq(&conf->device_lock); 716 spin_unlock_irq(conf->hash_locks + hash);
621 return sh; 717 return sh;
622} 718}
623 719
@@ -1597,7 +1693,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1597 put_cpu(); 1693 put_cpu();
1598} 1694}
1599 1695
1600static int grow_one_stripe(struct r5conf *conf) 1696static int grow_one_stripe(struct r5conf *conf, int hash)
1601{ 1697{
1602 struct stripe_head *sh; 1698 struct stripe_head *sh;
1603 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1699 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1613,6 +1709,7 @@ static int grow_one_stripe(struct r5conf *conf)
1613 kmem_cache_free(conf->slab_cache, sh); 1709 kmem_cache_free(conf->slab_cache, sh);
1614 return 0; 1710 return 0;
1615 } 1711 }
1712 sh->hash_lock_index = hash;
1616 /* we just created an active stripe so... */ 1713 /* we just created an active stripe so... */
1617 atomic_set(&sh->count, 1); 1714 atomic_set(&sh->count, 1);
1618 atomic_inc(&conf->active_stripes); 1715 atomic_inc(&conf->active_stripes);
@@ -1625,6 +1722,7 @@ static int grow_stripes(struct r5conf *conf, int num)
1625{ 1722{
1626 struct kmem_cache *sc; 1723 struct kmem_cache *sc;
1627 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1724 int devs = max(conf->raid_disks, conf->previous_raid_disks);
1725 int hash;
1628 1726
1629 if (conf->mddev->gendisk) 1727 if (conf->mddev->gendisk)
1630 sprintf(conf->cache_name[0], 1728 sprintf(conf->cache_name[0],
@@ -1642,9 +1740,13 @@ static int grow_stripes(struct r5conf *conf, int num)
1642 return 1; 1740 return 1;
1643 conf->slab_cache = sc; 1741 conf->slab_cache = sc;
1644 conf->pool_size = devs; 1742 conf->pool_size = devs;
1645 while (num--) 1743 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
1646 if (!grow_one_stripe(conf)) 1744 while (num--) {
1745 if (!grow_one_stripe(conf, hash))
1647 return 1; 1746 return 1;
1747 conf->max_nr_stripes++;
1748 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
1749 }
1648 return 0; 1750 return 0;
1649} 1751}
1650 1752
@@ -1702,6 +1804,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1702 int err; 1804 int err;
1703 struct kmem_cache *sc; 1805 struct kmem_cache *sc;
1704 int i; 1806 int i;
1807 int hash, cnt;
1705 1808
1706 if (newsize <= conf->pool_size) 1809 if (newsize <= conf->pool_size)
1707 return 0; /* never bother to shrink */ 1810 return 0; /* never bother to shrink */
@@ -1741,19 +1844,29 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1741 * OK, we have enough stripes, start collecting inactive 1844 * OK, we have enough stripes, start collecting inactive
1742 * stripes and copying them over 1845 * stripes and copying them over
1743 */ 1846 */
1847 hash = 0;
1848 cnt = 0;
1744 list_for_each_entry(nsh, &newstripes, lru) { 1849 list_for_each_entry(nsh, &newstripes, lru) {
1745 spin_lock_irq(&conf->device_lock); 1850 lock_device_hash_lock(conf, hash);
1746 wait_event_lock_irq(conf->wait_for_stripe, 1851 wait_event_cmd(conf->wait_for_stripe,
1747 !list_empty(&conf->inactive_list), 1852 !list_empty(conf->inactive_list + hash),
1748 conf->device_lock); 1853 unlock_device_hash_lock(conf, hash),
1749 osh = get_free_stripe(conf); 1854 lock_device_hash_lock(conf, hash));
1750 spin_unlock_irq(&conf->device_lock); 1855 osh = get_free_stripe(conf, hash);
1856 unlock_device_hash_lock(conf, hash);
1751 atomic_set(&nsh->count, 1); 1857 atomic_set(&nsh->count, 1);
1752 for(i=0; i<conf->pool_size; i++) 1858 for(i=0; i<conf->pool_size; i++)
1753 nsh->dev[i].page = osh->dev[i].page; 1859 nsh->dev[i].page = osh->dev[i].page;
1754 for( ; i<newsize; i++) 1860 for( ; i<newsize; i++)
1755 nsh->dev[i].page = NULL; 1861 nsh->dev[i].page = NULL;
1862 nsh->hash_lock_index = hash;
1756 kmem_cache_free(conf->slab_cache, osh); 1863 kmem_cache_free(conf->slab_cache, osh);
1864 cnt++;
1865 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
1866 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
1867 hash++;
1868 cnt = 0;
1869 }
1757 } 1870 }
1758 kmem_cache_destroy(conf->slab_cache); 1871 kmem_cache_destroy(conf->slab_cache);
1759 1872
@@ -1812,13 +1925,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1812 return err; 1925 return err;
1813} 1926}
1814 1927
1815static int drop_one_stripe(struct r5conf *conf) 1928static int drop_one_stripe(struct r5conf *conf, int hash)
1816{ 1929{
1817 struct stripe_head *sh; 1930 struct stripe_head *sh;
1818 1931
1819 spin_lock_irq(&conf->device_lock); 1932 spin_lock_irq(conf->hash_locks + hash);
1820 sh = get_free_stripe(conf); 1933 sh = get_free_stripe(conf, hash);
1821 spin_unlock_irq(&conf->device_lock); 1934 spin_unlock_irq(conf->hash_locks + hash);
1822 if (!sh) 1935 if (!sh)
1823 return 0; 1936 return 0;
1824 BUG_ON(atomic_read(&sh->count)); 1937 BUG_ON(atomic_read(&sh->count));
@@ -1830,8 +1943,10 @@ static int drop_one_stripe(struct r5conf *conf)
1830 1943
1831static void shrink_stripes(struct r5conf *conf) 1944static void shrink_stripes(struct r5conf *conf)
1832{ 1945{
1833 while (drop_one_stripe(conf)) 1946 int hash;
1834 ; 1947 for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
1948 while (drop_one_stripe(conf, hash))
1949 ;
1835 1950
1836 if (conf->slab_cache) 1951 if (conf->slab_cache)
1837 kmem_cache_destroy(conf->slab_cache); 1952 kmem_cache_destroy(conf->slab_cache);
@@ -3915,7 +4030,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
3915 } 4030 }
3916} 4031}
3917 4032
3918static void activate_bit_delay(struct r5conf *conf) 4033static void activate_bit_delay(struct r5conf *conf,
4034 struct list_head *temp_inactive_list)
3919{ 4035{
3920 /* device_lock is held */ 4036 /* device_lock is held */
3921 struct list_head head; 4037 struct list_head head;
@@ -3923,9 +4039,11 @@ static void activate_bit_delay(struct r5conf *conf)
3923 list_del_init(&conf->bitmap_list); 4039 list_del_init(&conf->bitmap_list);
3924 while (!list_empty(&head)) { 4040 while (!list_empty(&head)) {
3925 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 4041 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
4042 int hash;
3926 list_del_init(&sh->lru); 4043 list_del_init(&sh->lru);
3927 atomic_inc(&sh->count); 4044 atomic_inc(&sh->count);
3928 __release_stripe(conf, sh); 4045 hash = sh->hash_lock_index;
4046 __release_stripe(conf, sh, &temp_inactive_list[hash]);
3929 } 4047 }
3930} 4048}
3931 4049
@@ -3941,7 +4059,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
3941 return 1; 4059 return 1;
3942 if (conf->quiesce) 4060 if (conf->quiesce)
3943 return 1; 4061 return 1;
3944 if (list_empty_careful(&conf->inactive_list)) 4062 if (atomic_read(&conf->active_stripes) == conf->max_nr_stripes)
3945 return 1; 4063 return 1;
3946 4064
3947 return 0; 4065 return 0;
@@ -4271,6 +4389,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
4271struct raid5_plug_cb { 4389struct raid5_plug_cb {
4272 struct blk_plug_cb cb; 4390 struct blk_plug_cb cb;
4273 struct list_head list; 4391 struct list_head list;
4392 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
4274}; 4393};
4275 4394
4276static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4395static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
@@ -4281,6 +4400,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4281 struct mddev *mddev = cb->cb.data; 4400 struct mddev *mddev = cb->cb.data;
4282 struct r5conf *conf = mddev->private; 4401 struct r5conf *conf = mddev->private;
4283 int cnt = 0; 4402 int cnt = 0;
4403 int hash;
4284 4404
4285 if (cb->list.next && !list_empty(&cb->list)) { 4405 if (cb->list.next && !list_empty(&cb->list)) {
4286 spin_lock_irq(&conf->device_lock); 4406 spin_lock_irq(&conf->device_lock);
@@ -4298,11 +4418,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4298 * STRIPE_ON_RELEASE_LIST could be set here. In that 4418 * STRIPE_ON_RELEASE_LIST could be set here. In that
4299 * case, the count is always > 1 here 4419 * case, the count is always > 1 here
4300 */ 4420 */
4301 __release_stripe(conf, sh); 4421 hash = sh->hash_lock_index;
4422 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
4302 cnt++; 4423 cnt++;
4303 } 4424 }
4304 spin_unlock_irq(&conf->device_lock); 4425 spin_unlock_irq(&conf->device_lock);
4305 } 4426 }
4427 release_inactive_stripe_list(conf, cb->temp_inactive_list,
4428 NR_STRIPE_HASH_LOCKS);
4306 if (mddev->queue) 4429 if (mddev->queue)
4307 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4430 trace_block_unplug(mddev->queue, cnt, !from_schedule);
4308 kfree(cb); 4431 kfree(cb);
@@ -4323,8 +4446,12 @@ static void release_stripe_plug(struct mddev *mddev,
4323 4446
4324 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4447 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
4325 4448
4326 if (cb->list.next == NULL) 4449 if (cb->list.next == NULL) {
4450 int i;
4327 INIT_LIST_HEAD(&cb->list); 4451 INIT_LIST_HEAD(&cb->list);
4452 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
4453 INIT_LIST_HEAD(cb->temp_inactive_list + i);
4454 }
4328 4455
4329 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4456 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
4330 list_add_tail(&sh->lru, &cb->list); 4457 list_add_tail(&sh->lru, &cb->list);
@@ -4969,27 +5096,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4969} 5096}
4970 5097
4971static int handle_active_stripes(struct r5conf *conf, int group, 5098static int handle_active_stripes(struct r5conf *conf, int group,
4972 struct r5worker *worker) 5099 struct r5worker *worker,
5100 struct list_head *temp_inactive_list)
4973{ 5101{
4974 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 5102 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
4975 int i, batch_size = 0; 5103 int i, batch_size = 0, hash;
5104 bool release_inactive = false;
4976 5105
4977 while (batch_size < MAX_STRIPE_BATCH && 5106 while (batch_size < MAX_STRIPE_BATCH &&
4978 (sh = __get_priority_stripe(conf, group)) != NULL) 5107 (sh = __get_priority_stripe(conf, group)) != NULL)
4979 batch[batch_size++] = sh; 5108 batch[batch_size++] = sh;
4980 5109
4981 if (batch_size == 0) 5110 if (batch_size == 0) {
4982 return batch_size; 5111 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5112 if (!list_empty(temp_inactive_list + i))
5113 break;
5114 if (i == NR_STRIPE_HASH_LOCKS)
5115 return batch_size;
5116 release_inactive = true;
5117 }
4983 spin_unlock_irq(&conf->device_lock); 5118 spin_unlock_irq(&conf->device_lock);
4984 5119
5120 release_inactive_stripe_list(conf, temp_inactive_list,
5121 NR_STRIPE_HASH_LOCKS);
5122
5123 if (release_inactive) {
5124 spin_lock_irq(&conf->device_lock);
5125 return 0;
5126 }
5127
4985 for (i = 0; i < batch_size; i++) 5128 for (i = 0; i < batch_size; i++)
4986 handle_stripe(batch[i]); 5129 handle_stripe(batch[i]);
4987 5130
4988 cond_resched(); 5131 cond_resched();
4989 5132
4990 spin_lock_irq(&conf->device_lock); 5133 spin_lock_irq(&conf->device_lock);
4991 for (i = 0; i < batch_size; i++) 5134 for (i = 0; i < batch_size; i++) {
4992 __release_stripe(conf, batch[i]); 5135 hash = batch[i]->hash_lock_index;
5136 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
5137 }
4993 return batch_size; 5138 return batch_size;
4994} 5139}
4995 5140
@@ -5010,9 +5155,10 @@ static void raid5_do_work(struct work_struct *work)
5010 while (1) { 5155 while (1) {
5011 int batch_size, released; 5156 int batch_size, released;
5012 5157
5013 released = release_stripe_list(conf); 5158 released = release_stripe_list(conf, worker->temp_inactive_list);
5014 5159
5015 batch_size = handle_active_stripes(conf, group_id, worker); 5160 batch_size = handle_active_stripes(conf, group_id, worker,
5161 worker->temp_inactive_list);
5016 worker->working = false; 5162 worker->working = false;
5017 if (!batch_size && !released) 5163 if (!batch_size && !released)
5018 break; 5164 break;
@@ -5051,7 +5197,7 @@ static void raid5d(struct md_thread *thread)
5051 struct bio *bio; 5197 struct bio *bio;
5052 int batch_size, released; 5198 int batch_size, released;
5053 5199
5054 released = release_stripe_list(conf); 5200 released = release_stripe_list(conf, conf->temp_inactive_list);
5055 5201
5056 if ( 5202 if (
5057 !list_empty(&conf->bitmap_list)) { 5203 !list_empty(&conf->bitmap_list)) {
@@ -5061,7 +5207,7 @@ static void raid5d(struct md_thread *thread)
5061 bitmap_unplug(mddev->bitmap); 5207 bitmap_unplug(mddev->bitmap);
5062 spin_lock_irq(&conf->device_lock); 5208 spin_lock_irq(&conf->device_lock);
5063 conf->seq_write = conf->seq_flush; 5209 conf->seq_write = conf->seq_flush;
5064 activate_bit_delay(conf); 5210 activate_bit_delay(conf, conf->temp_inactive_list);
5065 } 5211 }
5066 raid5_activate_delayed(conf); 5212 raid5_activate_delayed(conf);
5067 5213
@@ -5075,7 +5221,8 @@ static void raid5d(struct md_thread *thread)
5075 handled++; 5221 handled++;
5076 } 5222 }
5077 5223
5078 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); 5224 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
5225 conf->temp_inactive_list);
5079 if (!batch_size && !released) 5226 if (!batch_size && !released)
5080 break; 5227 break;
5081 handled += batch_size; 5228 handled += batch_size;
@@ -5111,22 +5258,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
5111{ 5258{
5112 struct r5conf *conf = mddev->private; 5259 struct r5conf *conf = mddev->private;
5113 int err; 5260 int err;
5261 int hash;
5114 5262
5115 if (size <= 16 || size > 32768) 5263 if (size <= 16 || size > 32768)
5116 return -EINVAL; 5264 return -EINVAL;
5265 hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
5117 while (size < conf->max_nr_stripes) { 5266 while (size < conf->max_nr_stripes) {
5118 if (drop_one_stripe(conf)) 5267 if (drop_one_stripe(conf, hash))
5119 conf->max_nr_stripes--; 5268 conf->max_nr_stripes--;
5120 else 5269 else
5121 break; 5270 break;
5271 hash--;
5272 if (hash < 0)
5273 hash = NR_STRIPE_HASH_LOCKS - 1;
5122 } 5274 }
5123 err = md_allow_write(mddev); 5275 err = md_allow_write(mddev);
5124 if (err) 5276 if (err)
5125 return err; 5277 return err;
5278 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
5126 while (size > conf->max_nr_stripes) { 5279 while (size > conf->max_nr_stripes) {
5127 if (grow_one_stripe(conf)) 5280 if (grow_one_stripe(conf, hash))
5128 conf->max_nr_stripes++; 5281 conf->max_nr_stripes++;
5129 else break; 5282 else break;
5283 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
5130 } 5284 }
5131 return 0; 5285 return 0;
5132} 5286}
@@ -5277,7 +5431,7 @@ static struct attribute_group raid5_attrs_group = {
5277 5431
5278static int alloc_thread_groups(struct r5conf *conf, int cnt) 5432static int alloc_thread_groups(struct r5conf *conf, int cnt)
5279{ 5433{
5280 int i, j; 5434 int i, j, k;
5281 ssize_t size; 5435 ssize_t size;
5282 struct r5worker *workers; 5436 struct r5worker *workers;
5283 5437
@@ -5307,8 +5461,12 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt)
5307 group->workers = workers + i * cnt; 5461 group->workers = workers + i * cnt;
5308 5462
5309 for (j = 0; j < cnt; j++) { 5463 for (j = 0; j < cnt; j++) {
5310 group->workers[j].group = group; 5464 struct r5worker *worker = group->workers + j;
5311 INIT_WORK(&group->workers[j].work, raid5_do_work); 5465 worker->group = group;
5466 INIT_WORK(&worker->work, raid5_do_work);
5467
5468 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
5469 INIT_LIST_HEAD(worker->temp_inactive_list + k);
5312 } 5470 }
5313 } 5471 }
5314 5472
@@ -5459,6 +5617,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5459 struct md_rdev *rdev; 5617 struct md_rdev *rdev;
5460 struct disk_info *disk; 5618 struct disk_info *disk;
5461 char pers_name[6]; 5619 char pers_name[6];
5620 int i;
5462 5621
5463 if (mddev->new_level != 5 5622 if (mddev->new_level != 5
5464 && mddev->new_level != 4 5623 && mddev->new_level != 4
@@ -5503,7 +5662,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5503 INIT_LIST_HEAD(&conf->hold_list); 5662 INIT_LIST_HEAD(&conf->hold_list);
5504 INIT_LIST_HEAD(&conf->delayed_list); 5663 INIT_LIST_HEAD(&conf->delayed_list);
5505 INIT_LIST_HEAD(&conf->bitmap_list); 5664 INIT_LIST_HEAD(&conf->bitmap_list);
5506 INIT_LIST_HEAD(&conf->inactive_list);
5507 init_llist_head(&conf->released_stripes); 5665 init_llist_head(&conf->released_stripes);
5508 atomic_set(&conf->active_stripes, 0); 5666 atomic_set(&conf->active_stripes, 0);
5509 atomic_set(&conf->preread_active_stripes, 0); 5667 atomic_set(&conf->preread_active_stripes, 0);
@@ -5529,6 +5687,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5529 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5687 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
5530 goto abort; 5688 goto abort;
5531 5689
5690 /* We init hash_locks[0] separately to that it can be used
5691 * as the reference lock in the spin_lock_nest_lock() call
5692 * in lock_all_device_hash_locks_irq in order to convince
5693 * lockdep that we know what we are doing.
5694 */
5695 spin_lock_init(conf->hash_locks);
5696 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
5697 spin_lock_init(conf->hash_locks + i);
5698
5699 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5700 INIT_LIST_HEAD(conf->inactive_list + i);
5701
5702 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5703 INIT_LIST_HEAD(conf->temp_inactive_list + i);
5704
5532 conf->level = mddev->new_level; 5705 conf->level = mddev->new_level;
5533 if (raid5_alloc_percpu(conf) != 0) 5706 if (raid5_alloc_percpu(conf) != 0)
5534 goto abort; 5707 goto abort;
@@ -5569,7 +5742,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5569 else 5742 else
5570 conf->max_degraded = 1; 5743 conf->max_degraded = 1;
5571 conf->algorithm = mddev->new_layout; 5744 conf->algorithm = mddev->new_layout;
5572 conf->max_nr_stripes = NR_STRIPES;
5573 conf->reshape_progress = mddev->reshape_position; 5745 conf->reshape_progress = mddev->reshape_position;
5574 if (conf->reshape_progress != MaxSector) { 5746 if (conf->reshape_progress != MaxSector) {
5575 conf->prev_chunk_sectors = mddev->chunk_sectors; 5747 conf->prev_chunk_sectors = mddev->chunk_sectors;
@@ -5578,7 +5750,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5578 5750
5579 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5751 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
5580 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5752 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
5581 if (grow_stripes(conf, conf->max_nr_stripes)) { 5753 if (grow_stripes(conf, NR_STRIPES)) {
5582 printk(KERN_ERR 5754 printk(KERN_ERR
5583 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5755 "md/raid:%s: couldn't allocate %dkB for buffers\n",
5584 mdname(mddev), memory); 5756 mdname(mddev), memory);
@@ -6483,27 +6655,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
6483 break; 6655 break;
6484 6656
6485 case 1: /* stop all writes */ 6657 case 1: /* stop all writes */
6486 spin_lock_irq(&conf->device_lock); 6658 lock_all_device_hash_locks_irq(conf);
6487 /* '2' tells resync/reshape to pause so that all 6659 /* '2' tells resync/reshape to pause so that all
6488 * active stripes can drain 6660 * active stripes can drain
6489 */ 6661 */
6490 conf->quiesce = 2; 6662 conf->quiesce = 2;
6491 wait_event_lock_irq(conf->wait_for_stripe, 6663 wait_event_cmd(conf->wait_for_stripe,
6492 atomic_read(&conf->active_stripes) == 0 && 6664 atomic_read(&conf->active_stripes) == 0 &&
6493 atomic_read(&conf->active_aligned_reads) == 0, 6665 atomic_read(&conf->active_aligned_reads) == 0,
6494 conf->device_lock); 6666 unlock_all_device_hash_locks_irq(conf),
6667 lock_all_device_hash_locks_irq(conf));
6495 conf->quiesce = 1; 6668 conf->quiesce = 1;
6496 spin_unlock_irq(&conf->device_lock); 6669 unlock_all_device_hash_locks_irq(conf);
6497 /* allow reshape to continue */ 6670 /* allow reshape to continue */
6498 wake_up(&conf->wait_for_overlap); 6671 wake_up(&conf->wait_for_overlap);
6499 break; 6672 break;
6500 6673
6501 case 0: /* re-enable writes */ 6674 case 0: /* re-enable writes */
6502 spin_lock_irq(&conf->device_lock); 6675 lock_all_device_hash_locks_irq(conf);
6503 conf->quiesce = 0; 6676 conf->quiesce = 0;
6504 wake_up(&conf->wait_for_stripe); 6677 wake_up(&conf->wait_for_stripe);
6505 wake_up(&conf->wait_for_overlap); 6678 wake_up(&conf->wait_for_overlap);
6506 spin_unlock_irq(&conf->device_lock); 6679 unlock_all_device_hash_locks_irq(conf);
6507 break; 6680 break;
6508 } 6681 }
6509} 6682}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2113ffa82c7a..a9e443a1116f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -205,6 +205,7 @@ struct stripe_head {
205 short pd_idx; /* parity disk index */ 205 short pd_idx; /* parity disk index */
206 short qd_idx; /* 'Q' disk index for raid6 */ 206 short qd_idx; /* 'Q' disk index for raid6 */
207 short ddf_layout;/* use DDF ordering to calculate Q */ 207 short ddf_layout;/* use DDF ordering to calculate Q */
208 short hash_lock_index;
208 unsigned long state; /* state flags */ 209 unsigned long state; /* state flags */
209 atomic_t count; /* nr of active thread/requests */ 210 atomic_t count; /* nr of active thread/requests */
210 int bm_seq; /* sequence number for bitmap flushes */ 211 int bm_seq; /* sequence number for bitmap flushes */
@@ -367,9 +368,18 @@ struct disk_info {
367 struct md_rdev *rdev, *replacement; 368 struct md_rdev *rdev, *replacement;
368}; 369};
369 370
371/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
372 * This is because we sometimes take all the spinlocks
373 * and creating that much locking depth can cause
374 * problems.
375 */
376#define NR_STRIPE_HASH_LOCKS 8
377#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
378
370struct r5worker { 379struct r5worker {
371 struct work_struct work; 380 struct work_struct work;
372 struct r5worker_group *group; 381 struct r5worker_group *group;
382 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
373 bool working; 383 bool working;
374}; 384};
375 385
@@ -382,6 +392,8 @@ struct r5worker_group {
382 392
383struct r5conf { 393struct r5conf {
384 struct hlist_head *stripe_hashtbl; 394 struct hlist_head *stripe_hashtbl;
395 /* only protect corresponding hash list and inactive_list */
396 spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS];
385 struct mddev *mddev; 397 struct mddev *mddev;
386 int chunk_sectors; 398 int chunk_sectors;
387 int level, algorithm; 399 int level, algorithm;
@@ -462,7 +474,7 @@ struct r5conf {
462 * Free stripes pool 474 * Free stripes pool
463 */ 475 */
464 atomic_t active_stripes; 476 atomic_t active_stripes;
465 struct list_head inactive_list; 477 struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
466 struct llist_head released_stripes; 478 struct llist_head released_stripes;
467 wait_queue_head_t wait_for_stripe; 479 wait_queue_head_t wait_for_stripe;
468 wait_queue_head_t wait_for_overlap; 480 wait_queue_head_t wait_for_overlap;
@@ -477,6 +489,7 @@ struct r5conf {
477 * the new thread here until we fully activate the array. 489 * the new thread here until we fully activate the array.
478 */ 490 */
479 struct md_thread *thread; 491 struct md_thread *thread;
492 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
480 struct r5worker_group *worker_groups; 493 struct r5worker_group *worker_groups;
481 int group_cnt; 494 int group_cnt;
482 int worker_cnt_per_group; 495 int worker_cnt_per_group;