diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-20 16:05:25 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-20 16:05:25 -0500 |
commit | 6d6e352c80f22c446d933ca8103e02bac1f09129 (patch) | |
tree | 248a6a7ebc5ea95986da5bccdd6d75b255cf28e4 /drivers/md/raid5.c | |
parent | b4789b8e6be3151a955ade74872822f30e8cd914 (diff) | |
parent | 60aaf933854511630e16be4efe0f96485e132de4 (diff) |
Merge tag 'md/3.13' of git://neil.brown.name/md
Pull md update from Neil Brown:
"Mostly optimisations and obscure bug fixes.
- raid5 gets less lock contention
- raid1 gets less contention between normal-io and resync-io during
resync"
* tag 'md/3.13' of git://neil.brown.name/md:
md/raid5: Use conf->device_lock protect changing of multi-thread resources.
md/raid5: Before freeing old multi-thread worker, it should flush them.
md/raid5: For stripe with R5_ReadNoMerge, we replace REQ_FLUSH with REQ_NOMERGE.
UAPI: include <asm/byteorder.h> in linux/raid/md_p.h
raid1: Rewrite the implementation of iobarrier.
raid1: Add some macros to make code clearly.
raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array.
raid1: Add a field array_frozen to indicate whether raid in freeze state.
md: Convert use of typedef ctl_table to struct ctl_table
md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes.
md: use MD_RECOVERY_INTR instead of kthread_should_stop in resync thread.
md: fix some places where mddev_lock return value is not checked.
raid5: Retry R5_ReadNoMerge flag when hit a read error.
raid5: relieve lock contention in get_active_stripe()
raid5: relieve lock contention in get_active_stripe()
wait: add wait_event_cmd()
md/raid5.c: add proper locking to error path of raid5_start_reshape.
md: fix calculation of stacking limits on level change.
raid5: Use slow_path to release stripe when mddev->thread is null
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 420 |
1 files changed, 319 insertions, 101 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7f0e17a27aeb..47da0af6322b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) | |||
85 | return &conf->stripe_hashtbl[hash]; | 85 | return &conf->stripe_hashtbl[hash]; |
86 | } | 86 | } |
87 | 87 | ||
88 | static inline int stripe_hash_locks_hash(sector_t sect) | ||
89 | { | ||
90 | return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; | ||
91 | } | ||
92 | |||
93 | static inline void lock_device_hash_lock(struct r5conf *conf, int hash) | ||
94 | { | ||
95 | spin_lock_irq(conf->hash_locks + hash); | ||
96 | spin_lock(&conf->device_lock); | ||
97 | } | ||
98 | |||
99 | static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) | ||
100 | { | ||
101 | spin_unlock(&conf->device_lock); | ||
102 | spin_unlock_irq(conf->hash_locks + hash); | ||
103 | } | ||
104 | |||
105 | static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) | ||
106 | { | ||
107 | int i; | ||
108 | local_irq_disable(); | ||
109 | spin_lock(conf->hash_locks); | ||
110 | for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) | ||
111 | spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); | ||
112 | spin_lock(&conf->device_lock); | ||
113 | } | ||
114 | |||
115 | static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) | ||
116 | { | ||
117 | int i; | ||
118 | spin_unlock(&conf->device_lock); | ||
119 | for (i = NR_STRIPE_HASH_LOCKS; i; i--) | ||
120 | spin_unlock(conf->hash_locks + i - 1); | ||
121 | local_irq_enable(); | ||
122 | } | ||
123 | |||
88 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector | 124 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector |
89 | * order without overlap. There may be several bio's per stripe+device, and | 125 | * order without overlap. There may be several bio's per stripe+device, and |
90 | * a bio could span several devices. | 126 | * a bio could span several devices. |
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) | |||
249 | } | 285 | } |
250 | } | 286 | } |
251 | 287 | ||
252 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) | 288 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, |
289 | struct list_head *temp_inactive_list) | ||
253 | { | 290 | { |
254 | BUG_ON(!list_empty(&sh->lru)); | 291 | BUG_ON(!list_empty(&sh->lru)); |
255 | BUG_ON(atomic_read(&conf->active_stripes)==0); | 292 | BUG_ON(atomic_read(&conf->active_stripes)==0); |
@@ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) | |||
278 | < IO_THRESHOLD) | 315 | < IO_THRESHOLD) |
279 | md_wakeup_thread(conf->mddev->thread); | 316 | md_wakeup_thread(conf->mddev->thread); |
280 | atomic_dec(&conf->active_stripes); | 317 | atomic_dec(&conf->active_stripes); |
281 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { | 318 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) |
282 | list_add_tail(&sh->lru, &conf->inactive_list); | 319 | list_add_tail(&sh->lru, temp_inactive_list); |
283 | wake_up(&conf->wait_for_stripe); | ||
284 | if (conf->retry_read_aligned) | ||
285 | md_wakeup_thread(conf->mddev->thread); | ||
286 | } | ||
287 | } | 320 | } |
288 | } | 321 | } |
289 | 322 | ||
290 | static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) | 323 | static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, |
324 | struct list_head *temp_inactive_list) | ||
291 | { | 325 | { |
292 | if (atomic_dec_and_test(&sh->count)) | 326 | if (atomic_dec_and_test(&sh->count)) |
293 | do_release_stripe(conf, sh); | 327 | do_release_stripe(conf, sh, temp_inactive_list); |
328 | } | ||
329 | |||
330 | /* | ||
331 | * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list | ||
332 | * | ||
333 | * Be careful: Only one task can add/delete stripes from temp_inactive_list at | ||
334 | * given time. Adding stripes only takes device lock, while deleting stripes | ||
335 | * only takes hash lock. | ||
336 | */ | ||
337 | static void release_inactive_stripe_list(struct r5conf *conf, | ||
338 | struct list_head *temp_inactive_list, | ||
339 | int hash) | ||
340 | { | ||
341 | int size; | ||
342 | bool do_wakeup = false; | ||
343 | unsigned long flags; | ||
344 | |||
345 | if (hash == NR_STRIPE_HASH_LOCKS) { | ||
346 | size = NR_STRIPE_HASH_LOCKS; | ||
347 | hash = NR_STRIPE_HASH_LOCKS - 1; | ||
348 | } else | ||
349 | size = 1; | ||
350 | while (size) { | ||
351 | struct list_head *list = &temp_inactive_list[size - 1]; | ||
352 | |||
353 | /* | ||
354 | * We don't hold any lock here yet, get_active_stripe() might | ||
355 | * remove stripes from the list | ||
356 | */ | ||
357 | if (!list_empty_careful(list)) { | ||
358 | spin_lock_irqsave(conf->hash_locks + hash, flags); | ||
359 | if (list_empty(conf->inactive_list + hash) && | ||
360 | !list_empty(list)) | ||
361 | atomic_dec(&conf->empty_inactive_list_nr); | ||
362 | list_splice_tail_init(list, conf->inactive_list + hash); | ||
363 | do_wakeup = true; | ||
364 | spin_unlock_irqrestore(conf->hash_locks + hash, flags); | ||
365 | } | ||
366 | size--; | ||
367 | hash--; | ||
368 | } | ||
369 | |||
370 | if (do_wakeup) { | ||
371 | wake_up(&conf->wait_for_stripe); | ||
372 | if (conf->retry_read_aligned) | ||
373 | md_wakeup_thread(conf->mddev->thread); | ||
374 | } | ||
294 | } | 375 | } |
295 | 376 | ||
296 | /* should hold conf->device_lock already */ | 377 | /* should hold conf->device_lock already */ |
297 | static int release_stripe_list(struct r5conf *conf) | 378 | static int release_stripe_list(struct r5conf *conf, |
379 | struct list_head *temp_inactive_list) | ||
298 | { | 380 | { |
299 | struct stripe_head *sh; | 381 | struct stripe_head *sh; |
300 | int count = 0; | 382 | int count = 0; |
@@ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf) | |||
303 | head = llist_del_all(&conf->released_stripes); | 385 | head = llist_del_all(&conf->released_stripes); |
304 | head = llist_reverse_order(head); | 386 | head = llist_reverse_order(head); |
305 | while (head) { | 387 | while (head) { |
388 | int hash; | ||
389 | |||
306 | sh = llist_entry(head, struct stripe_head, release_list); | 390 | sh = llist_entry(head, struct stripe_head, release_list); |
307 | head = llist_next(head); | 391 | head = llist_next(head); |
308 | /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ | 392 | /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ |
@@ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf) | |||
313 | * again, the count is always > 1. This is true for | 397 | * again, the count is always > 1. This is true for |
314 | * STRIPE_ON_UNPLUG_LIST bit too. | 398 | * STRIPE_ON_UNPLUG_LIST bit too. |
315 | */ | 399 | */ |
316 | __release_stripe(conf, sh); | 400 | hash = sh->hash_lock_index; |
401 | __release_stripe(conf, sh, &temp_inactive_list[hash]); | ||
317 | count++; | 402 | count++; |
318 | } | 403 | } |
319 | 404 | ||
@@ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh) | |||
324 | { | 409 | { |
325 | struct r5conf *conf = sh->raid_conf; | 410 | struct r5conf *conf = sh->raid_conf; |
326 | unsigned long flags; | 411 | unsigned long flags; |
412 | struct list_head list; | ||
413 | int hash; | ||
327 | bool wakeup; | 414 | bool wakeup; |
328 | 415 | ||
329 | if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) | 416 | if (unlikely(!conf->mddev->thread) || |
417 | test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) | ||
330 | goto slow_path; | 418 | goto slow_path; |
331 | wakeup = llist_add(&sh->release_list, &conf->released_stripes); | 419 | wakeup = llist_add(&sh->release_list, &conf->released_stripes); |
332 | if (wakeup) | 420 | if (wakeup) |
@@ -336,8 +424,11 @@ slow_path: | |||
336 | local_irq_save(flags); | 424 | local_irq_save(flags); |
337 | /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ | 425 | /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ |
338 | if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { | 426 | if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { |
339 | do_release_stripe(conf, sh); | 427 | INIT_LIST_HEAD(&list); |
428 | hash = sh->hash_lock_index; | ||
429 | do_release_stripe(conf, sh, &list); | ||
340 | spin_unlock(&conf->device_lock); | 430 | spin_unlock(&conf->device_lock); |
431 | release_inactive_stripe_list(conf, &list, hash); | ||
341 | } | 432 | } |
342 | local_irq_restore(flags); | 433 | local_irq_restore(flags); |
343 | } | 434 | } |
@@ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) | |||
362 | 453 | ||
363 | 454 | ||
364 | /* find an idle stripe, make sure it is unhashed, and return it. */ | 455 | /* find an idle stripe, make sure it is unhashed, and return it. */ |
365 | static struct stripe_head *get_free_stripe(struct r5conf *conf) | 456 | static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) |
366 | { | 457 | { |
367 | struct stripe_head *sh = NULL; | 458 | struct stripe_head *sh = NULL; |
368 | struct list_head *first; | 459 | struct list_head *first; |
369 | 460 | ||
370 | if (list_empty(&conf->inactive_list)) | 461 | if (list_empty(conf->inactive_list + hash)) |
371 | goto out; | 462 | goto out; |
372 | first = conf->inactive_list.next; | 463 | first = (conf->inactive_list + hash)->next; |
373 | sh = list_entry(first, struct stripe_head, lru); | 464 | sh = list_entry(first, struct stripe_head, lru); |
374 | list_del_init(first); | 465 | list_del_init(first); |
375 | remove_hash(sh); | 466 | remove_hash(sh); |
376 | atomic_inc(&conf->active_stripes); | 467 | atomic_inc(&conf->active_stripes); |
468 | BUG_ON(hash != sh->hash_lock_index); | ||
469 | if (list_empty(conf->inactive_list + hash)) | ||
470 | atomic_inc(&conf->empty_inactive_list_nr); | ||
377 | out: | 471 | out: |
378 | return sh; | 472 | return sh; |
379 | } | 473 | } |
@@ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, | |||
416 | static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | 510 | static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) |
417 | { | 511 | { |
418 | struct r5conf *conf = sh->raid_conf; | 512 | struct r5conf *conf = sh->raid_conf; |
419 | int i; | 513 | int i, seq; |
420 | 514 | ||
421 | BUG_ON(atomic_read(&sh->count) != 0); | 515 | BUG_ON(atomic_read(&sh->count) != 0); |
422 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); | 516 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); |
@@ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | |||
426 | (unsigned long long)sh->sector); | 520 | (unsigned long long)sh->sector); |
427 | 521 | ||
428 | remove_hash(sh); | 522 | remove_hash(sh); |
429 | 523 | retry: | |
524 | seq = read_seqcount_begin(&conf->gen_lock); | ||
430 | sh->generation = conf->generation - previous; | 525 | sh->generation = conf->generation - previous; |
431 | sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; | 526 | sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; |
432 | sh->sector = sector; | 527 | sh->sector = sector; |
@@ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | |||
448 | dev->flags = 0; | 543 | dev->flags = 0; |
449 | raid5_build_block(sh, i, previous); | 544 | raid5_build_block(sh, i, previous); |
450 | } | 545 | } |
546 | if (read_seqcount_retry(&conf->gen_lock, seq)) | ||
547 | goto retry; | ||
451 | insert_hash(conf, sh); | 548 | insert_hash(conf, sh); |
452 | sh->cpu = smp_processor_id(); | 549 | sh->cpu = smp_processor_id(); |
453 | } | 550 | } |
@@ -552,29 +649,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
552 | int previous, int noblock, int noquiesce) | 649 | int previous, int noblock, int noquiesce) |
553 | { | 650 | { |
554 | struct stripe_head *sh; | 651 | struct stripe_head *sh; |
652 | int hash = stripe_hash_locks_hash(sector); | ||
555 | 653 | ||
556 | pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); | 654 | pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); |
557 | 655 | ||
558 | spin_lock_irq(&conf->device_lock); | 656 | spin_lock_irq(conf->hash_locks + hash); |
559 | 657 | ||
560 | do { | 658 | do { |
561 | wait_event_lock_irq(conf->wait_for_stripe, | 659 | wait_event_lock_irq(conf->wait_for_stripe, |
562 | conf->quiesce == 0 || noquiesce, | 660 | conf->quiesce == 0 || noquiesce, |
563 | conf->device_lock); | 661 | *(conf->hash_locks + hash)); |
564 | sh = __find_stripe(conf, sector, conf->generation - previous); | 662 | sh = __find_stripe(conf, sector, conf->generation - previous); |
565 | if (!sh) { | 663 | if (!sh) { |
566 | if (!conf->inactive_blocked) | 664 | if (!conf->inactive_blocked) |
567 | sh = get_free_stripe(conf); | 665 | sh = get_free_stripe(conf, hash); |
568 | if (noblock && sh == NULL) | 666 | if (noblock && sh == NULL) |
569 | break; | 667 | break; |
570 | if (!sh) { | 668 | if (!sh) { |
571 | conf->inactive_blocked = 1; | 669 | conf->inactive_blocked = 1; |
572 | wait_event_lock_irq(conf->wait_for_stripe, | 670 | wait_event_lock_irq( |
573 | !list_empty(&conf->inactive_list) && | 671 | conf->wait_for_stripe, |
574 | (atomic_read(&conf->active_stripes) | 672 | !list_empty(conf->inactive_list + hash) && |
575 | < (conf->max_nr_stripes *3/4) | 673 | (atomic_read(&conf->active_stripes) |
576 | || !conf->inactive_blocked), | 674 | < (conf->max_nr_stripes * 3 / 4) |
577 | conf->device_lock); | 675 | || !conf->inactive_blocked), |
676 | *(conf->hash_locks + hash)); | ||
578 | conf->inactive_blocked = 0; | 677 | conf->inactive_blocked = 0; |
579 | } else | 678 | } else |
580 | init_stripe(sh, sector, previous); | 679 | init_stripe(sh, sector, previous); |
@@ -585,9 +684,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
585 | && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) | 684 | && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) |
586 | && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); | 685 | && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); |
587 | } else { | 686 | } else { |
687 | spin_lock(&conf->device_lock); | ||
588 | if (!test_bit(STRIPE_HANDLE, &sh->state)) | 688 | if (!test_bit(STRIPE_HANDLE, &sh->state)) |
589 | atomic_inc(&conf->active_stripes); | 689 | atomic_inc(&conf->active_stripes); |
590 | if (list_empty(&sh->lru) && | 690 | if (list_empty(&sh->lru) && |
691 | !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) && | ||
591 | !test_bit(STRIPE_EXPANDING, &sh->state)) | 692 | !test_bit(STRIPE_EXPANDING, &sh->state)) |
592 | BUG(); | 693 | BUG(); |
593 | list_del_init(&sh->lru); | 694 | list_del_init(&sh->lru); |
@@ -595,6 +696,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
595 | sh->group->stripes_cnt--; | 696 | sh->group->stripes_cnt--; |
596 | sh->group = NULL; | 697 | sh->group = NULL; |
597 | } | 698 | } |
699 | spin_unlock(&conf->device_lock); | ||
598 | } | 700 | } |
599 | } | 701 | } |
600 | } while (sh == NULL); | 702 | } while (sh == NULL); |
@@ -602,7 +704,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
602 | if (sh) | 704 | if (sh) |
603 | atomic_inc(&sh->count); | 705 | atomic_inc(&sh->count); |
604 | 706 | ||
605 | spin_unlock_irq(&conf->device_lock); | 707 | spin_unlock_irq(conf->hash_locks + hash); |
606 | return sh; | 708 | return sh; |
607 | } | 709 | } |
608 | 710 | ||
@@ -758,7 +860,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
758 | bi->bi_sector = (sh->sector | 860 | bi->bi_sector = (sh->sector |
759 | + rdev->data_offset); | 861 | + rdev->data_offset); |
760 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | 862 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) |
761 | bi->bi_rw |= REQ_FLUSH; | 863 | bi->bi_rw |= REQ_NOMERGE; |
762 | 864 | ||
763 | bi->bi_vcnt = 1; | 865 | bi->bi_vcnt = 1; |
764 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 866 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
@@ -1582,7 +1684,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
1582 | put_cpu(); | 1684 | put_cpu(); |
1583 | } | 1685 | } |
1584 | 1686 | ||
1585 | static int grow_one_stripe(struct r5conf *conf) | 1687 | static int grow_one_stripe(struct r5conf *conf, int hash) |
1586 | { | 1688 | { |
1587 | struct stripe_head *sh; | 1689 | struct stripe_head *sh; |
1588 | sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); | 1690 | sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); |
@@ -1598,6 +1700,7 @@ static int grow_one_stripe(struct r5conf *conf) | |||
1598 | kmem_cache_free(conf->slab_cache, sh); | 1700 | kmem_cache_free(conf->slab_cache, sh); |
1599 | return 0; | 1701 | return 0; |
1600 | } | 1702 | } |
1703 | sh->hash_lock_index = hash; | ||
1601 | /* we just created an active stripe so... */ | 1704 | /* we just created an active stripe so... */ |
1602 | atomic_set(&sh->count, 1); | 1705 | atomic_set(&sh->count, 1); |
1603 | atomic_inc(&conf->active_stripes); | 1706 | atomic_inc(&conf->active_stripes); |
@@ -1610,6 +1713,7 @@ static int grow_stripes(struct r5conf *conf, int num) | |||
1610 | { | 1713 | { |
1611 | struct kmem_cache *sc; | 1714 | struct kmem_cache *sc; |
1612 | int devs = max(conf->raid_disks, conf->previous_raid_disks); | 1715 | int devs = max(conf->raid_disks, conf->previous_raid_disks); |
1716 | int hash; | ||
1613 | 1717 | ||
1614 | if (conf->mddev->gendisk) | 1718 | if (conf->mddev->gendisk) |
1615 | sprintf(conf->cache_name[0], | 1719 | sprintf(conf->cache_name[0], |
@@ -1627,9 +1731,13 @@ static int grow_stripes(struct r5conf *conf, int num) | |||
1627 | return 1; | 1731 | return 1; |
1628 | conf->slab_cache = sc; | 1732 | conf->slab_cache = sc; |
1629 | conf->pool_size = devs; | 1733 | conf->pool_size = devs; |
1630 | while (num--) | 1734 | hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; |
1631 | if (!grow_one_stripe(conf)) | 1735 | while (num--) { |
1736 | if (!grow_one_stripe(conf, hash)) | ||
1632 | return 1; | 1737 | return 1; |
1738 | conf->max_nr_stripes++; | ||
1739 | hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; | ||
1740 | } | ||
1633 | return 0; | 1741 | return 0; |
1634 | } | 1742 | } |
1635 | 1743 | ||
@@ -1687,6 +1795,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
1687 | int err; | 1795 | int err; |
1688 | struct kmem_cache *sc; | 1796 | struct kmem_cache *sc; |
1689 | int i; | 1797 | int i; |
1798 | int hash, cnt; | ||
1690 | 1799 | ||
1691 | if (newsize <= conf->pool_size) | 1800 | if (newsize <= conf->pool_size) |
1692 | return 0; /* never bother to shrink */ | 1801 | return 0; /* never bother to shrink */ |
@@ -1726,19 +1835,29 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
1726 | * OK, we have enough stripes, start collecting inactive | 1835 | * OK, we have enough stripes, start collecting inactive |
1727 | * stripes and copying them over | 1836 | * stripes and copying them over |
1728 | */ | 1837 | */ |
1838 | hash = 0; | ||
1839 | cnt = 0; | ||
1729 | list_for_each_entry(nsh, &newstripes, lru) { | 1840 | list_for_each_entry(nsh, &newstripes, lru) { |
1730 | spin_lock_irq(&conf->device_lock); | 1841 | lock_device_hash_lock(conf, hash); |
1731 | wait_event_lock_irq(conf->wait_for_stripe, | 1842 | wait_event_cmd(conf->wait_for_stripe, |
1732 | !list_empty(&conf->inactive_list), | 1843 | !list_empty(conf->inactive_list + hash), |
1733 | conf->device_lock); | 1844 | unlock_device_hash_lock(conf, hash), |
1734 | osh = get_free_stripe(conf); | 1845 | lock_device_hash_lock(conf, hash)); |
1735 | spin_unlock_irq(&conf->device_lock); | 1846 | osh = get_free_stripe(conf, hash); |
1847 | unlock_device_hash_lock(conf, hash); | ||
1736 | atomic_set(&nsh->count, 1); | 1848 | atomic_set(&nsh->count, 1); |
1737 | for(i=0; i<conf->pool_size; i++) | 1849 | for(i=0; i<conf->pool_size; i++) |
1738 | nsh->dev[i].page = osh->dev[i].page; | 1850 | nsh->dev[i].page = osh->dev[i].page; |
1739 | for( ; i<newsize; i++) | 1851 | for( ; i<newsize; i++) |
1740 | nsh->dev[i].page = NULL; | 1852 | nsh->dev[i].page = NULL; |
1853 | nsh->hash_lock_index = hash; | ||
1741 | kmem_cache_free(conf->slab_cache, osh); | 1854 | kmem_cache_free(conf->slab_cache, osh); |
1855 | cnt++; | ||
1856 | if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + | ||
1857 | !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { | ||
1858 | hash++; | ||
1859 | cnt = 0; | ||
1860 | } | ||
1742 | } | 1861 | } |
1743 | kmem_cache_destroy(conf->slab_cache); | 1862 | kmem_cache_destroy(conf->slab_cache); |
1744 | 1863 | ||
@@ -1797,13 +1916,13 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
1797 | return err; | 1916 | return err; |
1798 | } | 1917 | } |
1799 | 1918 | ||
1800 | static int drop_one_stripe(struct r5conf *conf) | 1919 | static int drop_one_stripe(struct r5conf *conf, int hash) |
1801 | { | 1920 | { |
1802 | struct stripe_head *sh; | 1921 | struct stripe_head *sh; |
1803 | 1922 | ||
1804 | spin_lock_irq(&conf->device_lock); | 1923 | spin_lock_irq(conf->hash_locks + hash); |
1805 | sh = get_free_stripe(conf); | 1924 | sh = get_free_stripe(conf, hash); |
1806 | spin_unlock_irq(&conf->device_lock); | 1925 | spin_unlock_irq(conf->hash_locks + hash); |
1807 | if (!sh) | 1926 | if (!sh) |
1808 | return 0; | 1927 | return 0; |
1809 | BUG_ON(atomic_read(&sh->count)); | 1928 | BUG_ON(atomic_read(&sh->count)); |
@@ -1815,8 +1934,10 @@ static int drop_one_stripe(struct r5conf *conf) | |||
1815 | 1934 | ||
1816 | static void shrink_stripes(struct r5conf *conf) | 1935 | static void shrink_stripes(struct r5conf *conf) |
1817 | { | 1936 | { |
1818 | while (drop_one_stripe(conf)) | 1937 | int hash; |
1819 | ; | 1938 | for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) |
1939 | while (drop_one_stripe(conf, hash)) | ||
1940 | ; | ||
1820 | 1941 | ||
1821 | if (conf->slab_cache) | 1942 | if (conf->slab_cache) |
1822 | kmem_cache_destroy(conf->slab_cache); | 1943 | kmem_cache_destroy(conf->slab_cache); |
@@ -1921,6 +2042,9 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1921 | mdname(conf->mddev), bdn); | 2042 | mdname(conf->mddev), bdn); |
1922 | else | 2043 | else |
1923 | retry = 1; | 2044 | retry = 1; |
2045 | if (set_bad && test_bit(In_sync, &rdev->flags) | ||
2046 | && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | ||
2047 | retry = 1; | ||
1924 | if (retry) | 2048 | if (retry) |
1925 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { | 2049 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { |
1926 | set_bit(R5_ReadError, &sh->dev[i].flags); | 2050 | set_bit(R5_ReadError, &sh->dev[i].flags); |
@@ -3900,7 +4024,8 @@ static void raid5_activate_delayed(struct r5conf *conf) | |||
3900 | } | 4024 | } |
3901 | } | 4025 | } |
3902 | 4026 | ||
3903 | static void activate_bit_delay(struct r5conf *conf) | 4027 | static void activate_bit_delay(struct r5conf *conf, |
4028 | struct list_head *temp_inactive_list) | ||
3904 | { | 4029 | { |
3905 | /* device_lock is held */ | 4030 | /* device_lock is held */ |
3906 | struct list_head head; | 4031 | struct list_head head; |
@@ -3908,9 +4033,11 @@ static void activate_bit_delay(struct r5conf *conf) | |||
3908 | list_del_init(&conf->bitmap_list); | 4033 | list_del_init(&conf->bitmap_list); |
3909 | while (!list_empty(&head)) { | 4034 | while (!list_empty(&head)) { |
3910 | struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); | 4035 | struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); |
4036 | int hash; | ||
3911 | list_del_init(&sh->lru); | 4037 | list_del_init(&sh->lru); |
3912 | atomic_inc(&sh->count); | 4038 | atomic_inc(&sh->count); |
3913 | __release_stripe(conf, sh); | 4039 | hash = sh->hash_lock_index; |
4040 | __release_stripe(conf, sh, &temp_inactive_list[hash]); | ||
3914 | } | 4041 | } |
3915 | } | 4042 | } |
3916 | 4043 | ||
@@ -3926,7 +4053,7 @@ int md_raid5_congested(struct mddev *mddev, int bits) | |||
3926 | return 1; | 4053 | return 1; |
3927 | if (conf->quiesce) | 4054 | if (conf->quiesce) |
3928 | return 1; | 4055 | return 1; |
3929 | if (list_empty_careful(&conf->inactive_list)) | 4056 | if (atomic_read(&conf->empty_inactive_list_nr)) |
3930 | return 1; | 4057 | return 1; |
3931 | 4058 | ||
3932 | return 0; | 4059 | return 0; |
@@ -4256,6 +4383,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) | |||
4256 | struct raid5_plug_cb { | 4383 | struct raid5_plug_cb { |
4257 | struct blk_plug_cb cb; | 4384 | struct blk_plug_cb cb; |
4258 | struct list_head list; | 4385 | struct list_head list; |
4386 | struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; | ||
4259 | }; | 4387 | }; |
4260 | 4388 | ||
4261 | static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | 4389 | static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) |
@@ -4266,6 +4394,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | |||
4266 | struct mddev *mddev = cb->cb.data; | 4394 | struct mddev *mddev = cb->cb.data; |
4267 | struct r5conf *conf = mddev->private; | 4395 | struct r5conf *conf = mddev->private; |
4268 | int cnt = 0; | 4396 | int cnt = 0; |
4397 | int hash; | ||
4269 | 4398 | ||
4270 | if (cb->list.next && !list_empty(&cb->list)) { | 4399 | if (cb->list.next && !list_empty(&cb->list)) { |
4271 | spin_lock_irq(&conf->device_lock); | 4400 | spin_lock_irq(&conf->device_lock); |
@@ -4283,11 +4412,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | |||
4283 | * STRIPE_ON_RELEASE_LIST could be set here. In that | 4412 | * STRIPE_ON_RELEASE_LIST could be set here. In that |
4284 | * case, the count is always > 1 here | 4413 | * case, the count is always > 1 here |
4285 | */ | 4414 | */ |
4286 | __release_stripe(conf, sh); | 4415 | hash = sh->hash_lock_index; |
4416 | __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); | ||
4287 | cnt++; | 4417 | cnt++; |
4288 | } | 4418 | } |
4289 | spin_unlock_irq(&conf->device_lock); | 4419 | spin_unlock_irq(&conf->device_lock); |
4290 | } | 4420 | } |
4421 | release_inactive_stripe_list(conf, cb->temp_inactive_list, | ||
4422 | NR_STRIPE_HASH_LOCKS); | ||
4291 | if (mddev->queue) | 4423 | if (mddev->queue) |
4292 | trace_block_unplug(mddev->queue, cnt, !from_schedule); | 4424 | trace_block_unplug(mddev->queue, cnt, !from_schedule); |
4293 | kfree(cb); | 4425 | kfree(cb); |
@@ -4308,8 +4440,12 @@ static void release_stripe_plug(struct mddev *mddev, | |||
4308 | 4440 | ||
4309 | cb = container_of(blk_cb, struct raid5_plug_cb, cb); | 4441 | cb = container_of(blk_cb, struct raid5_plug_cb, cb); |
4310 | 4442 | ||
4311 | if (cb->list.next == NULL) | 4443 | if (cb->list.next == NULL) { |
4444 | int i; | ||
4312 | INIT_LIST_HEAD(&cb->list); | 4445 | INIT_LIST_HEAD(&cb->list); |
4446 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||
4447 | INIT_LIST_HEAD(cb->temp_inactive_list + i); | ||
4448 | } | ||
4313 | 4449 | ||
4314 | if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) | 4450 | if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) |
4315 | list_add_tail(&sh->lru, &cb->list); | 4451 | list_add_tail(&sh->lru, &cb->list); |
@@ -4692,14 +4828,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4692 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | 4828 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { |
4693 | /* Cannot proceed until we've updated the superblock... */ | 4829 | /* Cannot proceed until we've updated the superblock... */ |
4694 | wait_event(conf->wait_for_overlap, | 4830 | wait_event(conf->wait_for_overlap, |
4695 | atomic_read(&conf->reshape_stripes)==0); | 4831 | atomic_read(&conf->reshape_stripes)==0 |
4832 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | ||
4833 | if (atomic_read(&conf->reshape_stripes) != 0) | ||
4834 | return 0; | ||
4696 | mddev->reshape_position = conf->reshape_progress; | 4835 | mddev->reshape_position = conf->reshape_progress; |
4697 | mddev->curr_resync_completed = sector_nr; | 4836 | mddev->curr_resync_completed = sector_nr; |
4698 | conf->reshape_checkpoint = jiffies; | 4837 | conf->reshape_checkpoint = jiffies; |
4699 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4838 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
4700 | md_wakeup_thread(mddev->thread); | 4839 | md_wakeup_thread(mddev->thread); |
4701 | wait_event(mddev->sb_wait, mddev->flags == 0 || | 4840 | wait_event(mddev->sb_wait, mddev->flags == 0 || |
4702 | kthread_should_stop()); | 4841 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
4842 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
4843 | return 0; | ||
4703 | spin_lock_irq(&conf->device_lock); | 4844 | spin_lock_irq(&conf->device_lock); |
4704 | conf->reshape_safe = mddev->reshape_position; | 4845 | conf->reshape_safe = mddev->reshape_position; |
4705 | spin_unlock_irq(&conf->device_lock); | 4846 | spin_unlock_irq(&conf->device_lock); |
@@ -4782,7 +4923,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4782 | >= mddev->resync_max - mddev->curr_resync_completed) { | 4923 | >= mddev->resync_max - mddev->curr_resync_completed) { |
4783 | /* Cannot proceed until we've updated the superblock... */ | 4924 | /* Cannot proceed until we've updated the superblock... */ |
4784 | wait_event(conf->wait_for_overlap, | 4925 | wait_event(conf->wait_for_overlap, |
4785 | atomic_read(&conf->reshape_stripes) == 0); | 4926 | atomic_read(&conf->reshape_stripes) == 0 |
4927 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | ||
4928 | if (atomic_read(&conf->reshape_stripes) != 0) | ||
4929 | goto ret; | ||
4786 | mddev->reshape_position = conf->reshape_progress; | 4930 | mddev->reshape_position = conf->reshape_progress; |
4787 | mddev->curr_resync_completed = sector_nr; | 4931 | mddev->curr_resync_completed = sector_nr; |
4788 | conf->reshape_checkpoint = jiffies; | 4932 | conf->reshape_checkpoint = jiffies; |
@@ -4790,13 +4934,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4790 | md_wakeup_thread(mddev->thread); | 4934 | md_wakeup_thread(mddev->thread); |
4791 | wait_event(mddev->sb_wait, | 4935 | wait_event(mddev->sb_wait, |
4792 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) | 4936 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) |
4793 | || kthread_should_stop()); | 4937 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
4938 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
4939 | goto ret; | ||
4794 | spin_lock_irq(&conf->device_lock); | 4940 | spin_lock_irq(&conf->device_lock); |
4795 | conf->reshape_safe = mddev->reshape_position; | 4941 | conf->reshape_safe = mddev->reshape_position; |
4796 | spin_unlock_irq(&conf->device_lock); | 4942 | spin_unlock_irq(&conf->device_lock); |
4797 | wake_up(&conf->wait_for_overlap); | 4943 | wake_up(&conf->wait_for_overlap); |
4798 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 4944 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
4799 | } | 4945 | } |
4946 | ret: | ||
4800 | return reshape_sectors; | 4947 | return reshape_sectors; |
4801 | } | 4948 | } |
4802 | 4949 | ||
@@ -4954,27 +5101,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
4954 | } | 5101 | } |
4955 | 5102 | ||
4956 | static int handle_active_stripes(struct r5conf *conf, int group, | 5103 | static int handle_active_stripes(struct r5conf *conf, int group, |
4957 | struct r5worker *worker) | 5104 | struct r5worker *worker, |
5105 | struct list_head *temp_inactive_list) | ||
4958 | { | 5106 | { |
4959 | struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; | 5107 | struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; |
4960 | int i, batch_size = 0; | 5108 | int i, batch_size = 0, hash; |
5109 | bool release_inactive = false; | ||
4961 | 5110 | ||
4962 | while (batch_size < MAX_STRIPE_BATCH && | 5111 | while (batch_size < MAX_STRIPE_BATCH && |
4963 | (sh = __get_priority_stripe(conf, group)) != NULL) | 5112 | (sh = __get_priority_stripe(conf, group)) != NULL) |
4964 | batch[batch_size++] = sh; | 5113 | batch[batch_size++] = sh; |
4965 | 5114 | ||
4966 | if (batch_size == 0) | 5115 | if (batch_size == 0) { |
4967 | return batch_size; | 5116 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) |
5117 | if (!list_empty(temp_inactive_list + i)) | ||
5118 | break; | ||
5119 | if (i == NR_STRIPE_HASH_LOCKS) | ||
5120 | return batch_size; | ||
5121 | release_inactive = true; | ||
5122 | } | ||
4968 | spin_unlock_irq(&conf->device_lock); | 5123 | spin_unlock_irq(&conf->device_lock); |
4969 | 5124 | ||
5125 | release_inactive_stripe_list(conf, temp_inactive_list, | ||
5126 | NR_STRIPE_HASH_LOCKS); | ||
5127 | |||
5128 | if (release_inactive) { | ||
5129 | spin_lock_irq(&conf->device_lock); | ||
5130 | return 0; | ||
5131 | } | ||
5132 | |||
4970 | for (i = 0; i < batch_size; i++) | 5133 | for (i = 0; i < batch_size; i++) |
4971 | handle_stripe(batch[i]); | 5134 | handle_stripe(batch[i]); |
4972 | 5135 | ||
4973 | cond_resched(); | 5136 | cond_resched(); |
4974 | 5137 | ||
4975 | spin_lock_irq(&conf->device_lock); | 5138 | spin_lock_irq(&conf->device_lock); |
4976 | for (i = 0; i < batch_size; i++) | 5139 | for (i = 0; i < batch_size; i++) { |
4977 | __release_stripe(conf, batch[i]); | 5140 | hash = batch[i]->hash_lock_index; |
5141 | __release_stripe(conf, batch[i], &temp_inactive_list[hash]); | ||
5142 | } | ||
4978 | return batch_size; | 5143 | return batch_size; |
4979 | } | 5144 | } |
4980 | 5145 | ||
@@ -4995,9 +5160,10 @@ static void raid5_do_work(struct work_struct *work) | |||
4995 | while (1) { | 5160 | while (1) { |
4996 | int batch_size, released; | 5161 | int batch_size, released; |
4997 | 5162 | ||
4998 | released = release_stripe_list(conf); | 5163 | released = release_stripe_list(conf, worker->temp_inactive_list); |
4999 | 5164 | ||
5000 | batch_size = handle_active_stripes(conf, group_id, worker); | 5165 | batch_size = handle_active_stripes(conf, group_id, worker, |
5166 | worker->temp_inactive_list); | ||
5001 | worker->working = false; | 5167 | worker->working = false; |
5002 | if (!batch_size && !released) | 5168 | if (!batch_size && !released) |
5003 | break; | 5169 | break; |
@@ -5036,7 +5202,7 @@ static void raid5d(struct md_thread *thread) | |||
5036 | struct bio *bio; | 5202 | struct bio *bio; |
5037 | int batch_size, released; | 5203 | int batch_size, released; |
5038 | 5204 | ||
5039 | released = release_stripe_list(conf); | 5205 | released = release_stripe_list(conf, conf->temp_inactive_list); |
5040 | 5206 | ||
5041 | if ( | 5207 | if ( |
5042 | !list_empty(&conf->bitmap_list)) { | 5208 | !list_empty(&conf->bitmap_list)) { |
@@ -5046,7 +5212,7 @@ static void raid5d(struct md_thread *thread) | |||
5046 | bitmap_unplug(mddev->bitmap); | 5212 | bitmap_unplug(mddev->bitmap); |
5047 | spin_lock_irq(&conf->device_lock); | 5213 | spin_lock_irq(&conf->device_lock); |
5048 | conf->seq_write = conf->seq_flush; | 5214 | conf->seq_write = conf->seq_flush; |
5049 | activate_bit_delay(conf); | 5215 | activate_bit_delay(conf, conf->temp_inactive_list); |
5050 | } | 5216 | } |
5051 | raid5_activate_delayed(conf); | 5217 | raid5_activate_delayed(conf); |
5052 | 5218 | ||
@@ -5060,7 +5226,8 @@ static void raid5d(struct md_thread *thread) | |||
5060 | handled++; | 5226 | handled++; |
5061 | } | 5227 | } |
5062 | 5228 | ||
5063 | batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); | 5229 | batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, |
5230 | conf->temp_inactive_list); | ||
5064 | if (!batch_size && !released) | 5231 | if (!batch_size && !released) |
5065 | break; | 5232 | break; |
5066 | handled += batch_size; | 5233 | handled += batch_size; |
@@ -5096,22 +5263,29 @@ raid5_set_cache_size(struct mddev *mddev, int size) | |||
5096 | { | 5263 | { |
5097 | struct r5conf *conf = mddev->private; | 5264 | struct r5conf *conf = mddev->private; |
5098 | int err; | 5265 | int err; |
5266 | int hash; | ||
5099 | 5267 | ||
5100 | if (size <= 16 || size > 32768) | 5268 | if (size <= 16 || size > 32768) |
5101 | return -EINVAL; | 5269 | return -EINVAL; |
5270 | hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; | ||
5102 | while (size < conf->max_nr_stripes) { | 5271 | while (size < conf->max_nr_stripes) { |
5103 | if (drop_one_stripe(conf)) | 5272 | if (drop_one_stripe(conf, hash)) |
5104 | conf->max_nr_stripes--; | 5273 | conf->max_nr_stripes--; |
5105 | else | 5274 | else |
5106 | break; | 5275 | break; |
5276 | hash--; | ||
5277 | if (hash < 0) | ||
5278 | hash = NR_STRIPE_HASH_LOCKS - 1; | ||
5107 | } | 5279 | } |
5108 | err = md_allow_write(mddev); | 5280 | err = md_allow_write(mddev); |
5109 | if (err) | 5281 | if (err) |
5110 | return err; | 5282 | return err; |
5283 | hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; | ||
5111 | while (size > conf->max_nr_stripes) { | 5284 | while (size > conf->max_nr_stripes) { |
5112 | if (grow_one_stripe(conf)) | 5285 | if (grow_one_stripe(conf, hash)) |
5113 | conf->max_nr_stripes++; | 5286 | conf->max_nr_stripes++; |
5114 | else break; | 5287 | else break; |
5288 | hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; | ||
5115 | } | 5289 | } |
5116 | return 0; | 5290 | return 0; |
5117 | } | 5291 | } |
@@ -5199,15 +5373,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page) | |||
5199 | return 0; | 5373 | return 0; |
5200 | } | 5374 | } |
5201 | 5375 | ||
5202 | static int alloc_thread_groups(struct r5conf *conf, int cnt); | 5376 | static int alloc_thread_groups(struct r5conf *conf, int cnt, |
5377 | int *group_cnt, | ||
5378 | int *worker_cnt_per_group, | ||
5379 | struct r5worker_group **worker_groups); | ||
5203 | static ssize_t | 5380 | static ssize_t |
5204 | raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) | 5381 | raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) |
5205 | { | 5382 | { |
5206 | struct r5conf *conf = mddev->private; | 5383 | struct r5conf *conf = mddev->private; |
5207 | unsigned long new; | 5384 | unsigned long new; |
5208 | int err; | 5385 | int err; |
5209 | struct r5worker_group *old_groups; | 5386 | struct r5worker_group *new_groups, *old_groups; |
5210 | int old_group_cnt; | 5387 | int group_cnt, worker_cnt_per_group; |
5211 | 5388 | ||
5212 | if (len >= PAGE_SIZE) | 5389 | if (len >= PAGE_SIZE) |
5213 | return -EINVAL; | 5390 | return -EINVAL; |
@@ -5223,14 +5400,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) | |||
5223 | mddev_suspend(mddev); | 5400 | mddev_suspend(mddev); |
5224 | 5401 | ||
5225 | old_groups = conf->worker_groups; | 5402 | old_groups = conf->worker_groups; |
5226 | old_group_cnt = conf->worker_cnt_per_group; | 5403 | if (old_groups) |
5404 | flush_workqueue(raid5_wq); | ||
5405 | |||
5406 | err = alloc_thread_groups(conf, new, | ||
5407 | &group_cnt, &worker_cnt_per_group, | ||
5408 | &new_groups); | ||
5409 | if (!err) { | ||
5410 | spin_lock_irq(&conf->device_lock); | ||
5411 | conf->group_cnt = group_cnt; | ||
5412 | conf->worker_cnt_per_group = worker_cnt_per_group; | ||
5413 | conf->worker_groups = new_groups; | ||
5414 | spin_unlock_irq(&conf->device_lock); | ||
5227 | 5415 | ||
5228 | conf->worker_groups = NULL; | ||
5229 | err = alloc_thread_groups(conf, new); | ||
5230 | if (err) { | ||
5231 | conf->worker_groups = old_groups; | ||
5232 | conf->worker_cnt_per_group = old_group_cnt; | ||
5233 | } else { | ||
5234 | if (old_groups) | 5416 | if (old_groups) |
5235 | kfree(old_groups[0].workers); | 5417 | kfree(old_groups[0].workers); |
5236 | kfree(old_groups); | 5418 | kfree(old_groups); |
@@ -5260,40 +5442,47 @@ static struct attribute_group raid5_attrs_group = { | |||
5260 | .attrs = raid5_attrs, | 5442 | .attrs = raid5_attrs, |
5261 | }; | 5443 | }; |
5262 | 5444 | ||
5263 | static int alloc_thread_groups(struct r5conf *conf, int cnt) | 5445 | static int alloc_thread_groups(struct r5conf *conf, int cnt, |
5446 | int *group_cnt, | ||
5447 | int *worker_cnt_per_group, | ||
5448 | struct r5worker_group **worker_groups) | ||
5264 | { | 5449 | { |
5265 | int i, j; | 5450 | int i, j, k; |
5266 | ssize_t size; | 5451 | ssize_t size; |
5267 | struct r5worker *workers; | 5452 | struct r5worker *workers; |
5268 | 5453 | ||
5269 | conf->worker_cnt_per_group = cnt; | 5454 | *worker_cnt_per_group = cnt; |
5270 | if (cnt == 0) { | 5455 | if (cnt == 0) { |
5271 | conf->worker_groups = NULL; | 5456 | *group_cnt = 0; |
5457 | *worker_groups = NULL; | ||
5272 | return 0; | 5458 | return 0; |
5273 | } | 5459 | } |
5274 | conf->group_cnt = num_possible_nodes(); | 5460 | *group_cnt = num_possible_nodes(); |
5275 | size = sizeof(struct r5worker) * cnt; | 5461 | size = sizeof(struct r5worker) * cnt; |
5276 | workers = kzalloc(size * conf->group_cnt, GFP_NOIO); | 5462 | workers = kzalloc(size * *group_cnt, GFP_NOIO); |
5277 | conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * | 5463 | *worker_groups = kzalloc(sizeof(struct r5worker_group) * |
5278 | conf->group_cnt, GFP_NOIO); | 5464 | *group_cnt, GFP_NOIO); |
5279 | if (!conf->worker_groups || !workers) { | 5465 | if (!*worker_groups || !workers) { |
5280 | kfree(workers); | 5466 | kfree(workers); |
5281 | kfree(conf->worker_groups); | 5467 | kfree(*worker_groups); |
5282 | conf->worker_groups = NULL; | ||
5283 | return -ENOMEM; | 5468 | return -ENOMEM; |
5284 | } | 5469 | } |
5285 | 5470 | ||
5286 | for (i = 0; i < conf->group_cnt; i++) { | 5471 | for (i = 0; i < *group_cnt; i++) { |
5287 | struct r5worker_group *group; | 5472 | struct r5worker_group *group; |
5288 | 5473 | ||
5289 | group = &conf->worker_groups[i]; | 5474 | group = worker_groups[i]; |
5290 | INIT_LIST_HEAD(&group->handle_list); | 5475 | INIT_LIST_HEAD(&group->handle_list); |
5291 | group->conf = conf; | 5476 | group->conf = conf; |
5292 | group->workers = workers + i * cnt; | 5477 | group->workers = workers + i * cnt; |
5293 | 5478 | ||
5294 | for (j = 0; j < cnt; j++) { | 5479 | for (j = 0; j < cnt; j++) { |
5295 | group->workers[j].group = group; | 5480 | struct r5worker *worker = group->workers + j; |
5296 | INIT_WORK(&group->workers[j].work, raid5_do_work); | 5481 | worker->group = group; |
5482 | INIT_WORK(&worker->work, raid5_do_work); | ||
5483 | |||
5484 | for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) | ||
5485 | INIT_LIST_HEAD(worker->temp_inactive_list + k); | ||
5297 | } | 5486 | } |
5298 | } | 5487 | } |
5299 | 5488 | ||
@@ -5444,6 +5633,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5444 | struct md_rdev *rdev; | 5633 | struct md_rdev *rdev; |
5445 | struct disk_info *disk; | 5634 | struct disk_info *disk; |
5446 | char pers_name[6]; | 5635 | char pers_name[6]; |
5636 | int i; | ||
5637 | int group_cnt, worker_cnt_per_group; | ||
5638 | struct r5worker_group *new_group; | ||
5447 | 5639 | ||
5448 | if (mddev->new_level != 5 | 5640 | if (mddev->new_level != 5 |
5449 | && mddev->new_level != 4 | 5641 | && mddev->new_level != 4 |
@@ -5478,7 +5670,12 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5478 | if (conf == NULL) | 5670 | if (conf == NULL) |
5479 | goto abort; | 5671 | goto abort; |
5480 | /* Don't enable multi-threading by default*/ | 5672 | /* Don't enable multi-threading by default*/ |
5481 | if (alloc_thread_groups(conf, 0)) | 5673 | if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, |
5674 | &new_group)) { | ||
5675 | conf->group_cnt = group_cnt; | ||
5676 | conf->worker_cnt_per_group = worker_cnt_per_group; | ||
5677 | conf->worker_groups = new_group; | ||
5678 | } else | ||
5482 | goto abort; | 5679 | goto abort; |
5483 | spin_lock_init(&conf->device_lock); | 5680 | spin_lock_init(&conf->device_lock); |
5484 | seqcount_init(&conf->gen_lock); | 5681 | seqcount_init(&conf->gen_lock); |
@@ -5488,7 +5685,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5488 | INIT_LIST_HEAD(&conf->hold_list); | 5685 | INIT_LIST_HEAD(&conf->hold_list); |
5489 | INIT_LIST_HEAD(&conf->delayed_list); | 5686 | INIT_LIST_HEAD(&conf->delayed_list); |
5490 | INIT_LIST_HEAD(&conf->bitmap_list); | 5687 | INIT_LIST_HEAD(&conf->bitmap_list); |
5491 | INIT_LIST_HEAD(&conf->inactive_list); | ||
5492 | init_llist_head(&conf->released_stripes); | 5688 | init_llist_head(&conf->released_stripes); |
5493 | atomic_set(&conf->active_stripes, 0); | 5689 | atomic_set(&conf->active_stripes, 0); |
5494 | atomic_set(&conf->preread_active_stripes, 0); | 5690 | atomic_set(&conf->preread_active_stripes, 0); |
@@ -5514,6 +5710,21 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5514 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) | 5710 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) |
5515 | goto abort; | 5711 | goto abort; |
5516 | 5712 | ||
5713 | /* We init hash_locks[0] separately to that it can be used | ||
5714 | * as the reference lock in the spin_lock_nest_lock() call | ||
5715 | * in lock_all_device_hash_locks_irq in order to convince | ||
5716 | * lockdep that we know what we are doing. | ||
5717 | */ | ||
5718 | spin_lock_init(conf->hash_locks); | ||
5719 | for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) | ||
5720 | spin_lock_init(conf->hash_locks + i); | ||
5721 | |||
5722 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||
5723 | INIT_LIST_HEAD(conf->inactive_list + i); | ||
5724 | |||
5725 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||
5726 | INIT_LIST_HEAD(conf->temp_inactive_list + i); | ||
5727 | |||
5517 | conf->level = mddev->new_level; | 5728 | conf->level = mddev->new_level; |
5518 | if (raid5_alloc_percpu(conf) != 0) | 5729 | if (raid5_alloc_percpu(conf) != 0) |
5519 | goto abort; | 5730 | goto abort; |
@@ -5554,7 +5765,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5554 | else | 5765 | else |
5555 | conf->max_degraded = 1; | 5766 | conf->max_degraded = 1; |
5556 | conf->algorithm = mddev->new_layout; | 5767 | conf->algorithm = mddev->new_layout; |
5557 | conf->max_nr_stripes = NR_STRIPES; | ||
5558 | conf->reshape_progress = mddev->reshape_position; | 5768 | conf->reshape_progress = mddev->reshape_position; |
5559 | if (conf->reshape_progress != MaxSector) { | 5769 | if (conf->reshape_progress != MaxSector) { |
5560 | conf->prev_chunk_sectors = mddev->chunk_sectors; | 5770 | conf->prev_chunk_sectors = mddev->chunk_sectors; |
@@ -5563,7 +5773,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
5563 | 5773 | ||
5564 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + | 5774 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + |
5565 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | 5775 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; |
5566 | if (grow_stripes(conf, conf->max_nr_stripes)) { | 5776 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); |
5777 | if (grow_stripes(conf, NR_STRIPES)) { | ||
5567 | printk(KERN_ERR | 5778 | printk(KERN_ERR |
5568 | "md/raid:%s: couldn't allocate %dkB for buffers\n", | 5779 | "md/raid:%s: couldn't allocate %dkB for buffers\n", |
5569 | mdname(mddev), memory); | 5780 | mdname(mddev), memory); |
@@ -6369,12 +6580,18 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
6369 | if (!mddev->sync_thread) { | 6580 | if (!mddev->sync_thread) { |
6370 | mddev->recovery = 0; | 6581 | mddev->recovery = 0; |
6371 | spin_lock_irq(&conf->device_lock); | 6582 | spin_lock_irq(&conf->device_lock); |
6583 | write_seqcount_begin(&conf->gen_lock); | ||
6372 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; | 6584 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; |
6585 | mddev->new_chunk_sectors = | ||
6586 | conf->chunk_sectors = conf->prev_chunk_sectors; | ||
6587 | mddev->new_layout = conf->algorithm = conf->prev_algo; | ||
6373 | rdev_for_each(rdev, mddev) | 6588 | rdev_for_each(rdev, mddev) |
6374 | rdev->new_data_offset = rdev->data_offset; | 6589 | rdev->new_data_offset = rdev->data_offset; |
6375 | smp_wmb(); | 6590 | smp_wmb(); |
6591 | conf->generation --; | ||
6376 | conf->reshape_progress = MaxSector; | 6592 | conf->reshape_progress = MaxSector; |
6377 | mddev->reshape_position = MaxSector; | 6593 | mddev->reshape_position = MaxSector; |
6594 | write_seqcount_end(&conf->gen_lock); | ||
6378 | spin_unlock_irq(&conf->device_lock); | 6595 | spin_unlock_irq(&conf->device_lock); |
6379 | return -EAGAIN; | 6596 | return -EAGAIN; |
6380 | } | 6597 | } |
@@ -6462,27 +6679,28 @@ static void raid5_quiesce(struct mddev *mddev, int state) | |||
6462 | break; | 6679 | break; |
6463 | 6680 | ||
6464 | case 1: /* stop all writes */ | 6681 | case 1: /* stop all writes */ |
6465 | spin_lock_irq(&conf->device_lock); | 6682 | lock_all_device_hash_locks_irq(conf); |
6466 | /* '2' tells resync/reshape to pause so that all | 6683 | /* '2' tells resync/reshape to pause so that all |
6467 | * active stripes can drain | 6684 | * active stripes can drain |
6468 | */ | 6685 | */ |
6469 | conf->quiesce = 2; | 6686 | conf->quiesce = 2; |
6470 | wait_event_lock_irq(conf->wait_for_stripe, | 6687 | wait_event_cmd(conf->wait_for_stripe, |
6471 | atomic_read(&conf->active_stripes) == 0 && | 6688 | atomic_read(&conf->active_stripes) == 0 && |
6472 | atomic_read(&conf->active_aligned_reads) == 0, | 6689 | atomic_read(&conf->active_aligned_reads) == 0, |
6473 | conf->device_lock); | 6690 | unlock_all_device_hash_locks_irq(conf), |
6691 | lock_all_device_hash_locks_irq(conf)); | ||
6474 | conf->quiesce = 1; | 6692 | conf->quiesce = 1; |
6475 | spin_unlock_irq(&conf->device_lock); | 6693 | unlock_all_device_hash_locks_irq(conf); |
6476 | /* allow reshape to continue */ | 6694 | /* allow reshape to continue */ |
6477 | wake_up(&conf->wait_for_overlap); | 6695 | wake_up(&conf->wait_for_overlap); |
6478 | break; | 6696 | break; |
6479 | 6697 | ||
6480 | case 0: /* re-enable writes */ | 6698 | case 0: /* re-enable writes */ |
6481 | spin_lock_irq(&conf->device_lock); | 6699 | lock_all_device_hash_locks_irq(conf); |
6482 | conf->quiesce = 0; | 6700 | conf->quiesce = 0; |
6483 | wake_up(&conf->wait_for_stripe); | 6701 | wake_up(&conf->wait_for_stripe); |
6484 | wake_up(&conf->wait_for_overlap); | 6702 | wake_up(&conf->wait_for_overlap); |
6485 | spin_unlock_irq(&conf->device_lock); | 6703 | unlock_all_device_hash_locks_irq(conf); |
6486 | break; | 6704 | break; |
6487 | } | 6705 | } |
6488 | } | 6706 | } |