diff options
Diffstat (limited to 'drivers/md/raid5.c')
| -rw-r--r-- | drivers/md/raid5.c | 826 |
1 files changed, 691 insertions, 135 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cd2f96b2c572..77dfd720aaa0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -54,6 +54,7 @@ | |||
| 54 | #include <linux/slab.h> | 54 | #include <linux/slab.h> |
| 55 | #include <linux/ratelimit.h> | 55 | #include <linux/ratelimit.h> |
| 56 | #include <linux/nodemask.h> | 56 | #include <linux/nodemask.h> |
| 57 | #include <linux/flex_array.h> | ||
| 57 | #include <trace/events/block.h> | 58 | #include <trace/events/block.h> |
| 58 | 59 | ||
| 59 | #include "md.h" | 60 | #include "md.h" |
| @@ -496,7 +497,7 @@ static void shrink_buffers(struct stripe_head *sh) | |||
| 496 | } | 497 | } |
| 497 | } | 498 | } |
| 498 | 499 | ||
| 499 | static int grow_buffers(struct stripe_head *sh) | 500 | static int grow_buffers(struct stripe_head *sh, gfp_t gfp) |
| 500 | { | 501 | { |
| 501 | int i; | 502 | int i; |
| 502 | int num = sh->raid_conf->pool_size; | 503 | int num = sh->raid_conf->pool_size; |
| @@ -504,7 +505,7 @@ static int grow_buffers(struct stripe_head *sh) | |||
| 504 | for (i = 0; i < num; i++) { | 505 | for (i = 0; i < num; i++) { |
| 505 | struct page *page; | 506 | struct page *page; |
| 506 | 507 | ||
| 507 | if (!(page = alloc_page(GFP_KERNEL))) { | 508 | if (!(page = alloc_page(gfp))) { |
| 508 | return 1; | 509 | return 1; |
| 509 | } | 510 | } |
| 510 | sh->dev[i].page = page; | 511 | sh->dev[i].page = page; |
| @@ -525,6 +526,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | |||
| 525 | BUG_ON(atomic_read(&sh->count) != 0); | 526 | BUG_ON(atomic_read(&sh->count) != 0); |
| 526 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); | 527 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); |
| 527 | BUG_ON(stripe_operations_active(sh)); | 528 | BUG_ON(stripe_operations_active(sh)); |
| 529 | BUG_ON(sh->batch_head); | ||
| 528 | 530 | ||
| 529 | pr_debug("init_stripe called, stripe %llu\n", | 531 | pr_debug("init_stripe called, stripe %llu\n", |
| 530 | (unsigned long long)sector); | 532 | (unsigned long long)sector); |
| @@ -552,8 +554,10 @@ retry: | |||
| 552 | } | 554 | } |
| 553 | if (read_seqcount_retry(&conf->gen_lock, seq)) | 555 | if (read_seqcount_retry(&conf->gen_lock, seq)) |
| 554 | goto retry; | 556 | goto retry; |
| 557 | sh->overwrite_disks = 0; | ||
| 555 | insert_hash(conf, sh); | 558 | insert_hash(conf, sh); |
| 556 | sh->cpu = smp_processor_id(); | 559 | sh->cpu = smp_processor_id(); |
| 560 | set_bit(STRIPE_BATCH_READY, &sh->state); | ||
| 557 | } | 561 | } |
| 558 | 562 | ||
| 559 | static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, | 563 | static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, |
| @@ -668,20 +672,28 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
| 668 | *(conf->hash_locks + hash)); | 672 | *(conf->hash_locks + hash)); |
| 669 | sh = __find_stripe(conf, sector, conf->generation - previous); | 673 | sh = __find_stripe(conf, sector, conf->generation - previous); |
| 670 | if (!sh) { | 674 | if (!sh) { |
| 671 | if (!conf->inactive_blocked) | 675 | if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { |
| 672 | sh = get_free_stripe(conf, hash); | 676 | sh = get_free_stripe(conf, hash); |
| 677 | if (!sh && llist_empty(&conf->released_stripes) && | ||
| 678 | !test_bit(R5_DID_ALLOC, &conf->cache_state)) | ||
| 679 | set_bit(R5_ALLOC_MORE, | ||
| 680 | &conf->cache_state); | ||
| 681 | } | ||
| 673 | if (noblock && sh == NULL) | 682 | if (noblock && sh == NULL) |
| 674 | break; | 683 | break; |
| 675 | if (!sh) { | 684 | if (!sh) { |
| 676 | conf->inactive_blocked = 1; | 685 | set_bit(R5_INACTIVE_BLOCKED, |
| 686 | &conf->cache_state); | ||
| 677 | wait_event_lock_irq( | 687 | wait_event_lock_irq( |
| 678 | conf->wait_for_stripe, | 688 | conf->wait_for_stripe, |
| 679 | !list_empty(conf->inactive_list + hash) && | 689 | !list_empty(conf->inactive_list + hash) && |
| 680 | (atomic_read(&conf->active_stripes) | 690 | (atomic_read(&conf->active_stripes) |
| 681 | < (conf->max_nr_stripes * 3 / 4) | 691 | < (conf->max_nr_stripes * 3 / 4) |
| 682 | || !conf->inactive_blocked), | 692 | || !test_bit(R5_INACTIVE_BLOCKED, |
| 693 | &conf->cache_state)), | ||
| 683 | *(conf->hash_locks + hash)); | 694 | *(conf->hash_locks + hash)); |
| 684 | conf->inactive_blocked = 0; | 695 | clear_bit(R5_INACTIVE_BLOCKED, |
| 696 | &conf->cache_state); | ||
| 685 | } else { | 697 | } else { |
| 686 | init_stripe(sh, sector, previous); | 698 | init_stripe(sh, sector, previous); |
| 687 | atomic_inc(&sh->count); | 699 | atomic_inc(&sh->count); |
| @@ -708,6 +720,130 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
| 708 | return sh; | 720 | return sh; |
| 709 | } | 721 | } |
| 710 | 722 | ||
| 723 | static bool is_full_stripe_write(struct stripe_head *sh) | ||
| 724 | { | ||
| 725 | BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); | ||
| 726 | return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); | ||
| 727 | } | ||
| 728 | |||
| 729 | static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) | ||
| 730 | { | ||
| 731 | local_irq_disable(); | ||
| 732 | if (sh1 > sh2) { | ||
| 733 | spin_lock(&sh2->stripe_lock); | ||
| 734 | spin_lock_nested(&sh1->stripe_lock, 1); | ||
| 735 | } else { | ||
| 736 | spin_lock(&sh1->stripe_lock); | ||
| 737 | spin_lock_nested(&sh2->stripe_lock, 1); | ||
| 738 | } | ||
| 739 | } | ||
| 740 | |||
| 741 | static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) | ||
| 742 | { | ||
| 743 | spin_unlock(&sh1->stripe_lock); | ||
| 744 | spin_unlock(&sh2->stripe_lock); | ||
| 745 | local_irq_enable(); | ||
| 746 | } | ||
| 747 | |||
| 748 | /* Only freshly new full stripe normal write stripe can be added to a batch list */ | ||
| 749 | static bool stripe_can_batch(struct stripe_head *sh) | ||
| 750 | { | ||
| 751 | return test_bit(STRIPE_BATCH_READY, &sh->state) && | ||
| 752 | is_full_stripe_write(sh); | ||
| 753 | } | ||
| 754 | |||
| 755 | /* we only do back search */ | ||
| 756 | static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) | ||
| 757 | { | ||
| 758 | struct stripe_head *head; | ||
| 759 | sector_t head_sector, tmp_sec; | ||
| 760 | int hash; | ||
| 761 | int dd_idx; | ||
| 762 | |||
| 763 | if (!stripe_can_batch(sh)) | ||
| 764 | return; | ||
| 765 | /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ | ||
| 766 | tmp_sec = sh->sector; | ||
| 767 | if (!sector_div(tmp_sec, conf->chunk_sectors)) | ||
| 768 | return; | ||
| 769 | head_sector = sh->sector - STRIPE_SECTORS; | ||
| 770 | |||
| 771 | hash = stripe_hash_locks_hash(head_sector); | ||
| 772 | spin_lock_irq(conf->hash_locks + hash); | ||
| 773 | head = __find_stripe(conf, head_sector, conf->generation); | ||
| 774 | if (head && !atomic_inc_not_zero(&head->count)) { | ||
| 775 | spin_lock(&conf->device_lock); | ||
| 776 | if (!atomic_read(&head->count)) { | ||
| 777 | if (!test_bit(STRIPE_HANDLE, &head->state)) | ||
| 778 | atomic_inc(&conf->active_stripes); | ||
| 779 | BUG_ON(list_empty(&head->lru) && | ||
| 780 | !test_bit(STRIPE_EXPANDING, &head->state)); | ||
| 781 | list_del_init(&head->lru); | ||
| 782 | if (head->group) { | ||
| 783 | head->group->stripes_cnt--; | ||
| 784 | head->group = NULL; | ||
| 785 | } | ||
| 786 | } | ||
| 787 | atomic_inc(&head->count); | ||
| 788 | spin_unlock(&conf->device_lock); | ||
| 789 | } | ||
| 790 | spin_unlock_irq(conf->hash_locks + hash); | ||
| 791 | |||
| 792 | if (!head) | ||
| 793 | return; | ||
| 794 | if (!stripe_can_batch(head)) | ||
| 795 | goto out; | ||
| 796 | |||
| 797 | lock_two_stripes(head, sh); | ||
| 798 | /* clear_batch_ready clear the flag */ | ||
| 799 | if (!stripe_can_batch(head) || !stripe_can_batch(sh)) | ||
| 800 | goto unlock_out; | ||
| 801 | |||
| 802 | if (sh->batch_head) | ||
| 803 | goto unlock_out; | ||
| 804 | |||
| 805 | dd_idx = 0; | ||
| 806 | while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) | ||
| 807 | dd_idx++; | ||
| 808 | if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw) | ||
| 809 | goto unlock_out; | ||
| 810 | |||
| 811 | if (head->batch_head) { | ||
| 812 | spin_lock(&head->batch_head->batch_lock); | ||
| 813 | /* This batch list is already running */ | ||
| 814 | if (!stripe_can_batch(head)) { | ||
| 815 | spin_unlock(&head->batch_head->batch_lock); | ||
| 816 | goto unlock_out; | ||
| 817 | } | ||
| 818 | |||
| 819 | /* | ||
| 820 | * at this point, head's BATCH_READY could be cleared, but we | ||
| 821 | * can still add the stripe to batch list | ||
| 822 | */ | ||
| 823 | list_add(&sh->batch_list, &head->batch_list); | ||
| 824 | spin_unlock(&head->batch_head->batch_lock); | ||
| 825 | |||
| 826 | sh->batch_head = head->batch_head; | ||
| 827 | } else { | ||
| 828 | head->batch_head = head; | ||
| 829 | sh->batch_head = head->batch_head; | ||
| 830 | spin_lock(&head->batch_lock); | ||
| 831 | list_add_tail(&sh->batch_list, &head->batch_list); | ||
| 832 | spin_unlock(&head->batch_lock); | ||
| 833 | } | ||
| 834 | |||
| 835 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
| 836 | if (atomic_dec_return(&conf->preread_active_stripes) | ||
| 837 | < IO_THRESHOLD) | ||
| 838 | md_wakeup_thread(conf->mddev->thread); | ||
| 839 | |||
| 840 | atomic_inc(&sh->count); | ||
| 841 | unlock_out: | ||
| 842 | unlock_two_stripes(head, sh); | ||
| 843 | out: | ||
| 844 | release_stripe(head); | ||
| 845 | } | ||
| 846 | |||
| 711 | /* Determine if 'data_offset' or 'new_data_offset' should be used | 847 | /* Determine if 'data_offset' or 'new_data_offset' should be used |
| 712 | * in this stripe_head. | 848 | * in this stripe_head. |
| 713 | */ | 849 | */ |
| @@ -738,6 +874,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 738 | { | 874 | { |
| 739 | struct r5conf *conf = sh->raid_conf; | 875 | struct r5conf *conf = sh->raid_conf; |
| 740 | int i, disks = sh->disks; | 876 | int i, disks = sh->disks; |
| 877 | struct stripe_head *head_sh = sh; | ||
| 741 | 878 | ||
| 742 | might_sleep(); | 879 | might_sleep(); |
| 743 | 880 | ||
| @@ -746,6 +883,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 746 | int replace_only = 0; | 883 | int replace_only = 0; |
| 747 | struct bio *bi, *rbi; | 884 | struct bio *bi, *rbi; |
| 748 | struct md_rdev *rdev, *rrdev = NULL; | 885 | struct md_rdev *rdev, *rrdev = NULL; |
| 886 | |||
| 887 | sh = head_sh; | ||
| 749 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { | 888 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { |
| 750 | if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) | 889 | if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) |
| 751 | rw = WRITE_FUA; | 890 | rw = WRITE_FUA; |
| @@ -764,6 +903,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 764 | if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) | 903 | if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) |
| 765 | rw |= REQ_SYNC; | 904 | rw |= REQ_SYNC; |
| 766 | 905 | ||
| 906 | again: | ||
| 767 | bi = &sh->dev[i].req; | 907 | bi = &sh->dev[i].req; |
| 768 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ | 908 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ |
| 769 | 909 | ||
| @@ -782,7 +922,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 782 | /* We raced and saw duplicates */ | 922 | /* We raced and saw duplicates */ |
| 783 | rrdev = NULL; | 923 | rrdev = NULL; |
| 784 | } else { | 924 | } else { |
| 785 | if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) | 925 | if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) |
| 786 | rdev = rrdev; | 926 | rdev = rrdev; |
| 787 | rrdev = NULL; | 927 | rrdev = NULL; |
| 788 | } | 928 | } |
| @@ -853,13 +993,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 853 | __func__, (unsigned long long)sh->sector, | 993 | __func__, (unsigned long long)sh->sector, |
| 854 | bi->bi_rw, i); | 994 | bi->bi_rw, i); |
| 855 | atomic_inc(&sh->count); | 995 | atomic_inc(&sh->count); |
| 996 | if (sh != head_sh) | ||
| 997 | atomic_inc(&head_sh->count); | ||
| 856 | if (use_new_offset(conf, sh)) | 998 | if (use_new_offset(conf, sh)) |
| 857 | bi->bi_iter.bi_sector = (sh->sector | 999 | bi->bi_iter.bi_sector = (sh->sector |
| 858 | + rdev->new_data_offset); | 1000 | + rdev->new_data_offset); |
| 859 | else | 1001 | else |
| 860 | bi->bi_iter.bi_sector = (sh->sector | 1002 | bi->bi_iter.bi_sector = (sh->sector |
| 861 | + rdev->data_offset); | 1003 | + rdev->data_offset); |
| 862 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | 1004 | if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) |
| 863 | bi->bi_rw |= REQ_NOMERGE; | 1005 | bi->bi_rw |= REQ_NOMERGE; |
| 864 | 1006 | ||
| 865 | if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) | 1007 | if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) |
| @@ -903,6 +1045,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 903 | __func__, (unsigned long long)sh->sector, | 1045 | __func__, (unsigned long long)sh->sector, |
| 904 | rbi->bi_rw, i); | 1046 | rbi->bi_rw, i); |
| 905 | atomic_inc(&sh->count); | 1047 | atomic_inc(&sh->count); |
| 1048 | if (sh != head_sh) | ||
| 1049 | atomic_inc(&head_sh->count); | ||
| 906 | if (use_new_offset(conf, sh)) | 1050 | if (use_new_offset(conf, sh)) |
| 907 | rbi->bi_iter.bi_sector = (sh->sector | 1051 | rbi->bi_iter.bi_sector = (sh->sector |
| 908 | + rrdev->new_data_offset); | 1052 | + rrdev->new_data_offset); |
| @@ -934,8 +1078,18 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 934 | pr_debug("skip op %ld on disc %d for sector %llu\n", | 1078 | pr_debug("skip op %ld on disc %d for sector %llu\n", |
| 935 | bi->bi_rw, i, (unsigned long long)sh->sector); | 1079 | bi->bi_rw, i, (unsigned long long)sh->sector); |
| 936 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | 1080 | clear_bit(R5_LOCKED, &sh->dev[i].flags); |
| 1081 | if (sh->batch_head) | ||
| 1082 | set_bit(STRIPE_BATCH_ERR, | ||
| 1083 | &sh->batch_head->state); | ||
| 937 | set_bit(STRIPE_HANDLE, &sh->state); | 1084 | set_bit(STRIPE_HANDLE, &sh->state); |
| 938 | } | 1085 | } |
| 1086 | |||
| 1087 | if (!head_sh->batch_head) | ||
| 1088 | continue; | ||
| 1089 | sh = list_first_entry(&sh->batch_list, struct stripe_head, | ||
| 1090 | batch_list); | ||
| 1091 | if (sh != head_sh) | ||
| 1092 | goto again; | ||
| 939 | } | 1093 | } |
| 940 | } | 1094 | } |
| 941 | 1095 | ||
| @@ -1051,6 +1205,7 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
| 1051 | struct async_submit_ctl submit; | 1205 | struct async_submit_ctl submit; |
| 1052 | int i; | 1206 | int i; |
| 1053 | 1207 | ||
| 1208 | BUG_ON(sh->batch_head); | ||
| 1054 | pr_debug("%s: stripe %llu\n", __func__, | 1209 | pr_debug("%s: stripe %llu\n", __func__, |
| 1055 | (unsigned long long)sh->sector); | 1210 | (unsigned long long)sh->sector); |
| 1056 | 1211 | ||
| @@ -1109,16 +1264,28 @@ static void ops_complete_compute(void *stripe_head_ref) | |||
| 1109 | 1264 | ||
| 1110 | /* return a pointer to the address conversion region of the scribble buffer */ | 1265 | /* return a pointer to the address conversion region of the scribble buffer */ |
| 1111 | static addr_conv_t *to_addr_conv(struct stripe_head *sh, | 1266 | static addr_conv_t *to_addr_conv(struct stripe_head *sh, |
| 1112 | struct raid5_percpu *percpu) | 1267 | struct raid5_percpu *percpu, int i) |
| 1113 | { | 1268 | { |
| 1114 | return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); | 1269 | void *addr; |
| 1270 | |||
| 1271 | addr = flex_array_get(percpu->scribble, i); | ||
| 1272 | return addr + sizeof(struct page *) * (sh->disks + 2); | ||
| 1273 | } | ||
| 1274 | |||
| 1275 | /* return a pointer to the address conversion region of the scribble buffer */ | ||
| 1276 | static struct page **to_addr_page(struct raid5_percpu *percpu, int i) | ||
| 1277 | { | ||
| 1278 | void *addr; | ||
| 1279 | |||
| 1280 | addr = flex_array_get(percpu->scribble, i); | ||
| 1281 | return addr; | ||
| 1115 | } | 1282 | } |
| 1116 | 1283 | ||
| 1117 | static struct dma_async_tx_descriptor * | 1284 | static struct dma_async_tx_descriptor * |
| 1118 | ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | 1285 | ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) |
| 1119 | { | 1286 | { |
| 1120 | int disks = sh->disks; | 1287 | int disks = sh->disks; |
| 1121 | struct page **xor_srcs = percpu->scribble; | 1288 | struct page **xor_srcs = to_addr_page(percpu, 0); |
| 1122 | int target = sh->ops.target; | 1289 | int target = sh->ops.target; |
| 1123 | struct r5dev *tgt = &sh->dev[target]; | 1290 | struct r5dev *tgt = &sh->dev[target]; |
| 1124 | struct page *xor_dest = tgt->page; | 1291 | struct page *xor_dest = tgt->page; |
| @@ -1127,6 +1294,8 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1127 | struct async_submit_ctl submit; | 1294 | struct async_submit_ctl submit; |
| 1128 | int i; | 1295 | int i; |
| 1129 | 1296 | ||
| 1297 | BUG_ON(sh->batch_head); | ||
| 1298 | |||
| 1130 | pr_debug("%s: stripe %llu block: %d\n", | 1299 | pr_debug("%s: stripe %llu block: %d\n", |
| 1131 | __func__, (unsigned long long)sh->sector, target); | 1300 | __func__, (unsigned long long)sh->sector, target); |
| 1132 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | 1301 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); |
| @@ -1138,7 +1307,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1138 | atomic_inc(&sh->count); | 1307 | atomic_inc(&sh->count); |
| 1139 | 1308 | ||
| 1140 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, | 1309 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, |
| 1141 | ops_complete_compute, sh, to_addr_conv(sh, percpu)); | 1310 | ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); |
| 1142 | if (unlikely(count == 1)) | 1311 | if (unlikely(count == 1)) |
| 1143 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); | 1312 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); |
| 1144 | else | 1313 | else |
| @@ -1156,7 +1325,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1156 | * destination buffer is recorded in srcs[count] and the Q destination | 1325 | * destination buffer is recorded in srcs[count] and the Q destination |
| 1157 | * is recorded in srcs[count+1]]. | 1326 | * is recorded in srcs[count+1]]. |
| 1158 | */ | 1327 | */ |
| 1159 | static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) | 1328 | static int set_syndrome_sources(struct page **srcs, |
| 1329 | struct stripe_head *sh, | ||
| 1330 | int srctype) | ||
| 1160 | { | 1331 | { |
| 1161 | int disks = sh->disks; | 1332 | int disks = sh->disks; |
| 1162 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); | 1333 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); |
| @@ -1171,8 +1342,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) | |||
| 1171 | i = d0_idx; | 1342 | i = d0_idx; |
| 1172 | do { | 1343 | do { |
| 1173 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | 1344 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); |
| 1345 | struct r5dev *dev = &sh->dev[i]; | ||
| 1174 | 1346 | ||
| 1175 | srcs[slot] = sh->dev[i].page; | 1347 | if (i == sh->qd_idx || i == sh->pd_idx || |
| 1348 | (srctype == SYNDROME_SRC_ALL) || | ||
| 1349 | (srctype == SYNDROME_SRC_WANT_DRAIN && | ||
| 1350 | test_bit(R5_Wantdrain, &dev->flags)) || | ||
| 1351 | (srctype == SYNDROME_SRC_WRITTEN && | ||
| 1352 | dev->written)) | ||
| 1353 | srcs[slot] = sh->dev[i].page; | ||
| 1176 | i = raid6_next_disk(i, disks); | 1354 | i = raid6_next_disk(i, disks); |
| 1177 | } while (i != d0_idx); | 1355 | } while (i != d0_idx); |
| 1178 | 1356 | ||
| @@ -1183,7 +1361,7 @@ static struct dma_async_tx_descriptor * | |||
| 1183 | ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | 1361 | ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) |
| 1184 | { | 1362 | { |
| 1185 | int disks = sh->disks; | 1363 | int disks = sh->disks; |
| 1186 | struct page **blocks = percpu->scribble; | 1364 | struct page **blocks = to_addr_page(percpu, 0); |
| 1187 | int target; | 1365 | int target; |
| 1188 | int qd_idx = sh->qd_idx; | 1366 | int qd_idx = sh->qd_idx; |
| 1189 | struct dma_async_tx_descriptor *tx; | 1367 | struct dma_async_tx_descriptor *tx; |
| @@ -1193,6 +1371,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1193 | int i; | 1371 | int i; |
| 1194 | int count; | 1372 | int count; |
| 1195 | 1373 | ||
| 1374 | BUG_ON(sh->batch_head); | ||
| 1196 | if (sh->ops.target < 0) | 1375 | if (sh->ops.target < 0) |
| 1197 | target = sh->ops.target2; | 1376 | target = sh->ops.target2; |
| 1198 | else if (sh->ops.target2 < 0) | 1377 | else if (sh->ops.target2 < 0) |
| @@ -1211,12 +1390,12 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1211 | atomic_inc(&sh->count); | 1390 | atomic_inc(&sh->count); |
| 1212 | 1391 | ||
| 1213 | if (target == qd_idx) { | 1392 | if (target == qd_idx) { |
| 1214 | count = set_syndrome_sources(blocks, sh); | 1393 | count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); |
| 1215 | blocks[count] = NULL; /* regenerating p is not necessary */ | 1394 | blocks[count] = NULL; /* regenerating p is not necessary */ |
| 1216 | BUG_ON(blocks[count+1] != dest); /* q should already be set */ | 1395 | BUG_ON(blocks[count+1] != dest); /* q should already be set */ |
| 1217 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | 1396 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, |
| 1218 | ops_complete_compute, sh, | 1397 | ops_complete_compute, sh, |
| 1219 | to_addr_conv(sh, percpu)); | 1398 | to_addr_conv(sh, percpu, 0)); |
| 1220 | tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | 1399 | tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); |
| 1221 | } else { | 1400 | } else { |
| 1222 | /* Compute any data- or p-drive using XOR */ | 1401 | /* Compute any data- or p-drive using XOR */ |
| @@ -1229,7 +1408,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1229 | 1408 | ||
| 1230 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, | 1409 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, |
| 1231 | NULL, ops_complete_compute, sh, | 1410 | NULL, ops_complete_compute, sh, |
| 1232 | to_addr_conv(sh, percpu)); | 1411 | to_addr_conv(sh, percpu, 0)); |
| 1233 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); | 1412 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); |
| 1234 | } | 1413 | } |
| 1235 | 1414 | ||
| @@ -1248,9 +1427,10 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1248 | struct r5dev *tgt = &sh->dev[target]; | 1427 | struct r5dev *tgt = &sh->dev[target]; |
| 1249 | struct r5dev *tgt2 = &sh->dev[target2]; | 1428 | struct r5dev *tgt2 = &sh->dev[target2]; |
| 1250 | struct dma_async_tx_descriptor *tx; | 1429 | struct dma_async_tx_descriptor *tx; |
| 1251 | struct page **blocks = percpu->scribble; | 1430 | struct page **blocks = to_addr_page(percpu, 0); |
| 1252 | struct async_submit_ctl submit; | 1431 | struct async_submit_ctl submit; |
| 1253 | 1432 | ||
| 1433 | BUG_ON(sh->batch_head); | ||
| 1254 | pr_debug("%s: stripe %llu block1: %d block2: %d\n", | 1434 | pr_debug("%s: stripe %llu block1: %d block2: %d\n", |
| 1255 | __func__, (unsigned long long)sh->sector, target, target2); | 1435 | __func__, (unsigned long long)sh->sector, target, target2); |
| 1256 | BUG_ON(target < 0 || target2 < 0); | 1436 | BUG_ON(target < 0 || target2 < 0); |
| @@ -1290,7 +1470,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1290 | /* Missing P+Q, just recompute */ | 1470 | /* Missing P+Q, just recompute */ |
| 1291 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | 1471 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, |
| 1292 | ops_complete_compute, sh, | 1472 | ops_complete_compute, sh, |
| 1293 | to_addr_conv(sh, percpu)); | 1473 | to_addr_conv(sh, percpu, 0)); |
| 1294 | return async_gen_syndrome(blocks, 0, syndrome_disks+2, | 1474 | return async_gen_syndrome(blocks, 0, syndrome_disks+2, |
| 1295 | STRIPE_SIZE, &submit); | 1475 | STRIPE_SIZE, &submit); |
| 1296 | } else { | 1476 | } else { |
| @@ -1314,21 +1494,21 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1314 | init_async_submit(&submit, | 1494 | init_async_submit(&submit, |
| 1315 | ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, | 1495 | ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, |
| 1316 | NULL, NULL, NULL, | 1496 | NULL, NULL, NULL, |
| 1317 | to_addr_conv(sh, percpu)); | 1497 | to_addr_conv(sh, percpu, 0)); |
| 1318 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, | 1498 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, |
| 1319 | &submit); | 1499 | &submit); |
| 1320 | 1500 | ||
| 1321 | count = set_syndrome_sources(blocks, sh); | 1501 | count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); |
| 1322 | init_async_submit(&submit, ASYNC_TX_FENCE, tx, | 1502 | init_async_submit(&submit, ASYNC_TX_FENCE, tx, |
| 1323 | ops_complete_compute, sh, | 1503 | ops_complete_compute, sh, |
| 1324 | to_addr_conv(sh, percpu)); | 1504 | to_addr_conv(sh, percpu, 0)); |
| 1325 | return async_gen_syndrome(blocks, 0, count+2, | 1505 | return async_gen_syndrome(blocks, 0, count+2, |
| 1326 | STRIPE_SIZE, &submit); | 1506 | STRIPE_SIZE, &submit); |
| 1327 | } | 1507 | } |
| 1328 | } else { | 1508 | } else { |
| 1329 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | 1509 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, |
| 1330 | ops_complete_compute, sh, | 1510 | ops_complete_compute, sh, |
| 1331 | to_addr_conv(sh, percpu)); | 1511 | to_addr_conv(sh, percpu, 0)); |
| 1332 | if (failb == syndrome_disks) { | 1512 | if (failb == syndrome_disks) { |
| 1333 | /* We're missing D+P. */ | 1513 | /* We're missing D+P. */ |
| 1334 | return async_raid6_datap_recov(syndrome_disks+2, | 1514 | return async_raid6_datap_recov(syndrome_disks+2, |
| @@ -1352,17 +1532,18 @@ static void ops_complete_prexor(void *stripe_head_ref) | |||
| 1352 | } | 1532 | } |
| 1353 | 1533 | ||
| 1354 | static struct dma_async_tx_descriptor * | 1534 | static struct dma_async_tx_descriptor * |
| 1355 | ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, | 1535 | ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, |
| 1356 | struct dma_async_tx_descriptor *tx) | 1536 | struct dma_async_tx_descriptor *tx) |
| 1357 | { | 1537 | { |
| 1358 | int disks = sh->disks; | 1538 | int disks = sh->disks; |
| 1359 | struct page **xor_srcs = percpu->scribble; | 1539 | struct page **xor_srcs = to_addr_page(percpu, 0); |
| 1360 | int count = 0, pd_idx = sh->pd_idx, i; | 1540 | int count = 0, pd_idx = sh->pd_idx, i; |
| 1361 | struct async_submit_ctl submit; | 1541 | struct async_submit_ctl submit; |
| 1362 | 1542 | ||
| 1363 | /* existing parity data subtracted */ | 1543 | /* existing parity data subtracted */ |
| 1364 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 1544 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
| 1365 | 1545 | ||
| 1546 | BUG_ON(sh->batch_head); | ||
| 1366 | pr_debug("%s: stripe %llu\n", __func__, | 1547 | pr_debug("%s: stripe %llu\n", __func__, |
| 1367 | (unsigned long long)sh->sector); | 1548 | (unsigned long long)sh->sector); |
| 1368 | 1549 | ||
| @@ -1374,31 +1555,56 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
| 1374 | } | 1555 | } |
| 1375 | 1556 | ||
| 1376 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, | 1557 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, |
| 1377 | ops_complete_prexor, sh, to_addr_conv(sh, percpu)); | 1558 | ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); |
| 1378 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); | 1559 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
| 1379 | 1560 | ||
| 1380 | return tx; | 1561 | return tx; |
| 1381 | } | 1562 | } |
| 1382 | 1563 | ||
| 1383 | static struct dma_async_tx_descriptor * | 1564 | static struct dma_async_tx_descriptor * |
| 1565 | ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, | ||
| 1566 | struct dma_async_tx_descriptor *tx) | ||
| 1567 | { | ||
| 1568 | struct page **blocks = to_addr_page(percpu, 0); | ||
| 1569 | int count; | ||
| 1570 | struct async_submit_ctl submit; | ||
| 1571 | |||
| 1572 | pr_debug("%s: stripe %llu\n", __func__, | ||
| 1573 | (unsigned long long)sh->sector); | ||
| 1574 | |||
| 1575 | count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); | ||
| 1576 | |||
| 1577 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, | ||
| 1578 | ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); | ||
| 1579 | tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | ||
| 1580 | |||
| 1581 | return tx; | ||
| 1582 | } | ||
| 1583 | |||
| 1584 | static struct dma_async_tx_descriptor * | ||
| 1384 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 1585 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) |
| 1385 | { | 1586 | { |
| 1386 | int disks = sh->disks; | 1587 | int disks = sh->disks; |
| 1387 | int i; | 1588 | int i; |
| 1589 | struct stripe_head *head_sh = sh; | ||
| 1388 | 1590 | ||
| 1389 | pr_debug("%s: stripe %llu\n", __func__, | 1591 | pr_debug("%s: stripe %llu\n", __func__, |
| 1390 | (unsigned long long)sh->sector); | 1592 | (unsigned long long)sh->sector); |
| 1391 | 1593 | ||
| 1392 | for (i = disks; i--; ) { | 1594 | for (i = disks; i--; ) { |
| 1393 | struct r5dev *dev = &sh->dev[i]; | 1595 | struct r5dev *dev; |
| 1394 | struct bio *chosen; | 1596 | struct bio *chosen; |
| 1395 | 1597 | ||
| 1396 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { | 1598 | sh = head_sh; |
| 1599 | if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { | ||
| 1397 | struct bio *wbi; | 1600 | struct bio *wbi; |
| 1398 | 1601 | ||
| 1602 | again: | ||
| 1603 | dev = &sh->dev[i]; | ||
| 1399 | spin_lock_irq(&sh->stripe_lock); | 1604 | spin_lock_irq(&sh->stripe_lock); |
| 1400 | chosen = dev->towrite; | 1605 | chosen = dev->towrite; |
| 1401 | dev->towrite = NULL; | 1606 | dev->towrite = NULL; |
| 1607 | sh->overwrite_disks = 0; | ||
| 1402 | BUG_ON(dev->written); | 1608 | BUG_ON(dev->written); |
| 1403 | wbi = dev->written = chosen; | 1609 | wbi = dev->written = chosen; |
| 1404 | spin_unlock_irq(&sh->stripe_lock); | 1610 | spin_unlock_irq(&sh->stripe_lock); |
| @@ -1423,6 +1629,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
| 1423 | } | 1629 | } |
| 1424 | wbi = r5_next_bio(wbi, dev->sector); | 1630 | wbi = r5_next_bio(wbi, dev->sector); |
| 1425 | } | 1631 | } |
| 1632 | |||
| 1633 | if (head_sh->batch_head) { | ||
| 1634 | sh = list_first_entry(&sh->batch_list, | ||
| 1635 | struct stripe_head, | ||
| 1636 | batch_list); | ||
| 1637 | if (sh == head_sh) | ||
| 1638 | continue; | ||
| 1639 | goto again; | ||
| 1640 | } | ||
| 1426 | } | 1641 | } |
| 1427 | } | 1642 | } |
| 1428 | 1643 | ||
| @@ -1478,12 +1693,15 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
| 1478 | struct dma_async_tx_descriptor *tx) | 1693 | struct dma_async_tx_descriptor *tx) |
| 1479 | { | 1694 | { |
| 1480 | int disks = sh->disks; | 1695 | int disks = sh->disks; |
| 1481 | struct page **xor_srcs = percpu->scribble; | 1696 | struct page **xor_srcs; |
| 1482 | struct async_submit_ctl submit; | 1697 | struct async_submit_ctl submit; |
| 1483 | int count = 0, pd_idx = sh->pd_idx, i; | 1698 | int count, pd_idx = sh->pd_idx, i; |
| 1484 | struct page *xor_dest; | 1699 | struct page *xor_dest; |
| 1485 | int prexor = 0; | 1700 | int prexor = 0; |
| 1486 | unsigned long flags; | 1701 | unsigned long flags; |
| 1702 | int j = 0; | ||
| 1703 | struct stripe_head *head_sh = sh; | ||
| 1704 | int last_stripe; | ||
| 1487 | 1705 | ||
| 1488 | pr_debug("%s: stripe %llu\n", __func__, | 1706 | pr_debug("%s: stripe %llu\n", __func__, |
| 1489 | (unsigned long long)sh->sector); | 1707 | (unsigned long long)sh->sector); |
| @@ -1500,15 +1718,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
| 1500 | ops_complete_reconstruct(sh); | 1718 | ops_complete_reconstruct(sh); |
| 1501 | return; | 1719 | return; |
| 1502 | } | 1720 | } |
| 1721 | again: | ||
| 1722 | count = 0; | ||
| 1723 | xor_srcs = to_addr_page(percpu, j); | ||
| 1503 | /* check if prexor is active which means only process blocks | 1724 | /* check if prexor is active which means only process blocks |
| 1504 | * that are part of a read-modify-write (written) | 1725 | * that are part of a read-modify-write (written) |
| 1505 | */ | 1726 | */ |
| 1506 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { | 1727 | if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { |
| 1507 | prexor = 1; | 1728 | prexor = 1; |
| 1508 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 1729 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
| 1509 | for (i = disks; i--; ) { | 1730 | for (i = disks; i--; ) { |
| 1510 | struct r5dev *dev = &sh->dev[i]; | 1731 | struct r5dev *dev = &sh->dev[i]; |
| 1511 | if (dev->written) | 1732 | if (head_sh->dev[i].written) |
| 1512 | xor_srcs[count++] = dev->page; | 1733 | xor_srcs[count++] = dev->page; |
| 1513 | } | 1734 | } |
| 1514 | } else { | 1735 | } else { |
| @@ -1525,17 +1746,32 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
| 1525 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST | 1746 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST |
| 1526 | * for the synchronous xor case | 1747 | * for the synchronous xor case |
| 1527 | */ | 1748 | */ |
| 1528 | flags = ASYNC_TX_ACK | | 1749 | last_stripe = !head_sh->batch_head || |
| 1529 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); | 1750 | list_first_entry(&sh->batch_list, |
| 1530 | 1751 | struct stripe_head, batch_list) == head_sh; | |
| 1531 | atomic_inc(&sh->count); | 1752 | if (last_stripe) { |
| 1753 | flags = ASYNC_TX_ACK | | ||
| 1754 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); | ||
| 1755 | |||
| 1756 | atomic_inc(&head_sh->count); | ||
| 1757 | init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, | ||
| 1758 | to_addr_conv(sh, percpu, j)); | ||
| 1759 | } else { | ||
| 1760 | flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; | ||
| 1761 | init_async_submit(&submit, flags, tx, NULL, NULL, | ||
| 1762 | to_addr_conv(sh, percpu, j)); | ||
| 1763 | } | ||
| 1532 | 1764 | ||
| 1533 | init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, | ||
| 1534 | to_addr_conv(sh, percpu)); | ||
| 1535 | if (unlikely(count == 1)) | 1765 | if (unlikely(count == 1)) |
| 1536 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); | 1766 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); |
| 1537 | else | 1767 | else |
| 1538 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); | 1768 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
| 1769 | if (!last_stripe) { | ||
| 1770 | j++; | ||
| 1771 | sh = list_first_entry(&sh->batch_list, struct stripe_head, | ||
| 1772 | batch_list); | ||
| 1773 | goto again; | ||
| 1774 | } | ||
| 1539 | } | 1775 | } |
| 1540 | 1776 | ||
| 1541 | static void | 1777 | static void |
| @@ -1543,8 +1779,12 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
| 1543 | struct dma_async_tx_descriptor *tx) | 1779 | struct dma_async_tx_descriptor *tx) |
| 1544 | { | 1780 | { |
| 1545 | struct async_submit_ctl submit; | 1781 | struct async_submit_ctl submit; |
| 1546 | struct page **blocks = percpu->scribble; | 1782 | struct page **blocks; |
| 1547 | int count, i; | 1783 | int count, i, j = 0; |
| 1784 | struct stripe_head *head_sh = sh; | ||
| 1785 | int last_stripe; | ||
| 1786 | int synflags; | ||
| 1787 | unsigned long txflags; | ||
| 1548 | 1788 | ||
| 1549 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); | 1789 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); |
| 1550 | 1790 | ||
| @@ -1562,13 +1802,36 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
| 1562 | return; | 1802 | return; |
| 1563 | } | 1803 | } |
| 1564 | 1804 | ||
| 1565 | count = set_syndrome_sources(blocks, sh); | 1805 | again: |
| 1806 | blocks = to_addr_page(percpu, j); | ||
| 1566 | 1807 | ||
| 1567 | atomic_inc(&sh->count); | 1808 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { |
| 1809 | synflags = SYNDROME_SRC_WRITTEN; | ||
| 1810 | txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; | ||
| 1811 | } else { | ||
| 1812 | synflags = SYNDROME_SRC_ALL; | ||
| 1813 | txflags = ASYNC_TX_ACK; | ||
| 1814 | } | ||
| 1815 | |||
| 1816 | count = set_syndrome_sources(blocks, sh, synflags); | ||
| 1817 | last_stripe = !head_sh->batch_head || | ||
| 1818 | list_first_entry(&sh->batch_list, | ||
| 1819 | struct stripe_head, batch_list) == head_sh; | ||
| 1568 | 1820 | ||
| 1569 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, | 1821 | if (last_stripe) { |
| 1570 | sh, to_addr_conv(sh, percpu)); | 1822 | atomic_inc(&head_sh->count); |
| 1823 | init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, | ||
| 1824 | head_sh, to_addr_conv(sh, percpu, j)); | ||
| 1825 | } else | ||
| 1826 | init_async_submit(&submit, 0, tx, NULL, NULL, | ||
| 1827 | to_addr_conv(sh, percpu, j)); | ||
| 1571 | async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | 1828 | async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); |
| 1829 | if (!last_stripe) { | ||
| 1830 | j++; | ||
| 1831 | sh = list_first_entry(&sh->batch_list, struct stripe_head, | ||
| 1832 | batch_list); | ||
| 1833 | goto again; | ||
| 1834 | } | ||
| 1572 | } | 1835 | } |
| 1573 | 1836 | ||
| 1574 | static void ops_complete_check(void *stripe_head_ref) | 1837 | static void ops_complete_check(void *stripe_head_ref) |
| @@ -1589,7 +1852,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1589 | int pd_idx = sh->pd_idx; | 1852 | int pd_idx = sh->pd_idx; |
| 1590 | int qd_idx = sh->qd_idx; | 1853 | int qd_idx = sh->qd_idx; |
| 1591 | struct page *xor_dest; | 1854 | struct page *xor_dest; |
| 1592 | struct page **xor_srcs = percpu->scribble; | 1855 | struct page **xor_srcs = to_addr_page(percpu, 0); |
| 1593 | struct dma_async_tx_descriptor *tx; | 1856 | struct dma_async_tx_descriptor *tx; |
| 1594 | struct async_submit_ctl submit; | 1857 | struct async_submit_ctl submit; |
| 1595 | int count; | 1858 | int count; |
| @@ -1598,6 +1861,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1598 | pr_debug("%s: stripe %llu\n", __func__, | 1861 | pr_debug("%s: stripe %llu\n", __func__, |
| 1599 | (unsigned long long)sh->sector); | 1862 | (unsigned long long)sh->sector); |
| 1600 | 1863 | ||
| 1864 | BUG_ON(sh->batch_head); | ||
| 1601 | count = 0; | 1865 | count = 0; |
| 1602 | xor_dest = sh->dev[pd_idx].page; | 1866 | xor_dest = sh->dev[pd_idx].page; |
| 1603 | xor_srcs[count++] = xor_dest; | 1867 | xor_srcs[count++] = xor_dest; |
| @@ -1608,7 +1872,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1608 | } | 1872 | } |
| 1609 | 1873 | ||
| 1610 | init_async_submit(&submit, 0, NULL, NULL, NULL, | 1874 | init_async_submit(&submit, 0, NULL, NULL, NULL, |
| 1611 | to_addr_conv(sh, percpu)); | 1875 | to_addr_conv(sh, percpu, 0)); |
| 1612 | tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 1876 | tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, |
| 1613 | &sh->ops.zero_sum_result, &submit); | 1877 | &sh->ops.zero_sum_result, &submit); |
| 1614 | 1878 | ||
| @@ -1619,20 +1883,21 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) | |||
| 1619 | 1883 | ||
| 1620 | static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) | 1884 | static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) |
| 1621 | { | 1885 | { |
| 1622 | struct page **srcs = percpu->scribble; | 1886 | struct page **srcs = to_addr_page(percpu, 0); |
| 1623 | struct async_submit_ctl submit; | 1887 | struct async_submit_ctl submit; |
| 1624 | int count; | 1888 | int count; |
| 1625 | 1889 | ||
| 1626 | pr_debug("%s: stripe %llu checkp: %d\n", __func__, | 1890 | pr_debug("%s: stripe %llu checkp: %d\n", __func__, |
| 1627 | (unsigned long long)sh->sector, checkp); | 1891 | (unsigned long long)sh->sector, checkp); |
| 1628 | 1892 | ||
| 1629 | count = set_syndrome_sources(srcs, sh); | 1893 | BUG_ON(sh->batch_head); |
| 1894 | count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); | ||
| 1630 | if (!checkp) | 1895 | if (!checkp) |
| 1631 | srcs[count] = NULL; | 1896 | srcs[count] = NULL; |
| 1632 | 1897 | ||
| 1633 | atomic_inc(&sh->count); | 1898 | atomic_inc(&sh->count); |
| 1634 | init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, | 1899 | init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, |
| 1635 | sh, to_addr_conv(sh, percpu)); | 1900 | sh, to_addr_conv(sh, percpu, 0)); |
| 1636 | async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, | 1901 | async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, |
| 1637 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); | 1902 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); |
| 1638 | } | 1903 | } |
| @@ -1667,8 +1932,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
| 1667 | async_tx_ack(tx); | 1932 | async_tx_ack(tx); |
| 1668 | } | 1933 | } |
| 1669 | 1934 | ||
| 1670 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) | 1935 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { |
| 1671 | tx = ops_run_prexor(sh, percpu, tx); | 1936 | if (level < 6) |
| 1937 | tx = ops_run_prexor5(sh, percpu, tx); | ||
| 1938 | else | ||
| 1939 | tx = ops_run_prexor6(sh, percpu, tx); | ||
| 1940 | } | ||
| 1672 | 1941 | ||
| 1673 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { | 1942 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { |
| 1674 | tx = ops_run_biodrain(sh, tx); | 1943 | tx = ops_run_biodrain(sh, tx); |
| @@ -1693,7 +1962,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
| 1693 | BUG(); | 1962 | BUG(); |
| 1694 | } | 1963 | } |
| 1695 | 1964 | ||
| 1696 | if (overlap_clear) | 1965 | if (overlap_clear && !sh->batch_head) |
| 1697 | for (i = disks; i--; ) { | 1966 | for (i = disks; i--; ) { |
| 1698 | struct r5dev *dev = &sh->dev[i]; | 1967 | struct r5dev *dev = &sh->dev[i]; |
| 1699 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | 1968 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) |
| @@ -1702,10 +1971,10 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
| 1702 | put_cpu(); | 1971 | put_cpu(); |
| 1703 | } | 1972 | } |
| 1704 | 1973 | ||
| 1705 | static int grow_one_stripe(struct r5conf *conf, int hash) | 1974 | static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) |
| 1706 | { | 1975 | { |
| 1707 | struct stripe_head *sh; | 1976 | struct stripe_head *sh; |
| 1708 | sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); | 1977 | sh = kmem_cache_zalloc(conf->slab_cache, gfp); |
| 1709 | if (!sh) | 1978 | if (!sh) |
| 1710 | return 0; | 1979 | return 0; |
| 1711 | 1980 | ||
| @@ -1713,17 +1982,23 @@ static int grow_one_stripe(struct r5conf *conf, int hash) | |||
| 1713 | 1982 | ||
| 1714 | spin_lock_init(&sh->stripe_lock); | 1983 | spin_lock_init(&sh->stripe_lock); |
| 1715 | 1984 | ||
| 1716 | if (grow_buffers(sh)) { | 1985 | if (grow_buffers(sh, gfp)) { |
| 1717 | shrink_buffers(sh); | 1986 | shrink_buffers(sh); |
| 1718 | kmem_cache_free(conf->slab_cache, sh); | 1987 | kmem_cache_free(conf->slab_cache, sh); |
| 1719 | return 0; | 1988 | return 0; |
| 1720 | } | 1989 | } |
| 1721 | sh->hash_lock_index = hash; | 1990 | sh->hash_lock_index = |
| 1991 | conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; | ||
| 1722 | /* we just created an active stripe so... */ | 1992 | /* we just created an active stripe so... */ |
| 1723 | atomic_set(&sh->count, 1); | 1993 | atomic_set(&sh->count, 1); |
| 1724 | atomic_inc(&conf->active_stripes); | 1994 | atomic_inc(&conf->active_stripes); |
| 1725 | INIT_LIST_HEAD(&sh->lru); | 1995 | INIT_LIST_HEAD(&sh->lru); |
| 1996 | |||
| 1997 | spin_lock_init(&sh->batch_lock); | ||
| 1998 | INIT_LIST_HEAD(&sh->batch_list); | ||
| 1999 | sh->batch_head = NULL; | ||
| 1726 | release_stripe(sh); | 2000 | release_stripe(sh); |
| 2001 | conf->max_nr_stripes++; | ||
| 1727 | return 1; | 2002 | return 1; |
| 1728 | } | 2003 | } |
| 1729 | 2004 | ||
| @@ -1731,7 +2006,6 @@ static int grow_stripes(struct r5conf *conf, int num) | |||
| 1731 | { | 2006 | { |
| 1732 | struct kmem_cache *sc; | 2007 | struct kmem_cache *sc; |
| 1733 | int devs = max(conf->raid_disks, conf->previous_raid_disks); | 2008 | int devs = max(conf->raid_disks, conf->previous_raid_disks); |
| 1734 | int hash; | ||
| 1735 | 2009 | ||
| 1736 | if (conf->mddev->gendisk) | 2010 | if (conf->mddev->gendisk) |
| 1737 | sprintf(conf->cache_name[0], | 2011 | sprintf(conf->cache_name[0], |
| @@ -1749,13 +2023,10 @@ static int grow_stripes(struct r5conf *conf, int num) | |||
| 1749 | return 1; | 2023 | return 1; |
| 1750 | conf->slab_cache = sc; | 2024 | conf->slab_cache = sc; |
| 1751 | conf->pool_size = devs; | 2025 | conf->pool_size = devs; |
| 1752 | hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; | 2026 | while (num--) |
| 1753 | while (num--) { | 2027 | if (!grow_one_stripe(conf, GFP_KERNEL)) |
| 1754 | if (!grow_one_stripe(conf, hash)) | ||
| 1755 | return 1; | 2028 | return 1; |
| 1756 | conf->max_nr_stripes++; | 2029 | |
| 1757 | hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; | ||
| 1758 | } | ||
| 1759 | return 0; | 2030 | return 0; |
| 1760 | } | 2031 | } |
| 1761 | 2032 | ||
| @@ -1772,13 +2043,21 @@ static int grow_stripes(struct r5conf *conf, int num) | |||
| 1772 | * calculate over all devices (not just the data blocks), using zeros in place | 2043 | * calculate over all devices (not just the data blocks), using zeros in place |
| 1773 | * of the P and Q blocks. | 2044 | * of the P and Q blocks. |
| 1774 | */ | 2045 | */ |
| 1775 | static size_t scribble_len(int num) | 2046 | static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) |
| 1776 | { | 2047 | { |
| 2048 | struct flex_array *ret; | ||
| 1777 | size_t len; | 2049 | size_t len; |
| 1778 | 2050 | ||
| 1779 | len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); | 2051 | len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); |
| 1780 | 2052 | ret = flex_array_alloc(len, cnt, flags); | |
| 1781 | return len; | 2053 | if (!ret) |
| 2054 | return NULL; | ||
| 2055 | /* always prealloc all elements, so no locking is required */ | ||
| 2056 | if (flex_array_prealloc(ret, 0, cnt, flags)) { | ||
| 2057 | flex_array_free(ret); | ||
| 2058 | return NULL; | ||
| 2059 | } | ||
| 2060 | return ret; | ||
| 1782 | } | 2061 | } |
| 1783 | 2062 | ||
| 1784 | static int resize_stripes(struct r5conf *conf, int newsize) | 2063 | static int resize_stripes(struct r5conf *conf, int newsize) |
| @@ -1896,16 +2175,16 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
| 1896 | err = -ENOMEM; | 2175 | err = -ENOMEM; |
| 1897 | 2176 | ||
| 1898 | get_online_cpus(); | 2177 | get_online_cpus(); |
| 1899 | conf->scribble_len = scribble_len(newsize); | ||
| 1900 | for_each_present_cpu(cpu) { | 2178 | for_each_present_cpu(cpu) { |
| 1901 | struct raid5_percpu *percpu; | 2179 | struct raid5_percpu *percpu; |
| 1902 | void *scribble; | 2180 | struct flex_array *scribble; |
| 1903 | 2181 | ||
| 1904 | percpu = per_cpu_ptr(conf->percpu, cpu); | 2182 | percpu = per_cpu_ptr(conf->percpu, cpu); |
| 1905 | scribble = kmalloc(conf->scribble_len, GFP_NOIO); | 2183 | scribble = scribble_alloc(newsize, conf->chunk_sectors / |
| 2184 | STRIPE_SECTORS, GFP_NOIO); | ||
| 1906 | 2185 | ||
| 1907 | if (scribble) { | 2186 | if (scribble) { |
| 1908 | kfree(percpu->scribble); | 2187 | flex_array_free(percpu->scribble); |
| 1909 | percpu->scribble = scribble; | 2188 | percpu->scribble = scribble; |
| 1910 | } else { | 2189 | } else { |
| 1911 | err = -ENOMEM; | 2190 | err = -ENOMEM; |
| @@ -1937,9 +2216,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
| 1937 | return err; | 2216 | return err; |
| 1938 | } | 2217 | } |
| 1939 | 2218 | ||
| 1940 | static int drop_one_stripe(struct r5conf *conf, int hash) | 2219 | static int drop_one_stripe(struct r5conf *conf) |
| 1941 | { | 2220 | { |
| 1942 | struct stripe_head *sh; | 2221 | struct stripe_head *sh; |
| 2222 | int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; | ||
| 1943 | 2223 | ||
| 1944 | spin_lock_irq(conf->hash_locks + hash); | 2224 | spin_lock_irq(conf->hash_locks + hash); |
| 1945 | sh = get_free_stripe(conf, hash); | 2225 | sh = get_free_stripe(conf, hash); |
| @@ -1950,15 +2230,15 @@ static int drop_one_stripe(struct r5conf *conf, int hash) | |||
| 1950 | shrink_buffers(sh); | 2230 | shrink_buffers(sh); |
| 1951 | kmem_cache_free(conf->slab_cache, sh); | 2231 | kmem_cache_free(conf->slab_cache, sh); |
| 1952 | atomic_dec(&conf->active_stripes); | 2232 | atomic_dec(&conf->active_stripes); |
| 2233 | conf->max_nr_stripes--; | ||
| 1953 | return 1; | 2234 | return 1; |
| 1954 | } | 2235 | } |
| 1955 | 2236 | ||
| 1956 | static void shrink_stripes(struct r5conf *conf) | 2237 | static void shrink_stripes(struct r5conf *conf) |
| 1957 | { | 2238 | { |
| 1958 | int hash; | 2239 | while (conf->max_nr_stripes && |
| 1959 | for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) | 2240 | drop_one_stripe(conf)) |
| 1960 | while (drop_one_stripe(conf, hash)) | 2241 | ; |
| 1961 | ; | ||
| 1962 | 2242 | ||
| 1963 | if (conf->slab_cache) | 2243 | if (conf->slab_cache) |
| 1964 | kmem_cache_destroy(conf->slab_cache); | 2244 | kmem_cache_destroy(conf->slab_cache); |
| @@ -2154,10 +2434,16 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
| 2154 | } | 2434 | } |
| 2155 | rdev_dec_pending(rdev, conf->mddev); | 2435 | rdev_dec_pending(rdev, conf->mddev); |
| 2156 | 2436 | ||
| 2437 | if (sh->batch_head && !uptodate) | ||
| 2438 | set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); | ||
| 2439 | |||
| 2157 | if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) | 2440 | if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) |
| 2158 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | 2441 | clear_bit(R5_LOCKED, &sh->dev[i].flags); |
| 2159 | set_bit(STRIPE_HANDLE, &sh->state); | 2442 | set_bit(STRIPE_HANDLE, &sh->state); |
| 2160 | release_stripe(sh); | 2443 | release_stripe(sh); |
| 2444 | |||
| 2445 | if (sh->batch_head && sh != sh->batch_head) | ||
| 2446 | release_stripe(sh->batch_head); | ||
| 2161 | } | 2447 | } |
| 2162 | 2448 | ||
| 2163 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); | 2449 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); |
| @@ -2535,7 +2821,7 @@ static void | |||
| 2535 | schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | 2821 | schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, |
| 2536 | int rcw, int expand) | 2822 | int rcw, int expand) |
| 2537 | { | 2823 | { |
| 2538 | int i, pd_idx = sh->pd_idx, disks = sh->disks; | 2824 | int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; |
| 2539 | struct r5conf *conf = sh->raid_conf; | 2825 | struct r5conf *conf = sh->raid_conf; |
| 2540 | int level = conf->level; | 2826 | int level = conf->level; |
| 2541 | 2827 | ||
| @@ -2571,13 +2857,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
| 2571 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | 2857 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) |
| 2572 | atomic_inc(&conf->pending_full_writes); | 2858 | atomic_inc(&conf->pending_full_writes); |
| 2573 | } else { | 2859 | } else { |
| 2574 | BUG_ON(level == 6); | ||
| 2575 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || | 2860 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || |
| 2576 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); | 2861 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); |
| 2862 | BUG_ON(level == 6 && | ||
| 2863 | (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || | ||
| 2864 | test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); | ||
| 2577 | 2865 | ||
| 2578 | for (i = disks; i--; ) { | 2866 | for (i = disks; i--; ) { |
| 2579 | struct r5dev *dev = &sh->dev[i]; | 2867 | struct r5dev *dev = &sh->dev[i]; |
| 2580 | if (i == pd_idx) | 2868 | if (i == pd_idx || i == qd_idx) |
| 2581 | continue; | 2869 | continue; |
| 2582 | 2870 | ||
| 2583 | if (dev->towrite && | 2871 | if (dev->towrite && |
| @@ -2624,7 +2912,8 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
| 2624 | * toread/towrite point to the first in a chain. | 2912 | * toread/towrite point to the first in a chain. |
| 2625 | * The bi_next chain must be in order. | 2913 | * The bi_next chain must be in order. |
| 2626 | */ | 2914 | */ |
| 2627 | static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) | 2915 | static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, |
| 2916 | int forwrite, int previous) | ||
| 2628 | { | 2917 | { |
| 2629 | struct bio **bip; | 2918 | struct bio **bip; |
| 2630 | struct r5conf *conf = sh->raid_conf; | 2919 | struct r5conf *conf = sh->raid_conf; |
| @@ -2643,6 +2932,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2643 | * protect it. | 2932 | * protect it. |
| 2644 | */ | 2933 | */ |
| 2645 | spin_lock_irq(&sh->stripe_lock); | 2934 | spin_lock_irq(&sh->stripe_lock); |
| 2935 | /* Don't allow new IO added to stripes in batch list */ | ||
| 2936 | if (sh->batch_head) | ||
| 2937 | goto overlap; | ||
| 2646 | if (forwrite) { | 2938 | if (forwrite) { |
| 2647 | bip = &sh->dev[dd_idx].towrite; | 2939 | bip = &sh->dev[dd_idx].towrite; |
| 2648 | if (*bip == NULL) | 2940 | if (*bip == NULL) |
| @@ -2657,6 +2949,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2657 | if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) | 2949 | if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) |
| 2658 | goto overlap; | 2950 | goto overlap; |
| 2659 | 2951 | ||
| 2952 | if (!forwrite || previous) | ||
| 2953 | clear_bit(STRIPE_BATCH_READY, &sh->state); | ||
| 2954 | |||
| 2660 | BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); | 2955 | BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); |
| 2661 | if (*bip) | 2956 | if (*bip) |
| 2662 | bi->bi_next = *bip; | 2957 | bi->bi_next = *bip; |
| @@ -2674,7 +2969,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2674 | sector = bio_end_sector(bi); | 2969 | sector = bio_end_sector(bi); |
| 2675 | } | 2970 | } |
| 2676 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | 2971 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) |
| 2677 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | 2972 | if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) |
| 2973 | sh->overwrite_disks++; | ||
| 2678 | } | 2974 | } |
| 2679 | 2975 | ||
| 2680 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | 2976 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", |
| @@ -2688,6 +2984,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2688 | sh->bm_seq = conf->seq_flush+1; | 2984 | sh->bm_seq = conf->seq_flush+1; |
| 2689 | set_bit(STRIPE_BIT_DELAY, &sh->state); | 2985 | set_bit(STRIPE_BIT_DELAY, &sh->state); |
| 2690 | } | 2986 | } |
| 2987 | |||
| 2988 | if (stripe_can_batch(sh)) | ||
| 2989 | stripe_add_to_batch_list(conf, sh); | ||
| 2691 | return 1; | 2990 | return 1; |
| 2692 | 2991 | ||
| 2693 | overlap: | 2992 | overlap: |
| @@ -2720,6 +3019,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
| 2720 | struct bio **return_bi) | 3019 | struct bio **return_bi) |
| 2721 | { | 3020 | { |
| 2722 | int i; | 3021 | int i; |
| 3022 | BUG_ON(sh->batch_head); | ||
| 2723 | for (i = disks; i--; ) { | 3023 | for (i = disks; i--; ) { |
| 2724 | struct bio *bi; | 3024 | struct bio *bi; |
| 2725 | int bitmap_end = 0; | 3025 | int bitmap_end = 0; |
| @@ -2746,6 +3046,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
| 2746 | /* fail all writes first */ | 3046 | /* fail all writes first */ |
| 2747 | bi = sh->dev[i].towrite; | 3047 | bi = sh->dev[i].towrite; |
| 2748 | sh->dev[i].towrite = NULL; | 3048 | sh->dev[i].towrite = NULL; |
| 3049 | sh->overwrite_disks = 0; | ||
| 2749 | spin_unlock_irq(&sh->stripe_lock); | 3050 | spin_unlock_irq(&sh->stripe_lock); |
| 2750 | if (bi) | 3051 | if (bi) |
| 2751 | bitmap_end = 1; | 3052 | bitmap_end = 1; |
| @@ -2834,6 +3135,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, | |||
| 2834 | int abort = 0; | 3135 | int abort = 0; |
| 2835 | int i; | 3136 | int i; |
| 2836 | 3137 | ||
| 3138 | BUG_ON(sh->batch_head); | ||
| 2837 | clear_bit(STRIPE_SYNCING, &sh->state); | 3139 | clear_bit(STRIPE_SYNCING, &sh->state); |
| 2838 | if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) | 3140 | if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) |
| 2839 | wake_up(&conf->wait_for_overlap); | 3141 | wake_up(&conf->wait_for_overlap); |
| @@ -3064,6 +3366,7 @@ static void handle_stripe_fill(struct stripe_head *sh, | |||
| 3064 | { | 3366 | { |
| 3065 | int i; | 3367 | int i; |
| 3066 | 3368 | ||
| 3369 | BUG_ON(sh->batch_head); | ||
| 3067 | /* look for blocks to read/compute, skip this if a compute | 3370 | /* look for blocks to read/compute, skip this if a compute |
| 3068 | * is already in flight, or if the stripe contents are in the | 3371 | * is already in flight, or if the stripe contents are in the |
| 3069 | * midst of changing due to a write | 3372 | * midst of changing due to a write |
| @@ -3087,6 +3390,9 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
| 3087 | int i; | 3390 | int i; |
| 3088 | struct r5dev *dev; | 3391 | struct r5dev *dev; |
| 3089 | int discard_pending = 0; | 3392 | int discard_pending = 0; |
| 3393 | struct stripe_head *head_sh = sh; | ||
| 3394 | bool do_endio = false; | ||
| 3395 | int wakeup_nr = 0; | ||
| 3090 | 3396 | ||
| 3091 | for (i = disks; i--; ) | 3397 | for (i = disks; i--; ) |
| 3092 | if (sh->dev[i].written) { | 3398 | if (sh->dev[i].written) { |
| @@ -3102,8 +3408,11 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
| 3102 | clear_bit(R5_UPTODATE, &dev->flags); | 3408 | clear_bit(R5_UPTODATE, &dev->flags); |
| 3103 | if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { | 3409 | if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { |
| 3104 | WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); | 3410 | WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); |
| 3105 | dev->page = dev->orig_page; | ||
| 3106 | } | 3411 | } |
| 3412 | do_endio = true; | ||
| 3413 | |||
| 3414 | returnbi: | ||
| 3415 | dev->page = dev->orig_page; | ||
| 3107 | wbi = dev->written; | 3416 | wbi = dev->written; |
| 3108 | dev->written = NULL; | 3417 | dev->written = NULL; |
| 3109 | while (wbi && wbi->bi_iter.bi_sector < | 3418 | while (wbi && wbi->bi_iter.bi_sector < |
| @@ -3120,6 +3429,17 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
| 3120 | STRIPE_SECTORS, | 3429 | STRIPE_SECTORS, |
| 3121 | !test_bit(STRIPE_DEGRADED, &sh->state), | 3430 | !test_bit(STRIPE_DEGRADED, &sh->state), |
| 3122 | 0); | 3431 | 0); |
| 3432 | if (head_sh->batch_head) { | ||
| 3433 | sh = list_first_entry(&sh->batch_list, | ||
| 3434 | struct stripe_head, | ||
| 3435 | batch_list); | ||
| 3436 | if (sh != head_sh) { | ||
| 3437 | dev = &sh->dev[i]; | ||
| 3438 | goto returnbi; | ||
| 3439 | } | ||
| 3440 | } | ||
| 3441 | sh = head_sh; | ||
| 3442 | dev = &sh->dev[i]; | ||
| 3123 | } else if (test_bit(R5_Discard, &dev->flags)) | 3443 | } else if (test_bit(R5_Discard, &dev->flags)) |
| 3124 | discard_pending = 1; | 3444 | discard_pending = 1; |
| 3125 | WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); | 3445 | WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); |
| @@ -3141,8 +3461,17 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
| 3141 | * will be reinitialized | 3461 | * will be reinitialized |
| 3142 | */ | 3462 | */ |
| 3143 | spin_lock_irq(&conf->device_lock); | 3463 | spin_lock_irq(&conf->device_lock); |
| 3464 | unhash: | ||
| 3144 | remove_hash(sh); | 3465 | remove_hash(sh); |
| 3466 | if (head_sh->batch_head) { | ||
| 3467 | sh = list_first_entry(&sh->batch_list, | ||
| 3468 | struct stripe_head, batch_list); | ||
| 3469 | if (sh != head_sh) | ||
| 3470 | goto unhash; | ||
| 3471 | } | ||
| 3145 | spin_unlock_irq(&conf->device_lock); | 3472 | spin_unlock_irq(&conf->device_lock); |
| 3473 | sh = head_sh; | ||
| 3474 | |||
| 3146 | if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) | 3475 | if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) |
| 3147 | set_bit(STRIPE_HANDLE, &sh->state); | 3476 | set_bit(STRIPE_HANDLE, &sh->state); |
| 3148 | 3477 | ||
| @@ -3151,6 +3480,45 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
| 3151 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) | 3480 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) |
| 3152 | if (atomic_dec_and_test(&conf->pending_full_writes)) | 3481 | if (atomic_dec_and_test(&conf->pending_full_writes)) |
| 3153 | md_wakeup_thread(conf->mddev->thread); | 3482 | md_wakeup_thread(conf->mddev->thread); |
| 3483 | |||
| 3484 | if (!head_sh->batch_head || !do_endio) | ||
| 3485 | return; | ||
| 3486 | for (i = 0; i < head_sh->disks; i++) { | ||
| 3487 | if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) | ||
| 3488 | wakeup_nr++; | ||
| 3489 | } | ||
| 3490 | while (!list_empty(&head_sh->batch_list)) { | ||
| 3491 | int i; | ||
| 3492 | sh = list_first_entry(&head_sh->batch_list, | ||
| 3493 | struct stripe_head, batch_list); | ||
| 3494 | list_del_init(&sh->batch_list); | ||
| 3495 | |||
| 3496 | set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, | ||
| 3497 | head_sh->state & ~((1 << STRIPE_ACTIVE) | | ||
| 3498 | (1 << STRIPE_PREREAD_ACTIVE) | | ||
| 3499 | STRIPE_EXPAND_SYNC_FLAG)); | ||
| 3500 | sh->check_state = head_sh->check_state; | ||
| 3501 | sh->reconstruct_state = head_sh->reconstruct_state; | ||
| 3502 | for (i = 0; i < sh->disks; i++) { | ||
| 3503 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
| 3504 | wakeup_nr++; | ||
| 3505 | sh->dev[i].flags = head_sh->dev[i].flags; | ||
| 3506 | } | ||
| 3507 | |||
| 3508 | spin_lock_irq(&sh->stripe_lock); | ||
| 3509 | sh->batch_head = NULL; | ||
| 3510 | spin_unlock_irq(&sh->stripe_lock); | ||
| 3511 | if (sh->state & STRIPE_EXPAND_SYNC_FLAG) | ||
| 3512 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 3513 | release_stripe(sh); | ||
| 3514 | } | ||
| 3515 | |||
| 3516 | spin_lock_irq(&head_sh->stripe_lock); | ||
| 3517 | head_sh->batch_head = NULL; | ||
| 3518 | spin_unlock_irq(&head_sh->stripe_lock); | ||
| 3519 | wake_up_nr(&conf->wait_for_overlap, wakeup_nr); | ||
| 3520 | if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG) | ||
| 3521 | set_bit(STRIPE_HANDLE, &head_sh->state); | ||
| 3154 | } | 3522 | } |
| 3155 | 3523 | ||
| 3156 | static void handle_stripe_dirtying(struct r5conf *conf, | 3524 | static void handle_stripe_dirtying(struct r5conf *conf, |
| @@ -3161,28 +3529,27 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
| 3161 | int rmw = 0, rcw = 0, i; | 3529 | int rmw = 0, rcw = 0, i; |
| 3162 | sector_t recovery_cp = conf->mddev->recovery_cp; | 3530 | sector_t recovery_cp = conf->mddev->recovery_cp; |
| 3163 | 3531 | ||
| 3164 | /* RAID6 requires 'rcw' in current implementation. | 3532 | /* Check whether resync is now happening or should start. |
| 3165 | * Otherwise, check whether resync is now happening or should start. | ||
| 3166 | * If yes, then the array is dirty (after unclean shutdown or | 3533 | * If yes, then the array is dirty (after unclean shutdown or |
| 3167 | * initial creation), so parity in some stripes might be inconsistent. | 3534 | * initial creation), so parity in some stripes might be inconsistent. |
| 3168 | * In this case, we need to always do reconstruct-write, to ensure | 3535 | * In this case, we need to always do reconstruct-write, to ensure |
| 3169 | * that in case of drive failure or read-error correction, we | 3536 | * that in case of drive failure or read-error correction, we |
| 3170 | * generate correct data from the parity. | 3537 | * generate correct data from the parity. |
| 3171 | */ | 3538 | */ |
| 3172 | if (conf->max_degraded == 2 || | 3539 | if (conf->rmw_level == PARITY_DISABLE_RMW || |
| 3173 | (recovery_cp < MaxSector && sh->sector >= recovery_cp && | 3540 | (recovery_cp < MaxSector && sh->sector >= recovery_cp && |
| 3174 | s->failed == 0)) { | 3541 | s->failed == 0)) { |
| 3175 | /* Calculate the real rcw later - for now make it | 3542 | /* Calculate the real rcw later - for now make it |
| 3176 | * look like rcw is cheaper | 3543 | * look like rcw is cheaper |
| 3177 | */ | 3544 | */ |
| 3178 | rcw = 1; rmw = 2; | 3545 | rcw = 1; rmw = 2; |
| 3179 | pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", | 3546 | pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", |
| 3180 | conf->max_degraded, (unsigned long long)recovery_cp, | 3547 | conf->rmw_level, (unsigned long long)recovery_cp, |
| 3181 | (unsigned long long)sh->sector); | 3548 | (unsigned long long)sh->sector); |
| 3182 | } else for (i = disks; i--; ) { | 3549 | } else for (i = disks; i--; ) { |
| 3183 | /* would I have to read this buffer for read_modify_write */ | 3550 | /* would I have to read this buffer for read_modify_write */ |
| 3184 | struct r5dev *dev = &sh->dev[i]; | 3551 | struct r5dev *dev = &sh->dev[i]; |
| 3185 | if ((dev->towrite || i == sh->pd_idx) && | 3552 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && |
| 3186 | !test_bit(R5_LOCKED, &dev->flags) && | 3553 | !test_bit(R5_LOCKED, &dev->flags) && |
| 3187 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3554 | !(test_bit(R5_UPTODATE, &dev->flags) || |
| 3188 | test_bit(R5_Wantcompute, &dev->flags))) { | 3555 | test_bit(R5_Wantcompute, &dev->flags))) { |
| @@ -3192,7 +3559,8 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
| 3192 | rmw += 2*disks; /* cannot read it */ | 3559 | rmw += 2*disks; /* cannot read it */ |
| 3193 | } | 3560 | } |
| 3194 | /* Would I have to read this buffer for reconstruct_write */ | 3561 | /* Would I have to read this buffer for reconstruct_write */ |
| 3195 | if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && | 3562 | if (!test_bit(R5_OVERWRITE, &dev->flags) && |
| 3563 | i != sh->pd_idx && i != sh->qd_idx && | ||
| 3196 | !test_bit(R5_LOCKED, &dev->flags) && | 3564 | !test_bit(R5_LOCKED, &dev->flags) && |
| 3197 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3565 | !(test_bit(R5_UPTODATE, &dev->flags) || |
| 3198 | test_bit(R5_Wantcompute, &dev->flags))) { | 3566 | test_bit(R5_Wantcompute, &dev->flags))) { |
| @@ -3205,7 +3573,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
| 3205 | pr_debug("for sector %llu, rmw=%d rcw=%d\n", | 3573 | pr_debug("for sector %llu, rmw=%d rcw=%d\n", |
| 3206 | (unsigned long long)sh->sector, rmw, rcw); | 3574 | (unsigned long long)sh->sector, rmw, rcw); |
| 3207 | set_bit(STRIPE_HANDLE, &sh->state); | 3575 | set_bit(STRIPE_HANDLE, &sh->state); |
| 3208 | if (rmw < rcw && rmw > 0) { | 3576 | if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { |
| 3209 | /* prefer read-modify-write, but need to get some data */ | 3577 | /* prefer read-modify-write, but need to get some data */ |
| 3210 | if (conf->mddev->queue) | 3578 | if (conf->mddev->queue) |
| 3211 | blk_add_trace_msg(conf->mddev->queue, | 3579 | blk_add_trace_msg(conf->mddev->queue, |
| @@ -3213,7 +3581,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
| 3213 | (unsigned long long)sh->sector, rmw); | 3581 | (unsigned long long)sh->sector, rmw); |
| 3214 | for (i = disks; i--; ) { | 3582 | for (i = disks; i--; ) { |
| 3215 | struct r5dev *dev = &sh->dev[i]; | 3583 | struct r5dev *dev = &sh->dev[i]; |
| 3216 | if ((dev->towrite || i == sh->pd_idx) && | 3584 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && |
| 3217 | !test_bit(R5_LOCKED, &dev->flags) && | 3585 | !test_bit(R5_LOCKED, &dev->flags) && |
| 3218 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3586 | !(test_bit(R5_UPTODATE, &dev->flags) || |
| 3219 | test_bit(R5_Wantcompute, &dev->flags)) && | 3587 | test_bit(R5_Wantcompute, &dev->flags)) && |
| @@ -3232,7 +3600,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
| 3232 | } | 3600 | } |
| 3233 | } | 3601 | } |
| 3234 | } | 3602 | } |
| 3235 | if (rcw <= rmw && rcw > 0) { | 3603 | if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { |
| 3236 | /* want reconstruct write, but need to get some data */ | 3604 | /* want reconstruct write, but need to get some data */ |
| 3237 | int qread =0; | 3605 | int qread =0; |
| 3238 | rcw = 0; | 3606 | rcw = 0; |
| @@ -3290,6 +3658,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, | |||
| 3290 | { | 3658 | { |
| 3291 | struct r5dev *dev = NULL; | 3659 | struct r5dev *dev = NULL; |
| 3292 | 3660 | ||
| 3661 | BUG_ON(sh->batch_head); | ||
| 3293 | set_bit(STRIPE_HANDLE, &sh->state); | 3662 | set_bit(STRIPE_HANDLE, &sh->state); |
| 3294 | 3663 | ||
| 3295 | switch (sh->check_state) { | 3664 | switch (sh->check_state) { |
| @@ -3380,6 +3749,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, | |||
| 3380 | int qd_idx = sh->qd_idx; | 3749 | int qd_idx = sh->qd_idx; |
| 3381 | struct r5dev *dev; | 3750 | struct r5dev *dev; |
| 3382 | 3751 | ||
| 3752 | BUG_ON(sh->batch_head); | ||
| 3383 | set_bit(STRIPE_HANDLE, &sh->state); | 3753 | set_bit(STRIPE_HANDLE, &sh->state); |
| 3384 | 3754 | ||
| 3385 | BUG_ON(s->failed > 2); | 3755 | BUG_ON(s->failed > 2); |
| @@ -3543,6 +3913,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) | |||
| 3543 | * copy some of them into a target stripe for expand. | 3913 | * copy some of them into a target stripe for expand. |
| 3544 | */ | 3914 | */ |
| 3545 | struct dma_async_tx_descriptor *tx = NULL; | 3915 | struct dma_async_tx_descriptor *tx = NULL; |
| 3916 | BUG_ON(sh->batch_head); | ||
| 3546 | clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 3917 | clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
| 3547 | for (i = 0; i < sh->disks; i++) | 3918 | for (i = 0; i < sh->disks; i++) |
| 3548 | if (i != sh->pd_idx && i != sh->qd_idx) { | 3919 | if (i != sh->pd_idx && i != sh->qd_idx) { |
| @@ -3615,8 +3986,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 3615 | 3986 | ||
| 3616 | memset(s, 0, sizeof(*s)); | 3987 | memset(s, 0, sizeof(*s)); |
| 3617 | 3988 | ||
| 3618 | s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 3989 | s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; |
| 3619 | s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | 3990 | s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; |
| 3620 | s->failed_num[0] = -1; | 3991 | s->failed_num[0] = -1; |
| 3621 | s->failed_num[1] = -1; | 3992 | s->failed_num[1] = -1; |
| 3622 | 3993 | ||
| @@ -3786,6 +4157,80 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 3786 | rcu_read_unlock(); | 4157 | rcu_read_unlock(); |
| 3787 | } | 4158 | } |
| 3788 | 4159 | ||
| 4160 | static int clear_batch_ready(struct stripe_head *sh) | ||
| 4161 | { | ||
| 4162 | struct stripe_head *tmp; | ||
| 4163 | if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) | ||
| 4164 | return 0; | ||
| 4165 | spin_lock(&sh->stripe_lock); | ||
| 4166 | if (!sh->batch_head) { | ||
| 4167 | spin_unlock(&sh->stripe_lock); | ||
| 4168 | return 0; | ||
| 4169 | } | ||
| 4170 | |||
| 4171 | /* | ||
| 4172 | * this stripe could be added to a batch list before we check | ||
| 4173 | * BATCH_READY, skips it | ||
| 4174 | */ | ||
| 4175 | if (sh->batch_head != sh) { | ||
| 4176 | spin_unlock(&sh->stripe_lock); | ||
| 4177 | return 1; | ||
| 4178 | } | ||
| 4179 | spin_lock(&sh->batch_lock); | ||
| 4180 | list_for_each_entry(tmp, &sh->batch_list, batch_list) | ||
| 4181 | clear_bit(STRIPE_BATCH_READY, &tmp->state); | ||
| 4182 | spin_unlock(&sh->batch_lock); | ||
| 4183 | spin_unlock(&sh->stripe_lock); | ||
| 4184 | |||
| 4185 | /* | ||
| 4186 | * BATCH_READY is cleared, no new stripes can be added. | ||
| 4187 | * batch_list can be accessed without lock | ||
| 4188 | */ | ||
| 4189 | return 0; | ||
| 4190 | } | ||
| 4191 | |||
| 4192 | static void check_break_stripe_batch_list(struct stripe_head *sh) | ||
| 4193 | { | ||
| 4194 | struct stripe_head *head_sh, *next; | ||
| 4195 | int i; | ||
| 4196 | |||
| 4197 | if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) | ||
| 4198 | return; | ||
| 4199 | |||
| 4200 | head_sh = sh; | ||
| 4201 | do { | ||
| 4202 | sh = list_first_entry(&sh->batch_list, | ||
| 4203 | struct stripe_head, batch_list); | ||
| 4204 | BUG_ON(sh == head_sh); | ||
| 4205 | } while (!test_bit(STRIPE_DEGRADED, &sh->state)); | ||
| 4206 | |||
| 4207 | while (sh != head_sh) { | ||
| 4208 | next = list_first_entry(&sh->batch_list, | ||
| 4209 | struct stripe_head, batch_list); | ||
| 4210 | list_del_init(&sh->batch_list); | ||
| 4211 | |||
| 4212 | set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, | ||
| 4213 | head_sh->state & ~((1 << STRIPE_ACTIVE) | | ||
| 4214 | (1 << STRIPE_PREREAD_ACTIVE) | | ||
| 4215 | (1 << STRIPE_DEGRADED) | | ||
| 4216 | STRIPE_EXPAND_SYNC_FLAG)); | ||
| 4217 | sh->check_state = head_sh->check_state; | ||
| 4218 | sh->reconstruct_state = head_sh->reconstruct_state; | ||
| 4219 | for (i = 0; i < sh->disks; i++) | ||
| 4220 | sh->dev[i].flags = head_sh->dev[i].flags & | ||
| 4221 | (~((1 << R5_WriteError) | (1 << R5_Overlap))); | ||
| 4222 | |||
| 4223 | spin_lock_irq(&sh->stripe_lock); | ||
| 4224 | sh->batch_head = NULL; | ||
| 4225 | spin_unlock_irq(&sh->stripe_lock); | ||
| 4226 | |||
| 4227 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 4228 | release_stripe(sh); | ||
| 4229 | |||
| 4230 | sh = next; | ||
| 4231 | } | ||
| 4232 | } | ||
| 4233 | |||
| 3789 | static void handle_stripe(struct stripe_head *sh) | 4234 | static void handle_stripe(struct stripe_head *sh) |
| 3790 | { | 4235 | { |
| 3791 | struct stripe_head_state s; | 4236 | struct stripe_head_state s; |
| @@ -3803,7 +4248,14 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 3803 | return; | 4248 | return; |
| 3804 | } | 4249 | } |
| 3805 | 4250 | ||
| 3806 | if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { | 4251 | if (clear_batch_ready(sh) ) { |
| 4252 | clear_bit_unlock(STRIPE_ACTIVE, &sh->state); | ||
| 4253 | return; | ||
| 4254 | } | ||
| 4255 | |||
| 4256 | check_break_stripe_batch_list(sh); | ||
| 4257 | |||
| 4258 | if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { | ||
| 3807 | spin_lock(&sh->stripe_lock); | 4259 | spin_lock(&sh->stripe_lock); |
| 3808 | /* Cannot process 'sync' concurrently with 'discard' */ | 4260 | /* Cannot process 'sync' concurrently with 'discard' */ |
| 3809 | if (!test_bit(STRIPE_DISCARD, &sh->state) && | 4261 | if (!test_bit(STRIPE_DISCARD, &sh->state) && |
| @@ -4158,7 +4610,7 @@ static int raid5_congested(struct mddev *mddev, int bits) | |||
| 4158 | * how busy the stripe_cache is | 4610 | * how busy the stripe_cache is |
| 4159 | */ | 4611 | */ |
| 4160 | 4612 | ||
| 4161 | if (conf->inactive_blocked) | 4613 | if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) |
| 4162 | return 1; | 4614 | return 1; |
| 4163 | if (conf->quiesce) | 4615 | if (conf->quiesce) |
| 4164 | return 1; | 4616 | return 1; |
| @@ -4180,8 +4632,12 @@ static int raid5_mergeable_bvec(struct mddev *mddev, | |||
| 4180 | unsigned int chunk_sectors = mddev->chunk_sectors; | 4632 | unsigned int chunk_sectors = mddev->chunk_sectors; |
| 4181 | unsigned int bio_sectors = bvm->bi_size >> 9; | 4633 | unsigned int bio_sectors = bvm->bi_size >> 9; |
| 4182 | 4634 | ||
| 4183 | if ((bvm->bi_rw & 1) == WRITE) | 4635 | /* |
| 4184 | return biovec->bv_len; /* always allow writes to be mergeable */ | 4636 | * always allow writes to be mergeable, read as well if array |
| 4637 | * is degraded as we'll go through stripe cache anyway. | ||
| 4638 | */ | ||
| 4639 | if ((bvm->bi_rw & 1) == WRITE || mddev->degraded) | ||
| 4640 | return biovec->bv_len; | ||
| 4185 | 4641 | ||
| 4186 | if (mddev->new_chunk_sectors < mddev->chunk_sectors) | 4642 | if (mddev->new_chunk_sectors < mddev->chunk_sectors) |
| 4187 | chunk_sectors = mddev->new_chunk_sectors; | 4643 | chunk_sectors = mddev->new_chunk_sectors; |
| @@ -4603,12 +5059,14 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) | |||
| 4603 | } | 5059 | } |
| 4604 | set_bit(STRIPE_DISCARD, &sh->state); | 5060 | set_bit(STRIPE_DISCARD, &sh->state); |
| 4605 | finish_wait(&conf->wait_for_overlap, &w); | 5061 | finish_wait(&conf->wait_for_overlap, &w); |
| 5062 | sh->overwrite_disks = 0; | ||
| 4606 | for (d = 0; d < conf->raid_disks; d++) { | 5063 | for (d = 0; d < conf->raid_disks; d++) { |
| 4607 | if (d == sh->pd_idx || d == sh->qd_idx) | 5064 | if (d == sh->pd_idx || d == sh->qd_idx) |
| 4608 | continue; | 5065 | continue; |
| 4609 | sh->dev[d].towrite = bi; | 5066 | sh->dev[d].towrite = bi; |
| 4610 | set_bit(R5_OVERWRITE, &sh->dev[d].flags); | 5067 | set_bit(R5_OVERWRITE, &sh->dev[d].flags); |
| 4611 | raid5_inc_bi_active_stripes(bi); | 5068 | raid5_inc_bi_active_stripes(bi); |
| 5069 | sh->overwrite_disks++; | ||
| 4612 | } | 5070 | } |
| 4613 | spin_unlock_irq(&sh->stripe_lock); | 5071 | spin_unlock_irq(&sh->stripe_lock); |
| 4614 | if (conf->mddev->bitmap) { | 5072 | if (conf->mddev->bitmap) { |
| @@ -4656,7 +5114,12 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 4656 | 5114 | ||
| 4657 | md_write_start(mddev, bi); | 5115 | md_write_start(mddev, bi); |
| 4658 | 5116 | ||
| 4659 | if (rw == READ && | 5117 | /* |
| 5118 | * If array is degraded, better not do chunk aligned read because | ||
| 5119 | * later we might have to read it again in order to reconstruct | ||
| 5120 | * data on failed drives. | ||
| 5121 | */ | ||
| 5122 | if (rw == READ && mddev->degraded == 0 && | ||
| 4660 | mddev->reshape_position == MaxSector && | 5123 | mddev->reshape_position == MaxSector && |
| 4661 | chunk_aligned_read(mddev,bi)) | 5124 | chunk_aligned_read(mddev,bi)) |
| 4662 | return; | 5125 | return; |
| @@ -4772,7 +5235,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 4772 | } | 5235 | } |
| 4773 | 5236 | ||
| 4774 | if (test_bit(STRIPE_EXPANDING, &sh->state) || | 5237 | if (test_bit(STRIPE_EXPANDING, &sh->state) || |
| 4775 | !add_stripe_bio(sh, bi, dd_idx, rw)) { | 5238 | !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { |
| 4776 | /* Stripe is busy expanding or | 5239 | /* Stripe is busy expanding or |
| 4777 | * add failed due to overlap. Flush everything | 5240 | * add failed due to overlap. Flush everything |
| 4778 | * and wait a while | 5241 | * and wait a while |
| @@ -4785,7 +5248,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 4785 | } | 5248 | } |
| 4786 | set_bit(STRIPE_HANDLE, &sh->state); | 5249 | set_bit(STRIPE_HANDLE, &sh->state); |
| 4787 | clear_bit(STRIPE_DELAYED, &sh->state); | 5250 | clear_bit(STRIPE_DELAYED, &sh->state); |
| 4788 | if ((bi->bi_rw & REQ_SYNC) && | 5251 | if ((!sh->batch_head || sh == sh->batch_head) && |
| 5252 | (bi->bi_rw & REQ_SYNC) && | ||
| 4789 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 5253 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
| 4790 | atomic_inc(&conf->preread_active_stripes); | 5254 | atomic_inc(&conf->preread_active_stripes); |
| 4791 | release_stripe_plug(mddev, sh); | 5255 | release_stripe_plug(mddev, sh); |
| @@ -5050,8 +5514,7 @@ ret: | |||
| 5050 | return reshape_sectors; | 5514 | return reshape_sectors; |
| 5051 | } | 5515 | } |
| 5052 | 5516 | ||
| 5053 | /* FIXME go_faster isn't used */ | 5517 | static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) |
| 5054 | static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) | ||
| 5055 | { | 5518 | { |
| 5056 | struct r5conf *conf = mddev->private; | 5519 | struct r5conf *conf = mddev->private; |
| 5057 | struct stripe_head *sh; | 5520 | struct stripe_head *sh; |
| @@ -5186,7 +5649,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
| 5186 | return handled; | 5649 | return handled; |
| 5187 | } | 5650 | } |
| 5188 | 5651 | ||
| 5189 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { | 5652 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { |
| 5190 | release_stripe(sh); | 5653 | release_stripe(sh); |
| 5191 | raid5_set_bi_processed_stripes(raid_bio, scnt); | 5654 | raid5_set_bi_processed_stripes(raid_bio, scnt); |
| 5192 | conf->retry_read_aligned = raid_bio; | 5655 | conf->retry_read_aligned = raid_bio; |
| @@ -5312,6 +5775,8 @@ static void raid5d(struct md_thread *thread) | |||
| 5312 | int batch_size, released; | 5775 | int batch_size, released; |
| 5313 | 5776 | ||
| 5314 | released = release_stripe_list(conf, conf->temp_inactive_list); | 5777 | released = release_stripe_list(conf, conf->temp_inactive_list); |
| 5778 | if (released) | ||
| 5779 | clear_bit(R5_DID_ALLOC, &conf->cache_state); | ||
| 5315 | 5780 | ||
| 5316 | if ( | 5781 | if ( |
| 5317 | !list_empty(&conf->bitmap_list)) { | 5782 | !list_empty(&conf->bitmap_list)) { |
| @@ -5350,6 +5815,13 @@ static void raid5d(struct md_thread *thread) | |||
| 5350 | pr_debug("%d stripes handled\n", handled); | 5815 | pr_debug("%d stripes handled\n", handled); |
| 5351 | 5816 | ||
| 5352 | spin_unlock_irq(&conf->device_lock); | 5817 | spin_unlock_irq(&conf->device_lock); |
| 5818 | if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) { | ||
| 5819 | grow_one_stripe(conf, __GFP_NOWARN); | ||
| 5820 | /* Set flag even if allocation failed. This helps | ||
| 5821 | * slow down allocation requests when mem is short | ||
| 5822 | */ | ||
| 5823 | set_bit(R5_DID_ALLOC, &conf->cache_state); | ||
| 5824 | } | ||
| 5353 | 5825 | ||
| 5354 | async_tx_issue_pending_all(); | 5826 | async_tx_issue_pending_all(); |
| 5355 | blk_finish_plug(&plug); | 5827 | blk_finish_plug(&plug); |
| @@ -5365,7 +5837,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page) | |||
| 5365 | spin_lock(&mddev->lock); | 5837 | spin_lock(&mddev->lock); |
| 5366 | conf = mddev->private; | 5838 | conf = mddev->private; |
| 5367 | if (conf) | 5839 | if (conf) |
| 5368 | ret = sprintf(page, "%d\n", conf->max_nr_stripes); | 5840 | ret = sprintf(page, "%d\n", conf->min_nr_stripes); |
| 5369 | spin_unlock(&mddev->lock); | 5841 | spin_unlock(&mddev->lock); |
| 5370 | return ret; | 5842 | return ret; |
| 5371 | } | 5843 | } |
| @@ -5375,30 +5847,24 @@ raid5_set_cache_size(struct mddev *mddev, int size) | |||
| 5375 | { | 5847 | { |
| 5376 | struct r5conf *conf = mddev->private; | 5848 | struct r5conf *conf = mddev->private; |
| 5377 | int err; | 5849 | int err; |
| 5378 | int hash; | ||
| 5379 | 5850 | ||
| 5380 | if (size <= 16 || size > 32768) | 5851 | if (size <= 16 || size > 32768) |
| 5381 | return -EINVAL; | 5852 | return -EINVAL; |
| 5382 | hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; | 5853 | |
| 5383 | while (size < conf->max_nr_stripes) { | 5854 | conf->min_nr_stripes = size; |
| 5384 | if (drop_one_stripe(conf, hash)) | 5855 | while (size < conf->max_nr_stripes && |
| 5385 | conf->max_nr_stripes--; | 5856 | drop_one_stripe(conf)) |
| 5386 | else | 5857 | ; |
| 5387 | break; | 5858 | |
| 5388 | hash--; | 5859 | |
| 5389 | if (hash < 0) | ||
| 5390 | hash = NR_STRIPE_HASH_LOCKS - 1; | ||
| 5391 | } | ||
| 5392 | err = md_allow_write(mddev); | 5860 | err = md_allow_write(mddev); |
| 5393 | if (err) | 5861 | if (err) |
| 5394 | return err; | 5862 | return err; |
| 5395 | hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; | 5863 | |
| 5396 | while (size > conf->max_nr_stripes) { | 5864 | while (size > conf->max_nr_stripes) |
| 5397 | if (grow_one_stripe(conf, hash)) | 5865 | if (!grow_one_stripe(conf, GFP_KERNEL)) |
| 5398 | conf->max_nr_stripes++; | 5866 | break; |
| 5399 | else break; | 5867 | |
| 5400 | hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; | ||
| 5401 | } | ||
| 5402 | return 0; | 5868 | return 0; |
| 5403 | } | 5869 | } |
| 5404 | EXPORT_SYMBOL(raid5_set_cache_size); | 5870 | EXPORT_SYMBOL(raid5_set_cache_size); |
| @@ -5433,6 +5899,49 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, | |||
| 5433 | raid5_store_stripe_cache_size); | 5899 | raid5_store_stripe_cache_size); |
| 5434 | 5900 | ||
| 5435 | static ssize_t | 5901 | static ssize_t |
| 5902 | raid5_show_rmw_level(struct mddev *mddev, char *page) | ||
| 5903 | { | ||
| 5904 | struct r5conf *conf = mddev->private; | ||
| 5905 | if (conf) | ||
| 5906 | return sprintf(page, "%d\n", conf->rmw_level); | ||
| 5907 | else | ||
| 5908 | return 0; | ||
| 5909 | } | ||
| 5910 | |||
| 5911 | static ssize_t | ||
| 5912 | raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) | ||
| 5913 | { | ||
| 5914 | struct r5conf *conf = mddev->private; | ||
| 5915 | unsigned long new; | ||
| 5916 | |||
| 5917 | if (!conf) | ||
| 5918 | return -ENODEV; | ||
| 5919 | |||
| 5920 | if (len >= PAGE_SIZE) | ||
| 5921 | return -EINVAL; | ||
| 5922 | |||
| 5923 | if (kstrtoul(page, 10, &new)) | ||
| 5924 | return -EINVAL; | ||
| 5925 | |||
| 5926 | if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) | ||
| 5927 | return -EINVAL; | ||
| 5928 | |||
| 5929 | if (new != PARITY_DISABLE_RMW && | ||
| 5930 | new != PARITY_ENABLE_RMW && | ||
| 5931 | new != PARITY_PREFER_RMW) | ||
| 5932 | return -EINVAL; | ||
| 5933 | |||
| 5934 | conf->rmw_level = new; | ||
| 5935 | return len; | ||
| 5936 | } | ||
| 5937 | |||
| 5938 | static struct md_sysfs_entry | ||
| 5939 | raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, | ||
| 5940 | raid5_show_rmw_level, | ||
| 5941 | raid5_store_rmw_level); | ||
| 5942 | |||
| 5943 | |||
| 5944 | static ssize_t | ||
| 5436 | raid5_show_preread_threshold(struct mddev *mddev, char *page) | 5945 | raid5_show_preread_threshold(struct mddev *mddev, char *page) |
| 5437 | { | 5946 | { |
| 5438 | struct r5conf *conf; | 5947 | struct r5conf *conf; |
| @@ -5463,7 +5972,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) | |||
| 5463 | conf = mddev->private; | 5972 | conf = mddev->private; |
| 5464 | if (!conf) | 5973 | if (!conf) |
| 5465 | err = -ENODEV; | 5974 | err = -ENODEV; |
| 5466 | else if (new > conf->max_nr_stripes) | 5975 | else if (new > conf->min_nr_stripes) |
| 5467 | err = -EINVAL; | 5976 | err = -EINVAL; |
| 5468 | else | 5977 | else |
| 5469 | conf->bypass_threshold = new; | 5978 | conf->bypass_threshold = new; |
| @@ -5618,6 +6127,7 @@ static struct attribute *raid5_attrs[] = { | |||
| 5618 | &raid5_preread_bypass_threshold.attr, | 6127 | &raid5_preread_bypass_threshold.attr, |
| 5619 | &raid5_group_thread_cnt.attr, | 6128 | &raid5_group_thread_cnt.attr, |
| 5620 | &raid5_skip_copy.attr, | 6129 | &raid5_skip_copy.attr, |
| 6130 | &raid5_rmw_level.attr, | ||
| 5621 | NULL, | 6131 | NULL, |
| 5622 | }; | 6132 | }; |
| 5623 | static struct attribute_group raid5_attrs_group = { | 6133 | static struct attribute_group raid5_attrs_group = { |
| @@ -5699,7 +6209,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) | |||
| 5699 | static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) | 6209 | static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) |
| 5700 | { | 6210 | { |
| 5701 | safe_put_page(percpu->spare_page); | 6211 | safe_put_page(percpu->spare_page); |
| 5702 | kfree(percpu->scribble); | 6212 | if (percpu->scribble) |
| 6213 | flex_array_free(percpu->scribble); | ||
| 5703 | percpu->spare_page = NULL; | 6214 | percpu->spare_page = NULL; |
| 5704 | percpu->scribble = NULL; | 6215 | percpu->scribble = NULL; |
| 5705 | } | 6216 | } |
| @@ -5709,7 +6220,9 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu | |||
| 5709 | if (conf->level == 6 && !percpu->spare_page) | 6220 | if (conf->level == 6 && !percpu->spare_page) |
| 5710 | percpu->spare_page = alloc_page(GFP_KERNEL); | 6221 | percpu->spare_page = alloc_page(GFP_KERNEL); |
| 5711 | if (!percpu->scribble) | 6222 | if (!percpu->scribble) |
| 5712 | percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); | 6223 | percpu->scribble = scribble_alloc(max(conf->raid_disks, |
| 6224 | conf->previous_raid_disks), conf->chunk_sectors / | ||
| 6225 | STRIPE_SECTORS, GFP_KERNEL); | ||
| 5713 | 6226 | ||
| 5714 | if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { | 6227 | if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { |
| 5715 | free_scratch_buffer(conf, percpu); | 6228 | free_scratch_buffer(conf, percpu); |
| @@ -5740,6 +6253,8 @@ static void raid5_free_percpu(struct r5conf *conf) | |||
| 5740 | 6253 | ||
| 5741 | static void free_conf(struct r5conf *conf) | 6254 | static void free_conf(struct r5conf *conf) |
| 5742 | { | 6255 | { |
| 6256 | if (conf->shrinker.seeks) | ||
| 6257 | unregister_shrinker(&conf->shrinker); | ||
| 5743 | free_thread_groups(conf); | 6258 | free_thread_groups(conf); |
| 5744 | shrink_stripes(conf); | 6259 | shrink_stripes(conf); |
| 5745 | raid5_free_percpu(conf); | 6260 | raid5_free_percpu(conf); |
| @@ -5807,6 +6322,30 @@ static int raid5_alloc_percpu(struct r5conf *conf) | |||
| 5807 | return err; | 6322 | return err; |
| 5808 | } | 6323 | } |
| 5809 | 6324 | ||
| 6325 | static unsigned long raid5_cache_scan(struct shrinker *shrink, | ||
| 6326 | struct shrink_control *sc) | ||
| 6327 | { | ||
| 6328 | struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); | ||
| 6329 | int ret = 0; | ||
| 6330 | while (ret < sc->nr_to_scan) { | ||
| 6331 | if (drop_one_stripe(conf) == 0) | ||
| 6332 | return SHRINK_STOP; | ||
| 6333 | ret++; | ||
| 6334 | } | ||
| 6335 | return ret; | ||
| 6336 | } | ||
| 6337 | |||
| 6338 | static unsigned long raid5_cache_count(struct shrinker *shrink, | ||
| 6339 | struct shrink_control *sc) | ||
| 6340 | { | ||
| 6341 | struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); | ||
| 6342 | |||
| 6343 | if (conf->max_nr_stripes < conf->min_nr_stripes) | ||
| 6344 | /* unlikely, but not impossible */ | ||
| 6345 | return 0; | ||
| 6346 | return conf->max_nr_stripes - conf->min_nr_stripes; | ||
| 6347 | } | ||
| 6348 | |||
| 5810 | static struct r5conf *setup_conf(struct mddev *mddev) | 6349 | static struct r5conf *setup_conf(struct mddev *mddev) |
| 5811 | { | 6350 | { |
| 5812 | struct r5conf *conf; | 6351 | struct r5conf *conf; |
| @@ -5879,7 +6418,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 5879 | else | 6418 | else |
| 5880 | conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; | 6419 | conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; |
| 5881 | max_disks = max(conf->raid_disks, conf->previous_raid_disks); | 6420 | max_disks = max(conf->raid_disks, conf->previous_raid_disks); |
| 5882 | conf->scribble_len = scribble_len(max_disks); | ||
| 5883 | 6421 | ||
| 5884 | conf->disks = kzalloc(max_disks * sizeof(struct disk_info), | 6422 | conf->disks = kzalloc(max_disks * sizeof(struct disk_info), |
| 5885 | GFP_KERNEL); | 6423 | GFP_KERNEL); |
| @@ -5907,6 +6445,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 5907 | INIT_LIST_HEAD(conf->temp_inactive_list + i); | 6445 | INIT_LIST_HEAD(conf->temp_inactive_list + i); |
| 5908 | 6446 | ||
| 5909 | conf->level = mddev->new_level; | 6447 | conf->level = mddev->new_level; |
| 6448 | conf->chunk_sectors = mddev->new_chunk_sectors; | ||
| 5910 | if (raid5_alloc_percpu(conf) != 0) | 6449 | if (raid5_alloc_percpu(conf) != 0) |
| 5911 | goto abort; | 6450 | goto abort; |
| 5912 | 6451 | ||
| @@ -5939,12 +6478,17 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 5939 | conf->fullsync = 1; | 6478 | conf->fullsync = 1; |
| 5940 | } | 6479 | } |
| 5941 | 6480 | ||
| 5942 | conf->chunk_sectors = mddev->new_chunk_sectors; | ||
| 5943 | conf->level = mddev->new_level; | 6481 | conf->level = mddev->new_level; |
| 5944 | if (conf->level == 6) | 6482 | if (conf->level == 6) { |
| 5945 | conf->max_degraded = 2; | 6483 | conf->max_degraded = 2; |
| 5946 | else | 6484 | if (raid6_call.xor_syndrome) |
| 6485 | conf->rmw_level = PARITY_ENABLE_RMW; | ||
| 6486 | else | ||
| 6487 | conf->rmw_level = PARITY_DISABLE_RMW; | ||
| 6488 | } else { | ||
| 5947 | conf->max_degraded = 1; | 6489 | conf->max_degraded = 1; |
| 6490 | conf->rmw_level = PARITY_ENABLE_RMW; | ||
| 6491 | } | ||
| 5948 | conf->algorithm = mddev->new_layout; | 6492 | conf->algorithm = mddev->new_layout; |
| 5949 | conf->reshape_progress = mddev->reshape_position; | 6493 | conf->reshape_progress = mddev->reshape_position; |
| 5950 | if (conf->reshape_progress != MaxSector) { | 6494 | if (conf->reshape_progress != MaxSector) { |
| @@ -5952,10 +6496,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 5952 | conf->prev_algo = mddev->layout; | 6496 | conf->prev_algo = mddev->layout; |
| 5953 | } | 6497 | } |
| 5954 | 6498 | ||
| 5955 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + | 6499 | conf->min_nr_stripes = NR_STRIPES; |
| 6500 | memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + | ||
| 5956 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | 6501 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; |
| 5957 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); | 6502 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); |
| 5958 | if (grow_stripes(conf, NR_STRIPES)) { | 6503 | if (grow_stripes(conf, conf->min_nr_stripes)) { |
| 5959 | printk(KERN_ERR | 6504 | printk(KERN_ERR |
| 5960 | "md/raid:%s: couldn't allocate %dkB for buffers\n", | 6505 | "md/raid:%s: couldn't allocate %dkB for buffers\n", |
| 5961 | mdname(mddev), memory); | 6506 | mdname(mddev), memory); |
| @@ -5963,6 +6508,17 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 5963 | } else | 6508 | } else |
| 5964 | printk(KERN_INFO "md/raid:%s: allocated %dkB\n", | 6509 | printk(KERN_INFO "md/raid:%s: allocated %dkB\n", |
| 5965 | mdname(mddev), memory); | 6510 | mdname(mddev), memory); |
| 6511 | /* | ||
| 6512 | * Losing a stripe head costs more than the time to refill it, | ||
| 6513 | * it reduces the queue depth and so can hurt throughput. | ||
| 6514 | * So set it rather large, scaled by number of devices. | ||
| 6515 | */ | ||
| 6516 | conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; | ||
| 6517 | conf->shrinker.scan_objects = raid5_cache_scan; | ||
| 6518 | conf->shrinker.count_objects = raid5_cache_count; | ||
| 6519 | conf->shrinker.batch = 128; | ||
| 6520 | conf->shrinker.flags = 0; | ||
| 6521 | register_shrinker(&conf->shrinker); | ||
| 5966 | 6522 | ||
| 5967 | sprintf(pers_name, "raid%d", mddev->new_level); | 6523 | sprintf(pers_name, "raid%d", mddev->new_level); |
| 5968 | conf->thread = md_register_thread(raid5d, mddev, pers_name); | 6524 | conf->thread = md_register_thread(raid5d, mddev, pers_name); |
| @@ -6604,9 +7160,9 @@ static int check_stripe_cache(struct mddev *mddev) | |||
| 6604 | */ | 7160 | */ |
| 6605 | struct r5conf *conf = mddev->private; | 7161 | struct r5conf *conf = mddev->private; |
| 6606 | if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 | 7162 | if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 |
| 6607 | > conf->max_nr_stripes || | 7163 | > conf->min_nr_stripes || |
| 6608 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 | 7164 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 |
| 6609 | > conf->max_nr_stripes) { | 7165 | > conf->min_nr_stripes) { |
| 6610 | printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", | 7166 | printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", |
| 6611 | mdname(mddev), | 7167 | mdname(mddev), |
| 6612 | ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) | 7168 | ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) |
