aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2009-07-14 14:50:52 -0400
committerDan Williams <dan.j.williams@intel.com>2009-08-29 22:09:26 -0400
commitd6f38f31f3ad4b0dd33fe970988f14e7c65ef702 (patch)
tree0b881c68e676376f2f0eccb2eb377dc3561d395f
parent36d1c6476be51101778882897b315bd928c8c7b5 (diff)
md/raid5,6: add percpu scribble region for buffer lists
Use percpu memory rather than stack for storing the buffer lists used in parity calculations. Include space for dma address conversions and pass that to async_tx via the async_submit_ctl.scribble pointer. [ Impact: move memory pressure from stack to heap ] Signed-off-by: Dan Williams <dan.j.williams@intel.com>
-rw-r--r--drivers/md/raid5.c132
-rw-r--r--drivers/md/raid5.h8
2 files changed, 110 insertions, 30 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5359236a1ec7..7727954cf726 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -642,11 +642,18 @@ static void ops_complete_compute5(void *stripe_head_ref)
642 release_stripe(sh); 642 release_stripe(sh);
643} 643}
644 644
645static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) 645/* return a pointer to the address conversion region of the scribble buffer */
646static addr_conv_t *to_addr_conv(struct stripe_head *sh,
647 struct raid5_percpu *percpu)
648{
649 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
650}
651
652static struct dma_async_tx_descriptor *
653ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
646{ 654{
647 /* kernel stack size limits the total number of disks */
648 int disks = sh->disks; 655 int disks = sh->disks;
649 struct page *xor_srcs[disks]; 656 struct page **xor_srcs = percpu->scribble;
650 int target = sh->ops.target; 657 int target = sh->ops.target;
651 struct r5dev *tgt = &sh->dev[target]; 658 struct r5dev *tgt = &sh->dev[target];
652 struct page *xor_dest = tgt->page; 659 struct page *xor_dest = tgt->page;
@@ -666,7 +673,7 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
666 atomic_inc(&sh->count); 673 atomic_inc(&sh->count);
667 674
668 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, 675 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
669 ops_complete_compute5, sh, NULL); 676 ops_complete_compute5, sh, to_addr_conv(sh, percpu));
670 if (unlikely(count == 1)) 677 if (unlikely(count == 1))
671 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 678 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
672 else 679 else
@@ -684,11 +691,11 @@ static void ops_complete_prexor(void *stripe_head_ref)
684} 691}
685 692
686static struct dma_async_tx_descriptor * 693static struct dma_async_tx_descriptor *
687ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 694ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
695 struct dma_async_tx_descriptor *tx)
688{ 696{
689 /* kernel stack size limits the total number of disks */
690 int disks = sh->disks; 697 int disks = sh->disks;
691 struct page *xor_srcs[disks]; 698 struct page **xor_srcs = percpu->scribble;
692 int count = 0, pd_idx = sh->pd_idx, i; 699 int count = 0, pd_idx = sh->pd_idx, i;
693 struct async_submit_ctl submit; 700 struct async_submit_ctl submit;
694 701
@@ -706,7 +713,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
706 } 713 }
707 714
708 init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx, 715 init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx,
709 ops_complete_prexor, sh, NULL); 716 ops_complete_prexor, sh, to_addr_conv(sh, percpu));
710 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 717 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
711 718
712 return tx; 719 return tx;
@@ -775,11 +782,11 @@ static void ops_complete_postxor(void *stripe_head_ref)
775} 782}
776 783
777static void 784static void
778ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 785ops_run_postxor(struct stripe_head *sh, struct raid5_percpu *percpu,
786 struct dma_async_tx_descriptor *tx)
779{ 787{
780 /* kernel stack size limits the total number of disks */
781 int disks = sh->disks; 788 int disks = sh->disks;
782 struct page *xor_srcs[disks]; 789 struct page **xor_srcs = percpu->scribble;
783 struct async_submit_ctl submit; 790 struct async_submit_ctl submit;
784 int count = 0, pd_idx = sh->pd_idx, i; 791 int count = 0, pd_idx = sh->pd_idx, i;
785 struct page *xor_dest; 792 struct page *xor_dest;
@@ -819,7 +826,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
819 826
820 atomic_inc(&sh->count); 827 atomic_inc(&sh->count);
821 828
822 init_async_submit(&submit, flags, tx, ops_complete_postxor, sh, NULL); 829 init_async_submit(&submit, flags, tx, ops_complete_postxor, sh,
830 to_addr_conv(sh, percpu));
823 if (unlikely(count == 1)) 831 if (unlikely(count == 1))
824 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 832 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
825 else 833 else
@@ -838,11 +846,10 @@ static void ops_complete_check(void *stripe_head_ref)
838 release_stripe(sh); 846 release_stripe(sh);
839} 847}
840 848
841static void ops_run_check(struct stripe_head *sh) 849static void ops_run_check(struct stripe_head *sh, struct raid5_percpu *percpu)
842{ 850{
843 /* kernel stack size limits the total number of disks */
844 int disks = sh->disks; 851 int disks = sh->disks;
845 struct page *xor_srcs[disks]; 852 struct page **xor_srcs = percpu->scribble;
846 struct dma_async_tx_descriptor *tx; 853 struct dma_async_tx_descriptor *tx;
847 struct async_submit_ctl submit; 854 struct async_submit_ctl submit;
848 855
@@ -858,7 +865,8 @@ static void ops_run_check(struct stripe_head *sh)
858 xor_srcs[count++] = dev->page; 865 xor_srcs[count++] = dev->page;
859 } 866 }
860 867
861 init_async_submit(&submit, 0, NULL, NULL, NULL, NULL); 868 init_async_submit(&submit, 0, NULL, NULL, NULL,
869 to_addr_conv(sh, percpu));
862 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 870 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
863 &sh->ops.zero_sum_result, &submit); 871 &sh->ops.zero_sum_result, &submit);
864 872
@@ -871,21 +879,26 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
871{ 879{
872 int overlap_clear = 0, i, disks = sh->disks; 880 int overlap_clear = 0, i, disks = sh->disks;
873 struct dma_async_tx_descriptor *tx = NULL; 881 struct dma_async_tx_descriptor *tx = NULL;
882 raid5_conf_t *conf = sh->raid_conf;
883 struct raid5_percpu *percpu;
884 unsigned long cpu;
874 885
886 cpu = get_cpu();
887 percpu = per_cpu_ptr(conf->percpu, cpu);
875 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 888 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
876 ops_run_biofill(sh); 889 ops_run_biofill(sh);
877 overlap_clear++; 890 overlap_clear++;
878 } 891 }
879 892
880 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 893 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
881 tx = ops_run_compute5(sh); 894 tx = ops_run_compute5(sh, percpu);
882 /* terminate the chain if postxor is not set to be run */ 895 /* terminate the chain if postxor is not set to be run */
883 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) 896 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
884 async_tx_ack(tx); 897 async_tx_ack(tx);
885 } 898 }
886 899
887 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 900 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
888 tx = ops_run_prexor(sh, tx); 901 tx = ops_run_prexor(sh, percpu, tx);
889 902
890 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 903 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
891 tx = ops_run_biodrain(sh, tx); 904 tx = ops_run_biodrain(sh, tx);
@@ -893,10 +906,10 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
893 } 906 }
894 907
895 if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) 908 if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
896 ops_run_postxor(sh, tx); 909 ops_run_postxor(sh, percpu, tx);
897 910
898 if (test_bit(STRIPE_OP_CHECK, &ops_request)) 911 if (test_bit(STRIPE_OP_CHECK, &ops_request))
899 ops_run_check(sh); 912 ops_run_check(sh, percpu);
900 913
901 if (overlap_clear) 914 if (overlap_clear)
902 for (i = disks; i--; ) { 915 for (i = disks; i--; ) {
@@ -904,6 +917,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
904 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 917 if (test_and_clear_bit(R5_Overlap, &dev->flags))
905 wake_up(&sh->raid_conf->wait_for_overlap); 918 wake_up(&sh->raid_conf->wait_for_overlap);
906 } 919 }
920 put_cpu();
907} 921}
908 922
909static int grow_one_stripe(raid5_conf_t *conf) 923static int grow_one_stripe(raid5_conf_t *conf)
@@ -953,6 +967,28 @@ static int grow_stripes(raid5_conf_t *conf, int num)
953 return 0; 967 return 0;
954} 968}
955 969
970/**
971 * scribble_len - return the required size of the scribble region
972 * @num - total number of disks in the array
973 *
974 * The size must be enough to contain:
975 * 1/ a struct page pointer for each device in the array +2
976 * 2/ room to convert each entry in (1) to its corresponding dma
977 * (dma_map_page()) or page (page_address()) address.
978 *
979 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
980 * calculate over all devices (not just the data blocks), using zeros in place
981 * of the P and Q blocks.
982 */
983static size_t scribble_len(int num)
984{
985 size_t len;
986
987 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
988
989 return len;
990}
991
956static int resize_stripes(raid5_conf_t *conf, int newsize) 992static int resize_stripes(raid5_conf_t *conf, int newsize)
957{ 993{
958 /* Make all the stripes able to hold 'newsize' devices. 994 /* Make all the stripes able to hold 'newsize' devices.
@@ -981,6 +1017,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
981 struct stripe_head *osh, *nsh; 1017 struct stripe_head *osh, *nsh;
982 LIST_HEAD(newstripes); 1018 LIST_HEAD(newstripes);
983 struct disk_info *ndisks; 1019 struct disk_info *ndisks;
1020 unsigned long cpu;
984 int err; 1021 int err;
985 struct kmem_cache *sc; 1022 struct kmem_cache *sc;
986 int i; 1023 int i;
@@ -1046,7 +1083,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1046 /* Step 3. 1083 /* Step 3.
1047 * At this point, we are holding all the stripes so the array 1084 * At this point, we are holding all the stripes so the array
1048 * is completely stalled, so now is a good time to resize 1085 * is completely stalled, so now is a good time to resize
1049 * conf->disks. 1086 * conf->disks and the scribble region
1050 */ 1087 */
1051 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1088 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1052 if (ndisks) { 1089 if (ndisks) {
@@ -1057,10 +1094,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1057 } else 1094 } else
1058 err = -ENOMEM; 1095 err = -ENOMEM;
1059 1096
1097 get_online_cpus();
1098 conf->scribble_len = scribble_len(newsize);
1099 for_each_present_cpu(cpu) {
1100 struct raid5_percpu *percpu;
1101 void *scribble;
1102
1103 percpu = per_cpu_ptr(conf->percpu, cpu);
1104 scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1105
1106 if (scribble) {
1107 kfree(percpu->scribble);
1108 percpu->scribble = scribble;
1109 } else {
1110 err = -ENOMEM;
1111 break;
1112 }
1113 }
1114 put_online_cpus();
1115
1060 /* Step 4, return new stripes to service */ 1116 /* Step 4, return new stripes to service */
1061 while(!list_empty(&newstripes)) { 1117 while(!list_empty(&newstripes)) {
1062 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1118 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1063 list_del_init(&nsh->lru); 1119 list_del_init(&nsh->lru);
1120
1064 for (i=conf->raid_disks; i < newsize; i++) 1121 for (i=conf->raid_disks; i < newsize; i++)
1065 if (nsh->dev[i].page == NULL) { 1122 if (nsh->dev[i].page == NULL) {
1066 struct page *p = alloc_page(GFP_NOIO); 1123 struct page *p = alloc_page(GFP_NOIO);
@@ -4318,6 +4375,7 @@ static void raid5_free_percpu(raid5_conf_t *conf)
4318 for_each_possible_cpu(cpu) { 4375 for_each_possible_cpu(cpu) {
4319 percpu = per_cpu_ptr(conf->percpu, cpu); 4376 percpu = per_cpu_ptr(conf->percpu, cpu);
4320 safe_put_page(percpu->spare_page); 4377 safe_put_page(percpu->spare_page);
4378 kfree(percpu->scribble);
4321 } 4379 }
4322#ifdef CONFIG_HOTPLUG_CPU 4380#ifdef CONFIG_HOTPLUG_CPU
4323 unregister_cpu_notifier(&conf->cpu_notify); 4381 unregister_cpu_notifier(&conf->cpu_notify);
@@ -4347,9 +4405,15 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4347 switch (action) { 4405 switch (action) {
4348 case CPU_UP_PREPARE: 4406 case CPU_UP_PREPARE:
4349 case CPU_UP_PREPARE_FROZEN: 4407 case CPU_UP_PREPARE_FROZEN:
4350 if (!percpu->spare_page) 4408 if (conf->level == 6 && !percpu->spare_page)
4351 percpu->spare_page = alloc_page(GFP_KERNEL); 4409 percpu->spare_page = alloc_page(GFP_KERNEL);
4352 if (!percpu->spare_page) { 4410 if (!percpu->scribble)
4411 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4412
4413 if (!percpu->scribble ||
4414 (conf->level == 6 && !percpu->spare_page)) {
4415 safe_put_page(percpu->spare_page);
4416 kfree(percpu->scribble);
4353 pr_err("%s: failed memory allocation for cpu%ld\n", 4417 pr_err("%s: failed memory allocation for cpu%ld\n",
4354 __func__, cpu); 4418 __func__, cpu);
4355 return NOTIFY_BAD; 4419 return NOTIFY_BAD;
@@ -4358,7 +4422,9 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4358 case CPU_DEAD: 4422 case CPU_DEAD:
4359 case CPU_DEAD_FROZEN: 4423 case CPU_DEAD_FROZEN:
4360 safe_put_page(percpu->spare_page); 4424 safe_put_page(percpu->spare_page);
4425 kfree(percpu->scribble);
4361 percpu->spare_page = NULL; 4426 percpu->spare_page = NULL;
4427 percpu->scribble = NULL;
4362 break; 4428 break;
4363 default: 4429 default:
4364 break; 4430 break;
@@ -4372,12 +4438,9 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
4372 unsigned long cpu; 4438 unsigned long cpu;
4373 struct page *spare_page; 4439 struct page *spare_page;
4374 struct raid5_percpu *allcpus; 4440 struct raid5_percpu *allcpus;
4441 void *scribble;
4375 int err; 4442 int err;
4376 4443
4377 /* the only percpu data is the raid6 spare page */
4378 if (conf->level != 6)
4379 return 0;
4380
4381 allcpus = alloc_percpu(struct raid5_percpu); 4444 allcpus = alloc_percpu(struct raid5_percpu);
4382 if (!allcpus) 4445 if (!allcpus)
4383 return -ENOMEM; 4446 return -ENOMEM;
@@ -4386,12 +4449,20 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
4386 get_online_cpus(); 4449 get_online_cpus();
4387 err = 0; 4450 err = 0;
4388 for_each_present_cpu(cpu) { 4451 for_each_present_cpu(cpu) {
4389 spare_page = alloc_page(GFP_KERNEL); 4452 if (conf->level == 6) {
4390 if (!spare_page) { 4453 spare_page = alloc_page(GFP_KERNEL);
4454 if (!spare_page) {
4455 err = -ENOMEM;
4456 break;
4457 }
4458 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4459 }
4460 scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
4461 if (!scribble) {
4391 err = -ENOMEM; 4462 err = -ENOMEM;
4392 break; 4463 break;
4393 } 4464 }
4394 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 4465 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
4395 } 4466 }
4396#ifdef CONFIG_HOTPLUG_CPU 4467#ifdef CONFIG_HOTPLUG_CPU
4397 conf->cpu_notify.notifier_call = raid456_cpu_notify; 4468 conf->cpu_notify.notifier_call = raid456_cpu_notify;
@@ -4443,6 +4514,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4443 goto abort; 4514 goto abort;
4444 4515
4445 conf->raid_disks = mddev->raid_disks; 4516 conf->raid_disks = mddev->raid_disks;
4517 conf->scribble_len = scribble_len(conf->raid_disks);
4446 if (mddev->reshape_position == MaxSector) 4518 if (mddev->reshape_position == MaxSector)
4447 conf->previous_raid_disks = mddev->raid_disks; 4519 conf->previous_raid_disks = mddev->raid_disks;
4448 else 4520 else
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 07a7a4102f05..e7baabffee86 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -386,7 +386,15 @@ struct raid5_private_data {
386 /* per cpu variables */ 386 /* per cpu variables */
387 struct raid5_percpu { 387 struct raid5_percpu {
388 struct page *spare_page; /* Used when checking P/Q in raid6 */ 388 struct page *spare_page; /* Used when checking P/Q in raid6 */
389 void *scribble; /* space for constructing buffer
390 * lists and performing address
391 * conversions
392 */
389 } *percpu; 393 } *percpu;
394 size_t scribble_len; /* size of scribble region must be
395 * associated with conf to handle
396 * cpu hotplug while reshaping
397 */
390#ifdef CONFIG_HOTPLUG_CPU 398#ifdef CONFIG_HOTPLUG_CPU
391 struct notifier_block cpu_notify; 399 struct notifier_block cpu_notify;
392#endif 400#endif