aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2009-09-08 20:42:29 -0400
committerDan Williams <dan.j.williams@intel.com>2009-09-08 20:42:29 -0400
commitf9dd2134374c8de6b911e2b8652c6c9622eaa658 (patch)
treec1b8f8d622941606b9e7247ab31d811ba4295011 /drivers/md/raid5.c
parent4b652f0db3be891c7b76b109c3b55003b920fc96 (diff)
parent07a3b417dc3d00802bd7b4874c3e811f0b015a7d (diff)
Merge branch 'md-raid6-accel' into ioat3.2
Conflicts: include/linux/dmaengine.h
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c1486
1 files changed, 910 insertions, 576 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bb37fb1b2d82..0a5cf2171214 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -47,7 +47,9 @@
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48#include <linux/raid/pq.h> 48#include <linux/raid/pq.h>
49#include <linux/async_tx.h> 49#include <linux/async_tx.h>
50#include <linux/async.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/cpu.h>
51#include "md.h" 53#include "md.h"
52#include "raid5.h" 54#include "raid5.h"
53#include "bitmap.h" 55#include "bitmap.h"
@@ -499,11 +501,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
499 struct page *bio_page; 501 struct page *bio_page;
500 int i; 502 int i;
501 int page_offset; 503 int page_offset;
504 struct async_submit_ctl submit;
502 505
503 if (bio->bi_sector >= sector) 506 if (bio->bi_sector >= sector)
504 page_offset = (signed)(bio->bi_sector - sector) * 512; 507 page_offset = (signed)(bio->bi_sector - sector) * 512;
505 else 508 else
506 page_offset = (signed)(sector - bio->bi_sector) * -512; 509 page_offset = (signed)(sector - bio->bi_sector) * -512;
510
511 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
507 bio_for_each_segment(bvl, bio, i) { 512 bio_for_each_segment(bvl, bio, i) {
508 int len = bio_iovec_idx(bio, i)->bv_len; 513 int len = bio_iovec_idx(bio, i)->bv_len;
509 int clen; 514 int clen;
@@ -525,15 +530,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
525 bio_page = bio_iovec_idx(bio, i)->bv_page; 530 bio_page = bio_iovec_idx(bio, i)->bv_page;
526 if (frombio) 531 if (frombio)
527 tx = async_memcpy(page, bio_page, page_offset, 532 tx = async_memcpy(page, bio_page, page_offset,
528 b_offset, clen, 533 b_offset, clen, &submit);
529 ASYNC_TX_DEP_ACK,
530 tx, NULL, NULL);
531 else 534 else
532 tx = async_memcpy(bio_page, page, b_offset, 535 tx = async_memcpy(bio_page, page, b_offset,
533 page_offset, clen, 536 page_offset, clen, &submit);
534 ASYNC_TX_DEP_ACK,
535 tx, NULL, NULL);
536 } 537 }
538 /* chain the operations */
539 submit.depend_tx = tx;
540
537 if (clen < len) /* hit end of page */ 541 if (clen < len) /* hit end of page */
538 break; 542 break;
539 page_offset += len; 543 page_offset += len;
@@ -592,6 +596,7 @@ static void ops_run_biofill(struct stripe_head *sh)
592{ 596{
593 struct dma_async_tx_descriptor *tx = NULL; 597 struct dma_async_tx_descriptor *tx = NULL;
594 raid5_conf_t *conf = sh->raid_conf; 598 raid5_conf_t *conf = sh->raid_conf;
599 struct async_submit_ctl submit;
595 int i; 600 int i;
596 601
597 pr_debug("%s: stripe %llu\n", __func__, 602 pr_debug("%s: stripe %llu\n", __func__,
@@ -615,22 +620,34 @@ static void ops_run_biofill(struct stripe_head *sh)
615 } 620 }
616 621
617 atomic_inc(&sh->count); 622 atomic_inc(&sh->count);
618 async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 623 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
619 ops_complete_biofill, sh); 624 async_trigger_callback(&submit);
620} 625}
621 626
622static void ops_complete_compute5(void *stripe_head_ref) 627static void mark_target_uptodate(struct stripe_head *sh, int target)
623{ 628{
624 struct stripe_head *sh = stripe_head_ref; 629 struct r5dev *tgt;
625 int target = sh->ops.target;
626 struct r5dev *tgt = &sh->dev[target];
627 630
628 pr_debug("%s: stripe %llu\n", __func__, 631 if (target < 0)
629 (unsigned long long)sh->sector); 632 return;
630 633
634 tgt = &sh->dev[target];
631 set_bit(R5_UPTODATE, &tgt->flags); 635 set_bit(R5_UPTODATE, &tgt->flags);
632 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 636 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
633 clear_bit(R5_Wantcompute, &tgt->flags); 637 clear_bit(R5_Wantcompute, &tgt->flags);
638}
639
640static void ops_complete_compute(void *stripe_head_ref)
641{
642 struct stripe_head *sh = stripe_head_ref;
643
644 pr_debug("%s: stripe %llu\n", __func__,
645 (unsigned long long)sh->sector);
646
647 /* mark the computed target(s) as uptodate */
648 mark_target_uptodate(sh, sh->ops.target);
649 mark_target_uptodate(sh, sh->ops.target2);
650
634 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 651 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
635 if (sh->check_state == check_state_compute_run) 652 if (sh->check_state == check_state_compute_run)
636 sh->check_state = check_state_compute_result; 653 sh->check_state = check_state_compute_result;
@@ -638,16 +655,24 @@ static void ops_complete_compute5(void *stripe_head_ref)
638 release_stripe(sh); 655 release_stripe(sh);
639} 656}
640 657
641static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) 658/* return a pointer to the address conversion region of the scribble buffer */
659static addr_conv_t *to_addr_conv(struct stripe_head *sh,
660 struct raid5_percpu *percpu)
661{
662 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
663}
664
665static struct dma_async_tx_descriptor *
666ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
642{ 667{
643 /* kernel stack size limits the total number of disks */
644 int disks = sh->disks; 668 int disks = sh->disks;
645 struct page *xor_srcs[disks]; 669 struct page **xor_srcs = percpu->scribble;
646 int target = sh->ops.target; 670 int target = sh->ops.target;
647 struct r5dev *tgt = &sh->dev[target]; 671 struct r5dev *tgt = &sh->dev[target];
648 struct page *xor_dest = tgt->page; 672 struct page *xor_dest = tgt->page;
649 int count = 0; 673 int count = 0;
650 struct dma_async_tx_descriptor *tx; 674 struct dma_async_tx_descriptor *tx;
675 struct async_submit_ctl submit;
651 int i; 676 int i;
652 677
653 pr_debug("%s: stripe %llu block: %d\n", 678 pr_debug("%s: stripe %llu block: %d\n",
@@ -660,17 +685,207 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
660 685
661 atomic_inc(&sh->count); 686 atomic_inc(&sh->count);
662 687
688 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
689 ops_complete_compute, sh, to_addr_conv(sh, percpu));
663 if (unlikely(count == 1)) 690 if (unlikely(count == 1))
664 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 691 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
665 0, NULL, ops_complete_compute5, sh);
666 else 692 else
667 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 693 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
668 ASYNC_TX_XOR_ZERO_DST, NULL,
669 ops_complete_compute5, sh);
670 694
671 return tx; 695 return tx;
672} 696}
673 697
698/* set_syndrome_sources - populate source buffers for gen_syndrome
699 * @srcs - (struct page *) array of size sh->disks
700 * @sh - stripe_head to parse
701 *
702 * Populates srcs in proper layout order for the stripe and returns the
703 * 'count' of sources to be used in a call to async_gen_syndrome. The P
704 * destination buffer is recorded in srcs[count] and the Q destination
705 * is recorded in srcs[count+1]].
706 */
707static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
708{
709 int disks = sh->disks;
710 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
711 int d0_idx = raid6_d0(sh);
712 int count;
713 int i;
714
715 for (i = 0; i < disks; i++)
716 srcs[i] = (void *)raid6_empty_zero_page;
717
718 count = 0;
719 i = d0_idx;
720 do {
721 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
722
723 srcs[slot] = sh->dev[i].page;
724 i = raid6_next_disk(i, disks);
725 } while (i != d0_idx);
726 BUG_ON(count != syndrome_disks);
727
728 return count;
729}
730
731static struct dma_async_tx_descriptor *
732ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
733{
734 int disks = sh->disks;
735 struct page **blocks = percpu->scribble;
736 int target;
737 int qd_idx = sh->qd_idx;
738 struct dma_async_tx_descriptor *tx;
739 struct async_submit_ctl submit;
740 struct r5dev *tgt;
741 struct page *dest;
742 int i;
743 int count;
744
745 if (sh->ops.target < 0)
746 target = sh->ops.target2;
747 else if (sh->ops.target2 < 0)
748 target = sh->ops.target;
749 else
750 /* we should only have one valid target */
751 BUG();
752 BUG_ON(target < 0);
753 pr_debug("%s: stripe %llu block: %d\n",
754 __func__, (unsigned long long)sh->sector, target);
755
756 tgt = &sh->dev[target];
757 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
758 dest = tgt->page;
759
760 atomic_inc(&sh->count);
761
762 if (target == qd_idx) {
763 count = set_syndrome_sources(blocks, sh);
764 blocks[count] = NULL; /* regenerating p is not necessary */
765 BUG_ON(blocks[count+1] != dest); /* q should already be set */
766 init_async_submit(&submit, 0, NULL, ops_complete_compute, sh,
767 to_addr_conv(sh, percpu));
768 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
769 } else {
770 /* Compute any data- or p-drive using XOR */
771 count = 0;
772 for (i = disks; i-- ; ) {
773 if (i == target || i == qd_idx)
774 continue;
775 blocks[count++] = sh->dev[i].page;
776 }
777
778 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
779 ops_complete_compute, sh,
780 to_addr_conv(sh, percpu));
781 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
782 }
783
784 return tx;
785}
786
787static struct dma_async_tx_descriptor *
788ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
789{
790 int i, count, disks = sh->disks;
791 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
792 int d0_idx = raid6_d0(sh);
793 int faila = -1, failb = -1;
794 int target = sh->ops.target;
795 int target2 = sh->ops.target2;
796 struct r5dev *tgt = &sh->dev[target];
797 struct r5dev *tgt2 = &sh->dev[target2];
798 struct dma_async_tx_descriptor *tx;
799 struct page **blocks = percpu->scribble;
800 struct async_submit_ctl submit;
801
802 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
803 __func__, (unsigned long long)sh->sector, target, target2);
804 BUG_ON(target < 0 || target2 < 0);
805 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
806 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
807
808 /* we need to open-code set_syndrome_sources to handle to the
809 * slot number conversion for 'faila' and 'failb'
810 */
811 for (i = 0; i < disks ; i++)
812 blocks[i] = (void *)raid6_empty_zero_page;
813 count = 0;
814 i = d0_idx;
815 do {
816 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
817
818 blocks[slot] = sh->dev[i].page;
819
820 if (i == target)
821 faila = slot;
822 if (i == target2)
823 failb = slot;
824 i = raid6_next_disk(i, disks);
825 } while (i != d0_idx);
826 BUG_ON(count != syndrome_disks);
827
828 BUG_ON(faila == failb);
829 if (failb < faila)
830 swap(faila, failb);
831 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
832 __func__, (unsigned long long)sh->sector, faila, failb);
833
834 atomic_inc(&sh->count);
835
836 if (failb == syndrome_disks+1) {
837 /* Q disk is one of the missing disks */
838 if (faila == syndrome_disks) {
839 /* Missing P+Q, just recompute */
840 init_async_submit(&submit, 0, NULL, ops_complete_compute,
841 sh, to_addr_conv(sh, percpu));
842 return async_gen_syndrome(blocks, 0, count+2,
843 STRIPE_SIZE, &submit);
844 } else {
845 struct page *dest;
846 int data_target;
847 int qd_idx = sh->qd_idx;
848
849 /* Missing D+Q: recompute D from P, then recompute Q */
850 if (target == qd_idx)
851 data_target = target2;
852 else
853 data_target = target;
854
855 count = 0;
856 for (i = disks; i-- ; ) {
857 if (i == data_target || i == qd_idx)
858 continue;
859 blocks[count++] = sh->dev[i].page;
860 }
861 dest = sh->dev[data_target].page;
862 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
863 NULL, NULL, to_addr_conv(sh, percpu));
864 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
865 &submit);
866
867 count = set_syndrome_sources(blocks, sh);
868 init_async_submit(&submit, 0, tx, ops_complete_compute,
869 sh, to_addr_conv(sh, percpu));
870 return async_gen_syndrome(blocks, 0, count+2,
871 STRIPE_SIZE, &submit);
872 }
873 }
874
875 init_async_submit(&submit, 0, NULL, ops_complete_compute, sh,
876 to_addr_conv(sh, percpu));
877 if (failb == syndrome_disks) {
878 /* We're missing D+P. */
879 return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE,
880 faila, blocks, &submit);
881 } else {
882 /* We're missing D+D. */
883 return async_raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE,
884 faila, failb, blocks, &submit);
885 }
886}
887
888
674static void ops_complete_prexor(void *stripe_head_ref) 889static void ops_complete_prexor(void *stripe_head_ref)
675{ 890{
676 struct stripe_head *sh = stripe_head_ref; 891 struct stripe_head *sh = stripe_head_ref;
@@ -680,12 +895,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
680} 895}
681 896
682static struct dma_async_tx_descriptor * 897static struct dma_async_tx_descriptor *
683ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 898ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
899 struct dma_async_tx_descriptor *tx)
684{ 900{
685 /* kernel stack size limits the total number of disks */
686 int disks = sh->disks; 901 int disks = sh->disks;
687 struct page *xor_srcs[disks]; 902 struct page **xor_srcs = percpu->scribble;
688 int count = 0, pd_idx = sh->pd_idx, i; 903 int count = 0, pd_idx = sh->pd_idx, i;
904 struct async_submit_ctl submit;
689 905
690 /* existing parity data subtracted */ 906 /* existing parity data subtracted */
691 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 907 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
@@ -700,9 +916,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
700 xor_srcs[count++] = dev->page; 916 xor_srcs[count++] = dev->page;
701 } 917 }
702 918
703 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 919 init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx,
704 ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, 920 ops_complete_prexor, sh, to_addr_conv(sh, percpu));
705 ops_complete_prexor, sh); 921 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
706 922
707 return tx; 923 return tx;
708} 924}
@@ -742,17 +958,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
742 return tx; 958 return tx;
743} 959}
744 960
745static void ops_complete_postxor(void *stripe_head_ref) 961static void ops_complete_reconstruct(void *stripe_head_ref)
746{ 962{
747 struct stripe_head *sh = stripe_head_ref; 963 struct stripe_head *sh = stripe_head_ref;
748 int disks = sh->disks, i, pd_idx = sh->pd_idx; 964 int disks = sh->disks;
965 int pd_idx = sh->pd_idx;
966 int qd_idx = sh->qd_idx;
967 int i;
749 968
750 pr_debug("%s: stripe %llu\n", __func__, 969 pr_debug("%s: stripe %llu\n", __func__,
751 (unsigned long long)sh->sector); 970 (unsigned long long)sh->sector);
752 971
753 for (i = disks; i--; ) { 972 for (i = disks; i--; ) {
754 struct r5dev *dev = &sh->dev[i]; 973 struct r5dev *dev = &sh->dev[i];
755 if (dev->written || i == pd_idx) 974
975 if (dev->written || i == pd_idx || i == qd_idx)
756 set_bit(R5_UPTODATE, &dev->flags); 976 set_bit(R5_UPTODATE, &dev->flags);
757 } 977 }
758 978
@@ -770,12 +990,12 @@ static void ops_complete_postxor(void *stripe_head_ref)
770} 990}
771 991
772static void 992static void
773ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 993ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
994 struct dma_async_tx_descriptor *tx)
774{ 995{
775 /* kernel stack size limits the total number of disks */
776 int disks = sh->disks; 996 int disks = sh->disks;
777 struct page *xor_srcs[disks]; 997 struct page **xor_srcs = percpu->scribble;
778 998 struct async_submit_ctl submit;
779 int count = 0, pd_idx = sh->pd_idx, i; 999 int count = 0, pd_idx = sh->pd_idx, i;
780 struct page *xor_dest; 1000 struct page *xor_dest;
781 int prexor = 0; 1001 int prexor = 0;
@@ -809,18 +1029,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
809 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1029 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
810 * for the synchronous xor case 1030 * for the synchronous xor case
811 */ 1031 */
812 flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | 1032 flags = ASYNC_TX_ACK |
813 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1033 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
814 1034
815 atomic_inc(&sh->count); 1035 atomic_inc(&sh->count);
816 1036
817 if (unlikely(count == 1)) { 1037 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
818 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); 1038 to_addr_conv(sh, percpu));
819 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 1039 if (unlikely(count == 1))
820 flags, tx, ops_complete_postxor, sh); 1040 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
821 } else 1041 else
822 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1042 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
823 flags, tx, ops_complete_postxor, sh); 1043}
1044
1045static void
1046ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1047 struct dma_async_tx_descriptor *tx)
1048{
1049 struct async_submit_ctl submit;
1050 struct page **blocks = percpu->scribble;
1051 int count;
1052
1053 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1054
1055 count = set_syndrome_sources(blocks, sh);
1056
1057 atomic_inc(&sh->count);
1058
1059 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1060 sh, to_addr_conv(sh, percpu));
1061 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
824} 1062}
825 1063
826static void ops_complete_check(void *stripe_head_ref) 1064static void ops_complete_check(void *stripe_head_ref)
@@ -835,63 +1073,115 @@ static void ops_complete_check(void *stripe_head_ref)
835 release_stripe(sh); 1073 release_stripe(sh);
836} 1074}
837 1075
838static void ops_run_check(struct stripe_head *sh) 1076static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
839{ 1077{
840 /* kernel stack size limits the total number of disks */
841 int disks = sh->disks; 1078 int disks = sh->disks;
842 struct page *xor_srcs[disks]; 1079 int pd_idx = sh->pd_idx;
1080 int qd_idx = sh->qd_idx;
1081 struct page *xor_dest;
1082 struct page **xor_srcs = percpu->scribble;
843 struct dma_async_tx_descriptor *tx; 1083 struct dma_async_tx_descriptor *tx;
844 1084 struct async_submit_ctl submit;
845 int count = 0, pd_idx = sh->pd_idx, i; 1085 int count;
846 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1086 int i;
847 1087
848 pr_debug("%s: stripe %llu\n", __func__, 1088 pr_debug("%s: stripe %llu\n", __func__,
849 (unsigned long long)sh->sector); 1089 (unsigned long long)sh->sector);
850 1090
1091 count = 0;
1092 xor_dest = sh->dev[pd_idx].page;
1093 xor_srcs[count++] = xor_dest;
851 for (i = disks; i--; ) { 1094 for (i = disks; i--; ) {
852 struct r5dev *dev = &sh->dev[i]; 1095 if (i == pd_idx || i == qd_idx)
853 if (i != pd_idx) 1096 continue;
854 xor_srcs[count++] = dev->page; 1097 xor_srcs[count++] = sh->dev[i].page;
855 } 1098 }
856 1099
857 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1100 init_async_submit(&submit, 0, NULL, NULL, NULL,
858 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); 1101 to_addr_conv(sh, percpu));
1102 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1103 &sh->ops.zero_sum_result, &submit);
1104
1105 atomic_inc(&sh->count);
1106 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1107 tx = async_trigger_callback(&submit);
1108}
1109
1110static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1111{
1112 struct page **srcs = percpu->scribble;
1113 struct async_submit_ctl submit;
1114 int count;
1115
1116 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1117 (unsigned long long)sh->sector, checkp);
1118
1119 count = set_syndrome_sources(srcs, sh);
1120 if (!checkp)
1121 srcs[count] = NULL;
859 1122
860 atomic_inc(&sh->count); 1123 atomic_inc(&sh->count);
861 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 1124 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
862 ops_complete_check, sh); 1125 sh, to_addr_conv(sh, percpu));
1126 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1127 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
863} 1128}
864 1129
865static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) 1130static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
866{ 1131{
867 int overlap_clear = 0, i, disks = sh->disks; 1132 int overlap_clear = 0, i, disks = sh->disks;
868 struct dma_async_tx_descriptor *tx = NULL; 1133 struct dma_async_tx_descriptor *tx = NULL;
1134 raid5_conf_t *conf = sh->raid_conf;
1135 int level = conf->level;
1136 struct raid5_percpu *percpu;
1137 unsigned long cpu;
869 1138
1139 cpu = get_cpu();
1140 percpu = per_cpu_ptr(conf->percpu, cpu);
870 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1141 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
871 ops_run_biofill(sh); 1142 ops_run_biofill(sh);
872 overlap_clear++; 1143 overlap_clear++;
873 } 1144 }
874 1145
875 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1146 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
876 tx = ops_run_compute5(sh); 1147 if (level < 6)
877 /* terminate the chain if postxor is not set to be run */ 1148 tx = ops_run_compute5(sh, percpu);
878 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) 1149 else {
1150 if (sh->ops.target2 < 0 || sh->ops.target < 0)
1151 tx = ops_run_compute6_1(sh, percpu);
1152 else
1153 tx = ops_run_compute6_2(sh, percpu);
1154 }
1155 /* terminate the chain if reconstruct is not set to be run */
1156 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
879 async_tx_ack(tx); 1157 async_tx_ack(tx);
880 } 1158 }
881 1159
882 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1160 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
883 tx = ops_run_prexor(sh, tx); 1161 tx = ops_run_prexor(sh, percpu, tx);
884 1162
885 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1163 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
886 tx = ops_run_biodrain(sh, tx); 1164 tx = ops_run_biodrain(sh, tx);
887 overlap_clear++; 1165 overlap_clear++;
888 } 1166 }
889 1167
890 if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) 1168 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
891 ops_run_postxor(sh, tx); 1169 if (level < 6)
1170 ops_run_reconstruct5(sh, percpu, tx);
1171 else
1172 ops_run_reconstruct6(sh, percpu, tx);
1173 }
892 1174
893 if (test_bit(STRIPE_OP_CHECK, &ops_request)) 1175 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
894 ops_run_check(sh); 1176 if (sh->check_state == check_state_run)
1177 ops_run_check_p(sh, percpu);
1178 else if (sh->check_state == check_state_run_q)
1179 ops_run_check_pq(sh, percpu, 0);
1180 else if (sh->check_state == check_state_run_pq)
1181 ops_run_check_pq(sh, percpu, 1);
1182 else
1183 BUG();
1184 }
895 1185
896 if (overlap_clear) 1186 if (overlap_clear)
897 for (i = disks; i--; ) { 1187 for (i = disks; i--; ) {
@@ -899,6 +1189,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
899 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1189 if (test_and_clear_bit(R5_Overlap, &dev->flags))
900 wake_up(&sh->raid_conf->wait_for_overlap); 1190 wake_up(&sh->raid_conf->wait_for_overlap);
901 } 1191 }
1192 put_cpu();
902} 1193}
903 1194
904static int grow_one_stripe(raid5_conf_t *conf) 1195static int grow_one_stripe(raid5_conf_t *conf)
@@ -948,6 +1239,28 @@ static int grow_stripes(raid5_conf_t *conf, int num)
948 return 0; 1239 return 0;
949} 1240}
950 1241
1242/**
1243 * scribble_len - return the required size of the scribble region
1244 * @num - total number of disks in the array
1245 *
1246 * The size must be enough to contain:
1247 * 1/ a struct page pointer for each device in the array +2
1248 * 2/ room to convert each entry in (1) to its corresponding dma
1249 * (dma_map_page()) or page (page_address()) address.
1250 *
1251 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1252 * calculate over all devices (not just the data blocks), using zeros in place
1253 * of the P and Q blocks.
1254 */
1255static size_t scribble_len(int num)
1256{
1257 size_t len;
1258
1259 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1260
1261 return len;
1262}
1263
951static int resize_stripes(raid5_conf_t *conf, int newsize) 1264static int resize_stripes(raid5_conf_t *conf, int newsize)
952{ 1265{
953 /* Make all the stripes able to hold 'newsize' devices. 1266 /* Make all the stripes able to hold 'newsize' devices.
@@ -976,6 +1289,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
976 struct stripe_head *osh, *nsh; 1289 struct stripe_head *osh, *nsh;
977 LIST_HEAD(newstripes); 1290 LIST_HEAD(newstripes);
978 struct disk_info *ndisks; 1291 struct disk_info *ndisks;
1292 unsigned long cpu;
979 int err; 1293 int err;
980 struct kmem_cache *sc; 1294 struct kmem_cache *sc;
981 int i; 1295 int i;
@@ -1041,7 +1355,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1041 /* Step 3. 1355 /* Step 3.
1042 * At this point, we are holding all the stripes so the array 1356 * At this point, we are holding all the stripes so the array
1043 * is completely stalled, so now is a good time to resize 1357 * is completely stalled, so now is a good time to resize
1044 * conf->disks. 1358 * conf->disks and the scribble region
1045 */ 1359 */
1046 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1360 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1047 if (ndisks) { 1361 if (ndisks) {
@@ -1052,10 +1366,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1052 } else 1366 } else
1053 err = -ENOMEM; 1367 err = -ENOMEM;
1054 1368
1369 get_online_cpus();
1370 conf->scribble_len = scribble_len(newsize);
1371 for_each_present_cpu(cpu) {
1372 struct raid5_percpu *percpu;
1373 void *scribble;
1374
1375 percpu = per_cpu_ptr(conf->percpu, cpu);
1376 scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1377
1378 if (scribble) {
1379 kfree(percpu->scribble);
1380 percpu->scribble = scribble;
1381 } else {
1382 err = -ENOMEM;
1383 break;
1384 }
1385 }
1386 put_online_cpus();
1387
1055 /* Step 4, return new stripes to service */ 1388 /* Step 4, return new stripes to service */
1056 while(!list_empty(&newstripes)) { 1389 while(!list_empty(&newstripes)) {
1057 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1390 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1058 list_del_init(&nsh->lru); 1391 list_del_init(&nsh->lru);
1392
1059 for (i=conf->raid_disks; i < newsize; i++) 1393 for (i=conf->raid_disks; i < newsize; i++)
1060 if (nsh->dev[i].page == NULL) { 1394 if (nsh->dev[i].page == NULL) {
1061 struct page *p = alloc_page(GFP_NOIO); 1395 struct page *p = alloc_page(GFP_NOIO);
@@ -1594,258 +1928,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1594} 1928}
1595 1929
1596 1930
1597
1598/*
1599 * Copy data between a page in the stripe cache, and one or more bion
1600 * The page could align with the middle of the bio, or there could be
1601 * several bion, each with several bio_vecs, which cover part of the page
1602 * Multiple bion are linked together on bi_next. There may be extras
1603 * at the end of this list. We ignore them.
1604 */
1605static void copy_data(int frombio, struct bio *bio,
1606 struct page *page,
1607 sector_t sector)
1608{
1609 char *pa = page_address(page);
1610 struct bio_vec *bvl;
1611 int i;
1612 int page_offset;
1613
1614 if (bio->bi_sector >= sector)
1615 page_offset = (signed)(bio->bi_sector - sector) * 512;
1616 else
1617 page_offset = (signed)(sector - bio->bi_sector) * -512;
1618 bio_for_each_segment(bvl, bio, i) {
1619 int len = bio_iovec_idx(bio,i)->bv_len;
1620 int clen;
1621 int b_offset = 0;
1622
1623 if (page_offset < 0) {
1624 b_offset = -page_offset;
1625 page_offset += b_offset;
1626 len -= b_offset;
1627 }
1628
1629 if (len > 0 && page_offset + len > STRIPE_SIZE)
1630 clen = STRIPE_SIZE - page_offset;
1631 else clen = len;
1632
1633 if (clen > 0) {
1634 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
1635 if (frombio)
1636 memcpy(pa+page_offset, ba+b_offset, clen);
1637 else
1638 memcpy(ba+b_offset, pa+page_offset, clen);
1639 __bio_kunmap_atomic(ba, KM_USER0);
1640 }
1641 if (clen < len) /* hit end of page */
1642 break;
1643 page_offset += len;
1644 }
1645}
1646
1647#define check_xor() do { \
1648 if (count == MAX_XOR_BLOCKS) { \
1649 xor_blocks(count, STRIPE_SIZE, dest, ptr);\
1650 count = 0; \
1651 } \
1652 } while(0)
1653
1654static void compute_parity6(struct stripe_head *sh, int method)
1655{
1656 raid5_conf_t *conf = sh->raid_conf;
1657 int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
1658 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1659 struct bio *chosen;
1660 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1661 void *ptrs[syndrome_disks+2];
1662
1663 pd_idx = sh->pd_idx;
1664 qd_idx = sh->qd_idx;
1665 d0_idx = raid6_d0(sh);
1666
1667 pr_debug("compute_parity, stripe %llu, method %d\n",
1668 (unsigned long long)sh->sector, method);
1669
1670 switch(method) {
1671 case READ_MODIFY_WRITE:
1672 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
1673 case RECONSTRUCT_WRITE:
1674 for (i= disks; i-- ;)
1675 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1676 chosen = sh->dev[i].towrite;
1677 sh->dev[i].towrite = NULL;
1678
1679 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1680 wake_up(&conf->wait_for_overlap);
1681
1682 BUG_ON(sh->dev[i].written);
1683 sh->dev[i].written = chosen;
1684 }
1685 break;
1686 case CHECK_PARITY:
1687 BUG(); /* Not implemented yet */
1688 }
1689
1690 for (i = disks; i--;)
1691 if (sh->dev[i].written) {
1692 sector_t sector = sh->dev[i].sector;
1693 struct bio *wbi = sh->dev[i].written;
1694 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1695 copy_data(1, wbi, sh->dev[i].page, sector);
1696 wbi = r5_next_bio(wbi, sector);
1697 }
1698
1699 set_bit(R5_LOCKED, &sh->dev[i].flags);
1700 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1701 }
1702
1703 /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
1704
1705 for (i = 0; i < disks; i++)
1706 ptrs[i] = (void *)raid6_empty_zero_page;
1707
1708 count = 0;
1709 i = d0_idx;
1710 do {
1711 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1712
1713 ptrs[slot] = page_address(sh->dev[i].page);
1714 if (slot < syndrome_disks &&
1715 !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
1716 printk(KERN_ERR "block %d/%d not uptodate "
1717 "on parity calc\n", i, count);
1718 BUG();
1719 }
1720
1721 i = raid6_next_disk(i, disks);
1722 } while (i != d0_idx);
1723 BUG_ON(count != syndrome_disks);
1724
1725 raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
1726
1727 switch(method) {
1728 case RECONSTRUCT_WRITE:
1729 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1730 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1731 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1732 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
1733 break;
1734 case UPDATE_PARITY:
1735 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1736 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1737 break;
1738 }
1739}
1740
1741
1742/* Compute one missing block */
1743static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1744{
1745 int i, count, disks = sh->disks;
1746 void *ptr[MAX_XOR_BLOCKS], *dest, *p;
1747 int qd_idx = sh->qd_idx;
1748
1749 pr_debug("compute_block_1, stripe %llu, idx %d\n",
1750 (unsigned long long)sh->sector, dd_idx);
1751
1752 if ( dd_idx == qd_idx ) {
1753 /* We're actually computing the Q drive */
1754 compute_parity6(sh, UPDATE_PARITY);
1755 } else {
1756 dest = page_address(sh->dev[dd_idx].page);
1757 if (!nozero) memset(dest, 0, STRIPE_SIZE);
1758 count = 0;
1759 for (i = disks ; i--; ) {
1760 if (i == dd_idx || i == qd_idx)
1761 continue;
1762 p = page_address(sh->dev[i].page);
1763 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1764 ptr[count++] = p;
1765 else
1766 printk("compute_block() %d, stripe %llu, %d"
1767 " not present\n", dd_idx,
1768 (unsigned long long)sh->sector, i);
1769
1770 check_xor();
1771 }
1772 if (count)
1773 xor_blocks(count, STRIPE_SIZE, dest, ptr);
1774 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1775 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1776 }
1777}
1778
1779/* Compute two missing blocks */
1780static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1781{
1782 int i, count, disks = sh->disks;
1783 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1784 int d0_idx = raid6_d0(sh);
1785 int faila = -1, failb = -1;
1786 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1787 void *ptrs[syndrome_disks+2];
1788
1789 for (i = 0; i < disks ; i++)
1790 ptrs[i] = (void *)raid6_empty_zero_page;
1791 count = 0;
1792 i = d0_idx;
1793 do {
1794 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1795
1796 ptrs[slot] = page_address(sh->dev[i].page);
1797
1798 if (i == dd_idx1)
1799 faila = slot;
1800 if (i == dd_idx2)
1801 failb = slot;
1802 i = raid6_next_disk(i, disks);
1803 } while (i != d0_idx);
1804 BUG_ON(count != syndrome_disks);
1805
1806 BUG_ON(faila == failb);
1807 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1808
1809 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1810 (unsigned long long)sh->sector, dd_idx1, dd_idx2,
1811 faila, failb);
1812
1813 if (failb == syndrome_disks+1) {
1814 /* Q disk is one of the missing disks */
1815 if (faila == syndrome_disks) {
1816 /* Missing P+Q, just recompute */
1817 compute_parity6(sh, UPDATE_PARITY);
1818 return;
1819 } else {
1820 /* We're missing D+Q; recompute D from P */
1821 compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
1822 dd_idx2 : dd_idx1),
1823 0);
1824 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1825 return;
1826 }
1827 }
1828
1829 /* We're missing D+P or D+D; */
1830 if (failb == syndrome_disks) {
1831 /* We're missing D+P. */
1832 raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
1833 } else {
1834 /* We're missing D+D. */
1835 raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
1836 ptrs);
1837 }
1838
1839 /* Both the above update both missing blocks */
1840 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1841 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1842}
1843
1844static void 1931static void
1845schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, 1932schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
1846 int rcw, int expand) 1933 int rcw, int expand)
1847{ 1934{
1848 int i, pd_idx = sh->pd_idx, disks = sh->disks; 1935 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1936 raid5_conf_t *conf = sh->raid_conf;
1937 int level = conf->level;
1849 1938
1850 if (rcw) { 1939 if (rcw) {
1851 /* if we are not expanding this is a proper write request, and 1940 /* if we are not expanding this is a proper write request, and
@@ -1858,7 +1947,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1858 } else 1947 } else
1859 sh->reconstruct_state = reconstruct_state_run; 1948 sh->reconstruct_state = reconstruct_state_run;
1860 1949
1861 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1950 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1862 1951
1863 for (i = disks; i--; ) { 1952 for (i = disks; i--; ) {
1864 struct r5dev *dev = &sh->dev[i]; 1953 struct r5dev *dev = &sh->dev[i];
@@ -1871,17 +1960,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1871 s->locked++; 1960 s->locked++;
1872 } 1961 }
1873 } 1962 }
1874 if (s->locked + 1 == disks) 1963 if (s->locked + conf->max_degraded == disks)
1875 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 1964 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1876 atomic_inc(&sh->raid_conf->pending_full_writes); 1965 atomic_inc(&conf->pending_full_writes);
1877 } else { 1966 } else {
1967 BUG_ON(level == 6);
1878 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1968 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1879 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1969 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1880 1970
1881 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 1971 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1882 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 1972 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1883 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 1973 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1884 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1974 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1885 1975
1886 for (i = disks; i--; ) { 1976 for (i = disks; i--; ) {
1887 struct r5dev *dev = &sh->dev[i]; 1977 struct r5dev *dev = &sh->dev[i];
@@ -1899,13 +1989,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1899 } 1989 }
1900 } 1990 }
1901 1991
1902 /* keep the parity disk locked while asynchronous operations 1992 /* keep the parity disk(s) locked while asynchronous operations
1903 * are in flight 1993 * are in flight
1904 */ 1994 */
1905 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 1995 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1906 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1996 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1907 s->locked++; 1997 s->locked++;
1908 1998
1999 if (level == 6) {
2000 int qd_idx = sh->qd_idx;
2001 struct r5dev *dev = &sh->dev[qd_idx];
2002
2003 set_bit(R5_LOCKED, &dev->flags);
2004 clear_bit(R5_UPTODATE, &dev->flags);
2005 s->locked++;
2006 }
2007
1909 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2008 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
1910 __func__, (unsigned long long)sh->sector, 2009 __func__, (unsigned long long)sh->sector,
1911 s->locked, s->ops_request); 2010 s->locked, s->ops_request);
@@ -1986,13 +2085,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1986 2085
1987static void end_reshape(raid5_conf_t *conf); 2086static void end_reshape(raid5_conf_t *conf);
1988 2087
1989static int page_is_zero(struct page *p)
1990{
1991 char *a = page_address(p);
1992 return ((*(u32*)a) == 0 &&
1993 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1994}
1995
1996static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 2088static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
1997 struct stripe_head *sh) 2089 struct stripe_head *sh)
1998{ 2090{
@@ -2133,9 +2225,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
2133 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2225 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2134 set_bit(R5_Wantcompute, &dev->flags); 2226 set_bit(R5_Wantcompute, &dev->flags);
2135 sh->ops.target = disk_idx; 2227 sh->ops.target = disk_idx;
2228 sh->ops.target2 = -1;
2136 s->req_compute = 1; 2229 s->req_compute = 1;
2137 /* Careful: from this point on 'uptodate' is in the eye 2230 /* Careful: from this point on 'uptodate' is in the eye
2138 * of raid5_run_ops which services 'compute' operations 2231 * of raid_run_ops which services 'compute' operations
2139 * before writes. R5_Wantcompute flags a block that will 2232 * before writes. R5_Wantcompute flags a block that will
2140 * be R5_UPTODATE by the time it is needed for a 2233 * be R5_UPTODATE by the time it is needed for a
2141 * subsequent operation. 2234 * subsequent operation.
@@ -2174,61 +2267,104 @@ static void handle_stripe_fill5(struct stripe_head *sh,
2174 set_bit(STRIPE_HANDLE, &sh->state); 2267 set_bit(STRIPE_HANDLE, &sh->state);
2175} 2268}
2176 2269
2177static void handle_stripe_fill6(struct stripe_head *sh, 2270/* fetch_block6 - checks the given member device to see if its data needs
2178 struct stripe_head_state *s, struct r6_state *r6s, 2271 * to be read or computed to satisfy a request.
2179 int disks) 2272 *
2273 * Returns 1 when no more member devices need to be checked, otherwise returns
2274 * 0 to tell the loop in handle_stripe_fill6 to continue
2275 */
2276static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2277 struct r6_state *r6s, int disk_idx, int disks)
2180{ 2278{
2181 int i; 2279 struct r5dev *dev = &sh->dev[disk_idx];
2182 for (i = disks; i--; ) { 2280 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
2183 struct r5dev *dev = &sh->dev[i]; 2281 &sh->dev[r6s->failed_num[1]] };
2184 if (!test_bit(R5_LOCKED, &dev->flags) && 2282
2185 !test_bit(R5_UPTODATE, &dev->flags) && 2283 if (!test_bit(R5_LOCKED, &dev->flags) &&
2186 (dev->toread || (dev->towrite && 2284 !test_bit(R5_UPTODATE, &dev->flags) &&
2187 !test_bit(R5_OVERWRITE, &dev->flags)) || 2285 (dev->toread ||
2188 s->syncing || s->expanding || 2286 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2189 (s->failed >= 1 && 2287 s->syncing || s->expanding ||
2190 (sh->dev[r6s->failed_num[0]].toread || 2288 (s->failed >= 1 &&
2191 s->to_write)) || 2289 (fdev[0]->toread || s->to_write)) ||
2192 (s->failed >= 2 && 2290 (s->failed >= 2 &&
2193 (sh->dev[r6s->failed_num[1]].toread || 2291 (fdev[1]->toread || s->to_write)))) {
2194 s->to_write)))) { 2292 /* we would like to get this block, possibly by computing it,
2195 /* we would like to get this block, possibly 2293 * otherwise read it if the backing disk is insync
2196 * by computing it, but we might not be able to 2294 */
2295 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2296 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2297 if ((s->uptodate == disks - 1) &&
2298 (s->failed && (disk_idx == r6s->failed_num[0] ||
2299 disk_idx == r6s->failed_num[1]))) {
2300 /* have disk failed, and we're requested to fetch it;
2301 * do compute it
2197 */ 2302 */
2198 if ((s->uptodate == disks - 1) && 2303 pr_debug("Computing stripe %llu block %d\n",
2199 (s->failed && (i == r6s->failed_num[0] || 2304 (unsigned long long)sh->sector, disk_idx);
2200 i == r6s->failed_num[1]))) { 2305 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2201 pr_debug("Computing stripe %llu block %d\n", 2306 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2202 (unsigned long long)sh->sector, i); 2307 set_bit(R5_Wantcompute, &dev->flags);
2203 compute_block_1(sh, i, 0); 2308 sh->ops.target = disk_idx;
2204 s->uptodate++; 2309 sh->ops.target2 = -1; /* no 2nd target */
2205 } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { 2310 s->req_compute = 1;
2206 /* Computing 2-failure is *very* expensive; only 2311 s->uptodate++;
2207 * do it if failed >= 2 2312 return 1;
2208 */ 2313 } else if (s->uptodate == disks-2 && s->failed >= 2) {
2209 int other; 2314 /* Computing 2-failure is *very* expensive; only
2210 for (other = disks; other--; ) { 2315 * do it if failed >= 2
2211 if (other == i) 2316 */
2212 continue; 2317 int other;
2213 if (!test_bit(R5_UPTODATE, 2318 for (other = disks; other--; ) {
2214 &sh->dev[other].flags)) 2319 if (other == disk_idx)
2215 break; 2320 continue;
2216 } 2321 if (!test_bit(R5_UPTODATE,
2217 BUG_ON(other < 0); 2322 &sh->dev[other].flags))
2218 pr_debug("Computing stripe %llu blocks %d,%d\n", 2323 break;
2219 (unsigned long long)sh->sector,
2220 i, other);
2221 compute_block_2(sh, i, other);
2222 s->uptodate += 2;
2223 } else if (test_bit(R5_Insync, &dev->flags)) {
2224 set_bit(R5_LOCKED, &dev->flags);
2225 set_bit(R5_Wantread, &dev->flags);
2226 s->locked++;
2227 pr_debug("Reading block %d (sync=%d)\n",
2228 i, s->syncing);
2229 } 2324 }
2325 BUG_ON(other < 0);
2326 pr_debug("Computing stripe %llu blocks %d,%d\n",
2327 (unsigned long long)sh->sector,
2328 disk_idx, other);
2329 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2330 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2331 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2332 set_bit(R5_Wantcompute, &sh->dev[other].flags);
2333 sh->ops.target = disk_idx;
2334 sh->ops.target2 = other;
2335 s->uptodate += 2;
2336 s->req_compute = 1;
2337 return 1;
2338 } else if (test_bit(R5_Insync, &dev->flags)) {
2339 set_bit(R5_LOCKED, &dev->flags);
2340 set_bit(R5_Wantread, &dev->flags);
2341 s->locked++;
2342 pr_debug("Reading block %d (sync=%d)\n",
2343 disk_idx, s->syncing);
2230 } 2344 }
2231 } 2345 }
2346
2347 return 0;
2348}
2349
2350/**
2351 * handle_stripe_fill6 - read or compute data to satisfy pending requests.
2352 */
2353static void handle_stripe_fill6(struct stripe_head *sh,
2354 struct stripe_head_state *s, struct r6_state *r6s,
2355 int disks)
2356{
2357 int i;
2358
2359 /* look for blocks to read/compute, skip this if a compute
2360 * is already in flight, or if the stripe contents are in the
2361 * midst of changing due to a write
2362 */
2363 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2364 !sh->reconstruct_state)
2365 for (i = disks; i--; )
2366 if (fetch_block6(sh, s, r6s, i, disks))
2367 break;
2232 set_bit(STRIPE_HANDLE, &sh->state); 2368 set_bit(STRIPE_HANDLE, &sh->state);
2233} 2369}
2234 2370
@@ -2362,114 +2498,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2362 */ 2498 */
2363 /* since handle_stripe can be called at any time we need to handle the 2499 /* since handle_stripe can be called at any time we need to handle the
2364 * case where a compute block operation has been submitted and then a 2500 * case where a compute block operation has been submitted and then a
2365 * subsequent call wants to start a write request. raid5_run_ops only 2501 * subsequent call wants to start a write request. raid_run_ops only
2366 * handles the case where compute block and postxor are requested 2502 * handles the case where compute block and reconstruct are requested
2367 * simultaneously. If this is not the case then new writes need to be 2503 * simultaneously. If this is not the case then new writes need to be
2368 * held off until the compute completes. 2504 * held off until the compute completes.
2369 */ 2505 */
2370 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2506 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2371 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2507 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2372 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2508 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2373 schedule_reconstruction5(sh, s, rcw == 0, 0); 2509 schedule_reconstruction(sh, s, rcw == 0, 0);
2374} 2510}
2375 2511
2376static void handle_stripe_dirtying6(raid5_conf_t *conf, 2512static void handle_stripe_dirtying6(raid5_conf_t *conf,
2377 struct stripe_head *sh, struct stripe_head_state *s, 2513 struct stripe_head *sh, struct stripe_head_state *s,
2378 struct r6_state *r6s, int disks) 2514 struct r6_state *r6s, int disks)
2379{ 2515{
2380 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; 2516 int rcw = 0, pd_idx = sh->pd_idx, i;
2381 int qd_idx = sh->qd_idx; 2517 int qd_idx = sh->qd_idx;
2518
2519 set_bit(STRIPE_HANDLE, &sh->state);
2382 for (i = disks; i--; ) { 2520 for (i = disks; i--; ) {
2383 struct r5dev *dev = &sh->dev[i]; 2521 struct r5dev *dev = &sh->dev[i];
2384 /* Would I have to read this buffer for reconstruct_write */ 2522 /* check if we haven't enough data */
2385 if (!test_bit(R5_OVERWRITE, &dev->flags) 2523 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2386 && i != pd_idx && i != qd_idx 2524 i != pd_idx && i != qd_idx &&
2387 && (!test_bit(R5_LOCKED, &dev->flags) 2525 !test_bit(R5_LOCKED, &dev->flags) &&
2388 ) && 2526 !(test_bit(R5_UPTODATE, &dev->flags) ||
2389 !test_bit(R5_UPTODATE, &dev->flags)) { 2527 test_bit(R5_Wantcompute, &dev->flags))) {
2390 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2528 rcw++;
2391 else { 2529 if (!test_bit(R5_Insync, &dev->flags))
2392 pr_debug("raid6: must_compute: " 2530 continue; /* it's a failed drive */
2393 "disk %d flags=%#lx\n", i, dev->flags); 2531
2394 must_compute++; 2532 if (
2533 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2534 pr_debug("Read_old stripe %llu "
2535 "block %d for Reconstruct\n",
2536 (unsigned long long)sh->sector, i);
2537 set_bit(R5_LOCKED, &dev->flags);
2538 set_bit(R5_Wantread, &dev->flags);
2539 s->locked++;
2540 } else {
2541 pr_debug("Request delayed stripe %llu "
2542 "block %d for Reconstruct\n",
2543 (unsigned long long)sh->sector, i);
2544 set_bit(STRIPE_DELAYED, &sh->state);
2545 set_bit(STRIPE_HANDLE, &sh->state);
2395 } 2546 }
2396 } 2547 }
2397 } 2548 }
2398 pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
2399 (unsigned long long)sh->sector, rcw, must_compute);
2400 set_bit(STRIPE_HANDLE, &sh->state);
2401
2402 if (rcw > 0)
2403 /* want reconstruct write, but need to get some data */
2404 for (i = disks; i--; ) {
2405 struct r5dev *dev = &sh->dev[i];
2406 if (!test_bit(R5_OVERWRITE, &dev->flags)
2407 && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
2408 && !test_bit(R5_LOCKED, &dev->flags) &&
2409 !test_bit(R5_UPTODATE, &dev->flags) &&
2410 test_bit(R5_Insync, &dev->flags)) {
2411 if (
2412 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2413 pr_debug("Read_old stripe %llu "
2414 "block %d for Reconstruct\n",
2415 (unsigned long long)sh->sector, i);
2416 set_bit(R5_LOCKED, &dev->flags);
2417 set_bit(R5_Wantread, &dev->flags);
2418 s->locked++;
2419 } else {
2420 pr_debug("Request delayed stripe %llu "
2421 "block %d for Reconstruct\n",
2422 (unsigned long long)sh->sector, i);
2423 set_bit(STRIPE_DELAYED, &sh->state);
2424 set_bit(STRIPE_HANDLE, &sh->state);
2425 }
2426 }
2427 }
2428 /* now if nothing is locked, and if we have enough data, we can start a 2549 /* now if nothing is locked, and if we have enough data, we can start a
2429 * write request 2550 * write request
2430 */ 2551 */
2431 if (s->locked == 0 && rcw == 0 && 2552 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2553 s->locked == 0 && rcw == 0 &&
2432 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 2554 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2433 if (must_compute > 0) { 2555 schedule_reconstruction(sh, s, 1, 0);
2434 /* We have failed blocks and need to compute them */
2435 switch (s->failed) {
2436 case 0:
2437 BUG();
2438 case 1:
2439 compute_block_1(sh, r6s->failed_num[0], 0);
2440 break;
2441 case 2:
2442 compute_block_2(sh, r6s->failed_num[0],
2443 r6s->failed_num[1]);
2444 break;
2445 default: /* This request should have been failed? */
2446 BUG();
2447 }
2448 }
2449
2450 pr_debug("Computing parity for stripe %llu\n",
2451 (unsigned long long)sh->sector);
2452 compute_parity6(sh, RECONSTRUCT_WRITE);
2453 /* now every locked buffer is ready to be written */
2454 for (i = disks; i--; )
2455 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2456 pr_debug("Writing stripe %llu block %d\n",
2457 (unsigned long long)sh->sector, i);
2458 s->locked++;
2459 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2460 }
2461 if (s->locked == disks)
2462 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2463 atomic_inc(&conf->pending_full_writes);
2464 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2465 set_bit(STRIPE_INSYNC, &sh->state);
2466
2467 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2468 atomic_dec(&conf->preread_active_stripes);
2469 if (atomic_read(&conf->preread_active_stripes) <
2470 IO_THRESHOLD)
2471 md_wakeup_thread(conf->mddev->thread);
2472 }
2473 } 2556 }
2474} 2557}
2475 2558
@@ -2528,7 +2611,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2528 * we are done. Otherwise update the mismatch count and repair 2611 * we are done. Otherwise update the mismatch count and repair
2529 * parity if !MD_RECOVERY_CHECK 2612 * parity if !MD_RECOVERY_CHECK
2530 */ 2613 */
2531 if (sh->ops.zero_sum_result == 0) 2614 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2532 /* parity is correct (on disc, 2615 /* parity is correct (on disc,
2533 * not in buffer any more) 2616 * not in buffer any more)
2534 */ 2617 */
@@ -2545,6 +2628,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2545 set_bit(R5_Wantcompute, 2628 set_bit(R5_Wantcompute,
2546 &sh->dev[sh->pd_idx].flags); 2629 &sh->dev[sh->pd_idx].flags);
2547 sh->ops.target = sh->pd_idx; 2630 sh->ops.target = sh->pd_idx;
2631 sh->ops.target2 = -1;
2548 s->uptodate++; 2632 s->uptodate++;
2549 } 2633 }
2550 } 2634 }
@@ -2561,67 +2645,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2561 2645
2562 2646
2563static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2647static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2564 struct stripe_head_state *s, 2648 struct stripe_head_state *s,
2565 struct r6_state *r6s, struct page *tmp_page, 2649 struct r6_state *r6s, int disks)
2566 int disks)
2567{ 2650{
2568 int update_p = 0, update_q = 0;
2569 struct r5dev *dev;
2570 int pd_idx = sh->pd_idx; 2651 int pd_idx = sh->pd_idx;
2571 int qd_idx = sh->qd_idx; 2652 int qd_idx = sh->qd_idx;
2653 struct r5dev *dev;
2572 2654
2573 set_bit(STRIPE_HANDLE, &sh->state); 2655 set_bit(STRIPE_HANDLE, &sh->state);
2574 2656
2575 BUG_ON(s->failed > 2); 2657 BUG_ON(s->failed > 2);
2576 BUG_ON(s->uptodate < disks); 2658
2577 /* Want to check and possibly repair P and Q. 2659 /* Want to check and possibly repair P and Q.
2578 * However there could be one 'failed' device, in which 2660 * However there could be one 'failed' device, in which
2579 * case we can only check one of them, possibly using the 2661 * case we can only check one of them, possibly using the
2580 * other to generate missing data 2662 * other to generate missing data
2581 */ 2663 */
2582 2664
2583 /* If !tmp_page, we cannot do the calculations, 2665 switch (sh->check_state) {
2584 * but as we have set STRIPE_HANDLE, we will soon be called 2666 case check_state_idle:
2585 * by stripe_handle with a tmp_page - just wait until then. 2667 /* start a new check operation if there are < 2 failures */
2586 */
2587 if (tmp_page) {
2588 if (s->failed == r6s->q_failed) { 2668 if (s->failed == r6s->q_failed) {
2589 /* The only possible failed device holds 'Q', so it 2669 /* The only possible failed device holds Q, so it
2590 * makes sense to check P (If anything else were failed, 2670 * makes sense to check P (If anything else were failed,
2591 * we would have used P to recreate it). 2671 * we would have used P to recreate it).
2592 */ 2672 */
2593 compute_block_1(sh, pd_idx, 1); 2673 sh->check_state = check_state_run;
2594 if (!page_is_zero(sh->dev[pd_idx].page)) {
2595 compute_block_1(sh, pd_idx, 0);
2596 update_p = 1;
2597 }
2598 } 2674 }
2599 if (!r6s->q_failed && s->failed < 2) { 2675 if (!r6s->q_failed && s->failed < 2) {
2600 /* q is not failed, and we didn't use it to generate 2676 /* Q is not failed, and we didn't use it to generate
2601 * anything, so it makes sense to check it 2677 * anything, so it makes sense to check it
2602 */ 2678 */
2603 memcpy(page_address(tmp_page), 2679 if (sh->check_state == check_state_run)
2604 page_address(sh->dev[qd_idx].page), 2680 sh->check_state = check_state_run_pq;
2605 STRIPE_SIZE); 2681 else
2606 compute_parity6(sh, UPDATE_PARITY); 2682 sh->check_state = check_state_run_q;
2607 if (memcmp(page_address(tmp_page),
2608 page_address(sh->dev[qd_idx].page),
2609 STRIPE_SIZE) != 0) {
2610 clear_bit(STRIPE_INSYNC, &sh->state);
2611 update_q = 1;
2612 }
2613 } 2683 }
2614 if (update_p || update_q) { 2684
2615 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2685 /* discard potentially stale zero_sum_result */
2616 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2686 sh->ops.zero_sum_result = 0;
2617 /* don't try to repair!! */ 2687
2618 update_p = update_q = 0; 2688 if (sh->check_state == check_state_run) {
2689 /* async_xor_zero_sum destroys the contents of P */
2690 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2691 s->uptodate--;
2619 } 2692 }
2693 if (sh->check_state >= check_state_run &&
2694 sh->check_state <= check_state_run_pq) {
2695 /* async_syndrome_zero_sum preserves P and Q, so
2696 * no need to mark them !uptodate here
2697 */
2698 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2699 break;
2700 }
2701
2702 /* we have 2-disk failure */
2703 BUG_ON(s->failed != 2);
2704 /* fall through */
2705 case check_state_compute_result:
2706 sh->check_state = check_state_idle;
2707
2708 /* check that a write has not made the stripe insync */
2709 if (test_bit(STRIPE_INSYNC, &sh->state))
2710 break;
2620 2711
2621 /* now write out any block on a failed drive, 2712 /* now write out any block on a failed drive,
2622 * or P or Q if they need it 2713 * or P or Q if they were recomputed
2623 */ 2714 */
2624 2715 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2625 if (s->failed == 2) { 2716 if (s->failed == 2) {
2626 dev = &sh->dev[r6s->failed_num[1]]; 2717 dev = &sh->dev[r6s->failed_num[1]];
2627 s->locked++; 2718 s->locked++;
@@ -2634,14 +2725,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2634 set_bit(R5_LOCKED, &dev->flags); 2725 set_bit(R5_LOCKED, &dev->flags);
2635 set_bit(R5_Wantwrite, &dev->flags); 2726 set_bit(R5_Wantwrite, &dev->flags);
2636 } 2727 }
2637 2728 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2638 if (update_p) {
2639 dev = &sh->dev[pd_idx]; 2729 dev = &sh->dev[pd_idx];
2640 s->locked++; 2730 s->locked++;
2641 set_bit(R5_LOCKED, &dev->flags); 2731 set_bit(R5_LOCKED, &dev->flags);
2642 set_bit(R5_Wantwrite, &dev->flags); 2732 set_bit(R5_Wantwrite, &dev->flags);
2643 } 2733 }
2644 if (update_q) { 2734 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2645 dev = &sh->dev[qd_idx]; 2735 dev = &sh->dev[qd_idx];
2646 s->locked++; 2736 s->locked++;
2647 set_bit(R5_LOCKED, &dev->flags); 2737 set_bit(R5_LOCKED, &dev->flags);
@@ -2650,6 +2740,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2650 clear_bit(STRIPE_DEGRADED, &sh->state); 2740 clear_bit(STRIPE_DEGRADED, &sh->state);
2651 2741
2652 set_bit(STRIPE_INSYNC, &sh->state); 2742 set_bit(STRIPE_INSYNC, &sh->state);
2743 break;
2744 case check_state_run:
2745 case check_state_run_q:
2746 case check_state_run_pq:
2747 break; /* we will be called again upon completion */
2748 case check_state_check_result:
2749 sh->check_state = check_state_idle;
2750
2751 /* handle a successful check operation, if parity is correct
2752 * we are done. Otherwise update the mismatch count and repair
2753 * parity if !MD_RECOVERY_CHECK
2754 */
2755 if (sh->ops.zero_sum_result == 0) {
2756 /* both parities are correct */
2757 if (!s->failed)
2758 set_bit(STRIPE_INSYNC, &sh->state);
2759 else {
2760 /* in contrast to the raid5 case we can validate
2761 * parity, but still have a failure to write
2762 * back
2763 */
2764 sh->check_state = check_state_compute_result;
2765 /* Returning at this point means that we may go
2766 * off and bring p and/or q uptodate again so
2767 * we make sure to check zero_sum_result again
2768 * to verify if p or q need writeback
2769 */
2770 }
2771 } else {
2772 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2773 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2774 /* don't try to repair!! */
2775 set_bit(STRIPE_INSYNC, &sh->state);
2776 else {
2777 int *target = &sh->ops.target;
2778
2779 sh->ops.target = -1;
2780 sh->ops.target2 = -1;
2781 sh->check_state = check_state_compute_run;
2782 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2783 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2784 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2785 set_bit(R5_Wantcompute,
2786 &sh->dev[pd_idx].flags);
2787 *target = pd_idx;
2788 target = &sh->ops.target2;
2789 s->uptodate++;
2790 }
2791 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2792 set_bit(R5_Wantcompute,
2793 &sh->dev[qd_idx].flags);
2794 *target = qd_idx;
2795 s->uptodate++;
2796 }
2797 }
2798 }
2799 break;
2800 case check_state_compute_run:
2801 break;
2802 default:
2803 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2804 __func__, sh->check_state,
2805 (unsigned long long) sh->sector);
2806 BUG();
2653 } 2807 }
2654} 2808}
2655 2809
@@ -2667,6 +2821,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2667 if (i != sh->pd_idx && i != sh->qd_idx) { 2821 if (i != sh->pd_idx && i != sh->qd_idx) {
2668 int dd_idx, j; 2822 int dd_idx, j;
2669 struct stripe_head *sh2; 2823 struct stripe_head *sh2;
2824 struct async_submit_ctl submit;
2670 2825
2671 sector_t bn = compute_blocknr(sh, i, 1); 2826 sector_t bn = compute_blocknr(sh, i, 1);
2672 sector_t s = raid5_compute_sector(conf, bn, 0, 2827 sector_t s = raid5_compute_sector(conf, bn, 0,
@@ -2686,9 +2841,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2686 } 2841 }
2687 2842
2688 /* place all the copies on one channel */ 2843 /* place all the copies on one channel */
2844 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
2689 tx = async_memcpy(sh2->dev[dd_idx].page, 2845 tx = async_memcpy(sh2->dev[dd_idx].page,
2690 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2846 sh->dev[i].page, 0, 0, STRIPE_SIZE,
2691 ASYNC_TX_DEP_ACK, tx, NULL, NULL); 2847 &submit);
2692 2848
2693 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2849 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2694 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2850 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
@@ -2974,7 +3130,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2974 /* Need to write out all blocks after computing parity */ 3130 /* Need to write out all blocks after computing parity */
2975 sh->disks = conf->raid_disks; 3131 sh->disks = conf->raid_disks;
2976 stripe_set_idx(sh->sector, conf, 0, sh); 3132 stripe_set_idx(sh->sector, conf, 0, sh);
2977 schedule_reconstruction5(sh, &s, 1, 1); 3133 schedule_reconstruction(sh, &s, 1, 1);
2978 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3134 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
2979 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3135 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2980 atomic_dec(&conf->reshape_stripes); 3136 atomic_dec(&conf->reshape_stripes);
@@ -2994,7 +3150,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2994 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3150 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2995 3151
2996 if (s.ops_request) 3152 if (s.ops_request)
2997 raid5_run_ops(sh, s.ops_request); 3153 raid_run_ops(sh, s.ops_request);
2998 3154
2999 ops_run_io(sh, &s); 3155 ops_run_io(sh, &s);
3000 3156
@@ -3003,7 +3159,7 @@ static bool handle_stripe5(struct stripe_head *sh)
3003 return blocked_rdev == NULL; 3159 return blocked_rdev == NULL;
3004} 3160}
3005 3161
3006static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 3162static bool handle_stripe6(struct stripe_head *sh)
3007{ 3163{
3008 raid5_conf_t *conf = sh->raid_conf; 3164 raid5_conf_t *conf = sh->raid_conf;
3009 int disks = sh->disks; 3165 int disks = sh->disks;
@@ -3015,9 +3171,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3015 mdk_rdev_t *blocked_rdev = NULL; 3171 mdk_rdev_t *blocked_rdev = NULL;
3016 3172
3017 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3173 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3018 "pd_idx=%d, qd_idx=%d\n", 3174 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3019 (unsigned long long)sh->sector, sh->state, 3175 (unsigned long long)sh->sector, sh->state,
3020 atomic_read(&sh->count), pd_idx, qd_idx); 3176 atomic_read(&sh->count), pd_idx, qd_idx,
3177 sh->check_state, sh->reconstruct_state);
3021 memset(&s, 0, sizeof(s)); 3178 memset(&s, 0, sizeof(s));
3022 3179
3023 spin_lock(&sh->lock); 3180 spin_lock(&sh->lock);
@@ -3037,35 +3194,24 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3037 3194
3038 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3195 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3039 i, dev->flags, dev->toread, dev->towrite, dev->written); 3196 i, dev->flags, dev->toread, dev->towrite, dev->written);
3040 /* maybe we can reply to a read */ 3197 /* maybe we can reply to a read
3041 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { 3198 *
3042 struct bio *rbi, *rbi2; 3199 * new wantfill requests are only permitted while
3043 pr_debug("Return read for disc %d\n", i); 3200 * ops_complete_biofill is guaranteed to be inactive
3044 spin_lock_irq(&conf->device_lock); 3201 */
3045 rbi = dev->toread; 3202 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3046 dev->toread = NULL; 3203 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3047 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 3204 set_bit(R5_Wantfill, &dev->flags);
3048 wake_up(&conf->wait_for_overlap);
3049 spin_unlock_irq(&conf->device_lock);
3050 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
3051 copy_data(0, rbi, dev->page, dev->sector);
3052 rbi2 = r5_next_bio(rbi, dev->sector);
3053 spin_lock_irq(&conf->device_lock);
3054 if (!raid5_dec_bi_phys_segments(rbi)) {
3055 rbi->bi_next = return_bi;
3056 return_bi = rbi;
3057 }
3058 spin_unlock_irq(&conf->device_lock);
3059 rbi = rbi2;
3060 }
3061 }
3062 3205
3063 /* now count some things */ 3206 /* now count some things */
3064 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3207 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
3065 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3208 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
3209 if (test_bit(R5_Wantcompute, &dev->flags))
3210 BUG_ON(++s.compute > 2);
3066 3211
3067 3212 if (test_bit(R5_Wantfill, &dev->flags)) {
3068 if (dev->toread) 3213 s.to_fill++;
3214 } else if (dev->toread)
3069 s.to_read++; 3215 s.to_read++;
3070 if (dev->towrite) { 3216 if (dev->towrite) {
3071 s.to_write++; 3217 s.to_write++;
@@ -3106,6 +3252,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3106 blocked_rdev = NULL; 3252 blocked_rdev = NULL;
3107 } 3253 }
3108 3254
3255 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3256 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3257 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3258 }
3259
3109 pr_debug("locked=%d uptodate=%d to_read=%d" 3260 pr_debug("locked=%d uptodate=%d to_read=%d"
3110 " to_write=%d failed=%d failed_num=%d,%d\n", 3261 " to_write=%d failed=%d failed_num=%d,%d\n",
3111 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3262 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3146,19 +3297,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3146 * or to load a block that is being partially written. 3297 * or to load a block that is being partially written.
3147 */ 3298 */
3148 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3299 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3149 (s.syncing && (s.uptodate < disks)) || s.expanding) 3300 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3150 handle_stripe_fill6(sh, &s, &r6s, disks); 3301 handle_stripe_fill6(sh, &s, &r6s, disks);
3151 3302
3152 /* now to consider writing and what else, if anything should be read */ 3303 /* Now we check to see if any write operations have recently
3153 if (s.to_write) 3304 * completed
3305 */
3306 if (sh->reconstruct_state == reconstruct_state_drain_result) {
3307 int qd_idx = sh->qd_idx;
3308
3309 sh->reconstruct_state = reconstruct_state_idle;
3310 /* All the 'written' buffers and the parity blocks are ready to
3311 * be written back to disk
3312 */
3313 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3314 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
3315 for (i = disks; i--; ) {
3316 dev = &sh->dev[i];
3317 if (test_bit(R5_LOCKED, &dev->flags) &&
3318 (i == sh->pd_idx || i == qd_idx ||
3319 dev->written)) {
3320 pr_debug("Writing block %d\n", i);
3321 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3322 set_bit(R5_Wantwrite, &dev->flags);
3323 if (!test_bit(R5_Insync, &dev->flags) ||
3324 ((i == sh->pd_idx || i == qd_idx) &&
3325 s.failed == 0))
3326 set_bit(STRIPE_INSYNC, &sh->state);
3327 }
3328 }
3329 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
3330 atomic_dec(&conf->preread_active_stripes);
3331 if (atomic_read(&conf->preread_active_stripes) <
3332 IO_THRESHOLD)
3333 md_wakeup_thread(conf->mddev->thread);
3334 }
3335 }
3336
3337 /* Now to consider new write requests and what else, if anything
3338 * should be read. We do not handle new writes when:
3339 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
3340 * 2/ A 'check' operation is in flight, as it may clobber the parity
3341 * block.
3342 */
3343 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3154 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3344 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3155 3345
3156 /* maybe we need to check and possibly fix the parity for this stripe 3346 /* maybe we need to check and possibly fix the parity for this stripe
3157 * Any reads will already have been scheduled, so we just see if enough 3347 * Any reads will already have been scheduled, so we just see if enough
3158 * data is available 3348 * data is available. The parity check is held off while parity
3349 * dependent operations are in flight.
3159 */ 3350 */
3160 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) 3351 if (sh->check_state ||
3161 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); 3352 (s.syncing && s.locked == 0 &&
3353 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3354 !test_bit(STRIPE_INSYNC, &sh->state)))
3355 handle_parity_checks6(conf, sh, &s, &r6s, disks);
3162 3356
3163 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3357 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3164 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3358 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
@@ -3179,15 +3373,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3179 set_bit(R5_Wantwrite, &dev->flags); 3373 set_bit(R5_Wantwrite, &dev->flags);
3180 set_bit(R5_ReWrite, &dev->flags); 3374 set_bit(R5_ReWrite, &dev->flags);
3181 set_bit(R5_LOCKED, &dev->flags); 3375 set_bit(R5_LOCKED, &dev->flags);
3376 s.locked++;
3182 } else { 3377 } else {
3183 /* let's read it back */ 3378 /* let's read it back */
3184 set_bit(R5_Wantread, &dev->flags); 3379 set_bit(R5_Wantread, &dev->flags);
3185 set_bit(R5_LOCKED, &dev->flags); 3380 set_bit(R5_LOCKED, &dev->flags);
3381 s.locked++;
3186 } 3382 }
3187 } 3383 }
3188 } 3384 }
3189 3385
3190 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 3386 /* Finish reconstruct operations initiated by the expansion process */
3387 if (sh->reconstruct_state == reconstruct_state_result) {
3388 sh->reconstruct_state = reconstruct_state_idle;
3389 clear_bit(STRIPE_EXPANDING, &sh->state);
3390 for (i = conf->raid_disks; i--; ) {
3391 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3392 set_bit(R5_LOCKED, &sh->dev[i].flags);
3393 s.locked++;
3394 }
3395 }
3396
3397 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3398 !sh->reconstruct_state) {
3191 struct stripe_head *sh2 3399 struct stripe_head *sh2
3192 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3400 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3193 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3401 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
@@ -3208,14 +3416,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3208 /* Need to write out all blocks after computing P&Q */ 3416 /* Need to write out all blocks after computing P&Q */
3209 sh->disks = conf->raid_disks; 3417 sh->disks = conf->raid_disks;
3210 stripe_set_idx(sh->sector, conf, 0, sh); 3418 stripe_set_idx(sh->sector, conf, 0, sh);
3211 compute_parity6(sh, RECONSTRUCT_WRITE); 3419 schedule_reconstruction(sh, &s, 1, 1);
3212 for (i = conf->raid_disks ; i-- ; ) { 3420 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3213 set_bit(R5_LOCKED, &sh->dev[i].flags);
3214 s.locked++;
3215 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3216 }
3217 clear_bit(STRIPE_EXPANDING, &sh->state);
3218 } else if (s.expanded) {
3219 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3421 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3220 atomic_dec(&conf->reshape_stripes); 3422 atomic_dec(&conf->reshape_stripes);
3221 wake_up(&conf->wait_for_overlap); 3423 wake_up(&conf->wait_for_overlap);
@@ -3233,6 +3435,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3233 if (unlikely(blocked_rdev)) 3435 if (unlikely(blocked_rdev))
3234 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3436 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3235 3437
3438 if (s.ops_request)
3439 raid_run_ops(sh, s.ops_request);
3440
3236 ops_run_io(sh, &s); 3441 ops_run_io(sh, &s);
3237 3442
3238 return_io(return_bi); 3443 return_io(return_bi);
@@ -3241,16 +3446,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3241} 3446}
3242 3447
3243/* returns true if the stripe was handled */ 3448/* returns true if the stripe was handled */
3244static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) 3449static bool handle_stripe(struct stripe_head *sh)
3245{ 3450{
3246 if (sh->raid_conf->level == 6) 3451 if (sh->raid_conf->level == 6)
3247 return handle_stripe6(sh, tmp_page); 3452 return handle_stripe6(sh);
3248 else 3453 else
3249 return handle_stripe5(sh); 3454 return handle_stripe5(sh);
3250} 3455}
3251 3456
3252
3253
3254static void raid5_activate_delayed(raid5_conf_t *conf) 3457static void raid5_activate_delayed(raid5_conf_t *conf)
3255{ 3458{
3256 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3459 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
@@ -4046,7 +4249,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4046 spin_unlock(&sh->lock); 4249 spin_unlock(&sh->lock);
4047 4250
4048 /* wait for any blocked device to be handled */ 4251 /* wait for any blocked device to be handled */
4049 while(unlikely(!handle_stripe(sh, NULL))) 4252 while (unlikely(!handle_stripe(sh)))
4050 ; 4253 ;
4051 release_stripe(sh); 4254 release_stripe(sh);
4052 4255
@@ -4103,7 +4306,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4103 return handled; 4306 return handled;
4104 } 4307 }
4105 4308
4106 handle_stripe(sh, NULL); 4309 handle_stripe(sh);
4107 release_stripe(sh); 4310 release_stripe(sh);
4108 handled++; 4311 handled++;
4109 } 4312 }
@@ -4117,6 +4320,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4117 return handled; 4320 return handled;
4118} 4321}
4119 4322
4323#ifdef CONFIG_MULTICORE_RAID456
4324static void __process_stripe(void *param, async_cookie_t cookie)
4325{
4326 struct stripe_head *sh = param;
4327
4328 handle_stripe(sh);
4329 release_stripe(sh);
4330}
4331
4332static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4333{
4334 async_schedule_domain(__process_stripe, sh, domain);
4335}
4336
4337static void synchronize_stripe_processing(struct list_head *domain)
4338{
4339 async_synchronize_full_domain(domain);
4340}
4341#else
4342static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4343{
4344 handle_stripe(sh);
4345 release_stripe(sh);
4346 cond_resched();
4347}
4348
4349static void synchronize_stripe_processing(struct list_head *domain)
4350{
4351}
4352#endif
4120 4353
4121 4354
4122/* 4355/*
@@ -4131,6 +4364,7 @@ static void raid5d(mddev_t *mddev)
4131 struct stripe_head *sh; 4364 struct stripe_head *sh;
4132 raid5_conf_t *conf = mddev_to_conf(mddev); 4365 raid5_conf_t *conf = mddev_to_conf(mddev);
4133 int handled; 4366 int handled;
4367 LIST_HEAD(raid_domain);
4134 4368
4135 pr_debug("+++ raid5d active\n"); 4369 pr_debug("+++ raid5d active\n");
4136 4370
@@ -4167,8 +4401,7 @@ static void raid5d(mddev_t *mddev)
4167 spin_unlock_irq(&conf->device_lock); 4401 spin_unlock_irq(&conf->device_lock);
4168 4402
4169 handled++; 4403 handled++;
4170 handle_stripe(sh, conf->spare_page); 4404 process_stripe(sh, &raid_domain);
4171 release_stripe(sh);
4172 4405
4173 spin_lock_irq(&conf->device_lock); 4406 spin_lock_irq(&conf->device_lock);
4174 } 4407 }
@@ -4176,6 +4409,7 @@ static void raid5d(mddev_t *mddev)
4176 4409
4177 spin_unlock_irq(&conf->device_lock); 4410 spin_unlock_irq(&conf->device_lock);
4178 4411
4412 synchronize_stripe_processing(&raid_domain);
4179 async_tx_issue_pending_all(); 4413 async_tx_issue_pending_all();
4180 unplug_slaves(mddev); 4414 unplug_slaves(mddev);
4181 4415
@@ -4308,6 +4542,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4308 return sectors * (raid_disks - conf->max_degraded); 4542 return sectors * (raid_disks - conf->max_degraded);
4309} 4543}
4310 4544
4545static void raid5_free_percpu(raid5_conf_t *conf)
4546{
4547 struct raid5_percpu *percpu;
4548 unsigned long cpu;
4549
4550 if (!conf->percpu)
4551 return;
4552
4553 get_online_cpus();
4554 for_each_possible_cpu(cpu) {
4555 percpu = per_cpu_ptr(conf->percpu, cpu);
4556 safe_put_page(percpu->spare_page);
4557 kfree(percpu->scribble);
4558 }
4559#ifdef CONFIG_HOTPLUG_CPU
4560 unregister_cpu_notifier(&conf->cpu_notify);
4561#endif
4562 put_online_cpus();
4563
4564 free_percpu(conf->percpu);
4565}
4566
4567static void free_conf(raid5_conf_t *conf)
4568{
4569 shrink_stripes(conf);
4570 raid5_free_percpu(conf);
4571 kfree(conf->disks);
4572 kfree(conf->stripe_hashtbl);
4573 kfree(conf);
4574}
4575
4576#ifdef CONFIG_HOTPLUG_CPU
4577static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4578 void *hcpu)
4579{
4580 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
4581 long cpu = (long)hcpu;
4582 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
4583
4584 switch (action) {
4585 case CPU_UP_PREPARE:
4586 case CPU_UP_PREPARE_FROZEN:
4587 if (conf->level == 6 && !percpu->spare_page)
4588 percpu->spare_page = alloc_page(GFP_KERNEL);
4589 if (!percpu->scribble)
4590 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4591
4592 if (!percpu->scribble ||
4593 (conf->level == 6 && !percpu->spare_page)) {
4594 safe_put_page(percpu->spare_page);
4595 kfree(percpu->scribble);
4596 pr_err("%s: failed memory allocation for cpu%ld\n",
4597 __func__, cpu);
4598 return NOTIFY_BAD;
4599 }
4600 break;
4601 case CPU_DEAD:
4602 case CPU_DEAD_FROZEN:
4603 safe_put_page(percpu->spare_page);
4604 kfree(percpu->scribble);
4605 percpu->spare_page = NULL;
4606 percpu->scribble = NULL;
4607 break;
4608 default:
4609 break;
4610 }
4611 return NOTIFY_OK;
4612}
4613#endif
4614
4615static int raid5_alloc_percpu(raid5_conf_t *conf)
4616{
4617 unsigned long cpu;
4618 struct page *spare_page;
4619 struct raid5_percpu *allcpus;
4620 void *scribble;
4621 int err;
4622
4623 allcpus = alloc_percpu(struct raid5_percpu);
4624 if (!allcpus)
4625 return -ENOMEM;
4626 conf->percpu = allcpus;
4627
4628 get_online_cpus();
4629 err = 0;
4630 for_each_present_cpu(cpu) {
4631 if (conf->level == 6) {
4632 spare_page = alloc_page(GFP_KERNEL);
4633 if (!spare_page) {
4634 err = -ENOMEM;
4635 break;
4636 }
4637 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4638 }
4639 scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
4640 if (!scribble) {
4641 err = -ENOMEM;
4642 break;
4643 }
4644 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
4645 }
4646#ifdef CONFIG_HOTPLUG_CPU
4647 conf->cpu_notify.notifier_call = raid456_cpu_notify;
4648 conf->cpu_notify.priority = 0;
4649 if (err == 0)
4650 err = register_cpu_notifier(&conf->cpu_notify);
4651#endif
4652 put_online_cpus();
4653
4654 return err;
4655}
4656
4311static raid5_conf_t *setup_conf(mddev_t *mddev) 4657static raid5_conf_t *setup_conf(mddev_t *mddev)
4312{ 4658{
4313 raid5_conf_t *conf; 4659 raid5_conf_t *conf;
@@ -4347,6 +4693,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4347 goto abort; 4693 goto abort;
4348 4694
4349 conf->raid_disks = mddev->raid_disks; 4695 conf->raid_disks = mddev->raid_disks;
4696 conf->scribble_len = scribble_len(conf->raid_disks);
4350 if (mddev->reshape_position == MaxSector) 4697 if (mddev->reshape_position == MaxSector)
4351 conf->previous_raid_disks = mddev->raid_disks; 4698 conf->previous_raid_disks = mddev->raid_disks;
4352 else 4699 else
@@ -4362,11 +4709,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4362 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4709 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4363 goto abort; 4710 goto abort;
4364 4711
4365 if (mddev->new_level == 6) { 4712 conf->level = mddev->new_level;
4366 conf->spare_page = alloc_page(GFP_KERNEL); 4713 if (raid5_alloc_percpu(conf) != 0)
4367 if (!conf->spare_page) 4714 goto abort;
4368 goto abort; 4715
4369 }
4370 spin_lock_init(&conf->device_lock); 4716 spin_lock_init(&conf->device_lock);
4371 init_waitqueue_head(&conf->wait_for_stripe); 4717 init_waitqueue_head(&conf->wait_for_stripe);
4372 init_waitqueue_head(&conf->wait_for_overlap); 4718 init_waitqueue_head(&conf->wait_for_overlap);
@@ -4402,7 +4748,6 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4402 } 4748 }
4403 4749
4404 conf->chunk_size = mddev->new_chunk; 4750 conf->chunk_size = mddev->new_chunk;
4405 conf->level = mddev->new_level;
4406 if (conf->level == 6) 4751 if (conf->level == 6)
4407 conf->max_degraded = 2; 4752 conf->max_degraded = 2;
4408 else 4753 else
@@ -4437,11 +4782,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4437 4782
4438 abort: 4783 abort:
4439 if (conf) { 4784 if (conf) {
4440 shrink_stripes(conf); 4785 free_conf(conf);
4441 safe_put_page(conf->spare_page);
4442 kfree(conf->disks);
4443 kfree(conf->stripe_hashtbl);
4444 kfree(conf);
4445 return ERR_PTR(-EIO); 4786 return ERR_PTR(-EIO);
4446 } else 4787 } else
4447 return ERR_PTR(-ENOMEM); 4788 return ERR_PTR(-ENOMEM);
@@ -4607,12 +4948,8 @@ abort:
4607 md_unregister_thread(mddev->thread); 4948 md_unregister_thread(mddev->thread);
4608 mddev->thread = NULL; 4949 mddev->thread = NULL;
4609 if (conf) { 4950 if (conf) {
4610 shrink_stripes(conf);
4611 print_raid5_conf(conf); 4951 print_raid5_conf(conf);
4612 safe_put_page(conf->spare_page); 4952 free_conf(conf);
4613 kfree(conf->disks);
4614 kfree(conf->stripe_hashtbl);
4615 kfree(conf);
4616 } 4953 }
4617 mddev->private = NULL; 4954 mddev->private = NULL;
4618 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 4955 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
@@ -4627,13 +4964,10 @@ static int stop(mddev_t *mddev)
4627 4964
4628 md_unregister_thread(mddev->thread); 4965 md_unregister_thread(mddev->thread);
4629 mddev->thread = NULL; 4966 mddev->thread = NULL;
4630 shrink_stripes(conf);
4631 kfree(conf->stripe_hashtbl);
4632 mddev->queue->backing_dev_info.congested_fn = NULL; 4967 mddev->queue->backing_dev_info.congested_fn = NULL;
4633 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 4968 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
4634 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 4969 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
4635 kfree(conf->disks); 4970 free_conf(conf);
4636 kfree(conf);
4637 mddev->private = NULL; 4971 mddev->private = NULL;
4638 return 0; 4972 return 0;
4639} 4973}