aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig26
-rw-r--r--drivers/md/raid5.c1494
-rw-r--r--drivers/md/raid5.h28
3 files changed, 967 insertions, 581 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 020f9573fd82..2158377a1359 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -124,6 +124,8 @@ config MD_RAID456
124 select MD_RAID6_PQ 124 select MD_RAID6_PQ
125 select ASYNC_MEMCPY 125 select ASYNC_MEMCPY
126 select ASYNC_XOR 126 select ASYNC_XOR
127 select ASYNC_PQ
128 select ASYNC_RAID6_RECOV
127 ---help--- 129 ---help---
128 A RAID-5 set of N drives with a capacity of C MB per drive provides 130 A RAID-5 set of N drives with a capacity of C MB per drive provides
129 the capacity of C * (N - 1) MB, and protects against a failure 131 the capacity of C * (N - 1) MB, and protects against a failure
@@ -152,9 +154,33 @@ config MD_RAID456
152 154
153 If unsure, say Y. 155 If unsure, say Y.
154 156
157config MULTICORE_RAID456
158 bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
159 depends on MD_RAID456
160 depends on SMP
161 depends on EXPERIMENTAL
162 ---help---
163 Enable the raid456 module to dispatch per-stripe raid operations to a
164 thread pool.
165
166 If unsure, say N.
167
155config MD_RAID6_PQ 168config MD_RAID6_PQ
156 tristate 169 tristate
157 170
171config ASYNC_RAID6_TEST
172 tristate "Self test for hardware accelerated raid6 recovery"
173 depends on MD_RAID6_PQ
174 select ASYNC_RAID6_RECOV
175 ---help---
176 This is a one-shot self test that permutes through the
177 recovery of all the possible two disk failure scenarios for a
178 N-disk array. Recovery is performed with the asynchronous
179 raid6 recovery routines, and will optionally use an offload
180 engine if one is available.
181
182 If unsure, say N.
183
158config MD_MULTIPATH 184config MD_MULTIPATH
159 tristate "Multipath I/O support" 185 tristate "Multipath I/O support"
160 depends on BLK_DEV_MD 186 depends on BLK_DEV_MD
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f9f991e6e138..cac6f4d3a143 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -47,7 +47,9 @@
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48#include <linux/raid/pq.h> 48#include <linux/raid/pq.h>
49#include <linux/async_tx.h> 49#include <linux/async_tx.h>
50#include <linux/async.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/cpu.h>
51#include "md.h" 53#include "md.h"
52#include "raid5.h" 54#include "raid5.h"
53#include "bitmap.h" 55#include "bitmap.h"
@@ -499,11 +501,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
499 struct page *bio_page; 501 struct page *bio_page;
500 int i; 502 int i;
501 int page_offset; 503 int page_offset;
504 struct async_submit_ctl submit;
505 enum async_tx_flags flags = 0;
502 506
503 if (bio->bi_sector >= sector) 507 if (bio->bi_sector >= sector)
504 page_offset = (signed)(bio->bi_sector - sector) * 512; 508 page_offset = (signed)(bio->bi_sector - sector) * 512;
505 else 509 else
506 page_offset = (signed)(sector - bio->bi_sector) * -512; 510 page_offset = (signed)(sector - bio->bi_sector) * -512;
511
512 if (frombio)
513 flags |= ASYNC_TX_FENCE;
514 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
515
507 bio_for_each_segment(bvl, bio, i) { 516 bio_for_each_segment(bvl, bio, i) {
508 int len = bio_iovec_idx(bio, i)->bv_len; 517 int len = bio_iovec_idx(bio, i)->bv_len;
509 int clen; 518 int clen;
@@ -525,15 +534,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
525 bio_page = bio_iovec_idx(bio, i)->bv_page; 534 bio_page = bio_iovec_idx(bio, i)->bv_page;
526 if (frombio) 535 if (frombio)
527 tx = async_memcpy(page, bio_page, page_offset, 536 tx = async_memcpy(page, bio_page, page_offset,
528 b_offset, clen, 537 b_offset, clen, &submit);
529 ASYNC_TX_DEP_ACK,
530 tx, NULL, NULL);
531 else 538 else
532 tx = async_memcpy(bio_page, page, b_offset, 539 tx = async_memcpy(bio_page, page, b_offset,
533 page_offset, clen, 540 page_offset, clen, &submit);
534 ASYNC_TX_DEP_ACK,
535 tx, NULL, NULL);
536 } 541 }
542 /* chain the operations */
543 submit.depend_tx = tx;
544
537 if (clen < len) /* hit end of page */ 545 if (clen < len) /* hit end of page */
538 break; 546 break;
539 page_offset += len; 547 page_offset += len;
@@ -592,6 +600,7 @@ static void ops_run_biofill(struct stripe_head *sh)
592{ 600{
593 struct dma_async_tx_descriptor *tx = NULL; 601 struct dma_async_tx_descriptor *tx = NULL;
594 raid5_conf_t *conf = sh->raid_conf; 602 raid5_conf_t *conf = sh->raid_conf;
603 struct async_submit_ctl submit;
595 int i; 604 int i;
596 605
597 pr_debug("%s: stripe %llu\n", __func__, 606 pr_debug("%s: stripe %llu\n", __func__,
@@ -615,22 +624,34 @@ static void ops_run_biofill(struct stripe_head *sh)
615 } 624 }
616 625
617 atomic_inc(&sh->count); 626 atomic_inc(&sh->count);
618 async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 627 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
619 ops_complete_biofill, sh); 628 async_trigger_callback(&submit);
620} 629}
621 630
622static void ops_complete_compute5(void *stripe_head_ref) 631static void mark_target_uptodate(struct stripe_head *sh, int target)
623{ 632{
624 struct stripe_head *sh = stripe_head_ref; 633 struct r5dev *tgt;
625 int target = sh->ops.target;
626 struct r5dev *tgt = &sh->dev[target];
627 634
628 pr_debug("%s: stripe %llu\n", __func__, 635 if (target < 0)
629 (unsigned long long)sh->sector); 636 return;
630 637
638 tgt = &sh->dev[target];
631 set_bit(R5_UPTODATE, &tgt->flags); 639 set_bit(R5_UPTODATE, &tgt->flags);
632 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 640 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
633 clear_bit(R5_Wantcompute, &tgt->flags); 641 clear_bit(R5_Wantcompute, &tgt->flags);
642}
643
644static void ops_complete_compute(void *stripe_head_ref)
645{
646 struct stripe_head *sh = stripe_head_ref;
647
648 pr_debug("%s: stripe %llu\n", __func__,
649 (unsigned long long)sh->sector);
650
651 /* mark the computed target(s) as uptodate */
652 mark_target_uptodate(sh, sh->ops.target);
653 mark_target_uptodate(sh, sh->ops.target2);
654
634 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 655 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
635 if (sh->check_state == check_state_compute_run) 656 if (sh->check_state == check_state_compute_run)
636 sh->check_state = check_state_compute_result; 657 sh->check_state = check_state_compute_result;
@@ -638,16 +659,24 @@ static void ops_complete_compute5(void *stripe_head_ref)
638 release_stripe(sh); 659 release_stripe(sh);
639} 660}
640 661
641static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) 662/* return a pointer to the address conversion region of the scribble buffer */
663static addr_conv_t *to_addr_conv(struct stripe_head *sh,
664 struct raid5_percpu *percpu)
665{
666 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
667}
668
669static struct dma_async_tx_descriptor *
670ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
642{ 671{
643 /* kernel stack size limits the total number of disks */
644 int disks = sh->disks; 672 int disks = sh->disks;
645 struct page *xor_srcs[disks]; 673 struct page **xor_srcs = percpu->scribble;
646 int target = sh->ops.target; 674 int target = sh->ops.target;
647 struct r5dev *tgt = &sh->dev[target]; 675 struct r5dev *tgt = &sh->dev[target];
648 struct page *xor_dest = tgt->page; 676 struct page *xor_dest = tgt->page;
649 int count = 0; 677 int count = 0;
650 struct dma_async_tx_descriptor *tx; 678 struct dma_async_tx_descriptor *tx;
679 struct async_submit_ctl submit;
651 int i; 680 int i;
652 681
653 pr_debug("%s: stripe %llu block: %d\n", 682 pr_debug("%s: stripe %llu block: %d\n",
@@ -660,17 +689,212 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
660 689
661 atomic_inc(&sh->count); 690 atomic_inc(&sh->count);
662 691
692 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
693 ops_complete_compute, sh, to_addr_conv(sh, percpu));
663 if (unlikely(count == 1)) 694 if (unlikely(count == 1))
664 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 695 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
665 0, NULL, ops_complete_compute5, sh);
666 else 696 else
667 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 697 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
668 ASYNC_TX_XOR_ZERO_DST, NULL,
669 ops_complete_compute5, sh);
670 698
671 return tx; 699 return tx;
672} 700}
673 701
702/* set_syndrome_sources - populate source buffers for gen_syndrome
703 * @srcs - (struct page *) array of size sh->disks
704 * @sh - stripe_head to parse
705 *
706 * Populates srcs in proper layout order for the stripe and returns the
707 * 'count' of sources to be used in a call to async_gen_syndrome. The P
708 * destination buffer is recorded in srcs[count] and the Q destination
709 * is recorded in srcs[count+1]].
710 */
711static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
712{
713 int disks = sh->disks;
714 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
715 int d0_idx = raid6_d0(sh);
716 int count;
717 int i;
718
719 for (i = 0; i < disks; i++)
720 srcs[i] = (void *)raid6_empty_zero_page;
721
722 count = 0;
723 i = d0_idx;
724 do {
725 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
726
727 srcs[slot] = sh->dev[i].page;
728 i = raid6_next_disk(i, disks);
729 } while (i != d0_idx);
730 BUG_ON(count != syndrome_disks);
731
732 return count;
733}
734
735static struct dma_async_tx_descriptor *
736ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
737{
738 int disks = sh->disks;
739 struct page **blocks = percpu->scribble;
740 int target;
741 int qd_idx = sh->qd_idx;
742 struct dma_async_tx_descriptor *tx;
743 struct async_submit_ctl submit;
744 struct r5dev *tgt;
745 struct page *dest;
746 int i;
747 int count;
748
749 if (sh->ops.target < 0)
750 target = sh->ops.target2;
751 else if (sh->ops.target2 < 0)
752 target = sh->ops.target;
753 else
754 /* we should only have one valid target */
755 BUG();
756 BUG_ON(target < 0);
757 pr_debug("%s: stripe %llu block: %d\n",
758 __func__, (unsigned long long)sh->sector, target);
759
760 tgt = &sh->dev[target];
761 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
762 dest = tgt->page;
763
764 atomic_inc(&sh->count);
765
766 if (target == qd_idx) {
767 count = set_syndrome_sources(blocks, sh);
768 blocks[count] = NULL; /* regenerating p is not necessary */
769 BUG_ON(blocks[count+1] != dest); /* q should already be set */
770 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
771 ops_complete_compute, sh,
772 to_addr_conv(sh, percpu));
773 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
774 } else {
775 /* Compute any data- or p-drive using XOR */
776 count = 0;
777 for (i = disks; i-- ; ) {
778 if (i == target || i == qd_idx)
779 continue;
780 blocks[count++] = sh->dev[i].page;
781 }
782
783 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
784 NULL, ops_complete_compute, sh,
785 to_addr_conv(sh, percpu));
786 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
787 }
788
789 return tx;
790}
791
792static struct dma_async_tx_descriptor *
793ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
794{
795 int i, count, disks = sh->disks;
796 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
797 int d0_idx = raid6_d0(sh);
798 int faila = -1, failb = -1;
799 int target = sh->ops.target;
800 int target2 = sh->ops.target2;
801 struct r5dev *tgt = &sh->dev[target];
802 struct r5dev *tgt2 = &sh->dev[target2];
803 struct dma_async_tx_descriptor *tx;
804 struct page **blocks = percpu->scribble;
805 struct async_submit_ctl submit;
806
807 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
808 __func__, (unsigned long long)sh->sector, target, target2);
809 BUG_ON(target < 0 || target2 < 0);
810 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
811 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
812
813 /* we need to open-code set_syndrome_sources to handle to the
814 * slot number conversion for 'faila' and 'failb'
815 */
816 for (i = 0; i < disks ; i++)
817 blocks[i] = (void *)raid6_empty_zero_page;
818 count = 0;
819 i = d0_idx;
820 do {
821 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
822
823 blocks[slot] = sh->dev[i].page;
824
825 if (i == target)
826 faila = slot;
827 if (i == target2)
828 failb = slot;
829 i = raid6_next_disk(i, disks);
830 } while (i != d0_idx);
831 BUG_ON(count != syndrome_disks);
832
833 BUG_ON(faila == failb);
834 if (failb < faila)
835 swap(faila, failb);
836 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
837 __func__, (unsigned long long)sh->sector, faila, failb);
838
839 atomic_inc(&sh->count);
840
841 if (failb == syndrome_disks+1) {
842 /* Q disk is one of the missing disks */
843 if (faila == syndrome_disks) {
844 /* Missing P+Q, just recompute */
845 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
846 ops_complete_compute, sh,
847 to_addr_conv(sh, percpu));
848 return async_gen_syndrome(blocks, 0, count+2,
849 STRIPE_SIZE, &submit);
850 } else {
851 struct page *dest;
852 int data_target;
853 int qd_idx = sh->qd_idx;
854
855 /* Missing D+Q: recompute D from P, then recompute Q */
856 if (target == qd_idx)
857 data_target = target2;
858 else
859 data_target = target;
860
861 count = 0;
862 for (i = disks; i-- ; ) {
863 if (i == data_target || i == qd_idx)
864 continue;
865 blocks[count++] = sh->dev[i].page;
866 }
867 dest = sh->dev[data_target].page;
868 init_async_submit(&submit,
869 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
870 NULL, NULL, NULL,
871 to_addr_conv(sh, percpu));
872 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
873 &submit);
874
875 count = set_syndrome_sources(blocks, sh);
876 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
877 ops_complete_compute, sh,
878 to_addr_conv(sh, percpu));
879 return async_gen_syndrome(blocks, 0, count+2,
880 STRIPE_SIZE, &submit);
881 }
882 }
883
884 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute,
885 sh, to_addr_conv(sh, percpu));
886 if (failb == syndrome_disks) {
887 /* We're missing D+P. */
888 return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE,
889 faila, blocks, &submit);
890 } else {
891 /* We're missing D+D. */
892 return async_raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE,
893 faila, failb, blocks, &submit);
894 }
895}
896
897
674static void ops_complete_prexor(void *stripe_head_ref) 898static void ops_complete_prexor(void *stripe_head_ref)
675{ 899{
676 struct stripe_head *sh = stripe_head_ref; 900 struct stripe_head *sh = stripe_head_ref;
@@ -680,12 +904,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
680} 904}
681 905
682static struct dma_async_tx_descriptor * 906static struct dma_async_tx_descriptor *
683ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 907ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
908 struct dma_async_tx_descriptor *tx)
684{ 909{
685 /* kernel stack size limits the total number of disks */
686 int disks = sh->disks; 910 int disks = sh->disks;
687 struct page *xor_srcs[disks]; 911 struct page **xor_srcs = percpu->scribble;
688 int count = 0, pd_idx = sh->pd_idx, i; 912 int count = 0, pd_idx = sh->pd_idx, i;
913 struct async_submit_ctl submit;
689 914
690 /* existing parity data subtracted */ 915 /* existing parity data subtracted */
691 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 916 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
@@ -700,9 +925,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
700 xor_srcs[count++] = dev->page; 925 xor_srcs[count++] = dev->page;
701 } 926 }
702 927
703 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 928 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
704 ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, 929 ops_complete_prexor, sh, to_addr_conv(sh, percpu));
705 ops_complete_prexor, sh); 930 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
706 931
707 return tx; 932 return tx;
708} 933}
@@ -742,17 +967,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
742 return tx; 967 return tx;
743} 968}
744 969
745static void ops_complete_postxor(void *stripe_head_ref) 970static void ops_complete_reconstruct(void *stripe_head_ref)
746{ 971{
747 struct stripe_head *sh = stripe_head_ref; 972 struct stripe_head *sh = stripe_head_ref;
748 int disks = sh->disks, i, pd_idx = sh->pd_idx; 973 int disks = sh->disks;
974 int pd_idx = sh->pd_idx;
975 int qd_idx = sh->qd_idx;
976 int i;
749 977
750 pr_debug("%s: stripe %llu\n", __func__, 978 pr_debug("%s: stripe %llu\n", __func__,
751 (unsigned long long)sh->sector); 979 (unsigned long long)sh->sector);
752 980
753 for (i = disks; i--; ) { 981 for (i = disks; i--; ) {
754 struct r5dev *dev = &sh->dev[i]; 982 struct r5dev *dev = &sh->dev[i];
755 if (dev->written || i == pd_idx) 983
984 if (dev->written || i == pd_idx || i == qd_idx)
756 set_bit(R5_UPTODATE, &dev->flags); 985 set_bit(R5_UPTODATE, &dev->flags);
757 } 986 }
758 987
@@ -770,12 +999,12 @@ static void ops_complete_postxor(void *stripe_head_ref)
770} 999}
771 1000
772static void 1001static void
773ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1002ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1003 struct dma_async_tx_descriptor *tx)
774{ 1004{
775 /* kernel stack size limits the total number of disks */
776 int disks = sh->disks; 1005 int disks = sh->disks;
777 struct page *xor_srcs[disks]; 1006 struct page **xor_srcs = percpu->scribble;
778 1007 struct async_submit_ctl submit;
779 int count = 0, pd_idx = sh->pd_idx, i; 1008 int count = 0, pd_idx = sh->pd_idx, i;
780 struct page *xor_dest; 1009 struct page *xor_dest;
781 int prexor = 0; 1010 int prexor = 0;
@@ -809,18 +1038,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
809 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1038 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
810 * for the synchronous xor case 1039 * for the synchronous xor case
811 */ 1040 */
812 flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | 1041 flags = ASYNC_TX_ACK |
813 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1042 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
814 1043
815 atomic_inc(&sh->count); 1044 atomic_inc(&sh->count);
816 1045
817 if (unlikely(count == 1)) { 1046 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
818 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); 1047 to_addr_conv(sh, percpu));
819 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 1048 if (unlikely(count == 1))
820 flags, tx, ops_complete_postxor, sh); 1049 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
821 } else 1050 else
822 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1051 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
823 flags, tx, ops_complete_postxor, sh); 1052}
1053
1054static void
1055ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1056 struct dma_async_tx_descriptor *tx)
1057{
1058 struct async_submit_ctl submit;
1059 struct page **blocks = percpu->scribble;
1060 int count;
1061
1062 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1063
1064 count = set_syndrome_sources(blocks, sh);
1065
1066 atomic_inc(&sh->count);
1067
1068 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1069 sh, to_addr_conv(sh, percpu));
1070 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
824} 1071}
825 1072
826static void ops_complete_check(void *stripe_head_ref) 1073static void ops_complete_check(void *stripe_head_ref)
@@ -835,63 +1082,115 @@ static void ops_complete_check(void *stripe_head_ref)
835 release_stripe(sh); 1082 release_stripe(sh);
836} 1083}
837 1084
838static void ops_run_check(struct stripe_head *sh) 1085static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
839{ 1086{
840 /* kernel stack size limits the total number of disks */
841 int disks = sh->disks; 1087 int disks = sh->disks;
842 struct page *xor_srcs[disks]; 1088 int pd_idx = sh->pd_idx;
1089 int qd_idx = sh->qd_idx;
1090 struct page *xor_dest;
1091 struct page **xor_srcs = percpu->scribble;
843 struct dma_async_tx_descriptor *tx; 1092 struct dma_async_tx_descriptor *tx;
844 1093 struct async_submit_ctl submit;
845 int count = 0, pd_idx = sh->pd_idx, i; 1094 int count;
846 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1095 int i;
847 1096
848 pr_debug("%s: stripe %llu\n", __func__, 1097 pr_debug("%s: stripe %llu\n", __func__,
849 (unsigned long long)sh->sector); 1098 (unsigned long long)sh->sector);
850 1099
1100 count = 0;
1101 xor_dest = sh->dev[pd_idx].page;
1102 xor_srcs[count++] = xor_dest;
851 for (i = disks; i--; ) { 1103 for (i = disks; i--; ) {
852 struct r5dev *dev = &sh->dev[i]; 1104 if (i == pd_idx || i == qd_idx)
853 if (i != pd_idx) 1105 continue;
854 xor_srcs[count++] = dev->page; 1106 xor_srcs[count++] = sh->dev[i].page;
855 } 1107 }
856 1108
857 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1109 init_async_submit(&submit, 0, NULL, NULL, NULL,
858 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); 1110 to_addr_conv(sh, percpu));
1111 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1112 &sh->ops.zero_sum_result, &submit);
1113
1114 atomic_inc(&sh->count);
1115 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1116 tx = async_trigger_callback(&submit);
1117}
1118
1119static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1120{
1121 struct page **srcs = percpu->scribble;
1122 struct async_submit_ctl submit;
1123 int count;
1124
1125 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1126 (unsigned long long)sh->sector, checkp);
1127
1128 count = set_syndrome_sources(srcs, sh);
1129 if (!checkp)
1130 srcs[count] = NULL;
859 1131
860 atomic_inc(&sh->count); 1132 atomic_inc(&sh->count);
861 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 1133 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
862 ops_complete_check, sh); 1134 sh, to_addr_conv(sh, percpu));
1135 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1136 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
863} 1137}
864 1138
865static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) 1139static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
866{ 1140{
867 int overlap_clear = 0, i, disks = sh->disks; 1141 int overlap_clear = 0, i, disks = sh->disks;
868 struct dma_async_tx_descriptor *tx = NULL; 1142 struct dma_async_tx_descriptor *tx = NULL;
1143 raid5_conf_t *conf = sh->raid_conf;
1144 int level = conf->level;
1145 struct raid5_percpu *percpu;
1146 unsigned long cpu;
869 1147
1148 cpu = get_cpu();
1149 percpu = per_cpu_ptr(conf->percpu, cpu);
870 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1150 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
871 ops_run_biofill(sh); 1151 ops_run_biofill(sh);
872 overlap_clear++; 1152 overlap_clear++;
873 } 1153 }
874 1154
875 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1155 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
876 tx = ops_run_compute5(sh); 1156 if (level < 6)
877 /* terminate the chain if postxor is not set to be run */ 1157 tx = ops_run_compute5(sh, percpu);
878 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) 1158 else {
1159 if (sh->ops.target2 < 0 || sh->ops.target < 0)
1160 tx = ops_run_compute6_1(sh, percpu);
1161 else
1162 tx = ops_run_compute6_2(sh, percpu);
1163 }
1164 /* terminate the chain if reconstruct is not set to be run */
1165 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
879 async_tx_ack(tx); 1166 async_tx_ack(tx);
880 } 1167 }
881 1168
882 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1169 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
883 tx = ops_run_prexor(sh, tx); 1170 tx = ops_run_prexor(sh, percpu, tx);
884 1171
885 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1172 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
886 tx = ops_run_biodrain(sh, tx); 1173 tx = ops_run_biodrain(sh, tx);
887 overlap_clear++; 1174 overlap_clear++;
888 } 1175 }
889 1176
890 if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) 1177 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
891 ops_run_postxor(sh, tx); 1178 if (level < 6)
1179 ops_run_reconstruct5(sh, percpu, tx);
1180 else
1181 ops_run_reconstruct6(sh, percpu, tx);
1182 }
892 1183
893 if (test_bit(STRIPE_OP_CHECK, &ops_request)) 1184 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
894 ops_run_check(sh); 1185 if (sh->check_state == check_state_run)
1186 ops_run_check_p(sh, percpu);
1187 else if (sh->check_state == check_state_run_q)
1188 ops_run_check_pq(sh, percpu, 0);
1189 else if (sh->check_state == check_state_run_pq)
1190 ops_run_check_pq(sh, percpu, 1);
1191 else
1192 BUG();
1193 }
895 1194
896 if (overlap_clear) 1195 if (overlap_clear)
897 for (i = disks; i--; ) { 1196 for (i = disks; i--; ) {
@@ -899,6 +1198,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
899 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1198 if (test_and_clear_bit(R5_Overlap, &dev->flags))
900 wake_up(&sh->raid_conf->wait_for_overlap); 1199 wake_up(&sh->raid_conf->wait_for_overlap);
901 } 1200 }
1201 put_cpu();
902} 1202}
903 1203
904static int grow_one_stripe(raid5_conf_t *conf) 1204static int grow_one_stripe(raid5_conf_t *conf)
@@ -948,6 +1248,28 @@ static int grow_stripes(raid5_conf_t *conf, int num)
948 return 0; 1248 return 0;
949} 1249}
950 1250
1251/**
1252 * scribble_len - return the required size of the scribble region
1253 * @num - total number of disks in the array
1254 *
1255 * The size must be enough to contain:
1256 * 1/ a struct page pointer for each device in the array +2
1257 * 2/ room to convert each entry in (1) to its corresponding dma
1258 * (dma_map_page()) or page (page_address()) address.
1259 *
1260 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1261 * calculate over all devices (not just the data blocks), using zeros in place
1262 * of the P and Q blocks.
1263 */
1264static size_t scribble_len(int num)
1265{
1266 size_t len;
1267
1268 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1269
1270 return len;
1271}
1272
951static int resize_stripes(raid5_conf_t *conf, int newsize) 1273static int resize_stripes(raid5_conf_t *conf, int newsize)
952{ 1274{
953 /* Make all the stripes able to hold 'newsize' devices. 1275 /* Make all the stripes able to hold 'newsize' devices.
@@ -976,6 +1298,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
976 struct stripe_head *osh, *nsh; 1298 struct stripe_head *osh, *nsh;
977 LIST_HEAD(newstripes); 1299 LIST_HEAD(newstripes);
978 struct disk_info *ndisks; 1300 struct disk_info *ndisks;
1301 unsigned long cpu;
979 int err; 1302 int err;
980 struct kmem_cache *sc; 1303 struct kmem_cache *sc;
981 int i; 1304 int i;
@@ -1041,7 +1364,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1041 /* Step 3. 1364 /* Step 3.
1042 * At this point, we are holding all the stripes so the array 1365 * At this point, we are holding all the stripes so the array
1043 * is completely stalled, so now is a good time to resize 1366 * is completely stalled, so now is a good time to resize
1044 * conf->disks. 1367 * conf->disks and the scribble region
1045 */ 1368 */
1046 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1369 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1047 if (ndisks) { 1370 if (ndisks) {
@@ -1052,10 +1375,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1052 } else 1375 } else
1053 err = -ENOMEM; 1376 err = -ENOMEM;
1054 1377
1378 get_online_cpus();
1379 conf->scribble_len = scribble_len(newsize);
1380 for_each_present_cpu(cpu) {
1381 struct raid5_percpu *percpu;
1382 void *scribble;
1383
1384 percpu = per_cpu_ptr(conf->percpu, cpu);
1385 scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1386
1387 if (scribble) {
1388 kfree(percpu->scribble);
1389 percpu->scribble = scribble;
1390 } else {
1391 err = -ENOMEM;
1392 break;
1393 }
1394 }
1395 put_online_cpus();
1396
1055 /* Step 4, return new stripes to service */ 1397 /* Step 4, return new stripes to service */
1056 while(!list_empty(&newstripes)) { 1398 while(!list_empty(&newstripes)) {
1057 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1399 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1058 list_del_init(&nsh->lru); 1400 list_del_init(&nsh->lru);
1401
1059 for (i=conf->raid_disks; i < newsize; i++) 1402 for (i=conf->raid_disks; i < newsize; i++)
1060 if (nsh->dev[i].page == NULL) { 1403 if (nsh->dev[i].page == NULL) {
1061 struct page *p = alloc_page(GFP_NOIO); 1404 struct page *p = alloc_page(GFP_NOIO);
@@ -1594,258 +1937,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1594} 1937}
1595 1938
1596 1939
1597
1598/*
1599 * Copy data between a page in the stripe cache, and one or more bion
1600 * The page could align with the middle of the bio, or there could be
1601 * several bion, each with several bio_vecs, which cover part of the page
1602 * Multiple bion are linked together on bi_next. There may be extras
1603 * at the end of this list. We ignore them.
1604 */
1605static void copy_data(int frombio, struct bio *bio,
1606 struct page *page,
1607 sector_t sector)
1608{
1609 char *pa = page_address(page);
1610 struct bio_vec *bvl;
1611 int i;
1612 int page_offset;
1613
1614 if (bio->bi_sector >= sector)
1615 page_offset = (signed)(bio->bi_sector - sector) * 512;
1616 else
1617 page_offset = (signed)(sector - bio->bi_sector) * -512;
1618 bio_for_each_segment(bvl, bio, i) {
1619 int len = bio_iovec_idx(bio,i)->bv_len;
1620 int clen;
1621 int b_offset = 0;
1622
1623 if (page_offset < 0) {
1624 b_offset = -page_offset;
1625 page_offset += b_offset;
1626 len -= b_offset;
1627 }
1628
1629 if (len > 0 && page_offset + len > STRIPE_SIZE)
1630 clen = STRIPE_SIZE - page_offset;
1631 else clen = len;
1632
1633 if (clen > 0) {
1634 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
1635 if (frombio)
1636 memcpy(pa+page_offset, ba+b_offset, clen);
1637 else
1638 memcpy(ba+b_offset, pa+page_offset, clen);
1639 __bio_kunmap_atomic(ba, KM_USER0);
1640 }
1641 if (clen < len) /* hit end of page */
1642 break;
1643 page_offset += len;
1644 }
1645}
1646
1647#define check_xor() do { \
1648 if (count == MAX_XOR_BLOCKS) { \
1649 xor_blocks(count, STRIPE_SIZE, dest, ptr);\
1650 count = 0; \
1651 } \
1652 } while(0)
1653
1654static void compute_parity6(struct stripe_head *sh, int method)
1655{
1656 raid5_conf_t *conf = sh->raid_conf;
1657 int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
1658 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1659 struct bio *chosen;
1660 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1661 void *ptrs[syndrome_disks+2];
1662
1663 pd_idx = sh->pd_idx;
1664 qd_idx = sh->qd_idx;
1665 d0_idx = raid6_d0(sh);
1666
1667 pr_debug("compute_parity, stripe %llu, method %d\n",
1668 (unsigned long long)sh->sector, method);
1669
1670 switch(method) {
1671 case READ_MODIFY_WRITE:
1672 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
1673 case RECONSTRUCT_WRITE:
1674 for (i= disks; i-- ;)
1675 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1676 chosen = sh->dev[i].towrite;
1677 sh->dev[i].towrite = NULL;
1678
1679 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1680 wake_up(&conf->wait_for_overlap);
1681
1682 BUG_ON(sh->dev[i].written);
1683 sh->dev[i].written = chosen;
1684 }
1685 break;
1686 case CHECK_PARITY:
1687 BUG(); /* Not implemented yet */
1688 }
1689
1690 for (i = disks; i--;)
1691 if (sh->dev[i].written) {
1692 sector_t sector = sh->dev[i].sector;
1693 struct bio *wbi = sh->dev[i].written;
1694 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1695 copy_data(1, wbi, sh->dev[i].page, sector);
1696 wbi = r5_next_bio(wbi, sector);
1697 }
1698
1699 set_bit(R5_LOCKED, &sh->dev[i].flags);
1700 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1701 }
1702
1703 /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
1704
1705 for (i = 0; i < disks; i++)
1706 ptrs[i] = (void *)raid6_empty_zero_page;
1707
1708 count = 0;
1709 i = d0_idx;
1710 do {
1711 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1712
1713 ptrs[slot] = page_address(sh->dev[i].page);
1714 if (slot < syndrome_disks &&
1715 !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
1716 printk(KERN_ERR "block %d/%d not uptodate "
1717 "on parity calc\n", i, count);
1718 BUG();
1719 }
1720
1721 i = raid6_next_disk(i, disks);
1722 } while (i != d0_idx);
1723 BUG_ON(count != syndrome_disks);
1724
1725 raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
1726
1727 switch(method) {
1728 case RECONSTRUCT_WRITE:
1729 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1730 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1731 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1732 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
1733 break;
1734 case UPDATE_PARITY:
1735 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1736 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1737 break;
1738 }
1739}
1740
1741
1742/* Compute one missing block */
1743static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1744{
1745 int i, count, disks = sh->disks;
1746 void *ptr[MAX_XOR_BLOCKS], *dest, *p;
1747 int qd_idx = sh->qd_idx;
1748
1749 pr_debug("compute_block_1, stripe %llu, idx %d\n",
1750 (unsigned long long)sh->sector, dd_idx);
1751
1752 if ( dd_idx == qd_idx ) {
1753 /* We're actually computing the Q drive */
1754 compute_parity6(sh, UPDATE_PARITY);
1755 } else {
1756 dest = page_address(sh->dev[dd_idx].page);
1757 if (!nozero) memset(dest, 0, STRIPE_SIZE);
1758 count = 0;
1759 for (i = disks ; i--; ) {
1760 if (i == dd_idx || i == qd_idx)
1761 continue;
1762 p = page_address(sh->dev[i].page);
1763 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1764 ptr[count++] = p;
1765 else
1766 printk("compute_block() %d, stripe %llu, %d"
1767 " not present\n", dd_idx,
1768 (unsigned long long)sh->sector, i);
1769
1770 check_xor();
1771 }
1772 if (count)
1773 xor_blocks(count, STRIPE_SIZE, dest, ptr);
1774 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1775 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1776 }
1777}
1778
1779/* Compute two missing blocks */
1780static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1781{
1782 int i, count, disks = sh->disks;
1783 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1784 int d0_idx = raid6_d0(sh);
1785 int faila = -1, failb = -1;
1786 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1787 void *ptrs[syndrome_disks+2];
1788
1789 for (i = 0; i < disks ; i++)
1790 ptrs[i] = (void *)raid6_empty_zero_page;
1791 count = 0;
1792 i = d0_idx;
1793 do {
1794 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1795
1796 ptrs[slot] = page_address(sh->dev[i].page);
1797
1798 if (i == dd_idx1)
1799 faila = slot;
1800 if (i == dd_idx2)
1801 failb = slot;
1802 i = raid6_next_disk(i, disks);
1803 } while (i != d0_idx);
1804 BUG_ON(count != syndrome_disks);
1805
1806 BUG_ON(faila == failb);
1807 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1808
1809 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1810 (unsigned long long)sh->sector, dd_idx1, dd_idx2,
1811 faila, failb);
1812
1813 if (failb == syndrome_disks+1) {
1814 /* Q disk is one of the missing disks */
1815 if (faila == syndrome_disks) {
1816 /* Missing P+Q, just recompute */
1817 compute_parity6(sh, UPDATE_PARITY);
1818 return;
1819 } else {
1820 /* We're missing D+Q; recompute D from P */
1821 compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
1822 dd_idx2 : dd_idx1),
1823 0);
1824 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1825 return;
1826 }
1827 }
1828
1829 /* We're missing D+P or D+D; */
1830 if (failb == syndrome_disks) {
1831 /* We're missing D+P. */
1832 raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
1833 } else {
1834 /* We're missing D+D. */
1835 raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
1836 ptrs);
1837 }
1838
1839 /* Both the above update both missing blocks */
1840 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1841 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1842}
1843
1844static void 1940static void
1845schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, 1941schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
1846 int rcw, int expand) 1942 int rcw, int expand)
1847{ 1943{
1848 int i, pd_idx = sh->pd_idx, disks = sh->disks; 1944 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1945 raid5_conf_t *conf = sh->raid_conf;
1946 int level = conf->level;
1849 1947
1850 if (rcw) { 1948 if (rcw) {
1851 /* if we are not expanding this is a proper write request, and 1949 /* if we are not expanding this is a proper write request, and
@@ -1858,7 +1956,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1858 } else 1956 } else
1859 sh->reconstruct_state = reconstruct_state_run; 1957 sh->reconstruct_state = reconstruct_state_run;
1860 1958
1861 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1959 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1862 1960
1863 for (i = disks; i--; ) { 1961 for (i = disks; i--; ) {
1864 struct r5dev *dev = &sh->dev[i]; 1962 struct r5dev *dev = &sh->dev[i];
@@ -1871,17 +1969,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1871 s->locked++; 1969 s->locked++;
1872 } 1970 }
1873 } 1971 }
1874 if (s->locked + 1 == disks) 1972 if (s->locked + conf->max_degraded == disks)
1875 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 1973 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1876 atomic_inc(&sh->raid_conf->pending_full_writes); 1974 atomic_inc(&conf->pending_full_writes);
1877 } else { 1975 } else {
1976 BUG_ON(level == 6);
1878 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1977 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1879 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1978 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1880 1979
1881 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 1980 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1882 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 1981 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1883 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 1982 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1884 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1983 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1885 1984
1886 for (i = disks; i--; ) { 1985 for (i = disks; i--; ) {
1887 struct r5dev *dev = &sh->dev[i]; 1986 struct r5dev *dev = &sh->dev[i];
@@ -1899,13 +1998,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1899 } 1998 }
1900 } 1999 }
1901 2000
1902 /* keep the parity disk locked while asynchronous operations 2001 /* keep the parity disk(s) locked while asynchronous operations
1903 * are in flight 2002 * are in flight
1904 */ 2003 */
1905 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2004 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1906 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2005 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1907 s->locked++; 2006 s->locked++;
1908 2007
2008 if (level == 6) {
2009 int qd_idx = sh->qd_idx;
2010 struct r5dev *dev = &sh->dev[qd_idx];
2011
2012 set_bit(R5_LOCKED, &dev->flags);
2013 clear_bit(R5_UPTODATE, &dev->flags);
2014 s->locked++;
2015 }
2016
1909 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2017 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
1910 __func__, (unsigned long long)sh->sector, 2018 __func__, (unsigned long long)sh->sector,
1911 s->locked, s->ops_request); 2019 s->locked, s->ops_request);
@@ -1986,13 +2094,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1986 2094
1987static void end_reshape(raid5_conf_t *conf); 2095static void end_reshape(raid5_conf_t *conf);
1988 2096
1989static int page_is_zero(struct page *p)
1990{
1991 char *a = page_address(p);
1992 return ((*(u32*)a) == 0 &&
1993 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1994}
1995
1996static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 2097static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
1997 struct stripe_head *sh) 2098 struct stripe_head *sh)
1998{ 2099{
@@ -2132,9 +2233,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
2132 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2233 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2133 set_bit(R5_Wantcompute, &dev->flags); 2234 set_bit(R5_Wantcompute, &dev->flags);
2134 sh->ops.target = disk_idx; 2235 sh->ops.target = disk_idx;
2236 sh->ops.target2 = -1;
2135 s->req_compute = 1; 2237 s->req_compute = 1;
2136 /* Careful: from this point on 'uptodate' is in the eye 2238 /* Careful: from this point on 'uptodate' is in the eye
2137 * of raid5_run_ops which services 'compute' operations 2239 * of raid_run_ops which services 'compute' operations
2138 * before writes. R5_Wantcompute flags a block that will 2240 * before writes. R5_Wantcompute flags a block that will
2139 * be R5_UPTODATE by the time it is needed for a 2241 * be R5_UPTODATE by the time it is needed for a
2140 * subsequent operation. 2242 * subsequent operation.
@@ -2173,61 +2275,104 @@ static void handle_stripe_fill5(struct stripe_head *sh,
2173 set_bit(STRIPE_HANDLE, &sh->state); 2275 set_bit(STRIPE_HANDLE, &sh->state);
2174} 2276}
2175 2277
2176static void handle_stripe_fill6(struct stripe_head *sh, 2278/* fetch_block6 - checks the given member device to see if its data needs
2177 struct stripe_head_state *s, struct r6_state *r6s, 2279 * to be read or computed to satisfy a request.
2178 int disks) 2280 *
2281 * Returns 1 when no more member devices need to be checked, otherwise returns
2282 * 0 to tell the loop in handle_stripe_fill6 to continue
2283 */
2284static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2285 struct r6_state *r6s, int disk_idx, int disks)
2179{ 2286{
2180 int i; 2287 struct r5dev *dev = &sh->dev[disk_idx];
2181 for (i = disks; i--; ) { 2288 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
2182 struct r5dev *dev = &sh->dev[i]; 2289 &sh->dev[r6s->failed_num[1]] };
2183 if (!test_bit(R5_LOCKED, &dev->flags) && 2290
2184 !test_bit(R5_UPTODATE, &dev->flags) && 2291 if (!test_bit(R5_LOCKED, &dev->flags) &&
2185 (dev->toread || (dev->towrite && 2292 !test_bit(R5_UPTODATE, &dev->flags) &&
2186 !test_bit(R5_OVERWRITE, &dev->flags)) || 2293 (dev->toread ||
2187 s->syncing || s->expanding || 2294 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2188 (s->failed >= 1 && 2295 s->syncing || s->expanding ||
2189 (sh->dev[r6s->failed_num[0]].toread || 2296 (s->failed >= 1 &&
2190 s->to_write)) || 2297 (fdev[0]->toread || s->to_write)) ||
2191 (s->failed >= 2 && 2298 (s->failed >= 2 &&
2192 (sh->dev[r6s->failed_num[1]].toread || 2299 (fdev[1]->toread || s->to_write)))) {
2193 s->to_write)))) { 2300 /* we would like to get this block, possibly by computing it,
2194 /* we would like to get this block, possibly 2301 * otherwise read it if the backing disk is insync
2195 * by computing it, but we might not be able to 2302 */
2303 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2304 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2305 if ((s->uptodate == disks - 1) &&
2306 (s->failed && (disk_idx == r6s->failed_num[0] ||
2307 disk_idx == r6s->failed_num[1]))) {
2308 /* have disk failed, and we're requested to fetch it;
2309 * do compute it
2196 */ 2310 */
2197 if ((s->uptodate == disks - 1) && 2311 pr_debug("Computing stripe %llu block %d\n",
2198 (s->failed && (i == r6s->failed_num[0] || 2312 (unsigned long long)sh->sector, disk_idx);
2199 i == r6s->failed_num[1]))) { 2313 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2200 pr_debug("Computing stripe %llu block %d\n", 2314 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2201 (unsigned long long)sh->sector, i); 2315 set_bit(R5_Wantcompute, &dev->flags);
2202 compute_block_1(sh, i, 0); 2316 sh->ops.target = disk_idx;
2203 s->uptodate++; 2317 sh->ops.target2 = -1; /* no 2nd target */
2204 } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { 2318 s->req_compute = 1;
2205 /* Computing 2-failure is *very* expensive; only 2319 s->uptodate++;
2206 * do it if failed >= 2 2320 return 1;
2207 */ 2321 } else if (s->uptodate == disks-2 && s->failed >= 2) {
2208 int other; 2322 /* Computing 2-failure is *very* expensive; only
2209 for (other = disks; other--; ) { 2323 * do it if failed >= 2
2210 if (other == i) 2324 */
2211 continue; 2325 int other;
2212 if (!test_bit(R5_UPTODATE, 2326 for (other = disks; other--; ) {
2213 &sh->dev[other].flags)) 2327 if (other == disk_idx)
2214 break; 2328 continue;
2215 } 2329 if (!test_bit(R5_UPTODATE,
2216 BUG_ON(other < 0); 2330 &sh->dev[other].flags))
2217 pr_debug("Computing stripe %llu blocks %d,%d\n", 2331 break;
2218 (unsigned long long)sh->sector,
2219 i, other);
2220 compute_block_2(sh, i, other);
2221 s->uptodate += 2;
2222 } else if (test_bit(R5_Insync, &dev->flags)) {
2223 set_bit(R5_LOCKED, &dev->flags);
2224 set_bit(R5_Wantread, &dev->flags);
2225 s->locked++;
2226 pr_debug("Reading block %d (sync=%d)\n",
2227 i, s->syncing);
2228 } 2332 }
2333 BUG_ON(other < 0);
2334 pr_debug("Computing stripe %llu blocks %d,%d\n",
2335 (unsigned long long)sh->sector,
2336 disk_idx, other);
2337 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2338 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2339 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2340 set_bit(R5_Wantcompute, &sh->dev[other].flags);
2341 sh->ops.target = disk_idx;
2342 sh->ops.target2 = other;
2343 s->uptodate += 2;
2344 s->req_compute = 1;
2345 return 1;
2346 } else if (test_bit(R5_Insync, &dev->flags)) {
2347 set_bit(R5_LOCKED, &dev->flags);
2348 set_bit(R5_Wantread, &dev->flags);
2349 s->locked++;
2350 pr_debug("Reading block %d (sync=%d)\n",
2351 disk_idx, s->syncing);
2229 } 2352 }
2230 } 2353 }
2354
2355 return 0;
2356}
2357
2358/**
2359 * handle_stripe_fill6 - read or compute data to satisfy pending requests.
2360 */
2361static void handle_stripe_fill6(struct stripe_head *sh,
2362 struct stripe_head_state *s, struct r6_state *r6s,
2363 int disks)
2364{
2365 int i;
2366
2367 /* look for blocks to read/compute, skip this if a compute
2368 * is already in flight, or if the stripe contents are in the
2369 * midst of changing due to a write
2370 */
2371 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2372 !sh->reconstruct_state)
2373 for (i = disks; i--; )
2374 if (fetch_block6(sh, s, r6s, i, disks))
2375 break;
2231 set_bit(STRIPE_HANDLE, &sh->state); 2376 set_bit(STRIPE_HANDLE, &sh->state);
2232} 2377}
2233 2378
@@ -2361,114 +2506,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2361 */ 2506 */
2362 /* since handle_stripe can be called at any time we need to handle the 2507 /* since handle_stripe can be called at any time we need to handle the
2363 * case where a compute block operation has been submitted and then a 2508 * case where a compute block operation has been submitted and then a
2364 * subsequent call wants to start a write request. raid5_run_ops only 2509 * subsequent call wants to start a write request. raid_run_ops only
2365 * handles the case where compute block and postxor are requested 2510 * handles the case where compute block and reconstruct are requested
2366 * simultaneously. If this is not the case then new writes need to be 2511 * simultaneously. If this is not the case then new writes need to be
2367 * held off until the compute completes. 2512 * held off until the compute completes.
2368 */ 2513 */
2369 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2514 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2370 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2515 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2371 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2516 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2372 schedule_reconstruction5(sh, s, rcw == 0, 0); 2517 schedule_reconstruction(sh, s, rcw == 0, 0);
2373} 2518}
2374 2519
2375static void handle_stripe_dirtying6(raid5_conf_t *conf, 2520static void handle_stripe_dirtying6(raid5_conf_t *conf,
2376 struct stripe_head *sh, struct stripe_head_state *s, 2521 struct stripe_head *sh, struct stripe_head_state *s,
2377 struct r6_state *r6s, int disks) 2522 struct r6_state *r6s, int disks)
2378{ 2523{
2379 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; 2524 int rcw = 0, pd_idx = sh->pd_idx, i;
2380 int qd_idx = sh->qd_idx; 2525 int qd_idx = sh->qd_idx;
2526
2527 set_bit(STRIPE_HANDLE, &sh->state);
2381 for (i = disks; i--; ) { 2528 for (i = disks; i--; ) {
2382 struct r5dev *dev = &sh->dev[i]; 2529 struct r5dev *dev = &sh->dev[i];
2383 /* Would I have to read this buffer for reconstruct_write */ 2530 /* check if we haven't enough data */
2384 if (!test_bit(R5_OVERWRITE, &dev->flags) 2531 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2385 && i != pd_idx && i != qd_idx 2532 i != pd_idx && i != qd_idx &&
2386 && (!test_bit(R5_LOCKED, &dev->flags) 2533 !test_bit(R5_LOCKED, &dev->flags) &&
2387 ) && 2534 !(test_bit(R5_UPTODATE, &dev->flags) ||
2388 !test_bit(R5_UPTODATE, &dev->flags)) { 2535 test_bit(R5_Wantcompute, &dev->flags))) {
2389 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2536 rcw++;
2390 else { 2537 if (!test_bit(R5_Insync, &dev->flags))
2391 pr_debug("raid6: must_compute: " 2538 continue; /* it's a failed drive */
2392 "disk %d flags=%#lx\n", i, dev->flags); 2539
2393 must_compute++; 2540 if (
2541 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2542 pr_debug("Read_old stripe %llu "
2543 "block %d for Reconstruct\n",
2544 (unsigned long long)sh->sector, i);
2545 set_bit(R5_LOCKED, &dev->flags);
2546 set_bit(R5_Wantread, &dev->flags);
2547 s->locked++;
2548 } else {
2549 pr_debug("Request delayed stripe %llu "
2550 "block %d for Reconstruct\n",
2551 (unsigned long long)sh->sector, i);
2552 set_bit(STRIPE_DELAYED, &sh->state);
2553 set_bit(STRIPE_HANDLE, &sh->state);
2394 } 2554 }
2395 } 2555 }
2396 } 2556 }
2397 pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
2398 (unsigned long long)sh->sector, rcw, must_compute);
2399 set_bit(STRIPE_HANDLE, &sh->state);
2400
2401 if (rcw > 0)
2402 /* want reconstruct write, but need to get some data */
2403 for (i = disks; i--; ) {
2404 struct r5dev *dev = &sh->dev[i];
2405 if (!test_bit(R5_OVERWRITE, &dev->flags)
2406 && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
2407 && !test_bit(R5_LOCKED, &dev->flags) &&
2408 !test_bit(R5_UPTODATE, &dev->flags) &&
2409 test_bit(R5_Insync, &dev->flags)) {
2410 if (
2411 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2412 pr_debug("Read_old stripe %llu "
2413 "block %d for Reconstruct\n",
2414 (unsigned long long)sh->sector, i);
2415 set_bit(R5_LOCKED, &dev->flags);
2416 set_bit(R5_Wantread, &dev->flags);
2417 s->locked++;
2418 } else {
2419 pr_debug("Request delayed stripe %llu "
2420 "block %d for Reconstruct\n",
2421 (unsigned long long)sh->sector, i);
2422 set_bit(STRIPE_DELAYED, &sh->state);
2423 set_bit(STRIPE_HANDLE, &sh->state);
2424 }
2425 }
2426 }
2427 /* now if nothing is locked, and if we have enough data, we can start a 2557 /* now if nothing is locked, and if we have enough data, we can start a
2428 * write request 2558 * write request
2429 */ 2559 */
2430 if (s->locked == 0 && rcw == 0 && 2560 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2561 s->locked == 0 && rcw == 0 &&
2431 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 2562 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2432 if (must_compute > 0) { 2563 schedule_reconstruction(sh, s, 1, 0);
2433 /* We have failed blocks and need to compute them */
2434 switch (s->failed) {
2435 case 0:
2436 BUG();
2437 case 1:
2438 compute_block_1(sh, r6s->failed_num[0], 0);
2439 break;
2440 case 2:
2441 compute_block_2(sh, r6s->failed_num[0],
2442 r6s->failed_num[1]);
2443 break;
2444 default: /* This request should have been failed? */
2445 BUG();
2446 }
2447 }
2448
2449 pr_debug("Computing parity for stripe %llu\n",
2450 (unsigned long long)sh->sector);
2451 compute_parity6(sh, RECONSTRUCT_WRITE);
2452 /* now every locked buffer is ready to be written */
2453 for (i = disks; i--; )
2454 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2455 pr_debug("Writing stripe %llu block %d\n",
2456 (unsigned long long)sh->sector, i);
2457 s->locked++;
2458 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2459 }
2460 if (s->locked == disks)
2461 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2462 atomic_inc(&conf->pending_full_writes);
2463 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2464 set_bit(STRIPE_INSYNC, &sh->state);
2465
2466 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2467 atomic_dec(&conf->preread_active_stripes);
2468 if (atomic_read(&conf->preread_active_stripes) <
2469 IO_THRESHOLD)
2470 md_wakeup_thread(conf->mddev->thread);
2471 }
2472 } 2564 }
2473} 2565}
2474 2566
@@ -2527,7 +2619,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2527 * we are done. Otherwise update the mismatch count and repair 2619 * we are done. Otherwise update the mismatch count and repair
2528 * parity if !MD_RECOVERY_CHECK 2620 * parity if !MD_RECOVERY_CHECK
2529 */ 2621 */
2530 if (sh->ops.zero_sum_result == 0) 2622 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2531 /* parity is correct (on disc, 2623 /* parity is correct (on disc,
2532 * not in buffer any more) 2624 * not in buffer any more)
2533 */ 2625 */
@@ -2544,6 +2636,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2544 set_bit(R5_Wantcompute, 2636 set_bit(R5_Wantcompute,
2545 &sh->dev[sh->pd_idx].flags); 2637 &sh->dev[sh->pd_idx].flags);
2546 sh->ops.target = sh->pd_idx; 2638 sh->ops.target = sh->pd_idx;
2639 sh->ops.target2 = -1;
2547 s->uptodate++; 2640 s->uptodate++;
2548 } 2641 }
2549 } 2642 }
@@ -2560,67 +2653,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2560 2653
2561 2654
2562static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2655static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2563 struct stripe_head_state *s, 2656 struct stripe_head_state *s,
2564 struct r6_state *r6s, struct page *tmp_page, 2657 struct r6_state *r6s, int disks)
2565 int disks)
2566{ 2658{
2567 int update_p = 0, update_q = 0;
2568 struct r5dev *dev;
2569 int pd_idx = sh->pd_idx; 2659 int pd_idx = sh->pd_idx;
2570 int qd_idx = sh->qd_idx; 2660 int qd_idx = sh->qd_idx;
2661 struct r5dev *dev;
2571 2662
2572 set_bit(STRIPE_HANDLE, &sh->state); 2663 set_bit(STRIPE_HANDLE, &sh->state);
2573 2664
2574 BUG_ON(s->failed > 2); 2665 BUG_ON(s->failed > 2);
2575 BUG_ON(s->uptodate < disks); 2666
2576 /* Want to check and possibly repair P and Q. 2667 /* Want to check and possibly repair P and Q.
2577 * However there could be one 'failed' device, in which 2668 * However there could be one 'failed' device, in which
2578 * case we can only check one of them, possibly using the 2669 * case we can only check one of them, possibly using the
2579 * other to generate missing data 2670 * other to generate missing data
2580 */ 2671 */
2581 2672
2582 /* If !tmp_page, we cannot do the calculations, 2673 switch (sh->check_state) {
2583 * but as we have set STRIPE_HANDLE, we will soon be called 2674 case check_state_idle:
2584 * by stripe_handle with a tmp_page - just wait until then. 2675 /* start a new check operation if there are < 2 failures */
2585 */
2586 if (tmp_page) {
2587 if (s->failed == r6s->q_failed) { 2676 if (s->failed == r6s->q_failed) {
2588 /* The only possible failed device holds 'Q', so it 2677 /* The only possible failed device holds Q, so it
2589 * makes sense to check P (If anything else were failed, 2678 * makes sense to check P (If anything else were failed,
2590 * we would have used P to recreate it). 2679 * we would have used P to recreate it).
2591 */ 2680 */
2592 compute_block_1(sh, pd_idx, 1); 2681 sh->check_state = check_state_run;
2593 if (!page_is_zero(sh->dev[pd_idx].page)) {
2594 compute_block_1(sh, pd_idx, 0);
2595 update_p = 1;
2596 }
2597 } 2682 }
2598 if (!r6s->q_failed && s->failed < 2) { 2683 if (!r6s->q_failed && s->failed < 2) {
2599 /* q is not failed, and we didn't use it to generate 2684 /* Q is not failed, and we didn't use it to generate
2600 * anything, so it makes sense to check it 2685 * anything, so it makes sense to check it
2601 */ 2686 */
2602 memcpy(page_address(tmp_page), 2687 if (sh->check_state == check_state_run)
2603 page_address(sh->dev[qd_idx].page), 2688 sh->check_state = check_state_run_pq;
2604 STRIPE_SIZE); 2689 else
2605 compute_parity6(sh, UPDATE_PARITY); 2690 sh->check_state = check_state_run_q;
2606 if (memcmp(page_address(tmp_page),
2607 page_address(sh->dev[qd_idx].page),
2608 STRIPE_SIZE) != 0) {
2609 clear_bit(STRIPE_INSYNC, &sh->state);
2610 update_q = 1;
2611 }
2612 } 2691 }
2613 if (update_p || update_q) { 2692
2614 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2693 /* discard potentially stale zero_sum_result */
2615 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2694 sh->ops.zero_sum_result = 0;
2616 /* don't try to repair!! */ 2695
2617 update_p = update_q = 0; 2696 if (sh->check_state == check_state_run) {
2697 /* async_xor_zero_sum destroys the contents of P */
2698 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2699 s->uptodate--;
2700 }
2701 if (sh->check_state >= check_state_run &&
2702 sh->check_state <= check_state_run_pq) {
2703 /* async_syndrome_zero_sum preserves P and Q, so
2704 * no need to mark them !uptodate here
2705 */
2706 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2707 break;
2618 } 2708 }
2619 2709
2710 /* we have 2-disk failure */
2711 BUG_ON(s->failed != 2);
2712 /* fall through */
2713 case check_state_compute_result:
2714 sh->check_state = check_state_idle;
2715
2716 /* check that a write has not made the stripe insync */
2717 if (test_bit(STRIPE_INSYNC, &sh->state))
2718 break;
2719
2620 /* now write out any block on a failed drive, 2720 /* now write out any block on a failed drive,
2621 * or P or Q if they need it 2721 * or P or Q if they were recomputed
2622 */ 2722 */
2623 2723 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2624 if (s->failed == 2) { 2724 if (s->failed == 2) {
2625 dev = &sh->dev[r6s->failed_num[1]]; 2725 dev = &sh->dev[r6s->failed_num[1]];
2626 s->locked++; 2726 s->locked++;
@@ -2633,14 +2733,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2633 set_bit(R5_LOCKED, &dev->flags); 2733 set_bit(R5_LOCKED, &dev->flags);
2634 set_bit(R5_Wantwrite, &dev->flags); 2734 set_bit(R5_Wantwrite, &dev->flags);
2635 } 2735 }
2636 2736 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2637 if (update_p) {
2638 dev = &sh->dev[pd_idx]; 2737 dev = &sh->dev[pd_idx];
2639 s->locked++; 2738 s->locked++;
2640 set_bit(R5_LOCKED, &dev->flags); 2739 set_bit(R5_LOCKED, &dev->flags);
2641 set_bit(R5_Wantwrite, &dev->flags); 2740 set_bit(R5_Wantwrite, &dev->flags);
2642 } 2741 }
2643 if (update_q) { 2742 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2644 dev = &sh->dev[qd_idx]; 2743 dev = &sh->dev[qd_idx];
2645 s->locked++; 2744 s->locked++;
2646 set_bit(R5_LOCKED, &dev->flags); 2745 set_bit(R5_LOCKED, &dev->flags);
@@ -2649,6 +2748,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2649 clear_bit(STRIPE_DEGRADED, &sh->state); 2748 clear_bit(STRIPE_DEGRADED, &sh->state);
2650 2749
2651 set_bit(STRIPE_INSYNC, &sh->state); 2750 set_bit(STRIPE_INSYNC, &sh->state);
2751 break;
2752 case check_state_run:
2753 case check_state_run_q:
2754 case check_state_run_pq:
2755 break; /* we will be called again upon completion */
2756 case check_state_check_result:
2757 sh->check_state = check_state_idle;
2758
2759 /* handle a successful check operation, if parity is correct
2760 * we are done. Otherwise update the mismatch count and repair
2761 * parity if !MD_RECOVERY_CHECK
2762 */
2763 if (sh->ops.zero_sum_result == 0) {
2764 /* both parities are correct */
2765 if (!s->failed)
2766 set_bit(STRIPE_INSYNC, &sh->state);
2767 else {
2768 /* in contrast to the raid5 case we can validate
2769 * parity, but still have a failure to write
2770 * back
2771 */
2772 sh->check_state = check_state_compute_result;
2773 /* Returning at this point means that we may go
2774 * off and bring p and/or q uptodate again so
2775 * we make sure to check zero_sum_result again
2776 * to verify if p or q need writeback
2777 */
2778 }
2779 } else {
2780 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2781 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2782 /* don't try to repair!! */
2783 set_bit(STRIPE_INSYNC, &sh->state);
2784 else {
2785 int *target = &sh->ops.target;
2786
2787 sh->ops.target = -1;
2788 sh->ops.target2 = -1;
2789 sh->check_state = check_state_compute_run;
2790 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2791 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2792 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2793 set_bit(R5_Wantcompute,
2794 &sh->dev[pd_idx].flags);
2795 *target = pd_idx;
2796 target = &sh->ops.target2;
2797 s->uptodate++;
2798 }
2799 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2800 set_bit(R5_Wantcompute,
2801 &sh->dev[qd_idx].flags);
2802 *target = qd_idx;
2803 s->uptodate++;
2804 }
2805 }
2806 }
2807 break;
2808 case check_state_compute_run:
2809 break;
2810 default:
2811 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2812 __func__, sh->check_state,
2813 (unsigned long long) sh->sector);
2814 BUG();
2652 } 2815 }
2653} 2816}
2654 2817
@@ -2666,6 +2829,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2666 if (i != sh->pd_idx && i != sh->qd_idx) { 2829 if (i != sh->pd_idx && i != sh->qd_idx) {
2667 int dd_idx, j; 2830 int dd_idx, j;
2668 struct stripe_head *sh2; 2831 struct stripe_head *sh2;
2832 struct async_submit_ctl submit;
2669 2833
2670 sector_t bn = compute_blocknr(sh, i, 1); 2834 sector_t bn = compute_blocknr(sh, i, 1);
2671 sector_t s = raid5_compute_sector(conf, bn, 0, 2835 sector_t s = raid5_compute_sector(conf, bn, 0,
@@ -2685,9 +2849,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2685 } 2849 }
2686 2850
2687 /* place all the copies on one channel */ 2851 /* place all the copies on one channel */
2852 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
2688 tx = async_memcpy(sh2->dev[dd_idx].page, 2853 tx = async_memcpy(sh2->dev[dd_idx].page,
2689 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2854 sh->dev[i].page, 0, 0, STRIPE_SIZE,
2690 ASYNC_TX_DEP_ACK, tx, NULL, NULL); 2855 &submit);
2691 2856
2692 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2857 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2693 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2858 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
@@ -2973,7 +3138,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2973 /* Need to write out all blocks after computing parity */ 3138 /* Need to write out all blocks after computing parity */
2974 sh->disks = conf->raid_disks; 3139 sh->disks = conf->raid_disks;
2975 stripe_set_idx(sh->sector, conf, 0, sh); 3140 stripe_set_idx(sh->sector, conf, 0, sh);
2976 schedule_reconstruction5(sh, &s, 1, 1); 3141 schedule_reconstruction(sh, &s, 1, 1);
2977 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3142 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
2978 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3143 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2979 atomic_dec(&conf->reshape_stripes); 3144 atomic_dec(&conf->reshape_stripes);
@@ -2993,7 +3158,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2993 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3158 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2994 3159
2995 if (s.ops_request) 3160 if (s.ops_request)
2996 raid5_run_ops(sh, s.ops_request); 3161 raid_run_ops(sh, s.ops_request);
2997 3162
2998 ops_run_io(sh, &s); 3163 ops_run_io(sh, &s);
2999 3164
@@ -3002,7 +3167,7 @@ static bool handle_stripe5(struct stripe_head *sh)
3002 return blocked_rdev == NULL; 3167 return blocked_rdev == NULL;
3003} 3168}
3004 3169
3005static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 3170static bool handle_stripe6(struct stripe_head *sh)
3006{ 3171{
3007 raid5_conf_t *conf = sh->raid_conf; 3172 raid5_conf_t *conf = sh->raid_conf;
3008 int disks = sh->disks; 3173 int disks = sh->disks;
@@ -3014,9 +3179,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3014 mdk_rdev_t *blocked_rdev = NULL; 3179 mdk_rdev_t *blocked_rdev = NULL;
3015 3180
3016 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3181 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3017 "pd_idx=%d, qd_idx=%d\n", 3182 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3018 (unsigned long long)sh->sector, sh->state, 3183 (unsigned long long)sh->sector, sh->state,
3019 atomic_read(&sh->count), pd_idx, qd_idx); 3184 atomic_read(&sh->count), pd_idx, qd_idx,
3185 sh->check_state, sh->reconstruct_state);
3020 memset(&s, 0, sizeof(s)); 3186 memset(&s, 0, sizeof(s));
3021 3187
3022 spin_lock(&sh->lock); 3188 spin_lock(&sh->lock);
@@ -3036,35 +3202,24 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3036 3202
3037 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3203 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3038 i, dev->flags, dev->toread, dev->towrite, dev->written); 3204 i, dev->flags, dev->toread, dev->towrite, dev->written);
3039 /* maybe we can reply to a read */ 3205 /* maybe we can reply to a read
3040 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { 3206 *
3041 struct bio *rbi, *rbi2; 3207 * new wantfill requests are only permitted while
3042 pr_debug("Return read for disc %d\n", i); 3208 * ops_complete_biofill is guaranteed to be inactive
3043 spin_lock_irq(&conf->device_lock); 3209 */
3044 rbi = dev->toread; 3210 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3045 dev->toread = NULL; 3211 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3046 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 3212 set_bit(R5_Wantfill, &dev->flags);
3047 wake_up(&conf->wait_for_overlap);
3048 spin_unlock_irq(&conf->device_lock);
3049 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
3050 copy_data(0, rbi, dev->page, dev->sector);
3051 rbi2 = r5_next_bio(rbi, dev->sector);
3052 spin_lock_irq(&conf->device_lock);
3053 if (!raid5_dec_bi_phys_segments(rbi)) {
3054 rbi->bi_next = return_bi;
3055 return_bi = rbi;
3056 }
3057 spin_unlock_irq(&conf->device_lock);
3058 rbi = rbi2;
3059 }
3060 }
3061 3213
3062 /* now count some things */ 3214 /* now count some things */
3063 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3215 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
3064 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3216 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
3217 if (test_bit(R5_Wantcompute, &dev->flags))
3218 BUG_ON(++s.compute > 2);
3065 3219
3066 3220 if (test_bit(R5_Wantfill, &dev->flags)) {
3067 if (dev->toread) 3221 s.to_fill++;
3222 } else if (dev->toread)
3068 s.to_read++; 3223 s.to_read++;
3069 if (dev->towrite) { 3224 if (dev->towrite) {
3070 s.to_write++; 3225 s.to_write++;
@@ -3105,6 +3260,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3105 blocked_rdev = NULL; 3260 blocked_rdev = NULL;
3106 } 3261 }
3107 3262
3263 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3264 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3265 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3266 }
3267
3108 pr_debug("locked=%d uptodate=%d to_read=%d" 3268 pr_debug("locked=%d uptodate=%d to_read=%d"
3109 " to_write=%d failed=%d failed_num=%d,%d\n", 3269 " to_write=%d failed=%d failed_num=%d,%d\n",
3110 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3270 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3145,19 +3305,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3145 * or to load a block that is being partially written. 3305 * or to load a block that is being partially written.
3146 */ 3306 */
3147 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3307 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3148 (s.syncing && (s.uptodate < disks)) || s.expanding) 3308 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3149 handle_stripe_fill6(sh, &s, &r6s, disks); 3309 handle_stripe_fill6(sh, &s, &r6s, disks);
3150 3310
3151 /* now to consider writing and what else, if anything should be read */ 3311 /* Now we check to see if any write operations have recently
3152 if (s.to_write) 3312 * completed
3313 */
3314 if (sh->reconstruct_state == reconstruct_state_drain_result) {
3315 int qd_idx = sh->qd_idx;
3316
3317 sh->reconstruct_state = reconstruct_state_idle;
3318 /* All the 'written' buffers and the parity blocks are ready to
3319 * be written back to disk
3320 */
3321 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3322 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
3323 for (i = disks; i--; ) {
3324 dev = &sh->dev[i];
3325 if (test_bit(R5_LOCKED, &dev->flags) &&
3326 (i == sh->pd_idx || i == qd_idx ||
3327 dev->written)) {
3328 pr_debug("Writing block %d\n", i);
3329 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3330 set_bit(R5_Wantwrite, &dev->flags);
3331 if (!test_bit(R5_Insync, &dev->flags) ||
3332 ((i == sh->pd_idx || i == qd_idx) &&
3333 s.failed == 0))
3334 set_bit(STRIPE_INSYNC, &sh->state);
3335 }
3336 }
3337 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
3338 atomic_dec(&conf->preread_active_stripes);
3339 if (atomic_read(&conf->preread_active_stripes) <
3340 IO_THRESHOLD)
3341 md_wakeup_thread(conf->mddev->thread);
3342 }
3343 }
3344
3345 /* Now to consider new write requests and what else, if anything
3346 * should be read. We do not handle new writes when:
3347 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
3348 * 2/ A 'check' operation is in flight, as it may clobber the parity
3349 * block.
3350 */
3351 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3153 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3352 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3154 3353
3155 /* maybe we need to check and possibly fix the parity for this stripe 3354 /* maybe we need to check and possibly fix the parity for this stripe
3156 * Any reads will already have been scheduled, so we just see if enough 3355 * Any reads will already have been scheduled, so we just see if enough
3157 * data is available 3356 * data is available. The parity check is held off while parity
3357 * dependent operations are in flight.
3158 */ 3358 */
3159 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) 3359 if (sh->check_state ||
3160 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); 3360 (s.syncing && s.locked == 0 &&
3361 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3362 !test_bit(STRIPE_INSYNC, &sh->state)))
3363 handle_parity_checks6(conf, sh, &s, &r6s, disks);
3161 3364
3162 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3365 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3163 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3366 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
@@ -3178,15 +3381,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3178 set_bit(R5_Wantwrite, &dev->flags); 3381 set_bit(R5_Wantwrite, &dev->flags);
3179 set_bit(R5_ReWrite, &dev->flags); 3382 set_bit(R5_ReWrite, &dev->flags);
3180 set_bit(R5_LOCKED, &dev->flags); 3383 set_bit(R5_LOCKED, &dev->flags);
3384 s.locked++;
3181 } else { 3385 } else {
3182 /* let's read it back */ 3386 /* let's read it back */
3183 set_bit(R5_Wantread, &dev->flags); 3387 set_bit(R5_Wantread, &dev->flags);
3184 set_bit(R5_LOCKED, &dev->flags); 3388 set_bit(R5_LOCKED, &dev->flags);
3389 s.locked++;
3185 } 3390 }
3186 } 3391 }
3187 } 3392 }
3188 3393
3189 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 3394 /* Finish reconstruct operations initiated by the expansion process */
3395 if (sh->reconstruct_state == reconstruct_state_result) {
3396 sh->reconstruct_state = reconstruct_state_idle;
3397 clear_bit(STRIPE_EXPANDING, &sh->state);
3398 for (i = conf->raid_disks; i--; ) {
3399 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3400 set_bit(R5_LOCKED, &sh->dev[i].flags);
3401 s.locked++;
3402 }
3403 }
3404
3405 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3406 !sh->reconstruct_state) {
3190 struct stripe_head *sh2 3407 struct stripe_head *sh2
3191 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3408 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3192 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3409 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
@@ -3207,14 +3424,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3207 /* Need to write out all blocks after computing P&Q */ 3424 /* Need to write out all blocks after computing P&Q */
3208 sh->disks = conf->raid_disks; 3425 sh->disks = conf->raid_disks;
3209 stripe_set_idx(sh->sector, conf, 0, sh); 3426 stripe_set_idx(sh->sector, conf, 0, sh);
3210 compute_parity6(sh, RECONSTRUCT_WRITE); 3427 schedule_reconstruction(sh, &s, 1, 1);
3211 for (i = conf->raid_disks ; i-- ; ) { 3428 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3212 set_bit(R5_LOCKED, &sh->dev[i].flags);
3213 s.locked++;
3214 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3215 }
3216 clear_bit(STRIPE_EXPANDING, &sh->state);
3217 } else if (s.expanded) {
3218 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3429 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3219 atomic_dec(&conf->reshape_stripes); 3430 atomic_dec(&conf->reshape_stripes);
3220 wake_up(&conf->wait_for_overlap); 3431 wake_up(&conf->wait_for_overlap);
@@ -3232,6 +3443,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3232 if (unlikely(blocked_rdev)) 3443 if (unlikely(blocked_rdev))
3233 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3444 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3234 3445
3446 if (s.ops_request)
3447 raid_run_ops(sh, s.ops_request);
3448
3235 ops_run_io(sh, &s); 3449 ops_run_io(sh, &s);
3236 3450
3237 return_io(return_bi); 3451 return_io(return_bi);
@@ -3240,16 +3454,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3240} 3454}
3241 3455
3242/* returns true if the stripe was handled */ 3456/* returns true if the stripe was handled */
3243static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) 3457static bool handle_stripe(struct stripe_head *sh)
3244{ 3458{
3245 if (sh->raid_conf->level == 6) 3459 if (sh->raid_conf->level == 6)
3246 return handle_stripe6(sh, tmp_page); 3460 return handle_stripe6(sh);
3247 else 3461 else
3248 return handle_stripe5(sh); 3462 return handle_stripe5(sh);
3249} 3463}
3250 3464
3251
3252
3253static void raid5_activate_delayed(raid5_conf_t *conf) 3465static void raid5_activate_delayed(raid5_conf_t *conf)
3254{ 3466{
3255 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3467 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
@@ -4046,7 +4258,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4046 spin_unlock(&sh->lock); 4258 spin_unlock(&sh->lock);
4047 4259
4048 /* wait for any blocked device to be handled */ 4260 /* wait for any blocked device to be handled */
4049 while(unlikely(!handle_stripe(sh, NULL))) 4261 while (unlikely(!handle_stripe(sh)))
4050 ; 4262 ;
4051 release_stripe(sh); 4263 release_stripe(sh);
4052 4264
@@ -4103,7 +4315,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4103 return handled; 4315 return handled;
4104 } 4316 }
4105 4317
4106 handle_stripe(sh, NULL); 4318 handle_stripe(sh);
4107 release_stripe(sh); 4319 release_stripe(sh);
4108 handled++; 4320 handled++;
4109 } 4321 }
@@ -4117,6 +4329,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4117 return handled; 4329 return handled;
4118} 4330}
4119 4331
4332#ifdef CONFIG_MULTICORE_RAID456
4333static void __process_stripe(void *param, async_cookie_t cookie)
4334{
4335 struct stripe_head *sh = param;
4336
4337 handle_stripe(sh);
4338 release_stripe(sh);
4339}
4340
4341static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4342{
4343 async_schedule_domain(__process_stripe, sh, domain);
4344}
4345
4346static void synchronize_stripe_processing(struct list_head *domain)
4347{
4348 async_synchronize_full_domain(domain);
4349}
4350#else
4351static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4352{
4353 handle_stripe(sh);
4354 release_stripe(sh);
4355 cond_resched();
4356}
4357
4358static void synchronize_stripe_processing(struct list_head *domain)
4359{
4360}
4361#endif
4120 4362
4121 4363
4122/* 4364/*
@@ -4131,6 +4373,7 @@ static void raid5d(mddev_t *mddev)
4131 struct stripe_head *sh; 4373 struct stripe_head *sh;
4132 raid5_conf_t *conf = mddev->private; 4374 raid5_conf_t *conf = mddev->private;
4133 int handled; 4375 int handled;
4376 LIST_HEAD(raid_domain);
4134 4377
4135 pr_debug("+++ raid5d active\n"); 4378 pr_debug("+++ raid5d active\n");
4136 4379
@@ -4167,8 +4410,7 @@ static void raid5d(mddev_t *mddev)
4167 spin_unlock_irq(&conf->device_lock); 4410 spin_unlock_irq(&conf->device_lock);
4168 4411
4169 handled++; 4412 handled++;
4170 handle_stripe(sh, conf->spare_page); 4413 process_stripe(sh, &raid_domain);
4171 release_stripe(sh);
4172 4414
4173 spin_lock_irq(&conf->device_lock); 4415 spin_lock_irq(&conf->device_lock);
4174 } 4416 }
@@ -4176,6 +4418,7 @@ static void raid5d(mddev_t *mddev)
4176 4418
4177 spin_unlock_irq(&conf->device_lock); 4419 spin_unlock_irq(&conf->device_lock);
4178 4420
4421 synchronize_stripe_processing(&raid_domain);
4179 async_tx_issue_pending_all(); 4422 async_tx_issue_pending_all();
4180 unplug_slaves(mddev); 4423 unplug_slaves(mddev);
4181 4424
@@ -4308,6 +4551,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4308 return sectors * (raid_disks - conf->max_degraded); 4551 return sectors * (raid_disks - conf->max_degraded);
4309} 4552}
4310 4553
4554static void raid5_free_percpu(raid5_conf_t *conf)
4555{
4556 struct raid5_percpu *percpu;
4557 unsigned long cpu;
4558
4559 if (!conf->percpu)
4560 return;
4561
4562 get_online_cpus();
4563 for_each_possible_cpu(cpu) {
4564 percpu = per_cpu_ptr(conf->percpu, cpu);
4565 safe_put_page(percpu->spare_page);
4566 kfree(percpu->scribble);
4567 }
4568#ifdef CONFIG_HOTPLUG_CPU
4569 unregister_cpu_notifier(&conf->cpu_notify);
4570#endif
4571 put_online_cpus();
4572
4573 free_percpu(conf->percpu);
4574}
4575
4576static void free_conf(raid5_conf_t *conf)
4577{
4578 shrink_stripes(conf);
4579 raid5_free_percpu(conf);
4580 kfree(conf->disks);
4581 kfree(conf->stripe_hashtbl);
4582 kfree(conf);
4583}
4584
4585#ifdef CONFIG_HOTPLUG_CPU
4586static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4587 void *hcpu)
4588{
4589 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
4590 long cpu = (long)hcpu;
4591 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
4592
4593 switch (action) {
4594 case CPU_UP_PREPARE:
4595 case CPU_UP_PREPARE_FROZEN:
4596 if (conf->level == 6 && !percpu->spare_page)
4597 percpu->spare_page = alloc_page(GFP_KERNEL);
4598 if (!percpu->scribble)
4599 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4600
4601 if (!percpu->scribble ||
4602 (conf->level == 6 && !percpu->spare_page)) {
4603 safe_put_page(percpu->spare_page);
4604 kfree(percpu->scribble);
4605 pr_err("%s: failed memory allocation for cpu%ld\n",
4606 __func__, cpu);
4607 return NOTIFY_BAD;
4608 }
4609 break;
4610 case CPU_DEAD:
4611 case CPU_DEAD_FROZEN:
4612 safe_put_page(percpu->spare_page);
4613 kfree(percpu->scribble);
4614 percpu->spare_page = NULL;
4615 percpu->scribble = NULL;
4616 break;
4617 default:
4618 break;
4619 }
4620 return NOTIFY_OK;
4621}
4622#endif
4623
4624static int raid5_alloc_percpu(raid5_conf_t *conf)
4625{
4626 unsigned long cpu;
4627 struct page *spare_page;
4628 struct raid5_percpu *allcpus;
4629 void *scribble;
4630 int err;
4631
4632 allcpus = alloc_percpu(struct raid5_percpu);
4633 if (!allcpus)
4634 return -ENOMEM;
4635 conf->percpu = allcpus;
4636
4637 get_online_cpus();
4638 err = 0;
4639 for_each_present_cpu(cpu) {
4640 if (conf->level == 6) {
4641 spare_page = alloc_page(GFP_KERNEL);
4642 if (!spare_page) {
4643 err = -ENOMEM;
4644 break;
4645 }
4646 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4647 }
4648 scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
4649 if (!scribble) {
4650 err = -ENOMEM;
4651 break;
4652 }
4653 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
4654 }
4655#ifdef CONFIG_HOTPLUG_CPU
4656 conf->cpu_notify.notifier_call = raid456_cpu_notify;
4657 conf->cpu_notify.priority = 0;
4658 if (err == 0)
4659 err = register_cpu_notifier(&conf->cpu_notify);
4660#endif
4661 put_online_cpus();
4662
4663 return err;
4664}
4665
4311static raid5_conf_t *setup_conf(mddev_t *mddev) 4666static raid5_conf_t *setup_conf(mddev_t *mddev)
4312{ 4667{
4313 raid5_conf_t *conf; 4668 raid5_conf_t *conf;
@@ -4349,6 +4704,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4349 goto abort; 4704 goto abort;
4350 4705
4351 conf->raid_disks = mddev->raid_disks; 4706 conf->raid_disks = mddev->raid_disks;
4707 conf->scribble_len = scribble_len(conf->raid_disks);
4352 if (mddev->reshape_position == MaxSector) 4708 if (mddev->reshape_position == MaxSector)
4353 conf->previous_raid_disks = mddev->raid_disks; 4709 conf->previous_raid_disks = mddev->raid_disks;
4354 else 4710 else
@@ -4364,11 +4720,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4364 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4720 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4365 goto abort; 4721 goto abort;
4366 4722
4367 if (mddev->new_level == 6) { 4723 conf->level = mddev->new_level;
4368 conf->spare_page = alloc_page(GFP_KERNEL); 4724 if (raid5_alloc_percpu(conf) != 0)
4369 if (!conf->spare_page) 4725 goto abort;
4370 goto abort; 4726
4371 }
4372 spin_lock_init(&conf->device_lock); 4727 spin_lock_init(&conf->device_lock);
4373 init_waitqueue_head(&conf->wait_for_stripe); 4728 init_waitqueue_head(&conf->wait_for_stripe);
4374 init_waitqueue_head(&conf->wait_for_overlap); 4729 init_waitqueue_head(&conf->wait_for_overlap);
@@ -4439,11 +4794,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4439 4794
4440 abort: 4795 abort:
4441 if (conf) { 4796 if (conf) {
4442 shrink_stripes(conf); 4797 free_conf(conf);
4443 safe_put_page(conf->spare_page);
4444 kfree(conf->disks);
4445 kfree(conf->stripe_hashtbl);
4446 kfree(conf);
4447 return ERR_PTR(-EIO); 4798 return ERR_PTR(-EIO);
4448 } else 4799 } else
4449 return ERR_PTR(-ENOMEM); 4800 return ERR_PTR(-ENOMEM);
@@ -4613,12 +4964,8 @@ abort:
4613 md_unregister_thread(mddev->thread); 4964 md_unregister_thread(mddev->thread);
4614 mddev->thread = NULL; 4965 mddev->thread = NULL;
4615 if (conf) { 4966 if (conf) {
4616 shrink_stripes(conf);
4617 print_raid5_conf(conf); 4967 print_raid5_conf(conf);
4618 safe_put_page(conf->spare_page); 4968 free_conf(conf);
4619 kfree(conf->disks);
4620 kfree(conf->stripe_hashtbl);
4621 kfree(conf);
4622 } 4969 }
4623 mddev->private = NULL; 4970 mddev->private = NULL;
4624 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 4971 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
@@ -4633,13 +4980,10 @@ static int stop(mddev_t *mddev)
4633 4980
4634 md_unregister_thread(mddev->thread); 4981 md_unregister_thread(mddev->thread);
4635 mddev->thread = NULL; 4982 mddev->thread = NULL;
4636 shrink_stripes(conf);
4637 kfree(conf->stripe_hashtbl);
4638 mddev->queue->backing_dev_info.congested_fn = NULL; 4983 mddev->queue->backing_dev_info.congested_fn = NULL;
4639 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 4984 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
4640 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 4985 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
4641 kfree(conf->disks); 4986 free_conf(conf);
4642 kfree(conf);
4643 mddev->private = NULL; 4987 mddev->private = NULL;
4644 return 0; 4988 return 0;
4645} 4989}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 9459689c4ea0..2390e0e83daf 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -2,6 +2,7 @@
2#define _RAID5_H 2#define _RAID5_H
3 3
4#include <linux/raid/xor.h> 4#include <linux/raid/xor.h>
5#include <linux/dmaengine.h>
5 6
6/* 7/*
7 * 8 *
@@ -175,7 +176,9 @@
175 */ 176 */
176enum check_states { 177enum check_states {
177 check_state_idle = 0, 178 check_state_idle = 0,
178 check_state_run, /* parity check */ 179 check_state_run, /* xor parity check */
180 check_state_run_q, /* q-parity check */
181 check_state_run_pq, /* pq dual parity check */
179 check_state_check_result, 182 check_state_check_result,
180 check_state_compute_run, /* parity repair */ 183 check_state_compute_run, /* parity repair */
181 check_state_compute_result, 184 check_state_compute_result,
@@ -215,8 +218,8 @@ struct stripe_head {
215 * @target - STRIPE_OP_COMPUTE_BLK target 218 * @target - STRIPE_OP_COMPUTE_BLK target
216 */ 219 */
217 struct stripe_operations { 220 struct stripe_operations {
218 int target; 221 int target, target2;
219 u32 zero_sum_result; 222 enum sum_check_flags zero_sum_result;
220 } ops; 223 } ops;
221 struct r5dev { 224 struct r5dev {
222 struct bio req; 225 struct bio req;
@@ -298,7 +301,7 @@ struct r6_state {
298#define STRIPE_OP_COMPUTE_BLK 1 301#define STRIPE_OP_COMPUTE_BLK 1
299#define STRIPE_OP_PREXOR 2 302#define STRIPE_OP_PREXOR 2
300#define STRIPE_OP_BIODRAIN 3 303#define STRIPE_OP_BIODRAIN 3
301#define STRIPE_OP_POSTXOR 4 304#define STRIPE_OP_RECONSTRUCT 4
302#define STRIPE_OP_CHECK 5 305#define STRIPE_OP_CHECK 5
303 306
304/* 307/*
@@ -385,8 +388,21 @@ struct raid5_private_data {
385 * (fresh device added). 388 * (fresh device added).
386 * Cleared when a sync completes. 389 * Cleared when a sync completes.
387 */ 390 */
388 391 /* per cpu variables */
389 struct page *spare_page; /* Used when checking P/Q in raid6 */ 392 struct raid5_percpu {
393 struct page *spare_page; /* Used when checking P/Q in raid6 */
394 void *scribble; /* space for constructing buffer
395 * lists and performing address
396 * conversions
397 */
398 } *percpu;
399 size_t scribble_len; /* size of scribble region must be
400 * associated with conf to handle
401 * cpu hotplug while reshaping
402 */
403#ifdef CONFIG_HOTPLUG_CPU
404 struct notifier_block cpu_notify;
405#endif
390 406
391 /* 407 /*
392 * Free stripes pool 408 * Free stripes pool