aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2009-09-08 20:42:29 -0400
committerDan Williams <dan.j.williams@intel.com>2009-09-08 20:42:29 -0400
commitf9dd2134374c8de6b911e2b8652c6c9622eaa658 (patch)
treec1b8f8d622941606b9e7247ab31d811ba4295011 /drivers/md
parent4b652f0db3be891c7b76b109c3b55003b920fc96 (diff)
parent07a3b417dc3d00802bd7b4874c3e811f0b015a7d (diff)
Merge branch 'md-raid6-accel' into ioat3.2
Conflicts: include/linux/dmaengine.h
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig26
-rw-r--r--drivers/md/raid5.c1486
-rw-r--r--drivers/md/raid5.h28
3 files changed, 958 insertions, 582 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 36e0675be9f7..09c0c6e49ab5 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -124,6 +124,8 @@ config MD_RAID456
124 select MD_RAID6_PQ 124 select MD_RAID6_PQ
125 select ASYNC_MEMCPY 125 select ASYNC_MEMCPY
126 select ASYNC_XOR 126 select ASYNC_XOR
127 select ASYNC_PQ
128 select ASYNC_RAID6_RECOV
127 ---help--- 129 ---help---
128 A RAID-5 set of N drives with a capacity of C MB per drive provides 130 A RAID-5 set of N drives with a capacity of C MB per drive provides
129 the capacity of C * (N - 1) MB, and protects against a failure 131 the capacity of C * (N - 1) MB, and protects against a failure
@@ -152,9 +154,33 @@ config MD_RAID456
152 154
153 If unsure, say Y. 155 If unsure, say Y.
154 156
157config MULTICORE_RAID456
158 bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
159 depends on MD_RAID456
160 depends on SMP
161 depends on EXPERIMENTAL
162 ---help---
163 Enable the raid456 module to dispatch per-stripe raid operations to a
164 thread pool.
165
166 If unsure, say N.
167
155config MD_RAID6_PQ 168config MD_RAID6_PQ
156 tristate 169 tristate
157 170
171config ASYNC_RAID6_TEST
172 tristate "Self test for hardware accelerated raid6 recovery"
173 depends on MD_RAID6_PQ
174 select ASYNC_RAID6_RECOV
175 ---help---
176 This is a one-shot self test that permutes through the
177 recovery of all the possible two disk failure scenarios for a
178 N-disk array. Recovery is performed with the asynchronous
179 raid6 recovery routines, and will optionally use an offload
180 engine if one is available.
181
182 If unsure, say N.
183
158config MD_MULTIPATH 184config MD_MULTIPATH
159 tristate "Multipath I/O support" 185 tristate "Multipath I/O support"
160 depends on BLK_DEV_MD 186 depends on BLK_DEV_MD
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bb37fb1b2d82..0a5cf2171214 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -47,7 +47,9 @@
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48#include <linux/raid/pq.h> 48#include <linux/raid/pq.h>
49#include <linux/async_tx.h> 49#include <linux/async_tx.h>
50#include <linux/async.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/cpu.h>
51#include "md.h" 53#include "md.h"
52#include "raid5.h" 54#include "raid5.h"
53#include "bitmap.h" 55#include "bitmap.h"
@@ -499,11 +501,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
499 struct page *bio_page; 501 struct page *bio_page;
500 int i; 502 int i;
501 int page_offset; 503 int page_offset;
504 struct async_submit_ctl submit;
502 505
503 if (bio->bi_sector >= sector) 506 if (bio->bi_sector >= sector)
504 page_offset = (signed)(bio->bi_sector - sector) * 512; 507 page_offset = (signed)(bio->bi_sector - sector) * 512;
505 else 508 else
506 page_offset = (signed)(sector - bio->bi_sector) * -512; 509 page_offset = (signed)(sector - bio->bi_sector) * -512;
510
511 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
507 bio_for_each_segment(bvl, bio, i) { 512 bio_for_each_segment(bvl, bio, i) {
508 int len = bio_iovec_idx(bio, i)->bv_len; 513 int len = bio_iovec_idx(bio, i)->bv_len;
509 int clen; 514 int clen;
@@ -525,15 +530,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
525 bio_page = bio_iovec_idx(bio, i)->bv_page; 530 bio_page = bio_iovec_idx(bio, i)->bv_page;
526 if (frombio) 531 if (frombio)
527 tx = async_memcpy(page, bio_page, page_offset, 532 tx = async_memcpy(page, bio_page, page_offset,
528 b_offset, clen, 533 b_offset, clen, &submit);
529 ASYNC_TX_DEP_ACK,
530 tx, NULL, NULL);
531 else 534 else
532 tx = async_memcpy(bio_page, page, b_offset, 535 tx = async_memcpy(bio_page, page, b_offset,
533 page_offset, clen, 536 page_offset, clen, &submit);
534 ASYNC_TX_DEP_ACK,
535 tx, NULL, NULL);
536 } 537 }
538 /* chain the operations */
539 submit.depend_tx = tx;
540
537 if (clen < len) /* hit end of page */ 541 if (clen < len) /* hit end of page */
538 break; 542 break;
539 page_offset += len; 543 page_offset += len;
@@ -592,6 +596,7 @@ static void ops_run_biofill(struct stripe_head *sh)
592{ 596{
593 struct dma_async_tx_descriptor *tx = NULL; 597 struct dma_async_tx_descriptor *tx = NULL;
594 raid5_conf_t *conf = sh->raid_conf; 598 raid5_conf_t *conf = sh->raid_conf;
599 struct async_submit_ctl submit;
595 int i; 600 int i;
596 601
597 pr_debug("%s: stripe %llu\n", __func__, 602 pr_debug("%s: stripe %llu\n", __func__,
@@ -615,22 +620,34 @@ static void ops_run_biofill(struct stripe_head *sh)
615 } 620 }
616 621
617 atomic_inc(&sh->count); 622 atomic_inc(&sh->count);
618 async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 623 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
619 ops_complete_biofill, sh); 624 async_trigger_callback(&submit);
620} 625}
621 626
622static void ops_complete_compute5(void *stripe_head_ref) 627static void mark_target_uptodate(struct stripe_head *sh, int target)
623{ 628{
624 struct stripe_head *sh = stripe_head_ref; 629 struct r5dev *tgt;
625 int target = sh->ops.target;
626 struct r5dev *tgt = &sh->dev[target];
627 630
628 pr_debug("%s: stripe %llu\n", __func__, 631 if (target < 0)
629 (unsigned long long)sh->sector); 632 return;
630 633
634 tgt = &sh->dev[target];
631 set_bit(R5_UPTODATE, &tgt->flags); 635 set_bit(R5_UPTODATE, &tgt->flags);
632 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 636 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
633 clear_bit(R5_Wantcompute, &tgt->flags); 637 clear_bit(R5_Wantcompute, &tgt->flags);
638}
639
640static void ops_complete_compute(void *stripe_head_ref)
641{
642 struct stripe_head *sh = stripe_head_ref;
643
644 pr_debug("%s: stripe %llu\n", __func__,
645 (unsigned long long)sh->sector);
646
647 /* mark the computed target(s) as uptodate */
648 mark_target_uptodate(sh, sh->ops.target);
649 mark_target_uptodate(sh, sh->ops.target2);
650
634 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 651 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
635 if (sh->check_state == check_state_compute_run) 652 if (sh->check_state == check_state_compute_run)
636 sh->check_state = check_state_compute_result; 653 sh->check_state = check_state_compute_result;
@@ -638,16 +655,24 @@ static void ops_complete_compute5(void *stripe_head_ref)
638 release_stripe(sh); 655 release_stripe(sh);
639} 656}
640 657
641static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) 658/* return a pointer to the address conversion region of the scribble buffer */
659static addr_conv_t *to_addr_conv(struct stripe_head *sh,
660 struct raid5_percpu *percpu)
661{
662 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
663}
664
665static struct dma_async_tx_descriptor *
666ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
642{ 667{
643 /* kernel stack size limits the total number of disks */
644 int disks = sh->disks; 668 int disks = sh->disks;
645 struct page *xor_srcs[disks]; 669 struct page **xor_srcs = percpu->scribble;
646 int target = sh->ops.target; 670 int target = sh->ops.target;
647 struct r5dev *tgt = &sh->dev[target]; 671 struct r5dev *tgt = &sh->dev[target];
648 struct page *xor_dest = tgt->page; 672 struct page *xor_dest = tgt->page;
649 int count = 0; 673 int count = 0;
650 struct dma_async_tx_descriptor *tx; 674 struct dma_async_tx_descriptor *tx;
675 struct async_submit_ctl submit;
651 int i; 676 int i;
652 677
653 pr_debug("%s: stripe %llu block: %d\n", 678 pr_debug("%s: stripe %llu block: %d\n",
@@ -660,17 +685,207 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
660 685
661 atomic_inc(&sh->count); 686 atomic_inc(&sh->count);
662 687
688 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
689 ops_complete_compute, sh, to_addr_conv(sh, percpu));
663 if (unlikely(count == 1)) 690 if (unlikely(count == 1))
664 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 691 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
665 0, NULL, ops_complete_compute5, sh);
666 else 692 else
667 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 693 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
668 ASYNC_TX_XOR_ZERO_DST, NULL,
669 ops_complete_compute5, sh);
670 694
671 return tx; 695 return tx;
672} 696}
673 697
698/* set_syndrome_sources - populate source buffers for gen_syndrome
699 * @srcs - (struct page *) array of size sh->disks
700 * @sh - stripe_head to parse
701 *
702 * Populates srcs in proper layout order for the stripe and returns the
703 * 'count' of sources to be used in a call to async_gen_syndrome. The P
704 * destination buffer is recorded in srcs[count] and the Q destination
705 * is recorded in srcs[count+1]].
706 */
707static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
708{
709 int disks = sh->disks;
710 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
711 int d0_idx = raid6_d0(sh);
712 int count;
713 int i;
714
715 for (i = 0; i < disks; i++)
716 srcs[i] = (void *)raid6_empty_zero_page;
717
718 count = 0;
719 i = d0_idx;
720 do {
721 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
722
723 srcs[slot] = sh->dev[i].page;
724 i = raid6_next_disk(i, disks);
725 } while (i != d0_idx);
726 BUG_ON(count != syndrome_disks);
727
728 return count;
729}
730
731static struct dma_async_tx_descriptor *
732ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
733{
734 int disks = sh->disks;
735 struct page **blocks = percpu->scribble;
736 int target;
737 int qd_idx = sh->qd_idx;
738 struct dma_async_tx_descriptor *tx;
739 struct async_submit_ctl submit;
740 struct r5dev *tgt;
741 struct page *dest;
742 int i;
743 int count;
744
745 if (sh->ops.target < 0)
746 target = sh->ops.target2;
747 else if (sh->ops.target2 < 0)
748 target = sh->ops.target;
749 else
750 /* we should only have one valid target */
751 BUG();
752 BUG_ON(target < 0);
753 pr_debug("%s: stripe %llu block: %d\n",
754 __func__, (unsigned long long)sh->sector, target);
755
756 tgt = &sh->dev[target];
757 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
758 dest = tgt->page;
759
760 atomic_inc(&sh->count);
761
762 if (target == qd_idx) {
763 count = set_syndrome_sources(blocks, sh);
764 blocks[count] = NULL; /* regenerating p is not necessary */
765 BUG_ON(blocks[count+1] != dest); /* q should already be set */
766 init_async_submit(&submit, 0, NULL, ops_complete_compute, sh,
767 to_addr_conv(sh, percpu));
768 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
769 } else {
770 /* Compute any data- or p-drive using XOR */
771 count = 0;
772 for (i = disks; i-- ; ) {
773 if (i == target || i == qd_idx)
774 continue;
775 blocks[count++] = sh->dev[i].page;
776 }
777
778 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
779 ops_complete_compute, sh,
780 to_addr_conv(sh, percpu));
781 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
782 }
783
784 return tx;
785}
786
787static struct dma_async_tx_descriptor *
788ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
789{
790 int i, count, disks = sh->disks;
791 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
792 int d0_idx = raid6_d0(sh);
793 int faila = -1, failb = -1;
794 int target = sh->ops.target;
795 int target2 = sh->ops.target2;
796 struct r5dev *tgt = &sh->dev[target];
797 struct r5dev *tgt2 = &sh->dev[target2];
798 struct dma_async_tx_descriptor *tx;
799 struct page **blocks = percpu->scribble;
800 struct async_submit_ctl submit;
801
802 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
803 __func__, (unsigned long long)sh->sector, target, target2);
804 BUG_ON(target < 0 || target2 < 0);
805 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
806 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
807
808 /* we need to open-code set_syndrome_sources to handle to the
809 * slot number conversion for 'faila' and 'failb'
810 */
811 for (i = 0; i < disks ; i++)
812 blocks[i] = (void *)raid6_empty_zero_page;
813 count = 0;
814 i = d0_idx;
815 do {
816 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
817
818 blocks[slot] = sh->dev[i].page;
819
820 if (i == target)
821 faila = slot;
822 if (i == target2)
823 failb = slot;
824 i = raid6_next_disk(i, disks);
825 } while (i != d0_idx);
826 BUG_ON(count != syndrome_disks);
827
828 BUG_ON(faila == failb);
829 if (failb < faila)
830 swap(faila, failb);
831 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
832 __func__, (unsigned long long)sh->sector, faila, failb);
833
834 atomic_inc(&sh->count);
835
836 if (failb == syndrome_disks+1) {
837 /* Q disk is one of the missing disks */
838 if (faila == syndrome_disks) {
839 /* Missing P+Q, just recompute */
840 init_async_submit(&submit, 0, NULL, ops_complete_compute,
841 sh, to_addr_conv(sh, percpu));
842 return async_gen_syndrome(blocks, 0, count+2,
843 STRIPE_SIZE, &submit);
844 } else {
845 struct page *dest;
846 int data_target;
847 int qd_idx = sh->qd_idx;
848
849 /* Missing D+Q: recompute D from P, then recompute Q */
850 if (target == qd_idx)
851 data_target = target2;
852 else
853 data_target = target;
854
855 count = 0;
856 for (i = disks; i-- ; ) {
857 if (i == data_target || i == qd_idx)
858 continue;
859 blocks[count++] = sh->dev[i].page;
860 }
861 dest = sh->dev[data_target].page;
862 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
863 NULL, NULL, to_addr_conv(sh, percpu));
864 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
865 &submit);
866
867 count = set_syndrome_sources(blocks, sh);
868 init_async_submit(&submit, 0, tx, ops_complete_compute,
869 sh, to_addr_conv(sh, percpu));
870 return async_gen_syndrome(blocks, 0, count+2,
871 STRIPE_SIZE, &submit);
872 }
873 }
874
875 init_async_submit(&submit, 0, NULL, ops_complete_compute, sh,
876 to_addr_conv(sh, percpu));
877 if (failb == syndrome_disks) {
878 /* We're missing D+P. */
879 return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE,
880 faila, blocks, &submit);
881 } else {
882 /* We're missing D+D. */
883 return async_raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE,
884 faila, failb, blocks, &submit);
885 }
886}
887
888
674static void ops_complete_prexor(void *stripe_head_ref) 889static void ops_complete_prexor(void *stripe_head_ref)
675{ 890{
676 struct stripe_head *sh = stripe_head_ref; 891 struct stripe_head *sh = stripe_head_ref;
@@ -680,12 +895,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
680} 895}
681 896
682static struct dma_async_tx_descriptor * 897static struct dma_async_tx_descriptor *
683ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 898ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
899 struct dma_async_tx_descriptor *tx)
684{ 900{
685 /* kernel stack size limits the total number of disks */
686 int disks = sh->disks; 901 int disks = sh->disks;
687 struct page *xor_srcs[disks]; 902 struct page **xor_srcs = percpu->scribble;
688 int count = 0, pd_idx = sh->pd_idx, i; 903 int count = 0, pd_idx = sh->pd_idx, i;
904 struct async_submit_ctl submit;
689 905
690 /* existing parity data subtracted */ 906 /* existing parity data subtracted */
691 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 907 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
@@ -700,9 +916,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
700 xor_srcs[count++] = dev->page; 916 xor_srcs[count++] = dev->page;
701 } 917 }
702 918
703 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 919 init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx,
704 ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, 920 ops_complete_prexor, sh, to_addr_conv(sh, percpu));
705 ops_complete_prexor, sh); 921 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
706 922
707 return tx; 923 return tx;
708} 924}
@@ -742,17 +958,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
742 return tx; 958 return tx;
743} 959}
744 960
745static void ops_complete_postxor(void *stripe_head_ref) 961static void ops_complete_reconstruct(void *stripe_head_ref)
746{ 962{
747 struct stripe_head *sh = stripe_head_ref; 963 struct stripe_head *sh = stripe_head_ref;
748 int disks = sh->disks, i, pd_idx = sh->pd_idx; 964 int disks = sh->disks;
965 int pd_idx = sh->pd_idx;
966 int qd_idx = sh->qd_idx;
967 int i;
749 968
750 pr_debug("%s: stripe %llu\n", __func__, 969 pr_debug("%s: stripe %llu\n", __func__,
751 (unsigned long long)sh->sector); 970 (unsigned long long)sh->sector);
752 971
753 for (i = disks; i--; ) { 972 for (i = disks; i--; ) {
754 struct r5dev *dev = &sh->dev[i]; 973 struct r5dev *dev = &sh->dev[i];
755 if (dev->written || i == pd_idx) 974
975 if (dev->written || i == pd_idx || i == qd_idx)
756 set_bit(R5_UPTODATE, &dev->flags); 976 set_bit(R5_UPTODATE, &dev->flags);
757 } 977 }
758 978
@@ -770,12 +990,12 @@ static void ops_complete_postxor(void *stripe_head_ref)
770} 990}
771 991
772static void 992static void
773ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 993ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
994 struct dma_async_tx_descriptor *tx)
774{ 995{
775 /* kernel stack size limits the total number of disks */
776 int disks = sh->disks; 996 int disks = sh->disks;
777 struct page *xor_srcs[disks]; 997 struct page **xor_srcs = percpu->scribble;
778 998 struct async_submit_ctl submit;
779 int count = 0, pd_idx = sh->pd_idx, i; 999 int count = 0, pd_idx = sh->pd_idx, i;
780 struct page *xor_dest; 1000 struct page *xor_dest;
781 int prexor = 0; 1001 int prexor = 0;
@@ -809,18 +1029,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
809 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1029 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
810 * for the synchronous xor case 1030 * for the synchronous xor case
811 */ 1031 */
812 flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | 1032 flags = ASYNC_TX_ACK |
813 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1033 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
814 1034
815 atomic_inc(&sh->count); 1035 atomic_inc(&sh->count);
816 1036
817 if (unlikely(count == 1)) { 1037 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
818 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); 1038 to_addr_conv(sh, percpu));
819 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 1039 if (unlikely(count == 1))
820 flags, tx, ops_complete_postxor, sh); 1040 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
821 } else 1041 else
822 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1042 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
823 flags, tx, ops_complete_postxor, sh); 1043}
1044
1045static void
1046ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1047 struct dma_async_tx_descriptor *tx)
1048{
1049 struct async_submit_ctl submit;
1050 struct page **blocks = percpu->scribble;
1051 int count;
1052
1053 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1054
1055 count = set_syndrome_sources(blocks, sh);
1056
1057 atomic_inc(&sh->count);
1058
1059 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1060 sh, to_addr_conv(sh, percpu));
1061 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
824} 1062}
825 1063
826static void ops_complete_check(void *stripe_head_ref) 1064static void ops_complete_check(void *stripe_head_ref)
@@ -835,63 +1073,115 @@ static void ops_complete_check(void *stripe_head_ref)
835 release_stripe(sh); 1073 release_stripe(sh);
836} 1074}
837 1075
838static void ops_run_check(struct stripe_head *sh) 1076static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
839{ 1077{
840 /* kernel stack size limits the total number of disks */
841 int disks = sh->disks; 1078 int disks = sh->disks;
842 struct page *xor_srcs[disks]; 1079 int pd_idx = sh->pd_idx;
1080 int qd_idx = sh->qd_idx;
1081 struct page *xor_dest;
1082 struct page **xor_srcs = percpu->scribble;
843 struct dma_async_tx_descriptor *tx; 1083 struct dma_async_tx_descriptor *tx;
844 1084 struct async_submit_ctl submit;
845 int count = 0, pd_idx = sh->pd_idx, i; 1085 int count;
846 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1086 int i;
847 1087
848 pr_debug("%s: stripe %llu\n", __func__, 1088 pr_debug("%s: stripe %llu\n", __func__,
849 (unsigned long long)sh->sector); 1089 (unsigned long long)sh->sector);
850 1090
1091 count = 0;
1092 xor_dest = sh->dev[pd_idx].page;
1093 xor_srcs[count++] = xor_dest;
851 for (i = disks; i--; ) { 1094 for (i = disks; i--; ) {
852 struct r5dev *dev = &sh->dev[i]; 1095 if (i == pd_idx || i == qd_idx)
853 if (i != pd_idx) 1096 continue;
854 xor_srcs[count++] = dev->page; 1097 xor_srcs[count++] = sh->dev[i].page;
855 } 1098 }
856 1099
857 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1100 init_async_submit(&submit, 0, NULL, NULL, NULL,
858 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); 1101 to_addr_conv(sh, percpu));
1102 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1103 &sh->ops.zero_sum_result, &submit);
1104
1105 atomic_inc(&sh->count);
1106 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1107 tx = async_trigger_callback(&submit);
1108}
1109
1110static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1111{
1112 struct page **srcs = percpu->scribble;
1113 struct async_submit_ctl submit;
1114 int count;
1115
1116 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1117 (unsigned long long)sh->sector, checkp);
1118
1119 count = set_syndrome_sources(srcs, sh);
1120 if (!checkp)
1121 srcs[count] = NULL;
859 1122
860 atomic_inc(&sh->count); 1123 atomic_inc(&sh->count);
861 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 1124 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
862 ops_complete_check, sh); 1125 sh, to_addr_conv(sh, percpu));
1126 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1127 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
863} 1128}
864 1129
865static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) 1130static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
866{ 1131{
867 int overlap_clear = 0, i, disks = sh->disks; 1132 int overlap_clear = 0, i, disks = sh->disks;
868 struct dma_async_tx_descriptor *tx = NULL; 1133 struct dma_async_tx_descriptor *tx = NULL;
1134 raid5_conf_t *conf = sh->raid_conf;
1135 int level = conf->level;
1136 struct raid5_percpu *percpu;
1137 unsigned long cpu;
869 1138
1139 cpu = get_cpu();
1140 percpu = per_cpu_ptr(conf->percpu, cpu);
870 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1141 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
871 ops_run_biofill(sh); 1142 ops_run_biofill(sh);
872 overlap_clear++; 1143 overlap_clear++;
873 } 1144 }
874 1145
875 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1146 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
876 tx = ops_run_compute5(sh); 1147 if (level < 6)
877 /* terminate the chain if postxor is not set to be run */ 1148 tx = ops_run_compute5(sh, percpu);
878 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) 1149 else {
1150 if (sh->ops.target2 < 0 || sh->ops.target < 0)
1151 tx = ops_run_compute6_1(sh, percpu);
1152 else
1153 tx = ops_run_compute6_2(sh, percpu);
1154 }
1155 /* terminate the chain if reconstruct is not set to be run */
1156 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
879 async_tx_ack(tx); 1157 async_tx_ack(tx);
880 } 1158 }
881 1159
882 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1160 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
883 tx = ops_run_prexor(sh, tx); 1161 tx = ops_run_prexor(sh, percpu, tx);
884 1162
885 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1163 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
886 tx = ops_run_biodrain(sh, tx); 1164 tx = ops_run_biodrain(sh, tx);
887 overlap_clear++; 1165 overlap_clear++;
888 } 1166 }
889 1167
890 if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) 1168 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
891 ops_run_postxor(sh, tx); 1169 if (level < 6)
1170 ops_run_reconstruct5(sh, percpu, tx);
1171 else
1172 ops_run_reconstruct6(sh, percpu, tx);
1173 }
892 1174
893 if (test_bit(STRIPE_OP_CHECK, &ops_request)) 1175 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
894 ops_run_check(sh); 1176 if (sh->check_state == check_state_run)
1177 ops_run_check_p(sh, percpu);
1178 else if (sh->check_state == check_state_run_q)
1179 ops_run_check_pq(sh, percpu, 0);
1180 else if (sh->check_state == check_state_run_pq)
1181 ops_run_check_pq(sh, percpu, 1);
1182 else
1183 BUG();
1184 }
895 1185
896 if (overlap_clear) 1186 if (overlap_clear)
897 for (i = disks; i--; ) { 1187 for (i = disks; i--; ) {
@@ -899,6 +1189,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
899 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1189 if (test_and_clear_bit(R5_Overlap, &dev->flags))
900 wake_up(&sh->raid_conf->wait_for_overlap); 1190 wake_up(&sh->raid_conf->wait_for_overlap);
901 } 1191 }
1192 put_cpu();
902} 1193}
903 1194
904static int grow_one_stripe(raid5_conf_t *conf) 1195static int grow_one_stripe(raid5_conf_t *conf)
@@ -948,6 +1239,28 @@ static int grow_stripes(raid5_conf_t *conf, int num)
948 return 0; 1239 return 0;
949} 1240}
950 1241
1242/**
1243 * scribble_len - return the required size of the scribble region
1244 * @num - total number of disks in the array
1245 *
1246 * The size must be enough to contain:
1247 * 1/ a struct page pointer for each device in the array +2
1248 * 2/ room to convert each entry in (1) to its corresponding dma
1249 * (dma_map_page()) or page (page_address()) address.
1250 *
1251 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1252 * calculate over all devices (not just the data blocks), using zeros in place
1253 * of the P and Q blocks.
1254 */
1255static size_t scribble_len(int num)
1256{
1257 size_t len;
1258
1259 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1260
1261 return len;
1262}
1263
951static int resize_stripes(raid5_conf_t *conf, int newsize) 1264static int resize_stripes(raid5_conf_t *conf, int newsize)
952{ 1265{
953 /* Make all the stripes able to hold 'newsize' devices. 1266 /* Make all the stripes able to hold 'newsize' devices.
@@ -976,6 +1289,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
976 struct stripe_head *osh, *nsh; 1289 struct stripe_head *osh, *nsh;
977 LIST_HEAD(newstripes); 1290 LIST_HEAD(newstripes);
978 struct disk_info *ndisks; 1291 struct disk_info *ndisks;
1292 unsigned long cpu;
979 int err; 1293 int err;
980 struct kmem_cache *sc; 1294 struct kmem_cache *sc;
981 int i; 1295 int i;
@@ -1041,7 +1355,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1041 /* Step 3. 1355 /* Step 3.
1042 * At this point, we are holding all the stripes so the array 1356 * At this point, we are holding all the stripes so the array
1043 * is completely stalled, so now is a good time to resize 1357 * is completely stalled, so now is a good time to resize
1044 * conf->disks. 1358 * conf->disks and the scribble region
1045 */ 1359 */
1046 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1360 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1047 if (ndisks) { 1361 if (ndisks) {
@@ -1052,10 +1366,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1052 } else 1366 } else
1053 err = -ENOMEM; 1367 err = -ENOMEM;
1054 1368
1369 get_online_cpus();
1370 conf->scribble_len = scribble_len(newsize);
1371 for_each_present_cpu(cpu) {
1372 struct raid5_percpu *percpu;
1373 void *scribble;
1374
1375 percpu = per_cpu_ptr(conf->percpu, cpu);
1376 scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1377
1378 if (scribble) {
1379 kfree(percpu->scribble);
1380 percpu->scribble = scribble;
1381 } else {
1382 err = -ENOMEM;
1383 break;
1384 }
1385 }
1386 put_online_cpus();
1387
1055 /* Step 4, return new stripes to service */ 1388 /* Step 4, return new stripes to service */
1056 while(!list_empty(&newstripes)) { 1389 while(!list_empty(&newstripes)) {
1057 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1390 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1058 list_del_init(&nsh->lru); 1391 list_del_init(&nsh->lru);
1392
1059 for (i=conf->raid_disks; i < newsize; i++) 1393 for (i=conf->raid_disks; i < newsize; i++)
1060 if (nsh->dev[i].page == NULL) { 1394 if (nsh->dev[i].page == NULL) {
1061 struct page *p = alloc_page(GFP_NOIO); 1395 struct page *p = alloc_page(GFP_NOIO);
@@ -1594,258 +1928,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1594} 1928}
1595 1929
1596 1930
1597
1598/*
1599 * Copy data between a page in the stripe cache, and one or more bion
1600 * The page could align with the middle of the bio, or there could be
1601 * several bion, each with several bio_vecs, which cover part of the page
1602 * Multiple bion are linked together on bi_next. There may be extras
1603 * at the end of this list. We ignore them.
1604 */
1605static void copy_data(int frombio, struct bio *bio,
1606 struct page *page,
1607 sector_t sector)
1608{
1609 char *pa = page_address(page);
1610 struct bio_vec *bvl;
1611 int i;
1612 int page_offset;
1613
1614 if (bio->bi_sector >= sector)
1615 page_offset = (signed)(bio->bi_sector - sector) * 512;
1616 else
1617 page_offset = (signed)(sector - bio->bi_sector) * -512;
1618 bio_for_each_segment(bvl, bio, i) {
1619 int len = bio_iovec_idx(bio,i)->bv_len;
1620 int clen;
1621 int b_offset = 0;
1622
1623 if (page_offset < 0) {
1624 b_offset = -page_offset;
1625 page_offset += b_offset;
1626 len -= b_offset;
1627 }
1628
1629 if (len > 0 && page_offset + len > STRIPE_SIZE)
1630 clen = STRIPE_SIZE - page_offset;
1631 else clen = len;
1632
1633 if (clen > 0) {
1634 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
1635 if (frombio)
1636 memcpy(pa+page_offset, ba+b_offset, clen);
1637 else
1638 memcpy(ba+b_offset, pa+page_offset, clen);
1639 __bio_kunmap_atomic(ba, KM_USER0);
1640 }
1641 if (clen < len) /* hit end of page */
1642 break;
1643 page_offset += len;
1644 }
1645}
1646
1647#define check_xor() do { \
1648 if (count == MAX_XOR_BLOCKS) { \
1649 xor_blocks(count, STRIPE_SIZE, dest, ptr);\
1650 count = 0; \
1651 } \
1652 } while(0)
1653
1654static void compute_parity6(struct stripe_head *sh, int method)
1655{
1656 raid5_conf_t *conf = sh->raid_conf;
1657 int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
1658 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1659 struct bio *chosen;
1660 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1661 void *ptrs[syndrome_disks+2];
1662
1663 pd_idx = sh->pd_idx;
1664 qd_idx = sh->qd_idx;
1665 d0_idx = raid6_d0(sh);
1666
1667 pr_debug("compute_parity, stripe %llu, method %d\n",
1668 (unsigned long long)sh->sector, method);
1669
1670 switch(method) {
1671 case READ_MODIFY_WRITE:
1672 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
1673 case RECONSTRUCT_WRITE:
1674 for (i= disks; i-- ;)
1675 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1676 chosen = sh->dev[i].towrite;
1677 sh->dev[i].towrite = NULL;
1678
1679 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1680 wake_up(&conf->wait_for_overlap);
1681
1682 BUG_ON(sh->dev[i].written);
1683 sh->dev[i].written = chosen;
1684 }
1685 break;
1686 case CHECK_PARITY:
1687 BUG(); /* Not implemented yet */
1688 }
1689
1690 for (i = disks; i--;)
1691 if (sh->dev[i].written) {
1692 sector_t sector = sh->dev[i].sector;
1693 struct bio *wbi = sh->dev[i].written;
1694 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1695 copy_data(1, wbi, sh->dev[i].page, sector);
1696 wbi = r5_next_bio(wbi, sector);
1697 }
1698
1699 set_bit(R5_LOCKED, &sh->dev[i].flags);
1700 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1701 }
1702
1703 /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
1704
1705 for (i = 0; i < disks; i++)
1706 ptrs[i] = (void *)raid6_empty_zero_page;
1707
1708 count = 0;
1709 i = d0_idx;
1710 do {
1711 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1712
1713 ptrs[slot] = page_address(sh->dev[i].page);
1714 if (slot < syndrome_disks &&
1715 !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
1716 printk(KERN_ERR "block %d/%d not uptodate "
1717 "on parity calc\n", i, count);
1718 BUG();
1719 }
1720
1721 i = raid6_next_disk(i, disks);
1722 } while (i != d0_idx);
1723 BUG_ON(count != syndrome_disks);
1724
1725 raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
1726
1727 switch(method) {
1728 case RECONSTRUCT_WRITE:
1729 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1730 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1731 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1732 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
1733 break;
1734 case UPDATE_PARITY:
1735 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1736 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1737 break;
1738 }
1739}
1740
1741
1742/* Compute one missing block */
1743static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1744{
1745 int i, count, disks = sh->disks;
1746 void *ptr[MAX_XOR_BLOCKS], *dest, *p;
1747 int qd_idx = sh->qd_idx;
1748
1749 pr_debug("compute_block_1, stripe %llu, idx %d\n",
1750 (unsigned long long)sh->sector, dd_idx);
1751
1752 if ( dd_idx == qd_idx ) {
1753 /* We're actually computing the Q drive */
1754 compute_parity6(sh, UPDATE_PARITY);
1755 } else {
1756 dest = page_address(sh->dev[dd_idx].page);
1757 if (!nozero) memset(dest, 0, STRIPE_SIZE);
1758 count = 0;
1759 for (i = disks ; i--; ) {
1760 if (i == dd_idx || i == qd_idx)
1761 continue;
1762 p = page_address(sh->dev[i].page);
1763 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1764 ptr[count++] = p;
1765 else
1766 printk("compute_block() %d, stripe %llu, %d"
1767 " not present\n", dd_idx,
1768 (unsigned long long)sh->sector, i);
1769
1770 check_xor();
1771 }
1772 if (count)
1773 xor_blocks(count, STRIPE_SIZE, dest, ptr);
1774 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1775 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1776 }
1777}
1778
1779/* Compute two missing blocks */
1780static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1781{
1782 int i, count, disks = sh->disks;
1783 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1784 int d0_idx = raid6_d0(sh);
1785 int faila = -1, failb = -1;
1786 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1787 void *ptrs[syndrome_disks+2];
1788
1789 for (i = 0; i < disks ; i++)
1790 ptrs[i] = (void *)raid6_empty_zero_page;
1791 count = 0;
1792 i = d0_idx;
1793 do {
1794 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1795
1796 ptrs[slot] = page_address(sh->dev[i].page);
1797
1798 if (i == dd_idx1)
1799 faila = slot;
1800 if (i == dd_idx2)
1801 failb = slot;
1802 i = raid6_next_disk(i, disks);
1803 } while (i != d0_idx);
1804 BUG_ON(count != syndrome_disks);
1805
1806 BUG_ON(faila == failb);
1807 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1808
1809 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1810 (unsigned long long)sh->sector, dd_idx1, dd_idx2,
1811 faila, failb);
1812
1813 if (failb == syndrome_disks+1) {
1814 /* Q disk is one of the missing disks */
1815 if (faila == syndrome_disks) {
1816 /* Missing P+Q, just recompute */
1817 compute_parity6(sh, UPDATE_PARITY);
1818 return;
1819 } else {
1820 /* We're missing D+Q; recompute D from P */
1821 compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
1822 dd_idx2 : dd_idx1),
1823 0);
1824 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1825 return;
1826 }
1827 }
1828
1829 /* We're missing D+P or D+D; */
1830 if (failb == syndrome_disks) {
1831 /* We're missing D+P. */
1832 raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
1833 } else {
1834 /* We're missing D+D. */
1835 raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
1836 ptrs);
1837 }
1838
1839 /* Both the above update both missing blocks */
1840 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1841 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1842}
1843
1844static void 1931static void
1845schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, 1932schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
1846 int rcw, int expand) 1933 int rcw, int expand)
1847{ 1934{
1848 int i, pd_idx = sh->pd_idx, disks = sh->disks; 1935 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1936 raid5_conf_t *conf = sh->raid_conf;
1937 int level = conf->level;
1849 1938
1850 if (rcw) { 1939 if (rcw) {
1851 /* if we are not expanding this is a proper write request, and 1940 /* if we are not expanding this is a proper write request, and
@@ -1858,7 +1947,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1858 } else 1947 } else
1859 sh->reconstruct_state = reconstruct_state_run; 1948 sh->reconstruct_state = reconstruct_state_run;
1860 1949
1861 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1950 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1862 1951
1863 for (i = disks; i--; ) { 1952 for (i = disks; i--; ) {
1864 struct r5dev *dev = &sh->dev[i]; 1953 struct r5dev *dev = &sh->dev[i];
@@ -1871,17 +1960,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1871 s->locked++; 1960 s->locked++;
1872 } 1961 }
1873 } 1962 }
1874 if (s->locked + 1 == disks) 1963 if (s->locked + conf->max_degraded == disks)
1875 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 1964 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1876 atomic_inc(&sh->raid_conf->pending_full_writes); 1965 atomic_inc(&conf->pending_full_writes);
1877 } else { 1966 } else {
1967 BUG_ON(level == 6);
1878 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1968 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1879 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1969 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1880 1970
1881 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 1971 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1882 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 1972 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1883 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 1973 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1884 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1974 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1885 1975
1886 for (i = disks; i--; ) { 1976 for (i = disks; i--; ) {
1887 struct r5dev *dev = &sh->dev[i]; 1977 struct r5dev *dev = &sh->dev[i];
@@ -1899,13 +1989,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1899 } 1989 }
1900 } 1990 }
1901 1991
1902 /* keep the parity disk locked while asynchronous operations 1992 /* keep the parity disk(s) locked while asynchronous operations
1903 * are in flight 1993 * are in flight
1904 */ 1994 */
1905 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 1995 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1906 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1996 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1907 s->locked++; 1997 s->locked++;
1908 1998
1999 if (level == 6) {
2000 int qd_idx = sh->qd_idx;
2001 struct r5dev *dev = &sh->dev[qd_idx];
2002
2003 set_bit(R5_LOCKED, &dev->flags);
2004 clear_bit(R5_UPTODATE, &dev->flags);
2005 s->locked++;
2006 }
2007
1909 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2008 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
1910 __func__, (unsigned long long)sh->sector, 2009 __func__, (unsigned long long)sh->sector,
1911 s->locked, s->ops_request); 2010 s->locked, s->ops_request);
@@ -1986,13 +2085,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1986 2085
1987static void end_reshape(raid5_conf_t *conf); 2086static void end_reshape(raid5_conf_t *conf);
1988 2087
1989static int page_is_zero(struct page *p)
1990{
1991 char *a = page_address(p);
1992 return ((*(u32*)a) == 0 &&
1993 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1994}
1995
1996static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 2088static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
1997 struct stripe_head *sh) 2089 struct stripe_head *sh)
1998{ 2090{
@@ -2133,9 +2225,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
2133 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2225 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2134 set_bit(R5_Wantcompute, &dev->flags); 2226 set_bit(R5_Wantcompute, &dev->flags);
2135 sh->ops.target = disk_idx; 2227 sh->ops.target = disk_idx;
2228 sh->ops.target2 = -1;
2136 s->req_compute = 1; 2229 s->req_compute = 1;
2137 /* Careful: from this point on 'uptodate' is in the eye 2230 /* Careful: from this point on 'uptodate' is in the eye
2138 * of raid5_run_ops which services 'compute' operations 2231 * of raid_run_ops which services 'compute' operations
2139 * before writes. R5_Wantcompute flags a block that will 2232 * before writes. R5_Wantcompute flags a block that will
2140 * be R5_UPTODATE by the time it is needed for a 2233 * be R5_UPTODATE by the time it is needed for a
2141 * subsequent operation. 2234 * subsequent operation.
@@ -2174,61 +2267,104 @@ static void handle_stripe_fill5(struct stripe_head *sh,
2174 set_bit(STRIPE_HANDLE, &sh->state); 2267 set_bit(STRIPE_HANDLE, &sh->state);
2175} 2268}
2176 2269
2177static void handle_stripe_fill6(struct stripe_head *sh, 2270/* fetch_block6 - checks the given member device to see if its data needs
2178 struct stripe_head_state *s, struct r6_state *r6s, 2271 * to be read or computed to satisfy a request.
2179 int disks) 2272 *
2273 * Returns 1 when no more member devices need to be checked, otherwise returns
2274 * 0 to tell the loop in handle_stripe_fill6 to continue
2275 */
2276static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2277 struct r6_state *r6s, int disk_idx, int disks)
2180{ 2278{
2181 int i; 2279 struct r5dev *dev = &sh->dev[disk_idx];
2182 for (i = disks; i--; ) { 2280 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
2183 struct r5dev *dev = &sh->dev[i]; 2281 &sh->dev[r6s->failed_num[1]] };
2184 if (!test_bit(R5_LOCKED, &dev->flags) && 2282
2185 !test_bit(R5_UPTODATE, &dev->flags) && 2283 if (!test_bit(R5_LOCKED, &dev->flags) &&
2186 (dev->toread || (dev->towrite && 2284 !test_bit(R5_UPTODATE, &dev->flags) &&
2187 !test_bit(R5_OVERWRITE, &dev->flags)) || 2285 (dev->toread ||
2188 s->syncing || s->expanding || 2286 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2189 (s->failed >= 1 && 2287 s->syncing || s->expanding ||
2190 (sh->dev[r6s->failed_num[0]].toread || 2288 (s->failed >= 1 &&
2191 s->to_write)) || 2289 (fdev[0]->toread || s->to_write)) ||
2192 (s->failed >= 2 && 2290 (s->failed >= 2 &&
2193 (sh->dev[r6s->failed_num[1]].toread || 2291 (fdev[1]->toread || s->to_write)))) {
2194 s->to_write)))) { 2292 /* we would like to get this block, possibly by computing it,
2195 /* we would like to get this block, possibly 2293 * otherwise read it if the backing disk is insync
2196 * by computing it, but we might not be able to 2294 */
2295 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2296 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2297 if ((s->uptodate == disks - 1) &&
2298 (s->failed && (disk_idx == r6s->failed_num[0] ||
2299 disk_idx == r6s->failed_num[1]))) {
2300 /* have disk failed, and we're requested to fetch it;
2301 * do compute it
2197 */ 2302 */
2198 if ((s->uptodate == disks - 1) && 2303 pr_debug("Computing stripe %llu block %d\n",
2199 (s->failed && (i == r6s->failed_num[0] || 2304 (unsigned long long)sh->sector, disk_idx);
2200 i == r6s->failed_num[1]))) { 2305 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2201 pr_debug("Computing stripe %llu block %d\n", 2306 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2202 (unsigned long long)sh->sector, i); 2307 set_bit(R5_Wantcompute, &dev->flags);
2203 compute_block_1(sh, i, 0); 2308 sh->ops.target = disk_idx;
2204 s->uptodate++; 2309 sh->ops.target2 = -1; /* no 2nd target */
2205 } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { 2310 s->req_compute = 1;
2206 /* Computing 2-failure is *very* expensive; only 2311 s->uptodate++;
2207 * do it if failed >= 2 2312 return 1;
2208 */ 2313 } else if (s->uptodate == disks-2 && s->failed >= 2) {
2209 int other; 2314 /* Computing 2-failure is *very* expensive; only
2210 for (other = disks; other--; ) { 2315 * do it if failed >= 2
2211 if (other == i) 2316 */
2212 continue; 2317 int other;
2213 if (!test_bit(R5_UPTODATE, 2318 for (other = disks; other--; ) {
2214 &sh->dev[other].flags)) 2319 if (other == disk_idx)
2215 break; 2320 continue;
2216 } 2321 if (!test_bit(R5_UPTODATE,
2217 BUG_ON(other < 0); 2322 &sh->dev[other].flags))
2218 pr_debug("Computing stripe %llu blocks %d,%d\n", 2323 break;
2219 (unsigned long long)sh->sector,
2220 i, other);
2221 compute_block_2(sh, i, other);
2222 s->uptodate += 2;
2223 } else if (test_bit(R5_Insync, &dev->flags)) {
2224 set_bit(R5_LOCKED, &dev->flags);
2225 set_bit(R5_Wantread, &dev->flags);
2226 s->locked++;
2227 pr_debug("Reading block %d (sync=%d)\n",
2228 i, s->syncing);
2229 } 2324 }
2325 BUG_ON(other < 0);
2326 pr_debug("Computing stripe %llu blocks %d,%d\n",
2327 (unsigned long long)sh->sector,
2328 disk_idx, other);
2329 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2330 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2331 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2332 set_bit(R5_Wantcompute, &sh->dev[other].flags);
2333 sh->ops.target = disk_idx;
2334 sh->ops.target2 = other;
2335 s->uptodate += 2;
2336 s->req_compute = 1;
2337 return 1;
2338 } else if (test_bit(R5_Insync, &dev->flags)) {
2339 set_bit(R5_LOCKED, &dev->flags);
2340 set_bit(R5_Wantread, &dev->flags);
2341 s->locked++;
2342 pr_debug("Reading block %d (sync=%d)\n",
2343 disk_idx, s->syncing);
2230 } 2344 }
2231 } 2345 }
2346
2347 return 0;
2348}
2349
2350/**
2351 * handle_stripe_fill6 - read or compute data to satisfy pending requests.
2352 */
2353static void handle_stripe_fill6(struct stripe_head *sh,
2354 struct stripe_head_state *s, struct r6_state *r6s,
2355 int disks)
2356{
2357 int i;
2358
2359 /* look for blocks to read/compute, skip this if a compute
2360 * is already in flight, or if the stripe contents are in the
2361 * midst of changing due to a write
2362 */
2363 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2364 !sh->reconstruct_state)
2365 for (i = disks; i--; )
2366 if (fetch_block6(sh, s, r6s, i, disks))
2367 break;
2232 set_bit(STRIPE_HANDLE, &sh->state); 2368 set_bit(STRIPE_HANDLE, &sh->state);
2233} 2369}
2234 2370
@@ -2362,114 +2498,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2362 */ 2498 */
2363 /* since handle_stripe can be called at any time we need to handle the 2499 /* since handle_stripe can be called at any time we need to handle the
2364 * case where a compute block operation has been submitted and then a 2500 * case where a compute block operation has been submitted and then a
2365 * subsequent call wants to start a write request. raid5_run_ops only 2501 * subsequent call wants to start a write request. raid_run_ops only
2366 * handles the case where compute block and postxor are requested 2502 * handles the case where compute block and reconstruct are requested
2367 * simultaneously. If this is not the case then new writes need to be 2503 * simultaneously. If this is not the case then new writes need to be
2368 * held off until the compute completes. 2504 * held off until the compute completes.
2369 */ 2505 */
2370 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2506 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2371 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2507 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2372 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2508 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2373 schedule_reconstruction5(sh, s, rcw == 0, 0); 2509 schedule_reconstruction(sh, s, rcw == 0, 0);
2374} 2510}
2375 2511
2376static void handle_stripe_dirtying6(raid5_conf_t *conf, 2512static void handle_stripe_dirtying6(raid5_conf_t *conf,
2377 struct stripe_head *sh, struct stripe_head_state *s, 2513 struct stripe_head *sh, struct stripe_head_state *s,
2378 struct r6_state *r6s, int disks) 2514 struct r6_state *r6s, int disks)
2379{ 2515{
2380 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; 2516 int rcw = 0, pd_idx = sh->pd_idx, i;
2381 int qd_idx = sh->qd_idx; 2517 int qd_idx = sh->qd_idx;
2518
2519 set_bit(STRIPE_HANDLE, &sh->state);
2382 for (i = disks; i--; ) { 2520 for (i = disks; i--; ) {
2383 struct r5dev *dev = &sh->dev[i]; 2521 struct r5dev *dev = &sh->dev[i];
2384 /* Would I have to read this buffer for reconstruct_write */ 2522 /* check if we haven't enough data */
2385 if (!test_bit(R5_OVERWRITE, &dev->flags) 2523 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2386 && i != pd_idx && i != qd_idx 2524 i != pd_idx && i != qd_idx &&
2387 && (!test_bit(R5_LOCKED, &dev->flags) 2525 !test_bit(R5_LOCKED, &dev->flags) &&
2388 ) && 2526 !(test_bit(R5_UPTODATE, &dev->flags) ||
2389 !test_bit(R5_UPTODATE, &dev->flags)) { 2527 test_bit(R5_Wantcompute, &dev->flags))) {
2390 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2528 rcw++;
2391 else { 2529 if (!test_bit(R5_Insync, &dev->flags))
2392 pr_debug("raid6: must_compute: " 2530 continue; /* it's a failed drive */
2393 "disk %d flags=%#lx\n", i, dev->flags); 2531
2394 must_compute++; 2532 if (
2533 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2534 pr_debug("Read_old stripe %llu "
2535 "block %d for Reconstruct\n",
2536 (unsigned long long)sh->sector, i);
2537 set_bit(R5_LOCKED, &dev->flags);
2538 set_bit(R5_Wantread, &dev->flags);
2539 s->locked++;
2540 } else {
2541 pr_debug("Request delayed stripe %llu "
2542 "block %d for Reconstruct\n",
2543 (unsigned long long)sh->sector, i);
2544 set_bit(STRIPE_DELAYED, &sh->state);
2545 set_bit(STRIPE_HANDLE, &sh->state);
2395 } 2546 }
2396 } 2547 }
2397 } 2548 }
2398 pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
2399 (unsigned long long)sh->sector, rcw, must_compute);
2400 set_bit(STRIPE_HANDLE, &sh->state);
2401
2402 if (rcw > 0)
2403 /* want reconstruct write, but need to get some data */
2404 for (i = disks; i--; ) {
2405 struct r5dev *dev = &sh->dev[i];
2406 if (!test_bit(R5_OVERWRITE, &dev->flags)
2407 && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
2408 && !test_bit(R5_LOCKED, &dev->flags) &&
2409 !test_bit(R5_UPTODATE, &dev->flags) &&
2410 test_bit(R5_Insync, &dev->flags)) {
2411 if (
2412 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2413 pr_debug("Read_old stripe %llu "
2414 "block %d for Reconstruct\n",
2415 (unsigned long long)sh->sector, i);
2416 set_bit(R5_LOCKED, &dev->flags);
2417 set_bit(R5_Wantread, &dev->flags);
2418 s->locked++;
2419 } else {
2420 pr_debug("Request delayed stripe %llu "
2421 "block %d for Reconstruct\n",
2422 (unsigned long long)sh->sector, i);
2423 set_bit(STRIPE_DELAYED, &sh->state);
2424 set_bit(STRIPE_HANDLE, &sh->state);
2425 }
2426 }
2427 }
2428 /* now if nothing is locked, and if we have enough data, we can start a 2549 /* now if nothing is locked, and if we have enough data, we can start a
2429 * write request 2550 * write request
2430 */ 2551 */
2431 if (s->locked == 0 && rcw == 0 && 2552 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2553 s->locked == 0 && rcw == 0 &&
2432 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 2554 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2433 if (must_compute > 0) { 2555 schedule_reconstruction(sh, s, 1, 0);
2434 /* We have failed blocks and need to compute them */
2435 switch (s->failed) {
2436 case 0:
2437 BUG();
2438 case 1:
2439 compute_block_1(sh, r6s->failed_num[0], 0);
2440 break;
2441 case 2:
2442 compute_block_2(sh, r6s->failed_num[0],
2443 r6s->failed_num[1]);
2444 break;
2445 default: /* This request should have been failed? */
2446 BUG();
2447 }
2448 }
2449
2450 pr_debug("Computing parity for stripe %llu\n",
2451 (unsigned long long)sh->sector);
2452 compute_parity6(sh, RECONSTRUCT_WRITE);
2453 /* now every locked buffer is ready to be written */
2454 for (i = disks; i--; )
2455 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2456 pr_debug("Writing stripe %llu block %d\n",
2457 (unsigned long long)sh->sector, i);
2458 s->locked++;
2459 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2460 }
2461 if (s->locked == disks)
2462 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2463 atomic_inc(&conf->pending_full_writes);
2464 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2465 set_bit(STRIPE_INSYNC, &sh->state);
2466
2467 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2468 atomic_dec(&conf->preread_active_stripes);
2469 if (atomic_read(&conf->preread_active_stripes) <
2470 IO_THRESHOLD)
2471 md_wakeup_thread(conf->mddev->thread);
2472 }
2473 } 2556 }
2474} 2557}
2475 2558
@@ -2528,7 +2611,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2528 * we are done. Otherwise update the mismatch count and repair 2611 * we are done. Otherwise update the mismatch count and repair
2529 * parity if !MD_RECOVERY_CHECK 2612 * parity if !MD_RECOVERY_CHECK
2530 */ 2613 */
2531 if (sh->ops.zero_sum_result == 0) 2614 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2532 /* parity is correct (on disc, 2615 /* parity is correct (on disc,
2533 * not in buffer any more) 2616 * not in buffer any more)
2534 */ 2617 */
@@ -2545,6 +2628,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2545 set_bit(R5_Wantcompute, 2628 set_bit(R5_Wantcompute,
2546 &sh->dev[sh->pd_idx].flags); 2629 &sh->dev[sh->pd_idx].flags);
2547 sh->ops.target = sh->pd_idx; 2630 sh->ops.target = sh->pd_idx;
2631 sh->ops.target2 = -1;
2548 s->uptodate++; 2632 s->uptodate++;
2549 } 2633 }
2550 } 2634 }
@@ -2561,67 +2645,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2561 2645
2562 2646
2563static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2647static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2564 struct stripe_head_state *s, 2648 struct stripe_head_state *s,
2565 struct r6_state *r6s, struct page *tmp_page, 2649 struct r6_state *r6s, int disks)
2566 int disks)
2567{ 2650{
2568 int update_p = 0, update_q = 0;
2569 struct r5dev *dev;
2570 int pd_idx = sh->pd_idx; 2651 int pd_idx = sh->pd_idx;
2571 int qd_idx = sh->qd_idx; 2652 int qd_idx = sh->qd_idx;
2653 struct r5dev *dev;
2572 2654
2573 set_bit(STRIPE_HANDLE, &sh->state); 2655 set_bit(STRIPE_HANDLE, &sh->state);
2574 2656
2575 BUG_ON(s->failed > 2); 2657 BUG_ON(s->failed > 2);
2576 BUG_ON(s->uptodate < disks); 2658
2577 /* Want to check and possibly repair P and Q. 2659 /* Want to check and possibly repair P and Q.
2578 * However there could be one 'failed' device, in which 2660 * However there could be one 'failed' device, in which
2579 * case we can only check one of them, possibly using the 2661 * case we can only check one of them, possibly using the
2580 * other to generate missing data 2662 * other to generate missing data
2581 */ 2663 */
2582 2664
2583 /* If !tmp_page, we cannot do the calculations, 2665 switch (sh->check_state) {
2584 * but as we have set STRIPE_HANDLE, we will soon be called 2666 case check_state_idle:
2585 * by stripe_handle with a tmp_page - just wait until then. 2667 /* start a new check operation if there are < 2 failures */
2586 */
2587 if (tmp_page) {
2588 if (s->failed == r6s->q_failed) { 2668 if (s->failed == r6s->q_failed) {
2589 /* The only possible failed device holds 'Q', so it 2669 /* The only possible failed device holds Q, so it
2590 * makes sense to check P (If anything else were failed, 2670 * makes sense to check P (If anything else were failed,
2591 * we would have used P to recreate it). 2671 * we would have used P to recreate it).
2592 */ 2672 */
2593 compute_block_1(sh, pd_idx, 1); 2673 sh->check_state = check_state_run;
2594 if (!page_is_zero(sh->dev[pd_idx].page)) {
2595 compute_block_1(sh, pd_idx, 0);
2596 update_p = 1;
2597 }
2598 } 2674 }
2599 if (!r6s->q_failed && s->failed < 2) { 2675 if (!r6s->q_failed && s->failed < 2) {
2600 /* q is not failed, and we didn't use it to generate 2676 /* Q is not failed, and we didn't use it to generate
2601 * anything, so it makes sense to check it 2677 * anything, so it makes sense to check it
2602 */ 2678 */
2603 memcpy(page_address(tmp_page), 2679 if (sh->check_state == check_state_run)
2604 page_address(sh->dev[qd_idx].page), 2680 sh->check_state = check_state_run_pq;
2605 STRIPE_SIZE); 2681 else
2606 compute_parity6(sh, UPDATE_PARITY); 2682 sh->check_state = check_state_run_q;
2607 if (memcmp(page_address(tmp_page),
2608 page_address(sh->dev[qd_idx].page),
2609 STRIPE_SIZE) != 0) {
2610 clear_bit(STRIPE_INSYNC, &sh->state);
2611 update_q = 1;
2612 }
2613 } 2683 }
2614 if (update_p || update_q) { 2684
2615 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2685 /* discard potentially stale zero_sum_result */
2616 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2686 sh->ops.zero_sum_result = 0;
2617 /* don't try to repair!! */ 2687
2618 update_p = update_q = 0; 2688 if (sh->check_state == check_state_run) {
2689 /* async_xor_zero_sum destroys the contents of P */
2690 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2691 s->uptodate--;
2619 } 2692 }
2693 if (sh->check_state >= check_state_run &&
2694 sh->check_state <= check_state_run_pq) {
2695 /* async_syndrome_zero_sum preserves P and Q, so
2696 * no need to mark them !uptodate here
2697 */
2698 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2699 break;
2700 }
2701
2702 /* we have 2-disk failure */
2703 BUG_ON(s->failed != 2);
2704 /* fall through */
2705 case check_state_compute_result:
2706 sh->check_state = check_state_idle;
2707
2708 /* check that a write has not made the stripe insync */
2709 if (test_bit(STRIPE_INSYNC, &sh->state))
2710 break;
2620 2711
2621 /* now write out any block on a failed drive, 2712 /* now write out any block on a failed drive,
2622 * or P or Q if they need it 2713 * or P or Q if they were recomputed
2623 */ 2714 */
2624 2715 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2625 if (s->failed == 2) { 2716 if (s->failed == 2) {
2626 dev = &sh->dev[r6s->failed_num[1]]; 2717 dev = &sh->dev[r6s->failed_num[1]];
2627 s->locked++; 2718 s->locked++;
@@ -2634,14 +2725,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2634 set_bit(R5_LOCKED, &dev->flags); 2725 set_bit(R5_LOCKED, &dev->flags);
2635 set_bit(R5_Wantwrite, &dev->flags); 2726 set_bit(R5_Wantwrite, &dev->flags);
2636 } 2727 }
2637 2728 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2638 if (update_p) {
2639 dev = &sh->dev[pd_idx]; 2729 dev = &sh->dev[pd_idx];
2640 s->locked++; 2730 s->locked++;
2641 set_bit(R5_LOCKED, &dev->flags); 2731 set_bit(R5_LOCKED, &dev->flags);
2642 set_bit(R5_Wantwrite, &dev->flags); 2732 set_bit(R5_Wantwrite, &dev->flags);
2643 } 2733 }
2644 if (update_q) { 2734 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2645 dev = &sh->dev[qd_idx]; 2735 dev = &sh->dev[qd_idx];
2646 s->locked++; 2736 s->locked++;
2647 set_bit(R5_LOCKED, &dev->flags); 2737 set_bit(R5_LOCKED, &dev->flags);
@@ -2650,6 +2740,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2650 clear_bit(STRIPE_DEGRADED, &sh->state); 2740 clear_bit(STRIPE_DEGRADED, &sh->state);
2651 2741
2652 set_bit(STRIPE_INSYNC, &sh->state); 2742 set_bit(STRIPE_INSYNC, &sh->state);
2743 break;
2744 case check_state_run:
2745 case check_state_run_q:
2746 case check_state_run_pq:
2747 break; /* we will be called again upon completion */
2748 case check_state_check_result:
2749 sh->check_state = check_state_idle;
2750
2751 /* handle a successful check operation, if parity is correct
2752 * we are done. Otherwise update the mismatch count and repair
2753 * parity if !MD_RECOVERY_CHECK
2754 */
2755 if (sh->ops.zero_sum_result == 0) {
2756 /* both parities are correct */
2757 if (!s->failed)
2758 set_bit(STRIPE_INSYNC, &sh->state);
2759 else {
2760 /* in contrast to the raid5 case we can validate
2761 * parity, but still have a failure to write
2762 * back
2763 */
2764 sh->check_state = check_state_compute_result;
2765 /* Returning at this point means that we may go
2766 * off and bring p and/or q uptodate again so
2767 * we make sure to check zero_sum_result again
2768 * to verify if p or q need writeback
2769 */
2770 }
2771 } else {
2772 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2773 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2774 /* don't try to repair!! */
2775 set_bit(STRIPE_INSYNC, &sh->state);
2776 else {
2777 int *target = &sh->ops.target;
2778
2779 sh->ops.target = -1;
2780 sh->ops.target2 = -1;
2781 sh->check_state = check_state_compute_run;
2782 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2783 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2784 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2785 set_bit(R5_Wantcompute,
2786 &sh->dev[pd_idx].flags);
2787 *target = pd_idx;
2788 target = &sh->ops.target2;
2789 s->uptodate++;
2790 }
2791 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2792 set_bit(R5_Wantcompute,
2793 &sh->dev[qd_idx].flags);
2794 *target = qd_idx;
2795 s->uptodate++;
2796 }
2797 }
2798 }
2799 break;
2800 case check_state_compute_run:
2801 break;
2802 default:
2803 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2804 __func__, sh->check_state,
2805 (unsigned long long) sh->sector);
2806 BUG();
2653 } 2807 }
2654} 2808}
2655 2809
@@ -2667,6 +2821,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2667 if (i != sh->pd_idx && i != sh->qd_idx) { 2821 if (i != sh->pd_idx && i != sh->qd_idx) {
2668 int dd_idx, j; 2822 int dd_idx, j;
2669 struct stripe_head *sh2; 2823 struct stripe_head *sh2;
2824 struct async_submit_ctl submit;
2670 2825
2671 sector_t bn = compute_blocknr(sh, i, 1); 2826 sector_t bn = compute_blocknr(sh, i, 1);
2672 sector_t s = raid5_compute_sector(conf, bn, 0, 2827 sector_t s = raid5_compute_sector(conf, bn, 0,
@@ -2686,9 +2841,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2686 } 2841 }
2687 2842
2688 /* place all the copies on one channel */ 2843 /* place all the copies on one channel */
2844 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
2689 tx = async_memcpy(sh2->dev[dd_idx].page, 2845 tx = async_memcpy(sh2->dev[dd_idx].page,
2690 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2846 sh->dev[i].page, 0, 0, STRIPE_SIZE,
2691 ASYNC_TX_DEP_ACK, tx, NULL, NULL); 2847 &submit);
2692 2848
2693 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2849 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2694 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2850 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
@@ -2974,7 +3130,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2974 /* Need to write out all blocks after computing parity */ 3130 /* Need to write out all blocks after computing parity */
2975 sh->disks = conf->raid_disks; 3131 sh->disks = conf->raid_disks;
2976 stripe_set_idx(sh->sector, conf, 0, sh); 3132 stripe_set_idx(sh->sector, conf, 0, sh);
2977 schedule_reconstruction5(sh, &s, 1, 1); 3133 schedule_reconstruction(sh, &s, 1, 1);
2978 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3134 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
2979 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3135 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2980 atomic_dec(&conf->reshape_stripes); 3136 atomic_dec(&conf->reshape_stripes);
@@ -2994,7 +3150,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2994 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3150 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2995 3151
2996 if (s.ops_request) 3152 if (s.ops_request)
2997 raid5_run_ops(sh, s.ops_request); 3153 raid_run_ops(sh, s.ops_request);
2998 3154
2999 ops_run_io(sh, &s); 3155 ops_run_io(sh, &s);
3000 3156
@@ -3003,7 +3159,7 @@ static bool handle_stripe5(struct stripe_head *sh)
3003 return blocked_rdev == NULL; 3159 return blocked_rdev == NULL;
3004} 3160}
3005 3161
3006static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 3162static bool handle_stripe6(struct stripe_head *sh)
3007{ 3163{
3008 raid5_conf_t *conf = sh->raid_conf; 3164 raid5_conf_t *conf = sh->raid_conf;
3009 int disks = sh->disks; 3165 int disks = sh->disks;
@@ -3015,9 +3171,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3015 mdk_rdev_t *blocked_rdev = NULL; 3171 mdk_rdev_t *blocked_rdev = NULL;
3016 3172
3017 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3173 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3018 "pd_idx=%d, qd_idx=%d\n", 3174 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3019 (unsigned long long)sh->sector, sh->state, 3175 (unsigned long long)sh->sector, sh->state,
3020 atomic_read(&sh->count), pd_idx, qd_idx); 3176 atomic_read(&sh->count), pd_idx, qd_idx,
3177 sh->check_state, sh->reconstruct_state);
3021 memset(&s, 0, sizeof(s)); 3178 memset(&s, 0, sizeof(s));
3022 3179
3023 spin_lock(&sh->lock); 3180 spin_lock(&sh->lock);
@@ -3037,35 +3194,24 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3037 3194
3038 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3195 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3039 i, dev->flags, dev->toread, dev->towrite, dev->written); 3196 i, dev->flags, dev->toread, dev->towrite, dev->written);
3040 /* maybe we can reply to a read */ 3197 /* maybe we can reply to a read
3041 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { 3198 *
3042 struct bio *rbi, *rbi2; 3199 * new wantfill requests are only permitted while
3043 pr_debug("Return read for disc %d\n", i); 3200 * ops_complete_biofill is guaranteed to be inactive
3044 spin_lock_irq(&conf->device_lock); 3201 */
3045 rbi = dev->toread; 3202 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3046 dev->toread = NULL; 3203 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3047 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 3204 set_bit(R5_Wantfill, &dev->flags);
3048 wake_up(&conf->wait_for_overlap);
3049 spin_unlock_irq(&conf->device_lock);
3050 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
3051 copy_data(0, rbi, dev->page, dev->sector);
3052 rbi2 = r5_next_bio(rbi, dev->sector);
3053 spin_lock_irq(&conf->device_lock);
3054 if (!raid5_dec_bi_phys_segments(rbi)) {
3055 rbi->bi_next = return_bi;
3056 return_bi = rbi;
3057 }
3058 spin_unlock_irq(&conf->device_lock);
3059 rbi = rbi2;
3060 }
3061 }
3062 3205
3063 /* now count some things */ 3206 /* now count some things */
3064 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3207 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
3065 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3208 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
3209 if (test_bit(R5_Wantcompute, &dev->flags))
3210 BUG_ON(++s.compute > 2);
3066 3211
3067 3212 if (test_bit(R5_Wantfill, &dev->flags)) {
3068 if (dev->toread) 3213 s.to_fill++;
3214 } else if (dev->toread)
3069 s.to_read++; 3215 s.to_read++;
3070 if (dev->towrite) { 3216 if (dev->towrite) {
3071 s.to_write++; 3217 s.to_write++;
@@ -3106,6 +3252,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3106 blocked_rdev = NULL; 3252 blocked_rdev = NULL;
3107 } 3253 }
3108 3254
3255 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3256 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3257 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3258 }
3259
3109 pr_debug("locked=%d uptodate=%d to_read=%d" 3260 pr_debug("locked=%d uptodate=%d to_read=%d"
3110 " to_write=%d failed=%d failed_num=%d,%d\n", 3261 " to_write=%d failed=%d failed_num=%d,%d\n",
3111 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3262 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3146,19 +3297,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3146 * or to load a block that is being partially written. 3297 * or to load a block that is being partially written.
3147 */ 3298 */
3148 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3299 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3149 (s.syncing && (s.uptodate < disks)) || s.expanding) 3300 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3150 handle_stripe_fill6(sh, &s, &r6s, disks); 3301 handle_stripe_fill6(sh, &s, &r6s, disks);
3151 3302
3152 /* now to consider writing and what else, if anything should be read */ 3303 /* Now we check to see if any write operations have recently
3153 if (s.to_write) 3304 * completed
3305 */
3306 if (sh->reconstruct_state == reconstruct_state_drain_result) {
3307 int qd_idx = sh->qd_idx;
3308
3309 sh->reconstruct_state = reconstruct_state_idle;
3310 /* All the 'written' buffers and the parity blocks are ready to
3311 * be written back to disk
3312 */
3313 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3314 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
3315 for (i = disks; i--; ) {
3316 dev = &sh->dev[i];
3317 if (test_bit(R5_LOCKED, &dev->flags) &&
3318 (i == sh->pd_idx || i == qd_idx ||
3319 dev->written)) {
3320 pr_debug("Writing block %d\n", i);
3321 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3322 set_bit(R5_Wantwrite, &dev->flags);
3323 if (!test_bit(R5_Insync, &dev->flags) ||
3324 ((i == sh->pd_idx || i == qd_idx) &&
3325 s.failed == 0))
3326 set_bit(STRIPE_INSYNC, &sh->state);
3327 }
3328 }
3329 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
3330 atomic_dec(&conf->preread_active_stripes);
3331 if (atomic_read(&conf->preread_active_stripes) <
3332 IO_THRESHOLD)
3333 md_wakeup_thread(conf->mddev->thread);
3334 }
3335 }
3336
3337 /* Now to consider new write requests and what else, if anything
3338 * should be read. We do not handle new writes when:
3339 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
3340 * 2/ A 'check' operation is in flight, as it may clobber the parity
3341 * block.
3342 */
3343 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3154 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3344 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3155 3345
3156 /* maybe we need to check and possibly fix the parity for this stripe 3346 /* maybe we need to check and possibly fix the parity for this stripe
3157 * Any reads will already have been scheduled, so we just see if enough 3347 * Any reads will already have been scheduled, so we just see if enough
3158 * data is available 3348 * data is available. The parity check is held off while parity
3349 * dependent operations are in flight.
3159 */ 3350 */
3160 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) 3351 if (sh->check_state ||
3161 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); 3352 (s.syncing && s.locked == 0 &&
3353 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3354 !test_bit(STRIPE_INSYNC, &sh->state)))
3355 handle_parity_checks6(conf, sh, &s, &r6s, disks);
3162 3356
3163 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3357 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3164 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3358 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
@@ -3179,15 +3373,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3179 set_bit(R5_Wantwrite, &dev->flags); 3373 set_bit(R5_Wantwrite, &dev->flags);
3180 set_bit(R5_ReWrite, &dev->flags); 3374 set_bit(R5_ReWrite, &dev->flags);
3181 set_bit(R5_LOCKED, &dev->flags); 3375 set_bit(R5_LOCKED, &dev->flags);
3376 s.locked++;
3182 } else { 3377 } else {
3183 /* let's read it back */ 3378 /* let's read it back */
3184 set_bit(R5_Wantread, &dev->flags); 3379 set_bit(R5_Wantread, &dev->flags);
3185 set_bit(R5_LOCKED, &dev->flags); 3380 set_bit(R5_LOCKED, &dev->flags);
3381 s.locked++;
3186 } 3382 }
3187 } 3383 }
3188 } 3384 }
3189 3385
3190 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 3386 /* Finish reconstruct operations initiated by the expansion process */
3387 if (sh->reconstruct_state == reconstruct_state_result) {
3388 sh->reconstruct_state = reconstruct_state_idle;
3389 clear_bit(STRIPE_EXPANDING, &sh->state);
3390 for (i = conf->raid_disks; i--; ) {
3391 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3392 set_bit(R5_LOCKED, &sh->dev[i].flags);
3393 s.locked++;
3394 }
3395 }
3396
3397 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3398 !sh->reconstruct_state) {
3191 struct stripe_head *sh2 3399 struct stripe_head *sh2
3192 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3400 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3193 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3401 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
@@ -3208,14 +3416,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3208 /* Need to write out all blocks after computing P&Q */ 3416 /* Need to write out all blocks after computing P&Q */
3209 sh->disks = conf->raid_disks; 3417 sh->disks = conf->raid_disks;
3210 stripe_set_idx(sh->sector, conf, 0, sh); 3418 stripe_set_idx(sh->sector, conf, 0, sh);
3211 compute_parity6(sh, RECONSTRUCT_WRITE); 3419 schedule_reconstruction(sh, &s, 1, 1);
3212 for (i = conf->raid_disks ; i-- ; ) { 3420 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3213 set_bit(R5_LOCKED, &sh->dev[i].flags);
3214 s.locked++;
3215 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3216 }
3217 clear_bit(STRIPE_EXPANDING, &sh->state);
3218 } else if (s.expanded) {
3219 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3421 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3220 atomic_dec(&conf->reshape_stripes); 3422 atomic_dec(&conf->reshape_stripes);
3221 wake_up(&conf->wait_for_overlap); 3423 wake_up(&conf->wait_for_overlap);
@@ -3233,6 +3435,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3233 if (unlikely(blocked_rdev)) 3435 if (unlikely(blocked_rdev))
3234 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3436 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3235 3437
3438 if (s.ops_request)
3439 raid_run_ops(sh, s.ops_request);
3440
3236 ops_run_io(sh, &s); 3441 ops_run_io(sh, &s);
3237 3442
3238 return_io(return_bi); 3443 return_io(return_bi);
@@ -3241,16 +3446,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3241} 3446}
3242 3447
3243/* returns true if the stripe was handled */ 3448/* returns true if the stripe was handled */
3244static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) 3449static bool handle_stripe(struct stripe_head *sh)
3245{ 3450{
3246 if (sh->raid_conf->level == 6) 3451 if (sh->raid_conf->level == 6)
3247 return handle_stripe6(sh, tmp_page); 3452 return handle_stripe6(sh);
3248 else 3453 else
3249 return handle_stripe5(sh); 3454 return handle_stripe5(sh);
3250} 3455}
3251 3456
3252
3253
3254static void raid5_activate_delayed(raid5_conf_t *conf) 3457static void raid5_activate_delayed(raid5_conf_t *conf)
3255{ 3458{
3256 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3459 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
@@ -4046,7 +4249,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4046 spin_unlock(&sh->lock); 4249 spin_unlock(&sh->lock);
4047 4250
4048 /* wait for any blocked device to be handled */ 4251 /* wait for any blocked device to be handled */
4049 while(unlikely(!handle_stripe(sh, NULL))) 4252 while (unlikely(!handle_stripe(sh)))
4050 ; 4253 ;
4051 release_stripe(sh); 4254 release_stripe(sh);
4052 4255
@@ -4103,7 +4306,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4103 return handled; 4306 return handled;
4104 } 4307 }
4105 4308
4106 handle_stripe(sh, NULL); 4309 handle_stripe(sh);
4107 release_stripe(sh); 4310 release_stripe(sh);
4108 handled++; 4311 handled++;
4109 } 4312 }
@@ -4117,6 +4320,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4117 return handled; 4320 return handled;
4118} 4321}
4119 4322
4323#ifdef CONFIG_MULTICORE_RAID456
4324static void __process_stripe(void *param, async_cookie_t cookie)
4325{
4326 struct stripe_head *sh = param;
4327
4328 handle_stripe(sh);
4329 release_stripe(sh);
4330}
4331
4332static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4333{
4334 async_schedule_domain(__process_stripe, sh, domain);
4335}
4336
4337static void synchronize_stripe_processing(struct list_head *domain)
4338{
4339 async_synchronize_full_domain(domain);
4340}
4341#else
4342static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4343{
4344 handle_stripe(sh);
4345 release_stripe(sh);
4346 cond_resched();
4347}
4348
4349static void synchronize_stripe_processing(struct list_head *domain)
4350{
4351}
4352#endif
4120 4353
4121 4354
4122/* 4355/*
@@ -4131,6 +4364,7 @@ static void raid5d(mddev_t *mddev)
4131 struct stripe_head *sh; 4364 struct stripe_head *sh;
4132 raid5_conf_t *conf = mddev_to_conf(mddev); 4365 raid5_conf_t *conf = mddev_to_conf(mddev);
4133 int handled; 4366 int handled;
4367 LIST_HEAD(raid_domain);
4134 4368
4135 pr_debug("+++ raid5d active\n"); 4369 pr_debug("+++ raid5d active\n");
4136 4370
@@ -4167,8 +4401,7 @@ static void raid5d(mddev_t *mddev)
4167 spin_unlock_irq(&conf->device_lock); 4401 spin_unlock_irq(&conf->device_lock);
4168 4402
4169 handled++; 4403 handled++;
4170 handle_stripe(sh, conf->spare_page); 4404 process_stripe(sh, &raid_domain);
4171 release_stripe(sh);
4172 4405
4173 spin_lock_irq(&conf->device_lock); 4406 spin_lock_irq(&conf->device_lock);
4174 } 4407 }
@@ -4176,6 +4409,7 @@ static void raid5d(mddev_t *mddev)
4176 4409
4177 spin_unlock_irq(&conf->device_lock); 4410 spin_unlock_irq(&conf->device_lock);
4178 4411
4412 synchronize_stripe_processing(&raid_domain);
4179 async_tx_issue_pending_all(); 4413 async_tx_issue_pending_all();
4180 unplug_slaves(mddev); 4414 unplug_slaves(mddev);
4181 4415
@@ -4308,6 +4542,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4308 return sectors * (raid_disks - conf->max_degraded); 4542 return sectors * (raid_disks - conf->max_degraded);
4309} 4543}
4310 4544
4545static void raid5_free_percpu(raid5_conf_t *conf)
4546{
4547 struct raid5_percpu *percpu;
4548 unsigned long cpu;
4549
4550 if (!conf->percpu)
4551 return;
4552
4553 get_online_cpus();
4554 for_each_possible_cpu(cpu) {
4555 percpu = per_cpu_ptr(conf->percpu, cpu);
4556 safe_put_page(percpu->spare_page);
4557 kfree(percpu->scribble);
4558 }
4559#ifdef CONFIG_HOTPLUG_CPU
4560 unregister_cpu_notifier(&conf->cpu_notify);
4561#endif
4562 put_online_cpus();
4563
4564 free_percpu(conf->percpu);
4565}
4566
4567static void free_conf(raid5_conf_t *conf)
4568{
4569 shrink_stripes(conf);
4570 raid5_free_percpu(conf);
4571 kfree(conf->disks);
4572 kfree(conf->stripe_hashtbl);
4573 kfree(conf);
4574}
4575
4576#ifdef CONFIG_HOTPLUG_CPU
4577static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4578 void *hcpu)
4579{
4580 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
4581 long cpu = (long)hcpu;
4582 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
4583
4584 switch (action) {
4585 case CPU_UP_PREPARE:
4586 case CPU_UP_PREPARE_FROZEN:
4587 if (conf->level == 6 && !percpu->spare_page)
4588 percpu->spare_page = alloc_page(GFP_KERNEL);
4589 if (!percpu->scribble)
4590 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4591
4592 if (!percpu->scribble ||
4593 (conf->level == 6 && !percpu->spare_page)) {
4594 safe_put_page(percpu->spare_page);
4595 kfree(percpu->scribble);
4596 pr_err("%s: failed memory allocation for cpu%ld\n",
4597 __func__, cpu);
4598 return NOTIFY_BAD;
4599 }
4600 break;
4601 case CPU_DEAD:
4602 case CPU_DEAD_FROZEN:
4603 safe_put_page(percpu->spare_page);
4604 kfree(percpu->scribble);
4605 percpu->spare_page = NULL;
4606 percpu->scribble = NULL;
4607 break;
4608 default:
4609 break;
4610 }
4611 return NOTIFY_OK;
4612}
4613#endif
4614
4615static int raid5_alloc_percpu(raid5_conf_t *conf)
4616{
4617 unsigned long cpu;
4618 struct page *spare_page;
4619 struct raid5_percpu *allcpus;
4620 void *scribble;
4621 int err;
4622
4623 allcpus = alloc_percpu(struct raid5_percpu);
4624 if (!allcpus)
4625 return -ENOMEM;
4626 conf->percpu = allcpus;
4627
4628 get_online_cpus();
4629 err = 0;
4630 for_each_present_cpu(cpu) {
4631 if (conf->level == 6) {
4632 spare_page = alloc_page(GFP_KERNEL);
4633 if (!spare_page) {
4634 err = -ENOMEM;
4635 break;
4636 }
4637 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4638 }
4639 scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
4640 if (!scribble) {
4641 err = -ENOMEM;
4642 break;
4643 }
4644 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
4645 }
4646#ifdef CONFIG_HOTPLUG_CPU
4647 conf->cpu_notify.notifier_call = raid456_cpu_notify;
4648 conf->cpu_notify.priority = 0;
4649 if (err == 0)
4650 err = register_cpu_notifier(&conf->cpu_notify);
4651#endif
4652 put_online_cpus();
4653
4654 return err;
4655}
4656
4311static raid5_conf_t *setup_conf(mddev_t *mddev) 4657static raid5_conf_t *setup_conf(mddev_t *mddev)
4312{ 4658{
4313 raid5_conf_t *conf; 4659 raid5_conf_t *conf;
@@ -4347,6 +4693,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4347 goto abort; 4693 goto abort;
4348 4694
4349 conf->raid_disks = mddev->raid_disks; 4695 conf->raid_disks = mddev->raid_disks;
4696 conf->scribble_len = scribble_len(conf->raid_disks);
4350 if (mddev->reshape_position == MaxSector) 4697 if (mddev->reshape_position == MaxSector)
4351 conf->previous_raid_disks = mddev->raid_disks; 4698 conf->previous_raid_disks = mddev->raid_disks;
4352 else 4699 else
@@ -4362,11 +4709,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4362 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4709 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4363 goto abort; 4710 goto abort;
4364 4711
4365 if (mddev->new_level == 6) { 4712 conf->level = mddev->new_level;
4366 conf->spare_page = alloc_page(GFP_KERNEL); 4713 if (raid5_alloc_percpu(conf) != 0)
4367 if (!conf->spare_page) 4714 goto abort;
4368 goto abort; 4715
4369 }
4370 spin_lock_init(&conf->device_lock); 4716 spin_lock_init(&conf->device_lock);
4371 init_waitqueue_head(&conf->wait_for_stripe); 4717 init_waitqueue_head(&conf->wait_for_stripe);
4372 init_waitqueue_head(&conf->wait_for_overlap); 4718 init_waitqueue_head(&conf->wait_for_overlap);
@@ -4402,7 +4748,6 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4402 } 4748 }
4403 4749
4404 conf->chunk_size = mddev->new_chunk; 4750 conf->chunk_size = mddev->new_chunk;
4405 conf->level = mddev->new_level;
4406 if (conf->level == 6) 4751 if (conf->level == 6)
4407 conf->max_degraded = 2; 4752 conf->max_degraded = 2;
4408 else 4753 else
@@ -4437,11 +4782,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4437 4782
4438 abort: 4783 abort:
4439 if (conf) { 4784 if (conf) {
4440 shrink_stripes(conf); 4785 free_conf(conf);
4441 safe_put_page(conf->spare_page);
4442 kfree(conf->disks);
4443 kfree(conf->stripe_hashtbl);
4444 kfree(conf);
4445 return ERR_PTR(-EIO); 4786 return ERR_PTR(-EIO);
4446 } else 4787 } else
4447 return ERR_PTR(-ENOMEM); 4788 return ERR_PTR(-ENOMEM);
@@ -4607,12 +4948,8 @@ abort:
4607 md_unregister_thread(mddev->thread); 4948 md_unregister_thread(mddev->thread);
4608 mddev->thread = NULL; 4949 mddev->thread = NULL;
4609 if (conf) { 4950 if (conf) {
4610 shrink_stripes(conf);
4611 print_raid5_conf(conf); 4951 print_raid5_conf(conf);
4612 safe_put_page(conf->spare_page); 4952 free_conf(conf);
4613 kfree(conf->disks);
4614 kfree(conf->stripe_hashtbl);
4615 kfree(conf);
4616 } 4953 }
4617 mddev->private = NULL; 4954 mddev->private = NULL;
4618 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 4955 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
@@ -4627,13 +4964,10 @@ static int stop(mddev_t *mddev)
4627 4964
4628 md_unregister_thread(mddev->thread); 4965 md_unregister_thread(mddev->thread);
4629 mddev->thread = NULL; 4966 mddev->thread = NULL;
4630 shrink_stripes(conf);
4631 kfree(conf->stripe_hashtbl);
4632 mddev->queue->backing_dev_info.congested_fn = NULL; 4967 mddev->queue->backing_dev_info.congested_fn = NULL;
4633 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 4968 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
4634 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 4969 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
4635 kfree(conf->disks); 4970 free_conf(conf);
4636 kfree(conf);
4637 mddev->private = NULL; 4971 mddev->private = NULL;
4638 return 0; 4972 return 0;
4639} 4973}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 52ba99954dec..116d0b44b2a9 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -2,6 +2,7 @@
2#define _RAID5_H 2#define _RAID5_H
3 3
4#include <linux/raid/xor.h> 4#include <linux/raid/xor.h>
5#include <linux/dmaengine.h>
5 6
6/* 7/*
7 * 8 *
@@ -175,7 +176,9 @@
175 */ 176 */
176enum check_states { 177enum check_states {
177 check_state_idle = 0, 178 check_state_idle = 0,
178 check_state_run, /* parity check */ 179 check_state_run, /* xor parity check */
180 check_state_run_q, /* q-parity check */
181 check_state_run_pq, /* pq dual parity check */
179 check_state_check_result, 182 check_state_check_result,
180 check_state_compute_run, /* parity repair */ 183 check_state_compute_run, /* parity repair */
181 check_state_compute_result, 184 check_state_compute_result,
@@ -215,8 +218,8 @@ struct stripe_head {
215 * @target - STRIPE_OP_COMPUTE_BLK target 218 * @target - STRIPE_OP_COMPUTE_BLK target
216 */ 219 */
217 struct stripe_operations { 220 struct stripe_operations {
218 int target; 221 int target, target2;
219 u32 zero_sum_result; 222 enum sum_check_flags zero_sum_result;
220 } ops; 223 } ops;
221 struct r5dev { 224 struct r5dev {
222 struct bio req; 225 struct bio req;
@@ -298,7 +301,7 @@ struct r6_state {
298#define STRIPE_OP_COMPUTE_BLK 1 301#define STRIPE_OP_COMPUTE_BLK 1
299#define STRIPE_OP_PREXOR 2 302#define STRIPE_OP_PREXOR 2
300#define STRIPE_OP_BIODRAIN 3 303#define STRIPE_OP_BIODRAIN 3
301#define STRIPE_OP_POSTXOR 4 304#define STRIPE_OP_RECONSTRUCT 4
302#define STRIPE_OP_CHECK 5 305#define STRIPE_OP_CHECK 5
303 306
304/* 307/*
@@ -383,8 +386,21 @@ struct raid5_private_data {
383 * (fresh device added). 386 * (fresh device added).
384 * Cleared when a sync completes. 387 * Cleared when a sync completes.
385 */ 388 */
386 389 /* per cpu variables */
387 struct page *spare_page; /* Used when checking P/Q in raid6 */ 390 struct raid5_percpu {
391 struct page *spare_page; /* Used when checking P/Q in raid6 */
392 void *scribble; /* space for constructing buffer
393 * lists and performing address
394 * conversions
395 */
396 } *percpu;
397 size_t scribble_len; /* size of scribble region must be
398 * associated with conf to handle
399 * cpu hotplug while reshaping
400 */
401#ifdef CONFIG_HOTPLUG_CPU
402 struct notifier_block cpu_notify;
403#endif
388 404
389 /* 405 /*
390 * Free stripes pool 406 * Free stripes pool