diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 26 | ||||
-rw-r--r-- | drivers/md/raid5.c | 1494 | ||||
-rw-r--r-- | drivers/md/raid5.h | 28 |
3 files changed, 967 insertions, 581 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 020f9573fd82..2158377a1359 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -124,6 +124,8 @@ config MD_RAID456 | |||
124 | select MD_RAID6_PQ | 124 | select MD_RAID6_PQ |
125 | select ASYNC_MEMCPY | 125 | select ASYNC_MEMCPY |
126 | select ASYNC_XOR | 126 | select ASYNC_XOR |
127 | select ASYNC_PQ | ||
128 | select ASYNC_RAID6_RECOV | ||
127 | ---help--- | 129 | ---help--- |
128 | A RAID-5 set of N drives with a capacity of C MB per drive provides | 130 | A RAID-5 set of N drives with a capacity of C MB per drive provides |
129 | the capacity of C * (N - 1) MB, and protects against a failure | 131 | the capacity of C * (N - 1) MB, and protects against a failure |
@@ -152,9 +154,33 @@ config MD_RAID456 | |||
152 | 154 | ||
153 | If unsure, say Y. | 155 | If unsure, say Y. |
154 | 156 | ||
157 | config MULTICORE_RAID456 | ||
158 | bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" | ||
159 | depends on MD_RAID456 | ||
160 | depends on SMP | ||
161 | depends on EXPERIMENTAL | ||
162 | ---help--- | ||
163 | Enable the raid456 module to dispatch per-stripe raid operations to a | ||
164 | thread pool. | ||
165 | |||
166 | If unsure, say N. | ||
167 | |||
155 | config MD_RAID6_PQ | 168 | config MD_RAID6_PQ |
156 | tristate | 169 | tristate |
157 | 170 | ||
171 | config ASYNC_RAID6_TEST | ||
172 | tristate "Self test for hardware accelerated raid6 recovery" | ||
173 | depends on MD_RAID6_PQ | ||
174 | select ASYNC_RAID6_RECOV | ||
175 | ---help--- | ||
176 | This is a one-shot self test that permutes through the | ||
177 | recovery of all the possible two disk failure scenarios for a | ||
178 | N-disk array. Recovery is performed with the asynchronous | ||
179 | raid6 recovery routines, and will optionally use an offload | ||
180 | engine if one is available. | ||
181 | |||
182 | If unsure, say N. | ||
183 | |||
158 | config MD_MULTIPATH | 184 | config MD_MULTIPATH |
159 | tristate "Multipath I/O support" | 185 | tristate "Multipath I/O support" |
160 | depends on BLK_DEV_MD | 186 | depends on BLK_DEV_MD |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f9f991e6e138..cac6f4d3a143 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -47,7 +47,9 @@ | |||
47 | #include <linux/kthread.h> | 47 | #include <linux/kthread.h> |
48 | #include <linux/raid/pq.h> | 48 | #include <linux/raid/pq.h> |
49 | #include <linux/async_tx.h> | 49 | #include <linux/async_tx.h> |
50 | #include <linux/async.h> | ||
50 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
52 | #include <linux/cpu.h> | ||
51 | #include "md.h" | 53 | #include "md.h" |
52 | #include "raid5.h" | 54 | #include "raid5.h" |
53 | #include "bitmap.h" | 55 | #include "bitmap.h" |
@@ -499,11 +501,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
499 | struct page *bio_page; | 501 | struct page *bio_page; |
500 | int i; | 502 | int i; |
501 | int page_offset; | 503 | int page_offset; |
504 | struct async_submit_ctl submit; | ||
505 | enum async_tx_flags flags = 0; | ||
502 | 506 | ||
503 | if (bio->bi_sector >= sector) | 507 | if (bio->bi_sector >= sector) |
504 | page_offset = (signed)(bio->bi_sector - sector) * 512; | 508 | page_offset = (signed)(bio->bi_sector - sector) * 512; |
505 | else | 509 | else |
506 | page_offset = (signed)(sector - bio->bi_sector) * -512; | 510 | page_offset = (signed)(sector - bio->bi_sector) * -512; |
511 | |||
512 | if (frombio) | ||
513 | flags |= ASYNC_TX_FENCE; | ||
514 | init_async_submit(&submit, flags, tx, NULL, NULL, NULL); | ||
515 | |||
507 | bio_for_each_segment(bvl, bio, i) { | 516 | bio_for_each_segment(bvl, bio, i) { |
508 | int len = bio_iovec_idx(bio, i)->bv_len; | 517 | int len = bio_iovec_idx(bio, i)->bv_len; |
509 | int clen; | 518 | int clen; |
@@ -525,15 +534,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
525 | bio_page = bio_iovec_idx(bio, i)->bv_page; | 534 | bio_page = bio_iovec_idx(bio, i)->bv_page; |
526 | if (frombio) | 535 | if (frombio) |
527 | tx = async_memcpy(page, bio_page, page_offset, | 536 | tx = async_memcpy(page, bio_page, page_offset, |
528 | b_offset, clen, | 537 | b_offset, clen, &submit); |
529 | ASYNC_TX_DEP_ACK, | ||
530 | tx, NULL, NULL); | ||
531 | else | 538 | else |
532 | tx = async_memcpy(bio_page, page, b_offset, | 539 | tx = async_memcpy(bio_page, page, b_offset, |
533 | page_offset, clen, | 540 | page_offset, clen, &submit); |
534 | ASYNC_TX_DEP_ACK, | ||
535 | tx, NULL, NULL); | ||
536 | } | 541 | } |
542 | /* chain the operations */ | ||
543 | submit.depend_tx = tx; | ||
544 | |||
537 | if (clen < len) /* hit end of page */ | 545 | if (clen < len) /* hit end of page */ |
538 | break; | 546 | break; |
539 | page_offset += len; | 547 | page_offset += len; |
@@ -592,6 +600,7 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
592 | { | 600 | { |
593 | struct dma_async_tx_descriptor *tx = NULL; | 601 | struct dma_async_tx_descriptor *tx = NULL; |
594 | raid5_conf_t *conf = sh->raid_conf; | 602 | raid5_conf_t *conf = sh->raid_conf; |
603 | struct async_submit_ctl submit; | ||
595 | int i; | 604 | int i; |
596 | 605 | ||
597 | pr_debug("%s: stripe %llu\n", __func__, | 606 | pr_debug("%s: stripe %llu\n", __func__, |
@@ -615,22 +624,34 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
615 | } | 624 | } |
616 | 625 | ||
617 | atomic_inc(&sh->count); | 626 | atomic_inc(&sh->count); |
618 | async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | 627 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); |
619 | ops_complete_biofill, sh); | 628 | async_trigger_callback(&submit); |
620 | } | 629 | } |
621 | 630 | ||
622 | static void ops_complete_compute5(void *stripe_head_ref) | 631 | static void mark_target_uptodate(struct stripe_head *sh, int target) |
623 | { | 632 | { |
624 | struct stripe_head *sh = stripe_head_ref; | 633 | struct r5dev *tgt; |
625 | int target = sh->ops.target; | ||
626 | struct r5dev *tgt = &sh->dev[target]; | ||
627 | 634 | ||
628 | pr_debug("%s: stripe %llu\n", __func__, | 635 | if (target < 0) |
629 | (unsigned long long)sh->sector); | 636 | return; |
630 | 637 | ||
638 | tgt = &sh->dev[target]; | ||
631 | set_bit(R5_UPTODATE, &tgt->flags); | 639 | set_bit(R5_UPTODATE, &tgt->flags); |
632 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | 640 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); |
633 | clear_bit(R5_Wantcompute, &tgt->flags); | 641 | clear_bit(R5_Wantcompute, &tgt->flags); |
642 | } | ||
643 | |||
644 | static void ops_complete_compute(void *stripe_head_ref) | ||
645 | { | ||
646 | struct stripe_head *sh = stripe_head_ref; | ||
647 | |||
648 | pr_debug("%s: stripe %llu\n", __func__, | ||
649 | (unsigned long long)sh->sector); | ||
650 | |||
651 | /* mark the computed target(s) as uptodate */ | ||
652 | mark_target_uptodate(sh, sh->ops.target); | ||
653 | mark_target_uptodate(sh, sh->ops.target2); | ||
654 | |||
634 | clear_bit(STRIPE_COMPUTE_RUN, &sh->state); | 655 | clear_bit(STRIPE_COMPUTE_RUN, &sh->state); |
635 | if (sh->check_state == check_state_compute_run) | 656 | if (sh->check_state == check_state_compute_run) |
636 | sh->check_state = check_state_compute_result; | 657 | sh->check_state = check_state_compute_result; |
@@ -638,16 +659,24 @@ static void ops_complete_compute5(void *stripe_head_ref) | |||
638 | release_stripe(sh); | 659 | release_stripe(sh); |
639 | } | 660 | } |
640 | 661 | ||
641 | static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) | 662 | /* return a pointer to the address conversion region of the scribble buffer */ |
663 | static addr_conv_t *to_addr_conv(struct stripe_head *sh, | ||
664 | struct raid5_percpu *percpu) | ||
665 | { | ||
666 | return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); | ||
667 | } | ||
668 | |||
669 | static struct dma_async_tx_descriptor * | ||
670 | ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | ||
642 | { | 671 | { |
643 | /* kernel stack size limits the total number of disks */ | ||
644 | int disks = sh->disks; | 672 | int disks = sh->disks; |
645 | struct page *xor_srcs[disks]; | 673 | struct page **xor_srcs = percpu->scribble; |
646 | int target = sh->ops.target; | 674 | int target = sh->ops.target; |
647 | struct r5dev *tgt = &sh->dev[target]; | 675 | struct r5dev *tgt = &sh->dev[target]; |
648 | struct page *xor_dest = tgt->page; | 676 | struct page *xor_dest = tgt->page; |
649 | int count = 0; | 677 | int count = 0; |
650 | struct dma_async_tx_descriptor *tx; | 678 | struct dma_async_tx_descriptor *tx; |
679 | struct async_submit_ctl submit; | ||
651 | int i; | 680 | int i; |
652 | 681 | ||
653 | pr_debug("%s: stripe %llu block: %d\n", | 682 | pr_debug("%s: stripe %llu block: %d\n", |
@@ -660,17 +689,212 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) | |||
660 | 689 | ||
661 | atomic_inc(&sh->count); | 690 | atomic_inc(&sh->count); |
662 | 691 | ||
692 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, | ||
693 | ops_complete_compute, sh, to_addr_conv(sh, percpu)); | ||
663 | if (unlikely(count == 1)) | 694 | if (unlikely(count == 1)) |
664 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | 695 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); |
665 | 0, NULL, ops_complete_compute5, sh); | ||
666 | else | 696 | else |
667 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 697 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
668 | ASYNC_TX_XOR_ZERO_DST, NULL, | ||
669 | ops_complete_compute5, sh); | ||
670 | 698 | ||
671 | return tx; | 699 | return tx; |
672 | } | 700 | } |
673 | 701 | ||
702 | /* set_syndrome_sources - populate source buffers for gen_syndrome | ||
703 | * @srcs - (struct page *) array of size sh->disks | ||
704 | * @sh - stripe_head to parse | ||
705 | * | ||
706 | * Populates srcs in proper layout order for the stripe and returns the | ||
707 | * 'count' of sources to be used in a call to async_gen_syndrome. The P | ||
708 | * destination buffer is recorded in srcs[count] and the Q destination | ||
709 | * is recorded in srcs[count+1]]. | ||
710 | */ | ||
711 | static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) | ||
712 | { | ||
713 | int disks = sh->disks; | ||
714 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); | ||
715 | int d0_idx = raid6_d0(sh); | ||
716 | int count; | ||
717 | int i; | ||
718 | |||
719 | for (i = 0; i < disks; i++) | ||
720 | srcs[i] = (void *)raid6_empty_zero_page; | ||
721 | |||
722 | count = 0; | ||
723 | i = d0_idx; | ||
724 | do { | ||
725 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
726 | |||
727 | srcs[slot] = sh->dev[i].page; | ||
728 | i = raid6_next_disk(i, disks); | ||
729 | } while (i != d0_idx); | ||
730 | BUG_ON(count != syndrome_disks); | ||
731 | |||
732 | return count; | ||
733 | } | ||
734 | |||
735 | static struct dma_async_tx_descriptor * | ||
736 | ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | ||
737 | { | ||
738 | int disks = sh->disks; | ||
739 | struct page **blocks = percpu->scribble; | ||
740 | int target; | ||
741 | int qd_idx = sh->qd_idx; | ||
742 | struct dma_async_tx_descriptor *tx; | ||
743 | struct async_submit_ctl submit; | ||
744 | struct r5dev *tgt; | ||
745 | struct page *dest; | ||
746 | int i; | ||
747 | int count; | ||
748 | |||
749 | if (sh->ops.target < 0) | ||
750 | target = sh->ops.target2; | ||
751 | else if (sh->ops.target2 < 0) | ||
752 | target = sh->ops.target; | ||
753 | else | ||
754 | /* we should only have one valid target */ | ||
755 | BUG(); | ||
756 | BUG_ON(target < 0); | ||
757 | pr_debug("%s: stripe %llu block: %d\n", | ||
758 | __func__, (unsigned long long)sh->sector, target); | ||
759 | |||
760 | tgt = &sh->dev[target]; | ||
761 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | ||
762 | dest = tgt->page; | ||
763 | |||
764 | atomic_inc(&sh->count); | ||
765 | |||
766 | if (target == qd_idx) { | ||
767 | count = set_syndrome_sources(blocks, sh); | ||
768 | blocks[count] = NULL; /* regenerating p is not necessary */ | ||
769 | BUG_ON(blocks[count+1] != dest); /* q should already be set */ | ||
770 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | ||
771 | ops_complete_compute, sh, | ||
772 | to_addr_conv(sh, percpu)); | ||
773 | tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | ||
774 | } else { | ||
775 | /* Compute any data- or p-drive using XOR */ | ||
776 | count = 0; | ||
777 | for (i = disks; i-- ; ) { | ||
778 | if (i == target || i == qd_idx) | ||
779 | continue; | ||
780 | blocks[count++] = sh->dev[i].page; | ||
781 | } | ||
782 | |||
783 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, | ||
784 | NULL, ops_complete_compute, sh, | ||
785 | to_addr_conv(sh, percpu)); | ||
786 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); | ||
787 | } | ||
788 | |||
789 | return tx; | ||
790 | } | ||
791 | |||
792 | static struct dma_async_tx_descriptor * | ||
793 | ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | ||
794 | { | ||
795 | int i, count, disks = sh->disks; | ||
796 | int syndrome_disks = sh->ddf_layout ? disks : disks-2; | ||
797 | int d0_idx = raid6_d0(sh); | ||
798 | int faila = -1, failb = -1; | ||
799 | int target = sh->ops.target; | ||
800 | int target2 = sh->ops.target2; | ||
801 | struct r5dev *tgt = &sh->dev[target]; | ||
802 | struct r5dev *tgt2 = &sh->dev[target2]; | ||
803 | struct dma_async_tx_descriptor *tx; | ||
804 | struct page **blocks = percpu->scribble; | ||
805 | struct async_submit_ctl submit; | ||
806 | |||
807 | pr_debug("%s: stripe %llu block1: %d block2: %d\n", | ||
808 | __func__, (unsigned long long)sh->sector, target, target2); | ||
809 | BUG_ON(target < 0 || target2 < 0); | ||
810 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | ||
811 | BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); | ||
812 | |||
813 | /* we need to open-code set_syndrome_sources to handle to the | ||
814 | * slot number conversion for 'faila' and 'failb' | ||
815 | */ | ||
816 | for (i = 0; i < disks ; i++) | ||
817 | blocks[i] = (void *)raid6_empty_zero_page; | ||
818 | count = 0; | ||
819 | i = d0_idx; | ||
820 | do { | ||
821 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
822 | |||
823 | blocks[slot] = sh->dev[i].page; | ||
824 | |||
825 | if (i == target) | ||
826 | faila = slot; | ||
827 | if (i == target2) | ||
828 | failb = slot; | ||
829 | i = raid6_next_disk(i, disks); | ||
830 | } while (i != d0_idx); | ||
831 | BUG_ON(count != syndrome_disks); | ||
832 | |||
833 | BUG_ON(faila == failb); | ||
834 | if (failb < faila) | ||
835 | swap(faila, failb); | ||
836 | pr_debug("%s: stripe: %llu faila: %d failb: %d\n", | ||
837 | __func__, (unsigned long long)sh->sector, faila, failb); | ||
838 | |||
839 | atomic_inc(&sh->count); | ||
840 | |||
841 | if (failb == syndrome_disks+1) { | ||
842 | /* Q disk is one of the missing disks */ | ||
843 | if (faila == syndrome_disks) { | ||
844 | /* Missing P+Q, just recompute */ | ||
845 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | ||
846 | ops_complete_compute, sh, | ||
847 | to_addr_conv(sh, percpu)); | ||
848 | return async_gen_syndrome(blocks, 0, count+2, | ||
849 | STRIPE_SIZE, &submit); | ||
850 | } else { | ||
851 | struct page *dest; | ||
852 | int data_target; | ||
853 | int qd_idx = sh->qd_idx; | ||
854 | |||
855 | /* Missing D+Q: recompute D from P, then recompute Q */ | ||
856 | if (target == qd_idx) | ||
857 | data_target = target2; | ||
858 | else | ||
859 | data_target = target; | ||
860 | |||
861 | count = 0; | ||
862 | for (i = disks; i-- ; ) { | ||
863 | if (i == data_target || i == qd_idx) | ||
864 | continue; | ||
865 | blocks[count++] = sh->dev[i].page; | ||
866 | } | ||
867 | dest = sh->dev[data_target].page; | ||
868 | init_async_submit(&submit, | ||
869 | ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, | ||
870 | NULL, NULL, NULL, | ||
871 | to_addr_conv(sh, percpu)); | ||
872 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, | ||
873 | &submit); | ||
874 | |||
875 | count = set_syndrome_sources(blocks, sh); | ||
876 | init_async_submit(&submit, ASYNC_TX_FENCE, tx, | ||
877 | ops_complete_compute, sh, | ||
878 | to_addr_conv(sh, percpu)); | ||
879 | return async_gen_syndrome(blocks, 0, count+2, | ||
880 | STRIPE_SIZE, &submit); | ||
881 | } | ||
882 | } | ||
883 | |||
884 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, | ||
885 | sh, to_addr_conv(sh, percpu)); | ||
886 | if (failb == syndrome_disks) { | ||
887 | /* We're missing D+P. */ | ||
888 | return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, | ||
889 | faila, blocks, &submit); | ||
890 | } else { | ||
891 | /* We're missing D+D. */ | ||
892 | return async_raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, | ||
893 | faila, failb, blocks, &submit); | ||
894 | } | ||
895 | } | ||
896 | |||
897 | |||
674 | static void ops_complete_prexor(void *stripe_head_ref) | 898 | static void ops_complete_prexor(void *stripe_head_ref) |
675 | { | 899 | { |
676 | struct stripe_head *sh = stripe_head_ref; | 900 | struct stripe_head *sh = stripe_head_ref; |
@@ -680,12 +904,13 @@ static void ops_complete_prexor(void *stripe_head_ref) | |||
680 | } | 904 | } |
681 | 905 | ||
682 | static struct dma_async_tx_descriptor * | 906 | static struct dma_async_tx_descriptor * |
683 | ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 907 | ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, |
908 | struct dma_async_tx_descriptor *tx) | ||
684 | { | 909 | { |
685 | /* kernel stack size limits the total number of disks */ | ||
686 | int disks = sh->disks; | 910 | int disks = sh->disks; |
687 | struct page *xor_srcs[disks]; | 911 | struct page **xor_srcs = percpu->scribble; |
688 | int count = 0, pd_idx = sh->pd_idx, i; | 912 | int count = 0, pd_idx = sh->pd_idx, i; |
913 | struct async_submit_ctl submit; | ||
689 | 914 | ||
690 | /* existing parity data subtracted */ | 915 | /* existing parity data subtracted */ |
691 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 916 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
@@ -700,9 +925,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
700 | xor_srcs[count++] = dev->page; | 925 | xor_srcs[count++] = dev->page; |
701 | } | 926 | } |
702 | 927 | ||
703 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 928 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, |
704 | ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, | 929 | ops_complete_prexor, sh, to_addr_conv(sh, percpu)); |
705 | ops_complete_prexor, sh); | 930 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
706 | 931 | ||
707 | return tx; | 932 | return tx; |
708 | } | 933 | } |
@@ -742,17 +967,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
742 | return tx; | 967 | return tx; |
743 | } | 968 | } |
744 | 969 | ||
745 | static void ops_complete_postxor(void *stripe_head_ref) | 970 | static void ops_complete_reconstruct(void *stripe_head_ref) |
746 | { | 971 | { |
747 | struct stripe_head *sh = stripe_head_ref; | 972 | struct stripe_head *sh = stripe_head_ref; |
748 | int disks = sh->disks, i, pd_idx = sh->pd_idx; | 973 | int disks = sh->disks; |
974 | int pd_idx = sh->pd_idx; | ||
975 | int qd_idx = sh->qd_idx; | ||
976 | int i; | ||
749 | 977 | ||
750 | pr_debug("%s: stripe %llu\n", __func__, | 978 | pr_debug("%s: stripe %llu\n", __func__, |
751 | (unsigned long long)sh->sector); | 979 | (unsigned long long)sh->sector); |
752 | 980 | ||
753 | for (i = disks; i--; ) { | 981 | for (i = disks; i--; ) { |
754 | struct r5dev *dev = &sh->dev[i]; | 982 | struct r5dev *dev = &sh->dev[i]; |
755 | if (dev->written || i == pd_idx) | 983 | |
984 | if (dev->written || i == pd_idx || i == qd_idx) | ||
756 | set_bit(R5_UPTODATE, &dev->flags); | 985 | set_bit(R5_UPTODATE, &dev->flags); |
757 | } | 986 | } |
758 | 987 | ||
@@ -770,12 +999,12 @@ static void ops_complete_postxor(void *stripe_head_ref) | |||
770 | } | 999 | } |
771 | 1000 | ||
772 | static void | 1001 | static void |
773 | ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 1002 | ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, |
1003 | struct dma_async_tx_descriptor *tx) | ||
774 | { | 1004 | { |
775 | /* kernel stack size limits the total number of disks */ | ||
776 | int disks = sh->disks; | 1005 | int disks = sh->disks; |
777 | struct page *xor_srcs[disks]; | 1006 | struct page **xor_srcs = percpu->scribble; |
778 | 1007 | struct async_submit_ctl submit; | |
779 | int count = 0, pd_idx = sh->pd_idx, i; | 1008 | int count = 0, pd_idx = sh->pd_idx, i; |
780 | struct page *xor_dest; | 1009 | struct page *xor_dest; |
781 | int prexor = 0; | 1010 | int prexor = 0; |
@@ -809,18 +1038,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
809 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST | 1038 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST |
810 | * for the synchronous xor case | 1039 | * for the synchronous xor case |
811 | */ | 1040 | */ |
812 | flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | | 1041 | flags = ASYNC_TX_ACK | |
813 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); | 1042 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); |
814 | 1043 | ||
815 | atomic_inc(&sh->count); | 1044 | atomic_inc(&sh->count); |
816 | 1045 | ||
817 | if (unlikely(count == 1)) { | 1046 | init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, |
818 | flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); | 1047 | to_addr_conv(sh, percpu)); |
819 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | 1048 | if (unlikely(count == 1)) |
820 | flags, tx, ops_complete_postxor, sh); | 1049 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); |
821 | } else | 1050 | else |
822 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 1051 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
823 | flags, tx, ops_complete_postxor, sh); | 1052 | } |
1053 | |||
1054 | static void | ||
1055 | ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, | ||
1056 | struct dma_async_tx_descriptor *tx) | ||
1057 | { | ||
1058 | struct async_submit_ctl submit; | ||
1059 | struct page **blocks = percpu->scribble; | ||
1060 | int count; | ||
1061 | |||
1062 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); | ||
1063 | |||
1064 | count = set_syndrome_sources(blocks, sh); | ||
1065 | |||
1066 | atomic_inc(&sh->count); | ||
1067 | |||
1068 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, | ||
1069 | sh, to_addr_conv(sh, percpu)); | ||
1070 | async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | ||
824 | } | 1071 | } |
825 | 1072 | ||
826 | static void ops_complete_check(void *stripe_head_ref) | 1073 | static void ops_complete_check(void *stripe_head_ref) |
@@ -835,63 +1082,115 @@ static void ops_complete_check(void *stripe_head_ref) | |||
835 | release_stripe(sh); | 1082 | release_stripe(sh); |
836 | } | 1083 | } |
837 | 1084 | ||
838 | static void ops_run_check(struct stripe_head *sh) | 1085 | static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) |
839 | { | 1086 | { |
840 | /* kernel stack size limits the total number of disks */ | ||
841 | int disks = sh->disks; | 1087 | int disks = sh->disks; |
842 | struct page *xor_srcs[disks]; | 1088 | int pd_idx = sh->pd_idx; |
1089 | int qd_idx = sh->qd_idx; | ||
1090 | struct page *xor_dest; | ||
1091 | struct page **xor_srcs = percpu->scribble; | ||
843 | struct dma_async_tx_descriptor *tx; | 1092 | struct dma_async_tx_descriptor *tx; |
844 | 1093 | struct async_submit_ctl submit; | |
845 | int count = 0, pd_idx = sh->pd_idx, i; | 1094 | int count; |
846 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 1095 | int i; |
847 | 1096 | ||
848 | pr_debug("%s: stripe %llu\n", __func__, | 1097 | pr_debug("%s: stripe %llu\n", __func__, |
849 | (unsigned long long)sh->sector); | 1098 | (unsigned long long)sh->sector); |
850 | 1099 | ||
1100 | count = 0; | ||
1101 | xor_dest = sh->dev[pd_idx].page; | ||
1102 | xor_srcs[count++] = xor_dest; | ||
851 | for (i = disks; i--; ) { | 1103 | for (i = disks; i--; ) { |
852 | struct r5dev *dev = &sh->dev[i]; | 1104 | if (i == pd_idx || i == qd_idx) |
853 | if (i != pd_idx) | 1105 | continue; |
854 | xor_srcs[count++] = dev->page; | 1106 | xor_srcs[count++] = sh->dev[i].page; |
855 | } | 1107 | } |
856 | 1108 | ||
857 | tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 1109 | init_async_submit(&submit, 0, NULL, NULL, NULL, |
858 | &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); | 1110 | to_addr_conv(sh, percpu)); |
1111 | tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | ||
1112 | &sh->ops.zero_sum_result, &submit); | ||
1113 | |||
1114 | atomic_inc(&sh->count); | ||
1115 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); | ||
1116 | tx = async_trigger_callback(&submit); | ||
1117 | } | ||
1118 | |||
1119 | static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) | ||
1120 | { | ||
1121 | struct page **srcs = percpu->scribble; | ||
1122 | struct async_submit_ctl submit; | ||
1123 | int count; | ||
1124 | |||
1125 | pr_debug("%s: stripe %llu checkp: %d\n", __func__, | ||
1126 | (unsigned long long)sh->sector, checkp); | ||
1127 | |||
1128 | count = set_syndrome_sources(srcs, sh); | ||
1129 | if (!checkp) | ||
1130 | srcs[count] = NULL; | ||
859 | 1131 | ||
860 | atomic_inc(&sh->count); | 1132 | atomic_inc(&sh->count); |
861 | tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | 1133 | init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, |
862 | ops_complete_check, sh); | 1134 | sh, to_addr_conv(sh, percpu)); |
1135 | async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, | ||
1136 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); | ||
863 | } | 1137 | } |
864 | 1138 | ||
865 | static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) | 1139 | static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) |
866 | { | 1140 | { |
867 | int overlap_clear = 0, i, disks = sh->disks; | 1141 | int overlap_clear = 0, i, disks = sh->disks; |
868 | struct dma_async_tx_descriptor *tx = NULL; | 1142 | struct dma_async_tx_descriptor *tx = NULL; |
1143 | raid5_conf_t *conf = sh->raid_conf; | ||
1144 | int level = conf->level; | ||
1145 | struct raid5_percpu *percpu; | ||
1146 | unsigned long cpu; | ||
869 | 1147 | ||
1148 | cpu = get_cpu(); | ||
1149 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
870 | if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { | 1150 | if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { |
871 | ops_run_biofill(sh); | 1151 | ops_run_biofill(sh); |
872 | overlap_clear++; | 1152 | overlap_clear++; |
873 | } | 1153 | } |
874 | 1154 | ||
875 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { | 1155 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { |
876 | tx = ops_run_compute5(sh); | 1156 | if (level < 6) |
877 | /* terminate the chain if postxor is not set to be run */ | 1157 | tx = ops_run_compute5(sh, percpu); |
878 | if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) | 1158 | else { |
1159 | if (sh->ops.target2 < 0 || sh->ops.target < 0) | ||
1160 | tx = ops_run_compute6_1(sh, percpu); | ||
1161 | else | ||
1162 | tx = ops_run_compute6_2(sh, percpu); | ||
1163 | } | ||
1164 | /* terminate the chain if reconstruct is not set to be run */ | ||
1165 | if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) | ||
879 | async_tx_ack(tx); | 1166 | async_tx_ack(tx); |
880 | } | 1167 | } |
881 | 1168 | ||
882 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) | 1169 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) |
883 | tx = ops_run_prexor(sh, tx); | 1170 | tx = ops_run_prexor(sh, percpu, tx); |
884 | 1171 | ||
885 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { | 1172 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { |
886 | tx = ops_run_biodrain(sh, tx); | 1173 | tx = ops_run_biodrain(sh, tx); |
887 | overlap_clear++; | 1174 | overlap_clear++; |
888 | } | 1175 | } |
889 | 1176 | ||
890 | if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) | 1177 | if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { |
891 | ops_run_postxor(sh, tx); | 1178 | if (level < 6) |
1179 | ops_run_reconstruct5(sh, percpu, tx); | ||
1180 | else | ||
1181 | ops_run_reconstruct6(sh, percpu, tx); | ||
1182 | } | ||
892 | 1183 | ||
893 | if (test_bit(STRIPE_OP_CHECK, &ops_request)) | 1184 | if (test_bit(STRIPE_OP_CHECK, &ops_request)) { |
894 | ops_run_check(sh); | 1185 | if (sh->check_state == check_state_run) |
1186 | ops_run_check_p(sh, percpu); | ||
1187 | else if (sh->check_state == check_state_run_q) | ||
1188 | ops_run_check_pq(sh, percpu, 0); | ||
1189 | else if (sh->check_state == check_state_run_pq) | ||
1190 | ops_run_check_pq(sh, percpu, 1); | ||
1191 | else | ||
1192 | BUG(); | ||
1193 | } | ||
895 | 1194 | ||
896 | if (overlap_clear) | 1195 | if (overlap_clear) |
897 | for (i = disks; i--; ) { | 1196 | for (i = disks; i--; ) { |
@@ -899,6 +1198,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
899 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | 1198 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) |
900 | wake_up(&sh->raid_conf->wait_for_overlap); | 1199 | wake_up(&sh->raid_conf->wait_for_overlap); |
901 | } | 1200 | } |
1201 | put_cpu(); | ||
902 | } | 1202 | } |
903 | 1203 | ||
904 | static int grow_one_stripe(raid5_conf_t *conf) | 1204 | static int grow_one_stripe(raid5_conf_t *conf) |
@@ -948,6 +1248,28 @@ static int grow_stripes(raid5_conf_t *conf, int num) | |||
948 | return 0; | 1248 | return 0; |
949 | } | 1249 | } |
950 | 1250 | ||
1251 | /** | ||
1252 | * scribble_len - return the required size of the scribble region | ||
1253 | * @num - total number of disks in the array | ||
1254 | * | ||
1255 | * The size must be enough to contain: | ||
1256 | * 1/ a struct page pointer for each device in the array +2 | ||
1257 | * 2/ room to convert each entry in (1) to its corresponding dma | ||
1258 | * (dma_map_page()) or page (page_address()) address. | ||
1259 | * | ||
1260 | * Note: the +2 is for the destination buffers of the ddf/raid6 case where we | ||
1261 | * calculate over all devices (not just the data blocks), using zeros in place | ||
1262 | * of the P and Q blocks. | ||
1263 | */ | ||
1264 | static size_t scribble_len(int num) | ||
1265 | { | ||
1266 | size_t len; | ||
1267 | |||
1268 | len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); | ||
1269 | |||
1270 | return len; | ||
1271 | } | ||
1272 | |||
951 | static int resize_stripes(raid5_conf_t *conf, int newsize) | 1273 | static int resize_stripes(raid5_conf_t *conf, int newsize) |
952 | { | 1274 | { |
953 | /* Make all the stripes able to hold 'newsize' devices. | 1275 | /* Make all the stripes able to hold 'newsize' devices. |
@@ -976,6 +1298,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
976 | struct stripe_head *osh, *nsh; | 1298 | struct stripe_head *osh, *nsh; |
977 | LIST_HEAD(newstripes); | 1299 | LIST_HEAD(newstripes); |
978 | struct disk_info *ndisks; | 1300 | struct disk_info *ndisks; |
1301 | unsigned long cpu; | ||
979 | int err; | 1302 | int err; |
980 | struct kmem_cache *sc; | 1303 | struct kmem_cache *sc; |
981 | int i; | 1304 | int i; |
@@ -1041,7 +1364,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1041 | /* Step 3. | 1364 | /* Step 3. |
1042 | * At this point, we are holding all the stripes so the array | 1365 | * At this point, we are holding all the stripes so the array |
1043 | * is completely stalled, so now is a good time to resize | 1366 | * is completely stalled, so now is a good time to resize |
1044 | * conf->disks. | 1367 | * conf->disks and the scribble region |
1045 | */ | 1368 | */ |
1046 | ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); | 1369 | ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); |
1047 | if (ndisks) { | 1370 | if (ndisks) { |
@@ -1052,10 +1375,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1052 | } else | 1375 | } else |
1053 | err = -ENOMEM; | 1376 | err = -ENOMEM; |
1054 | 1377 | ||
1378 | get_online_cpus(); | ||
1379 | conf->scribble_len = scribble_len(newsize); | ||
1380 | for_each_present_cpu(cpu) { | ||
1381 | struct raid5_percpu *percpu; | ||
1382 | void *scribble; | ||
1383 | |||
1384 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
1385 | scribble = kmalloc(conf->scribble_len, GFP_NOIO); | ||
1386 | |||
1387 | if (scribble) { | ||
1388 | kfree(percpu->scribble); | ||
1389 | percpu->scribble = scribble; | ||
1390 | } else { | ||
1391 | err = -ENOMEM; | ||
1392 | break; | ||
1393 | } | ||
1394 | } | ||
1395 | put_online_cpus(); | ||
1396 | |||
1055 | /* Step 4, return new stripes to service */ | 1397 | /* Step 4, return new stripes to service */ |
1056 | while(!list_empty(&newstripes)) { | 1398 | while(!list_empty(&newstripes)) { |
1057 | nsh = list_entry(newstripes.next, struct stripe_head, lru); | 1399 | nsh = list_entry(newstripes.next, struct stripe_head, lru); |
1058 | list_del_init(&nsh->lru); | 1400 | list_del_init(&nsh->lru); |
1401 | |||
1059 | for (i=conf->raid_disks; i < newsize; i++) | 1402 | for (i=conf->raid_disks; i < newsize; i++) |
1060 | if (nsh->dev[i].page == NULL) { | 1403 | if (nsh->dev[i].page == NULL) { |
1061 | struct page *p = alloc_page(GFP_NOIO); | 1404 | struct page *p = alloc_page(GFP_NOIO); |
@@ -1594,258 +1937,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
1594 | } | 1937 | } |
1595 | 1938 | ||
1596 | 1939 | ||
1597 | |||
1598 | /* | ||
1599 | * Copy data between a page in the stripe cache, and one or more bion | ||
1600 | * The page could align with the middle of the bio, or there could be | ||
1601 | * several bion, each with several bio_vecs, which cover part of the page | ||
1602 | * Multiple bion are linked together on bi_next. There may be extras | ||
1603 | * at the end of this list. We ignore them. | ||
1604 | */ | ||
1605 | static void copy_data(int frombio, struct bio *bio, | ||
1606 | struct page *page, | ||
1607 | sector_t sector) | ||
1608 | { | ||
1609 | char *pa = page_address(page); | ||
1610 | struct bio_vec *bvl; | ||
1611 | int i; | ||
1612 | int page_offset; | ||
1613 | |||
1614 | if (bio->bi_sector >= sector) | ||
1615 | page_offset = (signed)(bio->bi_sector - sector) * 512; | ||
1616 | else | ||
1617 | page_offset = (signed)(sector - bio->bi_sector) * -512; | ||
1618 | bio_for_each_segment(bvl, bio, i) { | ||
1619 | int len = bio_iovec_idx(bio,i)->bv_len; | ||
1620 | int clen; | ||
1621 | int b_offset = 0; | ||
1622 | |||
1623 | if (page_offset < 0) { | ||
1624 | b_offset = -page_offset; | ||
1625 | page_offset += b_offset; | ||
1626 | len -= b_offset; | ||
1627 | } | ||
1628 | |||
1629 | if (len > 0 && page_offset + len > STRIPE_SIZE) | ||
1630 | clen = STRIPE_SIZE - page_offset; | ||
1631 | else clen = len; | ||
1632 | |||
1633 | if (clen > 0) { | ||
1634 | char *ba = __bio_kmap_atomic(bio, i, KM_USER0); | ||
1635 | if (frombio) | ||
1636 | memcpy(pa+page_offset, ba+b_offset, clen); | ||
1637 | else | ||
1638 | memcpy(ba+b_offset, pa+page_offset, clen); | ||
1639 | __bio_kunmap_atomic(ba, KM_USER0); | ||
1640 | } | ||
1641 | if (clen < len) /* hit end of page */ | ||
1642 | break; | ||
1643 | page_offset += len; | ||
1644 | } | ||
1645 | } | ||
1646 | |||
1647 | #define check_xor() do { \ | ||
1648 | if (count == MAX_XOR_BLOCKS) { \ | ||
1649 | xor_blocks(count, STRIPE_SIZE, dest, ptr);\ | ||
1650 | count = 0; \ | ||
1651 | } \ | ||
1652 | } while(0) | ||
1653 | |||
1654 | static void compute_parity6(struct stripe_head *sh, int method) | ||
1655 | { | ||
1656 | raid5_conf_t *conf = sh->raid_conf; | ||
1657 | int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; | ||
1658 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); | ||
1659 | struct bio *chosen; | ||
1660 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ | ||
1661 | void *ptrs[syndrome_disks+2]; | ||
1662 | |||
1663 | pd_idx = sh->pd_idx; | ||
1664 | qd_idx = sh->qd_idx; | ||
1665 | d0_idx = raid6_d0(sh); | ||
1666 | |||
1667 | pr_debug("compute_parity, stripe %llu, method %d\n", | ||
1668 | (unsigned long long)sh->sector, method); | ||
1669 | |||
1670 | switch(method) { | ||
1671 | case READ_MODIFY_WRITE: | ||
1672 | BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ | ||
1673 | case RECONSTRUCT_WRITE: | ||
1674 | for (i= disks; i-- ;) | ||
1675 | if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { | ||
1676 | chosen = sh->dev[i].towrite; | ||
1677 | sh->dev[i].towrite = NULL; | ||
1678 | |||
1679 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
1680 | wake_up(&conf->wait_for_overlap); | ||
1681 | |||
1682 | BUG_ON(sh->dev[i].written); | ||
1683 | sh->dev[i].written = chosen; | ||
1684 | } | ||
1685 | break; | ||
1686 | case CHECK_PARITY: | ||
1687 | BUG(); /* Not implemented yet */ | ||
1688 | } | ||
1689 | |||
1690 | for (i = disks; i--;) | ||
1691 | if (sh->dev[i].written) { | ||
1692 | sector_t sector = sh->dev[i].sector; | ||
1693 | struct bio *wbi = sh->dev[i].written; | ||
1694 | while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { | ||
1695 | copy_data(1, wbi, sh->dev[i].page, sector); | ||
1696 | wbi = r5_next_bio(wbi, sector); | ||
1697 | } | ||
1698 | |||
1699 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
1700 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | ||
1701 | } | ||
1702 | |||
1703 | /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ | ||
1704 | |||
1705 | for (i = 0; i < disks; i++) | ||
1706 | ptrs[i] = (void *)raid6_empty_zero_page; | ||
1707 | |||
1708 | count = 0; | ||
1709 | i = d0_idx; | ||
1710 | do { | ||
1711 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
1712 | |||
1713 | ptrs[slot] = page_address(sh->dev[i].page); | ||
1714 | if (slot < syndrome_disks && | ||
1715 | !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { | ||
1716 | printk(KERN_ERR "block %d/%d not uptodate " | ||
1717 | "on parity calc\n", i, count); | ||
1718 | BUG(); | ||
1719 | } | ||
1720 | |||
1721 | i = raid6_next_disk(i, disks); | ||
1722 | } while (i != d0_idx); | ||
1723 | BUG_ON(count != syndrome_disks); | ||
1724 | |||
1725 | raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); | ||
1726 | |||
1727 | switch(method) { | ||
1728 | case RECONSTRUCT_WRITE: | ||
1729 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
1730 | set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); | ||
1731 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | ||
1732 | set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); | ||
1733 | break; | ||
1734 | case UPDATE_PARITY: | ||
1735 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
1736 | set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); | ||
1737 | break; | ||
1738 | } | ||
1739 | } | ||
1740 | |||
1741 | |||
1742 | /* Compute one missing block */ | ||
1743 | static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) | ||
1744 | { | ||
1745 | int i, count, disks = sh->disks; | ||
1746 | void *ptr[MAX_XOR_BLOCKS], *dest, *p; | ||
1747 | int qd_idx = sh->qd_idx; | ||
1748 | |||
1749 | pr_debug("compute_block_1, stripe %llu, idx %d\n", | ||
1750 | (unsigned long long)sh->sector, dd_idx); | ||
1751 | |||
1752 | if ( dd_idx == qd_idx ) { | ||
1753 | /* We're actually computing the Q drive */ | ||
1754 | compute_parity6(sh, UPDATE_PARITY); | ||
1755 | } else { | ||
1756 | dest = page_address(sh->dev[dd_idx].page); | ||
1757 | if (!nozero) memset(dest, 0, STRIPE_SIZE); | ||
1758 | count = 0; | ||
1759 | for (i = disks ; i--; ) { | ||
1760 | if (i == dd_idx || i == qd_idx) | ||
1761 | continue; | ||
1762 | p = page_address(sh->dev[i].page); | ||
1763 | if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) | ||
1764 | ptr[count++] = p; | ||
1765 | else | ||
1766 | printk("compute_block() %d, stripe %llu, %d" | ||
1767 | " not present\n", dd_idx, | ||
1768 | (unsigned long long)sh->sector, i); | ||
1769 | |||
1770 | check_xor(); | ||
1771 | } | ||
1772 | if (count) | ||
1773 | xor_blocks(count, STRIPE_SIZE, dest, ptr); | ||
1774 | if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | ||
1775 | else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | ||
1776 | } | ||
1777 | } | ||
1778 | |||
1779 | /* Compute two missing blocks */ | ||
1780 | static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) | ||
1781 | { | ||
1782 | int i, count, disks = sh->disks; | ||
1783 | int syndrome_disks = sh->ddf_layout ? disks : disks-2; | ||
1784 | int d0_idx = raid6_d0(sh); | ||
1785 | int faila = -1, failb = -1; | ||
1786 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ | ||
1787 | void *ptrs[syndrome_disks+2]; | ||
1788 | |||
1789 | for (i = 0; i < disks ; i++) | ||
1790 | ptrs[i] = (void *)raid6_empty_zero_page; | ||
1791 | count = 0; | ||
1792 | i = d0_idx; | ||
1793 | do { | ||
1794 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
1795 | |||
1796 | ptrs[slot] = page_address(sh->dev[i].page); | ||
1797 | |||
1798 | if (i == dd_idx1) | ||
1799 | faila = slot; | ||
1800 | if (i == dd_idx2) | ||
1801 | failb = slot; | ||
1802 | i = raid6_next_disk(i, disks); | ||
1803 | } while (i != d0_idx); | ||
1804 | BUG_ON(count != syndrome_disks); | ||
1805 | |||
1806 | BUG_ON(faila == failb); | ||
1807 | if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } | ||
1808 | |||
1809 | pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", | ||
1810 | (unsigned long long)sh->sector, dd_idx1, dd_idx2, | ||
1811 | faila, failb); | ||
1812 | |||
1813 | if (failb == syndrome_disks+1) { | ||
1814 | /* Q disk is one of the missing disks */ | ||
1815 | if (faila == syndrome_disks) { | ||
1816 | /* Missing P+Q, just recompute */ | ||
1817 | compute_parity6(sh, UPDATE_PARITY); | ||
1818 | return; | ||
1819 | } else { | ||
1820 | /* We're missing D+Q; recompute D from P */ | ||
1821 | compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? | ||
1822 | dd_idx2 : dd_idx1), | ||
1823 | 0); | ||
1824 | compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ | ||
1825 | return; | ||
1826 | } | ||
1827 | } | ||
1828 | |||
1829 | /* We're missing D+P or D+D; */ | ||
1830 | if (failb == syndrome_disks) { | ||
1831 | /* We're missing D+P. */ | ||
1832 | raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); | ||
1833 | } else { | ||
1834 | /* We're missing D+D. */ | ||
1835 | raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, | ||
1836 | ptrs); | ||
1837 | } | ||
1838 | |||
1839 | /* Both the above update both missing blocks */ | ||
1840 | set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); | ||
1841 | set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); | ||
1842 | } | ||
1843 | |||
1844 | static void | 1940 | static void |
1845 | schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | 1941 | schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, |
1846 | int rcw, int expand) | 1942 | int rcw, int expand) |
1847 | { | 1943 | { |
1848 | int i, pd_idx = sh->pd_idx, disks = sh->disks; | 1944 | int i, pd_idx = sh->pd_idx, disks = sh->disks; |
1945 | raid5_conf_t *conf = sh->raid_conf; | ||
1946 | int level = conf->level; | ||
1849 | 1947 | ||
1850 | if (rcw) { | 1948 | if (rcw) { |
1851 | /* if we are not expanding this is a proper write request, and | 1949 | /* if we are not expanding this is a proper write request, and |
@@ -1858,7 +1956,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | |||
1858 | } else | 1956 | } else |
1859 | sh->reconstruct_state = reconstruct_state_run; | 1957 | sh->reconstruct_state = reconstruct_state_run; |
1860 | 1958 | ||
1861 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); | 1959 | set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); |
1862 | 1960 | ||
1863 | for (i = disks; i--; ) { | 1961 | for (i = disks; i--; ) { |
1864 | struct r5dev *dev = &sh->dev[i]; | 1962 | struct r5dev *dev = &sh->dev[i]; |
@@ -1871,17 +1969,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | |||
1871 | s->locked++; | 1969 | s->locked++; |
1872 | } | 1970 | } |
1873 | } | 1971 | } |
1874 | if (s->locked + 1 == disks) | 1972 | if (s->locked + conf->max_degraded == disks) |
1875 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | 1973 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) |
1876 | atomic_inc(&sh->raid_conf->pending_full_writes); | 1974 | atomic_inc(&conf->pending_full_writes); |
1877 | } else { | 1975 | } else { |
1976 | BUG_ON(level == 6); | ||
1878 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || | 1977 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || |
1879 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); | 1978 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); |
1880 | 1979 | ||
1881 | sh->reconstruct_state = reconstruct_state_prexor_drain_run; | 1980 | sh->reconstruct_state = reconstruct_state_prexor_drain_run; |
1882 | set_bit(STRIPE_OP_PREXOR, &s->ops_request); | 1981 | set_bit(STRIPE_OP_PREXOR, &s->ops_request); |
1883 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); | 1982 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); |
1884 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); | 1983 | set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); |
1885 | 1984 | ||
1886 | for (i = disks; i--; ) { | 1985 | for (i = disks; i--; ) { |
1887 | struct r5dev *dev = &sh->dev[i]; | 1986 | struct r5dev *dev = &sh->dev[i]; |
@@ -1899,13 +1998,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | |||
1899 | } | 1998 | } |
1900 | } | 1999 | } |
1901 | 2000 | ||
1902 | /* keep the parity disk locked while asynchronous operations | 2001 | /* keep the parity disk(s) locked while asynchronous operations |
1903 | * are in flight | 2002 | * are in flight |
1904 | */ | 2003 | */ |
1905 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | 2004 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); |
1906 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | 2005 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); |
1907 | s->locked++; | 2006 | s->locked++; |
1908 | 2007 | ||
2008 | if (level == 6) { | ||
2009 | int qd_idx = sh->qd_idx; | ||
2010 | struct r5dev *dev = &sh->dev[qd_idx]; | ||
2011 | |||
2012 | set_bit(R5_LOCKED, &dev->flags); | ||
2013 | clear_bit(R5_UPTODATE, &dev->flags); | ||
2014 | s->locked++; | ||
2015 | } | ||
2016 | |||
1909 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", | 2017 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", |
1910 | __func__, (unsigned long long)sh->sector, | 2018 | __func__, (unsigned long long)sh->sector, |
1911 | s->locked, s->ops_request); | 2019 | s->locked, s->ops_request); |
@@ -1986,13 +2094,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
1986 | 2094 | ||
1987 | static void end_reshape(raid5_conf_t *conf); | 2095 | static void end_reshape(raid5_conf_t *conf); |
1988 | 2096 | ||
1989 | static int page_is_zero(struct page *p) | ||
1990 | { | ||
1991 | char *a = page_address(p); | ||
1992 | return ((*(u32*)a) == 0 && | ||
1993 | memcmp(a, a+4, STRIPE_SIZE-4)==0); | ||
1994 | } | ||
1995 | |||
1996 | static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, | 2097 | static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, |
1997 | struct stripe_head *sh) | 2098 | struct stripe_head *sh) |
1998 | { | 2099 | { |
@@ -2132,9 +2233,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, | |||
2132 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | 2233 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); |
2133 | set_bit(R5_Wantcompute, &dev->flags); | 2234 | set_bit(R5_Wantcompute, &dev->flags); |
2134 | sh->ops.target = disk_idx; | 2235 | sh->ops.target = disk_idx; |
2236 | sh->ops.target2 = -1; | ||
2135 | s->req_compute = 1; | 2237 | s->req_compute = 1; |
2136 | /* Careful: from this point on 'uptodate' is in the eye | 2238 | /* Careful: from this point on 'uptodate' is in the eye |
2137 | * of raid5_run_ops which services 'compute' operations | 2239 | * of raid_run_ops which services 'compute' operations |
2138 | * before writes. R5_Wantcompute flags a block that will | 2240 | * before writes. R5_Wantcompute flags a block that will |
2139 | * be R5_UPTODATE by the time it is needed for a | 2241 | * be R5_UPTODATE by the time it is needed for a |
2140 | * subsequent operation. | 2242 | * subsequent operation. |
@@ -2173,61 +2275,104 @@ static void handle_stripe_fill5(struct stripe_head *sh, | |||
2173 | set_bit(STRIPE_HANDLE, &sh->state); | 2275 | set_bit(STRIPE_HANDLE, &sh->state); |
2174 | } | 2276 | } |
2175 | 2277 | ||
2176 | static void handle_stripe_fill6(struct stripe_head *sh, | 2278 | /* fetch_block6 - checks the given member device to see if its data needs |
2177 | struct stripe_head_state *s, struct r6_state *r6s, | 2279 | * to be read or computed to satisfy a request. |
2178 | int disks) | 2280 | * |
2281 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
2282 | * 0 to tell the loop in handle_stripe_fill6 to continue | ||
2283 | */ | ||
2284 | static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | ||
2285 | struct r6_state *r6s, int disk_idx, int disks) | ||
2179 | { | 2286 | { |
2180 | int i; | 2287 | struct r5dev *dev = &sh->dev[disk_idx]; |
2181 | for (i = disks; i--; ) { | 2288 | struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], |
2182 | struct r5dev *dev = &sh->dev[i]; | 2289 | &sh->dev[r6s->failed_num[1]] }; |
2183 | if (!test_bit(R5_LOCKED, &dev->flags) && | 2290 | |
2184 | !test_bit(R5_UPTODATE, &dev->flags) && | 2291 | if (!test_bit(R5_LOCKED, &dev->flags) && |
2185 | (dev->toread || (dev->towrite && | 2292 | !test_bit(R5_UPTODATE, &dev->flags) && |
2186 | !test_bit(R5_OVERWRITE, &dev->flags)) || | 2293 | (dev->toread || |
2187 | s->syncing || s->expanding || | 2294 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
2188 | (s->failed >= 1 && | 2295 | s->syncing || s->expanding || |
2189 | (sh->dev[r6s->failed_num[0]].toread || | 2296 | (s->failed >= 1 && |
2190 | s->to_write)) || | 2297 | (fdev[0]->toread || s->to_write)) || |
2191 | (s->failed >= 2 && | 2298 | (s->failed >= 2 && |
2192 | (sh->dev[r6s->failed_num[1]].toread || | 2299 | (fdev[1]->toread || s->to_write)))) { |
2193 | s->to_write)))) { | 2300 | /* we would like to get this block, possibly by computing it, |
2194 | /* we would like to get this block, possibly | 2301 | * otherwise read it if the backing disk is insync |
2195 | * by computing it, but we might not be able to | 2302 | */ |
2303 | BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); | ||
2304 | BUG_ON(test_bit(R5_Wantread, &dev->flags)); | ||
2305 | if ((s->uptodate == disks - 1) && | ||
2306 | (s->failed && (disk_idx == r6s->failed_num[0] || | ||
2307 | disk_idx == r6s->failed_num[1]))) { | ||
2308 | /* have disk failed, and we're requested to fetch it; | ||
2309 | * do compute it | ||
2196 | */ | 2310 | */ |
2197 | if ((s->uptodate == disks - 1) && | 2311 | pr_debug("Computing stripe %llu block %d\n", |
2198 | (s->failed && (i == r6s->failed_num[0] || | 2312 | (unsigned long long)sh->sector, disk_idx); |
2199 | i == r6s->failed_num[1]))) { | 2313 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); |
2200 | pr_debug("Computing stripe %llu block %d\n", | 2314 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); |
2201 | (unsigned long long)sh->sector, i); | 2315 | set_bit(R5_Wantcompute, &dev->flags); |
2202 | compute_block_1(sh, i, 0); | 2316 | sh->ops.target = disk_idx; |
2203 | s->uptodate++; | 2317 | sh->ops.target2 = -1; /* no 2nd target */ |
2204 | } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { | 2318 | s->req_compute = 1; |
2205 | /* Computing 2-failure is *very* expensive; only | 2319 | s->uptodate++; |
2206 | * do it if failed >= 2 | 2320 | return 1; |
2207 | */ | 2321 | } else if (s->uptodate == disks-2 && s->failed >= 2) { |
2208 | int other; | 2322 | /* Computing 2-failure is *very* expensive; only |
2209 | for (other = disks; other--; ) { | 2323 | * do it if failed >= 2 |
2210 | if (other == i) | 2324 | */ |
2211 | continue; | 2325 | int other; |
2212 | if (!test_bit(R5_UPTODATE, | 2326 | for (other = disks; other--; ) { |
2213 | &sh->dev[other].flags)) | 2327 | if (other == disk_idx) |
2214 | break; | 2328 | continue; |
2215 | } | 2329 | if (!test_bit(R5_UPTODATE, |
2216 | BUG_ON(other < 0); | 2330 | &sh->dev[other].flags)) |
2217 | pr_debug("Computing stripe %llu blocks %d,%d\n", | 2331 | break; |
2218 | (unsigned long long)sh->sector, | ||
2219 | i, other); | ||
2220 | compute_block_2(sh, i, other); | ||
2221 | s->uptodate += 2; | ||
2222 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
2223 | set_bit(R5_LOCKED, &dev->flags); | ||
2224 | set_bit(R5_Wantread, &dev->flags); | ||
2225 | s->locked++; | ||
2226 | pr_debug("Reading block %d (sync=%d)\n", | ||
2227 | i, s->syncing); | ||
2228 | } | 2332 | } |
2333 | BUG_ON(other < 0); | ||
2334 | pr_debug("Computing stripe %llu blocks %d,%d\n", | ||
2335 | (unsigned long long)sh->sector, | ||
2336 | disk_idx, other); | ||
2337 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
2338 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2339 | set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); | ||
2340 | set_bit(R5_Wantcompute, &sh->dev[other].flags); | ||
2341 | sh->ops.target = disk_idx; | ||
2342 | sh->ops.target2 = other; | ||
2343 | s->uptodate += 2; | ||
2344 | s->req_compute = 1; | ||
2345 | return 1; | ||
2346 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
2347 | set_bit(R5_LOCKED, &dev->flags); | ||
2348 | set_bit(R5_Wantread, &dev->flags); | ||
2349 | s->locked++; | ||
2350 | pr_debug("Reading block %d (sync=%d)\n", | ||
2351 | disk_idx, s->syncing); | ||
2229 | } | 2352 | } |
2230 | } | 2353 | } |
2354 | |||
2355 | return 0; | ||
2356 | } | ||
2357 | |||
2358 | /** | ||
2359 | * handle_stripe_fill6 - read or compute data to satisfy pending requests. | ||
2360 | */ | ||
2361 | static void handle_stripe_fill6(struct stripe_head *sh, | ||
2362 | struct stripe_head_state *s, struct r6_state *r6s, | ||
2363 | int disks) | ||
2364 | { | ||
2365 | int i; | ||
2366 | |||
2367 | /* look for blocks to read/compute, skip this if a compute | ||
2368 | * is already in flight, or if the stripe contents are in the | ||
2369 | * midst of changing due to a write | ||
2370 | */ | ||
2371 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | ||
2372 | !sh->reconstruct_state) | ||
2373 | for (i = disks; i--; ) | ||
2374 | if (fetch_block6(sh, s, r6s, i, disks)) | ||
2375 | break; | ||
2231 | set_bit(STRIPE_HANDLE, &sh->state); | 2376 | set_bit(STRIPE_HANDLE, &sh->state); |
2232 | } | 2377 | } |
2233 | 2378 | ||
@@ -2361,114 +2506,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
2361 | */ | 2506 | */ |
2362 | /* since handle_stripe can be called at any time we need to handle the | 2507 | /* since handle_stripe can be called at any time we need to handle the |
2363 | * case where a compute block operation has been submitted and then a | 2508 | * case where a compute block operation has been submitted and then a |
2364 | * subsequent call wants to start a write request. raid5_run_ops only | 2509 | * subsequent call wants to start a write request. raid_run_ops only |
2365 | * handles the case where compute block and postxor are requested | 2510 | * handles the case where compute block and reconstruct are requested |
2366 | * simultaneously. If this is not the case then new writes need to be | 2511 | * simultaneously. If this is not the case then new writes need to be |
2367 | * held off until the compute completes. | 2512 | * held off until the compute completes. |
2368 | */ | 2513 | */ |
2369 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && | 2514 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
2370 | (s->locked == 0 && (rcw == 0 || rmw == 0) && | 2515 | (s->locked == 0 && (rcw == 0 || rmw == 0) && |
2371 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) | 2516 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) |
2372 | schedule_reconstruction5(sh, s, rcw == 0, 0); | 2517 | schedule_reconstruction(sh, s, rcw == 0, 0); |
2373 | } | 2518 | } |
2374 | 2519 | ||
2375 | static void handle_stripe_dirtying6(raid5_conf_t *conf, | 2520 | static void handle_stripe_dirtying6(raid5_conf_t *conf, |
2376 | struct stripe_head *sh, struct stripe_head_state *s, | 2521 | struct stripe_head *sh, struct stripe_head_state *s, |
2377 | struct r6_state *r6s, int disks) | 2522 | struct r6_state *r6s, int disks) |
2378 | { | 2523 | { |
2379 | int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; | 2524 | int rcw = 0, pd_idx = sh->pd_idx, i; |
2380 | int qd_idx = sh->qd_idx; | 2525 | int qd_idx = sh->qd_idx; |
2526 | |||
2527 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2381 | for (i = disks; i--; ) { | 2528 | for (i = disks; i--; ) { |
2382 | struct r5dev *dev = &sh->dev[i]; | 2529 | struct r5dev *dev = &sh->dev[i]; |
2383 | /* Would I have to read this buffer for reconstruct_write */ | 2530 | /* check if we haven't enough data */ |
2384 | if (!test_bit(R5_OVERWRITE, &dev->flags) | 2531 | if (!test_bit(R5_OVERWRITE, &dev->flags) && |
2385 | && i != pd_idx && i != qd_idx | 2532 | i != pd_idx && i != qd_idx && |
2386 | && (!test_bit(R5_LOCKED, &dev->flags) | 2533 | !test_bit(R5_LOCKED, &dev->flags) && |
2387 | ) && | 2534 | !(test_bit(R5_UPTODATE, &dev->flags) || |
2388 | !test_bit(R5_UPTODATE, &dev->flags)) { | 2535 | test_bit(R5_Wantcompute, &dev->flags))) { |
2389 | if (test_bit(R5_Insync, &dev->flags)) rcw++; | 2536 | rcw++; |
2390 | else { | 2537 | if (!test_bit(R5_Insync, &dev->flags)) |
2391 | pr_debug("raid6: must_compute: " | 2538 | continue; /* it's a failed drive */ |
2392 | "disk %d flags=%#lx\n", i, dev->flags); | 2539 | |
2393 | must_compute++; | 2540 | if ( |
2541 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2542 | pr_debug("Read_old stripe %llu " | ||
2543 | "block %d for Reconstruct\n", | ||
2544 | (unsigned long long)sh->sector, i); | ||
2545 | set_bit(R5_LOCKED, &dev->flags); | ||
2546 | set_bit(R5_Wantread, &dev->flags); | ||
2547 | s->locked++; | ||
2548 | } else { | ||
2549 | pr_debug("Request delayed stripe %llu " | ||
2550 | "block %d for Reconstruct\n", | ||
2551 | (unsigned long long)sh->sector, i); | ||
2552 | set_bit(STRIPE_DELAYED, &sh->state); | ||
2553 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2394 | } | 2554 | } |
2395 | } | 2555 | } |
2396 | } | 2556 | } |
2397 | pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", | ||
2398 | (unsigned long long)sh->sector, rcw, must_compute); | ||
2399 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2400 | |||
2401 | if (rcw > 0) | ||
2402 | /* want reconstruct write, but need to get some data */ | ||
2403 | for (i = disks; i--; ) { | ||
2404 | struct r5dev *dev = &sh->dev[i]; | ||
2405 | if (!test_bit(R5_OVERWRITE, &dev->flags) | ||
2406 | && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) | ||
2407 | && !test_bit(R5_LOCKED, &dev->flags) && | ||
2408 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
2409 | test_bit(R5_Insync, &dev->flags)) { | ||
2410 | if ( | ||
2411 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2412 | pr_debug("Read_old stripe %llu " | ||
2413 | "block %d for Reconstruct\n", | ||
2414 | (unsigned long long)sh->sector, i); | ||
2415 | set_bit(R5_LOCKED, &dev->flags); | ||
2416 | set_bit(R5_Wantread, &dev->flags); | ||
2417 | s->locked++; | ||
2418 | } else { | ||
2419 | pr_debug("Request delayed stripe %llu " | ||
2420 | "block %d for Reconstruct\n", | ||
2421 | (unsigned long long)sh->sector, i); | ||
2422 | set_bit(STRIPE_DELAYED, &sh->state); | ||
2423 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2424 | } | ||
2425 | } | ||
2426 | } | ||
2427 | /* now if nothing is locked, and if we have enough data, we can start a | 2557 | /* now if nothing is locked, and if we have enough data, we can start a |
2428 | * write request | 2558 | * write request |
2429 | */ | 2559 | */ |
2430 | if (s->locked == 0 && rcw == 0 && | 2560 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
2561 | s->locked == 0 && rcw == 0 && | ||
2431 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | 2562 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { |
2432 | if (must_compute > 0) { | 2563 | schedule_reconstruction(sh, s, 1, 0); |
2433 | /* We have failed blocks and need to compute them */ | ||
2434 | switch (s->failed) { | ||
2435 | case 0: | ||
2436 | BUG(); | ||
2437 | case 1: | ||
2438 | compute_block_1(sh, r6s->failed_num[0], 0); | ||
2439 | break; | ||
2440 | case 2: | ||
2441 | compute_block_2(sh, r6s->failed_num[0], | ||
2442 | r6s->failed_num[1]); | ||
2443 | break; | ||
2444 | default: /* This request should have been failed? */ | ||
2445 | BUG(); | ||
2446 | } | ||
2447 | } | ||
2448 | |||
2449 | pr_debug("Computing parity for stripe %llu\n", | ||
2450 | (unsigned long long)sh->sector); | ||
2451 | compute_parity6(sh, RECONSTRUCT_WRITE); | ||
2452 | /* now every locked buffer is ready to be written */ | ||
2453 | for (i = disks; i--; ) | ||
2454 | if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { | ||
2455 | pr_debug("Writing stripe %llu block %d\n", | ||
2456 | (unsigned long long)sh->sector, i); | ||
2457 | s->locked++; | ||
2458 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
2459 | } | ||
2460 | if (s->locked == disks) | ||
2461 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | ||
2462 | atomic_inc(&conf->pending_full_writes); | ||
2463 | /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ | ||
2464 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2465 | |||
2466 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2467 | atomic_dec(&conf->preread_active_stripes); | ||
2468 | if (atomic_read(&conf->preread_active_stripes) < | ||
2469 | IO_THRESHOLD) | ||
2470 | md_wakeup_thread(conf->mddev->thread); | ||
2471 | } | ||
2472 | } | 2564 | } |
2473 | } | 2565 | } |
2474 | 2566 | ||
@@ -2527,7 +2619,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2527 | * we are done. Otherwise update the mismatch count and repair | 2619 | * we are done. Otherwise update the mismatch count and repair |
2528 | * parity if !MD_RECOVERY_CHECK | 2620 | * parity if !MD_RECOVERY_CHECK |
2529 | */ | 2621 | */ |
2530 | if (sh->ops.zero_sum_result == 0) | 2622 | if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) |
2531 | /* parity is correct (on disc, | 2623 | /* parity is correct (on disc, |
2532 | * not in buffer any more) | 2624 | * not in buffer any more) |
2533 | */ | 2625 | */ |
@@ -2544,6 +2636,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2544 | set_bit(R5_Wantcompute, | 2636 | set_bit(R5_Wantcompute, |
2545 | &sh->dev[sh->pd_idx].flags); | 2637 | &sh->dev[sh->pd_idx].flags); |
2546 | sh->ops.target = sh->pd_idx; | 2638 | sh->ops.target = sh->pd_idx; |
2639 | sh->ops.target2 = -1; | ||
2547 | s->uptodate++; | 2640 | s->uptodate++; |
2548 | } | 2641 | } |
2549 | } | 2642 | } |
@@ -2560,67 +2653,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2560 | 2653 | ||
2561 | 2654 | ||
2562 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | 2655 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, |
2563 | struct stripe_head_state *s, | 2656 | struct stripe_head_state *s, |
2564 | struct r6_state *r6s, struct page *tmp_page, | 2657 | struct r6_state *r6s, int disks) |
2565 | int disks) | ||
2566 | { | 2658 | { |
2567 | int update_p = 0, update_q = 0; | ||
2568 | struct r5dev *dev; | ||
2569 | int pd_idx = sh->pd_idx; | 2659 | int pd_idx = sh->pd_idx; |
2570 | int qd_idx = sh->qd_idx; | 2660 | int qd_idx = sh->qd_idx; |
2661 | struct r5dev *dev; | ||
2571 | 2662 | ||
2572 | set_bit(STRIPE_HANDLE, &sh->state); | 2663 | set_bit(STRIPE_HANDLE, &sh->state); |
2573 | 2664 | ||
2574 | BUG_ON(s->failed > 2); | 2665 | BUG_ON(s->failed > 2); |
2575 | BUG_ON(s->uptodate < disks); | 2666 | |
2576 | /* Want to check and possibly repair P and Q. | 2667 | /* Want to check and possibly repair P and Q. |
2577 | * However there could be one 'failed' device, in which | 2668 | * However there could be one 'failed' device, in which |
2578 | * case we can only check one of them, possibly using the | 2669 | * case we can only check one of them, possibly using the |
2579 | * other to generate missing data | 2670 | * other to generate missing data |
2580 | */ | 2671 | */ |
2581 | 2672 | ||
2582 | /* If !tmp_page, we cannot do the calculations, | 2673 | switch (sh->check_state) { |
2583 | * but as we have set STRIPE_HANDLE, we will soon be called | 2674 | case check_state_idle: |
2584 | * by stripe_handle with a tmp_page - just wait until then. | 2675 | /* start a new check operation if there are < 2 failures */ |
2585 | */ | ||
2586 | if (tmp_page) { | ||
2587 | if (s->failed == r6s->q_failed) { | 2676 | if (s->failed == r6s->q_failed) { |
2588 | /* The only possible failed device holds 'Q', so it | 2677 | /* The only possible failed device holds Q, so it |
2589 | * makes sense to check P (If anything else were failed, | 2678 | * makes sense to check P (If anything else were failed, |
2590 | * we would have used P to recreate it). | 2679 | * we would have used P to recreate it). |
2591 | */ | 2680 | */ |
2592 | compute_block_1(sh, pd_idx, 1); | 2681 | sh->check_state = check_state_run; |
2593 | if (!page_is_zero(sh->dev[pd_idx].page)) { | ||
2594 | compute_block_1(sh, pd_idx, 0); | ||
2595 | update_p = 1; | ||
2596 | } | ||
2597 | } | 2682 | } |
2598 | if (!r6s->q_failed && s->failed < 2) { | 2683 | if (!r6s->q_failed && s->failed < 2) { |
2599 | /* q is not failed, and we didn't use it to generate | 2684 | /* Q is not failed, and we didn't use it to generate |
2600 | * anything, so it makes sense to check it | 2685 | * anything, so it makes sense to check it |
2601 | */ | 2686 | */ |
2602 | memcpy(page_address(tmp_page), | 2687 | if (sh->check_state == check_state_run) |
2603 | page_address(sh->dev[qd_idx].page), | 2688 | sh->check_state = check_state_run_pq; |
2604 | STRIPE_SIZE); | 2689 | else |
2605 | compute_parity6(sh, UPDATE_PARITY); | 2690 | sh->check_state = check_state_run_q; |
2606 | if (memcmp(page_address(tmp_page), | ||
2607 | page_address(sh->dev[qd_idx].page), | ||
2608 | STRIPE_SIZE) != 0) { | ||
2609 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
2610 | update_q = 1; | ||
2611 | } | ||
2612 | } | 2691 | } |
2613 | if (update_p || update_q) { | 2692 | |
2614 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | 2693 | /* discard potentially stale zero_sum_result */ |
2615 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | 2694 | sh->ops.zero_sum_result = 0; |
2616 | /* don't try to repair!! */ | 2695 | |
2617 | update_p = update_q = 0; | 2696 | if (sh->check_state == check_state_run) { |
2697 | /* async_xor_zero_sum destroys the contents of P */ | ||
2698 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
2699 | s->uptodate--; | ||
2700 | } | ||
2701 | if (sh->check_state >= check_state_run && | ||
2702 | sh->check_state <= check_state_run_pq) { | ||
2703 | /* async_syndrome_zero_sum preserves P and Q, so | ||
2704 | * no need to mark them !uptodate here | ||
2705 | */ | ||
2706 | set_bit(STRIPE_OP_CHECK, &s->ops_request); | ||
2707 | break; | ||
2618 | } | 2708 | } |
2619 | 2709 | ||
2710 | /* we have 2-disk failure */ | ||
2711 | BUG_ON(s->failed != 2); | ||
2712 | /* fall through */ | ||
2713 | case check_state_compute_result: | ||
2714 | sh->check_state = check_state_idle; | ||
2715 | |||
2716 | /* check that a write has not made the stripe insync */ | ||
2717 | if (test_bit(STRIPE_INSYNC, &sh->state)) | ||
2718 | break; | ||
2719 | |||
2620 | /* now write out any block on a failed drive, | 2720 | /* now write out any block on a failed drive, |
2621 | * or P or Q if they need it | 2721 | * or P or Q if they were recomputed |
2622 | */ | 2722 | */ |
2623 | 2723 | BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ | |
2624 | if (s->failed == 2) { | 2724 | if (s->failed == 2) { |
2625 | dev = &sh->dev[r6s->failed_num[1]]; | 2725 | dev = &sh->dev[r6s->failed_num[1]]; |
2626 | s->locked++; | 2726 | s->locked++; |
@@ -2633,14 +2733,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2633 | set_bit(R5_LOCKED, &dev->flags); | 2733 | set_bit(R5_LOCKED, &dev->flags); |
2634 | set_bit(R5_Wantwrite, &dev->flags); | 2734 | set_bit(R5_Wantwrite, &dev->flags); |
2635 | } | 2735 | } |
2636 | 2736 | if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { | |
2637 | if (update_p) { | ||
2638 | dev = &sh->dev[pd_idx]; | 2737 | dev = &sh->dev[pd_idx]; |
2639 | s->locked++; | 2738 | s->locked++; |
2640 | set_bit(R5_LOCKED, &dev->flags); | 2739 | set_bit(R5_LOCKED, &dev->flags); |
2641 | set_bit(R5_Wantwrite, &dev->flags); | 2740 | set_bit(R5_Wantwrite, &dev->flags); |
2642 | } | 2741 | } |
2643 | if (update_q) { | 2742 | if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { |
2644 | dev = &sh->dev[qd_idx]; | 2743 | dev = &sh->dev[qd_idx]; |
2645 | s->locked++; | 2744 | s->locked++; |
2646 | set_bit(R5_LOCKED, &dev->flags); | 2745 | set_bit(R5_LOCKED, &dev->flags); |
@@ -2649,6 +2748,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2649 | clear_bit(STRIPE_DEGRADED, &sh->state); | 2748 | clear_bit(STRIPE_DEGRADED, &sh->state); |
2650 | 2749 | ||
2651 | set_bit(STRIPE_INSYNC, &sh->state); | 2750 | set_bit(STRIPE_INSYNC, &sh->state); |
2751 | break; | ||
2752 | case check_state_run: | ||
2753 | case check_state_run_q: | ||
2754 | case check_state_run_pq: | ||
2755 | break; /* we will be called again upon completion */ | ||
2756 | case check_state_check_result: | ||
2757 | sh->check_state = check_state_idle; | ||
2758 | |||
2759 | /* handle a successful check operation, if parity is correct | ||
2760 | * we are done. Otherwise update the mismatch count and repair | ||
2761 | * parity if !MD_RECOVERY_CHECK | ||
2762 | */ | ||
2763 | if (sh->ops.zero_sum_result == 0) { | ||
2764 | /* both parities are correct */ | ||
2765 | if (!s->failed) | ||
2766 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2767 | else { | ||
2768 | /* in contrast to the raid5 case we can validate | ||
2769 | * parity, but still have a failure to write | ||
2770 | * back | ||
2771 | */ | ||
2772 | sh->check_state = check_state_compute_result; | ||
2773 | /* Returning at this point means that we may go | ||
2774 | * off and bring p and/or q uptodate again so | ||
2775 | * we make sure to check zero_sum_result again | ||
2776 | * to verify if p or q need writeback | ||
2777 | */ | ||
2778 | } | ||
2779 | } else { | ||
2780 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
2781 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
2782 | /* don't try to repair!! */ | ||
2783 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2784 | else { | ||
2785 | int *target = &sh->ops.target; | ||
2786 | |||
2787 | sh->ops.target = -1; | ||
2788 | sh->ops.target2 = -1; | ||
2789 | sh->check_state = check_state_compute_run; | ||
2790 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
2791 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2792 | if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { | ||
2793 | set_bit(R5_Wantcompute, | ||
2794 | &sh->dev[pd_idx].flags); | ||
2795 | *target = pd_idx; | ||
2796 | target = &sh->ops.target2; | ||
2797 | s->uptodate++; | ||
2798 | } | ||
2799 | if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { | ||
2800 | set_bit(R5_Wantcompute, | ||
2801 | &sh->dev[qd_idx].flags); | ||
2802 | *target = qd_idx; | ||
2803 | s->uptodate++; | ||
2804 | } | ||
2805 | } | ||
2806 | } | ||
2807 | break; | ||
2808 | case check_state_compute_run: | ||
2809 | break; | ||
2810 | default: | ||
2811 | printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", | ||
2812 | __func__, sh->check_state, | ||
2813 | (unsigned long long) sh->sector); | ||
2814 | BUG(); | ||
2652 | } | 2815 | } |
2653 | } | 2816 | } |
2654 | 2817 | ||
@@ -2666,6 +2829,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2666 | if (i != sh->pd_idx && i != sh->qd_idx) { | 2829 | if (i != sh->pd_idx && i != sh->qd_idx) { |
2667 | int dd_idx, j; | 2830 | int dd_idx, j; |
2668 | struct stripe_head *sh2; | 2831 | struct stripe_head *sh2; |
2832 | struct async_submit_ctl submit; | ||
2669 | 2833 | ||
2670 | sector_t bn = compute_blocknr(sh, i, 1); | 2834 | sector_t bn = compute_blocknr(sh, i, 1); |
2671 | sector_t s = raid5_compute_sector(conf, bn, 0, | 2835 | sector_t s = raid5_compute_sector(conf, bn, 0, |
@@ -2685,9 +2849,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2685 | } | 2849 | } |
2686 | 2850 | ||
2687 | /* place all the copies on one channel */ | 2851 | /* place all the copies on one channel */ |
2852 | init_async_submit(&submit, 0, tx, NULL, NULL, NULL); | ||
2688 | tx = async_memcpy(sh2->dev[dd_idx].page, | 2853 | tx = async_memcpy(sh2->dev[dd_idx].page, |
2689 | sh->dev[i].page, 0, 0, STRIPE_SIZE, | 2854 | sh->dev[i].page, 0, 0, STRIPE_SIZE, |
2690 | ASYNC_TX_DEP_ACK, tx, NULL, NULL); | 2855 | &submit); |
2691 | 2856 | ||
2692 | set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); | 2857 | set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); |
2693 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); | 2858 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); |
@@ -2973,7 +3138,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2973 | /* Need to write out all blocks after computing parity */ | 3138 | /* Need to write out all blocks after computing parity */ |
2974 | sh->disks = conf->raid_disks; | 3139 | sh->disks = conf->raid_disks; |
2975 | stripe_set_idx(sh->sector, conf, 0, sh); | 3140 | stripe_set_idx(sh->sector, conf, 0, sh); |
2976 | schedule_reconstruction5(sh, &s, 1, 1); | 3141 | schedule_reconstruction(sh, &s, 1, 1); |
2977 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { | 3142 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { |
2978 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 3143 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
2979 | atomic_dec(&conf->reshape_stripes); | 3144 | atomic_dec(&conf->reshape_stripes); |
@@ -2993,7 +3158,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2993 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 3158 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
2994 | 3159 | ||
2995 | if (s.ops_request) | 3160 | if (s.ops_request) |
2996 | raid5_run_ops(sh, s.ops_request); | 3161 | raid_run_ops(sh, s.ops_request); |
2997 | 3162 | ||
2998 | ops_run_io(sh, &s); | 3163 | ops_run_io(sh, &s); |
2999 | 3164 | ||
@@ -3002,7 +3167,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
3002 | return blocked_rdev == NULL; | 3167 | return blocked_rdev == NULL; |
3003 | } | 3168 | } |
3004 | 3169 | ||
3005 | static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | 3170 | static bool handle_stripe6(struct stripe_head *sh) |
3006 | { | 3171 | { |
3007 | raid5_conf_t *conf = sh->raid_conf; | 3172 | raid5_conf_t *conf = sh->raid_conf; |
3008 | int disks = sh->disks; | 3173 | int disks = sh->disks; |
@@ -3014,9 +3179,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3014 | mdk_rdev_t *blocked_rdev = NULL; | 3179 | mdk_rdev_t *blocked_rdev = NULL; |
3015 | 3180 | ||
3016 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " | 3181 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " |
3017 | "pd_idx=%d, qd_idx=%d\n", | 3182 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", |
3018 | (unsigned long long)sh->sector, sh->state, | 3183 | (unsigned long long)sh->sector, sh->state, |
3019 | atomic_read(&sh->count), pd_idx, qd_idx); | 3184 | atomic_read(&sh->count), pd_idx, qd_idx, |
3185 | sh->check_state, sh->reconstruct_state); | ||
3020 | memset(&s, 0, sizeof(s)); | 3186 | memset(&s, 0, sizeof(s)); |
3021 | 3187 | ||
3022 | spin_lock(&sh->lock); | 3188 | spin_lock(&sh->lock); |
@@ -3036,35 +3202,24 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3036 | 3202 | ||
3037 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3203 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
3038 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 3204 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
3039 | /* maybe we can reply to a read */ | 3205 | /* maybe we can reply to a read |
3040 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { | 3206 | * |
3041 | struct bio *rbi, *rbi2; | 3207 | * new wantfill requests are only permitted while |
3042 | pr_debug("Return read for disc %d\n", i); | 3208 | * ops_complete_biofill is guaranteed to be inactive |
3043 | spin_lock_irq(&conf->device_lock); | 3209 | */ |
3044 | rbi = dev->toread; | 3210 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && |
3045 | dev->toread = NULL; | 3211 | !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) |
3046 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | 3212 | set_bit(R5_Wantfill, &dev->flags); |
3047 | wake_up(&conf->wait_for_overlap); | ||
3048 | spin_unlock_irq(&conf->device_lock); | ||
3049 | while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { | ||
3050 | copy_data(0, rbi, dev->page, dev->sector); | ||
3051 | rbi2 = r5_next_bio(rbi, dev->sector); | ||
3052 | spin_lock_irq(&conf->device_lock); | ||
3053 | if (!raid5_dec_bi_phys_segments(rbi)) { | ||
3054 | rbi->bi_next = return_bi; | ||
3055 | return_bi = rbi; | ||
3056 | } | ||
3057 | spin_unlock_irq(&conf->device_lock); | ||
3058 | rbi = rbi2; | ||
3059 | } | ||
3060 | } | ||
3061 | 3213 | ||
3062 | /* now count some things */ | 3214 | /* now count some things */ |
3063 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; | 3215 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; |
3064 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; | 3216 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; |
3217 | if (test_bit(R5_Wantcompute, &dev->flags)) | ||
3218 | BUG_ON(++s.compute > 2); | ||
3065 | 3219 | ||
3066 | 3220 | if (test_bit(R5_Wantfill, &dev->flags)) { | |
3067 | if (dev->toread) | 3221 | s.to_fill++; |
3222 | } else if (dev->toread) | ||
3068 | s.to_read++; | 3223 | s.to_read++; |
3069 | if (dev->towrite) { | 3224 | if (dev->towrite) { |
3070 | s.to_write++; | 3225 | s.to_write++; |
@@ -3105,6 +3260,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3105 | blocked_rdev = NULL; | 3260 | blocked_rdev = NULL; |
3106 | } | 3261 | } |
3107 | 3262 | ||
3263 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { | ||
3264 | set_bit(STRIPE_OP_BIOFILL, &s.ops_request); | ||
3265 | set_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
3266 | } | ||
3267 | |||
3108 | pr_debug("locked=%d uptodate=%d to_read=%d" | 3268 | pr_debug("locked=%d uptodate=%d to_read=%d" |
3109 | " to_write=%d failed=%d failed_num=%d,%d\n", | 3269 | " to_write=%d failed=%d failed_num=%d,%d\n", |
3110 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, | 3270 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, |
@@ -3145,19 +3305,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3145 | * or to load a block that is being partially written. | 3305 | * or to load a block that is being partially written. |
3146 | */ | 3306 | */ |
3147 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || | 3307 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || |
3148 | (s.syncing && (s.uptodate < disks)) || s.expanding) | 3308 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) |
3149 | handle_stripe_fill6(sh, &s, &r6s, disks); | 3309 | handle_stripe_fill6(sh, &s, &r6s, disks); |
3150 | 3310 | ||
3151 | /* now to consider writing and what else, if anything should be read */ | 3311 | /* Now we check to see if any write operations have recently |
3152 | if (s.to_write) | 3312 | * completed |
3313 | */ | ||
3314 | if (sh->reconstruct_state == reconstruct_state_drain_result) { | ||
3315 | int qd_idx = sh->qd_idx; | ||
3316 | |||
3317 | sh->reconstruct_state = reconstruct_state_idle; | ||
3318 | /* All the 'written' buffers and the parity blocks are ready to | ||
3319 | * be written back to disk | ||
3320 | */ | ||
3321 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | ||
3322 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); | ||
3323 | for (i = disks; i--; ) { | ||
3324 | dev = &sh->dev[i]; | ||
3325 | if (test_bit(R5_LOCKED, &dev->flags) && | ||
3326 | (i == sh->pd_idx || i == qd_idx || | ||
3327 | dev->written)) { | ||
3328 | pr_debug("Writing block %d\n", i); | ||
3329 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | ||
3330 | set_bit(R5_Wantwrite, &dev->flags); | ||
3331 | if (!test_bit(R5_Insync, &dev->flags) || | ||
3332 | ((i == sh->pd_idx || i == qd_idx) && | ||
3333 | s.failed == 0)) | ||
3334 | set_bit(STRIPE_INSYNC, &sh->state); | ||
3335 | } | ||
3336 | } | ||
3337 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
3338 | atomic_dec(&conf->preread_active_stripes); | ||
3339 | if (atomic_read(&conf->preread_active_stripes) < | ||
3340 | IO_THRESHOLD) | ||
3341 | md_wakeup_thread(conf->mddev->thread); | ||
3342 | } | ||
3343 | } | ||
3344 | |||
3345 | /* Now to consider new write requests and what else, if anything | ||
3346 | * should be read. We do not handle new writes when: | ||
3347 | * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. | ||
3348 | * 2/ A 'check' operation is in flight, as it may clobber the parity | ||
3349 | * block. | ||
3350 | */ | ||
3351 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | ||
3153 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); | 3352 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); |
3154 | 3353 | ||
3155 | /* maybe we need to check and possibly fix the parity for this stripe | 3354 | /* maybe we need to check and possibly fix the parity for this stripe |
3156 | * Any reads will already have been scheduled, so we just see if enough | 3355 | * Any reads will already have been scheduled, so we just see if enough |
3157 | * data is available | 3356 | * data is available. The parity check is held off while parity |
3357 | * dependent operations are in flight. | ||
3158 | */ | 3358 | */ |
3159 | if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) | 3359 | if (sh->check_state || |
3160 | handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); | 3360 | (s.syncing && s.locked == 0 && |
3361 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | ||
3362 | !test_bit(STRIPE_INSYNC, &sh->state))) | ||
3363 | handle_parity_checks6(conf, sh, &s, &r6s, disks); | ||
3161 | 3364 | ||
3162 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 3365 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
3163 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 3366 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); |
@@ -3178,15 +3381,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3178 | set_bit(R5_Wantwrite, &dev->flags); | 3381 | set_bit(R5_Wantwrite, &dev->flags); |
3179 | set_bit(R5_ReWrite, &dev->flags); | 3382 | set_bit(R5_ReWrite, &dev->flags); |
3180 | set_bit(R5_LOCKED, &dev->flags); | 3383 | set_bit(R5_LOCKED, &dev->flags); |
3384 | s.locked++; | ||
3181 | } else { | 3385 | } else { |
3182 | /* let's read it back */ | 3386 | /* let's read it back */ |
3183 | set_bit(R5_Wantread, &dev->flags); | 3387 | set_bit(R5_Wantread, &dev->flags); |
3184 | set_bit(R5_LOCKED, &dev->flags); | 3388 | set_bit(R5_LOCKED, &dev->flags); |
3389 | s.locked++; | ||
3185 | } | 3390 | } |
3186 | } | 3391 | } |
3187 | } | 3392 | } |
3188 | 3393 | ||
3189 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { | 3394 | /* Finish reconstruct operations initiated by the expansion process */ |
3395 | if (sh->reconstruct_state == reconstruct_state_result) { | ||
3396 | sh->reconstruct_state = reconstruct_state_idle; | ||
3397 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
3398 | for (i = conf->raid_disks; i--; ) { | ||
3399 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
3400 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3401 | s.locked++; | ||
3402 | } | ||
3403 | } | ||
3404 | |||
3405 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | ||
3406 | !sh->reconstruct_state) { | ||
3190 | struct stripe_head *sh2 | 3407 | struct stripe_head *sh2 |
3191 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | 3408 | = get_active_stripe(conf, sh->sector, 1, 1, 1); |
3192 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | 3409 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { |
@@ -3207,14 +3424,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3207 | /* Need to write out all blocks after computing P&Q */ | 3424 | /* Need to write out all blocks after computing P&Q */ |
3208 | sh->disks = conf->raid_disks; | 3425 | sh->disks = conf->raid_disks; |
3209 | stripe_set_idx(sh->sector, conf, 0, sh); | 3426 | stripe_set_idx(sh->sector, conf, 0, sh); |
3210 | compute_parity6(sh, RECONSTRUCT_WRITE); | 3427 | schedule_reconstruction(sh, &s, 1, 1); |
3211 | for (i = conf->raid_disks ; i-- ; ) { | 3428 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { |
3212 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3213 | s.locked++; | ||
3214 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
3215 | } | ||
3216 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
3217 | } else if (s.expanded) { | ||
3218 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 3429 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
3219 | atomic_dec(&conf->reshape_stripes); | 3430 | atomic_dec(&conf->reshape_stripes); |
3220 | wake_up(&conf->wait_for_overlap); | 3431 | wake_up(&conf->wait_for_overlap); |
@@ -3232,6 +3443,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3232 | if (unlikely(blocked_rdev)) | 3443 | if (unlikely(blocked_rdev)) |
3233 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 3444 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
3234 | 3445 | ||
3446 | if (s.ops_request) | ||
3447 | raid_run_ops(sh, s.ops_request); | ||
3448 | |||
3235 | ops_run_io(sh, &s); | 3449 | ops_run_io(sh, &s); |
3236 | 3450 | ||
3237 | return_io(return_bi); | 3451 | return_io(return_bi); |
@@ -3240,16 +3454,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3240 | } | 3454 | } |
3241 | 3455 | ||
3242 | /* returns true if the stripe was handled */ | 3456 | /* returns true if the stripe was handled */ |
3243 | static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) | 3457 | static bool handle_stripe(struct stripe_head *sh) |
3244 | { | 3458 | { |
3245 | if (sh->raid_conf->level == 6) | 3459 | if (sh->raid_conf->level == 6) |
3246 | return handle_stripe6(sh, tmp_page); | 3460 | return handle_stripe6(sh); |
3247 | else | 3461 | else |
3248 | return handle_stripe5(sh); | 3462 | return handle_stripe5(sh); |
3249 | } | 3463 | } |
3250 | 3464 | ||
3251 | |||
3252 | |||
3253 | static void raid5_activate_delayed(raid5_conf_t *conf) | 3465 | static void raid5_activate_delayed(raid5_conf_t *conf) |
3254 | { | 3466 | { |
3255 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { | 3467 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { |
@@ -4046,7 +4258,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
4046 | spin_unlock(&sh->lock); | 4258 | spin_unlock(&sh->lock); |
4047 | 4259 | ||
4048 | /* wait for any blocked device to be handled */ | 4260 | /* wait for any blocked device to be handled */ |
4049 | while(unlikely(!handle_stripe(sh, NULL))) | 4261 | while (unlikely(!handle_stripe(sh))) |
4050 | ; | 4262 | ; |
4051 | release_stripe(sh); | 4263 | release_stripe(sh); |
4052 | 4264 | ||
@@ -4103,7 +4315,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
4103 | return handled; | 4315 | return handled; |
4104 | } | 4316 | } |
4105 | 4317 | ||
4106 | handle_stripe(sh, NULL); | 4318 | handle_stripe(sh); |
4107 | release_stripe(sh); | 4319 | release_stripe(sh); |
4108 | handled++; | 4320 | handled++; |
4109 | } | 4321 | } |
@@ -4117,6 +4329,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
4117 | return handled; | 4329 | return handled; |
4118 | } | 4330 | } |
4119 | 4331 | ||
4332 | #ifdef CONFIG_MULTICORE_RAID456 | ||
4333 | static void __process_stripe(void *param, async_cookie_t cookie) | ||
4334 | { | ||
4335 | struct stripe_head *sh = param; | ||
4336 | |||
4337 | handle_stripe(sh); | ||
4338 | release_stripe(sh); | ||
4339 | } | ||
4340 | |||
4341 | static void process_stripe(struct stripe_head *sh, struct list_head *domain) | ||
4342 | { | ||
4343 | async_schedule_domain(__process_stripe, sh, domain); | ||
4344 | } | ||
4345 | |||
4346 | static void synchronize_stripe_processing(struct list_head *domain) | ||
4347 | { | ||
4348 | async_synchronize_full_domain(domain); | ||
4349 | } | ||
4350 | #else | ||
4351 | static void process_stripe(struct stripe_head *sh, struct list_head *domain) | ||
4352 | { | ||
4353 | handle_stripe(sh); | ||
4354 | release_stripe(sh); | ||
4355 | cond_resched(); | ||
4356 | } | ||
4357 | |||
4358 | static void synchronize_stripe_processing(struct list_head *domain) | ||
4359 | { | ||
4360 | } | ||
4361 | #endif | ||
4120 | 4362 | ||
4121 | 4363 | ||
4122 | /* | 4364 | /* |
@@ -4131,6 +4373,7 @@ static void raid5d(mddev_t *mddev) | |||
4131 | struct stripe_head *sh; | 4373 | struct stripe_head *sh; |
4132 | raid5_conf_t *conf = mddev->private; | 4374 | raid5_conf_t *conf = mddev->private; |
4133 | int handled; | 4375 | int handled; |
4376 | LIST_HEAD(raid_domain); | ||
4134 | 4377 | ||
4135 | pr_debug("+++ raid5d active\n"); | 4378 | pr_debug("+++ raid5d active\n"); |
4136 | 4379 | ||
@@ -4167,8 +4410,7 @@ static void raid5d(mddev_t *mddev) | |||
4167 | spin_unlock_irq(&conf->device_lock); | 4410 | spin_unlock_irq(&conf->device_lock); |
4168 | 4411 | ||
4169 | handled++; | 4412 | handled++; |
4170 | handle_stripe(sh, conf->spare_page); | 4413 | process_stripe(sh, &raid_domain); |
4171 | release_stripe(sh); | ||
4172 | 4414 | ||
4173 | spin_lock_irq(&conf->device_lock); | 4415 | spin_lock_irq(&conf->device_lock); |
4174 | } | 4416 | } |
@@ -4176,6 +4418,7 @@ static void raid5d(mddev_t *mddev) | |||
4176 | 4418 | ||
4177 | spin_unlock_irq(&conf->device_lock); | 4419 | spin_unlock_irq(&conf->device_lock); |
4178 | 4420 | ||
4421 | synchronize_stripe_processing(&raid_domain); | ||
4179 | async_tx_issue_pending_all(); | 4422 | async_tx_issue_pending_all(); |
4180 | unplug_slaves(mddev); | 4423 | unplug_slaves(mddev); |
4181 | 4424 | ||
@@ -4308,6 +4551,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
4308 | return sectors * (raid_disks - conf->max_degraded); | 4551 | return sectors * (raid_disks - conf->max_degraded); |
4309 | } | 4552 | } |
4310 | 4553 | ||
4554 | static void raid5_free_percpu(raid5_conf_t *conf) | ||
4555 | { | ||
4556 | struct raid5_percpu *percpu; | ||
4557 | unsigned long cpu; | ||
4558 | |||
4559 | if (!conf->percpu) | ||
4560 | return; | ||
4561 | |||
4562 | get_online_cpus(); | ||
4563 | for_each_possible_cpu(cpu) { | ||
4564 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
4565 | safe_put_page(percpu->spare_page); | ||
4566 | kfree(percpu->scribble); | ||
4567 | } | ||
4568 | #ifdef CONFIG_HOTPLUG_CPU | ||
4569 | unregister_cpu_notifier(&conf->cpu_notify); | ||
4570 | #endif | ||
4571 | put_online_cpus(); | ||
4572 | |||
4573 | free_percpu(conf->percpu); | ||
4574 | } | ||
4575 | |||
4576 | static void free_conf(raid5_conf_t *conf) | ||
4577 | { | ||
4578 | shrink_stripes(conf); | ||
4579 | raid5_free_percpu(conf); | ||
4580 | kfree(conf->disks); | ||
4581 | kfree(conf->stripe_hashtbl); | ||
4582 | kfree(conf); | ||
4583 | } | ||
4584 | |||
4585 | #ifdef CONFIG_HOTPLUG_CPU | ||
4586 | static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, | ||
4587 | void *hcpu) | ||
4588 | { | ||
4589 | raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify); | ||
4590 | long cpu = (long)hcpu; | ||
4591 | struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); | ||
4592 | |||
4593 | switch (action) { | ||
4594 | case CPU_UP_PREPARE: | ||
4595 | case CPU_UP_PREPARE_FROZEN: | ||
4596 | if (conf->level == 6 && !percpu->spare_page) | ||
4597 | percpu->spare_page = alloc_page(GFP_KERNEL); | ||
4598 | if (!percpu->scribble) | ||
4599 | percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); | ||
4600 | |||
4601 | if (!percpu->scribble || | ||
4602 | (conf->level == 6 && !percpu->spare_page)) { | ||
4603 | safe_put_page(percpu->spare_page); | ||
4604 | kfree(percpu->scribble); | ||
4605 | pr_err("%s: failed memory allocation for cpu%ld\n", | ||
4606 | __func__, cpu); | ||
4607 | return NOTIFY_BAD; | ||
4608 | } | ||
4609 | break; | ||
4610 | case CPU_DEAD: | ||
4611 | case CPU_DEAD_FROZEN: | ||
4612 | safe_put_page(percpu->spare_page); | ||
4613 | kfree(percpu->scribble); | ||
4614 | percpu->spare_page = NULL; | ||
4615 | percpu->scribble = NULL; | ||
4616 | break; | ||
4617 | default: | ||
4618 | break; | ||
4619 | } | ||
4620 | return NOTIFY_OK; | ||
4621 | } | ||
4622 | #endif | ||
4623 | |||
4624 | static int raid5_alloc_percpu(raid5_conf_t *conf) | ||
4625 | { | ||
4626 | unsigned long cpu; | ||
4627 | struct page *spare_page; | ||
4628 | struct raid5_percpu *allcpus; | ||
4629 | void *scribble; | ||
4630 | int err; | ||
4631 | |||
4632 | allcpus = alloc_percpu(struct raid5_percpu); | ||
4633 | if (!allcpus) | ||
4634 | return -ENOMEM; | ||
4635 | conf->percpu = allcpus; | ||
4636 | |||
4637 | get_online_cpus(); | ||
4638 | err = 0; | ||
4639 | for_each_present_cpu(cpu) { | ||
4640 | if (conf->level == 6) { | ||
4641 | spare_page = alloc_page(GFP_KERNEL); | ||
4642 | if (!spare_page) { | ||
4643 | err = -ENOMEM; | ||
4644 | break; | ||
4645 | } | ||
4646 | per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; | ||
4647 | } | ||
4648 | scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL); | ||
4649 | if (!scribble) { | ||
4650 | err = -ENOMEM; | ||
4651 | break; | ||
4652 | } | ||
4653 | per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; | ||
4654 | } | ||
4655 | #ifdef CONFIG_HOTPLUG_CPU | ||
4656 | conf->cpu_notify.notifier_call = raid456_cpu_notify; | ||
4657 | conf->cpu_notify.priority = 0; | ||
4658 | if (err == 0) | ||
4659 | err = register_cpu_notifier(&conf->cpu_notify); | ||
4660 | #endif | ||
4661 | put_online_cpus(); | ||
4662 | |||
4663 | return err; | ||
4664 | } | ||
4665 | |||
4311 | static raid5_conf_t *setup_conf(mddev_t *mddev) | 4666 | static raid5_conf_t *setup_conf(mddev_t *mddev) |
4312 | { | 4667 | { |
4313 | raid5_conf_t *conf; | 4668 | raid5_conf_t *conf; |
@@ -4349,6 +4704,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4349 | goto abort; | 4704 | goto abort; |
4350 | 4705 | ||
4351 | conf->raid_disks = mddev->raid_disks; | 4706 | conf->raid_disks = mddev->raid_disks; |
4707 | conf->scribble_len = scribble_len(conf->raid_disks); | ||
4352 | if (mddev->reshape_position == MaxSector) | 4708 | if (mddev->reshape_position == MaxSector) |
4353 | conf->previous_raid_disks = mddev->raid_disks; | 4709 | conf->previous_raid_disks = mddev->raid_disks; |
4354 | else | 4710 | else |
@@ -4364,11 +4720,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4364 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) | 4720 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) |
4365 | goto abort; | 4721 | goto abort; |
4366 | 4722 | ||
4367 | if (mddev->new_level == 6) { | 4723 | conf->level = mddev->new_level; |
4368 | conf->spare_page = alloc_page(GFP_KERNEL); | 4724 | if (raid5_alloc_percpu(conf) != 0) |
4369 | if (!conf->spare_page) | 4725 | goto abort; |
4370 | goto abort; | 4726 | |
4371 | } | ||
4372 | spin_lock_init(&conf->device_lock); | 4727 | spin_lock_init(&conf->device_lock); |
4373 | init_waitqueue_head(&conf->wait_for_stripe); | 4728 | init_waitqueue_head(&conf->wait_for_stripe); |
4374 | init_waitqueue_head(&conf->wait_for_overlap); | 4729 | init_waitqueue_head(&conf->wait_for_overlap); |
@@ -4439,11 +4794,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4439 | 4794 | ||
4440 | abort: | 4795 | abort: |
4441 | if (conf) { | 4796 | if (conf) { |
4442 | shrink_stripes(conf); | 4797 | free_conf(conf); |
4443 | safe_put_page(conf->spare_page); | ||
4444 | kfree(conf->disks); | ||
4445 | kfree(conf->stripe_hashtbl); | ||
4446 | kfree(conf); | ||
4447 | return ERR_PTR(-EIO); | 4798 | return ERR_PTR(-EIO); |
4448 | } else | 4799 | } else |
4449 | return ERR_PTR(-ENOMEM); | 4800 | return ERR_PTR(-ENOMEM); |
@@ -4613,12 +4964,8 @@ abort: | |||
4613 | md_unregister_thread(mddev->thread); | 4964 | md_unregister_thread(mddev->thread); |
4614 | mddev->thread = NULL; | 4965 | mddev->thread = NULL; |
4615 | if (conf) { | 4966 | if (conf) { |
4616 | shrink_stripes(conf); | ||
4617 | print_raid5_conf(conf); | 4967 | print_raid5_conf(conf); |
4618 | safe_put_page(conf->spare_page); | 4968 | free_conf(conf); |
4619 | kfree(conf->disks); | ||
4620 | kfree(conf->stripe_hashtbl); | ||
4621 | kfree(conf); | ||
4622 | } | 4969 | } |
4623 | mddev->private = NULL; | 4970 | mddev->private = NULL; |
4624 | printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); | 4971 | printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); |
@@ -4633,13 +4980,10 @@ static int stop(mddev_t *mddev) | |||
4633 | 4980 | ||
4634 | md_unregister_thread(mddev->thread); | 4981 | md_unregister_thread(mddev->thread); |
4635 | mddev->thread = NULL; | 4982 | mddev->thread = NULL; |
4636 | shrink_stripes(conf); | ||
4637 | kfree(conf->stripe_hashtbl); | ||
4638 | mddev->queue->backing_dev_info.congested_fn = NULL; | 4983 | mddev->queue->backing_dev_info.congested_fn = NULL; |
4639 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 4984 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
4640 | sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); | 4985 | sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); |
4641 | kfree(conf->disks); | 4986 | free_conf(conf); |
4642 | kfree(conf); | ||
4643 | mddev->private = NULL; | 4987 | mddev->private = NULL; |
4644 | return 0; | 4988 | return 0; |
4645 | } | 4989 | } |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 9459689c4ea0..2390e0e83daf 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _RAID5_H | 2 | #define _RAID5_H |
3 | 3 | ||
4 | #include <linux/raid/xor.h> | 4 | #include <linux/raid/xor.h> |
5 | #include <linux/dmaengine.h> | ||
5 | 6 | ||
6 | /* | 7 | /* |
7 | * | 8 | * |
@@ -175,7 +176,9 @@ | |||
175 | */ | 176 | */ |
176 | enum check_states { | 177 | enum check_states { |
177 | check_state_idle = 0, | 178 | check_state_idle = 0, |
178 | check_state_run, /* parity check */ | 179 | check_state_run, /* xor parity check */ |
180 | check_state_run_q, /* q-parity check */ | ||
181 | check_state_run_pq, /* pq dual parity check */ | ||
179 | check_state_check_result, | 182 | check_state_check_result, |
180 | check_state_compute_run, /* parity repair */ | 183 | check_state_compute_run, /* parity repair */ |
181 | check_state_compute_result, | 184 | check_state_compute_result, |
@@ -215,8 +218,8 @@ struct stripe_head { | |||
215 | * @target - STRIPE_OP_COMPUTE_BLK target | 218 | * @target - STRIPE_OP_COMPUTE_BLK target |
216 | */ | 219 | */ |
217 | struct stripe_operations { | 220 | struct stripe_operations { |
218 | int target; | 221 | int target, target2; |
219 | u32 zero_sum_result; | 222 | enum sum_check_flags zero_sum_result; |
220 | } ops; | 223 | } ops; |
221 | struct r5dev { | 224 | struct r5dev { |
222 | struct bio req; | 225 | struct bio req; |
@@ -298,7 +301,7 @@ struct r6_state { | |||
298 | #define STRIPE_OP_COMPUTE_BLK 1 | 301 | #define STRIPE_OP_COMPUTE_BLK 1 |
299 | #define STRIPE_OP_PREXOR 2 | 302 | #define STRIPE_OP_PREXOR 2 |
300 | #define STRIPE_OP_BIODRAIN 3 | 303 | #define STRIPE_OP_BIODRAIN 3 |
301 | #define STRIPE_OP_POSTXOR 4 | 304 | #define STRIPE_OP_RECONSTRUCT 4 |
302 | #define STRIPE_OP_CHECK 5 | 305 | #define STRIPE_OP_CHECK 5 |
303 | 306 | ||
304 | /* | 307 | /* |
@@ -385,8 +388,21 @@ struct raid5_private_data { | |||
385 | * (fresh device added). | 388 | * (fresh device added). |
386 | * Cleared when a sync completes. | 389 | * Cleared when a sync completes. |
387 | */ | 390 | */ |
388 | 391 | /* per cpu variables */ | |
389 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | 392 | struct raid5_percpu { |
393 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | ||
394 | void *scribble; /* space for constructing buffer | ||
395 | * lists and performing address | ||
396 | * conversions | ||
397 | */ | ||
398 | } *percpu; | ||
399 | size_t scribble_len; /* size of scribble region must be | ||
400 | * associated with conf to handle | ||
401 | * cpu hotplug while reshaping | ||
402 | */ | ||
403 | #ifdef CONFIG_HOTPLUG_CPU | ||
404 | struct notifier_block cpu_notify; | ||
405 | #endif | ||
390 | 406 | ||
391 | /* | 407 | /* |
392 | * Free stripes pool | 408 | * Free stripes pool |