diff options
author | NeilBrown <neilb@suse.de> | 2009-09-23 04:31:11 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2009-09-23 04:31:11 -0400 |
commit | 4b3df5668c8ebaebd8d66a5a94374be3e3b2ef0c (patch) | |
tree | 51a231742e211143f5845edf4b09d1712dcd2771 /drivers/md | |
parent | 1ef04fefe2241087d9db7e9615c3f11b516e36cf (diff) | |
parent | 1f6672d44c1ae7408b43c06170ec34eb0a0e9b9f (diff) |
Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/djbw/async_tx into for-linus
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 26 | ||||
-rw-r--r-- | drivers/md/raid5.c | 1475 | ||||
-rw-r--r-- | drivers/md/raid5.h | 28 |
3 files changed, 961 insertions, 568 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 020f9573fd82..2158377a1359 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -124,6 +124,8 @@ config MD_RAID456 | |||
124 | select MD_RAID6_PQ | 124 | select MD_RAID6_PQ |
125 | select ASYNC_MEMCPY | 125 | select ASYNC_MEMCPY |
126 | select ASYNC_XOR | 126 | select ASYNC_XOR |
127 | select ASYNC_PQ | ||
128 | select ASYNC_RAID6_RECOV | ||
127 | ---help--- | 129 | ---help--- |
128 | A RAID-5 set of N drives with a capacity of C MB per drive provides | 130 | A RAID-5 set of N drives with a capacity of C MB per drive provides |
129 | the capacity of C * (N - 1) MB, and protects against a failure | 131 | the capacity of C * (N - 1) MB, and protects against a failure |
@@ -152,9 +154,33 @@ config MD_RAID456 | |||
152 | 154 | ||
153 | If unsure, say Y. | 155 | If unsure, say Y. |
154 | 156 | ||
157 | config MULTICORE_RAID456 | ||
158 | bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" | ||
159 | depends on MD_RAID456 | ||
160 | depends on SMP | ||
161 | depends on EXPERIMENTAL | ||
162 | ---help--- | ||
163 | Enable the raid456 module to dispatch per-stripe raid operations to a | ||
164 | thread pool. | ||
165 | |||
166 | If unsure, say N. | ||
167 | |||
155 | config MD_RAID6_PQ | 168 | config MD_RAID6_PQ |
156 | tristate | 169 | tristate |
157 | 170 | ||
171 | config ASYNC_RAID6_TEST | ||
172 | tristate "Self test for hardware accelerated raid6 recovery" | ||
173 | depends on MD_RAID6_PQ | ||
174 | select ASYNC_RAID6_RECOV | ||
175 | ---help--- | ||
176 | This is a one-shot self test that permutes through the | ||
177 | recovery of all the possible two disk failure scenarios for a | ||
178 | N-disk array. Recovery is performed with the asynchronous | ||
179 | raid6 recovery routines, and will optionally use an offload | ||
180 | engine if one is available. | ||
181 | |||
182 | If unsure, say N. | ||
183 | |||
158 | config MD_MULTIPATH | 184 | config MD_MULTIPATH |
159 | tristate "Multipath I/O support" | 185 | tristate "Multipath I/O support" |
160 | depends on BLK_DEV_MD | 186 | depends on BLK_DEV_MD |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9db84c98a41d..94829804ab7f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -47,7 +47,9 @@ | |||
47 | #include <linux/kthread.h> | 47 | #include <linux/kthread.h> |
48 | #include <linux/raid/pq.h> | 48 | #include <linux/raid/pq.h> |
49 | #include <linux/async_tx.h> | 49 | #include <linux/async_tx.h> |
50 | #include <linux/async.h> | ||
50 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
52 | #include <linux/cpu.h> | ||
51 | #include "md.h" | 53 | #include "md.h" |
52 | #include "raid5.h" | 54 | #include "raid5.h" |
53 | #include "bitmap.h" | 55 | #include "bitmap.h" |
@@ -499,11 +501,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
499 | struct page *bio_page; | 501 | struct page *bio_page; |
500 | int i; | 502 | int i; |
501 | int page_offset; | 503 | int page_offset; |
504 | struct async_submit_ctl submit; | ||
505 | enum async_tx_flags flags = 0; | ||
502 | 506 | ||
503 | if (bio->bi_sector >= sector) | 507 | if (bio->bi_sector >= sector) |
504 | page_offset = (signed)(bio->bi_sector - sector) * 512; | 508 | page_offset = (signed)(bio->bi_sector - sector) * 512; |
505 | else | 509 | else |
506 | page_offset = (signed)(sector - bio->bi_sector) * -512; | 510 | page_offset = (signed)(sector - bio->bi_sector) * -512; |
511 | |||
512 | if (frombio) | ||
513 | flags |= ASYNC_TX_FENCE; | ||
514 | init_async_submit(&submit, flags, tx, NULL, NULL, NULL); | ||
515 | |||
507 | bio_for_each_segment(bvl, bio, i) { | 516 | bio_for_each_segment(bvl, bio, i) { |
508 | int len = bio_iovec_idx(bio, i)->bv_len; | 517 | int len = bio_iovec_idx(bio, i)->bv_len; |
509 | int clen; | 518 | int clen; |
@@ -525,15 +534,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
525 | bio_page = bio_iovec_idx(bio, i)->bv_page; | 534 | bio_page = bio_iovec_idx(bio, i)->bv_page; |
526 | if (frombio) | 535 | if (frombio) |
527 | tx = async_memcpy(page, bio_page, page_offset, | 536 | tx = async_memcpy(page, bio_page, page_offset, |
528 | b_offset, clen, | 537 | b_offset, clen, &submit); |
529 | ASYNC_TX_DEP_ACK, | ||
530 | tx, NULL, NULL); | ||
531 | else | 538 | else |
532 | tx = async_memcpy(bio_page, page, b_offset, | 539 | tx = async_memcpy(bio_page, page, b_offset, |
533 | page_offset, clen, | 540 | page_offset, clen, &submit); |
534 | ASYNC_TX_DEP_ACK, | ||
535 | tx, NULL, NULL); | ||
536 | } | 541 | } |
542 | /* chain the operations */ | ||
543 | submit.depend_tx = tx; | ||
544 | |||
537 | if (clen < len) /* hit end of page */ | 545 | if (clen < len) /* hit end of page */ |
538 | break; | 546 | break; |
539 | page_offset += len; | 547 | page_offset += len; |
@@ -592,6 +600,7 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
592 | { | 600 | { |
593 | struct dma_async_tx_descriptor *tx = NULL; | 601 | struct dma_async_tx_descriptor *tx = NULL; |
594 | raid5_conf_t *conf = sh->raid_conf; | 602 | raid5_conf_t *conf = sh->raid_conf; |
603 | struct async_submit_ctl submit; | ||
595 | int i; | 604 | int i; |
596 | 605 | ||
597 | pr_debug("%s: stripe %llu\n", __func__, | 606 | pr_debug("%s: stripe %llu\n", __func__, |
@@ -615,22 +624,34 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
615 | } | 624 | } |
616 | 625 | ||
617 | atomic_inc(&sh->count); | 626 | atomic_inc(&sh->count); |
618 | async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | 627 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); |
619 | ops_complete_biofill, sh); | 628 | async_trigger_callback(&submit); |
620 | } | 629 | } |
621 | 630 | ||
622 | static void ops_complete_compute5(void *stripe_head_ref) | 631 | static void mark_target_uptodate(struct stripe_head *sh, int target) |
623 | { | 632 | { |
624 | struct stripe_head *sh = stripe_head_ref; | 633 | struct r5dev *tgt; |
625 | int target = sh->ops.target; | ||
626 | struct r5dev *tgt = &sh->dev[target]; | ||
627 | 634 | ||
628 | pr_debug("%s: stripe %llu\n", __func__, | 635 | if (target < 0) |
629 | (unsigned long long)sh->sector); | 636 | return; |
630 | 637 | ||
638 | tgt = &sh->dev[target]; | ||
631 | set_bit(R5_UPTODATE, &tgt->flags); | 639 | set_bit(R5_UPTODATE, &tgt->flags); |
632 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | 640 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); |
633 | clear_bit(R5_Wantcompute, &tgt->flags); | 641 | clear_bit(R5_Wantcompute, &tgt->flags); |
642 | } | ||
643 | |||
644 | static void ops_complete_compute(void *stripe_head_ref) | ||
645 | { | ||
646 | struct stripe_head *sh = stripe_head_ref; | ||
647 | |||
648 | pr_debug("%s: stripe %llu\n", __func__, | ||
649 | (unsigned long long)sh->sector); | ||
650 | |||
651 | /* mark the computed target(s) as uptodate */ | ||
652 | mark_target_uptodate(sh, sh->ops.target); | ||
653 | mark_target_uptodate(sh, sh->ops.target2); | ||
654 | |||
634 | clear_bit(STRIPE_COMPUTE_RUN, &sh->state); | 655 | clear_bit(STRIPE_COMPUTE_RUN, &sh->state); |
635 | if (sh->check_state == check_state_compute_run) | 656 | if (sh->check_state == check_state_compute_run) |
636 | sh->check_state = check_state_compute_result; | 657 | sh->check_state = check_state_compute_result; |
@@ -638,16 +659,24 @@ static void ops_complete_compute5(void *stripe_head_ref) | |||
638 | release_stripe(sh); | 659 | release_stripe(sh); |
639 | } | 660 | } |
640 | 661 | ||
641 | static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) | 662 | /* return a pointer to the address conversion region of the scribble buffer */ |
663 | static addr_conv_t *to_addr_conv(struct stripe_head *sh, | ||
664 | struct raid5_percpu *percpu) | ||
665 | { | ||
666 | return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); | ||
667 | } | ||
668 | |||
669 | static struct dma_async_tx_descriptor * | ||
670 | ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | ||
642 | { | 671 | { |
643 | /* kernel stack size limits the total number of disks */ | ||
644 | int disks = sh->disks; | 672 | int disks = sh->disks; |
645 | struct page *xor_srcs[disks]; | 673 | struct page **xor_srcs = percpu->scribble; |
646 | int target = sh->ops.target; | 674 | int target = sh->ops.target; |
647 | struct r5dev *tgt = &sh->dev[target]; | 675 | struct r5dev *tgt = &sh->dev[target]; |
648 | struct page *xor_dest = tgt->page; | 676 | struct page *xor_dest = tgt->page; |
649 | int count = 0; | 677 | int count = 0; |
650 | struct dma_async_tx_descriptor *tx; | 678 | struct dma_async_tx_descriptor *tx; |
679 | struct async_submit_ctl submit; | ||
651 | int i; | 680 | int i; |
652 | 681 | ||
653 | pr_debug("%s: stripe %llu block: %d\n", | 682 | pr_debug("%s: stripe %llu block: %d\n", |
@@ -660,17 +689,215 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) | |||
660 | 689 | ||
661 | atomic_inc(&sh->count); | 690 | atomic_inc(&sh->count); |
662 | 691 | ||
692 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, | ||
693 | ops_complete_compute, sh, to_addr_conv(sh, percpu)); | ||
663 | if (unlikely(count == 1)) | 694 | if (unlikely(count == 1)) |
664 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | 695 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); |
665 | 0, NULL, ops_complete_compute5, sh); | ||
666 | else | 696 | else |
667 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 697 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
668 | ASYNC_TX_XOR_ZERO_DST, NULL, | ||
669 | ops_complete_compute5, sh); | ||
670 | 698 | ||
671 | return tx; | 699 | return tx; |
672 | } | 700 | } |
673 | 701 | ||
702 | /* set_syndrome_sources - populate source buffers for gen_syndrome | ||
703 | * @srcs - (struct page *) array of size sh->disks | ||
704 | * @sh - stripe_head to parse | ||
705 | * | ||
706 | * Populates srcs in proper layout order for the stripe and returns the | ||
707 | * 'count' of sources to be used in a call to async_gen_syndrome. The P | ||
708 | * destination buffer is recorded in srcs[count] and the Q destination | ||
709 | * is recorded in srcs[count+1]]. | ||
710 | */ | ||
711 | static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) | ||
712 | { | ||
713 | int disks = sh->disks; | ||
714 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); | ||
715 | int d0_idx = raid6_d0(sh); | ||
716 | int count; | ||
717 | int i; | ||
718 | |||
719 | for (i = 0; i < disks; i++) | ||
720 | srcs[i] = (void *)raid6_empty_zero_page; | ||
721 | |||
722 | count = 0; | ||
723 | i = d0_idx; | ||
724 | do { | ||
725 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
726 | |||
727 | srcs[slot] = sh->dev[i].page; | ||
728 | i = raid6_next_disk(i, disks); | ||
729 | } while (i != d0_idx); | ||
730 | BUG_ON(count != syndrome_disks); | ||
731 | |||
732 | return count; | ||
733 | } | ||
734 | |||
735 | static struct dma_async_tx_descriptor * | ||
736 | ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | ||
737 | { | ||
738 | int disks = sh->disks; | ||
739 | struct page **blocks = percpu->scribble; | ||
740 | int target; | ||
741 | int qd_idx = sh->qd_idx; | ||
742 | struct dma_async_tx_descriptor *tx; | ||
743 | struct async_submit_ctl submit; | ||
744 | struct r5dev *tgt; | ||
745 | struct page *dest; | ||
746 | int i; | ||
747 | int count; | ||
748 | |||
749 | if (sh->ops.target < 0) | ||
750 | target = sh->ops.target2; | ||
751 | else if (sh->ops.target2 < 0) | ||
752 | target = sh->ops.target; | ||
753 | else | ||
754 | /* we should only have one valid target */ | ||
755 | BUG(); | ||
756 | BUG_ON(target < 0); | ||
757 | pr_debug("%s: stripe %llu block: %d\n", | ||
758 | __func__, (unsigned long long)sh->sector, target); | ||
759 | |||
760 | tgt = &sh->dev[target]; | ||
761 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | ||
762 | dest = tgt->page; | ||
763 | |||
764 | atomic_inc(&sh->count); | ||
765 | |||
766 | if (target == qd_idx) { | ||
767 | count = set_syndrome_sources(blocks, sh); | ||
768 | blocks[count] = NULL; /* regenerating p is not necessary */ | ||
769 | BUG_ON(blocks[count+1] != dest); /* q should already be set */ | ||
770 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | ||
771 | ops_complete_compute, sh, | ||
772 | to_addr_conv(sh, percpu)); | ||
773 | tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | ||
774 | } else { | ||
775 | /* Compute any data- or p-drive using XOR */ | ||
776 | count = 0; | ||
777 | for (i = disks; i-- ; ) { | ||
778 | if (i == target || i == qd_idx) | ||
779 | continue; | ||
780 | blocks[count++] = sh->dev[i].page; | ||
781 | } | ||
782 | |||
783 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, | ||
784 | NULL, ops_complete_compute, sh, | ||
785 | to_addr_conv(sh, percpu)); | ||
786 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); | ||
787 | } | ||
788 | |||
789 | return tx; | ||
790 | } | ||
791 | |||
792 | static struct dma_async_tx_descriptor * | ||
793 | ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | ||
794 | { | ||
795 | int i, count, disks = sh->disks; | ||
796 | int syndrome_disks = sh->ddf_layout ? disks : disks-2; | ||
797 | int d0_idx = raid6_d0(sh); | ||
798 | int faila = -1, failb = -1; | ||
799 | int target = sh->ops.target; | ||
800 | int target2 = sh->ops.target2; | ||
801 | struct r5dev *tgt = &sh->dev[target]; | ||
802 | struct r5dev *tgt2 = &sh->dev[target2]; | ||
803 | struct dma_async_tx_descriptor *tx; | ||
804 | struct page **blocks = percpu->scribble; | ||
805 | struct async_submit_ctl submit; | ||
806 | |||
807 | pr_debug("%s: stripe %llu block1: %d block2: %d\n", | ||
808 | __func__, (unsigned long long)sh->sector, target, target2); | ||
809 | BUG_ON(target < 0 || target2 < 0); | ||
810 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | ||
811 | BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); | ||
812 | |||
813 | /* we need to open-code set_syndrome_sources to handle the | ||
814 | * slot number conversion for 'faila' and 'failb' | ||
815 | */ | ||
816 | for (i = 0; i < disks ; i++) | ||
817 | blocks[i] = (void *)raid6_empty_zero_page; | ||
818 | count = 0; | ||
819 | i = d0_idx; | ||
820 | do { | ||
821 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
822 | |||
823 | blocks[slot] = sh->dev[i].page; | ||
824 | |||
825 | if (i == target) | ||
826 | faila = slot; | ||
827 | if (i == target2) | ||
828 | failb = slot; | ||
829 | i = raid6_next_disk(i, disks); | ||
830 | } while (i != d0_idx); | ||
831 | BUG_ON(count != syndrome_disks); | ||
832 | |||
833 | BUG_ON(faila == failb); | ||
834 | if (failb < faila) | ||
835 | swap(faila, failb); | ||
836 | pr_debug("%s: stripe: %llu faila: %d failb: %d\n", | ||
837 | __func__, (unsigned long long)sh->sector, faila, failb); | ||
838 | |||
839 | atomic_inc(&sh->count); | ||
840 | |||
841 | if (failb == syndrome_disks+1) { | ||
842 | /* Q disk is one of the missing disks */ | ||
843 | if (faila == syndrome_disks) { | ||
844 | /* Missing P+Q, just recompute */ | ||
845 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | ||
846 | ops_complete_compute, sh, | ||
847 | to_addr_conv(sh, percpu)); | ||
848 | return async_gen_syndrome(blocks, 0, count+2, | ||
849 | STRIPE_SIZE, &submit); | ||
850 | } else { | ||
851 | struct page *dest; | ||
852 | int data_target; | ||
853 | int qd_idx = sh->qd_idx; | ||
854 | |||
855 | /* Missing D+Q: recompute D from P, then recompute Q */ | ||
856 | if (target == qd_idx) | ||
857 | data_target = target2; | ||
858 | else | ||
859 | data_target = target; | ||
860 | |||
861 | count = 0; | ||
862 | for (i = disks; i-- ; ) { | ||
863 | if (i == data_target || i == qd_idx) | ||
864 | continue; | ||
865 | blocks[count++] = sh->dev[i].page; | ||
866 | } | ||
867 | dest = sh->dev[data_target].page; | ||
868 | init_async_submit(&submit, | ||
869 | ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, | ||
870 | NULL, NULL, NULL, | ||
871 | to_addr_conv(sh, percpu)); | ||
872 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, | ||
873 | &submit); | ||
874 | |||
875 | count = set_syndrome_sources(blocks, sh); | ||
876 | init_async_submit(&submit, ASYNC_TX_FENCE, tx, | ||
877 | ops_complete_compute, sh, | ||
878 | to_addr_conv(sh, percpu)); | ||
879 | return async_gen_syndrome(blocks, 0, count+2, | ||
880 | STRIPE_SIZE, &submit); | ||
881 | } | ||
882 | } else { | ||
883 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | ||
884 | ops_complete_compute, sh, | ||
885 | to_addr_conv(sh, percpu)); | ||
886 | if (failb == syndrome_disks) { | ||
887 | /* We're missing D+P. */ | ||
888 | return async_raid6_datap_recov(syndrome_disks+2, | ||
889 | STRIPE_SIZE, faila, | ||
890 | blocks, &submit); | ||
891 | } else { | ||
892 | /* We're missing D+D. */ | ||
893 | return async_raid6_2data_recov(syndrome_disks+2, | ||
894 | STRIPE_SIZE, faila, failb, | ||
895 | blocks, &submit); | ||
896 | } | ||
897 | } | ||
898 | } | ||
899 | |||
900 | |||
674 | static void ops_complete_prexor(void *stripe_head_ref) | 901 | static void ops_complete_prexor(void *stripe_head_ref) |
675 | { | 902 | { |
676 | struct stripe_head *sh = stripe_head_ref; | 903 | struct stripe_head *sh = stripe_head_ref; |
@@ -680,12 +907,13 @@ static void ops_complete_prexor(void *stripe_head_ref) | |||
680 | } | 907 | } |
681 | 908 | ||
682 | static struct dma_async_tx_descriptor * | 909 | static struct dma_async_tx_descriptor * |
683 | ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 910 | ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, |
911 | struct dma_async_tx_descriptor *tx) | ||
684 | { | 912 | { |
685 | /* kernel stack size limits the total number of disks */ | ||
686 | int disks = sh->disks; | 913 | int disks = sh->disks; |
687 | struct page *xor_srcs[disks]; | 914 | struct page **xor_srcs = percpu->scribble; |
688 | int count = 0, pd_idx = sh->pd_idx, i; | 915 | int count = 0, pd_idx = sh->pd_idx, i; |
916 | struct async_submit_ctl submit; | ||
689 | 917 | ||
690 | /* existing parity data subtracted */ | 918 | /* existing parity data subtracted */ |
691 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 919 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
@@ -700,9 +928,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
700 | xor_srcs[count++] = dev->page; | 928 | xor_srcs[count++] = dev->page; |
701 | } | 929 | } |
702 | 930 | ||
703 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 931 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, |
704 | ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, | 932 | ops_complete_prexor, sh, to_addr_conv(sh, percpu)); |
705 | ops_complete_prexor, sh); | 933 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
706 | 934 | ||
707 | return tx; | 935 | return tx; |
708 | } | 936 | } |
@@ -742,17 +970,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
742 | return tx; | 970 | return tx; |
743 | } | 971 | } |
744 | 972 | ||
745 | static void ops_complete_postxor(void *stripe_head_ref) | 973 | static void ops_complete_reconstruct(void *stripe_head_ref) |
746 | { | 974 | { |
747 | struct stripe_head *sh = stripe_head_ref; | 975 | struct stripe_head *sh = stripe_head_ref; |
748 | int disks = sh->disks, i, pd_idx = sh->pd_idx; | 976 | int disks = sh->disks; |
977 | int pd_idx = sh->pd_idx; | ||
978 | int qd_idx = sh->qd_idx; | ||
979 | int i; | ||
749 | 980 | ||
750 | pr_debug("%s: stripe %llu\n", __func__, | 981 | pr_debug("%s: stripe %llu\n", __func__, |
751 | (unsigned long long)sh->sector); | 982 | (unsigned long long)sh->sector); |
752 | 983 | ||
753 | for (i = disks; i--; ) { | 984 | for (i = disks; i--; ) { |
754 | struct r5dev *dev = &sh->dev[i]; | 985 | struct r5dev *dev = &sh->dev[i]; |
755 | if (dev->written || i == pd_idx) | 986 | |
987 | if (dev->written || i == pd_idx || i == qd_idx) | ||
756 | set_bit(R5_UPTODATE, &dev->flags); | 988 | set_bit(R5_UPTODATE, &dev->flags); |
757 | } | 989 | } |
758 | 990 | ||
@@ -770,12 +1002,12 @@ static void ops_complete_postxor(void *stripe_head_ref) | |||
770 | } | 1002 | } |
771 | 1003 | ||
772 | static void | 1004 | static void |
773 | ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 1005 | ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, |
1006 | struct dma_async_tx_descriptor *tx) | ||
774 | { | 1007 | { |
775 | /* kernel stack size limits the total number of disks */ | ||
776 | int disks = sh->disks; | 1008 | int disks = sh->disks; |
777 | struct page *xor_srcs[disks]; | 1009 | struct page **xor_srcs = percpu->scribble; |
778 | 1010 | struct async_submit_ctl submit; | |
779 | int count = 0, pd_idx = sh->pd_idx, i; | 1011 | int count = 0, pd_idx = sh->pd_idx, i; |
780 | struct page *xor_dest; | 1012 | struct page *xor_dest; |
781 | int prexor = 0; | 1013 | int prexor = 0; |
@@ -809,18 +1041,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
809 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST | 1041 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST |
810 | * for the synchronous xor case | 1042 | * for the synchronous xor case |
811 | */ | 1043 | */ |
812 | flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | | 1044 | flags = ASYNC_TX_ACK | |
813 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); | 1045 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); |
814 | 1046 | ||
815 | atomic_inc(&sh->count); | 1047 | atomic_inc(&sh->count); |
816 | 1048 | ||
817 | if (unlikely(count == 1)) { | 1049 | init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, |
818 | flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); | 1050 | to_addr_conv(sh, percpu)); |
819 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | 1051 | if (unlikely(count == 1)) |
820 | flags, tx, ops_complete_postxor, sh); | 1052 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); |
821 | } else | 1053 | else |
822 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 1054 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
823 | flags, tx, ops_complete_postxor, sh); | 1055 | } |
1056 | |||
1057 | static void | ||
1058 | ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, | ||
1059 | struct dma_async_tx_descriptor *tx) | ||
1060 | { | ||
1061 | struct async_submit_ctl submit; | ||
1062 | struct page **blocks = percpu->scribble; | ||
1063 | int count; | ||
1064 | |||
1065 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); | ||
1066 | |||
1067 | count = set_syndrome_sources(blocks, sh); | ||
1068 | |||
1069 | atomic_inc(&sh->count); | ||
1070 | |||
1071 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, | ||
1072 | sh, to_addr_conv(sh, percpu)); | ||
1073 | async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | ||
824 | } | 1074 | } |
825 | 1075 | ||
826 | static void ops_complete_check(void *stripe_head_ref) | 1076 | static void ops_complete_check(void *stripe_head_ref) |
@@ -835,63 +1085,115 @@ static void ops_complete_check(void *stripe_head_ref) | |||
835 | release_stripe(sh); | 1085 | release_stripe(sh); |
836 | } | 1086 | } |
837 | 1087 | ||
838 | static void ops_run_check(struct stripe_head *sh) | 1088 | static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) |
839 | { | 1089 | { |
840 | /* kernel stack size limits the total number of disks */ | ||
841 | int disks = sh->disks; | 1090 | int disks = sh->disks; |
842 | struct page *xor_srcs[disks]; | 1091 | int pd_idx = sh->pd_idx; |
1092 | int qd_idx = sh->qd_idx; | ||
1093 | struct page *xor_dest; | ||
1094 | struct page **xor_srcs = percpu->scribble; | ||
843 | struct dma_async_tx_descriptor *tx; | 1095 | struct dma_async_tx_descriptor *tx; |
844 | 1096 | struct async_submit_ctl submit; | |
845 | int count = 0, pd_idx = sh->pd_idx, i; | 1097 | int count; |
846 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 1098 | int i; |
847 | 1099 | ||
848 | pr_debug("%s: stripe %llu\n", __func__, | 1100 | pr_debug("%s: stripe %llu\n", __func__, |
849 | (unsigned long long)sh->sector); | 1101 | (unsigned long long)sh->sector); |
850 | 1102 | ||
1103 | count = 0; | ||
1104 | xor_dest = sh->dev[pd_idx].page; | ||
1105 | xor_srcs[count++] = xor_dest; | ||
851 | for (i = disks; i--; ) { | 1106 | for (i = disks; i--; ) { |
852 | struct r5dev *dev = &sh->dev[i]; | 1107 | if (i == pd_idx || i == qd_idx) |
853 | if (i != pd_idx) | 1108 | continue; |
854 | xor_srcs[count++] = dev->page; | 1109 | xor_srcs[count++] = sh->dev[i].page; |
855 | } | 1110 | } |
856 | 1111 | ||
857 | tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 1112 | init_async_submit(&submit, 0, NULL, NULL, NULL, |
858 | &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); | 1113 | to_addr_conv(sh, percpu)); |
1114 | tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | ||
1115 | &sh->ops.zero_sum_result, &submit); | ||
1116 | |||
1117 | atomic_inc(&sh->count); | ||
1118 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); | ||
1119 | tx = async_trigger_callback(&submit); | ||
1120 | } | ||
1121 | |||
1122 | static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) | ||
1123 | { | ||
1124 | struct page **srcs = percpu->scribble; | ||
1125 | struct async_submit_ctl submit; | ||
1126 | int count; | ||
1127 | |||
1128 | pr_debug("%s: stripe %llu checkp: %d\n", __func__, | ||
1129 | (unsigned long long)sh->sector, checkp); | ||
1130 | |||
1131 | count = set_syndrome_sources(srcs, sh); | ||
1132 | if (!checkp) | ||
1133 | srcs[count] = NULL; | ||
859 | 1134 | ||
860 | atomic_inc(&sh->count); | 1135 | atomic_inc(&sh->count); |
861 | tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | 1136 | init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, |
862 | ops_complete_check, sh); | 1137 | sh, to_addr_conv(sh, percpu)); |
1138 | async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, | ||
1139 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); | ||
863 | } | 1140 | } |
864 | 1141 | ||
865 | static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) | 1142 | static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) |
866 | { | 1143 | { |
867 | int overlap_clear = 0, i, disks = sh->disks; | 1144 | int overlap_clear = 0, i, disks = sh->disks; |
868 | struct dma_async_tx_descriptor *tx = NULL; | 1145 | struct dma_async_tx_descriptor *tx = NULL; |
1146 | raid5_conf_t *conf = sh->raid_conf; | ||
1147 | int level = conf->level; | ||
1148 | struct raid5_percpu *percpu; | ||
1149 | unsigned long cpu; | ||
869 | 1150 | ||
1151 | cpu = get_cpu(); | ||
1152 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
870 | if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { | 1153 | if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { |
871 | ops_run_biofill(sh); | 1154 | ops_run_biofill(sh); |
872 | overlap_clear++; | 1155 | overlap_clear++; |
873 | } | 1156 | } |
874 | 1157 | ||
875 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { | 1158 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { |
876 | tx = ops_run_compute5(sh); | 1159 | if (level < 6) |
877 | /* terminate the chain if postxor is not set to be run */ | 1160 | tx = ops_run_compute5(sh, percpu); |
878 | if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) | 1161 | else { |
1162 | if (sh->ops.target2 < 0 || sh->ops.target < 0) | ||
1163 | tx = ops_run_compute6_1(sh, percpu); | ||
1164 | else | ||
1165 | tx = ops_run_compute6_2(sh, percpu); | ||
1166 | } | ||
1167 | /* terminate the chain if reconstruct is not set to be run */ | ||
1168 | if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) | ||
879 | async_tx_ack(tx); | 1169 | async_tx_ack(tx); |
880 | } | 1170 | } |
881 | 1171 | ||
882 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) | 1172 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) |
883 | tx = ops_run_prexor(sh, tx); | 1173 | tx = ops_run_prexor(sh, percpu, tx); |
884 | 1174 | ||
885 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { | 1175 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { |
886 | tx = ops_run_biodrain(sh, tx); | 1176 | tx = ops_run_biodrain(sh, tx); |
887 | overlap_clear++; | 1177 | overlap_clear++; |
888 | } | 1178 | } |
889 | 1179 | ||
890 | if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) | 1180 | if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { |
891 | ops_run_postxor(sh, tx); | 1181 | if (level < 6) |
1182 | ops_run_reconstruct5(sh, percpu, tx); | ||
1183 | else | ||
1184 | ops_run_reconstruct6(sh, percpu, tx); | ||
1185 | } | ||
892 | 1186 | ||
893 | if (test_bit(STRIPE_OP_CHECK, &ops_request)) | 1187 | if (test_bit(STRIPE_OP_CHECK, &ops_request)) { |
894 | ops_run_check(sh); | 1188 | if (sh->check_state == check_state_run) |
1189 | ops_run_check_p(sh, percpu); | ||
1190 | else if (sh->check_state == check_state_run_q) | ||
1191 | ops_run_check_pq(sh, percpu, 0); | ||
1192 | else if (sh->check_state == check_state_run_pq) | ||
1193 | ops_run_check_pq(sh, percpu, 1); | ||
1194 | else | ||
1195 | BUG(); | ||
1196 | } | ||
895 | 1197 | ||
896 | if (overlap_clear) | 1198 | if (overlap_clear) |
897 | for (i = disks; i--; ) { | 1199 | for (i = disks; i--; ) { |
@@ -899,6 +1201,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
899 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | 1201 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) |
900 | wake_up(&sh->raid_conf->wait_for_overlap); | 1202 | wake_up(&sh->raid_conf->wait_for_overlap); |
901 | } | 1203 | } |
1204 | put_cpu(); | ||
902 | } | 1205 | } |
903 | 1206 | ||
904 | static int grow_one_stripe(raid5_conf_t *conf) | 1207 | static int grow_one_stripe(raid5_conf_t *conf) |
@@ -948,6 +1251,28 @@ static int grow_stripes(raid5_conf_t *conf, int num) | |||
948 | return 0; | 1251 | return 0; |
949 | } | 1252 | } |
950 | 1253 | ||
1254 | /** | ||
1255 | * scribble_len - return the required size of the scribble region | ||
1256 | * @num - total number of disks in the array | ||
1257 | * | ||
1258 | * The size must be enough to contain: | ||
1259 | * 1/ a struct page pointer for each device in the array +2 | ||
1260 | * 2/ room to convert each entry in (1) to its corresponding dma | ||
1261 | * (dma_map_page()) or page (page_address()) address. | ||
1262 | * | ||
1263 | * Note: the +2 is for the destination buffers of the ddf/raid6 case where we | ||
1264 | * calculate over all devices (not just the data blocks), using zeros in place | ||
1265 | * of the P and Q blocks. | ||
1266 | */ | ||
1267 | static size_t scribble_len(int num) | ||
1268 | { | ||
1269 | size_t len; | ||
1270 | |||
1271 | len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); | ||
1272 | |||
1273 | return len; | ||
1274 | } | ||
1275 | |||
951 | static int resize_stripes(raid5_conf_t *conf, int newsize) | 1276 | static int resize_stripes(raid5_conf_t *conf, int newsize) |
952 | { | 1277 | { |
953 | /* Make all the stripes able to hold 'newsize' devices. | 1278 | /* Make all the stripes able to hold 'newsize' devices. |
@@ -976,6 +1301,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
976 | struct stripe_head *osh, *nsh; | 1301 | struct stripe_head *osh, *nsh; |
977 | LIST_HEAD(newstripes); | 1302 | LIST_HEAD(newstripes); |
978 | struct disk_info *ndisks; | 1303 | struct disk_info *ndisks; |
1304 | unsigned long cpu; | ||
979 | int err; | 1305 | int err; |
980 | struct kmem_cache *sc; | 1306 | struct kmem_cache *sc; |
981 | int i; | 1307 | int i; |
@@ -1041,7 +1367,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1041 | /* Step 3. | 1367 | /* Step 3. |
1042 | * At this point, we are holding all the stripes so the array | 1368 | * At this point, we are holding all the stripes so the array |
1043 | * is completely stalled, so now is a good time to resize | 1369 | * is completely stalled, so now is a good time to resize |
1044 | * conf->disks. | 1370 | * conf->disks and the scribble region |
1045 | */ | 1371 | */ |
1046 | ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); | 1372 | ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); |
1047 | if (ndisks) { | 1373 | if (ndisks) { |
@@ -1052,10 +1378,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1052 | } else | 1378 | } else |
1053 | err = -ENOMEM; | 1379 | err = -ENOMEM; |
1054 | 1380 | ||
1381 | get_online_cpus(); | ||
1382 | conf->scribble_len = scribble_len(newsize); | ||
1383 | for_each_present_cpu(cpu) { | ||
1384 | struct raid5_percpu *percpu; | ||
1385 | void *scribble; | ||
1386 | |||
1387 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
1388 | scribble = kmalloc(conf->scribble_len, GFP_NOIO); | ||
1389 | |||
1390 | if (scribble) { | ||
1391 | kfree(percpu->scribble); | ||
1392 | percpu->scribble = scribble; | ||
1393 | } else { | ||
1394 | err = -ENOMEM; | ||
1395 | break; | ||
1396 | } | ||
1397 | } | ||
1398 | put_online_cpus(); | ||
1399 | |||
1055 | /* Step 4, return new stripes to service */ | 1400 | /* Step 4, return new stripes to service */ |
1056 | while(!list_empty(&newstripes)) { | 1401 | while(!list_empty(&newstripes)) { |
1057 | nsh = list_entry(newstripes.next, struct stripe_head, lru); | 1402 | nsh = list_entry(newstripes.next, struct stripe_head, lru); |
1058 | list_del_init(&nsh->lru); | 1403 | list_del_init(&nsh->lru); |
1404 | |||
1059 | for (i=conf->raid_disks; i < newsize; i++) | 1405 | for (i=conf->raid_disks; i < newsize; i++) |
1060 | if (nsh->dev[i].page == NULL) { | 1406 | if (nsh->dev[i].page == NULL) { |
1061 | struct page *p = alloc_page(GFP_NOIO); | 1407 | struct page *p = alloc_page(GFP_NOIO); |
@@ -1594,258 +1940,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
1594 | } | 1940 | } |
1595 | 1941 | ||
1596 | 1942 | ||
1597 | |||
1598 | /* | ||
1599 | * Copy data between a page in the stripe cache, and one or more bion | ||
1600 | * The page could align with the middle of the bio, or there could be | ||
1601 | * several bion, each with several bio_vecs, which cover part of the page | ||
1602 | * Multiple bion are linked together on bi_next. There may be extras | ||
1603 | * at the end of this list. We ignore them. | ||
1604 | */ | ||
1605 | static void copy_data(int frombio, struct bio *bio, | ||
1606 | struct page *page, | ||
1607 | sector_t sector) | ||
1608 | { | ||
1609 | char *pa = page_address(page); | ||
1610 | struct bio_vec *bvl; | ||
1611 | int i; | ||
1612 | int page_offset; | ||
1613 | |||
1614 | if (bio->bi_sector >= sector) | ||
1615 | page_offset = (signed)(bio->bi_sector - sector) * 512; | ||
1616 | else | ||
1617 | page_offset = (signed)(sector - bio->bi_sector) * -512; | ||
1618 | bio_for_each_segment(bvl, bio, i) { | ||
1619 | int len = bio_iovec_idx(bio,i)->bv_len; | ||
1620 | int clen; | ||
1621 | int b_offset = 0; | ||
1622 | |||
1623 | if (page_offset < 0) { | ||
1624 | b_offset = -page_offset; | ||
1625 | page_offset += b_offset; | ||
1626 | len -= b_offset; | ||
1627 | } | ||
1628 | |||
1629 | if (len > 0 && page_offset + len > STRIPE_SIZE) | ||
1630 | clen = STRIPE_SIZE - page_offset; | ||
1631 | else clen = len; | ||
1632 | |||
1633 | if (clen > 0) { | ||
1634 | char *ba = __bio_kmap_atomic(bio, i, KM_USER0); | ||
1635 | if (frombio) | ||
1636 | memcpy(pa+page_offset, ba+b_offset, clen); | ||
1637 | else | ||
1638 | memcpy(ba+b_offset, pa+page_offset, clen); | ||
1639 | __bio_kunmap_atomic(ba, KM_USER0); | ||
1640 | } | ||
1641 | if (clen < len) /* hit end of page */ | ||
1642 | break; | ||
1643 | page_offset += len; | ||
1644 | } | ||
1645 | } | ||
1646 | |||
1647 | #define check_xor() do { \ | ||
1648 | if (count == MAX_XOR_BLOCKS) { \ | ||
1649 | xor_blocks(count, STRIPE_SIZE, dest, ptr);\ | ||
1650 | count = 0; \ | ||
1651 | } \ | ||
1652 | } while(0) | ||
1653 | |||
1654 | static void compute_parity6(struct stripe_head *sh, int method) | ||
1655 | { | ||
1656 | raid5_conf_t *conf = sh->raid_conf; | ||
1657 | int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; | ||
1658 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); | ||
1659 | struct bio *chosen; | ||
1660 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ | ||
1661 | void *ptrs[syndrome_disks+2]; | ||
1662 | |||
1663 | pd_idx = sh->pd_idx; | ||
1664 | qd_idx = sh->qd_idx; | ||
1665 | d0_idx = raid6_d0(sh); | ||
1666 | |||
1667 | pr_debug("compute_parity, stripe %llu, method %d\n", | ||
1668 | (unsigned long long)sh->sector, method); | ||
1669 | |||
1670 | switch(method) { | ||
1671 | case READ_MODIFY_WRITE: | ||
1672 | BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ | ||
1673 | case RECONSTRUCT_WRITE: | ||
1674 | for (i= disks; i-- ;) | ||
1675 | if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { | ||
1676 | chosen = sh->dev[i].towrite; | ||
1677 | sh->dev[i].towrite = NULL; | ||
1678 | |||
1679 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
1680 | wake_up(&conf->wait_for_overlap); | ||
1681 | |||
1682 | BUG_ON(sh->dev[i].written); | ||
1683 | sh->dev[i].written = chosen; | ||
1684 | } | ||
1685 | break; | ||
1686 | case CHECK_PARITY: | ||
1687 | BUG(); /* Not implemented yet */ | ||
1688 | } | ||
1689 | |||
1690 | for (i = disks; i--;) | ||
1691 | if (sh->dev[i].written) { | ||
1692 | sector_t sector = sh->dev[i].sector; | ||
1693 | struct bio *wbi = sh->dev[i].written; | ||
1694 | while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { | ||
1695 | copy_data(1, wbi, sh->dev[i].page, sector); | ||
1696 | wbi = r5_next_bio(wbi, sector); | ||
1697 | } | ||
1698 | |||
1699 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
1700 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | ||
1701 | } | ||
1702 | |||
1703 | /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ | ||
1704 | |||
1705 | for (i = 0; i < disks; i++) | ||
1706 | ptrs[i] = (void *)raid6_empty_zero_page; | ||
1707 | |||
1708 | count = 0; | ||
1709 | i = d0_idx; | ||
1710 | do { | ||
1711 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
1712 | |||
1713 | ptrs[slot] = page_address(sh->dev[i].page); | ||
1714 | if (slot < syndrome_disks && | ||
1715 | !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { | ||
1716 | printk(KERN_ERR "block %d/%d not uptodate " | ||
1717 | "on parity calc\n", i, count); | ||
1718 | BUG(); | ||
1719 | } | ||
1720 | |||
1721 | i = raid6_next_disk(i, disks); | ||
1722 | } while (i != d0_idx); | ||
1723 | BUG_ON(count != syndrome_disks); | ||
1724 | |||
1725 | raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); | ||
1726 | |||
1727 | switch(method) { | ||
1728 | case RECONSTRUCT_WRITE: | ||
1729 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
1730 | set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); | ||
1731 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | ||
1732 | set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); | ||
1733 | break; | ||
1734 | case UPDATE_PARITY: | ||
1735 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
1736 | set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); | ||
1737 | break; | ||
1738 | } | ||
1739 | } | ||
1740 | |||
1741 | |||
1742 | /* Compute one missing block */ | ||
1743 | static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) | ||
1744 | { | ||
1745 | int i, count, disks = sh->disks; | ||
1746 | void *ptr[MAX_XOR_BLOCKS], *dest, *p; | ||
1747 | int qd_idx = sh->qd_idx; | ||
1748 | |||
1749 | pr_debug("compute_block_1, stripe %llu, idx %d\n", | ||
1750 | (unsigned long long)sh->sector, dd_idx); | ||
1751 | |||
1752 | if ( dd_idx == qd_idx ) { | ||
1753 | /* We're actually computing the Q drive */ | ||
1754 | compute_parity6(sh, UPDATE_PARITY); | ||
1755 | } else { | ||
1756 | dest = page_address(sh->dev[dd_idx].page); | ||
1757 | if (!nozero) memset(dest, 0, STRIPE_SIZE); | ||
1758 | count = 0; | ||
1759 | for (i = disks ; i--; ) { | ||
1760 | if (i == dd_idx || i == qd_idx) | ||
1761 | continue; | ||
1762 | p = page_address(sh->dev[i].page); | ||
1763 | if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) | ||
1764 | ptr[count++] = p; | ||
1765 | else | ||
1766 | printk("compute_block() %d, stripe %llu, %d" | ||
1767 | " not present\n", dd_idx, | ||
1768 | (unsigned long long)sh->sector, i); | ||
1769 | |||
1770 | check_xor(); | ||
1771 | } | ||
1772 | if (count) | ||
1773 | xor_blocks(count, STRIPE_SIZE, dest, ptr); | ||
1774 | if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | ||
1775 | else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | ||
1776 | } | ||
1777 | } | ||
1778 | |||
1779 | /* Compute two missing blocks */ | ||
1780 | static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) | ||
1781 | { | ||
1782 | int i, count, disks = sh->disks; | ||
1783 | int syndrome_disks = sh->ddf_layout ? disks : disks-2; | ||
1784 | int d0_idx = raid6_d0(sh); | ||
1785 | int faila = -1, failb = -1; | ||
1786 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ | ||
1787 | void *ptrs[syndrome_disks+2]; | ||
1788 | |||
1789 | for (i = 0; i < disks ; i++) | ||
1790 | ptrs[i] = (void *)raid6_empty_zero_page; | ||
1791 | count = 0; | ||
1792 | i = d0_idx; | ||
1793 | do { | ||
1794 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
1795 | |||
1796 | ptrs[slot] = page_address(sh->dev[i].page); | ||
1797 | |||
1798 | if (i == dd_idx1) | ||
1799 | faila = slot; | ||
1800 | if (i == dd_idx2) | ||
1801 | failb = slot; | ||
1802 | i = raid6_next_disk(i, disks); | ||
1803 | } while (i != d0_idx); | ||
1804 | BUG_ON(count != syndrome_disks); | ||
1805 | |||
1806 | BUG_ON(faila == failb); | ||
1807 | if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } | ||
1808 | |||
1809 | pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", | ||
1810 | (unsigned long long)sh->sector, dd_idx1, dd_idx2, | ||
1811 | faila, failb); | ||
1812 | |||
1813 | if (failb == syndrome_disks+1) { | ||
1814 | /* Q disk is one of the missing disks */ | ||
1815 | if (faila == syndrome_disks) { | ||
1816 | /* Missing P+Q, just recompute */ | ||
1817 | compute_parity6(sh, UPDATE_PARITY); | ||
1818 | return; | ||
1819 | } else { | ||
1820 | /* We're missing D+Q; recompute D from P */ | ||
1821 | compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? | ||
1822 | dd_idx2 : dd_idx1), | ||
1823 | 0); | ||
1824 | compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ | ||
1825 | return; | ||
1826 | } | ||
1827 | } | ||
1828 | |||
1829 | /* We're missing D+P or D+D; */ | ||
1830 | if (failb == syndrome_disks) { | ||
1831 | /* We're missing D+P. */ | ||
1832 | raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); | ||
1833 | } else { | ||
1834 | /* We're missing D+D. */ | ||
1835 | raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, | ||
1836 | ptrs); | ||
1837 | } | ||
1838 | |||
1839 | /* Both the above update both missing blocks */ | ||
1840 | set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); | ||
1841 | set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); | ||
1842 | } | ||
1843 | |||
1844 | static void | 1943 | static void |
1845 | schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | 1944 | schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, |
1846 | int rcw, int expand) | 1945 | int rcw, int expand) |
1847 | { | 1946 | { |
1848 | int i, pd_idx = sh->pd_idx, disks = sh->disks; | 1947 | int i, pd_idx = sh->pd_idx, disks = sh->disks; |
1948 | raid5_conf_t *conf = sh->raid_conf; | ||
1949 | int level = conf->level; | ||
1849 | 1950 | ||
1850 | if (rcw) { | 1951 | if (rcw) { |
1851 | /* if we are not expanding this is a proper write request, and | 1952 | /* if we are not expanding this is a proper write request, and |
@@ -1858,7 +1959,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | |||
1858 | } else | 1959 | } else |
1859 | sh->reconstruct_state = reconstruct_state_run; | 1960 | sh->reconstruct_state = reconstruct_state_run; |
1860 | 1961 | ||
1861 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); | 1962 | set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); |
1862 | 1963 | ||
1863 | for (i = disks; i--; ) { | 1964 | for (i = disks; i--; ) { |
1864 | struct r5dev *dev = &sh->dev[i]; | 1965 | struct r5dev *dev = &sh->dev[i]; |
@@ -1871,17 +1972,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | |||
1871 | s->locked++; | 1972 | s->locked++; |
1872 | } | 1973 | } |
1873 | } | 1974 | } |
1874 | if (s->locked + 1 == disks) | 1975 | if (s->locked + conf->max_degraded == disks) |
1875 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | 1976 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) |
1876 | atomic_inc(&sh->raid_conf->pending_full_writes); | 1977 | atomic_inc(&conf->pending_full_writes); |
1877 | } else { | 1978 | } else { |
1979 | BUG_ON(level == 6); | ||
1878 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || | 1980 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || |
1879 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); | 1981 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); |
1880 | 1982 | ||
1881 | sh->reconstruct_state = reconstruct_state_prexor_drain_run; | 1983 | sh->reconstruct_state = reconstruct_state_prexor_drain_run; |
1882 | set_bit(STRIPE_OP_PREXOR, &s->ops_request); | 1984 | set_bit(STRIPE_OP_PREXOR, &s->ops_request); |
1883 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); | 1985 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); |
1884 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); | 1986 | set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); |
1885 | 1987 | ||
1886 | for (i = disks; i--; ) { | 1988 | for (i = disks; i--; ) { |
1887 | struct r5dev *dev = &sh->dev[i]; | 1989 | struct r5dev *dev = &sh->dev[i]; |
@@ -1899,13 +2001,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | |||
1899 | } | 2001 | } |
1900 | } | 2002 | } |
1901 | 2003 | ||
1902 | /* keep the parity disk locked while asynchronous operations | 2004 | /* keep the parity disk(s) locked while asynchronous operations |
1903 | * are in flight | 2005 | * are in flight |
1904 | */ | 2006 | */ |
1905 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | 2007 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); |
1906 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | 2008 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); |
1907 | s->locked++; | 2009 | s->locked++; |
1908 | 2010 | ||
2011 | if (level == 6) { | ||
2012 | int qd_idx = sh->qd_idx; | ||
2013 | struct r5dev *dev = &sh->dev[qd_idx]; | ||
2014 | |||
2015 | set_bit(R5_LOCKED, &dev->flags); | ||
2016 | clear_bit(R5_UPTODATE, &dev->flags); | ||
2017 | s->locked++; | ||
2018 | } | ||
2019 | |||
1909 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", | 2020 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", |
1910 | __func__, (unsigned long long)sh->sector, | 2021 | __func__, (unsigned long long)sh->sector, |
1911 | s->locked, s->ops_request); | 2022 | s->locked, s->ops_request); |
@@ -1986,13 +2097,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
1986 | 2097 | ||
1987 | static void end_reshape(raid5_conf_t *conf); | 2098 | static void end_reshape(raid5_conf_t *conf); |
1988 | 2099 | ||
1989 | static int page_is_zero(struct page *p) | ||
1990 | { | ||
1991 | char *a = page_address(p); | ||
1992 | return ((*(u32*)a) == 0 && | ||
1993 | memcmp(a, a+4, STRIPE_SIZE-4)==0); | ||
1994 | } | ||
1995 | |||
1996 | static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, | 2100 | static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, |
1997 | struct stripe_head *sh) | 2101 | struct stripe_head *sh) |
1998 | { | 2102 | { |
@@ -2132,9 +2236,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, | |||
2132 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | 2236 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); |
2133 | set_bit(R5_Wantcompute, &dev->flags); | 2237 | set_bit(R5_Wantcompute, &dev->flags); |
2134 | sh->ops.target = disk_idx; | 2238 | sh->ops.target = disk_idx; |
2239 | sh->ops.target2 = -1; | ||
2135 | s->req_compute = 1; | 2240 | s->req_compute = 1; |
2136 | /* Careful: from this point on 'uptodate' is in the eye | 2241 | /* Careful: from this point on 'uptodate' is in the eye |
2137 | * of raid5_run_ops which services 'compute' operations | 2242 | * of raid_run_ops which services 'compute' operations |
2138 | * before writes. R5_Wantcompute flags a block that will | 2243 | * before writes. R5_Wantcompute flags a block that will |
2139 | * be R5_UPTODATE by the time it is needed for a | 2244 | * be R5_UPTODATE by the time it is needed for a |
2140 | * subsequent operation. | 2245 | * subsequent operation. |
@@ -2173,61 +2278,104 @@ static void handle_stripe_fill5(struct stripe_head *sh, | |||
2173 | set_bit(STRIPE_HANDLE, &sh->state); | 2278 | set_bit(STRIPE_HANDLE, &sh->state); |
2174 | } | 2279 | } |
2175 | 2280 | ||
2176 | static void handle_stripe_fill6(struct stripe_head *sh, | 2281 | /* fetch_block6 - checks the given member device to see if its data needs |
2177 | struct stripe_head_state *s, struct r6_state *r6s, | 2282 | * to be read or computed to satisfy a request. |
2178 | int disks) | 2283 | * |
2284 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
2285 | * 0 to tell the loop in handle_stripe_fill6 to continue | ||
2286 | */ | ||
2287 | static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | ||
2288 | struct r6_state *r6s, int disk_idx, int disks) | ||
2179 | { | 2289 | { |
2180 | int i; | 2290 | struct r5dev *dev = &sh->dev[disk_idx]; |
2181 | for (i = disks; i--; ) { | 2291 | struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], |
2182 | struct r5dev *dev = &sh->dev[i]; | 2292 | &sh->dev[r6s->failed_num[1]] }; |
2183 | if (!test_bit(R5_LOCKED, &dev->flags) && | 2293 | |
2184 | !test_bit(R5_UPTODATE, &dev->flags) && | 2294 | if (!test_bit(R5_LOCKED, &dev->flags) && |
2185 | (dev->toread || (dev->towrite && | 2295 | !test_bit(R5_UPTODATE, &dev->flags) && |
2186 | !test_bit(R5_OVERWRITE, &dev->flags)) || | 2296 | (dev->toread || |
2187 | s->syncing || s->expanding || | 2297 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
2188 | (s->failed >= 1 && | 2298 | s->syncing || s->expanding || |
2189 | (sh->dev[r6s->failed_num[0]].toread || | 2299 | (s->failed >= 1 && |
2190 | s->to_write)) || | 2300 | (fdev[0]->toread || s->to_write)) || |
2191 | (s->failed >= 2 && | 2301 | (s->failed >= 2 && |
2192 | (sh->dev[r6s->failed_num[1]].toread || | 2302 | (fdev[1]->toread || s->to_write)))) { |
2193 | s->to_write)))) { | 2303 | /* we would like to get this block, possibly by computing it, |
2194 | /* we would like to get this block, possibly | 2304 | * otherwise read it if the backing disk is insync |
2195 | * by computing it, but we might not be able to | 2305 | */ |
2306 | BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); | ||
2307 | BUG_ON(test_bit(R5_Wantread, &dev->flags)); | ||
2308 | if ((s->uptodate == disks - 1) && | ||
2309 | (s->failed && (disk_idx == r6s->failed_num[0] || | ||
2310 | disk_idx == r6s->failed_num[1]))) { | ||
2311 | /* have disk failed, and we're requested to fetch it; | ||
2312 | * do compute it | ||
2196 | */ | 2313 | */ |
2197 | if ((s->uptodate == disks - 1) && | 2314 | pr_debug("Computing stripe %llu block %d\n", |
2198 | (s->failed && (i == r6s->failed_num[0] || | 2315 | (unsigned long long)sh->sector, disk_idx); |
2199 | i == r6s->failed_num[1]))) { | 2316 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); |
2200 | pr_debug("Computing stripe %llu block %d\n", | 2317 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); |
2201 | (unsigned long long)sh->sector, i); | 2318 | set_bit(R5_Wantcompute, &dev->flags); |
2202 | compute_block_1(sh, i, 0); | 2319 | sh->ops.target = disk_idx; |
2203 | s->uptodate++; | 2320 | sh->ops.target2 = -1; /* no 2nd target */ |
2204 | } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { | 2321 | s->req_compute = 1; |
2205 | /* Computing 2-failure is *very* expensive; only | 2322 | s->uptodate++; |
2206 | * do it if failed >= 2 | 2323 | return 1; |
2207 | */ | 2324 | } else if (s->uptodate == disks-2 && s->failed >= 2) { |
2208 | int other; | 2325 | /* Computing 2-failure is *very* expensive; only |
2209 | for (other = disks; other--; ) { | 2326 | * do it if failed >= 2 |
2210 | if (other == i) | 2327 | */ |
2211 | continue; | 2328 | int other; |
2212 | if (!test_bit(R5_UPTODATE, | 2329 | for (other = disks; other--; ) { |
2213 | &sh->dev[other].flags)) | 2330 | if (other == disk_idx) |
2214 | break; | 2331 | continue; |
2215 | } | 2332 | if (!test_bit(R5_UPTODATE, |
2216 | BUG_ON(other < 0); | 2333 | &sh->dev[other].flags)) |
2217 | pr_debug("Computing stripe %llu blocks %d,%d\n", | 2334 | break; |
2218 | (unsigned long long)sh->sector, | ||
2219 | i, other); | ||
2220 | compute_block_2(sh, i, other); | ||
2221 | s->uptodate += 2; | ||
2222 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
2223 | set_bit(R5_LOCKED, &dev->flags); | ||
2224 | set_bit(R5_Wantread, &dev->flags); | ||
2225 | s->locked++; | ||
2226 | pr_debug("Reading block %d (sync=%d)\n", | ||
2227 | i, s->syncing); | ||
2228 | } | 2335 | } |
2336 | BUG_ON(other < 0); | ||
2337 | pr_debug("Computing stripe %llu blocks %d,%d\n", | ||
2338 | (unsigned long long)sh->sector, | ||
2339 | disk_idx, other); | ||
2340 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
2341 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2342 | set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); | ||
2343 | set_bit(R5_Wantcompute, &sh->dev[other].flags); | ||
2344 | sh->ops.target = disk_idx; | ||
2345 | sh->ops.target2 = other; | ||
2346 | s->uptodate += 2; | ||
2347 | s->req_compute = 1; | ||
2348 | return 1; | ||
2349 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
2350 | set_bit(R5_LOCKED, &dev->flags); | ||
2351 | set_bit(R5_Wantread, &dev->flags); | ||
2352 | s->locked++; | ||
2353 | pr_debug("Reading block %d (sync=%d)\n", | ||
2354 | disk_idx, s->syncing); | ||
2229 | } | 2355 | } |
2230 | } | 2356 | } |
2357 | |||
2358 | return 0; | ||
2359 | } | ||
2360 | |||
2361 | /** | ||
2362 | * handle_stripe_fill6 - read or compute data to satisfy pending requests. | ||
2363 | */ | ||
2364 | static void handle_stripe_fill6(struct stripe_head *sh, | ||
2365 | struct stripe_head_state *s, struct r6_state *r6s, | ||
2366 | int disks) | ||
2367 | { | ||
2368 | int i; | ||
2369 | |||
2370 | /* look for blocks to read/compute, skip this if a compute | ||
2371 | * is already in flight, or if the stripe contents are in the | ||
2372 | * midst of changing due to a write | ||
2373 | */ | ||
2374 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | ||
2375 | !sh->reconstruct_state) | ||
2376 | for (i = disks; i--; ) | ||
2377 | if (fetch_block6(sh, s, r6s, i, disks)) | ||
2378 | break; | ||
2231 | set_bit(STRIPE_HANDLE, &sh->state); | 2379 | set_bit(STRIPE_HANDLE, &sh->state); |
2232 | } | 2380 | } |
2233 | 2381 | ||
@@ -2361,114 +2509,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
2361 | */ | 2509 | */ |
2362 | /* since handle_stripe can be called at any time we need to handle the | 2510 | /* since handle_stripe can be called at any time we need to handle the |
2363 | * case where a compute block operation has been submitted and then a | 2511 | * case where a compute block operation has been submitted and then a |
2364 | * subsequent call wants to start a write request. raid5_run_ops only | 2512 | * subsequent call wants to start a write request. raid_run_ops only |
2365 | * handles the case where compute block and postxor are requested | 2513 | * handles the case where compute block and reconstruct are requested |
2366 | * simultaneously. If this is not the case then new writes need to be | 2514 | * simultaneously. If this is not the case then new writes need to be |
2367 | * held off until the compute completes. | 2515 | * held off until the compute completes. |
2368 | */ | 2516 | */ |
2369 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && | 2517 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
2370 | (s->locked == 0 && (rcw == 0 || rmw == 0) && | 2518 | (s->locked == 0 && (rcw == 0 || rmw == 0) && |
2371 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) | 2519 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) |
2372 | schedule_reconstruction5(sh, s, rcw == 0, 0); | 2520 | schedule_reconstruction(sh, s, rcw == 0, 0); |
2373 | } | 2521 | } |
2374 | 2522 | ||
2375 | static void handle_stripe_dirtying6(raid5_conf_t *conf, | 2523 | static void handle_stripe_dirtying6(raid5_conf_t *conf, |
2376 | struct stripe_head *sh, struct stripe_head_state *s, | 2524 | struct stripe_head *sh, struct stripe_head_state *s, |
2377 | struct r6_state *r6s, int disks) | 2525 | struct r6_state *r6s, int disks) |
2378 | { | 2526 | { |
2379 | int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; | 2527 | int rcw = 0, pd_idx = sh->pd_idx, i; |
2380 | int qd_idx = sh->qd_idx; | 2528 | int qd_idx = sh->qd_idx; |
2529 | |||
2530 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2381 | for (i = disks; i--; ) { | 2531 | for (i = disks; i--; ) { |
2382 | struct r5dev *dev = &sh->dev[i]; | 2532 | struct r5dev *dev = &sh->dev[i]; |
2383 | /* Would I have to read this buffer for reconstruct_write */ | 2533 | /* check if we haven't enough data */ |
2384 | if (!test_bit(R5_OVERWRITE, &dev->flags) | 2534 | if (!test_bit(R5_OVERWRITE, &dev->flags) && |
2385 | && i != pd_idx && i != qd_idx | 2535 | i != pd_idx && i != qd_idx && |
2386 | && (!test_bit(R5_LOCKED, &dev->flags) | 2536 | !test_bit(R5_LOCKED, &dev->flags) && |
2387 | ) && | 2537 | !(test_bit(R5_UPTODATE, &dev->flags) || |
2388 | !test_bit(R5_UPTODATE, &dev->flags)) { | 2538 | test_bit(R5_Wantcompute, &dev->flags))) { |
2389 | if (test_bit(R5_Insync, &dev->flags)) rcw++; | 2539 | rcw++; |
2390 | else { | 2540 | if (!test_bit(R5_Insync, &dev->flags)) |
2391 | pr_debug("raid6: must_compute: " | 2541 | continue; /* it's a failed drive */ |
2392 | "disk %d flags=%#lx\n", i, dev->flags); | 2542 | |
2393 | must_compute++; | 2543 | if ( |
2544 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2545 | pr_debug("Read_old stripe %llu " | ||
2546 | "block %d for Reconstruct\n", | ||
2547 | (unsigned long long)sh->sector, i); | ||
2548 | set_bit(R5_LOCKED, &dev->flags); | ||
2549 | set_bit(R5_Wantread, &dev->flags); | ||
2550 | s->locked++; | ||
2551 | } else { | ||
2552 | pr_debug("Request delayed stripe %llu " | ||
2553 | "block %d for Reconstruct\n", | ||
2554 | (unsigned long long)sh->sector, i); | ||
2555 | set_bit(STRIPE_DELAYED, &sh->state); | ||
2556 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2394 | } | 2557 | } |
2395 | } | 2558 | } |
2396 | } | 2559 | } |
2397 | pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", | ||
2398 | (unsigned long long)sh->sector, rcw, must_compute); | ||
2399 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2400 | |||
2401 | if (rcw > 0) | ||
2402 | /* want reconstruct write, but need to get some data */ | ||
2403 | for (i = disks; i--; ) { | ||
2404 | struct r5dev *dev = &sh->dev[i]; | ||
2405 | if (!test_bit(R5_OVERWRITE, &dev->flags) | ||
2406 | && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) | ||
2407 | && !test_bit(R5_LOCKED, &dev->flags) && | ||
2408 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
2409 | test_bit(R5_Insync, &dev->flags)) { | ||
2410 | if ( | ||
2411 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2412 | pr_debug("Read_old stripe %llu " | ||
2413 | "block %d for Reconstruct\n", | ||
2414 | (unsigned long long)sh->sector, i); | ||
2415 | set_bit(R5_LOCKED, &dev->flags); | ||
2416 | set_bit(R5_Wantread, &dev->flags); | ||
2417 | s->locked++; | ||
2418 | } else { | ||
2419 | pr_debug("Request delayed stripe %llu " | ||
2420 | "block %d for Reconstruct\n", | ||
2421 | (unsigned long long)sh->sector, i); | ||
2422 | set_bit(STRIPE_DELAYED, &sh->state); | ||
2423 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2424 | } | ||
2425 | } | ||
2426 | } | ||
2427 | /* now if nothing is locked, and if we have enough data, we can start a | 2560 | /* now if nothing is locked, and if we have enough data, we can start a |
2428 | * write request | 2561 | * write request |
2429 | */ | 2562 | */ |
2430 | if (s->locked == 0 && rcw == 0 && | 2563 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
2564 | s->locked == 0 && rcw == 0 && | ||
2431 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | 2565 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { |
2432 | if (must_compute > 0) { | 2566 | schedule_reconstruction(sh, s, 1, 0); |
2433 | /* We have failed blocks and need to compute them */ | ||
2434 | switch (s->failed) { | ||
2435 | case 0: | ||
2436 | BUG(); | ||
2437 | case 1: | ||
2438 | compute_block_1(sh, r6s->failed_num[0], 0); | ||
2439 | break; | ||
2440 | case 2: | ||
2441 | compute_block_2(sh, r6s->failed_num[0], | ||
2442 | r6s->failed_num[1]); | ||
2443 | break; | ||
2444 | default: /* This request should have been failed? */ | ||
2445 | BUG(); | ||
2446 | } | ||
2447 | } | ||
2448 | |||
2449 | pr_debug("Computing parity for stripe %llu\n", | ||
2450 | (unsigned long long)sh->sector); | ||
2451 | compute_parity6(sh, RECONSTRUCT_WRITE); | ||
2452 | /* now every locked buffer is ready to be written */ | ||
2453 | for (i = disks; i--; ) | ||
2454 | if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { | ||
2455 | pr_debug("Writing stripe %llu block %d\n", | ||
2456 | (unsigned long long)sh->sector, i); | ||
2457 | s->locked++; | ||
2458 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
2459 | } | ||
2460 | if (s->locked == disks) | ||
2461 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | ||
2462 | atomic_inc(&conf->pending_full_writes); | ||
2463 | /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ | ||
2464 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2465 | |||
2466 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2467 | atomic_dec(&conf->preread_active_stripes); | ||
2468 | if (atomic_read(&conf->preread_active_stripes) < | ||
2469 | IO_THRESHOLD) | ||
2470 | md_wakeup_thread(conf->mddev->thread); | ||
2471 | } | ||
2472 | } | 2567 | } |
2473 | } | 2568 | } |
2474 | 2569 | ||
@@ -2527,7 +2622,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2527 | * we are done. Otherwise update the mismatch count and repair | 2622 | * we are done. Otherwise update the mismatch count and repair |
2528 | * parity if !MD_RECOVERY_CHECK | 2623 | * parity if !MD_RECOVERY_CHECK |
2529 | */ | 2624 | */ |
2530 | if (sh->ops.zero_sum_result == 0) | 2625 | if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) |
2531 | /* parity is correct (on disc, | 2626 | /* parity is correct (on disc, |
2532 | * not in buffer any more) | 2627 | * not in buffer any more) |
2533 | */ | 2628 | */ |
@@ -2544,6 +2639,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2544 | set_bit(R5_Wantcompute, | 2639 | set_bit(R5_Wantcompute, |
2545 | &sh->dev[sh->pd_idx].flags); | 2640 | &sh->dev[sh->pd_idx].flags); |
2546 | sh->ops.target = sh->pd_idx; | 2641 | sh->ops.target = sh->pd_idx; |
2642 | sh->ops.target2 = -1; | ||
2547 | s->uptodate++; | 2643 | s->uptodate++; |
2548 | } | 2644 | } |
2549 | } | 2645 | } |
@@ -2560,67 +2656,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2560 | 2656 | ||
2561 | 2657 | ||
2562 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | 2658 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, |
2563 | struct stripe_head_state *s, | 2659 | struct stripe_head_state *s, |
2564 | struct r6_state *r6s, struct page *tmp_page, | 2660 | struct r6_state *r6s, int disks) |
2565 | int disks) | ||
2566 | { | 2661 | { |
2567 | int update_p = 0, update_q = 0; | ||
2568 | struct r5dev *dev; | ||
2569 | int pd_idx = sh->pd_idx; | 2662 | int pd_idx = sh->pd_idx; |
2570 | int qd_idx = sh->qd_idx; | 2663 | int qd_idx = sh->qd_idx; |
2664 | struct r5dev *dev; | ||
2571 | 2665 | ||
2572 | set_bit(STRIPE_HANDLE, &sh->state); | 2666 | set_bit(STRIPE_HANDLE, &sh->state); |
2573 | 2667 | ||
2574 | BUG_ON(s->failed > 2); | 2668 | BUG_ON(s->failed > 2); |
2575 | BUG_ON(s->uptodate < disks); | 2669 | |
2576 | /* Want to check and possibly repair P and Q. | 2670 | /* Want to check and possibly repair P and Q. |
2577 | * However there could be one 'failed' device, in which | 2671 | * However there could be one 'failed' device, in which |
2578 | * case we can only check one of them, possibly using the | 2672 | * case we can only check one of them, possibly using the |
2579 | * other to generate missing data | 2673 | * other to generate missing data |
2580 | */ | 2674 | */ |
2581 | 2675 | ||
2582 | /* If !tmp_page, we cannot do the calculations, | 2676 | switch (sh->check_state) { |
2583 | * but as we have set STRIPE_HANDLE, we will soon be called | 2677 | case check_state_idle: |
2584 | * by stripe_handle with a tmp_page - just wait until then. | 2678 | /* start a new check operation if there are < 2 failures */ |
2585 | */ | ||
2586 | if (tmp_page) { | ||
2587 | if (s->failed == r6s->q_failed) { | 2679 | if (s->failed == r6s->q_failed) { |
2588 | /* The only possible failed device holds 'Q', so it | 2680 | /* The only possible failed device holds Q, so it |
2589 | * makes sense to check P (If anything else were failed, | 2681 | * makes sense to check P (If anything else were failed, |
2590 | * we would have used P to recreate it). | 2682 | * we would have used P to recreate it). |
2591 | */ | 2683 | */ |
2592 | compute_block_1(sh, pd_idx, 1); | 2684 | sh->check_state = check_state_run; |
2593 | if (!page_is_zero(sh->dev[pd_idx].page)) { | ||
2594 | compute_block_1(sh, pd_idx, 0); | ||
2595 | update_p = 1; | ||
2596 | } | ||
2597 | } | 2685 | } |
2598 | if (!r6s->q_failed && s->failed < 2) { | 2686 | if (!r6s->q_failed && s->failed < 2) { |
2599 | /* q is not failed, and we didn't use it to generate | 2687 | /* Q is not failed, and we didn't use it to generate |
2600 | * anything, so it makes sense to check it | 2688 | * anything, so it makes sense to check it |
2601 | */ | 2689 | */ |
2602 | memcpy(page_address(tmp_page), | 2690 | if (sh->check_state == check_state_run) |
2603 | page_address(sh->dev[qd_idx].page), | 2691 | sh->check_state = check_state_run_pq; |
2604 | STRIPE_SIZE); | 2692 | else |
2605 | compute_parity6(sh, UPDATE_PARITY); | 2693 | sh->check_state = check_state_run_q; |
2606 | if (memcmp(page_address(tmp_page), | ||
2607 | page_address(sh->dev[qd_idx].page), | ||
2608 | STRIPE_SIZE) != 0) { | ||
2609 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
2610 | update_q = 1; | ||
2611 | } | ||
2612 | } | 2694 | } |
2613 | if (update_p || update_q) { | 2695 | |
2614 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | 2696 | /* discard potentially stale zero_sum_result */ |
2615 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | 2697 | sh->ops.zero_sum_result = 0; |
2616 | /* don't try to repair!! */ | 2698 | |
2617 | update_p = update_q = 0; | 2699 | if (sh->check_state == check_state_run) { |
2700 | /* async_xor_zero_sum destroys the contents of P */ | ||
2701 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
2702 | s->uptodate--; | ||
2703 | } | ||
2704 | if (sh->check_state >= check_state_run && | ||
2705 | sh->check_state <= check_state_run_pq) { | ||
2706 | /* async_syndrome_zero_sum preserves P and Q, so | ||
2707 | * no need to mark them !uptodate here | ||
2708 | */ | ||
2709 | set_bit(STRIPE_OP_CHECK, &s->ops_request); | ||
2710 | break; | ||
2618 | } | 2711 | } |
2619 | 2712 | ||
2713 | /* we have 2-disk failure */ | ||
2714 | BUG_ON(s->failed != 2); | ||
2715 | /* fall through */ | ||
2716 | case check_state_compute_result: | ||
2717 | sh->check_state = check_state_idle; | ||
2718 | |||
2719 | /* check that a write has not made the stripe insync */ | ||
2720 | if (test_bit(STRIPE_INSYNC, &sh->state)) | ||
2721 | break; | ||
2722 | |||
2620 | /* now write out any block on a failed drive, | 2723 | /* now write out any block on a failed drive, |
2621 | * or P or Q if they need it | 2724 | * or P or Q if they were recomputed |
2622 | */ | 2725 | */ |
2623 | 2726 | BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ | |
2624 | if (s->failed == 2) { | 2727 | if (s->failed == 2) { |
2625 | dev = &sh->dev[r6s->failed_num[1]]; | 2728 | dev = &sh->dev[r6s->failed_num[1]]; |
2626 | s->locked++; | 2729 | s->locked++; |
@@ -2633,14 +2736,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2633 | set_bit(R5_LOCKED, &dev->flags); | 2736 | set_bit(R5_LOCKED, &dev->flags); |
2634 | set_bit(R5_Wantwrite, &dev->flags); | 2737 | set_bit(R5_Wantwrite, &dev->flags); |
2635 | } | 2738 | } |
2636 | 2739 | if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { | |
2637 | if (update_p) { | ||
2638 | dev = &sh->dev[pd_idx]; | 2740 | dev = &sh->dev[pd_idx]; |
2639 | s->locked++; | 2741 | s->locked++; |
2640 | set_bit(R5_LOCKED, &dev->flags); | 2742 | set_bit(R5_LOCKED, &dev->flags); |
2641 | set_bit(R5_Wantwrite, &dev->flags); | 2743 | set_bit(R5_Wantwrite, &dev->flags); |
2642 | } | 2744 | } |
2643 | if (update_q) { | 2745 | if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { |
2644 | dev = &sh->dev[qd_idx]; | 2746 | dev = &sh->dev[qd_idx]; |
2645 | s->locked++; | 2747 | s->locked++; |
2646 | set_bit(R5_LOCKED, &dev->flags); | 2748 | set_bit(R5_LOCKED, &dev->flags); |
@@ -2649,6 +2751,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2649 | clear_bit(STRIPE_DEGRADED, &sh->state); | 2751 | clear_bit(STRIPE_DEGRADED, &sh->state); |
2650 | 2752 | ||
2651 | set_bit(STRIPE_INSYNC, &sh->state); | 2753 | set_bit(STRIPE_INSYNC, &sh->state); |
2754 | break; | ||
2755 | case check_state_run: | ||
2756 | case check_state_run_q: | ||
2757 | case check_state_run_pq: | ||
2758 | break; /* we will be called again upon completion */ | ||
2759 | case check_state_check_result: | ||
2760 | sh->check_state = check_state_idle; | ||
2761 | |||
2762 | /* handle a successful check operation, if parity is correct | ||
2763 | * we are done. Otherwise update the mismatch count and repair | ||
2764 | * parity if !MD_RECOVERY_CHECK | ||
2765 | */ | ||
2766 | if (sh->ops.zero_sum_result == 0) { | ||
2767 | /* both parities are correct */ | ||
2768 | if (!s->failed) | ||
2769 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2770 | else { | ||
2771 | /* in contrast to the raid5 case we can validate | ||
2772 | * parity, but still have a failure to write | ||
2773 | * back | ||
2774 | */ | ||
2775 | sh->check_state = check_state_compute_result; | ||
2776 | /* Returning at this point means that we may go | ||
2777 | * off and bring p and/or q uptodate again so | ||
2778 | * we make sure to check zero_sum_result again | ||
2779 | * to verify if p or q need writeback | ||
2780 | */ | ||
2781 | } | ||
2782 | } else { | ||
2783 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
2784 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
2785 | /* don't try to repair!! */ | ||
2786 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2787 | else { | ||
2788 | int *target = &sh->ops.target; | ||
2789 | |||
2790 | sh->ops.target = -1; | ||
2791 | sh->ops.target2 = -1; | ||
2792 | sh->check_state = check_state_compute_run; | ||
2793 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
2794 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2795 | if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { | ||
2796 | set_bit(R5_Wantcompute, | ||
2797 | &sh->dev[pd_idx].flags); | ||
2798 | *target = pd_idx; | ||
2799 | target = &sh->ops.target2; | ||
2800 | s->uptodate++; | ||
2801 | } | ||
2802 | if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { | ||
2803 | set_bit(R5_Wantcompute, | ||
2804 | &sh->dev[qd_idx].flags); | ||
2805 | *target = qd_idx; | ||
2806 | s->uptodate++; | ||
2807 | } | ||
2808 | } | ||
2809 | } | ||
2810 | break; | ||
2811 | case check_state_compute_run: | ||
2812 | break; | ||
2813 | default: | ||
2814 | printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", | ||
2815 | __func__, sh->check_state, | ||
2816 | (unsigned long long) sh->sector); | ||
2817 | BUG(); | ||
2652 | } | 2818 | } |
2653 | } | 2819 | } |
2654 | 2820 | ||
@@ -2666,6 +2832,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2666 | if (i != sh->pd_idx && i != sh->qd_idx) { | 2832 | if (i != sh->pd_idx && i != sh->qd_idx) { |
2667 | int dd_idx, j; | 2833 | int dd_idx, j; |
2668 | struct stripe_head *sh2; | 2834 | struct stripe_head *sh2; |
2835 | struct async_submit_ctl submit; | ||
2669 | 2836 | ||
2670 | sector_t bn = compute_blocknr(sh, i, 1); | 2837 | sector_t bn = compute_blocknr(sh, i, 1); |
2671 | sector_t s = raid5_compute_sector(conf, bn, 0, | 2838 | sector_t s = raid5_compute_sector(conf, bn, 0, |
@@ -2685,9 +2852,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2685 | } | 2852 | } |
2686 | 2853 | ||
2687 | /* place all the copies on one channel */ | 2854 | /* place all the copies on one channel */ |
2855 | init_async_submit(&submit, 0, tx, NULL, NULL, NULL); | ||
2688 | tx = async_memcpy(sh2->dev[dd_idx].page, | 2856 | tx = async_memcpy(sh2->dev[dd_idx].page, |
2689 | sh->dev[i].page, 0, 0, STRIPE_SIZE, | 2857 | sh->dev[i].page, 0, 0, STRIPE_SIZE, |
2690 | ASYNC_TX_DEP_ACK, tx, NULL, NULL); | 2858 | &submit); |
2691 | 2859 | ||
2692 | set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); | 2860 | set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); |
2693 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); | 2861 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); |
@@ -2974,7 +3142,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2974 | /* Need to write out all blocks after computing parity */ | 3142 | /* Need to write out all blocks after computing parity */ |
2975 | sh->disks = conf->raid_disks; | 3143 | sh->disks = conf->raid_disks; |
2976 | stripe_set_idx(sh->sector, conf, 0, sh); | 3144 | stripe_set_idx(sh->sector, conf, 0, sh); |
2977 | schedule_reconstruction5(sh, &s, 1, 1); | 3145 | schedule_reconstruction(sh, &s, 1, 1); |
2978 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { | 3146 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { |
2979 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 3147 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
2980 | atomic_dec(&conf->reshape_stripes); | 3148 | atomic_dec(&conf->reshape_stripes); |
@@ -2994,7 +3162,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2994 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 3162 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
2995 | 3163 | ||
2996 | if (s.ops_request) | 3164 | if (s.ops_request) |
2997 | raid5_run_ops(sh, s.ops_request); | 3165 | raid_run_ops(sh, s.ops_request); |
2998 | 3166 | ||
2999 | ops_run_io(sh, &s); | 3167 | ops_run_io(sh, &s); |
3000 | 3168 | ||
@@ -3003,7 +3171,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
3003 | return blocked_rdev == NULL; | 3171 | return blocked_rdev == NULL; |
3004 | } | 3172 | } |
3005 | 3173 | ||
3006 | static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | 3174 | static bool handle_stripe6(struct stripe_head *sh) |
3007 | { | 3175 | { |
3008 | raid5_conf_t *conf = sh->raid_conf; | 3176 | raid5_conf_t *conf = sh->raid_conf; |
3009 | int disks = sh->disks; | 3177 | int disks = sh->disks; |
@@ -3015,9 +3183,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3015 | mdk_rdev_t *blocked_rdev = NULL; | 3183 | mdk_rdev_t *blocked_rdev = NULL; |
3016 | 3184 | ||
3017 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " | 3185 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " |
3018 | "pd_idx=%d, qd_idx=%d\n", | 3186 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", |
3019 | (unsigned long long)sh->sector, sh->state, | 3187 | (unsigned long long)sh->sector, sh->state, |
3020 | atomic_read(&sh->count), pd_idx, qd_idx); | 3188 | atomic_read(&sh->count), pd_idx, qd_idx, |
3189 | sh->check_state, sh->reconstruct_state); | ||
3021 | memset(&s, 0, sizeof(s)); | 3190 | memset(&s, 0, sizeof(s)); |
3022 | 3191 | ||
3023 | spin_lock(&sh->lock); | 3192 | spin_lock(&sh->lock); |
@@ -3037,35 +3206,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3037 | 3206 | ||
3038 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3207 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
3039 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 3208 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
3040 | /* maybe we can reply to a read */ | 3209 | /* maybe we can reply to a read |
3041 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { | 3210 | * |
3042 | struct bio *rbi, *rbi2; | 3211 | * new wantfill requests are only permitted while |
3043 | pr_debug("Return read for disc %d\n", i); | 3212 | * ops_complete_biofill is guaranteed to be inactive |
3044 | spin_lock_irq(&conf->device_lock); | 3213 | */ |
3045 | rbi = dev->toread; | 3214 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && |
3046 | dev->toread = NULL; | 3215 | !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) |
3047 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | 3216 | set_bit(R5_Wantfill, &dev->flags); |
3048 | wake_up(&conf->wait_for_overlap); | ||
3049 | spin_unlock_irq(&conf->device_lock); | ||
3050 | while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { | ||
3051 | copy_data(0, rbi, dev->page, dev->sector); | ||
3052 | rbi2 = r5_next_bio(rbi, dev->sector); | ||
3053 | spin_lock_irq(&conf->device_lock); | ||
3054 | if (!raid5_dec_bi_phys_segments(rbi)) { | ||
3055 | rbi->bi_next = return_bi; | ||
3056 | return_bi = rbi; | ||
3057 | } | ||
3058 | spin_unlock_irq(&conf->device_lock); | ||
3059 | rbi = rbi2; | ||
3060 | } | ||
3061 | } | ||
3062 | 3217 | ||
3063 | /* now count some things */ | 3218 | /* now count some things */ |
3064 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; | 3219 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; |
3065 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; | 3220 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; |
3221 | if (test_bit(R5_Wantcompute, &dev->flags)) { | ||
3222 | s.compute++; | ||
3223 | BUG_ON(s.compute > 2); | ||
3224 | } | ||
3066 | 3225 | ||
3067 | 3226 | if (test_bit(R5_Wantfill, &dev->flags)) { | |
3068 | if (dev->toread) | 3227 | s.to_fill++; |
3228 | } else if (dev->toread) | ||
3069 | s.to_read++; | 3229 | s.to_read++; |
3070 | if (dev->towrite) { | 3230 | if (dev->towrite) { |
3071 | s.to_write++; | 3231 | s.to_write++; |
@@ -3106,6 +3266,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3106 | blocked_rdev = NULL; | 3266 | blocked_rdev = NULL; |
3107 | } | 3267 | } |
3108 | 3268 | ||
3269 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { | ||
3270 | set_bit(STRIPE_OP_BIOFILL, &s.ops_request); | ||
3271 | set_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
3272 | } | ||
3273 | |||
3109 | pr_debug("locked=%d uptodate=%d to_read=%d" | 3274 | pr_debug("locked=%d uptodate=%d to_read=%d" |
3110 | " to_write=%d failed=%d failed_num=%d,%d\n", | 3275 | " to_write=%d failed=%d failed_num=%d,%d\n", |
3111 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, | 3276 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, |
@@ -3146,19 +3311,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3146 | * or to load a block that is being partially written. | 3311 | * or to load a block that is being partially written. |
3147 | */ | 3312 | */ |
3148 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || | 3313 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || |
3149 | (s.syncing && (s.uptodate < disks)) || s.expanding) | 3314 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) |
3150 | handle_stripe_fill6(sh, &s, &r6s, disks); | 3315 | handle_stripe_fill6(sh, &s, &r6s, disks); |
3151 | 3316 | ||
3152 | /* now to consider writing and what else, if anything should be read */ | 3317 | /* Now we check to see if any write operations have recently |
3153 | if (s.to_write) | 3318 | * completed |
3319 | */ | ||
3320 | if (sh->reconstruct_state == reconstruct_state_drain_result) { | ||
3321 | int qd_idx = sh->qd_idx; | ||
3322 | |||
3323 | sh->reconstruct_state = reconstruct_state_idle; | ||
3324 | /* All the 'written' buffers and the parity blocks are ready to | ||
3325 | * be written back to disk | ||
3326 | */ | ||
3327 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | ||
3328 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); | ||
3329 | for (i = disks; i--; ) { | ||
3330 | dev = &sh->dev[i]; | ||
3331 | if (test_bit(R5_LOCKED, &dev->flags) && | ||
3332 | (i == sh->pd_idx || i == qd_idx || | ||
3333 | dev->written)) { | ||
3334 | pr_debug("Writing block %d\n", i); | ||
3335 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | ||
3336 | set_bit(R5_Wantwrite, &dev->flags); | ||
3337 | if (!test_bit(R5_Insync, &dev->flags) || | ||
3338 | ((i == sh->pd_idx || i == qd_idx) && | ||
3339 | s.failed == 0)) | ||
3340 | set_bit(STRIPE_INSYNC, &sh->state); | ||
3341 | } | ||
3342 | } | ||
3343 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
3344 | atomic_dec(&conf->preread_active_stripes); | ||
3345 | if (atomic_read(&conf->preread_active_stripes) < | ||
3346 | IO_THRESHOLD) | ||
3347 | md_wakeup_thread(conf->mddev->thread); | ||
3348 | } | ||
3349 | } | ||
3350 | |||
3351 | /* Now to consider new write requests and what else, if anything | ||
3352 | * should be read. We do not handle new writes when: | ||
3353 | * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. | ||
3354 | * 2/ A 'check' operation is in flight, as it may clobber the parity | ||
3355 | * block. | ||
3356 | */ | ||
3357 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | ||
3154 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); | 3358 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); |
3155 | 3359 | ||
3156 | /* maybe we need to check and possibly fix the parity for this stripe | 3360 | /* maybe we need to check and possibly fix the parity for this stripe |
3157 | * Any reads will already have been scheduled, so we just see if enough | 3361 | * Any reads will already have been scheduled, so we just see if enough |
3158 | * data is available | 3362 | * data is available. The parity check is held off while parity |
3363 | * dependent operations are in flight. | ||
3159 | */ | 3364 | */ |
3160 | if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) | 3365 | if (sh->check_state || |
3161 | handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); | 3366 | (s.syncing && s.locked == 0 && |
3367 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | ||
3368 | !test_bit(STRIPE_INSYNC, &sh->state))) | ||
3369 | handle_parity_checks6(conf, sh, &s, &r6s, disks); | ||
3162 | 3370 | ||
3163 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 3371 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
3164 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 3372 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); |
@@ -3179,15 +3387,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3179 | set_bit(R5_Wantwrite, &dev->flags); | 3387 | set_bit(R5_Wantwrite, &dev->flags); |
3180 | set_bit(R5_ReWrite, &dev->flags); | 3388 | set_bit(R5_ReWrite, &dev->flags); |
3181 | set_bit(R5_LOCKED, &dev->flags); | 3389 | set_bit(R5_LOCKED, &dev->flags); |
3390 | s.locked++; | ||
3182 | } else { | 3391 | } else { |
3183 | /* let's read it back */ | 3392 | /* let's read it back */ |
3184 | set_bit(R5_Wantread, &dev->flags); | 3393 | set_bit(R5_Wantread, &dev->flags); |
3185 | set_bit(R5_LOCKED, &dev->flags); | 3394 | set_bit(R5_LOCKED, &dev->flags); |
3395 | s.locked++; | ||
3186 | } | 3396 | } |
3187 | } | 3397 | } |
3188 | } | 3398 | } |
3189 | 3399 | ||
3190 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { | 3400 | /* Finish reconstruct operations initiated by the expansion process */ |
3401 | if (sh->reconstruct_state == reconstruct_state_result) { | ||
3402 | sh->reconstruct_state = reconstruct_state_idle; | ||
3403 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
3404 | for (i = conf->raid_disks; i--; ) { | ||
3405 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
3406 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3407 | s.locked++; | ||
3408 | } | ||
3409 | } | ||
3410 | |||
3411 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | ||
3412 | !sh->reconstruct_state) { | ||
3191 | struct stripe_head *sh2 | 3413 | struct stripe_head *sh2 |
3192 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | 3414 | = get_active_stripe(conf, sh->sector, 1, 1, 1); |
3193 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | 3415 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { |
@@ -3208,14 +3430,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3208 | /* Need to write out all blocks after computing P&Q */ | 3430 | /* Need to write out all blocks after computing P&Q */ |
3209 | sh->disks = conf->raid_disks; | 3431 | sh->disks = conf->raid_disks; |
3210 | stripe_set_idx(sh->sector, conf, 0, sh); | 3432 | stripe_set_idx(sh->sector, conf, 0, sh); |
3211 | compute_parity6(sh, RECONSTRUCT_WRITE); | 3433 | schedule_reconstruction(sh, &s, 1, 1); |
3212 | for (i = conf->raid_disks ; i-- ; ) { | 3434 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { |
3213 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3214 | s.locked++; | ||
3215 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
3216 | } | ||
3217 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
3218 | } else if (s.expanded) { | ||
3219 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 3435 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
3220 | atomic_dec(&conf->reshape_stripes); | 3436 | atomic_dec(&conf->reshape_stripes); |
3221 | wake_up(&conf->wait_for_overlap); | 3437 | wake_up(&conf->wait_for_overlap); |
@@ -3233,6 +3449,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3233 | if (unlikely(blocked_rdev)) | 3449 | if (unlikely(blocked_rdev)) |
3234 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 3450 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
3235 | 3451 | ||
3452 | if (s.ops_request) | ||
3453 | raid_run_ops(sh, s.ops_request); | ||
3454 | |||
3236 | ops_run_io(sh, &s); | 3455 | ops_run_io(sh, &s); |
3237 | 3456 | ||
3238 | return_io(return_bi); | 3457 | return_io(return_bi); |
@@ -3241,16 +3460,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3241 | } | 3460 | } |
3242 | 3461 | ||
3243 | /* returns true if the stripe was handled */ | 3462 | /* returns true if the stripe was handled */ |
3244 | static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) | 3463 | static bool handle_stripe(struct stripe_head *sh) |
3245 | { | 3464 | { |
3246 | if (sh->raid_conf->level == 6) | 3465 | if (sh->raid_conf->level == 6) |
3247 | return handle_stripe6(sh, tmp_page); | 3466 | return handle_stripe6(sh); |
3248 | else | 3467 | else |
3249 | return handle_stripe5(sh); | 3468 | return handle_stripe5(sh); |
3250 | } | 3469 | } |
3251 | 3470 | ||
3252 | |||
3253 | |||
3254 | static void raid5_activate_delayed(raid5_conf_t *conf) | 3471 | static void raid5_activate_delayed(raid5_conf_t *conf) |
3255 | { | 3472 | { |
3256 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { | 3473 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { |
@@ -4061,7 +4278,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
4061 | spin_unlock(&sh->lock); | 4278 | spin_unlock(&sh->lock); |
4062 | 4279 | ||
4063 | /* wait for any blocked device to be handled */ | 4280 | /* wait for any blocked device to be handled */ |
4064 | while(unlikely(!handle_stripe(sh, NULL))) | 4281 | while (unlikely(!handle_stripe(sh))) |
4065 | ; | 4282 | ; |
4066 | release_stripe(sh); | 4283 | release_stripe(sh); |
4067 | 4284 | ||
@@ -4118,7 +4335,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
4118 | return handled; | 4335 | return handled; |
4119 | } | 4336 | } |
4120 | 4337 | ||
4121 | handle_stripe(sh, NULL); | 4338 | handle_stripe(sh); |
4122 | release_stripe(sh); | 4339 | release_stripe(sh); |
4123 | handled++; | 4340 | handled++; |
4124 | } | 4341 | } |
@@ -4132,6 +4349,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
4132 | return handled; | 4349 | return handled; |
4133 | } | 4350 | } |
4134 | 4351 | ||
4352 | #ifdef CONFIG_MULTICORE_RAID456 | ||
4353 | static void __process_stripe(void *param, async_cookie_t cookie) | ||
4354 | { | ||
4355 | struct stripe_head *sh = param; | ||
4356 | |||
4357 | handle_stripe(sh); | ||
4358 | release_stripe(sh); | ||
4359 | } | ||
4360 | |||
4361 | static void process_stripe(struct stripe_head *sh, struct list_head *domain) | ||
4362 | { | ||
4363 | async_schedule_domain(__process_stripe, sh, domain); | ||
4364 | } | ||
4365 | |||
4366 | static void synchronize_stripe_processing(struct list_head *domain) | ||
4367 | { | ||
4368 | async_synchronize_full_domain(domain); | ||
4369 | } | ||
4370 | #else | ||
4371 | static void process_stripe(struct stripe_head *sh, struct list_head *domain) | ||
4372 | { | ||
4373 | handle_stripe(sh); | ||
4374 | release_stripe(sh); | ||
4375 | cond_resched(); | ||
4376 | } | ||
4377 | |||
4378 | static void synchronize_stripe_processing(struct list_head *domain) | ||
4379 | { | ||
4380 | } | ||
4381 | #endif | ||
4135 | 4382 | ||
4136 | 4383 | ||
4137 | /* | 4384 | /* |
@@ -4146,6 +4393,7 @@ static void raid5d(mddev_t *mddev) | |||
4146 | struct stripe_head *sh; | 4393 | struct stripe_head *sh; |
4147 | raid5_conf_t *conf = mddev->private; | 4394 | raid5_conf_t *conf = mddev->private; |
4148 | int handled; | 4395 | int handled; |
4396 | LIST_HEAD(raid_domain); | ||
4149 | 4397 | ||
4150 | pr_debug("+++ raid5d active\n"); | 4398 | pr_debug("+++ raid5d active\n"); |
4151 | 4399 | ||
@@ -4182,8 +4430,7 @@ static void raid5d(mddev_t *mddev) | |||
4182 | spin_unlock_irq(&conf->device_lock); | 4430 | spin_unlock_irq(&conf->device_lock); |
4183 | 4431 | ||
4184 | handled++; | 4432 | handled++; |
4185 | handle_stripe(sh, conf->spare_page); | 4433 | process_stripe(sh, &raid_domain); |
4186 | release_stripe(sh); | ||
4187 | 4434 | ||
4188 | spin_lock_irq(&conf->device_lock); | 4435 | spin_lock_irq(&conf->device_lock); |
4189 | } | 4436 | } |
@@ -4191,6 +4438,7 @@ static void raid5d(mddev_t *mddev) | |||
4191 | 4438 | ||
4192 | spin_unlock_irq(&conf->device_lock); | 4439 | spin_unlock_irq(&conf->device_lock); |
4193 | 4440 | ||
4441 | synchronize_stripe_processing(&raid_domain); | ||
4194 | async_tx_issue_pending_all(); | 4442 | async_tx_issue_pending_all(); |
4195 | unplug_slaves(mddev); | 4443 | unplug_slaves(mddev); |
4196 | 4444 | ||
@@ -4323,15 +4571,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
4323 | return sectors * (raid_disks - conf->max_degraded); | 4571 | return sectors * (raid_disks - conf->max_degraded); |
4324 | } | 4572 | } |
4325 | 4573 | ||
4574 | static void raid5_free_percpu(raid5_conf_t *conf) | ||
4575 | { | ||
4576 | struct raid5_percpu *percpu; | ||
4577 | unsigned long cpu; | ||
4578 | |||
4579 | if (!conf->percpu) | ||
4580 | return; | ||
4581 | |||
4582 | get_online_cpus(); | ||
4583 | for_each_possible_cpu(cpu) { | ||
4584 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
4585 | safe_put_page(percpu->spare_page); | ||
4586 | kfree(percpu->scribble); | ||
4587 | } | ||
4588 | #ifdef CONFIG_HOTPLUG_CPU | ||
4589 | unregister_cpu_notifier(&conf->cpu_notify); | ||
4590 | #endif | ||
4591 | put_online_cpus(); | ||
4592 | |||
4593 | free_percpu(conf->percpu); | ||
4594 | } | ||
4595 | |||
4326 | static void free_conf(raid5_conf_t *conf) | 4596 | static void free_conf(raid5_conf_t *conf) |
4327 | { | 4597 | { |
4328 | shrink_stripes(conf); | 4598 | shrink_stripes(conf); |
4329 | safe_put_page(conf->spare_page); | 4599 | raid5_free_percpu(conf); |
4330 | kfree(conf->disks); | 4600 | kfree(conf->disks); |
4331 | kfree(conf->stripe_hashtbl); | 4601 | kfree(conf->stripe_hashtbl); |
4332 | kfree(conf); | 4602 | kfree(conf); |
4333 | } | 4603 | } |
4334 | 4604 | ||
4605 | #ifdef CONFIG_HOTPLUG_CPU | ||
4606 | static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, | ||
4607 | void *hcpu) | ||
4608 | { | ||
4609 | raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify); | ||
4610 | long cpu = (long)hcpu; | ||
4611 | struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); | ||
4612 | |||
4613 | switch (action) { | ||
4614 | case CPU_UP_PREPARE: | ||
4615 | case CPU_UP_PREPARE_FROZEN: | ||
4616 | if (conf->level == 6 && !percpu->spare_page) | ||
4617 | percpu->spare_page = alloc_page(GFP_KERNEL); | ||
4618 | if (!percpu->scribble) | ||
4619 | percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); | ||
4620 | |||
4621 | if (!percpu->scribble || | ||
4622 | (conf->level == 6 && !percpu->spare_page)) { | ||
4623 | safe_put_page(percpu->spare_page); | ||
4624 | kfree(percpu->scribble); | ||
4625 | pr_err("%s: failed memory allocation for cpu%ld\n", | ||
4626 | __func__, cpu); | ||
4627 | return NOTIFY_BAD; | ||
4628 | } | ||
4629 | break; | ||
4630 | case CPU_DEAD: | ||
4631 | case CPU_DEAD_FROZEN: | ||
4632 | safe_put_page(percpu->spare_page); | ||
4633 | kfree(percpu->scribble); | ||
4634 | percpu->spare_page = NULL; | ||
4635 | percpu->scribble = NULL; | ||
4636 | break; | ||
4637 | default: | ||
4638 | break; | ||
4639 | } | ||
4640 | return NOTIFY_OK; | ||
4641 | } | ||
4642 | #endif | ||
4643 | |||
4644 | static int raid5_alloc_percpu(raid5_conf_t *conf) | ||
4645 | { | ||
4646 | unsigned long cpu; | ||
4647 | struct page *spare_page; | ||
4648 | struct raid5_percpu *allcpus; | ||
4649 | void *scribble; | ||
4650 | int err; | ||
4651 | |||
4652 | allcpus = alloc_percpu(struct raid5_percpu); | ||
4653 | if (!allcpus) | ||
4654 | return -ENOMEM; | ||
4655 | conf->percpu = allcpus; | ||
4656 | |||
4657 | get_online_cpus(); | ||
4658 | err = 0; | ||
4659 | for_each_present_cpu(cpu) { | ||
4660 | if (conf->level == 6) { | ||
4661 | spare_page = alloc_page(GFP_KERNEL); | ||
4662 | if (!spare_page) { | ||
4663 | err = -ENOMEM; | ||
4664 | break; | ||
4665 | } | ||
4666 | per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; | ||
4667 | } | ||
4668 | scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL); | ||
4669 | if (!scribble) { | ||
4670 | err = -ENOMEM; | ||
4671 | break; | ||
4672 | } | ||
4673 | per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; | ||
4674 | } | ||
4675 | #ifdef CONFIG_HOTPLUG_CPU | ||
4676 | conf->cpu_notify.notifier_call = raid456_cpu_notify; | ||
4677 | conf->cpu_notify.priority = 0; | ||
4678 | if (err == 0) | ||
4679 | err = register_cpu_notifier(&conf->cpu_notify); | ||
4680 | #endif | ||
4681 | put_online_cpus(); | ||
4682 | |||
4683 | return err; | ||
4684 | } | ||
4685 | |||
4335 | static raid5_conf_t *setup_conf(mddev_t *mddev) | 4686 | static raid5_conf_t *setup_conf(mddev_t *mddev) |
4336 | { | 4687 | { |
4337 | raid5_conf_t *conf; | 4688 | raid5_conf_t *conf; |
@@ -4373,6 +4724,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4373 | goto abort; | 4724 | goto abort; |
4374 | 4725 | ||
4375 | conf->raid_disks = mddev->raid_disks; | 4726 | conf->raid_disks = mddev->raid_disks; |
4727 | conf->scribble_len = scribble_len(conf->raid_disks); | ||
4376 | if (mddev->reshape_position == MaxSector) | 4728 | if (mddev->reshape_position == MaxSector) |
4377 | conf->previous_raid_disks = mddev->raid_disks; | 4729 | conf->previous_raid_disks = mddev->raid_disks; |
4378 | else | 4730 | else |
@@ -4388,11 +4740,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4388 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) | 4740 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) |
4389 | goto abort; | 4741 | goto abort; |
4390 | 4742 | ||
4391 | if (mddev->new_level == 6) { | 4743 | conf->level = mddev->new_level; |
4392 | conf->spare_page = alloc_page(GFP_KERNEL); | 4744 | if (raid5_alloc_percpu(conf) != 0) |
4393 | if (!conf->spare_page) | 4745 | goto abort; |
4394 | goto abort; | 4746 | |
4395 | } | ||
4396 | spin_lock_init(&conf->device_lock); | 4747 | spin_lock_init(&conf->device_lock); |
4397 | init_waitqueue_head(&conf->wait_for_stripe); | 4748 | init_waitqueue_head(&conf->wait_for_stripe); |
4398 | init_waitqueue_head(&conf->wait_for_overlap); | 4749 | init_waitqueue_head(&conf->wait_for_overlap); |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 9459689c4ea0..2390e0e83daf 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _RAID5_H | 2 | #define _RAID5_H |
3 | 3 | ||
4 | #include <linux/raid/xor.h> | 4 | #include <linux/raid/xor.h> |
5 | #include <linux/dmaengine.h> | ||
5 | 6 | ||
6 | /* | 7 | /* |
7 | * | 8 | * |
@@ -175,7 +176,9 @@ | |||
175 | */ | 176 | */ |
176 | enum check_states { | 177 | enum check_states { |
177 | check_state_idle = 0, | 178 | check_state_idle = 0, |
178 | check_state_run, /* parity check */ | 179 | check_state_run, /* xor parity check */ |
180 | check_state_run_q, /* q-parity check */ | ||
181 | check_state_run_pq, /* pq dual parity check */ | ||
179 | check_state_check_result, | 182 | check_state_check_result, |
180 | check_state_compute_run, /* parity repair */ | 183 | check_state_compute_run, /* parity repair */ |
181 | check_state_compute_result, | 184 | check_state_compute_result, |
@@ -215,8 +218,8 @@ struct stripe_head { | |||
215 | * @target - STRIPE_OP_COMPUTE_BLK target | 218 | * @target - STRIPE_OP_COMPUTE_BLK target |
216 | */ | 219 | */ |
217 | struct stripe_operations { | 220 | struct stripe_operations { |
218 | int target; | 221 | int target, target2; |
219 | u32 zero_sum_result; | 222 | enum sum_check_flags zero_sum_result; |
220 | } ops; | 223 | } ops; |
221 | struct r5dev { | 224 | struct r5dev { |
222 | struct bio req; | 225 | struct bio req; |
@@ -298,7 +301,7 @@ struct r6_state { | |||
298 | #define STRIPE_OP_COMPUTE_BLK 1 | 301 | #define STRIPE_OP_COMPUTE_BLK 1 |
299 | #define STRIPE_OP_PREXOR 2 | 302 | #define STRIPE_OP_PREXOR 2 |
300 | #define STRIPE_OP_BIODRAIN 3 | 303 | #define STRIPE_OP_BIODRAIN 3 |
301 | #define STRIPE_OP_POSTXOR 4 | 304 | #define STRIPE_OP_RECONSTRUCT 4 |
302 | #define STRIPE_OP_CHECK 5 | 305 | #define STRIPE_OP_CHECK 5 |
303 | 306 | ||
304 | /* | 307 | /* |
@@ -385,8 +388,21 @@ struct raid5_private_data { | |||
385 | * (fresh device added). | 388 | * (fresh device added). |
386 | * Cleared when a sync completes. | 389 | * Cleared when a sync completes. |
387 | */ | 390 | */ |
388 | 391 | /* per cpu variables */ | |
389 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | 392 | struct raid5_percpu { |
393 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | ||
394 | void *scribble; /* space for constructing buffer | ||
395 | * lists and performing address | ||
396 | * conversions | ||
397 | */ | ||
398 | } *percpu; | ||
399 | size_t scribble_len; /* size of scribble region must be | ||
400 | * associated with conf to handle | ||
401 | * cpu hotplug while reshaping | ||
402 | */ | ||
403 | #ifdef CONFIG_HOTPLUG_CPU | ||
404 | struct notifier_block cpu_notify; | ||
405 | #endif | ||
390 | 406 | ||
391 | /* | 407 | /* |
392 | * Free stripes pool | 408 | * Free stripes pool |