aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2006-06-26 03:27:38 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-26 12:58:37 -0400
commit16a53ecc35f2a80dc285be2e769768847d89ca37 (patch)
tree19d005f19af68e9b98efaf16885bb60498f540c1
parent16f17b39f385212b73278a76d482cdcaaebe6c02 (diff)
[PATCH] md: merge raid5 and raid6 code
There is a lot of commonality between raid5.c and raid6main.c. This patches merges both into one module called raid456. This saves a lot of code, and paves the way for online raid5->raid6 migrations. There is still duplication, e.g. between handle_stripe5 and handle_stripe6. This will probably be cleaned up later. Cc: "H. Peter Anvin" <hpa@zytor.com> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/md/Kconfig38
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/raid5.c1059
-rw-r--r--drivers/md/raid6main.c2427
-rw-r--r--include/linux/raid/raid5.h1
5 files changed, 1003 insertions, 2527 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ac25a48362ac..f657aa7ec78c 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -104,8 +104,8 @@ config MD_RAID10
104 104
105 If unsure, say Y. 105 If unsure, say Y.
106 106
107config MD_RAID5 107config MD_RAID456
108 tristate "RAID-4/RAID-5 mode" 108 tristate "RAID-4/RAID-5/RAID-6 mode"
109 depends on BLK_DEV_MD 109 depends on BLK_DEV_MD
110 ---help--- 110 ---help---
111 A RAID-5 set of N drives with a capacity of C MB per drive provides 111 A RAID-5 set of N drives with a capacity of C MB per drive provides
@@ -116,14 +116,22 @@ config MD_RAID5
116 while a RAID-5 set distributes the parity across the drives in one 116 while a RAID-5 set distributes the parity across the drives in one
117 of the available parity distribution methods. 117 of the available parity distribution methods.
118 118
119 A RAID-6 set of N drives with a capacity of C MB per drive
120 provides the capacity of C * (N - 2) MB, and protects
121 against a failure of any two drives. For a given sector
122 (row) number, (N - 2) drives contain data sectors, and two
123 drives contains two independent redundancy syndromes. Like
124 RAID-5, RAID-6 distributes the syndromes across the drives
125 in one of the available parity distribution methods.
126
119 Information about Software RAID on Linux is contained in the 127 Information about Software RAID on Linux is contained in the
120 Software-RAID mini-HOWTO, available from 128 Software-RAID mini-HOWTO, available from
121 <http://www.tldp.org/docs.html#howto>. There you will also 129 <http://www.tldp.org/docs.html#howto>. There you will also
122 learn where to get the supporting user space utilities raidtools. 130 learn where to get the supporting user space utilities raidtools.
123 131
124 If you want to use such a RAID-4/RAID-5 set, say Y. To 132 If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To
125 compile this code as a module, choose M here: the module 133 compile this code as a module, choose M here: the module
126 will be called raid5. 134 will be called raid456.
127 135
128 If unsure, say Y. 136 If unsure, say Y.
129 137
@@ -154,28 +162,6 @@ config MD_RAID5_RESHAPE
154 There should be enough spares already present to make the new 162 There should be enough spares already present to make the new
155 array workable. 163 array workable.
156 164
157config MD_RAID6
158 tristate "RAID-6 mode"
159 depends on BLK_DEV_MD
160 ---help---
161 A RAID-6 set of N drives with a capacity of C MB per drive
162 provides the capacity of C * (N - 2) MB, and protects
163 against a failure of any two drives. For a given sector
164 (row) number, (N - 2) drives contain data sectors, and two
165 drives contains two independent redundancy syndromes. Like
166 RAID-5, RAID-6 distributes the syndromes across the drives
167 in one of the available parity distribution methods.
168
169 RAID-6 requires mdadm-1.5.0 or later, available at:
170
171 ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
172
173 If you want to use such a RAID-6 set, say Y. To compile
174 this code as a module, choose M here: the module will be
175 called raid6.
176
177 If unsure, say Y.
178
179config MD_MULTIPATH 165config MD_MULTIPATH
180 tristate "Multipath I/O support" 166 tristate "Multipath I/O support"
181 depends on BLK_DEV_MD 167 depends on BLK_DEV_MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d3efedf6a6ad..34957a68d921 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,7 +8,7 @@ dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
8dm-snapshot-objs := dm-snap.o dm-exception-store.o 8dm-snapshot-objs := dm-snap.o dm-exception-store.o
9dm-mirror-objs := dm-log.o dm-raid1.o 9dm-mirror-objs := dm-log.o dm-raid1.o
10md-mod-objs := md.o bitmap.o 10md-mod-objs := md.o bitmap.o
11raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \ 11raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \
12 raid6int1.o raid6int2.o raid6int4.o \ 12 raid6int1.o raid6int2.o raid6int4.o \
13 raid6int8.o raid6int16.o raid6int32.o \ 13 raid6int8.o raid6int16.o raid6int32.o \
14 raid6altivec1.o raid6altivec2.o raid6altivec4.o \ 14 raid6altivec1.o raid6altivec2.o raid6altivec4.o \
@@ -25,8 +25,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o
25obj-$(CONFIG_MD_RAID0) += raid0.o 25obj-$(CONFIG_MD_RAID0) += raid0.o
26obj-$(CONFIG_MD_RAID1) += raid1.o 26obj-$(CONFIG_MD_RAID1) += raid1.o
27obj-$(CONFIG_MD_RAID10) += raid10.o 27obj-$(CONFIG_MD_RAID10) += raid10.o
28obj-$(CONFIG_MD_RAID5) += raid5.o xor.o 28obj-$(CONFIG_MD_RAID456) += raid456.o xor.o
29obj-$(CONFIG_MD_RAID6) += raid6.o xor.o
30obj-$(CONFIG_MD_MULTIPATH) += multipath.o 29obj-$(CONFIG_MD_MULTIPATH) += multipath.o
31obj-$(CONFIG_MD_FAULTY) += faulty.o 30obj-$(CONFIG_MD_FAULTY) += faulty.o
32obj-$(CONFIG_BLK_DEV_MD) += md-mod.o 31obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 122e64e557b1..9ba73074df04 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2,8 +2,11 @@
2 * raid5.c : Multiple Devices driver for Linux 2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar 4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
5 * 6 *
6 * RAID-5 management functions. 7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server!
7 * 10 *
8 * This program is free software; you can redistribute it and/or modify 11 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by 12 * it under the terms of the GNU General Public License as published by
@@ -19,11 +22,11 @@
19#include <linux/config.h> 22#include <linux/config.h>
20#include <linux/module.h> 23#include <linux/module.h>
21#include <linux/slab.h> 24#include <linux/slab.h>
22#include <linux/raid/raid5.h>
23#include <linux/highmem.h> 25#include <linux/highmem.h>
24#include <linux/bitops.h> 26#include <linux/bitops.h>
25#include <linux/kthread.h> 27#include <linux/kthread.h>
26#include <asm/atomic.h> 28#include <asm/atomic.h>
29#include "raid6.h"
27 30
28#include <linux/raid/bitmap.h> 31#include <linux/raid/bitmap.h>
29 32
@@ -68,6 +71,16 @@
68#define __inline__ 71#define __inline__
69#endif 72#endif
70 73
74#if !RAID6_USE_EMPTY_ZERO_PAGE
75/* In .bss so it's zeroed */
76const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
77#endif
78
79static inline int raid6_next_disk(int disk, int raid_disks)
80{
81 disk++;
82 return (disk < raid_disks) ? disk : 0;
83}
71static void print_raid5_conf (raid5_conf_t *conf); 84static void print_raid5_conf (raid5_conf_t *conf);
72 85
73static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 86static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
@@ -104,7 +117,7 @@ static void release_stripe(struct stripe_head *sh)
104{ 117{
105 raid5_conf_t *conf = sh->raid_conf; 118 raid5_conf_t *conf = sh->raid_conf;
106 unsigned long flags; 119 unsigned long flags;
107 120
108 spin_lock_irqsave(&conf->device_lock, flags); 121 spin_lock_irqsave(&conf->device_lock, flags);
109 __release_stripe(conf, sh); 122 __release_stripe(conf, sh);
110 spin_unlock_irqrestore(&conf->device_lock, flags); 123 spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -117,7 +130,7 @@ static inline void remove_hash(struct stripe_head *sh)
117 hlist_del_init(&sh->hash); 130 hlist_del_init(&sh->hash);
118} 131}
119 132
120static void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 133static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
121{ 134{
122 struct hlist_head *hp = stripe_hash(conf, sh->sector); 135 struct hlist_head *hp = stripe_hash(conf, sh->sector);
123 136
@@ -190,7 +203,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
190 (unsigned long long)sh->sector); 203 (unsigned long long)sh->sector);
191 204
192 remove_hash(sh); 205 remove_hash(sh);
193 206
194 sh->sector = sector; 207 sh->sector = sector;
195 sh->pd_idx = pd_idx; 208 sh->pd_idx = pd_idx;
196 sh->state = 0; 209 sh->state = 0;
@@ -269,8 +282,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
269 } else { 282 } else {
270 if (!test_bit(STRIPE_HANDLE, &sh->state)) 283 if (!test_bit(STRIPE_HANDLE, &sh->state))
271 atomic_inc(&conf->active_stripes); 284 atomic_inc(&conf->active_stripes);
272 if (!list_empty(&sh->lru)) 285 if (list_empty(&sh->lru))
273 list_del_init(&sh->lru); 286 BUG();
287 list_del_init(&sh->lru);
274 } 288 }
275 } 289 }
276 } while (sh == NULL); 290 } while (sh == NULL);
@@ -321,10 +335,9 @@ static int grow_stripes(raid5_conf_t *conf, int num)
321 return 1; 335 return 1;
322 conf->slab_cache = sc; 336 conf->slab_cache = sc;
323 conf->pool_size = devs; 337 conf->pool_size = devs;
324 while (num--) { 338 while (num--)
325 if (!grow_one_stripe(conf)) 339 if (!grow_one_stripe(conf))
326 return 1; 340 return 1;
327 }
328 return 0; 341 return 0;
329} 342}
330 343
@@ -631,8 +644,7 @@ static void raid5_build_block (struct stripe_head *sh, int i)
631 dev->req.bi_private = sh; 644 dev->req.bi_private = sh;
632 645
633 dev->flags = 0; 646 dev->flags = 0;
634 if (i != sh->pd_idx) 647 dev->sector = compute_blocknr(sh, i);
635 dev->sector = compute_blocknr(sh, i);
636} 648}
637 649
638static void error(mddev_t *mddev, mdk_rdev_t *rdev) 650static void error(mddev_t *mddev, mdk_rdev_t *rdev)
@@ -659,7 +671,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
659 " Operation continuing on %d devices\n", 671 " Operation continuing on %d devices\n",
660 bdevname(rdev->bdev,b), conf->working_disks); 672 bdevname(rdev->bdev,b), conf->working_disks);
661 } 673 }
662} 674}
663 675
664/* 676/*
665 * Input: a 'big' sector number, 677 * Input: a 'big' sector number,
@@ -697,9 +709,12 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
697 /* 709 /*
698 * Select the parity disk based on the user selected algorithm. 710 * Select the parity disk based on the user selected algorithm.
699 */ 711 */
700 if (conf->level == 4) 712 switch(conf->level) {
713 case 4:
701 *pd_idx = data_disks; 714 *pd_idx = data_disks;
702 else switch (conf->algorithm) { 715 break;
716 case 5:
717 switch (conf->algorithm) {
703 case ALGORITHM_LEFT_ASYMMETRIC: 718 case ALGORITHM_LEFT_ASYMMETRIC:
704 *pd_idx = data_disks - stripe % raid_disks; 719 *pd_idx = data_disks - stripe % raid_disks;
705 if (*dd_idx >= *pd_idx) 720 if (*dd_idx >= *pd_idx)
@@ -721,6 +736,39 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
721 default: 736 default:
722 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 737 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
723 conf->algorithm); 738 conf->algorithm);
739 }
740 break;
741 case 6:
742
743 /**** FIX THIS ****/
744 switch (conf->algorithm) {
745 case ALGORITHM_LEFT_ASYMMETRIC:
746 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
747 if (*pd_idx == raid_disks-1)
748 (*dd_idx)++; /* Q D D D P */
749 else if (*dd_idx >= *pd_idx)
750 (*dd_idx) += 2; /* D D P Q D */
751 break;
752 case ALGORITHM_RIGHT_ASYMMETRIC:
753 *pd_idx = stripe % raid_disks;
754 if (*pd_idx == raid_disks-1)
755 (*dd_idx)++; /* Q D D D P */
756 else if (*dd_idx >= *pd_idx)
757 (*dd_idx) += 2; /* D D P Q D */
758 break;
759 case ALGORITHM_LEFT_SYMMETRIC:
760 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
761 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
762 break;
763 case ALGORITHM_RIGHT_SYMMETRIC:
764 *pd_idx = stripe % raid_disks;
765 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
766 break;
767 default:
768 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
769 conf->algorithm);
770 }
771 break;
724 } 772 }
725 773
726 /* 774 /*
@@ -742,12 +790,17 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
742 int chunk_number, dummy1, dummy2, dd_idx = i; 790 int chunk_number, dummy1, dummy2, dd_idx = i;
743 sector_t r_sector; 791 sector_t r_sector;
744 792
793
745 chunk_offset = sector_div(new_sector, sectors_per_chunk); 794 chunk_offset = sector_div(new_sector, sectors_per_chunk);
746 stripe = new_sector; 795 stripe = new_sector;
747 BUG_ON(new_sector != stripe); 796 BUG_ON(new_sector != stripe);
748 797
749 798 if (i == sh->pd_idx)
750 switch (conf->algorithm) { 799 return 0;
800 switch(conf->level) {
801 case 4: break;
802 case 5:
803 switch (conf->algorithm) {
751 case ALGORITHM_LEFT_ASYMMETRIC: 804 case ALGORITHM_LEFT_ASYMMETRIC:
752 case ALGORITHM_RIGHT_ASYMMETRIC: 805 case ALGORITHM_RIGHT_ASYMMETRIC:
753 if (i > sh->pd_idx) 806 if (i > sh->pd_idx)
@@ -761,7 +814,37 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
761 break; 814 break;
762 default: 815 default:
763 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 816 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
817 conf->algorithm);
818 }
819 break;
820 case 6:
821 data_disks = raid_disks - 2;
822 if (i == raid6_next_disk(sh->pd_idx, raid_disks))
823 return 0; /* It is the Q disk */
824 switch (conf->algorithm) {
825 case ALGORITHM_LEFT_ASYMMETRIC:
826 case ALGORITHM_RIGHT_ASYMMETRIC:
827 if (sh->pd_idx == raid_disks-1)
828 i--; /* Q D D D P */
829 else if (i > sh->pd_idx)
830 i -= 2; /* D D P Q D */
831 break;
832 case ALGORITHM_LEFT_SYMMETRIC:
833 case ALGORITHM_RIGHT_SYMMETRIC:
834 if (sh->pd_idx == raid_disks-1)
835 i--; /* Q D D D P */
836 else {
837 /* D D P Q D */
838 if (i < sh->pd_idx)
839 i += raid_disks;
840 i -= (sh->pd_idx + 2);
841 }
842 break;
843 default:
844 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
764 conf->algorithm); 845 conf->algorithm);
846 }
847 break;
765 } 848 }
766 849
767 chunk_number = stripe * data_disks + i; 850 chunk_number = stripe * data_disks + i;
@@ -778,10 +861,11 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
778 861
779 862
780/* 863/*
781 * Copy data between a page in the stripe cache, and a bio. 864 * Copy data between a page in the stripe cache, and one or more bion
782 * There are no alignment or size guarantees between the page or the 865 * The page could align with the middle of the bio, or there could be
783 * bio except that there is some overlap. 866 * several bion, each with several bio_vecs, which cover part of the page
784 * All iovecs in the bio must be considered. 867 * Multiple bion are linked together on bi_next. There may be extras
868 * at the end of this list. We ignore them.
785 */ 869 */
786static void copy_data(int frombio, struct bio *bio, 870static void copy_data(int frombio, struct bio *bio,
787 struct page *page, 871 struct page *page,
@@ -810,7 +894,7 @@ static void copy_data(int frombio, struct bio *bio,
810 if (len > 0 && page_offset + len > STRIPE_SIZE) 894 if (len > 0 && page_offset + len > STRIPE_SIZE)
811 clen = STRIPE_SIZE - page_offset; 895 clen = STRIPE_SIZE - page_offset;
812 else clen = len; 896 else clen = len;
813 897
814 if (clen > 0) { 898 if (clen > 0) {
815 char *ba = __bio_kmap_atomic(bio, i, KM_USER0); 899 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
816 if (frombio) 900 if (frombio)
@@ -862,14 +946,14 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
862 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 946 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
863} 947}
864 948
865static void compute_parity(struct stripe_head *sh, int method) 949static void compute_parity5(struct stripe_head *sh, int method)
866{ 950{
867 raid5_conf_t *conf = sh->raid_conf; 951 raid5_conf_t *conf = sh->raid_conf;
868 int i, pd_idx = sh->pd_idx, disks = sh->disks, count; 952 int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
869 void *ptr[MAX_XOR_BLOCKS]; 953 void *ptr[MAX_XOR_BLOCKS];
870 struct bio *chosen; 954 struct bio *chosen;
871 955
872 PRINTK("compute_parity, stripe %llu, method %d\n", 956 PRINTK("compute_parity5, stripe %llu, method %d\n",
873 (unsigned long long)sh->sector, method); 957 (unsigned long long)sh->sector, method);
874 958
875 count = 1; 959 count = 1;
@@ -956,9 +1040,195 @@ static void compute_parity(struct stripe_head *sh, int method)
956 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1040 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
957} 1041}
958 1042
1043static void compute_parity6(struct stripe_head *sh, int method)
1044{
1045 raid6_conf_t *conf = sh->raid_conf;
1046 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
1047 struct bio *chosen;
1048 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1049 void *ptrs[disks];
1050
1051 qd_idx = raid6_next_disk(pd_idx, disks);
1052 d0_idx = raid6_next_disk(qd_idx, disks);
1053
1054 PRINTK("compute_parity, stripe %llu, method %d\n",
1055 (unsigned long long)sh->sector, method);
1056
1057 switch(method) {
1058 case READ_MODIFY_WRITE:
1059 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
1060 case RECONSTRUCT_WRITE:
1061 for (i= disks; i-- ;)
1062 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1063 chosen = sh->dev[i].towrite;
1064 sh->dev[i].towrite = NULL;
1065
1066 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1067 wake_up(&conf->wait_for_overlap);
1068
1069 if (sh->dev[i].written) BUG();
1070 sh->dev[i].written = chosen;
1071 }
1072 break;
1073 case CHECK_PARITY:
1074 BUG(); /* Not implemented yet */
1075 }
1076
1077 for (i = disks; i--;)
1078 if (sh->dev[i].written) {
1079 sector_t sector = sh->dev[i].sector;
1080 struct bio *wbi = sh->dev[i].written;
1081 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1082 copy_data(1, wbi, sh->dev[i].page, sector);
1083 wbi = r5_next_bio(wbi, sector);
1084 }
1085
1086 set_bit(R5_LOCKED, &sh->dev[i].flags);
1087 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1088 }
1089
1090// switch(method) {
1091// case RECONSTRUCT_WRITE:
1092// case CHECK_PARITY:
1093// case UPDATE_PARITY:
1094 /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
1095 /* FIX: Is this ordering of drives even remotely optimal? */
1096 count = 0;
1097 i = d0_idx;
1098 do {
1099 ptrs[count++] = page_address(sh->dev[i].page);
1100 if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1101 printk("block %d/%d not uptodate on parity calc\n", i,count);
1102 i = raid6_next_disk(i, disks);
1103 } while ( i != d0_idx );
1104// break;
1105// }
1106
1107 raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
1108
1109 switch(method) {
1110 case RECONSTRUCT_WRITE:
1111 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1112 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1113 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1114 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
1115 break;
1116 case UPDATE_PARITY:
1117 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1118 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1119 break;
1120 }
1121}
1122
1123
1124/* Compute one missing block */
1125static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1126{
1127 raid6_conf_t *conf = sh->raid_conf;
1128 int i, count, disks = conf->raid_disks;
1129 void *ptr[MAX_XOR_BLOCKS], *p;
1130 int pd_idx = sh->pd_idx;
1131 int qd_idx = raid6_next_disk(pd_idx, disks);
1132
1133 PRINTK("compute_block_1, stripe %llu, idx %d\n",
1134 (unsigned long long)sh->sector, dd_idx);
1135
1136 if ( dd_idx == qd_idx ) {
1137 /* We're actually computing the Q drive */
1138 compute_parity6(sh, UPDATE_PARITY);
1139 } else {
1140 ptr[0] = page_address(sh->dev[dd_idx].page);
1141 if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
1142 count = 1;
1143 for (i = disks ; i--; ) {
1144 if (i == dd_idx || i == qd_idx)
1145 continue;
1146 p = page_address(sh->dev[i].page);
1147 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1148 ptr[count++] = p;
1149 else
1150 printk("compute_block() %d, stripe %llu, %d"
1151 " not present\n", dd_idx,
1152 (unsigned long long)sh->sector, i);
1153
1154 check_xor();
1155 }
1156 if (count != 1)
1157 xor_block(count, STRIPE_SIZE, ptr);
1158 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1159 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1160 }
1161}
1162
1163/* Compute two missing blocks */
1164static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1165{
1166 raid6_conf_t *conf = sh->raid_conf;
1167 int i, count, disks = conf->raid_disks;
1168 int pd_idx = sh->pd_idx;
1169 int qd_idx = raid6_next_disk(pd_idx, disks);
1170 int d0_idx = raid6_next_disk(qd_idx, disks);
1171 int faila, failb;
1172
1173 /* faila and failb are disk numbers relative to d0_idx */
1174 /* pd_idx become disks-2 and qd_idx become disks-1 */
1175 faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
1176 failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
1177
1178 BUG_ON(faila == failb);
1179 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1180
1181 PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1182 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
1183
1184 if ( failb == disks-1 ) {
1185 /* Q disk is one of the missing disks */
1186 if ( faila == disks-2 ) {
1187 /* Missing P+Q, just recompute */
1188 compute_parity6(sh, UPDATE_PARITY);
1189 return;
1190 } else {
1191 /* We're missing D+Q; recompute D from P */
1192 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
1193 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1194 return;
1195 }
1196 }
1197
1198 /* We're missing D+P or D+D; build pointer table */
1199 {
1200 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1201 void *ptrs[disks];
1202
1203 count = 0;
1204 i = d0_idx;
1205 do {
1206 ptrs[count++] = page_address(sh->dev[i].page);
1207 i = raid6_next_disk(i, disks);
1208 if (i != dd_idx1 && i != dd_idx2 &&
1209 !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1210 printk("compute_2 with missing block %d/%d\n", count, i);
1211 } while ( i != d0_idx );
1212
1213 if ( failb == disks-2 ) {
1214 /* We're missing D+P. */
1215 raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
1216 } else {
1217 /* We're missing D+D. */
1218 raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
1219 }
1220
1221 /* Both the above update both missing blocks */
1222 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1223 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1224 }
1225}
1226
1227
1228
959/* 1229/*
960 * Each stripe/dev can have one or more bion attached. 1230 * Each stripe/dev can have one or more bion attached.
961 * toread/towrite point to the first in a chain. 1231 * toread/towrite point to the first in a chain.
962 * The bi_next chain must be in order. 1232 * The bi_next chain must be in order.
963 */ 1233 */
964static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 1234static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
@@ -1031,6 +1301,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1031 1301
1032static void end_reshape(raid5_conf_t *conf); 1302static void end_reshape(raid5_conf_t *conf);
1033 1303
1304static int page_is_zero(struct page *p)
1305{
1306 char *a = page_address(p);
1307 return ((*(u32*)a) == 0 &&
1308 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1309}
1310
1034static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) 1311static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1035{ 1312{
1036 int sectors_per_chunk = conf->chunk_size >> 9; 1313 int sectors_per_chunk = conf->chunk_size >> 9;
@@ -1062,7 +1339,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1062 * 1339 *
1063 */ 1340 */
1064 1341
1065static void handle_stripe(struct stripe_head *sh) 1342static void handle_stripe5(struct stripe_head *sh)
1066{ 1343{
1067 raid5_conf_t *conf = sh->raid_conf; 1344 raid5_conf_t *conf = sh->raid_conf;
1068 int disks = sh->disks; 1345 int disks = sh->disks;
@@ -1394,7 +1671,7 @@ static void handle_stripe(struct stripe_head *sh)
1394 if (locked == 0 && (rcw == 0 ||rmw == 0) && 1671 if (locked == 0 && (rcw == 0 ||rmw == 0) &&
1395 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 1672 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1396 PRINTK("Computing parity...\n"); 1673 PRINTK("Computing parity...\n");
1397 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); 1674 compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1398 /* now every locked buffer is ready to be written */ 1675 /* now every locked buffer is ready to be written */
1399 for (i=disks; i--;) 1676 for (i=disks; i--;)
1400 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { 1677 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
@@ -1421,13 +1698,10 @@ static void handle_stripe(struct stripe_head *sh)
1421 !test_bit(STRIPE_INSYNC, &sh->state)) { 1698 !test_bit(STRIPE_INSYNC, &sh->state)) {
1422 set_bit(STRIPE_HANDLE, &sh->state); 1699 set_bit(STRIPE_HANDLE, &sh->state);
1423 if (failed == 0) { 1700 if (failed == 0) {
1424 char *pagea;
1425 BUG_ON(uptodate != disks); 1701 BUG_ON(uptodate != disks);
1426 compute_parity(sh, CHECK_PARITY); 1702 compute_parity5(sh, CHECK_PARITY);
1427 uptodate--; 1703 uptodate--;
1428 pagea = page_address(sh->dev[sh->pd_idx].page); 1704 if (page_is_zero(sh->dev[sh->pd_idx].page)) {
1429 if ((*(u32*)pagea) == 0 &&
1430 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
1431 /* parity is correct (on disc, not in buffer any more) */ 1705 /* parity is correct (on disc, not in buffer any more) */
1432 set_bit(STRIPE_INSYNC, &sh->state); 1706 set_bit(STRIPE_INSYNC, &sh->state);
1433 } else { 1707 } else {
@@ -1487,7 +1761,7 @@ static void handle_stripe(struct stripe_head *sh)
1487 /* Need to write out all blocks after computing parity */ 1761 /* Need to write out all blocks after computing parity */
1488 sh->disks = conf->raid_disks; 1762 sh->disks = conf->raid_disks;
1489 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); 1763 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
1490 compute_parity(sh, RECONSTRUCT_WRITE); 1764 compute_parity5(sh, RECONSTRUCT_WRITE);
1491 for (i= conf->raid_disks; i--;) { 1765 for (i= conf->raid_disks; i--;) {
1492 set_bit(R5_LOCKED, &sh->dev[i].flags); 1766 set_bit(R5_LOCKED, &sh->dev[i].flags);
1493 locked++; 1767 locked++;
@@ -1615,6 +1889,569 @@ static void handle_stripe(struct stripe_head *sh)
1615 } 1889 }
1616} 1890}
1617 1891
1892static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1893{
1894 raid6_conf_t *conf = sh->raid_conf;
1895 int disks = conf->raid_disks;
1896 struct bio *return_bi= NULL;
1897 struct bio *bi;
1898 int i;
1899 int syncing;
1900 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1901 int non_overwrite = 0;
1902 int failed_num[2] = {0, 0};
1903 struct r5dev *dev, *pdev, *qdev;
1904 int pd_idx = sh->pd_idx;
1905 int qd_idx = raid6_next_disk(pd_idx, disks);
1906 int p_failed, q_failed;
1907
1908 PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
1909 (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
1910 pd_idx, qd_idx);
1911
1912 spin_lock(&sh->lock);
1913 clear_bit(STRIPE_HANDLE, &sh->state);
1914 clear_bit(STRIPE_DELAYED, &sh->state);
1915
1916 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1917 /* Now to look around and see what can be done */
1918
1919 rcu_read_lock();
1920 for (i=disks; i--; ) {
1921 mdk_rdev_t *rdev;
1922 dev = &sh->dev[i];
1923 clear_bit(R5_Insync, &dev->flags);
1924
1925 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
1926 i, dev->flags, dev->toread, dev->towrite, dev->written);
1927 /* maybe we can reply to a read */
1928 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1929 struct bio *rbi, *rbi2;
1930 PRINTK("Return read for disc %d\n", i);
1931 spin_lock_irq(&conf->device_lock);
1932 rbi = dev->toread;
1933 dev->toread = NULL;
1934 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1935 wake_up(&conf->wait_for_overlap);
1936 spin_unlock_irq(&conf->device_lock);
1937 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1938 copy_data(0, rbi, dev->page, dev->sector);
1939 rbi2 = r5_next_bio(rbi, dev->sector);
1940 spin_lock_irq(&conf->device_lock);
1941 if (--rbi->bi_phys_segments == 0) {
1942 rbi->bi_next = return_bi;
1943 return_bi = rbi;
1944 }
1945 spin_unlock_irq(&conf->device_lock);
1946 rbi = rbi2;
1947 }
1948 }
1949
1950 /* now count some things */
1951 if (test_bit(R5_LOCKED, &dev->flags)) locked++;
1952 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
1953
1954
1955 if (dev->toread) to_read++;
1956 if (dev->towrite) {
1957 to_write++;
1958 if (!test_bit(R5_OVERWRITE, &dev->flags))
1959 non_overwrite++;
1960 }
1961 if (dev->written) written++;
1962 rdev = rcu_dereference(conf->disks[i].rdev);
1963 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1964 /* The ReadError flag will just be confusing now */
1965 clear_bit(R5_ReadError, &dev->flags);
1966 clear_bit(R5_ReWrite, &dev->flags);
1967 }
1968 if (!rdev || !test_bit(In_sync, &rdev->flags)
1969 || test_bit(R5_ReadError, &dev->flags)) {
1970 if ( failed < 2 )
1971 failed_num[failed] = i;
1972 failed++;
1973 } else
1974 set_bit(R5_Insync, &dev->flags);
1975 }
1976 rcu_read_unlock();
1977 PRINTK("locked=%d uptodate=%d to_read=%d"
1978 " to_write=%d failed=%d failed_num=%d,%d\n",
1979 locked, uptodate, to_read, to_write, failed,
1980 failed_num[0], failed_num[1]);
1981 /* check if the array has lost >2 devices and, if so, some requests might
1982 * need to be failed
1983 */
1984 if (failed > 2 && to_read+to_write+written) {
1985 for (i=disks; i--; ) {
1986 int bitmap_end = 0;
1987
1988 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1989 mdk_rdev_t *rdev;
1990 rcu_read_lock();
1991 rdev = rcu_dereference(conf->disks[i].rdev);
1992 if (rdev && test_bit(In_sync, &rdev->flags))
1993 /* multiple read failures in one stripe */
1994 md_error(conf->mddev, rdev);
1995 rcu_read_unlock();
1996 }
1997
1998 spin_lock_irq(&conf->device_lock);
1999 /* fail all writes first */
2000 bi = sh->dev[i].towrite;
2001 sh->dev[i].towrite = NULL;
2002 if (bi) { to_write--; bitmap_end = 1; }
2003
2004 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2005 wake_up(&conf->wait_for_overlap);
2006
2007 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2008 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2009 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2010 if (--bi->bi_phys_segments == 0) {
2011 md_write_end(conf->mddev);
2012 bi->bi_next = return_bi;
2013 return_bi = bi;
2014 }
2015 bi = nextbi;
2016 }
2017 /* and fail all 'written' */
2018 bi = sh->dev[i].written;
2019 sh->dev[i].written = NULL;
2020 if (bi) bitmap_end = 1;
2021 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
2022 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2023 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2024 if (--bi->bi_phys_segments == 0) {
2025 md_write_end(conf->mddev);
2026 bi->bi_next = return_bi;
2027 return_bi = bi;
2028 }
2029 bi = bi2;
2030 }
2031
2032 /* fail any reads if this device is non-operational */
2033 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2034 test_bit(R5_ReadError, &sh->dev[i].flags)) {
2035 bi = sh->dev[i].toread;
2036 sh->dev[i].toread = NULL;
2037 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2038 wake_up(&conf->wait_for_overlap);
2039 if (bi) to_read--;
2040 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2041 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2042 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2043 if (--bi->bi_phys_segments == 0) {
2044 bi->bi_next = return_bi;
2045 return_bi = bi;
2046 }
2047 bi = nextbi;
2048 }
2049 }
2050 spin_unlock_irq(&conf->device_lock);
2051 if (bitmap_end)
2052 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2053 STRIPE_SECTORS, 0, 0);
2054 }
2055 }
2056 if (failed > 2 && syncing) {
2057 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2058 clear_bit(STRIPE_SYNCING, &sh->state);
2059 syncing = 0;
2060 }
2061
2062 /*
2063 * might be able to return some write requests if the parity blocks
2064 * are safe, or on a failed drive
2065 */
2066 pdev = &sh->dev[pd_idx];
2067 p_failed = (failed >= 1 && failed_num[0] == pd_idx)
2068 || (failed >= 2 && failed_num[1] == pd_idx);
2069 qdev = &sh->dev[qd_idx];
2070 q_failed = (failed >= 1 && failed_num[0] == qd_idx)
2071 || (failed >= 2 && failed_num[1] == qd_idx);
2072
2073 if ( written &&
2074 ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
2075 && !test_bit(R5_LOCKED, &pdev->flags)
2076 && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
2077 ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
2078 && !test_bit(R5_LOCKED, &qdev->flags)
2079 && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
2080 /* any written block on an uptodate or failed drive can be
2081 * returned. Note that if we 'wrote' to a failed drive,
2082 * it will be UPTODATE, but never LOCKED, so we don't need
2083 * to test 'failed' directly.
2084 */
2085 for (i=disks; i--; )
2086 if (sh->dev[i].written) {
2087 dev = &sh->dev[i];
2088 if (!test_bit(R5_LOCKED, &dev->flags) &&
2089 test_bit(R5_UPTODATE, &dev->flags) ) {
2090 /* We can return any write requests */
2091 int bitmap_end = 0;
2092 struct bio *wbi, *wbi2;
2093 PRINTK("Return write for stripe %llu disc %d\n",
2094 (unsigned long long)sh->sector, i);
2095 spin_lock_irq(&conf->device_lock);
2096 wbi = dev->written;
2097 dev->written = NULL;
2098 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
2099 wbi2 = r5_next_bio(wbi, dev->sector);
2100 if (--wbi->bi_phys_segments == 0) {
2101 md_write_end(conf->mddev);
2102 wbi->bi_next = return_bi;
2103 return_bi = wbi;
2104 }
2105 wbi = wbi2;
2106 }
2107 if (dev->towrite == NULL)
2108 bitmap_end = 1;
2109 spin_unlock_irq(&conf->device_lock);
2110 if (bitmap_end)
2111 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2112 STRIPE_SECTORS,
2113 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
2114 }
2115 }
2116 }
2117
2118 /* Now we might consider reading some blocks, either to check/generate
2119 * parity, or to satisfy requests
2120 * or to load a block that is being partially written.
2121 */
2122 if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
2123 for (i=disks; i--;) {
2124 dev = &sh->dev[i];
2125 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2126 (dev->toread ||
2127 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2128 syncing ||
2129 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
2130 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
2131 )
2132 ) {
2133 /* we would like to get this block, possibly
2134 * by computing it, but we might not be able to
2135 */
2136 if (uptodate == disks-1) {
2137 PRINTK("Computing stripe %llu block %d\n",
2138 (unsigned long long)sh->sector, i);
2139 compute_block_1(sh, i, 0);
2140 uptodate++;
2141 } else if ( uptodate == disks-2 && failed >= 2 ) {
2142 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
2143 int other;
2144 for (other=disks; other--;) {
2145 if ( other == i )
2146 continue;
2147 if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
2148 break;
2149 }
2150 BUG_ON(other < 0);
2151 PRINTK("Computing stripe %llu blocks %d,%d\n",
2152 (unsigned long long)sh->sector, i, other);
2153 compute_block_2(sh, i, other);
2154 uptodate += 2;
2155 } else if (test_bit(R5_Insync, &dev->flags)) {
2156 set_bit(R5_LOCKED, &dev->flags);
2157 set_bit(R5_Wantread, &dev->flags);
2158#if 0
2159 /* if I am just reading this block and we don't have
2160 a failed drive, or any pending writes then sidestep the cache */
2161 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
2162 ! syncing && !failed && !to_write) {
2163 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
2164 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
2165 }
2166#endif
2167 locked++;
2168 PRINTK("Reading block %d (sync=%d)\n",
2169 i, syncing);
2170 }
2171 }
2172 }
2173 set_bit(STRIPE_HANDLE, &sh->state);
2174 }
2175
2176 /* now to consider writing and what else, if anything should be read */
2177 if (to_write) {
2178 int rcw=0, must_compute=0;
2179 for (i=disks ; i--;) {
2180 dev = &sh->dev[i];
2181 /* Would I have to read this buffer for reconstruct_write */
2182 if (!test_bit(R5_OVERWRITE, &dev->flags)
2183 && i != pd_idx && i != qd_idx
2184 && (!test_bit(R5_LOCKED, &dev->flags)
2185#if 0
2186 || sh->bh_page[i] != bh->b_page
2187#endif
2188 ) &&
2189 !test_bit(R5_UPTODATE, &dev->flags)) {
2190 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2191 else {
2192 PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
2193 must_compute++;
2194 }
2195 }
2196 }
2197 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
2198 (unsigned long long)sh->sector, rcw, must_compute);
2199 set_bit(STRIPE_HANDLE, &sh->state);
2200
2201 if (rcw > 0)
2202 /* want reconstruct write, but need to get some data */
2203 for (i=disks; i--;) {
2204 dev = &sh->dev[i];
2205 if (!test_bit(R5_OVERWRITE, &dev->flags)
2206 && !(failed == 0 && (i == pd_idx || i == qd_idx))
2207 && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2208 test_bit(R5_Insync, &dev->flags)) {
2209 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
2210 {
2211 PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
2212 (unsigned long long)sh->sector, i);
2213 set_bit(R5_LOCKED, &dev->flags);
2214 set_bit(R5_Wantread, &dev->flags);
2215 locked++;
2216 } else {
2217 PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
2218 (unsigned long long)sh->sector, i);
2219 set_bit(STRIPE_DELAYED, &sh->state);
2220 set_bit(STRIPE_HANDLE, &sh->state);
2221 }
2222 }
2223 }
2224 /* now if nothing is locked, and if we have enough data, we can start a write request */
2225 if (locked == 0 && rcw == 0 &&
2226 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2227 if ( must_compute > 0 ) {
2228 /* We have failed blocks and need to compute them */
2229 switch ( failed ) {
2230 case 0: BUG();
2231 case 1: compute_block_1(sh, failed_num[0], 0); break;
2232 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
2233 default: BUG(); /* This request should have been failed? */
2234 }
2235 }
2236
2237 PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
2238 compute_parity6(sh, RECONSTRUCT_WRITE);
2239 /* now every locked buffer is ready to be written */
2240 for (i=disks; i--;)
2241 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2242 PRINTK("Writing stripe %llu block %d\n",
2243 (unsigned long long)sh->sector, i);
2244 locked++;
2245 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2246 }
2247 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2248 set_bit(STRIPE_INSYNC, &sh->state);
2249
2250 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2251 atomic_dec(&conf->preread_active_stripes);
2252 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
2253 md_wakeup_thread(conf->mddev->thread);
2254 }
2255 }
2256 }
2257
2258 /* maybe we need to check and possibly fix the parity for this stripe
2259 * Any reads will already have been scheduled, so we just see if enough data
2260 * is available
2261 */
2262 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
2263 int update_p = 0, update_q = 0;
2264 struct r5dev *dev;
2265
2266 set_bit(STRIPE_HANDLE, &sh->state);
2267
2268 BUG_ON(failed>2);
2269 BUG_ON(uptodate < disks);
2270 /* Want to check and possibly repair P and Q.
2271 * However there could be one 'failed' device, in which
2272 * case we can only check one of them, possibly using the
2273 * other to generate missing data
2274 */
2275
2276 /* If !tmp_page, we cannot do the calculations,
2277 * but as we have set STRIPE_HANDLE, we will soon be called
2278 * by stripe_handle with a tmp_page - just wait until then.
2279 */
2280 if (tmp_page) {
2281 if (failed == q_failed) {
2282 /* The only possible failed device holds 'Q', so it makes
2283 * sense to check P (If anything else were failed, we would
2284 * have used P to recreate it).
2285 */
2286 compute_block_1(sh, pd_idx, 1);
2287 if (!page_is_zero(sh->dev[pd_idx].page)) {
2288 compute_block_1(sh,pd_idx,0);
2289 update_p = 1;
2290 }
2291 }
2292 if (!q_failed && failed < 2) {
2293 /* q is not failed, and we didn't use it to generate
2294 * anything, so it makes sense to check it
2295 */
2296 memcpy(page_address(tmp_page),
2297 page_address(sh->dev[qd_idx].page),
2298 STRIPE_SIZE);
2299 compute_parity6(sh, UPDATE_PARITY);
2300 if (memcmp(page_address(tmp_page),
2301 page_address(sh->dev[qd_idx].page),
2302 STRIPE_SIZE)!= 0) {
2303 clear_bit(STRIPE_INSYNC, &sh->state);
2304 update_q = 1;
2305 }
2306 }
2307 if (update_p || update_q) {
2308 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2309 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2310 /* don't try to repair!! */
2311 update_p = update_q = 0;
2312 }
2313
2314 /* now write out any block on a failed drive,
2315 * or P or Q if they need it
2316 */
2317
2318 if (failed == 2) {
2319 dev = &sh->dev[failed_num[1]];
2320 locked++;
2321 set_bit(R5_LOCKED, &dev->flags);
2322 set_bit(R5_Wantwrite, &dev->flags);
2323 }
2324 if (failed >= 1) {
2325 dev = &sh->dev[failed_num[0]];
2326 locked++;
2327 set_bit(R5_LOCKED, &dev->flags);
2328 set_bit(R5_Wantwrite, &dev->flags);
2329 }
2330
2331 if (update_p) {
2332 dev = &sh->dev[pd_idx];
2333 locked ++;
2334 set_bit(R5_LOCKED, &dev->flags);
2335 set_bit(R5_Wantwrite, &dev->flags);
2336 }
2337 if (update_q) {
2338 dev = &sh->dev[qd_idx];
2339 locked++;
2340 set_bit(R5_LOCKED, &dev->flags);
2341 set_bit(R5_Wantwrite, &dev->flags);
2342 }
2343 clear_bit(STRIPE_DEGRADED, &sh->state);
2344
2345 set_bit(STRIPE_INSYNC, &sh->state);
2346 }
2347 }
2348
2349 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2350 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
2351 clear_bit(STRIPE_SYNCING, &sh->state);
2352 }
2353
2354 /* If the failed drives are just a ReadError, then we might need
2355 * to progress the repair/check process
2356 */
2357 if (failed <= 2 && ! conf->mddev->ro)
2358 for (i=0; i<failed;i++) {
2359 dev = &sh->dev[failed_num[i]];
2360 if (test_bit(R5_ReadError, &dev->flags)
2361 && !test_bit(R5_LOCKED, &dev->flags)
2362 && test_bit(R5_UPTODATE, &dev->flags)
2363 ) {
2364 if (!test_bit(R5_ReWrite, &dev->flags)) {
2365 set_bit(R5_Wantwrite, &dev->flags);
2366 set_bit(R5_ReWrite, &dev->flags);
2367 set_bit(R5_LOCKED, &dev->flags);
2368 } else {
2369 /* let's read it back */
2370 set_bit(R5_Wantread, &dev->flags);
2371 set_bit(R5_LOCKED, &dev->flags);
2372 }
2373 }
2374 }
2375 spin_unlock(&sh->lock);
2376
2377 while ((bi=return_bi)) {
2378 int bytes = bi->bi_size;
2379
2380 return_bi = bi->bi_next;
2381 bi->bi_next = NULL;
2382 bi->bi_size = 0;
2383 bi->bi_end_io(bi, bytes, 0);
2384 }
2385 for (i=disks; i-- ;) {
2386 int rw;
2387 struct bio *bi;
2388 mdk_rdev_t *rdev;
2389 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
2390 rw = 1;
2391 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
2392 rw = 0;
2393 else
2394 continue;
2395
2396 bi = &sh->dev[i].req;
2397
2398 bi->bi_rw = rw;
2399 if (rw)
2400 bi->bi_end_io = raid5_end_write_request;
2401 else
2402 bi->bi_end_io = raid5_end_read_request;
2403
2404 rcu_read_lock();
2405 rdev = rcu_dereference(conf->disks[i].rdev);
2406 if (rdev && test_bit(Faulty, &rdev->flags))
2407 rdev = NULL;
2408 if (rdev)
2409 atomic_inc(&rdev->nr_pending);
2410 rcu_read_unlock();
2411
2412 if (rdev) {
2413 if (syncing)
2414 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
2415
2416 bi->bi_bdev = rdev->bdev;
2417 PRINTK("for %llu schedule op %ld on disc %d\n",
2418 (unsigned long long)sh->sector, bi->bi_rw, i);
2419 atomic_inc(&sh->count);
2420 bi->bi_sector = sh->sector + rdev->data_offset;
2421 bi->bi_flags = 1 << BIO_UPTODATE;
2422 bi->bi_vcnt = 1;
2423 bi->bi_max_vecs = 1;
2424 bi->bi_idx = 0;
2425 bi->bi_io_vec = &sh->dev[i].vec;
2426 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
2427 bi->bi_io_vec[0].bv_offset = 0;
2428 bi->bi_size = STRIPE_SIZE;
2429 bi->bi_next = NULL;
2430 if (rw == WRITE &&
2431 test_bit(R5_ReWrite, &sh->dev[i].flags))
2432 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2433 generic_make_request(bi);
2434 } else {
2435 if (rw == 1)
2436 set_bit(STRIPE_DEGRADED, &sh->state);
2437 PRINTK("skip op %ld on disc %d for sector %llu\n",
2438 bi->bi_rw, i, (unsigned long long)sh->sector);
2439 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2440 set_bit(STRIPE_HANDLE, &sh->state);
2441 }
2442 }
2443}
2444
2445static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
2446{
2447 if (sh->raid_conf->level == 6)
2448 handle_stripe6(sh, tmp_page);
2449 else
2450 handle_stripe5(sh);
2451}
2452
2453
2454
1618static void raid5_activate_delayed(raid5_conf_t *conf) 2455static void raid5_activate_delayed(raid5_conf_t *conf)
1619{ 2456{
1620 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 2457 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
@@ -1753,7 +2590,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
1753 2590
1754 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 2591 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1755 DEFINE_WAIT(w); 2592 DEFINE_WAIT(w);
1756 int disks; 2593 int disks, data_disks;
1757 2594
1758 retry: 2595 retry:
1759 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 2596 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
@@ -1781,7 +2618,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
1781 } 2618 }
1782 spin_unlock_irq(&conf->device_lock); 2619 spin_unlock_irq(&conf->device_lock);
1783 } 2620 }
1784 new_sector = raid5_compute_sector(logical_sector, disks, disks - 1, 2621 data_disks = disks - conf->max_degraded;
2622
2623 new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
1785 &dd_idx, &pd_idx, conf); 2624 &dd_idx, &pd_idx, conf);
1786 PRINTK("raid5: make_request, sector %llu logical %llu\n", 2625 PRINTK("raid5: make_request, sector %llu logical %llu\n",
1787 (unsigned long long)new_sector, 2626 (unsigned long long)new_sector,
@@ -1833,7 +2672,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
1833 } 2672 }
1834 finish_wait(&conf->wait_for_overlap, &w); 2673 finish_wait(&conf->wait_for_overlap, &w);
1835 raid5_plug_device(conf); 2674 raid5_plug_device(conf);
1836 handle_stripe(sh); 2675 handle_stripe(sh, NULL);
1837 release_stripe(sh); 2676 release_stripe(sh);
1838 } else { 2677 } else {
1839 /* cannot get stripe for read-ahead, just give-up */ 2678 /* cannot get stripe for read-ahead, just give-up */
@@ -1849,7 +2688,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
1849 if (remaining == 0) { 2688 if (remaining == 0) {
1850 int bytes = bi->bi_size; 2689 int bytes = bi->bi_size;
1851 2690
1852 if ( bio_data_dir(bi) == WRITE ) 2691 if ( rw == WRITE )
1853 md_write_end(mddev); 2692 md_write_end(mddev);
1854 bi->bi_size = 0; 2693 bi->bi_size = 0;
1855 bi->bi_end_io(bi, bytes, 0); 2694 bi->bi_end_io(bi, bytes, 0);
@@ -1865,9 +2704,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1865 int pd_idx; 2704 int pd_idx;
1866 sector_t first_sector, last_sector; 2705 sector_t first_sector, last_sector;
1867 int raid_disks = conf->raid_disks; 2706 int raid_disks = conf->raid_disks;
1868 int data_disks = raid_disks-1; 2707 int data_disks = raid_disks - conf->max_degraded;
1869 sector_t max_sector = mddev->size << 1; 2708 sector_t max_sector = mddev->size << 1;
1870 int sync_blocks; 2709 int sync_blocks;
2710 int still_degraded = 0;
2711 int i;
1871 2712
1872 if (sector_nr >= max_sector) { 2713 if (sector_nr >= max_sector) {
1873 /* just being told to finish up .. nothing much to do */ 2714 /* just being told to finish up .. nothing much to do */
@@ -1880,7 +2721,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1880 if (mddev->curr_resync < max_sector) /* aborted */ 2721 if (mddev->curr_resync < max_sector) /* aborted */
1881 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 2722 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1882 &sync_blocks, 1); 2723 &sync_blocks, 1);
1883 else /* compelted sync */ 2724 else /* completed sync */
1884 conf->fullsync = 0; 2725 conf->fullsync = 0;
1885 bitmap_close_sync(mddev->bitmap); 2726 bitmap_close_sync(mddev->bitmap);
1886 2727
@@ -2003,11 +2844,12 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2003 } 2844 }
2004 return conf->chunk_size>>9; 2845 return conf->chunk_size>>9;
2005 } 2846 }
2006 /* if there is 1 or more failed drives and we are trying 2847 /* if there is too many failed drives and we are trying
2007 * to resync, then assert that we are finished, because there is 2848 * to resync, then assert that we are finished, because there is
2008 * nothing we can do. 2849 * nothing we can do.
2009 */ 2850 */
2010 if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2851 if (mddev->degraded >= (data_disks - raid_disks) &&
2852 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2011 sector_t rv = (mddev->size << 1) - sector_nr; 2853 sector_t rv = (mddev->size << 1) - sector_nr;
2012 *skipped = 1; 2854 *skipped = 1;
2013 return rv; 2855 return rv;
@@ -2026,17 +2868,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2026 if (sh == NULL) { 2868 if (sh == NULL) {
2027 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); 2869 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
2028 /* make sure we don't swamp the stripe cache if someone else 2870 /* make sure we don't swamp the stripe cache if someone else
2029 * is trying to get access 2871 * is trying to get access
2030 */ 2872 */
2031 schedule_timeout_uninterruptible(1); 2873 schedule_timeout_uninterruptible(1);
2032 } 2874 }
2033 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); 2875 /* Need to check if array will still be degraded after recovery/resync
2034 spin_lock(&sh->lock); 2876 * We don't need to check the 'failed' flag as when that gets set,
2877 * recovery aborts.
2878 */
2879 for (i=0; i<mddev->raid_disks; i++)
2880 if (conf->disks[i].rdev == NULL)
2881 still_degraded = 1;
2882
2883 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
2884
2885 spin_lock(&sh->lock);
2035 set_bit(STRIPE_SYNCING, &sh->state); 2886 set_bit(STRIPE_SYNCING, &sh->state);
2036 clear_bit(STRIPE_INSYNC, &sh->state); 2887 clear_bit(STRIPE_INSYNC, &sh->state);
2037 spin_unlock(&sh->lock); 2888 spin_unlock(&sh->lock);
2038 2889
2039 handle_stripe(sh); 2890 handle_stripe(sh, NULL);
2040 release_stripe(sh); 2891 release_stripe(sh);
2041 2892
2042 return STRIPE_SECTORS; 2893 return STRIPE_SECTORS;
@@ -2091,7 +2942,7 @@ static void raid5d (mddev_t *mddev)
2091 spin_unlock_irq(&conf->device_lock); 2942 spin_unlock_irq(&conf->device_lock);
2092 2943
2093 handled++; 2944 handled++;
2094 handle_stripe(sh); 2945 handle_stripe(sh, conf->spare_page);
2095 release_stripe(sh); 2946 release_stripe(sh);
2096 2947
2097 spin_lock_irq(&conf->device_lock); 2948 spin_lock_irq(&conf->device_lock);
@@ -2181,8 +3032,8 @@ static int run(mddev_t *mddev)
2181 struct disk_info *disk; 3032 struct disk_info *disk;
2182 struct list_head *tmp; 3033 struct list_head *tmp;
2183 3034
2184 if (mddev->level != 5 && mddev->level != 4) { 3035 if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
2185 printk(KERN_ERR "raid5: %s: raid level not set to 4/5 (%d)\n", 3036 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
2186 mdname(mddev), mddev->level); 3037 mdname(mddev), mddev->level);
2187 return -EIO; 3038 return -EIO;
2188 } 3039 }
@@ -2251,6 +3102,11 @@ static int run(mddev_t *mddev)
2251 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 3102 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
2252 goto abort; 3103 goto abort;
2253 3104
3105 if (mddev->level == 6) {
3106 conf->spare_page = alloc_page(GFP_KERNEL);
3107 if (!conf->spare_page)
3108 goto abort;
3109 }
2254 spin_lock_init(&conf->device_lock); 3110 spin_lock_init(&conf->device_lock);
2255 init_waitqueue_head(&conf->wait_for_stripe); 3111 init_waitqueue_head(&conf->wait_for_stripe);
2256 init_waitqueue_head(&conf->wait_for_overlap); 3112 init_waitqueue_head(&conf->wait_for_overlap);
@@ -2282,12 +3138,16 @@ static int run(mddev_t *mddev)
2282 } 3138 }
2283 3139
2284 /* 3140 /*
2285 * 0 for a fully functional array, 1 for a degraded array. 3141 * 0 for a fully functional array, 1 or 2 for a degraded array.
2286 */ 3142 */
2287 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks; 3143 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
2288 conf->mddev = mddev; 3144 conf->mddev = mddev;
2289 conf->chunk_size = mddev->chunk_size; 3145 conf->chunk_size = mddev->chunk_size;
2290 conf->level = mddev->level; 3146 conf->level = mddev->level;
3147 if (conf->level == 6)
3148 conf->max_degraded = 2;
3149 else
3150 conf->max_degraded = 1;
2291 conf->algorithm = mddev->layout; 3151 conf->algorithm = mddev->layout;
2292 conf->max_nr_stripes = NR_STRIPES; 3152 conf->max_nr_stripes = NR_STRIPES;
2293 conf->expand_progress = mddev->reshape_position; 3153 conf->expand_progress = mddev->reshape_position;
@@ -2296,6 +3156,11 @@ static int run(mddev_t *mddev)
2296 mddev->size &= ~(mddev->chunk_size/1024 -1); 3156 mddev->size &= ~(mddev->chunk_size/1024 -1);
2297 mddev->resync_max_sectors = mddev->size << 1; 3157 mddev->resync_max_sectors = mddev->size << 1;
2298 3158
3159 if (conf->level == 6 && conf->raid_disks < 4) {
3160 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
3161 mdname(mddev), conf->raid_disks);
3162 goto abort;
3163 }
2299 if (!conf->chunk_size || conf->chunk_size % 4) { 3164 if (!conf->chunk_size || conf->chunk_size % 4) {
2300 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 3165 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
2301 conf->chunk_size, mdname(mddev)); 3166 conf->chunk_size, mdname(mddev));
@@ -2307,14 +3172,14 @@ static int run(mddev_t *mddev)
2307 conf->algorithm, mdname(mddev)); 3172 conf->algorithm, mdname(mddev));
2308 goto abort; 3173 goto abort;
2309 } 3174 }
2310 if (mddev->degraded > 1) { 3175 if (mddev->degraded > conf->max_degraded) {
2311 printk(KERN_ERR "raid5: not enough operational devices for %s" 3176 printk(KERN_ERR "raid5: not enough operational devices for %s"
2312 " (%d/%d failed)\n", 3177 " (%d/%d failed)\n",
2313 mdname(mddev), conf->failed_disks, conf->raid_disks); 3178 mdname(mddev), conf->failed_disks, conf->raid_disks);
2314 goto abort; 3179 goto abort;
2315 } 3180 }
2316 3181
2317 if (mddev->degraded == 1 && 3182 if (mddev->degraded > 0 &&
2318 mddev->recovery_cp != MaxSector) { 3183 mddev->recovery_cp != MaxSector) {
2319 if (mddev->ok_start_degraded) 3184 if (mddev->ok_start_degraded)
2320 printk(KERN_WARNING 3185 printk(KERN_WARNING
@@ -2379,10 +3244,11 @@ static int run(mddev_t *mddev)
2379 } 3244 }
2380 3245
2381 /* read-ahead size must cover two whole stripes, which is 3246 /* read-ahead size must cover two whole stripes, which is
2382 * 2 * (n-1) * chunksize where 'n' is the number of raid devices 3247 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
2383 */ 3248 */
2384 { 3249 {
2385 int stripe = (mddev->raid_disks-1) * 3250 int data_disks = conf->previous_raid_disks - conf->max_degraded;
3251 int stripe = data_disks *
2386 (mddev->chunk_size / PAGE_SIZE); 3252 (mddev->chunk_size / PAGE_SIZE);
2387 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 3253 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
2388 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 3254 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
@@ -2393,12 +3259,14 @@ static int run(mddev_t *mddev)
2393 3259
2394 mddev->queue->unplug_fn = raid5_unplug_device; 3260 mddev->queue->unplug_fn = raid5_unplug_device;
2395 mddev->queue->issue_flush_fn = raid5_issue_flush; 3261 mddev->queue->issue_flush_fn = raid5_issue_flush;
2396 mddev->array_size = mddev->size * (conf->previous_raid_disks - 1); 3262 mddev->array_size = mddev->size * (conf->previous_raid_disks -
3263 conf->max_degraded);
2397 3264
2398 return 0; 3265 return 0;
2399abort: 3266abort:
2400 if (conf) { 3267 if (conf) {
2401 print_raid5_conf(conf); 3268 print_raid5_conf(conf);
3269 safe_put_page(conf->spare_page);
2402 kfree(conf->disks); 3270 kfree(conf->disks);
2403 kfree(conf->stripe_hashtbl); 3271 kfree(conf->stripe_hashtbl);
2404 kfree(conf); 3272 kfree(conf);
@@ -2427,23 +3295,23 @@ static int stop(mddev_t *mddev)
2427} 3295}
2428 3296
2429#if RAID5_DEBUG 3297#if RAID5_DEBUG
2430static void print_sh (struct stripe_head *sh) 3298static void print_sh (struct seq_file *seq, struct stripe_head *sh)
2431{ 3299{
2432 int i; 3300 int i;
2433 3301
2434 printk("sh %llu, pd_idx %d, state %ld.\n", 3302 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
2435 (unsigned long long)sh->sector, sh->pd_idx, sh->state); 3303 (unsigned long long)sh->sector, sh->pd_idx, sh->state);
2436 printk("sh %llu, count %d.\n", 3304 seq_printf(seq, "sh %llu, count %d.\n",
2437 (unsigned long long)sh->sector, atomic_read(&sh->count)); 3305 (unsigned long long)sh->sector, atomic_read(&sh->count));
2438 printk("sh %llu, ", (unsigned long long)sh->sector); 3306 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
2439 for (i = 0; i < sh->disks; i++) { 3307 for (i = 0; i < sh->disks; i++) {
2440 printk("(cache%d: %p %ld) ", 3308 seq_printf(seq, "(cache%d: %p %ld) ",
2441 i, sh->dev[i].page, sh->dev[i].flags); 3309 i, sh->dev[i].page, sh->dev[i].flags);
2442 } 3310 }
2443 printk("\n"); 3311 seq_printf(seq, "\n");
2444} 3312}
2445 3313
2446static void printall (raid5_conf_t *conf) 3314static void printall (struct seq_file *seq, raid5_conf_t *conf)
2447{ 3315{
2448 struct stripe_head *sh; 3316 struct stripe_head *sh;
2449 struct hlist_node *hn; 3317 struct hlist_node *hn;
@@ -2454,7 +3322,7 @@ static void printall (raid5_conf_t *conf)
2454 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { 3322 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
2455 if (sh->raid_conf != conf) 3323 if (sh->raid_conf != conf)
2456 continue; 3324 continue;
2457 print_sh(sh); 3325 print_sh(seq, sh);
2458 } 3326 }
2459 } 3327 }
2460 spin_unlock_irq(&conf->device_lock); 3328 spin_unlock_irq(&conf->device_lock);
@@ -2474,9 +3342,8 @@ static void status (struct seq_file *seq, mddev_t *mddev)
2474 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 3342 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
2475 seq_printf (seq, "]"); 3343 seq_printf (seq, "]");
2476#if RAID5_DEBUG 3344#if RAID5_DEBUG
2477#define D(x) \ 3345 seq_printf (seq, "\n");
2478 seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x)) 3346 printall(seq, conf);
2479 printall(conf);
2480#endif 3347#endif
2481} 3348}
2482 3349
@@ -2560,14 +3427,20 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2560 int disk; 3427 int disk;
2561 struct disk_info *p; 3428 struct disk_info *p;
2562 3429
2563 if (mddev->degraded > 1) 3430 if (mddev->degraded > conf->max_degraded)
2564 /* no point adding a device */ 3431 /* no point adding a device */
2565 return 0; 3432 return 0;
2566 3433
2567 /* 3434 /*
2568 * find the disk ... 3435 * find the disk ... but prefer rdev->saved_raid_disk
3436 * if possible.
2569 */ 3437 */
2570 for (disk=0; disk < conf->raid_disks; disk++) 3438 if (rdev->saved_raid_disk >= 0 &&
3439 conf->disks[rdev->saved_raid_disk].rdev == NULL)
3440 disk = rdev->saved_raid_disk;
3441 else
3442 disk = 0;
3443 for ( ; disk < conf->raid_disks; disk++)
2571 if ((p=conf->disks + disk)->rdev == NULL) { 3444 if ((p=conf->disks + disk)->rdev == NULL) {
2572 clear_bit(In_sync, &rdev->flags); 3445 clear_bit(In_sync, &rdev->flags);
2573 rdev->raid_disk = disk; 3446 rdev->raid_disk = disk;
@@ -2590,8 +3463,10 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
2590 * any io in the removed space completes, but it hardly seems 3463 * any io in the removed space completes, but it hardly seems
2591 * worth it. 3464 * worth it.
2592 */ 3465 */
3466 raid5_conf_t *conf = mddev_to_conf(mddev);
3467
2593 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 3468 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
2594 mddev->array_size = (sectors * (mddev->raid_disks-1))>>1; 3469 mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
2595 set_capacity(mddev->gendisk, mddev->array_size << 1); 3470 set_capacity(mddev->gendisk, mddev->array_size << 1);
2596 mddev->changed = 1; 3471 mddev->changed = 1;
2597 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { 3472 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
@@ -2731,6 +3606,17 @@ static void end_reshape(raid5_conf_t *conf)
2731 conf->expand_progress = MaxSector; 3606 conf->expand_progress = MaxSector;
2732 spin_unlock_irq(&conf->device_lock); 3607 spin_unlock_irq(&conf->device_lock);
2733 conf->mddev->reshape_position = MaxSector; 3608 conf->mddev->reshape_position = MaxSector;
3609
3610 /* read-ahead size must cover two whole stripes, which is
3611 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
3612 */
3613 {
3614 int data_disks = conf->previous_raid_disks - conf->max_degraded;
3615 int stripe = data_disks *
3616 (conf->mddev->chunk_size / PAGE_SIZE);
3617 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3618 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3619 }
2734 } 3620 }
2735} 3621}
2736 3622
@@ -2762,6 +3648,23 @@ static void raid5_quiesce(mddev_t *mddev, int state)
2762 } 3648 }
2763} 3649}
2764 3650
3651static struct mdk_personality raid6_personality =
3652{
3653 .name = "raid6",
3654 .level = 6,
3655 .owner = THIS_MODULE,
3656 .make_request = make_request,
3657 .run = run,
3658 .stop = stop,
3659 .status = status,
3660 .error_handler = error,
3661 .hot_add_disk = raid5_add_disk,
3662 .hot_remove_disk= raid5_remove_disk,
3663 .spare_active = raid5_spare_active,
3664 .sync_request = sync_request,
3665 .resize = raid5_resize,
3666 .quiesce = raid5_quiesce,
3667};
2765static struct mdk_personality raid5_personality = 3668static struct mdk_personality raid5_personality =
2766{ 3669{
2767 .name = "raid5", 3670 .name = "raid5",
@@ -2804,6 +3707,12 @@ static struct mdk_personality raid4_personality =
2804 3707
2805static int __init raid5_init(void) 3708static int __init raid5_init(void)
2806{ 3709{
3710 int e;
3711
3712 e = raid6_select_algo();
3713 if ( e )
3714 return e;
3715 register_md_personality(&raid6_personality);
2807 register_md_personality(&raid5_personality); 3716 register_md_personality(&raid5_personality);
2808 register_md_personality(&raid4_personality); 3717 register_md_personality(&raid4_personality);
2809 return 0; 3718 return 0;
@@ -2811,6 +3720,7 @@ static int __init raid5_init(void)
2811 3720
2812static void raid5_exit(void) 3721static void raid5_exit(void)
2813{ 3722{
3723 unregister_md_personality(&raid6_personality);
2814 unregister_md_personality(&raid5_personality); 3724 unregister_md_personality(&raid5_personality);
2815 unregister_md_personality(&raid4_personality); 3725 unregister_md_personality(&raid4_personality);
2816} 3726}
@@ -2823,3 +3733,10 @@ MODULE_ALIAS("md-raid5");
2823MODULE_ALIAS("md-raid4"); 3733MODULE_ALIAS("md-raid4");
2824MODULE_ALIAS("md-level-5"); 3734MODULE_ALIAS("md-level-5");
2825MODULE_ALIAS("md-level-4"); 3735MODULE_ALIAS("md-level-4");
3736MODULE_ALIAS("md-personality-8"); /* RAID6 */
3737MODULE_ALIAS("md-raid6");
3738MODULE_ALIAS("md-level-6");
3739
3740/* This used to be two separate modules, they were: */
3741MODULE_ALIAS("raid5");
3742MODULE_ALIAS("raid6");
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
deleted file mode 100644
index e53d2d96ea3a..000000000000
--- a/drivers/md/raid6main.c
+++ /dev/null
@@ -1,2427 +0,0 @@
1/*
2 * raid6main.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-6 management functions. This code is derived from raid5.c.
8 * Last merge from raid5.c bkcvs version 1.79 (kernel 2.6.1).
9 *
10 * Thanks to Penguin Computing for making the RAID-6 development possible
11 * by donating a test server!
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2, or (at your option)
16 * any later version.
17 *
18 * You should have received a copy of the GNU General Public License
19 * (for example /usr/src/linux/COPYING); if not, write to the Free
20 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 */
22
23
24#include <linux/config.h>
25#include <linux/module.h>
26#include <linux/slab.h>
27#include <linux/highmem.h>
28#include <linux/bitops.h>
29#include <asm/atomic.h>
30#include "raid6.h"
31
32#include <linux/raid/bitmap.h>
33
34/*
35 * Stripe cache
36 */
37
38#define NR_STRIPES 256
39#define STRIPE_SIZE PAGE_SIZE
40#define STRIPE_SHIFT (PAGE_SHIFT - 9)
41#define STRIPE_SECTORS (STRIPE_SIZE>>9)
42#define IO_THRESHOLD 1
43#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
44#define HASH_MASK (NR_HASH - 1)
45
46#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
47
48/* bio's attached to a stripe+device for I/O are linked together in bi_sector
49 * order without overlap. There may be several bio's per stripe+device, and
50 * a bio could span several devices.
51 * When walking this list for a particular stripe+device, we must never proceed
52 * beyond a bio that extends past this device, as the next bio might no longer
53 * be valid.
54 * This macro is used to determine the 'next' bio in the list, given the sector
55 * of the current stripe+device
56 */
57#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
58/*
59 * The following can be used to debug the driver
60 */
61#define RAID6_DEBUG 0 /* Extremely verbose printk */
62#define RAID6_PARANOIA 1 /* Check spinlocks */
63#define RAID6_DUMPSTATE 0 /* Include stripe cache state in /proc/mdstat */
64#if RAID6_PARANOIA && defined(CONFIG_SMP)
65# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
66#else
67# define CHECK_DEVLOCK()
68#endif
69
70#define PRINTK(x...) ((void)(RAID6_DEBUG && printk(KERN_DEBUG x)))
71#if RAID6_DEBUG
72#undef inline
73#undef __inline__
74#define inline
75#define __inline__
76#endif
77
78#if !RAID6_USE_EMPTY_ZERO_PAGE
79/* In .bss so it's zeroed */
80const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
81#endif
82
83static inline int raid6_next_disk(int disk, int raid_disks)
84{
85 disk++;
86 return (disk < raid_disks) ? disk : 0;
87}
88
89static void print_raid6_conf (raid6_conf_t *conf);
90
91static void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh)
92{
93 if (atomic_dec_and_test(&sh->count)) {
94 BUG_ON(!list_empty(&sh->lru));
95 BUG_ON(atomic_read(&conf->active_stripes)==0);
96 if (test_bit(STRIPE_HANDLE, &sh->state)) {
97 if (test_bit(STRIPE_DELAYED, &sh->state))
98 list_add_tail(&sh->lru, &conf->delayed_list);
99 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
100 conf->seq_write == sh->bm_seq)
101 list_add_tail(&sh->lru, &conf->bitmap_list);
102 else {
103 clear_bit(STRIPE_BIT_DELAY, &sh->state);
104 list_add_tail(&sh->lru, &conf->handle_list);
105 }
106 md_wakeup_thread(conf->mddev->thread);
107 } else {
108 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
109 atomic_dec(&conf->preread_active_stripes);
110 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
111 md_wakeup_thread(conf->mddev->thread);
112 }
113 list_add_tail(&sh->lru, &conf->inactive_list);
114 atomic_dec(&conf->active_stripes);
115 if (!conf->inactive_blocked ||
116 atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4))
117 wake_up(&conf->wait_for_stripe);
118 }
119 }
120}
121static void release_stripe(struct stripe_head *sh)
122{
123 raid6_conf_t *conf = sh->raid_conf;
124 unsigned long flags;
125
126 spin_lock_irqsave(&conf->device_lock, flags);
127 __release_stripe(conf, sh);
128 spin_unlock_irqrestore(&conf->device_lock, flags);
129}
130
131static inline void remove_hash(struct stripe_head *sh)
132{
133 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
134
135 hlist_del_init(&sh->hash);
136}
137
138static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
139{
140 struct hlist_head *hp = stripe_hash(conf, sh->sector);
141
142 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
143
144 CHECK_DEVLOCK();
145 hlist_add_head(&sh->hash, hp);
146}
147
148
149/* find an idle stripe, make sure it is unhashed, and return it. */
150static struct stripe_head *get_free_stripe(raid6_conf_t *conf)
151{
152 struct stripe_head *sh = NULL;
153 struct list_head *first;
154
155 CHECK_DEVLOCK();
156 if (list_empty(&conf->inactive_list))
157 goto out;
158 first = conf->inactive_list.next;
159 sh = list_entry(first, struct stripe_head, lru);
160 list_del_init(first);
161 remove_hash(sh);
162 atomic_inc(&conf->active_stripes);
163out:
164 return sh;
165}
166
167static void shrink_buffers(struct stripe_head *sh, int num)
168{
169 struct page *p;
170 int i;
171
172 for (i=0; i<num ; i++) {
173 p = sh->dev[i].page;
174 if (!p)
175 continue;
176 sh->dev[i].page = NULL;
177 put_page(p);
178 }
179}
180
181static int grow_buffers(struct stripe_head *sh, int num)
182{
183 int i;
184
185 for (i=0; i<num; i++) {
186 struct page *page;
187
188 if (!(page = alloc_page(GFP_KERNEL))) {
189 return 1;
190 }
191 sh->dev[i].page = page;
192 }
193 return 0;
194}
195
196static void raid6_build_block (struct stripe_head *sh, int i);
197
198static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
199{
200 raid6_conf_t *conf = sh->raid_conf;
201 int disks = conf->raid_disks, i;
202
203 BUG_ON(atomic_read(&sh->count) != 0);
204 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
205
206 CHECK_DEVLOCK();
207 PRINTK("init_stripe called, stripe %llu\n",
208 (unsigned long long)sh->sector);
209
210 remove_hash(sh);
211
212 sh->sector = sector;
213 sh->pd_idx = pd_idx;
214 sh->state = 0;
215
216 for (i=disks; i--; ) {
217 struct r5dev *dev = &sh->dev[i];
218
219 if (dev->toread || dev->towrite || dev->written ||
220 test_bit(R5_LOCKED, &dev->flags)) {
221 PRINTK("sector=%llx i=%d %p %p %p %d\n",
222 (unsigned long long)sh->sector, i, dev->toread,
223 dev->towrite, dev->written,
224 test_bit(R5_LOCKED, &dev->flags));
225 BUG();
226 }
227 dev->flags = 0;
228 raid6_build_block(sh, i);
229 }
230 insert_hash(conf, sh);
231}
232
233static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
234{
235 struct stripe_head *sh;
236 struct hlist_node *hn;
237
238 CHECK_DEVLOCK();
239 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
240 hlist_for_each_entry (sh, hn, stripe_hash(conf, sector), hash)
241 if (sh->sector == sector)
242 return sh;
243 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
244 return NULL;
245}
246
247static void unplug_slaves(mddev_t *mddev);
248
249static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector,
250 int pd_idx, int noblock)
251{
252 struct stripe_head *sh;
253
254 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
255
256 spin_lock_irq(&conf->device_lock);
257
258 do {
259 wait_event_lock_irq(conf->wait_for_stripe,
260 conf->quiesce == 0,
261 conf->device_lock, /* nothing */);
262 sh = __find_stripe(conf, sector);
263 if (!sh) {
264 if (!conf->inactive_blocked)
265 sh = get_free_stripe(conf);
266 if (noblock && sh == NULL)
267 break;
268 if (!sh) {
269 conf->inactive_blocked = 1;
270 wait_event_lock_irq(conf->wait_for_stripe,
271 !list_empty(&conf->inactive_list) &&
272 (atomic_read(&conf->active_stripes)
273 < (conf->max_nr_stripes *3/4)
274 || !conf->inactive_blocked),
275 conf->device_lock,
276 unplug_slaves(conf->mddev);
277 );
278 conf->inactive_blocked = 0;
279 } else
280 init_stripe(sh, sector, pd_idx);
281 } else {
282 if (atomic_read(&sh->count)) {
283 BUG_ON(!list_empty(&sh->lru));
284 } else {
285 if (!test_bit(STRIPE_HANDLE, &sh->state))
286 atomic_inc(&conf->active_stripes);
287 BUG_ON(list_empty(&sh->lru));
288 list_del_init(&sh->lru);
289 }
290 }
291 } while (sh == NULL);
292
293 if (sh)
294 atomic_inc(&sh->count);
295
296 spin_unlock_irq(&conf->device_lock);
297 return sh;
298}
299
300static int grow_one_stripe(raid6_conf_t *conf)
301{
302 struct stripe_head *sh;
303 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
304 if (!sh)
305 return 0;
306 memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
307 sh->raid_conf = conf;
308 spin_lock_init(&sh->lock);
309
310 if (grow_buffers(sh, conf->raid_disks)) {
311 shrink_buffers(sh, conf->raid_disks);
312 kmem_cache_free(conf->slab_cache, sh);
313 return 0;
314 }
315 /* we just created an active stripe so... */
316 atomic_set(&sh->count, 1);
317 atomic_inc(&conf->active_stripes);
318 INIT_LIST_HEAD(&sh->lru);
319 release_stripe(sh);
320 return 1;
321}
322
323static int grow_stripes(raid6_conf_t *conf, int num)
324{
325 kmem_cache_t *sc;
326 int devs = conf->raid_disks;
327
328 sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
329
330 sc = kmem_cache_create(conf->cache_name[0],
331 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
332 0, 0, NULL, NULL);
333 if (!sc)
334 return 1;
335 conf->slab_cache = sc;
336 while (num--)
337 if (!grow_one_stripe(conf))
338 return 1;
339 return 0;
340}
341
342static int drop_one_stripe(raid6_conf_t *conf)
343{
344 struct stripe_head *sh;
345 spin_lock_irq(&conf->device_lock);
346 sh = get_free_stripe(conf);
347 spin_unlock_irq(&conf->device_lock);
348 if (!sh)
349 return 0;
350 BUG_ON(atomic_read(&sh->count));
351 shrink_buffers(sh, conf->raid_disks);
352 kmem_cache_free(conf->slab_cache, sh);
353 atomic_dec(&conf->active_stripes);
354 return 1;
355}
356
357static void shrink_stripes(raid6_conf_t *conf)
358{
359 while (drop_one_stripe(conf))
360 ;
361
362 if (conf->slab_cache)
363 kmem_cache_destroy(conf->slab_cache);
364 conf->slab_cache = NULL;
365}
366
367static int raid6_end_read_request(struct bio * bi, unsigned int bytes_done,
368 int error)
369{
370 struct stripe_head *sh = bi->bi_private;
371 raid6_conf_t *conf = sh->raid_conf;
372 int disks = conf->raid_disks, i;
373 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
374
375 if (bi->bi_size)
376 return 1;
377
378 for (i=0 ; i<disks; i++)
379 if (bi == &sh->dev[i].req)
380 break;
381
382 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
383 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
384 uptodate);
385 if (i == disks) {
386 BUG();
387 return 0;
388 }
389
390 if (uptodate) {
391#if 0
392 struct bio *bio;
393 unsigned long flags;
394 spin_lock_irqsave(&conf->device_lock, flags);
395 /* we can return a buffer if we bypassed the cache or
396 * if the top buffer is not in highmem. If there are
397 * multiple buffers, leave the extra work to
398 * handle_stripe
399 */
400 buffer = sh->bh_read[i];
401 if (buffer &&
402 (!PageHighMem(buffer->b_page)
403 || buffer->b_page == bh->b_page )
404 ) {
405 sh->bh_read[i] = buffer->b_reqnext;
406 buffer->b_reqnext = NULL;
407 } else
408 buffer = NULL;
409 spin_unlock_irqrestore(&conf->device_lock, flags);
410 if (sh->bh_page[i]==bh->b_page)
411 set_buffer_uptodate(bh);
412 if (buffer) {
413 if (buffer->b_page != bh->b_page)
414 memcpy(buffer->b_data, bh->b_data, bh->b_size);
415 buffer->b_end_io(buffer, 1);
416 }
417#else
418 set_bit(R5_UPTODATE, &sh->dev[i].flags);
419#endif
420 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
421 printk(KERN_INFO "raid6: read error corrected!!\n");
422 clear_bit(R5_ReadError, &sh->dev[i].flags);
423 clear_bit(R5_ReWrite, &sh->dev[i].flags);
424 }
425 if (atomic_read(&conf->disks[i].rdev->read_errors))
426 atomic_set(&conf->disks[i].rdev->read_errors, 0);
427 } else {
428 int retry = 0;
429 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
430 atomic_inc(&conf->disks[i].rdev->read_errors);
431 if (conf->mddev->degraded)
432 printk(KERN_WARNING "raid6: read error not correctable.\n");
433 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
434 /* Oh, no!!! */
435 printk(KERN_WARNING "raid6: read error NOT corrected!!\n");
436 else if (atomic_read(&conf->disks[i].rdev->read_errors)
437 > conf->max_nr_stripes)
438 printk(KERN_WARNING
439 "raid6: Too many read errors, failing device.\n");
440 else
441 retry = 1;
442 if (retry)
443 set_bit(R5_ReadError, &sh->dev[i].flags);
444 else {
445 clear_bit(R5_ReadError, &sh->dev[i].flags);
446 clear_bit(R5_ReWrite, &sh->dev[i].flags);
447 md_error(conf->mddev, conf->disks[i].rdev);
448 }
449 }
450 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
451#if 0
452 /* must restore b_page before unlocking buffer... */
453 if (sh->bh_page[i] != bh->b_page) {
454 bh->b_page = sh->bh_page[i];
455 bh->b_data = page_address(bh->b_page);
456 clear_buffer_uptodate(bh);
457 }
458#endif
459 clear_bit(R5_LOCKED, &sh->dev[i].flags);
460 set_bit(STRIPE_HANDLE, &sh->state);
461 release_stripe(sh);
462 return 0;
463}
464
465static int raid6_end_write_request (struct bio *bi, unsigned int bytes_done,
466 int error)
467{
468 struct stripe_head *sh = bi->bi_private;
469 raid6_conf_t *conf = sh->raid_conf;
470 int disks = conf->raid_disks, i;
471 unsigned long flags;
472 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
473
474 if (bi->bi_size)
475 return 1;
476
477 for (i=0 ; i<disks; i++)
478 if (bi == &sh->dev[i].req)
479 break;
480
481 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
482 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
483 uptodate);
484 if (i == disks) {
485 BUG();
486 return 0;
487 }
488
489 spin_lock_irqsave(&conf->device_lock, flags);
490 if (!uptodate)
491 md_error(conf->mddev, conf->disks[i].rdev);
492
493 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
494
495 clear_bit(R5_LOCKED, &sh->dev[i].flags);
496 set_bit(STRIPE_HANDLE, &sh->state);
497 __release_stripe(conf, sh);
498 spin_unlock_irqrestore(&conf->device_lock, flags);
499 return 0;
500}
501
502
503static sector_t compute_blocknr(struct stripe_head *sh, int i);
504
505static void raid6_build_block (struct stripe_head *sh, int i)
506{
507 struct r5dev *dev = &sh->dev[i];
508 int pd_idx = sh->pd_idx;
509 int qd_idx = raid6_next_disk(pd_idx, sh->raid_conf->raid_disks);
510
511 bio_init(&dev->req);
512 dev->req.bi_io_vec = &dev->vec;
513 dev->req.bi_vcnt++;
514 dev->req.bi_max_vecs++;
515 dev->vec.bv_page = dev->page;
516 dev->vec.bv_len = STRIPE_SIZE;
517 dev->vec.bv_offset = 0;
518
519 dev->req.bi_sector = sh->sector;
520 dev->req.bi_private = sh;
521
522 dev->flags = 0;
523 if (i != pd_idx && i != qd_idx)
524 dev->sector = compute_blocknr(sh, i);
525}
526
527static void error(mddev_t *mddev, mdk_rdev_t *rdev)
528{
529 char b[BDEVNAME_SIZE];
530 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
531 PRINTK("raid6: error called\n");
532
533 if (!test_bit(Faulty, &rdev->flags)) {
534 mddev->sb_dirty = 1;
535 if (test_bit(In_sync, &rdev->flags)) {
536 conf->working_disks--;
537 mddev->degraded++;
538 conf->failed_disks++;
539 clear_bit(In_sync, &rdev->flags);
540 /*
541 * if recovery was running, make sure it aborts.
542 */
543 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
544 }
545 set_bit(Faulty, &rdev->flags);
546 printk (KERN_ALERT
547 "raid6: Disk failure on %s, disabling device."
548 " Operation continuing on %d devices\n",
549 bdevname(rdev->bdev,b), conf->working_disks);
550 }
551}
552
553/*
554 * Input: a 'big' sector number,
555 * Output: index of the data and parity disk, and the sector # in them.
556 */
557static sector_t raid6_compute_sector(sector_t r_sector, unsigned int raid_disks,
558 unsigned int data_disks, unsigned int * dd_idx,
559 unsigned int * pd_idx, raid6_conf_t *conf)
560{
561 long stripe;
562 unsigned long chunk_number;
563 unsigned int chunk_offset;
564 sector_t new_sector;
565 int sectors_per_chunk = conf->chunk_size >> 9;
566
567 /* First compute the information on this sector */
568
569 /*
570 * Compute the chunk number and the sector offset inside the chunk
571 */
572 chunk_offset = sector_div(r_sector, sectors_per_chunk);
573 chunk_number = r_sector;
574 if ( r_sector != chunk_number ) {
575 printk(KERN_CRIT "raid6: ERROR: r_sector = %llu, chunk_number = %lu\n",
576 (unsigned long long)r_sector, (unsigned long)chunk_number);
577 BUG();
578 }
579
580 /*
581 * Compute the stripe number
582 */
583 stripe = chunk_number / data_disks;
584
585 /*
586 * Compute the data disk and parity disk indexes inside the stripe
587 */
588 *dd_idx = chunk_number % data_disks;
589
590 /*
591 * Select the parity disk based on the user selected algorithm.
592 */
593
594 /**** FIX THIS ****/
595 switch (conf->algorithm) {
596 case ALGORITHM_LEFT_ASYMMETRIC:
597 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
598 if (*pd_idx == raid_disks-1)
599 (*dd_idx)++; /* Q D D D P */
600 else if (*dd_idx >= *pd_idx)
601 (*dd_idx) += 2; /* D D P Q D */
602 break;
603 case ALGORITHM_RIGHT_ASYMMETRIC:
604 *pd_idx = stripe % raid_disks;
605 if (*pd_idx == raid_disks-1)
606 (*dd_idx)++; /* Q D D D P */
607 else if (*dd_idx >= *pd_idx)
608 (*dd_idx) += 2; /* D D P Q D */
609 break;
610 case ALGORITHM_LEFT_SYMMETRIC:
611 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
612 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
613 break;
614 case ALGORITHM_RIGHT_SYMMETRIC:
615 *pd_idx = stripe % raid_disks;
616 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
617 break;
618 default:
619 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
620 conf->algorithm);
621 }
622
623 PRINTK("raid6: chunk_number = %lu, pd_idx = %u, dd_idx = %u\n",
624 chunk_number, *pd_idx, *dd_idx);
625
626 /*
627 * Finally, compute the new sector number
628 */
629 new_sector = (sector_t) stripe * sectors_per_chunk + chunk_offset;
630 return new_sector;
631}
632
633
634static sector_t compute_blocknr(struct stripe_head *sh, int i)
635{
636 raid6_conf_t *conf = sh->raid_conf;
637 int raid_disks = conf->raid_disks, data_disks = raid_disks - 2;
638 sector_t new_sector = sh->sector, check;
639 int sectors_per_chunk = conf->chunk_size >> 9;
640 sector_t stripe;
641 int chunk_offset;
642 int chunk_number, dummy1, dummy2, dd_idx = i;
643 sector_t r_sector;
644 int i0 = i;
645
646 chunk_offset = sector_div(new_sector, sectors_per_chunk);
647 stripe = new_sector;
648 if ( new_sector != stripe ) {
649 printk(KERN_CRIT "raid6: ERROR: new_sector = %llu, stripe = %lu\n",
650 (unsigned long long)new_sector, (unsigned long)stripe);
651 BUG();
652 }
653
654 switch (conf->algorithm) {
655 case ALGORITHM_LEFT_ASYMMETRIC:
656 case ALGORITHM_RIGHT_ASYMMETRIC:
657 if (sh->pd_idx == raid_disks-1)
658 i--; /* Q D D D P */
659 else if (i > sh->pd_idx)
660 i -= 2; /* D D P Q D */
661 break;
662 case ALGORITHM_LEFT_SYMMETRIC:
663 case ALGORITHM_RIGHT_SYMMETRIC:
664 if (sh->pd_idx == raid_disks-1)
665 i--; /* Q D D D P */
666 else {
667 /* D D P Q D */
668 if (i < sh->pd_idx)
669 i += raid_disks;
670 i -= (sh->pd_idx + 2);
671 }
672 break;
673 default:
674 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
675 conf->algorithm);
676 }
677
678 PRINTK("raid6: compute_blocknr: pd_idx = %u, i0 = %u, i = %u\n", sh->pd_idx, i0, i);
679
680 chunk_number = stripe * data_disks + i;
681 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
682
683 check = raid6_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
684 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
685 printk(KERN_CRIT "raid6: compute_blocknr: map not correct\n");
686 return 0;
687 }
688 return r_sector;
689}
690
691
692
693/*
694 * Copy data between a page in the stripe cache, and one or more bion
695 * The page could align with the middle of the bio, or there could be
696 * several bion, each with several bio_vecs, which cover part of the page
697 * Multiple bion are linked together on bi_next. There may be extras
698 * at the end of this list. We ignore them.
699 */
700static void copy_data(int frombio, struct bio *bio,
701 struct page *page,
702 sector_t sector)
703{
704 char *pa = page_address(page);
705 struct bio_vec *bvl;
706 int i;
707 int page_offset;
708
709 if (bio->bi_sector >= sector)
710 page_offset = (signed)(bio->bi_sector - sector) * 512;
711 else
712 page_offset = (signed)(sector - bio->bi_sector) * -512;
713 bio_for_each_segment(bvl, bio, i) {
714 int len = bio_iovec_idx(bio,i)->bv_len;
715 int clen;
716 int b_offset = 0;
717
718 if (page_offset < 0) {
719 b_offset = -page_offset;
720 page_offset += b_offset;
721 len -= b_offset;
722 }
723
724 if (len > 0 && page_offset + len > STRIPE_SIZE)
725 clen = STRIPE_SIZE - page_offset;
726 else clen = len;
727
728 if (clen > 0) {
729 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
730 if (frombio)
731 memcpy(pa+page_offset, ba+b_offset, clen);
732 else
733 memcpy(ba+b_offset, pa+page_offset, clen);
734 __bio_kunmap_atomic(ba, KM_USER0);
735 }
736 if (clen < len) /* hit end of page */
737 break;
738 page_offset += len;
739 }
740}
741
742#define check_xor() do { \
743 if (count == MAX_XOR_BLOCKS) { \
744 xor_block(count, STRIPE_SIZE, ptr); \
745 count = 1; \
746 } \
747 } while(0)
748
749/* Compute P and Q syndromes */
750static void compute_parity(struct stripe_head *sh, int method)
751{
752 raid6_conf_t *conf = sh->raid_conf;
753 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
754 struct bio *chosen;
755 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
756 void *ptrs[disks];
757
758 qd_idx = raid6_next_disk(pd_idx, disks);
759 d0_idx = raid6_next_disk(qd_idx, disks);
760
761 PRINTK("compute_parity, stripe %llu, method %d\n",
762 (unsigned long long)sh->sector, method);
763
764 switch(method) {
765 case READ_MODIFY_WRITE:
766 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
767 case RECONSTRUCT_WRITE:
768 for (i= disks; i-- ;)
769 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
770 chosen = sh->dev[i].towrite;
771 sh->dev[i].towrite = NULL;
772
773 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
774 wake_up(&conf->wait_for_overlap);
775
776 BUG_ON(sh->dev[i].written);
777 sh->dev[i].written = chosen;
778 }
779 break;
780 case CHECK_PARITY:
781 BUG(); /* Not implemented yet */
782 }
783
784 for (i = disks; i--;)
785 if (sh->dev[i].written) {
786 sector_t sector = sh->dev[i].sector;
787 struct bio *wbi = sh->dev[i].written;
788 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
789 copy_data(1, wbi, sh->dev[i].page, sector);
790 wbi = r5_next_bio(wbi, sector);
791 }
792
793 set_bit(R5_LOCKED, &sh->dev[i].flags);
794 set_bit(R5_UPTODATE, &sh->dev[i].flags);
795 }
796
797// switch(method) {
798// case RECONSTRUCT_WRITE:
799// case CHECK_PARITY:
800// case UPDATE_PARITY:
801 /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
802 /* FIX: Is this ordering of drives even remotely optimal? */
803 count = 0;
804 i = d0_idx;
805 do {
806 ptrs[count++] = page_address(sh->dev[i].page);
807 if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
808 printk("block %d/%d not uptodate on parity calc\n", i,count);
809 i = raid6_next_disk(i, disks);
810 } while ( i != d0_idx );
811// break;
812// }
813
814 raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
815
816 switch(method) {
817 case RECONSTRUCT_WRITE:
818 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
819 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
820 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
821 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
822 break;
823 case UPDATE_PARITY:
824 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
825 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
826 break;
827 }
828}
829
830/* Compute one missing block */
831static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
832{
833 raid6_conf_t *conf = sh->raid_conf;
834 int i, count, disks = conf->raid_disks;
835 void *ptr[MAX_XOR_BLOCKS], *p;
836 int pd_idx = sh->pd_idx;
837 int qd_idx = raid6_next_disk(pd_idx, disks);
838
839 PRINTK("compute_block_1, stripe %llu, idx %d\n",
840 (unsigned long long)sh->sector, dd_idx);
841
842 if ( dd_idx == qd_idx ) {
843 /* We're actually computing the Q drive */
844 compute_parity(sh, UPDATE_PARITY);
845 } else {
846 ptr[0] = page_address(sh->dev[dd_idx].page);
847 if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
848 count = 1;
849 for (i = disks ; i--; ) {
850 if (i == dd_idx || i == qd_idx)
851 continue;
852 p = page_address(sh->dev[i].page);
853 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
854 ptr[count++] = p;
855 else
856 printk("compute_block() %d, stripe %llu, %d"
857 " not present\n", dd_idx,
858 (unsigned long long)sh->sector, i);
859
860 check_xor();
861 }
862 if (count != 1)
863 xor_block(count, STRIPE_SIZE, ptr);
864 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
865 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
866 }
867}
868
869/* Compute two missing blocks */
870static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
871{
872 raid6_conf_t *conf = sh->raid_conf;
873 int i, count, disks = conf->raid_disks;
874 int pd_idx = sh->pd_idx;
875 int qd_idx = raid6_next_disk(pd_idx, disks);
876 int d0_idx = raid6_next_disk(qd_idx, disks);
877 int faila, failb;
878
879 /* faila and failb are disk numbers relative to d0_idx */
880 /* pd_idx become disks-2 and qd_idx become disks-1 */
881 faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
882 failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
883
884 BUG_ON(faila == failb);
885 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
886
887 PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
888 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
889
890 if ( failb == disks-1 ) {
891 /* Q disk is one of the missing disks */
892 if ( faila == disks-2 ) {
893 /* Missing P+Q, just recompute */
894 compute_parity(sh, UPDATE_PARITY);
895 return;
896 } else {
897 /* We're missing D+Q; recompute D from P */
898 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
899 compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
900 return;
901 }
902 }
903
904 /* We're missing D+P or D+D; build pointer table */
905 {
906 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
907 void *ptrs[disks];
908
909 count = 0;
910 i = d0_idx;
911 do {
912 ptrs[count++] = page_address(sh->dev[i].page);
913 i = raid6_next_disk(i, disks);
914 if (i != dd_idx1 && i != dd_idx2 &&
915 !test_bit(R5_UPTODATE, &sh->dev[i].flags))
916 printk("compute_2 with missing block %d/%d\n", count, i);
917 } while ( i != d0_idx );
918
919 if ( failb == disks-2 ) {
920 /* We're missing D+P. */
921 raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
922 } else {
923 /* We're missing D+D. */
924 raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
925 }
926
927 /* Both the above update both missing blocks */
928 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
929 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
930 }
931}
932
933
934/*
935 * Each stripe/dev can have one or more bion attached.
936 * toread/towrite point to the first in a chain.
937 * The bi_next chain must be in order.
938 */
939static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
940{
941 struct bio **bip;
942 raid6_conf_t *conf = sh->raid_conf;
943 int firstwrite=0;
944
945 PRINTK("adding bh b#%llu to stripe s#%llu\n",
946 (unsigned long long)bi->bi_sector,
947 (unsigned long long)sh->sector);
948
949
950 spin_lock(&sh->lock);
951 spin_lock_irq(&conf->device_lock);
952 if (forwrite) {
953 bip = &sh->dev[dd_idx].towrite;
954 if (*bip == NULL && sh->dev[dd_idx].written == NULL)
955 firstwrite = 1;
956 } else
957 bip = &sh->dev[dd_idx].toread;
958 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
959 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
960 goto overlap;
961 bip = &(*bip)->bi_next;
962 }
963 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
964 goto overlap;
965
966 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
967 if (*bip)
968 bi->bi_next = *bip;
969 *bip = bi;
970 bi->bi_phys_segments ++;
971 spin_unlock_irq(&conf->device_lock);
972 spin_unlock(&sh->lock);
973
974 PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
975 (unsigned long long)bi->bi_sector,
976 (unsigned long long)sh->sector, dd_idx);
977
978 if (conf->mddev->bitmap && firstwrite) {
979 sh->bm_seq = conf->seq_write;
980 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
981 STRIPE_SECTORS, 0);
982 set_bit(STRIPE_BIT_DELAY, &sh->state);
983 }
984
985 if (forwrite) {
986 /* check if page is covered */
987 sector_t sector = sh->dev[dd_idx].sector;
988 for (bi=sh->dev[dd_idx].towrite;
989 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
990 bi && bi->bi_sector <= sector;
991 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
992 if (bi->bi_sector + (bi->bi_size>>9) >= sector)
993 sector = bi->bi_sector + (bi->bi_size>>9);
994 }
995 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
996 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
997 }
998 return 1;
999
1000 overlap:
1001 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
1002 spin_unlock_irq(&conf->device_lock);
1003 spin_unlock(&sh->lock);
1004 return 0;
1005}
1006
1007
1008static int page_is_zero(struct page *p)
1009{
1010 char *a = page_address(p);
1011 return ((*(u32*)a) == 0 &&
1012 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1013}
1014/*
1015 * handle_stripe - do things to a stripe.
1016 *
1017 * We lock the stripe and then examine the state of various bits
1018 * to see what needs to be done.
1019 * Possible results:
1020 * return some read request which now have data
1021 * return some write requests which are safely on disc
1022 * schedule a read on some buffers
1023 * schedule a write of some buffers
1024 * return confirmation of parity correctness
1025 *
1026 * Parity calculations are done inside the stripe lock
1027 * buffers are taken off read_list or write_list, and bh_cache buffers
1028 * get BH_Lock set before the stripe lock is released.
1029 *
1030 */
1031
1032static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
1033{
1034 raid6_conf_t *conf = sh->raid_conf;
1035 int disks = conf->raid_disks;
1036 struct bio *return_bi= NULL;
1037 struct bio *bi;
1038 int i;
1039 int syncing;
1040 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1041 int non_overwrite = 0;
1042 int failed_num[2] = {0, 0};
1043 struct r5dev *dev, *pdev, *qdev;
1044 int pd_idx = sh->pd_idx;
1045 int qd_idx = raid6_next_disk(pd_idx, disks);
1046 int p_failed, q_failed;
1047
1048 PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
1049 (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
1050 pd_idx, qd_idx);
1051
1052 spin_lock(&sh->lock);
1053 clear_bit(STRIPE_HANDLE, &sh->state);
1054 clear_bit(STRIPE_DELAYED, &sh->state);
1055
1056 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1057 /* Now to look around and see what can be done */
1058
1059 rcu_read_lock();
1060 for (i=disks; i--; ) {
1061 mdk_rdev_t *rdev;
1062 dev = &sh->dev[i];
1063 clear_bit(R5_Insync, &dev->flags);
1064
1065 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
1066 i, dev->flags, dev->toread, dev->towrite, dev->written);
1067 /* maybe we can reply to a read */
1068 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1069 struct bio *rbi, *rbi2;
1070 PRINTK("Return read for disc %d\n", i);
1071 spin_lock_irq(&conf->device_lock);
1072 rbi = dev->toread;
1073 dev->toread = NULL;
1074 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1075 wake_up(&conf->wait_for_overlap);
1076 spin_unlock_irq(&conf->device_lock);
1077 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1078 copy_data(0, rbi, dev->page, dev->sector);
1079 rbi2 = r5_next_bio(rbi, dev->sector);
1080 spin_lock_irq(&conf->device_lock);
1081 if (--rbi->bi_phys_segments == 0) {
1082 rbi->bi_next = return_bi;
1083 return_bi = rbi;
1084 }
1085 spin_unlock_irq(&conf->device_lock);
1086 rbi = rbi2;
1087 }
1088 }
1089
1090 /* now count some things */
1091 if (test_bit(R5_LOCKED, &dev->flags)) locked++;
1092 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
1093
1094
1095 if (dev->toread) to_read++;
1096 if (dev->towrite) {
1097 to_write++;
1098 if (!test_bit(R5_OVERWRITE, &dev->flags))
1099 non_overwrite++;
1100 }
1101 if (dev->written) written++;
1102 rdev = rcu_dereference(conf->disks[i].rdev);
1103 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1104 /* The ReadError flag will just be confusing now */
1105 clear_bit(R5_ReadError, &dev->flags);
1106 clear_bit(R5_ReWrite, &dev->flags);
1107 }
1108 if (!rdev || !test_bit(In_sync, &rdev->flags)
1109 || test_bit(R5_ReadError, &dev->flags)) {
1110 if ( failed < 2 )
1111 failed_num[failed] = i;
1112 failed++;
1113 } else
1114 set_bit(R5_Insync, &dev->flags);
1115 }
1116 rcu_read_unlock();
1117 PRINTK("locked=%d uptodate=%d to_read=%d"
1118 " to_write=%d failed=%d failed_num=%d,%d\n",
1119 locked, uptodate, to_read, to_write, failed,
1120 failed_num[0], failed_num[1]);
1121 /* check if the array has lost >2 devices and, if so, some requests might
1122 * need to be failed
1123 */
1124 if (failed > 2 && to_read+to_write+written) {
1125 for (i=disks; i--; ) {
1126 int bitmap_end = 0;
1127
1128 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1129 mdk_rdev_t *rdev;
1130 rcu_read_lock();
1131 rdev = rcu_dereference(conf->disks[i].rdev);
1132 if (rdev && test_bit(In_sync, &rdev->flags))
1133 /* multiple read failures in one stripe */
1134 md_error(conf->mddev, rdev);
1135 rcu_read_unlock();
1136 }
1137
1138 spin_lock_irq(&conf->device_lock);
1139 /* fail all writes first */
1140 bi = sh->dev[i].towrite;
1141 sh->dev[i].towrite = NULL;
1142 if (bi) { to_write--; bitmap_end = 1; }
1143
1144 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1145 wake_up(&conf->wait_for_overlap);
1146
1147 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1148 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1149 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1150 if (--bi->bi_phys_segments == 0) {
1151 md_write_end(conf->mddev);
1152 bi->bi_next = return_bi;
1153 return_bi = bi;
1154 }
1155 bi = nextbi;
1156 }
1157 /* and fail all 'written' */
1158 bi = sh->dev[i].written;
1159 sh->dev[i].written = NULL;
1160 if (bi) bitmap_end = 1;
1161 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
1162 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1163 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1164 if (--bi->bi_phys_segments == 0) {
1165 md_write_end(conf->mddev);
1166 bi->bi_next = return_bi;
1167 return_bi = bi;
1168 }
1169 bi = bi2;
1170 }
1171
1172 /* fail any reads if this device is non-operational */
1173 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1174 test_bit(R5_ReadError, &sh->dev[i].flags)) {
1175 bi = sh->dev[i].toread;
1176 sh->dev[i].toread = NULL;
1177 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1178 wake_up(&conf->wait_for_overlap);
1179 if (bi) to_read--;
1180 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1181 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1182 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1183 if (--bi->bi_phys_segments == 0) {
1184 bi->bi_next = return_bi;
1185 return_bi = bi;
1186 }
1187 bi = nextbi;
1188 }
1189 }
1190 spin_unlock_irq(&conf->device_lock);
1191 if (bitmap_end)
1192 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1193 STRIPE_SECTORS, 0, 0);
1194 }
1195 }
1196 if (failed > 2 && syncing) {
1197 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
1198 clear_bit(STRIPE_SYNCING, &sh->state);
1199 syncing = 0;
1200 }
1201
1202 /*
1203 * might be able to return some write requests if the parity blocks
1204 * are safe, or on a failed drive
1205 */
1206 pdev = &sh->dev[pd_idx];
1207 p_failed = (failed >= 1 && failed_num[0] == pd_idx)
1208 || (failed >= 2 && failed_num[1] == pd_idx);
1209 qdev = &sh->dev[qd_idx];
1210 q_failed = (failed >= 1 && failed_num[0] == qd_idx)
1211 || (failed >= 2 && failed_num[1] == qd_idx);
1212
1213 if ( written &&
1214 ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
1215 && !test_bit(R5_LOCKED, &pdev->flags)
1216 && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
1217 ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
1218 && !test_bit(R5_LOCKED, &qdev->flags)
1219 && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
1220 /* any written block on an uptodate or failed drive can be
1221 * returned. Note that if we 'wrote' to a failed drive,
1222 * it will be UPTODATE, but never LOCKED, so we don't need
1223 * to test 'failed' directly.
1224 */
1225 for (i=disks; i--; )
1226 if (sh->dev[i].written) {
1227 dev = &sh->dev[i];
1228 if (!test_bit(R5_LOCKED, &dev->flags) &&
1229 test_bit(R5_UPTODATE, &dev->flags) ) {
1230 /* We can return any write requests */
1231 int bitmap_end = 0;
1232 struct bio *wbi, *wbi2;
1233 PRINTK("Return write for stripe %llu disc %d\n",
1234 (unsigned long long)sh->sector, i);
1235 spin_lock_irq(&conf->device_lock);
1236 wbi = dev->written;
1237 dev->written = NULL;
1238 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1239 wbi2 = r5_next_bio(wbi, dev->sector);
1240 if (--wbi->bi_phys_segments == 0) {
1241 md_write_end(conf->mddev);
1242 wbi->bi_next = return_bi;
1243 return_bi = wbi;
1244 }
1245 wbi = wbi2;
1246 }
1247 if (dev->towrite == NULL)
1248 bitmap_end = 1;
1249 spin_unlock_irq(&conf->device_lock);
1250 if (bitmap_end)
1251 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1252 STRIPE_SECTORS,
1253 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
1254 }
1255 }
1256 }
1257
1258 /* Now we might consider reading some blocks, either to check/generate
1259 * parity, or to satisfy requests
1260 * or to load a block that is being partially written.
1261 */
1262 if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
1263 for (i=disks; i--;) {
1264 dev = &sh->dev[i];
1265 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1266 (dev->toread ||
1267 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1268 syncing ||
1269 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
1270 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
1271 )
1272 ) {
1273 /* we would like to get this block, possibly
1274 * by computing it, but we might not be able to
1275 */
1276 if (uptodate == disks-1) {
1277 PRINTK("Computing stripe %llu block %d\n",
1278 (unsigned long long)sh->sector, i);
1279 compute_block_1(sh, i, 0);
1280 uptodate++;
1281 } else if ( uptodate == disks-2 && failed >= 2 ) {
1282 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
1283 int other;
1284 for (other=disks; other--;) {
1285 if ( other == i )
1286 continue;
1287 if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
1288 break;
1289 }
1290 BUG_ON(other < 0);
1291 PRINTK("Computing stripe %llu blocks %d,%d\n",
1292 (unsigned long long)sh->sector, i, other);
1293 compute_block_2(sh, i, other);
1294 uptodate += 2;
1295 } else if (test_bit(R5_Insync, &dev->flags)) {
1296 set_bit(R5_LOCKED, &dev->flags);
1297 set_bit(R5_Wantread, &dev->flags);
1298#if 0
1299 /* if I am just reading this block and we don't have
1300 a failed drive, or any pending writes then sidestep the cache */
1301 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
1302 ! syncing && !failed && !to_write) {
1303 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
1304 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
1305 }
1306#endif
1307 locked++;
1308 PRINTK("Reading block %d (sync=%d)\n",
1309 i, syncing);
1310 }
1311 }
1312 }
1313 set_bit(STRIPE_HANDLE, &sh->state);
1314 }
1315
1316 /* now to consider writing and what else, if anything should be read */
1317 if (to_write) {
1318 int rcw=0, must_compute=0;
1319 for (i=disks ; i--;) {
1320 dev = &sh->dev[i];
1321 /* Would I have to read this buffer for reconstruct_write */
1322 if (!test_bit(R5_OVERWRITE, &dev->flags)
1323 && i != pd_idx && i != qd_idx
1324 && (!test_bit(R5_LOCKED, &dev->flags)
1325#if 0
1326 || sh->bh_page[i] != bh->b_page
1327#endif
1328 ) &&
1329 !test_bit(R5_UPTODATE, &dev->flags)) {
1330 if (test_bit(R5_Insync, &dev->flags)) rcw++;
1331 else {
1332 PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
1333 must_compute++;
1334 }
1335 }
1336 }
1337 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
1338 (unsigned long long)sh->sector, rcw, must_compute);
1339 set_bit(STRIPE_HANDLE, &sh->state);
1340
1341 if (rcw > 0)
1342 /* want reconstruct write, but need to get some data */
1343 for (i=disks; i--;) {
1344 dev = &sh->dev[i];
1345 if (!test_bit(R5_OVERWRITE, &dev->flags)
1346 && !(failed == 0 && (i == pd_idx || i == qd_idx))
1347 && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1348 test_bit(R5_Insync, &dev->flags)) {
1349 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1350 {
1351 PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
1352 (unsigned long long)sh->sector, i);
1353 set_bit(R5_LOCKED, &dev->flags);
1354 set_bit(R5_Wantread, &dev->flags);
1355 locked++;
1356 } else {
1357 PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
1358 (unsigned long long)sh->sector, i);
1359 set_bit(STRIPE_DELAYED, &sh->state);
1360 set_bit(STRIPE_HANDLE, &sh->state);
1361 }
1362 }
1363 }
1364 /* now if nothing is locked, and if we have enough data, we can start a write request */
1365 if (locked == 0 && rcw == 0 &&
1366 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1367 if ( must_compute > 0 ) {
1368 /* We have failed blocks and need to compute them */
1369 switch ( failed ) {
1370 case 0: BUG();
1371 case 1: compute_block_1(sh, failed_num[0], 0); break;
1372 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
1373 default: BUG(); /* This request should have been failed? */
1374 }
1375 }
1376
1377 PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
1378 compute_parity(sh, RECONSTRUCT_WRITE);
1379 /* now every locked buffer is ready to be written */
1380 for (i=disks; i--;)
1381 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1382 PRINTK("Writing stripe %llu block %d\n",
1383 (unsigned long long)sh->sector, i);
1384 locked++;
1385 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1386 }
1387 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
1388 set_bit(STRIPE_INSYNC, &sh->state);
1389
1390 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1391 atomic_dec(&conf->preread_active_stripes);
1392 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1393 md_wakeup_thread(conf->mddev->thread);
1394 }
1395 }
1396 }
1397
1398 /* maybe we need to check and possibly fix the parity for this stripe
1399 * Any reads will already have been scheduled, so we just see if enough data
1400 * is available
1401 */
1402 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
1403 int update_p = 0, update_q = 0;
1404 struct r5dev *dev;
1405
1406 set_bit(STRIPE_HANDLE, &sh->state);
1407
1408 BUG_ON(failed>2);
1409 BUG_ON(uptodate < disks);
1410 /* Want to check and possibly repair P and Q.
1411 * However there could be one 'failed' device, in which
1412 * case we can only check one of them, possibly using the
1413 * other to generate missing data
1414 */
1415
1416 /* If !tmp_page, we cannot do the calculations,
1417 * but as we have set STRIPE_HANDLE, we will soon be called
1418 * by stripe_handle with a tmp_page - just wait until then.
1419 */
1420 if (tmp_page) {
1421 if (failed == q_failed) {
1422 /* The only possible failed device holds 'Q', so it makes
1423 * sense to check P (If anything else were failed, we would
1424 * have used P to recreate it).
1425 */
1426 compute_block_1(sh, pd_idx, 1);
1427 if (!page_is_zero(sh->dev[pd_idx].page)) {
1428 compute_block_1(sh,pd_idx,0);
1429 update_p = 1;
1430 }
1431 }
1432 if (!q_failed && failed < 2) {
1433 /* q is not failed, and we didn't use it to generate
1434 * anything, so it makes sense to check it
1435 */
1436 memcpy(page_address(tmp_page),
1437 page_address(sh->dev[qd_idx].page),
1438 STRIPE_SIZE);
1439 compute_parity(sh, UPDATE_PARITY);
1440 if (memcmp(page_address(tmp_page),
1441 page_address(sh->dev[qd_idx].page),
1442 STRIPE_SIZE)!= 0) {
1443 clear_bit(STRIPE_INSYNC, &sh->state);
1444 update_q = 1;
1445 }
1446 }
1447 if (update_p || update_q) {
1448 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1449 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1450 /* don't try to repair!! */
1451 update_p = update_q = 0;
1452 }
1453
1454 /* now write out any block on a failed drive,
1455 * or P or Q if they need it
1456 */
1457
1458 if (failed == 2) {
1459 dev = &sh->dev[failed_num[1]];
1460 locked++;
1461 set_bit(R5_LOCKED, &dev->flags);
1462 set_bit(R5_Wantwrite, &dev->flags);
1463 }
1464 if (failed >= 1) {
1465 dev = &sh->dev[failed_num[0]];
1466 locked++;
1467 set_bit(R5_LOCKED, &dev->flags);
1468 set_bit(R5_Wantwrite, &dev->flags);
1469 }
1470
1471 if (update_p) {
1472 dev = &sh->dev[pd_idx];
1473 locked ++;
1474 set_bit(R5_LOCKED, &dev->flags);
1475 set_bit(R5_Wantwrite, &dev->flags);
1476 }
1477 if (update_q) {
1478 dev = &sh->dev[qd_idx];
1479 locked++;
1480 set_bit(R5_LOCKED, &dev->flags);
1481 set_bit(R5_Wantwrite, &dev->flags);
1482 }
1483 clear_bit(STRIPE_DEGRADED, &sh->state);
1484
1485 set_bit(STRIPE_INSYNC, &sh->state);
1486 }
1487 }
1488
1489 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1490 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1491 clear_bit(STRIPE_SYNCING, &sh->state);
1492 }
1493
1494 /* If the failed drives are just a ReadError, then we might need
1495 * to progress the repair/check process
1496 */
1497 if (failed <= 2 && ! conf->mddev->ro)
1498 for (i=0; i<failed;i++) {
1499 dev = &sh->dev[failed_num[i]];
1500 if (test_bit(R5_ReadError, &dev->flags)
1501 && !test_bit(R5_LOCKED, &dev->flags)
1502 && test_bit(R5_UPTODATE, &dev->flags)
1503 ) {
1504 if (!test_bit(R5_ReWrite, &dev->flags)) {
1505 set_bit(R5_Wantwrite, &dev->flags);
1506 set_bit(R5_ReWrite, &dev->flags);
1507 set_bit(R5_LOCKED, &dev->flags);
1508 } else {
1509 /* let's read it back */
1510 set_bit(R5_Wantread, &dev->flags);
1511 set_bit(R5_LOCKED, &dev->flags);
1512 }
1513 }
1514 }
1515 spin_unlock(&sh->lock);
1516
1517 while ((bi=return_bi)) {
1518 int bytes = bi->bi_size;
1519
1520 return_bi = bi->bi_next;
1521 bi->bi_next = NULL;
1522 bi->bi_size = 0;
1523 bi->bi_end_io(bi, bytes, 0);
1524 }
1525 for (i=disks; i-- ;) {
1526 int rw;
1527 struct bio *bi;
1528 mdk_rdev_t *rdev;
1529 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
1530 rw = 1;
1531 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1532 rw = 0;
1533 else
1534 continue;
1535
1536 bi = &sh->dev[i].req;
1537
1538 bi->bi_rw = rw;
1539 if (rw)
1540 bi->bi_end_io = raid6_end_write_request;
1541 else
1542 bi->bi_end_io = raid6_end_read_request;
1543
1544 rcu_read_lock();
1545 rdev = rcu_dereference(conf->disks[i].rdev);
1546 if (rdev && test_bit(Faulty, &rdev->flags))
1547 rdev = NULL;
1548 if (rdev)
1549 atomic_inc(&rdev->nr_pending);
1550 rcu_read_unlock();
1551
1552 if (rdev) {
1553 if (syncing)
1554 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1555
1556 bi->bi_bdev = rdev->bdev;
1557 PRINTK("for %llu schedule op %ld on disc %d\n",
1558 (unsigned long long)sh->sector, bi->bi_rw, i);
1559 atomic_inc(&sh->count);
1560 bi->bi_sector = sh->sector + rdev->data_offset;
1561 bi->bi_flags = 1 << BIO_UPTODATE;
1562 bi->bi_vcnt = 1;
1563 bi->bi_max_vecs = 1;
1564 bi->bi_idx = 0;
1565 bi->bi_io_vec = &sh->dev[i].vec;
1566 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1567 bi->bi_io_vec[0].bv_offset = 0;
1568 bi->bi_size = STRIPE_SIZE;
1569 bi->bi_next = NULL;
1570 if (rw == WRITE &&
1571 test_bit(R5_ReWrite, &sh->dev[i].flags))
1572 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1573 generic_make_request(bi);
1574 } else {
1575 if (rw == 1)
1576 set_bit(STRIPE_DEGRADED, &sh->state);
1577 PRINTK("skip op %ld on disc %d for sector %llu\n",
1578 bi->bi_rw, i, (unsigned long long)sh->sector);
1579 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1580 set_bit(STRIPE_HANDLE, &sh->state);
1581 }
1582 }
1583}
1584
1585static void raid6_activate_delayed(raid6_conf_t *conf)
1586{
1587 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1588 while (!list_empty(&conf->delayed_list)) {
1589 struct list_head *l = conf->delayed_list.next;
1590 struct stripe_head *sh;
1591 sh = list_entry(l, struct stripe_head, lru);
1592 list_del_init(l);
1593 clear_bit(STRIPE_DELAYED, &sh->state);
1594 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1595 atomic_inc(&conf->preread_active_stripes);
1596 list_add_tail(&sh->lru, &conf->handle_list);
1597 }
1598 }
1599}
1600
1601static void activate_bit_delay(raid6_conf_t *conf)
1602{
1603 /* device_lock is held */
1604 struct list_head head;
1605 list_add(&head, &conf->bitmap_list);
1606 list_del_init(&conf->bitmap_list);
1607 while (!list_empty(&head)) {
1608 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
1609 list_del_init(&sh->lru);
1610 atomic_inc(&sh->count);
1611 __release_stripe(conf, sh);
1612 }
1613}
1614
1615static void unplug_slaves(mddev_t *mddev)
1616{
1617 raid6_conf_t *conf = mddev_to_conf(mddev);
1618 int i;
1619
1620 rcu_read_lock();
1621 for (i=0; i<mddev->raid_disks; i++) {
1622 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
1623 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
1624 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
1625
1626 atomic_inc(&rdev->nr_pending);
1627 rcu_read_unlock();
1628
1629 if (r_queue->unplug_fn)
1630 r_queue->unplug_fn(r_queue);
1631
1632 rdev_dec_pending(rdev, mddev);
1633 rcu_read_lock();
1634 }
1635 }
1636 rcu_read_unlock();
1637}
1638
1639static void raid6_unplug_device(request_queue_t *q)
1640{
1641 mddev_t *mddev = q->queuedata;
1642 raid6_conf_t *conf = mddev_to_conf(mddev);
1643 unsigned long flags;
1644
1645 spin_lock_irqsave(&conf->device_lock, flags);
1646
1647 if (blk_remove_plug(q)) {
1648 conf->seq_flush++;
1649 raid6_activate_delayed(conf);
1650 }
1651 md_wakeup_thread(mddev->thread);
1652
1653 spin_unlock_irqrestore(&conf->device_lock, flags);
1654
1655 unplug_slaves(mddev);
1656}
1657
1658static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
1659 sector_t *error_sector)
1660{
1661 mddev_t *mddev = q->queuedata;
1662 raid6_conf_t *conf = mddev_to_conf(mddev);
1663 int i, ret = 0;
1664
1665 rcu_read_lock();
1666 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
1667 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
1668 if (rdev && !test_bit(Faulty, &rdev->flags)) {
1669 struct block_device *bdev = rdev->bdev;
1670 request_queue_t *r_queue = bdev_get_queue(bdev);
1671
1672 if (!r_queue->issue_flush_fn)
1673 ret = -EOPNOTSUPP;
1674 else {
1675 atomic_inc(&rdev->nr_pending);
1676 rcu_read_unlock();
1677 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
1678 error_sector);
1679 rdev_dec_pending(rdev, mddev);
1680 rcu_read_lock();
1681 }
1682 }
1683 }
1684 rcu_read_unlock();
1685 return ret;
1686}
1687
1688static inline void raid6_plug_device(raid6_conf_t *conf)
1689{
1690 spin_lock_irq(&conf->device_lock);
1691 blk_plug_device(conf->mddev->queue);
1692 spin_unlock_irq(&conf->device_lock);
1693}
1694
1695static int make_request (request_queue_t *q, struct bio * bi)
1696{
1697 mddev_t *mddev = q->queuedata;
1698 raid6_conf_t *conf = mddev_to_conf(mddev);
1699 const unsigned int raid_disks = conf->raid_disks;
1700 const unsigned int data_disks = raid_disks - 2;
1701 unsigned int dd_idx, pd_idx;
1702 sector_t new_sector;
1703 sector_t logical_sector, last_sector;
1704 struct stripe_head *sh;
1705 const int rw = bio_data_dir(bi);
1706
1707 if (unlikely(bio_barrier(bi))) {
1708 bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
1709 return 0;
1710 }
1711
1712 md_write_start(mddev, bi);
1713
1714 disk_stat_inc(mddev->gendisk, ios[rw]);
1715 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
1716
1717 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
1718 last_sector = bi->bi_sector + (bi->bi_size>>9);
1719
1720 bi->bi_next = NULL;
1721 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
1722
1723 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1724 DEFINE_WAIT(w);
1725
1726 new_sector = raid6_compute_sector(logical_sector,
1727 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1728
1729 PRINTK("raid6: make_request, sector %llu logical %llu\n",
1730 (unsigned long long)new_sector,
1731 (unsigned long long)logical_sector);
1732
1733 retry:
1734 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
1735 sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
1736 if (sh) {
1737 if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
1738 /* Add failed due to overlap. Flush everything
1739 * and wait a while
1740 */
1741 raid6_unplug_device(mddev->queue);
1742 release_stripe(sh);
1743 schedule();
1744 goto retry;
1745 }
1746 finish_wait(&conf->wait_for_overlap, &w);
1747 raid6_plug_device(conf);
1748 handle_stripe(sh, NULL);
1749 release_stripe(sh);
1750 } else {
1751 /* cannot get stripe for read-ahead, just give-up */
1752 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1753 finish_wait(&conf->wait_for_overlap, &w);
1754 break;
1755 }
1756
1757 }
1758 spin_lock_irq(&conf->device_lock);
1759 if (--bi->bi_phys_segments == 0) {
1760 int bytes = bi->bi_size;
1761
1762 if (rw == WRITE )
1763 md_write_end(mddev);
1764 bi->bi_size = 0;
1765 bi->bi_end_io(bi, bytes, 0);
1766 }
1767 spin_unlock_irq(&conf->device_lock);
1768 return 0;
1769}
1770
1771/* FIXME go_faster isn't used */
1772static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1773{
1774 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
1775 struct stripe_head *sh;
1776 int sectors_per_chunk = conf->chunk_size >> 9;
1777 sector_t x;
1778 unsigned long stripe;
1779 int chunk_offset;
1780 int dd_idx, pd_idx;
1781 sector_t first_sector;
1782 int raid_disks = conf->raid_disks;
1783 int data_disks = raid_disks - 2;
1784 sector_t max_sector = mddev->size << 1;
1785 int sync_blocks;
1786 int still_degraded = 0;
1787 int i;
1788
1789 if (sector_nr >= max_sector) {
1790 /* just being told to finish up .. nothing much to do */
1791 unplug_slaves(mddev);
1792
1793 if (mddev->curr_resync < max_sector) /* aborted */
1794 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1795 &sync_blocks, 1);
1796 else /* completed sync */
1797 conf->fullsync = 0;
1798 bitmap_close_sync(mddev->bitmap);
1799
1800 return 0;
1801 }
1802 /* if there are 2 or more failed drives and we are trying
1803 * to resync, then assert that we are finished, because there is
1804 * nothing we can do.
1805 */
1806 if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1807 sector_t rv = (mddev->size << 1) - sector_nr;
1808 *skipped = 1;
1809 return rv;
1810 }
1811 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1812 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
1813 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
1814 /* we can skip this block, and probably more */
1815 sync_blocks /= STRIPE_SECTORS;
1816 *skipped = 1;
1817 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
1818 }
1819
1820 x = sector_nr;
1821 chunk_offset = sector_div(x, sectors_per_chunk);
1822 stripe = x;
1823 BUG_ON(x != stripe);
1824
1825 first_sector = raid6_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
1826 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1827 sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
1828 if (sh == NULL) {
1829 sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
1830 /* make sure we don't swamp the stripe cache if someone else
1831 * is trying to get access
1832 */
1833 schedule_timeout_uninterruptible(1);
1834 }
1835 /* Need to check if array will still be degraded after recovery/resync
1836 * We don't need to check the 'failed' flag as when that gets set,
1837 * recovery aborts.
1838 */
1839 for (i=0; i<mddev->raid_disks; i++)
1840 if (conf->disks[i].rdev == NULL)
1841 still_degraded = 1;
1842
1843 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
1844
1845 spin_lock(&sh->lock);
1846 set_bit(STRIPE_SYNCING, &sh->state);
1847 clear_bit(STRIPE_INSYNC, &sh->state);
1848 spin_unlock(&sh->lock);
1849
1850 handle_stripe(sh, NULL);
1851 release_stripe(sh);
1852
1853 return STRIPE_SECTORS;
1854}
1855
1856/*
1857 * This is our raid6 kernel thread.
1858 *
1859 * We scan the hash table for stripes which can be handled now.
1860 * During the scan, completed stripes are saved for us by the interrupt
1861 * handler, so that they will not have to wait for our next wakeup.
1862 */
1863static void raid6d (mddev_t *mddev)
1864{
1865 struct stripe_head *sh;
1866 raid6_conf_t *conf = mddev_to_conf(mddev);
1867 int handled;
1868
1869 PRINTK("+++ raid6d active\n");
1870
1871 md_check_recovery(mddev);
1872
1873 handled = 0;
1874 spin_lock_irq(&conf->device_lock);
1875 while (1) {
1876 struct list_head *first;
1877
1878 if (conf->seq_flush - conf->seq_write > 0) {
1879 int seq = conf->seq_flush;
1880 spin_unlock_irq(&conf->device_lock);
1881 bitmap_unplug(mddev->bitmap);
1882 spin_lock_irq(&conf->device_lock);
1883 conf->seq_write = seq;
1884 activate_bit_delay(conf);
1885 }
1886
1887 if (list_empty(&conf->handle_list) &&
1888 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1889 !blk_queue_plugged(mddev->queue) &&
1890 !list_empty(&conf->delayed_list))
1891 raid6_activate_delayed(conf);
1892
1893 if (list_empty(&conf->handle_list))
1894 break;
1895
1896 first = conf->handle_list.next;
1897 sh = list_entry(first, struct stripe_head, lru);
1898
1899 list_del_init(first);
1900 atomic_inc(&sh->count);
1901 BUG_ON(atomic_read(&sh->count)!= 1);
1902 spin_unlock_irq(&conf->device_lock);
1903
1904 handled++;
1905 handle_stripe(sh, conf->spare_page);
1906 release_stripe(sh);
1907
1908 spin_lock_irq(&conf->device_lock);
1909 }
1910 PRINTK("%d stripes handled\n", handled);
1911
1912 spin_unlock_irq(&conf->device_lock);
1913
1914 unplug_slaves(mddev);
1915
1916 PRINTK("--- raid6d inactive\n");
1917}
1918
1919static ssize_t
1920raid6_show_stripe_cache_size(mddev_t *mddev, char *page)
1921{
1922 raid6_conf_t *conf = mddev_to_conf(mddev);
1923 if (conf)
1924 return sprintf(page, "%d\n", conf->max_nr_stripes);
1925 else
1926 return 0;
1927}
1928
1929static ssize_t
1930raid6_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
1931{
1932 raid6_conf_t *conf = mddev_to_conf(mddev);
1933 char *end;
1934 int new;
1935 if (len >= PAGE_SIZE)
1936 return -EINVAL;
1937 if (!conf)
1938 return -ENODEV;
1939
1940 new = simple_strtoul(page, &end, 10);
1941 if (!*page || (*end && *end != '\n') )
1942 return -EINVAL;
1943 if (new <= 16 || new > 32768)
1944 return -EINVAL;
1945 while (new < conf->max_nr_stripes) {
1946 if (drop_one_stripe(conf))
1947 conf->max_nr_stripes--;
1948 else
1949 break;
1950 }
1951 while (new > conf->max_nr_stripes) {
1952 if (grow_one_stripe(conf))
1953 conf->max_nr_stripes++;
1954 else break;
1955 }
1956 return len;
1957}
1958
1959static struct md_sysfs_entry
1960raid6_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
1961 raid6_show_stripe_cache_size,
1962 raid6_store_stripe_cache_size);
1963
1964static ssize_t
1965stripe_cache_active_show(mddev_t *mddev, char *page)
1966{
1967 raid6_conf_t *conf = mddev_to_conf(mddev);
1968 if (conf)
1969 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
1970 else
1971 return 0;
1972}
1973
1974static struct md_sysfs_entry
1975raid6_stripecache_active = __ATTR_RO(stripe_cache_active);
1976
1977static struct attribute *raid6_attrs[] = {
1978 &raid6_stripecache_size.attr,
1979 &raid6_stripecache_active.attr,
1980 NULL,
1981};
1982static struct attribute_group raid6_attrs_group = {
1983 .name = NULL,
1984 .attrs = raid6_attrs,
1985};
1986
1987static int run(mddev_t *mddev)
1988{
1989 raid6_conf_t *conf;
1990 int raid_disk, memory;
1991 mdk_rdev_t *rdev;
1992 struct disk_info *disk;
1993 struct list_head *tmp;
1994
1995 if (mddev->level != 6) {
1996 PRINTK("raid6: %s: raid level not set to 6 (%d)\n", mdname(mddev), mddev->level);
1997 return -EIO;
1998 }
1999
2000 mddev->private = kzalloc(sizeof (raid6_conf_t), GFP_KERNEL);
2001 if ((conf = mddev->private) == NULL)
2002 goto abort;
2003 conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
2004 GFP_KERNEL);
2005 if (!conf->disks)
2006 goto abort;
2007
2008 conf->mddev = mddev;
2009
2010 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
2011 goto abort;
2012
2013 conf->spare_page = alloc_page(GFP_KERNEL);
2014 if (!conf->spare_page)
2015 goto abort;
2016
2017 spin_lock_init(&conf->device_lock);
2018 init_waitqueue_head(&conf->wait_for_stripe);
2019 init_waitqueue_head(&conf->wait_for_overlap);
2020 INIT_LIST_HEAD(&conf->handle_list);
2021 INIT_LIST_HEAD(&conf->delayed_list);
2022 INIT_LIST_HEAD(&conf->bitmap_list);
2023 INIT_LIST_HEAD(&conf->inactive_list);
2024 atomic_set(&conf->active_stripes, 0);
2025 atomic_set(&conf->preread_active_stripes, 0);
2026
2027 PRINTK("raid6: run(%s) called.\n", mdname(mddev));
2028
2029 ITERATE_RDEV(mddev,rdev,tmp) {
2030 raid_disk = rdev->raid_disk;
2031 if (raid_disk >= mddev->raid_disks
2032 || raid_disk < 0)
2033 continue;
2034 disk = conf->disks + raid_disk;
2035
2036 disk->rdev = rdev;
2037
2038 if (test_bit(In_sync, &rdev->flags)) {
2039 char b[BDEVNAME_SIZE];
2040 printk(KERN_INFO "raid6: device %s operational as raid"
2041 " disk %d\n", bdevname(rdev->bdev,b),
2042 raid_disk);
2043 conf->working_disks++;
2044 }
2045 }
2046
2047 conf->raid_disks = mddev->raid_disks;
2048
2049 /*
2050 * 0 for a fully functional array, 1 or 2 for a degraded array.
2051 */
2052 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
2053 conf->mddev = mddev;
2054 conf->chunk_size = mddev->chunk_size;
2055 conf->level = mddev->level;
2056 conf->algorithm = mddev->layout;
2057 conf->max_nr_stripes = NR_STRIPES;
2058
2059 /* device size must be a multiple of chunk size */
2060 mddev->size &= ~(mddev->chunk_size/1024 -1);
2061 mddev->resync_max_sectors = mddev->size << 1;
2062
2063 if (conf->raid_disks < 4) {
2064 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
2065 mdname(mddev), conf->raid_disks);
2066 goto abort;
2067 }
2068 if (!conf->chunk_size || conf->chunk_size % 4) {
2069 printk(KERN_ERR "raid6: invalid chunk size %d for %s\n",
2070 conf->chunk_size, mdname(mddev));
2071 goto abort;
2072 }
2073 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
2074 printk(KERN_ERR
2075 "raid6: unsupported parity algorithm %d for %s\n",
2076 conf->algorithm, mdname(mddev));
2077 goto abort;
2078 }
2079 if (mddev->degraded > 2) {
2080 printk(KERN_ERR "raid6: not enough operational devices for %s"
2081 " (%d/%d failed)\n",
2082 mdname(mddev), conf->failed_disks, conf->raid_disks);
2083 goto abort;
2084 }
2085
2086 if (mddev->degraded > 0 &&
2087 mddev->recovery_cp != MaxSector) {
2088 if (mddev->ok_start_degraded)
2089 printk(KERN_WARNING "raid6: starting dirty degraded array:%s"
2090 "- data corruption possible.\n",
2091 mdname(mddev));
2092 else {
2093 printk(KERN_ERR "raid6: cannot start dirty degraded array"
2094 " for %s\n", mdname(mddev));
2095 goto abort;
2096 }
2097 }
2098
2099 {
2100 mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
2101 if (!mddev->thread) {
2102 printk(KERN_ERR
2103 "raid6: couldn't allocate thread for %s\n",
2104 mdname(mddev));
2105 goto abort;
2106 }
2107 }
2108
2109 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
2110 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
2111 if (grow_stripes(conf, conf->max_nr_stripes)) {
2112 printk(KERN_ERR
2113 "raid6: couldn't allocate %dkB for buffers\n", memory);
2114 shrink_stripes(conf);
2115 md_unregister_thread(mddev->thread);
2116 goto abort;
2117 } else
2118 printk(KERN_INFO "raid6: allocated %dkB for %s\n",
2119 memory, mdname(mddev));
2120
2121 if (mddev->degraded == 0)
2122 printk(KERN_INFO "raid6: raid level %d set %s active with %d out of %d"
2123 " devices, algorithm %d\n", conf->level, mdname(mddev),
2124 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
2125 conf->algorithm);
2126 else
2127 printk(KERN_ALERT "raid6: raid level %d set %s active with %d"
2128 " out of %d devices, algorithm %d\n", conf->level,
2129 mdname(mddev), mddev->raid_disks - mddev->degraded,
2130 mddev->raid_disks, conf->algorithm);
2131
2132 print_raid6_conf(conf);
2133
2134 /* read-ahead size must cover two whole stripes, which is
2135 * 2 * (n-2) * chunksize where 'n' is the number of raid devices
2136 */
2137 {
2138 int stripe = (mddev->raid_disks-2) *
2139 (mddev->chunk_size / PAGE_SIZE);
2140 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
2141 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
2142 }
2143
2144 /* Ok, everything is just fine now */
2145 sysfs_create_group(&mddev->kobj, &raid6_attrs_group);
2146
2147 mddev->array_size = mddev->size * (mddev->raid_disks - 2);
2148
2149 mddev->queue->unplug_fn = raid6_unplug_device;
2150 mddev->queue->issue_flush_fn = raid6_issue_flush;
2151 return 0;
2152abort:
2153 if (conf) {
2154 print_raid6_conf(conf);
2155 safe_put_page(conf->spare_page);
2156 kfree(conf->stripe_hashtbl);
2157 kfree(conf->disks);
2158 kfree(conf);
2159 }
2160 mddev->private = NULL;
2161 printk(KERN_ALERT "raid6: failed to run raid set %s\n", mdname(mddev));
2162 return -EIO;
2163}
2164
2165
2166
2167static int stop (mddev_t *mddev)
2168{
2169 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
2170
2171 md_unregister_thread(mddev->thread);
2172 mddev->thread = NULL;
2173 shrink_stripes(conf);
2174 kfree(conf->stripe_hashtbl);
2175 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2176 sysfs_remove_group(&mddev->kobj, &raid6_attrs_group);
2177 kfree(conf);
2178 mddev->private = NULL;
2179 return 0;
2180}
2181
2182#if RAID6_DUMPSTATE
2183static void print_sh (struct seq_file *seq, struct stripe_head *sh)
2184{
2185 int i;
2186
2187 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
2188 (unsigned long long)sh->sector, sh->pd_idx, sh->state);
2189 seq_printf(seq, "sh %llu, count %d.\n",
2190 (unsigned long long)sh->sector, atomic_read(&sh->count));
2191 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
2192 for (i = 0; i < sh->raid_conf->raid_disks; i++) {
2193 seq_printf(seq, "(cache%d: %p %ld) ",
2194 i, sh->dev[i].page, sh->dev[i].flags);
2195 }
2196 seq_printf(seq, "\n");
2197}
2198
2199static void printall (struct seq_file *seq, raid6_conf_t *conf)
2200{
2201 struct stripe_head *sh;
2202 struct hlist_node *hn;
2203 int i;
2204
2205 spin_lock_irq(&conf->device_lock);
2206 for (i = 0; i < NR_HASH; i++) {
2207 sh = conf->stripe_hashtbl[i];
2208 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
2209 if (sh->raid_conf != conf)
2210 continue;
2211 print_sh(seq, sh);
2212 }
2213 }
2214 spin_unlock_irq(&conf->device_lock);
2215}
2216#endif
2217
2218static void status (struct seq_file *seq, mddev_t *mddev)
2219{
2220 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
2221 int i;
2222
2223 seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
2224 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
2225 for (i = 0; i < conf->raid_disks; i++)
2226 seq_printf (seq, "%s",
2227 conf->disks[i].rdev &&
2228 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
2229 seq_printf (seq, "]");
2230#if RAID6_DUMPSTATE
2231 seq_printf (seq, "\n");
2232 printall(seq, conf);
2233#endif
2234}
2235
2236static void print_raid6_conf (raid6_conf_t *conf)
2237{
2238 int i;
2239 struct disk_info *tmp;
2240
2241 printk("RAID6 conf printout:\n");
2242 if (!conf) {
2243 printk("(conf==NULL)\n");
2244 return;
2245 }
2246 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
2247 conf->working_disks, conf->failed_disks);
2248
2249 for (i = 0; i < conf->raid_disks; i++) {
2250 char b[BDEVNAME_SIZE];
2251 tmp = conf->disks + i;
2252 if (tmp->rdev)
2253 printk(" disk %d, o:%d, dev:%s\n",
2254 i, !test_bit(Faulty, &tmp->rdev->flags),
2255 bdevname(tmp->rdev->bdev,b));
2256 }
2257}
2258
2259static int raid6_spare_active(mddev_t *mddev)
2260{
2261 int i;
2262 raid6_conf_t *conf = mddev->private;
2263 struct disk_info *tmp;
2264
2265 for (i = 0; i < conf->raid_disks; i++) {
2266 tmp = conf->disks + i;
2267 if (tmp->rdev
2268 && !test_bit(Faulty, &tmp->rdev->flags)
2269 && !test_bit(In_sync, &tmp->rdev->flags)) {
2270 mddev->degraded--;
2271 conf->failed_disks--;
2272 conf->working_disks++;
2273 set_bit(In_sync, &tmp->rdev->flags);
2274 }
2275 }
2276 print_raid6_conf(conf);
2277 return 0;
2278}
2279
2280static int raid6_remove_disk(mddev_t *mddev, int number)
2281{
2282 raid6_conf_t *conf = mddev->private;
2283 int err = 0;
2284 mdk_rdev_t *rdev;
2285 struct disk_info *p = conf->disks + number;
2286
2287 print_raid6_conf(conf);
2288 rdev = p->rdev;
2289 if (rdev) {
2290 if (test_bit(In_sync, &rdev->flags) ||
2291 atomic_read(&rdev->nr_pending)) {
2292 err = -EBUSY;
2293 goto abort;
2294 }
2295 p->rdev = NULL;
2296 synchronize_rcu();
2297 if (atomic_read(&rdev->nr_pending)) {
2298 /* lost the race, try later */
2299 err = -EBUSY;
2300 p->rdev = rdev;
2301 }
2302 }
2303
2304abort:
2305
2306 print_raid6_conf(conf);
2307 return err;
2308}
2309
2310static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2311{
2312 raid6_conf_t *conf = mddev->private;
2313 int found = 0;
2314 int disk;
2315 struct disk_info *p;
2316
2317 if (mddev->degraded > 2)
2318 /* no point adding a device */
2319 return 0;
2320 /*
2321 * find the disk ... but prefer rdev->saved_raid_disk
2322 * if possible.
2323 */
2324 if (rdev->saved_raid_disk >= 0 &&
2325 conf->disks[rdev->saved_raid_disk].rdev == NULL)
2326 disk = rdev->saved_raid_disk;
2327 else
2328 disk = 0;
2329 for ( ; disk < mddev->raid_disks; disk++)
2330 if ((p=conf->disks + disk)->rdev == NULL) {
2331 clear_bit(In_sync, &rdev->flags);
2332 rdev->raid_disk = disk;
2333 found = 1;
2334 if (rdev->saved_raid_disk != disk)
2335 conf->fullsync = 1;
2336 rcu_assign_pointer(p->rdev, rdev);
2337 break;
2338 }
2339 print_raid6_conf(conf);
2340 return found;
2341}
2342
2343static int raid6_resize(mddev_t *mddev, sector_t sectors)
2344{
2345 /* no resync is happening, and there is enough space
2346 * on all devices, so we can resize.
2347 * We need to make sure resync covers any new space.
2348 * If the array is shrinking we should possibly wait until
2349 * any io in the removed space completes, but it hardly seems
2350 * worth it.
2351 */
2352 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
2353 mddev->array_size = (sectors * (mddev->raid_disks-2))>>1;
2354 set_capacity(mddev->gendisk, mddev->array_size << 1);
2355 mddev->changed = 1;
2356 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
2357 mddev->recovery_cp = mddev->size << 1;
2358 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2359 }
2360 mddev->size = sectors /2;
2361 mddev->resync_max_sectors = sectors;
2362 return 0;
2363}
2364
2365static void raid6_quiesce(mddev_t *mddev, int state)
2366{
2367 raid6_conf_t *conf = mddev_to_conf(mddev);
2368
2369 switch(state) {
2370 case 1: /* stop all writes */
2371 spin_lock_irq(&conf->device_lock);
2372 conf->quiesce = 1;
2373 wait_event_lock_irq(conf->wait_for_stripe,
2374 atomic_read(&conf->active_stripes) == 0,
2375 conf->device_lock, /* nothing */);
2376 spin_unlock_irq(&conf->device_lock);
2377 break;
2378
2379 case 0: /* re-enable writes */
2380 spin_lock_irq(&conf->device_lock);
2381 conf->quiesce = 0;
2382 wake_up(&conf->wait_for_stripe);
2383 spin_unlock_irq(&conf->device_lock);
2384 break;
2385 }
2386}
2387
2388static struct mdk_personality raid6_personality =
2389{
2390 .name = "raid6",
2391 .level = 6,
2392 .owner = THIS_MODULE,
2393 .make_request = make_request,
2394 .run = run,
2395 .stop = stop,
2396 .status = status,
2397 .error_handler = error,
2398 .hot_add_disk = raid6_add_disk,
2399 .hot_remove_disk= raid6_remove_disk,
2400 .spare_active = raid6_spare_active,
2401 .sync_request = sync_request,
2402 .resize = raid6_resize,
2403 .quiesce = raid6_quiesce,
2404};
2405
2406static int __init raid6_init(void)
2407{
2408 int e;
2409
2410 e = raid6_select_algo();
2411 if ( e )
2412 return e;
2413
2414 return register_md_personality(&raid6_personality);
2415}
2416
2417static void raid6_exit (void)
2418{
2419 unregister_md_personality(&raid6_personality);
2420}
2421
2422module_init(raid6_init);
2423module_exit(raid6_exit);
2424MODULE_LICENSE("GPL");
2425MODULE_ALIAS("md-personality-8"); /* RAID6 */
2426MODULE_ALIAS("md-raid6");
2427MODULE_ALIAS("md-level-6");
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 914af667044f..20ed4c997636 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -212,6 +212,7 @@ struct raid5_private_data {
212 mddev_t *mddev; 212 mddev_t *mddev;
213 struct disk_info *spare; 213 struct disk_info *spare;
214 int chunk_size, level, algorithm; 214 int chunk_size, level, algorithm;
215 int max_degraded;
215 int raid_disks, working_disks, failed_disks; 216 int raid_disks, working_disks, failed_disks;
216 int max_nr_stripes; 217 int max_nr_stripes;
217 218