diff options
Diffstat (limited to 'drivers/md')
54 files changed, 28483 insertions, 0 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig new file mode 100644 index 000000000000..ac43f98062fd --- /dev/null +++ b/drivers/md/Kconfig | |||
@@ -0,0 +1,240 @@ | |||
1 | # | ||
2 | # Block device driver configuration | ||
3 | # | ||
4 | |||
5 | menu "Multi-device support (RAID and LVM)" | ||
6 | |||
7 | config MD | ||
8 | bool "Multiple devices driver support (RAID and LVM)" | ||
9 | help | ||
10 | Support multiple physical spindles through a single logical device. | ||
11 | Required for RAID and logical volume management. | ||
12 | |||
13 | config BLK_DEV_MD | ||
14 | tristate "RAID support" | ||
15 | depends on MD | ||
16 | ---help--- | ||
17 | This driver lets you combine several hard disk partitions into one | ||
18 | logical block device. This can be used to simply append one | ||
19 | partition to another one or to combine several redundant hard disks | ||
20 | into a RAID1/4/5 device so as to provide protection against hard | ||
21 | disk failures. This is called "Software RAID" since the combining of | ||
22 | the partitions is done by the kernel. "Hardware RAID" means that the | ||
23 | combining is done by a dedicated controller; if you have such a | ||
24 | controller, you do not need to say Y here. | ||
25 | |||
26 | More information about Software RAID on Linux is contained in the | ||
27 | Software RAID mini-HOWTO, available from | ||
28 | <http://www.tldp.org/docs.html#howto>. There you will also learn | ||
29 | where to get the supporting user space utilities raidtools. | ||
30 | |||
31 | If unsure, say N. | ||
32 | |||
33 | config MD_LINEAR | ||
34 | tristate "Linear (append) mode" | ||
35 | depends on BLK_DEV_MD | ||
36 | ---help--- | ||
37 | If you say Y here, then your multiple devices driver will be able to | ||
38 | use the so-called linear mode, i.e. it will combine the hard disk | ||
39 | partitions by simply appending one to the other. | ||
40 | |||
41 | To compile this as a module, choose M here: the module | ||
42 | will be called linear. | ||
43 | |||
44 | If unsure, say Y. | ||
45 | |||
46 | config MD_RAID0 | ||
47 | tristate "RAID-0 (striping) mode" | ||
48 | depends on BLK_DEV_MD | ||
49 | ---help--- | ||
50 | If you say Y here, then your multiple devices driver will be able to | ||
51 | use the so-called raid0 mode, i.e. it will combine the hard disk | ||
52 | partitions into one logical device in such a fashion as to fill them | ||
53 | up evenly, one chunk here and one chunk there. This will increase | ||
54 | the throughput rate if the partitions reside on distinct disks. | ||
55 | |||
56 | Information about Software RAID on Linux is contained in the | ||
57 | Software-RAID mini-HOWTO, available from | ||
58 | <http://www.tldp.org/docs.html#howto>. There you will also | ||
59 | learn where to get the supporting user space utilities raidtools. | ||
60 | |||
61 | To compile this as a module, choose M here: the module | ||
62 | will be called raid0. | ||
63 | |||
64 | If unsure, say Y. | ||
65 | |||
66 | config MD_RAID1 | ||
67 | tristate "RAID-1 (mirroring) mode" | ||
68 | depends on BLK_DEV_MD | ||
69 | ---help--- | ||
70 | A RAID-1 set consists of several disk drives which are exact copies | ||
71 | of each other. In the event of a mirror failure, the RAID driver | ||
72 | will continue to use the operational mirrors in the set, providing | ||
73 | an error free MD (multiple device) to the higher levels of the | ||
74 | kernel. In a set with N drives, the available space is the capacity | ||
75 | of a single drive, and the set protects against a failure of (N - 1) | ||
76 | drives. | ||
77 | |||
78 | Information about Software RAID on Linux is contained in the | ||
79 | Software-RAID mini-HOWTO, available from | ||
80 | <http://www.tldp.org/docs.html#howto>. There you will also | ||
81 | learn where to get the supporting user space utilities raidtools. | ||
82 | |||
83 | If you want to use such a RAID-1 set, say Y. To compile this code | ||
84 | as a module, choose M here: the module will be called raid1. | ||
85 | |||
86 | If unsure, say Y. | ||
87 | |||
88 | config MD_RAID10 | ||
89 | tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" | ||
90 | depends on BLK_DEV_MD && EXPERIMENTAL | ||
91 | ---help--- | ||
92 | RAID-10 provides a combination of striping (RAID-0) and | ||
93 | mirroring (RAID-1) with easier configuration and more flexable | ||
94 | layout. | ||
95 | Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to | ||
96 | be the same size (or at least, only as much as the smallest device | ||
97 | will be used). | ||
98 | RAID-10 provides a variety of layouts that provide different levels | ||
99 | of redundancy and performance. | ||
100 | |||
101 | RAID-10 requires mdadm-1.7.0 or later, available at: | ||
102 | |||
103 | ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/ | ||
104 | |||
105 | If unsure, say Y. | ||
106 | |||
107 | config MD_RAID5 | ||
108 | tristate "RAID-4/RAID-5 mode" | ||
109 | depends on BLK_DEV_MD | ||
110 | ---help--- | ||
111 | A RAID-5 set of N drives with a capacity of C MB per drive provides | ||
112 | the capacity of C * (N - 1) MB, and protects against a failure | ||
113 | of a single drive. For a given sector (row) number, (N - 1) drives | ||
114 | contain data sectors, and one drive contains the parity protection. | ||
115 | For a RAID-4 set, the parity blocks are present on a single drive, | ||
116 | while a RAID-5 set distributes the parity across the drives in one | ||
117 | of the available parity distribution methods. | ||
118 | |||
119 | Information about Software RAID on Linux is contained in the | ||
120 | Software-RAID mini-HOWTO, available from | ||
121 | <http://www.tldp.org/docs.html#howto>. There you will also | ||
122 | learn where to get the supporting user space utilities raidtools. | ||
123 | |||
124 | If you want to use such a RAID-4/RAID-5 set, say Y. To | ||
125 | compile this code as a module, choose M here: the module | ||
126 | will be called raid5. | ||
127 | |||
128 | If unsure, say Y. | ||
129 | |||
130 | config MD_RAID6 | ||
131 | tristate "RAID-6 mode" | ||
132 | depends on BLK_DEV_MD | ||
133 | ---help--- | ||
134 | A RAID-6 set of N drives with a capacity of C MB per drive | ||
135 | provides the capacity of C * (N - 2) MB, and protects | ||
136 | against a failure of any two drives. For a given sector | ||
137 | (row) number, (N - 2) drives contain data sectors, and two | ||
138 | drives contains two independent redundancy syndromes. Like | ||
139 | RAID-5, RAID-6 distributes the syndromes across the drives | ||
140 | in one of the available parity distribution methods. | ||
141 | |||
142 | RAID-6 requires mdadm-1.5.0 or later, available at: | ||
143 | |||
144 | ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/ | ||
145 | |||
146 | If you want to use such a RAID-6 set, say Y. To compile | ||
147 | this code as a module, choose M here: the module will be | ||
148 | called raid6. | ||
149 | |||
150 | If unsure, say Y. | ||
151 | |||
152 | config MD_MULTIPATH | ||
153 | tristate "Multipath I/O support" | ||
154 | depends on BLK_DEV_MD | ||
155 | help | ||
156 | Multipath-IO is the ability of certain devices to address the same | ||
157 | physical disk over multiple 'IO paths'. The code ensures that such | ||
158 | paths can be defined and handled at runtime, and ensures that a | ||
159 | transparent failover to the backup path(s) happens if a IO errors | ||
160 | arrives on the primary path. | ||
161 | |||
162 | If unsure, say N. | ||
163 | |||
164 | config MD_FAULTY | ||
165 | tristate "Faulty test module for MD" | ||
166 | depends on BLK_DEV_MD | ||
167 | help | ||
168 | The "faulty" module allows for a block device that occasionally returns | ||
169 | read or write errors. It is useful for testing. | ||
170 | |||
171 | In unsure, say N. | ||
172 | |||
173 | config BLK_DEV_DM | ||
174 | tristate "Device mapper support" | ||
175 | depends on MD | ||
176 | ---help--- | ||
177 | Device-mapper is a low level volume manager. It works by allowing | ||
178 | people to specify mappings for ranges of logical sectors. Various | ||
179 | mapping types are available, in addition people may write their own | ||
180 | modules containing custom mappings if they wish. | ||
181 | |||
182 | Higher level volume managers such as LVM2 use this driver. | ||
183 | |||
184 | To compile this as a module, choose M here: the module will be | ||
185 | called dm-mod. | ||
186 | |||
187 | If unsure, say N. | ||
188 | |||
189 | config DM_CRYPT | ||
190 | tristate "Crypt target support" | ||
191 | depends on BLK_DEV_DM && EXPERIMENTAL | ||
192 | select CRYPTO | ||
193 | ---help--- | ||
194 | This device-mapper target allows you to create a device that | ||
195 | transparently encrypts the data on it. You'll need to activate | ||
196 | the ciphers you're going to use in the cryptoapi configuration. | ||
197 | |||
198 | Information on how to use dm-crypt can be found on | ||
199 | |||
200 | <http://www.saout.de/misc/dm-crypt/> | ||
201 | |||
202 | To compile this code as a module, choose M here: the module will | ||
203 | be called dm-crypt. | ||
204 | |||
205 | If unsure, say N. | ||
206 | |||
207 | config DM_SNAPSHOT | ||
208 | tristate "Snapshot target (EXPERIMENTAL)" | ||
209 | depends on BLK_DEV_DM && EXPERIMENTAL | ||
210 | ---help--- | ||
211 | Allow volume managers to take writeable snapshots of a device. | ||
212 | |||
213 | config DM_MIRROR | ||
214 | tristate "Mirror target (EXPERIMENTAL)" | ||
215 | depends on BLK_DEV_DM && EXPERIMENTAL | ||
216 | ---help--- | ||
217 | Allow volume managers to mirror logical volumes, also | ||
218 | needed for live data migration tools such as 'pvmove'. | ||
219 | |||
220 | config DM_ZERO | ||
221 | tristate "Zero target (EXPERIMENTAL)" | ||
222 | depends on BLK_DEV_DM && EXPERIMENTAL | ||
223 | ---help--- | ||
224 | A target that discards writes, and returns all zeroes for | ||
225 | reads. Useful in some recovery situations. | ||
226 | |||
227 | config DM_MULTIPATH | ||
228 | tristate "Multipath target (EXPERIMENTAL)" | ||
229 | depends on BLK_DEV_DM && EXPERIMENTAL | ||
230 | ---help--- | ||
231 | Allow volume managers to support multipath hardware. | ||
232 | |||
233 | config DM_MULTIPATH_EMC | ||
234 | tristate "EMC CX/AX multipath support (EXPERIMENTAL)" | ||
235 | depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL | ||
236 | ---help--- | ||
237 | Multipath support for EMC CX/AX series hardware. | ||
238 | |||
239 | endmenu | ||
240 | |||
diff --git a/drivers/md/Makefile b/drivers/md/Makefile new file mode 100644 index 000000000000..90de9c146a5f --- /dev/null +++ b/drivers/md/Makefile | |||
@@ -0,0 +1,107 @@ | |||
1 | # | ||
2 | # Makefile for the kernel software RAID and LVM drivers. | ||
3 | # | ||
4 | |||
5 | dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ | ||
6 | dm-ioctl.o dm-io.o kcopyd.o | ||
7 | dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o | ||
8 | dm-snapshot-objs := dm-snap.o dm-exception-store.o | ||
9 | dm-mirror-objs := dm-log.o dm-raid1.o | ||
10 | raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \ | ||
11 | raid6int1.o raid6int2.o raid6int4.o \ | ||
12 | raid6int8.o raid6int16.o raid6int32.o \ | ||
13 | raid6altivec1.o raid6altivec2.o raid6altivec4.o \ | ||
14 | raid6altivec8.o \ | ||
15 | raid6mmx.o raid6sse1.o raid6sse2.o | ||
16 | hostprogs-y := mktables | ||
17 | |||
18 | # Note: link order is important. All raid personalities | ||
19 | # and xor.o must come before md.o, as they each initialise | ||
20 | # themselves, and md.o may use the personalities when it | ||
21 | # auto-initialised. | ||
22 | |||
23 | obj-$(CONFIG_MD_LINEAR) += linear.o | ||
24 | obj-$(CONFIG_MD_RAID0) += raid0.o | ||
25 | obj-$(CONFIG_MD_RAID1) += raid1.o | ||
26 | obj-$(CONFIG_MD_RAID10) += raid10.o | ||
27 | obj-$(CONFIG_MD_RAID5) += raid5.o xor.o | ||
28 | obj-$(CONFIG_MD_RAID6) += raid6.o xor.o | ||
29 | obj-$(CONFIG_MD_MULTIPATH) += multipath.o | ||
30 | obj-$(CONFIG_MD_FAULTY) += faulty.o | ||
31 | obj-$(CONFIG_BLK_DEV_MD) += md.o | ||
32 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | ||
33 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o | ||
34 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o | ||
35 | obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o | ||
36 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | ||
37 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o | ||
38 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | ||
39 | |||
40 | quiet_cmd_unroll = UNROLL $@ | ||
41 | cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \ | ||
42 | < $< > $@ || ( rm -f $@ && exit 1 ) | ||
43 | |||
44 | ifeq ($(CONFIG_ALTIVEC),y) | ||
45 | altivec_flags := -maltivec -mabi=altivec | ||
46 | endif | ||
47 | |||
48 | targets += raid6int1.c | ||
49 | $(obj)/raid6int1.c: UNROLL := 1 | ||
50 | $(obj)/raid6int1.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE | ||
51 | $(call if_changed,unroll) | ||
52 | |||
53 | targets += raid6int2.c | ||
54 | $(obj)/raid6int2.c: UNROLL := 2 | ||
55 | $(obj)/raid6int2.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE | ||
56 | $(call if_changed,unroll) | ||
57 | |||
58 | targets += raid6int4.c | ||
59 | $(obj)/raid6int4.c: UNROLL := 4 | ||
60 | $(obj)/raid6int4.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE | ||
61 | $(call if_changed,unroll) | ||
62 | |||
63 | targets += raid6int8.c | ||
64 | $(obj)/raid6int8.c: UNROLL := 8 | ||
65 | $(obj)/raid6int8.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE | ||
66 | $(call if_changed,unroll) | ||
67 | |||
68 | targets += raid6int16.c | ||
69 | $(obj)/raid6int16.c: UNROLL := 16 | ||
70 | $(obj)/raid6int16.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE | ||
71 | $(call if_changed,unroll) | ||
72 | |||
73 | targets += raid6int32.c | ||
74 | $(obj)/raid6int32.c: UNROLL := 32 | ||
75 | $(obj)/raid6int32.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE | ||
76 | $(call if_changed,unroll) | ||
77 | |||
78 | CFLAGS_raid6altivec1.o += $(altivec_flags) | ||
79 | targets += raid6altivec1.c | ||
80 | $(obj)/raid6altivec1.c: UNROLL := 1 | ||
81 | $(obj)/raid6altivec1.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE | ||
82 | $(call if_changed,unroll) | ||
83 | |||
84 | CFLAGS_raid6altivec2.o += $(altivec_flags) | ||
85 | targets += raid6altivec2.c | ||
86 | $(obj)/raid6altivec2.c: UNROLL := 2 | ||
87 | $(obj)/raid6altivec2.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE | ||
88 | $(call if_changed,unroll) | ||
89 | |||
90 | CFLAGS_raid6altivec4.o += $(altivec_flags) | ||
91 | targets += raid6altivec4.c | ||
92 | $(obj)/raid6altivec4.c: UNROLL := 4 | ||
93 | $(obj)/raid6altivec4.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE | ||
94 | $(call if_changed,unroll) | ||
95 | |||
96 | CFLAGS_raid6altivec8.o += $(altivec_flags) | ||
97 | targets += raid6altivec8.c | ||
98 | $(obj)/raid6altivec8.c: UNROLL := 8 | ||
99 | $(obj)/raid6altivec8.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE | ||
100 | $(call if_changed,unroll) | ||
101 | |||
102 | quiet_cmd_mktable = TABLE $@ | ||
103 | cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) | ||
104 | |||
105 | targets += raid6tables.c | ||
106 | $(obj)/raid6tables.c: $(obj)/mktables FORCE | ||
107 | $(call if_changed,mktable) | ||
diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h new file mode 100644 index 000000000000..bc021e1fd4d1 --- /dev/null +++ b/drivers/md/dm-bio-list.h | |||
@@ -0,0 +1,68 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2004 Red Hat UK Ltd. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #ifndef DM_BIO_LIST_H | ||
8 | #define DM_BIO_LIST_H | ||
9 | |||
10 | #include <linux/bio.h> | ||
11 | |||
12 | struct bio_list { | ||
13 | struct bio *head; | ||
14 | struct bio *tail; | ||
15 | }; | ||
16 | |||
17 | static inline void bio_list_init(struct bio_list *bl) | ||
18 | { | ||
19 | bl->head = bl->tail = NULL; | ||
20 | } | ||
21 | |||
22 | static inline void bio_list_add(struct bio_list *bl, struct bio *bio) | ||
23 | { | ||
24 | bio->bi_next = NULL; | ||
25 | |||
26 | if (bl->tail) | ||
27 | bl->tail->bi_next = bio; | ||
28 | else | ||
29 | bl->head = bio; | ||
30 | |||
31 | bl->tail = bio; | ||
32 | } | ||
33 | |||
34 | static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) | ||
35 | { | ||
36 | if (bl->tail) | ||
37 | bl->tail->bi_next = bl2->head; | ||
38 | else | ||
39 | bl->head = bl2->head; | ||
40 | |||
41 | bl->tail = bl2->tail; | ||
42 | } | ||
43 | |||
44 | static inline struct bio *bio_list_pop(struct bio_list *bl) | ||
45 | { | ||
46 | struct bio *bio = bl->head; | ||
47 | |||
48 | if (bio) { | ||
49 | bl->head = bl->head->bi_next; | ||
50 | if (!bl->head) | ||
51 | bl->tail = NULL; | ||
52 | |||
53 | bio->bi_next = NULL; | ||
54 | } | ||
55 | |||
56 | return bio; | ||
57 | } | ||
58 | |||
59 | static inline struct bio *bio_list_get(struct bio_list *bl) | ||
60 | { | ||
61 | struct bio *bio = bl->head; | ||
62 | |||
63 | bl->head = bl->tail = NULL; | ||
64 | |||
65 | return bio; | ||
66 | } | ||
67 | |||
68 | #endif | ||
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h new file mode 100644 index 000000000000..d3ec217847d6 --- /dev/null +++ b/drivers/md/dm-bio-record.h | |||
@@ -0,0 +1,45 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #ifndef DM_BIO_RECORD_H | ||
8 | #define DM_BIO_RECORD_H | ||
9 | |||
10 | #include <linux/bio.h> | ||
11 | |||
12 | /* | ||
13 | * There are lots of mutable fields in the bio struct that get | ||
14 | * changed by the lower levels of the block layer. Some targets, | ||
15 | * such as multipath, may wish to resubmit a bio on error. The | ||
16 | * functions in this file help the target record and restore the | ||
17 | * original bio state. | ||
18 | */ | ||
19 | struct dm_bio_details { | ||
20 | sector_t bi_sector; | ||
21 | struct block_device *bi_bdev; | ||
22 | unsigned int bi_size; | ||
23 | unsigned short bi_idx; | ||
24 | unsigned long bi_flags; | ||
25 | }; | ||
26 | |||
27 | static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) | ||
28 | { | ||
29 | bd->bi_sector = bio->bi_sector; | ||
30 | bd->bi_bdev = bio->bi_bdev; | ||
31 | bd->bi_size = bio->bi_size; | ||
32 | bd->bi_idx = bio->bi_idx; | ||
33 | bd->bi_flags = bio->bi_flags; | ||
34 | } | ||
35 | |||
36 | static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) | ||
37 | { | ||
38 | bio->bi_sector = bd->bi_sector; | ||
39 | bio->bi_bdev = bd->bi_bdev; | ||
40 | bio->bi_size = bd->bi_size; | ||
41 | bio->bi_idx = bd->bi_idx; | ||
42 | bio->bi_flags = bd->bi_flags; | ||
43 | } | ||
44 | |||
45 | #endif | ||
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c new file mode 100644 index 000000000000..77619a56e2bf --- /dev/null +++ b/drivers/md/dm-crypt.c | |||
@@ -0,0 +1,977 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Christophe Saout <christophe@saout.de> | ||
3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> | ||
4 | * | ||
5 | * This file is released under the GPL. | ||
6 | */ | ||
7 | |||
8 | #include <linux/module.h> | ||
9 | #include <linux/init.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/bio.h> | ||
12 | #include <linux/blkdev.h> | ||
13 | #include <linux/mempool.h> | ||
14 | #include <linux/slab.h> | ||
15 | #include <linux/crypto.h> | ||
16 | #include <linux/workqueue.h> | ||
17 | #include <asm/atomic.h> | ||
18 | #include <asm/scatterlist.h> | ||
19 | #include <asm/page.h> | ||
20 | |||
21 | #include "dm.h" | ||
22 | |||
23 | #define PFX "crypt: " | ||
24 | |||
25 | /* | ||
26 | * per bio private data | ||
27 | */ | ||
28 | struct crypt_io { | ||
29 | struct dm_target *target; | ||
30 | struct bio *bio; | ||
31 | struct bio *first_clone; | ||
32 | struct work_struct work; | ||
33 | atomic_t pending; | ||
34 | int error; | ||
35 | }; | ||
36 | |||
37 | /* | ||
38 | * context holding the current state of a multi-part conversion | ||
39 | */ | ||
40 | struct convert_context { | ||
41 | struct bio *bio_in; | ||
42 | struct bio *bio_out; | ||
43 | unsigned int offset_in; | ||
44 | unsigned int offset_out; | ||
45 | unsigned int idx_in; | ||
46 | unsigned int idx_out; | ||
47 | sector_t sector; | ||
48 | int write; | ||
49 | }; | ||
50 | |||
51 | struct crypt_config; | ||
52 | |||
53 | struct crypt_iv_operations { | ||
54 | int (*ctr)(struct crypt_config *cc, struct dm_target *ti, | ||
55 | const char *opts); | ||
56 | void (*dtr)(struct crypt_config *cc); | ||
57 | const char *(*status)(struct crypt_config *cc); | ||
58 | int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); | ||
59 | }; | ||
60 | |||
61 | /* | ||
62 | * Crypt: maps a linear range of a block device | ||
63 | * and encrypts / decrypts at the same time. | ||
64 | */ | ||
65 | struct crypt_config { | ||
66 | struct dm_dev *dev; | ||
67 | sector_t start; | ||
68 | |||
69 | /* | ||
70 | * pool for per bio private data and | ||
71 | * for encryption buffer pages | ||
72 | */ | ||
73 | mempool_t *io_pool; | ||
74 | mempool_t *page_pool; | ||
75 | |||
76 | /* | ||
77 | * crypto related data | ||
78 | */ | ||
79 | struct crypt_iv_operations *iv_gen_ops; | ||
80 | char *iv_mode; | ||
81 | void *iv_gen_private; | ||
82 | sector_t iv_offset; | ||
83 | unsigned int iv_size; | ||
84 | |||
85 | struct crypto_tfm *tfm; | ||
86 | unsigned int key_size; | ||
87 | u8 key[0]; | ||
88 | }; | ||
89 | |||
90 | #define MIN_IOS 256 | ||
91 | #define MIN_POOL_PAGES 32 | ||
92 | #define MIN_BIO_PAGES 8 | ||
93 | |||
94 | static kmem_cache_t *_crypt_io_pool; | ||
95 | |||
96 | /* | ||
97 | * Mempool alloc and free functions for the page | ||
98 | */ | ||
99 | static void *mempool_alloc_page(unsigned int __nocast gfp_mask, void *data) | ||
100 | { | ||
101 | return alloc_page(gfp_mask); | ||
102 | } | ||
103 | |||
104 | static void mempool_free_page(void *page, void *data) | ||
105 | { | ||
106 | __free_page(page); | ||
107 | } | ||
108 | |||
109 | |||
110 | /* | ||
111 | * Different IV generation algorithms: | ||
112 | * | ||
113 | * plain: the initial vector is the 32-bit low-endian version of the sector | ||
114 | * number, padded with zeros if neccessary. | ||
115 | * | ||
116 | * ess_iv: "encrypted sector|salt initial vector", the sector number is | ||
117 | * encrypted with the bulk cipher using a salt as key. The salt | ||
118 | * should be derived from the bulk cipher's key via hashing. | ||
119 | * | ||
120 | * plumb: unimplemented, see: | ||
121 | * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 | ||
122 | */ | ||
123 | |||
124 | static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | ||
125 | { | ||
126 | memset(iv, 0, cc->iv_size); | ||
127 | *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); | ||
128 | |||
129 | return 0; | ||
130 | } | ||
131 | |||
132 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | ||
133 | const char *opts) | ||
134 | { | ||
135 | struct crypto_tfm *essiv_tfm; | ||
136 | struct crypto_tfm *hash_tfm; | ||
137 | struct scatterlist sg; | ||
138 | unsigned int saltsize; | ||
139 | u8 *salt; | ||
140 | |||
141 | if (opts == NULL) { | ||
142 | ti->error = PFX "Digest algorithm missing for ESSIV mode"; | ||
143 | return -EINVAL; | ||
144 | } | ||
145 | |||
146 | /* Hash the cipher key with the given hash algorithm */ | ||
147 | hash_tfm = crypto_alloc_tfm(opts, 0); | ||
148 | if (hash_tfm == NULL) { | ||
149 | ti->error = PFX "Error initializing ESSIV hash"; | ||
150 | return -EINVAL; | ||
151 | } | ||
152 | |||
153 | if (crypto_tfm_alg_type(hash_tfm) != CRYPTO_ALG_TYPE_DIGEST) { | ||
154 | ti->error = PFX "Expected digest algorithm for ESSIV hash"; | ||
155 | crypto_free_tfm(hash_tfm); | ||
156 | return -EINVAL; | ||
157 | } | ||
158 | |||
159 | saltsize = crypto_tfm_alg_digestsize(hash_tfm); | ||
160 | salt = kmalloc(saltsize, GFP_KERNEL); | ||
161 | if (salt == NULL) { | ||
162 | ti->error = PFX "Error kmallocing salt storage in ESSIV"; | ||
163 | crypto_free_tfm(hash_tfm); | ||
164 | return -ENOMEM; | ||
165 | } | ||
166 | |||
167 | sg.page = virt_to_page(cc->key); | ||
168 | sg.offset = offset_in_page(cc->key); | ||
169 | sg.length = cc->key_size; | ||
170 | crypto_digest_digest(hash_tfm, &sg, 1, salt); | ||
171 | crypto_free_tfm(hash_tfm); | ||
172 | |||
173 | /* Setup the essiv_tfm with the given salt */ | ||
174 | essiv_tfm = crypto_alloc_tfm(crypto_tfm_alg_name(cc->tfm), | ||
175 | CRYPTO_TFM_MODE_ECB); | ||
176 | if (essiv_tfm == NULL) { | ||
177 | ti->error = PFX "Error allocating crypto tfm for ESSIV"; | ||
178 | kfree(salt); | ||
179 | return -EINVAL; | ||
180 | } | ||
181 | if (crypto_tfm_alg_blocksize(essiv_tfm) | ||
182 | != crypto_tfm_alg_ivsize(cc->tfm)) { | ||
183 | ti->error = PFX "Block size of ESSIV cipher does " | ||
184 | "not match IV size of block cipher"; | ||
185 | crypto_free_tfm(essiv_tfm); | ||
186 | kfree(salt); | ||
187 | return -EINVAL; | ||
188 | } | ||
189 | if (crypto_cipher_setkey(essiv_tfm, salt, saltsize) < 0) { | ||
190 | ti->error = PFX "Failed to set key for ESSIV cipher"; | ||
191 | crypto_free_tfm(essiv_tfm); | ||
192 | kfree(salt); | ||
193 | return -EINVAL; | ||
194 | } | ||
195 | kfree(salt); | ||
196 | |||
197 | cc->iv_gen_private = (void *)essiv_tfm; | ||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) | ||
202 | { | ||
203 | crypto_free_tfm((struct crypto_tfm *)cc->iv_gen_private); | ||
204 | cc->iv_gen_private = NULL; | ||
205 | } | ||
206 | |||
207 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | ||
208 | { | ||
209 | struct scatterlist sg = { NULL, }; | ||
210 | |||
211 | memset(iv, 0, cc->iv_size); | ||
212 | *(u64 *)iv = cpu_to_le64(sector); | ||
213 | |||
214 | sg.page = virt_to_page(iv); | ||
215 | sg.offset = offset_in_page(iv); | ||
216 | sg.length = cc->iv_size; | ||
217 | crypto_cipher_encrypt((struct crypto_tfm *)cc->iv_gen_private, | ||
218 | &sg, &sg, cc->iv_size); | ||
219 | |||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | static struct crypt_iv_operations crypt_iv_plain_ops = { | ||
224 | .generator = crypt_iv_plain_gen | ||
225 | }; | ||
226 | |||
227 | static struct crypt_iv_operations crypt_iv_essiv_ops = { | ||
228 | .ctr = crypt_iv_essiv_ctr, | ||
229 | .dtr = crypt_iv_essiv_dtr, | ||
230 | .generator = crypt_iv_essiv_gen | ||
231 | }; | ||
232 | |||
233 | |||
234 | static inline int | ||
235 | crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out, | ||
236 | struct scatterlist *in, unsigned int length, | ||
237 | int write, sector_t sector) | ||
238 | { | ||
239 | u8 iv[cc->iv_size]; | ||
240 | int r; | ||
241 | |||
242 | if (cc->iv_gen_ops) { | ||
243 | r = cc->iv_gen_ops->generator(cc, iv, sector); | ||
244 | if (r < 0) | ||
245 | return r; | ||
246 | |||
247 | if (write) | ||
248 | r = crypto_cipher_encrypt_iv(cc->tfm, out, in, length, iv); | ||
249 | else | ||
250 | r = crypto_cipher_decrypt_iv(cc->tfm, out, in, length, iv); | ||
251 | } else { | ||
252 | if (write) | ||
253 | r = crypto_cipher_encrypt(cc->tfm, out, in, length); | ||
254 | else | ||
255 | r = crypto_cipher_decrypt(cc->tfm, out, in, length); | ||
256 | } | ||
257 | |||
258 | return r; | ||
259 | } | ||
260 | |||
261 | static void | ||
262 | crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx, | ||
263 | struct bio *bio_out, struct bio *bio_in, | ||
264 | sector_t sector, int write) | ||
265 | { | ||
266 | ctx->bio_in = bio_in; | ||
267 | ctx->bio_out = bio_out; | ||
268 | ctx->offset_in = 0; | ||
269 | ctx->offset_out = 0; | ||
270 | ctx->idx_in = bio_in ? bio_in->bi_idx : 0; | ||
271 | ctx->idx_out = bio_out ? bio_out->bi_idx : 0; | ||
272 | ctx->sector = sector + cc->iv_offset; | ||
273 | ctx->write = write; | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * Encrypt / decrypt data from one bio to another one (can be the same one) | ||
278 | */ | ||
279 | static int crypt_convert(struct crypt_config *cc, | ||
280 | struct convert_context *ctx) | ||
281 | { | ||
282 | int r = 0; | ||
283 | |||
284 | while(ctx->idx_in < ctx->bio_in->bi_vcnt && | ||
285 | ctx->idx_out < ctx->bio_out->bi_vcnt) { | ||
286 | struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); | ||
287 | struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); | ||
288 | struct scatterlist sg_in = { | ||
289 | .page = bv_in->bv_page, | ||
290 | .offset = bv_in->bv_offset + ctx->offset_in, | ||
291 | .length = 1 << SECTOR_SHIFT | ||
292 | }; | ||
293 | struct scatterlist sg_out = { | ||
294 | .page = bv_out->bv_page, | ||
295 | .offset = bv_out->bv_offset + ctx->offset_out, | ||
296 | .length = 1 << SECTOR_SHIFT | ||
297 | }; | ||
298 | |||
299 | ctx->offset_in += sg_in.length; | ||
300 | if (ctx->offset_in >= bv_in->bv_len) { | ||
301 | ctx->offset_in = 0; | ||
302 | ctx->idx_in++; | ||
303 | } | ||
304 | |||
305 | ctx->offset_out += sg_out.length; | ||
306 | if (ctx->offset_out >= bv_out->bv_len) { | ||
307 | ctx->offset_out = 0; | ||
308 | ctx->idx_out++; | ||
309 | } | ||
310 | |||
311 | r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length, | ||
312 | ctx->write, ctx->sector); | ||
313 | if (r < 0) | ||
314 | break; | ||
315 | |||
316 | ctx->sector++; | ||
317 | } | ||
318 | |||
319 | return r; | ||
320 | } | ||
321 | |||
322 | /* | ||
323 | * Generate a new unfragmented bio with the given size | ||
324 | * This should never violate the device limitations | ||
325 | * May return a smaller bio when running out of pages | ||
326 | */ | ||
327 | static struct bio * | ||
328 | crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, | ||
329 | struct bio *base_bio, unsigned int *bio_vec_idx) | ||
330 | { | ||
331 | struct bio *bio; | ||
332 | unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
333 | int gfp_mask = GFP_NOIO | __GFP_HIGHMEM; | ||
334 | unsigned long flags = current->flags; | ||
335 | unsigned int i; | ||
336 | |||
337 | /* | ||
338 | * Tell VM to act less aggressively and fail earlier. | ||
339 | * This is not necessary but increases throughput. | ||
340 | * FIXME: Is this really intelligent? | ||
341 | */ | ||
342 | current->flags &= ~PF_MEMALLOC; | ||
343 | |||
344 | if (base_bio) | ||
345 | bio = bio_clone(base_bio, GFP_NOIO); | ||
346 | else | ||
347 | bio = bio_alloc(GFP_NOIO, nr_iovecs); | ||
348 | if (!bio) { | ||
349 | if (flags & PF_MEMALLOC) | ||
350 | current->flags |= PF_MEMALLOC; | ||
351 | return NULL; | ||
352 | } | ||
353 | |||
354 | /* if the last bio was not complete, continue where that one ended */ | ||
355 | bio->bi_idx = *bio_vec_idx; | ||
356 | bio->bi_vcnt = *bio_vec_idx; | ||
357 | bio->bi_size = 0; | ||
358 | bio->bi_flags &= ~(1 << BIO_SEG_VALID); | ||
359 | |||
360 | /* bio->bi_idx pages have already been allocated */ | ||
361 | size -= bio->bi_idx * PAGE_SIZE; | ||
362 | |||
363 | for(i = bio->bi_idx; i < nr_iovecs; i++) { | ||
364 | struct bio_vec *bv = bio_iovec_idx(bio, i); | ||
365 | |||
366 | bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask); | ||
367 | if (!bv->bv_page) | ||
368 | break; | ||
369 | |||
370 | /* | ||
371 | * if additional pages cannot be allocated without waiting, | ||
372 | * return a partially allocated bio, the caller will then try | ||
373 | * to allocate additional bios while submitting this partial bio | ||
374 | */ | ||
375 | if ((i - bio->bi_idx) == (MIN_BIO_PAGES - 1)) | ||
376 | gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; | ||
377 | |||
378 | bv->bv_offset = 0; | ||
379 | if (size > PAGE_SIZE) | ||
380 | bv->bv_len = PAGE_SIZE; | ||
381 | else | ||
382 | bv->bv_len = size; | ||
383 | |||
384 | bio->bi_size += bv->bv_len; | ||
385 | bio->bi_vcnt++; | ||
386 | size -= bv->bv_len; | ||
387 | } | ||
388 | |||
389 | if (flags & PF_MEMALLOC) | ||
390 | current->flags |= PF_MEMALLOC; | ||
391 | |||
392 | if (!bio->bi_size) { | ||
393 | bio_put(bio); | ||
394 | return NULL; | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * Remember the last bio_vec allocated to be able | ||
399 | * to correctly continue after the splitting. | ||
400 | */ | ||
401 | *bio_vec_idx = bio->bi_vcnt; | ||
402 | |||
403 | return bio; | ||
404 | } | ||
405 | |||
406 | static void crypt_free_buffer_pages(struct crypt_config *cc, | ||
407 | struct bio *bio, unsigned int bytes) | ||
408 | { | ||
409 | unsigned int i, start, end; | ||
410 | struct bio_vec *bv; | ||
411 | |||
412 | /* | ||
413 | * This is ugly, but Jens Axboe thinks that using bi_idx in the | ||
414 | * endio function is too dangerous at the moment, so I calculate the | ||
415 | * correct position using bi_vcnt and bi_size. | ||
416 | * The bv_offset and bv_len fields might already be modified but we | ||
417 | * know that we always allocated whole pages. | ||
418 | * A fix to the bi_idx issue in the kernel is in the works, so | ||
419 | * we will hopefully be able to revert to the cleaner solution soon. | ||
420 | */ | ||
421 | i = bio->bi_vcnt - 1; | ||
422 | bv = bio_iovec_idx(bio, i); | ||
423 | end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - bio->bi_size; | ||
424 | start = end - bytes; | ||
425 | |||
426 | start >>= PAGE_SHIFT; | ||
427 | if (!bio->bi_size) | ||
428 | end = bio->bi_vcnt; | ||
429 | else | ||
430 | end >>= PAGE_SHIFT; | ||
431 | |||
432 | for(i = start; i < end; i++) { | ||
433 | bv = bio_iovec_idx(bio, i); | ||
434 | BUG_ON(!bv->bv_page); | ||
435 | mempool_free(bv->bv_page, cc->page_pool); | ||
436 | bv->bv_page = NULL; | ||
437 | } | ||
438 | } | ||
439 | |||
440 | /* | ||
441 | * One of the bios was finished. Check for completion of | ||
442 | * the whole request and correctly clean up the buffer. | ||
443 | */ | ||
444 | static void dec_pending(struct crypt_io *io, int error) | ||
445 | { | ||
446 | struct crypt_config *cc = (struct crypt_config *) io->target->private; | ||
447 | |||
448 | if (error < 0) | ||
449 | io->error = error; | ||
450 | |||
451 | if (!atomic_dec_and_test(&io->pending)) | ||
452 | return; | ||
453 | |||
454 | if (io->first_clone) | ||
455 | bio_put(io->first_clone); | ||
456 | |||
457 | bio_endio(io->bio, io->bio->bi_size, io->error); | ||
458 | |||
459 | mempool_free(io, cc->io_pool); | ||
460 | } | ||
461 | |||
462 | /* | ||
463 | * kcryptd: | ||
464 | * | ||
465 | * Needed because it would be very unwise to do decryption in an | ||
466 | * interrupt context, so bios returning from read requests get | ||
467 | * queued here. | ||
468 | */ | ||
469 | static struct workqueue_struct *_kcryptd_workqueue; | ||
470 | |||
471 | static void kcryptd_do_work(void *data) | ||
472 | { | ||
473 | struct crypt_io *io = (struct crypt_io *) data; | ||
474 | struct crypt_config *cc = (struct crypt_config *) io->target->private; | ||
475 | struct convert_context ctx; | ||
476 | int r; | ||
477 | |||
478 | crypt_convert_init(cc, &ctx, io->bio, io->bio, | ||
479 | io->bio->bi_sector - io->target->begin, 0); | ||
480 | r = crypt_convert(cc, &ctx); | ||
481 | |||
482 | dec_pending(io, r); | ||
483 | } | ||
484 | |||
485 | static void kcryptd_queue_io(struct crypt_io *io) | ||
486 | { | ||
487 | INIT_WORK(&io->work, kcryptd_do_work, io); | ||
488 | queue_work(_kcryptd_workqueue, &io->work); | ||
489 | } | ||
490 | |||
491 | /* | ||
492 | * Decode key from its hex representation | ||
493 | */ | ||
494 | static int crypt_decode_key(u8 *key, char *hex, unsigned int size) | ||
495 | { | ||
496 | char buffer[3]; | ||
497 | char *endp; | ||
498 | unsigned int i; | ||
499 | |||
500 | buffer[2] = '\0'; | ||
501 | |||
502 | for(i = 0; i < size; i++) { | ||
503 | buffer[0] = *hex++; | ||
504 | buffer[1] = *hex++; | ||
505 | |||
506 | key[i] = (u8)simple_strtoul(buffer, &endp, 16); | ||
507 | |||
508 | if (endp != &buffer[2]) | ||
509 | return -EINVAL; | ||
510 | } | ||
511 | |||
512 | if (*hex != '\0') | ||
513 | return -EINVAL; | ||
514 | |||
515 | return 0; | ||
516 | } | ||
517 | |||
518 | /* | ||
519 | * Encode key into its hex representation | ||
520 | */ | ||
521 | static void crypt_encode_key(char *hex, u8 *key, unsigned int size) | ||
522 | { | ||
523 | unsigned int i; | ||
524 | |||
525 | for(i = 0; i < size; i++) { | ||
526 | sprintf(hex, "%02x", *key); | ||
527 | hex += 2; | ||
528 | key++; | ||
529 | } | ||
530 | } | ||
531 | |||
532 | /* | ||
533 | * Construct an encryption mapping: | ||
534 | * <cipher> <key> <iv_offset> <dev_path> <start> | ||
535 | */ | ||
536 | static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | ||
537 | { | ||
538 | struct crypt_config *cc; | ||
539 | struct crypto_tfm *tfm; | ||
540 | char *tmp; | ||
541 | char *cipher; | ||
542 | char *chainmode; | ||
543 | char *ivmode; | ||
544 | char *ivopts; | ||
545 | unsigned int crypto_flags; | ||
546 | unsigned int key_size; | ||
547 | |||
548 | if (argc != 5) { | ||
549 | ti->error = PFX "Not enough arguments"; | ||
550 | return -EINVAL; | ||
551 | } | ||
552 | |||
553 | tmp = argv[0]; | ||
554 | cipher = strsep(&tmp, "-"); | ||
555 | chainmode = strsep(&tmp, "-"); | ||
556 | ivopts = strsep(&tmp, "-"); | ||
557 | ivmode = strsep(&ivopts, ":"); | ||
558 | |||
559 | if (tmp) | ||
560 | DMWARN(PFX "Unexpected additional cipher options"); | ||
561 | |||
562 | key_size = strlen(argv[1]) >> 1; | ||
563 | |||
564 | cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); | ||
565 | if (cc == NULL) { | ||
566 | ti->error = | ||
567 | PFX "Cannot allocate transparent encryption context"; | ||
568 | return -ENOMEM; | ||
569 | } | ||
570 | |||
571 | cc->key_size = key_size; | ||
572 | if ((!key_size && strcmp(argv[1], "-") != 0) || | ||
573 | (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) { | ||
574 | ti->error = PFX "Error decoding key"; | ||
575 | goto bad1; | ||
576 | } | ||
577 | |||
578 | /* Compatiblity mode for old dm-crypt cipher strings */ | ||
579 | if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { | ||
580 | chainmode = "cbc"; | ||
581 | ivmode = "plain"; | ||
582 | } | ||
583 | |||
584 | /* Choose crypto_flags according to chainmode */ | ||
585 | if (strcmp(chainmode, "cbc") == 0) | ||
586 | crypto_flags = CRYPTO_TFM_MODE_CBC; | ||
587 | else if (strcmp(chainmode, "ecb") == 0) | ||
588 | crypto_flags = CRYPTO_TFM_MODE_ECB; | ||
589 | else { | ||
590 | ti->error = PFX "Unknown chaining mode"; | ||
591 | goto bad1; | ||
592 | } | ||
593 | |||
594 | if (crypto_flags != CRYPTO_TFM_MODE_ECB && !ivmode) { | ||
595 | ti->error = PFX "This chaining mode requires an IV mechanism"; | ||
596 | goto bad1; | ||
597 | } | ||
598 | |||
599 | tfm = crypto_alloc_tfm(cipher, crypto_flags); | ||
600 | if (!tfm) { | ||
601 | ti->error = PFX "Error allocating crypto tfm"; | ||
602 | goto bad1; | ||
603 | } | ||
604 | if (crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER) { | ||
605 | ti->error = PFX "Expected cipher algorithm"; | ||
606 | goto bad2; | ||
607 | } | ||
608 | |||
609 | cc->tfm = tfm; | ||
610 | |||
611 | /* | ||
612 | * Choose ivmode. Valid modes: "plain", "essiv:<esshash>". | ||
613 | * See comments at iv code | ||
614 | */ | ||
615 | |||
616 | if (ivmode == NULL) | ||
617 | cc->iv_gen_ops = NULL; | ||
618 | else if (strcmp(ivmode, "plain") == 0) | ||
619 | cc->iv_gen_ops = &crypt_iv_plain_ops; | ||
620 | else if (strcmp(ivmode, "essiv") == 0) | ||
621 | cc->iv_gen_ops = &crypt_iv_essiv_ops; | ||
622 | else { | ||
623 | ti->error = PFX "Invalid IV mode"; | ||
624 | goto bad2; | ||
625 | } | ||
626 | |||
627 | if (cc->iv_gen_ops && cc->iv_gen_ops->ctr && | ||
628 | cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) | ||
629 | goto bad2; | ||
630 | |||
631 | if (tfm->crt_cipher.cit_decrypt_iv && tfm->crt_cipher.cit_encrypt_iv) | ||
632 | /* at least a 64 bit sector number should fit in our buffer */ | ||
633 | cc->iv_size = max(crypto_tfm_alg_ivsize(tfm), | ||
634 | (unsigned int)(sizeof(u64) / sizeof(u8))); | ||
635 | else { | ||
636 | cc->iv_size = 0; | ||
637 | if (cc->iv_gen_ops) { | ||
638 | DMWARN(PFX "Selected cipher does not support IVs"); | ||
639 | if (cc->iv_gen_ops->dtr) | ||
640 | cc->iv_gen_ops->dtr(cc); | ||
641 | cc->iv_gen_ops = NULL; | ||
642 | } | ||
643 | } | ||
644 | |||
645 | cc->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, | ||
646 | mempool_free_slab, _crypt_io_pool); | ||
647 | if (!cc->io_pool) { | ||
648 | ti->error = PFX "Cannot allocate crypt io mempool"; | ||
649 | goto bad3; | ||
650 | } | ||
651 | |||
652 | cc->page_pool = mempool_create(MIN_POOL_PAGES, mempool_alloc_page, | ||
653 | mempool_free_page, NULL); | ||
654 | if (!cc->page_pool) { | ||
655 | ti->error = PFX "Cannot allocate page mempool"; | ||
656 | goto bad4; | ||
657 | } | ||
658 | |||
659 | if (tfm->crt_cipher.cit_setkey(tfm, cc->key, key_size) < 0) { | ||
660 | ti->error = PFX "Error setting key"; | ||
661 | goto bad5; | ||
662 | } | ||
663 | |||
664 | if (sscanf(argv[2], SECTOR_FORMAT, &cc->iv_offset) != 1) { | ||
665 | ti->error = PFX "Invalid iv_offset sector"; | ||
666 | goto bad5; | ||
667 | } | ||
668 | |||
669 | if (sscanf(argv[4], SECTOR_FORMAT, &cc->start) != 1) { | ||
670 | ti->error = PFX "Invalid device sector"; | ||
671 | goto bad5; | ||
672 | } | ||
673 | |||
674 | if (dm_get_device(ti, argv[3], cc->start, ti->len, | ||
675 | dm_table_get_mode(ti->table), &cc->dev)) { | ||
676 | ti->error = PFX "Device lookup failed"; | ||
677 | goto bad5; | ||
678 | } | ||
679 | |||
680 | if (ivmode && cc->iv_gen_ops) { | ||
681 | if (ivopts) | ||
682 | *(ivopts - 1) = ':'; | ||
683 | cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL); | ||
684 | if (!cc->iv_mode) { | ||
685 | ti->error = PFX "Error kmallocing iv_mode string"; | ||
686 | goto bad5; | ||
687 | } | ||
688 | strcpy(cc->iv_mode, ivmode); | ||
689 | } else | ||
690 | cc->iv_mode = NULL; | ||
691 | |||
692 | ti->private = cc; | ||
693 | return 0; | ||
694 | |||
695 | bad5: | ||
696 | mempool_destroy(cc->page_pool); | ||
697 | bad4: | ||
698 | mempool_destroy(cc->io_pool); | ||
699 | bad3: | ||
700 | if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) | ||
701 | cc->iv_gen_ops->dtr(cc); | ||
702 | bad2: | ||
703 | crypto_free_tfm(tfm); | ||
704 | bad1: | ||
705 | kfree(cc); | ||
706 | return -EINVAL; | ||
707 | } | ||
708 | |||
709 | static void crypt_dtr(struct dm_target *ti) | ||
710 | { | ||
711 | struct crypt_config *cc = (struct crypt_config *) ti->private; | ||
712 | |||
713 | mempool_destroy(cc->page_pool); | ||
714 | mempool_destroy(cc->io_pool); | ||
715 | |||
716 | if (cc->iv_mode) | ||
717 | kfree(cc->iv_mode); | ||
718 | if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) | ||
719 | cc->iv_gen_ops->dtr(cc); | ||
720 | crypto_free_tfm(cc->tfm); | ||
721 | dm_put_device(ti, cc->dev); | ||
722 | kfree(cc); | ||
723 | } | ||
724 | |||
725 | static int crypt_endio(struct bio *bio, unsigned int done, int error) | ||
726 | { | ||
727 | struct crypt_io *io = (struct crypt_io *) bio->bi_private; | ||
728 | struct crypt_config *cc = (struct crypt_config *) io->target->private; | ||
729 | |||
730 | if (bio_data_dir(bio) == WRITE) { | ||
731 | /* | ||
732 | * free the processed pages, even if | ||
733 | * it's only a partially completed write | ||
734 | */ | ||
735 | crypt_free_buffer_pages(cc, bio, done); | ||
736 | } | ||
737 | |||
738 | if (bio->bi_size) | ||
739 | return 1; | ||
740 | |||
741 | bio_put(bio); | ||
742 | |||
743 | /* | ||
744 | * successful reads are decrypted by the worker thread | ||
745 | */ | ||
746 | if ((bio_data_dir(bio) == READ) | ||
747 | && bio_flagged(bio, BIO_UPTODATE)) { | ||
748 | kcryptd_queue_io(io); | ||
749 | return 0; | ||
750 | } | ||
751 | |||
752 | dec_pending(io, error); | ||
753 | return error; | ||
754 | } | ||
755 | |||
756 | static inline struct bio * | ||
757 | crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio, | ||
758 | sector_t sector, unsigned int *bvec_idx, | ||
759 | struct convert_context *ctx) | ||
760 | { | ||
761 | struct bio *clone; | ||
762 | |||
763 | if (bio_data_dir(bio) == WRITE) { | ||
764 | clone = crypt_alloc_buffer(cc, bio->bi_size, | ||
765 | io->first_clone, bvec_idx); | ||
766 | if (clone) { | ||
767 | ctx->bio_out = clone; | ||
768 | if (crypt_convert(cc, ctx) < 0) { | ||
769 | crypt_free_buffer_pages(cc, clone, | ||
770 | clone->bi_size); | ||
771 | bio_put(clone); | ||
772 | return NULL; | ||
773 | } | ||
774 | } | ||
775 | } else { | ||
776 | /* | ||
777 | * The block layer might modify the bvec array, so always | ||
778 | * copy the required bvecs because we need the original | ||
779 | * one in order to decrypt the whole bio data *afterwards*. | ||
780 | */ | ||
781 | clone = bio_alloc(GFP_NOIO, bio_segments(bio)); | ||
782 | if (clone) { | ||
783 | clone->bi_idx = 0; | ||
784 | clone->bi_vcnt = bio_segments(bio); | ||
785 | clone->bi_size = bio->bi_size; | ||
786 | memcpy(clone->bi_io_vec, bio_iovec(bio), | ||
787 | sizeof(struct bio_vec) * clone->bi_vcnt); | ||
788 | } | ||
789 | } | ||
790 | |||
791 | if (!clone) | ||
792 | return NULL; | ||
793 | |||
794 | clone->bi_private = io; | ||
795 | clone->bi_end_io = crypt_endio; | ||
796 | clone->bi_bdev = cc->dev->bdev; | ||
797 | clone->bi_sector = cc->start + sector; | ||
798 | clone->bi_rw = bio->bi_rw; | ||
799 | |||
800 | return clone; | ||
801 | } | ||
802 | |||
803 | static int crypt_map(struct dm_target *ti, struct bio *bio, | ||
804 | union map_info *map_context) | ||
805 | { | ||
806 | struct crypt_config *cc = (struct crypt_config *) ti->private; | ||
807 | struct crypt_io *io = mempool_alloc(cc->io_pool, GFP_NOIO); | ||
808 | struct convert_context ctx; | ||
809 | struct bio *clone; | ||
810 | unsigned int remaining = bio->bi_size; | ||
811 | sector_t sector = bio->bi_sector - ti->begin; | ||
812 | unsigned int bvec_idx = 0; | ||
813 | |||
814 | io->target = ti; | ||
815 | io->bio = bio; | ||
816 | io->first_clone = NULL; | ||
817 | io->error = 0; | ||
818 | atomic_set(&io->pending, 1); /* hold a reference */ | ||
819 | |||
820 | if (bio_data_dir(bio) == WRITE) | ||
821 | crypt_convert_init(cc, &ctx, NULL, bio, sector, 1); | ||
822 | |||
823 | /* | ||
824 | * The allocated buffers can be smaller than the whole bio, | ||
825 | * so repeat the whole process until all the data can be handled. | ||
826 | */ | ||
827 | while (remaining) { | ||
828 | clone = crypt_clone(cc, io, bio, sector, &bvec_idx, &ctx); | ||
829 | if (!clone) | ||
830 | goto cleanup; | ||
831 | |||
832 | if (!io->first_clone) { | ||
833 | /* | ||
834 | * hold a reference to the first clone, because it | ||
835 | * holds the bio_vec array and that can't be freed | ||
836 | * before all other clones are released | ||
837 | */ | ||
838 | bio_get(clone); | ||
839 | io->first_clone = clone; | ||
840 | } | ||
841 | atomic_inc(&io->pending); | ||
842 | |||
843 | remaining -= clone->bi_size; | ||
844 | sector += bio_sectors(clone); | ||
845 | |||
846 | generic_make_request(clone); | ||
847 | |||
848 | /* out of memory -> run queues */ | ||
849 | if (remaining) | ||
850 | blk_congestion_wait(bio_data_dir(clone), HZ/100); | ||
851 | } | ||
852 | |||
853 | /* drop reference, clones could have returned before we reach this */ | ||
854 | dec_pending(io, 0); | ||
855 | return 0; | ||
856 | |||
857 | cleanup: | ||
858 | if (io->first_clone) { | ||
859 | dec_pending(io, -ENOMEM); | ||
860 | return 0; | ||
861 | } | ||
862 | |||
863 | /* if no bio has been dispatched yet, we can directly return the error */ | ||
864 | mempool_free(io, cc->io_pool); | ||
865 | return -ENOMEM; | ||
866 | } | ||
867 | |||
868 | static int crypt_status(struct dm_target *ti, status_type_t type, | ||
869 | char *result, unsigned int maxlen) | ||
870 | { | ||
871 | struct crypt_config *cc = (struct crypt_config *) ti->private; | ||
872 | const char *cipher; | ||
873 | const char *chainmode = NULL; | ||
874 | unsigned int sz = 0; | ||
875 | |||
876 | switch (type) { | ||
877 | case STATUSTYPE_INFO: | ||
878 | result[0] = '\0'; | ||
879 | break; | ||
880 | |||
881 | case STATUSTYPE_TABLE: | ||
882 | cipher = crypto_tfm_alg_name(cc->tfm); | ||
883 | |||
884 | switch(cc->tfm->crt_cipher.cit_mode) { | ||
885 | case CRYPTO_TFM_MODE_CBC: | ||
886 | chainmode = "cbc"; | ||
887 | break; | ||
888 | case CRYPTO_TFM_MODE_ECB: | ||
889 | chainmode = "ecb"; | ||
890 | break; | ||
891 | default: | ||
892 | BUG(); | ||
893 | } | ||
894 | |||
895 | if (cc->iv_mode) | ||
896 | DMEMIT("%s-%s-%s ", cipher, chainmode, cc->iv_mode); | ||
897 | else | ||
898 | DMEMIT("%s-%s ", cipher, chainmode); | ||
899 | |||
900 | if (cc->key_size > 0) { | ||
901 | if ((maxlen - sz) < ((cc->key_size << 1) + 1)) | ||
902 | return -ENOMEM; | ||
903 | |||
904 | crypt_encode_key(result + sz, cc->key, cc->key_size); | ||
905 | sz += cc->key_size << 1; | ||
906 | } else { | ||
907 | if (sz >= maxlen) | ||
908 | return -ENOMEM; | ||
909 | result[sz++] = '-'; | ||
910 | } | ||
911 | |||
912 | DMEMIT(" " SECTOR_FORMAT " %s " SECTOR_FORMAT, | ||
913 | cc->iv_offset, cc->dev->name, cc->start); | ||
914 | break; | ||
915 | } | ||
916 | return 0; | ||
917 | } | ||
918 | |||
919 | static struct target_type crypt_target = { | ||
920 | .name = "crypt", | ||
921 | .version= {1, 1, 0}, | ||
922 | .module = THIS_MODULE, | ||
923 | .ctr = crypt_ctr, | ||
924 | .dtr = crypt_dtr, | ||
925 | .map = crypt_map, | ||
926 | .status = crypt_status, | ||
927 | }; | ||
928 | |||
929 | static int __init dm_crypt_init(void) | ||
930 | { | ||
931 | int r; | ||
932 | |||
933 | _crypt_io_pool = kmem_cache_create("dm-crypt_io", | ||
934 | sizeof(struct crypt_io), | ||
935 | 0, 0, NULL, NULL); | ||
936 | if (!_crypt_io_pool) | ||
937 | return -ENOMEM; | ||
938 | |||
939 | _kcryptd_workqueue = create_workqueue("kcryptd"); | ||
940 | if (!_kcryptd_workqueue) { | ||
941 | r = -ENOMEM; | ||
942 | DMERR(PFX "couldn't create kcryptd"); | ||
943 | goto bad1; | ||
944 | } | ||
945 | |||
946 | r = dm_register_target(&crypt_target); | ||
947 | if (r < 0) { | ||
948 | DMERR(PFX "register failed %d", r); | ||
949 | goto bad2; | ||
950 | } | ||
951 | |||
952 | return 0; | ||
953 | |||
954 | bad2: | ||
955 | destroy_workqueue(_kcryptd_workqueue); | ||
956 | bad1: | ||
957 | kmem_cache_destroy(_crypt_io_pool); | ||
958 | return r; | ||
959 | } | ||
960 | |||
961 | static void __exit dm_crypt_exit(void) | ||
962 | { | ||
963 | int r = dm_unregister_target(&crypt_target); | ||
964 | |||
965 | if (r < 0) | ||
966 | DMERR(PFX "unregister failed %d", r); | ||
967 | |||
968 | destroy_workqueue(_kcryptd_workqueue); | ||
969 | kmem_cache_destroy(_crypt_io_pool); | ||
970 | } | ||
971 | |||
972 | module_init(dm_crypt_init); | ||
973 | module_exit(dm_crypt_exit); | ||
974 | |||
975 | MODULE_AUTHOR("Christophe Saout <christophe@saout.de>"); | ||
976 | MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption"); | ||
977 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c new file mode 100644 index 000000000000..700658664594 --- /dev/null +++ b/drivers/md/dm-emc.c | |||
@@ -0,0 +1,359 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved. | ||
3 | * Copyright (C) 2004 Red Hat, Inc. All rights reserved. | ||
4 | * | ||
5 | * This file is released under the GPL. | ||
6 | * | ||
7 | * Multipath support for EMC CLARiiON AX/CX-series hardware. | ||
8 | */ | ||
9 | |||
10 | #include "dm.h" | ||
11 | #include "dm-hw-handler.h" | ||
12 | #include <scsi/scsi.h> | ||
13 | #include <scsi/scsi_cmnd.h> | ||
14 | |||
15 | struct emc_handler { | ||
16 | spinlock_t lock; | ||
17 | |||
18 | /* Whether we should send the short trespass command (FC-series) | ||
19 | * or the long version (default for AX/CX CLARiiON arrays). */ | ||
20 | unsigned short_trespass; | ||
21 | /* Whether or not to honor SCSI reservations when initiating a | ||
22 | * switch-over. Default: Don't. */ | ||
23 | unsigned hr; | ||
24 | |||
25 | unsigned char sense[SCSI_SENSE_BUFFERSIZE]; | ||
26 | }; | ||
27 | |||
28 | #define TRESPASS_PAGE 0x22 | ||
29 | #define EMC_FAILOVER_TIMEOUT (60 * HZ) | ||
30 | |||
31 | /* Code borrowed from dm-lsi-rdac by Mike Christie */ | ||
32 | |||
33 | static inline void free_bio(struct bio *bio) | ||
34 | { | ||
35 | __free_page(bio->bi_io_vec[0].bv_page); | ||
36 | bio_put(bio); | ||
37 | } | ||
38 | |||
39 | static int emc_endio(struct bio *bio, unsigned int bytes_done, int error) | ||
40 | { | ||
41 | struct path *path = bio->bi_private; | ||
42 | |||
43 | if (bio->bi_size) | ||
44 | return 1; | ||
45 | |||
46 | /* We also need to look at the sense keys here whether or not to | ||
47 | * switch to the next PG etc. | ||
48 | * | ||
49 | * For now simple logic: either it works or it doesn't. | ||
50 | */ | ||
51 | if (error) | ||
52 | dm_pg_init_complete(path, MP_FAIL_PATH); | ||
53 | else | ||
54 | dm_pg_init_complete(path, 0); | ||
55 | |||
56 | /* request is freed in block layer */ | ||
57 | free_bio(bio); | ||
58 | |||
59 | return 0; | ||
60 | } | ||
61 | |||
62 | static struct bio *get_failover_bio(struct path *path, unsigned data_size) | ||
63 | { | ||
64 | struct bio *bio; | ||
65 | struct page *page; | ||
66 | |||
67 | bio = bio_alloc(GFP_ATOMIC, 1); | ||
68 | if (!bio) { | ||
69 | DMERR("dm-emc: get_failover_bio: bio_alloc() failed."); | ||
70 | return NULL; | ||
71 | } | ||
72 | |||
73 | bio->bi_rw |= (1 << BIO_RW); | ||
74 | bio->bi_bdev = path->dev->bdev; | ||
75 | bio->bi_sector = 0; | ||
76 | bio->bi_private = path; | ||
77 | bio->bi_end_io = emc_endio; | ||
78 | |||
79 | page = alloc_page(GFP_ATOMIC); | ||
80 | if (!page) { | ||
81 | DMERR("dm-emc: get_failover_bio: alloc_page() failed."); | ||
82 | bio_put(bio); | ||
83 | return NULL; | ||
84 | } | ||
85 | |||
86 | if (bio_add_page(bio, page, data_size, 0) != data_size) { | ||
87 | DMERR("dm-emc: get_failover_bio: alloc_page() failed."); | ||
88 | __free_page(page); | ||
89 | bio_put(bio); | ||
90 | return NULL; | ||
91 | } | ||
92 | |||
93 | return bio; | ||
94 | } | ||
95 | |||
96 | static struct request *get_failover_req(struct emc_handler *h, | ||
97 | struct bio *bio, struct path *path) | ||
98 | { | ||
99 | struct request *rq; | ||
100 | struct block_device *bdev = bio->bi_bdev; | ||
101 | struct request_queue *q = bdev_get_queue(bdev); | ||
102 | |||
103 | /* FIXME: Figure out why it fails with GFP_ATOMIC. */ | ||
104 | rq = blk_get_request(q, WRITE, __GFP_WAIT); | ||
105 | if (!rq) { | ||
106 | DMERR("dm-emc: get_failover_req: blk_get_request failed"); | ||
107 | return NULL; | ||
108 | } | ||
109 | |||
110 | rq->bio = rq->biotail = bio; | ||
111 | blk_rq_bio_prep(q, rq, bio); | ||
112 | |||
113 | rq->rq_disk = bdev->bd_contains->bd_disk; | ||
114 | |||
115 | /* bio backed don't set data */ | ||
116 | rq->buffer = rq->data = NULL; | ||
117 | /* rq data_len used for pc cmd's request_bufflen */ | ||
118 | rq->data_len = bio->bi_size; | ||
119 | |||
120 | rq->sense = h->sense; | ||
121 | memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE); | ||
122 | rq->sense_len = 0; | ||
123 | |||
124 | memset(&rq->cmd, 0, BLK_MAX_CDB); | ||
125 | |||
126 | rq->timeout = EMC_FAILOVER_TIMEOUT; | ||
127 | rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE); | ||
128 | |||
129 | return rq; | ||
130 | } | ||
131 | |||
132 | static struct request *emc_trespass_get(struct emc_handler *h, | ||
133 | struct path *path) | ||
134 | { | ||
135 | struct bio *bio; | ||
136 | struct request *rq; | ||
137 | unsigned char *page22; | ||
138 | unsigned char long_trespass_pg[] = { | ||
139 | 0, 0, 0, 0, | ||
140 | TRESPASS_PAGE, /* Page code */ | ||
141 | 0x09, /* Page length - 2 */ | ||
142 | h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */ | ||
143 | 0xff, 0xff, /* Trespass target */ | ||
144 | 0, 0, 0, 0, 0, 0 /* Reserved bytes / unknown */ | ||
145 | }; | ||
146 | unsigned char short_trespass_pg[] = { | ||
147 | 0, 0, 0, 0, | ||
148 | TRESPASS_PAGE, /* Page code */ | ||
149 | 0x02, /* Page length - 2 */ | ||
150 | h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */ | ||
151 | 0xff, /* Trespass target */ | ||
152 | }; | ||
153 | unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) : | ||
154 | sizeof(long_trespass_pg); | ||
155 | |||
156 | /* get bio backing */ | ||
157 | if (data_size > PAGE_SIZE) | ||
158 | /* this should never happen */ | ||
159 | return NULL; | ||
160 | |||
161 | bio = get_failover_bio(path, data_size); | ||
162 | if (!bio) { | ||
163 | DMERR("dm-emc: emc_trespass_get: no bio"); | ||
164 | return NULL; | ||
165 | } | ||
166 | |||
167 | page22 = (unsigned char *)bio_data(bio); | ||
168 | memset(page22, 0, data_size); | ||
169 | |||
170 | memcpy(page22, h->short_trespass ? | ||
171 | short_trespass_pg : long_trespass_pg, data_size); | ||
172 | |||
173 | /* get request for block layer packet command */ | ||
174 | rq = get_failover_req(h, bio, path); | ||
175 | if (!rq) { | ||
176 | DMERR("dm-emc: emc_trespass_get: no rq"); | ||
177 | free_bio(bio); | ||
178 | return NULL; | ||
179 | } | ||
180 | |||
181 | /* Prepare the command. */ | ||
182 | rq->cmd[0] = MODE_SELECT; | ||
183 | rq->cmd[1] = 0x10; | ||
184 | rq->cmd[4] = data_size; | ||
185 | rq->cmd_len = COMMAND_SIZE(rq->cmd[0]); | ||
186 | |||
187 | return rq; | ||
188 | } | ||
189 | |||
190 | static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed, | ||
191 | struct path *path) | ||
192 | { | ||
193 | struct request *rq; | ||
194 | struct request_queue *q = bdev_get_queue(path->dev->bdev); | ||
195 | |||
196 | /* | ||
197 | * We can either blindly init the pg (then look at the sense), | ||
198 | * or we can send some commands to get the state here (then | ||
199 | * possibly send the fo cmnd), or we can also have the | ||
200 | * initial state passed into us and then get an update here. | ||
201 | */ | ||
202 | if (!q) { | ||
203 | DMINFO("dm-emc: emc_pg_init: no queue"); | ||
204 | goto fail_path; | ||
205 | } | ||
206 | |||
207 | /* FIXME: The request should be pre-allocated. */ | ||
208 | rq = emc_trespass_get(hwh->context, path); | ||
209 | if (!rq) { | ||
210 | DMERR("dm-emc: emc_pg_init: no rq"); | ||
211 | goto fail_path; | ||
212 | } | ||
213 | |||
214 | DMINFO("dm-emc: emc_pg_init: sending switch-over command"); | ||
215 | elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1); | ||
216 | return; | ||
217 | |||
218 | fail_path: | ||
219 | dm_pg_init_complete(path, MP_FAIL_PATH); | ||
220 | } | ||
221 | |||
222 | static struct emc_handler *alloc_emc_handler(void) | ||
223 | { | ||
224 | struct emc_handler *h = kmalloc(sizeof(*h), GFP_KERNEL); | ||
225 | |||
226 | if (h) | ||
227 | spin_lock_init(&h->lock); | ||
228 | |||
229 | return h; | ||
230 | } | ||
231 | |||
232 | static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv) | ||
233 | { | ||
234 | struct emc_handler *h; | ||
235 | unsigned hr, short_trespass; | ||
236 | |||
237 | if (argc == 0) { | ||
238 | /* No arguments: use defaults */ | ||
239 | hr = 0; | ||
240 | short_trespass = 0; | ||
241 | } else if (argc != 2) { | ||
242 | DMWARN("dm-emc hwhandler: incorrect number of arguments"); | ||
243 | return -EINVAL; | ||
244 | } else { | ||
245 | if ((sscanf(argv[0], "%u", &short_trespass) != 1) | ||
246 | || (short_trespass > 1)) { | ||
247 | DMWARN("dm-emc: invalid trespass mode selected"); | ||
248 | return -EINVAL; | ||
249 | } | ||
250 | |||
251 | if ((sscanf(argv[1], "%u", &hr) != 1) | ||
252 | || (hr > 1)) { | ||
253 | DMWARN("dm-emc: invalid honor reservation flag selected"); | ||
254 | return -EINVAL; | ||
255 | } | ||
256 | } | ||
257 | |||
258 | h = alloc_emc_handler(); | ||
259 | if (!h) | ||
260 | return -ENOMEM; | ||
261 | |||
262 | memset(h, 0, sizeof(*h)); | ||
263 | |||
264 | hwh->context = h; | ||
265 | |||
266 | if ((h->short_trespass = short_trespass)) | ||
267 | DMWARN("dm-emc: short trespass command will be send"); | ||
268 | else | ||
269 | DMWARN("dm-emc: long trespass command will be send"); | ||
270 | |||
271 | if ((h->hr = hr)) | ||
272 | DMWARN("dm-emc: honor reservation bit will be set"); | ||
273 | else | ||
274 | DMWARN("dm-emc: honor reservation bit will not be set (default)"); | ||
275 | |||
276 | return 0; | ||
277 | } | ||
278 | |||
279 | static void emc_destroy(struct hw_handler *hwh) | ||
280 | { | ||
281 | struct emc_handler *h = (struct emc_handler *) hwh->context; | ||
282 | |||
283 | kfree(h); | ||
284 | hwh->context = NULL; | ||
285 | } | ||
286 | |||
287 | static unsigned emc_error(struct hw_handler *hwh, struct bio *bio) | ||
288 | { | ||
289 | /* FIXME: Patch from axboe still missing */ | ||
290 | #if 0 | ||
291 | int sense; | ||
292 | |||
293 | if (bio->bi_error & BIO_SENSE) { | ||
294 | sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */ | ||
295 | |||
296 | if (sense == 0x020403) { | ||
297 | /* LUN Not Ready - Manual Intervention Required | ||
298 | * indicates this is a passive path. | ||
299 | * | ||
300 | * FIXME: However, if this is seen and EVPD C0 | ||
301 | * indicates that this is due to a NDU in | ||
302 | * progress, we should set FAIL_PATH too. | ||
303 | * This indicates we might have to do a SCSI | ||
304 | * inquiry in the end_io path. Ugh. */ | ||
305 | return MP_BYPASS_PG | MP_RETRY_IO; | ||
306 | } else if (sense == 0x052501) { | ||
307 | /* An array based copy is in progress. Do not | ||
308 | * fail the path, do not bypass to another PG, | ||
309 | * do not retry. Fail the IO immediately. | ||
310 | * (Actually this is the same conclusion as in | ||
311 | * the default handler, but lets make sure.) */ | ||
312 | return 0; | ||
313 | } else if (sense == 0x062900) { | ||
314 | /* Unit Attention Code. This is the first IO | ||
315 | * to the new path, so just retry. */ | ||
316 | return MP_RETRY_IO; | ||
317 | } | ||
318 | } | ||
319 | #endif | ||
320 | |||
321 | /* Try default handler */ | ||
322 | return dm_scsi_err_handler(hwh, bio); | ||
323 | } | ||
324 | |||
325 | static struct hw_handler_type emc_hwh = { | ||
326 | .name = "emc", | ||
327 | .module = THIS_MODULE, | ||
328 | .create = emc_create, | ||
329 | .destroy = emc_destroy, | ||
330 | .pg_init = emc_pg_init, | ||
331 | .error = emc_error, | ||
332 | }; | ||
333 | |||
334 | static int __init dm_emc_init(void) | ||
335 | { | ||
336 | int r = dm_register_hw_handler(&emc_hwh); | ||
337 | |||
338 | if (r < 0) | ||
339 | DMERR("emc: register failed %d", r); | ||
340 | |||
341 | DMINFO("dm-emc version 0.0.3 loaded"); | ||
342 | |||
343 | return r; | ||
344 | } | ||
345 | |||
346 | static void __exit dm_emc_exit(void) | ||
347 | { | ||
348 | int r = dm_unregister_hw_handler(&emc_hwh); | ||
349 | |||
350 | if (r < 0) | ||
351 | DMERR("emc: unregister failed %d", r); | ||
352 | } | ||
353 | |||
354 | module_init(dm_emc_init); | ||
355 | module_exit(dm_emc_exit); | ||
356 | |||
357 | MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath"); | ||
358 | MODULE_AUTHOR("Lars Marowsky-Bree <lmb@suse.de>"); | ||
359 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c new file mode 100644 index 000000000000..17212b4201a1 --- /dev/null +++ b/drivers/md/dm-exception-store.c | |||
@@ -0,0 +1,648 @@ | |||
1 | /* | ||
2 | * dm-snapshot.c | ||
3 | * | ||
4 | * Copyright (C) 2001-2002 Sistina Software (UK) Limited. | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | */ | ||
8 | |||
9 | #include "dm.h" | ||
10 | #include "dm-snap.h" | ||
11 | #include "dm-io.h" | ||
12 | #include "kcopyd.h" | ||
13 | |||
14 | #include <linux/mm.h> | ||
15 | #include <linux/pagemap.h> | ||
16 | #include <linux/vmalloc.h> | ||
17 | #include <linux/slab.h> | ||
18 | |||
19 | /*----------------------------------------------------------------- | ||
20 | * Persistent snapshots, by persistent we mean that the snapshot | ||
21 | * will survive a reboot. | ||
22 | *---------------------------------------------------------------*/ | ||
23 | |||
24 | /* | ||
25 | * We need to store a record of which parts of the origin have | ||
26 | * been copied to the snapshot device. The snapshot code | ||
27 | * requires that we copy exception chunks to chunk aligned areas | ||
28 | * of the COW store. It makes sense therefore, to store the | ||
29 | * metadata in chunk size blocks. | ||
30 | * | ||
31 | * There is no backward or forward compatibility implemented, | ||
32 | * snapshots with different disk versions than the kernel will | ||
33 | * not be usable. It is expected that "lvcreate" will blank out | ||
34 | * the start of a fresh COW device before calling the snapshot | ||
35 | * constructor. | ||
36 | * | ||
37 | * The first chunk of the COW device just contains the header. | ||
38 | * After this there is a chunk filled with exception metadata, | ||
39 | * followed by as many exception chunks as can fit in the | ||
40 | * metadata areas. | ||
41 | * | ||
42 | * All on disk structures are in little-endian format. The end | ||
43 | * of the exceptions info is indicated by an exception with a | ||
44 | * new_chunk of 0, which is invalid since it would point to the | ||
45 | * header chunk. | ||
46 | */ | ||
47 | |||
48 | /* | ||
49 | * Magic for persistent snapshots: "SnAp" - Feeble isn't it. | ||
50 | */ | ||
51 | #define SNAP_MAGIC 0x70416e53 | ||
52 | |||
53 | /* | ||
54 | * The on-disk version of the metadata. | ||
55 | */ | ||
56 | #define SNAPSHOT_DISK_VERSION 1 | ||
57 | |||
58 | struct disk_header { | ||
59 | uint32_t magic; | ||
60 | |||
61 | /* | ||
62 | * Is this snapshot valid. There is no way of recovering | ||
63 | * an invalid snapshot. | ||
64 | */ | ||
65 | uint32_t valid; | ||
66 | |||
67 | /* | ||
68 | * Simple, incrementing version. no backward | ||
69 | * compatibility. | ||
70 | */ | ||
71 | uint32_t version; | ||
72 | |||
73 | /* In sectors */ | ||
74 | uint32_t chunk_size; | ||
75 | }; | ||
76 | |||
77 | struct disk_exception { | ||
78 | uint64_t old_chunk; | ||
79 | uint64_t new_chunk; | ||
80 | }; | ||
81 | |||
82 | struct commit_callback { | ||
83 | void (*callback)(void *, int success); | ||
84 | void *context; | ||
85 | }; | ||
86 | |||
87 | /* | ||
88 | * The top level structure for a persistent exception store. | ||
89 | */ | ||
90 | struct pstore { | ||
91 | struct dm_snapshot *snap; /* up pointer to my snapshot */ | ||
92 | int version; | ||
93 | int valid; | ||
94 | uint32_t chunk_size; | ||
95 | uint32_t exceptions_per_area; | ||
96 | |||
97 | /* | ||
98 | * Now that we have an asynchronous kcopyd there is no | ||
99 | * need for large chunk sizes, so it wont hurt to have a | ||
100 | * whole chunks worth of metadata in memory at once. | ||
101 | */ | ||
102 | void *area; | ||
103 | |||
104 | /* | ||
105 | * Used to keep track of which metadata area the data in | ||
106 | * 'chunk' refers to. | ||
107 | */ | ||
108 | uint32_t current_area; | ||
109 | |||
110 | /* | ||
111 | * The next free chunk for an exception. | ||
112 | */ | ||
113 | uint32_t next_free; | ||
114 | |||
115 | /* | ||
116 | * The index of next free exception in the current | ||
117 | * metadata area. | ||
118 | */ | ||
119 | uint32_t current_committed; | ||
120 | |||
121 | atomic_t pending_count; | ||
122 | uint32_t callback_count; | ||
123 | struct commit_callback *callbacks; | ||
124 | }; | ||
125 | |||
126 | static inline unsigned int sectors_to_pages(unsigned int sectors) | ||
127 | { | ||
128 | return sectors / (PAGE_SIZE >> 9); | ||
129 | } | ||
130 | |||
131 | static int alloc_area(struct pstore *ps) | ||
132 | { | ||
133 | int r = -ENOMEM; | ||
134 | size_t len; | ||
135 | |||
136 | len = ps->chunk_size << SECTOR_SHIFT; | ||
137 | |||
138 | /* | ||
139 | * Allocate the chunk_size block of memory that will hold | ||
140 | * a single metadata area. | ||
141 | */ | ||
142 | ps->area = vmalloc(len); | ||
143 | if (!ps->area) | ||
144 | return r; | ||
145 | |||
146 | return 0; | ||
147 | } | ||
148 | |||
149 | static void free_area(struct pstore *ps) | ||
150 | { | ||
151 | vfree(ps->area); | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * Read or write a chunk aligned and sized block of data from a device. | ||
156 | */ | ||
157 | static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) | ||
158 | { | ||
159 | struct io_region where; | ||
160 | unsigned long bits; | ||
161 | |||
162 | where.bdev = ps->snap->cow->bdev; | ||
163 | where.sector = ps->chunk_size * chunk; | ||
164 | where.count = ps->chunk_size; | ||
165 | |||
166 | return dm_io_sync_vm(1, &where, rw, ps->area, &bits); | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * Read or write a metadata area. Remembering to skip the first | ||
171 | * chunk which holds the header. | ||
172 | */ | ||
173 | static int area_io(struct pstore *ps, uint32_t area, int rw) | ||
174 | { | ||
175 | int r; | ||
176 | uint32_t chunk; | ||
177 | |||
178 | /* convert a metadata area index to a chunk index */ | ||
179 | chunk = 1 + ((ps->exceptions_per_area + 1) * area); | ||
180 | |||
181 | r = chunk_io(ps, chunk, rw); | ||
182 | if (r) | ||
183 | return r; | ||
184 | |||
185 | ps->current_area = area; | ||
186 | return 0; | ||
187 | } | ||
188 | |||
189 | static int zero_area(struct pstore *ps, uint32_t area) | ||
190 | { | ||
191 | memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); | ||
192 | return area_io(ps, area, WRITE); | ||
193 | } | ||
194 | |||
195 | static int read_header(struct pstore *ps, int *new_snapshot) | ||
196 | { | ||
197 | int r; | ||
198 | struct disk_header *dh; | ||
199 | |||
200 | r = chunk_io(ps, 0, READ); | ||
201 | if (r) | ||
202 | return r; | ||
203 | |||
204 | dh = (struct disk_header *) ps->area; | ||
205 | |||
206 | if (le32_to_cpu(dh->magic) == 0) { | ||
207 | *new_snapshot = 1; | ||
208 | |||
209 | } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { | ||
210 | *new_snapshot = 0; | ||
211 | ps->valid = le32_to_cpu(dh->valid); | ||
212 | ps->version = le32_to_cpu(dh->version); | ||
213 | ps->chunk_size = le32_to_cpu(dh->chunk_size); | ||
214 | |||
215 | } else { | ||
216 | DMWARN("Invalid/corrupt snapshot"); | ||
217 | r = -ENXIO; | ||
218 | } | ||
219 | |||
220 | return r; | ||
221 | } | ||
222 | |||
223 | static int write_header(struct pstore *ps) | ||
224 | { | ||
225 | struct disk_header *dh; | ||
226 | |||
227 | memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); | ||
228 | |||
229 | dh = (struct disk_header *) ps->area; | ||
230 | dh->magic = cpu_to_le32(SNAP_MAGIC); | ||
231 | dh->valid = cpu_to_le32(ps->valid); | ||
232 | dh->version = cpu_to_le32(ps->version); | ||
233 | dh->chunk_size = cpu_to_le32(ps->chunk_size); | ||
234 | |||
235 | return chunk_io(ps, 0, WRITE); | ||
236 | } | ||
237 | |||
238 | /* | ||
239 | * Access functions for the disk exceptions, these do the endian conversions. | ||
240 | */ | ||
241 | static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) | ||
242 | { | ||
243 | if (index >= ps->exceptions_per_area) | ||
244 | return NULL; | ||
245 | |||
246 | return ((struct disk_exception *) ps->area) + index; | ||
247 | } | ||
248 | |||
249 | static int read_exception(struct pstore *ps, | ||
250 | uint32_t index, struct disk_exception *result) | ||
251 | { | ||
252 | struct disk_exception *e; | ||
253 | |||
254 | e = get_exception(ps, index); | ||
255 | if (!e) | ||
256 | return -EINVAL; | ||
257 | |||
258 | /* copy it */ | ||
259 | result->old_chunk = le64_to_cpu(e->old_chunk); | ||
260 | result->new_chunk = le64_to_cpu(e->new_chunk); | ||
261 | |||
262 | return 0; | ||
263 | } | ||
264 | |||
265 | static int write_exception(struct pstore *ps, | ||
266 | uint32_t index, struct disk_exception *de) | ||
267 | { | ||
268 | struct disk_exception *e; | ||
269 | |||
270 | e = get_exception(ps, index); | ||
271 | if (!e) | ||
272 | return -EINVAL; | ||
273 | |||
274 | /* copy it */ | ||
275 | e->old_chunk = cpu_to_le64(de->old_chunk); | ||
276 | e->new_chunk = cpu_to_le64(de->new_chunk); | ||
277 | |||
278 | return 0; | ||
279 | } | ||
280 | |||
281 | /* | ||
282 | * Registers the exceptions that are present in the current area. | ||
283 | * 'full' is filled in to indicate if the area has been | ||
284 | * filled. | ||
285 | */ | ||
286 | static int insert_exceptions(struct pstore *ps, int *full) | ||
287 | { | ||
288 | int r; | ||
289 | unsigned int i; | ||
290 | struct disk_exception de; | ||
291 | |||
292 | /* presume the area is full */ | ||
293 | *full = 1; | ||
294 | |||
295 | for (i = 0; i < ps->exceptions_per_area; i++) { | ||
296 | r = read_exception(ps, i, &de); | ||
297 | |||
298 | if (r) | ||
299 | return r; | ||
300 | |||
301 | /* | ||
302 | * If the new_chunk is pointing at the start of | ||
303 | * the COW device, where the first metadata area | ||
304 | * is we know that we've hit the end of the | ||
305 | * exceptions. Therefore the area is not full. | ||
306 | */ | ||
307 | if (de.new_chunk == 0LL) { | ||
308 | ps->current_committed = i; | ||
309 | *full = 0; | ||
310 | break; | ||
311 | } | ||
312 | |||
313 | /* | ||
314 | * Keep track of the start of the free chunks. | ||
315 | */ | ||
316 | if (ps->next_free <= de.new_chunk) | ||
317 | ps->next_free = de.new_chunk + 1; | ||
318 | |||
319 | /* | ||
320 | * Otherwise we add the exception to the snapshot. | ||
321 | */ | ||
322 | r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); | ||
323 | if (r) | ||
324 | return r; | ||
325 | } | ||
326 | |||
327 | return 0; | ||
328 | } | ||
329 | |||
330 | static int read_exceptions(struct pstore *ps) | ||
331 | { | ||
332 | uint32_t area; | ||
333 | int r, full = 1; | ||
334 | |||
335 | /* | ||
336 | * Keeping reading chunks and inserting exceptions until | ||
337 | * we find a partially full area. | ||
338 | */ | ||
339 | for (area = 0; full; area++) { | ||
340 | r = area_io(ps, area, READ); | ||
341 | if (r) | ||
342 | return r; | ||
343 | |||
344 | r = insert_exceptions(ps, &full); | ||
345 | if (r) | ||
346 | return r; | ||
347 | } | ||
348 | |||
349 | return 0; | ||
350 | } | ||
351 | |||
352 | static inline struct pstore *get_info(struct exception_store *store) | ||
353 | { | ||
354 | return (struct pstore *) store->context; | ||
355 | } | ||
356 | |||
357 | static void persistent_fraction_full(struct exception_store *store, | ||
358 | sector_t *numerator, sector_t *denominator) | ||
359 | { | ||
360 | *numerator = get_info(store)->next_free * store->snap->chunk_size; | ||
361 | *denominator = get_dev_size(store->snap->cow->bdev); | ||
362 | } | ||
363 | |||
364 | static void persistent_destroy(struct exception_store *store) | ||
365 | { | ||
366 | struct pstore *ps = get_info(store); | ||
367 | |||
368 | dm_io_put(sectors_to_pages(ps->chunk_size)); | ||
369 | vfree(ps->callbacks); | ||
370 | free_area(ps); | ||
371 | kfree(ps); | ||
372 | } | ||
373 | |||
374 | static int persistent_read_metadata(struct exception_store *store) | ||
375 | { | ||
376 | int r, new_snapshot; | ||
377 | struct pstore *ps = get_info(store); | ||
378 | |||
379 | /* | ||
380 | * Read the snapshot header. | ||
381 | */ | ||
382 | r = read_header(ps, &new_snapshot); | ||
383 | if (r) | ||
384 | return r; | ||
385 | |||
386 | /* | ||
387 | * Do we need to setup a new snapshot ? | ||
388 | */ | ||
389 | if (new_snapshot) { | ||
390 | r = write_header(ps); | ||
391 | if (r) { | ||
392 | DMWARN("write_header failed"); | ||
393 | return r; | ||
394 | } | ||
395 | |||
396 | r = zero_area(ps, 0); | ||
397 | if (r) { | ||
398 | DMWARN("zero_area(0) failed"); | ||
399 | return r; | ||
400 | } | ||
401 | |||
402 | } else { | ||
403 | /* | ||
404 | * Sanity checks. | ||
405 | */ | ||
406 | if (!ps->valid) { | ||
407 | DMWARN("snapshot is marked invalid"); | ||
408 | return -EINVAL; | ||
409 | } | ||
410 | |||
411 | if (ps->version != SNAPSHOT_DISK_VERSION) { | ||
412 | DMWARN("unable to handle snapshot disk version %d", | ||
413 | ps->version); | ||
414 | return -EINVAL; | ||
415 | } | ||
416 | |||
417 | /* | ||
418 | * Read the metadata. | ||
419 | */ | ||
420 | r = read_exceptions(ps); | ||
421 | if (r) | ||
422 | return r; | ||
423 | } | ||
424 | |||
425 | return 0; | ||
426 | } | ||
427 | |||
428 | static int persistent_prepare(struct exception_store *store, | ||
429 | struct exception *e) | ||
430 | { | ||
431 | struct pstore *ps = get_info(store); | ||
432 | uint32_t stride; | ||
433 | sector_t size = get_dev_size(store->snap->cow->bdev); | ||
434 | |||
435 | /* Is there enough room ? */ | ||
436 | if (size < ((ps->next_free + 1) * store->snap->chunk_size)) | ||
437 | return -ENOSPC; | ||
438 | |||
439 | e->new_chunk = ps->next_free; | ||
440 | |||
441 | /* | ||
442 | * Move onto the next free pending, making sure to take | ||
443 | * into account the location of the metadata chunks. | ||
444 | */ | ||
445 | stride = (ps->exceptions_per_area + 1); | ||
446 | if ((++ps->next_free % stride) == 1) | ||
447 | ps->next_free++; | ||
448 | |||
449 | atomic_inc(&ps->pending_count); | ||
450 | return 0; | ||
451 | } | ||
452 | |||
453 | static void persistent_commit(struct exception_store *store, | ||
454 | struct exception *e, | ||
455 | void (*callback) (void *, int success), | ||
456 | void *callback_context) | ||
457 | { | ||
458 | int r; | ||
459 | unsigned int i; | ||
460 | struct pstore *ps = get_info(store); | ||
461 | struct disk_exception de; | ||
462 | struct commit_callback *cb; | ||
463 | |||
464 | de.old_chunk = e->old_chunk; | ||
465 | de.new_chunk = e->new_chunk; | ||
466 | write_exception(ps, ps->current_committed++, &de); | ||
467 | |||
468 | /* | ||
469 | * Add the callback to the back of the array. This code | ||
470 | * is the only place where the callback array is | ||
471 | * manipulated, and we know that it will never be called | ||
472 | * multiple times concurrently. | ||
473 | */ | ||
474 | cb = ps->callbacks + ps->callback_count++; | ||
475 | cb->callback = callback; | ||
476 | cb->context = callback_context; | ||
477 | |||
478 | /* | ||
479 | * If there are no more exceptions in flight, or we have | ||
480 | * filled this metadata area we commit the exceptions to | ||
481 | * disk. | ||
482 | */ | ||
483 | if (atomic_dec_and_test(&ps->pending_count) || | ||
484 | (ps->current_committed == ps->exceptions_per_area)) { | ||
485 | r = area_io(ps, ps->current_area, WRITE); | ||
486 | if (r) | ||
487 | ps->valid = 0; | ||
488 | |||
489 | for (i = 0; i < ps->callback_count; i++) { | ||
490 | cb = ps->callbacks + i; | ||
491 | cb->callback(cb->context, r == 0 ? 1 : 0); | ||
492 | } | ||
493 | |||
494 | ps->callback_count = 0; | ||
495 | } | ||
496 | |||
497 | /* | ||
498 | * Have we completely filled the current area ? | ||
499 | */ | ||
500 | if (ps->current_committed == ps->exceptions_per_area) { | ||
501 | ps->current_committed = 0; | ||
502 | r = zero_area(ps, ps->current_area + 1); | ||
503 | if (r) | ||
504 | ps->valid = 0; | ||
505 | } | ||
506 | } | ||
507 | |||
508 | static void persistent_drop(struct exception_store *store) | ||
509 | { | ||
510 | struct pstore *ps = get_info(store); | ||
511 | |||
512 | ps->valid = 0; | ||
513 | if (write_header(ps)) | ||
514 | DMWARN("write header failed"); | ||
515 | } | ||
516 | |||
517 | int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) | ||
518 | { | ||
519 | int r; | ||
520 | struct pstore *ps; | ||
521 | |||
522 | r = dm_io_get(sectors_to_pages(chunk_size)); | ||
523 | if (r) | ||
524 | return r; | ||
525 | |||
526 | /* allocate the pstore */ | ||
527 | ps = kmalloc(sizeof(*ps), GFP_KERNEL); | ||
528 | if (!ps) { | ||
529 | r = -ENOMEM; | ||
530 | goto bad; | ||
531 | } | ||
532 | |||
533 | ps->snap = store->snap; | ||
534 | ps->valid = 1; | ||
535 | ps->version = SNAPSHOT_DISK_VERSION; | ||
536 | ps->chunk_size = chunk_size; | ||
537 | ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) / | ||
538 | sizeof(struct disk_exception); | ||
539 | ps->next_free = 2; /* skipping the header and first area */ | ||
540 | ps->current_committed = 0; | ||
541 | |||
542 | r = alloc_area(ps); | ||
543 | if (r) | ||
544 | goto bad; | ||
545 | |||
546 | /* | ||
547 | * Allocate space for all the callbacks. | ||
548 | */ | ||
549 | ps->callback_count = 0; | ||
550 | atomic_set(&ps->pending_count, 0); | ||
551 | ps->callbacks = dm_vcalloc(ps->exceptions_per_area, | ||
552 | sizeof(*ps->callbacks)); | ||
553 | |||
554 | if (!ps->callbacks) { | ||
555 | r = -ENOMEM; | ||
556 | goto bad; | ||
557 | } | ||
558 | |||
559 | store->destroy = persistent_destroy; | ||
560 | store->read_metadata = persistent_read_metadata; | ||
561 | store->prepare_exception = persistent_prepare; | ||
562 | store->commit_exception = persistent_commit; | ||
563 | store->drop_snapshot = persistent_drop; | ||
564 | store->fraction_full = persistent_fraction_full; | ||
565 | store->context = ps; | ||
566 | |||
567 | return 0; | ||
568 | |||
569 | bad: | ||
570 | dm_io_put(sectors_to_pages(chunk_size)); | ||
571 | if (ps) { | ||
572 | if (ps->area) | ||
573 | free_area(ps); | ||
574 | |||
575 | kfree(ps); | ||
576 | } | ||
577 | return r; | ||
578 | } | ||
579 | |||
580 | /*----------------------------------------------------------------- | ||
581 | * Implementation of the store for non-persistent snapshots. | ||
582 | *---------------------------------------------------------------*/ | ||
583 | struct transient_c { | ||
584 | sector_t next_free; | ||
585 | }; | ||
586 | |||
587 | static void transient_destroy(struct exception_store *store) | ||
588 | { | ||
589 | kfree(store->context); | ||
590 | } | ||
591 | |||
592 | static int transient_read_metadata(struct exception_store *store) | ||
593 | { | ||
594 | return 0; | ||
595 | } | ||
596 | |||
597 | static int transient_prepare(struct exception_store *store, struct exception *e) | ||
598 | { | ||
599 | struct transient_c *tc = (struct transient_c *) store->context; | ||
600 | sector_t size = get_dev_size(store->snap->cow->bdev); | ||
601 | |||
602 | if (size < (tc->next_free + store->snap->chunk_size)) | ||
603 | return -1; | ||
604 | |||
605 | e->new_chunk = sector_to_chunk(store->snap, tc->next_free); | ||
606 | tc->next_free += store->snap->chunk_size; | ||
607 | |||
608 | return 0; | ||
609 | } | ||
610 | |||
611 | static void transient_commit(struct exception_store *store, | ||
612 | struct exception *e, | ||
613 | void (*callback) (void *, int success), | ||
614 | void *callback_context) | ||
615 | { | ||
616 | /* Just succeed */ | ||
617 | callback(callback_context, 1); | ||
618 | } | ||
619 | |||
620 | static void transient_fraction_full(struct exception_store *store, | ||
621 | sector_t *numerator, sector_t *denominator) | ||
622 | { | ||
623 | *numerator = ((struct transient_c *) store->context)->next_free; | ||
624 | *denominator = get_dev_size(store->snap->cow->bdev); | ||
625 | } | ||
626 | |||
627 | int dm_create_transient(struct exception_store *store, | ||
628 | struct dm_snapshot *s, int blocksize) | ||
629 | { | ||
630 | struct transient_c *tc; | ||
631 | |||
632 | memset(store, 0, sizeof(*store)); | ||
633 | store->destroy = transient_destroy; | ||
634 | store->read_metadata = transient_read_metadata; | ||
635 | store->prepare_exception = transient_prepare; | ||
636 | store->commit_exception = transient_commit; | ||
637 | store->fraction_full = transient_fraction_full; | ||
638 | store->snap = s; | ||
639 | |||
640 | tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); | ||
641 | if (!tc) | ||
642 | return -ENOMEM; | ||
643 | |||
644 | tc->next_free = 0; | ||
645 | store->context = tc; | ||
646 | |||
647 | return 0; | ||
648 | } | ||
diff --git a/drivers/md/dm-hw-handler.c b/drivers/md/dm-hw-handler.c new file mode 100644 index 000000000000..ae63772e44c9 --- /dev/null +++ b/drivers/md/dm-hw-handler.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2004 Red Hat, Inc. All rights reserved. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | * | ||
6 | * Multipath hardware handler registration. | ||
7 | */ | ||
8 | |||
9 | #include "dm.h" | ||
10 | #include "dm-hw-handler.h" | ||
11 | |||
12 | #include <linux/slab.h> | ||
13 | |||
14 | struct hwh_internal { | ||
15 | struct hw_handler_type hwht; | ||
16 | |||
17 | struct list_head list; | ||
18 | long use; | ||
19 | }; | ||
20 | |||
21 | #define hwht_to_hwhi(__hwht) container_of((__hwht), struct hwh_internal, hwht) | ||
22 | |||
23 | static LIST_HEAD(_hw_handlers); | ||
24 | static DECLARE_RWSEM(_hwh_lock); | ||
25 | |||
26 | struct hwh_internal *__find_hw_handler_type(const char *name) | ||
27 | { | ||
28 | struct hwh_internal *hwhi; | ||
29 | |||
30 | list_for_each_entry(hwhi, &_hw_handlers, list) { | ||
31 | if (!strcmp(name, hwhi->hwht.name)) | ||
32 | return hwhi; | ||
33 | } | ||
34 | |||
35 | return NULL; | ||
36 | } | ||
37 | |||
38 | static struct hwh_internal *get_hw_handler(const char *name) | ||
39 | { | ||
40 | struct hwh_internal *hwhi; | ||
41 | |||
42 | down_read(&_hwh_lock); | ||
43 | hwhi = __find_hw_handler_type(name); | ||
44 | if (hwhi) { | ||
45 | if ((hwhi->use == 0) && !try_module_get(hwhi->hwht.module)) | ||
46 | hwhi = NULL; | ||
47 | else | ||
48 | hwhi->use++; | ||
49 | } | ||
50 | up_read(&_hwh_lock); | ||
51 | |||
52 | return hwhi; | ||
53 | } | ||
54 | |||
55 | struct hw_handler_type *dm_get_hw_handler(const char *name) | ||
56 | { | ||
57 | struct hwh_internal *hwhi; | ||
58 | |||
59 | if (!name) | ||
60 | return NULL; | ||
61 | |||
62 | hwhi = get_hw_handler(name); | ||
63 | if (!hwhi) { | ||
64 | request_module("dm-%s", name); | ||
65 | hwhi = get_hw_handler(name); | ||
66 | } | ||
67 | |||
68 | return hwhi ? &hwhi->hwht : NULL; | ||
69 | } | ||
70 | |||
71 | void dm_put_hw_handler(struct hw_handler_type *hwht) | ||
72 | { | ||
73 | struct hwh_internal *hwhi; | ||
74 | |||
75 | if (!hwht) | ||
76 | return; | ||
77 | |||
78 | down_read(&_hwh_lock); | ||
79 | hwhi = __find_hw_handler_type(hwht->name); | ||
80 | if (!hwhi) | ||
81 | goto out; | ||
82 | |||
83 | if (--hwhi->use == 0) | ||
84 | module_put(hwhi->hwht.module); | ||
85 | |||
86 | if (hwhi->use < 0) | ||
87 | BUG(); | ||
88 | |||
89 | out: | ||
90 | up_read(&_hwh_lock); | ||
91 | } | ||
92 | |||
93 | static struct hwh_internal *_alloc_hw_handler(struct hw_handler_type *hwht) | ||
94 | { | ||
95 | struct hwh_internal *hwhi = kmalloc(sizeof(*hwhi), GFP_KERNEL); | ||
96 | |||
97 | if (hwhi) { | ||
98 | memset(hwhi, 0, sizeof(*hwhi)); | ||
99 | hwhi->hwht = *hwht; | ||
100 | } | ||
101 | |||
102 | return hwhi; | ||
103 | } | ||
104 | |||
105 | int dm_register_hw_handler(struct hw_handler_type *hwht) | ||
106 | { | ||
107 | int r = 0; | ||
108 | struct hwh_internal *hwhi = _alloc_hw_handler(hwht); | ||
109 | |||
110 | if (!hwhi) | ||
111 | return -ENOMEM; | ||
112 | |||
113 | down_write(&_hwh_lock); | ||
114 | |||
115 | if (__find_hw_handler_type(hwht->name)) { | ||
116 | kfree(hwhi); | ||
117 | r = -EEXIST; | ||
118 | } else | ||
119 | list_add(&hwhi->list, &_hw_handlers); | ||
120 | |||
121 | up_write(&_hwh_lock); | ||
122 | |||
123 | return r; | ||
124 | } | ||
125 | |||
126 | int dm_unregister_hw_handler(struct hw_handler_type *hwht) | ||
127 | { | ||
128 | struct hwh_internal *hwhi; | ||
129 | |||
130 | down_write(&_hwh_lock); | ||
131 | |||
132 | hwhi = __find_hw_handler_type(hwht->name); | ||
133 | if (!hwhi) { | ||
134 | up_write(&_hwh_lock); | ||
135 | return -EINVAL; | ||
136 | } | ||
137 | |||
138 | if (hwhi->use) { | ||
139 | up_write(&_hwh_lock); | ||
140 | return -ETXTBSY; | ||
141 | } | ||
142 | |||
143 | list_del(&hwhi->list); | ||
144 | |||
145 | up_write(&_hwh_lock); | ||
146 | |||
147 | kfree(hwhi); | ||
148 | |||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio) | ||
153 | { | ||
154 | #if 0 | ||
155 | int sense_key, asc, ascq; | ||
156 | |||
157 | if (bio->bi_error & BIO_SENSE) { | ||
158 | /* FIXME: This is just an initial guess. */ | ||
159 | /* key / asc / ascq */ | ||
160 | sense_key = (bio->bi_error >> 16) & 0xff; | ||
161 | asc = (bio->bi_error >> 8) & 0xff; | ||
162 | ascq = bio->bi_error & 0xff; | ||
163 | |||
164 | switch (sense_key) { | ||
165 | /* This block as a whole comes from the device. | ||
166 | * So no point retrying on another path. */ | ||
167 | case 0x03: /* Medium error */ | ||
168 | case 0x05: /* Illegal request */ | ||
169 | case 0x07: /* Data protect */ | ||
170 | case 0x08: /* Blank check */ | ||
171 | case 0x0a: /* copy aborted */ | ||
172 | case 0x0c: /* obsolete - no clue ;-) */ | ||
173 | case 0x0d: /* volume overflow */ | ||
174 | case 0x0e: /* data miscompare */ | ||
175 | case 0x0f: /* reserved - no idea either. */ | ||
176 | return MP_ERROR_IO; | ||
177 | |||
178 | /* For these errors it's unclear whether they | ||
179 | * come from the device or the controller. | ||
180 | * So just lets try a different path, and if | ||
181 | * it eventually succeeds, user-space will clear | ||
182 | * the paths again... */ | ||
183 | case 0x02: /* Not ready */ | ||
184 | case 0x04: /* Hardware error */ | ||
185 | case 0x09: /* vendor specific */ | ||
186 | case 0x0b: /* Aborted command */ | ||
187 | return MP_FAIL_PATH; | ||
188 | |||
189 | case 0x06: /* Unit attention - might want to decode */ | ||
190 | if (asc == 0x04 && ascq == 0x01) | ||
191 | /* "Unit in the process of | ||
192 | * becoming ready" */ | ||
193 | return 0; | ||
194 | return MP_FAIL_PATH; | ||
195 | |||
196 | /* FIXME: For Unit Not Ready we may want | ||
197 | * to have a generic pg activation | ||
198 | * feature (START_UNIT). */ | ||
199 | |||
200 | /* Should these two ever end up in the | ||
201 | * error path? I don't think so. */ | ||
202 | case 0x00: /* No sense */ | ||
203 | case 0x01: /* Recovered error */ | ||
204 | return 0; | ||
205 | } | ||
206 | } | ||
207 | #endif | ||
208 | |||
209 | /* We got no idea how to decode the other kinds of errors -> | ||
210 | * assume generic error condition. */ | ||
211 | return MP_FAIL_PATH; | ||
212 | } | ||
213 | |||
214 | EXPORT_SYMBOL_GPL(dm_register_hw_handler); | ||
215 | EXPORT_SYMBOL_GPL(dm_unregister_hw_handler); | ||
216 | EXPORT_SYMBOL_GPL(dm_scsi_err_handler); | ||
diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h new file mode 100644 index 000000000000..15f5629e231a --- /dev/null +++ b/drivers/md/dm-hw-handler.h | |||
@@ -0,0 +1,61 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2004 Red Hat, Inc. All rights reserved. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | * | ||
6 | * Multipath hardware handler registration. | ||
7 | */ | ||
8 | |||
9 | #ifndef DM_HW_HANDLER_H | ||
10 | #define DM_HW_HANDLER_H | ||
11 | |||
12 | #include <linux/device-mapper.h> | ||
13 | |||
14 | #include "dm-mpath.h" | ||
15 | |||
16 | struct hw_handler_type; | ||
17 | struct hw_handler { | ||
18 | struct hw_handler_type *type; | ||
19 | void *context; | ||
20 | }; | ||
21 | |||
22 | /* | ||
23 | * Constructs a hardware handler object, takes custom arguments | ||
24 | */ | ||
25 | /* Information about a hardware handler type */ | ||
26 | struct hw_handler_type { | ||
27 | char *name; | ||
28 | struct module *module; | ||
29 | |||
30 | int (*create) (struct hw_handler *handler, unsigned int argc, | ||
31 | char **argv); | ||
32 | void (*destroy) (struct hw_handler *hwh); | ||
33 | |||
34 | void (*pg_init) (struct hw_handler *hwh, unsigned bypassed, | ||
35 | struct path *path); | ||
36 | unsigned (*error) (struct hw_handler *hwh, struct bio *bio); | ||
37 | int (*status) (struct hw_handler *hwh, status_type_t type, | ||
38 | char *result, unsigned int maxlen); | ||
39 | }; | ||
40 | |||
41 | /* Register a hardware handler */ | ||
42 | int dm_register_hw_handler(struct hw_handler_type *type); | ||
43 | |||
44 | /* Unregister a hardware handler */ | ||
45 | int dm_unregister_hw_handler(struct hw_handler_type *type); | ||
46 | |||
47 | /* Returns a registered hardware handler type */ | ||
48 | struct hw_handler_type *dm_get_hw_handler(const char *name); | ||
49 | |||
50 | /* Releases a hardware handler */ | ||
51 | void dm_put_hw_handler(struct hw_handler_type *hwht); | ||
52 | |||
53 | /* Default err function */ | ||
54 | unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio); | ||
55 | |||
56 | /* Error flags for err and dm_pg_init_complete */ | ||
57 | #define MP_FAIL_PATH 1 | ||
58 | #define MP_BYPASS_PG 2 | ||
59 | #define MP_ERROR_IO 4 /* Don't retry this I/O */ | ||
60 | |||
61 | #endif | ||
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c new file mode 100644 index 000000000000..45754bb6a799 --- /dev/null +++ b/drivers/md/dm-io.c | |||
@@ -0,0 +1,426 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Sistina Software | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #include "dm-io.h" | ||
8 | |||
9 | #include <linux/bio.h> | ||
10 | #include <linux/mempool.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/slab.h> | ||
14 | |||
15 | static struct bio_set *_bios; | ||
16 | |||
17 | /* FIXME: can we shrink this ? */ | ||
18 | struct io { | ||
19 | unsigned long error; | ||
20 | atomic_t count; | ||
21 | struct task_struct *sleeper; | ||
22 | io_notify_fn callback; | ||
23 | void *context; | ||
24 | }; | ||
25 | |||
26 | /* | ||
27 | * io contexts are only dynamically allocated for asynchronous | ||
28 | * io. Since async io is likely to be the majority of io we'll | ||
29 | * have the same number of io contexts as buffer heads ! (FIXME: | ||
30 | * must reduce this). | ||
31 | */ | ||
32 | static unsigned _num_ios; | ||
33 | static mempool_t *_io_pool; | ||
34 | |||
35 | static void *alloc_io(unsigned int __nocast gfp_mask, void *pool_data) | ||
36 | { | ||
37 | return kmalloc(sizeof(struct io), gfp_mask); | ||
38 | } | ||
39 | |||
40 | static void free_io(void *element, void *pool_data) | ||
41 | { | ||
42 | kfree(element); | ||
43 | } | ||
44 | |||
45 | static unsigned int pages_to_ios(unsigned int pages) | ||
46 | { | ||
47 | return 4 * pages; /* too many ? */ | ||
48 | } | ||
49 | |||
50 | static int resize_pool(unsigned int new_ios) | ||
51 | { | ||
52 | int r = 0; | ||
53 | |||
54 | if (_io_pool) { | ||
55 | if (new_ios == 0) { | ||
56 | /* free off the pool */ | ||
57 | mempool_destroy(_io_pool); | ||
58 | _io_pool = NULL; | ||
59 | bioset_free(_bios); | ||
60 | |||
61 | } else { | ||
62 | /* resize the pool */ | ||
63 | r = mempool_resize(_io_pool, new_ios, GFP_KERNEL); | ||
64 | } | ||
65 | |||
66 | } else { | ||
67 | /* create new pool */ | ||
68 | _io_pool = mempool_create(new_ios, alloc_io, free_io, NULL); | ||
69 | if (!_io_pool) | ||
70 | return -ENOMEM; | ||
71 | |||
72 | _bios = bioset_create(16, 16, 4); | ||
73 | if (!_bios) { | ||
74 | mempool_destroy(_io_pool); | ||
75 | _io_pool = NULL; | ||
76 | return -ENOMEM; | ||
77 | } | ||
78 | } | ||
79 | |||
80 | if (!r) | ||
81 | _num_ios = new_ios; | ||
82 | |||
83 | return r; | ||
84 | } | ||
85 | |||
86 | int dm_io_get(unsigned int num_pages) | ||
87 | { | ||
88 | return resize_pool(_num_ios + pages_to_ios(num_pages)); | ||
89 | } | ||
90 | |||
91 | void dm_io_put(unsigned int num_pages) | ||
92 | { | ||
93 | resize_pool(_num_ios - pages_to_ios(num_pages)); | ||
94 | } | ||
95 | |||
96 | /*----------------------------------------------------------------- | ||
97 | * We need to keep track of which region a bio is doing io for. | ||
98 | * In order to save a memory allocation we store this the last | ||
99 | * bvec which we know is unused (blech). | ||
100 | * XXX This is ugly and can OOPS with some configs... find another way. | ||
101 | *---------------------------------------------------------------*/ | ||
102 | static inline void bio_set_region(struct bio *bio, unsigned region) | ||
103 | { | ||
104 | bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len = region; | ||
105 | } | ||
106 | |||
107 | static inline unsigned bio_get_region(struct bio *bio) | ||
108 | { | ||
109 | return bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len; | ||
110 | } | ||
111 | |||
112 | /*----------------------------------------------------------------- | ||
113 | * We need an io object to keep track of the number of bios that | ||
114 | * have been dispatched for a particular io. | ||
115 | *---------------------------------------------------------------*/ | ||
116 | static void dec_count(struct io *io, unsigned int region, int error) | ||
117 | { | ||
118 | if (error) | ||
119 | set_bit(region, &io->error); | ||
120 | |||
121 | if (atomic_dec_and_test(&io->count)) { | ||
122 | if (io->sleeper) | ||
123 | wake_up_process(io->sleeper); | ||
124 | |||
125 | else { | ||
126 | int r = io->error; | ||
127 | io_notify_fn fn = io->callback; | ||
128 | void *context = io->context; | ||
129 | |||
130 | mempool_free(io, _io_pool); | ||
131 | fn(r, context); | ||
132 | } | ||
133 | } | ||
134 | } | ||
135 | |||
136 | static int endio(struct bio *bio, unsigned int done, int error) | ||
137 | { | ||
138 | struct io *io = (struct io *) bio->bi_private; | ||
139 | |||
140 | /* keep going until we've finished */ | ||
141 | if (bio->bi_size) | ||
142 | return 1; | ||
143 | |||
144 | if (error && bio_data_dir(bio) == READ) | ||
145 | zero_fill_bio(bio); | ||
146 | |||
147 | dec_count(io, bio_get_region(bio), error); | ||
148 | bio_put(bio); | ||
149 | |||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | /*----------------------------------------------------------------- | ||
154 | * These little objects provide an abstraction for getting a new | ||
155 | * destination page for io. | ||
156 | *---------------------------------------------------------------*/ | ||
157 | struct dpages { | ||
158 | void (*get_page)(struct dpages *dp, | ||
159 | struct page **p, unsigned long *len, unsigned *offset); | ||
160 | void (*next_page)(struct dpages *dp); | ||
161 | |||
162 | unsigned context_u; | ||
163 | void *context_ptr; | ||
164 | }; | ||
165 | |||
166 | /* | ||
167 | * Functions for getting the pages from a list. | ||
168 | */ | ||
169 | static void list_get_page(struct dpages *dp, | ||
170 | struct page **p, unsigned long *len, unsigned *offset) | ||
171 | { | ||
172 | unsigned o = dp->context_u; | ||
173 | struct page_list *pl = (struct page_list *) dp->context_ptr; | ||
174 | |||
175 | *p = pl->page; | ||
176 | *len = PAGE_SIZE - o; | ||
177 | *offset = o; | ||
178 | } | ||
179 | |||
180 | static void list_next_page(struct dpages *dp) | ||
181 | { | ||
182 | struct page_list *pl = (struct page_list *) dp->context_ptr; | ||
183 | dp->context_ptr = pl->next; | ||
184 | dp->context_u = 0; | ||
185 | } | ||
186 | |||
187 | static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset) | ||
188 | { | ||
189 | dp->get_page = list_get_page; | ||
190 | dp->next_page = list_next_page; | ||
191 | dp->context_u = offset; | ||
192 | dp->context_ptr = pl; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * Functions for getting the pages from a bvec. | ||
197 | */ | ||
198 | static void bvec_get_page(struct dpages *dp, | ||
199 | struct page **p, unsigned long *len, unsigned *offset) | ||
200 | { | ||
201 | struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; | ||
202 | *p = bvec->bv_page; | ||
203 | *len = bvec->bv_len; | ||
204 | *offset = bvec->bv_offset; | ||
205 | } | ||
206 | |||
207 | static void bvec_next_page(struct dpages *dp) | ||
208 | { | ||
209 | struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; | ||
210 | dp->context_ptr = bvec + 1; | ||
211 | } | ||
212 | |||
213 | static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec) | ||
214 | { | ||
215 | dp->get_page = bvec_get_page; | ||
216 | dp->next_page = bvec_next_page; | ||
217 | dp->context_ptr = bvec; | ||
218 | } | ||
219 | |||
220 | static void vm_get_page(struct dpages *dp, | ||
221 | struct page **p, unsigned long *len, unsigned *offset) | ||
222 | { | ||
223 | *p = vmalloc_to_page(dp->context_ptr); | ||
224 | *offset = dp->context_u; | ||
225 | *len = PAGE_SIZE - dp->context_u; | ||
226 | } | ||
227 | |||
228 | static void vm_next_page(struct dpages *dp) | ||
229 | { | ||
230 | dp->context_ptr += PAGE_SIZE - dp->context_u; | ||
231 | dp->context_u = 0; | ||
232 | } | ||
233 | |||
234 | static void vm_dp_init(struct dpages *dp, void *data) | ||
235 | { | ||
236 | dp->get_page = vm_get_page; | ||
237 | dp->next_page = vm_next_page; | ||
238 | dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1); | ||
239 | dp->context_ptr = data; | ||
240 | } | ||
241 | |||
242 | /*----------------------------------------------------------------- | ||
243 | * IO routines that accept a list of pages. | ||
244 | *---------------------------------------------------------------*/ | ||
245 | static void do_region(int rw, unsigned int region, struct io_region *where, | ||
246 | struct dpages *dp, struct io *io) | ||
247 | { | ||
248 | struct bio *bio; | ||
249 | struct page *page; | ||
250 | unsigned long len; | ||
251 | unsigned offset; | ||
252 | unsigned num_bvecs; | ||
253 | sector_t remaining = where->count; | ||
254 | |||
255 | while (remaining) { | ||
256 | /* | ||
257 | * Allocate a suitably sized bio, we add an extra | ||
258 | * bvec for bio_get/set_region(). | ||
259 | */ | ||
260 | num_bvecs = (remaining / (PAGE_SIZE >> 9)) + 2; | ||
261 | bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios); | ||
262 | bio->bi_sector = where->sector + (where->count - remaining); | ||
263 | bio->bi_bdev = where->bdev; | ||
264 | bio->bi_end_io = endio; | ||
265 | bio->bi_private = io; | ||
266 | bio_set_region(bio, region); | ||
267 | |||
268 | /* | ||
269 | * Try and add as many pages as possible. | ||
270 | */ | ||
271 | while (remaining) { | ||
272 | dp->get_page(dp, &page, &len, &offset); | ||
273 | len = min(len, to_bytes(remaining)); | ||
274 | if (!bio_add_page(bio, page, len, offset)) | ||
275 | break; | ||
276 | |||
277 | offset = 0; | ||
278 | remaining -= to_sector(len); | ||
279 | dp->next_page(dp); | ||
280 | } | ||
281 | |||
282 | atomic_inc(&io->count); | ||
283 | submit_bio(rw, bio); | ||
284 | } | ||
285 | } | ||
286 | |||
287 | static void dispatch_io(int rw, unsigned int num_regions, | ||
288 | struct io_region *where, struct dpages *dp, | ||
289 | struct io *io, int sync) | ||
290 | { | ||
291 | int i; | ||
292 | struct dpages old_pages = *dp; | ||
293 | |||
294 | if (sync) | ||
295 | rw |= (1 << BIO_RW_SYNC); | ||
296 | |||
297 | /* | ||
298 | * For multiple regions we need to be careful to rewind | ||
299 | * the dp object for each call to do_region. | ||
300 | */ | ||
301 | for (i = 0; i < num_regions; i++) { | ||
302 | *dp = old_pages; | ||
303 | if (where[i].count) | ||
304 | do_region(rw, i, where + i, dp, io); | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * Drop the extra refence that we were holding to avoid | ||
309 | * the io being completed too early. | ||
310 | */ | ||
311 | dec_count(io, 0, 0); | ||
312 | } | ||
313 | |||
314 | static int sync_io(unsigned int num_regions, struct io_region *where, | ||
315 | int rw, struct dpages *dp, unsigned long *error_bits) | ||
316 | { | ||
317 | struct io io; | ||
318 | |||
319 | if (num_regions > 1 && rw != WRITE) { | ||
320 | WARN_ON(1); | ||
321 | return -EIO; | ||
322 | } | ||
323 | |||
324 | io.error = 0; | ||
325 | atomic_set(&io.count, 1); /* see dispatch_io() */ | ||
326 | io.sleeper = current; | ||
327 | |||
328 | dispatch_io(rw, num_regions, where, dp, &io, 1); | ||
329 | |||
330 | while (1) { | ||
331 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
332 | |||
333 | if (!atomic_read(&io.count) || signal_pending(current)) | ||
334 | break; | ||
335 | |||
336 | io_schedule(); | ||
337 | } | ||
338 | set_current_state(TASK_RUNNING); | ||
339 | |||
340 | if (atomic_read(&io.count)) | ||
341 | return -EINTR; | ||
342 | |||
343 | *error_bits = io.error; | ||
344 | return io.error ? -EIO : 0; | ||
345 | } | ||
346 | |||
347 | static int async_io(unsigned int num_regions, struct io_region *where, int rw, | ||
348 | struct dpages *dp, io_notify_fn fn, void *context) | ||
349 | { | ||
350 | struct io *io; | ||
351 | |||
352 | if (num_regions > 1 && rw != WRITE) { | ||
353 | WARN_ON(1); | ||
354 | fn(1, context); | ||
355 | return -EIO; | ||
356 | } | ||
357 | |||
358 | io = mempool_alloc(_io_pool, GFP_NOIO); | ||
359 | io->error = 0; | ||
360 | atomic_set(&io->count, 1); /* see dispatch_io() */ | ||
361 | io->sleeper = NULL; | ||
362 | io->callback = fn; | ||
363 | io->context = context; | ||
364 | |||
365 | dispatch_io(rw, num_regions, where, dp, io, 0); | ||
366 | return 0; | ||
367 | } | ||
368 | |||
369 | int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, | ||
370 | struct page_list *pl, unsigned int offset, | ||
371 | unsigned long *error_bits) | ||
372 | { | ||
373 | struct dpages dp; | ||
374 | list_dp_init(&dp, pl, offset); | ||
375 | return sync_io(num_regions, where, rw, &dp, error_bits); | ||
376 | } | ||
377 | |||
378 | int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw, | ||
379 | struct bio_vec *bvec, unsigned long *error_bits) | ||
380 | { | ||
381 | struct dpages dp; | ||
382 | bvec_dp_init(&dp, bvec); | ||
383 | return sync_io(num_regions, where, rw, &dp, error_bits); | ||
384 | } | ||
385 | |||
386 | int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, | ||
387 | void *data, unsigned long *error_bits) | ||
388 | { | ||
389 | struct dpages dp; | ||
390 | vm_dp_init(&dp, data); | ||
391 | return sync_io(num_regions, where, rw, &dp, error_bits); | ||
392 | } | ||
393 | |||
394 | int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, | ||
395 | struct page_list *pl, unsigned int offset, | ||
396 | io_notify_fn fn, void *context) | ||
397 | { | ||
398 | struct dpages dp; | ||
399 | list_dp_init(&dp, pl, offset); | ||
400 | return async_io(num_regions, where, rw, &dp, fn, context); | ||
401 | } | ||
402 | |||
403 | int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw, | ||
404 | struct bio_vec *bvec, io_notify_fn fn, void *context) | ||
405 | { | ||
406 | struct dpages dp; | ||
407 | bvec_dp_init(&dp, bvec); | ||
408 | return async_io(num_regions, where, rw, &dp, fn, context); | ||
409 | } | ||
410 | |||
411 | int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, | ||
412 | void *data, io_notify_fn fn, void *context) | ||
413 | { | ||
414 | struct dpages dp; | ||
415 | vm_dp_init(&dp, data); | ||
416 | return async_io(num_regions, where, rw, &dp, fn, context); | ||
417 | } | ||
418 | |||
419 | EXPORT_SYMBOL(dm_io_get); | ||
420 | EXPORT_SYMBOL(dm_io_put); | ||
421 | EXPORT_SYMBOL(dm_io_sync); | ||
422 | EXPORT_SYMBOL(dm_io_async); | ||
423 | EXPORT_SYMBOL(dm_io_sync_bvec); | ||
424 | EXPORT_SYMBOL(dm_io_async_bvec); | ||
425 | EXPORT_SYMBOL(dm_io_sync_vm); | ||
426 | EXPORT_SYMBOL(dm_io_async_vm); | ||
diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h new file mode 100644 index 000000000000..1a77f3265706 --- /dev/null +++ b/drivers/md/dm-io.h | |||
@@ -0,0 +1,77 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Sistina Software | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #ifndef _DM_IO_H | ||
8 | #define _DM_IO_H | ||
9 | |||
10 | #include "dm.h" | ||
11 | |||
12 | /* FIXME make this configurable */ | ||
13 | #define DM_MAX_IO_REGIONS 8 | ||
14 | |||
15 | struct io_region { | ||
16 | struct block_device *bdev; | ||
17 | sector_t sector; | ||
18 | sector_t count; | ||
19 | }; | ||
20 | |||
21 | struct page_list { | ||
22 | struct page_list *next; | ||
23 | struct page *page; | ||
24 | }; | ||
25 | |||
26 | |||
27 | /* | ||
28 | * 'error' is a bitset, with each bit indicating whether an error | ||
29 | * occurred doing io to the corresponding region. | ||
30 | */ | ||
31 | typedef void (*io_notify_fn)(unsigned long error, void *context); | ||
32 | |||
33 | |||
34 | /* | ||
35 | * Before anyone uses the IO interface they should call | ||
36 | * dm_io_get(), specifying roughly how many pages they are | ||
37 | * expecting to perform io on concurrently. | ||
38 | * | ||
39 | * This function may block. | ||
40 | */ | ||
41 | int dm_io_get(unsigned int num_pages); | ||
42 | void dm_io_put(unsigned int num_pages); | ||
43 | |||
44 | /* | ||
45 | * Synchronous IO. | ||
46 | * | ||
47 | * Please ensure that the rw flag in the next two functions is | ||
48 | * either READ or WRITE, ie. we don't take READA. Any | ||
49 | * regions with a zero count field will be ignored. | ||
50 | */ | ||
51 | int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, | ||
52 | struct page_list *pl, unsigned int offset, | ||
53 | unsigned long *error_bits); | ||
54 | |||
55 | int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw, | ||
56 | struct bio_vec *bvec, unsigned long *error_bits); | ||
57 | |||
58 | int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, | ||
59 | void *data, unsigned long *error_bits); | ||
60 | |||
61 | /* | ||
62 | * Aynchronous IO. | ||
63 | * | ||
64 | * The 'where' array may be safely allocated on the stack since | ||
65 | * the function takes a copy. | ||
66 | */ | ||
67 | int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, | ||
68 | struct page_list *pl, unsigned int offset, | ||
69 | io_notify_fn fn, void *context); | ||
70 | |||
71 | int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw, | ||
72 | struct bio_vec *bvec, io_notify_fn fn, void *context); | ||
73 | |||
74 | int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, | ||
75 | void *data, io_notify_fn fn, void *context); | ||
76 | |||
77 | #endif | ||
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c new file mode 100644 index 000000000000..ee3c869d9701 --- /dev/null +++ b/drivers/md/dm-ioctl.c | |||
@@ -0,0 +1,1416 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. | ||
3 | * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved. | ||
4 | * | ||
5 | * This file is released under the GPL. | ||
6 | */ | ||
7 | |||
8 | #include "dm.h" | ||
9 | |||
10 | #include <linux/module.h> | ||
11 | #include <linux/vmalloc.h> | ||
12 | #include <linux/miscdevice.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/wait.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/devfs_fs_kernel.h> | ||
17 | #include <linux/dm-ioctl.h> | ||
18 | |||
19 | #include <asm/uaccess.h> | ||
20 | |||
21 | #define DM_DRIVER_EMAIL "dm-devel@redhat.com" | ||
22 | |||
23 | /*----------------------------------------------------------------- | ||
24 | * The ioctl interface needs to be able to look up devices by | ||
25 | * name or uuid. | ||
26 | *---------------------------------------------------------------*/ | ||
27 | struct hash_cell { | ||
28 | struct list_head name_list; | ||
29 | struct list_head uuid_list; | ||
30 | |||
31 | char *name; | ||
32 | char *uuid; | ||
33 | struct mapped_device *md; | ||
34 | struct dm_table *new_map; | ||
35 | }; | ||
36 | |||
37 | struct vers_iter { | ||
38 | size_t param_size; | ||
39 | struct dm_target_versions *vers, *old_vers; | ||
40 | char *end; | ||
41 | uint32_t flags; | ||
42 | }; | ||
43 | |||
44 | |||
45 | #define NUM_BUCKETS 64 | ||
46 | #define MASK_BUCKETS (NUM_BUCKETS - 1) | ||
47 | static struct list_head _name_buckets[NUM_BUCKETS]; | ||
48 | static struct list_head _uuid_buckets[NUM_BUCKETS]; | ||
49 | |||
50 | static void dm_hash_remove_all(void); | ||
51 | |||
52 | /* | ||
53 | * Guards access to both hash tables. | ||
54 | */ | ||
55 | static DECLARE_RWSEM(_hash_lock); | ||
56 | |||
57 | static void init_buckets(struct list_head *buckets) | ||
58 | { | ||
59 | unsigned int i; | ||
60 | |||
61 | for (i = 0; i < NUM_BUCKETS; i++) | ||
62 | INIT_LIST_HEAD(buckets + i); | ||
63 | } | ||
64 | |||
65 | static int dm_hash_init(void) | ||
66 | { | ||
67 | init_buckets(_name_buckets); | ||
68 | init_buckets(_uuid_buckets); | ||
69 | devfs_mk_dir(DM_DIR); | ||
70 | return 0; | ||
71 | } | ||
72 | |||
73 | static void dm_hash_exit(void) | ||
74 | { | ||
75 | dm_hash_remove_all(); | ||
76 | devfs_remove(DM_DIR); | ||
77 | } | ||
78 | |||
79 | /*----------------------------------------------------------------- | ||
80 | * Hash function: | ||
81 | * We're not really concerned with the str hash function being | ||
82 | * fast since it's only used by the ioctl interface. | ||
83 | *---------------------------------------------------------------*/ | ||
84 | static unsigned int hash_str(const char *str) | ||
85 | { | ||
86 | const unsigned int hash_mult = 2654435387U; | ||
87 | unsigned int h = 0; | ||
88 | |||
89 | while (*str) | ||
90 | h = (h + (unsigned int) *str++) * hash_mult; | ||
91 | |||
92 | return h & MASK_BUCKETS; | ||
93 | } | ||
94 | |||
95 | /*----------------------------------------------------------------- | ||
96 | * Code for looking up a device by name | ||
97 | *---------------------------------------------------------------*/ | ||
98 | static struct hash_cell *__get_name_cell(const char *str) | ||
99 | { | ||
100 | struct hash_cell *hc; | ||
101 | unsigned int h = hash_str(str); | ||
102 | |||
103 | list_for_each_entry (hc, _name_buckets + h, name_list) | ||
104 | if (!strcmp(hc->name, str)) | ||
105 | return hc; | ||
106 | |||
107 | return NULL; | ||
108 | } | ||
109 | |||
110 | static struct hash_cell *__get_uuid_cell(const char *str) | ||
111 | { | ||
112 | struct hash_cell *hc; | ||
113 | unsigned int h = hash_str(str); | ||
114 | |||
115 | list_for_each_entry (hc, _uuid_buckets + h, uuid_list) | ||
116 | if (!strcmp(hc->uuid, str)) | ||
117 | return hc; | ||
118 | |||
119 | return NULL; | ||
120 | } | ||
121 | |||
122 | /*----------------------------------------------------------------- | ||
123 | * Inserting, removing and renaming a device. | ||
124 | *---------------------------------------------------------------*/ | ||
125 | static inline char *kstrdup(const char *str) | ||
126 | { | ||
127 | char *r = kmalloc(strlen(str) + 1, GFP_KERNEL); | ||
128 | if (r) | ||
129 | strcpy(r, str); | ||
130 | return r; | ||
131 | } | ||
132 | |||
133 | static struct hash_cell *alloc_cell(const char *name, const char *uuid, | ||
134 | struct mapped_device *md) | ||
135 | { | ||
136 | struct hash_cell *hc; | ||
137 | |||
138 | hc = kmalloc(sizeof(*hc), GFP_KERNEL); | ||
139 | if (!hc) | ||
140 | return NULL; | ||
141 | |||
142 | hc->name = kstrdup(name); | ||
143 | if (!hc->name) { | ||
144 | kfree(hc); | ||
145 | return NULL; | ||
146 | } | ||
147 | |||
148 | if (!uuid) | ||
149 | hc->uuid = NULL; | ||
150 | |||
151 | else { | ||
152 | hc->uuid = kstrdup(uuid); | ||
153 | if (!hc->uuid) { | ||
154 | kfree(hc->name); | ||
155 | kfree(hc); | ||
156 | return NULL; | ||
157 | } | ||
158 | } | ||
159 | |||
160 | INIT_LIST_HEAD(&hc->name_list); | ||
161 | INIT_LIST_HEAD(&hc->uuid_list); | ||
162 | hc->md = md; | ||
163 | hc->new_map = NULL; | ||
164 | return hc; | ||
165 | } | ||
166 | |||
167 | static void free_cell(struct hash_cell *hc) | ||
168 | { | ||
169 | if (hc) { | ||
170 | kfree(hc->name); | ||
171 | kfree(hc->uuid); | ||
172 | kfree(hc); | ||
173 | } | ||
174 | } | ||
175 | |||
176 | /* | ||
177 | * devfs stuff. | ||
178 | */ | ||
179 | static int register_with_devfs(struct hash_cell *hc) | ||
180 | { | ||
181 | struct gendisk *disk = dm_disk(hc->md); | ||
182 | |||
183 | devfs_mk_bdev(MKDEV(disk->major, disk->first_minor), | ||
184 | S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP, | ||
185 | DM_DIR "/%s", hc->name); | ||
186 | return 0; | ||
187 | } | ||
188 | |||
189 | static int unregister_with_devfs(struct hash_cell *hc) | ||
190 | { | ||
191 | devfs_remove(DM_DIR"/%s", hc->name); | ||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * The kdev_t and uuid of a device can never change once it is | ||
197 | * initially inserted. | ||
198 | */ | ||
199 | static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) | ||
200 | { | ||
201 | struct hash_cell *cell; | ||
202 | |||
203 | /* | ||
204 | * Allocate the new cells. | ||
205 | */ | ||
206 | cell = alloc_cell(name, uuid, md); | ||
207 | if (!cell) | ||
208 | return -ENOMEM; | ||
209 | |||
210 | /* | ||
211 | * Insert the cell into both hash tables. | ||
212 | */ | ||
213 | down_write(&_hash_lock); | ||
214 | if (__get_name_cell(name)) | ||
215 | goto bad; | ||
216 | |||
217 | list_add(&cell->name_list, _name_buckets + hash_str(name)); | ||
218 | |||
219 | if (uuid) { | ||
220 | if (__get_uuid_cell(uuid)) { | ||
221 | list_del(&cell->name_list); | ||
222 | goto bad; | ||
223 | } | ||
224 | list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); | ||
225 | } | ||
226 | register_with_devfs(cell); | ||
227 | dm_get(md); | ||
228 | dm_set_mdptr(md, cell); | ||
229 | up_write(&_hash_lock); | ||
230 | |||
231 | return 0; | ||
232 | |||
233 | bad: | ||
234 | up_write(&_hash_lock); | ||
235 | free_cell(cell); | ||
236 | return -EBUSY; | ||
237 | } | ||
238 | |||
239 | static void __hash_remove(struct hash_cell *hc) | ||
240 | { | ||
241 | /* remove from the dev hash */ | ||
242 | list_del(&hc->uuid_list); | ||
243 | list_del(&hc->name_list); | ||
244 | unregister_with_devfs(hc); | ||
245 | dm_set_mdptr(hc->md, NULL); | ||
246 | dm_put(hc->md); | ||
247 | if (hc->new_map) | ||
248 | dm_table_put(hc->new_map); | ||
249 | free_cell(hc); | ||
250 | } | ||
251 | |||
252 | static void dm_hash_remove_all(void) | ||
253 | { | ||
254 | int i; | ||
255 | struct hash_cell *hc; | ||
256 | struct list_head *tmp, *n; | ||
257 | |||
258 | down_write(&_hash_lock); | ||
259 | for (i = 0; i < NUM_BUCKETS; i++) { | ||
260 | list_for_each_safe (tmp, n, _name_buckets + i) { | ||
261 | hc = list_entry(tmp, struct hash_cell, name_list); | ||
262 | __hash_remove(hc); | ||
263 | } | ||
264 | } | ||
265 | up_write(&_hash_lock); | ||
266 | } | ||
267 | |||
268 | static int dm_hash_rename(const char *old, const char *new) | ||
269 | { | ||
270 | char *new_name, *old_name; | ||
271 | struct hash_cell *hc; | ||
272 | |||
273 | /* | ||
274 | * duplicate new. | ||
275 | */ | ||
276 | new_name = kstrdup(new); | ||
277 | if (!new_name) | ||
278 | return -ENOMEM; | ||
279 | |||
280 | down_write(&_hash_lock); | ||
281 | |||
282 | /* | ||
283 | * Is new free ? | ||
284 | */ | ||
285 | hc = __get_name_cell(new); | ||
286 | if (hc) { | ||
287 | DMWARN("asked to rename to an already existing name %s -> %s", | ||
288 | old, new); | ||
289 | up_write(&_hash_lock); | ||
290 | kfree(new_name); | ||
291 | return -EBUSY; | ||
292 | } | ||
293 | |||
294 | /* | ||
295 | * Is there such a device as 'old' ? | ||
296 | */ | ||
297 | hc = __get_name_cell(old); | ||
298 | if (!hc) { | ||
299 | DMWARN("asked to rename a non existent device %s -> %s", | ||
300 | old, new); | ||
301 | up_write(&_hash_lock); | ||
302 | kfree(new_name); | ||
303 | return -ENXIO; | ||
304 | } | ||
305 | |||
306 | /* | ||
307 | * rename and move the name cell. | ||
308 | */ | ||
309 | unregister_with_devfs(hc); | ||
310 | |||
311 | list_del(&hc->name_list); | ||
312 | old_name = hc->name; | ||
313 | hc->name = new_name; | ||
314 | list_add(&hc->name_list, _name_buckets + hash_str(new_name)); | ||
315 | |||
316 | /* rename the device node in devfs */ | ||
317 | register_with_devfs(hc); | ||
318 | |||
319 | up_write(&_hash_lock); | ||
320 | kfree(old_name); | ||
321 | return 0; | ||
322 | } | ||
323 | |||
324 | /*----------------------------------------------------------------- | ||
325 | * Implementation of the ioctl commands | ||
326 | *---------------------------------------------------------------*/ | ||
327 | /* | ||
328 | * All the ioctl commands get dispatched to functions with this | ||
329 | * prototype. | ||
330 | */ | ||
331 | typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); | ||
332 | |||
333 | static int remove_all(struct dm_ioctl *param, size_t param_size) | ||
334 | { | ||
335 | dm_hash_remove_all(); | ||
336 | param->data_size = 0; | ||
337 | return 0; | ||
338 | } | ||
339 | |||
340 | /* | ||
341 | * Round up the ptr to an 8-byte boundary. | ||
342 | */ | ||
343 | #define ALIGN_MASK 7 | ||
344 | static inline void *align_ptr(void *ptr) | ||
345 | { | ||
346 | return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK); | ||
347 | } | ||
348 | |||
349 | /* | ||
350 | * Retrieves the data payload buffer from an already allocated | ||
351 | * struct dm_ioctl. | ||
352 | */ | ||
353 | static void *get_result_buffer(struct dm_ioctl *param, size_t param_size, | ||
354 | size_t *len) | ||
355 | { | ||
356 | param->data_start = align_ptr(param + 1) - (void *) param; | ||
357 | |||
358 | if (param->data_start < param_size) | ||
359 | *len = param_size - param->data_start; | ||
360 | else | ||
361 | *len = 0; | ||
362 | |||
363 | return ((void *) param) + param->data_start; | ||
364 | } | ||
365 | |||
366 | static int list_devices(struct dm_ioctl *param, size_t param_size) | ||
367 | { | ||
368 | unsigned int i; | ||
369 | struct hash_cell *hc; | ||
370 | size_t len, needed = 0; | ||
371 | struct gendisk *disk; | ||
372 | struct dm_name_list *nl, *old_nl = NULL; | ||
373 | |||
374 | down_write(&_hash_lock); | ||
375 | |||
376 | /* | ||
377 | * Loop through all the devices working out how much | ||
378 | * space we need. | ||
379 | */ | ||
380 | for (i = 0; i < NUM_BUCKETS; i++) { | ||
381 | list_for_each_entry (hc, _name_buckets + i, name_list) { | ||
382 | needed += sizeof(struct dm_name_list); | ||
383 | needed += strlen(hc->name) + 1; | ||
384 | needed += ALIGN_MASK; | ||
385 | } | ||
386 | } | ||
387 | |||
388 | /* | ||
389 | * Grab our output buffer. | ||
390 | */ | ||
391 | nl = get_result_buffer(param, param_size, &len); | ||
392 | if (len < needed) { | ||
393 | param->flags |= DM_BUFFER_FULL_FLAG; | ||
394 | goto out; | ||
395 | } | ||
396 | param->data_size = param->data_start + needed; | ||
397 | |||
398 | nl->dev = 0; /* Flags no data */ | ||
399 | |||
400 | /* | ||
401 | * Now loop through filling out the names. | ||
402 | */ | ||
403 | for (i = 0; i < NUM_BUCKETS; i++) { | ||
404 | list_for_each_entry (hc, _name_buckets + i, name_list) { | ||
405 | if (old_nl) | ||
406 | old_nl->next = (uint32_t) ((void *) nl - | ||
407 | (void *) old_nl); | ||
408 | disk = dm_disk(hc->md); | ||
409 | nl->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); | ||
410 | nl->next = 0; | ||
411 | strcpy(nl->name, hc->name); | ||
412 | |||
413 | old_nl = nl; | ||
414 | nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1); | ||
415 | } | ||
416 | } | ||
417 | |||
418 | out: | ||
419 | up_write(&_hash_lock); | ||
420 | return 0; | ||
421 | } | ||
422 | |||
423 | static void list_version_get_needed(struct target_type *tt, void *needed_param) | ||
424 | { | ||
425 | size_t *needed = needed_param; | ||
426 | |||
427 | *needed += strlen(tt->name); | ||
428 | *needed += sizeof(tt->version); | ||
429 | *needed += ALIGN_MASK; | ||
430 | } | ||
431 | |||
432 | static void list_version_get_info(struct target_type *tt, void *param) | ||
433 | { | ||
434 | struct vers_iter *info = param; | ||
435 | |||
436 | /* Check space - it might have changed since the first iteration */ | ||
437 | if ((char *)info->vers + sizeof(tt->version) + strlen(tt->name) + 1 > | ||
438 | info->end) { | ||
439 | |||
440 | info->flags = DM_BUFFER_FULL_FLAG; | ||
441 | return; | ||
442 | } | ||
443 | |||
444 | if (info->old_vers) | ||
445 | info->old_vers->next = (uint32_t) ((void *)info->vers - | ||
446 | (void *)info->old_vers); | ||
447 | info->vers->version[0] = tt->version[0]; | ||
448 | info->vers->version[1] = tt->version[1]; | ||
449 | info->vers->version[2] = tt->version[2]; | ||
450 | info->vers->next = 0; | ||
451 | strcpy(info->vers->name, tt->name); | ||
452 | |||
453 | info->old_vers = info->vers; | ||
454 | info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1); | ||
455 | } | ||
456 | |||
457 | static int list_versions(struct dm_ioctl *param, size_t param_size) | ||
458 | { | ||
459 | size_t len, needed = 0; | ||
460 | struct dm_target_versions *vers; | ||
461 | struct vers_iter iter_info; | ||
462 | |||
463 | /* | ||
464 | * Loop through all the devices working out how much | ||
465 | * space we need. | ||
466 | */ | ||
467 | dm_target_iterate(list_version_get_needed, &needed); | ||
468 | |||
469 | /* | ||
470 | * Grab our output buffer. | ||
471 | */ | ||
472 | vers = get_result_buffer(param, param_size, &len); | ||
473 | if (len < needed) { | ||
474 | param->flags |= DM_BUFFER_FULL_FLAG; | ||
475 | goto out; | ||
476 | } | ||
477 | param->data_size = param->data_start + needed; | ||
478 | |||
479 | iter_info.param_size = param_size; | ||
480 | iter_info.old_vers = NULL; | ||
481 | iter_info.vers = vers; | ||
482 | iter_info.flags = 0; | ||
483 | iter_info.end = (char *)vers+len; | ||
484 | |||
485 | /* | ||
486 | * Now loop through filling out the names & versions. | ||
487 | */ | ||
488 | dm_target_iterate(list_version_get_info, &iter_info); | ||
489 | param->flags |= iter_info.flags; | ||
490 | |||
491 | out: | ||
492 | return 0; | ||
493 | } | ||
494 | |||
495 | |||
496 | |||
497 | static int check_name(const char *name) | ||
498 | { | ||
499 | if (strchr(name, '/')) { | ||
500 | DMWARN("invalid device name"); | ||
501 | return -EINVAL; | ||
502 | } | ||
503 | |||
504 | return 0; | ||
505 | } | ||
506 | |||
507 | /* | ||
508 | * Fills in a dm_ioctl structure, ready for sending back to | ||
509 | * userland. | ||
510 | */ | ||
511 | static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) | ||
512 | { | ||
513 | struct gendisk *disk = dm_disk(md); | ||
514 | struct dm_table *table; | ||
515 | struct block_device *bdev; | ||
516 | |||
517 | param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | | ||
518 | DM_ACTIVE_PRESENT_FLAG); | ||
519 | |||
520 | if (dm_suspended(md)) | ||
521 | param->flags |= DM_SUSPEND_FLAG; | ||
522 | |||
523 | param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); | ||
524 | |||
525 | if (!(param->flags & DM_SKIP_BDGET_FLAG)) { | ||
526 | bdev = bdget_disk(disk, 0); | ||
527 | if (!bdev) | ||
528 | return -ENXIO; | ||
529 | |||
530 | /* | ||
531 | * Yes, this will be out of date by the time it gets back | ||
532 | * to userland, but it is still very useful for | ||
533 | * debugging. | ||
534 | */ | ||
535 | param->open_count = bdev->bd_openers; | ||
536 | bdput(bdev); | ||
537 | } else | ||
538 | param->open_count = -1; | ||
539 | |||
540 | if (disk->policy) | ||
541 | param->flags |= DM_READONLY_FLAG; | ||
542 | |||
543 | param->event_nr = dm_get_event_nr(md); | ||
544 | |||
545 | table = dm_get_table(md); | ||
546 | if (table) { | ||
547 | param->flags |= DM_ACTIVE_PRESENT_FLAG; | ||
548 | param->target_count = dm_table_get_num_targets(table); | ||
549 | dm_table_put(table); | ||
550 | } else | ||
551 | param->target_count = 0; | ||
552 | |||
553 | return 0; | ||
554 | } | ||
555 | |||
556 | static int dev_create(struct dm_ioctl *param, size_t param_size) | ||
557 | { | ||
558 | int r; | ||
559 | struct mapped_device *md; | ||
560 | |||
561 | r = check_name(param->name); | ||
562 | if (r) | ||
563 | return r; | ||
564 | |||
565 | if (param->flags & DM_PERSISTENT_DEV_FLAG) | ||
566 | r = dm_create_with_minor(MINOR(huge_decode_dev(param->dev)), &md); | ||
567 | else | ||
568 | r = dm_create(&md); | ||
569 | |||
570 | if (r) | ||
571 | return r; | ||
572 | |||
573 | r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); | ||
574 | if (r) { | ||
575 | dm_put(md); | ||
576 | return r; | ||
577 | } | ||
578 | |||
579 | param->flags &= ~DM_INACTIVE_PRESENT_FLAG; | ||
580 | |||
581 | r = __dev_status(md, param); | ||
582 | dm_put(md); | ||
583 | |||
584 | return r; | ||
585 | } | ||
586 | |||
587 | /* | ||
588 | * Always use UUID for lookups if it's present, otherwise use name or dev. | ||
589 | */ | ||
590 | static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) | ||
591 | { | ||
592 | if (*param->uuid) | ||
593 | return __get_uuid_cell(param->uuid); | ||
594 | else if (*param->name) | ||
595 | return __get_name_cell(param->name); | ||
596 | else | ||
597 | return dm_get_mdptr(huge_decode_dev(param->dev)); | ||
598 | } | ||
599 | |||
600 | static inline struct mapped_device *find_device(struct dm_ioctl *param) | ||
601 | { | ||
602 | struct hash_cell *hc; | ||
603 | struct mapped_device *md = NULL; | ||
604 | |||
605 | down_read(&_hash_lock); | ||
606 | hc = __find_device_hash_cell(param); | ||
607 | if (hc) { | ||
608 | md = hc->md; | ||
609 | dm_get(md); | ||
610 | |||
611 | /* | ||
612 | * Sneakily write in both the name and the uuid | ||
613 | * while we have the cell. | ||
614 | */ | ||
615 | strncpy(param->name, hc->name, sizeof(param->name)); | ||
616 | if (hc->uuid) | ||
617 | strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1); | ||
618 | else | ||
619 | param->uuid[0] = '\0'; | ||
620 | |||
621 | if (hc->new_map) | ||
622 | param->flags |= DM_INACTIVE_PRESENT_FLAG; | ||
623 | else | ||
624 | param->flags &= ~DM_INACTIVE_PRESENT_FLAG; | ||
625 | } | ||
626 | up_read(&_hash_lock); | ||
627 | |||
628 | return md; | ||
629 | } | ||
630 | |||
631 | static int dev_remove(struct dm_ioctl *param, size_t param_size) | ||
632 | { | ||
633 | struct hash_cell *hc; | ||
634 | |||
635 | down_write(&_hash_lock); | ||
636 | hc = __find_device_hash_cell(param); | ||
637 | |||
638 | if (!hc) { | ||
639 | DMWARN("device doesn't appear to be in the dev hash table."); | ||
640 | up_write(&_hash_lock); | ||
641 | return -ENXIO; | ||
642 | } | ||
643 | |||
644 | __hash_remove(hc); | ||
645 | up_write(&_hash_lock); | ||
646 | param->data_size = 0; | ||
647 | return 0; | ||
648 | } | ||
649 | |||
650 | /* | ||
651 | * Check a string doesn't overrun the chunk of | ||
652 | * memory we copied from userland. | ||
653 | */ | ||
654 | static int invalid_str(char *str, void *end) | ||
655 | { | ||
656 | while ((void *) str < end) | ||
657 | if (!*str++) | ||
658 | return 0; | ||
659 | |||
660 | return -EINVAL; | ||
661 | } | ||
662 | |||
663 | static int dev_rename(struct dm_ioctl *param, size_t param_size) | ||
664 | { | ||
665 | int r; | ||
666 | char *new_name = (char *) param + param->data_start; | ||
667 | |||
668 | if (new_name < (char *) (param + 1) || | ||
669 | invalid_str(new_name, (void *) param + param_size)) { | ||
670 | DMWARN("Invalid new logical volume name supplied."); | ||
671 | return -EINVAL; | ||
672 | } | ||
673 | |||
674 | r = check_name(new_name); | ||
675 | if (r) | ||
676 | return r; | ||
677 | |||
678 | param->data_size = 0; | ||
679 | return dm_hash_rename(param->name, new_name); | ||
680 | } | ||
681 | |||
682 | static int do_suspend(struct dm_ioctl *param) | ||
683 | { | ||
684 | int r = 0; | ||
685 | struct mapped_device *md; | ||
686 | |||
687 | md = find_device(param); | ||
688 | if (!md) | ||
689 | return -ENXIO; | ||
690 | |||
691 | if (!dm_suspended(md)) | ||
692 | r = dm_suspend(md); | ||
693 | |||
694 | if (!r) | ||
695 | r = __dev_status(md, param); | ||
696 | |||
697 | dm_put(md); | ||
698 | return r; | ||
699 | } | ||
700 | |||
701 | static int do_resume(struct dm_ioctl *param) | ||
702 | { | ||
703 | int r = 0; | ||
704 | struct hash_cell *hc; | ||
705 | struct mapped_device *md; | ||
706 | struct dm_table *new_map; | ||
707 | |||
708 | down_write(&_hash_lock); | ||
709 | |||
710 | hc = __find_device_hash_cell(param); | ||
711 | if (!hc) { | ||
712 | DMWARN("device doesn't appear to be in the dev hash table."); | ||
713 | up_write(&_hash_lock); | ||
714 | return -ENXIO; | ||
715 | } | ||
716 | |||
717 | md = hc->md; | ||
718 | dm_get(md); | ||
719 | |||
720 | new_map = hc->new_map; | ||
721 | hc->new_map = NULL; | ||
722 | param->flags &= ~DM_INACTIVE_PRESENT_FLAG; | ||
723 | |||
724 | up_write(&_hash_lock); | ||
725 | |||
726 | /* Do we need to load a new map ? */ | ||
727 | if (new_map) { | ||
728 | /* Suspend if it isn't already suspended */ | ||
729 | if (!dm_suspended(md)) | ||
730 | dm_suspend(md); | ||
731 | |||
732 | r = dm_swap_table(md, new_map); | ||
733 | if (r) { | ||
734 | dm_put(md); | ||
735 | dm_table_put(new_map); | ||
736 | return r; | ||
737 | } | ||
738 | |||
739 | if (dm_table_get_mode(new_map) & FMODE_WRITE) | ||
740 | set_disk_ro(dm_disk(md), 0); | ||
741 | else | ||
742 | set_disk_ro(dm_disk(md), 1); | ||
743 | |||
744 | dm_table_put(new_map); | ||
745 | } | ||
746 | |||
747 | if (dm_suspended(md)) | ||
748 | r = dm_resume(md); | ||
749 | |||
750 | if (!r) | ||
751 | r = __dev_status(md, param); | ||
752 | |||
753 | dm_put(md); | ||
754 | return r; | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * Set or unset the suspension state of a device. | ||
759 | * If the device already is in the requested state we just return its status. | ||
760 | */ | ||
761 | static int dev_suspend(struct dm_ioctl *param, size_t param_size) | ||
762 | { | ||
763 | if (param->flags & DM_SUSPEND_FLAG) | ||
764 | return do_suspend(param); | ||
765 | |||
766 | return do_resume(param); | ||
767 | } | ||
768 | |||
769 | /* | ||
770 | * Copies device info back to user space, used by | ||
771 | * the create and info ioctls. | ||
772 | */ | ||
773 | static int dev_status(struct dm_ioctl *param, size_t param_size) | ||
774 | { | ||
775 | int r; | ||
776 | struct mapped_device *md; | ||
777 | |||
778 | md = find_device(param); | ||
779 | if (!md) | ||
780 | return -ENXIO; | ||
781 | |||
782 | r = __dev_status(md, param); | ||
783 | dm_put(md); | ||
784 | return r; | ||
785 | } | ||
786 | |||
787 | /* | ||
788 | * Build up the status struct for each target | ||
789 | */ | ||
790 | static void retrieve_status(struct dm_table *table, | ||
791 | struct dm_ioctl *param, size_t param_size) | ||
792 | { | ||
793 | unsigned int i, num_targets; | ||
794 | struct dm_target_spec *spec; | ||
795 | char *outbuf, *outptr; | ||
796 | status_type_t type; | ||
797 | size_t remaining, len, used = 0; | ||
798 | |||
799 | outptr = outbuf = get_result_buffer(param, param_size, &len); | ||
800 | |||
801 | if (param->flags & DM_STATUS_TABLE_FLAG) | ||
802 | type = STATUSTYPE_TABLE; | ||
803 | else | ||
804 | type = STATUSTYPE_INFO; | ||
805 | |||
806 | /* Get all the target info */ | ||
807 | num_targets = dm_table_get_num_targets(table); | ||
808 | for (i = 0; i < num_targets; i++) { | ||
809 | struct dm_target *ti = dm_table_get_target(table, i); | ||
810 | |||
811 | remaining = len - (outptr - outbuf); | ||
812 | if (remaining <= sizeof(struct dm_target_spec)) { | ||
813 | param->flags |= DM_BUFFER_FULL_FLAG; | ||
814 | break; | ||
815 | } | ||
816 | |||
817 | spec = (struct dm_target_spec *) outptr; | ||
818 | |||
819 | spec->status = 0; | ||
820 | spec->sector_start = ti->begin; | ||
821 | spec->length = ti->len; | ||
822 | strncpy(spec->target_type, ti->type->name, | ||
823 | sizeof(spec->target_type)); | ||
824 | |||
825 | outptr += sizeof(struct dm_target_spec); | ||
826 | remaining = len - (outptr - outbuf); | ||
827 | if (remaining <= 0) { | ||
828 | param->flags |= DM_BUFFER_FULL_FLAG; | ||
829 | break; | ||
830 | } | ||
831 | |||
832 | /* Get the status/table string from the target driver */ | ||
833 | if (ti->type->status) { | ||
834 | if (ti->type->status(ti, type, outptr, remaining)) { | ||
835 | param->flags |= DM_BUFFER_FULL_FLAG; | ||
836 | break; | ||
837 | } | ||
838 | } else | ||
839 | outptr[0] = '\0'; | ||
840 | |||
841 | outptr += strlen(outptr) + 1; | ||
842 | used = param->data_start + (outptr - outbuf); | ||
843 | |||
844 | outptr = align_ptr(outptr); | ||
845 | spec->next = outptr - outbuf; | ||
846 | } | ||
847 | |||
848 | if (used) | ||
849 | param->data_size = used; | ||
850 | |||
851 | param->target_count = num_targets; | ||
852 | } | ||
853 | |||
854 | /* | ||
855 | * Wait for a device to report an event | ||
856 | */ | ||
857 | static int dev_wait(struct dm_ioctl *param, size_t param_size) | ||
858 | { | ||
859 | int r; | ||
860 | struct mapped_device *md; | ||
861 | struct dm_table *table; | ||
862 | |||
863 | md = find_device(param); | ||
864 | if (!md) | ||
865 | return -ENXIO; | ||
866 | |||
867 | /* | ||
868 | * Wait for a notification event | ||
869 | */ | ||
870 | if (dm_wait_event(md, param->event_nr)) { | ||
871 | r = -ERESTARTSYS; | ||
872 | goto out; | ||
873 | } | ||
874 | |||
875 | /* | ||
876 | * The userland program is going to want to know what | ||
877 | * changed to trigger the event, so we may as well tell | ||
878 | * him and save an ioctl. | ||
879 | */ | ||
880 | r = __dev_status(md, param); | ||
881 | if (r) | ||
882 | goto out; | ||
883 | |||
884 | table = dm_get_table(md); | ||
885 | if (table) { | ||
886 | retrieve_status(table, param, param_size); | ||
887 | dm_table_put(table); | ||
888 | } | ||
889 | |||
890 | out: | ||
891 | dm_put(md); | ||
892 | return r; | ||
893 | } | ||
894 | |||
895 | static inline int get_mode(struct dm_ioctl *param) | ||
896 | { | ||
897 | int mode = FMODE_READ | FMODE_WRITE; | ||
898 | |||
899 | if (param->flags & DM_READONLY_FLAG) | ||
900 | mode = FMODE_READ; | ||
901 | |||
902 | return mode; | ||
903 | } | ||
904 | |||
905 | static int next_target(struct dm_target_spec *last, uint32_t next, void *end, | ||
906 | struct dm_target_spec **spec, char **target_params) | ||
907 | { | ||
908 | *spec = (struct dm_target_spec *) ((unsigned char *) last + next); | ||
909 | *target_params = (char *) (*spec + 1); | ||
910 | |||
911 | if (*spec < (last + 1)) | ||
912 | return -EINVAL; | ||
913 | |||
914 | return invalid_str(*target_params, end); | ||
915 | } | ||
916 | |||
917 | static int populate_table(struct dm_table *table, | ||
918 | struct dm_ioctl *param, size_t param_size) | ||
919 | { | ||
920 | int r; | ||
921 | unsigned int i = 0; | ||
922 | struct dm_target_spec *spec = (struct dm_target_spec *) param; | ||
923 | uint32_t next = param->data_start; | ||
924 | void *end = (void *) param + param_size; | ||
925 | char *target_params; | ||
926 | |||
927 | if (!param->target_count) { | ||
928 | DMWARN("populate_table: no targets specified"); | ||
929 | return -EINVAL; | ||
930 | } | ||
931 | |||
932 | for (i = 0; i < param->target_count; i++) { | ||
933 | |||
934 | r = next_target(spec, next, end, &spec, &target_params); | ||
935 | if (r) { | ||
936 | DMWARN("unable to find target"); | ||
937 | return r; | ||
938 | } | ||
939 | |||
940 | r = dm_table_add_target(table, spec->target_type, | ||
941 | (sector_t) spec->sector_start, | ||
942 | (sector_t) spec->length, | ||
943 | target_params); | ||
944 | if (r) { | ||
945 | DMWARN("error adding target to table"); | ||
946 | return r; | ||
947 | } | ||
948 | |||
949 | next = spec->next; | ||
950 | } | ||
951 | |||
952 | return dm_table_complete(table); | ||
953 | } | ||
954 | |||
955 | static int table_load(struct dm_ioctl *param, size_t param_size) | ||
956 | { | ||
957 | int r; | ||
958 | struct hash_cell *hc; | ||
959 | struct dm_table *t; | ||
960 | |||
961 | r = dm_table_create(&t, get_mode(param), param->target_count); | ||
962 | if (r) | ||
963 | return r; | ||
964 | |||
965 | r = populate_table(t, param, param_size); | ||
966 | if (r) { | ||
967 | dm_table_put(t); | ||
968 | return r; | ||
969 | } | ||
970 | |||
971 | down_write(&_hash_lock); | ||
972 | hc = __find_device_hash_cell(param); | ||
973 | if (!hc) { | ||
974 | DMWARN("device doesn't appear to be in the dev hash table."); | ||
975 | up_write(&_hash_lock); | ||
976 | return -ENXIO; | ||
977 | } | ||
978 | |||
979 | if (hc->new_map) | ||
980 | dm_table_put(hc->new_map); | ||
981 | hc->new_map = t; | ||
982 | param->flags |= DM_INACTIVE_PRESENT_FLAG; | ||
983 | |||
984 | r = __dev_status(hc->md, param); | ||
985 | up_write(&_hash_lock); | ||
986 | return r; | ||
987 | } | ||
988 | |||
989 | static int table_clear(struct dm_ioctl *param, size_t param_size) | ||
990 | { | ||
991 | int r; | ||
992 | struct hash_cell *hc; | ||
993 | |||
994 | down_write(&_hash_lock); | ||
995 | |||
996 | hc = __find_device_hash_cell(param); | ||
997 | if (!hc) { | ||
998 | DMWARN("device doesn't appear to be in the dev hash table."); | ||
999 | up_write(&_hash_lock); | ||
1000 | return -ENXIO; | ||
1001 | } | ||
1002 | |||
1003 | if (hc->new_map) { | ||
1004 | dm_table_put(hc->new_map); | ||
1005 | hc->new_map = NULL; | ||
1006 | } | ||
1007 | |||
1008 | param->flags &= ~DM_INACTIVE_PRESENT_FLAG; | ||
1009 | |||
1010 | r = __dev_status(hc->md, param); | ||
1011 | up_write(&_hash_lock); | ||
1012 | return r; | ||
1013 | } | ||
1014 | |||
1015 | /* | ||
1016 | * Retrieves a list of devices used by a particular dm device. | ||
1017 | */ | ||
1018 | static void retrieve_deps(struct dm_table *table, | ||
1019 | struct dm_ioctl *param, size_t param_size) | ||
1020 | { | ||
1021 | unsigned int count = 0; | ||
1022 | struct list_head *tmp; | ||
1023 | size_t len, needed; | ||
1024 | struct dm_dev *dd; | ||
1025 | struct dm_target_deps *deps; | ||
1026 | |||
1027 | deps = get_result_buffer(param, param_size, &len); | ||
1028 | |||
1029 | /* | ||
1030 | * Count the devices. | ||
1031 | */ | ||
1032 | list_for_each (tmp, dm_table_get_devices(table)) | ||
1033 | count++; | ||
1034 | |||
1035 | /* | ||
1036 | * Check we have enough space. | ||
1037 | */ | ||
1038 | needed = sizeof(*deps) + (sizeof(*deps->dev) * count); | ||
1039 | if (len < needed) { | ||
1040 | param->flags |= DM_BUFFER_FULL_FLAG; | ||
1041 | return; | ||
1042 | } | ||
1043 | |||
1044 | /* | ||
1045 | * Fill in the devices. | ||
1046 | */ | ||
1047 | deps->count = count; | ||
1048 | count = 0; | ||
1049 | list_for_each_entry (dd, dm_table_get_devices(table), list) | ||
1050 | deps->dev[count++] = huge_encode_dev(dd->bdev->bd_dev); | ||
1051 | |||
1052 | param->data_size = param->data_start + needed; | ||
1053 | } | ||
1054 | |||
1055 | static int table_deps(struct dm_ioctl *param, size_t param_size) | ||
1056 | { | ||
1057 | int r = 0; | ||
1058 | struct mapped_device *md; | ||
1059 | struct dm_table *table; | ||
1060 | |||
1061 | md = find_device(param); | ||
1062 | if (!md) | ||
1063 | return -ENXIO; | ||
1064 | |||
1065 | r = __dev_status(md, param); | ||
1066 | if (r) | ||
1067 | goto out; | ||
1068 | |||
1069 | table = dm_get_table(md); | ||
1070 | if (table) { | ||
1071 | retrieve_deps(table, param, param_size); | ||
1072 | dm_table_put(table); | ||
1073 | } | ||
1074 | |||
1075 | out: | ||
1076 | dm_put(md); | ||
1077 | return r; | ||
1078 | } | ||
1079 | |||
1080 | /* | ||
1081 | * Return the status of a device as a text string for each | ||
1082 | * target. | ||
1083 | */ | ||
1084 | static int table_status(struct dm_ioctl *param, size_t param_size) | ||
1085 | { | ||
1086 | int r; | ||
1087 | struct mapped_device *md; | ||
1088 | struct dm_table *table; | ||
1089 | |||
1090 | md = find_device(param); | ||
1091 | if (!md) | ||
1092 | return -ENXIO; | ||
1093 | |||
1094 | r = __dev_status(md, param); | ||
1095 | if (r) | ||
1096 | goto out; | ||
1097 | |||
1098 | table = dm_get_table(md); | ||
1099 | if (table) { | ||
1100 | retrieve_status(table, param, param_size); | ||
1101 | dm_table_put(table); | ||
1102 | } | ||
1103 | |||
1104 | out: | ||
1105 | dm_put(md); | ||
1106 | return r; | ||
1107 | } | ||
1108 | |||
1109 | /* | ||
1110 | * Pass a message to the target that's at the supplied device offset. | ||
1111 | */ | ||
1112 | static int target_message(struct dm_ioctl *param, size_t param_size) | ||
1113 | { | ||
1114 | int r, argc; | ||
1115 | char **argv; | ||
1116 | struct mapped_device *md; | ||
1117 | struct dm_table *table; | ||
1118 | struct dm_target *ti; | ||
1119 | struct dm_target_msg *tmsg = (void *) param + param->data_start; | ||
1120 | |||
1121 | md = find_device(param); | ||
1122 | if (!md) | ||
1123 | return -ENXIO; | ||
1124 | |||
1125 | r = __dev_status(md, param); | ||
1126 | if (r) | ||
1127 | goto out; | ||
1128 | |||
1129 | if (tmsg < (struct dm_target_msg *) (param + 1) || | ||
1130 | invalid_str(tmsg->message, (void *) param + param_size)) { | ||
1131 | DMWARN("Invalid target message parameters."); | ||
1132 | r = -EINVAL; | ||
1133 | goto out; | ||
1134 | } | ||
1135 | |||
1136 | r = dm_split_args(&argc, &argv, tmsg->message); | ||
1137 | if (r) { | ||
1138 | DMWARN("Failed to split target message parameters"); | ||
1139 | goto out; | ||
1140 | } | ||
1141 | |||
1142 | table = dm_get_table(md); | ||
1143 | if (!table) | ||
1144 | goto out_argv; | ||
1145 | |||
1146 | if (tmsg->sector >= dm_table_get_size(table)) { | ||
1147 | DMWARN("Target message sector outside device."); | ||
1148 | r = -EINVAL; | ||
1149 | goto out_table; | ||
1150 | } | ||
1151 | |||
1152 | ti = dm_table_find_target(table, tmsg->sector); | ||
1153 | if (ti->type->message) | ||
1154 | r = ti->type->message(ti, argc, argv); | ||
1155 | else { | ||
1156 | DMWARN("Target type does not support messages"); | ||
1157 | r = -EINVAL; | ||
1158 | } | ||
1159 | |||
1160 | out_table: | ||
1161 | dm_table_put(table); | ||
1162 | out_argv: | ||
1163 | kfree(argv); | ||
1164 | out: | ||
1165 | param->data_size = 0; | ||
1166 | dm_put(md); | ||
1167 | return r; | ||
1168 | } | ||
1169 | |||
1170 | /*----------------------------------------------------------------- | ||
1171 | * Implementation of open/close/ioctl on the special char | ||
1172 | * device. | ||
1173 | *---------------------------------------------------------------*/ | ||
1174 | static ioctl_fn lookup_ioctl(unsigned int cmd) | ||
1175 | { | ||
1176 | static struct { | ||
1177 | int cmd; | ||
1178 | ioctl_fn fn; | ||
1179 | } _ioctls[] = { | ||
1180 | {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */ | ||
1181 | {DM_REMOVE_ALL_CMD, remove_all}, | ||
1182 | {DM_LIST_DEVICES_CMD, list_devices}, | ||
1183 | |||
1184 | {DM_DEV_CREATE_CMD, dev_create}, | ||
1185 | {DM_DEV_REMOVE_CMD, dev_remove}, | ||
1186 | {DM_DEV_RENAME_CMD, dev_rename}, | ||
1187 | {DM_DEV_SUSPEND_CMD, dev_suspend}, | ||
1188 | {DM_DEV_STATUS_CMD, dev_status}, | ||
1189 | {DM_DEV_WAIT_CMD, dev_wait}, | ||
1190 | |||
1191 | {DM_TABLE_LOAD_CMD, table_load}, | ||
1192 | {DM_TABLE_CLEAR_CMD, table_clear}, | ||
1193 | {DM_TABLE_DEPS_CMD, table_deps}, | ||
1194 | {DM_TABLE_STATUS_CMD, table_status}, | ||
1195 | |||
1196 | {DM_LIST_VERSIONS_CMD, list_versions}, | ||
1197 | |||
1198 | {DM_TARGET_MSG_CMD, target_message} | ||
1199 | }; | ||
1200 | |||
1201 | return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; | ||
1202 | } | ||
1203 | |||
1204 | /* | ||
1205 | * As well as checking the version compatibility this always | ||
1206 | * copies the kernel interface version out. | ||
1207 | */ | ||
1208 | static int check_version(unsigned int cmd, struct dm_ioctl __user *user) | ||
1209 | { | ||
1210 | uint32_t version[3]; | ||
1211 | int r = 0; | ||
1212 | |||
1213 | if (copy_from_user(version, user->version, sizeof(version))) | ||
1214 | return -EFAULT; | ||
1215 | |||
1216 | if ((DM_VERSION_MAJOR != version[0]) || | ||
1217 | (DM_VERSION_MINOR < version[1])) { | ||
1218 | DMWARN("ioctl interface mismatch: " | ||
1219 | "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", | ||
1220 | DM_VERSION_MAJOR, DM_VERSION_MINOR, | ||
1221 | DM_VERSION_PATCHLEVEL, | ||
1222 | version[0], version[1], version[2], cmd); | ||
1223 | r = -EINVAL; | ||
1224 | } | ||
1225 | |||
1226 | /* | ||
1227 | * Fill in the kernel version. | ||
1228 | */ | ||
1229 | version[0] = DM_VERSION_MAJOR; | ||
1230 | version[1] = DM_VERSION_MINOR; | ||
1231 | version[2] = DM_VERSION_PATCHLEVEL; | ||
1232 | if (copy_to_user(user->version, version, sizeof(version))) | ||
1233 | return -EFAULT; | ||
1234 | |||
1235 | return r; | ||
1236 | } | ||
1237 | |||
1238 | static void free_params(struct dm_ioctl *param) | ||
1239 | { | ||
1240 | vfree(param); | ||
1241 | } | ||
1242 | |||
1243 | static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) | ||
1244 | { | ||
1245 | struct dm_ioctl tmp, *dmi; | ||
1246 | |||
1247 | if (copy_from_user(&tmp, user, sizeof(tmp))) | ||
1248 | return -EFAULT; | ||
1249 | |||
1250 | if (tmp.data_size < sizeof(tmp)) | ||
1251 | return -EINVAL; | ||
1252 | |||
1253 | dmi = (struct dm_ioctl *) vmalloc(tmp.data_size); | ||
1254 | if (!dmi) | ||
1255 | return -ENOMEM; | ||
1256 | |||
1257 | if (copy_from_user(dmi, user, tmp.data_size)) { | ||
1258 | vfree(dmi); | ||
1259 | return -EFAULT; | ||
1260 | } | ||
1261 | |||
1262 | *param = dmi; | ||
1263 | return 0; | ||
1264 | } | ||
1265 | |||
1266 | static int validate_params(uint cmd, struct dm_ioctl *param) | ||
1267 | { | ||
1268 | /* Always clear this flag */ | ||
1269 | param->flags &= ~DM_BUFFER_FULL_FLAG; | ||
1270 | |||
1271 | /* Ignores parameters */ | ||
1272 | if (cmd == DM_REMOVE_ALL_CMD || | ||
1273 | cmd == DM_LIST_DEVICES_CMD || | ||
1274 | cmd == DM_LIST_VERSIONS_CMD) | ||
1275 | return 0; | ||
1276 | |||
1277 | if ((cmd == DM_DEV_CREATE_CMD)) { | ||
1278 | if (!*param->name) { | ||
1279 | DMWARN("name not supplied when creating device"); | ||
1280 | return -EINVAL; | ||
1281 | } | ||
1282 | } else if ((*param->uuid && *param->name)) { | ||
1283 | DMWARN("only supply one of name or uuid, cmd(%u)", cmd); | ||
1284 | return -EINVAL; | ||
1285 | } | ||
1286 | |||
1287 | /* Ensure strings are terminated */ | ||
1288 | param->name[DM_NAME_LEN - 1] = '\0'; | ||
1289 | param->uuid[DM_UUID_LEN - 1] = '\0'; | ||
1290 | |||
1291 | return 0; | ||
1292 | } | ||
1293 | |||
1294 | static int ctl_ioctl(struct inode *inode, struct file *file, | ||
1295 | uint command, ulong u) | ||
1296 | { | ||
1297 | int r = 0; | ||
1298 | unsigned int cmd; | ||
1299 | struct dm_ioctl *param; | ||
1300 | struct dm_ioctl __user *user = (struct dm_ioctl __user *) u; | ||
1301 | ioctl_fn fn = NULL; | ||
1302 | size_t param_size; | ||
1303 | |||
1304 | /* only root can play with this */ | ||
1305 | if (!capable(CAP_SYS_ADMIN)) | ||
1306 | return -EACCES; | ||
1307 | |||
1308 | if (_IOC_TYPE(command) != DM_IOCTL) | ||
1309 | return -ENOTTY; | ||
1310 | |||
1311 | cmd = _IOC_NR(command); | ||
1312 | |||
1313 | /* | ||
1314 | * Check the interface version passed in. This also | ||
1315 | * writes out the kernel's interface version. | ||
1316 | */ | ||
1317 | r = check_version(cmd, user); | ||
1318 | if (r) | ||
1319 | return r; | ||
1320 | |||
1321 | /* | ||
1322 | * Nothing more to do for the version command. | ||
1323 | */ | ||
1324 | if (cmd == DM_VERSION_CMD) | ||
1325 | return 0; | ||
1326 | |||
1327 | fn = lookup_ioctl(cmd); | ||
1328 | if (!fn) { | ||
1329 | DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); | ||
1330 | return -ENOTTY; | ||
1331 | } | ||
1332 | |||
1333 | /* | ||
1334 | * Trying to avoid low memory issues when a device is | ||
1335 | * suspended. | ||
1336 | */ | ||
1337 | current->flags |= PF_MEMALLOC; | ||
1338 | |||
1339 | /* | ||
1340 | * Copy the parameters into kernel space. | ||
1341 | */ | ||
1342 | r = copy_params(user, ¶m); | ||
1343 | if (r) { | ||
1344 | current->flags &= ~PF_MEMALLOC; | ||
1345 | return r; | ||
1346 | } | ||
1347 | |||
1348 | /* | ||
1349 | * FIXME: eventually we will remove the PF_MEMALLOC flag | ||
1350 | * here. However the tools still do nasty things like | ||
1351 | * 'load' while a device is suspended. | ||
1352 | */ | ||
1353 | |||
1354 | r = validate_params(cmd, param); | ||
1355 | if (r) | ||
1356 | goto out; | ||
1357 | |||
1358 | param_size = param->data_size; | ||
1359 | param->data_size = sizeof(*param); | ||
1360 | r = fn(param, param_size); | ||
1361 | |||
1362 | /* | ||
1363 | * Copy the results back to userland. | ||
1364 | */ | ||
1365 | if (!r && copy_to_user(user, param, param->data_size)) | ||
1366 | r = -EFAULT; | ||
1367 | |||
1368 | out: | ||
1369 | free_params(param); | ||
1370 | current->flags &= ~PF_MEMALLOC; | ||
1371 | return r; | ||
1372 | } | ||
1373 | |||
1374 | static struct file_operations _ctl_fops = { | ||
1375 | .ioctl = ctl_ioctl, | ||
1376 | .owner = THIS_MODULE, | ||
1377 | }; | ||
1378 | |||
1379 | static struct miscdevice _dm_misc = { | ||
1380 | .minor = MISC_DYNAMIC_MINOR, | ||
1381 | .name = DM_NAME, | ||
1382 | .devfs_name = "mapper/control", | ||
1383 | .fops = &_ctl_fops | ||
1384 | }; | ||
1385 | |||
1386 | /* | ||
1387 | * Create misc character device and link to DM_DIR/control. | ||
1388 | */ | ||
1389 | int __init dm_interface_init(void) | ||
1390 | { | ||
1391 | int r; | ||
1392 | |||
1393 | r = dm_hash_init(); | ||
1394 | if (r) | ||
1395 | return r; | ||
1396 | |||
1397 | r = misc_register(&_dm_misc); | ||
1398 | if (r) { | ||
1399 | DMERR("misc_register failed for control device"); | ||
1400 | dm_hash_exit(); | ||
1401 | return r; | ||
1402 | } | ||
1403 | |||
1404 | DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR, | ||
1405 | DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA, | ||
1406 | DM_DRIVER_EMAIL); | ||
1407 | return 0; | ||
1408 | } | ||
1409 | |||
1410 | void dm_interface_exit(void) | ||
1411 | { | ||
1412 | if (misc_deregister(&_dm_misc) < 0) | ||
1413 | DMERR("misc_deregister failed for control device"); | ||
1414 | |||
1415 | dm_hash_exit(); | ||
1416 | } | ||
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c new file mode 100644 index 000000000000..6a2cd5dc8a63 --- /dev/null +++ b/drivers/md/dm-linear.c | |||
@@ -0,0 +1,123 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001-2003 Sistina Software (UK) Limited. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #include "dm.h" | ||
8 | |||
9 | #include <linux/module.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/blkdev.h> | ||
12 | #include <linux/bio.h> | ||
13 | #include <linux/slab.h> | ||
14 | |||
15 | /* | ||
16 | * Linear: maps a linear range of a device. | ||
17 | */ | ||
18 | struct linear_c { | ||
19 | struct dm_dev *dev; | ||
20 | sector_t start; | ||
21 | }; | ||
22 | |||
23 | /* | ||
24 | * Construct a linear mapping: <dev_path> <offset> | ||
25 | */ | ||
26 | static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | ||
27 | { | ||
28 | struct linear_c *lc; | ||
29 | |||
30 | if (argc != 2) { | ||
31 | ti->error = "dm-linear: Invalid argument count"; | ||
32 | return -EINVAL; | ||
33 | } | ||
34 | |||
35 | lc = kmalloc(sizeof(*lc), GFP_KERNEL); | ||
36 | if (lc == NULL) { | ||
37 | ti->error = "dm-linear: Cannot allocate linear context"; | ||
38 | return -ENOMEM; | ||
39 | } | ||
40 | |||
41 | if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) { | ||
42 | ti->error = "dm-linear: Invalid device sector"; | ||
43 | goto bad; | ||
44 | } | ||
45 | |||
46 | if (dm_get_device(ti, argv[0], lc->start, ti->len, | ||
47 | dm_table_get_mode(ti->table), &lc->dev)) { | ||
48 | ti->error = "dm-linear: Device lookup failed"; | ||
49 | goto bad; | ||
50 | } | ||
51 | |||
52 | ti->private = lc; | ||
53 | return 0; | ||
54 | |||
55 | bad: | ||
56 | kfree(lc); | ||
57 | return -EINVAL; | ||
58 | } | ||
59 | |||
60 | static void linear_dtr(struct dm_target *ti) | ||
61 | { | ||
62 | struct linear_c *lc = (struct linear_c *) ti->private; | ||
63 | |||
64 | dm_put_device(ti, lc->dev); | ||
65 | kfree(lc); | ||
66 | } | ||
67 | |||
68 | static int linear_map(struct dm_target *ti, struct bio *bio, | ||
69 | union map_info *map_context) | ||
70 | { | ||
71 | struct linear_c *lc = (struct linear_c *) ti->private; | ||
72 | |||
73 | bio->bi_bdev = lc->dev->bdev; | ||
74 | bio->bi_sector = lc->start + (bio->bi_sector - ti->begin); | ||
75 | |||
76 | return 1; | ||
77 | } | ||
78 | |||
79 | static int linear_status(struct dm_target *ti, status_type_t type, | ||
80 | char *result, unsigned int maxlen) | ||
81 | { | ||
82 | struct linear_c *lc = (struct linear_c *) ti->private; | ||
83 | |||
84 | switch (type) { | ||
85 | case STATUSTYPE_INFO: | ||
86 | result[0] = '\0'; | ||
87 | break; | ||
88 | |||
89 | case STATUSTYPE_TABLE: | ||
90 | snprintf(result, maxlen, "%s " SECTOR_FORMAT, lc->dev->name, | ||
91 | lc->start); | ||
92 | break; | ||
93 | } | ||
94 | return 0; | ||
95 | } | ||
96 | |||
97 | static struct target_type linear_target = { | ||
98 | .name = "linear", | ||
99 | .version= {1, 0, 1}, | ||
100 | .module = THIS_MODULE, | ||
101 | .ctr = linear_ctr, | ||
102 | .dtr = linear_dtr, | ||
103 | .map = linear_map, | ||
104 | .status = linear_status, | ||
105 | }; | ||
106 | |||
107 | int __init dm_linear_init(void) | ||
108 | { | ||
109 | int r = dm_register_target(&linear_target); | ||
110 | |||
111 | if (r < 0) | ||
112 | DMERR("linear: register failed %d", r); | ||
113 | |||
114 | return r; | ||
115 | } | ||
116 | |||
117 | void dm_linear_exit(void) | ||
118 | { | ||
119 | int r = dm_unregister_target(&linear_target); | ||
120 | |||
121 | if (r < 0) | ||
122 | DMERR("linear: unregister failed %d", r); | ||
123 | } | ||
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c new file mode 100644 index 000000000000..e110655eabdb --- /dev/null +++ b/drivers/md/dm-log.c | |||
@@ -0,0 +1,711 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Sistina Software | ||
3 | * | ||
4 | * This file is released under the LGPL. | ||
5 | */ | ||
6 | |||
7 | #include <linux/init.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/vmalloc.h> | ||
11 | |||
12 | #include "dm-log.h" | ||
13 | #include "dm-io.h" | ||
14 | |||
15 | static LIST_HEAD(_log_types); | ||
16 | static DEFINE_SPINLOCK(_lock); | ||
17 | |||
18 | int dm_register_dirty_log_type(struct dirty_log_type *type) | ||
19 | { | ||
20 | spin_lock(&_lock); | ||
21 | type->use_count = 0; | ||
22 | list_add(&type->list, &_log_types); | ||
23 | spin_unlock(&_lock); | ||
24 | |||
25 | return 0; | ||
26 | } | ||
27 | |||
28 | int dm_unregister_dirty_log_type(struct dirty_log_type *type) | ||
29 | { | ||
30 | spin_lock(&_lock); | ||
31 | |||
32 | if (type->use_count) | ||
33 | DMWARN("Attempt to unregister a log type that is still in use"); | ||
34 | else | ||
35 | list_del(&type->list); | ||
36 | |||
37 | spin_unlock(&_lock); | ||
38 | |||
39 | return 0; | ||
40 | } | ||
41 | |||
42 | static struct dirty_log_type *get_type(const char *type_name) | ||
43 | { | ||
44 | struct dirty_log_type *type; | ||
45 | |||
46 | spin_lock(&_lock); | ||
47 | list_for_each_entry (type, &_log_types, list) | ||
48 | if (!strcmp(type_name, type->name)) { | ||
49 | if (!type->use_count && !try_module_get(type->module)){ | ||
50 | spin_unlock(&_lock); | ||
51 | return NULL; | ||
52 | } | ||
53 | type->use_count++; | ||
54 | spin_unlock(&_lock); | ||
55 | return type; | ||
56 | } | ||
57 | |||
58 | spin_unlock(&_lock); | ||
59 | return NULL; | ||
60 | } | ||
61 | |||
62 | static void put_type(struct dirty_log_type *type) | ||
63 | { | ||
64 | spin_lock(&_lock); | ||
65 | if (!--type->use_count) | ||
66 | module_put(type->module); | ||
67 | spin_unlock(&_lock); | ||
68 | } | ||
69 | |||
70 | struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti, | ||
71 | unsigned int argc, char **argv) | ||
72 | { | ||
73 | struct dirty_log_type *type; | ||
74 | struct dirty_log *log; | ||
75 | |||
76 | log = kmalloc(sizeof(*log), GFP_KERNEL); | ||
77 | if (!log) | ||
78 | return NULL; | ||
79 | |||
80 | type = get_type(type_name); | ||
81 | if (!type) { | ||
82 | kfree(log); | ||
83 | return NULL; | ||
84 | } | ||
85 | |||
86 | log->type = type; | ||
87 | if (type->ctr(log, ti, argc, argv)) { | ||
88 | kfree(log); | ||
89 | put_type(type); | ||
90 | return NULL; | ||
91 | } | ||
92 | |||
93 | return log; | ||
94 | } | ||
95 | |||
96 | void dm_destroy_dirty_log(struct dirty_log *log) | ||
97 | { | ||
98 | log->type->dtr(log); | ||
99 | put_type(log->type); | ||
100 | kfree(log); | ||
101 | } | ||
102 | |||
103 | /*----------------------------------------------------------------- | ||
104 | * Persistent and core logs share a lot of their implementation. | ||
105 | * FIXME: need a reload method to be called from a resume | ||
106 | *---------------------------------------------------------------*/ | ||
107 | /* | ||
108 | * Magic for persistent mirrors: "MiRr" | ||
109 | */ | ||
110 | #define MIRROR_MAGIC 0x4D695272 | ||
111 | |||
112 | /* | ||
113 | * The on-disk version of the metadata. | ||
114 | */ | ||
115 | #define MIRROR_DISK_VERSION 1 | ||
116 | #define LOG_OFFSET 2 | ||
117 | |||
118 | struct log_header { | ||
119 | uint32_t magic; | ||
120 | |||
121 | /* | ||
122 | * Simple, incrementing version. no backward | ||
123 | * compatibility. | ||
124 | */ | ||
125 | uint32_t version; | ||
126 | sector_t nr_regions; | ||
127 | }; | ||
128 | |||
129 | struct log_c { | ||
130 | struct dm_target *ti; | ||
131 | int touched; | ||
132 | uint32_t region_size; | ||
133 | unsigned int region_count; | ||
134 | region_t sync_count; | ||
135 | |||
136 | unsigned bitset_uint32_count; | ||
137 | uint32_t *clean_bits; | ||
138 | uint32_t *sync_bits; | ||
139 | uint32_t *recovering_bits; /* FIXME: this seems excessive */ | ||
140 | |||
141 | int sync_search; | ||
142 | |||
143 | /* Resync flag */ | ||
144 | enum sync { | ||
145 | DEFAULTSYNC, /* Synchronize if necessary */ | ||
146 | NOSYNC, /* Devices known to be already in sync */ | ||
147 | FORCESYNC, /* Force a sync to happen */ | ||
148 | } sync; | ||
149 | |||
150 | /* | ||
151 | * Disk log fields | ||
152 | */ | ||
153 | struct dm_dev *log_dev; | ||
154 | struct log_header header; | ||
155 | |||
156 | struct io_region header_location; | ||
157 | struct log_header *disk_header; | ||
158 | |||
159 | struct io_region bits_location; | ||
160 | uint32_t *disk_bits; | ||
161 | }; | ||
162 | |||
163 | /* | ||
164 | * The touched member needs to be updated every time we access | ||
165 | * one of the bitsets. | ||
166 | */ | ||
167 | static inline int log_test_bit(uint32_t *bs, unsigned bit) | ||
168 | { | ||
169 | return test_bit(bit, (unsigned long *) bs) ? 1 : 0; | ||
170 | } | ||
171 | |||
172 | static inline void log_set_bit(struct log_c *l, | ||
173 | uint32_t *bs, unsigned bit) | ||
174 | { | ||
175 | set_bit(bit, (unsigned long *) bs); | ||
176 | l->touched = 1; | ||
177 | } | ||
178 | |||
179 | static inline void log_clear_bit(struct log_c *l, | ||
180 | uint32_t *bs, unsigned bit) | ||
181 | { | ||
182 | clear_bit(bit, (unsigned long *) bs); | ||
183 | l->touched = 1; | ||
184 | } | ||
185 | |||
186 | /*---------------------------------------------------------------- | ||
187 | * Header IO | ||
188 | *--------------------------------------------------------------*/ | ||
189 | static void header_to_disk(struct log_header *core, struct log_header *disk) | ||
190 | { | ||
191 | disk->magic = cpu_to_le32(core->magic); | ||
192 | disk->version = cpu_to_le32(core->version); | ||
193 | disk->nr_regions = cpu_to_le64(core->nr_regions); | ||
194 | } | ||
195 | |||
196 | static void header_from_disk(struct log_header *core, struct log_header *disk) | ||
197 | { | ||
198 | core->magic = le32_to_cpu(disk->magic); | ||
199 | core->version = le32_to_cpu(disk->version); | ||
200 | core->nr_regions = le64_to_cpu(disk->nr_regions); | ||
201 | } | ||
202 | |||
203 | static int read_header(struct log_c *log) | ||
204 | { | ||
205 | int r; | ||
206 | unsigned long ebits; | ||
207 | |||
208 | r = dm_io_sync_vm(1, &log->header_location, READ, | ||
209 | log->disk_header, &ebits); | ||
210 | if (r) | ||
211 | return r; | ||
212 | |||
213 | header_from_disk(&log->header, log->disk_header); | ||
214 | |||
215 | /* New log required? */ | ||
216 | if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) { | ||
217 | log->header.magic = MIRROR_MAGIC; | ||
218 | log->header.version = MIRROR_DISK_VERSION; | ||
219 | log->header.nr_regions = 0; | ||
220 | } | ||
221 | |||
222 | if (log->header.version != MIRROR_DISK_VERSION) { | ||
223 | DMWARN("incompatible disk log version"); | ||
224 | return -EINVAL; | ||
225 | } | ||
226 | |||
227 | return 0; | ||
228 | } | ||
229 | |||
230 | static inline int write_header(struct log_c *log) | ||
231 | { | ||
232 | unsigned long ebits; | ||
233 | |||
234 | header_to_disk(&log->header, log->disk_header); | ||
235 | return dm_io_sync_vm(1, &log->header_location, WRITE, | ||
236 | log->disk_header, &ebits); | ||
237 | } | ||
238 | |||
239 | /*---------------------------------------------------------------- | ||
240 | * Bits IO | ||
241 | *--------------------------------------------------------------*/ | ||
242 | static inline void bits_to_core(uint32_t *core, uint32_t *disk, unsigned count) | ||
243 | { | ||
244 | unsigned i; | ||
245 | |||
246 | for (i = 0; i < count; i++) | ||
247 | core[i] = le32_to_cpu(disk[i]); | ||
248 | } | ||
249 | |||
250 | static inline void bits_to_disk(uint32_t *core, uint32_t *disk, unsigned count) | ||
251 | { | ||
252 | unsigned i; | ||
253 | |||
254 | /* copy across the clean/dirty bitset */ | ||
255 | for (i = 0; i < count; i++) | ||
256 | disk[i] = cpu_to_le32(core[i]); | ||
257 | } | ||
258 | |||
259 | static int read_bits(struct log_c *log) | ||
260 | { | ||
261 | int r; | ||
262 | unsigned long ebits; | ||
263 | |||
264 | r = dm_io_sync_vm(1, &log->bits_location, READ, | ||
265 | log->disk_bits, &ebits); | ||
266 | if (r) | ||
267 | return r; | ||
268 | |||
269 | bits_to_core(log->clean_bits, log->disk_bits, | ||
270 | log->bitset_uint32_count); | ||
271 | return 0; | ||
272 | } | ||
273 | |||
274 | static int write_bits(struct log_c *log) | ||
275 | { | ||
276 | unsigned long ebits; | ||
277 | bits_to_disk(log->clean_bits, log->disk_bits, | ||
278 | log->bitset_uint32_count); | ||
279 | return dm_io_sync_vm(1, &log->bits_location, WRITE, | ||
280 | log->disk_bits, &ebits); | ||
281 | } | ||
282 | |||
283 | /*---------------------------------------------------------------- | ||
284 | * core log constructor/destructor | ||
285 | * | ||
286 | * argv contains region_size followed optionally by [no]sync | ||
287 | *--------------------------------------------------------------*/ | ||
288 | #define BYTE_SHIFT 3 | ||
289 | static int core_ctr(struct dirty_log *log, struct dm_target *ti, | ||
290 | unsigned int argc, char **argv) | ||
291 | { | ||
292 | enum sync sync = DEFAULTSYNC; | ||
293 | |||
294 | struct log_c *lc; | ||
295 | uint32_t region_size; | ||
296 | unsigned int region_count; | ||
297 | size_t bitset_size; | ||
298 | |||
299 | if (argc < 1 || argc > 2) { | ||
300 | DMWARN("wrong number of arguments to mirror log"); | ||
301 | return -EINVAL; | ||
302 | } | ||
303 | |||
304 | if (argc > 1) { | ||
305 | if (!strcmp(argv[1], "sync")) | ||
306 | sync = FORCESYNC; | ||
307 | else if (!strcmp(argv[1], "nosync")) | ||
308 | sync = NOSYNC; | ||
309 | else { | ||
310 | DMWARN("unrecognised sync argument to mirror log: %s", | ||
311 | argv[1]); | ||
312 | return -EINVAL; | ||
313 | } | ||
314 | } | ||
315 | |||
316 | if (sscanf(argv[0], "%u", ®ion_size) != 1) { | ||
317 | DMWARN("invalid region size string"); | ||
318 | return -EINVAL; | ||
319 | } | ||
320 | |||
321 | region_count = dm_sector_div_up(ti->len, region_size); | ||
322 | |||
323 | lc = kmalloc(sizeof(*lc), GFP_KERNEL); | ||
324 | if (!lc) { | ||
325 | DMWARN("couldn't allocate core log"); | ||
326 | return -ENOMEM; | ||
327 | } | ||
328 | |||
329 | lc->ti = ti; | ||
330 | lc->touched = 0; | ||
331 | lc->region_size = region_size; | ||
332 | lc->region_count = region_count; | ||
333 | lc->sync = sync; | ||
334 | |||
335 | /* | ||
336 | * Work out how many words we need to hold the bitset. | ||
337 | */ | ||
338 | bitset_size = dm_round_up(region_count, | ||
339 | sizeof(*lc->clean_bits) << BYTE_SHIFT); | ||
340 | bitset_size >>= BYTE_SHIFT; | ||
341 | |||
342 | lc->bitset_uint32_count = bitset_size / 4; | ||
343 | lc->clean_bits = vmalloc(bitset_size); | ||
344 | if (!lc->clean_bits) { | ||
345 | DMWARN("couldn't allocate clean bitset"); | ||
346 | kfree(lc); | ||
347 | return -ENOMEM; | ||
348 | } | ||
349 | memset(lc->clean_bits, -1, bitset_size); | ||
350 | |||
351 | lc->sync_bits = vmalloc(bitset_size); | ||
352 | if (!lc->sync_bits) { | ||
353 | DMWARN("couldn't allocate sync bitset"); | ||
354 | vfree(lc->clean_bits); | ||
355 | kfree(lc); | ||
356 | return -ENOMEM; | ||
357 | } | ||
358 | memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); | ||
359 | lc->sync_count = (sync == NOSYNC) ? region_count : 0; | ||
360 | |||
361 | lc->recovering_bits = vmalloc(bitset_size); | ||
362 | if (!lc->recovering_bits) { | ||
363 | DMWARN("couldn't allocate sync bitset"); | ||
364 | vfree(lc->sync_bits); | ||
365 | vfree(lc->clean_bits); | ||
366 | kfree(lc); | ||
367 | return -ENOMEM; | ||
368 | } | ||
369 | memset(lc->recovering_bits, 0, bitset_size); | ||
370 | lc->sync_search = 0; | ||
371 | log->context = lc; | ||
372 | return 0; | ||
373 | } | ||
374 | |||
375 | static void core_dtr(struct dirty_log *log) | ||
376 | { | ||
377 | struct log_c *lc = (struct log_c *) log->context; | ||
378 | vfree(lc->clean_bits); | ||
379 | vfree(lc->sync_bits); | ||
380 | vfree(lc->recovering_bits); | ||
381 | kfree(lc); | ||
382 | } | ||
383 | |||
384 | /*---------------------------------------------------------------- | ||
385 | * disk log constructor/destructor | ||
386 | * | ||
387 | * argv contains log_device region_size followed optionally by [no]sync | ||
388 | *--------------------------------------------------------------*/ | ||
389 | static int disk_ctr(struct dirty_log *log, struct dm_target *ti, | ||
390 | unsigned int argc, char **argv) | ||
391 | { | ||
392 | int r; | ||
393 | size_t size; | ||
394 | struct log_c *lc; | ||
395 | struct dm_dev *dev; | ||
396 | |||
397 | if (argc < 2 || argc > 3) { | ||
398 | DMWARN("wrong number of arguments to disk mirror log"); | ||
399 | return -EINVAL; | ||
400 | } | ||
401 | |||
402 | r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */, | ||
403 | FMODE_READ | FMODE_WRITE, &dev); | ||
404 | if (r) | ||
405 | return r; | ||
406 | |||
407 | r = core_ctr(log, ti, argc - 1, argv + 1); | ||
408 | if (r) { | ||
409 | dm_put_device(ti, dev); | ||
410 | return r; | ||
411 | } | ||
412 | |||
413 | lc = (struct log_c *) log->context; | ||
414 | lc->log_dev = dev; | ||
415 | |||
416 | /* setup the disk header fields */ | ||
417 | lc->header_location.bdev = lc->log_dev->bdev; | ||
418 | lc->header_location.sector = 0; | ||
419 | lc->header_location.count = 1; | ||
420 | |||
421 | /* | ||
422 | * We can't read less than this amount, even though we'll | ||
423 | * not be using most of this space. | ||
424 | */ | ||
425 | lc->disk_header = vmalloc(1 << SECTOR_SHIFT); | ||
426 | if (!lc->disk_header) | ||
427 | goto bad; | ||
428 | |||
429 | /* setup the disk bitset fields */ | ||
430 | lc->bits_location.bdev = lc->log_dev->bdev; | ||
431 | lc->bits_location.sector = LOG_OFFSET; | ||
432 | |||
433 | size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t), | ||
434 | 1 << SECTOR_SHIFT); | ||
435 | lc->bits_location.count = size >> SECTOR_SHIFT; | ||
436 | lc->disk_bits = vmalloc(size); | ||
437 | if (!lc->disk_bits) { | ||
438 | vfree(lc->disk_header); | ||
439 | goto bad; | ||
440 | } | ||
441 | return 0; | ||
442 | |||
443 | bad: | ||
444 | dm_put_device(ti, lc->log_dev); | ||
445 | core_dtr(log); | ||
446 | return -ENOMEM; | ||
447 | } | ||
448 | |||
449 | static void disk_dtr(struct dirty_log *log) | ||
450 | { | ||
451 | struct log_c *lc = (struct log_c *) log->context; | ||
452 | dm_put_device(lc->ti, lc->log_dev); | ||
453 | vfree(lc->disk_header); | ||
454 | vfree(lc->disk_bits); | ||
455 | core_dtr(log); | ||
456 | } | ||
457 | |||
458 | static int count_bits32(uint32_t *addr, unsigned size) | ||
459 | { | ||
460 | int count = 0, i; | ||
461 | |||
462 | for (i = 0; i < size; i++) { | ||
463 | count += hweight32(*(addr+i)); | ||
464 | } | ||
465 | return count; | ||
466 | } | ||
467 | |||
468 | static int disk_resume(struct dirty_log *log) | ||
469 | { | ||
470 | int r; | ||
471 | unsigned i; | ||
472 | struct log_c *lc = (struct log_c *) log->context; | ||
473 | size_t size = lc->bitset_uint32_count * sizeof(uint32_t); | ||
474 | |||
475 | /* read the disk header */ | ||
476 | r = read_header(lc); | ||
477 | if (r) | ||
478 | return r; | ||
479 | |||
480 | /* read the bits */ | ||
481 | r = read_bits(lc); | ||
482 | if (r) | ||
483 | return r; | ||
484 | |||
485 | /* set or clear any new bits */ | ||
486 | if (lc->sync == NOSYNC) | ||
487 | for (i = lc->header.nr_regions; i < lc->region_count; i++) | ||
488 | /* FIXME: amazingly inefficient */ | ||
489 | log_set_bit(lc, lc->clean_bits, i); | ||
490 | else | ||
491 | for (i = lc->header.nr_regions; i < lc->region_count; i++) | ||
492 | /* FIXME: amazingly inefficient */ | ||
493 | log_clear_bit(lc, lc->clean_bits, i); | ||
494 | |||
495 | /* copy clean across to sync */ | ||
496 | memcpy(lc->sync_bits, lc->clean_bits, size); | ||
497 | lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); | ||
498 | |||
499 | /* write the bits */ | ||
500 | r = write_bits(lc); | ||
501 | if (r) | ||
502 | return r; | ||
503 | |||
504 | /* set the correct number of regions in the header */ | ||
505 | lc->header.nr_regions = lc->region_count; | ||
506 | |||
507 | /* write the new header */ | ||
508 | return write_header(lc); | ||
509 | } | ||
510 | |||
511 | static uint32_t core_get_region_size(struct dirty_log *log) | ||
512 | { | ||
513 | struct log_c *lc = (struct log_c *) log->context; | ||
514 | return lc->region_size; | ||
515 | } | ||
516 | |||
517 | static int core_is_clean(struct dirty_log *log, region_t region) | ||
518 | { | ||
519 | struct log_c *lc = (struct log_c *) log->context; | ||
520 | return log_test_bit(lc->clean_bits, region); | ||
521 | } | ||
522 | |||
523 | static int core_in_sync(struct dirty_log *log, region_t region, int block) | ||
524 | { | ||
525 | struct log_c *lc = (struct log_c *) log->context; | ||
526 | return log_test_bit(lc->sync_bits, region); | ||
527 | } | ||
528 | |||
529 | static int core_flush(struct dirty_log *log) | ||
530 | { | ||
531 | /* no op */ | ||
532 | return 0; | ||
533 | } | ||
534 | |||
535 | static int disk_flush(struct dirty_log *log) | ||
536 | { | ||
537 | int r; | ||
538 | struct log_c *lc = (struct log_c *) log->context; | ||
539 | |||
540 | /* only write if the log has changed */ | ||
541 | if (!lc->touched) | ||
542 | return 0; | ||
543 | |||
544 | r = write_bits(lc); | ||
545 | if (!r) | ||
546 | lc->touched = 0; | ||
547 | |||
548 | return r; | ||
549 | } | ||
550 | |||
551 | static void core_mark_region(struct dirty_log *log, region_t region) | ||
552 | { | ||
553 | struct log_c *lc = (struct log_c *) log->context; | ||
554 | log_clear_bit(lc, lc->clean_bits, region); | ||
555 | } | ||
556 | |||
557 | static void core_clear_region(struct dirty_log *log, region_t region) | ||
558 | { | ||
559 | struct log_c *lc = (struct log_c *) log->context; | ||
560 | log_set_bit(lc, lc->clean_bits, region); | ||
561 | } | ||
562 | |||
563 | static int core_get_resync_work(struct dirty_log *log, region_t *region) | ||
564 | { | ||
565 | struct log_c *lc = (struct log_c *) log->context; | ||
566 | |||
567 | if (lc->sync_search >= lc->region_count) | ||
568 | return 0; | ||
569 | |||
570 | do { | ||
571 | *region = find_next_zero_bit((unsigned long *) lc->sync_bits, | ||
572 | lc->region_count, | ||
573 | lc->sync_search); | ||
574 | lc->sync_search = *region + 1; | ||
575 | |||
576 | if (*region == lc->region_count) | ||
577 | return 0; | ||
578 | |||
579 | } while (log_test_bit(lc->recovering_bits, *region)); | ||
580 | |||
581 | log_set_bit(lc, lc->recovering_bits, *region); | ||
582 | return 1; | ||
583 | } | ||
584 | |||
585 | static void core_complete_resync_work(struct dirty_log *log, region_t region, | ||
586 | int success) | ||
587 | { | ||
588 | struct log_c *lc = (struct log_c *) log->context; | ||
589 | |||
590 | log_clear_bit(lc, lc->recovering_bits, region); | ||
591 | if (success) { | ||
592 | log_set_bit(lc, lc->sync_bits, region); | ||
593 | lc->sync_count++; | ||
594 | } | ||
595 | } | ||
596 | |||
597 | static region_t core_get_sync_count(struct dirty_log *log) | ||
598 | { | ||
599 | struct log_c *lc = (struct log_c *) log->context; | ||
600 | |||
601 | return lc->sync_count; | ||
602 | } | ||
603 | |||
604 | #define DMEMIT_SYNC \ | ||
605 | if (lc->sync != DEFAULTSYNC) \ | ||
606 | DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "") | ||
607 | |||
608 | static int core_status(struct dirty_log *log, status_type_t status, | ||
609 | char *result, unsigned int maxlen) | ||
610 | { | ||
611 | int sz = 0; | ||
612 | struct log_c *lc = log->context; | ||
613 | |||
614 | switch(status) { | ||
615 | case STATUSTYPE_INFO: | ||
616 | break; | ||
617 | |||
618 | case STATUSTYPE_TABLE: | ||
619 | DMEMIT("%s %u %u ", log->type->name, | ||
620 | lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size); | ||
621 | DMEMIT_SYNC; | ||
622 | } | ||
623 | |||
624 | return sz; | ||
625 | } | ||
626 | |||
627 | static int disk_status(struct dirty_log *log, status_type_t status, | ||
628 | char *result, unsigned int maxlen) | ||
629 | { | ||
630 | int sz = 0; | ||
631 | char buffer[16]; | ||
632 | struct log_c *lc = log->context; | ||
633 | |||
634 | switch(status) { | ||
635 | case STATUSTYPE_INFO: | ||
636 | break; | ||
637 | |||
638 | case STATUSTYPE_TABLE: | ||
639 | format_dev_t(buffer, lc->log_dev->bdev->bd_dev); | ||
640 | DMEMIT("%s %u %s %u ", log->type->name, | ||
641 | lc->sync == DEFAULTSYNC ? 2 : 3, buffer, | ||
642 | lc->region_size); | ||
643 | DMEMIT_SYNC; | ||
644 | } | ||
645 | |||
646 | return sz; | ||
647 | } | ||
648 | |||
649 | static struct dirty_log_type _core_type = { | ||
650 | .name = "core", | ||
651 | .module = THIS_MODULE, | ||
652 | .ctr = core_ctr, | ||
653 | .dtr = core_dtr, | ||
654 | .get_region_size = core_get_region_size, | ||
655 | .is_clean = core_is_clean, | ||
656 | .in_sync = core_in_sync, | ||
657 | .flush = core_flush, | ||
658 | .mark_region = core_mark_region, | ||
659 | .clear_region = core_clear_region, | ||
660 | .get_resync_work = core_get_resync_work, | ||
661 | .complete_resync_work = core_complete_resync_work, | ||
662 | .get_sync_count = core_get_sync_count, | ||
663 | .status = core_status, | ||
664 | }; | ||
665 | |||
666 | static struct dirty_log_type _disk_type = { | ||
667 | .name = "disk", | ||
668 | .module = THIS_MODULE, | ||
669 | .ctr = disk_ctr, | ||
670 | .dtr = disk_dtr, | ||
671 | .suspend = disk_flush, | ||
672 | .resume = disk_resume, | ||
673 | .get_region_size = core_get_region_size, | ||
674 | .is_clean = core_is_clean, | ||
675 | .in_sync = core_in_sync, | ||
676 | .flush = disk_flush, | ||
677 | .mark_region = core_mark_region, | ||
678 | .clear_region = core_clear_region, | ||
679 | .get_resync_work = core_get_resync_work, | ||
680 | .complete_resync_work = core_complete_resync_work, | ||
681 | .get_sync_count = core_get_sync_count, | ||
682 | .status = disk_status, | ||
683 | }; | ||
684 | |||
685 | int __init dm_dirty_log_init(void) | ||
686 | { | ||
687 | int r; | ||
688 | |||
689 | r = dm_register_dirty_log_type(&_core_type); | ||
690 | if (r) | ||
691 | DMWARN("couldn't register core log"); | ||
692 | |||
693 | r = dm_register_dirty_log_type(&_disk_type); | ||
694 | if (r) { | ||
695 | DMWARN("couldn't register disk type"); | ||
696 | dm_unregister_dirty_log_type(&_core_type); | ||
697 | } | ||
698 | |||
699 | return r; | ||
700 | } | ||
701 | |||
702 | void dm_dirty_log_exit(void) | ||
703 | { | ||
704 | dm_unregister_dirty_log_type(&_disk_type); | ||
705 | dm_unregister_dirty_log_type(&_core_type); | ||
706 | } | ||
707 | |||
708 | EXPORT_SYMBOL(dm_register_dirty_log_type); | ||
709 | EXPORT_SYMBOL(dm_unregister_dirty_log_type); | ||
710 | EXPORT_SYMBOL(dm_create_dirty_log); | ||
711 | EXPORT_SYMBOL(dm_destroy_dirty_log); | ||
diff --git a/drivers/md/dm-log.h b/drivers/md/dm-log.h new file mode 100644 index 000000000000..5ae5309ebf28 --- /dev/null +++ b/drivers/md/dm-log.h | |||
@@ -0,0 +1,130 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Sistina Software | ||
3 | * | ||
4 | * This file is released under the LGPL. | ||
5 | */ | ||
6 | |||
7 | #ifndef DM_DIRTY_LOG | ||
8 | #define DM_DIRTY_LOG | ||
9 | |||
10 | #include "dm.h" | ||
11 | |||
12 | typedef sector_t region_t; | ||
13 | |||
14 | struct dirty_log_type; | ||
15 | |||
16 | struct dirty_log { | ||
17 | struct dirty_log_type *type; | ||
18 | void *context; | ||
19 | }; | ||
20 | |||
21 | struct dirty_log_type { | ||
22 | struct list_head list; | ||
23 | const char *name; | ||
24 | struct module *module; | ||
25 | unsigned int use_count; | ||
26 | |||
27 | int (*ctr)(struct dirty_log *log, struct dm_target *ti, | ||
28 | unsigned int argc, char **argv); | ||
29 | void (*dtr)(struct dirty_log *log); | ||
30 | |||
31 | /* | ||
32 | * There are times when we don't want the log to touch | ||
33 | * the disk. | ||
34 | */ | ||
35 | int (*suspend)(struct dirty_log *log); | ||
36 | int (*resume)(struct dirty_log *log); | ||
37 | |||
38 | /* | ||
39 | * Retrieves the smallest size of region that the log can | ||
40 | * deal with. | ||
41 | */ | ||
42 | uint32_t (*get_region_size)(struct dirty_log *log); | ||
43 | |||
44 | /* | ||
45 | * A predicate to say whether a region is clean or not. | ||
46 | * May block. | ||
47 | */ | ||
48 | int (*is_clean)(struct dirty_log *log, region_t region); | ||
49 | |||
50 | /* | ||
51 | * Returns: 0, 1, -EWOULDBLOCK, < 0 | ||
52 | * | ||
53 | * A predicate function to check the area given by | ||
54 | * [sector, sector + len) is in sync. | ||
55 | * | ||
56 | * If -EWOULDBLOCK is returned the state of the region is | ||
57 | * unknown, typically this will result in a read being | ||
58 | * passed to a daemon to deal with, since a daemon is | ||
59 | * allowed to block. | ||
60 | */ | ||
61 | int (*in_sync)(struct dirty_log *log, region_t region, int can_block); | ||
62 | |||
63 | /* | ||
64 | * Flush the current log state (eg, to disk). This | ||
65 | * function may block. | ||
66 | */ | ||
67 | int (*flush)(struct dirty_log *log); | ||
68 | |||
69 | /* | ||
70 | * Mark an area as clean or dirty. These functions may | ||
71 | * block, though for performance reasons blocking should | ||
72 | * be extremely rare (eg, allocating another chunk of | ||
73 | * memory for some reason). | ||
74 | */ | ||
75 | void (*mark_region)(struct dirty_log *log, region_t region); | ||
76 | void (*clear_region)(struct dirty_log *log, region_t region); | ||
77 | |||
78 | /* | ||
79 | * Returns: <0 (error), 0 (no region), 1 (region) | ||
80 | * | ||
81 | * The mirrord will need perform recovery on regions of | ||
82 | * the mirror that are in the NOSYNC state. This | ||
83 | * function asks the log to tell the caller about the | ||
84 | * next region that this machine should recover. | ||
85 | * | ||
86 | * Do not confuse this function with 'in_sync()', one | ||
87 | * tells you if an area is synchronised, the other | ||
88 | * assigns recovery work. | ||
89 | */ | ||
90 | int (*get_resync_work)(struct dirty_log *log, region_t *region); | ||
91 | |||
92 | /* | ||
93 | * This notifies the log that the resync of an area has | ||
94 | * been completed. The log should then mark this region | ||
95 | * as CLEAN. | ||
96 | */ | ||
97 | void (*complete_resync_work)(struct dirty_log *log, | ||
98 | region_t region, int success); | ||
99 | |||
100 | /* | ||
101 | * Returns the number of regions that are in sync. | ||
102 | */ | ||
103 | region_t (*get_sync_count)(struct dirty_log *log); | ||
104 | |||
105 | /* | ||
106 | * Support function for mirror status requests. | ||
107 | */ | ||
108 | int (*status)(struct dirty_log *log, status_type_t status_type, | ||
109 | char *result, unsigned int maxlen); | ||
110 | }; | ||
111 | |||
112 | int dm_register_dirty_log_type(struct dirty_log_type *type); | ||
113 | int dm_unregister_dirty_log_type(struct dirty_log_type *type); | ||
114 | |||
115 | |||
116 | /* | ||
117 | * Make sure you use these two functions, rather than calling | ||
118 | * type->constructor/destructor() directly. | ||
119 | */ | ||
120 | struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti, | ||
121 | unsigned int argc, char **argv); | ||
122 | void dm_destroy_dirty_log(struct dirty_log *log); | ||
123 | |||
124 | /* | ||
125 | * init/exit functions. | ||
126 | */ | ||
127 | int dm_dirty_log_init(void); | ||
128 | void dm_dirty_log_exit(void); | ||
129 | |||
130 | #endif | ||
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c new file mode 100644 index 000000000000..43763a0bd096 --- /dev/null +++ b/drivers/md/dm-mpath.c | |||
@@ -0,0 +1,1302 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Sistina Software Limited. | ||
3 | * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. | ||
4 | * | ||
5 | * This file is released under the GPL. | ||
6 | */ | ||
7 | |||
8 | #include "dm.h" | ||
9 | #include "dm-path-selector.h" | ||
10 | #include "dm-hw-handler.h" | ||
11 | #include "dm-bio-list.h" | ||
12 | #include "dm-bio-record.h" | ||
13 | |||
14 | #include <linux/ctype.h> | ||
15 | #include <linux/init.h> | ||
16 | #include <linux/mempool.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/pagemap.h> | ||
19 | #include <linux/slab.h> | ||
20 | #include <linux/time.h> | ||
21 | #include <linux/workqueue.h> | ||
22 | #include <asm/atomic.h> | ||
23 | |||
24 | #define MESG_STR(x) x, sizeof(x) | ||
25 | |||
26 | /* Path properties */ | ||
27 | struct pgpath { | ||
28 | struct list_head list; | ||
29 | |||
30 | struct priority_group *pg; /* Owning PG */ | ||
31 | unsigned fail_count; /* Cumulative failure count */ | ||
32 | |||
33 | struct path path; | ||
34 | }; | ||
35 | |||
36 | #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) | ||
37 | |||
38 | /* | ||
39 | * Paths are grouped into Priority Groups and numbered from 1 upwards. | ||
40 | * Each has a path selector which controls which path gets used. | ||
41 | */ | ||
42 | struct priority_group { | ||
43 | struct list_head list; | ||
44 | |||
45 | struct multipath *m; /* Owning multipath instance */ | ||
46 | struct path_selector ps; | ||
47 | |||
48 | unsigned pg_num; /* Reference number */ | ||
49 | unsigned bypassed; /* Temporarily bypass this PG? */ | ||
50 | |||
51 | unsigned nr_pgpaths; /* Number of paths in PG */ | ||
52 | struct list_head pgpaths; | ||
53 | }; | ||
54 | |||
55 | /* Multipath context */ | ||
56 | struct multipath { | ||
57 | struct list_head list; | ||
58 | struct dm_target *ti; | ||
59 | |||
60 | spinlock_t lock; | ||
61 | |||
62 | struct hw_handler hw_handler; | ||
63 | unsigned nr_priority_groups; | ||
64 | struct list_head priority_groups; | ||
65 | unsigned pg_init_required; /* pg_init needs calling? */ | ||
66 | |||
67 | unsigned nr_valid_paths; /* Total number of usable paths */ | ||
68 | struct pgpath *current_pgpath; | ||
69 | struct priority_group *current_pg; | ||
70 | struct priority_group *next_pg; /* Switch to this PG if set */ | ||
71 | unsigned repeat_count; /* I/Os left before calling PS again */ | ||
72 | |||
73 | unsigned queue_io; /* Must we queue all I/O? */ | ||
74 | unsigned queue_if_no_path; /* Queue I/O if last path fails? */ | ||
75 | unsigned suspended; /* Has dm core suspended our I/O? */ | ||
76 | |||
77 | struct work_struct process_queued_ios; | ||
78 | struct bio_list queued_ios; | ||
79 | unsigned queue_size; | ||
80 | |||
81 | struct work_struct trigger_event; | ||
82 | |||
83 | /* | ||
84 | * We must use a mempool of mpath_io structs so that we | ||
85 | * can resubmit bios on error. | ||
86 | */ | ||
87 | mempool_t *mpio_pool; | ||
88 | }; | ||
89 | |||
90 | /* | ||
91 | * Context information attached to each bio we process. | ||
92 | */ | ||
93 | struct mpath_io { | ||
94 | struct pgpath *pgpath; | ||
95 | struct dm_bio_details details; | ||
96 | }; | ||
97 | |||
98 | typedef int (*action_fn) (struct pgpath *pgpath); | ||
99 | |||
100 | #define MIN_IOS 256 /* Mempool size */ | ||
101 | |||
102 | static kmem_cache_t *_mpio_cache; | ||
103 | |||
104 | static void process_queued_ios(void *data); | ||
105 | static void trigger_event(void *data); | ||
106 | |||
107 | |||
108 | /*----------------------------------------------- | ||
109 | * Allocation routines | ||
110 | *-----------------------------------------------*/ | ||
111 | |||
112 | static struct pgpath *alloc_pgpath(void) | ||
113 | { | ||
114 | struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL); | ||
115 | |||
116 | if (pgpath) { | ||
117 | memset(pgpath, 0, sizeof(*pgpath)); | ||
118 | pgpath->path.is_active = 1; | ||
119 | } | ||
120 | |||
121 | return pgpath; | ||
122 | } | ||
123 | |||
124 | static inline void free_pgpath(struct pgpath *pgpath) | ||
125 | { | ||
126 | kfree(pgpath); | ||
127 | } | ||
128 | |||
129 | static struct priority_group *alloc_priority_group(void) | ||
130 | { | ||
131 | struct priority_group *pg; | ||
132 | |||
133 | pg = kmalloc(sizeof(*pg), GFP_KERNEL); | ||
134 | if (!pg) | ||
135 | return NULL; | ||
136 | |||
137 | memset(pg, 0, sizeof(*pg)); | ||
138 | INIT_LIST_HEAD(&pg->pgpaths); | ||
139 | |||
140 | return pg; | ||
141 | } | ||
142 | |||
143 | static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) | ||
144 | { | ||
145 | struct pgpath *pgpath, *tmp; | ||
146 | |||
147 | list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { | ||
148 | list_del(&pgpath->list); | ||
149 | dm_put_device(ti, pgpath->path.dev); | ||
150 | free_pgpath(pgpath); | ||
151 | } | ||
152 | } | ||
153 | |||
154 | static void free_priority_group(struct priority_group *pg, | ||
155 | struct dm_target *ti) | ||
156 | { | ||
157 | struct path_selector *ps = &pg->ps; | ||
158 | |||
159 | if (ps->type) { | ||
160 | ps->type->destroy(ps); | ||
161 | dm_put_path_selector(ps->type); | ||
162 | } | ||
163 | |||
164 | free_pgpaths(&pg->pgpaths, ti); | ||
165 | kfree(pg); | ||
166 | } | ||
167 | |||
168 | static struct multipath *alloc_multipath(void) | ||
169 | { | ||
170 | struct multipath *m; | ||
171 | |||
172 | m = kmalloc(sizeof(*m), GFP_KERNEL); | ||
173 | if (m) { | ||
174 | memset(m, 0, sizeof(*m)); | ||
175 | INIT_LIST_HEAD(&m->priority_groups); | ||
176 | spin_lock_init(&m->lock); | ||
177 | m->queue_io = 1; | ||
178 | INIT_WORK(&m->process_queued_ios, process_queued_ios, m); | ||
179 | INIT_WORK(&m->trigger_event, trigger_event, m); | ||
180 | m->mpio_pool = mempool_create(MIN_IOS, mempool_alloc_slab, | ||
181 | mempool_free_slab, _mpio_cache); | ||
182 | if (!m->mpio_pool) { | ||
183 | kfree(m); | ||
184 | return NULL; | ||
185 | } | ||
186 | } | ||
187 | |||
188 | return m; | ||
189 | } | ||
190 | |||
191 | static void free_multipath(struct multipath *m) | ||
192 | { | ||
193 | struct priority_group *pg, *tmp; | ||
194 | struct hw_handler *hwh = &m->hw_handler; | ||
195 | |||
196 | list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { | ||
197 | list_del(&pg->list); | ||
198 | free_priority_group(pg, m->ti); | ||
199 | } | ||
200 | |||
201 | if (hwh->type) { | ||
202 | hwh->type->destroy(hwh); | ||
203 | dm_put_hw_handler(hwh->type); | ||
204 | } | ||
205 | |||
206 | mempool_destroy(m->mpio_pool); | ||
207 | kfree(m); | ||
208 | } | ||
209 | |||
210 | |||
211 | /*----------------------------------------------- | ||
212 | * Path selection | ||
213 | *-----------------------------------------------*/ | ||
214 | |||
215 | static void __switch_pg(struct multipath *m, struct pgpath *pgpath) | ||
216 | { | ||
217 | struct hw_handler *hwh = &m->hw_handler; | ||
218 | |||
219 | m->current_pg = pgpath->pg; | ||
220 | |||
221 | /* Must we initialise the PG first, and queue I/O till it's ready? */ | ||
222 | if (hwh->type && hwh->type->pg_init) { | ||
223 | m->pg_init_required = 1; | ||
224 | m->queue_io = 1; | ||
225 | } else { | ||
226 | m->pg_init_required = 0; | ||
227 | m->queue_io = 0; | ||
228 | } | ||
229 | } | ||
230 | |||
231 | static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) | ||
232 | { | ||
233 | struct path *path; | ||
234 | |||
235 | path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); | ||
236 | if (!path) | ||
237 | return -ENXIO; | ||
238 | |||
239 | m->current_pgpath = path_to_pgpath(path); | ||
240 | |||
241 | if (m->current_pg != pg) | ||
242 | __switch_pg(m, m->current_pgpath); | ||
243 | |||
244 | return 0; | ||
245 | } | ||
246 | |||
247 | static void __choose_pgpath(struct multipath *m) | ||
248 | { | ||
249 | struct priority_group *pg; | ||
250 | unsigned bypassed = 1; | ||
251 | |||
252 | if (!m->nr_valid_paths) | ||
253 | goto failed; | ||
254 | |||
255 | /* Were we instructed to switch PG? */ | ||
256 | if (m->next_pg) { | ||
257 | pg = m->next_pg; | ||
258 | m->next_pg = NULL; | ||
259 | if (!__choose_path_in_pg(m, pg)) | ||
260 | return; | ||
261 | } | ||
262 | |||
263 | /* Don't change PG until it has no remaining paths */ | ||
264 | if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) | ||
265 | return; | ||
266 | |||
267 | /* | ||
268 | * Loop through priority groups until we find a valid path. | ||
269 | * First time we skip PGs marked 'bypassed'. | ||
270 | * Second time we only try the ones we skipped. | ||
271 | */ | ||
272 | do { | ||
273 | list_for_each_entry(pg, &m->priority_groups, list) { | ||
274 | if (pg->bypassed == bypassed) | ||
275 | continue; | ||
276 | if (!__choose_path_in_pg(m, pg)) | ||
277 | return; | ||
278 | } | ||
279 | } while (bypassed--); | ||
280 | |||
281 | failed: | ||
282 | m->current_pgpath = NULL; | ||
283 | m->current_pg = NULL; | ||
284 | } | ||
285 | |||
286 | static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio, | ||
287 | unsigned was_queued) | ||
288 | { | ||
289 | int r = 1; | ||
290 | unsigned long flags; | ||
291 | struct pgpath *pgpath; | ||
292 | |||
293 | spin_lock_irqsave(&m->lock, flags); | ||
294 | |||
295 | /* Do we need to select a new pgpath? */ | ||
296 | if (!m->current_pgpath || | ||
297 | (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) | ||
298 | __choose_pgpath(m); | ||
299 | |||
300 | pgpath = m->current_pgpath; | ||
301 | |||
302 | if (was_queued) | ||
303 | m->queue_size--; | ||
304 | |||
305 | if ((pgpath && m->queue_io) || | ||
306 | (!pgpath && m->queue_if_no_path && !m->suspended)) { | ||
307 | /* Queue for the daemon to resubmit */ | ||
308 | bio_list_add(&m->queued_ios, bio); | ||
309 | m->queue_size++; | ||
310 | if (m->pg_init_required || !m->queue_io) | ||
311 | schedule_work(&m->process_queued_ios); | ||
312 | pgpath = NULL; | ||
313 | r = 0; | ||
314 | } else if (!pgpath) | ||
315 | r = -EIO; /* Failed */ | ||
316 | else | ||
317 | bio->bi_bdev = pgpath->path.dev->bdev; | ||
318 | |||
319 | mpio->pgpath = pgpath; | ||
320 | |||
321 | spin_unlock_irqrestore(&m->lock, flags); | ||
322 | |||
323 | return r; | ||
324 | } | ||
325 | |||
326 | /* | ||
327 | * If we run out of usable paths, should we queue I/O or error it? | ||
328 | */ | ||
329 | static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path) | ||
330 | { | ||
331 | unsigned long flags; | ||
332 | |||
333 | spin_lock_irqsave(&m->lock, flags); | ||
334 | |||
335 | m->queue_if_no_path = queue_if_no_path; | ||
336 | if (!m->queue_if_no_path) | ||
337 | schedule_work(&m->process_queued_ios); | ||
338 | |||
339 | spin_unlock_irqrestore(&m->lock, flags); | ||
340 | |||
341 | return 0; | ||
342 | } | ||
343 | |||
344 | /*----------------------------------------------------------------- | ||
345 | * The multipath daemon is responsible for resubmitting queued ios. | ||
346 | *---------------------------------------------------------------*/ | ||
347 | |||
348 | static void dispatch_queued_ios(struct multipath *m) | ||
349 | { | ||
350 | int r; | ||
351 | unsigned long flags; | ||
352 | struct bio *bio = NULL, *next; | ||
353 | struct mpath_io *mpio; | ||
354 | union map_info *info; | ||
355 | |||
356 | spin_lock_irqsave(&m->lock, flags); | ||
357 | bio = bio_list_get(&m->queued_ios); | ||
358 | spin_unlock_irqrestore(&m->lock, flags); | ||
359 | |||
360 | while (bio) { | ||
361 | next = bio->bi_next; | ||
362 | bio->bi_next = NULL; | ||
363 | |||
364 | info = dm_get_mapinfo(bio); | ||
365 | mpio = info->ptr; | ||
366 | |||
367 | r = map_io(m, bio, mpio, 1); | ||
368 | if (r < 0) | ||
369 | bio_endio(bio, bio->bi_size, r); | ||
370 | else if (r == 1) | ||
371 | generic_make_request(bio); | ||
372 | |||
373 | bio = next; | ||
374 | } | ||
375 | } | ||
376 | |||
377 | static void process_queued_ios(void *data) | ||
378 | { | ||
379 | struct multipath *m = (struct multipath *) data; | ||
380 | struct hw_handler *hwh = &m->hw_handler; | ||
381 | struct pgpath *pgpath; | ||
382 | unsigned init_required, must_queue = 0; | ||
383 | unsigned long flags; | ||
384 | |||
385 | spin_lock_irqsave(&m->lock, flags); | ||
386 | |||
387 | if (!m->current_pgpath) | ||
388 | __choose_pgpath(m); | ||
389 | |||
390 | pgpath = m->current_pgpath; | ||
391 | |||
392 | if ((pgpath && m->queue_io) || | ||
393 | (!pgpath && m->queue_if_no_path && !m->suspended)) | ||
394 | must_queue = 1; | ||
395 | |||
396 | init_required = m->pg_init_required; | ||
397 | if (init_required) | ||
398 | m->pg_init_required = 0; | ||
399 | |||
400 | spin_unlock_irqrestore(&m->lock, flags); | ||
401 | |||
402 | if (init_required) | ||
403 | hwh->type->pg_init(hwh, pgpath->pg->bypassed, &pgpath->path); | ||
404 | |||
405 | if (!must_queue) | ||
406 | dispatch_queued_ios(m); | ||
407 | } | ||
408 | |||
409 | /* | ||
410 | * An event is triggered whenever a path is taken out of use. | ||
411 | * Includes path failure and PG bypass. | ||
412 | */ | ||
413 | static void trigger_event(void *data) | ||
414 | { | ||
415 | struct multipath *m = (struct multipath *) data; | ||
416 | |||
417 | dm_table_event(m->ti->table); | ||
418 | } | ||
419 | |||
420 | /*----------------------------------------------------------------- | ||
421 | * Constructor/argument parsing: | ||
422 | * <#multipath feature args> [<arg>]* | ||
423 | * <#hw_handler args> [hw_handler [<arg>]*] | ||
424 | * <#priority groups> | ||
425 | * <initial priority group> | ||
426 | * [<selector> <#selector args> [<arg>]* | ||
427 | * <#paths> <#per-path selector args> | ||
428 | * [<path> [<arg>]* ]+ ]+ | ||
429 | *---------------------------------------------------------------*/ | ||
430 | struct param { | ||
431 | unsigned min; | ||
432 | unsigned max; | ||
433 | char *error; | ||
434 | }; | ||
435 | |||
436 | #define ESTR(s) ("dm-multipath: " s) | ||
437 | |||
438 | static int read_param(struct param *param, char *str, unsigned *v, char **error) | ||
439 | { | ||
440 | if (!str || | ||
441 | (sscanf(str, "%u", v) != 1) || | ||
442 | (*v < param->min) || | ||
443 | (*v > param->max)) { | ||
444 | *error = param->error; | ||
445 | return -EINVAL; | ||
446 | } | ||
447 | |||
448 | return 0; | ||
449 | } | ||
450 | |||
451 | struct arg_set { | ||
452 | unsigned argc; | ||
453 | char **argv; | ||
454 | }; | ||
455 | |||
456 | static char *shift(struct arg_set *as) | ||
457 | { | ||
458 | char *r; | ||
459 | |||
460 | if (as->argc) { | ||
461 | as->argc--; | ||
462 | r = *as->argv; | ||
463 | as->argv++; | ||
464 | return r; | ||
465 | } | ||
466 | |||
467 | return NULL; | ||
468 | } | ||
469 | |||
470 | static void consume(struct arg_set *as, unsigned n) | ||
471 | { | ||
472 | BUG_ON (as->argc < n); | ||
473 | as->argc -= n; | ||
474 | as->argv += n; | ||
475 | } | ||
476 | |||
477 | static int parse_path_selector(struct arg_set *as, struct priority_group *pg, | ||
478 | struct dm_target *ti) | ||
479 | { | ||
480 | int r; | ||
481 | struct path_selector_type *pst; | ||
482 | unsigned ps_argc; | ||
483 | |||
484 | static struct param _params[] = { | ||
485 | {0, 1024, ESTR("invalid number of path selector args")}, | ||
486 | }; | ||
487 | |||
488 | pst = dm_get_path_selector(shift(as)); | ||
489 | if (!pst) { | ||
490 | ti->error = ESTR("unknown path selector type"); | ||
491 | return -EINVAL; | ||
492 | } | ||
493 | |||
494 | r = read_param(_params, shift(as), &ps_argc, &ti->error); | ||
495 | if (r) | ||
496 | return -EINVAL; | ||
497 | |||
498 | r = pst->create(&pg->ps, ps_argc, as->argv); | ||
499 | if (r) { | ||
500 | dm_put_path_selector(pst); | ||
501 | ti->error = ESTR("path selector constructor failed"); | ||
502 | return r; | ||
503 | } | ||
504 | |||
505 | pg->ps.type = pst; | ||
506 | consume(as, ps_argc); | ||
507 | |||
508 | return 0; | ||
509 | } | ||
510 | |||
511 | static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, | ||
512 | struct dm_target *ti) | ||
513 | { | ||
514 | int r; | ||
515 | struct pgpath *p; | ||
516 | |||
517 | /* we need at least a path arg */ | ||
518 | if (as->argc < 1) { | ||
519 | ti->error = ESTR("no device given"); | ||
520 | return NULL; | ||
521 | } | ||
522 | |||
523 | p = alloc_pgpath(); | ||
524 | if (!p) | ||
525 | return NULL; | ||
526 | |||
527 | r = dm_get_device(ti, shift(as), ti->begin, ti->len, | ||
528 | dm_table_get_mode(ti->table), &p->path.dev); | ||
529 | if (r) { | ||
530 | ti->error = ESTR("error getting device"); | ||
531 | goto bad; | ||
532 | } | ||
533 | |||
534 | r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); | ||
535 | if (r) { | ||
536 | dm_put_device(ti, p->path.dev); | ||
537 | goto bad; | ||
538 | } | ||
539 | |||
540 | return p; | ||
541 | |||
542 | bad: | ||
543 | free_pgpath(p); | ||
544 | return NULL; | ||
545 | } | ||
546 | |||
547 | static struct priority_group *parse_priority_group(struct arg_set *as, | ||
548 | struct multipath *m, | ||
549 | struct dm_target *ti) | ||
550 | { | ||
551 | static struct param _params[] = { | ||
552 | {1, 1024, ESTR("invalid number of paths")}, | ||
553 | {0, 1024, ESTR("invalid number of selector args")} | ||
554 | }; | ||
555 | |||
556 | int r; | ||
557 | unsigned i, nr_selector_args, nr_params; | ||
558 | struct priority_group *pg; | ||
559 | |||
560 | if (as->argc < 2) { | ||
561 | as->argc = 0; | ||
562 | ti->error = ESTR("not enough priority group aruments"); | ||
563 | return NULL; | ||
564 | } | ||
565 | |||
566 | pg = alloc_priority_group(); | ||
567 | if (!pg) { | ||
568 | ti->error = ESTR("couldn't allocate priority group"); | ||
569 | return NULL; | ||
570 | } | ||
571 | pg->m = m; | ||
572 | |||
573 | r = parse_path_selector(as, pg, ti); | ||
574 | if (r) | ||
575 | goto bad; | ||
576 | |||
577 | /* | ||
578 | * read the paths | ||
579 | */ | ||
580 | r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error); | ||
581 | if (r) | ||
582 | goto bad; | ||
583 | |||
584 | r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error); | ||
585 | if (r) | ||
586 | goto bad; | ||
587 | |||
588 | nr_params = 1 + nr_selector_args; | ||
589 | for (i = 0; i < pg->nr_pgpaths; i++) { | ||
590 | struct pgpath *pgpath; | ||
591 | struct arg_set path_args; | ||
592 | |||
593 | if (as->argc < nr_params) | ||
594 | goto bad; | ||
595 | |||
596 | path_args.argc = nr_params; | ||
597 | path_args.argv = as->argv; | ||
598 | |||
599 | pgpath = parse_path(&path_args, &pg->ps, ti); | ||
600 | if (!pgpath) | ||
601 | goto bad; | ||
602 | |||
603 | pgpath->pg = pg; | ||
604 | list_add_tail(&pgpath->list, &pg->pgpaths); | ||
605 | consume(as, nr_params); | ||
606 | } | ||
607 | |||
608 | return pg; | ||
609 | |||
610 | bad: | ||
611 | free_priority_group(pg, ti); | ||
612 | return NULL; | ||
613 | } | ||
614 | |||
615 | static int parse_hw_handler(struct arg_set *as, struct multipath *m, | ||
616 | struct dm_target *ti) | ||
617 | { | ||
618 | int r; | ||
619 | struct hw_handler_type *hwht; | ||
620 | unsigned hw_argc; | ||
621 | |||
622 | static struct param _params[] = { | ||
623 | {0, 1024, ESTR("invalid number of hardware handler args")}, | ||
624 | }; | ||
625 | |||
626 | r = read_param(_params, shift(as), &hw_argc, &ti->error); | ||
627 | if (r) | ||
628 | return -EINVAL; | ||
629 | |||
630 | if (!hw_argc) | ||
631 | return 0; | ||
632 | |||
633 | hwht = dm_get_hw_handler(shift(as)); | ||
634 | if (!hwht) { | ||
635 | ti->error = ESTR("unknown hardware handler type"); | ||
636 | return -EINVAL; | ||
637 | } | ||
638 | |||
639 | r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv); | ||
640 | if (r) { | ||
641 | dm_put_hw_handler(hwht); | ||
642 | ti->error = ESTR("hardware handler constructor failed"); | ||
643 | return r; | ||
644 | } | ||
645 | |||
646 | m->hw_handler.type = hwht; | ||
647 | consume(as, hw_argc - 1); | ||
648 | |||
649 | return 0; | ||
650 | } | ||
651 | |||
652 | static int parse_features(struct arg_set *as, struct multipath *m, | ||
653 | struct dm_target *ti) | ||
654 | { | ||
655 | int r; | ||
656 | unsigned argc; | ||
657 | |||
658 | static struct param _params[] = { | ||
659 | {0, 1, ESTR("invalid number of feature args")}, | ||
660 | }; | ||
661 | |||
662 | r = read_param(_params, shift(as), &argc, &ti->error); | ||
663 | if (r) | ||
664 | return -EINVAL; | ||
665 | |||
666 | if (!argc) | ||
667 | return 0; | ||
668 | |||
669 | if (!strnicmp(shift(as), MESG_STR("queue_if_no_path"))) | ||
670 | return queue_if_no_path(m, 1); | ||
671 | else { | ||
672 | ti->error = "Unrecognised multipath feature request"; | ||
673 | return -EINVAL; | ||
674 | } | ||
675 | } | ||
676 | |||
677 | static int multipath_ctr(struct dm_target *ti, unsigned int argc, | ||
678 | char **argv) | ||
679 | { | ||
680 | /* target parameters */ | ||
681 | static struct param _params[] = { | ||
682 | {1, 1024, ESTR("invalid number of priority groups")}, | ||
683 | {1, 1024, ESTR("invalid initial priority group number")}, | ||
684 | }; | ||
685 | |||
686 | int r; | ||
687 | struct multipath *m; | ||
688 | struct arg_set as; | ||
689 | unsigned pg_count = 0; | ||
690 | unsigned next_pg_num; | ||
691 | |||
692 | as.argc = argc; | ||
693 | as.argv = argv; | ||
694 | |||
695 | m = alloc_multipath(); | ||
696 | if (!m) { | ||
697 | ti->error = ESTR("can't allocate multipath"); | ||
698 | return -EINVAL; | ||
699 | } | ||
700 | |||
701 | r = parse_features(&as, m, ti); | ||
702 | if (r) | ||
703 | goto bad; | ||
704 | |||
705 | r = parse_hw_handler(&as, m, ti); | ||
706 | if (r) | ||
707 | goto bad; | ||
708 | |||
709 | r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error); | ||
710 | if (r) | ||
711 | goto bad; | ||
712 | |||
713 | r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error); | ||
714 | if (r) | ||
715 | goto bad; | ||
716 | |||
717 | /* parse the priority groups */ | ||
718 | while (as.argc) { | ||
719 | struct priority_group *pg; | ||
720 | |||
721 | pg = parse_priority_group(&as, m, ti); | ||
722 | if (!pg) { | ||
723 | r = -EINVAL; | ||
724 | goto bad; | ||
725 | } | ||
726 | |||
727 | m->nr_valid_paths += pg->nr_pgpaths; | ||
728 | list_add_tail(&pg->list, &m->priority_groups); | ||
729 | pg_count++; | ||
730 | pg->pg_num = pg_count; | ||
731 | if (!--next_pg_num) | ||
732 | m->next_pg = pg; | ||
733 | } | ||
734 | |||
735 | if (pg_count != m->nr_priority_groups) { | ||
736 | ti->error = ESTR("priority group count mismatch"); | ||
737 | r = -EINVAL; | ||
738 | goto bad; | ||
739 | } | ||
740 | |||
741 | ti->private = m; | ||
742 | m->ti = ti; | ||
743 | |||
744 | return 0; | ||
745 | |||
746 | bad: | ||
747 | free_multipath(m); | ||
748 | return r; | ||
749 | } | ||
750 | |||
751 | static void multipath_dtr(struct dm_target *ti) | ||
752 | { | ||
753 | struct multipath *m = (struct multipath *) ti->private; | ||
754 | free_multipath(m); | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * Map bios, recording original fields for later in case we have to resubmit | ||
759 | */ | ||
760 | static int multipath_map(struct dm_target *ti, struct bio *bio, | ||
761 | union map_info *map_context) | ||
762 | { | ||
763 | int r; | ||
764 | struct mpath_io *mpio; | ||
765 | struct multipath *m = (struct multipath *) ti->private; | ||
766 | |||
767 | mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); | ||
768 | dm_bio_record(&mpio->details, bio); | ||
769 | |||
770 | map_context->ptr = mpio; | ||
771 | bio->bi_rw |= (1 << BIO_RW_FAILFAST); | ||
772 | r = map_io(m, bio, mpio, 0); | ||
773 | if (r < 0) | ||
774 | mempool_free(mpio, m->mpio_pool); | ||
775 | |||
776 | return r; | ||
777 | } | ||
778 | |||
779 | /* | ||
780 | * Take a path out of use. | ||
781 | */ | ||
782 | static int fail_path(struct pgpath *pgpath) | ||
783 | { | ||
784 | unsigned long flags; | ||
785 | struct multipath *m = pgpath->pg->m; | ||
786 | |||
787 | spin_lock_irqsave(&m->lock, flags); | ||
788 | |||
789 | if (!pgpath->path.is_active) | ||
790 | goto out; | ||
791 | |||
792 | DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name); | ||
793 | |||
794 | pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); | ||
795 | pgpath->path.is_active = 0; | ||
796 | pgpath->fail_count++; | ||
797 | |||
798 | m->nr_valid_paths--; | ||
799 | |||
800 | if (pgpath == m->current_pgpath) | ||
801 | m->current_pgpath = NULL; | ||
802 | |||
803 | schedule_work(&m->trigger_event); | ||
804 | |||
805 | out: | ||
806 | spin_unlock_irqrestore(&m->lock, flags); | ||
807 | |||
808 | return 0; | ||
809 | } | ||
810 | |||
811 | /* | ||
812 | * Reinstate a previously-failed path | ||
813 | */ | ||
814 | static int reinstate_path(struct pgpath *pgpath) | ||
815 | { | ||
816 | int r = 0; | ||
817 | unsigned long flags; | ||
818 | struct multipath *m = pgpath->pg->m; | ||
819 | |||
820 | spin_lock_irqsave(&m->lock, flags); | ||
821 | |||
822 | if (pgpath->path.is_active) | ||
823 | goto out; | ||
824 | |||
825 | if (!pgpath->pg->ps.type) { | ||
826 | DMWARN("Reinstate path not supported by path selector %s", | ||
827 | pgpath->pg->ps.type->name); | ||
828 | r = -EINVAL; | ||
829 | goto out; | ||
830 | } | ||
831 | |||
832 | r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); | ||
833 | if (r) | ||
834 | goto out; | ||
835 | |||
836 | pgpath->path.is_active = 1; | ||
837 | |||
838 | m->current_pgpath = NULL; | ||
839 | if (!m->nr_valid_paths++) | ||
840 | schedule_work(&m->process_queued_ios); | ||
841 | |||
842 | schedule_work(&m->trigger_event); | ||
843 | |||
844 | out: | ||
845 | spin_unlock_irqrestore(&m->lock, flags); | ||
846 | |||
847 | return r; | ||
848 | } | ||
849 | |||
850 | /* | ||
851 | * Fail or reinstate all paths that match the provided struct dm_dev. | ||
852 | */ | ||
853 | static int action_dev(struct multipath *m, struct dm_dev *dev, | ||
854 | action_fn action) | ||
855 | { | ||
856 | int r = 0; | ||
857 | struct pgpath *pgpath; | ||
858 | struct priority_group *pg; | ||
859 | |||
860 | list_for_each_entry(pg, &m->priority_groups, list) { | ||
861 | list_for_each_entry(pgpath, &pg->pgpaths, list) { | ||
862 | if (pgpath->path.dev == dev) | ||
863 | r = action(pgpath); | ||
864 | } | ||
865 | } | ||
866 | |||
867 | return r; | ||
868 | } | ||
869 | |||
870 | /* | ||
871 | * Temporarily try to avoid having to use the specified PG | ||
872 | */ | ||
873 | static void bypass_pg(struct multipath *m, struct priority_group *pg, | ||
874 | int bypassed) | ||
875 | { | ||
876 | unsigned long flags; | ||
877 | |||
878 | spin_lock_irqsave(&m->lock, flags); | ||
879 | |||
880 | pg->bypassed = bypassed; | ||
881 | m->current_pgpath = NULL; | ||
882 | m->current_pg = NULL; | ||
883 | |||
884 | spin_unlock_irqrestore(&m->lock, flags); | ||
885 | |||
886 | schedule_work(&m->trigger_event); | ||
887 | } | ||
888 | |||
889 | /* | ||
890 | * Switch to using the specified PG from the next I/O that gets mapped | ||
891 | */ | ||
892 | static int switch_pg_num(struct multipath *m, const char *pgstr) | ||
893 | { | ||
894 | struct priority_group *pg; | ||
895 | unsigned pgnum; | ||
896 | unsigned long flags; | ||
897 | |||
898 | if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || | ||
899 | (pgnum > m->nr_priority_groups)) { | ||
900 | DMWARN("invalid PG number supplied to switch_pg_num"); | ||
901 | return -EINVAL; | ||
902 | } | ||
903 | |||
904 | spin_lock_irqsave(&m->lock, flags); | ||
905 | list_for_each_entry(pg, &m->priority_groups, list) { | ||
906 | pg->bypassed = 0; | ||
907 | if (--pgnum) | ||
908 | continue; | ||
909 | |||
910 | m->current_pgpath = NULL; | ||
911 | m->current_pg = NULL; | ||
912 | m->next_pg = pg; | ||
913 | } | ||
914 | spin_unlock_irqrestore(&m->lock, flags); | ||
915 | |||
916 | schedule_work(&m->trigger_event); | ||
917 | return 0; | ||
918 | } | ||
919 | |||
920 | /* | ||
921 | * Set/clear bypassed status of a PG. | ||
922 | * PGs are numbered upwards from 1 in the order they were declared. | ||
923 | */ | ||
924 | static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) | ||
925 | { | ||
926 | struct priority_group *pg; | ||
927 | unsigned pgnum; | ||
928 | |||
929 | if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || | ||
930 | (pgnum > m->nr_priority_groups)) { | ||
931 | DMWARN("invalid PG number supplied to bypass_pg"); | ||
932 | return -EINVAL; | ||
933 | } | ||
934 | |||
935 | list_for_each_entry(pg, &m->priority_groups, list) { | ||
936 | if (!--pgnum) | ||
937 | break; | ||
938 | } | ||
939 | |||
940 | bypass_pg(m, pg, bypassed); | ||
941 | return 0; | ||
942 | } | ||
943 | |||
944 | /* | ||
945 | * pg_init must call this when it has completed its initialisation | ||
946 | */ | ||
947 | void dm_pg_init_complete(struct path *path, unsigned err_flags) | ||
948 | { | ||
949 | struct pgpath *pgpath = path_to_pgpath(path); | ||
950 | struct priority_group *pg = pgpath->pg; | ||
951 | struct multipath *m = pg->m; | ||
952 | unsigned long flags; | ||
953 | |||
954 | /* We insist on failing the path if the PG is already bypassed. */ | ||
955 | if (err_flags && pg->bypassed) | ||
956 | err_flags |= MP_FAIL_PATH; | ||
957 | |||
958 | if (err_flags & MP_FAIL_PATH) | ||
959 | fail_path(pgpath); | ||
960 | |||
961 | if (err_flags & MP_BYPASS_PG) | ||
962 | bypass_pg(m, pg, 1); | ||
963 | |||
964 | spin_lock_irqsave(&m->lock, flags); | ||
965 | if (!err_flags) | ||
966 | m->queue_io = 0; | ||
967 | else { | ||
968 | m->current_pgpath = NULL; | ||
969 | m->current_pg = NULL; | ||
970 | } | ||
971 | schedule_work(&m->process_queued_ios); | ||
972 | spin_unlock_irqrestore(&m->lock, flags); | ||
973 | } | ||
974 | |||
975 | /* | ||
976 | * end_io handling | ||
977 | */ | ||
978 | static int do_end_io(struct multipath *m, struct bio *bio, | ||
979 | int error, struct mpath_io *mpio) | ||
980 | { | ||
981 | struct hw_handler *hwh = &m->hw_handler; | ||
982 | unsigned err_flags = MP_FAIL_PATH; /* Default behavior */ | ||
983 | |||
984 | if (!error) | ||
985 | return 0; /* I/O complete */ | ||
986 | |||
987 | spin_lock(&m->lock); | ||
988 | if (!m->nr_valid_paths) { | ||
989 | if (!m->queue_if_no_path || m->suspended) { | ||
990 | spin_unlock(&m->lock); | ||
991 | return -EIO; | ||
992 | } else { | ||
993 | spin_unlock(&m->lock); | ||
994 | goto requeue; | ||
995 | } | ||
996 | } | ||
997 | spin_unlock(&m->lock); | ||
998 | |||
999 | if (hwh->type && hwh->type->error) | ||
1000 | err_flags = hwh->type->error(hwh, bio); | ||
1001 | |||
1002 | if (mpio->pgpath) { | ||
1003 | if (err_flags & MP_FAIL_PATH) | ||
1004 | fail_path(mpio->pgpath); | ||
1005 | |||
1006 | if (err_flags & MP_BYPASS_PG) | ||
1007 | bypass_pg(m, mpio->pgpath->pg, 1); | ||
1008 | } | ||
1009 | |||
1010 | if (err_flags & MP_ERROR_IO) | ||
1011 | return -EIO; | ||
1012 | |||
1013 | requeue: | ||
1014 | dm_bio_restore(&mpio->details, bio); | ||
1015 | |||
1016 | /* queue for the daemon to resubmit or fail */ | ||
1017 | spin_lock(&m->lock); | ||
1018 | bio_list_add(&m->queued_ios, bio); | ||
1019 | m->queue_size++; | ||
1020 | if (!m->queue_io) | ||
1021 | schedule_work(&m->process_queued_ios); | ||
1022 | spin_unlock(&m->lock); | ||
1023 | |||
1024 | return 1; /* io not complete */ | ||
1025 | } | ||
1026 | |||
1027 | static int multipath_end_io(struct dm_target *ti, struct bio *bio, | ||
1028 | int error, union map_info *map_context) | ||
1029 | { | ||
1030 | struct multipath *m = (struct multipath *) ti->private; | ||
1031 | struct mpath_io *mpio = (struct mpath_io *) map_context->ptr; | ||
1032 | struct pgpath *pgpath = mpio->pgpath; | ||
1033 | struct path_selector *ps; | ||
1034 | int r; | ||
1035 | |||
1036 | r = do_end_io(m, bio, error, mpio); | ||
1037 | if (pgpath) { | ||
1038 | ps = &pgpath->pg->ps; | ||
1039 | if (ps->type->end_io) | ||
1040 | ps->type->end_io(ps, &pgpath->path); | ||
1041 | } | ||
1042 | if (r <= 0) | ||
1043 | mempool_free(mpio, m->mpio_pool); | ||
1044 | |||
1045 | return r; | ||
1046 | } | ||
1047 | |||
1048 | /* | ||
1049 | * Suspend can't complete until all the I/O is processed so if | ||
1050 | * the last path failed we will now error any queued I/O. | ||
1051 | */ | ||
1052 | static void multipath_presuspend(struct dm_target *ti) | ||
1053 | { | ||
1054 | struct multipath *m = (struct multipath *) ti->private; | ||
1055 | unsigned long flags; | ||
1056 | |||
1057 | spin_lock_irqsave(&m->lock, flags); | ||
1058 | m->suspended = 1; | ||
1059 | if (m->queue_if_no_path) | ||
1060 | schedule_work(&m->process_queued_ios); | ||
1061 | spin_unlock_irqrestore(&m->lock, flags); | ||
1062 | } | ||
1063 | |||
1064 | static void multipath_resume(struct dm_target *ti) | ||
1065 | { | ||
1066 | struct multipath *m = (struct multipath *) ti->private; | ||
1067 | unsigned long flags; | ||
1068 | |||
1069 | spin_lock_irqsave(&m->lock, flags); | ||
1070 | m->suspended = 0; | ||
1071 | spin_unlock_irqrestore(&m->lock, flags); | ||
1072 | } | ||
1073 | |||
1074 | /* | ||
1075 | * Info output has the following format: | ||
1076 | * num_multipath_feature_args [multipath_feature_args]* | ||
1077 | * num_handler_status_args [handler_status_args]* | ||
1078 | * num_groups init_group_number | ||
1079 | * [A|D|E num_ps_status_args [ps_status_args]* | ||
1080 | * num_paths num_selector_args | ||
1081 | * [path_dev A|F fail_count [selector_args]* ]+ ]+ | ||
1082 | * | ||
1083 | * Table output has the following format (identical to the constructor string): | ||
1084 | * num_feature_args [features_args]* | ||
1085 | * num_handler_args hw_handler [hw_handler_args]* | ||
1086 | * num_groups init_group_number | ||
1087 | * [priority selector-name num_ps_args [ps_args]* | ||
1088 | * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ | ||
1089 | */ | ||
1090 | static int multipath_status(struct dm_target *ti, status_type_t type, | ||
1091 | char *result, unsigned int maxlen) | ||
1092 | { | ||
1093 | int sz = 0; | ||
1094 | unsigned long flags; | ||
1095 | struct multipath *m = (struct multipath *) ti->private; | ||
1096 | struct hw_handler *hwh = &m->hw_handler; | ||
1097 | struct priority_group *pg; | ||
1098 | struct pgpath *p; | ||
1099 | unsigned pg_num; | ||
1100 | char state; | ||
1101 | |||
1102 | spin_lock_irqsave(&m->lock, flags); | ||
1103 | |||
1104 | /* Features */ | ||
1105 | if (type == STATUSTYPE_INFO) | ||
1106 | DMEMIT("1 %u ", m->queue_size); | ||
1107 | else if (m->queue_if_no_path) | ||
1108 | DMEMIT("1 queue_if_no_path "); | ||
1109 | else | ||
1110 | DMEMIT("0 "); | ||
1111 | |||
1112 | if (hwh->type && hwh->type->status) | ||
1113 | sz += hwh->type->status(hwh, type, result + sz, maxlen - sz); | ||
1114 | else if (!hwh->type || type == STATUSTYPE_INFO) | ||
1115 | DMEMIT("0 "); | ||
1116 | else | ||
1117 | DMEMIT("1 %s ", hwh->type->name); | ||
1118 | |||
1119 | DMEMIT("%u ", m->nr_priority_groups); | ||
1120 | |||
1121 | if (m->next_pg) | ||
1122 | pg_num = m->next_pg->pg_num; | ||
1123 | else if (m->current_pg) | ||
1124 | pg_num = m->current_pg->pg_num; | ||
1125 | else | ||
1126 | pg_num = 1; | ||
1127 | |||
1128 | DMEMIT("%u ", pg_num); | ||
1129 | |||
1130 | switch (type) { | ||
1131 | case STATUSTYPE_INFO: | ||
1132 | list_for_each_entry(pg, &m->priority_groups, list) { | ||
1133 | if (pg->bypassed) | ||
1134 | state = 'D'; /* Disabled */ | ||
1135 | else if (pg == m->current_pg) | ||
1136 | state = 'A'; /* Currently Active */ | ||
1137 | else | ||
1138 | state = 'E'; /* Enabled */ | ||
1139 | |||
1140 | DMEMIT("%c ", state); | ||
1141 | |||
1142 | if (pg->ps.type->status) | ||
1143 | sz += pg->ps.type->status(&pg->ps, NULL, type, | ||
1144 | result + sz, | ||
1145 | maxlen - sz); | ||
1146 | else | ||
1147 | DMEMIT("0 "); | ||
1148 | |||
1149 | DMEMIT("%u %u ", pg->nr_pgpaths, | ||
1150 | pg->ps.type->info_args); | ||
1151 | |||
1152 | list_for_each_entry(p, &pg->pgpaths, list) { | ||
1153 | DMEMIT("%s %s %u ", p->path.dev->name, | ||
1154 | p->path.is_active ? "A" : "F", | ||
1155 | p->fail_count); | ||
1156 | if (pg->ps.type->status) | ||
1157 | sz += pg->ps.type->status(&pg->ps, | ||
1158 | &p->path, type, result + sz, | ||
1159 | maxlen - sz); | ||
1160 | } | ||
1161 | } | ||
1162 | break; | ||
1163 | |||
1164 | case STATUSTYPE_TABLE: | ||
1165 | list_for_each_entry(pg, &m->priority_groups, list) { | ||
1166 | DMEMIT("%s ", pg->ps.type->name); | ||
1167 | |||
1168 | if (pg->ps.type->status) | ||
1169 | sz += pg->ps.type->status(&pg->ps, NULL, type, | ||
1170 | result + sz, | ||
1171 | maxlen - sz); | ||
1172 | else | ||
1173 | DMEMIT("0 "); | ||
1174 | |||
1175 | DMEMIT("%u %u ", pg->nr_pgpaths, | ||
1176 | pg->ps.type->table_args); | ||
1177 | |||
1178 | list_for_each_entry(p, &pg->pgpaths, list) { | ||
1179 | DMEMIT("%s ", p->path.dev->name); | ||
1180 | if (pg->ps.type->status) | ||
1181 | sz += pg->ps.type->status(&pg->ps, | ||
1182 | &p->path, type, result + sz, | ||
1183 | maxlen - sz); | ||
1184 | } | ||
1185 | } | ||
1186 | break; | ||
1187 | } | ||
1188 | |||
1189 | spin_unlock_irqrestore(&m->lock, flags); | ||
1190 | |||
1191 | return 0; | ||
1192 | } | ||
1193 | |||
1194 | static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) | ||
1195 | { | ||
1196 | int r; | ||
1197 | struct dm_dev *dev; | ||
1198 | struct multipath *m = (struct multipath *) ti->private; | ||
1199 | action_fn action; | ||
1200 | |||
1201 | if (argc == 1) { | ||
1202 | if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) | ||
1203 | return queue_if_no_path(m, 1); | ||
1204 | else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) | ||
1205 | return queue_if_no_path(m, 0); | ||
1206 | } | ||
1207 | |||
1208 | if (argc != 2) | ||
1209 | goto error; | ||
1210 | |||
1211 | if (!strnicmp(argv[0], MESG_STR("disable_group"))) | ||
1212 | return bypass_pg_num(m, argv[1], 1); | ||
1213 | else if (!strnicmp(argv[0], MESG_STR("enable_group"))) | ||
1214 | return bypass_pg_num(m, argv[1], 0); | ||
1215 | else if (!strnicmp(argv[0], MESG_STR("switch_group"))) | ||
1216 | return switch_pg_num(m, argv[1]); | ||
1217 | else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) | ||
1218 | action = reinstate_path; | ||
1219 | else if (!strnicmp(argv[0], MESG_STR("fail_path"))) | ||
1220 | action = fail_path; | ||
1221 | else | ||
1222 | goto error; | ||
1223 | |||
1224 | r = dm_get_device(ti, argv[1], ti->begin, ti->len, | ||
1225 | dm_table_get_mode(ti->table), &dev); | ||
1226 | if (r) { | ||
1227 | DMWARN("dm-multipath message: error getting device %s", | ||
1228 | argv[1]); | ||
1229 | return -EINVAL; | ||
1230 | } | ||
1231 | |||
1232 | r = action_dev(m, dev, action); | ||
1233 | |||
1234 | dm_put_device(ti, dev); | ||
1235 | |||
1236 | return r; | ||
1237 | |||
1238 | error: | ||
1239 | DMWARN("Unrecognised multipath message received."); | ||
1240 | return -EINVAL; | ||
1241 | } | ||
1242 | |||
1243 | /*----------------------------------------------------------------- | ||
1244 | * Module setup | ||
1245 | *---------------------------------------------------------------*/ | ||
1246 | static struct target_type multipath_target = { | ||
1247 | .name = "multipath", | ||
1248 | .version = {1, 0, 4}, | ||
1249 | .module = THIS_MODULE, | ||
1250 | .ctr = multipath_ctr, | ||
1251 | .dtr = multipath_dtr, | ||
1252 | .map = multipath_map, | ||
1253 | .end_io = multipath_end_io, | ||
1254 | .presuspend = multipath_presuspend, | ||
1255 | .resume = multipath_resume, | ||
1256 | .status = multipath_status, | ||
1257 | .message = multipath_message, | ||
1258 | }; | ||
1259 | |||
1260 | static int __init dm_multipath_init(void) | ||
1261 | { | ||
1262 | int r; | ||
1263 | |||
1264 | /* allocate a slab for the dm_ios */ | ||
1265 | _mpio_cache = kmem_cache_create("dm_mpath", sizeof(struct mpath_io), | ||
1266 | 0, 0, NULL, NULL); | ||
1267 | if (!_mpio_cache) | ||
1268 | return -ENOMEM; | ||
1269 | |||
1270 | r = dm_register_target(&multipath_target); | ||
1271 | if (r < 0) { | ||
1272 | DMERR("%s: register failed %d", multipath_target.name, r); | ||
1273 | kmem_cache_destroy(_mpio_cache); | ||
1274 | return -EINVAL; | ||
1275 | } | ||
1276 | |||
1277 | DMINFO("dm-multipath version %u.%u.%u loaded", | ||
1278 | multipath_target.version[0], multipath_target.version[1], | ||
1279 | multipath_target.version[2]); | ||
1280 | |||
1281 | return r; | ||
1282 | } | ||
1283 | |||
1284 | static void __exit dm_multipath_exit(void) | ||
1285 | { | ||
1286 | int r; | ||
1287 | |||
1288 | r = dm_unregister_target(&multipath_target); | ||
1289 | if (r < 0) | ||
1290 | DMERR("%s: target unregister failed %d", | ||
1291 | multipath_target.name, r); | ||
1292 | kmem_cache_destroy(_mpio_cache); | ||
1293 | } | ||
1294 | |||
1295 | EXPORT_SYMBOL_GPL(dm_pg_init_complete); | ||
1296 | |||
1297 | module_init(dm_multipath_init); | ||
1298 | module_exit(dm_multipath_exit); | ||
1299 | |||
1300 | MODULE_DESCRIPTION(DM_NAME " multipath target"); | ||
1301 | MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); | ||
1302 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h new file mode 100644 index 000000000000..8a4bf2b6d52e --- /dev/null +++ b/drivers/md/dm-mpath.h | |||
@@ -0,0 +1,25 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2004 Red Hat, Inc. All rights reserved. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | * | ||
6 | * Multipath. | ||
7 | */ | ||
8 | |||
9 | #ifndef DM_MPATH_H | ||
10 | #define DM_MPATH_H | ||
11 | |||
12 | struct dm_dev; | ||
13 | |||
14 | struct path { | ||
15 | struct dm_dev *dev; /* Read-only */ | ||
16 | unsigned is_active; /* Read-only */ | ||
17 | |||
18 | void *pscontext; /* For path-selector use */ | ||
19 | void *hwhcontext; /* For hw-handler use */ | ||
20 | }; | ||
21 | |||
22 | /* Callback for hwh_pg_init_fn to use when complete */ | ||
23 | void dm_pg_init_complete(struct path *path, unsigned err_flags); | ||
24 | |||
25 | #endif | ||
diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c new file mode 100644 index 000000000000..ac5c4bbec6c1 --- /dev/null +++ b/drivers/md/dm-path-selector.c | |||
@@ -0,0 +1,156 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Sistina Software. | ||
3 | * Copyright (C) 2004 Red Hat, Inc. All rights reserved. | ||
4 | * | ||
5 | * Module Author: Heinz Mauelshagen | ||
6 | * | ||
7 | * This file is released under the GPL. | ||
8 | * | ||
9 | * Path selector registration. | ||
10 | */ | ||
11 | |||
12 | #include "dm.h" | ||
13 | #include "dm-path-selector.h" | ||
14 | |||
15 | #include <linux/slab.h> | ||
16 | |||
17 | struct ps_internal { | ||
18 | struct path_selector_type pst; | ||
19 | |||
20 | struct list_head list; | ||
21 | long use; | ||
22 | }; | ||
23 | |||
24 | #define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst) | ||
25 | |||
26 | static LIST_HEAD(_path_selectors); | ||
27 | static DECLARE_RWSEM(_ps_lock); | ||
28 | |||
29 | struct ps_internal *__find_path_selector_type(const char *name) | ||
30 | { | ||
31 | struct ps_internal *psi; | ||
32 | |||
33 | list_for_each_entry(psi, &_path_selectors, list) { | ||
34 | if (!strcmp(name, psi->pst.name)) | ||
35 | return psi; | ||
36 | } | ||
37 | |||
38 | return NULL; | ||
39 | } | ||
40 | |||
41 | static struct ps_internal *get_path_selector(const char *name) | ||
42 | { | ||
43 | struct ps_internal *psi; | ||
44 | |||
45 | down_read(&_ps_lock); | ||
46 | psi = __find_path_selector_type(name); | ||
47 | if (psi) { | ||
48 | if ((psi->use == 0) && !try_module_get(psi->pst.module)) | ||
49 | psi = NULL; | ||
50 | else | ||
51 | psi->use++; | ||
52 | } | ||
53 | up_read(&_ps_lock); | ||
54 | |||
55 | return psi; | ||
56 | } | ||
57 | |||
58 | struct path_selector_type *dm_get_path_selector(const char *name) | ||
59 | { | ||
60 | struct ps_internal *psi; | ||
61 | |||
62 | if (!name) | ||
63 | return NULL; | ||
64 | |||
65 | psi = get_path_selector(name); | ||
66 | if (!psi) { | ||
67 | request_module("dm-%s", name); | ||
68 | psi = get_path_selector(name); | ||
69 | } | ||
70 | |||
71 | return psi ? &psi->pst : NULL; | ||
72 | } | ||
73 | |||
74 | void dm_put_path_selector(struct path_selector_type *pst) | ||
75 | { | ||
76 | struct ps_internal *psi; | ||
77 | |||
78 | if (!pst) | ||
79 | return; | ||
80 | |||
81 | down_read(&_ps_lock); | ||
82 | psi = __find_path_selector_type(pst->name); | ||
83 | if (!psi) | ||
84 | goto out; | ||
85 | |||
86 | if (--psi->use == 0) | ||
87 | module_put(psi->pst.module); | ||
88 | |||
89 | if (psi->use < 0) | ||
90 | BUG(); | ||
91 | |||
92 | out: | ||
93 | up_read(&_ps_lock); | ||
94 | } | ||
95 | |||
96 | static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst) | ||
97 | { | ||
98 | struct ps_internal *psi = kmalloc(sizeof(*psi), GFP_KERNEL); | ||
99 | |||
100 | if (psi) { | ||
101 | memset(psi, 0, sizeof(*psi)); | ||
102 | psi->pst = *pst; | ||
103 | } | ||
104 | |||
105 | return psi; | ||
106 | } | ||
107 | |||
108 | int dm_register_path_selector(struct path_selector_type *pst) | ||
109 | { | ||
110 | int r = 0; | ||
111 | struct ps_internal *psi = _alloc_path_selector(pst); | ||
112 | |||
113 | if (!psi) | ||
114 | return -ENOMEM; | ||
115 | |||
116 | down_write(&_ps_lock); | ||
117 | |||
118 | if (__find_path_selector_type(pst->name)) { | ||
119 | kfree(psi); | ||
120 | r = -EEXIST; | ||
121 | } else | ||
122 | list_add(&psi->list, &_path_selectors); | ||
123 | |||
124 | up_write(&_ps_lock); | ||
125 | |||
126 | return r; | ||
127 | } | ||
128 | |||
129 | int dm_unregister_path_selector(struct path_selector_type *pst) | ||
130 | { | ||
131 | struct ps_internal *psi; | ||
132 | |||
133 | down_write(&_ps_lock); | ||
134 | |||
135 | psi = __find_path_selector_type(pst->name); | ||
136 | if (!psi) { | ||
137 | up_write(&_ps_lock); | ||
138 | return -EINVAL; | ||
139 | } | ||
140 | |||
141 | if (psi->use) { | ||
142 | up_write(&_ps_lock); | ||
143 | return -ETXTBSY; | ||
144 | } | ||
145 | |||
146 | list_del(&psi->list); | ||
147 | |||
148 | up_write(&_ps_lock); | ||
149 | |||
150 | kfree(psi); | ||
151 | |||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | EXPORT_SYMBOL_GPL(dm_register_path_selector); | ||
156 | EXPORT_SYMBOL_GPL(dm_unregister_path_selector); | ||
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h new file mode 100644 index 000000000000..732d06a84f85 --- /dev/null +++ b/drivers/md/dm-path-selector.h | |||
@@ -0,0 +1,93 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Sistina Software. | ||
3 | * Copyright (C) 2004 Red Hat, Inc. All rights reserved. | ||
4 | * | ||
5 | * Module Author: Heinz Mauelshagen | ||
6 | * | ||
7 | * This file is released under the GPL. | ||
8 | * | ||
9 | * Path-Selector registration. | ||
10 | */ | ||
11 | |||
12 | #ifndef DM_PATH_SELECTOR_H | ||
13 | #define DM_PATH_SELECTOR_H | ||
14 | |||
15 | #include <linux/device-mapper.h> | ||
16 | |||
17 | #include "dm-mpath.h" | ||
18 | |||
19 | /* | ||
20 | * We provide an abstraction for the code that chooses which path | ||
21 | * to send some io down. | ||
22 | */ | ||
23 | struct path_selector_type; | ||
24 | struct path_selector { | ||
25 | struct path_selector_type *type; | ||
26 | void *context; | ||
27 | }; | ||
28 | |||
29 | /* Information about a path selector type */ | ||
30 | struct path_selector_type { | ||
31 | char *name; | ||
32 | struct module *module; | ||
33 | |||
34 | unsigned int table_args; | ||
35 | unsigned int info_args; | ||
36 | |||
37 | /* | ||
38 | * Constructs a path selector object, takes custom arguments | ||
39 | */ | ||
40 | int (*create) (struct path_selector *ps, unsigned argc, char **argv); | ||
41 | void (*destroy) (struct path_selector *ps); | ||
42 | |||
43 | /* | ||
44 | * Add an opaque path object, along with some selector specific | ||
45 | * path args (eg, path priority). | ||
46 | */ | ||
47 | int (*add_path) (struct path_selector *ps, struct path *path, | ||
48 | int argc, char **argv, char **error); | ||
49 | |||
50 | /* | ||
51 | * Chooses a path for this io, if no paths are available then | ||
52 | * NULL will be returned. | ||
53 | * | ||
54 | * repeat_count is the number of times to use the path before | ||
55 | * calling the function again. 0 means don't call it again unless | ||
56 | * the path fails. | ||
57 | */ | ||
58 | struct path *(*select_path) (struct path_selector *ps, | ||
59 | unsigned *repeat_count); | ||
60 | |||
61 | /* | ||
62 | * Notify the selector that a path has failed. | ||
63 | */ | ||
64 | void (*fail_path) (struct path_selector *ps, struct path *p); | ||
65 | |||
66 | /* | ||
67 | * Ask selector to reinstate a path. | ||
68 | */ | ||
69 | int (*reinstate_path) (struct path_selector *ps, struct path *p); | ||
70 | |||
71 | /* | ||
72 | * Table content based on parameters added in ps_add_path_fn | ||
73 | * or path selector status | ||
74 | */ | ||
75 | int (*status) (struct path_selector *ps, struct path *path, | ||
76 | status_type_t type, char *result, unsigned int maxlen); | ||
77 | |||
78 | int (*end_io) (struct path_selector *ps, struct path *path); | ||
79 | }; | ||
80 | |||
81 | /* Register a path selector */ | ||
82 | int dm_register_path_selector(struct path_selector_type *type); | ||
83 | |||
84 | /* Unregister a path selector */ | ||
85 | int dm_unregister_path_selector(struct path_selector_type *type); | ||
86 | |||
87 | /* Returns a registered path selector type */ | ||
88 | struct path_selector_type *dm_get_path_selector(const char *name); | ||
89 | |||
90 | /* Releases a path selector */ | ||
91 | void dm_put_path_selector(struct path_selector_type *pst); | ||
92 | |||
93 | #endif | ||
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c new file mode 100644 index 000000000000..6e3cf7e13451 --- /dev/null +++ b/drivers/md/dm-raid1.c | |||
@@ -0,0 +1,1269 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Sistina Software Limited. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #include "dm.h" | ||
8 | #include "dm-bio-list.h" | ||
9 | #include "dm-io.h" | ||
10 | #include "dm-log.h" | ||
11 | #include "kcopyd.h" | ||
12 | |||
13 | #include <linux/ctype.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/mempool.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/pagemap.h> | ||
18 | #include <linux/slab.h> | ||
19 | #include <linux/time.h> | ||
20 | #include <linux/vmalloc.h> | ||
21 | #include <linux/workqueue.h> | ||
22 | |||
23 | static struct workqueue_struct *_kmirrord_wq; | ||
24 | static struct work_struct _kmirrord_work; | ||
25 | |||
26 | static inline void wake(void) | ||
27 | { | ||
28 | queue_work(_kmirrord_wq, &_kmirrord_work); | ||
29 | } | ||
30 | |||
31 | /*----------------------------------------------------------------- | ||
32 | * Region hash | ||
33 | * | ||
34 | * The mirror splits itself up into discrete regions. Each | ||
35 | * region can be in one of three states: clean, dirty, | ||
36 | * nosync. There is no need to put clean regions in the hash. | ||
37 | * | ||
38 | * In addition to being present in the hash table a region _may_ | ||
39 | * be present on one of three lists. | ||
40 | * | ||
41 | * clean_regions: Regions on this list have no io pending to | ||
42 | * them, they are in sync, we are no longer interested in them, | ||
43 | * they are dull. rh_update_states() will remove them from the | ||
44 | * hash table. | ||
45 | * | ||
46 | * quiesced_regions: These regions have been spun down, ready | ||
47 | * for recovery. rh_recovery_start() will remove regions from | ||
48 | * this list and hand them to kmirrord, which will schedule the | ||
49 | * recovery io with kcopyd. | ||
50 | * | ||
51 | * recovered_regions: Regions that kcopyd has successfully | ||
52 | * recovered. rh_update_states() will now schedule any delayed | ||
53 | * io, up the recovery_count, and remove the region from the | ||
54 | * hash. | ||
55 | * | ||
56 | * There are 2 locks: | ||
57 | * A rw spin lock 'hash_lock' protects just the hash table, | ||
58 | * this is never held in write mode from interrupt context, | ||
59 | * which I believe means that we only have to disable irqs when | ||
60 | * doing a write lock. | ||
61 | * | ||
62 | * An ordinary spin lock 'region_lock' that protects the three | ||
63 | * lists in the region_hash, with the 'state', 'list' and | ||
64 | * 'bhs_delayed' fields of the regions. This is used from irq | ||
65 | * context, so all other uses will have to suspend local irqs. | ||
66 | *---------------------------------------------------------------*/ | ||
67 | struct mirror_set; | ||
68 | struct region_hash { | ||
69 | struct mirror_set *ms; | ||
70 | uint32_t region_size; | ||
71 | unsigned region_shift; | ||
72 | |||
73 | /* holds persistent region state */ | ||
74 | struct dirty_log *log; | ||
75 | |||
76 | /* hash table */ | ||
77 | rwlock_t hash_lock; | ||
78 | mempool_t *region_pool; | ||
79 | unsigned int mask; | ||
80 | unsigned int nr_buckets; | ||
81 | struct list_head *buckets; | ||
82 | |||
83 | spinlock_t region_lock; | ||
84 | struct semaphore recovery_count; | ||
85 | struct list_head clean_regions; | ||
86 | struct list_head quiesced_regions; | ||
87 | struct list_head recovered_regions; | ||
88 | }; | ||
89 | |||
90 | enum { | ||
91 | RH_CLEAN, | ||
92 | RH_DIRTY, | ||
93 | RH_NOSYNC, | ||
94 | RH_RECOVERING | ||
95 | }; | ||
96 | |||
97 | struct region { | ||
98 | struct region_hash *rh; /* FIXME: can we get rid of this ? */ | ||
99 | region_t key; | ||
100 | int state; | ||
101 | |||
102 | struct list_head hash_list; | ||
103 | struct list_head list; | ||
104 | |||
105 | atomic_t pending; | ||
106 | struct bio_list delayed_bios; | ||
107 | }; | ||
108 | |||
109 | /* | ||
110 | * Conversion fns | ||
111 | */ | ||
112 | static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio) | ||
113 | { | ||
114 | return bio->bi_sector >> rh->region_shift; | ||
115 | } | ||
116 | |||
117 | static inline sector_t region_to_sector(struct region_hash *rh, region_t region) | ||
118 | { | ||
119 | return region << rh->region_shift; | ||
120 | } | ||
121 | |||
122 | /* FIXME move this */ | ||
123 | static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); | ||
124 | |||
125 | static void *region_alloc(unsigned int __nocast gfp_mask, void *pool_data) | ||
126 | { | ||
127 | return kmalloc(sizeof(struct region), gfp_mask); | ||
128 | } | ||
129 | |||
130 | static void region_free(void *element, void *pool_data) | ||
131 | { | ||
132 | kfree(element); | ||
133 | } | ||
134 | |||
135 | #define MIN_REGIONS 64 | ||
136 | #define MAX_RECOVERY 1 | ||
137 | static int rh_init(struct region_hash *rh, struct mirror_set *ms, | ||
138 | struct dirty_log *log, uint32_t region_size, | ||
139 | region_t nr_regions) | ||
140 | { | ||
141 | unsigned int nr_buckets, max_buckets; | ||
142 | size_t i; | ||
143 | |||
144 | /* | ||
145 | * Calculate a suitable number of buckets for our hash | ||
146 | * table. | ||
147 | */ | ||
148 | max_buckets = nr_regions >> 6; | ||
149 | for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) | ||
150 | ; | ||
151 | nr_buckets >>= 1; | ||
152 | |||
153 | rh->ms = ms; | ||
154 | rh->log = log; | ||
155 | rh->region_size = region_size; | ||
156 | rh->region_shift = ffs(region_size) - 1; | ||
157 | rwlock_init(&rh->hash_lock); | ||
158 | rh->mask = nr_buckets - 1; | ||
159 | rh->nr_buckets = nr_buckets; | ||
160 | |||
161 | rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); | ||
162 | if (!rh->buckets) { | ||
163 | DMERR("unable to allocate region hash memory"); | ||
164 | return -ENOMEM; | ||
165 | } | ||
166 | |||
167 | for (i = 0; i < nr_buckets; i++) | ||
168 | INIT_LIST_HEAD(rh->buckets + i); | ||
169 | |||
170 | spin_lock_init(&rh->region_lock); | ||
171 | sema_init(&rh->recovery_count, 0); | ||
172 | INIT_LIST_HEAD(&rh->clean_regions); | ||
173 | INIT_LIST_HEAD(&rh->quiesced_regions); | ||
174 | INIT_LIST_HEAD(&rh->recovered_regions); | ||
175 | |||
176 | rh->region_pool = mempool_create(MIN_REGIONS, region_alloc, | ||
177 | region_free, NULL); | ||
178 | if (!rh->region_pool) { | ||
179 | vfree(rh->buckets); | ||
180 | rh->buckets = NULL; | ||
181 | return -ENOMEM; | ||
182 | } | ||
183 | |||
184 | return 0; | ||
185 | } | ||
186 | |||
187 | static void rh_exit(struct region_hash *rh) | ||
188 | { | ||
189 | unsigned int h; | ||
190 | struct region *reg, *nreg; | ||
191 | |||
192 | BUG_ON(!list_empty(&rh->quiesced_regions)); | ||
193 | for (h = 0; h < rh->nr_buckets; h++) { | ||
194 | list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) { | ||
195 | BUG_ON(atomic_read(®->pending)); | ||
196 | mempool_free(reg, rh->region_pool); | ||
197 | } | ||
198 | } | ||
199 | |||
200 | if (rh->log) | ||
201 | dm_destroy_dirty_log(rh->log); | ||
202 | if (rh->region_pool) | ||
203 | mempool_destroy(rh->region_pool); | ||
204 | vfree(rh->buckets); | ||
205 | } | ||
206 | |||
207 | #define RH_HASH_MULT 2654435387U | ||
208 | |||
209 | static inline unsigned int rh_hash(struct region_hash *rh, region_t region) | ||
210 | { | ||
211 | return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask; | ||
212 | } | ||
213 | |||
214 | static struct region *__rh_lookup(struct region_hash *rh, region_t region) | ||
215 | { | ||
216 | struct region *reg; | ||
217 | |||
218 | list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list) | ||
219 | if (reg->key == region) | ||
220 | return reg; | ||
221 | |||
222 | return NULL; | ||
223 | } | ||
224 | |||
225 | static void __rh_insert(struct region_hash *rh, struct region *reg) | ||
226 | { | ||
227 | unsigned int h = rh_hash(rh, reg->key); | ||
228 | list_add(®->hash_list, rh->buckets + h); | ||
229 | } | ||
230 | |||
231 | static struct region *__rh_alloc(struct region_hash *rh, region_t region) | ||
232 | { | ||
233 | struct region *reg, *nreg; | ||
234 | |||
235 | read_unlock(&rh->hash_lock); | ||
236 | nreg = mempool_alloc(rh->region_pool, GFP_NOIO); | ||
237 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? | ||
238 | RH_CLEAN : RH_NOSYNC; | ||
239 | nreg->rh = rh; | ||
240 | nreg->key = region; | ||
241 | |||
242 | INIT_LIST_HEAD(&nreg->list); | ||
243 | |||
244 | atomic_set(&nreg->pending, 0); | ||
245 | bio_list_init(&nreg->delayed_bios); | ||
246 | write_lock_irq(&rh->hash_lock); | ||
247 | |||
248 | reg = __rh_lookup(rh, region); | ||
249 | if (reg) | ||
250 | /* we lost the race */ | ||
251 | mempool_free(nreg, rh->region_pool); | ||
252 | |||
253 | else { | ||
254 | __rh_insert(rh, nreg); | ||
255 | if (nreg->state == RH_CLEAN) { | ||
256 | spin_lock(&rh->region_lock); | ||
257 | list_add(&nreg->list, &rh->clean_regions); | ||
258 | spin_unlock(&rh->region_lock); | ||
259 | } | ||
260 | reg = nreg; | ||
261 | } | ||
262 | write_unlock_irq(&rh->hash_lock); | ||
263 | read_lock(&rh->hash_lock); | ||
264 | |||
265 | return reg; | ||
266 | } | ||
267 | |||
268 | static inline struct region *__rh_find(struct region_hash *rh, region_t region) | ||
269 | { | ||
270 | struct region *reg; | ||
271 | |||
272 | reg = __rh_lookup(rh, region); | ||
273 | if (!reg) | ||
274 | reg = __rh_alloc(rh, region); | ||
275 | |||
276 | return reg; | ||
277 | } | ||
278 | |||
279 | static int rh_state(struct region_hash *rh, region_t region, int may_block) | ||
280 | { | ||
281 | int r; | ||
282 | struct region *reg; | ||
283 | |||
284 | read_lock(&rh->hash_lock); | ||
285 | reg = __rh_lookup(rh, region); | ||
286 | read_unlock(&rh->hash_lock); | ||
287 | |||
288 | if (reg) | ||
289 | return reg->state; | ||
290 | |||
291 | /* | ||
292 | * The region wasn't in the hash, so we fall back to the | ||
293 | * dirty log. | ||
294 | */ | ||
295 | r = rh->log->type->in_sync(rh->log, region, may_block); | ||
296 | |||
297 | /* | ||
298 | * Any error from the dirty log (eg. -EWOULDBLOCK) gets | ||
299 | * taken as a RH_NOSYNC | ||
300 | */ | ||
301 | return r == 1 ? RH_CLEAN : RH_NOSYNC; | ||
302 | } | ||
303 | |||
304 | static inline int rh_in_sync(struct region_hash *rh, | ||
305 | region_t region, int may_block) | ||
306 | { | ||
307 | int state = rh_state(rh, region, may_block); | ||
308 | return state == RH_CLEAN || state == RH_DIRTY; | ||
309 | } | ||
310 | |||
311 | static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list) | ||
312 | { | ||
313 | struct bio *bio; | ||
314 | |||
315 | while ((bio = bio_list_pop(bio_list))) { | ||
316 | queue_bio(ms, bio, WRITE); | ||
317 | } | ||
318 | } | ||
319 | |||
320 | static void rh_update_states(struct region_hash *rh) | ||
321 | { | ||
322 | struct region *reg, *next; | ||
323 | |||
324 | LIST_HEAD(clean); | ||
325 | LIST_HEAD(recovered); | ||
326 | |||
327 | /* | ||
328 | * Quickly grab the lists. | ||
329 | */ | ||
330 | write_lock_irq(&rh->hash_lock); | ||
331 | spin_lock(&rh->region_lock); | ||
332 | if (!list_empty(&rh->clean_regions)) { | ||
333 | list_splice(&rh->clean_regions, &clean); | ||
334 | INIT_LIST_HEAD(&rh->clean_regions); | ||
335 | |||
336 | list_for_each_entry (reg, &clean, list) { | ||
337 | rh->log->type->clear_region(rh->log, reg->key); | ||
338 | list_del(®->hash_list); | ||
339 | } | ||
340 | } | ||
341 | |||
342 | if (!list_empty(&rh->recovered_regions)) { | ||
343 | list_splice(&rh->recovered_regions, &recovered); | ||
344 | INIT_LIST_HEAD(&rh->recovered_regions); | ||
345 | |||
346 | list_for_each_entry (reg, &recovered, list) | ||
347 | list_del(®->hash_list); | ||
348 | } | ||
349 | spin_unlock(&rh->region_lock); | ||
350 | write_unlock_irq(&rh->hash_lock); | ||
351 | |||
352 | /* | ||
353 | * All the regions on the recovered and clean lists have | ||
354 | * now been pulled out of the system, so no need to do | ||
355 | * any more locking. | ||
356 | */ | ||
357 | list_for_each_entry_safe (reg, next, &recovered, list) { | ||
358 | rh->log->type->clear_region(rh->log, reg->key); | ||
359 | rh->log->type->complete_resync_work(rh->log, reg->key, 1); | ||
360 | dispatch_bios(rh->ms, ®->delayed_bios); | ||
361 | up(&rh->recovery_count); | ||
362 | mempool_free(reg, rh->region_pool); | ||
363 | } | ||
364 | |||
365 | if (!list_empty(&recovered)) | ||
366 | rh->log->type->flush(rh->log); | ||
367 | |||
368 | list_for_each_entry_safe (reg, next, &clean, list) | ||
369 | mempool_free(reg, rh->region_pool); | ||
370 | } | ||
371 | |||
372 | static void rh_inc(struct region_hash *rh, region_t region) | ||
373 | { | ||
374 | struct region *reg; | ||
375 | |||
376 | read_lock(&rh->hash_lock); | ||
377 | reg = __rh_find(rh, region); | ||
378 | if (reg->state == RH_CLEAN) { | ||
379 | rh->log->type->mark_region(rh->log, reg->key); | ||
380 | |||
381 | spin_lock_irq(&rh->region_lock); | ||
382 | reg->state = RH_DIRTY; | ||
383 | list_del_init(®->list); /* take off the clean list */ | ||
384 | spin_unlock_irq(&rh->region_lock); | ||
385 | } | ||
386 | |||
387 | atomic_inc(®->pending); | ||
388 | read_unlock(&rh->hash_lock); | ||
389 | } | ||
390 | |||
391 | static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios) | ||
392 | { | ||
393 | struct bio *bio; | ||
394 | |||
395 | for (bio = bios->head; bio; bio = bio->bi_next) | ||
396 | rh_inc(rh, bio_to_region(rh, bio)); | ||
397 | } | ||
398 | |||
399 | static void rh_dec(struct region_hash *rh, region_t region) | ||
400 | { | ||
401 | unsigned long flags; | ||
402 | struct region *reg; | ||
403 | int should_wake = 0; | ||
404 | |||
405 | read_lock(&rh->hash_lock); | ||
406 | reg = __rh_lookup(rh, region); | ||
407 | read_unlock(&rh->hash_lock); | ||
408 | |||
409 | if (atomic_dec_and_test(®->pending)) { | ||
410 | spin_lock_irqsave(&rh->region_lock, flags); | ||
411 | if (reg->state == RH_RECOVERING) { | ||
412 | list_add_tail(®->list, &rh->quiesced_regions); | ||
413 | } else { | ||
414 | reg->state = RH_CLEAN; | ||
415 | list_add(®->list, &rh->clean_regions); | ||
416 | } | ||
417 | spin_unlock_irqrestore(&rh->region_lock, flags); | ||
418 | should_wake = 1; | ||
419 | } | ||
420 | |||
421 | if (should_wake) | ||
422 | wake(); | ||
423 | } | ||
424 | |||
425 | /* | ||
426 | * Starts quiescing a region in preparation for recovery. | ||
427 | */ | ||
428 | static int __rh_recovery_prepare(struct region_hash *rh) | ||
429 | { | ||
430 | int r; | ||
431 | struct region *reg; | ||
432 | region_t region; | ||
433 | |||
434 | /* | ||
435 | * Ask the dirty log what's next. | ||
436 | */ | ||
437 | r = rh->log->type->get_resync_work(rh->log, ®ion); | ||
438 | if (r <= 0) | ||
439 | return r; | ||
440 | |||
441 | /* | ||
442 | * Get this region, and start it quiescing by setting the | ||
443 | * recovering flag. | ||
444 | */ | ||
445 | read_lock(&rh->hash_lock); | ||
446 | reg = __rh_find(rh, region); | ||
447 | read_unlock(&rh->hash_lock); | ||
448 | |||
449 | spin_lock_irq(&rh->region_lock); | ||
450 | reg->state = RH_RECOVERING; | ||
451 | |||
452 | /* Already quiesced ? */ | ||
453 | if (atomic_read(®->pending)) | ||
454 | list_del_init(®->list); | ||
455 | |||
456 | else { | ||
457 | list_del_init(®->list); | ||
458 | list_add(®->list, &rh->quiesced_regions); | ||
459 | } | ||
460 | spin_unlock_irq(&rh->region_lock); | ||
461 | |||
462 | return 1; | ||
463 | } | ||
464 | |||
465 | static void rh_recovery_prepare(struct region_hash *rh) | ||
466 | { | ||
467 | while (!down_trylock(&rh->recovery_count)) | ||
468 | if (__rh_recovery_prepare(rh) <= 0) { | ||
469 | up(&rh->recovery_count); | ||
470 | break; | ||
471 | } | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * Returns any quiesced regions. | ||
476 | */ | ||
477 | static struct region *rh_recovery_start(struct region_hash *rh) | ||
478 | { | ||
479 | struct region *reg = NULL; | ||
480 | |||
481 | spin_lock_irq(&rh->region_lock); | ||
482 | if (!list_empty(&rh->quiesced_regions)) { | ||
483 | reg = list_entry(rh->quiesced_regions.next, | ||
484 | struct region, list); | ||
485 | list_del_init(®->list); /* remove from the quiesced list */ | ||
486 | } | ||
487 | spin_unlock_irq(&rh->region_lock); | ||
488 | |||
489 | return reg; | ||
490 | } | ||
491 | |||
492 | /* FIXME: success ignored for now */ | ||
493 | static void rh_recovery_end(struct region *reg, int success) | ||
494 | { | ||
495 | struct region_hash *rh = reg->rh; | ||
496 | |||
497 | spin_lock_irq(&rh->region_lock); | ||
498 | list_add(®->list, ®->rh->recovered_regions); | ||
499 | spin_unlock_irq(&rh->region_lock); | ||
500 | |||
501 | wake(); | ||
502 | } | ||
503 | |||
504 | static void rh_flush(struct region_hash *rh) | ||
505 | { | ||
506 | rh->log->type->flush(rh->log); | ||
507 | } | ||
508 | |||
509 | static void rh_delay(struct region_hash *rh, struct bio *bio) | ||
510 | { | ||
511 | struct region *reg; | ||
512 | |||
513 | read_lock(&rh->hash_lock); | ||
514 | reg = __rh_find(rh, bio_to_region(rh, bio)); | ||
515 | bio_list_add(®->delayed_bios, bio); | ||
516 | read_unlock(&rh->hash_lock); | ||
517 | } | ||
518 | |||
519 | static void rh_stop_recovery(struct region_hash *rh) | ||
520 | { | ||
521 | int i; | ||
522 | |||
523 | /* wait for any recovering regions */ | ||
524 | for (i = 0; i < MAX_RECOVERY; i++) | ||
525 | down(&rh->recovery_count); | ||
526 | } | ||
527 | |||
528 | static void rh_start_recovery(struct region_hash *rh) | ||
529 | { | ||
530 | int i; | ||
531 | |||
532 | for (i = 0; i < MAX_RECOVERY; i++) | ||
533 | up(&rh->recovery_count); | ||
534 | |||
535 | wake(); | ||
536 | } | ||
537 | |||
538 | /*----------------------------------------------------------------- | ||
539 | * Mirror set structures. | ||
540 | *---------------------------------------------------------------*/ | ||
541 | struct mirror { | ||
542 | atomic_t error_count; | ||
543 | struct dm_dev *dev; | ||
544 | sector_t offset; | ||
545 | }; | ||
546 | |||
547 | struct mirror_set { | ||
548 | struct dm_target *ti; | ||
549 | struct list_head list; | ||
550 | struct region_hash rh; | ||
551 | struct kcopyd_client *kcopyd_client; | ||
552 | |||
553 | spinlock_t lock; /* protects the next two lists */ | ||
554 | struct bio_list reads; | ||
555 | struct bio_list writes; | ||
556 | |||
557 | /* recovery */ | ||
558 | region_t nr_regions; | ||
559 | int in_sync; | ||
560 | |||
561 | unsigned int nr_mirrors; | ||
562 | struct mirror mirror[0]; | ||
563 | }; | ||
564 | |||
565 | /* | ||
566 | * Every mirror should look like this one. | ||
567 | */ | ||
568 | #define DEFAULT_MIRROR 0 | ||
569 | |||
570 | /* | ||
571 | * This is yucky. We squirrel the mirror_set struct away inside | ||
572 | * bi_next for write buffers. This is safe since the bh | ||
573 | * doesn't get submitted to the lower levels of block layer. | ||
574 | */ | ||
575 | static struct mirror_set *bio_get_ms(struct bio *bio) | ||
576 | { | ||
577 | return (struct mirror_set *) bio->bi_next; | ||
578 | } | ||
579 | |||
580 | static void bio_set_ms(struct bio *bio, struct mirror_set *ms) | ||
581 | { | ||
582 | bio->bi_next = (struct bio *) ms; | ||
583 | } | ||
584 | |||
585 | /*----------------------------------------------------------------- | ||
586 | * Recovery. | ||
587 | * | ||
588 | * When a mirror is first activated we may find that some regions | ||
589 | * are in the no-sync state. We have to recover these by | ||
590 | * recopying from the default mirror to all the others. | ||
591 | *---------------------------------------------------------------*/ | ||
592 | static void recovery_complete(int read_err, unsigned int write_err, | ||
593 | void *context) | ||
594 | { | ||
595 | struct region *reg = (struct region *) context; | ||
596 | |||
597 | /* FIXME: better error handling */ | ||
598 | rh_recovery_end(reg, read_err || write_err); | ||
599 | } | ||
600 | |||
601 | static int recover(struct mirror_set *ms, struct region *reg) | ||
602 | { | ||
603 | int r; | ||
604 | unsigned int i; | ||
605 | struct io_region from, to[KCOPYD_MAX_REGIONS], *dest; | ||
606 | struct mirror *m; | ||
607 | unsigned long flags = 0; | ||
608 | |||
609 | /* fill in the source */ | ||
610 | m = ms->mirror + DEFAULT_MIRROR; | ||
611 | from.bdev = m->dev->bdev; | ||
612 | from.sector = m->offset + region_to_sector(reg->rh, reg->key); | ||
613 | if (reg->key == (ms->nr_regions - 1)) { | ||
614 | /* | ||
615 | * The final region may be smaller than | ||
616 | * region_size. | ||
617 | */ | ||
618 | from.count = ms->ti->len & (reg->rh->region_size - 1); | ||
619 | if (!from.count) | ||
620 | from.count = reg->rh->region_size; | ||
621 | } else | ||
622 | from.count = reg->rh->region_size; | ||
623 | |||
624 | /* fill in the destinations */ | ||
625 | for (i = 0, dest = to; i < ms->nr_mirrors; i++) { | ||
626 | if (i == DEFAULT_MIRROR) | ||
627 | continue; | ||
628 | |||
629 | m = ms->mirror + i; | ||
630 | dest->bdev = m->dev->bdev; | ||
631 | dest->sector = m->offset + region_to_sector(reg->rh, reg->key); | ||
632 | dest->count = from.count; | ||
633 | dest++; | ||
634 | } | ||
635 | |||
636 | /* hand to kcopyd */ | ||
637 | set_bit(KCOPYD_IGNORE_ERROR, &flags); | ||
638 | r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, | ||
639 | recovery_complete, reg); | ||
640 | |||
641 | return r; | ||
642 | } | ||
643 | |||
644 | static void do_recovery(struct mirror_set *ms) | ||
645 | { | ||
646 | int r; | ||
647 | struct region *reg; | ||
648 | struct dirty_log *log = ms->rh.log; | ||
649 | |||
650 | /* | ||
651 | * Start quiescing some regions. | ||
652 | */ | ||
653 | rh_recovery_prepare(&ms->rh); | ||
654 | |||
655 | /* | ||
656 | * Copy any already quiesced regions. | ||
657 | */ | ||
658 | while ((reg = rh_recovery_start(&ms->rh))) { | ||
659 | r = recover(ms, reg); | ||
660 | if (r) | ||
661 | rh_recovery_end(reg, 0); | ||
662 | } | ||
663 | |||
664 | /* | ||
665 | * Update the in sync flag. | ||
666 | */ | ||
667 | if (!ms->in_sync && | ||
668 | (log->type->get_sync_count(log) == ms->nr_regions)) { | ||
669 | /* the sync is complete */ | ||
670 | dm_table_event(ms->ti->table); | ||
671 | ms->in_sync = 1; | ||
672 | } | ||
673 | } | ||
674 | |||
675 | /*----------------------------------------------------------------- | ||
676 | * Reads | ||
677 | *---------------------------------------------------------------*/ | ||
678 | static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) | ||
679 | { | ||
680 | /* FIXME: add read balancing */ | ||
681 | return ms->mirror + DEFAULT_MIRROR; | ||
682 | } | ||
683 | |||
684 | /* | ||
685 | * remap a buffer to a particular mirror. | ||
686 | */ | ||
687 | static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) | ||
688 | { | ||
689 | bio->bi_bdev = m->dev->bdev; | ||
690 | bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); | ||
691 | } | ||
692 | |||
693 | static void do_reads(struct mirror_set *ms, struct bio_list *reads) | ||
694 | { | ||
695 | region_t region; | ||
696 | struct bio *bio; | ||
697 | struct mirror *m; | ||
698 | |||
699 | while ((bio = bio_list_pop(reads))) { | ||
700 | region = bio_to_region(&ms->rh, bio); | ||
701 | |||
702 | /* | ||
703 | * We can only read balance if the region is in sync. | ||
704 | */ | ||
705 | if (rh_in_sync(&ms->rh, region, 0)) | ||
706 | m = choose_mirror(ms, bio->bi_sector); | ||
707 | else | ||
708 | m = ms->mirror + DEFAULT_MIRROR; | ||
709 | |||
710 | map_bio(ms, m, bio); | ||
711 | generic_make_request(bio); | ||
712 | } | ||
713 | } | ||
714 | |||
715 | /*----------------------------------------------------------------- | ||
716 | * Writes. | ||
717 | * | ||
718 | * We do different things with the write io depending on the | ||
719 | * state of the region that it's in: | ||
720 | * | ||
721 | * SYNC: increment pending, use kcopyd to write to *all* mirrors | ||
722 | * RECOVERING: delay the io until recovery completes | ||
723 | * NOSYNC: increment pending, just write to the default mirror | ||
724 | *---------------------------------------------------------------*/ | ||
725 | static void write_callback(unsigned long error, void *context) | ||
726 | { | ||
727 | unsigned int i; | ||
728 | int uptodate = 1; | ||
729 | struct bio *bio = (struct bio *) context; | ||
730 | struct mirror_set *ms; | ||
731 | |||
732 | ms = bio_get_ms(bio); | ||
733 | bio_set_ms(bio, NULL); | ||
734 | |||
735 | /* | ||
736 | * NOTE: We don't decrement the pending count here, | ||
737 | * instead it is done by the targets endio function. | ||
738 | * This way we handle both writes to SYNC and NOSYNC | ||
739 | * regions with the same code. | ||
740 | */ | ||
741 | |||
742 | if (error) { | ||
743 | /* | ||
744 | * only error the io if all mirrors failed. | ||
745 | * FIXME: bogus | ||
746 | */ | ||
747 | uptodate = 0; | ||
748 | for (i = 0; i < ms->nr_mirrors; i++) | ||
749 | if (!test_bit(i, &error)) { | ||
750 | uptodate = 1; | ||
751 | break; | ||
752 | } | ||
753 | } | ||
754 | bio_endio(bio, bio->bi_size, 0); | ||
755 | } | ||
756 | |||
757 | static void do_write(struct mirror_set *ms, struct bio *bio) | ||
758 | { | ||
759 | unsigned int i; | ||
760 | struct io_region io[KCOPYD_MAX_REGIONS+1]; | ||
761 | struct mirror *m; | ||
762 | |||
763 | for (i = 0; i < ms->nr_mirrors; i++) { | ||
764 | m = ms->mirror + i; | ||
765 | |||
766 | io[i].bdev = m->dev->bdev; | ||
767 | io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); | ||
768 | io[i].count = bio->bi_size >> 9; | ||
769 | } | ||
770 | |||
771 | bio_set_ms(bio, ms); | ||
772 | dm_io_async_bvec(ms->nr_mirrors, io, WRITE, | ||
773 | bio->bi_io_vec + bio->bi_idx, | ||
774 | write_callback, bio); | ||
775 | } | ||
776 | |||
777 | static void do_writes(struct mirror_set *ms, struct bio_list *writes) | ||
778 | { | ||
779 | int state; | ||
780 | struct bio *bio; | ||
781 | struct bio_list sync, nosync, recover, *this_list = NULL; | ||
782 | |||
783 | if (!writes->head) | ||
784 | return; | ||
785 | |||
786 | /* | ||
787 | * Classify each write. | ||
788 | */ | ||
789 | bio_list_init(&sync); | ||
790 | bio_list_init(&nosync); | ||
791 | bio_list_init(&recover); | ||
792 | |||
793 | while ((bio = bio_list_pop(writes))) { | ||
794 | state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1); | ||
795 | switch (state) { | ||
796 | case RH_CLEAN: | ||
797 | case RH_DIRTY: | ||
798 | this_list = &sync; | ||
799 | break; | ||
800 | |||
801 | case RH_NOSYNC: | ||
802 | this_list = &nosync; | ||
803 | break; | ||
804 | |||
805 | case RH_RECOVERING: | ||
806 | this_list = &recover; | ||
807 | break; | ||
808 | } | ||
809 | |||
810 | bio_list_add(this_list, bio); | ||
811 | } | ||
812 | |||
813 | /* | ||
814 | * Increment the pending counts for any regions that will | ||
815 | * be written to (writes to recover regions are going to | ||
816 | * be delayed). | ||
817 | */ | ||
818 | rh_inc_pending(&ms->rh, &sync); | ||
819 | rh_inc_pending(&ms->rh, &nosync); | ||
820 | rh_flush(&ms->rh); | ||
821 | |||
822 | /* | ||
823 | * Dispatch io. | ||
824 | */ | ||
825 | while ((bio = bio_list_pop(&sync))) | ||
826 | do_write(ms, bio); | ||
827 | |||
828 | while ((bio = bio_list_pop(&recover))) | ||
829 | rh_delay(&ms->rh, bio); | ||
830 | |||
831 | while ((bio = bio_list_pop(&nosync))) { | ||
832 | map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio); | ||
833 | generic_make_request(bio); | ||
834 | } | ||
835 | } | ||
836 | |||
837 | /*----------------------------------------------------------------- | ||
838 | * kmirrord | ||
839 | *---------------------------------------------------------------*/ | ||
840 | static LIST_HEAD(_mirror_sets); | ||
841 | static DECLARE_RWSEM(_mirror_sets_lock); | ||
842 | |||
843 | static void do_mirror(struct mirror_set *ms) | ||
844 | { | ||
845 | struct bio_list reads, writes; | ||
846 | |||
847 | spin_lock(&ms->lock); | ||
848 | reads = ms->reads; | ||
849 | writes = ms->writes; | ||
850 | bio_list_init(&ms->reads); | ||
851 | bio_list_init(&ms->writes); | ||
852 | spin_unlock(&ms->lock); | ||
853 | |||
854 | rh_update_states(&ms->rh); | ||
855 | do_recovery(ms); | ||
856 | do_reads(ms, &reads); | ||
857 | do_writes(ms, &writes); | ||
858 | } | ||
859 | |||
860 | static void do_work(void *ignored) | ||
861 | { | ||
862 | struct mirror_set *ms; | ||
863 | |||
864 | down_read(&_mirror_sets_lock); | ||
865 | list_for_each_entry (ms, &_mirror_sets, list) | ||
866 | do_mirror(ms); | ||
867 | up_read(&_mirror_sets_lock); | ||
868 | } | ||
869 | |||
870 | /*----------------------------------------------------------------- | ||
871 | * Target functions | ||
872 | *---------------------------------------------------------------*/ | ||
873 | static struct mirror_set *alloc_context(unsigned int nr_mirrors, | ||
874 | uint32_t region_size, | ||
875 | struct dm_target *ti, | ||
876 | struct dirty_log *dl) | ||
877 | { | ||
878 | size_t len; | ||
879 | struct mirror_set *ms = NULL; | ||
880 | |||
881 | if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors)) | ||
882 | return NULL; | ||
883 | |||
884 | len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); | ||
885 | |||
886 | ms = kmalloc(len, GFP_KERNEL); | ||
887 | if (!ms) { | ||
888 | ti->error = "dm-mirror: Cannot allocate mirror context"; | ||
889 | return NULL; | ||
890 | } | ||
891 | |||
892 | memset(ms, 0, len); | ||
893 | spin_lock_init(&ms->lock); | ||
894 | |||
895 | ms->ti = ti; | ||
896 | ms->nr_mirrors = nr_mirrors; | ||
897 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); | ||
898 | ms->in_sync = 0; | ||
899 | |||
900 | if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { | ||
901 | ti->error = "dm-mirror: Error creating dirty region hash"; | ||
902 | kfree(ms); | ||
903 | return NULL; | ||
904 | } | ||
905 | |||
906 | return ms; | ||
907 | } | ||
908 | |||
909 | static void free_context(struct mirror_set *ms, struct dm_target *ti, | ||
910 | unsigned int m) | ||
911 | { | ||
912 | while (m--) | ||
913 | dm_put_device(ti, ms->mirror[m].dev); | ||
914 | |||
915 | rh_exit(&ms->rh); | ||
916 | kfree(ms); | ||
917 | } | ||
918 | |||
919 | static inline int _check_region_size(struct dm_target *ti, uint32_t size) | ||
920 | { | ||
921 | return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) || | ||
922 | size > ti->len); | ||
923 | } | ||
924 | |||
925 | static int get_mirror(struct mirror_set *ms, struct dm_target *ti, | ||
926 | unsigned int mirror, char **argv) | ||
927 | { | ||
928 | sector_t offset; | ||
929 | |||
930 | if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) { | ||
931 | ti->error = "dm-mirror: Invalid offset"; | ||
932 | return -EINVAL; | ||
933 | } | ||
934 | |||
935 | if (dm_get_device(ti, argv[0], offset, ti->len, | ||
936 | dm_table_get_mode(ti->table), | ||
937 | &ms->mirror[mirror].dev)) { | ||
938 | ti->error = "dm-mirror: Device lookup failure"; | ||
939 | return -ENXIO; | ||
940 | } | ||
941 | |||
942 | ms->mirror[mirror].offset = offset; | ||
943 | |||
944 | return 0; | ||
945 | } | ||
946 | |||
947 | static int add_mirror_set(struct mirror_set *ms) | ||
948 | { | ||
949 | down_write(&_mirror_sets_lock); | ||
950 | list_add_tail(&ms->list, &_mirror_sets); | ||
951 | up_write(&_mirror_sets_lock); | ||
952 | wake(); | ||
953 | |||
954 | return 0; | ||
955 | } | ||
956 | |||
957 | static void del_mirror_set(struct mirror_set *ms) | ||
958 | { | ||
959 | down_write(&_mirror_sets_lock); | ||
960 | list_del(&ms->list); | ||
961 | up_write(&_mirror_sets_lock); | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Create dirty log: log_type #log_params <log_params> | ||
966 | */ | ||
967 | static struct dirty_log *create_dirty_log(struct dm_target *ti, | ||
968 | unsigned int argc, char **argv, | ||
969 | unsigned int *args_used) | ||
970 | { | ||
971 | unsigned int param_count; | ||
972 | struct dirty_log *dl; | ||
973 | |||
974 | if (argc < 2) { | ||
975 | ti->error = "dm-mirror: Insufficient mirror log arguments"; | ||
976 | return NULL; | ||
977 | } | ||
978 | |||
979 | if (sscanf(argv[1], "%u", ¶m_count) != 1) { | ||
980 | ti->error = "dm-mirror: Invalid mirror log argument count"; | ||
981 | return NULL; | ||
982 | } | ||
983 | |||
984 | *args_used = 2 + param_count; | ||
985 | |||
986 | if (argc < *args_used) { | ||
987 | ti->error = "dm-mirror: Insufficient mirror log arguments"; | ||
988 | return NULL; | ||
989 | } | ||
990 | |||
991 | dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2); | ||
992 | if (!dl) { | ||
993 | ti->error = "dm-mirror: Error creating mirror dirty log"; | ||
994 | return NULL; | ||
995 | } | ||
996 | |||
997 | if (!_check_region_size(ti, dl->type->get_region_size(dl))) { | ||
998 | ti->error = "dm-mirror: Invalid region size"; | ||
999 | dm_destroy_dirty_log(dl); | ||
1000 | return NULL; | ||
1001 | } | ||
1002 | |||
1003 | return dl; | ||
1004 | } | ||
1005 | |||
1006 | /* | ||
1007 | * Construct a mirror mapping: | ||
1008 | * | ||
1009 | * log_type #log_params <log_params> | ||
1010 | * #mirrors [mirror_path offset]{2,} | ||
1011 | * | ||
1012 | * log_type is "core" or "disk" | ||
1013 | * #log_params is between 1 and 3 | ||
1014 | */ | ||
1015 | #define DM_IO_PAGES 64 | ||
1016 | static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | ||
1017 | { | ||
1018 | int r; | ||
1019 | unsigned int nr_mirrors, m, args_used; | ||
1020 | struct mirror_set *ms; | ||
1021 | struct dirty_log *dl; | ||
1022 | |||
1023 | dl = create_dirty_log(ti, argc, argv, &args_used); | ||
1024 | if (!dl) | ||
1025 | return -EINVAL; | ||
1026 | |||
1027 | argv += args_used; | ||
1028 | argc -= args_used; | ||
1029 | |||
1030 | if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || | ||
1031 | nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) { | ||
1032 | ti->error = "dm-mirror: Invalid number of mirrors"; | ||
1033 | dm_destroy_dirty_log(dl); | ||
1034 | return -EINVAL; | ||
1035 | } | ||
1036 | |||
1037 | argv++, argc--; | ||
1038 | |||
1039 | if (argc != nr_mirrors * 2) { | ||
1040 | ti->error = "dm-mirror: Wrong number of mirror arguments"; | ||
1041 | dm_destroy_dirty_log(dl); | ||
1042 | return -EINVAL; | ||
1043 | } | ||
1044 | |||
1045 | ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); | ||
1046 | if (!ms) { | ||
1047 | dm_destroy_dirty_log(dl); | ||
1048 | return -ENOMEM; | ||
1049 | } | ||
1050 | |||
1051 | /* Get the mirror parameter sets */ | ||
1052 | for (m = 0; m < nr_mirrors; m++) { | ||
1053 | r = get_mirror(ms, ti, m, argv); | ||
1054 | if (r) { | ||
1055 | free_context(ms, ti, m); | ||
1056 | return r; | ||
1057 | } | ||
1058 | argv += 2; | ||
1059 | argc -= 2; | ||
1060 | } | ||
1061 | |||
1062 | ti->private = ms; | ||
1063 | |||
1064 | r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); | ||
1065 | if (r) { | ||
1066 | free_context(ms, ti, ms->nr_mirrors); | ||
1067 | return r; | ||
1068 | } | ||
1069 | |||
1070 | add_mirror_set(ms); | ||
1071 | return 0; | ||
1072 | } | ||
1073 | |||
1074 | static void mirror_dtr(struct dm_target *ti) | ||
1075 | { | ||
1076 | struct mirror_set *ms = (struct mirror_set *) ti->private; | ||
1077 | |||
1078 | del_mirror_set(ms); | ||
1079 | kcopyd_client_destroy(ms->kcopyd_client); | ||
1080 | free_context(ms, ti, ms->nr_mirrors); | ||
1081 | } | ||
1082 | |||
1083 | static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) | ||
1084 | { | ||
1085 | int should_wake = 0; | ||
1086 | struct bio_list *bl; | ||
1087 | |||
1088 | bl = (rw == WRITE) ? &ms->writes : &ms->reads; | ||
1089 | spin_lock(&ms->lock); | ||
1090 | should_wake = !(bl->head); | ||
1091 | bio_list_add(bl, bio); | ||
1092 | spin_unlock(&ms->lock); | ||
1093 | |||
1094 | if (should_wake) | ||
1095 | wake(); | ||
1096 | } | ||
1097 | |||
1098 | /* | ||
1099 | * Mirror mapping function | ||
1100 | */ | ||
1101 | static int mirror_map(struct dm_target *ti, struct bio *bio, | ||
1102 | union map_info *map_context) | ||
1103 | { | ||
1104 | int r, rw = bio_rw(bio); | ||
1105 | struct mirror *m; | ||
1106 | struct mirror_set *ms = ti->private; | ||
1107 | |||
1108 | map_context->ll = bio->bi_sector >> ms->rh.region_shift; | ||
1109 | |||
1110 | if (rw == WRITE) { | ||
1111 | queue_bio(ms, bio, rw); | ||
1112 | return 0; | ||
1113 | } | ||
1114 | |||
1115 | r = ms->rh.log->type->in_sync(ms->rh.log, | ||
1116 | bio_to_region(&ms->rh, bio), 0); | ||
1117 | if (r < 0 && r != -EWOULDBLOCK) | ||
1118 | return r; | ||
1119 | |||
1120 | if (r == -EWOULDBLOCK) /* FIXME: ugly */ | ||
1121 | r = 0; | ||
1122 | |||
1123 | /* | ||
1124 | * We don't want to fast track a recovery just for a read | ||
1125 | * ahead. So we just let it silently fail. | ||
1126 | * FIXME: get rid of this. | ||
1127 | */ | ||
1128 | if (!r && rw == READA) | ||
1129 | return -EIO; | ||
1130 | |||
1131 | if (!r) { | ||
1132 | /* Pass this io over to the daemon */ | ||
1133 | queue_bio(ms, bio, rw); | ||
1134 | return 0; | ||
1135 | } | ||
1136 | |||
1137 | m = choose_mirror(ms, bio->bi_sector); | ||
1138 | if (!m) | ||
1139 | return -EIO; | ||
1140 | |||
1141 | map_bio(ms, m, bio); | ||
1142 | return 1; | ||
1143 | } | ||
1144 | |||
1145 | static int mirror_end_io(struct dm_target *ti, struct bio *bio, | ||
1146 | int error, union map_info *map_context) | ||
1147 | { | ||
1148 | int rw = bio_rw(bio); | ||
1149 | struct mirror_set *ms = (struct mirror_set *) ti->private; | ||
1150 | region_t region = map_context->ll; | ||
1151 | |||
1152 | /* | ||
1153 | * We need to dec pending if this was a write. | ||
1154 | */ | ||
1155 | if (rw == WRITE) | ||
1156 | rh_dec(&ms->rh, region); | ||
1157 | |||
1158 | return 0; | ||
1159 | } | ||
1160 | |||
1161 | static void mirror_postsuspend(struct dm_target *ti) | ||
1162 | { | ||
1163 | struct mirror_set *ms = (struct mirror_set *) ti->private; | ||
1164 | struct dirty_log *log = ms->rh.log; | ||
1165 | |||
1166 | rh_stop_recovery(&ms->rh); | ||
1167 | if (log->type->suspend && log->type->suspend(log)) | ||
1168 | /* FIXME: need better error handling */ | ||
1169 | DMWARN("log suspend failed"); | ||
1170 | } | ||
1171 | |||
1172 | static void mirror_resume(struct dm_target *ti) | ||
1173 | { | ||
1174 | struct mirror_set *ms = (struct mirror_set *) ti->private; | ||
1175 | struct dirty_log *log = ms->rh.log; | ||
1176 | if (log->type->resume && log->type->resume(log)) | ||
1177 | /* FIXME: need better error handling */ | ||
1178 | DMWARN("log resume failed"); | ||
1179 | rh_start_recovery(&ms->rh); | ||
1180 | } | ||
1181 | |||
1182 | static int mirror_status(struct dm_target *ti, status_type_t type, | ||
1183 | char *result, unsigned int maxlen) | ||
1184 | { | ||
1185 | unsigned int m, sz; | ||
1186 | struct mirror_set *ms = (struct mirror_set *) ti->private; | ||
1187 | |||
1188 | sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); | ||
1189 | |||
1190 | switch (type) { | ||
1191 | case STATUSTYPE_INFO: | ||
1192 | DMEMIT("%d ", ms->nr_mirrors); | ||
1193 | for (m = 0; m < ms->nr_mirrors; m++) | ||
1194 | DMEMIT("%s ", ms->mirror[m].dev->name); | ||
1195 | |||
1196 | DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT, | ||
1197 | ms->rh.log->type->get_sync_count(ms->rh.log), | ||
1198 | ms->nr_regions); | ||
1199 | break; | ||
1200 | |||
1201 | case STATUSTYPE_TABLE: | ||
1202 | DMEMIT("%d ", ms->nr_mirrors); | ||
1203 | for (m = 0; m < ms->nr_mirrors; m++) | ||
1204 | DMEMIT("%s " SECTOR_FORMAT " ", | ||
1205 | ms->mirror[m].dev->name, ms->mirror[m].offset); | ||
1206 | } | ||
1207 | |||
1208 | return 0; | ||
1209 | } | ||
1210 | |||
1211 | static struct target_type mirror_target = { | ||
1212 | .name = "mirror", | ||
1213 | .version = {1, 0, 1}, | ||
1214 | .module = THIS_MODULE, | ||
1215 | .ctr = mirror_ctr, | ||
1216 | .dtr = mirror_dtr, | ||
1217 | .map = mirror_map, | ||
1218 | .end_io = mirror_end_io, | ||
1219 | .postsuspend = mirror_postsuspend, | ||
1220 | .resume = mirror_resume, | ||
1221 | .status = mirror_status, | ||
1222 | }; | ||
1223 | |||
1224 | static int __init dm_mirror_init(void) | ||
1225 | { | ||
1226 | int r; | ||
1227 | |||
1228 | r = dm_dirty_log_init(); | ||
1229 | if (r) | ||
1230 | return r; | ||
1231 | |||
1232 | _kmirrord_wq = create_workqueue("kmirrord"); | ||
1233 | if (!_kmirrord_wq) { | ||
1234 | DMERR("couldn't start kmirrord"); | ||
1235 | dm_dirty_log_exit(); | ||
1236 | return r; | ||
1237 | } | ||
1238 | INIT_WORK(&_kmirrord_work, do_work, NULL); | ||
1239 | |||
1240 | r = dm_register_target(&mirror_target); | ||
1241 | if (r < 0) { | ||
1242 | DMERR("%s: Failed to register mirror target", | ||
1243 | mirror_target.name); | ||
1244 | dm_dirty_log_exit(); | ||
1245 | destroy_workqueue(_kmirrord_wq); | ||
1246 | } | ||
1247 | |||
1248 | return r; | ||
1249 | } | ||
1250 | |||
1251 | static void __exit dm_mirror_exit(void) | ||
1252 | { | ||
1253 | int r; | ||
1254 | |||
1255 | r = dm_unregister_target(&mirror_target); | ||
1256 | if (r < 0) | ||
1257 | DMERR("%s: unregister failed %d", mirror_target.name, r); | ||
1258 | |||
1259 | destroy_workqueue(_kmirrord_wq); | ||
1260 | dm_dirty_log_exit(); | ||
1261 | } | ||
1262 | |||
1263 | /* Module hooks */ | ||
1264 | module_init(dm_mirror_init); | ||
1265 | module_exit(dm_mirror_exit); | ||
1266 | |||
1267 | MODULE_DESCRIPTION(DM_NAME " mirror target"); | ||
1268 | MODULE_AUTHOR("Joe Thornber"); | ||
1269 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c new file mode 100644 index 000000000000..d0024865a789 --- /dev/null +++ b/drivers/md/dm-round-robin.c | |||
@@ -0,0 +1,214 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Sistina Software. | ||
3 | * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. | ||
4 | * | ||
5 | * Module Author: Heinz Mauelshagen | ||
6 | * | ||
7 | * This file is released under the GPL. | ||
8 | * | ||
9 | * Round-robin path selector. | ||
10 | */ | ||
11 | |||
12 | #include "dm.h" | ||
13 | #include "dm-path-selector.h" | ||
14 | |||
15 | #include <linux/slab.h> | ||
16 | |||
17 | /*----------------------------------------------------------------- | ||
18 | * Path-handling code, paths are held in lists | ||
19 | *---------------------------------------------------------------*/ | ||
20 | struct path_info { | ||
21 | struct list_head list; | ||
22 | struct path *path; | ||
23 | unsigned repeat_count; | ||
24 | }; | ||
25 | |||
26 | static void free_paths(struct list_head *paths) | ||
27 | { | ||
28 | struct path_info *pi, *next; | ||
29 | |||
30 | list_for_each_entry_safe(pi, next, paths, list) { | ||
31 | list_del(&pi->list); | ||
32 | kfree(pi); | ||
33 | } | ||
34 | } | ||
35 | |||
36 | /*----------------------------------------------------------------- | ||
37 | * Round-robin selector | ||
38 | *---------------------------------------------------------------*/ | ||
39 | |||
40 | #define RR_MIN_IO 1000 | ||
41 | |||
42 | struct selector { | ||
43 | struct list_head valid_paths; | ||
44 | struct list_head invalid_paths; | ||
45 | }; | ||
46 | |||
47 | static struct selector *alloc_selector(void) | ||
48 | { | ||
49 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
50 | |||
51 | if (s) { | ||
52 | INIT_LIST_HEAD(&s->valid_paths); | ||
53 | INIT_LIST_HEAD(&s->invalid_paths); | ||
54 | } | ||
55 | |||
56 | return s; | ||
57 | } | ||
58 | |||
59 | static int rr_create(struct path_selector *ps, unsigned argc, char **argv) | ||
60 | { | ||
61 | struct selector *s; | ||
62 | |||
63 | s = alloc_selector(); | ||
64 | if (!s) | ||
65 | return -ENOMEM; | ||
66 | |||
67 | ps->context = s; | ||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | static void rr_destroy(struct path_selector *ps) | ||
72 | { | ||
73 | struct selector *s = (struct selector *) ps->context; | ||
74 | |||
75 | free_paths(&s->valid_paths); | ||
76 | free_paths(&s->invalid_paths); | ||
77 | kfree(s); | ||
78 | ps->context = NULL; | ||
79 | } | ||
80 | |||
81 | static int rr_status(struct path_selector *ps, struct path *path, | ||
82 | status_type_t type, char *result, unsigned int maxlen) | ||
83 | { | ||
84 | struct path_info *pi; | ||
85 | int sz = 0; | ||
86 | |||
87 | if (!path) | ||
88 | DMEMIT("0 "); | ||
89 | else { | ||
90 | switch(type) { | ||
91 | case STATUSTYPE_INFO: | ||
92 | break; | ||
93 | case STATUSTYPE_TABLE: | ||
94 | pi = path->pscontext; | ||
95 | DMEMIT("%u ", pi->repeat_count); | ||
96 | break; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | return sz; | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * Called during initialisation to register each path with an | ||
105 | * optional repeat_count. | ||
106 | */ | ||
107 | static int rr_add_path(struct path_selector *ps, struct path *path, | ||
108 | int argc, char **argv, char **error) | ||
109 | { | ||
110 | struct selector *s = (struct selector *) ps->context; | ||
111 | struct path_info *pi; | ||
112 | unsigned repeat_count = RR_MIN_IO; | ||
113 | |||
114 | if (argc > 1) { | ||
115 | *error = "round-robin ps: incorrect number of arguments"; | ||
116 | return -EINVAL; | ||
117 | } | ||
118 | |||
119 | /* First path argument is number of I/Os before switching path */ | ||
120 | if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | ||
121 | *error = "round-robin ps: invalid repeat count"; | ||
122 | return -EINVAL; | ||
123 | } | ||
124 | |||
125 | /* allocate the path */ | ||
126 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | ||
127 | if (!pi) { | ||
128 | *error = "round-robin ps: Error allocating path context"; | ||
129 | return -ENOMEM; | ||
130 | } | ||
131 | |||
132 | pi->path = path; | ||
133 | pi->repeat_count = repeat_count; | ||
134 | |||
135 | path->pscontext = pi; | ||
136 | |||
137 | list_add(&pi->list, &s->valid_paths); | ||
138 | |||
139 | return 0; | ||
140 | } | ||
141 | |||
142 | static void rr_fail_path(struct path_selector *ps, struct path *p) | ||
143 | { | ||
144 | struct selector *s = (struct selector *) ps->context; | ||
145 | struct path_info *pi = p->pscontext; | ||
146 | |||
147 | list_move(&pi->list, &s->invalid_paths); | ||
148 | } | ||
149 | |||
150 | static int rr_reinstate_path(struct path_selector *ps, struct path *p) | ||
151 | { | ||
152 | struct selector *s = (struct selector *) ps->context; | ||
153 | struct path_info *pi = p->pscontext; | ||
154 | |||
155 | list_move(&pi->list, &s->valid_paths); | ||
156 | |||
157 | return 0; | ||
158 | } | ||
159 | |||
160 | static struct path *rr_select_path(struct path_selector *ps, | ||
161 | unsigned *repeat_count) | ||
162 | { | ||
163 | struct selector *s = (struct selector *) ps->context; | ||
164 | struct path_info *pi = NULL; | ||
165 | |||
166 | if (!list_empty(&s->valid_paths)) { | ||
167 | pi = list_entry(s->valid_paths.next, struct path_info, list); | ||
168 | list_move_tail(&pi->list, &s->valid_paths); | ||
169 | *repeat_count = pi->repeat_count; | ||
170 | } | ||
171 | |||
172 | return pi ? pi->path : NULL; | ||
173 | } | ||
174 | |||
175 | static struct path_selector_type rr_ps = { | ||
176 | .name = "round-robin", | ||
177 | .module = THIS_MODULE, | ||
178 | .table_args = 1, | ||
179 | .info_args = 0, | ||
180 | .create = rr_create, | ||
181 | .destroy = rr_destroy, | ||
182 | .status = rr_status, | ||
183 | .add_path = rr_add_path, | ||
184 | .fail_path = rr_fail_path, | ||
185 | .reinstate_path = rr_reinstate_path, | ||
186 | .select_path = rr_select_path, | ||
187 | }; | ||
188 | |||
189 | static int __init dm_rr_init(void) | ||
190 | { | ||
191 | int r = dm_register_path_selector(&rr_ps); | ||
192 | |||
193 | if (r < 0) | ||
194 | DMERR("round-robin: register failed %d", r); | ||
195 | |||
196 | DMINFO("dm-round-robin version 1.0.0 loaded"); | ||
197 | |||
198 | return r; | ||
199 | } | ||
200 | |||
201 | static void __exit dm_rr_exit(void) | ||
202 | { | ||
203 | int r = dm_unregister_path_selector(&rr_ps); | ||
204 | |||
205 | if (r < 0) | ||
206 | DMERR("round-robin: unregister failed %d", r); | ||
207 | } | ||
208 | |||
209 | module_init(dm_rr_init); | ||
210 | module_exit(dm_rr_exit); | ||
211 | |||
212 | MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector"); | ||
213 | MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); | ||
214 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c new file mode 100644 index 000000000000..7e691ab9a748 --- /dev/null +++ b/drivers/md/dm-snap.c | |||
@@ -0,0 +1,1208 @@ | |||
1 | /* | ||
2 | * dm-snapshot.c | ||
3 | * | ||
4 | * Copyright (C) 2001-2002 Sistina Software (UK) Limited. | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | */ | ||
8 | |||
9 | #include <linux/blkdev.h> | ||
10 | #include <linux/config.h> | ||
11 | #include <linux/ctype.h> | ||
12 | #include <linux/device-mapper.h> | ||
13 | #include <linux/fs.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/kdev_t.h> | ||
16 | #include <linux/list.h> | ||
17 | #include <linux/mempool.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/slab.h> | ||
20 | #include <linux/vmalloc.h> | ||
21 | |||
22 | #include "dm-snap.h" | ||
23 | #include "dm-bio-list.h" | ||
24 | #include "kcopyd.h" | ||
25 | |||
26 | /* | ||
27 | * The percentage increment we will wake up users at | ||
28 | */ | ||
29 | #define WAKE_UP_PERCENT 5 | ||
30 | |||
31 | /* | ||
32 | * kcopyd priority of snapshot operations | ||
33 | */ | ||
34 | #define SNAPSHOT_COPY_PRIORITY 2 | ||
35 | |||
36 | /* | ||
37 | * Each snapshot reserves this many pages for io | ||
38 | */ | ||
39 | #define SNAPSHOT_PAGES 256 | ||
40 | |||
41 | struct pending_exception { | ||
42 | struct exception e; | ||
43 | |||
44 | /* | ||
45 | * Origin buffers waiting for this to complete are held | ||
46 | * in a bio list | ||
47 | */ | ||
48 | struct bio_list origin_bios; | ||
49 | struct bio_list snapshot_bios; | ||
50 | |||
51 | /* | ||
52 | * Other pending_exceptions that are processing this | ||
53 | * chunk. When this list is empty, we know we can | ||
54 | * complete the origins. | ||
55 | */ | ||
56 | struct list_head siblings; | ||
57 | |||
58 | /* Pointer back to snapshot context */ | ||
59 | struct dm_snapshot *snap; | ||
60 | |||
61 | /* | ||
62 | * 1 indicates the exception has already been sent to | ||
63 | * kcopyd. | ||
64 | */ | ||
65 | int started; | ||
66 | }; | ||
67 | |||
68 | /* | ||
69 | * Hash table mapping origin volumes to lists of snapshots and | ||
70 | * a lock to protect it | ||
71 | */ | ||
72 | static kmem_cache_t *exception_cache; | ||
73 | static kmem_cache_t *pending_cache; | ||
74 | static mempool_t *pending_pool; | ||
75 | |||
76 | /* | ||
77 | * One of these per registered origin, held in the snapshot_origins hash | ||
78 | */ | ||
79 | struct origin { | ||
80 | /* The origin device */ | ||
81 | struct block_device *bdev; | ||
82 | |||
83 | struct list_head hash_list; | ||
84 | |||
85 | /* List of snapshots for this origin */ | ||
86 | struct list_head snapshots; | ||
87 | }; | ||
88 | |||
89 | /* | ||
90 | * Size of the hash table for origin volumes. If we make this | ||
91 | * the size of the minors list then it should be nearly perfect | ||
92 | */ | ||
93 | #define ORIGIN_HASH_SIZE 256 | ||
94 | #define ORIGIN_MASK 0xFF | ||
95 | static struct list_head *_origins; | ||
96 | static struct rw_semaphore _origins_lock; | ||
97 | |||
98 | static int init_origin_hash(void) | ||
99 | { | ||
100 | int i; | ||
101 | |||
102 | _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), | ||
103 | GFP_KERNEL); | ||
104 | if (!_origins) { | ||
105 | DMERR("Device mapper: Snapshot: unable to allocate memory"); | ||
106 | return -ENOMEM; | ||
107 | } | ||
108 | |||
109 | for (i = 0; i < ORIGIN_HASH_SIZE; i++) | ||
110 | INIT_LIST_HEAD(_origins + i); | ||
111 | init_rwsem(&_origins_lock); | ||
112 | |||
113 | return 0; | ||
114 | } | ||
115 | |||
116 | static void exit_origin_hash(void) | ||
117 | { | ||
118 | kfree(_origins); | ||
119 | } | ||
120 | |||
121 | static inline unsigned int origin_hash(struct block_device *bdev) | ||
122 | { | ||
123 | return bdev->bd_dev & ORIGIN_MASK; | ||
124 | } | ||
125 | |||
126 | static struct origin *__lookup_origin(struct block_device *origin) | ||
127 | { | ||
128 | struct list_head *ol; | ||
129 | struct origin *o; | ||
130 | |||
131 | ol = &_origins[origin_hash(origin)]; | ||
132 | list_for_each_entry (o, ol, hash_list) | ||
133 | if (bdev_equal(o->bdev, origin)) | ||
134 | return o; | ||
135 | |||
136 | return NULL; | ||
137 | } | ||
138 | |||
139 | static void __insert_origin(struct origin *o) | ||
140 | { | ||
141 | struct list_head *sl = &_origins[origin_hash(o->bdev)]; | ||
142 | list_add_tail(&o->hash_list, sl); | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * Make a note of the snapshot and its origin so we can look it | ||
147 | * up when the origin has a write on it. | ||
148 | */ | ||
149 | static int register_snapshot(struct dm_snapshot *snap) | ||
150 | { | ||
151 | struct origin *o; | ||
152 | struct block_device *bdev = snap->origin->bdev; | ||
153 | |||
154 | down_write(&_origins_lock); | ||
155 | o = __lookup_origin(bdev); | ||
156 | |||
157 | if (!o) { | ||
158 | /* New origin */ | ||
159 | o = kmalloc(sizeof(*o), GFP_KERNEL); | ||
160 | if (!o) { | ||
161 | up_write(&_origins_lock); | ||
162 | return -ENOMEM; | ||
163 | } | ||
164 | |||
165 | /* Initialise the struct */ | ||
166 | INIT_LIST_HEAD(&o->snapshots); | ||
167 | o->bdev = bdev; | ||
168 | |||
169 | __insert_origin(o); | ||
170 | } | ||
171 | |||
172 | list_add_tail(&snap->list, &o->snapshots); | ||
173 | |||
174 | up_write(&_origins_lock); | ||
175 | return 0; | ||
176 | } | ||
177 | |||
178 | static void unregister_snapshot(struct dm_snapshot *s) | ||
179 | { | ||
180 | struct origin *o; | ||
181 | |||
182 | down_write(&_origins_lock); | ||
183 | o = __lookup_origin(s->origin->bdev); | ||
184 | |||
185 | list_del(&s->list); | ||
186 | if (list_empty(&o->snapshots)) { | ||
187 | list_del(&o->hash_list); | ||
188 | kfree(o); | ||
189 | } | ||
190 | |||
191 | up_write(&_origins_lock); | ||
192 | } | ||
193 | |||
194 | /* | ||
195 | * Implementation of the exception hash tables. | ||
196 | */ | ||
197 | static int init_exception_table(struct exception_table *et, uint32_t size) | ||
198 | { | ||
199 | unsigned int i; | ||
200 | |||
201 | et->hash_mask = size - 1; | ||
202 | et->table = dm_vcalloc(size, sizeof(struct list_head)); | ||
203 | if (!et->table) | ||
204 | return -ENOMEM; | ||
205 | |||
206 | for (i = 0; i < size; i++) | ||
207 | INIT_LIST_HEAD(et->table + i); | ||
208 | |||
209 | return 0; | ||
210 | } | ||
211 | |||
212 | static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem) | ||
213 | { | ||
214 | struct list_head *slot; | ||
215 | struct exception *ex, *next; | ||
216 | int i, size; | ||
217 | |||
218 | size = et->hash_mask + 1; | ||
219 | for (i = 0; i < size; i++) { | ||
220 | slot = et->table + i; | ||
221 | |||
222 | list_for_each_entry_safe (ex, next, slot, hash_list) | ||
223 | kmem_cache_free(mem, ex); | ||
224 | } | ||
225 | |||
226 | vfree(et->table); | ||
227 | } | ||
228 | |||
229 | static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk) | ||
230 | { | ||
231 | return chunk & et->hash_mask; | ||
232 | } | ||
233 | |||
234 | static void insert_exception(struct exception_table *eh, struct exception *e) | ||
235 | { | ||
236 | struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; | ||
237 | list_add(&e->hash_list, l); | ||
238 | } | ||
239 | |||
240 | static inline void remove_exception(struct exception *e) | ||
241 | { | ||
242 | list_del(&e->hash_list); | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Return the exception data for a sector, or NULL if not | ||
247 | * remapped. | ||
248 | */ | ||
249 | static struct exception *lookup_exception(struct exception_table *et, | ||
250 | chunk_t chunk) | ||
251 | { | ||
252 | struct list_head *slot; | ||
253 | struct exception *e; | ||
254 | |||
255 | slot = &et->table[exception_hash(et, chunk)]; | ||
256 | list_for_each_entry (e, slot, hash_list) | ||
257 | if (e->old_chunk == chunk) | ||
258 | return e; | ||
259 | |||
260 | return NULL; | ||
261 | } | ||
262 | |||
263 | static inline struct exception *alloc_exception(void) | ||
264 | { | ||
265 | struct exception *e; | ||
266 | |||
267 | e = kmem_cache_alloc(exception_cache, GFP_NOIO); | ||
268 | if (!e) | ||
269 | e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); | ||
270 | |||
271 | return e; | ||
272 | } | ||
273 | |||
274 | static inline void free_exception(struct exception *e) | ||
275 | { | ||
276 | kmem_cache_free(exception_cache, e); | ||
277 | } | ||
278 | |||
279 | static inline struct pending_exception *alloc_pending_exception(void) | ||
280 | { | ||
281 | return mempool_alloc(pending_pool, GFP_NOIO); | ||
282 | } | ||
283 | |||
284 | static inline void free_pending_exception(struct pending_exception *pe) | ||
285 | { | ||
286 | mempool_free(pe, pending_pool); | ||
287 | } | ||
288 | |||
289 | int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) | ||
290 | { | ||
291 | struct exception *e; | ||
292 | |||
293 | e = alloc_exception(); | ||
294 | if (!e) | ||
295 | return -ENOMEM; | ||
296 | |||
297 | e->old_chunk = old; | ||
298 | e->new_chunk = new; | ||
299 | insert_exception(&s->complete, e); | ||
300 | return 0; | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * Hard coded magic. | ||
305 | */ | ||
306 | static int calc_max_buckets(void) | ||
307 | { | ||
308 | /* use a fixed size of 2MB */ | ||
309 | unsigned long mem = 2 * 1024 * 1024; | ||
310 | mem /= sizeof(struct list_head); | ||
311 | |||
312 | return mem; | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * Rounds a number down to a power of 2. | ||
317 | */ | ||
318 | static inline uint32_t round_down(uint32_t n) | ||
319 | { | ||
320 | while (n & (n - 1)) | ||
321 | n &= (n - 1); | ||
322 | return n; | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Allocate room for a suitable hash table. | ||
327 | */ | ||
328 | static int init_hash_tables(struct dm_snapshot *s) | ||
329 | { | ||
330 | sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; | ||
331 | |||
332 | /* | ||
333 | * Calculate based on the size of the original volume or | ||
334 | * the COW volume... | ||
335 | */ | ||
336 | cow_dev_size = get_dev_size(s->cow->bdev); | ||
337 | origin_dev_size = get_dev_size(s->origin->bdev); | ||
338 | max_buckets = calc_max_buckets(); | ||
339 | |||
340 | hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; | ||
341 | hash_size = min(hash_size, max_buckets); | ||
342 | |||
343 | /* Round it down to a power of 2 */ | ||
344 | hash_size = round_down(hash_size); | ||
345 | if (init_exception_table(&s->complete, hash_size)) | ||
346 | return -ENOMEM; | ||
347 | |||
348 | /* | ||
349 | * Allocate hash table for in-flight exceptions | ||
350 | * Make this smaller than the real hash table | ||
351 | */ | ||
352 | hash_size >>= 3; | ||
353 | if (hash_size < 64) | ||
354 | hash_size = 64; | ||
355 | |||
356 | if (init_exception_table(&s->pending, hash_size)) { | ||
357 | exit_exception_table(&s->complete, exception_cache); | ||
358 | return -ENOMEM; | ||
359 | } | ||
360 | |||
361 | return 0; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Round a number up to the nearest 'size' boundary. size must | ||
366 | * be a power of 2. | ||
367 | */ | ||
368 | static inline ulong round_up(ulong n, ulong size) | ||
369 | { | ||
370 | size--; | ||
371 | return (n + size) & ~size; | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> | ||
376 | */ | ||
377 | static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | ||
378 | { | ||
379 | struct dm_snapshot *s; | ||
380 | unsigned long chunk_size; | ||
381 | int r = -EINVAL; | ||
382 | char persistent; | ||
383 | char *origin_path; | ||
384 | char *cow_path; | ||
385 | char *value; | ||
386 | int blocksize; | ||
387 | |||
388 | if (argc < 4) { | ||
389 | ti->error = "dm-snapshot: requires exactly 4 arguments"; | ||
390 | r = -EINVAL; | ||
391 | goto bad1; | ||
392 | } | ||
393 | |||
394 | origin_path = argv[0]; | ||
395 | cow_path = argv[1]; | ||
396 | persistent = toupper(*argv[2]); | ||
397 | |||
398 | if (persistent != 'P' && persistent != 'N') { | ||
399 | ti->error = "Persistent flag is not P or N"; | ||
400 | r = -EINVAL; | ||
401 | goto bad1; | ||
402 | } | ||
403 | |||
404 | chunk_size = simple_strtoul(argv[3], &value, 10); | ||
405 | if (chunk_size == 0 || value == NULL) { | ||
406 | ti->error = "Invalid chunk size"; | ||
407 | r = -EINVAL; | ||
408 | goto bad1; | ||
409 | } | ||
410 | |||
411 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
412 | if (s == NULL) { | ||
413 | ti->error = "Cannot allocate snapshot context private " | ||
414 | "structure"; | ||
415 | r = -ENOMEM; | ||
416 | goto bad1; | ||
417 | } | ||
418 | |||
419 | r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); | ||
420 | if (r) { | ||
421 | ti->error = "Cannot get origin device"; | ||
422 | goto bad2; | ||
423 | } | ||
424 | |||
425 | r = dm_get_device(ti, cow_path, 0, 0, | ||
426 | FMODE_READ | FMODE_WRITE, &s->cow); | ||
427 | if (r) { | ||
428 | dm_put_device(ti, s->origin); | ||
429 | ti->error = "Cannot get COW device"; | ||
430 | goto bad2; | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * Chunk size must be multiple of page size. Silently | ||
435 | * round up if it's not. | ||
436 | */ | ||
437 | chunk_size = round_up(chunk_size, PAGE_SIZE >> 9); | ||
438 | |||
439 | /* Validate the chunk size against the device block size */ | ||
440 | blocksize = s->cow->bdev->bd_disk->queue->hardsect_size; | ||
441 | if (chunk_size % (blocksize >> 9)) { | ||
442 | ti->error = "Chunk size is not a multiple of device blocksize"; | ||
443 | r = -EINVAL; | ||
444 | goto bad3; | ||
445 | } | ||
446 | |||
447 | /* Check chunk_size is a power of 2 */ | ||
448 | if (chunk_size & (chunk_size - 1)) { | ||
449 | ti->error = "Chunk size is not a power of 2"; | ||
450 | r = -EINVAL; | ||
451 | goto bad3; | ||
452 | } | ||
453 | |||
454 | s->chunk_size = chunk_size; | ||
455 | s->chunk_mask = chunk_size - 1; | ||
456 | s->type = persistent; | ||
457 | s->chunk_shift = ffs(chunk_size) - 1; | ||
458 | |||
459 | s->valid = 1; | ||
460 | s->have_metadata = 0; | ||
461 | s->last_percent = 0; | ||
462 | init_rwsem(&s->lock); | ||
463 | s->table = ti->table; | ||
464 | |||
465 | /* Allocate hash table for COW data */ | ||
466 | if (init_hash_tables(s)) { | ||
467 | ti->error = "Unable to allocate hash table space"; | ||
468 | r = -ENOMEM; | ||
469 | goto bad3; | ||
470 | } | ||
471 | |||
472 | /* | ||
473 | * Check the persistent flag - done here because we need the iobuf | ||
474 | * to check the LV header | ||
475 | */ | ||
476 | s->store.snap = s; | ||
477 | |||
478 | if (persistent == 'P') | ||
479 | r = dm_create_persistent(&s->store, chunk_size); | ||
480 | else | ||
481 | r = dm_create_transient(&s->store, s, blocksize); | ||
482 | |||
483 | if (r) { | ||
484 | ti->error = "Couldn't create exception store"; | ||
485 | r = -EINVAL; | ||
486 | goto bad4; | ||
487 | } | ||
488 | |||
489 | r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); | ||
490 | if (r) { | ||
491 | ti->error = "Could not create kcopyd client"; | ||
492 | goto bad5; | ||
493 | } | ||
494 | |||
495 | /* Add snapshot to the list of snapshots for this origin */ | ||
496 | if (register_snapshot(s)) { | ||
497 | r = -EINVAL; | ||
498 | ti->error = "Cannot register snapshot origin"; | ||
499 | goto bad6; | ||
500 | } | ||
501 | |||
502 | ti->private = s; | ||
503 | ti->split_io = chunk_size; | ||
504 | |||
505 | return 0; | ||
506 | |||
507 | bad6: | ||
508 | kcopyd_client_destroy(s->kcopyd_client); | ||
509 | |||
510 | bad5: | ||
511 | s->store.destroy(&s->store); | ||
512 | |||
513 | bad4: | ||
514 | exit_exception_table(&s->pending, pending_cache); | ||
515 | exit_exception_table(&s->complete, exception_cache); | ||
516 | |||
517 | bad3: | ||
518 | dm_put_device(ti, s->cow); | ||
519 | dm_put_device(ti, s->origin); | ||
520 | |||
521 | bad2: | ||
522 | kfree(s); | ||
523 | |||
524 | bad1: | ||
525 | return r; | ||
526 | } | ||
527 | |||
528 | static void snapshot_dtr(struct dm_target *ti) | ||
529 | { | ||
530 | struct dm_snapshot *s = (struct dm_snapshot *) ti->private; | ||
531 | |||
532 | unregister_snapshot(s); | ||
533 | |||
534 | exit_exception_table(&s->pending, pending_cache); | ||
535 | exit_exception_table(&s->complete, exception_cache); | ||
536 | |||
537 | /* Deallocate memory used */ | ||
538 | s->store.destroy(&s->store); | ||
539 | |||
540 | dm_put_device(ti, s->origin); | ||
541 | dm_put_device(ti, s->cow); | ||
542 | kcopyd_client_destroy(s->kcopyd_client); | ||
543 | kfree(s); | ||
544 | } | ||
545 | |||
546 | /* | ||
547 | * Flush a list of buffers. | ||
548 | */ | ||
549 | static void flush_bios(struct bio *bio) | ||
550 | { | ||
551 | struct bio *n; | ||
552 | |||
553 | while (bio) { | ||
554 | n = bio->bi_next; | ||
555 | bio->bi_next = NULL; | ||
556 | generic_make_request(bio); | ||
557 | bio = n; | ||
558 | } | ||
559 | } | ||
560 | |||
561 | /* | ||
562 | * Error a list of buffers. | ||
563 | */ | ||
564 | static void error_bios(struct bio *bio) | ||
565 | { | ||
566 | struct bio *n; | ||
567 | |||
568 | while (bio) { | ||
569 | n = bio->bi_next; | ||
570 | bio->bi_next = NULL; | ||
571 | bio_io_error(bio, bio->bi_size); | ||
572 | bio = n; | ||
573 | } | ||
574 | } | ||
575 | |||
576 | static struct bio *__flush_bios(struct pending_exception *pe) | ||
577 | { | ||
578 | struct pending_exception *sibling; | ||
579 | |||
580 | if (list_empty(&pe->siblings)) | ||
581 | return bio_list_get(&pe->origin_bios); | ||
582 | |||
583 | sibling = list_entry(pe->siblings.next, | ||
584 | struct pending_exception, siblings); | ||
585 | |||
586 | list_del(&pe->siblings); | ||
587 | |||
588 | /* This is fine as long as kcopyd is single-threaded. If kcopyd | ||
589 | * becomes multi-threaded, we'll need some locking here. | ||
590 | */ | ||
591 | bio_list_merge(&sibling->origin_bios, &pe->origin_bios); | ||
592 | |||
593 | return NULL; | ||
594 | } | ||
595 | |||
596 | static void pending_complete(struct pending_exception *pe, int success) | ||
597 | { | ||
598 | struct exception *e; | ||
599 | struct dm_snapshot *s = pe->snap; | ||
600 | struct bio *flush = NULL; | ||
601 | |||
602 | if (success) { | ||
603 | e = alloc_exception(); | ||
604 | if (!e) { | ||
605 | DMWARN("Unable to allocate exception."); | ||
606 | down_write(&s->lock); | ||
607 | s->store.drop_snapshot(&s->store); | ||
608 | s->valid = 0; | ||
609 | flush = __flush_bios(pe); | ||
610 | up_write(&s->lock); | ||
611 | |||
612 | error_bios(bio_list_get(&pe->snapshot_bios)); | ||
613 | goto out; | ||
614 | } | ||
615 | *e = pe->e; | ||
616 | |||
617 | /* | ||
618 | * Add a proper exception, and remove the | ||
619 | * in-flight exception from the list. | ||
620 | */ | ||
621 | down_write(&s->lock); | ||
622 | insert_exception(&s->complete, e); | ||
623 | remove_exception(&pe->e); | ||
624 | flush = __flush_bios(pe); | ||
625 | |||
626 | /* Submit any pending write bios */ | ||
627 | up_write(&s->lock); | ||
628 | |||
629 | flush_bios(bio_list_get(&pe->snapshot_bios)); | ||
630 | } else { | ||
631 | /* Read/write error - snapshot is unusable */ | ||
632 | down_write(&s->lock); | ||
633 | if (s->valid) | ||
634 | DMERR("Error reading/writing snapshot"); | ||
635 | s->store.drop_snapshot(&s->store); | ||
636 | s->valid = 0; | ||
637 | remove_exception(&pe->e); | ||
638 | flush = __flush_bios(pe); | ||
639 | up_write(&s->lock); | ||
640 | |||
641 | error_bios(bio_list_get(&pe->snapshot_bios)); | ||
642 | |||
643 | dm_table_event(s->table); | ||
644 | } | ||
645 | |||
646 | out: | ||
647 | free_pending_exception(pe); | ||
648 | |||
649 | if (flush) | ||
650 | flush_bios(flush); | ||
651 | } | ||
652 | |||
653 | static void commit_callback(void *context, int success) | ||
654 | { | ||
655 | struct pending_exception *pe = (struct pending_exception *) context; | ||
656 | pending_complete(pe, success); | ||
657 | } | ||
658 | |||
659 | /* | ||
660 | * Called when the copy I/O has finished. kcopyd actually runs | ||
661 | * this code so don't block. | ||
662 | */ | ||
663 | static void copy_callback(int read_err, unsigned int write_err, void *context) | ||
664 | { | ||
665 | struct pending_exception *pe = (struct pending_exception *) context; | ||
666 | struct dm_snapshot *s = pe->snap; | ||
667 | |||
668 | if (read_err || write_err) | ||
669 | pending_complete(pe, 0); | ||
670 | |||
671 | else | ||
672 | /* Update the metadata if we are persistent */ | ||
673 | s->store.commit_exception(&s->store, &pe->e, commit_callback, | ||
674 | pe); | ||
675 | } | ||
676 | |||
677 | /* | ||
678 | * Dispatches the copy operation to kcopyd. | ||
679 | */ | ||
680 | static inline void start_copy(struct pending_exception *pe) | ||
681 | { | ||
682 | struct dm_snapshot *s = pe->snap; | ||
683 | struct io_region src, dest; | ||
684 | struct block_device *bdev = s->origin->bdev; | ||
685 | sector_t dev_size; | ||
686 | |||
687 | dev_size = get_dev_size(bdev); | ||
688 | |||
689 | src.bdev = bdev; | ||
690 | src.sector = chunk_to_sector(s, pe->e.old_chunk); | ||
691 | src.count = min(s->chunk_size, dev_size - src.sector); | ||
692 | |||
693 | dest.bdev = s->cow->bdev; | ||
694 | dest.sector = chunk_to_sector(s, pe->e.new_chunk); | ||
695 | dest.count = src.count; | ||
696 | |||
697 | /* Hand over to kcopyd */ | ||
698 | kcopyd_copy(s->kcopyd_client, | ||
699 | &src, 1, &dest, 0, copy_callback, pe); | ||
700 | } | ||
701 | |||
702 | /* | ||
703 | * Looks to see if this snapshot already has a pending exception | ||
704 | * for this chunk, otherwise it allocates a new one and inserts | ||
705 | * it into the pending table. | ||
706 | * | ||
707 | * NOTE: a write lock must be held on snap->lock before calling | ||
708 | * this. | ||
709 | */ | ||
710 | static struct pending_exception * | ||
711 | __find_pending_exception(struct dm_snapshot *s, struct bio *bio) | ||
712 | { | ||
713 | struct exception *e; | ||
714 | struct pending_exception *pe; | ||
715 | chunk_t chunk = sector_to_chunk(s, bio->bi_sector); | ||
716 | |||
717 | /* | ||
718 | * Is there a pending exception for this already ? | ||
719 | */ | ||
720 | e = lookup_exception(&s->pending, chunk); | ||
721 | if (e) { | ||
722 | /* cast the exception to a pending exception */ | ||
723 | pe = container_of(e, struct pending_exception, e); | ||
724 | |||
725 | } else { | ||
726 | /* | ||
727 | * Create a new pending exception, we don't want | ||
728 | * to hold the lock while we do this. | ||
729 | */ | ||
730 | up_write(&s->lock); | ||
731 | pe = alloc_pending_exception(); | ||
732 | down_write(&s->lock); | ||
733 | |||
734 | e = lookup_exception(&s->pending, chunk); | ||
735 | if (e) { | ||
736 | free_pending_exception(pe); | ||
737 | pe = container_of(e, struct pending_exception, e); | ||
738 | } else { | ||
739 | pe->e.old_chunk = chunk; | ||
740 | bio_list_init(&pe->origin_bios); | ||
741 | bio_list_init(&pe->snapshot_bios); | ||
742 | INIT_LIST_HEAD(&pe->siblings); | ||
743 | pe->snap = s; | ||
744 | pe->started = 0; | ||
745 | |||
746 | if (s->store.prepare_exception(&s->store, &pe->e)) { | ||
747 | free_pending_exception(pe); | ||
748 | s->valid = 0; | ||
749 | return NULL; | ||
750 | } | ||
751 | |||
752 | insert_exception(&s->pending, &pe->e); | ||
753 | } | ||
754 | } | ||
755 | |||
756 | return pe; | ||
757 | } | ||
758 | |||
759 | static inline void remap_exception(struct dm_snapshot *s, struct exception *e, | ||
760 | struct bio *bio) | ||
761 | { | ||
762 | bio->bi_bdev = s->cow->bdev; | ||
763 | bio->bi_sector = chunk_to_sector(s, e->new_chunk) + | ||
764 | (bio->bi_sector & s->chunk_mask); | ||
765 | } | ||
766 | |||
767 | static int snapshot_map(struct dm_target *ti, struct bio *bio, | ||
768 | union map_info *map_context) | ||
769 | { | ||
770 | struct exception *e; | ||
771 | struct dm_snapshot *s = (struct dm_snapshot *) ti->private; | ||
772 | int r = 1; | ||
773 | chunk_t chunk; | ||
774 | struct pending_exception *pe; | ||
775 | |||
776 | chunk = sector_to_chunk(s, bio->bi_sector); | ||
777 | |||
778 | /* Full snapshots are not usable */ | ||
779 | if (!s->valid) | ||
780 | return -1; | ||
781 | |||
782 | /* | ||
783 | * Write to snapshot - higher level takes care of RW/RO | ||
784 | * flags so we should only get this if we are | ||
785 | * writeable. | ||
786 | */ | ||
787 | if (bio_rw(bio) == WRITE) { | ||
788 | |||
789 | /* FIXME: should only take write lock if we need | ||
790 | * to copy an exception */ | ||
791 | down_write(&s->lock); | ||
792 | |||
793 | /* If the block is already remapped - use that, else remap it */ | ||
794 | e = lookup_exception(&s->complete, chunk); | ||
795 | if (e) { | ||
796 | remap_exception(s, e, bio); | ||
797 | up_write(&s->lock); | ||
798 | |||
799 | } else { | ||
800 | pe = __find_pending_exception(s, bio); | ||
801 | |||
802 | if (!pe) { | ||
803 | if (s->store.drop_snapshot) | ||
804 | s->store.drop_snapshot(&s->store); | ||
805 | s->valid = 0; | ||
806 | r = -EIO; | ||
807 | up_write(&s->lock); | ||
808 | } else { | ||
809 | remap_exception(s, &pe->e, bio); | ||
810 | bio_list_add(&pe->snapshot_bios, bio); | ||
811 | |||
812 | if (!pe->started) { | ||
813 | /* this is protected by snap->lock */ | ||
814 | pe->started = 1; | ||
815 | up_write(&s->lock); | ||
816 | start_copy(pe); | ||
817 | } else | ||
818 | up_write(&s->lock); | ||
819 | r = 0; | ||
820 | } | ||
821 | } | ||
822 | |||
823 | } else { | ||
824 | /* | ||
825 | * FIXME: this read path scares me because we | ||
826 | * always use the origin when we have a pending | ||
827 | * exception. However I can't think of a | ||
828 | * situation where this is wrong - ejt. | ||
829 | */ | ||
830 | |||
831 | /* Do reads */ | ||
832 | down_read(&s->lock); | ||
833 | |||
834 | /* See if it it has been remapped */ | ||
835 | e = lookup_exception(&s->complete, chunk); | ||
836 | if (e) | ||
837 | remap_exception(s, e, bio); | ||
838 | else | ||
839 | bio->bi_bdev = s->origin->bdev; | ||
840 | |||
841 | up_read(&s->lock); | ||
842 | } | ||
843 | |||
844 | return r; | ||
845 | } | ||
846 | |||
847 | static void snapshot_resume(struct dm_target *ti) | ||
848 | { | ||
849 | struct dm_snapshot *s = (struct dm_snapshot *) ti->private; | ||
850 | |||
851 | if (s->have_metadata) | ||
852 | return; | ||
853 | |||
854 | if (s->store.read_metadata(&s->store)) { | ||
855 | down_write(&s->lock); | ||
856 | s->valid = 0; | ||
857 | up_write(&s->lock); | ||
858 | } | ||
859 | |||
860 | s->have_metadata = 1; | ||
861 | } | ||
862 | |||
863 | static int snapshot_status(struct dm_target *ti, status_type_t type, | ||
864 | char *result, unsigned int maxlen) | ||
865 | { | ||
866 | struct dm_snapshot *snap = (struct dm_snapshot *) ti->private; | ||
867 | |||
868 | switch (type) { | ||
869 | case STATUSTYPE_INFO: | ||
870 | if (!snap->valid) | ||
871 | snprintf(result, maxlen, "Invalid"); | ||
872 | else { | ||
873 | if (snap->store.fraction_full) { | ||
874 | sector_t numerator, denominator; | ||
875 | snap->store.fraction_full(&snap->store, | ||
876 | &numerator, | ||
877 | &denominator); | ||
878 | snprintf(result, maxlen, | ||
879 | SECTOR_FORMAT "/" SECTOR_FORMAT, | ||
880 | numerator, denominator); | ||
881 | } | ||
882 | else | ||
883 | snprintf(result, maxlen, "Unknown"); | ||
884 | } | ||
885 | break; | ||
886 | |||
887 | case STATUSTYPE_TABLE: | ||
888 | /* | ||
889 | * kdevname returns a static pointer so we need | ||
890 | * to make private copies if the output is to | ||
891 | * make sense. | ||
892 | */ | ||
893 | snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT, | ||
894 | snap->origin->name, snap->cow->name, | ||
895 | snap->type, snap->chunk_size); | ||
896 | break; | ||
897 | } | ||
898 | |||
899 | return 0; | ||
900 | } | ||
901 | |||
902 | /*----------------------------------------------------------------- | ||
903 | * Origin methods | ||
904 | *---------------------------------------------------------------*/ | ||
905 | static void list_merge(struct list_head *l1, struct list_head *l2) | ||
906 | { | ||
907 | struct list_head *l1_n, *l2_p; | ||
908 | |||
909 | l1_n = l1->next; | ||
910 | l2_p = l2->prev; | ||
911 | |||
912 | l1->next = l2; | ||
913 | l2->prev = l1; | ||
914 | |||
915 | l2_p->next = l1_n; | ||
916 | l1_n->prev = l2_p; | ||
917 | } | ||
918 | |||
919 | static int __origin_write(struct list_head *snapshots, struct bio *bio) | ||
920 | { | ||
921 | int r = 1, first = 1; | ||
922 | struct dm_snapshot *snap; | ||
923 | struct exception *e; | ||
924 | struct pending_exception *pe, *last = NULL; | ||
925 | chunk_t chunk; | ||
926 | |||
927 | /* Do all the snapshots on this origin */ | ||
928 | list_for_each_entry (snap, snapshots, list) { | ||
929 | |||
930 | /* Only deal with valid snapshots */ | ||
931 | if (!snap->valid) | ||
932 | continue; | ||
933 | |||
934 | down_write(&snap->lock); | ||
935 | |||
936 | /* | ||
937 | * Remember, different snapshots can have | ||
938 | * different chunk sizes. | ||
939 | */ | ||
940 | chunk = sector_to_chunk(snap, bio->bi_sector); | ||
941 | |||
942 | /* | ||
943 | * Check exception table to see if block | ||
944 | * is already remapped in this snapshot | ||
945 | * and trigger an exception if not. | ||
946 | */ | ||
947 | e = lookup_exception(&snap->complete, chunk); | ||
948 | if (!e) { | ||
949 | pe = __find_pending_exception(snap, bio); | ||
950 | if (!pe) { | ||
951 | snap->store.drop_snapshot(&snap->store); | ||
952 | snap->valid = 0; | ||
953 | |||
954 | } else { | ||
955 | if (last) | ||
956 | list_merge(&pe->siblings, | ||
957 | &last->siblings); | ||
958 | |||
959 | last = pe; | ||
960 | r = 0; | ||
961 | } | ||
962 | } | ||
963 | |||
964 | up_write(&snap->lock); | ||
965 | } | ||
966 | |||
967 | /* | ||
968 | * Now that we have a complete pe list we can start the copying. | ||
969 | */ | ||
970 | if (last) { | ||
971 | pe = last; | ||
972 | do { | ||
973 | down_write(&pe->snap->lock); | ||
974 | if (first) | ||
975 | bio_list_add(&pe->origin_bios, bio); | ||
976 | if (!pe->started) { | ||
977 | pe->started = 1; | ||
978 | up_write(&pe->snap->lock); | ||
979 | start_copy(pe); | ||
980 | } else | ||
981 | up_write(&pe->snap->lock); | ||
982 | first = 0; | ||
983 | pe = list_entry(pe->siblings.next, | ||
984 | struct pending_exception, siblings); | ||
985 | |||
986 | } while (pe != last); | ||
987 | } | ||
988 | |||
989 | return r; | ||
990 | } | ||
991 | |||
992 | /* | ||
993 | * Called on a write from the origin driver. | ||
994 | */ | ||
995 | static int do_origin(struct dm_dev *origin, struct bio *bio) | ||
996 | { | ||
997 | struct origin *o; | ||
998 | int r = 1; | ||
999 | |||
1000 | down_read(&_origins_lock); | ||
1001 | o = __lookup_origin(origin->bdev); | ||
1002 | if (o) | ||
1003 | r = __origin_write(&o->snapshots, bio); | ||
1004 | up_read(&_origins_lock); | ||
1005 | |||
1006 | return r; | ||
1007 | } | ||
1008 | |||
1009 | /* | ||
1010 | * Origin: maps a linear range of a device, with hooks for snapshotting. | ||
1011 | */ | ||
1012 | |||
1013 | /* | ||
1014 | * Construct an origin mapping: <dev_path> | ||
1015 | * The context for an origin is merely a 'struct dm_dev *' | ||
1016 | * pointing to the real device. | ||
1017 | */ | ||
1018 | static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) | ||
1019 | { | ||
1020 | int r; | ||
1021 | struct dm_dev *dev; | ||
1022 | |||
1023 | if (argc != 1) { | ||
1024 | ti->error = "dm-origin: incorrect number of arguments"; | ||
1025 | return -EINVAL; | ||
1026 | } | ||
1027 | |||
1028 | r = dm_get_device(ti, argv[0], 0, ti->len, | ||
1029 | dm_table_get_mode(ti->table), &dev); | ||
1030 | if (r) { | ||
1031 | ti->error = "Cannot get target device"; | ||
1032 | return r; | ||
1033 | } | ||
1034 | |||
1035 | ti->private = dev; | ||
1036 | return 0; | ||
1037 | } | ||
1038 | |||
1039 | static void origin_dtr(struct dm_target *ti) | ||
1040 | { | ||
1041 | struct dm_dev *dev = (struct dm_dev *) ti->private; | ||
1042 | dm_put_device(ti, dev); | ||
1043 | } | ||
1044 | |||
1045 | static int origin_map(struct dm_target *ti, struct bio *bio, | ||
1046 | union map_info *map_context) | ||
1047 | { | ||
1048 | struct dm_dev *dev = (struct dm_dev *) ti->private; | ||
1049 | bio->bi_bdev = dev->bdev; | ||
1050 | |||
1051 | /* Only tell snapshots if this is a write */ | ||
1052 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1; | ||
1053 | } | ||
1054 | |||
1055 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
1056 | |||
1057 | /* | ||
1058 | * Set the target "split_io" field to the minimum of all the snapshots' | ||
1059 | * chunk sizes. | ||
1060 | */ | ||
1061 | static void origin_resume(struct dm_target *ti) | ||
1062 | { | ||
1063 | struct dm_dev *dev = (struct dm_dev *) ti->private; | ||
1064 | struct dm_snapshot *snap; | ||
1065 | struct origin *o; | ||
1066 | chunk_t chunk_size = 0; | ||
1067 | |||
1068 | down_read(&_origins_lock); | ||
1069 | o = __lookup_origin(dev->bdev); | ||
1070 | if (o) | ||
1071 | list_for_each_entry (snap, &o->snapshots, list) | ||
1072 | chunk_size = min_not_zero(chunk_size, snap->chunk_size); | ||
1073 | up_read(&_origins_lock); | ||
1074 | |||
1075 | ti->split_io = chunk_size; | ||
1076 | } | ||
1077 | |||
1078 | static int origin_status(struct dm_target *ti, status_type_t type, char *result, | ||
1079 | unsigned int maxlen) | ||
1080 | { | ||
1081 | struct dm_dev *dev = (struct dm_dev *) ti->private; | ||
1082 | |||
1083 | switch (type) { | ||
1084 | case STATUSTYPE_INFO: | ||
1085 | result[0] = '\0'; | ||
1086 | break; | ||
1087 | |||
1088 | case STATUSTYPE_TABLE: | ||
1089 | snprintf(result, maxlen, "%s", dev->name); | ||
1090 | break; | ||
1091 | } | ||
1092 | |||
1093 | return 0; | ||
1094 | } | ||
1095 | |||
1096 | static struct target_type origin_target = { | ||
1097 | .name = "snapshot-origin", | ||
1098 | .version = {1, 0, 1}, | ||
1099 | .module = THIS_MODULE, | ||
1100 | .ctr = origin_ctr, | ||
1101 | .dtr = origin_dtr, | ||
1102 | .map = origin_map, | ||
1103 | .resume = origin_resume, | ||
1104 | .status = origin_status, | ||
1105 | }; | ||
1106 | |||
1107 | static struct target_type snapshot_target = { | ||
1108 | .name = "snapshot", | ||
1109 | .version = {1, 0, 1}, | ||
1110 | .module = THIS_MODULE, | ||
1111 | .ctr = snapshot_ctr, | ||
1112 | .dtr = snapshot_dtr, | ||
1113 | .map = snapshot_map, | ||
1114 | .resume = snapshot_resume, | ||
1115 | .status = snapshot_status, | ||
1116 | }; | ||
1117 | |||
1118 | static int __init dm_snapshot_init(void) | ||
1119 | { | ||
1120 | int r; | ||
1121 | |||
1122 | r = dm_register_target(&snapshot_target); | ||
1123 | if (r) { | ||
1124 | DMERR("snapshot target register failed %d", r); | ||
1125 | return r; | ||
1126 | } | ||
1127 | |||
1128 | r = dm_register_target(&origin_target); | ||
1129 | if (r < 0) { | ||
1130 | DMERR("Device mapper: Origin: register failed %d\n", r); | ||
1131 | goto bad1; | ||
1132 | } | ||
1133 | |||
1134 | r = init_origin_hash(); | ||
1135 | if (r) { | ||
1136 | DMERR("init_origin_hash failed."); | ||
1137 | goto bad2; | ||
1138 | } | ||
1139 | |||
1140 | exception_cache = kmem_cache_create("dm-snapshot-ex", | ||
1141 | sizeof(struct exception), | ||
1142 | __alignof__(struct exception), | ||
1143 | 0, NULL, NULL); | ||
1144 | if (!exception_cache) { | ||
1145 | DMERR("Couldn't create exception cache."); | ||
1146 | r = -ENOMEM; | ||
1147 | goto bad3; | ||
1148 | } | ||
1149 | |||
1150 | pending_cache = | ||
1151 | kmem_cache_create("dm-snapshot-in", | ||
1152 | sizeof(struct pending_exception), | ||
1153 | __alignof__(struct pending_exception), | ||
1154 | 0, NULL, NULL); | ||
1155 | if (!pending_cache) { | ||
1156 | DMERR("Couldn't create pending cache."); | ||
1157 | r = -ENOMEM; | ||
1158 | goto bad4; | ||
1159 | } | ||
1160 | |||
1161 | pending_pool = mempool_create(128, mempool_alloc_slab, | ||
1162 | mempool_free_slab, pending_cache); | ||
1163 | if (!pending_pool) { | ||
1164 | DMERR("Couldn't create pending pool."); | ||
1165 | r = -ENOMEM; | ||
1166 | goto bad5; | ||
1167 | } | ||
1168 | |||
1169 | return 0; | ||
1170 | |||
1171 | bad5: | ||
1172 | kmem_cache_destroy(pending_cache); | ||
1173 | bad4: | ||
1174 | kmem_cache_destroy(exception_cache); | ||
1175 | bad3: | ||
1176 | exit_origin_hash(); | ||
1177 | bad2: | ||
1178 | dm_unregister_target(&origin_target); | ||
1179 | bad1: | ||
1180 | dm_unregister_target(&snapshot_target); | ||
1181 | return r; | ||
1182 | } | ||
1183 | |||
1184 | static void __exit dm_snapshot_exit(void) | ||
1185 | { | ||
1186 | int r; | ||
1187 | |||
1188 | r = dm_unregister_target(&snapshot_target); | ||
1189 | if (r) | ||
1190 | DMERR("snapshot unregister failed %d", r); | ||
1191 | |||
1192 | r = dm_unregister_target(&origin_target); | ||
1193 | if (r) | ||
1194 | DMERR("origin unregister failed %d", r); | ||
1195 | |||
1196 | exit_origin_hash(); | ||
1197 | mempool_destroy(pending_pool); | ||
1198 | kmem_cache_destroy(pending_cache); | ||
1199 | kmem_cache_destroy(exception_cache); | ||
1200 | } | ||
1201 | |||
1202 | /* Module hooks */ | ||
1203 | module_init(dm_snapshot_init); | ||
1204 | module_exit(dm_snapshot_exit); | ||
1205 | |||
1206 | MODULE_DESCRIPTION(DM_NAME " snapshot target"); | ||
1207 | MODULE_AUTHOR("Joe Thornber"); | ||
1208 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h new file mode 100644 index 000000000000..375aa24d4d7d --- /dev/null +++ b/drivers/md/dm-snap.h | |||
@@ -0,0 +1,161 @@ | |||
1 | /* | ||
2 | * dm-snapshot.c | ||
3 | * | ||
4 | * Copyright (C) 2001-2002 Sistina Software (UK) Limited. | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | */ | ||
8 | |||
9 | #ifndef DM_SNAPSHOT_H | ||
10 | #define DM_SNAPSHOT_H | ||
11 | |||
12 | #include "dm.h" | ||
13 | #include <linux/blkdev.h> | ||
14 | |||
15 | struct exception_table { | ||
16 | uint32_t hash_mask; | ||
17 | struct list_head *table; | ||
18 | }; | ||
19 | |||
20 | /* | ||
21 | * The snapshot code deals with largish chunks of the disk at a | ||
22 | * time. Typically 64k - 256k. | ||
23 | */ | ||
24 | /* FIXME: can we get away with limiting these to a uint32_t ? */ | ||
25 | typedef sector_t chunk_t; | ||
26 | |||
27 | /* | ||
28 | * An exception is used where an old chunk of data has been | ||
29 | * replaced by a new one. | ||
30 | */ | ||
31 | struct exception { | ||
32 | struct list_head hash_list; | ||
33 | |||
34 | chunk_t old_chunk; | ||
35 | chunk_t new_chunk; | ||
36 | }; | ||
37 | |||
38 | /* | ||
39 | * Abstraction to handle the meta/layout of exception stores (the | ||
40 | * COW device). | ||
41 | */ | ||
42 | struct exception_store { | ||
43 | |||
44 | /* | ||
45 | * Destroys this object when you've finished with it. | ||
46 | */ | ||
47 | void (*destroy) (struct exception_store *store); | ||
48 | |||
49 | /* | ||
50 | * The target shouldn't read the COW device until this is | ||
51 | * called. | ||
52 | */ | ||
53 | int (*read_metadata) (struct exception_store *store); | ||
54 | |||
55 | /* | ||
56 | * Find somewhere to store the next exception. | ||
57 | */ | ||
58 | int (*prepare_exception) (struct exception_store *store, | ||
59 | struct exception *e); | ||
60 | |||
61 | /* | ||
62 | * Update the metadata with this exception. | ||
63 | */ | ||
64 | void (*commit_exception) (struct exception_store *store, | ||
65 | struct exception *e, | ||
66 | void (*callback) (void *, int success), | ||
67 | void *callback_context); | ||
68 | |||
69 | /* | ||
70 | * The snapshot is invalid, note this in the metadata. | ||
71 | */ | ||
72 | void (*drop_snapshot) (struct exception_store *store); | ||
73 | |||
74 | /* | ||
75 | * Return how full the snapshot is. | ||
76 | */ | ||
77 | void (*fraction_full) (struct exception_store *store, | ||
78 | sector_t *numerator, | ||
79 | sector_t *denominator); | ||
80 | |||
81 | struct dm_snapshot *snap; | ||
82 | void *context; | ||
83 | }; | ||
84 | |||
85 | struct dm_snapshot { | ||
86 | struct rw_semaphore lock; | ||
87 | struct dm_table *table; | ||
88 | |||
89 | struct dm_dev *origin; | ||
90 | struct dm_dev *cow; | ||
91 | |||
92 | /* List of snapshots per Origin */ | ||
93 | struct list_head list; | ||
94 | |||
95 | /* Size of data blocks saved - must be a power of 2 */ | ||
96 | chunk_t chunk_size; | ||
97 | chunk_t chunk_mask; | ||
98 | chunk_t chunk_shift; | ||
99 | |||
100 | /* You can't use a snapshot if this is 0 (e.g. if full) */ | ||
101 | int valid; | ||
102 | int have_metadata; | ||
103 | |||
104 | /* Used for display of table */ | ||
105 | char type; | ||
106 | |||
107 | /* The last percentage we notified */ | ||
108 | int last_percent; | ||
109 | |||
110 | struct exception_table pending; | ||
111 | struct exception_table complete; | ||
112 | |||
113 | /* The on disk metadata handler */ | ||
114 | struct exception_store store; | ||
115 | |||
116 | struct kcopyd_client *kcopyd_client; | ||
117 | }; | ||
118 | |||
119 | /* | ||
120 | * Used by the exception stores to load exceptions hen | ||
121 | * initialising. | ||
122 | */ | ||
123 | int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new); | ||
124 | |||
125 | /* | ||
126 | * Constructor and destructor for the default persistent | ||
127 | * store. | ||
128 | */ | ||
129 | int dm_create_persistent(struct exception_store *store, uint32_t chunk_size); | ||
130 | |||
131 | int dm_create_transient(struct exception_store *store, | ||
132 | struct dm_snapshot *s, int blocksize); | ||
133 | |||
134 | /* | ||
135 | * Return the number of sectors in the device. | ||
136 | */ | ||
137 | static inline sector_t get_dev_size(struct block_device *bdev) | ||
138 | { | ||
139 | return bdev->bd_inode->i_size >> SECTOR_SHIFT; | ||
140 | } | ||
141 | |||
142 | static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector) | ||
143 | { | ||
144 | return (sector & ~s->chunk_mask) >> s->chunk_shift; | ||
145 | } | ||
146 | |||
147 | static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk) | ||
148 | { | ||
149 | return chunk << s->chunk_shift; | ||
150 | } | ||
151 | |||
152 | static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs) | ||
153 | { | ||
154 | /* | ||
155 | * There is only ever one instance of a particular block | ||
156 | * device so we can compare pointers safely. | ||
157 | */ | ||
158 | return lhs == rhs; | ||
159 | } | ||
160 | |||
161 | #endif | ||
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c new file mode 100644 index 000000000000..ab89278a56bf --- /dev/null +++ b/drivers/md/dm-stripe.c | |||
@@ -0,0 +1,234 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001-2003 Sistina Software (UK) Limited. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #include "dm.h" | ||
8 | |||
9 | #include <linux/module.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/blkdev.h> | ||
12 | #include <linux/bio.h> | ||
13 | #include <linux/slab.h> | ||
14 | |||
15 | struct stripe { | ||
16 | struct dm_dev *dev; | ||
17 | sector_t physical_start; | ||
18 | }; | ||
19 | |||
20 | struct stripe_c { | ||
21 | uint32_t stripes; | ||
22 | |||
23 | /* The size of this target / num. stripes */ | ||
24 | sector_t stripe_width; | ||
25 | |||
26 | /* stripe chunk size */ | ||
27 | uint32_t chunk_shift; | ||
28 | sector_t chunk_mask; | ||
29 | |||
30 | struct stripe stripe[0]; | ||
31 | }; | ||
32 | |||
33 | static inline struct stripe_c *alloc_context(unsigned int stripes) | ||
34 | { | ||
35 | size_t len; | ||
36 | |||
37 | if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), | ||
38 | stripes)) | ||
39 | return NULL; | ||
40 | |||
41 | len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); | ||
42 | |||
43 | return kmalloc(len, GFP_KERNEL); | ||
44 | } | ||
45 | |||
46 | /* | ||
47 | * Parse a single <dev> <sector> pair | ||
48 | */ | ||
49 | static int get_stripe(struct dm_target *ti, struct stripe_c *sc, | ||
50 | unsigned int stripe, char **argv) | ||
51 | { | ||
52 | sector_t start; | ||
53 | |||
54 | if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1) | ||
55 | return -EINVAL; | ||
56 | |||
57 | if (dm_get_device(ti, argv[0], start, sc->stripe_width, | ||
58 | dm_table_get_mode(ti->table), | ||
59 | &sc->stripe[stripe].dev)) | ||
60 | return -ENXIO; | ||
61 | |||
62 | sc->stripe[stripe].physical_start = start; | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | /* | ||
67 | * Construct a striped mapping. | ||
68 | * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+ | ||
69 | */ | ||
70 | static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | ||
71 | { | ||
72 | struct stripe_c *sc; | ||
73 | sector_t width; | ||
74 | uint32_t stripes; | ||
75 | uint32_t chunk_size; | ||
76 | char *end; | ||
77 | int r; | ||
78 | unsigned int i; | ||
79 | |||
80 | if (argc < 2) { | ||
81 | ti->error = "dm-stripe: Not enough arguments"; | ||
82 | return -EINVAL; | ||
83 | } | ||
84 | |||
85 | stripes = simple_strtoul(argv[0], &end, 10); | ||
86 | if (*end) { | ||
87 | ti->error = "dm-stripe: Invalid stripe count"; | ||
88 | return -EINVAL; | ||
89 | } | ||
90 | |||
91 | chunk_size = simple_strtoul(argv[1], &end, 10); | ||
92 | if (*end) { | ||
93 | ti->error = "dm-stripe: Invalid chunk_size"; | ||
94 | return -EINVAL; | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * chunk_size is a power of two | ||
99 | */ | ||
100 | if (!chunk_size || (chunk_size & (chunk_size - 1)) || | ||
101 | (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) { | ||
102 | ti->error = "dm-stripe: Invalid chunk size"; | ||
103 | return -EINVAL; | ||
104 | } | ||
105 | |||
106 | width = ti->len; | ||
107 | if (sector_div(width, stripes)) { | ||
108 | ti->error = "dm-stripe: Target length not divisable by " | ||
109 | "number of stripes"; | ||
110 | return -EINVAL; | ||
111 | } | ||
112 | |||
113 | /* | ||
114 | * Do we have enough arguments for that many stripes ? | ||
115 | */ | ||
116 | if (argc != (2 + 2 * stripes)) { | ||
117 | ti->error = "dm-stripe: Not enough destinations " | ||
118 | "specified"; | ||
119 | return -EINVAL; | ||
120 | } | ||
121 | |||
122 | sc = alloc_context(stripes); | ||
123 | if (!sc) { | ||
124 | ti->error = "dm-stripe: Memory allocation for striped context " | ||
125 | "failed"; | ||
126 | return -ENOMEM; | ||
127 | } | ||
128 | |||
129 | sc->stripes = stripes; | ||
130 | sc->stripe_width = width; | ||
131 | ti->split_io = chunk_size; | ||
132 | |||
133 | sc->chunk_mask = ((sector_t) chunk_size) - 1; | ||
134 | for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) | ||
135 | chunk_size >>= 1; | ||
136 | sc->chunk_shift--; | ||
137 | |||
138 | /* | ||
139 | * Get the stripe destinations. | ||
140 | */ | ||
141 | for (i = 0; i < stripes; i++) { | ||
142 | argv += 2; | ||
143 | |||
144 | r = get_stripe(ti, sc, i, argv); | ||
145 | if (r < 0) { | ||
146 | ti->error = "dm-stripe: Couldn't parse stripe " | ||
147 | "destination"; | ||
148 | while (i--) | ||
149 | dm_put_device(ti, sc->stripe[i].dev); | ||
150 | kfree(sc); | ||
151 | return r; | ||
152 | } | ||
153 | } | ||
154 | |||
155 | ti->private = sc; | ||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | static void stripe_dtr(struct dm_target *ti) | ||
160 | { | ||
161 | unsigned int i; | ||
162 | struct stripe_c *sc = (struct stripe_c *) ti->private; | ||
163 | |||
164 | for (i = 0; i < sc->stripes; i++) | ||
165 | dm_put_device(ti, sc->stripe[i].dev); | ||
166 | |||
167 | kfree(sc); | ||
168 | } | ||
169 | |||
170 | static int stripe_map(struct dm_target *ti, struct bio *bio, | ||
171 | union map_info *map_context) | ||
172 | { | ||
173 | struct stripe_c *sc = (struct stripe_c *) ti->private; | ||
174 | |||
175 | sector_t offset = bio->bi_sector - ti->begin; | ||
176 | sector_t chunk = offset >> sc->chunk_shift; | ||
177 | uint32_t stripe = sector_div(chunk, sc->stripes); | ||
178 | |||
179 | bio->bi_bdev = sc->stripe[stripe].dev->bdev; | ||
180 | bio->bi_sector = sc->stripe[stripe].physical_start + | ||
181 | (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); | ||
182 | return 1; | ||
183 | } | ||
184 | |||
185 | static int stripe_status(struct dm_target *ti, | ||
186 | status_type_t type, char *result, unsigned int maxlen) | ||
187 | { | ||
188 | struct stripe_c *sc = (struct stripe_c *) ti->private; | ||
189 | unsigned int sz = 0; | ||
190 | unsigned int i; | ||
191 | |||
192 | switch (type) { | ||
193 | case STATUSTYPE_INFO: | ||
194 | result[0] = '\0'; | ||
195 | break; | ||
196 | |||
197 | case STATUSTYPE_TABLE: | ||
198 | DMEMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1); | ||
199 | for (i = 0; i < sc->stripes; i++) | ||
200 | DMEMIT(" %s " SECTOR_FORMAT, sc->stripe[i].dev->name, | ||
201 | sc->stripe[i].physical_start); | ||
202 | break; | ||
203 | } | ||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | static struct target_type stripe_target = { | ||
208 | .name = "striped", | ||
209 | .version= {1, 0, 2}, | ||
210 | .module = THIS_MODULE, | ||
211 | .ctr = stripe_ctr, | ||
212 | .dtr = stripe_dtr, | ||
213 | .map = stripe_map, | ||
214 | .status = stripe_status, | ||
215 | }; | ||
216 | |||
217 | int __init dm_stripe_init(void) | ||
218 | { | ||
219 | int r; | ||
220 | |||
221 | r = dm_register_target(&stripe_target); | ||
222 | if (r < 0) | ||
223 | DMWARN("striped target registration failed"); | ||
224 | |||
225 | return r; | ||
226 | } | ||
227 | |||
228 | void dm_stripe_exit(void) | ||
229 | { | ||
230 | if (dm_unregister_target(&stripe_target)) | ||
231 | DMWARN("striped target unregistration failed"); | ||
232 | |||
233 | return; | ||
234 | } | ||
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c new file mode 100644 index 000000000000..ee175d4906c4 --- /dev/null +++ b/drivers/md/dm-table.c | |||
@@ -0,0 +1,950 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001 Sistina Software (UK) Limited. | ||
3 | * Copyright (C) 2004 Red Hat, Inc. All rights reserved. | ||
4 | * | ||
5 | * This file is released under the GPL. | ||
6 | */ | ||
7 | |||
8 | #include "dm.h" | ||
9 | |||
10 | #include <linux/module.h> | ||
11 | #include <linux/vmalloc.h> | ||
12 | #include <linux/blkdev.h> | ||
13 | #include <linux/namei.h> | ||
14 | #include <linux/ctype.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/interrupt.h> | ||
17 | #include <asm/atomic.h> | ||
18 | |||
19 | #define MAX_DEPTH 16 | ||
20 | #define NODE_SIZE L1_CACHE_BYTES | ||
21 | #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) | ||
22 | #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) | ||
23 | |||
24 | struct dm_table { | ||
25 | atomic_t holders; | ||
26 | |||
27 | /* btree table */ | ||
28 | unsigned int depth; | ||
29 | unsigned int counts[MAX_DEPTH]; /* in nodes */ | ||
30 | sector_t *index[MAX_DEPTH]; | ||
31 | |||
32 | unsigned int num_targets; | ||
33 | unsigned int num_allocated; | ||
34 | sector_t *highs; | ||
35 | struct dm_target *targets; | ||
36 | |||
37 | /* | ||
38 | * Indicates the rw permissions for the new logical | ||
39 | * device. This should be a combination of FMODE_READ | ||
40 | * and FMODE_WRITE. | ||
41 | */ | ||
42 | int mode; | ||
43 | |||
44 | /* a list of devices used by this table */ | ||
45 | struct list_head devices; | ||
46 | |||
47 | /* | ||
48 | * These are optimistic limits taken from all the | ||
49 | * targets, some targets will need smaller limits. | ||
50 | */ | ||
51 | struct io_restrictions limits; | ||
52 | |||
53 | /* events get handed up using this callback */ | ||
54 | void (*event_fn)(void *); | ||
55 | void *event_context; | ||
56 | }; | ||
57 | |||
58 | /* | ||
59 | * Similar to ceiling(log_size(n)) | ||
60 | */ | ||
61 | static unsigned int int_log(unsigned int n, unsigned int base) | ||
62 | { | ||
63 | int result = 0; | ||
64 | |||
65 | while (n > 1) { | ||
66 | n = dm_div_up(n, base); | ||
67 | result++; | ||
68 | } | ||
69 | |||
70 | return result; | ||
71 | } | ||
72 | |||
73 | /* | ||
74 | * Returns the minimum that is _not_ zero, unless both are zero. | ||
75 | */ | ||
76 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
77 | |||
78 | /* | ||
79 | * Combine two io_restrictions, always taking the lower value. | ||
80 | */ | ||
81 | static void combine_restrictions_low(struct io_restrictions *lhs, | ||
82 | struct io_restrictions *rhs) | ||
83 | { | ||
84 | lhs->max_sectors = | ||
85 | min_not_zero(lhs->max_sectors, rhs->max_sectors); | ||
86 | |||
87 | lhs->max_phys_segments = | ||
88 | min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments); | ||
89 | |||
90 | lhs->max_hw_segments = | ||
91 | min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments); | ||
92 | |||
93 | lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size); | ||
94 | |||
95 | lhs->max_segment_size = | ||
96 | min_not_zero(lhs->max_segment_size, rhs->max_segment_size); | ||
97 | |||
98 | lhs->seg_boundary_mask = | ||
99 | min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask); | ||
100 | } | ||
101 | |||
102 | /* | ||
103 | * Calculate the index of the child node of the n'th node k'th key. | ||
104 | */ | ||
105 | static inline unsigned int get_child(unsigned int n, unsigned int k) | ||
106 | { | ||
107 | return (n * CHILDREN_PER_NODE) + k; | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * Return the n'th node of level l from table t. | ||
112 | */ | ||
113 | static inline sector_t *get_node(struct dm_table *t, | ||
114 | unsigned int l, unsigned int n) | ||
115 | { | ||
116 | return t->index[l] + (n * KEYS_PER_NODE); | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Return the highest key that you could lookup from the n'th | ||
121 | * node on level l of the btree. | ||
122 | */ | ||
123 | static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) | ||
124 | { | ||
125 | for (; l < t->depth - 1; l++) | ||
126 | n = get_child(n, CHILDREN_PER_NODE - 1); | ||
127 | |||
128 | if (n >= t->counts[l]) | ||
129 | return (sector_t) - 1; | ||
130 | |||
131 | return get_node(t, l, n)[KEYS_PER_NODE - 1]; | ||
132 | } | ||
133 | |||
134 | /* | ||
135 | * Fills in a level of the btree based on the highs of the level | ||
136 | * below it. | ||
137 | */ | ||
138 | static int setup_btree_index(unsigned int l, struct dm_table *t) | ||
139 | { | ||
140 | unsigned int n, k; | ||
141 | sector_t *node; | ||
142 | |||
143 | for (n = 0U; n < t->counts[l]; n++) { | ||
144 | node = get_node(t, l, n); | ||
145 | |||
146 | for (k = 0U; k < KEYS_PER_NODE; k++) | ||
147 | node[k] = high(t, l + 1, get_child(n, k)); | ||
148 | } | ||
149 | |||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) | ||
154 | { | ||
155 | unsigned long size; | ||
156 | void *addr; | ||
157 | |||
158 | /* | ||
159 | * Check that we're not going to overflow. | ||
160 | */ | ||
161 | if (nmemb > (ULONG_MAX / elem_size)) | ||
162 | return NULL; | ||
163 | |||
164 | size = nmemb * elem_size; | ||
165 | addr = vmalloc(size); | ||
166 | if (addr) | ||
167 | memset(addr, 0, size); | ||
168 | |||
169 | return addr; | ||
170 | } | ||
171 | |||
172 | /* | ||
173 | * highs, and targets are managed as dynamic arrays during a | ||
174 | * table load. | ||
175 | */ | ||
176 | static int alloc_targets(struct dm_table *t, unsigned int num) | ||
177 | { | ||
178 | sector_t *n_highs; | ||
179 | struct dm_target *n_targets; | ||
180 | int n = t->num_targets; | ||
181 | |||
182 | /* | ||
183 | * Allocate both the target array and offset array at once. | ||
184 | */ | ||
185 | n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) + | ||
186 | sizeof(sector_t)); | ||
187 | if (!n_highs) | ||
188 | return -ENOMEM; | ||
189 | |||
190 | n_targets = (struct dm_target *) (n_highs + num); | ||
191 | |||
192 | if (n) { | ||
193 | memcpy(n_highs, t->highs, sizeof(*n_highs) * n); | ||
194 | memcpy(n_targets, t->targets, sizeof(*n_targets) * n); | ||
195 | } | ||
196 | |||
197 | memset(n_highs + n, -1, sizeof(*n_highs) * (num - n)); | ||
198 | vfree(t->highs); | ||
199 | |||
200 | t->num_allocated = num; | ||
201 | t->highs = n_highs; | ||
202 | t->targets = n_targets; | ||
203 | |||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | int dm_table_create(struct dm_table **result, int mode, unsigned num_targets) | ||
208 | { | ||
209 | struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL); | ||
210 | |||
211 | if (!t) | ||
212 | return -ENOMEM; | ||
213 | |||
214 | memset(t, 0, sizeof(*t)); | ||
215 | INIT_LIST_HEAD(&t->devices); | ||
216 | atomic_set(&t->holders, 1); | ||
217 | |||
218 | if (!num_targets) | ||
219 | num_targets = KEYS_PER_NODE; | ||
220 | |||
221 | num_targets = dm_round_up(num_targets, KEYS_PER_NODE); | ||
222 | |||
223 | if (alloc_targets(t, num_targets)) { | ||
224 | kfree(t); | ||
225 | t = NULL; | ||
226 | return -ENOMEM; | ||
227 | } | ||
228 | |||
229 | t->mode = mode; | ||
230 | *result = t; | ||
231 | return 0; | ||
232 | } | ||
233 | |||
234 | static void free_devices(struct list_head *devices) | ||
235 | { | ||
236 | struct list_head *tmp, *next; | ||
237 | |||
238 | for (tmp = devices->next; tmp != devices; tmp = next) { | ||
239 | struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); | ||
240 | next = tmp->next; | ||
241 | kfree(dd); | ||
242 | } | ||
243 | } | ||
244 | |||
245 | void table_destroy(struct dm_table *t) | ||
246 | { | ||
247 | unsigned int i; | ||
248 | |||
249 | /* free the indexes (see dm_table_complete) */ | ||
250 | if (t->depth >= 2) | ||
251 | vfree(t->index[t->depth - 2]); | ||
252 | |||
253 | /* free the targets */ | ||
254 | for (i = 0; i < t->num_targets; i++) { | ||
255 | struct dm_target *tgt = t->targets + i; | ||
256 | |||
257 | if (tgt->type->dtr) | ||
258 | tgt->type->dtr(tgt); | ||
259 | |||
260 | dm_put_target_type(tgt->type); | ||
261 | } | ||
262 | |||
263 | vfree(t->highs); | ||
264 | |||
265 | /* free the device list */ | ||
266 | if (t->devices.next != &t->devices) { | ||
267 | DMWARN("devices still present during destroy: " | ||
268 | "dm_table_remove_device calls missing"); | ||
269 | |||
270 | free_devices(&t->devices); | ||
271 | } | ||
272 | |||
273 | kfree(t); | ||
274 | } | ||
275 | |||
276 | void dm_table_get(struct dm_table *t) | ||
277 | { | ||
278 | atomic_inc(&t->holders); | ||
279 | } | ||
280 | |||
281 | void dm_table_put(struct dm_table *t) | ||
282 | { | ||
283 | if (!t) | ||
284 | return; | ||
285 | |||
286 | if (atomic_dec_and_test(&t->holders)) | ||
287 | table_destroy(t); | ||
288 | } | ||
289 | |||
290 | /* | ||
291 | * Checks to see if we need to extend highs or targets. | ||
292 | */ | ||
293 | static inline int check_space(struct dm_table *t) | ||
294 | { | ||
295 | if (t->num_targets >= t->num_allocated) | ||
296 | return alloc_targets(t, t->num_allocated * 2); | ||
297 | |||
298 | return 0; | ||
299 | } | ||
300 | |||
301 | /* | ||
302 | * Convert a device path to a dev_t. | ||
303 | */ | ||
304 | static int lookup_device(const char *path, dev_t *dev) | ||
305 | { | ||
306 | int r; | ||
307 | struct nameidata nd; | ||
308 | struct inode *inode; | ||
309 | |||
310 | if ((r = path_lookup(path, LOOKUP_FOLLOW, &nd))) | ||
311 | return r; | ||
312 | |||
313 | inode = nd.dentry->d_inode; | ||
314 | if (!inode) { | ||
315 | r = -ENOENT; | ||
316 | goto out; | ||
317 | } | ||
318 | |||
319 | if (!S_ISBLK(inode->i_mode)) { | ||
320 | r = -ENOTBLK; | ||
321 | goto out; | ||
322 | } | ||
323 | |||
324 | *dev = inode->i_rdev; | ||
325 | |||
326 | out: | ||
327 | path_release(&nd); | ||
328 | return r; | ||
329 | } | ||
330 | |||
331 | /* | ||
332 | * See if we've already got a device in the list. | ||
333 | */ | ||
334 | static struct dm_dev *find_device(struct list_head *l, dev_t dev) | ||
335 | { | ||
336 | struct dm_dev *dd; | ||
337 | |||
338 | list_for_each_entry (dd, l, list) | ||
339 | if (dd->bdev->bd_dev == dev) | ||
340 | return dd; | ||
341 | |||
342 | return NULL; | ||
343 | } | ||
344 | |||
345 | /* | ||
346 | * Open a device so we can use it as a map destination. | ||
347 | */ | ||
348 | static int open_dev(struct dm_dev *d, dev_t dev) | ||
349 | { | ||
350 | static char *_claim_ptr = "I belong to device-mapper"; | ||
351 | struct block_device *bdev; | ||
352 | |||
353 | int r; | ||
354 | |||
355 | if (d->bdev) | ||
356 | BUG(); | ||
357 | |||
358 | bdev = open_by_devnum(dev, d->mode); | ||
359 | if (IS_ERR(bdev)) | ||
360 | return PTR_ERR(bdev); | ||
361 | r = bd_claim(bdev, _claim_ptr); | ||
362 | if (r) | ||
363 | blkdev_put(bdev); | ||
364 | else | ||
365 | d->bdev = bdev; | ||
366 | return r; | ||
367 | } | ||
368 | |||
369 | /* | ||
370 | * Close a device that we've been using. | ||
371 | */ | ||
372 | static void close_dev(struct dm_dev *d) | ||
373 | { | ||
374 | if (!d->bdev) | ||
375 | return; | ||
376 | |||
377 | bd_release(d->bdev); | ||
378 | blkdev_put(d->bdev); | ||
379 | d->bdev = NULL; | ||
380 | } | ||
381 | |||
382 | /* | ||
383 | * If possible (ie. blk_size[major] is set), this checks an area | ||
384 | * of a destination device is valid. | ||
385 | */ | ||
386 | static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len) | ||
387 | { | ||
388 | sector_t dev_size; | ||
389 | dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT; | ||
390 | return ((start < dev_size) && (len <= (dev_size - start))); | ||
391 | } | ||
392 | |||
393 | /* | ||
394 | * This upgrades the mode on an already open dm_dev. Being | ||
395 | * careful to leave things as they were if we fail to reopen the | ||
396 | * device. | ||
397 | */ | ||
398 | static int upgrade_mode(struct dm_dev *dd, int new_mode) | ||
399 | { | ||
400 | int r; | ||
401 | struct dm_dev dd_copy; | ||
402 | dev_t dev = dd->bdev->bd_dev; | ||
403 | |||
404 | dd_copy = *dd; | ||
405 | |||
406 | dd->mode |= new_mode; | ||
407 | dd->bdev = NULL; | ||
408 | r = open_dev(dd, dev); | ||
409 | if (!r) | ||
410 | close_dev(&dd_copy); | ||
411 | else | ||
412 | *dd = dd_copy; | ||
413 | |||
414 | return r; | ||
415 | } | ||
416 | |||
417 | /* | ||
418 | * Add a device to the list, or just increment the usage count if | ||
419 | * it's already present. | ||
420 | */ | ||
421 | static int __table_get_device(struct dm_table *t, struct dm_target *ti, | ||
422 | const char *path, sector_t start, sector_t len, | ||
423 | int mode, struct dm_dev **result) | ||
424 | { | ||
425 | int r; | ||
426 | dev_t dev; | ||
427 | struct dm_dev *dd; | ||
428 | unsigned int major, minor; | ||
429 | |||
430 | if (!t) | ||
431 | BUG(); | ||
432 | |||
433 | if (sscanf(path, "%u:%u", &major, &minor) == 2) { | ||
434 | /* Extract the major/minor numbers */ | ||
435 | dev = MKDEV(major, minor); | ||
436 | if (MAJOR(dev) != major || MINOR(dev) != minor) | ||
437 | return -EOVERFLOW; | ||
438 | } else { | ||
439 | /* convert the path to a device */ | ||
440 | if ((r = lookup_device(path, &dev))) | ||
441 | return r; | ||
442 | } | ||
443 | |||
444 | dd = find_device(&t->devices, dev); | ||
445 | if (!dd) { | ||
446 | dd = kmalloc(sizeof(*dd), GFP_KERNEL); | ||
447 | if (!dd) | ||
448 | return -ENOMEM; | ||
449 | |||
450 | dd->mode = mode; | ||
451 | dd->bdev = NULL; | ||
452 | |||
453 | if ((r = open_dev(dd, dev))) { | ||
454 | kfree(dd); | ||
455 | return r; | ||
456 | } | ||
457 | |||
458 | format_dev_t(dd->name, dev); | ||
459 | |||
460 | atomic_set(&dd->count, 0); | ||
461 | list_add(&dd->list, &t->devices); | ||
462 | |||
463 | } else if (dd->mode != (mode | dd->mode)) { | ||
464 | r = upgrade_mode(dd, mode); | ||
465 | if (r) | ||
466 | return r; | ||
467 | } | ||
468 | atomic_inc(&dd->count); | ||
469 | |||
470 | if (!check_device_area(dd, start, len)) { | ||
471 | DMWARN("device %s too small for target", path); | ||
472 | dm_put_device(ti, dd); | ||
473 | return -EINVAL; | ||
474 | } | ||
475 | |||
476 | *result = dd; | ||
477 | |||
478 | return 0; | ||
479 | } | ||
480 | |||
481 | |||
482 | int dm_get_device(struct dm_target *ti, const char *path, sector_t start, | ||
483 | sector_t len, int mode, struct dm_dev **result) | ||
484 | { | ||
485 | int r = __table_get_device(ti->table, ti, path, | ||
486 | start, len, mode, result); | ||
487 | if (!r) { | ||
488 | request_queue_t *q = bdev_get_queue((*result)->bdev); | ||
489 | struct io_restrictions *rs = &ti->limits; | ||
490 | |||
491 | /* | ||
492 | * Combine the device limits low. | ||
493 | * | ||
494 | * FIXME: if we move an io_restriction struct | ||
495 | * into q this would just be a call to | ||
496 | * combine_restrictions_low() | ||
497 | */ | ||
498 | rs->max_sectors = | ||
499 | min_not_zero(rs->max_sectors, q->max_sectors); | ||
500 | |||
501 | /* FIXME: Device-Mapper on top of RAID-0 breaks because DM | ||
502 | * currently doesn't honor MD's merge_bvec_fn routine. | ||
503 | * In this case, we'll force DM to use PAGE_SIZE or | ||
504 | * smaller I/O, just to be safe. A better fix is in the | ||
505 | * works, but add this for the time being so it will at | ||
506 | * least operate correctly. | ||
507 | */ | ||
508 | if (q->merge_bvec_fn) | ||
509 | rs->max_sectors = | ||
510 | min_not_zero(rs->max_sectors, | ||
511 | (unsigned short)(PAGE_SIZE >> 9)); | ||
512 | |||
513 | rs->max_phys_segments = | ||
514 | min_not_zero(rs->max_phys_segments, | ||
515 | q->max_phys_segments); | ||
516 | |||
517 | rs->max_hw_segments = | ||
518 | min_not_zero(rs->max_hw_segments, q->max_hw_segments); | ||
519 | |||
520 | rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); | ||
521 | |||
522 | rs->max_segment_size = | ||
523 | min_not_zero(rs->max_segment_size, q->max_segment_size); | ||
524 | |||
525 | rs->seg_boundary_mask = | ||
526 | min_not_zero(rs->seg_boundary_mask, | ||
527 | q->seg_boundary_mask); | ||
528 | } | ||
529 | |||
530 | return r; | ||
531 | } | ||
532 | |||
533 | /* | ||
534 | * Decrement a devices use count and remove it if necessary. | ||
535 | */ | ||
536 | void dm_put_device(struct dm_target *ti, struct dm_dev *dd) | ||
537 | { | ||
538 | if (atomic_dec_and_test(&dd->count)) { | ||
539 | close_dev(dd); | ||
540 | list_del(&dd->list); | ||
541 | kfree(dd); | ||
542 | } | ||
543 | } | ||
544 | |||
545 | /* | ||
546 | * Checks to see if the target joins onto the end of the table. | ||
547 | */ | ||
548 | static int adjoin(struct dm_table *table, struct dm_target *ti) | ||
549 | { | ||
550 | struct dm_target *prev; | ||
551 | |||
552 | if (!table->num_targets) | ||
553 | return !ti->begin; | ||
554 | |||
555 | prev = &table->targets[table->num_targets - 1]; | ||
556 | return (ti->begin == (prev->begin + prev->len)); | ||
557 | } | ||
558 | |||
559 | /* | ||
560 | * Used to dynamically allocate the arg array. | ||
561 | */ | ||
562 | static char **realloc_argv(unsigned *array_size, char **old_argv) | ||
563 | { | ||
564 | char **argv; | ||
565 | unsigned new_size; | ||
566 | |||
567 | new_size = *array_size ? *array_size * 2 : 64; | ||
568 | argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); | ||
569 | if (argv) { | ||
570 | memcpy(argv, old_argv, *array_size * sizeof(*argv)); | ||
571 | *array_size = new_size; | ||
572 | } | ||
573 | |||
574 | kfree(old_argv); | ||
575 | return argv; | ||
576 | } | ||
577 | |||
578 | /* | ||
579 | * Destructively splits up the argument list to pass to ctr. | ||
580 | */ | ||
581 | int dm_split_args(int *argc, char ***argvp, char *input) | ||
582 | { | ||
583 | char *start, *end = input, *out, **argv = NULL; | ||
584 | unsigned array_size = 0; | ||
585 | |||
586 | *argc = 0; | ||
587 | argv = realloc_argv(&array_size, argv); | ||
588 | if (!argv) | ||
589 | return -ENOMEM; | ||
590 | |||
591 | while (1) { | ||
592 | start = end; | ||
593 | |||
594 | /* Skip whitespace */ | ||
595 | while (*start && isspace(*start)) | ||
596 | start++; | ||
597 | |||
598 | if (!*start) | ||
599 | break; /* success, we hit the end */ | ||
600 | |||
601 | /* 'out' is used to remove any back-quotes */ | ||
602 | end = out = start; | ||
603 | while (*end) { | ||
604 | /* Everything apart from '\0' can be quoted */ | ||
605 | if (*end == '\\' && *(end + 1)) { | ||
606 | *out++ = *(end + 1); | ||
607 | end += 2; | ||
608 | continue; | ||
609 | } | ||
610 | |||
611 | if (isspace(*end)) | ||
612 | break; /* end of token */ | ||
613 | |||
614 | *out++ = *end++; | ||
615 | } | ||
616 | |||
617 | /* have we already filled the array ? */ | ||
618 | if ((*argc + 1) > array_size) { | ||
619 | argv = realloc_argv(&array_size, argv); | ||
620 | if (!argv) | ||
621 | return -ENOMEM; | ||
622 | } | ||
623 | |||
624 | /* we know this is whitespace */ | ||
625 | if (*end) | ||
626 | end++; | ||
627 | |||
628 | /* terminate the string and put it in the array */ | ||
629 | *out = '\0'; | ||
630 | argv[*argc] = start; | ||
631 | (*argc)++; | ||
632 | } | ||
633 | |||
634 | *argvp = argv; | ||
635 | return 0; | ||
636 | } | ||
637 | |||
638 | static void check_for_valid_limits(struct io_restrictions *rs) | ||
639 | { | ||
640 | if (!rs->max_sectors) | ||
641 | rs->max_sectors = MAX_SECTORS; | ||
642 | if (!rs->max_phys_segments) | ||
643 | rs->max_phys_segments = MAX_PHYS_SEGMENTS; | ||
644 | if (!rs->max_hw_segments) | ||
645 | rs->max_hw_segments = MAX_HW_SEGMENTS; | ||
646 | if (!rs->hardsect_size) | ||
647 | rs->hardsect_size = 1 << SECTOR_SHIFT; | ||
648 | if (!rs->max_segment_size) | ||
649 | rs->max_segment_size = MAX_SEGMENT_SIZE; | ||
650 | if (!rs->seg_boundary_mask) | ||
651 | rs->seg_boundary_mask = -1; | ||
652 | } | ||
653 | |||
654 | int dm_table_add_target(struct dm_table *t, const char *type, | ||
655 | sector_t start, sector_t len, char *params) | ||
656 | { | ||
657 | int r = -EINVAL, argc; | ||
658 | char **argv; | ||
659 | struct dm_target *tgt; | ||
660 | |||
661 | if ((r = check_space(t))) | ||
662 | return r; | ||
663 | |||
664 | tgt = t->targets + t->num_targets; | ||
665 | memset(tgt, 0, sizeof(*tgt)); | ||
666 | |||
667 | if (!len) { | ||
668 | tgt->error = "zero-length target"; | ||
669 | DMERR("%s", tgt->error); | ||
670 | return -EINVAL; | ||
671 | } | ||
672 | |||
673 | tgt->type = dm_get_target_type(type); | ||
674 | if (!tgt->type) { | ||
675 | tgt->error = "unknown target type"; | ||
676 | DMERR("%s", tgt->error); | ||
677 | return -EINVAL; | ||
678 | } | ||
679 | |||
680 | tgt->table = t; | ||
681 | tgt->begin = start; | ||
682 | tgt->len = len; | ||
683 | tgt->error = "Unknown error"; | ||
684 | |||
685 | /* | ||
686 | * Does this target adjoin the previous one ? | ||
687 | */ | ||
688 | if (!adjoin(t, tgt)) { | ||
689 | tgt->error = "Gap in table"; | ||
690 | r = -EINVAL; | ||
691 | goto bad; | ||
692 | } | ||
693 | |||
694 | r = dm_split_args(&argc, &argv, params); | ||
695 | if (r) { | ||
696 | tgt->error = "couldn't split parameters (insufficient memory)"; | ||
697 | goto bad; | ||
698 | } | ||
699 | |||
700 | r = tgt->type->ctr(tgt, argc, argv); | ||
701 | kfree(argv); | ||
702 | if (r) | ||
703 | goto bad; | ||
704 | |||
705 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; | ||
706 | |||
707 | /* FIXME: the plan is to combine high here and then have | ||
708 | * the merge fn apply the target level restrictions. */ | ||
709 | combine_restrictions_low(&t->limits, &tgt->limits); | ||
710 | return 0; | ||
711 | |||
712 | bad: | ||
713 | DMERR("%s", tgt->error); | ||
714 | dm_put_target_type(tgt->type); | ||
715 | return r; | ||
716 | } | ||
717 | |||
718 | static int setup_indexes(struct dm_table *t) | ||
719 | { | ||
720 | int i; | ||
721 | unsigned int total = 0; | ||
722 | sector_t *indexes; | ||
723 | |||
724 | /* allocate the space for *all* the indexes */ | ||
725 | for (i = t->depth - 2; i >= 0; i--) { | ||
726 | t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); | ||
727 | total += t->counts[i]; | ||
728 | } | ||
729 | |||
730 | indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE); | ||
731 | if (!indexes) | ||
732 | return -ENOMEM; | ||
733 | |||
734 | /* set up internal nodes, bottom-up */ | ||
735 | for (i = t->depth - 2, total = 0; i >= 0; i--) { | ||
736 | t->index[i] = indexes; | ||
737 | indexes += (KEYS_PER_NODE * t->counts[i]); | ||
738 | setup_btree_index(i, t); | ||
739 | } | ||
740 | |||
741 | return 0; | ||
742 | } | ||
743 | |||
744 | /* | ||
745 | * Builds the btree to index the map. | ||
746 | */ | ||
747 | int dm_table_complete(struct dm_table *t) | ||
748 | { | ||
749 | int r = 0; | ||
750 | unsigned int leaf_nodes; | ||
751 | |||
752 | check_for_valid_limits(&t->limits); | ||
753 | |||
754 | /* how many indexes will the btree have ? */ | ||
755 | leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); | ||
756 | t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); | ||
757 | |||
758 | /* leaf layer has already been set up */ | ||
759 | t->counts[t->depth - 1] = leaf_nodes; | ||
760 | t->index[t->depth - 1] = t->highs; | ||
761 | |||
762 | if (t->depth >= 2) | ||
763 | r = setup_indexes(t); | ||
764 | |||
765 | return r; | ||
766 | } | ||
767 | |||
768 | static DECLARE_MUTEX(_event_lock); | ||
769 | void dm_table_event_callback(struct dm_table *t, | ||
770 | void (*fn)(void *), void *context) | ||
771 | { | ||
772 | down(&_event_lock); | ||
773 | t->event_fn = fn; | ||
774 | t->event_context = context; | ||
775 | up(&_event_lock); | ||
776 | } | ||
777 | |||
778 | void dm_table_event(struct dm_table *t) | ||
779 | { | ||
780 | /* | ||
781 | * You can no longer call dm_table_event() from interrupt | ||
782 | * context, use a bottom half instead. | ||
783 | */ | ||
784 | BUG_ON(in_interrupt()); | ||
785 | |||
786 | down(&_event_lock); | ||
787 | if (t->event_fn) | ||
788 | t->event_fn(t->event_context); | ||
789 | up(&_event_lock); | ||
790 | } | ||
791 | |||
792 | sector_t dm_table_get_size(struct dm_table *t) | ||
793 | { | ||
794 | return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; | ||
795 | } | ||
796 | |||
797 | struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) | ||
798 | { | ||
799 | if (index > t->num_targets) | ||
800 | return NULL; | ||
801 | |||
802 | return t->targets + index; | ||
803 | } | ||
804 | |||
805 | /* | ||
806 | * Search the btree for the correct target. | ||
807 | */ | ||
808 | struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) | ||
809 | { | ||
810 | unsigned int l, n = 0, k = 0; | ||
811 | sector_t *node; | ||
812 | |||
813 | for (l = 0; l < t->depth; l++) { | ||
814 | n = get_child(n, k); | ||
815 | node = get_node(t, l, n); | ||
816 | |||
817 | for (k = 0; k < KEYS_PER_NODE; k++) | ||
818 | if (node[k] >= sector) | ||
819 | break; | ||
820 | } | ||
821 | |||
822 | return &t->targets[(KEYS_PER_NODE * n) + k]; | ||
823 | } | ||
824 | |||
825 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) | ||
826 | { | ||
827 | /* | ||
828 | * Make sure we obey the optimistic sub devices | ||
829 | * restrictions. | ||
830 | */ | ||
831 | blk_queue_max_sectors(q, t->limits.max_sectors); | ||
832 | q->max_phys_segments = t->limits.max_phys_segments; | ||
833 | q->max_hw_segments = t->limits.max_hw_segments; | ||
834 | q->hardsect_size = t->limits.hardsect_size; | ||
835 | q->max_segment_size = t->limits.max_segment_size; | ||
836 | q->seg_boundary_mask = t->limits.seg_boundary_mask; | ||
837 | } | ||
838 | |||
839 | unsigned int dm_table_get_num_targets(struct dm_table *t) | ||
840 | { | ||
841 | return t->num_targets; | ||
842 | } | ||
843 | |||
844 | struct list_head *dm_table_get_devices(struct dm_table *t) | ||
845 | { | ||
846 | return &t->devices; | ||
847 | } | ||
848 | |||
849 | int dm_table_get_mode(struct dm_table *t) | ||
850 | { | ||
851 | return t->mode; | ||
852 | } | ||
853 | |||
854 | static void suspend_targets(struct dm_table *t, unsigned postsuspend) | ||
855 | { | ||
856 | int i = t->num_targets; | ||
857 | struct dm_target *ti = t->targets; | ||
858 | |||
859 | while (i--) { | ||
860 | if (postsuspend) { | ||
861 | if (ti->type->postsuspend) | ||
862 | ti->type->postsuspend(ti); | ||
863 | } else if (ti->type->presuspend) | ||
864 | ti->type->presuspend(ti); | ||
865 | |||
866 | ti++; | ||
867 | } | ||
868 | } | ||
869 | |||
870 | void dm_table_presuspend_targets(struct dm_table *t) | ||
871 | { | ||
872 | return suspend_targets(t, 0); | ||
873 | } | ||
874 | |||
875 | void dm_table_postsuspend_targets(struct dm_table *t) | ||
876 | { | ||
877 | return suspend_targets(t, 1); | ||
878 | } | ||
879 | |||
880 | void dm_table_resume_targets(struct dm_table *t) | ||
881 | { | ||
882 | int i; | ||
883 | |||
884 | for (i = 0; i < t->num_targets; i++) { | ||
885 | struct dm_target *ti = t->targets + i; | ||
886 | |||
887 | if (ti->type->resume) | ||
888 | ti->type->resume(ti); | ||
889 | } | ||
890 | } | ||
891 | |||
892 | int dm_table_any_congested(struct dm_table *t, int bdi_bits) | ||
893 | { | ||
894 | struct list_head *d, *devices; | ||
895 | int r = 0; | ||
896 | |||
897 | devices = dm_table_get_devices(t); | ||
898 | for (d = devices->next; d != devices; d = d->next) { | ||
899 | struct dm_dev *dd = list_entry(d, struct dm_dev, list); | ||
900 | request_queue_t *q = bdev_get_queue(dd->bdev); | ||
901 | r |= bdi_congested(&q->backing_dev_info, bdi_bits); | ||
902 | } | ||
903 | |||
904 | return r; | ||
905 | } | ||
906 | |||
907 | void dm_table_unplug_all(struct dm_table *t) | ||
908 | { | ||
909 | struct list_head *d, *devices = dm_table_get_devices(t); | ||
910 | |||
911 | for (d = devices->next; d != devices; d = d->next) { | ||
912 | struct dm_dev *dd = list_entry(d, struct dm_dev, list); | ||
913 | request_queue_t *q = bdev_get_queue(dd->bdev); | ||
914 | |||
915 | if (q->unplug_fn) | ||
916 | q->unplug_fn(q); | ||
917 | } | ||
918 | } | ||
919 | |||
920 | int dm_table_flush_all(struct dm_table *t) | ||
921 | { | ||
922 | struct list_head *d, *devices = dm_table_get_devices(t); | ||
923 | int ret = 0; | ||
924 | |||
925 | for (d = devices->next; d != devices; d = d->next) { | ||
926 | struct dm_dev *dd = list_entry(d, struct dm_dev, list); | ||
927 | request_queue_t *q = bdev_get_queue(dd->bdev); | ||
928 | int err; | ||
929 | |||
930 | if (!q->issue_flush_fn) | ||
931 | err = -EOPNOTSUPP; | ||
932 | else | ||
933 | err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL); | ||
934 | |||
935 | if (!ret) | ||
936 | ret = err; | ||
937 | } | ||
938 | |||
939 | return ret; | ||
940 | } | ||
941 | |||
942 | EXPORT_SYMBOL(dm_vcalloc); | ||
943 | EXPORT_SYMBOL(dm_get_device); | ||
944 | EXPORT_SYMBOL(dm_put_device); | ||
945 | EXPORT_SYMBOL(dm_table_event); | ||
946 | EXPORT_SYMBOL(dm_table_get_mode); | ||
947 | EXPORT_SYMBOL(dm_table_put); | ||
948 | EXPORT_SYMBOL(dm_table_get); | ||
949 | EXPORT_SYMBOL(dm_table_unplug_all); | ||
950 | EXPORT_SYMBOL(dm_table_flush_all); | ||
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c new file mode 100644 index 000000000000..aecd9e0c2616 --- /dev/null +++ b/drivers/md/dm-target.c | |||
@@ -0,0 +1,196 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001 Sistina Software (UK) Limited | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #include "dm.h" | ||
8 | |||
9 | #include <linux/module.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/kmod.h> | ||
12 | #include <linux/bio.h> | ||
13 | #include <linux/slab.h> | ||
14 | |||
15 | struct tt_internal { | ||
16 | struct target_type tt; | ||
17 | |||
18 | struct list_head list; | ||
19 | long use; | ||
20 | }; | ||
21 | |||
22 | static LIST_HEAD(_targets); | ||
23 | static DECLARE_RWSEM(_lock); | ||
24 | |||
25 | #define DM_MOD_NAME_SIZE 32 | ||
26 | |||
27 | static inline struct tt_internal *__find_target_type(const char *name) | ||
28 | { | ||
29 | struct tt_internal *ti; | ||
30 | |||
31 | list_for_each_entry (ti, &_targets, list) | ||
32 | if (!strcmp(name, ti->tt.name)) | ||
33 | return ti; | ||
34 | |||
35 | return NULL; | ||
36 | } | ||
37 | |||
38 | static struct tt_internal *get_target_type(const char *name) | ||
39 | { | ||
40 | struct tt_internal *ti; | ||
41 | |||
42 | down_read(&_lock); | ||
43 | |||
44 | ti = __find_target_type(name); | ||
45 | if (ti) { | ||
46 | if ((ti->use == 0) && !try_module_get(ti->tt.module)) | ||
47 | ti = NULL; | ||
48 | else | ||
49 | ti->use++; | ||
50 | } | ||
51 | |||
52 | up_read(&_lock); | ||
53 | return ti; | ||
54 | } | ||
55 | |||
56 | static void load_module(const char *name) | ||
57 | { | ||
58 | request_module("dm-%s", name); | ||
59 | } | ||
60 | |||
61 | struct target_type *dm_get_target_type(const char *name) | ||
62 | { | ||
63 | struct tt_internal *ti = get_target_type(name); | ||
64 | |||
65 | if (!ti) { | ||
66 | load_module(name); | ||
67 | ti = get_target_type(name); | ||
68 | } | ||
69 | |||
70 | return ti ? &ti->tt : NULL; | ||
71 | } | ||
72 | |||
73 | void dm_put_target_type(struct target_type *t) | ||
74 | { | ||
75 | struct tt_internal *ti = (struct tt_internal *) t; | ||
76 | |||
77 | down_read(&_lock); | ||
78 | if (--ti->use == 0) | ||
79 | module_put(ti->tt.module); | ||
80 | |||
81 | if (ti->use < 0) | ||
82 | BUG(); | ||
83 | up_read(&_lock); | ||
84 | |||
85 | return; | ||
86 | } | ||
87 | |||
88 | static struct tt_internal *alloc_target(struct target_type *t) | ||
89 | { | ||
90 | struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL); | ||
91 | |||
92 | if (ti) { | ||
93 | memset(ti, 0, sizeof(*ti)); | ||
94 | ti->tt = *t; | ||
95 | } | ||
96 | |||
97 | return ti; | ||
98 | } | ||
99 | |||
100 | |||
101 | int dm_target_iterate(void (*iter_func)(struct target_type *tt, | ||
102 | void *param), void *param) | ||
103 | { | ||
104 | struct tt_internal *ti; | ||
105 | |||
106 | down_read(&_lock); | ||
107 | list_for_each_entry (ti, &_targets, list) | ||
108 | iter_func(&ti->tt, param); | ||
109 | up_read(&_lock); | ||
110 | |||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | int dm_register_target(struct target_type *t) | ||
115 | { | ||
116 | int rv = 0; | ||
117 | struct tt_internal *ti = alloc_target(t); | ||
118 | |||
119 | if (!ti) | ||
120 | return -ENOMEM; | ||
121 | |||
122 | down_write(&_lock); | ||
123 | if (__find_target_type(t->name)) | ||
124 | rv = -EEXIST; | ||
125 | else | ||
126 | list_add(&ti->list, &_targets); | ||
127 | |||
128 | up_write(&_lock); | ||
129 | if (rv) | ||
130 | kfree(ti); | ||
131 | return rv; | ||
132 | } | ||
133 | |||
134 | int dm_unregister_target(struct target_type *t) | ||
135 | { | ||
136 | struct tt_internal *ti; | ||
137 | |||
138 | down_write(&_lock); | ||
139 | if (!(ti = __find_target_type(t->name))) { | ||
140 | up_write(&_lock); | ||
141 | return -EINVAL; | ||
142 | } | ||
143 | |||
144 | if (ti->use) { | ||
145 | up_write(&_lock); | ||
146 | return -ETXTBSY; | ||
147 | } | ||
148 | |||
149 | list_del(&ti->list); | ||
150 | kfree(ti); | ||
151 | |||
152 | up_write(&_lock); | ||
153 | return 0; | ||
154 | } | ||
155 | |||
156 | /* | ||
157 | * io-err: always fails an io, useful for bringing | ||
158 | * up LVs that have holes in them. | ||
159 | */ | ||
160 | static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args) | ||
161 | { | ||
162 | return 0; | ||
163 | } | ||
164 | |||
165 | static void io_err_dtr(struct dm_target *ti) | ||
166 | { | ||
167 | /* empty */ | ||
168 | } | ||
169 | |||
170 | static int io_err_map(struct dm_target *ti, struct bio *bio, | ||
171 | union map_info *map_context) | ||
172 | { | ||
173 | return -EIO; | ||
174 | } | ||
175 | |||
176 | static struct target_type error_target = { | ||
177 | .name = "error", | ||
178 | .version = {1, 0, 1}, | ||
179 | .ctr = io_err_ctr, | ||
180 | .dtr = io_err_dtr, | ||
181 | .map = io_err_map, | ||
182 | }; | ||
183 | |||
184 | int __init dm_target_init(void) | ||
185 | { | ||
186 | return dm_register_target(&error_target); | ||
187 | } | ||
188 | |||
189 | void dm_target_exit(void) | ||
190 | { | ||
191 | if (dm_unregister_target(&error_target)) | ||
192 | DMWARN("error target unregistration failed"); | ||
193 | } | ||
194 | |||
195 | EXPORT_SYMBOL(dm_register_target); | ||
196 | EXPORT_SYMBOL(dm_unregister_target); | ||
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c new file mode 100644 index 000000000000..7febc2cac73d --- /dev/null +++ b/drivers/md/dm-zero.c | |||
@@ -0,0 +1,81 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Christophe Saout <christophe@saout.de> | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #include "dm.h" | ||
8 | |||
9 | #include <linux/module.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/bio.h> | ||
12 | |||
13 | /* | ||
14 | * Construct a dummy mapping that only returns zeros | ||
15 | */ | ||
16 | static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) | ||
17 | { | ||
18 | if (argc != 0) { | ||
19 | ti->error = "dm-zero: No arguments required"; | ||
20 | return -EINVAL; | ||
21 | } | ||
22 | |||
23 | return 0; | ||
24 | } | ||
25 | |||
26 | /* | ||
27 | * Return zeros only on reads | ||
28 | */ | ||
29 | static int zero_map(struct dm_target *ti, struct bio *bio, | ||
30 | union map_info *map_context) | ||
31 | { | ||
32 | switch(bio_rw(bio)) { | ||
33 | case READ: | ||
34 | zero_fill_bio(bio); | ||
35 | break; | ||
36 | case READA: | ||
37 | /* readahead of null bytes only wastes buffer cache */ | ||
38 | return -EIO; | ||
39 | case WRITE: | ||
40 | /* writes get silently dropped */ | ||
41 | break; | ||
42 | } | ||
43 | |||
44 | bio_endio(bio, bio->bi_size, 0); | ||
45 | |||
46 | /* accepted bio, don't make new request */ | ||
47 | return 0; | ||
48 | } | ||
49 | |||
50 | static struct target_type zero_target = { | ||
51 | .name = "zero", | ||
52 | .version = {1, 0, 0}, | ||
53 | .module = THIS_MODULE, | ||
54 | .ctr = zero_ctr, | ||
55 | .map = zero_map, | ||
56 | }; | ||
57 | |||
58 | int __init dm_zero_init(void) | ||
59 | { | ||
60 | int r = dm_register_target(&zero_target); | ||
61 | |||
62 | if (r < 0) | ||
63 | DMERR("zero: register failed %d", r); | ||
64 | |||
65 | return r; | ||
66 | } | ||
67 | |||
68 | void __exit dm_zero_exit(void) | ||
69 | { | ||
70 | int r = dm_unregister_target(&zero_target); | ||
71 | |||
72 | if (r < 0) | ||
73 | DMERR("zero: unregister failed %d", r); | ||
74 | } | ||
75 | |||
76 | module_init(dm_zero_init) | ||
77 | module_exit(dm_zero_exit) | ||
78 | |||
79 | MODULE_AUTHOR("Christophe Saout <christophe@saout.de>"); | ||
80 | MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros"); | ||
81 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm.c b/drivers/md/dm.c new file mode 100644 index 000000000000..243ff6884e83 --- /dev/null +++ b/drivers/md/dm.c | |||
@@ -0,0 +1,1194 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. | ||
3 | * Copyright (C) 2004 Red Hat, Inc. All rights reserved. | ||
4 | * | ||
5 | * This file is released under the GPL. | ||
6 | */ | ||
7 | |||
8 | #include "dm.h" | ||
9 | #include "dm-bio-list.h" | ||
10 | |||
11 | #include <linux/init.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/moduleparam.h> | ||
14 | #include <linux/blkpg.h> | ||
15 | #include <linux/bio.h> | ||
16 | #include <linux/buffer_head.h> | ||
17 | #include <linux/mempool.h> | ||
18 | #include <linux/slab.h> | ||
19 | #include <linux/idr.h> | ||
20 | |||
21 | static const char *_name = DM_NAME; | ||
22 | |||
23 | static unsigned int major = 0; | ||
24 | static unsigned int _major = 0; | ||
25 | |||
26 | /* | ||
27 | * One of these is allocated per bio. | ||
28 | */ | ||
29 | struct dm_io { | ||
30 | struct mapped_device *md; | ||
31 | int error; | ||
32 | struct bio *bio; | ||
33 | atomic_t io_count; | ||
34 | }; | ||
35 | |||
36 | /* | ||
37 | * One of these is allocated per target within a bio. Hopefully | ||
38 | * this will be simplified out one day. | ||
39 | */ | ||
40 | struct target_io { | ||
41 | struct dm_io *io; | ||
42 | struct dm_target *ti; | ||
43 | union map_info info; | ||
44 | }; | ||
45 | |||
46 | union map_info *dm_get_mapinfo(struct bio *bio) | ||
47 | { | ||
48 | if (bio && bio->bi_private) | ||
49 | return &((struct target_io *)bio->bi_private)->info; | ||
50 | return NULL; | ||
51 | } | ||
52 | |||
53 | /* | ||
54 | * Bits for the md->flags field. | ||
55 | */ | ||
56 | #define DMF_BLOCK_IO 0 | ||
57 | #define DMF_SUSPENDED 1 | ||
58 | #define DMF_FS_LOCKED 2 | ||
59 | |||
60 | struct mapped_device { | ||
61 | struct rw_semaphore lock; | ||
62 | rwlock_t map_lock; | ||
63 | atomic_t holders; | ||
64 | |||
65 | unsigned long flags; | ||
66 | |||
67 | request_queue_t *queue; | ||
68 | struct gendisk *disk; | ||
69 | |||
70 | void *interface_ptr; | ||
71 | |||
72 | /* | ||
73 | * A list of ios that arrived while we were suspended. | ||
74 | */ | ||
75 | atomic_t pending; | ||
76 | wait_queue_head_t wait; | ||
77 | struct bio_list deferred; | ||
78 | |||
79 | /* | ||
80 | * The current mapping. | ||
81 | */ | ||
82 | struct dm_table *map; | ||
83 | |||
84 | /* | ||
85 | * io objects are allocated from here. | ||
86 | */ | ||
87 | mempool_t *io_pool; | ||
88 | mempool_t *tio_pool; | ||
89 | |||
90 | /* | ||
91 | * Event handling. | ||
92 | */ | ||
93 | atomic_t event_nr; | ||
94 | wait_queue_head_t eventq; | ||
95 | |||
96 | /* | ||
97 | * freeze/thaw support require holding onto a super block | ||
98 | */ | ||
99 | struct super_block *frozen_sb; | ||
100 | }; | ||
101 | |||
102 | #define MIN_IOS 256 | ||
103 | static kmem_cache_t *_io_cache; | ||
104 | static kmem_cache_t *_tio_cache; | ||
105 | |||
106 | static struct bio_set *dm_set; | ||
107 | |||
108 | static int __init local_init(void) | ||
109 | { | ||
110 | int r; | ||
111 | |||
112 | dm_set = bioset_create(16, 16, 4); | ||
113 | if (!dm_set) | ||
114 | return -ENOMEM; | ||
115 | |||
116 | /* allocate a slab for the dm_ios */ | ||
117 | _io_cache = kmem_cache_create("dm_io", | ||
118 | sizeof(struct dm_io), 0, 0, NULL, NULL); | ||
119 | if (!_io_cache) | ||
120 | return -ENOMEM; | ||
121 | |||
122 | /* allocate a slab for the target ios */ | ||
123 | _tio_cache = kmem_cache_create("dm_tio", sizeof(struct target_io), | ||
124 | 0, 0, NULL, NULL); | ||
125 | if (!_tio_cache) { | ||
126 | kmem_cache_destroy(_io_cache); | ||
127 | return -ENOMEM; | ||
128 | } | ||
129 | |||
130 | _major = major; | ||
131 | r = register_blkdev(_major, _name); | ||
132 | if (r < 0) { | ||
133 | kmem_cache_destroy(_tio_cache); | ||
134 | kmem_cache_destroy(_io_cache); | ||
135 | return r; | ||
136 | } | ||
137 | |||
138 | if (!_major) | ||
139 | _major = r; | ||
140 | |||
141 | return 0; | ||
142 | } | ||
143 | |||
144 | static void local_exit(void) | ||
145 | { | ||
146 | kmem_cache_destroy(_tio_cache); | ||
147 | kmem_cache_destroy(_io_cache); | ||
148 | |||
149 | bioset_free(dm_set); | ||
150 | |||
151 | if (unregister_blkdev(_major, _name) < 0) | ||
152 | DMERR("devfs_unregister_blkdev failed"); | ||
153 | |||
154 | _major = 0; | ||
155 | |||
156 | DMINFO("cleaned up"); | ||
157 | } | ||
158 | |||
159 | int (*_inits[])(void) __initdata = { | ||
160 | local_init, | ||
161 | dm_target_init, | ||
162 | dm_linear_init, | ||
163 | dm_stripe_init, | ||
164 | dm_interface_init, | ||
165 | }; | ||
166 | |||
167 | void (*_exits[])(void) = { | ||
168 | local_exit, | ||
169 | dm_target_exit, | ||
170 | dm_linear_exit, | ||
171 | dm_stripe_exit, | ||
172 | dm_interface_exit, | ||
173 | }; | ||
174 | |||
175 | static int __init dm_init(void) | ||
176 | { | ||
177 | const int count = ARRAY_SIZE(_inits); | ||
178 | |||
179 | int r, i; | ||
180 | |||
181 | for (i = 0; i < count; i++) { | ||
182 | r = _inits[i](); | ||
183 | if (r) | ||
184 | goto bad; | ||
185 | } | ||
186 | |||
187 | return 0; | ||
188 | |||
189 | bad: | ||
190 | while (i--) | ||
191 | _exits[i](); | ||
192 | |||
193 | return r; | ||
194 | } | ||
195 | |||
196 | static void __exit dm_exit(void) | ||
197 | { | ||
198 | int i = ARRAY_SIZE(_exits); | ||
199 | |||
200 | while (i--) | ||
201 | _exits[i](); | ||
202 | } | ||
203 | |||
204 | /* | ||
205 | * Block device functions | ||
206 | */ | ||
207 | static int dm_blk_open(struct inode *inode, struct file *file) | ||
208 | { | ||
209 | struct mapped_device *md; | ||
210 | |||
211 | md = inode->i_bdev->bd_disk->private_data; | ||
212 | dm_get(md); | ||
213 | return 0; | ||
214 | } | ||
215 | |||
216 | static int dm_blk_close(struct inode *inode, struct file *file) | ||
217 | { | ||
218 | struct mapped_device *md; | ||
219 | |||
220 | md = inode->i_bdev->bd_disk->private_data; | ||
221 | dm_put(md); | ||
222 | return 0; | ||
223 | } | ||
224 | |||
225 | static inline struct dm_io *alloc_io(struct mapped_device *md) | ||
226 | { | ||
227 | return mempool_alloc(md->io_pool, GFP_NOIO); | ||
228 | } | ||
229 | |||
230 | static inline void free_io(struct mapped_device *md, struct dm_io *io) | ||
231 | { | ||
232 | mempool_free(io, md->io_pool); | ||
233 | } | ||
234 | |||
235 | static inline struct target_io *alloc_tio(struct mapped_device *md) | ||
236 | { | ||
237 | return mempool_alloc(md->tio_pool, GFP_NOIO); | ||
238 | } | ||
239 | |||
240 | static inline void free_tio(struct mapped_device *md, struct target_io *tio) | ||
241 | { | ||
242 | mempool_free(tio, md->tio_pool); | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Add the bio to the list of deferred io. | ||
247 | */ | ||
248 | static int queue_io(struct mapped_device *md, struct bio *bio) | ||
249 | { | ||
250 | down_write(&md->lock); | ||
251 | |||
252 | if (!test_bit(DMF_BLOCK_IO, &md->flags)) { | ||
253 | up_write(&md->lock); | ||
254 | return 1; | ||
255 | } | ||
256 | |||
257 | bio_list_add(&md->deferred, bio); | ||
258 | |||
259 | up_write(&md->lock); | ||
260 | return 0; /* deferred successfully */ | ||
261 | } | ||
262 | |||
263 | /* | ||
264 | * Everyone (including functions in this file), should use this | ||
265 | * function to access the md->map field, and make sure they call | ||
266 | * dm_table_put() when finished. | ||
267 | */ | ||
268 | struct dm_table *dm_get_table(struct mapped_device *md) | ||
269 | { | ||
270 | struct dm_table *t; | ||
271 | |||
272 | read_lock(&md->map_lock); | ||
273 | t = md->map; | ||
274 | if (t) | ||
275 | dm_table_get(t); | ||
276 | read_unlock(&md->map_lock); | ||
277 | |||
278 | return t; | ||
279 | } | ||
280 | |||
281 | /*----------------------------------------------------------------- | ||
282 | * CRUD START: | ||
283 | * A more elegant soln is in the works that uses the queue | ||
284 | * merge fn, unfortunately there are a couple of changes to | ||
285 | * the block layer that I want to make for this. So in the | ||
286 | * interests of getting something for people to use I give | ||
287 | * you this clearly demarcated crap. | ||
288 | *---------------------------------------------------------------*/ | ||
289 | |||
290 | /* | ||
291 | * Decrements the number of outstanding ios that a bio has been | ||
292 | * cloned into, completing the original io if necc. | ||
293 | */ | ||
294 | static inline void dec_pending(struct dm_io *io, int error) | ||
295 | { | ||
296 | if (error) | ||
297 | io->error = error; | ||
298 | |||
299 | if (atomic_dec_and_test(&io->io_count)) { | ||
300 | if (atomic_dec_and_test(&io->md->pending)) | ||
301 | /* nudge anyone waiting on suspend queue */ | ||
302 | wake_up(&io->md->wait); | ||
303 | |||
304 | bio_endio(io->bio, io->bio->bi_size, io->error); | ||
305 | free_io(io->md, io); | ||
306 | } | ||
307 | } | ||
308 | |||
309 | static int clone_endio(struct bio *bio, unsigned int done, int error) | ||
310 | { | ||
311 | int r = 0; | ||
312 | struct target_io *tio = bio->bi_private; | ||
313 | struct dm_io *io = tio->io; | ||
314 | dm_endio_fn endio = tio->ti->type->end_io; | ||
315 | |||
316 | if (bio->bi_size) | ||
317 | return 1; | ||
318 | |||
319 | if (!bio_flagged(bio, BIO_UPTODATE) && !error) | ||
320 | error = -EIO; | ||
321 | |||
322 | if (endio) { | ||
323 | r = endio(tio->ti, bio, error, &tio->info); | ||
324 | if (r < 0) | ||
325 | error = r; | ||
326 | |||
327 | else if (r > 0) | ||
328 | /* the target wants another shot at the io */ | ||
329 | return 1; | ||
330 | } | ||
331 | |||
332 | free_tio(io->md, tio); | ||
333 | dec_pending(io, error); | ||
334 | bio_put(bio); | ||
335 | return r; | ||
336 | } | ||
337 | |||
338 | static sector_t max_io_len(struct mapped_device *md, | ||
339 | sector_t sector, struct dm_target *ti) | ||
340 | { | ||
341 | sector_t offset = sector - ti->begin; | ||
342 | sector_t len = ti->len - offset; | ||
343 | |||
344 | /* | ||
345 | * Does the target need to split even further ? | ||
346 | */ | ||
347 | if (ti->split_io) { | ||
348 | sector_t boundary; | ||
349 | boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) | ||
350 | - offset; | ||
351 | if (len > boundary) | ||
352 | len = boundary; | ||
353 | } | ||
354 | |||
355 | return len; | ||
356 | } | ||
357 | |||
358 | static void __map_bio(struct dm_target *ti, struct bio *clone, | ||
359 | struct target_io *tio) | ||
360 | { | ||
361 | int r; | ||
362 | |||
363 | /* | ||
364 | * Sanity checks. | ||
365 | */ | ||
366 | BUG_ON(!clone->bi_size); | ||
367 | |||
368 | clone->bi_end_io = clone_endio; | ||
369 | clone->bi_private = tio; | ||
370 | |||
371 | /* | ||
372 | * Map the clone. If r == 0 we don't need to do | ||
373 | * anything, the target has assumed ownership of | ||
374 | * this io. | ||
375 | */ | ||
376 | atomic_inc(&tio->io->io_count); | ||
377 | r = ti->type->map(ti, clone, &tio->info); | ||
378 | if (r > 0) | ||
379 | /* the bio has been remapped so dispatch it */ | ||
380 | generic_make_request(clone); | ||
381 | |||
382 | else if (r < 0) { | ||
383 | /* error the io and bail out */ | ||
384 | struct dm_io *io = tio->io; | ||
385 | free_tio(tio->io->md, tio); | ||
386 | dec_pending(io, -EIO); | ||
387 | bio_put(clone); | ||
388 | } | ||
389 | } | ||
390 | |||
391 | struct clone_info { | ||
392 | struct mapped_device *md; | ||
393 | struct dm_table *map; | ||
394 | struct bio *bio; | ||
395 | struct dm_io *io; | ||
396 | sector_t sector; | ||
397 | sector_t sector_count; | ||
398 | unsigned short idx; | ||
399 | }; | ||
400 | |||
401 | /* | ||
402 | * Creates a little bio that is just does part of a bvec. | ||
403 | */ | ||
404 | static struct bio *split_bvec(struct bio *bio, sector_t sector, | ||
405 | unsigned short idx, unsigned int offset, | ||
406 | unsigned int len) | ||
407 | { | ||
408 | struct bio *clone; | ||
409 | struct bio_vec *bv = bio->bi_io_vec + idx; | ||
410 | |||
411 | clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set); | ||
412 | *clone->bi_io_vec = *bv; | ||
413 | |||
414 | clone->bi_sector = sector; | ||
415 | clone->bi_bdev = bio->bi_bdev; | ||
416 | clone->bi_rw = bio->bi_rw; | ||
417 | clone->bi_vcnt = 1; | ||
418 | clone->bi_size = to_bytes(len); | ||
419 | clone->bi_io_vec->bv_offset = offset; | ||
420 | clone->bi_io_vec->bv_len = clone->bi_size; | ||
421 | |||
422 | return clone; | ||
423 | } | ||
424 | |||
425 | /* | ||
426 | * Creates a bio that consists of range of complete bvecs. | ||
427 | */ | ||
428 | static struct bio *clone_bio(struct bio *bio, sector_t sector, | ||
429 | unsigned short idx, unsigned short bv_count, | ||
430 | unsigned int len) | ||
431 | { | ||
432 | struct bio *clone; | ||
433 | |||
434 | clone = bio_clone(bio, GFP_NOIO); | ||
435 | clone->bi_sector = sector; | ||
436 | clone->bi_idx = idx; | ||
437 | clone->bi_vcnt = idx + bv_count; | ||
438 | clone->bi_size = to_bytes(len); | ||
439 | clone->bi_flags &= ~(1 << BIO_SEG_VALID); | ||
440 | |||
441 | return clone; | ||
442 | } | ||
443 | |||
444 | static void __clone_and_map(struct clone_info *ci) | ||
445 | { | ||
446 | struct bio *clone, *bio = ci->bio; | ||
447 | struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); | ||
448 | sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); | ||
449 | struct target_io *tio; | ||
450 | |||
451 | /* | ||
452 | * Allocate a target io object. | ||
453 | */ | ||
454 | tio = alloc_tio(ci->md); | ||
455 | tio->io = ci->io; | ||
456 | tio->ti = ti; | ||
457 | memset(&tio->info, 0, sizeof(tio->info)); | ||
458 | |||
459 | if (ci->sector_count <= max) { | ||
460 | /* | ||
461 | * Optimise for the simple case where we can do all of | ||
462 | * the remaining io with a single clone. | ||
463 | */ | ||
464 | clone = clone_bio(bio, ci->sector, ci->idx, | ||
465 | bio->bi_vcnt - ci->idx, ci->sector_count); | ||
466 | __map_bio(ti, clone, tio); | ||
467 | ci->sector_count = 0; | ||
468 | |||
469 | } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { | ||
470 | /* | ||
471 | * There are some bvecs that don't span targets. | ||
472 | * Do as many of these as possible. | ||
473 | */ | ||
474 | int i; | ||
475 | sector_t remaining = max; | ||
476 | sector_t bv_len; | ||
477 | |||
478 | for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { | ||
479 | bv_len = to_sector(bio->bi_io_vec[i].bv_len); | ||
480 | |||
481 | if (bv_len > remaining) | ||
482 | break; | ||
483 | |||
484 | remaining -= bv_len; | ||
485 | len += bv_len; | ||
486 | } | ||
487 | |||
488 | clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len); | ||
489 | __map_bio(ti, clone, tio); | ||
490 | |||
491 | ci->sector += len; | ||
492 | ci->sector_count -= len; | ||
493 | ci->idx = i; | ||
494 | |||
495 | } else { | ||
496 | /* | ||
497 | * Create two copy bios to deal with io that has | ||
498 | * been split across a target. | ||
499 | */ | ||
500 | struct bio_vec *bv = bio->bi_io_vec + ci->idx; | ||
501 | |||
502 | clone = split_bvec(bio, ci->sector, ci->idx, | ||
503 | bv->bv_offset, max); | ||
504 | __map_bio(ti, clone, tio); | ||
505 | |||
506 | ci->sector += max; | ||
507 | ci->sector_count -= max; | ||
508 | ti = dm_table_find_target(ci->map, ci->sector); | ||
509 | |||
510 | len = to_sector(bv->bv_len) - max; | ||
511 | clone = split_bvec(bio, ci->sector, ci->idx, | ||
512 | bv->bv_offset + to_bytes(max), len); | ||
513 | tio = alloc_tio(ci->md); | ||
514 | tio->io = ci->io; | ||
515 | tio->ti = ti; | ||
516 | memset(&tio->info, 0, sizeof(tio->info)); | ||
517 | __map_bio(ti, clone, tio); | ||
518 | |||
519 | ci->sector += len; | ||
520 | ci->sector_count -= len; | ||
521 | ci->idx++; | ||
522 | } | ||
523 | } | ||
524 | |||
525 | /* | ||
526 | * Split the bio into several clones. | ||
527 | */ | ||
528 | static void __split_bio(struct mapped_device *md, struct bio *bio) | ||
529 | { | ||
530 | struct clone_info ci; | ||
531 | |||
532 | ci.map = dm_get_table(md); | ||
533 | if (!ci.map) { | ||
534 | bio_io_error(bio, bio->bi_size); | ||
535 | return; | ||
536 | } | ||
537 | |||
538 | ci.md = md; | ||
539 | ci.bio = bio; | ||
540 | ci.io = alloc_io(md); | ||
541 | ci.io->error = 0; | ||
542 | atomic_set(&ci.io->io_count, 1); | ||
543 | ci.io->bio = bio; | ||
544 | ci.io->md = md; | ||
545 | ci.sector = bio->bi_sector; | ||
546 | ci.sector_count = bio_sectors(bio); | ||
547 | ci.idx = bio->bi_idx; | ||
548 | |||
549 | atomic_inc(&md->pending); | ||
550 | while (ci.sector_count) | ||
551 | __clone_and_map(&ci); | ||
552 | |||
553 | /* drop the extra reference count */ | ||
554 | dec_pending(ci.io, 0); | ||
555 | dm_table_put(ci.map); | ||
556 | } | ||
557 | /*----------------------------------------------------------------- | ||
558 | * CRUD END | ||
559 | *---------------------------------------------------------------*/ | ||
560 | |||
561 | /* | ||
562 | * The request function that just remaps the bio built up by | ||
563 | * dm_merge_bvec. | ||
564 | */ | ||
565 | static int dm_request(request_queue_t *q, struct bio *bio) | ||
566 | { | ||
567 | int r; | ||
568 | struct mapped_device *md = q->queuedata; | ||
569 | |||
570 | down_read(&md->lock); | ||
571 | |||
572 | /* | ||
573 | * If we're suspended we have to queue | ||
574 | * this io for later. | ||
575 | */ | ||
576 | while (test_bit(DMF_BLOCK_IO, &md->flags)) { | ||
577 | up_read(&md->lock); | ||
578 | |||
579 | if (bio_rw(bio) == READA) { | ||
580 | bio_io_error(bio, bio->bi_size); | ||
581 | return 0; | ||
582 | } | ||
583 | |||
584 | r = queue_io(md, bio); | ||
585 | if (r < 0) { | ||
586 | bio_io_error(bio, bio->bi_size); | ||
587 | return 0; | ||
588 | |||
589 | } else if (r == 0) | ||
590 | return 0; /* deferred successfully */ | ||
591 | |||
592 | /* | ||
593 | * We're in a while loop, because someone could suspend | ||
594 | * before we get to the following read lock. | ||
595 | */ | ||
596 | down_read(&md->lock); | ||
597 | } | ||
598 | |||
599 | __split_bio(md, bio); | ||
600 | up_read(&md->lock); | ||
601 | return 0; | ||
602 | } | ||
603 | |||
604 | static int dm_flush_all(request_queue_t *q, struct gendisk *disk, | ||
605 | sector_t *error_sector) | ||
606 | { | ||
607 | struct mapped_device *md = q->queuedata; | ||
608 | struct dm_table *map = dm_get_table(md); | ||
609 | int ret = -ENXIO; | ||
610 | |||
611 | if (map) { | ||
612 | ret = dm_table_flush_all(md->map); | ||
613 | dm_table_put(map); | ||
614 | } | ||
615 | |||
616 | return ret; | ||
617 | } | ||
618 | |||
619 | static void dm_unplug_all(request_queue_t *q) | ||
620 | { | ||
621 | struct mapped_device *md = q->queuedata; | ||
622 | struct dm_table *map = dm_get_table(md); | ||
623 | |||
624 | if (map) { | ||
625 | dm_table_unplug_all(map); | ||
626 | dm_table_put(map); | ||
627 | } | ||
628 | } | ||
629 | |||
630 | static int dm_any_congested(void *congested_data, int bdi_bits) | ||
631 | { | ||
632 | int r; | ||
633 | struct mapped_device *md = (struct mapped_device *) congested_data; | ||
634 | struct dm_table *map = dm_get_table(md); | ||
635 | |||
636 | if (!map || test_bit(DMF_BLOCK_IO, &md->flags)) | ||
637 | r = bdi_bits; | ||
638 | else | ||
639 | r = dm_table_any_congested(map, bdi_bits); | ||
640 | |||
641 | dm_table_put(map); | ||
642 | return r; | ||
643 | } | ||
644 | |||
645 | /*----------------------------------------------------------------- | ||
646 | * An IDR is used to keep track of allocated minor numbers. | ||
647 | *---------------------------------------------------------------*/ | ||
648 | static DECLARE_MUTEX(_minor_lock); | ||
649 | static DEFINE_IDR(_minor_idr); | ||
650 | |||
651 | static void free_minor(unsigned int minor) | ||
652 | { | ||
653 | down(&_minor_lock); | ||
654 | idr_remove(&_minor_idr, minor); | ||
655 | up(&_minor_lock); | ||
656 | } | ||
657 | |||
658 | /* | ||
659 | * See if the device with a specific minor # is free. | ||
660 | */ | ||
661 | static int specific_minor(struct mapped_device *md, unsigned int minor) | ||
662 | { | ||
663 | int r, m; | ||
664 | |||
665 | if (minor >= (1 << MINORBITS)) | ||
666 | return -EINVAL; | ||
667 | |||
668 | down(&_minor_lock); | ||
669 | |||
670 | if (idr_find(&_minor_idr, minor)) { | ||
671 | r = -EBUSY; | ||
672 | goto out; | ||
673 | } | ||
674 | |||
675 | r = idr_pre_get(&_minor_idr, GFP_KERNEL); | ||
676 | if (!r) { | ||
677 | r = -ENOMEM; | ||
678 | goto out; | ||
679 | } | ||
680 | |||
681 | r = idr_get_new_above(&_minor_idr, md, minor, &m); | ||
682 | if (r) { | ||
683 | goto out; | ||
684 | } | ||
685 | |||
686 | if (m != minor) { | ||
687 | idr_remove(&_minor_idr, m); | ||
688 | r = -EBUSY; | ||
689 | goto out; | ||
690 | } | ||
691 | |||
692 | out: | ||
693 | up(&_minor_lock); | ||
694 | return r; | ||
695 | } | ||
696 | |||
697 | static int next_free_minor(struct mapped_device *md, unsigned int *minor) | ||
698 | { | ||
699 | int r; | ||
700 | unsigned int m; | ||
701 | |||
702 | down(&_minor_lock); | ||
703 | |||
704 | r = idr_pre_get(&_minor_idr, GFP_KERNEL); | ||
705 | if (!r) { | ||
706 | r = -ENOMEM; | ||
707 | goto out; | ||
708 | } | ||
709 | |||
710 | r = idr_get_new(&_minor_idr, md, &m); | ||
711 | if (r) { | ||
712 | goto out; | ||
713 | } | ||
714 | |||
715 | if (m >= (1 << MINORBITS)) { | ||
716 | idr_remove(&_minor_idr, m); | ||
717 | r = -ENOSPC; | ||
718 | goto out; | ||
719 | } | ||
720 | |||
721 | *minor = m; | ||
722 | |||
723 | out: | ||
724 | up(&_minor_lock); | ||
725 | return r; | ||
726 | } | ||
727 | |||
728 | static struct block_device_operations dm_blk_dops; | ||
729 | |||
730 | /* | ||
731 | * Allocate and initialise a blank device with a given minor. | ||
732 | */ | ||
733 | static struct mapped_device *alloc_dev(unsigned int minor, int persistent) | ||
734 | { | ||
735 | int r; | ||
736 | struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); | ||
737 | |||
738 | if (!md) { | ||
739 | DMWARN("unable to allocate device, out of memory."); | ||
740 | return NULL; | ||
741 | } | ||
742 | |||
743 | /* get a minor number for the dev */ | ||
744 | r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor); | ||
745 | if (r < 0) | ||
746 | goto bad1; | ||
747 | |||
748 | memset(md, 0, sizeof(*md)); | ||
749 | init_rwsem(&md->lock); | ||
750 | rwlock_init(&md->map_lock); | ||
751 | atomic_set(&md->holders, 1); | ||
752 | atomic_set(&md->event_nr, 0); | ||
753 | |||
754 | md->queue = blk_alloc_queue(GFP_KERNEL); | ||
755 | if (!md->queue) | ||
756 | goto bad1; | ||
757 | |||
758 | md->queue->queuedata = md; | ||
759 | md->queue->backing_dev_info.congested_fn = dm_any_congested; | ||
760 | md->queue->backing_dev_info.congested_data = md; | ||
761 | blk_queue_make_request(md->queue, dm_request); | ||
762 | md->queue->unplug_fn = dm_unplug_all; | ||
763 | md->queue->issue_flush_fn = dm_flush_all; | ||
764 | |||
765 | md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, | ||
766 | mempool_free_slab, _io_cache); | ||
767 | if (!md->io_pool) | ||
768 | goto bad2; | ||
769 | |||
770 | md->tio_pool = mempool_create(MIN_IOS, mempool_alloc_slab, | ||
771 | mempool_free_slab, _tio_cache); | ||
772 | if (!md->tio_pool) | ||
773 | goto bad3; | ||
774 | |||
775 | md->disk = alloc_disk(1); | ||
776 | if (!md->disk) | ||
777 | goto bad4; | ||
778 | |||
779 | md->disk->major = _major; | ||
780 | md->disk->first_minor = minor; | ||
781 | md->disk->fops = &dm_blk_dops; | ||
782 | md->disk->queue = md->queue; | ||
783 | md->disk->private_data = md; | ||
784 | sprintf(md->disk->disk_name, "dm-%d", minor); | ||
785 | add_disk(md->disk); | ||
786 | |||
787 | atomic_set(&md->pending, 0); | ||
788 | init_waitqueue_head(&md->wait); | ||
789 | init_waitqueue_head(&md->eventq); | ||
790 | |||
791 | return md; | ||
792 | |||
793 | bad4: | ||
794 | mempool_destroy(md->tio_pool); | ||
795 | bad3: | ||
796 | mempool_destroy(md->io_pool); | ||
797 | bad2: | ||
798 | blk_put_queue(md->queue); | ||
799 | free_minor(minor); | ||
800 | bad1: | ||
801 | kfree(md); | ||
802 | return NULL; | ||
803 | } | ||
804 | |||
805 | static void free_dev(struct mapped_device *md) | ||
806 | { | ||
807 | free_minor(md->disk->first_minor); | ||
808 | mempool_destroy(md->tio_pool); | ||
809 | mempool_destroy(md->io_pool); | ||
810 | del_gendisk(md->disk); | ||
811 | put_disk(md->disk); | ||
812 | blk_put_queue(md->queue); | ||
813 | kfree(md); | ||
814 | } | ||
815 | |||
816 | /* | ||
817 | * Bind a table to the device. | ||
818 | */ | ||
819 | static void event_callback(void *context) | ||
820 | { | ||
821 | struct mapped_device *md = (struct mapped_device *) context; | ||
822 | |||
823 | atomic_inc(&md->event_nr); | ||
824 | wake_up(&md->eventq); | ||
825 | } | ||
826 | |||
827 | static void __set_size(struct gendisk *disk, sector_t size) | ||
828 | { | ||
829 | struct block_device *bdev; | ||
830 | |||
831 | set_capacity(disk, size); | ||
832 | bdev = bdget_disk(disk, 0); | ||
833 | if (bdev) { | ||
834 | down(&bdev->bd_inode->i_sem); | ||
835 | i_size_write(bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); | ||
836 | up(&bdev->bd_inode->i_sem); | ||
837 | bdput(bdev); | ||
838 | } | ||
839 | } | ||
840 | |||
841 | static int __bind(struct mapped_device *md, struct dm_table *t) | ||
842 | { | ||
843 | request_queue_t *q = md->queue; | ||
844 | sector_t size; | ||
845 | |||
846 | size = dm_table_get_size(t); | ||
847 | __set_size(md->disk, size); | ||
848 | if (size == 0) | ||
849 | return 0; | ||
850 | |||
851 | write_lock(&md->map_lock); | ||
852 | md->map = t; | ||
853 | write_unlock(&md->map_lock); | ||
854 | |||
855 | dm_table_get(t); | ||
856 | dm_table_event_callback(md->map, event_callback, md); | ||
857 | dm_table_set_restrictions(t, q); | ||
858 | return 0; | ||
859 | } | ||
860 | |||
861 | static void __unbind(struct mapped_device *md) | ||
862 | { | ||
863 | struct dm_table *map = md->map; | ||
864 | |||
865 | if (!map) | ||
866 | return; | ||
867 | |||
868 | dm_table_event_callback(map, NULL, NULL); | ||
869 | write_lock(&md->map_lock); | ||
870 | md->map = NULL; | ||
871 | write_unlock(&md->map_lock); | ||
872 | dm_table_put(map); | ||
873 | } | ||
874 | |||
875 | /* | ||
876 | * Constructor for a new device. | ||
877 | */ | ||
878 | static int create_aux(unsigned int minor, int persistent, | ||
879 | struct mapped_device **result) | ||
880 | { | ||
881 | struct mapped_device *md; | ||
882 | |||
883 | md = alloc_dev(minor, persistent); | ||
884 | if (!md) | ||
885 | return -ENXIO; | ||
886 | |||
887 | *result = md; | ||
888 | return 0; | ||
889 | } | ||
890 | |||
891 | int dm_create(struct mapped_device **result) | ||
892 | { | ||
893 | return create_aux(0, 0, result); | ||
894 | } | ||
895 | |||
896 | int dm_create_with_minor(unsigned int minor, struct mapped_device **result) | ||
897 | { | ||
898 | return create_aux(minor, 1, result); | ||
899 | } | ||
900 | |||
901 | void *dm_get_mdptr(dev_t dev) | ||
902 | { | ||
903 | struct mapped_device *md; | ||
904 | void *mdptr = NULL; | ||
905 | unsigned minor = MINOR(dev); | ||
906 | |||
907 | if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) | ||
908 | return NULL; | ||
909 | |||
910 | down(&_minor_lock); | ||
911 | |||
912 | md = idr_find(&_minor_idr, minor); | ||
913 | |||
914 | if (md && (dm_disk(md)->first_minor == minor)) | ||
915 | mdptr = md->interface_ptr; | ||
916 | |||
917 | up(&_minor_lock); | ||
918 | |||
919 | return mdptr; | ||
920 | } | ||
921 | |||
922 | void dm_set_mdptr(struct mapped_device *md, void *ptr) | ||
923 | { | ||
924 | md->interface_ptr = ptr; | ||
925 | } | ||
926 | |||
927 | void dm_get(struct mapped_device *md) | ||
928 | { | ||
929 | atomic_inc(&md->holders); | ||
930 | } | ||
931 | |||
932 | void dm_put(struct mapped_device *md) | ||
933 | { | ||
934 | struct dm_table *map = dm_get_table(md); | ||
935 | |||
936 | if (atomic_dec_and_test(&md->holders)) { | ||
937 | if (!test_bit(DMF_SUSPENDED, &md->flags) && map) { | ||
938 | dm_table_presuspend_targets(map); | ||
939 | dm_table_postsuspend_targets(map); | ||
940 | } | ||
941 | __unbind(md); | ||
942 | free_dev(md); | ||
943 | } | ||
944 | |||
945 | dm_table_put(map); | ||
946 | } | ||
947 | |||
948 | /* | ||
949 | * Process the deferred bios | ||
950 | */ | ||
951 | static void __flush_deferred_io(struct mapped_device *md, struct bio *c) | ||
952 | { | ||
953 | struct bio *n; | ||
954 | |||
955 | while (c) { | ||
956 | n = c->bi_next; | ||
957 | c->bi_next = NULL; | ||
958 | __split_bio(md, c); | ||
959 | c = n; | ||
960 | } | ||
961 | } | ||
962 | |||
963 | /* | ||
964 | * Swap in a new table (destroying old one). | ||
965 | */ | ||
966 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) | ||
967 | { | ||
968 | int r; | ||
969 | |||
970 | down_write(&md->lock); | ||
971 | |||
972 | /* device must be suspended */ | ||
973 | if (!test_bit(DMF_SUSPENDED, &md->flags)) { | ||
974 | up_write(&md->lock); | ||
975 | return -EPERM; | ||
976 | } | ||
977 | |||
978 | __unbind(md); | ||
979 | r = __bind(md, table); | ||
980 | if (r) | ||
981 | return r; | ||
982 | |||
983 | up_write(&md->lock); | ||
984 | return 0; | ||
985 | } | ||
986 | |||
987 | /* | ||
988 | * Functions to lock and unlock any filesystem running on the | ||
989 | * device. | ||
990 | */ | ||
991 | static int __lock_fs(struct mapped_device *md) | ||
992 | { | ||
993 | struct block_device *bdev; | ||
994 | |||
995 | if (test_and_set_bit(DMF_FS_LOCKED, &md->flags)) | ||
996 | return 0; | ||
997 | |||
998 | bdev = bdget_disk(md->disk, 0); | ||
999 | if (!bdev) { | ||
1000 | DMWARN("bdget failed in __lock_fs"); | ||
1001 | return -ENOMEM; | ||
1002 | } | ||
1003 | |||
1004 | WARN_ON(md->frozen_sb); | ||
1005 | md->frozen_sb = freeze_bdev(bdev); | ||
1006 | /* don't bdput right now, we don't want the bdev | ||
1007 | * to go away while it is locked. We'll bdput | ||
1008 | * in __unlock_fs | ||
1009 | */ | ||
1010 | return 0; | ||
1011 | } | ||
1012 | |||
1013 | static int __unlock_fs(struct mapped_device *md) | ||
1014 | { | ||
1015 | struct block_device *bdev; | ||
1016 | |||
1017 | if (!test_and_clear_bit(DMF_FS_LOCKED, &md->flags)) | ||
1018 | return 0; | ||
1019 | |||
1020 | bdev = bdget_disk(md->disk, 0); | ||
1021 | if (!bdev) { | ||
1022 | DMWARN("bdget failed in __unlock_fs"); | ||
1023 | return -ENOMEM; | ||
1024 | } | ||
1025 | |||
1026 | thaw_bdev(bdev, md->frozen_sb); | ||
1027 | md->frozen_sb = NULL; | ||
1028 | bdput(bdev); | ||
1029 | bdput(bdev); | ||
1030 | return 0; | ||
1031 | } | ||
1032 | |||
1033 | /* | ||
1034 | * We need to be able to change a mapping table under a mounted | ||
1035 | * filesystem. For example we might want to move some data in | ||
1036 | * the background. Before the table can be swapped with | ||
1037 | * dm_bind_table, dm_suspend must be called to flush any in | ||
1038 | * flight bios and ensure that any further io gets deferred. | ||
1039 | */ | ||
1040 | int dm_suspend(struct mapped_device *md) | ||
1041 | { | ||
1042 | struct dm_table *map; | ||
1043 | DECLARE_WAITQUEUE(wait, current); | ||
1044 | |||
1045 | /* Flush I/O to the device. */ | ||
1046 | down_read(&md->lock); | ||
1047 | if (test_bit(DMF_BLOCK_IO, &md->flags)) { | ||
1048 | up_read(&md->lock); | ||
1049 | return -EINVAL; | ||
1050 | } | ||
1051 | |||
1052 | map = dm_get_table(md); | ||
1053 | if (map) | ||
1054 | dm_table_presuspend_targets(map); | ||
1055 | __lock_fs(md); | ||
1056 | |||
1057 | up_read(&md->lock); | ||
1058 | |||
1059 | /* | ||
1060 | * First we set the BLOCK_IO flag so no more ios will be | ||
1061 | * mapped. | ||
1062 | */ | ||
1063 | down_write(&md->lock); | ||
1064 | if (test_bit(DMF_BLOCK_IO, &md->flags)) { | ||
1065 | /* | ||
1066 | * If we get here we know another thread is | ||
1067 | * trying to suspend as well, so we leave the fs | ||
1068 | * locked for this thread. | ||
1069 | */ | ||
1070 | up_write(&md->lock); | ||
1071 | return -EINVAL; | ||
1072 | } | ||
1073 | |||
1074 | set_bit(DMF_BLOCK_IO, &md->flags); | ||
1075 | add_wait_queue(&md->wait, &wait); | ||
1076 | up_write(&md->lock); | ||
1077 | |||
1078 | /* unplug */ | ||
1079 | if (map) { | ||
1080 | dm_table_unplug_all(map); | ||
1081 | dm_table_put(map); | ||
1082 | } | ||
1083 | |||
1084 | /* | ||
1085 | * Then we wait for the already mapped ios to | ||
1086 | * complete. | ||
1087 | */ | ||
1088 | while (1) { | ||
1089 | set_current_state(TASK_INTERRUPTIBLE); | ||
1090 | |||
1091 | if (!atomic_read(&md->pending) || signal_pending(current)) | ||
1092 | break; | ||
1093 | |||
1094 | io_schedule(); | ||
1095 | } | ||
1096 | set_current_state(TASK_RUNNING); | ||
1097 | |||
1098 | down_write(&md->lock); | ||
1099 | remove_wait_queue(&md->wait, &wait); | ||
1100 | |||
1101 | /* were we interrupted ? */ | ||
1102 | if (atomic_read(&md->pending)) { | ||
1103 | __unlock_fs(md); | ||
1104 | clear_bit(DMF_BLOCK_IO, &md->flags); | ||
1105 | up_write(&md->lock); | ||
1106 | return -EINTR; | ||
1107 | } | ||
1108 | |||
1109 | set_bit(DMF_SUSPENDED, &md->flags); | ||
1110 | |||
1111 | map = dm_get_table(md); | ||
1112 | if (map) | ||
1113 | dm_table_postsuspend_targets(map); | ||
1114 | dm_table_put(map); | ||
1115 | up_write(&md->lock); | ||
1116 | |||
1117 | return 0; | ||
1118 | } | ||
1119 | |||
1120 | int dm_resume(struct mapped_device *md) | ||
1121 | { | ||
1122 | struct bio *def; | ||
1123 | struct dm_table *map = dm_get_table(md); | ||
1124 | |||
1125 | down_write(&md->lock); | ||
1126 | if (!map || | ||
1127 | !test_bit(DMF_SUSPENDED, &md->flags) || | ||
1128 | !dm_table_get_size(map)) { | ||
1129 | up_write(&md->lock); | ||
1130 | dm_table_put(map); | ||
1131 | return -EINVAL; | ||
1132 | } | ||
1133 | |||
1134 | dm_table_resume_targets(map); | ||
1135 | clear_bit(DMF_SUSPENDED, &md->flags); | ||
1136 | clear_bit(DMF_BLOCK_IO, &md->flags); | ||
1137 | |||
1138 | def = bio_list_get(&md->deferred); | ||
1139 | __flush_deferred_io(md, def); | ||
1140 | up_write(&md->lock); | ||
1141 | __unlock_fs(md); | ||
1142 | dm_table_unplug_all(map); | ||
1143 | dm_table_put(map); | ||
1144 | |||
1145 | return 0; | ||
1146 | } | ||
1147 | |||
1148 | /*----------------------------------------------------------------- | ||
1149 | * Event notification. | ||
1150 | *---------------------------------------------------------------*/ | ||
1151 | uint32_t dm_get_event_nr(struct mapped_device *md) | ||
1152 | { | ||
1153 | return atomic_read(&md->event_nr); | ||
1154 | } | ||
1155 | |||
1156 | int dm_wait_event(struct mapped_device *md, int event_nr) | ||
1157 | { | ||
1158 | return wait_event_interruptible(md->eventq, | ||
1159 | (event_nr != atomic_read(&md->event_nr))); | ||
1160 | } | ||
1161 | |||
1162 | /* | ||
1163 | * The gendisk is only valid as long as you have a reference | ||
1164 | * count on 'md'. | ||
1165 | */ | ||
1166 | struct gendisk *dm_disk(struct mapped_device *md) | ||
1167 | { | ||
1168 | return md->disk; | ||
1169 | } | ||
1170 | |||
1171 | int dm_suspended(struct mapped_device *md) | ||
1172 | { | ||
1173 | return test_bit(DMF_SUSPENDED, &md->flags); | ||
1174 | } | ||
1175 | |||
1176 | static struct block_device_operations dm_blk_dops = { | ||
1177 | .open = dm_blk_open, | ||
1178 | .release = dm_blk_close, | ||
1179 | .owner = THIS_MODULE | ||
1180 | }; | ||
1181 | |||
1182 | EXPORT_SYMBOL(dm_get_mapinfo); | ||
1183 | |||
1184 | /* | ||
1185 | * module hooks | ||
1186 | */ | ||
1187 | module_init(dm_init); | ||
1188 | module_exit(dm_exit); | ||
1189 | |||
1190 | module_param(major, uint, 0); | ||
1191 | MODULE_PARM_DESC(major, "The major number of the device mapper"); | ||
1192 | MODULE_DESCRIPTION(DM_NAME " driver"); | ||
1193 | MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); | ||
1194 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm.h b/drivers/md/dm.h new file mode 100644 index 000000000000..e38c3fc1a1db --- /dev/null +++ b/drivers/md/dm.h | |||
@@ -0,0 +1,195 @@ | |||
1 | /* | ||
2 | * Internal header file for device mapper | ||
3 | * | ||
4 | * Copyright (C) 2001, 2002 Sistina Software | ||
5 | * Copyright (C) 2004 Red Hat, Inc. All rights reserved. | ||
6 | * | ||
7 | * This file is released under the LGPL. | ||
8 | */ | ||
9 | |||
10 | #ifndef DM_INTERNAL_H | ||
11 | #define DM_INTERNAL_H | ||
12 | |||
13 | #include <linux/fs.h> | ||
14 | #include <linux/device-mapper.h> | ||
15 | #include <linux/list.h> | ||
16 | #include <linux/blkdev.h> | ||
17 | |||
18 | #define DM_NAME "device-mapper" | ||
19 | #define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x) | ||
20 | #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) | ||
21 | #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) | ||
22 | |||
23 | #define DMEMIT(x...) sz += ((sz >= maxlen) ? \ | ||
24 | 0 : scnprintf(result + sz, maxlen - sz, x)) | ||
25 | |||
26 | /* | ||
27 | * FIXME: I think this should be with the definition of sector_t | ||
28 | * in types.h. | ||
29 | */ | ||
30 | #ifdef CONFIG_LBD | ||
31 | #define SECTOR_FORMAT "%Lu" | ||
32 | #else | ||
33 | #define SECTOR_FORMAT "%lu" | ||
34 | #endif | ||
35 | |||
36 | #define SECTOR_SHIFT 9 | ||
37 | |||
38 | /* | ||
39 | * List of devices that a metadevice uses and should open/close. | ||
40 | */ | ||
41 | struct dm_dev { | ||
42 | struct list_head list; | ||
43 | |||
44 | atomic_t count; | ||
45 | int mode; | ||
46 | struct block_device *bdev; | ||
47 | char name[16]; | ||
48 | }; | ||
49 | |||
50 | struct dm_table; | ||
51 | struct mapped_device; | ||
52 | |||
53 | /*----------------------------------------------------------------- | ||
54 | * Functions for manipulating a struct mapped_device. | ||
55 | * Drop the reference with dm_put when you finish with the object. | ||
56 | *---------------------------------------------------------------*/ | ||
57 | int dm_create(struct mapped_device **md); | ||
58 | int dm_create_with_minor(unsigned int minor, struct mapped_device **md); | ||
59 | void dm_set_mdptr(struct mapped_device *md, void *ptr); | ||
60 | void *dm_get_mdptr(dev_t dev); | ||
61 | |||
62 | /* | ||
63 | * Reference counting for md. | ||
64 | */ | ||
65 | void dm_get(struct mapped_device *md); | ||
66 | void dm_put(struct mapped_device *md); | ||
67 | |||
68 | /* | ||
69 | * A device can still be used while suspended, but I/O is deferred. | ||
70 | */ | ||
71 | int dm_suspend(struct mapped_device *md); | ||
72 | int dm_resume(struct mapped_device *md); | ||
73 | |||
74 | /* | ||
75 | * The device must be suspended before calling this method. | ||
76 | */ | ||
77 | int dm_swap_table(struct mapped_device *md, struct dm_table *t); | ||
78 | |||
79 | /* | ||
80 | * Drop a reference on the table when you've finished with the | ||
81 | * result. | ||
82 | */ | ||
83 | struct dm_table *dm_get_table(struct mapped_device *md); | ||
84 | |||
85 | /* | ||
86 | * Event functions. | ||
87 | */ | ||
88 | uint32_t dm_get_event_nr(struct mapped_device *md); | ||
89 | int dm_wait_event(struct mapped_device *md, int event_nr); | ||
90 | |||
91 | /* | ||
92 | * Info functions. | ||
93 | */ | ||
94 | struct gendisk *dm_disk(struct mapped_device *md); | ||
95 | int dm_suspended(struct mapped_device *md); | ||
96 | |||
97 | /*----------------------------------------------------------------- | ||
98 | * Functions for manipulating a table. Tables are also reference | ||
99 | * counted. | ||
100 | *---------------------------------------------------------------*/ | ||
101 | int dm_table_create(struct dm_table **result, int mode, unsigned num_targets); | ||
102 | |||
103 | void dm_table_get(struct dm_table *t); | ||
104 | void dm_table_put(struct dm_table *t); | ||
105 | |||
106 | int dm_table_add_target(struct dm_table *t, const char *type, | ||
107 | sector_t start, sector_t len, char *params); | ||
108 | int dm_table_complete(struct dm_table *t); | ||
109 | void dm_table_event_callback(struct dm_table *t, | ||
110 | void (*fn)(void *), void *context); | ||
111 | void dm_table_event(struct dm_table *t); | ||
112 | sector_t dm_table_get_size(struct dm_table *t); | ||
113 | struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); | ||
114 | struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); | ||
115 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); | ||
116 | unsigned int dm_table_get_num_targets(struct dm_table *t); | ||
117 | struct list_head *dm_table_get_devices(struct dm_table *t); | ||
118 | int dm_table_get_mode(struct dm_table *t); | ||
119 | void dm_table_presuspend_targets(struct dm_table *t); | ||
120 | void dm_table_postsuspend_targets(struct dm_table *t); | ||
121 | void dm_table_resume_targets(struct dm_table *t); | ||
122 | int dm_table_any_congested(struct dm_table *t, int bdi_bits); | ||
123 | void dm_table_unplug_all(struct dm_table *t); | ||
124 | int dm_table_flush_all(struct dm_table *t); | ||
125 | |||
126 | /*----------------------------------------------------------------- | ||
127 | * A registry of target types. | ||
128 | *---------------------------------------------------------------*/ | ||
129 | int dm_target_init(void); | ||
130 | void dm_target_exit(void); | ||
131 | struct target_type *dm_get_target_type(const char *name); | ||
132 | void dm_put_target_type(struct target_type *t); | ||
133 | int dm_target_iterate(void (*iter_func)(struct target_type *tt, | ||
134 | void *param), void *param); | ||
135 | |||
136 | |||
137 | /*----------------------------------------------------------------- | ||
138 | * Useful inlines. | ||
139 | *---------------------------------------------------------------*/ | ||
140 | static inline int array_too_big(unsigned long fixed, unsigned long obj, | ||
141 | unsigned long num) | ||
142 | { | ||
143 | return (num > (ULONG_MAX - fixed) / obj); | ||
144 | } | ||
145 | |||
146 | /* | ||
147 | * Ceiling(n / sz) | ||
148 | */ | ||
149 | #define dm_div_up(n, sz) (((n) + (sz) - 1) / (sz)) | ||
150 | |||
151 | #define dm_sector_div_up(n, sz) ( \ | ||
152 | { \ | ||
153 | sector_t _r = ((n) + (sz) - 1); \ | ||
154 | sector_div(_r, (sz)); \ | ||
155 | _r; \ | ||
156 | } \ | ||
157 | ) | ||
158 | |||
159 | /* | ||
160 | * ceiling(n / size) * size | ||
161 | */ | ||
162 | #define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz)) | ||
163 | |||
164 | static inline sector_t to_sector(unsigned long n) | ||
165 | { | ||
166 | return (n >> 9); | ||
167 | } | ||
168 | |||
169 | static inline unsigned long to_bytes(sector_t n) | ||
170 | { | ||
171 | return (n << 9); | ||
172 | } | ||
173 | |||
174 | int dm_split_args(int *argc, char ***argvp, char *input); | ||
175 | |||
176 | /* | ||
177 | * The device-mapper can be driven through one of two interfaces; | ||
178 | * ioctl or filesystem, depending which patch you have applied. | ||
179 | */ | ||
180 | int dm_interface_init(void); | ||
181 | void dm_interface_exit(void); | ||
182 | |||
183 | /* | ||
184 | * Targets for linear and striped mappings | ||
185 | */ | ||
186 | int dm_linear_init(void); | ||
187 | void dm_linear_exit(void); | ||
188 | |||
189 | int dm_stripe_init(void); | ||
190 | void dm_stripe_exit(void); | ||
191 | |||
192 | void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); | ||
193 | union map_info *dm_get_mapinfo(struct bio *bio); | ||
194 | |||
195 | #endif | ||
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c new file mode 100644 index 000000000000..0248f8e7eac0 --- /dev/null +++ b/drivers/md/faulty.c | |||
@@ -0,0 +1,343 @@ | |||
1 | /* | ||
2 | * faulty.c : Multiple Devices driver for Linux | ||
3 | * | ||
4 | * Copyright (C) 2004 Neil Brown | ||
5 | * | ||
6 | * fautly-device-simulator personality for md | ||
7 | * | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2, or (at your option) | ||
12 | * any later version. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
16 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
17 | */ | ||
18 | |||
19 | |||
20 | /* | ||
21 | * The "faulty" personality causes some requests to fail. | ||
22 | * | ||
23 | * Possible failure modes are: | ||
24 | * reads fail "randomly" but succeed on retry | ||
25 | * writes fail "randomly" but succeed on retry | ||
26 | * reads for some address fail and then persist until a write | ||
27 | * reads for some address fail and then persist irrespective of write | ||
28 | * writes for some address fail and persist | ||
29 | * all writes fail | ||
30 | * | ||
31 | * Different modes can be active at a time, but only | ||
32 | * one can be set at array creation. Others can be added later. | ||
33 | * A mode can be one-shot or recurrent with the recurrance being | ||
34 | * once in every N requests. | ||
35 | * The bottom 5 bits of the "layout" indicate the mode. The | ||
36 | * remainder indicate a period, or 0 for one-shot. | ||
37 | * | ||
38 | * There is an implementation limit on the number of concurrently | ||
39 | * persisting-faulty blocks. When a new fault is requested that would | ||
40 | * exceed the limit, it is ignored. | ||
41 | * All current faults can be clear using a layout of "0". | ||
42 | * | ||
43 | * Requests are always sent to the device. If they are to fail, | ||
44 | * we clone the bio and insert a new b_end_io into the chain. | ||
45 | */ | ||
46 | |||
47 | #define WriteTransient 0 | ||
48 | #define ReadTransient 1 | ||
49 | #define WritePersistent 2 | ||
50 | #define ReadPersistent 3 | ||
51 | #define WriteAll 4 /* doesn't go to device */ | ||
52 | #define ReadFixable 5 | ||
53 | #define Modes 6 | ||
54 | |||
55 | #define ClearErrors 31 | ||
56 | #define ClearFaults 30 | ||
57 | |||
58 | #define AllPersist 100 /* internal use only */ | ||
59 | #define NoPersist 101 | ||
60 | |||
61 | #define ModeMask 0x1f | ||
62 | #define ModeShift 5 | ||
63 | |||
64 | #define MaxFault 50 | ||
65 | #include <linux/raid/md.h> | ||
66 | |||
67 | |||
68 | static int faulty_fail(struct bio *bio, unsigned int bytes_done, int error) | ||
69 | { | ||
70 | struct bio *b = bio->bi_private; | ||
71 | |||
72 | b->bi_size = bio->bi_size; | ||
73 | b->bi_sector = bio->bi_sector; | ||
74 | |||
75 | if (bio->bi_size == 0) | ||
76 | bio_put(bio); | ||
77 | |||
78 | clear_bit(BIO_UPTODATE, &b->bi_flags); | ||
79 | return (b->bi_end_io)(b, bytes_done, -EIO); | ||
80 | } | ||
81 | |||
82 | typedef struct faulty_conf { | ||
83 | int period[Modes]; | ||
84 | atomic_t counters[Modes]; | ||
85 | sector_t faults[MaxFault]; | ||
86 | int modes[MaxFault]; | ||
87 | int nfaults; | ||
88 | mdk_rdev_t *rdev; | ||
89 | } conf_t; | ||
90 | |||
91 | static int check_mode(conf_t *conf, int mode) | ||
92 | { | ||
93 | if (conf->period[mode] == 0 && | ||
94 | atomic_read(&conf->counters[mode]) <= 0) | ||
95 | return 0; /* no failure, no decrement */ | ||
96 | |||
97 | |||
98 | if (atomic_dec_and_test(&conf->counters[mode])) { | ||
99 | if (conf->period[mode]) | ||
100 | atomic_set(&conf->counters[mode], conf->period[mode]); | ||
101 | return 1; | ||
102 | } | ||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir) | ||
107 | { | ||
108 | /* If we find a ReadFixable sector, we fix it ... */ | ||
109 | int i; | ||
110 | for (i=0; i<conf->nfaults; i++) | ||
111 | if (conf->faults[i] >= start && | ||
112 | conf->faults[i] < end) { | ||
113 | /* found it ... */ | ||
114 | switch (conf->modes[i] * 2 + dir) { | ||
115 | case WritePersistent*2+WRITE: return 1; | ||
116 | case ReadPersistent*2+READ: return 1; | ||
117 | case ReadFixable*2+READ: return 1; | ||
118 | case ReadFixable*2+WRITE: | ||
119 | conf->modes[i] = NoPersist; | ||
120 | return 0; | ||
121 | case AllPersist*2+READ: | ||
122 | case AllPersist*2+WRITE: return 1; | ||
123 | default: | ||
124 | return 0; | ||
125 | } | ||
126 | } | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | static void add_sector(conf_t *conf, sector_t start, int mode) | ||
131 | { | ||
132 | int i; | ||
133 | int n = conf->nfaults; | ||
134 | for (i=0; i<conf->nfaults; i++) | ||
135 | if (conf->faults[i] == start) { | ||
136 | switch(mode) { | ||
137 | case NoPersist: conf->modes[i] = mode; return; | ||
138 | case WritePersistent: | ||
139 | if (conf->modes[i] == ReadPersistent || | ||
140 | conf->modes[i] == ReadFixable) | ||
141 | conf->modes[i] = AllPersist; | ||
142 | else | ||
143 | conf->modes[i] = WritePersistent; | ||
144 | return; | ||
145 | case ReadPersistent: | ||
146 | if (conf->modes[i] == WritePersistent) | ||
147 | conf->modes[i] = AllPersist; | ||
148 | else | ||
149 | conf->modes[i] = ReadPersistent; | ||
150 | return; | ||
151 | case ReadFixable: | ||
152 | if (conf->modes[i] == WritePersistent || | ||
153 | conf->modes[i] == ReadPersistent) | ||
154 | conf->modes[i] = AllPersist; | ||
155 | else | ||
156 | conf->modes[i] = ReadFixable; | ||
157 | return; | ||
158 | } | ||
159 | } else if (conf->modes[i] == NoPersist) | ||
160 | n = i; | ||
161 | |||
162 | if (n >= MaxFault) | ||
163 | return; | ||
164 | conf->faults[n] = start; | ||
165 | conf->modes[n] = mode; | ||
166 | if (conf->nfaults == n) | ||
167 | conf->nfaults = n+1; | ||
168 | } | ||
169 | |||
170 | static int make_request(request_queue_t *q, struct bio *bio) | ||
171 | { | ||
172 | mddev_t *mddev = q->queuedata; | ||
173 | conf_t *conf = (conf_t*)mddev->private; | ||
174 | int failit = 0; | ||
175 | |||
176 | if (bio->bi_rw & 1) { | ||
177 | /* write request */ | ||
178 | if (atomic_read(&conf->counters[WriteAll])) { | ||
179 | /* special case - don't decrement, don't generic_make_request, | ||
180 | * just fail immediately | ||
181 | */ | ||
182 | bio_endio(bio, bio->bi_size, -EIO); | ||
183 | return 0; | ||
184 | } | ||
185 | |||
186 | if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9), | ||
187 | WRITE)) | ||
188 | failit = 1; | ||
189 | if (check_mode(conf, WritePersistent)) { | ||
190 | add_sector(conf, bio->bi_sector, WritePersistent); | ||
191 | failit = 1; | ||
192 | } | ||
193 | if (check_mode(conf, WriteTransient)) | ||
194 | failit = 1; | ||
195 | } else { | ||
196 | /* read request */ | ||
197 | if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9), | ||
198 | READ)) | ||
199 | failit = 1; | ||
200 | if (check_mode(conf, ReadTransient)) | ||
201 | failit = 1; | ||
202 | if (check_mode(conf, ReadPersistent)) { | ||
203 | add_sector(conf, bio->bi_sector, ReadPersistent); | ||
204 | failit = 1; | ||
205 | } | ||
206 | if (check_mode(conf, ReadFixable)) { | ||
207 | add_sector(conf, bio->bi_sector, ReadFixable); | ||
208 | failit = 1; | ||
209 | } | ||
210 | } | ||
211 | if (failit) { | ||
212 | struct bio *b = bio_clone(bio, GFP_NOIO); | ||
213 | b->bi_bdev = conf->rdev->bdev; | ||
214 | b->bi_private = bio; | ||
215 | b->bi_end_io = faulty_fail; | ||
216 | generic_make_request(b); | ||
217 | return 0; | ||
218 | } else { | ||
219 | bio->bi_bdev = conf->rdev->bdev; | ||
220 | return 1; | ||
221 | } | ||
222 | } | ||
223 | |||
224 | static void status(struct seq_file *seq, mddev_t *mddev) | ||
225 | { | ||
226 | conf_t *conf = (conf_t*)mddev->private; | ||
227 | int n; | ||
228 | |||
229 | if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) | ||
230 | seq_printf(seq, " WriteTransient=%d(%d)", | ||
231 | n, conf->period[WriteTransient]); | ||
232 | |||
233 | if ((n=atomic_read(&conf->counters[ReadTransient])) != 0) | ||
234 | seq_printf(seq, " ReadTransient=%d(%d)", | ||
235 | n, conf->period[ReadTransient]); | ||
236 | |||
237 | if ((n=atomic_read(&conf->counters[WritePersistent])) != 0) | ||
238 | seq_printf(seq, " WritePersistent=%d(%d)", | ||
239 | n, conf->period[WritePersistent]); | ||
240 | |||
241 | if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0) | ||
242 | seq_printf(seq, " ReadPersistent=%d(%d)", | ||
243 | n, conf->period[ReadPersistent]); | ||
244 | |||
245 | |||
246 | if ((n=atomic_read(&conf->counters[ReadFixable])) != 0) | ||
247 | seq_printf(seq, " ReadFixable=%d(%d)", | ||
248 | n, conf->period[ReadFixable]); | ||
249 | |||
250 | if ((n=atomic_read(&conf->counters[WriteAll])) != 0) | ||
251 | seq_printf(seq, " WriteAll"); | ||
252 | |||
253 | seq_printf(seq, " nfaults=%d", conf->nfaults); | ||
254 | } | ||
255 | |||
256 | |||
257 | static int reconfig(mddev_t *mddev, int layout, int chunk_size) | ||
258 | { | ||
259 | int mode = layout & ModeMask; | ||
260 | int count = layout >> ModeShift; | ||
261 | conf_t *conf = mddev->private; | ||
262 | |||
263 | if (chunk_size != -1) | ||
264 | return -EINVAL; | ||
265 | |||
266 | /* new layout */ | ||
267 | if (mode == ClearFaults) | ||
268 | conf->nfaults = 0; | ||
269 | else if (mode == ClearErrors) { | ||
270 | int i; | ||
271 | for (i=0 ; i < Modes ; i++) { | ||
272 | conf->period[i] = 0; | ||
273 | atomic_set(&conf->counters[i], 0); | ||
274 | } | ||
275 | } else if (mode < Modes) { | ||
276 | conf->period[mode] = count; | ||
277 | if (!count) count++; | ||
278 | atomic_set(&conf->counters[mode], count); | ||
279 | } else | ||
280 | return -EINVAL; | ||
281 | mddev->layout = -1; /* makes sure further changes come through */ | ||
282 | return 0; | ||
283 | } | ||
284 | |||
285 | static int run(mddev_t *mddev) | ||
286 | { | ||
287 | mdk_rdev_t *rdev; | ||
288 | struct list_head *tmp; | ||
289 | int i; | ||
290 | |||
291 | conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL); | ||
292 | |||
293 | for (i=0; i<Modes; i++) { | ||
294 | atomic_set(&conf->counters[i], 0); | ||
295 | conf->period[i] = 0; | ||
296 | } | ||
297 | conf->nfaults = 0; | ||
298 | |||
299 | ITERATE_RDEV(mddev, rdev, tmp) | ||
300 | conf->rdev = rdev; | ||
301 | |||
302 | mddev->array_size = mddev->size; | ||
303 | mddev->private = conf; | ||
304 | |||
305 | reconfig(mddev, mddev->layout, -1); | ||
306 | |||
307 | return 0; | ||
308 | } | ||
309 | |||
310 | static int stop(mddev_t *mddev) | ||
311 | { | ||
312 | conf_t *conf = (conf_t *)mddev->private; | ||
313 | |||
314 | kfree(conf); | ||
315 | mddev->private = NULL; | ||
316 | return 0; | ||
317 | } | ||
318 | |||
319 | static mdk_personality_t faulty_personality = | ||
320 | { | ||
321 | .name = "faulty", | ||
322 | .owner = THIS_MODULE, | ||
323 | .make_request = make_request, | ||
324 | .run = run, | ||
325 | .stop = stop, | ||
326 | .status = status, | ||
327 | .reconfig = reconfig, | ||
328 | }; | ||
329 | |||
330 | static int __init raid_init(void) | ||
331 | { | ||
332 | return register_md_personality(FAULTY, &faulty_personality); | ||
333 | } | ||
334 | |||
335 | static void raid_exit(void) | ||
336 | { | ||
337 | unregister_md_personality(FAULTY); | ||
338 | } | ||
339 | |||
340 | module_init(raid_init); | ||
341 | module_exit(raid_exit); | ||
342 | MODULE_LICENSE("GPL"); | ||
343 | MODULE_ALIAS("md-personality-10"); /* faulty */ | ||
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c new file mode 100644 index 000000000000..eb7036485975 --- /dev/null +++ b/drivers/md/kcopyd.c | |||
@@ -0,0 +1,687 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2002 Sistina Software (UK) Limited. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | * | ||
6 | * Kcopyd provides a simple interface for copying an area of one | ||
7 | * block-device to one or more other block-devices, with an asynchronous | ||
8 | * completion notification. | ||
9 | */ | ||
10 | |||
11 | #include <asm/atomic.h> | ||
12 | |||
13 | #include <linux/blkdev.h> | ||
14 | #include <linux/config.h> | ||
15 | #include <linux/fs.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/list.h> | ||
18 | #include <linux/mempool.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/pagemap.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/vmalloc.h> | ||
23 | #include <linux/workqueue.h> | ||
24 | |||
25 | #include "kcopyd.h" | ||
26 | |||
27 | static struct workqueue_struct *_kcopyd_wq; | ||
28 | static struct work_struct _kcopyd_work; | ||
29 | |||
30 | static inline void wake(void) | ||
31 | { | ||
32 | queue_work(_kcopyd_wq, &_kcopyd_work); | ||
33 | } | ||
34 | |||
35 | /*----------------------------------------------------------------- | ||
36 | * Each kcopyd client has its own little pool of preallocated | ||
37 | * pages for kcopyd io. | ||
38 | *---------------------------------------------------------------*/ | ||
39 | struct kcopyd_client { | ||
40 | struct list_head list; | ||
41 | |||
42 | spinlock_t lock; | ||
43 | struct page_list *pages; | ||
44 | unsigned int nr_pages; | ||
45 | unsigned int nr_free_pages; | ||
46 | }; | ||
47 | |||
48 | static struct page_list *alloc_pl(void) | ||
49 | { | ||
50 | struct page_list *pl; | ||
51 | |||
52 | pl = kmalloc(sizeof(*pl), GFP_KERNEL); | ||
53 | if (!pl) | ||
54 | return NULL; | ||
55 | |||
56 | pl->page = alloc_page(GFP_KERNEL); | ||
57 | if (!pl->page) { | ||
58 | kfree(pl); | ||
59 | return NULL; | ||
60 | } | ||
61 | |||
62 | return pl; | ||
63 | } | ||
64 | |||
65 | static void free_pl(struct page_list *pl) | ||
66 | { | ||
67 | __free_page(pl->page); | ||
68 | kfree(pl); | ||
69 | } | ||
70 | |||
71 | static int kcopyd_get_pages(struct kcopyd_client *kc, | ||
72 | unsigned int nr, struct page_list **pages) | ||
73 | { | ||
74 | struct page_list *pl; | ||
75 | |||
76 | spin_lock(&kc->lock); | ||
77 | if (kc->nr_free_pages < nr) { | ||
78 | spin_unlock(&kc->lock); | ||
79 | return -ENOMEM; | ||
80 | } | ||
81 | |||
82 | kc->nr_free_pages -= nr; | ||
83 | for (*pages = pl = kc->pages; --nr; pl = pl->next) | ||
84 | ; | ||
85 | |||
86 | kc->pages = pl->next; | ||
87 | pl->next = NULL; | ||
88 | |||
89 | spin_unlock(&kc->lock); | ||
90 | |||
91 | return 0; | ||
92 | } | ||
93 | |||
94 | static void kcopyd_put_pages(struct kcopyd_client *kc, struct page_list *pl) | ||
95 | { | ||
96 | struct page_list *cursor; | ||
97 | |||
98 | spin_lock(&kc->lock); | ||
99 | for (cursor = pl; cursor->next; cursor = cursor->next) | ||
100 | kc->nr_free_pages++; | ||
101 | |||
102 | kc->nr_free_pages++; | ||
103 | cursor->next = kc->pages; | ||
104 | kc->pages = pl; | ||
105 | spin_unlock(&kc->lock); | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * These three functions resize the page pool. | ||
110 | */ | ||
111 | static void drop_pages(struct page_list *pl) | ||
112 | { | ||
113 | struct page_list *next; | ||
114 | |||
115 | while (pl) { | ||
116 | next = pl->next; | ||
117 | free_pl(pl); | ||
118 | pl = next; | ||
119 | } | ||
120 | } | ||
121 | |||
122 | static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr) | ||
123 | { | ||
124 | unsigned int i; | ||
125 | struct page_list *pl = NULL, *next; | ||
126 | |||
127 | for (i = 0; i < nr; i++) { | ||
128 | next = alloc_pl(); | ||
129 | if (!next) { | ||
130 | if (pl) | ||
131 | drop_pages(pl); | ||
132 | return -ENOMEM; | ||
133 | } | ||
134 | next->next = pl; | ||
135 | pl = next; | ||
136 | } | ||
137 | |||
138 | kcopyd_put_pages(kc, pl); | ||
139 | kc->nr_pages += nr; | ||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | static void client_free_pages(struct kcopyd_client *kc) | ||
144 | { | ||
145 | BUG_ON(kc->nr_free_pages != kc->nr_pages); | ||
146 | drop_pages(kc->pages); | ||
147 | kc->pages = NULL; | ||
148 | kc->nr_free_pages = kc->nr_pages = 0; | ||
149 | } | ||
150 | |||
151 | /*----------------------------------------------------------------- | ||
152 | * kcopyd_jobs need to be allocated by the *clients* of kcopyd, | ||
153 | * for this reason we use a mempool to prevent the client from | ||
154 | * ever having to do io (which could cause a deadlock). | ||
155 | *---------------------------------------------------------------*/ | ||
156 | struct kcopyd_job { | ||
157 | struct kcopyd_client *kc; | ||
158 | struct list_head list; | ||
159 | unsigned long flags; | ||
160 | |||
161 | /* | ||
162 | * Error state of the job. | ||
163 | */ | ||
164 | int read_err; | ||
165 | unsigned int write_err; | ||
166 | |||
167 | /* | ||
168 | * Either READ or WRITE | ||
169 | */ | ||
170 | int rw; | ||
171 | struct io_region source; | ||
172 | |||
173 | /* | ||
174 | * The destinations for the transfer. | ||
175 | */ | ||
176 | unsigned int num_dests; | ||
177 | struct io_region dests[KCOPYD_MAX_REGIONS]; | ||
178 | |||
179 | sector_t offset; | ||
180 | unsigned int nr_pages; | ||
181 | struct page_list *pages; | ||
182 | |||
183 | /* | ||
184 | * Set this to ensure you are notified when the job has | ||
185 | * completed. 'context' is for callback to use. | ||
186 | */ | ||
187 | kcopyd_notify_fn fn; | ||
188 | void *context; | ||
189 | |||
190 | /* | ||
191 | * These fields are only used if the job has been split | ||
192 | * into more manageable parts. | ||
193 | */ | ||
194 | struct semaphore lock; | ||
195 | atomic_t sub_jobs; | ||
196 | sector_t progress; | ||
197 | }; | ||
198 | |||
199 | /* FIXME: this should scale with the number of pages */ | ||
200 | #define MIN_JOBS 512 | ||
201 | |||
202 | static kmem_cache_t *_job_cache; | ||
203 | static mempool_t *_job_pool; | ||
204 | |||
205 | /* | ||
206 | * We maintain three lists of jobs: | ||
207 | * | ||
208 | * i) jobs waiting for pages | ||
209 | * ii) jobs that have pages, and are waiting for the io to be issued. | ||
210 | * iii) jobs that have completed. | ||
211 | * | ||
212 | * All three of these are protected by job_lock. | ||
213 | */ | ||
214 | static DEFINE_SPINLOCK(_job_lock); | ||
215 | |||
216 | static LIST_HEAD(_complete_jobs); | ||
217 | static LIST_HEAD(_io_jobs); | ||
218 | static LIST_HEAD(_pages_jobs); | ||
219 | |||
220 | static int jobs_init(void) | ||
221 | { | ||
222 | _job_cache = kmem_cache_create("kcopyd-jobs", | ||
223 | sizeof(struct kcopyd_job), | ||
224 | __alignof__(struct kcopyd_job), | ||
225 | 0, NULL, NULL); | ||
226 | if (!_job_cache) | ||
227 | return -ENOMEM; | ||
228 | |||
229 | _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, | ||
230 | mempool_free_slab, _job_cache); | ||
231 | if (!_job_pool) { | ||
232 | kmem_cache_destroy(_job_cache); | ||
233 | return -ENOMEM; | ||
234 | } | ||
235 | |||
236 | return 0; | ||
237 | } | ||
238 | |||
239 | static void jobs_exit(void) | ||
240 | { | ||
241 | BUG_ON(!list_empty(&_complete_jobs)); | ||
242 | BUG_ON(!list_empty(&_io_jobs)); | ||
243 | BUG_ON(!list_empty(&_pages_jobs)); | ||
244 | |||
245 | mempool_destroy(_job_pool); | ||
246 | kmem_cache_destroy(_job_cache); | ||
247 | _job_pool = NULL; | ||
248 | _job_cache = NULL; | ||
249 | } | ||
250 | |||
251 | /* | ||
252 | * Functions to push and pop a job onto the head of a given job | ||
253 | * list. | ||
254 | */ | ||
255 | static inline struct kcopyd_job *pop(struct list_head *jobs) | ||
256 | { | ||
257 | struct kcopyd_job *job = NULL; | ||
258 | unsigned long flags; | ||
259 | |||
260 | spin_lock_irqsave(&_job_lock, flags); | ||
261 | |||
262 | if (!list_empty(jobs)) { | ||
263 | job = list_entry(jobs->next, struct kcopyd_job, list); | ||
264 | list_del(&job->list); | ||
265 | } | ||
266 | spin_unlock_irqrestore(&_job_lock, flags); | ||
267 | |||
268 | return job; | ||
269 | } | ||
270 | |||
271 | static inline void push(struct list_head *jobs, struct kcopyd_job *job) | ||
272 | { | ||
273 | unsigned long flags; | ||
274 | |||
275 | spin_lock_irqsave(&_job_lock, flags); | ||
276 | list_add_tail(&job->list, jobs); | ||
277 | spin_unlock_irqrestore(&_job_lock, flags); | ||
278 | } | ||
279 | |||
280 | /* | ||
281 | * These three functions process 1 item from the corresponding | ||
282 | * job list. | ||
283 | * | ||
284 | * They return: | ||
285 | * < 0: error | ||
286 | * 0: success | ||
287 | * > 0: can't process yet. | ||
288 | */ | ||
289 | static int run_complete_job(struct kcopyd_job *job) | ||
290 | { | ||
291 | void *context = job->context; | ||
292 | int read_err = job->read_err; | ||
293 | unsigned int write_err = job->write_err; | ||
294 | kcopyd_notify_fn fn = job->fn; | ||
295 | |||
296 | kcopyd_put_pages(job->kc, job->pages); | ||
297 | mempool_free(job, _job_pool); | ||
298 | fn(read_err, write_err, context); | ||
299 | return 0; | ||
300 | } | ||
301 | |||
302 | static void complete_io(unsigned long error, void *context) | ||
303 | { | ||
304 | struct kcopyd_job *job = (struct kcopyd_job *) context; | ||
305 | |||
306 | if (error) { | ||
307 | if (job->rw == WRITE) | ||
308 | job->write_err &= error; | ||
309 | else | ||
310 | job->read_err = 1; | ||
311 | |||
312 | if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { | ||
313 | push(&_complete_jobs, job); | ||
314 | wake(); | ||
315 | return; | ||
316 | } | ||
317 | } | ||
318 | |||
319 | if (job->rw == WRITE) | ||
320 | push(&_complete_jobs, job); | ||
321 | |||
322 | else { | ||
323 | job->rw = WRITE; | ||
324 | push(&_io_jobs, job); | ||
325 | } | ||
326 | |||
327 | wake(); | ||
328 | } | ||
329 | |||
330 | /* | ||
331 | * Request io on as many buffer heads as we can currently get for | ||
332 | * a particular job. | ||
333 | */ | ||
334 | static int run_io_job(struct kcopyd_job *job) | ||
335 | { | ||
336 | int r; | ||
337 | |||
338 | if (job->rw == READ) | ||
339 | r = dm_io_async(1, &job->source, job->rw, | ||
340 | job->pages, | ||
341 | job->offset, complete_io, job); | ||
342 | |||
343 | else | ||
344 | r = dm_io_async(job->num_dests, job->dests, job->rw, | ||
345 | job->pages, | ||
346 | job->offset, complete_io, job); | ||
347 | |||
348 | return r; | ||
349 | } | ||
350 | |||
351 | static int run_pages_job(struct kcopyd_job *job) | ||
352 | { | ||
353 | int r; | ||
354 | |||
355 | job->nr_pages = dm_div_up(job->dests[0].count + job->offset, | ||
356 | PAGE_SIZE >> 9); | ||
357 | r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); | ||
358 | if (!r) { | ||
359 | /* this job is ready for io */ | ||
360 | push(&_io_jobs, job); | ||
361 | return 0; | ||
362 | } | ||
363 | |||
364 | if (r == -ENOMEM) | ||
365 | /* can't complete now */ | ||
366 | return 1; | ||
367 | |||
368 | return r; | ||
369 | } | ||
370 | |||
371 | /* | ||
372 | * Run through a list for as long as possible. Returns the count | ||
373 | * of successful jobs. | ||
374 | */ | ||
375 | static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) | ||
376 | { | ||
377 | struct kcopyd_job *job; | ||
378 | int r, count = 0; | ||
379 | |||
380 | while ((job = pop(jobs))) { | ||
381 | |||
382 | r = fn(job); | ||
383 | |||
384 | if (r < 0) { | ||
385 | /* error this rogue job */ | ||
386 | if (job->rw == WRITE) | ||
387 | job->write_err = (unsigned int) -1; | ||
388 | else | ||
389 | job->read_err = 1; | ||
390 | push(&_complete_jobs, job); | ||
391 | break; | ||
392 | } | ||
393 | |||
394 | if (r > 0) { | ||
395 | /* | ||
396 | * We couldn't service this job ATM, so | ||
397 | * push this job back onto the list. | ||
398 | */ | ||
399 | push(jobs, job); | ||
400 | break; | ||
401 | } | ||
402 | |||
403 | count++; | ||
404 | } | ||
405 | |||
406 | return count; | ||
407 | } | ||
408 | |||
409 | /* | ||
410 | * kcopyd does this every time it's woken up. | ||
411 | */ | ||
412 | static void do_work(void *ignored) | ||
413 | { | ||
414 | /* | ||
415 | * The order that these are called is *very* important. | ||
416 | * complete jobs can free some pages for pages jobs. | ||
417 | * Pages jobs when successful will jump onto the io jobs | ||
418 | * list. io jobs call wake when they complete and it all | ||
419 | * starts again. | ||
420 | */ | ||
421 | process_jobs(&_complete_jobs, run_complete_job); | ||
422 | process_jobs(&_pages_jobs, run_pages_job); | ||
423 | process_jobs(&_io_jobs, run_io_job); | ||
424 | } | ||
425 | |||
426 | /* | ||
427 | * If we are copying a small region we just dispatch a single job | ||
428 | * to do the copy, otherwise the io has to be split up into many | ||
429 | * jobs. | ||
430 | */ | ||
431 | static void dispatch_job(struct kcopyd_job *job) | ||
432 | { | ||
433 | push(&_pages_jobs, job); | ||
434 | wake(); | ||
435 | } | ||
436 | |||
437 | #define SUB_JOB_SIZE 128 | ||
438 | static void segment_complete(int read_err, | ||
439 | unsigned int write_err, void *context) | ||
440 | { | ||
441 | /* FIXME: tidy this function */ | ||
442 | sector_t progress = 0; | ||
443 | sector_t count = 0; | ||
444 | struct kcopyd_job *job = (struct kcopyd_job *) context; | ||
445 | |||
446 | down(&job->lock); | ||
447 | |||
448 | /* update the error */ | ||
449 | if (read_err) | ||
450 | job->read_err = 1; | ||
451 | |||
452 | if (write_err) | ||
453 | job->write_err &= write_err; | ||
454 | |||
455 | /* | ||
456 | * Only dispatch more work if there hasn't been an error. | ||
457 | */ | ||
458 | if ((!job->read_err && !job->write_err) || | ||
459 | test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { | ||
460 | /* get the next chunk of work */ | ||
461 | progress = job->progress; | ||
462 | count = job->source.count - progress; | ||
463 | if (count) { | ||
464 | if (count > SUB_JOB_SIZE) | ||
465 | count = SUB_JOB_SIZE; | ||
466 | |||
467 | job->progress += count; | ||
468 | } | ||
469 | } | ||
470 | up(&job->lock); | ||
471 | |||
472 | if (count) { | ||
473 | int i; | ||
474 | struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO); | ||
475 | |||
476 | *sub_job = *job; | ||
477 | sub_job->source.sector += progress; | ||
478 | sub_job->source.count = count; | ||
479 | |||
480 | for (i = 0; i < job->num_dests; i++) { | ||
481 | sub_job->dests[i].sector += progress; | ||
482 | sub_job->dests[i].count = count; | ||
483 | } | ||
484 | |||
485 | sub_job->fn = segment_complete; | ||
486 | sub_job->context = job; | ||
487 | dispatch_job(sub_job); | ||
488 | |||
489 | } else if (atomic_dec_and_test(&job->sub_jobs)) { | ||
490 | |||
491 | /* | ||
492 | * To avoid a race we must keep the job around | ||
493 | * until after the notify function has completed. | ||
494 | * Otherwise the client may try and stop the job | ||
495 | * after we've completed. | ||
496 | */ | ||
497 | job->fn(read_err, write_err, job->context); | ||
498 | mempool_free(job, _job_pool); | ||
499 | } | ||
500 | } | ||
501 | |||
502 | /* | ||
503 | * Create some little jobs that will do the move between | ||
504 | * them. | ||
505 | */ | ||
506 | #define SPLIT_COUNT 8 | ||
507 | static void split_job(struct kcopyd_job *job) | ||
508 | { | ||
509 | int i; | ||
510 | |||
511 | atomic_set(&job->sub_jobs, SPLIT_COUNT); | ||
512 | for (i = 0; i < SPLIT_COUNT; i++) | ||
513 | segment_complete(0, 0u, job); | ||
514 | } | ||
515 | |||
516 | int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, | ||
517 | unsigned int num_dests, struct io_region *dests, | ||
518 | unsigned int flags, kcopyd_notify_fn fn, void *context) | ||
519 | { | ||
520 | struct kcopyd_job *job; | ||
521 | |||
522 | /* | ||
523 | * Allocate a new job. | ||
524 | */ | ||
525 | job = mempool_alloc(_job_pool, GFP_NOIO); | ||
526 | |||
527 | /* | ||
528 | * set up for the read. | ||
529 | */ | ||
530 | job->kc = kc; | ||
531 | job->flags = flags; | ||
532 | job->read_err = 0; | ||
533 | job->write_err = 0; | ||
534 | job->rw = READ; | ||
535 | |||
536 | job->source = *from; | ||
537 | |||
538 | job->num_dests = num_dests; | ||
539 | memcpy(&job->dests, dests, sizeof(*dests) * num_dests); | ||
540 | |||
541 | job->offset = 0; | ||
542 | job->nr_pages = 0; | ||
543 | job->pages = NULL; | ||
544 | |||
545 | job->fn = fn; | ||
546 | job->context = context; | ||
547 | |||
548 | if (job->source.count < SUB_JOB_SIZE) | ||
549 | dispatch_job(job); | ||
550 | |||
551 | else { | ||
552 | init_MUTEX(&job->lock); | ||
553 | job->progress = 0; | ||
554 | split_job(job); | ||
555 | } | ||
556 | |||
557 | return 0; | ||
558 | } | ||
559 | |||
560 | /* | ||
561 | * Cancels a kcopyd job, eg. someone might be deactivating a | ||
562 | * mirror. | ||
563 | */ | ||
564 | int kcopyd_cancel(struct kcopyd_job *job, int block) | ||
565 | { | ||
566 | /* FIXME: finish */ | ||
567 | return -1; | ||
568 | } | ||
569 | |||
570 | /*----------------------------------------------------------------- | ||
571 | * Unit setup | ||
572 | *---------------------------------------------------------------*/ | ||
573 | static DECLARE_MUTEX(_client_lock); | ||
574 | static LIST_HEAD(_clients); | ||
575 | |||
576 | static void client_add(struct kcopyd_client *kc) | ||
577 | { | ||
578 | down(&_client_lock); | ||
579 | list_add(&kc->list, &_clients); | ||
580 | up(&_client_lock); | ||
581 | } | ||
582 | |||
583 | static void client_del(struct kcopyd_client *kc) | ||
584 | { | ||
585 | down(&_client_lock); | ||
586 | list_del(&kc->list); | ||
587 | up(&_client_lock); | ||
588 | } | ||
589 | |||
590 | static DECLARE_MUTEX(kcopyd_init_lock); | ||
591 | static int kcopyd_clients = 0; | ||
592 | |||
593 | static int kcopyd_init(void) | ||
594 | { | ||
595 | int r; | ||
596 | |||
597 | down(&kcopyd_init_lock); | ||
598 | |||
599 | if (kcopyd_clients) { | ||
600 | /* Already initialized. */ | ||
601 | kcopyd_clients++; | ||
602 | up(&kcopyd_init_lock); | ||
603 | return 0; | ||
604 | } | ||
605 | |||
606 | r = jobs_init(); | ||
607 | if (r) { | ||
608 | up(&kcopyd_init_lock); | ||
609 | return r; | ||
610 | } | ||
611 | |||
612 | _kcopyd_wq = create_singlethread_workqueue("kcopyd"); | ||
613 | if (!_kcopyd_wq) { | ||
614 | jobs_exit(); | ||
615 | up(&kcopyd_init_lock); | ||
616 | return -ENOMEM; | ||
617 | } | ||
618 | |||
619 | kcopyd_clients++; | ||
620 | INIT_WORK(&_kcopyd_work, do_work, NULL); | ||
621 | up(&kcopyd_init_lock); | ||
622 | return 0; | ||
623 | } | ||
624 | |||
625 | static void kcopyd_exit(void) | ||
626 | { | ||
627 | down(&kcopyd_init_lock); | ||
628 | kcopyd_clients--; | ||
629 | if (!kcopyd_clients) { | ||
630 | jobs_exit(); | ||
631 | destroy_workqueue(_kcopyd_wq); | ||
632 | _kcopyd_wq = NULL; | ||
633 | } | ||
634 | up(&kcopyd_init_lock); | ||
635 | } | ||
636 | |||
637 | int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result) | ||
638 | { | ||
639 | int r = 0; | ||
640 | struct kcopyd_client *kc; | ||
641 | |||
642 | r = kcopyd_init(); | ||
643 | if (r) | ||
644 | return r; | ||
645 | |||
646 | kc = kmalloc(sizeof(*kc), GFP_KERNEL); | ||
647 | if (!kc) { | ||
648 | kcopyd_exit(); | ||
649 | return -ENOMEM; | ||
650 | } | ||
651 | |||
652 | spin_lock_init(&kc->lock); | ||
653 | kc->pages = NULL; | ||
654 | kc->nr_pages = kc->nr_free_pages = 0; | ||
655 | r = client_alloc_pages(kc, nr_pages); | ||
656 | if (r) { | ||
657 | kfree(kc); | ||
658 | kcopyd_exit(); | ||
659 | return r; | ||
660 | } | ||
661 | |||
662 | r = dm_io_get(nr_pages); | ||
663 | if (r) { | ||
664 | client_free_pages(kc); | ||
665 | kfree(kc); | ||
666 | kcopyd_exit(); | ||
667 | return r; | ||
668 | } | ||
669 | |||
670 | client_add(kc); | ||
671 | *result = kc; | ||
672 | return 0; | ||
673 | } | ||
674 | |||
675 | void kcopyd_client_destroy(struct kcopyd_client *kc) | ||
676 | { | ||
677 | dm_io_put(kc->nr_pages); | ||
678 | client_free_pages(kc); | ||
679 | client_del(kc); | ||
680 | kfree(kc); | ||
681 | kcopyd_exit(); | ||
682 | } | ||
683 | |||
684 | EXPORT_SYMBOL(kcopyd_client_create); | ||
685 | EXPORT_SYMBOL(kcopyd_client_destroy); | ||
686 | EXPORT_SYMBOL(kcopyd_copy); | ||
687 | EXPORT_SYMBOL(kcopyd_cancel); | ||
diff --git a/drivers/md/kcopyd.h b/drivers/md/kcopyd.h new file mode 100644 index 000000000000..4621ea055c0e --- /dev/null +++ b/drivers/md/kcopyd.h | |||
@@ -0,0 +1,42 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001 Sistina Software | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | * | ||
6 | * Kcopyd provides a simple interface for copying an area of one | ||
7 | * block-device to one or more other block-devices, with an asynchronous | ||
8 | * completion notification. | ||
9 | */ | ||
10 | |||
11 | #ifndef DM_KCOPYD_H | ||
12 | #define DM_KCOPYD_H | ||
13 | |||
14 | #include "dm-io.h" | ||
15 | |||
16 | /* FIXME: make this configurable */ | ||
17 | #define KCOPYD_MAX_REGIONS 8 | ||
18 | |||
19 | #define KCOPYD_IGNORE_ERROR 1 | ||
20 | |||
21 | /* | ||
22 | * To use kcopyd you must first create a kcopyd client object. | ||
23 | */ | ||
24 | struct kcopyd_client; | ||
25 | int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result); | ||
26 | void kcopyd_client_destroy(struct kcopyd_client *kc); | ||
27 | |||
28 | /* | ||
29 | * Submit a copy job to kcopyd. This is built on top of the | ||
30 | * previous three fns. | ||
31 | * | ||
32 | * read_err is a boolean, | ||
33 | * write_err is a bitset, with 1 bit for each destination region | ||
34 | */ | ||
35 | typedef void (*kcopyd_notify_fn)(int read_err, | ||
36 | unsigned int write_err, void *context); | ||
37 | |||
38 | int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, | ||
39 | unsigned int num_dests, struct io_region *dests, | ||
40 | unsigned int flags, kcopyd_notify_fn fn, void *context); | ||
41 | |||
42 | #endif | ||
diff --git a/drivers/md/linear.c b/drivers/md/linear.c new file mode 100644 index 000000000000..161e9aa87291 --- /dev/null +++ b/drivers/md/linear.c | |||
@@ -0,0 +1,343 @@ | |||
1 | /* | ||
2 | linear.c : Multiple Devices driver for Linux | ||
3 | Copyright (C) 1994-96 Marc ZYNGIER | ||
4 | <zyngier@ufr-info-p7.ibp.fr> or | ||
5 | <maz@gloups.fdn.fr> | ||
6 | |||
7 | Linear mode management functions. | ||
8 | |||
9 | This program is free software; you can redistribute it and/or modify | ||
10 | it under the terms of the GNU General Public License as published by | ||
11 | the Free Software Foundation; either version 2, or (at your option) | ||
12 | any later version. | ||
13 | |||
14 | You should have received a copy of the GNU General Public License | ||
15 | (for example /usr/src/linux/COPYING); if not, write to the Free | ||
16 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/module.h> | ||
20 | |||
21 | #include <linux/raid/md.h> | ||
22 | #include <linux/slab.h> | ||
23 | #include <linux/raid/linear.h> | ||
24 | |||
25 | #define MAJOR_NR MD_MAJOR | ||
26 | #define MD_DRIVER | ||
27 | #define MD_PERSONALITY | ||
28 | |||
29 | /* | ||
30 | * find which device holds a particular offset | ||
31 | */ | ||
32 | static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) | ||
33 | { | ||
34 | dev_info_t *hash; | ||
35 | linear_conf_t *conf = mddev_to_conf(mddev); | ||
36 | sector_t block = sector >> 1; | ||
37 | |||
38 | /* | ||
39 | * sector_div(a,b) returns the remainer and sets a to a/b | ||
40 | */ | ||
41 | (void)sector_div(block, conf->smallest->size); | ||
42 | hash = conf->hash_table[block]; | ||
43 | |||
44 | while ((sector>>1) >= (hash->size + hash->offset)) | ||
45 | hash++; | ||
46 | return hash; | ||
47 | } | ||
48 | |||
49 | /** | ||
50 | * linear_mergeable_bvec -- tell bio layer if a two requests can be merged | ||
51 | * @q: request queue | ||
52 | * @bio: the buffer head that's been built up so far | ||
53 | * @biovec: the request that could be merged to it. | ||
54 | * | ||
55 | * Return amount of bytes we can take at this offset | ||
56 | */ | ||
57 | static int linear_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec) | ||
58 | { | ||
59 | mddev_t *mddev = q->queuedata; | ||
60 | dev_info_t *dev0; | ||
61 | unsigned long maxsectors, bio_sectors = bio->bi_size >> 9; | ||
62 | sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); | ||
63 | |||
64 | dev0 = which_dev(mddev, sector); | ||
65 | maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1)); | ||
66 | |||
67 | if (maxsectors < bio_sectors) | ||
68 | maxsectors = 0; | ||
69 | else | ||
70 | maxsectors -= bio_sectors; | ||
71 | |||
72 | if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) | ||
73 | return biovec->bv_len; | ||
74 | /* The bytes available at this offset could be really big, | ||
75 | * so we cap at 2^31 to avoid overflow */ | ||
76 | if (maxsectors > (1 << (31-9))) | ||
77 | return 1<<31; | ||
78 | return maxsectors << 9; | ||
79 | } | ||
80 | |||
81 | static void linear_unplug(request_queue_t *q) | ||
82 | { | ||
83 | mddev_t *mddev = q->queuedata; | ||
84 | linear_conf_t *conf = mddev_to_conf(mddev); | ||
85 | int i; | ||
86 | |||
87 | for (i=0; i < mddev->raid_disks; i++) { | ||
88 | request_queue_t *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); | ||
89 | if (r_queue->unplug_fn) | ||
90 | r_queue->unplug_fn(r_queue); | ||
91 | } | ||
92 | } | ||
93 | |||
94 | static int linear_issue_flush(request_queue_t *q, struct gendisk *disk, | ||
95 | sector_t *error_sector) | ||
96 | { | ||
97 | mddev_t *mddev = q->queuedata; | ||
98 | linear_conf_t *conf = mddev_to_conf(mddev); | ||
99 | int i, ret = 0; | ||
100 | |||
101 | for (i=0; i < mddev->raid_disks && ret == 0; i++) { | ||
102 | struct block_device *bdev = conf->disks[i].rdev->bdev; | ||
103 | request_queue_t *r_queue = bdev_get_queue(bdev); | ||
104 | |||
105 | if (!r_queue->issue_flush_fn) | ||
106 | ret = -EOPNOTSUPP; | ||
107 | else | ||
108 | ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); | ||
109 | } | ||
110 | return ret; | ||
111 | } | ||
112 | |||
113 | static int linear_run (mddev_t *mddev) | ||
114 | { | ||
115 | linear_conf_t *conf; | ||
116 | dev_info_t **table; | ||
117 | mdk_rdev_t *rdev; | ||
118 | int i, nb_zone, cnt; | ||
119 | sector_t start; | ||
120 | sector_t curr_offset; | ||
121 | struct list_head *tmp; | ||
122 | |||
123 | conf = kmalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t), | ||
124 | GFP_KERNEL); | ||
125 | if (!conf) | ||
126 | goto out; | ||
127 | memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t)); | ||
128 | mddev->private = conf; | ||
129 | |||
130 | /* | ||
131 | * Find the smallest device. | ||
132 | */ | ||
133 | |||
134 | conf->smallest = NULL; | ||
135 | cnt = 0; | ||
136 | mddev->array_size = 0; | ||
137 | |||
138 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
139 | int j = rdev->raid_disk; | ||
140 | dev_info_t *disk = conf->disks + j; | ||
141 | |||
142 | if (j < 0 || j > mddev->raid_disks || disk->rdev) { | ||
143 | printk("linear: disk numbering problem. Aborting!\n"); | ||
144 | goto out; | ||
145 | } | ||
146 | |||
147 | disk->rdev = rdev; | ||
148 | |||
149 | blk_queue_stack_limits(mddev->queue, | ||
150 | rdev->bdev->bd_disk->queue); | ||
151 | /* as we don't honour merge_bvec_fn, we must never risk | ||
152 | * violating it, so limit ->max_sector to one PAGE, as | ||
153 | * a one page request is never in violation. | ||
154 | */ | ||
155 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | ||
156 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | ||
157 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | ||
158 | |||
159 | disk->size = rdev->size; | ||
160 | mddev->array_size += rdev->size; | ||
161 | |||
162 | if (!conf->smallest || (disk->size < conf->smallest->size)) | ||
163 | conf->smallest = disk; | ||
164 | cnt++; | ||
165 | } | ||
166 | if (cnt != mddev->raid_disks) { | ||
167 | printk("linear: not enough drives present. Aborting!\n"); | ||
168 | goto out; | ||
169 | } | ||
170 | |||
171 | /* | ||
172 | * This code was restructured to work around a gcc-2.95.3 internal | ||
173 | * compiler error. Alter it with care. | ||
174 | */ | ||
175 | { | ||
176 | sector_t sz; | ||
177 | unsigned round; | ||
178 | unsigned long base; | ||
179 | |||
180 | sz = mddev->array_size; | ||
181 | base = conf->smallest->size; | ||
182 | round = sector_div(sz, base); | ||
183 | nb_zone = conf->nr_zones = sz + (round ? 1 : 0); | ||
184 | } | ||
185 | |||
186 | conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone, | ||
187 | GFP_KERNEL); | ||
188 | if (!conf->hash_table) | ||
189 | goto out; | ||
190 | |||
191 | /* | ||
192 | * Here we generate the linear hash table | ||
193 | */ | ||
194 | table = conf->hash_table; | ||
195 | start = 0; | ||
196 | curr_offset = 0; | ||
197 | for (i = 0; i < cnt; i++) { | ||
198 | dev_info_t *disk = conf->disks + i; | ||
199 | |||
200 | disk->offset = curr_offset; | ||
201 | curr_offset += disk->size; | ||
202 | |||
203 | /* 'curr_offset' is the end of this disk | ||
204 | * 'start' is the start of table | ||
205 | */ | ||
206 | while (start < curr_offset) { | ||
207 | *table++ = disk; | ||
208 | start += conf->smallest->size; | ||
209 | } | ||
210 | } | ||
211 | if (table-conf->hash_table != nb_zone) | ||
212 | BUG(); | ||
213 | |||
214 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); | ||
215 | mddev->queue->unplug_fn = linear_unplug; | ||
216 | mddev->queue->issue_flush_fn = linear_issue_flush; | ||
217 | return 0; | ||
218 | |||
219 | out: | ||
220 | if (conf) | ||
221 | kfree(conf); | ||
222 | return 1; | ||
223 | } | ||
224 | |||
225 | static int linear_stop (mddev_t *mddev) | ||
226 | { | ||
227 | linear_conf_t *conf = mddev_to_conf(mddev); | ||
228 | |||
229 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | ||
230 | kfree(conf->hash_table); | ||
231 | kfree(conf); | ||
232 | |||
233 | return 0; | ||
234 | } | ||
235 | |||
236 | static int linear_make_request (request_queue_t *q, struct bio *bio) | ||
237 | { | ||
238 | mddev_t *mddev = q->queuedata; | ||
239 | dev_info_t *tmp_dev; | ||
240 | sector_t block; | ||
241 | |||
242 | if (bio_data_dir(bio)==WRITE) { | ||
243 | disk_stat_inc(mddev->gendisk, writes); | ||
244 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); | ||
245 | } else { | ||
246 | disk_stat_inc(mddev->gendisk, reads); | ||
247 | disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); | ||
248 | } | ||
249 | |||
250 | tmp_dev = which_dev(mddev, bio->bi_sector); | ||
251 | block = bio->bi_sector >> 1; | ||
252 | |||
253 | if (unlikely(block >= (tmp_dev->size + tmp_dev->offset) | ||
254 | || block < tmp_dev->offset)) { | ||
255 | char b[BDEVNAME_SIZE]; | ||
256 | |||
257 | printk("linear_make_request: Block %llu out of bounds on " | ||
258 | "dev %s size %llu offset %llu\n", | ||
259 | (unsigned long long)block, | ||
260 | bdevname(tmp_dev->rdev->bdev, b), | ||
261 | (unsigned long long)tmp_dev->size, | ||
262 | (unsigned long long)tmp_dev->offset); | ||
263 | bio_io_error(bio, bio->bi_size); | ||
264 | return 0; | ||
265 | } | ||
266 | if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > | ||
267 | (tmp_dev->offset + tmp_dev->size)<<1)) { | ||
268 | /* This bio crosses a device boundary, so we have to | ||
269 | * split it. | ||
270 | */ | ||
271 | struct bio_pair *bp; | ||
272 | bp = bio_split(bio, bio_split_pool, | ||
273 | (bio->bi_sector + (bio->bi_size >> 9) - | ||
274 | (tmp_dev->offset + tmp_dev->size))<<1); | ||
275 | if (linear_make_request(q, &bp->bio1)) | ||
276 | generic_make_request(&bp->bio1); | ||
277 | if (linear_make_request(q, &bp->bio2)) | ||
278 | generic_make_request(&bp->bio2); | ||
279 | bio_pair_release(bp); | ||
280 | return 0; | ||
281 | } | ||
282 | |||
283 | bio->bi_bdev = tmp_dev->rdev->bdev; | ||
284 | bio->bi_sector = bio->bi_sector - (tmp_dev->offset << 1) + tmp_dev->rdev->data_offset; | ||
285 | |||
286 | return 1; | ||
287 | } | ||
288 | |||
289 | static void linear_status (struct seq_file *seq, mddev_t *mddev) | ||
290 | { | ||
291 | |||
292 | #undef MD_DEBUG | ||
293 | #ifdef MD_DEBUG | ||
294 | int j; | ||
295 | linear_conf_t *conf = mddev_to_conf(mddev); | ||
296 | sector_t s = 0; | ||
297 | |||
298 | seq_printf(seq, " "); | ||
299 | for (j = 0; j < conf->nr_zones; j++) | ||
300 | { | ||
301 | char b[BDEVNAME_SIZE]; | ||
302 | s += conf->smallest_size; | ||
303 | seq_printf(seq, "[%s", | ||
304 | bdevname(conf->hash_table[j][0].rdev->bdev,b)); | ||
305 | |||
306 | while (s > conf->hash_table[j][0].offset + | ||
307 | conf->hash_table[j][0].size) | ||
308 | seq_printf(seq, "/%s] ", | ||
309 | bdevname(conf->hash_table[j][1].rdev->bdev,b)); | ||
310 | else | ||
311 | seq_printf(seq, "] "); | ||
312 | } | ||
313 | seq_printf(seq, "\n"); | ||
314 | #endif | ||
315 | seq_printf(seq, " %dk rounding", mddev->chunk_size/1024); | ||
316 | } | ||
317 | |||
318 | |||
319 | static mdk_personality_t linear_personality= | ||
320 | { | ||
321 | .name = "linear", | ||
322 | .owner = THIS_MODULE, | ||
323 | .make_request = linear_make_request, | ||
324 | .run = linear_run, | ||
325 | .stop = linear_stop, | ||
326 | .status = linear_status, | ||
327 | }; | ||
328 | |||
329 | static int __init linear_init (void) | ||
330 | { | ||
331 | return register_md_personality (LINEAR, &linear_personality); | ||
332 | } | ||
333 | |||
334 | static void linear_exit (void) | ||
335 | { | ||
336 | unregister_md_personality (LINEAR); | ||
337 | } | ||
338 | |||
339 | |||
340 | module_init(linear_init); | ||
341 | module_exit(linear_exit); | ||
342 | MODULE_LICENSE("GPL"); | ||
343 | MODULE_ALIAS("md-personality-1"); /* LINEAR */ | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c new file mode 100644 index 000000000000..04562add1920 --- /dev/null +++ b/drivers/md/md.c | |||
@@ -0,0 +1,3766 @@ | |||
1 | /* | ||
2 | md.c : Multiple Devices driver for Linux | ||
3 | Copyright (C) 1998, 1999, 2000 Ingo Molnar | ||
4 | |||
5 | completely rewritten, based on the MD driver code from Marc Zyngier | ||
6 | |||
7 | Changes: | ||
8 | |||
9 | - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar | ||
10 | - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> | ||
11 | - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> | ||
12 | - kerneld support by Boris Tobotras <boris@xtalk.msk.su> | ||
13 | - kmod support by: Cyrus Durgin | ||
14 | - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> | ||
15 | - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> | ||
16 | |||
17 | - lots of fixes and improvements to the RAID1/RAID5 and generic | ||
18 | RAID code (such as request based resynchronization): | ||
19 | |||
20 | Neil Brown <neilb@cse.unsw.edu.au>. | ||
21 | |||
22 | This program is free software; you can redistribute it and/or modify | ||
23 | it under the terms of the GNU General Public License as published by | ||
24 | the Free Software Foundation; either version 2, or (at your option) | ||
25 | any later version. | ||
26 | |||
27 | You should have received a copy of the GNU General Public License | ||
28 | (for example /usr/src/linux/COPYING); if not, write to the Free | ||
29 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
30 | */ | ||
31 | |||
32 | #include <linux/module.h> | ||
33 | #include <linux/config.h> | ||
34 | #include <linux/linkage.h> | ||
35 | #include <linux/raid/md.h> | ||
36 | #include <linux/sysctl.h> | ||
37 | #include <linux/devfs_fs_kernel.h> | ||
38 | #include <linux/buffer_head.h> /* for invalidate_bdev */ | ||
39 | #include <linux/suspend.h> | ||
40 | |||
41 | #include <linux/init.h> | ||
42 | |||
43 | #ifdef CONFIG_KMOD | ||
44 | #include <linux/kmod.h> | ||
45 | #endif | ||
46 | |||
47 | #include <asm/unaligned.h> | ||
48 | |||
49 | #define MAJOR_NR MD_MAJOR | ||
50 | #define MD_DRIVER | ||
51 | |||
52 | /* 63 partitions with the alternate major number (mdp) */ | ||
53 | #define MdpMinorShift 6 | ||
54 | |||
55 | #define DEBUG 0 | ||
56 | #define dprintk(x...) ((void)(DEBUG && printk(x))) | ||
57 | |||
58 | |||
59 | #ifndef MODULE | ||
60 | static void autostart_arrays (int part); | ||
61 | #endif | ||
62 | |||
63 | static mdk_personality_t *pers[MAX_PERSONALITY]; | ||
64 | static DEFINE_SPINLOCK(pers_lock); | ||
65 | |||
66 | /* | ||
67 | * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' | ||
68 | * is 1000 KB/sec, so the extra system load does not show up that much. | ||
69 | * Increase it if you want to have more _guaranteed_ speed. Note that | ||
70 | * the RAID driver will use the maximum available bandwith if the IO | ||
71 | * subsystem is idle. There is also an 'absolute maximum' reconstruction | ||
72 | * speed limit - in case reconstruction slows down your system despite | ||
73 | * idle IO detection. | ||
74 | * | ||
75 | * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. | ||
76 | */ | ||
77 | |||
78 | static int sysctl_speed_limit_min = 1000; | ||
79 | static int sysctl_speed_limit_max = 200000; | ||
80 | |||
81 | static struct ctl_table_header *raid_table_header; | ||
82 | |||
83 | static ctl_table raid_table[] = { | ||
84 | { | ||
85 | .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, | ||
86 | .procname = "speed_limit_min", | ||
87 | .data = &sysctl_speed_limit_min, | ||
88 | .maxlen = sizeof(int), | ||
89 | .mode = 0644, | ||
90 | .proc_handler = &proc_dointvec, | ||
91 | }, | ||
92 | { | ||
93 | .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, | ||
94 | .procname = "speed_limit_max", | ||
95 | .data = &sysctl_speed_limit_max, | ||
96 | .maxlen = sizeof(int), | ||
97 | .mode = 0644, | ||
98 | .proc_handler = &proc_dointvec, | ||
99 | }, | ||
100 | { .ctl_name = 0 } | ||
101 | }; | ||
102 | |||
103 | static ctl_table raid_dir_table[] = { | ||
104 | { | ||
105 | .ctl_name = DEV_RAID, | ||
106 | .procname = "raid", | ||
107 | .maxlen = 0, | ||
108 | .mode = 0555, | ||
109 | .child = raid_table, | ||
110 | }, | ||
111 | { .ctl_name = 0 } | ||
112 | }; | ||
113 | |||
114 | static ctl_table raid_root_table[] = { | ||
115 | { | ||
116 | .ctl_name = CTL_DEV, | ||
117 | .procname = "dev", | ||
118 | .maxlen = 0, | ||
119 | .mode = 0555, | ||
120 | .child = raid_dir_table, | ||
121 | }, | ||
122 | { .ctl_name = 0 } | ||
123 | }; | ||
124 | |||
125 | static struct block_device_operations md_fops; | ||
126 | |||
127 | /* | ||
128 | * Enables to iterate over all existing md arrays | ||
129 | * all_mddevs_lock protects this list. | ||
130 | */ | ||
131 | static LIST_HEAD(all_mddevs); | ||
132 | static DEFINE_SPINLOCK(all_mddevs_lock); | ||
133 | |||
134 | |||
135 | /* | ||
136 | * iterates through all used mddevs in the system. | ||
137 | * We take care to grab the all_mddevs_lock whenever navigating | ||
138 | * the list, and to always hold a refcount when unlocked. | ||
139 | * Any code which breaks out of this loop while own | ||
140 | * a reference to the current mddev and must mddev_put it. | ||
141 | */ | ||
142 | #define ITERATE_MDDEV(mddev,tmp) \ | ||
143 | \ | ||
144 | for (({ spin_lock(&all_mddevs_lock); \ | ||
145 | tmp = all_mddevs.next; \ | ||
146 | mddev = NULL;}); \ | ||
147 | ({ if (tmp != &all_mddevs) \ | ||
148 | mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ | ||
149 | spin_unlock(&all_mddevs_lock); \ | ||
150 | if (mddev) mddev_put(mddev); \ | ||
151 | mddev = list_entry(tmp, mddev_t, all_mddevs); \ | ||
152 | tmp != &all_mddevs;}); \ | ||
153 | ({ spin_lock(&all_mddevs_lock); \ | ||
154 | tmp = tmp->next;}) \ | ||
155 | ) | ||
156 | |||
157 | |||
158 | static int md_fail_request (request_queue_t *q, struct bio *bio) | ||
159 | { | ||
160 | bio_io_error(bio, bio->bi_size); | ||
161 | return 0; | ||
162 | } | ||
163 | |||
164 | static inline mddev_t *mddev_get(mddev_t *mddev) | ||
165 | { | ||
166 | atomic_inc(&mddev->active); | ||
167 | return mddev; | ||
168 | } | ||
169 | |||
170 | static void mddev_put(mddev_t *mddev) | ||
171 | { | ||
172 | if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) | ||
173 | return; | ||
174 | if (!mddev->raid_disks && list_empty(&mddev->disks)) { | ||
175 | list_del(&mddev->all_mddevs); | ||
176 | blk_put_queue(mddev->queue); | ||
177 | kfree(mddev); | ||
178 | } | ||
179 | spin_unlock(&all_mddevs_lock); | ||
180 | } | ||
181 | |||
182 | static mddev_t * mddev_find(dev_t unit) | ||
183 | { | ||
184 | mddev_t *mddev, *new = NULL; | ||
185 | |||
186 | retry: | ||
187 | spin_lock(&all_mddevs_lock); | ||
188 | list_for_each_entry(mddev, &all_mddevs, all_mddevs) | ||
189 | if (mddev->unit == unit) { | ||
190 | mddev_get(mddev); | ||
191 | spin_unlock(&all_mddevs_lock); | ||
192 | if (new) | ||
193 | kfree(new); | ||
194 | return mddev; | ||
195 | } | ||
196 | |||
197 | if (new) { | ||
198 | list_add(&new->all_mddevs, &all_mddevs); | ||
199 | spin_unlock(&all_mddevs_lock); | ||
200 | return new; | ||
201 | } | ||
202 | spin_unlock(&all_mddevs_lock); | ||
203 | |||
204 | new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); | ||
205 | if (!new) | ||
206 | return NULL; | ||
207 | |||
208 | memset(new, 0, sizeof(*new)); | ||
209 | |||
210 | new->unit = unit; | ||
211 | if (MAJOR(unit) == MD_MAJOR) | ||
212 | new->md_minor = MINOR(unit); | ||
213 | else | ||
214 | new->md_minor = MINOR(unit) >> MdpMinorShift; | ||
215 | |||
216 | init_MUTEX(&new->reconfig_sem); | ||
217 | INIT_LIST_HEAD(&new->disks); | ||
218 | INIT_LIST_HEAD(&new->all_mddevs); | ||
219 | init_timer(&new->safemode_timer); | ||
220 | atomic_set(&new->active, 1); | ||
221 | |||
222 | new->queue = blk_alloc_queue(GFP_KERNEL); | ||
223 | if (!new->queue) { | ||
224 | kfree(new); | ||
225 | return NULL; | ||
226 | } | ||
227 | |||
228 | blk_queue_make_request(new->queue, md_fail_request); | ||
229 | |||
230 | goto retry; | ||
231 | } | ||
232 | |||
233 | static inline int mddev_lock(mddev_t * mddev) | ||
234 | { | ||
235 | return down_interruptible(&mddev->reconfig_sem); | ||
236 | } | ||
237 | |||
238 | static inline void mddev_lock_uninterruptible(mddev_t * mddev) | ||
239 | { | ||
240 | down(&mddev->reconfig_sem); | ||
241 | } | ||
242 | |||
243 | static inline int mddev_trylock(mddev_t * mddev) | ||
244 | { | ||
245 | return down_trylock(&mddev->reconfig_sem); | ||
246 | } | ||
247 | |||
248 | static inline void mddev_unlock(mddev_t * mddev) | ||
249 | { | ||
250 | up(&mddev->reconfig_sem); | ||
251 | |||
252 | if (mddev->thread) | ||
253 | md_wakeup_thread(mddev->thread); | ||
254 | } | ||
255 | |||
256 | mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) | ||
257 | { | ||
258 | mdk_rdev_t * rdev; | ||
259 | struct list_head *tmp; | ||
260 | |||
261 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
262 | if (rdev->desc_nr == nr) | ||
263 | return rdev; | ||
264 | } | ||
265 | return NULL; | ||
266 | } | ||
267 | |||
268 | static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) | ||
269 | { | ||
270 | struct list_head *tmp; | ||
271 | mdk_rdev_t *rdev; | ||
272 | |||
273 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
274 | if (rdev->bdev->bd_dev == dev) | ||
275 | return rdev; | ||
276 | } | ||
277 | return NULL; | ||
278 | } | ||
279 | |||
280 | inline static sector_t calc_dev_sboffset(struct block_device *bdev) | ||
281 | { | ||
282 | sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; | ||
283 | return MD_NEW_SIZE_BLOCKS(size); | ||
284 | } | ||
285 | |||
286 | static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) | ||
287 | { | ||
288 | sector_t size; | ||
289 | |||
290 | size = rdev->sb_offset; | ||
291 | |||
292 | if (chunk_size) | ||
293 | size &= ~((sector_t)chunk_size/1024 - 1); | ||
294 | return size; | ||
295 | } | ||
296 | |||
297 | static int alloc_disk_sb(mdk_rdev_t * rdev) | ||
298 | { | ||
299 | if (rdev->sb_page) | ||
300 | MD_BUG(); | ||
301 | |||
302 | rdev->sb_page = alloc_page(GFP_KERNEL); | ||
303 | if (!rdev->sb_page) { | ||
304 | printk(KERN_ALERT "md: out of memory.\n"); | ||
305 | return -EINVAL; | ||
306 | } | ||
307 | |||
308 | return 0; | ||
309 | } | ||
310 | |||
311 | static void free_disk_sb(mdk_rdev_t * rdev) | ||
312 | { | ||
313 | if (rdev->sb_page) { | ||
314 | page_cache_release(rdev->sb_page); | ||
315 | rdev->sb_loaded = 0; | ||
316 | rdev->sb_page = NULL; | ||
317 | rdev->sb_offset = 0; | ||
318 | rdev->size = 0; | ||
319 | } | ||
320 | } | ||
321 | |||
322 | |||
323 | static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) | ||
324 | { | ||
325 | if (bio->bi_size) | ||
326 | return 1; | ||
327 | |||
328 | complete((struct completion*)bio->bi_private); | ||
329 | return 0; | ||
330 | } | ||
331 | |||
332 | static int sync_page_io(struct block_device *bdev, sector_t sector, int size, | ||
333 | struct page *page, int rw) | ||
334 | { | ||
335 | struct bio *bio = bio_alloc(GFP_KERNEL, 1); | ||
336 | struct completion event; | ||
337 | int ret; | ||
338 | |||
339 | rw |= (1 << BIO_RW_SYNC); | ||
340 | |||
341 | bio->bi_bdev = bdev; | ||
342 | bio->bi_sector = sector; | ||
343 | bio_add_page(bio, page, size, 0); | ||
344 | init_completion(&event); | ||
345 | bio->bi_private = &event; | ||
346 | bio->bi_end_io = bi_complete; | ||
347 | submit_bio(rw, bio); | ||
348 | wait_for_completion(&event); | ||
349 | |||
350 | ret = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
351 | bio_put(bio); | ||
352 | return ret; | ||
353 | } | ||
354 | |||
355 | static int read_disk_sb(mdk_rdev_t * rdev) | ||
356 | { | ||
357 | char b[BDEVNAME_SIZE]; | ||
358 | if (!rdev->sb_page) { | ||
359 | MD_BUG(); | ||
360 | return -EINVAL; | ||
361 | } | ||
362 | if (rdev->sb_loaded) | ||
363 | return 0; | ||
364 | |||
365 | |||
366 | if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) | ||
367 | goto fail; | ||
368 | rdev->sb_loaded = 1; | ||
369 | return 0; | ||
370 | |||
371 | fail: | ||
372 | printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", | ||
373 | bdevname(rdev->bdev,b)); | ||
374 | return -EINVAL; | ||
375 | } | ||
376 | |||
377 | static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) | ||
378 | { | ||
379 | if ( (sb1->set_uuid0 == sb2->set_uuid0) && | ||
380 | (sb1->set_uuid1 == sb2->set_uuid1) && | ||
381 | (sb1->set_uuid2 == sb2->set_uuid2) && | ||
382 | (sb1->set_uuid3 == sb2->set_uuid3)) | ||
383 | |||
384 | return 1; | ||
385 | |||
386 | return 0; | ||
387 | } | ||
388 | |||
389 | |||
390 | static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) | ||
391 | { | ||
392 | int ret; | ||
393 | mdp_super_t *tmp1, *tmp2; | ||
394 | |||
395 | tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); | ||
396 | tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); | ||
397 | |||
398 | if (!tmp1 || !tmp2) { | ||
399 | ret = 0; | ||
400 | printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); | ||
401 | goto abort; | ||
402 | } | ||
403 | |||
404 | *tmp1 = *sb1; | ||
405 | *tmp2 = *sb2; | ||
406 | |||
407 | /* | ||
408 | * nr_disks is not constant | ||
409 | */ | ||
410 | tmp1->nr_disks = 0; | ||
411 | tmp2->nr_disks = 0; | ||
412 | |||
413 | if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) | ||
414 | ret = 0; | ||
415 | else | ||
416 | ret = 1; | ||
417 | |||
418 | abort: | ||
419 | if (tmp1) | ||
420 | kfree(tmp1); | ||
421 | if (tmp2) | ||
422 | kfree(tmp2); | ||
423 | |||
424 | return ret; | ||
425 | } | ||
426 | |||
427 | static unsigned int calc_sb_csum(mdp_super_t * sb) | ||
428 | { | ||
429 | unsigned int disk_csum, csum; | ||
430 | |||
431 | disk_csum = sb->sb_csum; | ||
432 | sb->sb_csum = 0; | ||
433 | csum = csum_partial((void *)sb, MD_SB_BYTES, 0); | ||
434 | sb->sb_csum = disk_csum; | ||
435 | return csum; | ||
436 | } | ||
437 | |||
438 | |||
439 | /* | ||
440 | * Handle superblock details. | ||
441 | * We want to be able to handle multiple superblock formats | ||
442 | * so we have a common interface to them all, and an array of | ||
443 | * different handlers. | ||
444 | * We rely on user-space to write the initial superblock, and support | ||
445 | * reading and updating of superblocks. | ||
446 | * Interface methods are: | ||
447 | * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) | ||
448 | * loads and validates a superblock on dev. | ||
449 | * if refdev != NULL, compare superblocks on both devices | ||
450 | * Return: | ||
451 | * 0 - dev has a superblock that is compatible with refdev | ||
452 | * 1 - dev has a superblock that is compatible and newer than refdev | ||
453 | * so dev should be used as the refdev in future | ||
454 | * -EINVAL superblock incompatible or invalid | ||
455 | * -othererror e.g. -EIO | ||
456 | * | ||
457 | * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) | ||
458 | * Verify that dev is acceptable into mddev. | ||
459 | * The first time, mddev->raid_disks will be 0, and data from | ||
460 | * dev should be merged in. Subsequent calls check that dev | ||
461 | * is new enough. Return 0 or -EINVAL | ||
462 | * | ||
463 | * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) | ||
464 | * Update the superblock for rdev with data in mddev | ||
465 | * This does not write to disc. | ||
466 | * | ||
467 | */ | ||
468 | |||
469 | struct super_type { | ||
470 | char *name; | ||
471 | struct module *owner; | ||
472 | int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); | ||
473 | int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); | ||
474 | void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); | ||
475 | }; | ||
476 | |||
477 | /* | ||
478 | * load_super for 0.90.0 | ||
479 | */ | ||
480 | static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | ||
481 | { | ||
482 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; | ||
483 | mdp_super_t *sb; | ||
484 | int ret; | ||
485 | sector_t sb_offset; | ||
486 | |||
487 | /* | ||
488 | * Calculate the position of the superblock, | ||
489 | * it's at the end of the disk. | ||
490 | * | ||
491 | * It also happens to be a multiple of 4Kb. | ||
492 | */ | ||
493 | sb_offset = calc_dev_sboffset(rdev->bdev); | ||
494 | rdev->sb_offset = sb_offset; | ||
495 | |||
496 | ret = read_disk_sb(rdev); | ||
497 | if (ret) return ret; | ||
498 | |||
499 | ret = -EINVAL; | ||
500 | |||
501 | bdevname(rdev->bdev, b); | ||
502 | sb = (mdp_super_t*)page_address(rdev->sb_page); | ||
503 | |||
504 | if (sb->md_magic != MD_SB_MAGIC) { | ||
505 | printk(KERN_ERR "md: invalid raid superblock magic on %s\n", | ||
506 | b); | ||
507 | goto abort; | ||
508 | } | ||
509 | |||
510 | if (sb->major_version != 0 || | ||
511 | sb->minor_version != 90) { | ||
512 | printk(KERN_WARNING "Bad version number %d.%d on %s\n", | ||
513 | sb->major_version, sb->minor_version, | ||
514 | b); | ||
515 | goto abort; | ||
516 | } | ||
517 | |||
518 | if (sb->raid_disks <= 0) | ||
519 | goto abort; | ||
520 | |||
521 | if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { | ||
522 | printk(KERN_WARNING "md: invalid superblock checksum on %s\n", | ||
523 | b); | ||
524 | goto abort; | ||
525 | } | ||
526 | |||
527 | rdev->preferred_minor = sb->md_minor; | ||
528 | rdev->data_offset = 0; | ||
529 | |||
530 | if (sb->level == LEVEL_MULTIPATH) | ||
531 | rdev->desc_nr = -1; | ||
532 | else | ||
533 | rdev->desc_nr = sb->this_disk.number; | ||
534 | |||
535 | if (refdev == 0) | ||
536 | ret = 1; | ||
537 | else { | ||
538 | __u64 ev1, ev2; | ||
539 | mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); | ||
540 | if (!uuid_equal(refsb, sb)) { | ||
541 | printk(KERN_WARNING "md: %s has different UUID to %s\n", | ||
542 | b, bdevname(refdev->bdev,b2)); | ||
543 | goto abort; | ||
544 | } | ||
545 | if (!sb_equal(refsb, sb)) { | ||
546 | printk(KERN_WARNING "md: %s has same UUID" | ||
547 | " but different superblock to %s\n", | ||
548 | b, bdevname(refdev->bdev, b2)); | ||
549 | goto abort; | ||
550 | } | ||
551 | ev1 = md_event(sb); | ||
552 | ev2 = md_event(refsb); | ||
553 | if (ev1 > ev2) | ||
554 | ret = 1; | ||
555 | else | ||
556 | ret = 0; | ||
557 | } | ||
558 | rdev->size = calc_dev_size(rdev, sb->chunk_size); | ||
559 | |||
560 | abort: | ||
561 | return ret; | ||
562 | } | ||
563 | |||
564 | /* | ||
565 | * validate_super for 0.90.0 | ||
566 | */ | ||
567 | static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | ||
568 | { | ||
569 | mdp_disk_t *desc; | ||
570 | mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); | ||
571 | |||
572 | if (mddev->raid_disks == 0) { | ||
573 | mddev->major_version = 0; | ||
574 | mddev->minor_version = sb->minor_version; | ||
575 | mddev->patch_version = sb->patch_version; | ||
576 | mddev->persistent = ! sb->not_persistent; | ||
577 | mddev->chunk_size = sb->chunk_size; | ||
578 | mddev->ctime = sb->ctime; | ||
579 | mddev->utime = sb->utime; | ||
580 | mddev->level = sb->level; | ||
581 | mddev->layout = sb->layout; | ||
582 | mddev->raid_disks = sb->raid_disks; | ||
583 | mddev->size = sb->size; | ||
584 | mddev->events = md_event(sb); | ||
585 | |||
586 | if (sb->state & (1<<MD_SB_CLEAN)) | ||
587 | mddev->recovery_cp = MaxSector; | ||
588 | else { | ||
589 | if (sb->events_hi == sb->cp_events_hi && | ||
590 | sb->events_lo == sb->cp_events_lo) { | ||
591 | mddev->recovery_cp = sb->recovery_cp; | ||
592 | } else | ||
593 | mddev->recovery_cp = 0; | ||
594 | } | ||
595 | |||
596 | memcpy(mddev->uuid+0, &sb->set_uuid0, 4); | ||
597 | memcpy(mddev->uuid+4, &sb->set_uuid1, 4); | ||
598 | memcpy(mddev->uuid+8, &sb->set_uuid2, 4); | ||
599 | memcpy(mddev->uuid+12,&sb->set_uuid3, 4); | ||
600 | |||
601 | mddev->max_disks = MD_SB_DISKS; | ||
602 | } else { | ||
603 | __u64 ev1; | ||
604 | ev1 = md_event(sb); | ||
605 | ++ev1; | ||
606 | if (ev1 < mddev->events) | ||
607 | return -EINVAL; | ||
608 | } | ||
609 | if (mddev->level != LEVEL_MULTIPATH) { | ||
610 | rdev->raid_disk = -1; | ||
611 | rdev->in_sync = rdev->faulty = 0; | ||
612 | desc = sb->disks + rdev->desc_nr; | ||
613 | |||
614 | if (desc->state & (1<<MD_DISK_FAULTY)) | ||
615 | rdev->faulty = 1; | ||
616 | else if (desc->state & (1<<MD_DISK_SYNC) && | ||
617 | desc->raid_disk < mddev->raid_disks) { | ||
618 | rdev->in_sync = 1; | ||
619 | rdev->raid_disk = desc->raid_disk; | ||
620 | } | ||
621 | } | ||
622 | return 0; | ||
623 | } | ||
624 | |||
625 | /* | ||
626 | * sync_super for 0.90.0 | ||
627 | */ | ||
628 | static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | ||
629 | { | ||
630 | mdp_super_t *sb; | ||
631 | struct list_head *tmp; | ||
632 | mdk_rdev_t *rdev2; | ||
633 | int next_spare = mddev->raid_disks; | ||
634 | |||
635 | /* make rdev->sb match mddev data.. | ||
636 | * | ||
637 | * 1/ zero out disks | ||
638 | * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); | ||
639 | * 3/ any empty disks < next_spare become removed | ||
640 | * | ||
641 | * disks[0] gets initialised to REMOVED because | ||
642 | * we cannot be sure from other fields if it has | ||
643 | * been initialised or not. | ||
644 | */ | ||
645 | int i; | ||
646 | int active=0, working=0,failed=0,spare=0,nr_disks=0; | ||
647 | |||
648 | sb = (mdp_super_t*)page_address(rdev->sb_page); | ||
649 | |||
650 | memset(sb, 0, sizeof(*sb)); | ||
651 | |||
652 | sb->md_magic = MD_SB_MAGIC; | ||
653 | sb->major_version = mddev->major_version; | ||
654 | sb->minor_version = mddev->minor_version; | ||
655 | sb->patch_version = mddev->patch_version; | ||
656 | sb->gvalid_words = 0; /* ignored */ | ||
657 | memcpy(&sb->set_uuid0, mddev->uuid+0, 4); | ||
658 | memcpy(&sb->set_uuid1, mddev->uuid+4, 4); | ||
659 | memcpy(&sb->set_uuid2, mddev->uuid+8, 4); | ||
660 | memcpy(&sb->set_uuid3, mddev->uuid+12,4); | ||
661 | |||
662 | sb->ctime = mddev->ctime; | ||
663 | sb->level = mddev->level; | ||
664 | sb->size = mddev->size; | ||
665 | sb->raid_disks = mddev->raid_disks; | ||
666 | sb->md_minor = mddev->md_minor; | ||
667 | sb->not_persistent = !mddev->persistent; | ||
668 | sb->utime = mddev->utime; | ||
669 | sb->state = 0; | ||
670 | sb->events_hi = (mddev->events>>32); | ||
671 | sb->events_lo = (u32)mddev->events; | ||
672 | |||
673 | if (mddev->in_sync) | ||
674 | { | ||
675 | sb->recovery_cp = mddev->recovery_cp; | ||
676 | sb->cp_events_hi = (mddev->events>>32); | ||
677 | sb->cp_events_lo = (u32)mddev->events; | ||
678 | if (mddev->recovery_cp == MaxSector) | ||
679 | sb->state = (1<< MD_SB_CLEAN); | ||
680 | } else | ||
681 | sb->recovery_cp = 0; | ||
682 | |||
683 | sb->layout = mddev->layout; | ||
684 | sb->chunk_size = mddev->chunk_size; | ||
685 | |||
686 | sb->disks[0].state = (1<<MD_DISK_REMOVED); | ||
687 | ITERATE_RDEV(mddev,rdev2,tmp) { | ||
688 | mdp_disk_t *d; | ||
689 | if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) | ||
690 | rdev2->desc_nr = rdev2->raid_disk; | ||
691 | else | ||
692 | rdev2->desc_nr = next_spare++; | ||
693 | d = &sb->disks[rdev2->desc_nr]; | ||
694 | nr_disks++; | ||
695 | d->number = rdev2->desc_nr; | ||
696 | d->major = MAJOR(rdev2->bdev->bd_dev); | ||
697 | d->minor = MINOR(rdev2->bdev->bd_dev); | ||
698 | if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) | ||
699 | d->raid_disk = rdev2->raid_disk; | ||
700 | else | ||
701 | d->raid_disk = rdev2->desc_nr; /* compatibility */ | ||
702 | if (rdev2->faulty) { | ||
703 | d->state = (1<<MD_DISK_FAULTY); | ||
704 | failed++; | ||
705 | } else if (rdev2->in_sync) { | ||
706 | d->state = (1<<MD_DISK_ACTIVE); | ||
707 | d->state |= (1<<MD_DISK_SYNC); | ||
708 | active++; | ||
709 | working++; | ||
710 | } else { | ||
711 | d->state = 0; | ||
712 | spare++; | ||
713 | working++; | ||
714 | } | ||
715 | } | ||
716 | |||
717 | /* now set the "removed" and "faulty" bits on any missing devices */ | ||
718 | for (i=0 ; i < mddev->raid_disks ; i++) { | ||
719 | mdp_disk_t *d = &sb->disks[i]; | ||
720 | if (d->state == 0 && d->number == 0) { | ||
721 | d->number = i; | ||
722 | d->raid_disk = i; | ||
723 | d->state = (1<<MD_DISK_REMOVED); | ||
724 | d->state |= (1<<MD_DISK_FAULTY); | ||
725 | failed++; | ||
726 | } | ||
727 | } | ||
728 | sb->nr_disks = nr_disks; | ||
729 | sb->active_disks = active; | ||
730 | sb->working_disks = working; | ||
731 | sb->failed_disks = failed; | ||
732 | sb->spare_disks = spare; | ||
733 | |||
734 | sb->this_disk = sb->disks[rdev->desc_nr]; | ||
735 | sb->sb_csum = calc_sb_csum(sb); | ||
736 | } | ||
737 | |||
738 | /* | ||
739 | * version 1 superblock | ||
740 | */ | ||
741 | |||
742 | static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) | ||
743 | { | ||
744 | unsigned int disk_csum, csum; | ||
745 | unsigned long long newcsum; | ||
746 | int size = 256 + le32_to_cpu(sb->max_dev)*2; | ||
747 | unsigned int *isuper = (unsigned int*)sb; | ||
748 | int i; | ||
749 | |||
750 | disk_csum = sb->sb_csum; | ||
751 | sb->sb_csum = 0; | ||
752 | newcsum = 0; | ||
753 | for (i=0; size>=4; size -= 4 ) | ||
754 | newcsum += le32_to_cpu(*isuper++); | ||
755 | |||
756 | if (size == 2) | ||
757 | newcsum += le16_to_cpu(*(unsigned short*) isuper); | ||
758 | |||
759 | csum = (newcsum & 0xffffffff) + (newcsum >> 32); | ||
760 | sb->sb_csum = disk_csum; | ||
761 | return cpu_to_le32(csum); | ||
762 | } | ||
763 | |||
764 | static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | ||
765 | { | ||
766 | struct mdp_superblock_1 *sb; | ||
767 | int ret; | ||
768 | sector_t sb_offset; | ||
769 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; | ||
770 | |||
771 | /* | ||
772 | * Calculate the position of the superblock. | ||
773 | * It is always aligned to a 4K boundary and | ||
774 | * depeding on minor_version, it can be: | ||
775 | * 0: At least 8K, but less than 12K, from end of device | ||
776 | * 1: At start of device | ||
777 | * 2: 4K from start of device. | ||
778 | */ | ||
779 | switch(minor_version) { | ||
780 | case 0: | ||
781 | sb_offset = rdev->bdev->bd_inode->i_size >> 9; | ||
782 | sb_offset -= 8*2; | ||
783 | sb_offset &= ~(4*2-1); | ||
784 | /* convert from sectors to K */ | ||
785 | sb_offset /= 2; | ||
786 | break; | ||
787 | case 1: | ||
788 | sb_offset = 0; | ||
789 | break; | ||
790 | case 2: | ||
791 | sb_offset = 4; | ||
792 | break; | ||
793 | default: | ||
794 | return -EINVAL; | ||
795 | } | ||
796 | rdev->sb_offset = sb_offset; | ||
797 | |||
798 | ret = read_disk_sb(rdev); | ||
799 | if (ret) return ret; | ||
800 | |||
801 | |||
802 | sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | ||
803 | |||
804 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || | ||
805 | sb->major_version != cpu_to_le32(1) || | ||
806 | le32_to_cpu(sb->max_dev) > (4096-256)/2 || | ||
807 | le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || | ||
808 | sb->feature_map != 0) | ||
809 | return -EINVAL; | ||
810 | |||
811 | if (calc_sb_1_csum(sb) != sb->sb_csum) { | ||
812 | printk("md: invalid superblock checksum on %s\n", | ||
813 | bdevname(rdev->bdev,b)); | ||
814 | return -EINVAL; | ||
815 | } | ||
816 | if (le64_to_cpu(sb->data_size) < 10) { | ||
817 | printk("md: data_size too small on %s\n", | ||
818 | bdevname(rdev->bdev,b)); | ||
819 | return -EINVAL; | ||
820 | } | ||
821 | rdev->preferred_minor = 0xffff; | ||
822 | rdev->data_offset = le64_to_cpu(sb->data_offset); | ||
823 | |||
824 | if (refdev == 0) | ||
825 | return 1; | ||
826 | else { | ||
827 | __u64 ev1, ev2; | ||
828 | struct mdp_superblock_1 *refsb = | ||
829 | (struct mdp_superblock_1*)page_address(refdev->sb_page); | ||
830 | |||
831 | if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || | ||
832 | sb->level != refsb->level || | ||
833 | sb->layout != refsb->layout || | ||
834 | sb->chunksize != refsb->chunksize) { | ||
835 | printk(KERN_WARNING "md: %s has strangely different" | ||
836 | " superblock to %s\n", | ||
837 | bdevname(rdev->bdev,b), | ||
838 | bdevname(refdev->bdev,b2)); | ||
839 | return -EINVAL; | ||
840 | } | ||
841 | ev1 = le64_to_cpu(sb->events); | ||
842 | ev2 = le64_to_cpu(refsb->events); | ||
843 | |||
844 | if (ev1 > ev2) | ||
845 | return 1; | ||
846 | } | ||
847 | if (minor_version) | ||
848 | rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; | ||
849 | else | ||
850 | rdev->size = rdev->sb_offset; | ||
851 | if (rdev->size < le64_to_cpu(sb->data_size)/2) | ||
852 | return -EINVAL; | ||
853 | rdev->size = le64_to_cpu(sb->data_size)/2; | ||
854 | if (le32_to_cpu(sb->chunksize)) | ||
855 | rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); | ||
856 | return 0; | ||
857 | } | ||
858 | |||
859 | static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | ||
860 | { | ||
861 | struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | ||
862 | |||
863 | if (mddev->raid_disks == 0) { | ||
864 | mddev->major_version = 1; | ||
865 | mddev->patch_version = 0; | ||
866 | mddev->persistent = 1; | ||
867 | mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; | ||
868 | mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); | ||
869 | mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); | ||
870 | mddev->level = le32_to_cpu(sb->level); | ||
871 | mddev->layout = le32_to_cpu(sb->layout); | ||
872 | mddev->raid_disks = le32_to_cpu(sb->raid_disks); | ||
873 | mddev->size = le64_to_cpu(sb->size)/2; | ||
874 | mddev->events = le64_to_cpu(sb->events); | ||
875 | |||
876 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); | ||
877 | memcpy(mddev->uuid, sb->set_uuid, 16); | ||
878 | |||
879 | mddev->max_disks = (4096-256)/2; | ||
880 | } else { | ||
881 | __u64 ev1; | ||
882 | ev1 = le64_to_cpu(sb->events); | ||
883 | ++ev1; | ||
884 | if (ev1 < mddev->events) | ||
885 | return -EINVAL; | ||
886 | } | ||
887 | |||
888 | if (mddev->level != LEVEL_MULTIPATH) { | ||
889 | int role; | ||
890 | rdev->desc_nr = le32_to_cpu(sb->dev_number); | ||
891 | role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); | ||
892 | switch(role) { | ||
893 | case 0xffff: /* spare */ | ||
894 | rdev->in_sync = 0; | ||
895 | rdev->faulty = 0; | ||
896 | rdev->raid_disk = -1; | ||
897 | break; | ||
898 | case 0xfffe: /* faulty */ | ||
899 | rdev->in_sync = 0; | ||
900 | rdev->faulty = 1; | ||
901 | rdev->raid_disk = -1; | ||
902 | break; | ||
903 | default: | ||
904 | rdev->in_sync = 1; | ||
905 | rdev->faulty = 0; | ||
906 | rdev->raid_disk = role; | ||
907 | break; | ||
908 | } | ||
909 | } | ||
910 | return 0; | ||
911 | } | ||
912 | |||
913 | static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | ||
914 | { | ||
915 | struct mdp_superblock_1 *sb; | ||
916 | struct list_head *tmp; | ||
917 | mdk_rdev_t *rdev2; | ||
918 | int max_dev, i; | ||
919 | /* make rdev->sb match mddev and rdev data. */ | ||
920 | |||
921 | sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | ||
922 | |||
923 | sb->feature_map = 0; | ||
924 | sb->pad0 = 0; | ||
925 | memset(sb->pad1, 0, sizeof(sb->pad1)); | ||
926 | memset(sb->pad2, 0, sizeof(sb->pad2)); | ||
927 | memset(sb->pad3, 0, sizeof(sb->pad3)); | ||
928 | |||
929 | sb->utime = cpu_to_le64((__u64)mddev->utime); | ||
930 | sb->events = cpu_to_le64(mddev->events); | ||
931 | if (mddev->in_sync) | ||
932 | sb->resync_offset = cpu_to_le64(mddev->recovery_cp); | ||
933 | else | ||
934 | sb->resync_offset = cpu_to_le64(0); | ||
935 | |||
936 | max_dev = 0; | ||
937 | ITERATE_RDEV(mddev,rdev2,tmp) | ||
938 | if (rdev2->desc_nr+1 > max_dev) | ||
939 | max_dev = rdev2->desc_nr+1; | ||
940 | |||
941 | sb->max_dev = cpu_to_le32(max_dev); | ||
942 | for (i=0; i<max_dev;i++) | ||
943 | sb->dev_roles[i] = cpu_to_le16(0xfffe); | ||
944 | |||
945 | ITERATE_RDEV(mddev,rdev2,tmp) { | ||
946 | i = rdev2->desc_nr; | ||
947 | if (rdev2->faulty) | ||
948 | sb->dev_roles[i] = cpu_to_le16(0xfffe); | ||
949 | else if (rdev2->in_sync) | ||
950 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); | ||
951 | else | ||
952 | sb->dev_roles[i] = cpu_to_le16(0xffff); | ||
953 | } | ||
954 | |||
955 | sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ | ||
956 | sb->sb_csum = calc_sb_1_csum(sb); | ||
957 | } | ||
958 | |||
959 | |||
960 | struct super_type super_types[] = { | ||
961 | [0] = { | ||
962 | .name = "0.90.0", | ||
963 | .owner = THIS_MODULE, | ||
964 | .load_super = super_90_load, | ||
965 | .validate_super = super_90_validate, | ||
966 | .sync_super = super_90_sync, | ||
967 | }, | ||
968 | [1] = { | ||
969 | .name = "md-1", | ||
970 | .owner = THIS_MODULE, | ||
971 | .load_super = super_1_load, | ||
972 | .validate_super = super_1_validate, | ||
973 | .sync_super = super_1_sync, | ||
974 | }, | ||
975 | }; | ||
976 | |||
977 | static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) | ||
978 | { | ||
979 | struct list_head *tmp; | ||
980 | mdk_rdev_t *rdev; | ||
981 | |||
982 | ITERATE_RDEV(mddev,rdev,tmp) | ||
983 | if (rdev->bdev->bd_contains == dev->bdev->bd_contains) | ||
984 | return rdev; | ||
985 | |||
986 | return NULL; | ||
987 | } | ||
988 | |||
989 | static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) | ||
990 | { | ||
991 | struct list_head *tmp; | ||
992 | mdk_rdev_t *rdev; | ||
993 | |||
994 | ITERATE_RDEV(mddev1,rdev,tmp) | ||
995 | if (match_dev_unit(mddev2, rdev)) | ||
996 | return 1; | ||
997 | |||
998 | return 0; | ||
999 | } | ||
1000 | |||
1001 | static LIST_HEAD(pending_raid_disks); | ||
1002 | |||
1003 | static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | ||
1004 | { | ||
1005 | mdk_rdev_t *same_pdev; | ||
1006 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; | ||
1007 | |||
1008 | if (rdev->mddev) { | ||
1009 | MD_BUG(); | ||
1010 | return -EINVAL; | ||
1011 | } | ||
1012 | same_pdev = match_dev_unit(mddev, rdev); | ||
1013 | if (same_pdev) | ||
1014 | printk(KERN_WARNING | ||
1015 | "%s: WARNING: %s appears to be on the same physical" | ||
1016 | " disk as %s. True\n protection against single-disk" | ||
1017 | " failure might be compromised.\n", | ||
1018 | mdname(mddev), bdevname(rdev->bdev,b), | ||
1019 | bdevname(same_pdev->bdev,b2)); | ||
1020 | |||
1021 | /* Verify rdev->desc_nr is unique. | ||
1022 | * If it is -1, assign a free number, else | ||
1023 | * check number is not in use | ||
1024 | */ | ||
1025 | if (rdev->desc_nr < 0) { | ||
1026 | int choice = 0; | ||
1027 | if (mddev->pers) choice = mddev->raid_disks; | ||
1028 | while (find_rdev_nr(mddev, choice)) | ||
1029 | choice++; | ||
1030 | rdev->desc_nr = choice; | ||
1031 | } else { | ||
1032 | if (find_rdev_nr(mddev, rdev->desc_nr)) | ||
1033 | return -EBUSY; | ||
1034 | } | ||
1035 | |||
1036 | list_add(&rdev->same_set, &mddev->disks); | ||
1037 | rdev->mddev = mddev; | ||
1038 | printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); | ||
1039 | return 0; | ||
1040 | } | ||
1041 | |||
1042 | static void unbind_rdev_from_array(mdk_rdev_t * rdev) | ||
1043 | { | ||
1044 | char b[BDEVNAME_SIZE]; | ||
1045 | if (!rdev->mddev) { | ||
1046 | MD_BUG(); | ||
1047 | return; | ||
1048 | } | ||
1049 | list_del_init(&rdev->same_set); | ||
1050 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); | ||
1051 | rdev->mddev = NULL; | ||
1052 | } | ||
1053 | |||
1054 | /* | ||
1055 | * prevent the device from being mounted, repartitioned or | ||
1056 | * otherwise reused by a RAID array (or any other kernel | ||
1057 | * subsystem), by bd_claiming the device. | ||
1058 | */ | ||
1059 | static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) | ||
1060 | { | ||
1061 | int err = 0; | ||
1062 | struct block_device *bdev; | ||
1063 | char b[BDEVNAME_SIZE]; | ||
1064 | |||
1065 | bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); | ||
1066 | if (IS_ERR(bdev)) { | ||
1067 | printk(KERN_ERR "md: could not open %s.\n", | ||
1068 | __bdevname(dev, b)); | ||
1069 | return PTR_ERR(bdev); | ||
1070 | } | ||
1071 | err = bd_claim(bdev, rdev); | ||
1072 | if (err) { | ||
1073 | printk(KERN_ERR "md: could not bd_claim %s.\n", | ||
1074 | bdevname(bdev, b)); | ||
1075 | blkdev_put(bdev); | ||
1076 | return err; | ||
1077 | } | ||
1078 | rdev->bdev = bdev; | ||
1079 | return err; | ||
1080 | } | ||
1081 | |||
1082 | static void unlock_rdev(mdk_rdev_t *rdev) | ||
1083 | { | ||
1084 | struct block_device *bdev = rdev->bdev; | ||
1085 | rdev->bdev = NULL; | ||
1086 | if (!bdev) | ||
1087 | MD_BUG(); | ||
1088 | bd_release(bdev); | ||
1089 | blkdev_put(bdev); | ||
1090 | } | ||
1091 | |||
1092 | void md_autodetect_dev(dev_t dev); | ||
1093 | |||
1094 | static void export_rdev(mdk_rdev_t * rdev) | ||
1095 | { | ||
1096 | char b[BDEVNAME_SIZE]; | ||
1097 | printk(KERN_INFO "md: export_rdev(%s)\n", | ||
1098 | bdevname(rdev->bdev,b)); | ||
1099 | if (rdev->mddev) | ||
1100 | MD_BUG(); | ||
1101 | free_disk_sb(rdev); | ||
1102 | list_del_init(&rdev->same_set); | ||
1103 | #ifndef MODULE | ||
1104 | md_autodetect_dev(rdev->bdev->bd_dev); | ||
1105 | #endif | ||
1106 | unlock_rdev(rdev); | ||
1107 | kfree(rdev); | ||
1108 | } | ||
1109 | |||
1110 | static void kick_rdev_from_array(mdk_rdev_t * rdev) | ||
1111 | { | ||
1112 | unbind_rdev_from_array(rdev); | ||
1113 | export_rdev(rdev); | ||
1114 | } | ||
1115 | |||
1116 | static void export_array(mddev_t *mddev) | ||
1117 | { | ||
1118 | struct list_head *tmp; | ||
1119 | mdk_rdev_t *rdev; | ||
1120 | |||
1121 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
1122 | if (!rdev->mddev) { | ||
1123 | MD_BUG(); | ||
1124 | continue; | ||
1125 | } | ||
1126 | kick_rdev_from_array(rdev); | ||
1127 | } | ||
1128 | if (!list_empty(&mddev->disks)) | ||
1129 | MD_BUG(); | ||
1130 | mddev->raid_disks = 0; | ||
1131 | mddev->major_version = 0; | ||
1132 | } | ||
1133 | |||
1134 | static void print_desc(mdp_disk_t *desc) | ||
1135 | { | ||
1136 | printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, | ||
1137 | desc->major,desc->minor,desc->raid_disk,desc->state); | ||
1138 | } | ||
1139 | |||
1140 | static void print_sb(mdp_super_t *sb) | ||
1141 | { | ||
1142 | int i; | ||
1143 | |||
1144 | printk(KERN_INFO | ||
1145 | "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", | ||
1146 | sb->major_version, sb->minor_version, sb->patch_version, | ||
1147 | sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, | ||
1148 | sb->ctime); | ||
1149 | printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", | ||
1150 | sb->level, sb->size, sb->nr_disks, sb->raid_disks, | ||
1151 | sb->md_minor, sb->layout, sb->chunk_size); | ||
1152 | printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" | ||
1153 | " FD:%d SD:%d CSUM:%08x E:%08lx\n", | ||
1154 | sb->utime, sb->state, sb->active_disks, sb->working_disks, | ||
1155 | sb->failed_disks, sb->spare_disks, | ||
1156 | sb->sb_csum, (unsigned long)sb->events_lo); | ||
1157 | |||
1158 | printk(KERN_INFO); | ||
1159 | for (i = 0; i < MD_SB_DISKS; i++) { | ||
1160 | mdp_disk_t *desc; | ||
1161 | |||
1162 | desc = sb->disks + i; | ||
1163 | if (desc->number || desc->major || desc->minor || | ||
1164 | desc->raid_disk || (desc->state && (desc->state != 4))) { | ||
1165 | printk(" D %2d: ", i); | ||
1166 | print_desc(desc); | ||
1167 | } | ||
1168 | } | ||
1169 | printk(KERN_INFO "md: THIS: "); | ||
1170 | print_desc(&sb->this_disk); | ||
1171 | |||
1172 | } | ||
1173 | |||
1174 | static void print_rdev(mdk_rdev_t *rdev) | ||
1175 | { | ||
1176 | char b[BDEVNAME_SIZE]; | ||
1177 | printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", | ||
1178 | bdevname(rdev->bdev,b), (unsigned long long)rdev->size, | ||
1179 | rdev->faulty, rdev->in_sync, rdev->desc_nr); | ||
1180 | if (rdev->sb_loaded) { | ||
1181 | printk(KERN_INFO "md: rdev superblock:\n"); | ||
1182 | print_sb((mdp_super_t*)page_address(rdev->sb_page)); | ||
1183 | } else | ||
1184 | printk(KERN_INFO "md: no rdev superblock!\n"); | ||
1185 | } | ||
1186 | |||
1187 | void md_print_devices(void) | ||
1188 | { | ||
1189 | struct list_head *tmp, *tmp2; | ||
1190 | mdk_rdev_t *rdev; | ||
1191 | mddev_t *mddev; | ||
1192 | char b[BDEVNAME_SIZE]; | ||
1193 | |||
1194 | printk("\n"); | ||
1195 | printk("md: **********************************\n"); | ||
1196 | printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); | ||
1197 | printk("md: **********************************\n"); | ||
1198 | ITERATE_MDDEV(mddev,tmp) { | ||
1199 | printk("%s: ", mdname(mddev)); | ||
1200 | |||
1201 | ITERATE_RDEV(mddev,rdev,tmp2) | ||
1202 | printk("<%s>", bdevname(rdev->bdev,b)); | ||
1203 | printk("\n"); | ||
1204 | |||
1205 | ITERATE_RDEV(mddev,rdev,tmp2) | ||
1206 | print_rdev(rdev); | ||
1207 | } | ||
1208 | printk("md: **********************************\n"); | ||
1209 | printk("\n"); | ||
1210 | } | ||
1211 | |||
1212 | |||
1213 | static int write_disk_sb(mdk_rdev_t * rdev) | ||
1214 | { | ||
1215 | char b[BDEVNAME_SIZE]; | ||
1216 | if (!rdev->sb_loaded) { | ||
1217 | MD_BUG(); | ||
1218 | return 1; | ||
1219 | } | ||
1220 | if (rdev->faulty) { | ||
1221 | MD_BUG(); | ||
1222 | return 1; | ||
1223 | } | ||
1224 | |||
1225 | dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", | ||
1226 | bdevname(rdev->bdev,b), | ||
1227 | (unsigned long long)rdev->sb_offset); | ||
1228 | |||
1229 | if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) | ||
1230 | return 0; | ||
1231 | |||
1232 | printk("md: write_disk_sb failed for device %s\n", | ||
1233 | bdevname(rdev->bdev,b)); | ||
1234 | return 1; | ||
1235 | } | ||
1236 | |||
1237 | static void sync_sbs(mddev_t * mddev) | ||
1238 | { | ||
1239 | mdk_rdev_t *rdev; | ||
1240 | struct list_head *tmp; | ||
1241 | |||
1242 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
1243 | super_types[mddev->major_version]. | ||
1244 | sync_super(mddev, rdev); | ||
1245 | rdev->sb_loaded = 1; | ||
1246 | } | ||
1247 | } | ||
1248 | |||
1249 | static void md_update_sb(mddev_t * mddev) | ||
1250 | { | ||
1251 | int err, count = 100; | ||
1252 | struct list_head *tmp; | ||
1253 | mdk_rdev_t *rdev; | ||
1254 | |||
1255 | mddev->sb_dirty = 0; | ||
1256 | repeat: | ||
1257 | mddev->utime = get_seconds(); | ||
1258 | mddev->events ++; | ||
1259 | |||
1260 | if (!mddev->events) { | ||
1261 | /* | ||
1262 | * oops, this 64-bit counter should never wrap. | ||
1263 | * Either we are in around ~1 trillion A.C., assuming | ||
1264 | * 1 reboot per second, or we have a bug: | ||
1265 | */ | ||
1266 | MD_BUG(); | ||
1267 | mddev->events --; | ||
1268 | } | ||
1269 | sync_sbs(mddev); | ||
1270 | |||
1271 | /* | ||
1272 | * do not write anything to disk if using | ||
1273 | * nonpersistent superblocks | ||
1274 | */ | ||
1275 | if (!mddev->persistent) | ||
1276 | return; | ||
1277 | |||
1278 | dprintk(KERN_INFO | ||
1279 | "md: updating %s RAID superblock on device (in sync %d)\n", | ||
1280 | mdname(mddev),mddev->in_sync); | ||
1281 | |||
1282 | err = 0; | ||
1283 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
1284 | char b[BDEVNAME_SIZE]; | ||
1285 | dprintk(KERN_INFO "md: "); | ||
1286 | if (rdev->faulty) | ||
1287 | dprintk("(skipping faulty "); | ||
1288 | |||
1289 | dprintk("%s ", bdevname(rdev->bdev,b)); | ||
1290 | if (!rdev->faulty) { | ||
1291 | err += write_disk_sb(rdev); | ||
1292 | } else | ||
1293 | dprintk(")\n"); | ||
1294 | if (!err && mddev->level == LEVEL_MULTIPATH) | ||
1295 | /* only need to write one superblock... */ | ||
1296 | break; | ||
1297 | } | ||
1298 | if (err) { | ||
1299 | if (--count) { | ||
1300 | printk(KERN_ERR "md: errors occurred during superblock" | ||
1301 | " update, repeating\n"); | ||
1302 | goto repeat; | ||
1303 | } | ||
1304 | printk(KERN_ERR \ | ||
1305 | "md: excessive errors occurred during superblock update, exiting\n"); | ||
1306 | } | ||
1307 | } | ||
1308 | |||
1309 | /* | ||
1310 | * Import a device. If 'super_format' >= 0, then sanity check the superblock | ||
1311 | * | ||
1312 | * mark the device faulty if: | ||
1313 | * | ||
1314 | * - the device is nonexistent (zero size) | ||
1315 | * - the device has no valid superblock | ||
1316 | * | ||
1317 | * a faulty rdev _never_ has rdev->sb set. | ||
1318 | */ | ||
1319 | static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) | ||
1320 | { | ||
1321 | char b[BDEVNAME_SIZE]; | ||
1322 | int err; | ||
1323 | mdk_rdev_t *rdev; | ||
1324 | sector_t size; | ||
1325 | |||
1326 | rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); | ||
1327 | if (!rdev) { | ||
1328 | printk(KERN_ERR "md: could not alloc mem for new device!\n"); | ||
1329 | return ERR_PTR(-ENOMEM); | ||
1330 | } | ||
1331 | memset(rdev, 0, sizeof(*rdev)); | ||
1332 | |||
1333 | if ((err = alloc_disk_sb(rdev))) | ||
1334 | goto abort_free; | ||
1335 | |||
1336 | err = lock_rdev(rdev, newdev); | ||
1337 | if (err) | ||
1338 | goto abort_free; | ||
1339 | |||
1340 | rdev->desc_nr = -1; | ||
1341 | rdev->faulty = 0; | ||
1342 | rdev->in_sync = 0; | ||
1343 | rdev->data_offset = 0; | ||
1344 | atomic_set(&rdev->nr_pending, 0); | ||
1345 | |||
1346 | size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; | ||
1347 | if (!size) { | ||
1348 | printk(KERN_WARNING | ||
1349 | "md: %s has zero or unknown size, marking faulty!\n", | ||
1350 | bdevname(rdev->bdev,b)); | ||
1351 | err = -EINVAL; | ||
1352 | goto abort_free; | ||
1353 | } | ||
1354 | |||
1355 | if (super_format >= 0) { | ||
1356 | err = super_types[super_format]. | ||
1357 | load_super(rdev, NULL, super_minor); | ||
1358 | if (err == -EINVAL) { | ||
1359 | printk(KERN_WARNING | ||
1360 | "md: %s has invalid sb, not importing!\n", | ||
1361 | bdevname(rdev->bdev,b)); | ||
1362 | goto abort_free; | ||
1363 | } | ||
1364 | if (err < 0) { | ||
1365 | printk(KERN_WARNING | ||
1366 | "md: could not read %s's sb, not importing!\n", | ||
1367 | bdevname(rdev->bdev,b)); | ||
1368 | goto abort_free; | ||
1369 | } | ||
1370 | } | ||
1371 | INIT_LIST_HEAD(&rdev->same_set); | ||
1372 | |||
1373 | return rdev; | ||
1374 | |||
1375 | abort_free: | ||
1376 | if (rdev->sb_page) { | ||
1377 | if (rdev->bdev) | ||
1378 | unlock_rdev(rdev); | ||
1379 | free_disk_sb(rdev); | ||
1380 | } | ||
1381 | kfree(rdev); | ||
1382 | return ERR_PTR(err); | ||
1383 | } | ||
1384 | |||
1385 | /* | ||
1386 | * Check a full RAID array for plausibility | ||
1387 | */ | ||
1388 | |||
1389 | |||
1390 | static int analyze_sbs(mddev_t * mddev) | ||
1391 | { | ||
1392 | int i; | ||
1393 | struct list_head *tmp; | ||
1394 | mdk_rdev_t *rdev, *freshest; | ||
1395 | char b[BDEVNAME_SIZE]; | ||
1396 | |||
1397 | freshest = NULL; | ||
1398 | ITERATE_RDEV(mddev,rdev,tmp) | ||
1399 | switch (super_types[mddev->major_version]. | ||
1400 | load_super(rdev, freshest, mddev->minor_version)) { | ||
1401 | case 1: | ||
1402 | freshest = rdev; | ||
1403 | break; | ||
1404 | case 0: | ||
1405 | break; | ||
1406 | default: | ||
1407 | printk( KERN_ERR \ | ||
1408 | "md: fatal superblock inconsistency in %s" | ||
1409 | " -- removing from array\n", | ||
1410 | bdevname(rdev->bdev,b)); | ||
1411 | kick_rdev_from_array(rdev); | ||
1412 | } | ||
1413 | |||
1414 | |||
1415 | super_types[mddev->major_version]. | ||
1416 | validate_super(mddev, freshest); | ||
1417 | |||
1418 | i = 0; | ||
1419 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
1420 | if (rdev != freshest) | ||
1421 | if (super_types[mddev->major_version]. | ||
1422 | validate_super(mddev, rdev)) { | ||
1423 | printk(KERN_WARNING "md: kicking non-fresh %s" | ||
1424 | " from array!\n", | ||
1425 | bdevname(rdev->bdev,b)); | ||
1426 | kick_rdev_from_array(rdev); | ||
1427 | continue; | ||
1428 | } | ||
1429 | if (mddev->level == LEVEL_MULTIPATH) { | ||
1430 | rdev->desc_nr = i++; | ||
1431 | rdev->raid_disk = rdev->desc_nr; | ||
1432 | rdev->in_sync = 1; | ||
1433 | } | ||
1434 | } | ||
1435 | |||
1436 | |||
1437 | |||
1438 | if (mddev->recovery_cp != MaxSector && | ||
1439 | mddev->level >= 1) | ||
1440 | printk(KERN_ERR "md: %s: raid array is not clean" | ||
1441 | " -- starting background reconstruction\n", | ||
1442 | mdname(mddev)); | ||
1443 | |||
1444 | return 0; | ||
1445 | } | ||
1446 | |||
1447 | int mdp_major = 0; | ||
1448 | |||
1449 | static struct kobject *md_probe(dev_t dev, int *part, void *data) | ||
1450 | { | ||
1451 | static DECLARE_MUTEX(disks_sem); | ||
1452 | mddev_t *mddev = mddev_find(dev); | ||
1453 | struct gendisk *disk; | ||
1454 | int partitioned = (MAJOR(dev) != MD_MAJOR); | ||
1455 | int shift = partitioned ? MdpMinorShift : 0; | ||
1456 | int unit = MINOR(dev) >> shift; | ||
1457 | |||
1458 | if (!mddev) | ||
1459 | return NULL; | ||
1460 | |||
1461 | down(&disks_sem); | ||
1462 | if (mddev->gendisk) { | ||
1463 | up(&disks_sem); | ||
1464 | mddev_put(mddev); | ||
1465 | return NULL; | ||
1466 | } | ||
1467 | disk = alloc_disk(1 << shift); | ||
1468 | if (!disk) { | ||
1469 | up(&disks_sem); | ||
1470 | mddev_put(mddev); | ||
1471 | return NULL; | ||
1472 | } | ||
1473 | disk->major = MAJOR(dev); | ||
1474 | disk->first_minor = unit << shift; | ||
1475 | if (partitioned) { | ||
1476 | sprintf(disk->disk_name, "md_d%d", unit); | ||
1477 | sprintf(disk->devfs_name, "md/d%d", unit); | ||
1478 | } else { | ||
1479 | sprintf(disk->disk_name, "md%d", unit); | ||
1480 | sprintf(disk->devfs_name, "md/%d", unit); | ||
1481 | } | ||
1482 | disk->fops = &md_fops; | ||
1483 | disk->private_data = mddev; | ||
1484 | disk->queue = mddev->queue; | ||
1485 | add_disk(disk); | ||
1486 | mddev->gendisk = disk; | ||
1487 | up(&disks_sem); | ||
1488 | return NULL; | ||
1489 | } | ||
1490 | |||
1491 | void md_wakeup_thread(mdk_thread_t *thread); | ||
1492 | |||
1493 | static void md_safemode_timeout(unsigned long data) | ||
1494 | { | ||
1495 | mddev_t *mddev = (mddev_t *) data; | ||
1496 | |||
1497 | mddev->safemode = 1; | ||
1498 | md_wakeup_thread(mddev->thread); | ||
1499 | } | ||
1500 | |||
1501 | |||
1502 | static int do_md_run(mddev_t * mddev) | ||
1503 | { | ||
1504 | int pnum, err; | ||
1505 | int chunk_size; | ||
1506 | struct list_head *tmp; | ||
1507 | mdk_rdev_t *rdev; | ||
1508 | struct gendisk *disk; | ||
1509 | char b[BDEVNAME_SIZE]; | ||
1510 | |||
1511 | if (list_empty(&mddev->disks)) { | ||
1512 | MD_BUG(); | ||
1513 | return -EINVAL; | ||
1514 | } | ||
1515 | |||
1516 | if (mddev->pers) | ||
1517 | return -EBUSY; | ||
1518 | |||
1519 | /* | ||
1520 | * Analyze all RAID superblock(s) | ||
1521 | */ | ||
1522 | if (!mddev->raid_disks && analyze_sbs(mddev)) { | ||
1523 | MD_BUG(); | ||
1524 | return -EINVAL; | ||
1525 | } | ||
1526 | |||
1527 | chunk_size = mddev->chunk_size; | ||
1528 | pnum = level_to_pers(mddev->level); | ||
1529 | |||
1530 | if ((pnum != MULTIPATH) && (pnum != RAID1)) { | ||
1531 | if (!chunk_size) { | ||
1532 | /* | ||
1533 | * 'default chunksize' in the old md code used to | ||
1534 | * be PAGE_SIZE, baaad. | ||
1535 | * we abort here to be on the safe side. We don't | ||
1536 | * want to continue the bad practice. | ||
1537 | */ | ||
1538 | printk(KERN_ERR | ||
1539 | "no chunksize specified, see 'man raidtab'\n"); | ||
1540 | return -EINVAL; | ||
1541 | } | ||
1542 | if (chunk_size > MAX_CHUNK_SIZE) { | ||
1543 | printk(KERN_ERR "too big chunk_size: %d > %d\n", | ||
1544 | chunk_size, MAX_CHUNK_SIZE); | ||
1545 | return -EINVAL; | ||
1546 | } | ||
1547 | /* | ||
1548 | * chunk-size has to be a power of 2 and multiples of PAGE_SIZE | ||
1549 | */ | ||
1550 | if ( (1 << ffz(~chunk_size)) != chunk_size) { | ||
1551 | MD_BUG(); | ||
1552 | return -EINVAL; | ||
1553 | } | ||
1554 | if (chunk_size < PAGE_SIZE) { | ||
1555 | printk(KERN_ERR "too small chunk_size: %d < %ld\n", | ||
1556 | chunk_size, PAGE_SIZE); | ||
1557 | return -EINVAL; | ||
1558 | } | ||
1559 | |||
1560 | /* devices must have minimum size of one chunk */ | ||
1561 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
1562 | if (rdev->faulty) | ||
1563 | continue; | ||
1564 | if (rdev->size < chunk_size / 1024) { | ||
1565 | printk(KERN_WARNING | ||
1566 | "md: Dev %s smaller than chunk_size:" | ||
1567 | " %lluk < %dk\n", | ||
1568 | bdevname(rdev->bdev,b), | ||
1569 | (unsigned long long)rdev->size, | ||
1570 | chunk_size / 1024); | ||
1571 | return -EINVAL; | ||
1572 | } | ||
1573 | } | ||
1574 | } | ||
1575 | |||
1576 | if (pnum >= MAX_PERSONALITY) { | ||
1577 | MD_BUG(); | ||
1578 | return -EINVAL; | ||
1579 | } | ||
1580 | |||
1581 | #ifdef CONFIG_KMOD | ||
1582 | if (!pers[pnum]) | ||
1583 | { | ||
1584 | request_module("md-personality-%d", pnum); | ||
1585 | } | ||
1586 | #endif | ||
1587 | |||
1588 | /* | ||
1589 | * Drop all container device buffers, from now on | ||
1590 | * the only valid external interface is through the md | ||
1591 | * device. | ||
1592 | * Also find largest hardsector size | ||
1593 | */ | ||
1594 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
1595 | if (rdev->faulty) | ||
1596 | continue; | ||
1597 | sync_blockdev(rdev->bdev); | ||
1598 | invalidate_bdev(rdev->bdev, 0); | ||
1599 | } | ||
1600 | |||
1601 | md_probe(mddev->unit, NULL, NULL); | ||
1602 | disk = mddev->gendisk; | ||
1603 | if (!disk) | ||
1604 | return -ENOMEM; | ||
1605 | |||
1606 | spin_lock(&pers_lock); | ||
1607 | if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { | ||
1608 | spin_unlock(&pers_lock); | ||
1609 | printk(KERN_WARNING "md: personality %d is not loaded!\n", | ||
1610 | pnum); | ||
1611 | return -EINVAL; | ||
1612 | } | ||
1613 | |||
1614 | mddev->pers = pers[pnum]; | ||
1615 | spin_unlock(&pers_lock); | ||
1616 | |||
1617 | mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ | ||
1618 | |||
1619 | err = mddev->pers->run(mddev); | ||
1620 | if (err) { | ||
1621 | printk(KERN_ERR "md: pers->run() failed ...\n"); | ||
1622 | module_put(mddev->pers->owner); | ||
1623 | mddev->pers = NULL; | ||
1624 | return -EINVAL; | ||
1625 | } | ||
1626 | atomic_set(&mddev->writes_pending,0); | ||
1627 | mddev->safemode = 0; | ||
1628 | mddev->safemode_timer.function = md_safemode_timeout; | ||
1629 | mddev->safemode_timer.data = (unsigned long) mddev; | ||
1630 | mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ | ||
1631 | mddev->in_sync = 1; | ||
1632 | |||
1633 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
1634 | |||
1635 | if (mddev->sb_dirty) | ||
1636 | md_update_sb(mddev); | ||
1637 | |||
1638 | set_capacity(disk, mddev->array_size<<1); | ||
1639 | |||
1640 | /* If we call blk_queue_make_request here, it will | ||
1641 | * re-initialise max_sectors etc which may have been | ||
1642 | * refined inside -> run. So just set the bits we need to set. | ||
1643 | * Most initialisation happended when we called | ||
1644 | * blk_queue_make_request(..., md_fail_request) | ||
1645 | * earlier. | ||
1646 | */ | ||
1647 | mddev->queue->queuedata = mddev; | ||
1648 | mddev->queue->make_request_fn = mddev->pers->make_request; | ||
1649 | |||
1650 | mddev->changed = 1; | ||
1651 | return 0; | ||
1652 | } | ||
1653 | |||
1654 | static int restart_array(mddev_t *mddev) | ||
1655 | { | ||
1656 | struct gendisk *disk = mddev->gendisk; | ||
1657 | int err; | ||
1658 | |||
1659 | /* | ||
1660 | * Complain if it has no devices | ||
1661 | */ | ||
1662 | err = -ENXIO; | ||
1663 | if (list_empty(&mddev->disks)) | ||
1664 | goto out; | ||
1665 | |||
1666 | if (mddev->pers) { | ||
1667 | err = -EBUSY; | ||
1668 | if (!mddev->ro) | ||
1669 | goto out; | ||
1670 | |||
1671 | mddev->safemode = 0; | ||
1672 | mddev->ro = 0; | ||
1673 | set_disk_ro(disk, 0); | ||
1674 | |||
1675 | printk(KERN_INFO "md: %s switched to read-write mode.\n", | ||
1676 | mdname(mddev)); | ||
1677 | /* | ||
1678 | * Kick recovery or resync if necessary | ||
1679 | */ | ||
1680 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
1681 | md_wakeup_thread(mddev->thread); | ||
1682 | err = 0; | ||
1683 | } else { | ||
1684 | printk(KERN_ERR "md: %s has no personality assigned.\n", | ||
1685 | mdname(mddev)); | ||
1686 | err = -EINVAL; | ||
1687 | } | ||
1688 | |||
1689 | out: | ||
1690 | return err; | ||
1691 | } | ||
1692 | |||
1693 | static int do_md_stop(mddev_t * mddev, int ro) | ||
1694 | { | ||
1695 | int err = 0; | ||
1696 | struct gendisk *disk = mddev->gendisk; | ||
1697 | |||
1698 | if (mddev->pers) { | ||
1699 | if (atomic_read(&mddev->active)>2) { | ||
1700 | printk("md: %s still in use.\n",mdname(mddev)); | ||
1701 | return -EBUSY; | ||
1702 | } | ||
1703 | |||
1704 | if (mddev->sync_thread) { | ||
1705 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
1706 | md_unregister_thread(mddev->sync_thread); | ||
1707 | mddev->sync_thread = NULL; | ||
1708 | } | ||
1709 | |||
1710 | del_timer_sync(&mddev->safemode_timer); | ||
1711 | |||
1712 | invalidate_partition(disk, 0); | ||
1713 | |||
1714 | if (ro) { | ||
1715 | err = -ENXIO; | ||
1716 | if (mddev->ro) | ||
1717 | goto out; | ||
1718 | mddev->ro = 1; | ||
1719 | } else { | ||
1720 | if (mddev->ro) | ||
1721 | set_disk_ro(disk, 0); | ||
1722 | blk_queue_make_request(mddev->queue, md_fail_request); | ||
1723 | mddev->pers->stop(mddev); | ||
1724 | module_put(mddev->pers->owner); | ||
1725 | mddev->pers = NULL; | ||
1726 | if (mddev->ro) | ||
1727 | mddev->ro = 0; | ||
1728 | } | ||
1729 | if (!mddev->in_sync) { | ||
1730 | /* mark array as shutdown cleanly */ | ||
1731 | mddev->in_sync = 1; | ||
1732 | md_update_sb(mddev); | ||
1733 | } | ||
1734 | if (ro) | ||
1735 | set_disk_ro(disk, 1); | ||
1736 | } | ||
1737 | /* | ||
1738 | * Free resources if final stop | ||
1739 | */ | ||
1740 | if (!ro) { | ||
1741 | struct gendisk *disk; | ||
1742 | printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); | ||
1743 | |||
1744 | export_array(mddev); | ||
1745 | |||
1746 | mddev->array_size = 0; | ||
1747 | disk = mddev->gendisk; | ||
1748 | if (disk) | ||
1749 | set_capacity(disk, 0); | ||
1750 | mddev->changed = 1; | ||
1751 | } else | ||
1752 | printk(KERN_INFO "md: %s switched to read-only mode.\n", | ||
1753 | mdname(mddev)); | ||
1754 | err = 0; | ||
1755 | out: | ||
1756 | return err; | ||
1757 | } | ||
1758 | |||
1759 | static void autorun_array(mddev_t *mddev) | ||
1760 | { | ||
1761 | mdk_rdev_t *rdev; | ||
1762 | struct list_head *tmp; | ||
1763 | int err; | ||
1764 | |||
1765 | if (list_empty(&mddev->disks)) { | ||
1766 | MD_BUG(); | ||
1767 | return; | ||
1768 | } | ||
1769 | |||
1770 | printk(KERN_INFO "md: running: "); | ||
1771 | |||
1772 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
1773 | char b[BDEVNAME_SIZE]; | ||
1774 | printk("<%s>", bdevname(rdev->bdev,b)); | ||
1775 | } | ||
1776 | printk("\n"); | ||
1777 | |||
1778 | err = do_md_run (mddev); | ||
1779 | if (err) { | ||
1780 | printk(KERN_WARNING "md: do_md_run() returned %d\n", err); | ||
1781 | do_md_stop (mddev, 0); | ||
1782 | } | ||
1783 | } | ||
1784 | |||
1785 | /* | ||
1786 | * lets try to run arrays based on all disks that have arrived | ||
1787 | * until now. (those are in pending_raid_disks) | ||
1788 | * | ||
1789 | * the method: pick the first pending disk, collect all disks with | ||
1790 | * the same UUID, remove all from the pending list and put them into | ||
1791 | * the 'same_array' list. Then order this list based on superblock | ||
1792 | * update time (freshest comes first), kick out 'old' disks and | ||
1793 | * compare superblocks. If everything's fine then run it. | ||
1794 | * | ||
1795 | * If "unit" is allocated, then bump its reference count | ||
1796 | */ | ||
1797 | static void autorun_devices(int part) | ||
1798 | { | ||
1799 | struct list_head candidates; | ||
1800 | struct list_head *tmp; | ||
1801 | mdk_rdev_t *rdev0, *rdev; | ||
1802 | mddev_t *mddev; | ||
1803 | char b[BDEVNAME_SIZE]; | ||
1804 | |||
1805 | printk(KERN_INFO "md: autorun ...\n"); | ||
1806 | while (!list_empty(&pending_raid_disks)) { | ||
1807 | dev_t dev; | ||
1808 | rdev0 = list_entry(pending_raid_disks.next, | ||
1809 | mdk_rdev_t, same_set); | ||
1810 | |||
1811 | printk(KERN_INFO "md: considering %s ...\n", | ||
1812 | bdevname(rdev0->bdev,b)); | ||
1813 | INIT_LIST_HEAD(&candidates); | ||
1814 | ITERATE_RDEV_PENDING(rdev,tmp) | ||
1815 | if (super_90_load(rdev, rdev0, 0) >= 0) { | ||
1816 | printk(KERN_INFO "md: adding %s ...\n", | ||
1817 | bdevname(rdev->bdev,b)); | ||
1818 | list_move(&rdev->same_set, &candidates); | ||
1819 | } | ||
1820 | /* | ||
1821 | * now we have a set of devices, with all of them having | ||
1822 | * mostly sane superblocks. It's time to allocate the | ||
1823 | * mddev. | ||
1824 | */ | ||
1825 | if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { | ||
1826 | printk(KERN_INFO "md: unit number in %s is bad: %d\n", | ||
1827 | bdevname(rdev0->bdev, b), rdev0->preferred_minor); | ||
1828 | break; | ||
1829 | } | ||
1830 | if (part) | ||
1831 | dev = MKDEV(mdp_major, | ||
1832 | rdev0->preferred_minor << MdpMinorShift); | ||
1833 | else | ||
1834 | dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); | ||
1835 | |||
1836 | md_probe(dev, NULL, NULL); | ||
1837 | mddev = mddev_find(dev); | ||
1838 | if (!mddev) { | ||
1839 | printk(KERN_ERR | ||
1840 | "md: cannot allocate memory for md drive.\n"); | ||
1841 | break; | ||
1842 | } | ||
1843 | if (mddev_lock(mddev)) | ||
1844 | printk(KERN_WARNING "md: %s locked, cannot run\n", | ||
1845 | mdname(mddev)); | ||
1846 | else if (mddev->raid_disks || mddev->major_version | ||
1847 | || !list_empty(&mddev->disks)) { | ||
1848 | printk(KERN_WARNING | ||
1849 | "md: %s already running, cannot run %s\n", | ||
1850 | mdname(mddev), bdevname(rdev0->bdev,b)); | ||
1851 | mddev_unlock(mddev); | ||
1852 | } else { | ||
1853 | printk(KERN_INFO "md: created %s\n", mdname(mddev)); | ||
1854 | ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { | ||
1855 | list_del_init(&rdev->same_set); | ||
1856 | if (bind_rdev_to_array(rdev, mddev)) | ||
1857 | export_rdev(rdev); | ||
1858 | } | ||
1859 | autorun_array(mddev); | ||
1860 | mddev_unlock(mddev); | ||
1861 | } | ||
1862 | /* on success, candidates will be empty, on error | ||
1863 | * it won't... | ||
1864 | */ | ||
1865 | ITERATE_RDEV_GENERIC(candidates,rdev,tmp) | ||
1866 | export_rdev(rdev); | ||
1867 | mddev_put(mddev); | ||
1868 | } | ||
1869 | printk(KERN_INFO "md: ... autorun DONE.\n"); | ||
1870 | } | ||
1871 | |||
1872 | /* | ||
1873 | * import RAID devices based on one partition | ||
1874 | * if possible, the array gets run as well. | ||
1875 | */ | ||
1876 | |||
1877 | static int autostart_array(dev_t startdev) | ||
1878 | { | ||
1879 | char b[BDEVNAME_SIZE]; | ||
1880 | int err = -EINVAL, i; | ||
1881 | mdp_super_t *sb = NULL; | ||
1882 | mdk_rdev_t *start_rdev = NULL, *rdev; | ||
1883 | |||
1884 | start_rdev = md_import_device(startdev, 0, 0); | ||
1885 | if (IS_ERR(start_rdev)) | ||
1886 | return err; | ||
1887 | |||
1888 | |||
1889 | /* NOTE: this can only work for 0.90.0 superblocks */ | ||
1890 | sb = (mdp_super_t*)page_address(start_rdev->sb_page); | ||
1891 | if (sb->major_version != 0 || | ||
1892 | sb->minor_version != 90 ) { | ||
1893 | printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); | ||
1894 | export_rdev(start_rdev); | ||
1895 | return err; | ||
1896 | } | ||
1897 | |||
1898 | if (start_rdev->faulty) { | ||
1899 | printk(KERN_WARNING | ||
1900 | "md: can not autostart based on faulty %s!\n", | ||
1901 | bdevname(start_rdev->bdev,b)); | ||
1902 | export_rdev(start_rdev); | ||
1903 | return err; | ||
1904 | } | ||
1905 | list_add(&start_rdev->same_set, &pending_raid_disks); | ||
1906 | |||
1907 | for (i = 0; i < MD_SB_DISKS; i++) { | ||
1908 | mdp_disk_t *desc = sb->disks + i; | ||
1909 | dev_t dev = MKDEV(desc->major, desc->minor); | ||
1910 | |||
1911 | if (!dev) | ||
1912 | continue; | ||
1913 | if (dev == startdev) | ||
1914 | continue; | ||
1915 | if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) | ||
1916 | continue; | ||
1917 | rdev = md_import_device(dev, 0, 0); | ||
1918 | if (IS_ERR(rdev)) | ||
1919 | continue; | ||
1920 | |||
1921 | list_add(&rdev->same_set, &pending_raid_disks); | ||
1922 | } | ||
1923 | |||
1924 | /* | ||
1925 | * possibly return codes | ||
1926 | */ | ||
1927 | autorun_devices(0); | ||
1928 | return 0; | ||
1929 | |||
1930 | } | ||
1931 | |||
1932 | |||
1933 | static int get_version(void __user * arg) | ||
1934 | { | ||
1935 | mdu_version_t ver; | ||
1936 | |||
1937 | ver.major = MD_MAJOR_VERSION; | ||
1938 | ver.minor = MD_MINOR_VERSION; | ||
1939 | ver.patchlevel = MD_PATCHLEVEL_VERSION; | ||
1940 | |||
1941 | if (copy_to_user(arg, &ver, sizeof(ver))) | ||
1942 | return -EFAULT; | ||
1943 | |||
1944 | return 0; | ||
1945 | } | ||
1946 | |||
1947 | static int get_array_info(mddev_t * mddev, void __user * arg) | ||
1948 | { | ||
1949 | mdu_array_info_t info; | ||
1950 | int nr,working,active,failed,spare; | ||
1951 | mdk_rdev_t *rdev; | ||
1952 | struct list_head *tmp; | ||
1953 | |||
1954 | nr=working=active=failed=spare=0; | ||
1955 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
1956 | nr++; | ||
1957 | if (rdev->faulty) | ||
1958 | failed++; | ||
1959 | else { | ||
1960 | working++; | ||
1961 | if (rdev->in_sync) | ||
1962 | active++; | ||
1963 | else | ||
1964 | spare++; | ||
1965 | } | ||
1966 | } | ||
1967 | |||
1968 | info.major_version = mddev->major_version; | ||
1969 | info.minor_version = mddev->minor_version; | ||
1970 | info.patch_version = MD_PATCHLEVEL_VERSION; | ||
1971 | info.ctime = mddev->ctime; | ||
1972 | info.level = mddev->level; | ||
1973 | info.size = mddev->size; | ||
1974 | info.nr_disks = nr; | ||
1975 | info.raid_disks = mddev->raid_disks; | ||
1976 | info.md_minor = mddev->md_minor; | ||
1977 | info.not_persistent= !mddev->persistent; | ||
1978 | |||
1979 | info.utime = mddev->utime; | ||
1980 | info.state = 0; | ||
1981 | if (mddev->in_sync) | ||
1982 | info.state = (1<<MD_SB_CLEAN); | ||
1983 | info.active_disks = active; | ||
1984 | info.working_disks = working; | ||
1985 | info.failed_disks = failed; | ||
1986 | info.spare_disks = spare; | ||
1987 | |||
1988 | info.layout = mddev->layout; | ||
1989 | info.chunk_size = mddev->chunk_size; | ||
1990 | |||
1991 | if (copy_to_user(arg, &info, sizeof(info))) | ||
1992 | return -EFAULT; | ||
1993 | |||
1994 | return 0; | ||
1995 | } | ||
1996 | |||
1997 | static int get_disk_info(mddev_t * mddev, void __user * arg) | ||
1998 | { | ||
1999 | mdu_disk_info_t info; | ||
2000 | unsigned int nr; | ||
2001 | mdk_rdev_t *rdev; | ||
2002 | |||
2003 | if (copy_from_user(&info, arg, sizeof(info))) | ||
2004 | return -EFAULT; | ||
2005 | |||
2006 | nr = info.number; | ||
2007 | |||
2008 | rdev = find_rdev_nr(mddev, nr); | ||
2009 | if (rdev) { | ||
2010 | info.major = MAJOR(rdev->bdev->bd_dev); | ||
2011 | info.minor = MINOR(rdev->bdev->bd_dev); | ||
2012 | info.raid_disk = rdev->raid_disk; | ||
2013 | info.state = 0; | ||
2014 | if (rdev->faulty) | ||
2015 | info.state |= (1<<MD_DISK_FAULTY); | ||
2016 | else if (rdev->in_sync) { | ||
2017 | info.state |= (1<<MD_DISK_ACTIVE); | ||
2018 | info.state |= (1<<MD_DISK_SYNC); | ||
2019 | } | ||
2020 | } else { | ||
2021 | info.major = info.minor = 0; | ||
2022 | info.raid_disk = -1; | ||
2023 | info.state = (1<<MD_DISK_REMOVED); | ||
2024 | } | ||
2025 | |||
2026 | if (copy_to_user(arg, &info, sizeof(info))) | ||
2027 | return -EFAULT; | ||
2028 | |||
2029 | return 0; | ||
2030 | } | ||
2031 | |||
2032 | static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | ||
2033 | { | ||
2034 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; | ||
2035 | mdk_rdev_t *rdev; | ||
2036 | dev_t dev = MKDEV(info->major,info->minor); | ||
2037 | |||
2038 | if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) | ||
2039 | return -EOVERFLOW; | ||
2040 | |||
2041 | if (!mddev->raid_disks) { | ||
2042 | int err; | ||
2043 | /* expecting a device which has a superblock */ | ||
2044 | rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); | ||
2045 | if (IS_ERR(rdev)) { | ||
2046 | printk(KERN_WARNING | ||
2047 | "md: md_import_device returned %ld\n", | ||
2048 | PTR_ERR(rdev)); | ||
2049 | return PTR_ERR(rdev); | ||
2050 | } | ||
2051 | if (!list_empty(&mddev->disks)) { | ||
2052 | mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, | ||
2053 | mdk_rdev_t, same_set); | ||
2054 | int err = super_types[mddev->major_version] | ||
2055 | .load_super(rdev, rdev0, mddev->minor_version); | ||
2056 | if (err < 0) { | ||
2057 | printk(KERN_WARNING | ||
2058 | "md: %s has different UUID to %s\n", | ||
2059 | bdevname(rdev->bdev,b), | ||
2060 | bdevname(rdev0->bdev,b2)); | ||
2061 | export_rdev(rdev); | ||
2062 | return -EINVAL; | ||
2063 | } | ||
2064 | } | ||
2065 | err = bind_rdev_to_array(rdev, mddev); | ||
2066 | if (err) | ||
2067 | export_rdev(rdev); | ||
2068 | return err; | ||
2069 | } | ||
2070 | |||
2071 | /* | ||
2072 | * add_new_disk can be used once the array is assembled | ||
2073 | * to add "hot spares". They must already have a superblock | ||
2074 | * written | ||
2075 | */ | ||
2076 | if (mddev->pers) { | ||
2077 | int err; | ||
2078 | if (!mddev->pers->hot_add_disk) { | ||
2079 | printk(KERN_WARNING | ||
2080 | "%s: personality does not support diskops!\n", | ||
2081 | mdname(mddev)); | ||
2082 | return -EINVAL; | ||
2083 | } | ||
2084 | rdev = md_import_device(dev, mddev->major_version, | ||
2085 | mddev->minor_version); | ||
2086 | if (IS_ERR(rdev)) { | ||
2087 | printk(KERN_WARNING | ||
2088 | "md: md_import_device returned %ld\n", | ||
2089 | PTR_ERR(rdev)); | ||
2090 | return PTR_ERR(rdev); | ||
2091 | } | ||
2092 | rdev->in_sync = 0; /* just to be sure */ | ||
2093 | rdev->raid_disk = -1; | ||
2094 | err = bind_rdev_to_array(rdev, mddev); | ||
2095 | if (err) | ||
2096 | export_rdev(rdev); | ||
2097 | if (mddev->thread) | ||
2098 | md_wakeup_thread(mddev->thread); | ||
2099 | return err; | ||
2100 | } | ||
2101 | |||
2102 | /* otherwise, add_new_disk is only allowed | ||
2103 | * for major_version==0 superblocks | ||
2104 | */ | ||
2105 | if (mddev->major_version != 0) { | ||
2106 | printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", | ||
2107 | mdname(mddev)); | ||
2108 | return -EINVAL; | ||
2109 | } | ||
2110 | |||
2111 | if (!(info->state & (1<<MD_DISK_FAULTY))) { | ||
2112 | int err; | ||
2113 | rdev = md_import_device (dev, -1, 0); | ||
2114 | if (IS_ERR(rdev)) { | ||
2115 | printk(KERN_WARNING | ||
2116 | "md: error, md_import_device() returned %ld\n", | ||
2117 | PTR_ERR(rdev)); | ||
2118 | return PTR_ERR(rdev); | ||
2119 | } | ||
2120 | rdev->desc_nr = info->number; | ||
2121 | if (info->raid_disk < mddev->raid_disks) | ||
2122 | rdev->raid_disk = info->raid_disk; | ||
2123 | else | ||
2124 | rdev->raid_disk = -1; | ||
2125 | |||
2126 | rdev->faulty = 0; | ||
2127 | if (rdev->raid_disk < mddev->raid_disks) | ||
2128 | rdev->in_sync = (info->state & (1<<MD_DISK_SYNC)); | ||
2129 | else | ||
2130 | rdev->in_sync = 0; | ||
2131 | |||
2132 | err = bind_rdev_to_array(rdev, mddev); | ||
2133 | if (err) { | ||
2134 | export_rdev(rdev); | ||
2135 | return err; | ||
2136 | } | ||
2137 | |||
2138 | if (!mddev->persistent) { | ||
2139 | printk(KERN_INFO "md: nonpersistent superblock ...\n"); | ||
2140 | rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; | ||
2141 | } else | ||
2142 | rdev->sb_offset = calc_dev_sboffset(rdev->bdev); | ||
2143 | rdev->size = calc_dev_size(rdev, mddev->chunk_size); | ||
2144 | |||
2145 | if (!mddev->size || (mddev->size > rdev->size)) | ||
2146 | mddev->size = rdev->size; | ||
2147 | } | ||
2148 | |||
2149 | return 0; | ||
2150 | } | ||
2151 | |||
2152 | static int hot_remove_disk(mddev_t * mddev, dev_t dev) | ||
2153 | { | ||
2154 | char b[BDEVNAME_SIZE]; | ||
2155 | mdk_rdev_t *rdev; | ||
2156 | |||
2157 | if (!mddev->pers) | ||
2158 | return -ENODEV; | ||
2159 | |||
2160 | rdev = find_rdev(mddev, dev); | ||
2161 | if (!rdev) | ||
2162 | return -ENXIO; | ||
2163 | |||
2164 | if (rdev->raid_disk >= 0) | ||
2165 | goto busy; | ||
2166 | |||
2167 | kick_rdev_from_array(rdev); | ||
2168 | md_update_sb(mddev); | ||
2169 | |||
2170 | return 0; | ||
2171 | busy: | ||
2172 | printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", | ||
2173 | bdevname(rdev->bdev,b), mdname(mddev)); | ||
2174 | return -EBUSY; | ||
2175 | } | ||
2176 | |||
2177 | static int hot_add_disk(mddev_t * mddev, dev_t dev) | ||
2178 | { | ||
2179 | char b[BDEVNAME_SIZE]; | ||
2180 | int err; | ||
2181 | unsigned int size; | ||
2182 | mdk_rdev_t *rdev; | ||
2183 | |||
2184 | if (!mddev->pers) | ||
2185 | return -ENODEV; | ||
2186 | |||
2187 | if (mddev->major_version != 0) { | ||
2188 | printk(KERN_WARNING "%s: HOT_ADD may only be used with" | ||
2189 | " version-0 superblocks.\n", | ||
2190 | mdname(mddev)); | ||
2191 | return -EINVAL; | ||
2192 | } | ||
2193 | if (!mddev->pers->hot_add_disk) { | ||
2194 | printk(KERN_WARNING | ||
2195 | "%s: personality does not support diskops!\n", | ||
2196 | mdname(mddev)); | ||
2197 | return -EINVAL; | ||
2198 | } | ||
2199 | |||
2200 | rdev = md_import_device (dev, -1, 0); | ||
2201 | if (IS_ERR(rdev)) { | ||
2202 | printk(KERN_WARNING | ||
2203 | "md: error, md_import_device() returned %ld\n", | ||
2204 | PTR_ERR(rdev)); | ||
2205 | return -EINVAL; | ||
2206 | } | ||
2207 | |||
2208 | if (mddev->persistent) | ||
2209 | rdev->sb_offset = calc_dev_sboffset(rdev->bdev); | ||
2210 | else | ||
2211 | rdev->sb_offset = | ||
2212 | rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; | ||
2213 | |||
2214 | size = calc_dev_size(rdev, mddev->chunk_size); | ||
2215 | rdev->size = size; | ||
2216 | |||
2217 | if (size < mddev->size) { | ||
2218 | printk(KERN_WARNING | ||
2219 | "%s: disk size %llu blocks < array size %llu\n", | ||
2220 | mdname(mddev), (unsigned long long)size, | ||
2221 | (unsigned long long)mddev->size); | ||
2222 | err = -ENOSPC; | ||
2223 | goto abort_export; | ||
2224 | } | ||
2225 | |||
2226 | if (rdev->faulty) { | ||
2227 | printk(KERN_WARNING | ||
2228 | "md: can not hot-add faulty %s disk to %s!\n", | ||
2229 | bdevname(rdev->bdev,b), mdname(mddev)); | ||
2230 | err = -EINVAL; | ||
2231 | goto abort_export; | ||
2232 | } | ||
2233 | rdev->in_sync = 0; | ||
2234 | rdev->desc_nr = -1; | ||
2235 | bind_rdev_to_array(rdev, mddev); | ||
2236 | |||
2237 | /* | ||
2238 | * The rest should better be atomic, we can have disk failures | ||
2239 | * noticed in interrupt contexts ... | ||
2240 | */ | ||
2241 | |||
2242 | if (rdev->desc_nr == mddev->max_disks) { | ||
2243 | printk(KERN_WARNING "%s: can not hot-add to full array!\n", | ||
2244 | mdname(mddev)); | ||
2245 | err = -EBUSY; | ||
2246 | goto abort_unbind_export; | ||
2247 | } | ||
2248 | |||
2249 | rdev->raid_disk = -1; | ||
2250 | |||
2251 | md_update_sb(mddev); | ||
2252 | |||
2253 | /* | ||
2254 | * Kick recovery, maybe this spare has to be added to the | ||
2255 | * array immediately. | ||
2256 | */ | ||
2257 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
2258 | md_wakeup_thread(mddev->thread); | ||
2259 | |||
2260 | return 0; | ||
2261 | |||
2262 | abort_unbind_export: | ||
2263 | unbind_rdev_from_array(rdev); | ||
2264 | |||
2265 | abort_export: | ||
2266 | export_rdev(rdev); | ||
2267 | return err; | ||
2268 | } | ||
2269 | |||
2270 | /* | ||
2271 | * set_array_info is used two different ways | ||
2272 | * The original usage is when creating a new array. | ||
2273 | * In this usage, raid_disks is > 0 and it together with | ||
2274 | * level, size, not_persistent,layout,chunksize determine the | ||
2275 | * shape of the array. | ||
2276 | * This will always create an array with a type-0.90.0 superblock. | ||
2277 | * The newer usage is when assembling an array. | ||
2278 | * In this case raid_disks will be 0, and the major_version field is | ||
2279 | * use to determine which style super-blocks are to be found on the devices. | ||
2280 | * The minor and patch _version numbers are also kept incase the | ||
2281 | * super_block handler wishes to interpret them. | ||
2282 | */ | ||
2283 | static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | ||
2284 | { | ||
2285 | |||
2286 | if (info->raid_disks == 0) { | ||
2287 | /* just setting version number for superblock loading */ | ||
2288 | if (info->major_version < 0 || | ||
2289 | info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || | ||
2290 | super_types[info->major_version].name == NULL) { | ||
2291 | /* maybe try to auto-load a module? */ | ||
2292 | printk(KERN_INFO | ||
2293 | "md: superblock version %d not known\n", | ||
2294 | info->major_version); | ||
2295 | return -EINVAL; | ||
2296 | } | ||
2297 | mddev->major_version = info->major_version; | ||
2298 | mddev->minor_version = info->minor_version; | ||
2299 | mddev->patch_version = info->patch_version; | ||
2300 | return 0; | ||
2301 | } | ||
2302 | mddev->major_version = MD_MAJOR_VERSION; | ||
2303 | mddev->minor_version = MD_MINOR_VERSION; | ||
2304 | mddev->patch_version = MD_PATCHLEVEL_VERSION; | ||
2305 | mddev->ctime = get_seconds(); | ||
2306 | |||
2307 | mddev->level = info->level; | ||
2308 | mddev->size = info->size; | ||
2309 | mddev->raid_disks = info->raid_disks; | ||
2310 | /* don't set md_minor, it is determined by which /dev/md* was | ||
2311 | * openned | ||
2312 | */ | ||
2313 | if (info->state & (1<<MD_SB_CLEAN)) | ||
2314 | mddev->recovery_cp = MaxSector; | ||
2315 | else | ||
2316 | mddev->recovery_cp = 0; | ||
2317 | mddev->persistent = ! info->not_persistent; | ||
2318 | |||
2319 | mddev->layout = info->layout; | ||
2320 | mddev->chunk_size = info->chunk_size; | ||
2321 | |||
2322 | mddev->max_disks = MD_SB_DISKS; | ||
2323 | |||
2324 | mddev->sb_dirty = 1; | ||
2325 | |||
2326 | /* | ||
2327 | * Generate a 128 bit UUID | ||
2328 | */ | ||
2329 | get_random_bytes(mddev->uuid, 16); | ||
2330 | |||
2331 | return 0; | ||
2332 | } | ||
2333 | |||
2334 | /* | ||
2335 | * update_array_info is used to change the configuration of an | ||
2336 | * on-line array. | ||
2337 | * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size | ||
2338 | * fields in the info are checked against the array. | ||
2339 | * Any differences that cannot be handled will cause an error. | ||
2340 | * Normally, only one change can be managed at a time. | ||
2341 | */ | ||
2342 | static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | ||
2343 | { | ||
2344 | int rv = 0; | ||
2345 | int cnt = 0; | ||
2346 | |||
2347 | if (mddev->major_version != info->major_version || | ||
2348 | mddev->minor_version != info->minor_version || | ||
2349 | /* mddev->patch_version != info->patch_version || */ | ||
2350 | mddev->ctime != info->ctime || | ||
2351 | mddev->level != info->level || | ||
2352 | /* mddev->layout != info->layout || */ | ||
2353 | !mddev->persistent != info->not_persistent|| | ||
2354 | mddev->chunk_size != info->chunk_size ) | ||
2355 | return -EINVAL; | ||
2356 | /* Check there is only one change */ | ||
2357 | if (mddev->size != info->size) cnt++; | ||
2358 | if (mddev->raid_disks != info->raid_disks) cnt++; | ||
2359 | if (mddev->layout != info->layout) cnt++; | ||
2360 | if (cnt == 0) return 0; | ||
2361 | if (cnt > 1) return -EINVAL; | ||
2362 | |||
2363 | if (mddev->layout != info->layout) { | ||
2364 | /* Change layout | ||
2365 | * we don't need to do anything at the md level, the | ||
2366 | * personality will take care of it all. | ||
2367 | */ | ||
2368 | if (mddev->pers->reconfig == NULL) | ||
2369 | return -EINVAL; | ||
2370 | else | ||
2371 | return mddev->pers->reconfig(mddev, info->layout, -1); | ||
2372 | } | ||
2373 | if (mddev->size != info->size) { | ||
2374 | mdk_rdev_t * rdev; | ||
2375 | struct list_head *tmp; | ||
2376 | if (mddev->pers->resize == NULL) | ||
2377 | return -EINVAL; | ||
2378 | /* The "size" is the amount of each device that is used. | ||
2379 | * This can only make sense for arrays with redundancy. | ||
2380 | * linear and raid0 always use whatever space is available | ||
2381 | * We can only consider changing the size if no resync | ||
2382 | * or reconstruction is happening, and if the new size | ||
2383 | * is acceptable. It must fit before the sb_offset or, | ||
2384 | * if that is <data_offset, it must fit before the | ||
2385 | * size of each device. | ||
2386 | * If size is zero, we find the largest size that fits. | ||
2387 | */ | ||
2388 | if (mddev->sync_thread) | ||
2389 | return -EBUSY; | ||
2390 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
2391 | sector_t avail; | ||
2392 | int fit = (info->size == 0); | ||
2393 | if (rdev->sb_offset > rdev->data_offset) | ||
2394 | avail = (rdev->sb_offset*2) - rdev->data_offset; | ||
2395 | else | ||
2396 | avail = get_capacity(rdev->bdev->bd_disk) | ||
2397 | - rdev->data_offset; | ||
2398 | if (fit && (info->size == 0 || info->size > avail/2)) | ||
2399 | info->size = avail/2; | ||
2400 | if (avail < ((sector_t)info->size << 1)) | ||
2401 | return -ENOSPC; | ||
2402 | } | ||
2403 | rv = mddev->pers->resize(mddev, (sector_t)info->size *2); | ||
2404 | if (!rv) { | ||
2405 | struct block_device *bdev; | ||
2406 | |||
2407 | bdev = bdget_disk(mddev->gendisk, 0); | ||
2408 | if (bdev) { | ||
2409 | down(&bdev->bd_inode->i_sem); | ||
2410 | i_size_write(bdev->bd_inode, mddev->array_size << 10); | ||
2411 | up(&bdev->bd_inode->i_sem); | ||
2412 | bdput(bdev); | ||
2413 | } | ||
2414 | } | ||
2415 | } | ||
2416 | if (mddev->raid_disks != info->raid_disks) { | ||
2417 | /* change the number of raid disks */ | ||
2418 | if (mddev->pers->reshape == NULL) | ||
2419 | return -EINVAL; | ||
2420 | if (info->raid_disks <= 0 || | ||
2421 | info->raid_disks >= mddev->max_disks) | ||
2422 | return -EINVAL; | ||
2423 | if (mddev->sync_thread) | ||
2424 | return -EBUSY; | ||
2425 | rv = mddev->pers->reshape(mddev, info->raid_disks); | ||
2426 | if (!rv) { | ||
2427 | struct block_device *bdev; | ||
2428 | |||
2429 | bdev = bdget_disk(mddev->gendisk, 0); | ||
2430 | if (bdev) { | ||
2431 | down(&bdev->bd_inode->i_sem); | ||
2432 | i_size_write(bdev->bd_inode, mddev->array_size << 10); | ||
2433 | up(&bdev->bd_inode->i_sem); | ||
2434 | bdput(bdev); | ||
2435 | } | ||
2436 | } | ||
2437 | } | ||
2438 | md_update_sb(mddev); | ||
2439 | return rv; | ||
2440 | } | ||
2441 | |||
2442 | static int set_disk_faulty(mddev_t *mddev, dev_t dev) | ||
2443 | { | ||
2444 | mdk_rdev_t *rdev; | ||
2445 | |||
2446 | if (mddev->pers == NULL) | ||
2447 | return -ENODEV; | ||
2448 | |||
2449 | rdev = find_rdev(mddev, dev); | ||
2450 | if (!rdev) | ||
2451 | return -ENODEV; | ||
2452 | |||
2453 | md_error(mddev, rdev); | ||
2454 | return 0; | ||
2455 | } | ||
2456 | |||
2457 | static int md_ioctl(struct inode *inode, struct file *file, | ||
2458 | unsigned int cmd, unsigned long arg) | ||
2459 | { | ||
2460 | int err = 0; | ||
2461 | void __user *argp = (void __user *)arg; | ||
2462 | struct hd_geometry __user *loc = argp; | ||
2463 | mddev_t *mddev = NULL; | ||
2464 | |||
2465 | if (!capable(CAP_SYS_ADMIN)) | ||
2466 | return -EACCES; | ||
2467 | |||
2468 | /* | ||
2469 | * Commands dealing with the RAID driver but not any | ||
2470 | * particular array: | ||
2471 | */ | ||
2472 | switch (cmd) | ||
2473 | { | ||
2474 | case RAID_VERSION: | ||
2475 | err = get_version(argp); | ||
2476 | goto done; | ||
2477 | |||
2478 | case PRINT_RAID_DEBUG: | ||
2479 | err = 0; | ||
2480 | md_print_devices(); | ||
2481 | goto done; | ||
2482 | |||
2483 | #ifndef MODULE | ||
2484 | case RAID_AUTORUN: | ||
2485 | err = 0; | ||
2486 | autostart_arrays(arg); | ||
2487 | goto done; | ||
2488 | #endif | ||
2489 | default:; | ||
2490 | } | ||
2491 | |||
2492 | /* | ||
2493 | * Commands creating/starting a new array: | ||
2494 | */ | ||
2495 | |||
2496 | mddev = inode->i_bdev->bd_disk->private_data; | ||
2497 | |||
2498 | if (!mddev) { | ||
2499 | BUG(); | ||
2500 | goto abort; | ||
2501 | } | ||
2502 | |||
2503 | |||
2504 | if (cmd == START_ARRAY) { | ||
2505 | /* START_ARRAY doesn't need to lock the array as autostart_array | ||
2506 | * does the locking, and it could even be a different array | ||
2507 | */ | ||
2508 | static int cnt = 3; | ||
2509 | if (cnt > 0 ) { | ||
2510 | printk(KERN_WARNING | ||
2511 | "md: %s(pid %d) used deprecated START_ARRAY ioctl. " | ||
2512 | "This will not be supported beyond 2.6\n", | ||
2513 | current->comm, current->pid); | ||
2514 | cnt--; | ||
2515 | } | ||
2516 | err = autostart_array(new_decode_dev(arg)); | ||
2517 | if (err) { | ||
2518 | printk(KERN_WARNING "md: autostart failed!\n"); | ||
2519 | goto abort; | ||
2520 | } | ||
2521 | goto done; | ||
2522 | } | ||
2523 | |||
2524 | err = mddev_lock(mddev); | ||
2525 | if (err) { | ||
2526 | printk(KERN_INFO | ||
2527 | "md: ioctl lock interrupted, reason %d, cmd %d\n", | ||
2528 | err, cmd); | ||
2529 | goto abort; | ||
2530 | } | ||
2531 | |||
2532 | switch (cmd) | ||
2533 | { | ||
2534 | case SET_ARRAY_INFO: | ||
2535 | { | ||
2536 | mdu_array_info_t info; | ||
2537 | if (!arg) | ||
2538 | memset(&info, 0, sizeof(info)); | ||
2539 | else if (copy_from_user(&info, argp, sizeof(info))) { | ||
2540 | err = -EFAULT; | ||
2541 | goto abort_unlock; | ||
2542 | } | ||
2543 | if (mddev->pers) { | ||
2544 | err = update_array_info(mddev, &info); | ||
2545 | if (err) { | ||
2546 | printk(KERN_WARNING "md: couldn't update" | ||
2547 | " array info. %d\n", err); | ||
2548 | goto abort_unlock; | ||
2549 | } | ||
2550 | goto done_unlock; | ||
2551 | } | ||
2552 | if (!list_empty(&mddev->disks)) { | ||
2553 | printk(KERN_WARNING | ||
2554 | "md: array %s already has disks!\n", | ||
2555 | mdname(mddev)); | ||
2556 | err = -EBUSY; | ||
2557 | goto abort_unlock; | ||
2558 | } | ||
2559 | if (mddev->raid_disks) { | ||
2560 | printk(KERN_WARNING | ||
2561 | "md: array %s already initialised!\n", | ||
2562 | mdname(mddev)); | ||
2563 | err = -EBUSY; | ||
2564 | goto abort_unlock; | ||
2565 | } | ||
2566 | err = set_array_info(mddev, &info); | ||
2567 | if (err) { | ||
2568 | printk(KERN_WARNING "md: couldn't set" | ||
2569 | " array info. %d\n", err); | ||
2570 | goto abort_unlock; | ||
2571 | } | ||
2572 | } | ||
2573 | goto done_unlock; | ||
2574 | |||
2575 | default:; | ||
2576 | } | ||
2577 | |||
2578 | /* | ||
2579 | * Commands querying/configuring an existing array: | ||
2580 | */ | ||
2581 | /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ | ||
2582 | if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { | ||
2583 | err = -ENODEV; | ||
2584 | goto abort_unlock; | ||
2585 | } | ||
2586 | |||
2587 | /* | ||
2588 | * Commands even a read-only array can execute: | ||
2589 | */ | ||
2590 | switch (cmd) | ||
2591 | { | ||
2592 | case GET_ARRAY_INFO: | ||
2593 | err = get_array_info(mddev, argp); | ||
2594 | goto done_unlock; | ||
2595 | |||
2596 | case GET_DISK_INFO: | ||
2597 | err = get_disk_info(mddev, argp); | ||
2598 | goto done_unlock; | ||
2599 | |||
2600 | case RESTART_ARRAY_RW: | ||
2601 | err = restart_array(mddev); | ||
2602 | goto done_unlock; | ||
2603 | |||
2604 | case STOP_ARRAY: | ||
2605 | err = do_md_stop (mddev, 0); | ||
2606 | goto done_unlock; | ||
2607 | |||
2608 | case STOP_ARRAY_RO: | ||
2609 | err = do_md_stop (mddev, 1); | ||
2610 | goto done_unlock; | ||
2611 | |||
2612 | /* | ||
2613 | * We have a problem here : there is no easy way to give a CHS | ||
2614 | * virtual geometry. We currently pretend that we have a 2 heads | ||
2615 | * 4 sectors (with a BIG number of cylinders...). This drives | ||
2616 | * dosfs just mad... ;-) | ||
2617 | */ | ||
2618 | case HDIO_GETGEO: | ||
2619 | if (!loc) { | ||
2620 | err = -EINVAL; | ||
2621 | goto abort_unlock; | ||
2622 | } | ||
2623 | err = put_user (2, (char __user *) &loc->heads); | ||
2624 | if (err) | ||
2625 | goto abort_unlock; | ||
2626 | err = put_user (4, (char __user *) &loc->sectors); | ||
2627 | if (err) | ||
2628 | goto abort_unlock; | ||
2629 | err = put_user(get_capacity(mddev->gendisk)/8, | ||
2630 | (short __user *) &loc->cylinders); | ||
2631 | if (err) | ||
2632 | goto abort_unlock; | ||
2633 | err = put_user (get_start_sect(inode->i_bdev), | ||
2634 | (long __user *) &loc->start); | ||
2635 | goto done_unlock; | ||
2636 | } | ||
2637 | |||
2638 | /* | ||
2639 | * The remaining ioctls are changing the state of the | ||
2640 | * superblock, so we do not allow read-only arrays | ||
2641 | * here: | ||
2642 | */ | ||
2643 | if (mddev->ro) { | ||
2644 | err = -EROFS; | ||
2645 | goto abort_unlock; | ||
2646 | } | ||
2647 | |||
2648 | switch (cmd) | ||
2649 | { | ||
2650 | case ADD_NEW_DISK: | ||
2651 | { | ||
2652 | mdu_disk_info_t info; | ||
2653 | if (copy_from_user(&info, argp, sizeof(info))) | ||
2654 | err = -EFAULT; | ||
2655 | else | ||
2656 | err = add_new_disk(mddev, &info); | ||
2657 | goto done_unlock; | ||
2658 | } | ||
2659 | |||
2660 | case HOT_REMOVE_DISK: | ||
2661 | err = hot_remove_disk(mddev, new_decode_dev(arg)); | ||
2662 | goto done_unlock; | ||
2663 | |||
2664 | case HOT_ADD_DISK: | ||
2665 | err = hot_add_disk(mddev, new_decode_dev(arg)); | ||
2666 | goto done_unlock; | ||
2667 | |||
2668 | case SET_DISK_FAULTY: | ||
2669 | err = set_disk_faulty(mddev, new_decode_dev(arg)); | ||
2670 | goto done_unlock; | ||
2671 | |||
2672 | case RUN_ARRAY: | ||
2673 | err = do_md_run (mddev); | ||
2674 | goto done_unlock; | ||
2675 | |||
2676 | default: | ||
2677 | if (_IOC_TYPE(cmd) == MD_MAJOR) | ||
2678 | printk(KERN_WARNING "md: %s(pid %d) used" | ||
2679 | " obsolete MD ioctl, upgrade your" | ||
2680 | " software to use new ictls.\n", | ||
2681 | current->comm, current->pid); | ||
2682 | err = -EINVAL; | ||
2683 | goto abort_unlock; | ||
2684 | } | ||
2685 | |||
2686 | done_unlock: | ||
2687 | abort_unlock: | ||
2688 | mddev_unlock(mddev); | ||
2689 | |||
2690 | return err; | ||
2691 | done: | ||
2692 | if (err) | ||
2693 | MD_BUG(); | ||
2694 | abort: | ||
2695 | return err; | ||
2696 | } | ||
2697 | |||
2698 | static int md_open(struct inode *inode, struct file *file) | ||
2699 | { | ||
2700 | /* | ||
2701 | * Succeed if we can lock the mddev, which confirms that | ||
2702 | * it isn't being stopped right now. | ||
2703 | */ | ||
2704 | mddev_t *mddev = inode->i_bdev->bd_disk->private_data; | ||
2705 | int err; | ||
2706 | |||
2707 | if ((err = mddev_lock(mddev))) | ||
2708 | goto out; | ||
2709 | |||
2710 | err = 0; | ||
2711 | mddev_get(mddev); | ||
2712 | mddev_unlock(mddev); | ||
2713 | |||
2714 | check_disk_change(inode->i_bdev); | ||
2715 | out: | ||
2716 | return err; | ||
2717 | } | ||
2718 | |||
2719 | static int md_release(struct inode *inode, struct file * file) | ||
2720 | { | ||
2721 | mddev_t *mddev = inode->i_bdev->bd_disk->private_data; | ||
2722 | |||
2723 | if (!mddev) | ||
2724 | BUG(); | ||
2725 | mddev_put(mddev); | ||
2726 | |||
2727 | return 0; | ||
2728 | } | ||
2729 | |||
2730 | static int md_media_changed(struct gendisk *disk) | ||
2731 | { | ||
2732 | mddev_t *mddev = disk->private_data; | ||
2733 | |||
2734 | return mddev->changed; | ||
2735 | } | ||
2736 | |||
2737 | static int md_revalidate(struct gendisk *disk) | ||
2738 | { | ||
2739 | mddev_t *mddev = disk->private_data; | ||
2740 | |||
2741 | mddev->changed = 0; | ||
2742 | return 0; | ||
2743 | } | ||
2744 | static struct block_device_operations md_fops = | ||
2745 | { | ||
2746 | .owner = THIS_MODULE, | ||
2747 | .open = md_open, | ||
2748 | .release = md_release, | ||
2749 | .ioctl = md_ioctl, | ||
2750 | .media_changed = md_media_changed, | ||
2751 | .revalidate_disk= md_revalidate, | ||
2752 | }; | ||
2753 | |||
2754 | int md_thread(void * arg) | ||
2755 | { | ||
2756 | mdk_thread_t *thread = arg; | ||
2757 | |||
2758 | lock_kernel(); | ||
2759 | |||
2760 | /* | ||
2761 | * Detach thread | ||
2762 | */ | ||
2763 | |||
2764 | daemonize(thread->name, mdname(thread->mddev)); | ||
2765 | |||
2766 | current->exit_signal = SIGCHLD; | ||
2767 | allow_signal(SIGKILL); | ||
2768 | thread->tsk = current; | ||
2769 | |||
2770 | /* | ||
2771 | * md_thread is a 'system-thread', it's priority should be very | ||
2772 | * high. We avoid resource deadlocks individually in each | ||
2773 | * raid personality. (RAID5 does preallocation) We also use RR and | ||
2774 | * the very same RT priority as kswapd, thus we will never get | ||
2775 | * into a priority inversion deadlock. | ||
2776 | * | ||
2777 | * we definitely have to have equal or higher priority than | ||
2778 | * bdflush, otherwise bdflush will deadlock if there are too | ||
2779 | * many dirty RAID5 blocks. | ||
2780 | */ | ||
2781 | unlock_kernel(); | ||
2782 | |||
2783 | complete(thread->event); | ||
2784 | while (thread->run) { | ||
2785 | void (*run)(mddev_t *); | ||
2786 | |||
2787 | wait_event_interruptible(thread->wqueue, | ||
2788 | test_bit(THREAD_WAKEUP, &thread->flags)); | ||
2789 | if (current->flags & PF_FREEZE) | ||
2790 | refrigerator(PF_FREEZE); | ||
2791 | |||
2792 | clear_bit(THREAD_WAKEUP, &thread->flags); | ||
2793 | |||
2794 | run = thread->run; | ||
2795 | if (run) | ||
2796 | run(thread->mddev); | ||
2797 | |||
2798 | if (signal_pending(current)) | ||
2799 | flush_signals(current); | ||
2800 | } | ||
2801 | complete(thread->event); | ||
2802 | return 0; | ||
2803 | } | ||
2804 | |||
2805 | void md_wakeup_thread(mdk_thread_t *thread) | ||
2806 | { | ||
2807 | if (thread) { | ||
2808 | dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); | ||
2809 | set_bit(THREAD_WAKEUP, &thread->flags); | ||
2810 | wake_up(&thread->wqueue); | ||
2811 | } | ||
2812 | } | ||
2813 | |||
2814 | mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, | ||
2815 | const char *name) | ||
2816 | { | ||
2817 | mdk_thread_t *thread; | ||
2818 | int ret; | ||
2819 | struct completion event; | ||
2820 | |||
2821 | thread = (mdk_thread_t *) kmalloc | ||
2822 | (sizeof(mdk_thread_t), GFP_KERNEL); | ||
2823 | if (!thread) | ||
2824 | return NULL; | ||
2825 | |||
2826 | memset(thread, 0, sizeof(mdk_thread_t)); | ||
2827 | init_waitqueue_head(&thread->wqueue); | ||
2828 | |||
2829 | init_completion(&event); | ||
2830 | thread->event = &event; | ||
2831 | thread->run = run; | ||
2832 | thread->mddev = mddev; | ||
2833 | thread->name = name; | ||
2834 | ret = kernel_thread(md_thread, thread, 0); | ||
2835 | if (ret < 0) { | ||
2836 | kfree(thread); | ||
2837 | return NULL; | ||
2838 | } | ||
2839 | wait_for_completion(&event); | ||
2840 | return thread; | ||
2841 | } | ||
2842 | |||
2843 | static void md_interrupt_thread(mdk_thread_t *thread) | ||
2844 | { | ||
2845 | if (!thread->tsk) { | ||
2846 | MD_BUG(); | ||
2847 | return; | ||
2848 | } | ||
2849 | dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); | ||
2850 | send_sig(SIGKILL, thread->tsk, 1); | ||
2851 | } | ||
2852 | |||
2853 | void md_unregister_thread(mdk_thread_t *thread) | ||
2854 | { | ||
2855 | struct completion event; | ||
2856 | |||
2857 | init_completion(&event); | ||
2858 | |||
2859 | thread->event = &event; | ||
2860 | thread->run = NULL; | ||
2861 | thread->name = NULL; | ||
2862 | md_interrupt_thread(thread); | ||
2863 | wait_for_completion(&event); | ||
2864 | kfree(thread); | ||
2865 | } | ||
2866 | |||
2867 | void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | ||
2868 | { | ||
2869 | if (!mddev) { | ||
2870 | MD_BUG(); | ||
2871 | return; | ||
2872 | } | ||
2873 | |||
2874 | if (!rdev || rdev->faulty) | ||
2875 | return; | ||
2876 | |||
2877 | dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", | ||
2878 | mdname(mddev), | ||
2879 | MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), | ||
2880 | __builtin_return_address(0),__builtin_return_address(1), | ||
2881 | __builtin_return_address(2),__builtin_return_address(3)); | ||
2882 | |||
2883 | if (!mddev->pers->error_handler) | ||
2884 | return; | ||
2885 | mddev->pers->error_handler(mddev,rdev); | ||
2886 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
2887 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
2888 | md_wakeup_thread(mddev->thread); | ||
2889 | } | ||
2890 | |||
2891 | /* seq_file implementation /proc/mdstat */ | ||
2892 | |||
2893 | static void status_unused(struct seq_file *seq) | ||
2894 | { | ||
2895 | int i = 0; | ||
2896 | mdk_rdev_t *rdev; | ||
2897 | struct list_head *tmp; | ||
2898 | |||
2899 | seq_printf(seq, "unused devices: "); | ||
2900 | |||
2901 | ITERATE_RDEV_PENDING(rdev,tmp) { | ||
2902 | char b[BDEVNAME_SIZE]; | ||
2903 | i++; | ||
2904 | seq_printf(seq, "%s ", | ||
2905 | bdevname(rdev->bdev,b)); | ||
2906 | } | ||
2907 | if (!i) | ||
2908 | seq_printf(seq, "<none>"); | ||
2909 | |||
2910 | seq_printf(seq, "\n"); | ||
2911 | } | ||
2912 | |||
2913 | |||
2914 | static void status_resync(struct seq_file *seq, mddev_t * mddev) | ||
2915 | { | ||
2916 | unsigned long max_blocks, resync, res, dt, db, rt; | ||
2917 | |||
2918 | resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; | ||
2919 | |||
2920 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | ||
2921 | max_blocks = mddev->resync_max_sectors >> 1; | ||
2922 | else | ||
2923 | max_blocks = mddev->size; | ||
2924 | |||
2925 | /* | ||
2926 | * Should not happen. | ||
2927 | */ | ||
2928 | if (!max_blocks) { | ||
2929 | MD_BUG(); | ||
2930 | return; | ||
2931 | } | ||
2932 | res = (resync/1024)*1000/(max_blocks/1024 + 1); | ||
2933 | { | ||
2934 | int i, x = res/50, y = 20-x; | ||
2935 | seq_printf(seq, "["); | ||
2936 | for (i = 0; i < x; i++) | ||
2937 | seq_printf(seq, "="); | ||
2938 | seq_printf(seq, ">"); | ||
2939 | for (i = 0; i < y; i++) | ||
2940 | seq_printf(seq, "."); | ||
2941 | seq_printf(seq, "] "); | ||
2942 | } | ||
2943 | seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", | ||
2944 | (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? | ||
2945 | "resync" : "recovery"), | ||
2946 | res/10, res % 10, resync, max_blocks); | ||
2947 | |||
2948 | /* | ||
2949 | * We do not want to overflow, so the order of operands and | ||
2950 | * the * 100 / 100 trick are important. We do a +1 to be | ||
2951 | * safe against division by zero. We only estimate anyway. | ||
2952 | * | ||
2953 | * dt: time from mark until now | ||
2954 | * db: blocks written from mark until now | ||
2955 | * rt: remaining time | ||
2956 | */ | ||
2957 | dt = ((jiffies - mddev->resync_mark) / HZ); | ||
2958 | if (!dt) dt++; | ||
2959 | db = resync - (mddev->resync_mark_cnt/2); | ||
2960 | rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; | ||
2961 | |||
2962 | seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); | ||
2963 | |||
2964 | seq_printf(seq, " speed=%ldK/sec", db/dt); | ||
2965 | } | ||
2966 | |||
2967 | static void *md_seq_start(struct seq_file *seq, loff_t *pos) | ||
2968 | { | ||
2969 | struct list_head *tmp; | ||
2970 | loff_t l = *pos; | ||
2971 | mddev_t *mddev; | ||
2972 | |||
2973 | if (l >= 0x10000) | ||
2974 | return NULL; | ||
2975 | if (!l--) | ||
2976 | /* header */ | ||
2977 | return (void*)1; | ||
2978 | |||
2979 | spin_lock(&all_mddevs_lock); | ||
2980 | list_for_each(tmp,&all_mddevs) | ||
2981 | if (!l--) { | ||
2982 | mddev = list_entry(tmp, mddev_t, all_mddevs); | ||
2983 | mddev_get(mddev); | ||
2984 | spin_unlock(&all_mddevs_lock); | ||
2985 | return mddev; | ||
2986 | } | ||
2987 | spin_unlock(&all_mddevs_lock); | ||
2988 | if (!l--) | ||
2989 | return (void*)2;/* tail */ | ||
2990 | return NULL; | ||
2991 | } | ||
2992 | |||
2993 | static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
2994 | { | ||
2995 | struct list_head *tmp; | ||
2996 | mddev_t *next_mddev, *mddev = v; | ||
2997 | |||
2998 | ++*pos; | ||
2999 | if (v == (void*)2) | ||
3000 | return NULL; | ||
3001 | |||
3002 | spin_lock(&all_mddevs_lock); | ||
3003 | if (v == (void*)1) | ||
3004 | tmp = all_mddevs.next; | ||
3005 | else | ||
3006 | tmp = mddev->all_mddevs.next; | ||
3007 | if (tmp != &all_mddevs) | ||
3008 | next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); | ||
3009 | else { | ||
3010 | next_mddev = (void*)2; | ||
3011 | *pos = 0x10000; | ||
3012 | } | ||
3013 | spin_unlock(&all_mddevs_lock); | ||
3014 | |||
3015 | if (v != (void*)1) | ||
3016 | mddev_put(mddev); | ||
3017 | return next_mddev; | ||
3018 | |||
3019 | } | ||
3020 | |||
3021 | static void md_seq_stop(struct seq_file *seq, void *v) | ||
3022 | { | ||
3023 | mddev_t *mddev = v; | ||
3024 | |||
3025 | if (mddev && v != (void*)1 && v != (void*)2) | ||
3026 | mddev_put(mddev); | ||
3027 | } | ||
3028 | |||
3029 | static int md_seq_show(struct seq_file *seq, void *v) | ||
3030 | { | ||
3031 | mddev_t *mddev = v; | ||
3032 | sector_t size; | ||
3033 | struct list_head *tmp2; | ||
3034 | mdk_rdev_t *rdev; | ||
3035 | int i; | ||
3036 | |||
3037 | if (v == (void*)1) { | ||
3038 | seq_printf(seq, "Personalities : "); | ||
3039 | spin_lock(&pers_lock); | ||
3040 | for (i = 0; i < MAX_PERSONALITY; i++) | ||
3041 | if (pers[i]) | ||
3042 | seq_printf(seq, "[%s] ", pers[i]->name); | ||
3043 | |||
3044 | spin_unlock(&pers_lock); | ||
3045 | seq_printf(seq, "\n"); | ||
3046 | return 0; | ||
3047 | } | ||
3048 | if (v == (void*)2) { | ||
3049 | status_unused(seq); | ||
3050 | return 0; | ||
3051 | } | ||
3052 | |||
3053 | if (mddev_lock(mddev)!=0) | ||
3054 | return -EINTR; | ||
3055 | if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { | ||
3056 | seq_printf(seq, "%s : %sactive", mdname(mddev), | ||
3057 | mddev->pers ? "" : "in"); | ||
3058 | if (mddev->pers) { | ||
3059 | if (mddev->ro) | ||
3060 | seq_printf(seq, " (read-only)"); | ||
3061 | seq_printf(seq, " %s", mddev->pers->name); | ||
3062 | } | ||
3063 | |||
3064 | size = 0; | ||
3065 | ITERATE_RDEV(mddev,rdev,tmp2) { | ||
3066 | char b[BDEVNAME_SIZE]; | ||
3067 | seq_printf(seq, " %s[%d]", | ||
3068 | bdevname(rdev->bdev,b), rdev->desc_nr); | ||
3069 | if (rdev->faulty) { | ||
3070 | seq_printf(seq, "(F)"); | ||
3071 | continue; | ||
3072 | } | ||
3073 | size += rdev->size; | ||
3074 | } | ||
3075 | |||
3076 | if (!list_empty(&mddev->disks)) { | ||
3077 | if (mddev->pers) | ||
3078 | seq_printf(seq, "\n %llu blocks", | ||
3079 | (unsigned long long)mddev->array_size); | ||
3080 | else | ||
3081 | seq_printf(seq, "\n %llu blocks", | ||
3082 | (unsigned long long)size); | ||
3083 | } | ||
3084 | |||
3085 | if (mddev->pers) { | ||
3086 | mddev->pers->status (seq, mddev); | ||
3087 | seq_printf(seq, "\n "); | ||
3088 | if (mddev->curr_resync > 2) | ||
3089 | status_resync (seq, mddev); | ||
3090 | else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) | ||
3091 | seq_printf(seq, " resync=DELAYED"); | ||
3092 | } | ||
3093 | |||
3094 | seq_printf(seq, "\n"); | ||
3095 | } | ||
3096 | mddev_unlock(mddev); | ||
3097 | |||
3098 | return 0; | ||
3099 | } | ||
3100 | |||
3101 | static struct seq_operations md_seq_ops = { | ||
3102 | .start = md_seq_start, | ||
3103 | .next = md_seq_next, | ||
3104 | .stop = md_seq_stop, | ||
3105 | .show = md_seq_show, | ||
3106 | }; | ||
3107 | |||
3108 | static int md_seq_open(struct inode *inode, struct file *file) | ||
3109 | { | ||
3110 | int error; | ||
3111 | |||
3112 | error = seq_open(file, &md_seq_ops); | ||
3113 | return error; | ||
3114 | } | ||
3115 | |||
3116 | static struct file_operations md_seq_fops = { | ||
3117 | .open = md_seq_open, | ||
3118 | .read = seq_read, | ||
3119 | .llseek = seq_lseek, | ||
3120 | .release = seq_release, | ||
3121 | }; | ||
3122 | |||
3123 | int register_md_personality(int pnum, mdk_personality_t *p) | ||
3124 | { | ||
3125 | if (pnum >= MAX_PERSONALITY) { | ||
3126 | printk(KERN_ERR | ||
3127 | "md: tried to install personality %s as nr %d, but max is %lu\n", | ||
3128 | p->name, pnum, MAX_PERSONALITY-1); | ||
3129 | return -EINVAL; | ||
3130 | } | ||
3131 | |||
3132 | spin_lock(&pers_lock); | ||
3133 | if (pers[pnum]) { | ||
3134 | spin_unlock(&pers_lock); | ||
3135 | MD_BUG(); | ||
3136 | return -EBUSY; | ||
3137 | } | ||
3138 | |||
3139 | pers[pnum] = p; | ||
3140 | printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); | ||
3141 | spin_unlock(&pers_lock); | ||
3142 | return 0; | ||
3143 | } | ||
3144 | |||
3145 | int unregister_md_personality(int pnum) | ||
3146 | { | ||
3147 | if (pnum >= MAX_PERSONALITY) { | ||
3148 | MD_BUG(); | ||
3149 | return -EINVAL; | ||
3150 | } | ||
3151 | |||
3152 | printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); | ||
3153 | spin_lock(&pers_lock); | ||
3154 | pers[pnum] = NULL; | ||
3155 | spin_unlock(&pers_lock); | ||
3156 | return 0; | ||
3157 | } | ||
3158 | |||
3159 | static int is_mddev_idle(mddev_t *mddev) | ||
3160 | { | ||
3161 | mdk_rdev_t * rdev; | ||
3162 | struct list_head *tmp; | ||
3163 | int idle; | ||
3164 | unsigned long curr_events; | ||
3165 | |||
3166 | idle = 1; | ||
3167 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
3168 | struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; | ||
3169 | curr_events = disk_stat_read(disk, read_sectors) + | ||
3170 | disk_stat_read(disk, write_sectors) - | ||
3171 | atomic_read(&disk->sync_io); | ||
3172 | /* Allow some slack between valud of curr_events and last_events, | ||
3173 | * as there are some uninteresting races. | ||
3174 | * Note: the following is an unsigned comparison. | ||
3175 | */ | ||
3176 | if ((curr_events - rdev->last_events + 32) > 64) { | ||
3177 | rdev->last_events = curr_events; | ||
3178 | idle = 0; | ||
3179 | } | ||
3180 | } | ||
3181 | return idle; | ||
3182 | } | ||
3183 | |||
3184 | void md_done_sync(mddev_t *mddev, int blocks, int ok) | ||
3185 | { | ||
3186 | /* another "blocks" (512byte) blocks have been synced */ | ||
3187 | atomic_sub(blocks, &mddev->recovery_active); | ||
3188 | wake_up(&mddev->recovery_wait); | ||
3189 | if (!ok) { | ||
3190 | set_bit(MD_RECOVERY_ERR, &mddev->recovery); | ||
3191 | md_wakeup_thread(mddev->thread); | ||
3192 | // stop recovery, signal do_sync .... | ||
3193 | } | ||
3194 | } | ||
3195 | |||
3196 | |||
3197 | void md_write_start(mddev_t *mddev) | ||
3198 | { | ||
3199 | if (!atomic_read(&mddev->writes_pending)) { | ||
3200 | mddev_lock_uninterruptible(mddev); | ||
3201 | if (mddev->in_sync) { | ||
3202 | mddev->in_sync = 0; | ||
3203 | del_timer(&mddev->safemode_timer); | ||
3204 | md_update_sb(mddev); | ||
3205 | } | ||
3206 | atomic_inc(&mddev->writes_pending); | ||
3207 | mddev_unlock(mddev); | ||
3208 | } else | ||
3209 | atomic_inc(&mddev->writes_pending); | ||
3210 | } | ||
3211 | |||
3212 | void md_write_end(mddev_t *mddev) | ||
3213 | { | ||
3214 | if (atomic_dec_and_test(&mddev->writes_pending)) { | ||
3215 | if (mddev->safemode == 2) | ||
3216 | md_wakeup_thread(mddev->thread); | ||
3217 | else | ||
3218 | mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); | ||
3219 | } | ||
3220 | } | ||
3221 | |||
3222 | static inline void md_enter_safemode(mddev_t *mddev) | ||
3223 | { | ||
3224 | if (!mddev->safemode) return; | ||
3225 | if (mddev->safemode == 2 && | ||
3226 | (atomic_read(&mddev->writes_pending) || mddev->in_sync || | ||
3227 | mddev->recovery_cp != MaxSector)) | ||
3228 | return; /* avoid the lock */ | ||
3229 | mddev_lock_uninterruptible(mddev); | ||
3230 | if (mddev->safemode && !atomic_read(&mddev->writes_pending) && | ||
3231 | !mddev->in_sync && mddev->recovery_cp == MaxSector) { | ||
3232 | mddev->in_sync = 1; | ||
3233 | md_update_sb(mddev); | ||
3234 | } | ||
3235 | mddev_unlock(mddev); | ||
3236 | |||
3237 | if (mddev->safemode == 1) | ||
3238 | mddev->safemode = 0; | ||
3239 | } | ||
3240 | |||
3241 | void md_handle_safemode(mddev_t *mddev) | ||
3242 | { | ||
3243 | if (signal_pending(current)) { | ||
3244 | printk(KERN_INFO "md: %s in immediate safe mode\n", | ||
3245 | mdname(mddev)); | ||
3246 | mddev->safemode = 2; | ||
3247 | flush_signals(current); | ||
3248 | } | ||
3249 | md_enter_safemode(mddev); | ||
3250 | } | ||
3251 | |||
3252 | |||
3253 | DECLARE_WAIT_QUEUE_HEAD(resync_wait); | ||
3254 | |||
3255 | #define SYNC_MARKS 10 | ||
3256 | #define SYNC_MARK_STEP (3*HZ) | ||
3257 | static void md_do_sync(mddev_t *mddev) | ||
3258 | { | ||
3259 | mddev_t *mddev2; | ||
3260 | unsigned int currspeed = 0, | ||
3261 | window; | ||
3262 | sector_t max_sectors,j; | ||
3263 | unsigned long mark[SYNC_MARKS]; | ||
3264 | sector_t mark_cnt[SYNC_MARKS]; | ||
3265 | int last_mark,m; | ||
3266 | struct list_head *tmp; | ||
3267 | sector_t last_check; | ||
3268 | |||
3269 | /* just incase thread restarts... */ | ||
3270 | if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) | ||
3271 | return; | ||
3272 | |||
3273 | /* we overload curr_resync somewhat here. | ||
3274 | * 0 == not engaged in resync at all | ||
3275 | * 2 == checking that there is no conflict with another sync | ||
3276 | * 1 == like 2, but have yielded to allow conflicting resync to | ||
3277 | * commense | ||
3278 | * other == active in resync - this many blocks | ||
3279 | * | ||
3280 | * Before starting a resync we must have set curr_resync to | ||
3281 | * 2, and then checked that every "conflicting" array has curr_resync | ||
3282 | * less than ours. When we find one that is the same or higher | ||
3283 | * we wait on resync_wait. To avoid deadlock, we reduce curr_resync | ||
3284 | * to 1 if we choose to yield (based arbitrarily on address of mddev structure). | ||
3285 | * This will mean we have to start checking from the beginning again. | ||
3286 | * | ||
3287 | */ | ||
3288 | |||
3289 | do { | ||
3290 | mddev->curr_resync = 2; | ||
3291 | |||
3292 | try_again: | ||
3293 | if (signal_pending(current)) { | ||
3294 | flush_signals(current); | ||
3295 | goto skip; | ||
3296 | } | ||
3297 | ITERATE_MDDEV(mddev2,tmp) { | ||
3298 | printk("."); | ||
3299 | if (mddev2 == mddev) | ||
3300 | continue; | ||
3301 | if (mddev2->curr_resync && | ||
3302 | match_mddev_units(mddev,mddev2)) { | ||
3303 | DEFINE_WAIT(wq); | ||
3304 | if (mddev < mddev2 && mddev->curr_resync == 2) { | ||
3305 | /* arbitrarily yield */ | ||
3306 | mddev->curr_resync = 1; | ||
3307 | wake_up(&resync_wait); | ||
3308 | } | ||
3309 | if (mddev > mddev2 && mddev->curr_resync == 1) | ||
3310 | /* no need to wait here, we can wait the next | ||
3311 | * time 'round when curr_resync == 2 | ||
3312 | */ | ||
3313 | continue; | ||
3314 | prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); | ||
3315 | if (!signal_pending(current) | ||
3316 | && mddev2->curr_resync >= mddev->curr_resync) { | ||
3317 | printk(KERN_INFO "md: delaying resync of %s" | ||
3318 | " until %s has finished resync (they" | ||
3319 | " share one or more physical units)\n", | ||
3320 | mdname(mddev), mdname(mddev2)); | ||
3321 | mddev_put(mddev2); | ||
3322 | schedule(); | ||
3323 | finish_wait(&resync_wait, &wq); | ||
3324 | goto try_again; | ||
3325 | } | ||
3326 | finish_wait(&resync_wait, &wq); | ||
3327 | } | ||
3328 | } | ||
3329 | } while (mddev->curr_resync < 2); | ||
3330 | |||
3331 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | ||
3332 | /* resync follows the size requested by the personality, | ||
3333 | * which default to physical size, but can be virtual size | ||
3334 | */ | ||
3335 | max_sectors = mddev->resync_max_sectors; | ||
3336 | else | ||
3337 | /* recovery follows the physical size of devices */ | ||
3338 | max_sectors = mddev->size << 1; | ||
3339 | |||
3340 | printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); | ||
3341 | printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" | ||
3342 | " %d KB/sec/disc.\n", sysctl_speed_limit_min); | ||
3343 | printk(KERN_INFO "md: using maximum available idle IO bandwith " | ||
3344 | "(but not more than %d KB/sec) for reconstruction.\n", | ||
3345 | sysctl_speed_limit_max); | ||
3346 | |||
3347 | is_mddev_idle(mddev); /* this also initializes IO event counters */ | ||
3348 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | ||
3349 | j = mddev->recovery_cp; | ||
3350 | else | ||
3351 | j = 0; | ||
3352 | for (m = 0; m < SYNC_MARKS; m++) { | ||
3353 | mark[m] = jiffies; | ||
3354 | mark_cnt[m] = j; | ||
3355 | } | ||
3356 | last_mark = 0; | ||
3357 | mddev->resync_mark = mark[last_mark]; | ||
3358 | mddev->resync_mark_cnt = mark_cnt[last_mark]; | ||
3359 | |||
3360 | /* | ||
3361 | * Tune reconstruction: | ||
3362 | */ | ||
3363 | window = 32*(PAGE_SIZE/512); | ||
3364 | printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", | ||
3365 | window/2,(unsigned long long) max_sectors/2); | ||
3366 | |||
3367 | atomic_set(&mddev->recovery_active, 0); | ||
3368 | init_waitqueue_head(&mddev->recovery_wait); | ||
3369 | last_check = 0; | ||
3370 | |||
3371 | if (j>2) { | ||
3372 | printk(KERN_INFO | ||
3373 | "md: resuming recovery of %s from checkpoint.\n", | ||
3374 | mdname(mddev)); | ||
3375 | mddev->curr_resync = j; | ||
3376 | } | ||
3377 | |||
3378 | while (j < max_sectors) { | ||
3379 | int sectors; | ||
3380 | |||
3381 | sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); | ||
3382 | if (sectors < 0) { | ||
3383 | set_bit(MD_RECOVERY_ERR, &mddev->recovery); | ||
3384 | goto out; | ||
3385 | } | ||
3386 | atomic_add(sectors, &mddev->recovery_active); | ||
3387 | j += sectors; | ||
3388 | if (j>1) mddev->curr_resync = j; | ||
3389 | |||
3390 | if (last_check + window > j || j == max_sectors) | ||
3391 | continue; | ||
3392 | |||
3393 | last_check = j; | ||
3394 | |||
3395 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || | ||
3396 | test_bit(MD_RECOVERY_ERR, &mddev->recovery)) | ||
3397 | break; | ||
3398 | |||
3399 | repeat: | ||
3400 | if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { | ||
3401 | /* step marks */ | ||
3402 | int next = (last_mark+1) % SYNC_MARKS; | ||
3403 | |||
3404 | mddev->resync_mark = mark[next]; | ||
3405 | mddev->resync_mark_cnt = mark_cnt[next]; | ||
3406 | mark[next] = jiffies; | ||
3407 | mark_cnt[next] = j - atomic_read(&mddev->recovery_active); | ||
3408 | last_mark = next; | ||
3409 | } | ||
3410 | |||
3411 | |||
3412 | if (signal_pending(current)) { | ||
3413 | /* | ||
3414 | * got a signal, exit. | ||
3415 | */ | ||
3416 | printk(KERN_INFO | ||
3417 | "md: md_do_sync() got signal ... exiting\n"); | ||
3418 | flush_signals(current); | ||
3419 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
3420 | goto out; | ||
3421 | } | ||
3422 | |||
3423 | /* | ||
3424 | * this loop exits only if either when we are slower than | ||
3425 | * the 'hard' speed limit, or the system was IO-idle for | ||
3426 | * a jiffy. | ||
3427 | * the system might be non-idle CPU-wise, but we only care | ||
3428 | * about not overloading the IO subsystem. (things like an | ||
3429 | * e2fsck being done on the RAID array should execute fast) | ||
3430 | */ | ||
3431 | mddev->queue->unplug_fn(mddev->queue); | ||
3432 | cond_resched(); | ||
3433 | |||
3434 | currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1; | ||
3435 | |||
3436 | if (currspeed > sysctl_speed_limit_min) { | ||
3437 | if ((currspeed > sysctl_speed_limit_max) || | ||
3438 | !is_mddev_idle(mddev)) { | ||
3439 | msleep_interruptible(250); | ||
3440 | goto repeat; | ||
3441 | } | ||
3442 | } | ||
3443 | } | ||
3444 | printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); | ||
3445 | /* | ||
3446 | * this also signals 'finished resyncing' to md_stop | ||
3447 | */ | ||
3448 | out: | ||
3449 | mddev->queue->unplug_fn(mddev->queue); | ||
3450 | |||
3451 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); | ||
3452 | |||
3453 | /* tell personality that we are finished */ | ||
3454 | mddev->pers->sync_request(mddev, max_sectors, 1); | ||
3455 | |||
3456 | if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && | ||
3457 | mddev->curr_resync > 2 && | ||
3458 | mddev->curr_resync >= mddev->recovery_cp) { | ||
3459 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | ||
3460 | printk(KERN_INFO | ||
3461 | "md: checkpointing recovery of %s.\n", | ||
3462 | mdname(mddev)); | ||
3463 | mddev->recovery_cp = mddev->curr_resync; | ||
3464 | } else | ||
3465 | mddev->recovery_cp = MaxSector; | ||
3466 | } | ||
3467 | |||
3468 | md_enter_safemode(mddev); | ||
3469 | skip: | ||
3470 | mddev->curr_resync = 0; | ||
3471 | wake_up(&resync_wait); | ||
3472 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); | ||
3473 | md_wakeup_thread(mddev->thread); | ||
3474 | } | ||
3475 | |||
3476 | |||
3477 | /* | ||
3478 | * This routine is regularly called by all per-raid-array threads to | ||
3479 | * deal with generic issues like resync and super-block update. | ||
3480 | * Raid personalities that don't have a thread (linear/raid0) do not | ||
3481 | * need this as they never do any recovery or update the superblock. | ||
3482 | * | ||
3483 | * It does not do any resync itself, but rather "forks" off other threads | ||
3484 | * to do that as needed. | ||
3485 | * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in | ||
3486 | * "->recovery" and create a thread at ->sync_thread. | ||
3487 | * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) | ||
3488 | * and wakeups up this thread which will reap the thread and finish up. | ||
3489 | * This thread also removes any faulty devices (with nr_pending == 0). | ||
3490 | * | ||
3491 | * The overall approach is: | ||
3492 | * 1/ if the superblock needs updating, update it. | ||
3493 | * 2/ If a recovery thread is running, don't do anything else. | ||
3494 | * 3/ If recovery has finished, clean up, possibly marking spares active. | ||
3495 | * 4/ If there are any faulty devices, remove them. | ||
3496 | * 5/ If array is degraded, try to add spares devices | ||
3497 | * 6/ If array has spares or is not in-sync, start a resync thread. | ||
3498 | */ | ||
3499 | void md_check_recovery(mddev_t *mddev) | ||
3500 | { | ||
3501 | mdk_rdev_t *rdev; | ||
3502 | struct list_head *rtmp; | ||
3503 | |||
3504 | |||
3505 | dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); | ||
3506 | |||
3507 | if (mddev->ro) | ||
3508 | return; | ||
3509 | if ( ! ( | ||
3510 | mddev->sb_dirty || | ||
3511 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || | ||
3512 | test_bit(MD_RECOVERY_DONE, &mddev->recovery) | ||
3513 | )) | ||
3514 | return; | ||
3515 | if (mddev_trylock(mddev)==0) { | ||
3516 | int spares =0; | ||
3517 | if (mddev->sb_dirty) | ||
3518 | md_update_sb(mddev); | ||
3519 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && | ||
3520 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { | ||
3521 | /* resync/recovery still happening */ | ||
3522 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
3523 | goto unlock; | ||
3524 | } | ||
3525 | if (mddev->sync_thread) { | ||
3526 | /* resync has finished, collect result */ | ||
3527 | md_unregister_thread(mddev->sync_thread); | ||
3528 | mddev->sync_thread = NULL; | ||
3529 | if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && | ||
3530 | !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | ||
3531 | /* success...*/ | ||
3532 | /* activate any spares */ | ||
3533 | mddev->pers->spare_active(mddev); | ||
3534 | } | ||
3535 | md_update_sb(mddev); | ||
3536 | mddev->recovery = 0; | ||
3537 | /* flag recovery needed just to double check */ | ||
3538 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
3539 | goto unlock; | ||
3540 | } | ||
3541 | if (mddev->recovery) | ||
3542 | /* probably just the RECOVERY_NEEDED flag */ | ||
3543 | mddev->recovery = 0; | ||
3544 | |||
3545 | /* no recovery is running. | ||
3546 | * remove any failed drives, then | ||
3547 | * add spares if possible. | ||
3548 | * Spare are also removed and re-added, to allow | ||
3549 | * the personality to fail the re-add. | ||
3550 | */ | ||
3551 | ITERATE_RDEV(mddev,rdev,rtmp) | ||
3552 | if (rdev->raid_disk >= 0 && | ||
3553 | (rdev->faulty || ! rdev->in_sync) && | ||
3554 | atomic_read(&rdev->nr_pending)==0) { | ||
3555 | if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) | ||
3556 | rdev->raid_disk = -1; | ||
3557 | } | ||
3558 | |||
3559 | if (mddev->degraded) { | ||
3560 | ITERATE_RDEV(mddev,rdev,rtmp) | ||
3561 | if (rdev->raid_disk < 0 | ||
3562 | && !rdev->faulty) { | ||
3563 | if (mddev->pers->hot_add_disk(mddev,rdev)) | ||
3564 | spares++; | ||
3565 | else | ||
3566 | break; | ||
3567 | } | ||
3568 | } | ||
3569 | |||
3570 | if (!spares && (mddev->recovery_cp == MaxSector )) { | ||
3571 | /* nothing we can do ... */ | ||
3572 | goto unlock; | ||
3573 | } | ||
3574 | if (mddev->pers->sync_request) { | ||
3575 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
3576 | if (!spares) | ||
3577 | set_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
3578 | mddev->sync_thread = md_register_thread(md_do_sync, | ||
3579 | mddev, | ||
3580 | "%s_resync"); | ||
3581 | if (!mddev->sync_thread) { | ||
3582 | printk(KERN_ERR "%s: could not start resync" | ||
3583 | " thread...\n", | ||
3584 | mdname(mddev)); | ||
3585 | /* leave the spares where they are, it shouldn't hurt */ | ||
3586 | mddev->recovery = 0; | ||
3587 | } else { | ||
3588 | md_wakeup_thread(mddev->sync_thread); | ||
3589 | } | ||
3590 | } | ||
3591 | unlock: | ||
3592 | mddev_unlock(mddev); | ||
3593 | } | ||
3594 | } | ||
3595 | |||
3596 | int md_notify_reboot(struct notifier_block *this, | ||
3597 | unsigned long code, void *x) | ||
3598 | { | ||
3599 | struct list_head *tmp; | ||
3600 | mddev_t *mddev; | ||
3601 | |||
3602 | if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { | ||
3603 | |||
3604 | printk(KERN_INFO "md: stopping all md devices.\n"); | ||
3605 | |||
3606 | ITERATE_MDDEV(mddev,tmp) | ||
3607 | if (mddev_trylock(mddev)==0) | ||
3608 | do_md_stop (mddev, 1); | ||
3609 | /* | ||
3610 | * certain more exotic SCSI devices are known to be | ||
3611 | * volatile wrt too early system reboots. While the | ||
3612 | * right place to handle this issue is the given | ||
3613 | * driver, we do want to have a safe RAID driver ... | ||
3614 | */ | ||
3615 | mdelay(1000*1); | ||
3616 | } | ||
3617 | return NOTIFY_DONE; | ||
3618 | } | ||
3619 | |||
3620 | struct notifier_block md_notifier = { | ||
3621 | .notifier_call = md_notify_reboot, | ||
3622 | .next = NULL, | ||
3623 | .priority = INT_MAX, /* before any real devices */ | ||
3624 | }; | ||
3625 | |||
3626 | static void md_geninit(void) | ||
3627 | { | ||
3628 | struct proc_dir_entry *p; | ||
3629 | |||
3630 | dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); | ||
3631 | |||
3632 | p = create_proc_entry("mdstat", S_IRUGO, NULL); | ||
3633 | if (p) | ||
3634 | p->proc_fops = &md_seq_fops; | ||
3635 | } | ||
3636 | |||
3637 | int __init md_init(void) | ||
3638 | { | ||
3639 | int minor; | ||
3640 | |||
3641 | printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," | ||
3642 | " MD_SB_DISKS=%d\n", | ||
3643 | MD_MAJOR_VERSION, MD_MINOR_VERSION, | ||
3644 | MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); | ||
3645 | |||
3646 | if (register_blkdev(MAJOR_NR, "md")) | ||
3647 | return -1; | ||
3648 | if ((mdp_major=register_blkdev(0, "mdp"))<=0) { | ||
3649 | unregister_blkdev(MAJOR_NR, "md"); | ||
3650 | return -1; | ||
3651 | } | ||
3652 | devfs_mk_dir("md"); | ||
3653 | blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, | ||
3654 | md_probe, NULL, NULL); | ||
3655 | blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, | ||
3656 | md_probe, NULL, NULL); | ||
3657 | |||
3658 | for (minor=0; minor < MAX_MD_DEVS; ++minor) | ||
3659 | devfs_mk_bdev(MKDEV(MAJOR_NR, minor), | ||
3660 | S_IFBLK|S_IRUSR|S_IWUSR, | ||
3661 | "md/%d", minor); | ||
3662 | |||
3663 | for (minor=0; minor < MAX_MD_DEVS; ++minor) | ||
3664 | devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), | ||
3665 | S_IFBLK|S_IRUSR|S_IWUSR, | ||
3666 | "md/mdp%d", minor); | ||
3667 | |||
3668 | |||
3669 | register_reboot_notifier(&md_notifier); | ||
3670 | raid_table_header = register_sysctl_table(raid_root_table, 1); | ||
3671 | |||
3672 | md_geninit(); | ||
3673 | return (0); | ||
3674 | } | ||
3675 | |||
3676 | |||
3677 | #ifndef MODULE | ||
3678 | |||
3679 | /* | ||
3680 | * Searches all registered partitions for autorun RAID arrays | ||
3681 | * at boot time. | ||
3682 | */ | ||
3683 | static dev_t detected_devices[128]; | ||
3684 | static int dev_cnt; | ||
3685 | |||
3686 | void md_autodetect_dev(dev_t dev) | ||
3687 | { | ||
3688 | if (dev_cnt >= 0 && dev_cnt < 127) | ||
3689 | detected_devices[dev_cnt++] = dev; | ||
3690 | } | ||
3691 | |||
3692 | |||
3693 | static void autostart_arrays(int part) | ||
3694 | { | ||
3695 | mdk_rdev_t *rdev; | ||
3696 | int i; | ||
3697 | |||
3698 | printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); | ||
3699 | |||
3700 | for (i = 0; i < dev_cnt; i++) { | ||
3701 | dev_t dev = detected_devices[i]; | ||
3702 | |||
3703 | rdev = md_import_device(dev,0, 0); | ||
3704 | if (IS_ERR(rdev)) | ||
3705 | continue; | ||
3706 | |||
3707 | if (rdev->faulty) { | ||
3708 | MD_BUG(); | ||
3709 | continue; | ||
3710 | } | ||
3711 | list_add(&rdev->same_set, &pending_raid_disks); | ||
3712 | } | ||
3713 | dev_cnt = 0; | ||
3714 | |||
3715 | autorun_devices(part); | ||
3716 | } | ||
3717 | |||
3718 | #endif | ||
3719 | |||
3720 | static __exit void md_exit(void) | ||
3721 | { | ||
3722 | mddev_t *mddev; | ||
3723 | struct list_head *tmp; | ||
3724 | int i; | ||
3725 | blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); | ||
3726 | blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); | ||
3727 | for (i=0; i < MAX_MD_DEVS; i++) | ||
3728 | devfs_remove("md/%d", i); | ||
3729 | for (i=0; i < MAX_MD_DEVS; i++) | ||
3730 | devfs_remove("md/d%d", i); | ||
3731 | |||
3732 | devfs_remove("md"); | ||
3733 | |||
3734 | unregister_blkdev(MAJOR_NR,"md"); | ||
3735 | unregister_blkdev(mdp_major, "mdp"); | ||
3736 | unregister_reboot_notifier(&md_notifier); | ||
3737 | unregister_sysctl_table(raid_table_header); | ||
3738 | remove_proc_entry("mdstat", NULL); | ||
3739 | ITERATE_MDDEV(mddev,tmp) { | ||
3740 | struct gendisk *disk = mddev->gendisk; | ||
3741 | if (!disk) | ||
3742 | continue; | ||
3743 | export_array(mddev); | ||
3744 | del_gendisk(disk); | ||
3745 | put_disk(disk); | ||
3746 | mddev->gendisk = NULL; | ||
3747 | mddev_put(mddev); | ||
3748 | } | ||
3749 | } | ||
3750 | |||
3751 | module_init(md_init) | ||
3752 | module_exit(md_exit) | ||
3753 | |||
3754 | EXPORT_SYMBOL(register_md_personality); | ||
3755 | EXPORT_SYMBOL(unregister_md_personality); | ||
3756 | EXPORT_SYMBOL(md_error); | ||
3757 | EXPORT_SYMBOL(md_done_sync); | ||
3758 | EXPORT_SYMBOL(md_write_start); | ||
3759 | EXPORT_SYMBOL(md_write_end); | ||
3760 | EXPORT_SYMBOL(md_handle_safemode); | ||
3761 | EXPORT_SYMBOL(md_register_thread); | ||
3762 | EXPORT_SYMBOL(md_unregister_thread); | ||
3763 | EXPORT_SYMBOL(md_wakeup_thread); | ||
3764 | EXPORT_SYMBOL(md_print_devices); | ||
3765 | EXPORT_SYMBOL(md_check_recovery); | ||
3766 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c new file mode 100644 index 000000000000..adef299908cf --- /dev/null +++ b/drivers/md/mktables.c | |||
@@ -0,0 +1,125 @@ | |||
1 | #ident "$Id: mktables.c,v 1.2 2002/12/12 22:41:27 hpa Exp $" | ||
2 | /* ----------------------------------------------------------------------- * | ||
3 | * | ||
4 | * Copyright 2002 H. Peter Anvin - All Rights Reserved | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
9 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | ||
10 | * (at your option) any later version; incorporated herein by reference. | ||
11 | * | ||
12 | * ----------------------------------------------------------------------- */ | ||
13 | |||
14 | /* | ||
15 | * mktables.c | ||
16 | * | ||
17 | * Make RAID-6 tables. This is a host user space program to be run at | ||
18 | * compile time. | ||
19 | */ | ||
20 | |||
21 | #include <stdio.h> | ||
22 | #include <string.h> | ||
23 | #include <inttypes.h> | ||
24 | #include <stdlib.h> | ||
25 | #include <time.h> | ||
26 | |||
27 | static uint8_t gfmul(uint8_t a, uint8_t b) | ||
28 | { | ||
29 | uint8_t v = 0; | ||
30 | |||
31 | while ( b ) { | ||
32 | if ( b & 1 ) v ^= a; | ||
33 | a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); | ||
34 | b >>= 1; | ||
35 | } | ||
36 | return v; | ||
37 | } | ||
38 | |||
39 | static uint8_t gfpow(uint8_t a, int b) | ||
40 | { | ||
41 | uint8_t v = 1; | ||
42 | |||
43 | b %= 255; | ||
44 | if ( b < 0 ) | ||
45 | b += 255; | ||
46 | |||
47 | while ( b ) { | ||
48 | if ( b & 1 ) v = gfmul(v,a); | ||
49 | a = gfmul(a,a); | ||
50 | b >>= 1; | ||
51 | } | ||
52 | return v; | ||
53 | } | ||
54 | |||
55 | int main(int argc, char *argv[]) | ||
56 | { | ||
57 | int i, j, k; | ||
58 | uint8_t v; | ||
59 | uint8_t exptbl[256], invtbl[256]; | ||
60 | |||
61 | printf("#include \"raid6.h\"\n"); | ||
62 | |||
63 | /* Compute multiplication table */ | ||
64 | printf("\nconst u8 __attribute__((aligned(256)))\n" | ||
65 | "raid6_gfmul[256][256] =\n" | ||
66 | "{\n"); | ||
67 | for ( i = 0 ; i < 256 ; i++ ) { | ||
68 | printf("\t{\n"); | ||
69 | for ( j = 0 ; j < 256 ; j += 8 ) { | ||
70 | printf("\t\t"); | ||
71 | for ( k = 0 ; k < 8 ; k++ ) { | ||
72 | printf("0x%02x, ", gfmul(i,j+k)); | ||
73 | } | ||
74 | printf("\n"); | ||
75 | } | ||
76 | printf("\t},\n"); | ||
77 | } | ||
78 | printf("};\n"); | ||
79 | |||
80 | /* Compute power-of-2 table (exponent) */ | ||
81 | v = 1; | ||
82 | printf("\nconst u8 __attribute__((aligned(256)))\n" | ||
83 | "raid6_gfexp[256] =\n" | ||
84 | "{\n"); | ||
85 | for ( i = 0 ; i < 256 ; i += 8 ) { | ||
86 | printf("\t"); | ||
87 | for ( j = 0 ; j < 8 ; j++ ) { | ||
88 | exptbl[i+j] = v; | ||
89 | printf("0x%02x, ", v); | ||
90 | v = gfmul(v,2); | ||
91 | if ( v == 1 ) v = 0; /* For entry 255, not a real entry */ | ||
92 | } | ||
93 | printf("\n"); | ||
94 | } | ||
95 | printf("};\n"); | ||
96 | |||
97 | /* Compute inverse table x^-1 == x^254 */ | ||
98 | printf("\nconst u8 __attribute__((aligned(256)))\n" | ||
99 | "raid6_gfinv[256] =\n" | ||
100 | "{\n"); | ||
101 | for ( i = 0 ; i < 256 ; i += 8 ) { | ||
102 | printf("\t"); | ||
103 | for ( j = 0 ; j < 8 ; j++ ) { | ||
104 | invtbl[i+j] = v = gfpow(i+j,254); | ||
105 | printf("0x%02x, ", v); | ||
106 | } | ||
107 | printf("\n"); | ||
108 | } | ||
109 | printf("};\n"); | ||
110 | |||
111 | /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ | ||
112 | printf("\nconst u8 __attribute__((aligned(256)))\n" | ||
113 | "raid6_gfexi[256] =\n" | ||
114 | "{\n"); | ||
115 | for ( i = 0 ; i < 256 ; i += 8 ) { | ||
116 | printf("\t"); | ||
117 | for ( j = 0 ; j < 8 ; j++ ) { | ||
118 | printf("0x%02x, ", invtbl[exptbl[i+j]^1]); | ||
119 | } | ||
120 | printf("\n"); | ||
121 | } | ||
122 | printf("};\n\n"); | ||
123 | |||
124 | return 0; | ||
125 | } | ||
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c new file mode 100644 index 000000000000..c9b134cd1532 --- /dev/null +++ b/drivers/md/multipath.c | |||
@@ -0,0 +1,584 @@ | |||
1 | /* | ||
2 | * multipath.c : Multiple Devices driver for Linux | ||
3 | * | ||
4 | * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat | ||
5 | * | ||
6 | * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman | ||
7 | * | ||
8 | * MULTIPATH management functions. | ||
9 | * | ||
10 | * derived from raid1.c. | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | ||
13 | * it under the terms of the GNU General Public License as published by | ||
14 | * the Free Software Foundation; either version 2, or (at your option) | ||
15 | * any later version. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
19 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | */ | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/spinlock.h> | ||
25 | #include <linux/raid/multipath.h> | ||
26 | #include <linux/buffer_head.h> | ||
27 | #include <asm/atomic.h> | ||
28 | |||
29 | #define MAJOR_NR MD_MAJOR | ||
30 | #define MD_DRIVER | ||
31 | #define MD_PERSONALITY | ||
32 | |||
33 | #define MAX_WORK_PER_DISK 128 | ||
34 | |||
35 | #define NR_RESERVED_BUFS 32 | ||
36 | |||
37 | |||
38 | static mdk_personality_t multipath_personality; | ||
39 | |||
40 | |||
41 | static void *mp_pool_alloc(unsigned int __nocast gfp_flags, void *data) | ||
42 | { | ||
43 | struct multipath_bh *mpb; | ||
44 | mpb = kmalloc(sizeof(*mpb), gfp_flags); | ||
45 | if (mpb) | ||
46 | memset(mpb, 0, sizeof(*mpb)); | ||
47 | return mpb; | ||
48 | } | ||
49 | |||
50 | static void mp_pool_free(void *mpb, void *data) | ||
51 | { | ||
52 | kfree(mpb); | ||
53 | } | ||
54 | |||
55 | static int multipath_map (multipath_conf_t *conf) | ||
56 | { | ||
57 | int i, disks = conf->raid_disks; | ||
58 | |||
59 | /* | ||
60 | * Later we do read balancing on the read side | ||
61 | * now we use the first available disk. | ||
62 | */ | ||
63 | |||
64 | rcu_read_lock(); | ||
65 | for (i = 0; i < disks; i++) { | ||
66 | mdk_rdev_t *rdev = conf->multipaths[i].rdev; | ||
67 | if (rdev && rdev->in_sync) { | ||
68 | atomic_inc(&rdev->nr_pending); | ||
69 | rcu_read_unlock(); | ||
70 | return i; | ||
71 | } | ||
72 | } | ||
73 | rcu_read_unlock(); | ||
74 | |||
75 | printk(KERN_ERR "multipath_map(): no more operational IO paths?\n"); | ||
76 | return (-1); | ||
77 | } | ||
78 | |||
79 | static void multipath_reschedule_retry (struct multipath_bh *mp_bh) | ||
80 | { | ||
81 | unsigned long flags; | ||
82 | mddev_t *mddev = mp_bh->mddev; | ||
83 | multipath_conf_t *conf = mddev_to_conf(mddev); | ||
84 | |||
85 | spin_lock_irqsave(&conf->device_lock, flags); | ||
86 | list_add(&mp_bh->retry_list, &conf->retry_list); | ||
87 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
88 | md_wakeup_thread(mddev->thread); | ||
89 | } | ||
90 | |||
91 | |||
92 | /* | ||
93 | * multipath_end_bh_io() is called when we have finished servicing a multipathed | ||
94 | * operation and are ready to return a success/failure code to the buffer | ||
95 | * cache layer. | ||
96 | */ | ||
97 | static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) | ||
98 | { | ||
99 | struct bio *bio = mp_bh->master_bio; | ||
100 | multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); | ||
101 | |||
102 | bio_endio(bio, bio->bi_size, err); | ||
103 | mempool_free(mp_bh, conf->pool); | ||
104 | } | ||
105 | |||
106 | int multipath_end_request(struct bio *bio, unsigned int bytes_done, int error) | ||
107 | { | ||
108 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
109 | struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); | ||
110 | multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); | ||
111 | mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; | ||
112 | |||
113 | if (bio->bi_size) | ||
114 | return 1; | ||
115 | |||
116 | if (uptodate) | ||
117 | multipath_end_bh_io(mp_bh, 0); | ||
118 | else if (!bio_rw_ahead(bio)) { | ||
119 | /* | ||
120 | * oops, IO error: | ||
121 | */ | ||
122 | char b[BDEVNAME_SIZE]; | ||
123 | md_error (mp_bh->mddev, rdev); | ||
124 | printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", | ||
125 | bdevname(rdev->bdev,b), | ||
126 | (unsigned long long)bio->bi_sector); | ||
127 | multipath_reschedule_retry(mp_bh); | ||
128 | } else | ||
129 | multipath_end_bh_io(mp_bh, error); | ||
130 | rdev_dec_pending(rdev, conf->mddev); | ||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | static void unplug_slaves(mddev_t *mddev) | ||
135 | { | ||
136 | multipath_conf_t *conf = mddev_to_conf(mddev); | ||
137 | int i; | ||
138 | |||
139 | rcu_read_lock(); | ||
140 | for (i=0; i<mddev->raid_disks; i++) { | ||
141 | mdk_rdev_t *rdev = conf->multipaths[i].rdev; | ||
142 | if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { | ||
143 | request_queue_t *r_queue = bdev_get_queue(rdev->bdev); | ||
144 | |||
145 | atomic_inc(&rdev->nr_pending); | ||
146 | rcu_read_unlock(); | ||
147 | |||
148 | if (r_queue->unplug_fn) | ||
149 | r_queue->unplug_fn(r_queue); | ||
150 | |||
151 | rdev_dec_pending(rdev, mddev); | ||
152 | rcu_read_lock(); | ||
153 | } | ||
154 | } | ||
155 | rcu_read_unlock(); | ||
156 | } | ||
157 | |||
158 | static void multipath_unplug(request_queue_t *q) | ||
159 | { | ||
160 | unplug_slaves(q->queuedata); | ||
161 | } | ||
162 | |||
163 | |||
164 | static int multipath_make_request (request_queue_t *q, struct bio * bio) | ||
165 | { | ||
166 | mddev_t *mddev = q->queuedata; | ||
167 | multipath_conf_t *conf = mddev_to_conf(mddev); | ||
168 | struct multipath_bh * mp_bh; | ||
169 | struct multipath_info *multipath; | ||
170 | |||
171 | mp_bh = mempool_alloc(conf->pool, GFP_NOIO); | ||
172 | |||
173 | mp_bh->master_bio = bio; | ||
174 | mp_bh->mddev = mddev; | ||
175 | |||
176 | if (bio_data_dir(bio)==WRITE) { | ||
177 | disk_stat_inc(mddev->gendisk, writes); | ||
178 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); | ||
179 | } else { | ||
180 | disk_stat_inc(mddev->gendisk, reads); | ||
181 | disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); | ||
182 | } | ||
183 | |||
184 | mp_bh->path = multipath_map(conf); | ||
185 | if (mp_bh->path < 0) { | ||
186 | bio_endio(bio, bio->bi_size, -EIO); | ||
187 | mempool_free(mp_bh, conf->pool); | ||
188 | return 0; | ||
189 | } | ||
190 | multipath = conf->multipaths + mp_bh->path; | ||
191 | |||
192 | mp_bh->bio = *bio; | ||
193 | mp_bh->bio.bi_sector += multipath->rdev->data_offset; | ||
194 | mp_bh->bio.bi_bdev = multipath->rdev->bdev; | ||
195 | mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST); | ||
196 | mp_bh->bio.bi_end_io = multipath_end_request; | ||
197 | mp_bh->bio.bi_private = mp_bh; | ||
198 | generic_make_request(&mp_bh->bio); | ||
199 | return 0; | ||
200 | } | ||
201 | |||
202 | static void multipath_status (struct seq_file *seq, mddev_t *mddev) | ||
203 | { | ||
204 | multipath_conf_t *conf = mddev_to_conf(mddev); | ||
205 | int i; | ||
206 | |||
207 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, | ||
208 | conf->working_disks); | ||
209 | for (i = 0; i < conf->raid_disks; i++) | ||
210 | seq_printf (seq, "%s", | ||
211 | conf->multipaths[i].rdev && | ||
212 | conf->multipaths[i].rdev->in_sync ? "U" : "_"); | ||
213 | seq_printf (seq, "]"); | ||
214 | } | ||
215 | |||
216 | static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk, | ||
217 | sector_t *error_sector) | ||
218 | { | ||
219 | mddev_t *mddev = q->queuedata; | ||
220 | multipath_conf_t *conf = mddev_to_conf(mddev); | ||
221 | int i, ret = 0; | ||
222 | |||
223 | rcu_read_lock(); | ||
224 | for (i=0; i<mddev->raid_disks && ret == 0; i++) { | ||
225 | mdk_rdev_t *rdev = conf->multipaths[i].rdev; | ||
226 | if (rdev && !rdev->faulty) { | ||
227 | struct block_device *bdev = rdev->bdev; | ||
228 | request_queue_t *r_queue = bdev_get_queue(bdev); | ||
229 | |||
230 | if (!r_queue->issue_flush_fn) | ||
231 | ret = -EOPNOTSUPP; | ||
232 | else { | ||
233 | atomic_inc(&rdev->nr_pending); | ||
234 | rcu_read_unlock(); | ||
235 | ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, | ||
236 | error_sector); | ||
237 | rdev_dec_pending(rdev, mddev); | ||
238 | rcu_read_lock(); | ||
239 | } | ||
240 | } | ||
241 | } | ||
242 | rcu_read_unlock(); | ||
243 | return ret; | ||
244 | } | ||
245 | |||
246 | /* | ||
247 | * Careful, this can execute in IRQ contexts as well! | ||
248 | */ | ||
249 | static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) | ||
250 | { | ||
251 | multipath_conf_t *conf = mddev_to_conf(mddev); | ||
252 | |||
253 | if (conf->working_disks <= 1) { | ||
254 | /* | ||
255 | * Uh oh, we can do nothing if this is our last path, but | ||
256 | * first check if this is a queued request for a device | ||
257 | * which has just failed. | ||
258 | */ | ||
259 | printk(KERN_ALERT | ||
260 | "multipath: only one IO path left and IO error.\n"); | ||
261 | /* leave it active... it's all we have */ | ||
262 | } else { | ||
263 | /* | ||
264 | * Mark disk as unusable | ||
265 | */ | ||
266 | if (!rdev->faulty) { | ||
267 | char b[BDEVNAME_SIZE]; | ||
268 | rdev->in_sync = 0; | ||
269 | rdev->faulty = 1; | ||
270 | mddev->sb_dirty = 1; | ||
271 | conf->working_disks--; | ||
272 | printk(KERN_ALERT "multipath: IO failure on %s," | ||
273 | " disabling IO path. \n Operation continuing" | ||
274 | " on %d IO paths.\n", | ||
275 | bdevname (rdev->bdev,b), | ||
276 | conf->working_disks); | ||
277 | } | ||
278 | } | ||
279 | } | ||
280 | |||
281 | static void print_multipath_conf (multipath_conf_t *conf) | ||
282 | { | ||
283 | int i; | ||
284 | struct multipath_info *tmp; | ||
285 | |||
286 | printk("MULTIPATH conf printout:\n"); | ||
287 | if (!conf) { | ||
288 | printk("(conf==NULL)\n"); | ||
289 | return; | ||
290 | } | ||
291 | printk(" --- wd:%d rd:%d\n", conf->working_disks, | ||
292 | conf->raid_disks); | ||
293 | |||
294 | for (i = 0; i < conf->raid_disks; i++) { | ||
295 | char b[BDEVNAME_SIZE]; | ||
296 | tmp = conf->multipaths + i; | ||
297 | if (tmp->rdev) | ||
298 | printk(" disk%d, o:%d, dev:%s\n", | ||
299 | i,!tmp->rdev->faulty, | ||
300 | bdevname(tmp->rdev->bdev,b)); | ||
301 | } | ||
302 | } | ||
303 | |||
304 | |||
305 | static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | ||
306 | { | ||
307 | multipath_conf_t *conf = mddev->private; | ||
308 | int found = 0; | ||
309 | int path; | ||
310 | struct multipath_info *p; | ||
311 | |||
312 | print_multipath_conf(conf); | ||
313 | |||
314 | for (path=0; path<mddev->raid_disks; path++) | ||
315 | if ((p=conf->multipaths+path)->rdev == NULL) { | ||
316 | blk_queue_stack_limits(mddev->queue, | ||
317 | rdev->bdev->bd_disk->queue); | ||
318 | |||
319 | /* as we don't honour merge_bvec_fn, we must never risk | ||
320 | * violating it, so limit ->max_sector to one PAGE, as | ||
321 | * a one page request is never in violation. | ||
322 | * (Note: it is very unlikely that a device with | ||
323 | * merge_bvec_fn will be involved in multipath.) | ||
324 | */ | ||
325 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | ||
326 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | ||
327 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | ||
328 | |||
329 | conf->working_disks++; | ||
330 | rdev->raid_disk = path; | ||
331 | rdev->in_sync = 1; | ||
332 | p->rdev = rdev; | ||
333 | found = 1; | ||
334 | } | ||
335 | |||
336 | print_multipath_conf(conf); | ||
337 | return found; | ||
338 | } | ||
339 | |||
340 | static int multipath_remove_disk(mddev_t *mddev, int number) | ||
341 | { | ||
342 | multipath_conf_t *conf = mddev->private; | ||
343 | int err = 0; | ||
344 | mdk_rdev_t *rdev; | ||
345 | struct multipath_info *p = conf->multipaths + number; | ||
346 | |||
347 | print_multipath_conf(conf); | ||
348 | |||
349 | rdev = p->rdev; | ||
350 | if (rdev) { | ||
351 | if (rdev->in_sync || | ||
352 | atomic_read(&rdev->nr_pending)) { | ||
353 | printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number); | ||
354 | err = -EBUSY; | ||
355 | goto abort; | ||
356 | } | ||
357 | p->rdev = NULL; | ||
358 | synchronize_kernel(); | ||
359 | if (atomic_read(&rdev->nr_pending)) { | ||
360 | /* lost the race, try later */ | ||
361 | err = -EBUSY; | ||
362 | p->rdev = rdev; | ||
363 | } | ||
364 | } | ||
365 | abort: | ||
366 | |||
367 | print_multipath_conf(conf); | ||
368 | return err; | ||
369 | } | ||
370 | |||
371 | |||
372 | |||
373 | /* | ||
374 | * This is a kernel thread which: | ||
375 | * | ||
376 | * 1. Retries failed read operations on working multipaths. | ||
377 | * 2. Updates the raid superblock when problems encounter. | ||
378 | * 3. Performs writes following reads for array syncronising. | ||
379 | */ | ||
380 | |||
381 | static void multipathd (mddev_t *mddev) | ||
382 | { | ||
383 | struct multipath_bh *mp_bh; | ||
384 | struct bio *bio; | ||
385 | unsigned long flags; | ||
386 | multipath_conf_t *conf = mddev_to_conf(mddev); | ||
387 | struct list_head *head = &conf->retry_list; | ||
388 | |||
389 | md_check_recovery(mddev); | ||
390 | for (;;) { | ||
391 | char b[BDEVNAME_SIZE]; | ||
392 | spin_lock_irqsave(&conf->device_lock, flags); | ||
393 | if (list_empty(head)) | ||
394 | break; | ||
395 | mp_bh = list_entry(head->prev, struct multipath_bh, retry_list); | ||
396 | list_del(head->prev); | ||
397 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
398 | |||
399 | bio = &mp_bh->bio; | ||
400 | bio->bi_sector = mp_bh->master_bio->bi_sector; | ||
401 | |||
402 | if ((mp_bh->path = multipath_map (conf))<0) { | ||
403 | printk(KERN_ALERT "multipath: %s: unrecoverable IO read" | ||
404 | " error for block %llu\n", | ||
405 | bdevname(bio->bi_bdev,b), | ||
406 | (unsigned long long)bio->bi_sector); | ||
407 | multipath_end_bh_io(mp_bh, -EIO); | ||
408 | } else { | ||
409 | printk(KERN_ERR "multipath: %s: redirecting sector %llu" | ||
410 | " to another IO path\n", | ||
411 | bdevname(bio->bi_bdev,b), | ||
412 | (unsigned long long)bio->bi_sector); | ||
413 | *bio = *(mp_bh->master_bio); | ||
414 | bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; | ||
415 | bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; | ||
416 | bio->bi_rw |= (1 << BIO_RW_FAILFAST); | ||
417 | bio->bi_end_io = multipath_end_request; | ||
418 | bio->bi_private = mp_bh; | ||
419 | generic_make_request(bio); | ||
420 | } | ||
421 | } | ||
422 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
423 | } | ||
424 | |||
425 | static int multipath_run (mddev_t *mddev) | ||
426 | { | ||
427 | multipath_conf_t *conf; | ||
428 | int disk_idx; | ||
429 | struct multipath_info *disk; | ||
430 | mdk_rdev_t *rdev; | ||
431 | struct list_head *tmp; | ||
432 | |||
433 | if (mddev->level != LEVEL_MULTIPATH) { | ||
434 | printk("multipath: %s: raid level not set to multipath IO (%d)\n", | ||
435 | mdname(mddev), mddev->level); | ||
436 | goto out; | ||
437 | } | ||
438 | /* | ||
439 | * copy the already verified devices into our private MULTIPATH | ||
440 | * bookkeeping area. [whatever we allocate in multipath_run(), | ||
441 | * should be freed in multipath_stop()] | ||
442 | */ | ||
443 | |||
444 | conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL); | ||
445 | mddev->private = conf; | ||
446 | if (!conf) { | ||
447 | printk(KERN_ERR | ||
448 | "multipath: couldn't allocate memory for %s\n", | ||
449 | mdname(mddev)); | ||
450 | goto out; | ||
451 | } | ||
452 | memset(conf, 0, sizeof(*conf)); | ||
453 | |||
454 | conf->multipaths = kmalloc(sizeof(struct multipath_info)*mddev->raid_disks, | ||
455 | GFP_KERNEL); | ||
456 | if (!conf->multipaths) { | ||
457 | printk(KERN_ERR | ||
458 | "multipath: couldn't allocate memory for %s\n", | ||
459 | mdname(mddev)); | ||
460 | goto out_free_conf; | ||
461 | } | ||
462 | memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks); | ||
463 | |||
464 | mddev->queue->unplug_fn = multipath_unplug; | ||
465 | |||
466 | mddev->queue->issue_flush_fn = multipath_issue_flush; | ||
467 | |||
468 | conf->working_disks = 0; | ||
469 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
470 | disk_idx = rdev->raid_disk; | ||
471 | if (disk_idx < 0 || | ||
472 | disk_idx >= mddev->raid_disks) | ||
473 | continue; | ||
474 | |||
475 | disk = conf->multipaths + disk_idx; | ||
476 | disk->rdev = rdev; | ||
477 | |||
478 | blk_queue_stack_limits(mddev->queue, | ||
479 | rdev->bdev->bd_disk->queue); | ||
480 | /* as we don't honour merge_bvec_fn, we must never risk | ||
481 | * violating it, not that we ever expect a device with | ||
482 | * a merge_bvec_fn to be involved in multipath */ | ||
483 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | ||
484 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | ||
485 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | ||
486 | |||
487 | if (!rdev->faulty) | ||
488 | conf->working_disks++; | ||
489 | } | ||
490 | |||
491 | conf->raid_disks = mddev->raid_disks; | ||
492 | mddev->sb_dirty = 1; | ||
493 | conf->mddev = mddev; | ||
494 | spin_lock_init(&conf->device_lock); | ||
495 | INIT_LIST_HEAD(&conf->retry_list); | ||
496 | |||
497 | if (!conf->working_disks) { | ||
498 | printk(KERN_ERR "multipath: no operational IO paths for %s\n", | ||
499 | mdname(mddev)); | ||
500 | goto out_free_conf; | ||
501 | } | ||
502 | mddev->degraded = conf->raid_disks = conf->working_disks; | ||
503 | |||
504 | conf->pool = mempool_create(NR_RESERVED_BUFS, | ||
505 | mp_pool_alloc, mp_pool_free, | ||
506 | NULL); | ||
507 | if (conf->pool == NULL) { | ||
508 | printk(KERN_ERR | ||
509 | "multipath: couldn't allocate memory for %s\n", | ||
510 | mdname(mddev)); | ||
511 | goto out_free_conf; | ||
512 | } | ||
513 | |||
514 | { | ||
515 | mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath"); | ||
516 | if (!mddev->thread) { | ||
517 | printk(KERN_ERR "multipath: couldn't allocate thread" | ||
518 | " for %s\n", mdname(mddev)); | ||
519 | goto out_free_conf; | ||
520 | } | ||
521 | } | ||
522 | |||
523 | printk(KERN_INFO | ||
524 | "multipath: array %s active with %d out of %d IO paths\n", | ||
525 | mdname(mddev), conf->working_disks, mddev->raid_disks); | ||
526 | /* | ||
527 | * Ok, everything is just fine now | ||
528 | */ | ||
529 | mddev->array_size = mddev->size; | ||
530 | return 0; | ||
531 | |||
532 | out_free_conf: | ||
533 | if (conf->pool) | ||
534 | mempool_destroy(conf->pool); | ||
535 | if (conf->multipaths) | ||
536 | kfree(conf->multipaths); | ||
537 | kfree(conf); | ||
538 | mddev->private = NULL; | ||
539 | out: | ||
540 | return -EIO; | ||
541 | } | ||
542 | |||
543 | |||
544 | static int multipath_stop (mddev_t *mddev) | ||
545 | { | ||
546 | multipath_conf_t *conf = mddev_to_conf(mddev); | ||
547 | |||
548 | md_unregister_thread(mddev->thread); | ||
549 | mddev->thread = NULL; | ||
550 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | ||
551 | mempool_destroy(conf->pool); | ||
552 | kfree(conf->multipaths); | ||
553 | kfree(conf); | ||
554 | mddev->private = NULL; | ||
555 | return 0; | ||
556 | } | ||
557 | |||
558 | static mdk_personality_t multipath_personality= | ||
559 | { | ||
560 | .name = "multipath", | ||
561 | .owner = THIS_MODULE, | ||
562 | .make_request = multipath_make_request, | ||
563 | .run = multipath_run, | ||
564 | .stop = multipath_stop, | ||
565 | .status = multipath_status, | ||
566 | .error_handler = multipath_error, | ||
567 | .hot_add_disk = multipath_add_disk, | ||
568 | .hot_remove_disk= multipath_remove_disk, | ||
569 | }; | ||
570 | |||
571 | static int __init multipath_init (void) | ||
572 | { | ||
573 | return register_md_personality (MULTIPATH, &multipath_personality); | ||
574 | } | ||
575 | |||
576 | static void __exit multipath_exit (void) | ||
577 | { | ||
578 | unregister_md_personality (MULTIPATH); | ||
579 | } | ||
580 | |||
581 | module_init(multipath_init); | ||
582 | module_exit(multipath_exit); | ||
583 | MODULE_LICENSE("GPL"); | ||
584 | MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ | ||
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c new file mode 100644 index 000000000000..e7d934eca06f --- /dev/null +++ b/drivers/md/raid0.c | |||
@@ -0,0 +1,539 @@ | |||
1 | /* | ||
2 | raid0.c : Multiple Devices driver for Linux | ||
3 | Copyright (C) 1994-96 Marc ZYNGIER | ||
4 | <zyngier@ufr-info-p7.ibp.fr> or | ||
5 | <maz@gloups.fdn.fr> | ||
6 | Copyright (C) 1999, 2000 Ingo Molnar, Red Hat | ||
7 | |||
8 | |||
9 | RAID-0 management functions. | ||
10 | |||
11 | This program is free software; you can redistribute it and/or modify | ||
12 | it under the terms of the GNU General Public License as published by | ||
13 | the Free Software Foundation; either version 2, or (at your option) | ||
14 | any later version. | ||
15 | |||
16 | You should have received a copy of the GNU General Public License | ||
17 | (for example /usr/src/linux/COPYING); if not, write to the Free | ||
18 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/raid/raid0.h> | ||
23 | |||
24 | #define MAJOR_NR MD_MAJOR | ||
25 | #define MD_DRIVER | ||
26 | #define MD_PERSONALITY | ||
27 | |||
28 | static void raid0_unplug(request_queue_t *q) | ||
29 | { | ||
30 | mddev_t *mddev = q->queuedata; | ||
31 | raid0_conf_t *conf = mddev_to_conf(mddev); | ||
32 | mdk_rdev_t **devlist = conf->strip_zone[0].dev; | ||
33 | int i; | ||
34 | |||
35 | for (i=0; i<mddev->raid_disks; i++) { | ||
36 | request_queue_t *r_queue = bdev_get_queue(devlist[i]->bdev); | ||
37 | |||
38 | if (r_queue->unplug_fn) | ||
39 | r_queue->unplug_fn(r_queue); | ||
40 | } | ||
41 | } | ||
42 | |||
43 | static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk, | ||
44 | sector_t *error_sector) | ||
45 | { | ||
46 | mddev_t *mddev = q->queuedata; | ||
47 | raid0_conf_t *conf = mddev_to_conf(mddev); | ||
48 | mdk_rdev_t **devlist = conf->strip_zone[0].dev; | ||
49 | int i, ret = 0; | ||
50 | |||
51 | for (i=0; i<mddev->raid_disks && ret == 0; i++) { | ||
52 | struct block_device *bdev = devlist[i]->bdev; | ||
53 | request_queue_t *r_queue = bdev_get_queue(bdev); | ||
54 | |||
55 | if (!r_queue->issue_flush_fn) | ||
56 | ret = -EOPNOTSUPP; | ||
57 | else | ||
58 | ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); | ||
59 | } | ||
60 | return ret; | ||
61 | } | ||
62 | |||
63 | |||
64 | static int create_strip_zones (mddev_t *mddev) | ||
65 | { | ||
66 | int i, c, j; | ||
67 | sector_t current_offset, curr_zone_offset; | ||
68 | sector_t min_spacing; | ||
69 | raid0_conf_t *conf = mddev_to_conf(mddev); | ||
70 | mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; | ||
71 | struct list_head *tmp1, *tmp2; | ||
72 | struct strip_zone *zone; | ||
73 | int cnt; | ||
74 | char b[BDEVNAME_SIZE]; | ||
75 | |||
76 | /* | ||
77 | * The number of 'same size groups' | ||
78 | */ | ||
79 | conf->nr_strip_zones = 0; | ||
80 | |||
81 | ITERATE_RDEV(mddev,rdev1,tmp1) { | ||
82 | printk("raid0: looking at %s\n", | ||
83 | bdevname(rdev1->bdev,b)); | ||
84 | c = 0; | ||
85 | ITERATE_RDEV(mddev,rdev2,tmp2) { | ||
86 | printk("raid0: comparing %s(%llu)", | ||
87 | bdevname(rdev1->bdev,b), | ||
88 | (unsigned long long)rdev1->size); | ||
89 | printk(" with %s(%llu)\n", | ||
90 | bdevname(rdev2->bdev,b), | ||
91 | (unsigned long long)rdev2->size); | ||
92 | if (rdev2 == rdev1) { | ||
93 | printk("raid0: END\n"); | ||
94 | break; | ||
95 | } | ||
96 | if (rdev2->size == rdev1->size) | ||
97 | { | ||
98 | /* | ||
99 | * Not unique, don't count it as a new | ||
100 | * group | ||
101 | */ | ||
102 | printk("raid0: EQUAL\n"); | ||
103 | c = 1; | ||
104 | break; | ||
105 | } | ||
106 | printk("raid0: NOT EQUAL\n"); | ||
107 | } | ||
108 | if (!c) { | ||
109 | printk("raid0: ==> UNIQUE\n"); | ||
110 | conf->nr_strip_zones++; | ||
111 | printk("raid0: %d zones\n", conf->nr_strip_zones); | ||
112 | } | ||
113 | } | ||
114 | printk("raid0: FINAL %d zones\n", conf->nr_strip_zones); | ||
115 | |||
116 | conf->strip_zone = kmalloc(sizeof(struct strip_zone)* | ||
117 | conf->nr_strip_zones, GFP_KERNEL); | ||
118 | if (!conf->strip_zone) | ||
119 | return 1; | ||
120 | conf->devlist = kmalloc(sizeof(mdk_rdev_t*)* | ||
121 | conf->nr_strip_zones*mddev->raid_disks, | ||
122 | GFP_KERNEL); | ||
123 | if (!conf->devlist) | ||
124 | return 1; | ||
125 | |||
126 | memset(conf->strip_zone, 0,sizeof(struct strip_zone)* | ||
127 | conf->nr_strip_zones); | ||
128 | memset(conf->devlist, 0, | ||
129 | sizeof(mdk_rdev_t*) * conf->nr_strip_zones * mddev->raid_disks); | ||
130 | |||
131 | /* The first zone must contain all devices, so here we check that | ||
132 | * there is a proper alignment of slots to devices and find them all | ||
133 | */ | ||
134 | zone = &conf->strip_zone[0]; | ||
135 | cnt = 0; | ||
136 | smallest = NULL; | ||
137 | zone->dev = conf->devlist; | ||
138 | ITERATE_RDEV(mddev, rdev1, tmp1) { | ||
139 | int j = rdev1->raid_disk; | ||
140 | |||
141 | if (j < 0 || j >= mddev->raid_disks) { | ||
142 | printk("raid0: bad disk number %d - aborting!\n", j); | ||
143 | goto abort; | ||
144 | } | ||
145 | if (zone->dev[j]) { | ||
146 | printk("raid0: multiple devices for %d - aborting!\n", | ||
147 | j); | ||
148 | goto abort; | ||
149 | } | ||
150 | zone->dev[j] = rdev1; | ||
151 | |||
152 | blk_queue_stack_limits(mddev->queue, | ||
153 | rdev1->bdev->bd_disk->queue); | ||
154 | /* as we don't honour merge_bvec_fn, we must never risk | ||
155 | * violating it, so limit ->max_sector to one PAGE, as | ||
156 | * a one page request is never in violation. | ||
157 | */ | ||
158 | |||
159 | if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && | ||
160 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | ||
161 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | ||
162 | |||
163 | if (!smallest || (rdev1->size <smallest->size)) | ||
164 | smallest = rdev1; | ||
165 | cnt++; | ||
166 | } | ||
167 | if (cnt != mddev->raid_disks) { | ||
168 | printk("raid0: too few disks (%d of %d) - aborting!\n", | ||
169 | cnt, mddev->raid_disks); | ||
170 | goto abort; | ||
171 | } | ||
172 | zone->nb_dev = cnt; | ||
173 | zone->size = smallest->size * cnt; | ||
174 | zone->zone_offset = 0; | ||
175 | |||
176 | current_offset = smallest->size; | ||
177 | curr_zone_offset = zone->size; | ||
178 | |||
179 | /* now do the other zones */ | ||
180 | for (i = 1; i < conf->nr_strip_zones; i++) | ||
181 | { | ||
182 | zone = conf->strip_zone + i; | ||
183 | zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks; | ||
184 | |||
185 | printk("raid0: zone %d\n", i); | ||
186 | zone->dev_offset = current_offset; | ||
187 | smallest = NULL; | ||
188 | c = 0; | ||
189 | |||
190 | for (j=0; j<cnt; j++) { | ||
191 | char b[BDEVNAME_SIZE]; | ||
192 | rdev = conf->strip_zone[0].dev[j]; | ||
193 | printk("raid0: checking %s ...", bdevname(rdev->bdev,b)); | ||
194 | if (rdev->size > current_offset) | ||
195 | { | ||
196 | printk(" contained as device %d\n", c); | ||
197 | zone->dev[c] = rdev; | ||
198 | c++; | ||
199 | if (!smallest || (rdev->size <smallest->size)) { | ||
200 | smallest = rdev; | ||
201 | printk(" (%llu) is smallest!.\n", | ||
202 | (unsigned long long)rdev->size); | ||
203 | } | ||
204 | } else | ||
205 | printk(" nope.\n"); | ||
206 | } | ||
207 | |||
208 | zone->nb_dev = c; | ||
209 | zone->size = (smallest->size - current_offset) * c; | ||
210 | printk("raid0: zone->nb_dev: %d, size: %llu\n", | ||
211 | zone->nb_dev, (unsigned long long)zone->size); | ||
212 | |||
213 | zone->zone_offset = curr_zone_offset; | ||
214 | curr_zone_offset += zone->size; | ||
215 | |||
216 | current_offset = smallest->size; | ||
217 | printk("raid0: current zone offset: %llu\n", | ||
218 | (unsigned long long)current_offset); | ||
219 | } | ||
220 | |||
221 | /* Now find appropriate hash spacing. | ||
222 | * We want a number which causes most hash entries to cover | ||
223 | * at most two strips, but the hash table must be at most | ||
224 | * 1 PAGE. We choose the smallest strip, or contiguous collection | ||
225 | * of strips, that has big enough size. We never consider the last | ||
226 | * strip though as it's size has no bearing on the efficacy of the hash | ||
227 | * table. | ||
228 | */ | ||
229 | conf->hash_spacing = curr_zone_offset; | ||
230 | min_spacing = curr_zone_offset; | ||
231 | sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*)); | ||
232 | for (i=0; i < conf->nr_strip_zones-1; i++) { | ||
233 | sector_t sz = 0; | ||
234 | for (j=i; j<conf->nr_strip_zones-1 && | ||
235 | sz < min_spacing ; j++) | ||
236 | sz += conf->strip_zone[j].size; | ||
237 | if (sz >= min_spacing && sz < conf->hash_spacing) | ||
238 | conf->hash_spacing = sz; | ||
239 | } | ||
240 | |||
241 | mddev->queue->unplug_fn = raid0_unplug; | ||
242 | |||
243 | mddev->queue->issue_flush_fn = raid0_issue_flush; | ||
244 | |||
245 | printk("raid0: done.\n"); | ||
246 | return 0; | ||
247 | abort: | ||
248 | return 1; | ||
249 | } | ||
250 | |||
251 | /** | ||
252 | * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged | ||
253 | * @q: request queue | ||
254 | * @bio: the buffer head that's been built up so far | ||
255 | * @biovec: the request that could be merged to it. | ||
256 | * | ||
257 | * Return amount of bytes we can accept at this offset | ||
258 | */ | ||
259 | static int raid0_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec) | ||
260 | { | ||
261 | mddev_t *mddev = q->queuedata; | ||
262 | sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); | ||
263 | int max; | ||
264 | unsigned int chunk_sectors = mddev->chunk_size >> 9; | ||
265 | unsigned int bio_sectors = bio->bi_size >> 9; | ||
266 | |||
267 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; | ||
268 | if (max < 0) max = 0; /* bio_add cannot handle a negative return */ | ||
269 | if (max <= biovec->bv_len && bio_sectors == 0) | ||
270 | return biovec->bv_len; | ||
271 | else | ||
272 | return max; | ||
273 | } | ||
274 | |||
275 | static int raid0_run (mddev_t *mddev) | ||
276 | { | ||
277 | unsigned cur=0, i=0, nb_zone; | ||
278 | s64 size; | ||
279 | raid0_conf_t *conf; | ||
280 | mdk_rdev_t *rdev; | ||
281 | struct list_head *tmp; | ||
282 | |||
283 | printk("%s: setting max_sectors to %d, segment boundary to %d\n", | ||
284 | mdname(mddev), | ||
285 | mddev->chunk_size >> 9, | ||
286 | (mddev->chunk_size>>1)-1); | ||
287 | blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9); | ||
288 | blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1); | ||
289 | |||
290 | conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL); | ||
291 | if (!conf) | ||
292 | goto out; | ||
293 | mddev->private = (void *)conf; | ||
294 | |||
295 | conf->strip_zone = NULL; | ||
296 | conf->devlist = NULL; | ||
297 | if (create_strip_zones (mddev)) | ||
298 | goto out_free_conf; | ||
299 | |||
300 | /* calculate array device size */ | ||
301 | mddev->array_size = 0; | ||
302 | ITERATE_RDEV(mddev,rdev,tmp) | ||
303 | mddev->array_size += rdev->size; | ||
304 | |||
305 | printk("raid0 : md_size is %llu blocks.\n", | ||
306 | (unsigned long long)mddev->array_size); | ||
307 | printk("raid0 : conf->hash_spacing is %llu blocks.\n", | ||
308 | (unsigned long long)conf->hash_spacing); | ||
309 | { | ||
310 | #if __GNUC__ < 3 | ||
311 | volatile | ||
312 | #endif | ||
313 | sector_t s = mddev->array_size; | ||
314 | sector_t space = conf->hash_spacing; | ||
315 | int round; | ||
316 | conf->preshift = 0; | ||
317 | if (sizeof(sector_t) > sizeof(unsigned long)) { | ||
318 | /*shift down space and s so that sector_div will work */ | ||
319 | while (space > (sector_t) (~(unsigned long)0)) { | ||
320 | s >>= 1; | ||
321 | space >>= 1; | ||
322 | s += 1; /* force round-up */ | ||
323 | conf->preshift++; | ||
324 | } | ||
325 | } | ||
326 | round = sector_div(s, (unsigned long)space) ? 1 : 0; | ||
327 | nb_zone = s + round; | ||
328 | } | ||
329 | printk("raid0 : nb_zone is %d.\n", nb_zone); | ||
330 | |||
331 | printk("raid0 : Allocating %Zd bytes for hash.\n", | ||
332 | nb_zone*sizeof(struct strip_zone*)); | ||
333 | conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL); | ||
334 | if (!conf->hash_table) | ||
335 | goto out_free_conf; | ||
336 | size = conf->strip_zone[cur].size; | ||
337 | |||
338 | for (i=0; i< nb_zone; i++) { | ||
339 | conf->hash_table[i] = conf->strip_zone + cur; | ||
340 | while (size <= conf->hash_spacing) { | ||
341 | cur++; | ||
342 | size += conf->strip_zone[cur].size; | ||
343 | } | ||
344 | size -= conf->hash_spacing; | ||
345 | } | ||
346 | if (conf->preshift) { | ||
347 | conf->hash_spacing >>= conf->preshift; | ||
348 | /* round hash_spacing up so when we divide by it, we | ||
349 | * err on the side of too-low, which is safest | ||
350 | */ | ||
351 | conf->hash_spacing++; | ||
352 | } | ||
353 | |||
354 | /* calculate the max read-ahead size. | ||
355 | * For read-ahead of large files to be effective, we need to | ||
356 | * readahead at least twice a whole stripe. i.e. number of devices | ||
357 | * multiplied by chunk size times 2. | ||
358 | * If an individual device has an ra_pages greater than the | ||
359 | * chunk size, then we will not drive that device as hard as it | ||
360 | * wants. We consider this a configuration error: a larger | ||
361 | * chunksize should be used in that case. | ||
362 | */ | ||
363 | { | ||
364 | int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE; | ||
365 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) | ||
366 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; | ||
367 | } | ||
368 | |||
369 | |||
370 | blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); | ||
371 | return 0; | ||
372 | |||
373 | out_free_conf: | ||
374 | if (conf->strip_zone) | ||
375 | kfree(conf->strip_zone); | ||
376 | if (conf->devlist) | ||
377 | kfree (conf->devlist); | ||
378 | kfree(conf); | ||
379 | mddev->private = NULL; | ||
380 | out: | ||
381 | return 1; | ||
382 | } | ||
383 | |||
384 | static int raid0_stop (mddev_t *mddev) | ||
385 | { | ||
386 | raid0_conf_t *conf = mddev_to_conf(mddev); | ||
387 | |||
388 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | ||
389 | kfree (conf->hash_table); | ||
390 | conf->hash_table = NULL; | ||
391 | kfree (conf->strip_zone); | ||
392 | conf->strip_zone = NULL; | ||
393 | kfree (conf); | ||
394 | mddev->private = NULL; | ||
395 | |||
396 | return 0; | ||
397 | } | ||
398 | |||
399 | static int raid0_make_request (request_queue_t *q, struct bio *bio) | ||
400 | { | ||
401 | mddev_t *mddev = q->queuedata; | ||
402 | unsigned int sect_in_chunk, chunksize_bits, chunk_size, chunk_sects; | ||
403 | raid0_conf_t *conf = mddev_to_conf(mddev); | ||
404 | struct strip_zone *zone; | ||
405 | mdk_rdev_t *tmp_dev; | ||
406 | unsigned long chunk; | ||
407 | sector_t block, rsect; | ||
408 | |||
409 | if (bio_data_dir(bio)==WRITE) { | ||
410 | disk_stat_inc(mddev->gendisk, writes); | ||
411 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); | ||
412 | } else { | ||
413 | disk_stat_inc(mddev->gendisk, reads); | ||
414 | disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); | ||
415 | } | ||
416 | |||
417 | chunk_size = mddev->chunk_size >> 10; | ||
418 | chunk_sects = mddev->chunk_size >> 9; | ||
419 | chunksize_bits = ffz(~chunk_size); | ||
420 | block = bio->bi_sector >> 1; | ||
421 | |||
422 | |||
423 | if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) { | ||
424 | struct bio_pair *bp; | ||
425 | /* Sanity check -- queue functions should prevent this happening */ | ||
426 | if (bio->bi_vcnt != 1 || | ||
427 | bio->bi_idx != 0) | ||
428 | goto bad_map; | ||
429 | /* This is a one page bio that upper layers | ||
430 | * refuse to split for us, so we need to split it. | ||
431 | */ | ||
432 | bp = bio_split(bio, bio_split_pool, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); | ||
433 | if (raid0_make_request(q, &bp->bio1)) | ||
434 | generic_make_request(&bp->bio1); | ||
435 | if (raid0_make_request(q, &bp->bio2)) | ||
436 | generic_make_request(&bp->bio2); | ||
437 | |||
438 | bio_pair_release(bp); | ||
439 | return 0; | ||
440 | } | ||
441 | |||
442 | |||
443 | { | ||
444 | #if __GNUC__ < 3 | ||
445 | volatile | ||
446 | #endif | ||
447 | sector_t x = block >> conf->preshift; | ||
448 | sector_div(x, (unsigned long)conf->hash_spacing); | ||
449 | zone = conf->hash_table[x]; | ||
450 | } | ||
451 | |||
452 | while (block >= (zone->zone_offset + zone->size)) | ||
453 | zone++; | ||
454 | |||
455 | sect_in_chunk = bio->bi_sector & ((chunk_size<<1) -1); | ||
456 | |||
457 | |||
458 | { | ||
459 | sector_t x = (block - zone->zone_offset) >> chunksize_bits; | ||
460 | |||
461 | sector_div(x, zone->nb_dev); | ||
462 | chunk = x; | ||
463 | BUG_ON(x != (sector_t)chunk); | ||
464 | |||
465 | x = block >> chunksize_bits; | ||
466 | tmp_dev = zone->dev[sector_div(x, zone->nb_dev)]; | ||
467 | } | ||
468 | rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1) | ||
469 | + sect_in_chunk; | ||
470 | |||
471 | bio->bi_bdev = tmp_dev->bdev; | ||
472 | bio->bi_sector = rsect + tmp_dev->data_offset; | ||
473 | |||
474 | /* | ||
475 | * Let the main block layer submit the IO and resolve recursion: | ||
476 | */ | ||
477 | return 1; | ||
478 | |||
479 | bad_map: | ||
480 | printk("raid0_make_request bug: can't convert block across chunks" | ||
481 | " or bigger than %dk %llu %d\n", chunk_size, | ||
482 | (unsigned long long)bio->bi_sector, bio->bi_size >> 10); | ||
483 | |||
484 | bio_io_error(bio, bio->bi_size); | ||
485 | return 0; | ||
486 | } | ||
487 | |||
488 | static void raid0_status (struct seq_file *seq, mddev_t *mddev) | ||
489 | { | ||
490 | #undef MD_DEBUG | ||
491 | #ifdef MD_DEBUG | ||
492 | int j, k, h; | ||
493 | char b[BDEVNAME_SIZE]; | ||
494 | raid0_conf_t *conf = mddev_to_conf(mddev); | ||
495 | |||
496 | h = 0; | ||
497 | for (j = 0; j < conf->nr_strip_zones; j++) { | ||
498 | seq_printf(seq, " z%d", j); | ||
499 | if (conf->hash_table[h] == conf->strip_zone+j) | ||
500 | seq_printf("(h%d)", h++); | ||
501 | seq_printf(seq, "=["); | ||
502 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) | ||
503 | seq_printf (seq, "%s/", bdevname( | ||
504 | conf->strip_zone[j].dev[k]->bdev,b)); | ||
505 | |||
506 | seq_printf (seq, "] zo=%d do=%d s=%d\n", | ||
507 | conf->strip_zone[j].zone_offset, | ||
508 | conf->strip_zone[j].dev_offset, | ||
509 | conf->strip_zone[j].size); | ||
510 | } | ||
511 | #endif | ||
512 | seq_printf(seq, " %dk chunks", mddev->chunk_size/1024); | ||
513 | return; | ||
514 | } | ||
515 | |||
516 | static mdk_personality_t raid0_personality= | ||
517 | { | ||
518 | .name = "raid0", | ||
519 | .owner = THIS_MODULE, | ||
520 | .make_request = raid0_make_request, | ||
521 | .run = raid0_run, | ||
522 | .stop = raid0_stop, | ||
523 | .status = raid0_status, | ||
524 | }; | ||
525 | |||
526 | static int __init raid0_init (void) | ||
527 | { | ||
528 | return register_md_personality (RAID0, &raid0_personality); | ||
529 | } | ||
530 | |||
531 | static void raid0_exit (void) | ||
532 | { | ||
533 | unregister_md_personality (RAID0); | ||
534 | } | ||
535 | |||
536 | module_init(raid0_init); | ||
537 | module_exit(raid0_exit); | ||
538 | MODULE_LICENSE("GPL"); | ||
539 | MODULE_ALIAS("md-personality-2"); /* RAID0 */ | ||
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c new file mode 100644 index 000000000000..a389394b52f6 --- /dev/null +++ b/drivers/md/raid1.c | |||
@@ -0,0 +1,1449 @@ | |||
1 | /* | ||
2 | * raid1.c : Multiple Devices driver for Linux | ||
3 | * | ||
4 | * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat | ||
5 | * | ||
6 | * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman | ||
7 | * | ||
8 | * RAID-1 management functions. | ||
9 | * | ||
10 | * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 | ||
11 | * | ||
12 | * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> | ||
13 | * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> | ||
14 | * | ||
15 | * This program is free software; you can redistribute it and/or modify | ||
16 | * it under the terms of the GNU General Public License as published by | ||
17 | * the Free Software Foundation; either version 2, or (at your option) | ||
18 | * any later version. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
22 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | #include <linux/raid/raid1.h> | ||
26 | |||
27 | /* | ||
28 | * Number of guaranteed r1bios in case of extreme VM load: | ||
29 | */ | ||
30 | #define NR_RAID1_BIOS 256 | ||
31 | |||
32 | static mdk_personality_t raid1_personality; | ||
33 | |||
34 | static void unplug_slaves(mddev_t *mddev); | ||
35 | |||
36 | |||
37 | static void * r1bio_pool_alloc(unsigned int __nocast gfp_flags, void *data) | ||
38 | { | ||
39 | struct pool_info *pi = data; | ||
40 | r1bio_t *r1_bio; | ||
41 | int size = offsetof(r1bio_t, bios[pi->raid_disks]); | ||
42 | |||
43 | /* allocate a r1bio with room for raid_disks entries in the bios array */ | ||
44 | r1_bio = kmalloc(size, gfp_flags); | ||
45 | if (r1_bio) | ||
46 | memset(r1_bio, 0, size); | ||
47 | else | ||
48 | unplug_slaves(pi->mddev); | ||
49 | |||
50 | return r1_bio; | ||
51 | } | ||
52 | |||
53 | static void r1bio_pool_free(void *r1_bio, void *data) | ||
54 | { | ||
55 | kfree(r1_bio); | ||
56 | } | ||
57 | |||
58 | #define RESYNC_BLOCK_SIZE (64*1024) | ||
59 | //#define RESYNC_BLOCK_SIZE PAGE_SIZE | ||
60 | #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) | ||
61 | #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) | ||
62 | #define RESYNC_WINDOW (2048*1024) | ||
63 | |||
64 | static void * r1buf_pool_alloc(unsigned int __nocast gfp_flags, void *data) | ||
65 | { | ||
66 | struct pool_info *pi = data; | ||
67 | struct page *page; | ||
68 | r1bio_t *r1_bio; | ||
69 | struct bio *bio; | ||
70 | int i, j; | ||
71 | |||
72 | r1_bio = r1bio_pool_alloc(gfp_flags, pi); | ||
73 | if (!r1_bio) { | ||
74 | unplug_slaves(pi->mddev); | ||
75 | return NULL; | ||
76 | } | ||
77 | |||
78 | /* | ||
79 | * Allocate bios : 1 for reading, n-1 for writing | ||
80 | */ | ||
81 | for (j = pi->raid_disks ; j-- ; ) { | ||
82 | bio = bio_alloc(gfp_flags, RESYNC_PAGES); | ||
83 | if (!bio) | ||
84 | goto out_free_bio; | ||
85 | r1_bio->bios[j] = bio; | ||
86 | } | ||
87 | /* | ||
88 | * Allocate RESYNC_PAGES data pages and attach them to | ||
89 | * the first bio; | ||
90 | */ | ||
91 | bio = r1_bio->bios[0]; | ||
92 | for (i = 0; i < RESYNC_PAGES; i++) { | ||
93 | page = alloc_page(gfp_flags); | ||
94 | if (unlikely(!page)) | ||
95 | goto out_free_pages; | ||
96 | |||
97 | bio->bi_io_vec[i].bv_page = page; | ||
98 | } | ||
99 | |||
100 | r1_bio->master_bio = NULL; | ||
101 | |||
102 | return r1_bio; | ||
103 | |||
104 | out_free_pages: | ||
105 | for ( ; i > 0 ; i--) | ||
106 | __free_page(bio->bi_io_vec[i-1].bv_page); | ||
107 | out_free_bio: | ||
108 | while ( ++j < pi->raid_disks ) | ||
109 | bio_put(r1_bio->bios[j]); | ||
110 | r1bio_pool_free(r1_bio, data); | ||
111 | return NULL; | ||
112 | } | ||
113 | |||
114 | static void r1buf_pool_free(void *__r1_bio, void *data) | ||
115 | { | ||
116 | struct pool_info *pi = data; | ||
117 | int i; | ||
118 | r1bio_t *r1bio = __r1_bio; | ||
119 | struct bio *bio = r1bio->bios[0]; | ||
120 | |||
121 | for (i = 0; i < RESYNC_PAGES; i++) { | ||
122 | __free_page(bio->bi_io_vec[i].bv_page); | ||
123 | bio->bi_io_vec[i].bv_page = NULL; | ||
124 | } | ||
125 | for (i=0 ; i < pi->raid_disks; i++) | ||
126 | bio_put(r1bio->bios[i]); | ||
127 | |||
128 | r1bio_pool_free(r1bio, data); | ||
129 | } | ||
130 | |||
131 | static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) | ||
132 | { | ||
133 | int i; | ||
134 | |||
135 | for (i = 0; i < conf->raid_disks; i++) { | ||
136 | struct bio **bio = r1_bio->bios + i; | ||
137 | if (*bio) | ||
138 | bio_put(*bio); | ||
139 | *bio = NULL; | ||
140 | } | ||
141 | } | ||
142 | |||
143 | static inline void free_r1bio(r1bio_t *r1_bio) | ||
144 | { | ||
145 | unsigned long flags; | ||
146 | |||
147 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | ||
148 | |||
149 | /* | ||
150 | * Wake up any possible resync thread that waits for the device | ||
151 | * to go idle. | ||
152 | */ | ||
153 | spin_lock_irqsave(&conf->resync_lock, flags); | ||
154 | if (!--conf->nr_pending) { | ||
155 | wake_up(&conf->wait_idle); | ||
156 | wake_up(&conf->wait_resume); | ||
157 | } | ||
158 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
159 | |||
160 | put_all_bios(conf, r1_bio); | ||
161 | mempool_free(r1_bio, conf->r1bio_pool); | ||
162 | } | ||
163 | |||
164 | static inline void put_buf(r1bio_t *r1_bio) | ||
165 | { | ||
166 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | ||
167 | unsigned long flags; | ||
168 | |||
169 | mempool_free(r1_bio, conf->r1buf_pool); | ||
170 | |||
171 | spin_lock_irqsave(&conf->resync_lock, flags); | ||
172 | if (!conf->barrier) | ||
173 | BUG(); | ||
174 | --conf->barrier; | ||
175 | wake_up(&conf->wait_resume); | ||
176 | wake_up(&conf->wait_idle); | ||
177 | |||
178 | if (!--conf->nr_pending) { | ||
179 | wake_up(&conf->wait_idle); | ||
180 | wake_up(&conf->wait_resume); | ||
181 | } | ||
182 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
183 | } | ||
184 | |||
185 | static void reschedule_retry(r1bio_t *r1_bio) | ||
186 | { | ||
187 | unsigned long flags; | ||
188 | mddev_t *mddev = r1_bio->mddev; | ||
189 | conf_t *conf = mddev_to_conf(mddev); | ||
190 | |||
191 | spin_lock_irqsave(&conf->device_lock, flags); | ||
192 | list_add(&r1_bio->retry_list, &conf->retry_list); | ||
193 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
194 | |||
195 | md_wakeup_thread(mddev->thread); | ||
196 | } | ||
197 | |||
198 | /* | ||
199 | * raid_end_bio_io() is called when we have finished servicing a mirrored | ||
200 | * operation and are ready to return a success/failure code to the buffer | ||
201 | * cache layer. | ||
202 | */ | ||
203 | static void raid_end_bio_io(r1bio_t *r1_bio) | ||
204 | { | ||
205 | struct bio *bio = r1_bio->master_bio; | ||
206 | |||
207 | bio_endio(bio, bio->bi_size, | ||
208 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | ||
209 | free_r1bio(r1_bio); | ||
210 | } | ||
211 | |||
212 | /* | ||
213 | * Update disk head position estimator based on IRQ completion info. | ||
214 | */ | ||
215 | static inline void update_head_pos(int disk, r1bio_t *r1_bio) | ||
216 | { | ||
217 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | ||
218 | |||
219 | conf->mirrors[disk].head_position = | ||
220 | r1_bio->sector + (r1_bio->sectors); | ||
221 | } | ||
222 | |||
223 | static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error) | ||
224 | { | ||
225 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
226 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | ||
227 | int mirror; | ||
228 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | ||
229 | |||
230 | if (bio->bi_size) | ||
231 | return 1; | ||
232 | |||
233 | mirror = r1_bio->read_disk; | ||
234 | /* | ||
235 | * this branch is our 'one mirror IO has finished' event handler: | ||
236 | */ | ||
237 | if (!uptodate) | ||
238 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | ||
239 | else | ||
240 | /* | ||
241 | * Set R1BIO_Uptodate in our master bio, so that | ||
242 | * we will return a good error code for to the higher | ||
243 | * levels even if IO on some other mirrored buffer fails. | ||
244 | * | ||
245 | * The 'master' represents the composite IO operation to | ||
246 | * user-side. So if something waits for IO, then it will | ||
247 | * wait for the 'master' bio. | ||
248 | */ | ||
249 | set_bit(R1BIO_Uptodate, &r1_bio->state); | ||
250 | |||
251 | update_head_pos(mirror, r1_bio); | ||
252 | |||
253 | /* | ||
254 | * we have only one bio on the read side | ||
255 | */ | ||
256 | if (uptodate) | ||
257 | raid_end_bio_io(r1_bio); | ||
258 | else { | ||
259 | /* | ||
260 | * oops, read error: | ||
261 | */ | ||
262 | char b[BDEVNAME_SIZE]; | ||
263 | if (printk_ratelimit()) | ||
264 | printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", | ||
265 | bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); | ||
266 | reschedule_retry(r1_bio); | ||
267 | } | ||
268 | |||
269 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | ||
270 | return 0; | ||
271 | } | ||
272 | |||
273 | static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error) | ||
274 | { | ||
275 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
276 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | ||
277 | int mirror; | ||
278 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | ||
279 | |||
280 | if (bio->bi_size) | ||
281 | return 1; | ||
282 | |||
283 | for (mirror = 0; mirror < conf->raid_disks; mirror++) | ||
284 | if (r1_bio->bios[mirror] == bio) | ||
285 | break; | ||
286 | |||
287 | /* | ||
288 | * this branch is our 'one mirror IO has finished' event handler: | ||
289 | */ | ||
290 | if (!uptodate) | ||
291 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | ||
292 | else | ||
293 | /* | ||
294 | * Set R1BIO_Uptodate in our master bio, so that | ||
295 | * we will return a good error code for to the higher | ||
296 | * levels even if IO on some other mirrored buffer fails. | ||
297 | * | ||
298 | * The 'master' represents the composite IO operation to | ||
299 | * user-side. So if something waits for IO, then it will | ||
300 | * wait for the 'master' bio. | ||
301 | */ | ||
302 | set_bit(R1BIO_Uptodate, &r1_bio->state); | ||
303 | |||
304 | update_head_pos(mirror, r1_bio); | ||
305 | |||
306 | /* | ||
307 | * | ||
308 | * Let's see if all mirrored write operations have finished | ||
309 | * already. | ||
310 | */ | ||
311 | if (atomic_dec_and_test(&r1_bio->remaining)) { | ||
312 | md_write_end(r1_bio->mddev); | ||
313 | raid_end_bio_io(r1_bio); | ||
314 | } | ||
315 | |||
316 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | ||
317 | return 0; | ||
318 | } | ||
319 | |||
320 | |||
321 | /* | ||
322 | * This routine returns the disk from which the requested read should | ||
323 | * be done. There is a per-array 'next expected sequential IO' sector | ||
324 | * number - if this matches on the next IO then we use the last disk. | ||
325 | * There is also a per-disk 'last know head position' sector that is | ||
326 | * maintained from IRQ contexts, both the normal and the resync IO | ||
327 | * completion handlers update this position correctly. If there is no | ||
328 | * perfect sequential match then we pick the disk whose head is closest. | ||
329 | * | ||
330 | * If there are 2 mirrors in the same 2 devices, performance degrades | ||
331 | * because position is mirror, not device based. | ||
332 | * | ||
333 | * The rdev for the device selected will have nr_pending incremented. | ||
334 | */ | ||
335 | static int read_balance(conf_t *conf, r1bio_t *r1_bio) | ||
336 | { | ||
337 | const unsigned long this_sector = r1_bio->sector; | ||
338 | int new_disk = conf->last_used, disk = new_disk; | ||
339 | const int sectors = r1_bio->sectors; | ||
340 | sector_t new_distance, current_distance; | ||
341 | mdk_rdev_t *new_rdev, *rdev; | ||
342 | |||
343 | rcu_read_lock(); | ||
344 | /* | ||
345 | * Check if it if we can balance. We can balance on the whole | ||
346 | * device if no resync is going on, or below the resync window. | ||
347 | * We take the first readable disk when above the resync window. | ||
348 | */ | ||
349 | retry: | ||
350 | if (conf->mddev->recovery_cp < MaxSector && | ||
351 | (this_sector + sectors >= conf->next_resync)) { | ||
352 | /* Choose the first operation device, for consistancy */ | ||
353 | new_disk = 0; | ||
354 | |||
355 | while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || | ||
356 | !new_rdev->in_sync) { | ||
357 | new_disk++; | ||
358 | if (new_disk == conf->raid_disks) { | ||
359 | new_disk = -1; | ||
360 | break; | ||
361 | } | ||
362 | } | ||
363 | goto rb_out; | ||
364 | } | ||
365 | |||
366 | |||
367 | /* make sure the disk is operational */ | ||
368 | while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || | ||
369 | !new_rdev->in_sync) { | ||
370 | if (new_disk <= 0) | ||
371 | new_disk = conf->raid_disks; | ||
372 | new_disk--; | ||
373 | if (new_disk == disk) { | ||
374 | new_disk = -1; | ||
375 | goto rb_out; | ||
376 | } | ||
377 | } | ||
378 | disk = new_disk; | ||
379 | /* now disk == new_disk == starting point for search */ | ||
380 | |||
381 | /* | ||
382 | * Don't change to another disk for sequential reads: | ||
383 | */ | ||
384 | if (conf->next_seq_sect == this_sector) | ||
385 | goto rb_out; | ||
386 | if (this_sector == conf->mirrors[new_disk].head_position) | ||
387 | goto rb_out; | ||
388 | |||
389 | current_distance = abs(this_sector - conf->mirrors[disk].head_position); | ||
390 | |||
391 | /* Find the disk whose head is closest */ | ||
392 | |||
393 | do { | ||
394 | if (disk <= 0) | ||
395 | disk = conf->raid_disks; | ||
396 | disk--; | ||
397 | |||
398 | if ((rdev=conf->mirrors[disk].rdev) == NULL || | ||
399 | !rdev->in_sync) | ||
400 | continue; | ||
401 | |||
402 | if (!atomic_read(&rdev->nr_pending)) { | ||
403 | new_disk = disk; | ||
404 | new_rdev = rdev; | ||
405 | break; | ||
406 | } | ||
407 | new_distance = abs(this_sector - conf->mirrors[disk].head_position); | ||
408 | if (new_distance < current_distance) { | ||
409 | current_distance = new_distance; | ||
410 | new_disk = disk; | ||
411 | new_rdev = rdev; | ||
412 | } | ||
413 | } while (disk != conf->last_used); | ||
414 | |||
415 | rb_out: | ||
416 | |||
417 | |||
418 | if (new_disk >= 0) { | ||
419 | conf->next_seq_sect = this_sector + sectors; | ||
420 | conf->last_used = new_disk; | ||
421 | atomic_inc(&new_rdev->nr_pending); | ||
422 | if (!new_rdev->in_sync) { | ||
423 | /* cannot risk returning a device that failed | ||
424 | * before we inc'ed nr_pending | ||
425 | */ | ||
426 | atomic_dec(&new_rdev->nr_pending); | ||
427 | goto retry; | ||
428 | } | ||
429 | } | ||
430 | rcu_read_unlock(); | ||
431 | |||
432 | return new_disk; | ||
433 | } | ||
434 | |||
435 | static void unplug_slaves(mddev_t *mddev) | ||
436 | { | ||
437 | conf_t *conf = mddev_to_conf(mddev); | ||
438 | int i; | ||
439 | |||
440 | rcu_read_lock(); | ||
441 | for (i=0; i<mddev->raid_disks; i++) { | ||
442 | mdk_rdev_t *rdev = conf->mirrors[i].rdev; | ||
443 | if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { | ||
444 | request_queue_t *r_queue = bdev_get_queue(rdev->bdev); | ||
445 | |||
446 | atomic_inc(&rdev->nr_pending); | ||
447 | rcu_read_unlock(); | ||
448 | |||
449 | if (r_queue->unplug_fn) | ||
450 | r_queue->unplug_fn(r_queue); | ||
451 | |||
452 | rdev_dec_pending(rdev, mddev); | ||
453 | rcu_read_lock(); | ||
454 | } | ||
455 | } | ||
456 | rcu_read_unlock(); | ||
457 | } | ||
458 | |||
459 | static void raid1_unplug(request_queue_t *q) | ||
460 | { | ||
461 | unplug_slaves(q->queuedata); | ||
462 | } | ||
463 | |||
464 | static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, | ||
465 | sector_t *error_sector) | ||
466 | { | ||
467 | mddev_t *mddev = q->queuedata; | ||
468 | conf_t *conf = mddev_to_conf(mddev); | ||
469 | int i, ret = 0; | ||
470 | |||
471 | rcu_read_lock(); | ||
472 | for (i=0; i<mddev->raid_disks && ret == 0; i++) { | ||
473 | mdk_rdev_t *rdev = conf->mirrors[i].rdev; | ||
474 | if (rdev && !rdev->faulty) { | ||
475 | struct block_device *bdev = rdev->bdev; | ||
476 | request_queue_t *r_queue = bdev_get_queue(bdev); | ||
477 | |||
478 | if (!r_queue->issue_flush_fn) | ||
479 | ret = -EOPNOTSUPP; | ||
480 | else { | ||
481 | atomic_inc(&rdev->nr_pending); | ||
482 | rcu_read_unlock(); | ||
483 | ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, | ||
484 | error_sector); | ||
485 | rdev_dec_pending(rdev, mddev); | ||
486 | rcu_read_lock(); | ||
487 | } | ||
488 | } | ||
489 | } | ||
490 | rcu_read_unlock(); | ||
491 | return ret; | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * Throttle resync depth, so that we can both get proper overlapping of | ||
496 | * requests, but are still able to handle normal requests quickly. | ||
497 | */ | ||
498 | #define RESYNC_DEPTH 32 | ||
499 | |||
500 | static void device_barrier(conf_t *conf, sector_t sect) | ||
501 | { | ||
502 | spin_lock_irq(&conf->resync_lock); | ||
503 | wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), | ||
504 | conf->resync_lock, unplug_slaves(conf->mddev)); | ||
505 | |||
506 | if (!conf->barrier++) { | ||
507 | wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, | ||
508 | conf->resync_lock, unplug_slaves(conf->mddev)); | ||
509 | if (conf->nr_pending) | ||
510 | BUG(); | ||
511 | } | ||
512 | wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, | ||
513 | conf->resync_lock, unplug_slaves(conf->mddev)); | ||
514 | conf->next_resync = sect; | ||
515 | spin_unlock_irq(&conf->resync_lock); | ||
516 | } | ||
517 | |||
518 | static int make_request(request_queue_t *q, struct bio * bio) | ||
519 | { | ||
520 | mddev_t *mddev = q->queuedata; | ||
521 | conf_t *conf = mddev_to_conf(mddev); | ||
522 | mirror_info_t *mirror; | ||
523 | r1bio_t *r1_bio; | ||
524 | struct bio *read_bio; | ||
525 | int i, disks; | ||
526 | mdk_rdev_t *rdev; | ||
527 | |||
528 | /* | ||
529 | * Register the new request and wait if the reconstruction | ||
530 | * thread has put up a bar for new requests. | ||
531 | * Continue immediately if no resync is active currently. | ||
532 | */ | ||
533 | spin_lock_irq(&conf->resync_lock); | ||
534 | wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); | ||
535 | conf->nr_pending++; | ||
536 | spin_unlock_irq(&conf->resync_lock); | ||
537 | |||
538 | if (bio_data_dir(bio)==WRITE) { | ||
539 | disk_stat_inc(mddev->gendisk, writes); | ||
540 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); | ||
541 | } else { | ||
542 | disk_stat_inc(mddev->gendisk, reads); | ||
543 | disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); | ||
544 | } | ||
545 | |||
546 | /* | ||
547 | * make_request() can abort the operation when READA is being | ||
548 | * used and no empty request is available. | ||
549 | * | ||
550 | */ | ||
551 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
552 | |||
553 | r1_bio->master_bio = bio; | ||
554 | r1_bio->sectors = bio->bi_size >> 9; | ||
555 | |||
556 | r1_bio->mddev = mddev; | ||
557 | r1_bio->sector = bio->bi_sector; | ||
558 | |||
559 | r1_bio->state = 0; | ||
560 | |||
561 | if (bio_data_dir(bio) == READ) { | ||
562 | /* | ||
563 | * read balancing logic: | ||
564 | */ | ||
565 | int rdisk = read_balance(conf, r1_bio); | ||
566 | |||
567 | if (rdisk < 0) { | ||
568 | /* couldn't find anywhere to read from */ | ||
569 | raid_end_bio_io(r1_bio); | ||
570 | return 0; | ||
571 | } | ||
572 | mirror = conf->mirrors + rdisk; | ||
573 | |||
574 | r1_bio->read_disk = rdisk; | ||
575 | |||
576 | read_bio = bio_clone(bio, GFP_NOIO); | ||
577 | |||
578 | r1_bio->bios[rdisk] = read_bio; | ||
579 | |||
580 | read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; | ||
581 | read_bio->bi_bdev = mirror->rdev->bdev; | ||
582 | read_bio->bi_end_io = raid1_end_read_request; | ||
583 | read_bio->bi_rw = READ; | ||
584 | read_bio->bi_private = r1_bio; | ||
585 | |||
586 | generic_make_request(read_bio); | ||
587 | return 0; | ||
588 | } | ||
589 | |||
590 | /* | ||
591 | * WRITE: | ||
592 | */ | ||
593 | /* first select target devices under spinlock and | ||
594 | * inc refcount on their rdev. Record them by setting | ||
595 | * bios[x] to bio | ||
596 | */ | ||
597 | disks = conf->raid_disks; | ||
598 | rcu_read_lock(); | ||
599 | for (i = 0; i < disks; i++) { | ||
600 | if ((rdev=conf->mirrors[i].rdev) != NULL && | ||
601 | !rdev->faulty) { | ||
602 | atomic_inc(&rdev->nr_pending); | ||
603 | if (rdev->faulty) { | ||
604 | atomic_dec(&rdev->nr_pending); | ||
605 | r1_bio->bios[i] = NULL; | ||
606 | } else | ||
607 | r1_bio->bios[i] = bio; | ||
608 | } else | ||
609 | r1_bio->bios[i] = NULL; | ||
610 | } | ||
611 | rcu_read_unlock(); | ||
612 | |||
613 | atomic_set(&r1_bio->remaining, 1); | ||
614 | md_write_start(mddev); | ||
615 | for (i = 0; i < disks; i++) { | ||
616 | struct bio *mbio; | ||
617 | if (!r1_bio->bios[i]) | ||
618 | continue; | ||
619 | |||
620 | mbio = bio_clone(bio, GFP_NOIO); | ||
621 | r1_bio->bios[i] = mbio; | ||
622 | |||
623 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; | ||
624 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
625 | mbio->bi_end_io = raid1_end_write_request; | ||
626 | mbio->bi_rw = WRITE; | ||
627 | mbio->bi_private = r1_bio; | ||
628 | |||
629 | atomic_inc(&r1_bio->remaining); | ||
630 | generic_make_request(mbio); | ||
631 | } | ||
632 | |||
633 | if (atomic_dec_and_test(&r1_bio->remaining)) { | ||
634 | md_write_end(mddev); | ||
635 | raid_end_bio_io(r1_bio); | ||
636 | } | ||
637 | |||
638 | return 0; | ||
639 | } | ||
640 | |||
641 | static void status(struct seq_file *seq, mddev_t *mddev) | ||
642 | { | ||
643 | conf_t *conf = mddev_to_conf(mddev); | ||
644 | int i; | ||
645 | |||
646 | seq_printf(seq, " [%d/%d] [", conf->raid_disks, | ||
647 | conf->working_disks); | ||
648 | for (i = 0; i < conf->raid_disks; i++) | ||
649 | seq_printf(seq, "%s", | ||
650 | conf->mirrors[i].rdev && | ||
651 | conf->mirrors[i].rdev->in_sync ? "U" : "_"); | ||
652 | seq_printf(seq, "]"); | ||
653 | } | ||
654 | |||
655 | |||
656 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | ||
657 | { | ||
658 | char b[BDEVNAME_SIZE]; | ||
659 | conf_t *conf = mddev_to_conf(mddev); | ||
660 | |||
661 | /* | ||
662 | * If it is not operational, then we have already marked it as dead | ||
663 | * else if it is the last working disks, ignore the error, let the | ||
664 | * next level up know. | ||
665 | * else mark the drive as failed | ||
666 | */ | ||
667 | if (rdev->in_sync | ||
668 | && conf->working_disks == 1) | ||
669 | /* | ||
670 | * Don't fail the drive, act as though we were just a | ||
671 | * normal single drive | ||
672 | */ | ||
673 | return; | ||
674 | if (rdev->in_sync) { | ||
675 | mddev->degraded++; | ||
676 | conf->working_disks--; | ||
677 | /* | ||
678 | * if recovery is running, make sure it aborts. | ||
679 | */ | ||
680 | set_bit(MD_RECOVERY_ERR, &mddev->recovery); | ||
681 | } | ||
682 | rdev->in_sync = 0; | ||
683 | rdev->faulty = 1; | ||
684 | mddev->sb_dirty = 1; | ||
685 | printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" | ||
686 | " Operation continuing on %d devices\n", | ||
687 | bdevname(rdev->bdev,b), conf->working_disks); | ||
688 | } | ||
689 | |||
690 | static void print_conf(conf_t *conf) | ||
691 | { | ||
692 | int i; | ||
693 | mirror_info_t *tmp; | ||
694 | |||
695 | printk("RAID1 conf printout:\n"); | ||
696 | if (!conf) { | ||
697 | printk("(!conf)\n"); | ||
698 | return; | ||
699 | } | ||
700 | printk(" --- wd:%d rd:%d\n", conf->working_disks, | ||
701 | conf->raid_disks); | ||
702 | |||
703 | for (i = 0; i < conf->raid_disks; i++) { | ||
704 | char b[BDEVNAME_SIZE]; | ||
705 | tmp = conf->mirrors + i; | ||
706 | if (tmp->rdev) | ||
707 | printk(" disk %d, wo:%d, o:%d, dev:%s\n", | ||
708 | i, !tmp->rdev->in_sync, !tmp->rdev->faulty, | ||
709 | bdevname(tmp->rdev->bdev,b)); | ||
710 | } | ||
711 | } | ||
712 | |||
713 | static void close_sync(conf_t *conf) | ||
714 | { | ||
715 | spin_lock_irq(&conf->resync_lock); | ||
716 | wait_event_lock_irq(conf->wait_resume, !conf->barrier, | ||
717 | conf->resync_lock, unplug_slaves(conf->mddev)); | ||
718 | spin_unlock_irq(&conf->resync_lock); | ||
719 | |||
720 | if (conf->barrier) BUG(); | ||
721 | if (waitqueue_active(&conf->wait_idle)) BUG(); | ||
722 | |||
723 | mempool_destroy(conf->r1buf_pool); | ||
724 | conf->r1buf_pool = NULL; | ||
725 | } | ||
726 | |||
727 | static int raid1_spare_active(mddev_t *mddev) | ||
728 | { | ||
729 | int i; | ||
730 | conf_t *conf = mddev->private; | ||
731 | mirror_info_t *tmp; | ||
732 | |||
733 | /* | ||
734 | * Find all failed disks within the RAID1 configuration | ||
735 | * and mark them readable | ||
736 | */ | ||
737 | for (i = 0; i < conf->raid_disks; i++) { | ||
738 | tmp = conf->mirrors + i; | ||
739 | if (tmp->rdev | ||
740 | && !tmp->rdev->faulty | ||
741 | && !tmp->rdev->in_sync) { | ||
742 | conf->working_disks++; | ||
743 | mddev->degraded--; | ||
744 | tmp->rdev->in_sync = 1; | ||
745 | } | ||
746 | } | ||
747 | |||
748 | print_conf(conf); | ||
749 | return 0; | ||
750 | } | ||
751 | |||
752 | |||
753 | static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | ||
754 | { | ||
755 | conf_t *conf = mddev->private; | ||
756 | int found = 0; | ||
757 | int mirror; | ||
758 | mirror_info_t *p; | ||
759 | |||
760 | for (mirror=0; mirror < mddev->raid_disks; mirror++) | ||
761 | if ( !(p=conf->mirrors+mirror)->rdev) { | ||
762 | |||
763 | blk_queue_stack_limits(mddev->queue, | ||
764 | rdev->bdev->bd_disk->queue); | ||
765 | /* as we don't honour merge_bvec_fn, we must never risk | ||
766 | * violating it, so limit ->max_sector to one PAGE, as | ||
767 | * a one page request is never in violation. | ||
768 | */ | ||
769 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | ||
770 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | ||
771 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | ||
772 | |||
773 | p->head_position = 0; | ||
774 | rdev->raid_disk = mirror; | ||
775 | found = 1; | ||
776 | p->rdev = rdev; | ||
777 | break; | ||
778 | } | ||
779 | |||
780 | print_conf(conf); | ||
781 | return found; | ||
782 | } | ||
783 | |||
784 | static int raid1_remove_disk(mddev_t *mddev, int number) | ||
785 | { | ||
786 | conf_t *conf = mddev->private; | ||
787 | int err = 0; | ||
788 | mdk_rdev_t *rdev; | ||
789 | mirror_info_t *p = conf->mirrors+ number; | ||
790 | |||
791 | print_conf(conf); | ||
792 | rdev = p->rdev; | ||
793 | if (rdev) { | ||
794 | if (rdev->in_sync || | ||
795 | atomic_read(&rdev->nr_pending)) { | ||
796 | err = -EBUSY; | ||
797 | goto abort; | ||
798 | } | ||
799 | p->rdev = NULL; | ||
800 | synchronize_kernel(); | ||
801 | if (atomic_read(&rdev->nr_pending)) { | ||
802 | /* lost the race, try later */ | ||
803 | err = -EBUSY; | ||
804 | p->rdev = rdev; | ||
805 | } | ||
806 | } | ||
807 | abort: | ||
808 | |||
809 | print_conf(conf); | ||
810 | return err; | ||
811 | } | ||
812 | |||
813 | |||
814 | static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) | ||
815 | { | ||
816 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
817 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | ||
818 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | ||
819 | |||
820 | if (bio->bi_size) | ||
821 | return 1; | ||
822 | |||
823 | if (r1_bio->bios[r1_bio->read_disk] != bio) | ||
824 | BUG(); | ||
825 | update_head_pos(r1_bio->read_disk, r1_bio); | ||
826 | /* | ||
827 | * we have read a block, now it needs to be re-written, | ||
828 | * or re-read if the read failed. | ||
829 | * We don't do much here, just schedule handling by raid1d | ||
830 | */ | ||
831 | if (!uptodate) | ||
832 | md_error(r1_bio->mddev, | ||
833 | conf->mirrors[r1_bio->read_disk].rdev); | ||
834 | else | ||
835 | set_bit(R1BIO_Uptodate, &r1_bio->state); | ||
836 | rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); | ||
837 | reschedule_retry(r1_bio); | ||
838 | return 0; | ||
839 | } | ||
840 | |||
841 | static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) | ||
842 | { | ||
843 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
844 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | ||
845 | mddev_t *mddev = r1_bio->mddev; | ||
846 | conf_t *conf = mddev_to_conf(mddev); | ||
847 | int i; | ||
848 | int mirror=0; | ||
849 | |||
850 | if (bio->bi_size) | ||
851 | return 1; | ||
852 | |||
853 | for (i = 0; i < conf->raid_disks; i++) | ||
854 | if (r1_bio->bios[i] == bio) { | ||
855 | mirror = i; | ||
856 | break; | ||
857 | } | ||
858 | if (!uptodate) | ||
859 | md_error(mddev, conf->mirrors[mirror].rdev); | ||
860 | update_head_pos(mirror, r1_bio); | ||
861 | |||
862 | if (atomic_dec_and_test(&r1_bio->remaining)) { | ||
863 | md_done_sync(mddev, r1_bio->sectors, uptodate); | ||
864 | put_buf(r1_bio); | ||
865 | } | ||
866 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
867 | return 0; | ||
868 | } | ||
869 | |||
870 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | ||
871 | { | ||
872 | conf_t *conf = mddev_to_conf(mddev); | ||
873 | int i; | ||
874 | int disks = conf->raid_disks; | ||
875 | struct bio *bio, *wbio; | ||
876 | |||
877 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
878 | |||
879 | /* | ||
880 | * schedule writes | ||
881 | */ | ||
882 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
883 | /* | ||
884 | * There is no point trying a read-for-reconstruct as | ||
885 | * reconstruct is about to be aborted | ||
886 | */ | ||
887 | char b[BDEVNAME_SIZE]; | ||
888 | printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" | ||
889 | " for block %llu\n", | ||
890 | bdevname(bio->bi_bdev,b), | ||
891 | (unsigned long long)r1_bio->sector); | ||
892 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
893 | put_buf(r1_bio); | ||
894 | return; | ||
895 | } | ||
896 | |||
897 | atomic_set(&r1_bio->remaining, 1); | ||
898 | for (i = 0; i < disks ; i++) { | ||
899 | wbio = r1_bio->bios[i]; | ||
900 | if (wbio->bi_end_io != end_sync_write) | ||
901 | continue; | ||
902 | |||
903 | atomic_inc(&conf->mirrors[i].rdev->nr_pending); | ||
904 | atomic_inc(&r1_bio->remaining); | ||
905 | md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); | ||
906 | generic_make_request(wbio); | ||
907 | } | ||
908 | |||
909 | if (atomic_dec_and_test(&r1_bio->remaining)) { | ||
910 | md_done_sync(mddev, r1_bio->sectors, 1); | ||
911 | put_buf(r1_bio); | ||
912 | } | ||
913 | } | ||
914 | |||
915 | /* | ||
916 | * This is a kernel thread which: | ||
917 | * | ||
918 | * 1. Retries failed read operations on working mirrors. | ||
919 | * 2. Updates the raid superblock when problems encounter. | ||
920 | * 3. Performs writes following reads for array syncronising. | ||
921 | */ | ||
922 | |||
923 | static void raid1d(mddev_t *mddev) | ||
924 | { | ||
925 | r1bio_t *r1_bio; | ||
926 | struct bio *bio; | ||
927 | unsigned long flags; | ||
928 | conf_t *conf = mddev_to_conf(mddev); | ||
929 | struct list_head *head = &conf->retry_list; | ||
930 | int unplug=0; | ||
931 | mdk_rdev_t *rdev; | ||
932 | |||
933 | md_check_recovery(mddev); | ||
934 | md_handle_safemode(mddev); | ||
935 | |||
936 | for (;;) { | ||
937 | char b[BDEVNAME_SIZE]; | ||
938 | spin_lock_irqsave(&conf->device_lock, flags); | ||
939 | if (list_empty(head)) | ||
940 | break; | ||
941 | r1_bio = list_entry(head->prev, r1bio_t, retry_list); | ||
942 | list_del(head->prev); | ||
943 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
944 | |||
945 | mddev = r1_bio->mddev; | ||
946 | conf = mddev_to_conf(mddev); | ||
947 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { | ||
948 | sync_request_write(mddev, r1_bio); | ||
949 | unplug = 1; | ||
950 | } else { | ||
951 | int disk; | ||
952 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
953 | if ((disk=read_balance(conf, r1_bio)) == -1) { | ||
954 | printk(KERN_ALERT "raid1: %s: unrecoverable I/O" | ||
955 | " read error for block %llu\n", | ||
956 | bdevname(bio->bi_bdev,b), | ||
957 | (unsigned long long)r1_bio->sector); | ||
958 | raid_end_bio_io(r1_bio); | ||
959 | } else { | ||
960 | r1_bio->bios[r1_bio->read_disk] = NULL; | ||
961 | r1_bio->read_disk = disk; | ||
962 | bio_put(bio); | ||
963 | bio = bio_clone(r1_bio->master_bio, GFP_NOIO); | ||
964 | r1_bio->bios[r1_bio->read_disk] = bio; | ||
965 | rdev = conf->mirrors[disk].rdev; | ||
966 | if (printk_ratelimit()) | ||
967 | printk(KERN_ERR "raid1: %s: redirecting sector %llu to" | ||
968 | " another mirror\n", | ||
969 | bdevname(rdev->bdev,b), | ||
970 | (unsigned long long)r1_bio->sector); | ||
971 | bio->bi_sector = r1_bio->sector + rdev->data_offset; | ||
972 | bio->bi_bdev = rdev->bdev; | ||
973 | bio->bi_end_io = raid1_end_read_request; | ||
974 | bio->bi_rw = READ; | ||
975 | bio->bi_private = r1_bio; | ||
976 | unplug = 1; | ||
977 | generic_make_request(bio); | ||
978 | } | ||
979 | } | ||
980 | } | ||
981 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
982 | if (unplug) | ||
983 | unplug_slaves(mddev); | ||
984 | } | ||
985 | |||
986 | |||
987 | static int init_resync(conf_t *conf) | ||
988 | { | ||
989 | int buffs; | ||
990 | |||
991 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; | ||
992 | if (conf->r1buf_pool) | ||
993 | BUG(); | ||
994 | conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free, | ||
995 | conf->poolinfo); | ||
996 | if (!conf->r1buf_pool) | ||
997 | return -ENOMEM; | ||
998 | conf->next_resync = 0; | ||
999 | return 0; | ||
1000 | } | ||
1001 | |||
1002 | /* | ||
1003 | * perform a "sync" on one "block" | ||
1004 | * | ||
1005 | * We need to make sure that no normal I/O request - particularly write | ||
1006 | * requests - conflict with active sync requests. | ||
1007 | * | ||
1008 | * This is achieved by tracking pending requests and a 'barrier' concept | ||
1009 | * that can be installed to exclude normal IO requests. | ||
1010 | */ | ||
1011 | |||
1012 | static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) | ||
1013 | { | ||
1014 | conf_t *conf = mddev_to_conf(mddev); | ||
1015 | mirror_info_t *mirror; | ||
1016 | r1bio_t *r1_bio; | ||
1017 | struct bio *bio; | ||
1018 | sector_t max_sector, nr_sectors; | ||
1019 | int disk; | ||
1020 | int i; | ||
1021 | int write_targets = 0; | ||
1022 | |||
1023 | if (!conf->r1buf_pool) | ||
1024 | if (init_resync(conf)) | ||
1025 | return -ENOMEM; | ||
1026 | |||
1027 | max_sector = mddev->size << 1; | ||
1028 | if (sector_nr >= max_sector) { | ||
1029 | close_sync(conf); | ||
1030 | return 0; | ||
1031 | } | ||
1032 | |||
1033 | /* | ||
1034 | * If there is non-resync activity waiting for us then | ||
1035 | * put in a delay to throttle resync. | ||
1036 | */ | ||
1037 | if (!go_faster && waitqueue_active(&conf->wait_resume)) | ||
1038 | msleep_interruptible(1000); | ||
1039 | device_barrier(conf, sector_nr + RESYNC_SECTORS); | ||
1040 | |||
1041 | /* | ||
1042 | * If reconstructing, and >1 working disc, | ||
1043 | * could dedicate one to rebuild and others to | ||
1044 | * service read requests .. | ||
1045 | */ | ||
1046 | disk = conf->last_used; | ||
1047 | /* make sure disk is operational */ | ||
1048 | |||
1049 | while (conf->mirrors[disk].rdev == NULL || | ||
1050 | !conf->mirrors[disk].rdev->in_sync) { | ||
1051 | if (disk <= 0) | ||
1052 | disk = conf->raid_disks; | ||
1053 | disk--; | ||
1054 | if (disk == conf->last_used) | ||
1055 | break; | ||
1056 | } | ||
1057 | conf->last_used = disk; | ||
1058 | atomic_inc(&conf->mirrors[disk].rdev->nr_pending); | ||
1059 | |||
1060 | |||
1061 | mirror = conf->mirrors + disk; | ||
1062 | |||
1063 | r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); | ||
1064 | |||
1065 | spin_lock_irq(&conf->resync_lock); | ||
1066 | conf->nr_pending++; | ||
1067 | spin_unlock_irq(&conf->resync_lock); | ||
1068 | |||
1069 | r1_bio->mddev = mddev; | ||
1070 | r1_bio->sector = sector_nr; | ||
1071 | set_bit(R1BIO_IsSync, &r1_bio->state); | ||
1072 | r1_bio->read_disk = disk; | ||
1073 | |||
1074 | for (i=0; i < conf->raid_disks; i++) { | ||
1075 | bio = r1_bio->bios[i]; | ||
1076 | |||
1077 | /* take from bio_init */ | ||
1078 | bio->bi_next = NULL; | ||
1079 | bio->bi_flags |= 1 << BIO_UPTODATE; | ||
1080 | bio->bi_rw = 0; | ||
1081 | bio->bi_vcnt = 0; | ||
1082 | bio->bi_idx = 0; | ||
1083 | bio->bi_phys_segments = 0; | ||
1084 | bio->bi_hw_segments = 0; | ||
1085 | bio->bi_size = 0; | ||
1086 | bio->bi_end_io = NULL; | ||
1087 | bio->bi_private = NULL; | ||
1088 | |||
1089 | if (i == disk) { | ||
1090 | bio->bi_rw = READ; | ||
1091 | bio->bi_end_io = end_sync_read; | ||
1092 | } else if (conf->mirrors[i].rdev && | ||
1093 | !conf->mirrors[i].rdev->faulty && | ||
1094 | (!conf->mirrors[i].rdev->in_sync || | ||
1095 | sector_nr + RESYNC_SECTORS > mddev->recovery_cp)) { | ||
1096 | bio->bi_rw = WRITE; | ||
1097 | bio->bi_end_io = end_sync_write; | ||
1098 | write_targets ++; | ||
1099 | } else | ||
1100 | continue; | ||
1101 | bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset; | ||
1102 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1103 | bio->bi_private = r1_bio; | ||
1104 | } | ||
1105 | if (write_targets == 0) { | ||
1106 | /* There is nowhere to write, so all non-sync | ||
1107 | * drives must be failed - so we are finished | ||
1108 | */ | ||
1109 | int rv = max_sector - sector_nr; | ||
1110 | md_done_sync(mddev, rv, 1); | ||
1111 | put_buf(r1_bio); | ||
1112 | rdev_dec_pending(conf->mirrors[disk].rdev, mddev); | ||
1113 | return rv; | ||
1114 | } | ||
1115 | |||
1116 | nr_sectors = 0; | ||
1117 | do { | ||
1118 | struct page *page; | ||
1119 | int len = PAGE_SIZE; | ||
1120 | if (sector_nr + (len>>9) > max_sector) | ||
1121 | len = (max_sector - sector_nr) << 9; | ||
1122 | if (len == 0) | ||
1123 | break; | ||
1124 | for (i=0 ; i < conf->raid_disks; i++) { | ||
1125 | bio = r1_bio->bios[i]; | ||
1126 | if (bio->bi_end_io) { | ||
1127 | page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page; | ||
1128 | if (bio_add_page(bio, page, len, 0) == 0) { | ||
1129 | /* stop here */ | ||
1130 | r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page; | ||
1131 | while (i > 0) { | ||
1132 | i--; | ||
1133 | bio = r1_bio->bios[i]; | ||
1134 | if (bio->bi_end_io==NULL) continue; | ||
1135 | /* remove last page from this bio */ | ||
1136 | bio->bi_vcnt--; | ||
1137 | bio->bi_size -= len; | ||
1138 | bio->bi_flags &= ~(1<< BIO_SEG_VALID); | ||
1139 | } | ||
1140 | goto bio_full; | ||
1141 | } | ||
1142 | } | ||
1143 | } | ||
1144 | nr_sectors += len>>9; | ||
1145 | sector_nr += len>>9; | ||
1146 | } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); | ||
1147 | bio_full: | ||
1148 | bio = r1_bio->bios[disk]; | ||
1149 | r1_bio->sectors = nr_sectors; | ||
1150 | |||
1151 | md_sync_acct(mirror->rdev->bdev, nr_sectors); | ||
1152 | |||
1153 | generic_make_request(bio); | ||
1154 | |||
1155 | return nr_sectors; | ||
1156 | } | ||
1157 | |||
1158 | static int run(mddev_t *mddev) | ||
1159 | { | ||
1160 | conf_t *conf; | ||
1161 | int i, j, disk_idx; | ||
1162 | mirror_info_t *disk; | ||
1163 | mdk_rdev_t *rdev; | ||
1164 | struct list_head *tmp; | ||
1165 | |||
1166 | if (mddev->level != 1) { | ||
1167 | printk("raid1: %s: raid level not set to mirroring (%d)\n", | ||
1168 | mdname(mddev), mddev->level); | ||
1169 | goto out; | ||
1170 | } | ||
1171 | /* | ||
1172 | * copy the already verified devices into our private RAID1 | ||
1173 | * bookkeeping area. [whatever we allocate in run(), | ||
1174 | * should be freed in stop()] | ||
1175 | */ | ||
1176 | conf = kmalloc(sizeof(conf_t), GFP_KERNEL); | ||
1177 | mddev->private = conf; | ||
1178 | if (!conf) | ||
1179 | goto out_no_mem; | ||
1180 | |||
1181 | memset(conf, 0, sizeof(*conf)); | ||
1182 | conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, | ||
1183 | GFP_KERNEL); | ||
1184 | if (!conf->mirrors) | ||
1185 | goto out_no_mem; | ||
1186 | |||
1187 | memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); | ||
1188 | |||
1189 | conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); | ||
1190 | if (!conf->poolinfo) | ||
1191 | goto out_no_mem; | ||
1192 | conf->poolinfo->mddev = mddev; | ||
1193 | conf->poolinfo->raid_disks = mddev->raid_disks; | ||
1194 | conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, | ||
1195 | r1bio_pool_free, | ||
1196 | conf->poolinfo); | ||
1197 | if (!conf->r1bio_pool) | ||
1198 | goto out_no_mem; | ||
1199 | |||
1200 | mddev->queue->unplug_fn = raid1_unplug; | ||
1201 | |||
1202 | mddev->queue->issue_flush_fn = raid1_issue_flush; | ||
1203 | |||
1204 | ITERATE_RDEV(mddev, rdev, tmp) { | ||
1205 | disk_idx = rdev->raid_disk; | ||
1206 | if (disk_idx >= mddev->raid_disks | ||
1207 | || disk_idx < 0) | ||
1208 | continue; | ||
1209 | disk = conf->mirrors + disk_idx; | ||
1210 | |||
1211 | disk->rdev = rdev; | ||
1212 | |||
1213 | blk_queue_stack_limits(mddev->queue, | ||
1214 | rdev->bdev->bd_disk->queue); | ||
1215 | /* as we don't honour merge_bvec_fn, we must never risk | ||
1216 | * violating it, so limit ->max_sector to one PAGE, as | ||
1217 | * a one page request is never in violation. | ||
1218 | */ | ||
1219 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | ||
1220 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | ||
1221 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | ||
1222 | |||
1223 | disk->head_position = 0; | ||
1224 | if (!rdev->faulty && rdev->in_sync) | ||
1225 | conf->working_disks++; | ||
1226 | } | ||
1227 | conf->raid_disks = mddev->raid_disks; | ||
1228 | conf->mddev = mddev; | ||
1229 | spin_lock_init(&conf->device_lock); | ||
1230 | INIT_LIST_HEAD(&conf->retry_list); | ||
1231 | if (conf->working_disks == 1) | ||
1232 | mddev->recovery_cp = MaxSector; | ||
1233 | |||
1234 | spin_lock_init(&conf->resync_lock); | ||
1235 | init_waitqueue_head(&conf->wait_idle); | ||
1236 | init_waitqueue_head(&conf->wait_resume); | ||
1237 | |||
1238 | if (!conf->working_disks) { | ||
1239 | printk(KERN_ERR "raid1: no operational mirrors for %s\n", | ||
1240 | mdname(mddev)); | ||
1241 | goto out_free_conf; | ||
1242 | } | ||
1243 | |||
1244 | mddev->degraded = 0; | ||
1245 | for (i = 0; i < conf->raid_disks; i++) { | ||
1246 | |||
1247 | disk = conf->mirrors + i; | ||
1248 | |||
1249 | if (!disk->rdev) { | ||
1250 | disk->head_position = 0; | ||
1251 | mddev->degraded++; | ||
1252 | } | ||
1253 | } | ||
1254 | |||
1255 | /* | ||
1256 | * find the first working one and use it as a starting point | ||
1257 | * to read balancing. | ||
1258 | */ | ||
1259 | for (j = 0; j < conf->raid_disks && | ||
1260 | (!conf->mirrors[j].rdev || | ||
1261 | !conf->mirrors[j].rdev->in_sync) ; j++) | ||
1262 | /* nothing */; | ||
1263 | conf->last_used = j; | ||
1264 | |||
1265 | |||
1266 | |||
1267 | { | ||
1268 | mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); | ||
1269 | if (!mddev->thread) { | ||
1270 | printk(KERN_ERR | ||
1271 | "raid1: couldn't allocate thread for %s\n", | ||
1272 | mdname(mddev)); | ||
1273 | goto out_free_conf; | ||
1274 | } | ||
1275 | } | ||
1276 | printk(KERN_INFO | ||
1277 | "raid1: raid set %s active with %d out of %d mirrors\n", | ||
1278 | mdname(mddev), mddev->raid_disks - mddev->degraded, | ||
1279 | mddev->raid_disks); | ||
1280 | /* | ||
1281 | * Ok, everything is just fine now | ||
1282 | */ | ||
1283 | mddev->array_size = mddev->size; | ||
1284 | |||
1285 | return 0; | ||
1286 | |||
1287 | out_no_mem: | ||
1288 | printk(KERN_ERR "raid1: couldn't allocate memory for %s\n", | ||
1289 | mdname(mddev)); | ||
1290 | |||
1291 | out_free_conf: | ||
1292 | if (conf) { | ||
1293 | if (conf->r1bio_pool) | ||
1294 | mempool_destroy(conf->r1bio_pool); | ||
1295 | if (conf->mirrors) | ||
1296 | kfree(conf->mirrors); | ||
1297 | if (conf->poolinfo) | ||
1298 | kfree(conf->poolinfo); | ||
1299 | kfree(conf); | ||
1300 | mddev->private = NULL; | ||
1301 | } | ||
1302 | out: | ||
1303 | return -EIO; | ||
1304 | } | ||
1305 | |||
1306 | static int stop(mddev_t *mddev) | ||
1307 | { | ||
1308 | conf_t *conf = mddev_to_conf(mddev); | ||
1309 | |||
1310 | md_unregister_thread(mddev->thread); | ||
1311 | mddev->thread = NULL; | ||
1312 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | ||
1313 | if (conf->r1bio_pool) | ||
1314 | mempool_destroy(conf->r1bio_pool); | ||
1315 | if (conf->mirrors) | ||
1316 | kfree(conf->mirrors); | ||
1317 | if (conf->poolinfo) | ||
1318 | kfree(conf->poolinfo); | ||
1319 | kfree(conf); | ||
1320 | mddev->private = NULL; | ||
1321 | return 0; | ||
1322 | } | ||
1323 | |||
1324 | static int raid1_resize(mddev_t *mddev, sector_t sectors) | ||
1325 | { | ||
1326 | /* no resync is happening, and there is enough space | ||
1327 | * on all devices, so we can resize. | ||
1328 | * We need to make sure resync covers any new space. | ||
1329 | * If the array is shrinking we should possibly wait until | ||
1330 | * any io in the removed space completes, but it hardly seems | ||
1331 | * worth it. | ||
1332 | */ | ||
1333 | mddev->array_size = sectors>>1; | ||
1334 | set_capacity(mddev->gendisk, mddev->array_size << 1); | ||
1335 | mddev->changed = 1; | ||
1336 | if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { | ||
1337 | mddev->recovery_cp = mddev->size << 1; | ||
1338 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
1339 | } | ||
1340 | mddev->size = mddev->array_size; | ||
1341 | return 0; | ||
1342 | } | ||
1343 | |||
1344 | static int raid1_reshape(mddev_t *mddev, int raid_disks) | ||
1345 | { | ||
1346 | /* We need to: | ||
1347 | * 1/ resize the r1bio_pool | ||
1348 | * 2/ resize conf->mirrors | ||
1349 | * | ||
1350 | * We allocate a new r1bio_pool if we can. | ||
1351 | * Then raise a device barrier and wait until all IO stops. | ||
1352 | * Then resize conf->mirrors and swap in the new r1bio pool. | ||
1353 | */ | ||
1354 | mempool_t *newpool, *oldpool; | ||
1355 | struct pool_info *newpoolinfo; | ||
1356 | mirror_info_t *newmirrors; | ||
1357 | conf_t *conf = mddev_to_conf(mddev); | ||
1358 | |||
1359 | int d; | ||
1360 | |||
1361 | for (d= raid_disks; d < conf->raid_disks; d++) | ||
1362 | if (conf->mirrors[d].rdev) | ||
1363 | return -EBUSY; | ||
1364 | |||
1365 | newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); | ||
1366 | if (!newpoolinfo) | ||
1367 | return -ENOMEM; | ||
1368 | newpoolinfo->mddev = mddev; | ||
1369 | newpoolinfo->raid_disks = raid_disks; | ||
1370 | |||
1371 | newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, | ||
1372 | r1bio_pool_free, newpoolinfo); | ||
1373 | if (!newpool) { | ||
1374 | kfree(newpoolinfo); | ||
1375 | return -ENOMEM; | ||
1376 | } | ||
1377 | newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); | ||
1378 | if (!newmirrors) { | ||
1379 | kfree(newpoolinfo); | ||
1380 | mempool_destroy(newpool); | ||
1381 | return -ENOMEM; | ||
1382 | } | ||
1383 | memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks); | ||
1384 | |||
1385 | spin_lock_irq(&conf->resync_lock); | ||
1386 | conf->barrier++; | ||
1387 | wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, | ||
1388 | conf->resync_lock, unplug_slaves(mddev)); | ||
1389 | spin_unlock_irq(&conf->resync_lock); | ||
1390 | |||
1391 | /* ok, everything is stopped */ | ||
1392 | oldpool = conf->r1bio_pool; | ||
1393 | conf->r1bio_pool = newpool; | ||
1394 | for (d=0; d < raid_disks && d < conf->raid_disks; d++) | ||
1395 | newmirrors[d] = conf->mirrors[d]; | ||
1396 | kfree(conf->mirrors); | ||
1397 | conf->mirrors = newmirrors; | ||
1398 | kfree(conf->poolinfo); | ||
1399 | conf->poolinfo = newpoolinfo; | ||
1400 | |||
1401 | mddev->degraded += (raid_disks - conf->raid_disks); | ||
1402 | conf->raid_disks = mddev->raid_disks = raid_disks; | ||
1403 | |||
1404 | spin_lock_irq(&conf->resync_lock); | ||
1405 | conf->barrier--; | ||
1406 | spin_unlock_irq(&conf->resync_lock); | ||
1407 | wake_up(&conf->wait_resume); | ||
1408 | wake_up(&conf->wait_idle); | ||
1409 | |||
1410 | |||
1411 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
1412 | md_wakeup_thread(mddev->thread); | ||
1413 | |||
1414 | mempool_destroy(oldpool); | ||
1415 | return 0; | ||
1416 | } | ||
1417 | |||
1418 | |||
1419 | static mdk_personality_t raid1_personality = | ||
1420 | { | ||
1421 | .name = "raid1", | ||
1422 | .owner = THIS_MODULE, | ||
1423 | .make_request = make_request, | ||
1424 | .run = run, | ||
1425 | .stop = stop, | ||
1426 | .status = status, | ||
1427 | .error_handler = error, | ||
1428 | .hot_add_disk = raid1_add_disk, | ||
1429 | .hot_remove_disk= raid1_remove_disk, | ||
1430 | .spare_active = raid1_spare_active, | ||
1431 | .sync_request = sync_request, | ||
1432 | .resize = raid1_resize, | ||
1433 | .reshape = raid1_reshape, | ||
1434 | }; | ||
1435 | |||
1436 | static int __init raid_init(void) | ||
1437 | { | ||
1438 | return register_md_personality(RAID1, &raid1_personality); | ||
1439 | } | ||
1440 | |||
1441 | static void raid_exit(void) | ||
1442 | { | ||
1443 | unregister_md_personality(RAID1); | ||
1444 | } | ||
1445 | |||
1446 | module_init(raid_init); | ||
1447 | module_exit(raid_exit); | ||
1448 | MODULE_LICENSE("GPL"); | ||
1449 | MODULE_ALIAS("md-personality-3"); /* RAID1 */ | ||
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c new file mode 100644 index 000000000000..b100bfe4fdca --- /dev/null +++ b/drivers/md/raid10.c | |||
@@ -0,0 +1,1787 @@ | |||
1 | /* | ||
2 | * raid10.c : Multiple Devices driver for Linux | ||
3 | * | ||
4 | * Copyright (C) 2000-2004 Neil Brown | ||
5 | * | ||
6 | * RAID-10 support for md. | ||
7 | * | ||
8 | * Base on code in raid1.c. See raid1.c for futher copyright information. | ||
9 | * | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License as published by | ||
13 | * the Free Software Foundation; either version 2, or (at your option) | ||
14 | * any later version. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
18 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
19 | */ | ||
20 | |||
21 | #include <linux/raid/raid10.h> | ||
22 | |||
23 | /* | ||
24 | * RAID10 provides a combination of RAID0 and RAID1 functionality. | ||
25 | * The layout of data is defined by | ||
26 | * chunk_size | ||
27 | * raid_disks | ||
28 | * near_copies (stored in low byte of layout) | ||
29 | * far_copies (stored in second byte of layout) | ||
30 | * | ||
31 | * The data to be stored is divided into chunks using chunksize. | ||
32 | * Each device is divided into far_copies sections. | ||
33 | * In each section, chunks are laid out in a style similar to raid0, but | ||
34 | * near_copies copies of each chunk is stored (each on a different drive). | ||
35 | * The starting device for each section is offset near_copies from the starting | ||
36 | * device of the previous section. | ||
37 | * Thus there are (near_copies*far_copies) of each chunk, and each is on a different | ||
38 | * drive. | ||
39 | * near_copies and far_copies must be at least one, and their product is at most | ||
40 | * raid_disks. | ||
41 | */ | ||
42 | |||
43 | /* | ||
44 | * Number of guaranteed r10bios in case of extreme VM load: | ||
45 | */ | ||
46 | #define NR_RAID10_BIOS 256 | ||
47 | |||
48 | static void unplug_slaves(mddev_t *mddev); | ||
49 | |||
50 | static void * r10bio_pool_alloc(unsigned int __nocast gfp_flags, void *data) | ||
51 | { | ||
52 | conf_t *conf = data; | ||
53 | r10bio_t *r10_bio; | ||
54 | int size = offsetof(struct r10bio_s, devs[conf->copies]); | ||
55 | |||
56 | /* allocate a r10bio with room for raid_disks entries in the bios array */ | ||
57 | r10_bio = kmalloc(size, gfp_flags); | ||
58 | if (r10_bio) | ||
59 | memset(r10_bio, 0, size); | ||
60 | else | ||
61 | unplug_slaves(conf->mddev); | ||
62 | |||
63 | return r10_bio; | ||
64 | } | ||
65 | |||
66 | static void r10bio_pool_free(void *r10_bio, void *data) | ||
67 | { | ||
68 | kfree(r10_bio); | ||
69 | } | ||
70 | |||
71 | #define RESYNC_BLOCK_SIZE (64*1024) | ||
72 | //#define RESYNC_BLOCK_SIZE PAGE_SIZE | ||
73 | #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) | ||
74 | #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) | ||
75 | #define RESYNC_WINDOW (2048*1024) | ||
76 | |||
77 | /* | ||
78 | * When performing a resync, we need to read and compare, so | ||
79 | * we need as many pages are there are copies. | ||
80 | * When performing a recovery, we need 2 bios, one for read, | ||
81 | * one for write (we recover only one drive per r10buf) | ||
82 | * | ||
83 | */ | ||
84 | static void * r10buf_pool_alloc(unsigned int __nocast gfp_flags, void *data) | ||
85 | { | ||
86 | conf_t *conf = data; | ||
87 | struct page *page; | ||
88 | r10bio_t *r10_bio; | ||
89 | struct bio *bio; | ||
90 | int i, j; | ||
91 | int nalloc; | ||
92 | |||
93 | r10_bio = r10bio_pool_alloc(gfp_flags, conf); | ||
94 | if (!r10_bio) { | ||
95 | unplug_slaves(conf->mddev); | ||
96 | return NULL; | ||
97 | } | ||
98 | |||
99 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) | ||
100 | nalloc = conf->copies; /* resync */ | ||
101 | else | ||
102 | nalloc = 2; /* recovery */ | ||
103 | |||
104 | /* | ||
105 | * Allocate bios. | ||
106 | */ | ||
107 | for (j = nalloc ; j-- ; ) { | ||
108 | bio = bio_alloc(gfp_flags, RESYNC_PAGES); | ||
109 | if (!bio) | ||
110 | goto out_free_bio; | ||
111 | r10_bio->devs[j].bio = bio; | ||
112 | } | ||
113 | /* | ||
114 | * Allocate RESYNC_PAGES data pages and attach them | ||
115 | * where needed. | ||
116 | */ | ||
117 | for (j = 0 ; j < nalloc; j++) { | ||
118 | bio = r10_bio->devs[j].bio; | ||
119 | for (i = 0; i < RESYNC_PAGES; i++) { | ||
120 | page = alloc_page(gfp_flags); | ||
121 | if (unlikely(!page)) | ||
122 | goto out_free_pages; | ||
123 | |||
124 | bio->bi_io_vec[i].bv_page = page; | ||
125 | } | ||
126 | } | ||
127 | |||
128 | return r10_bio; | ||
129 | |||
130 | out_free_pages: | ||
131 | for ( ; i > 0 ; i--) | ||
132 | __free_page(bio->bi_io_vec[i-1].bv_page); | ||
133 | while (j--) | ||
134 | for (i = 0; i < RESYNC_PAGES ; i++) | ||
135 | __free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); | ||
136 | j = -1; | ||
137 | out_free_bio: | ||
138 | while ( ++j < nalloc ) | ||
139 | bio_put(r10_bio->devs[j].bio); | ||
140 | r10bio_pool_free(r10_bio, conf); | ||
141 | return NULL; | ||
142 | } | ||
143 | |||
144 | static void r10buf_pool_free(void *__r10_bio, void *data) | ||
145 | { | ||
146 | int i; | ||
147 | conf_t *conf = data; | ||
148 | r10bio_t *r10bio = __r10_bio; | ||
149 | int j; | ||
150 | |||
151 | for (j=0; j < conf->copies; j++) { | ||
152 | struct bio *bio = r10bio->devs[j].bio; | ||
153 | if (bio) { | ||
154 | for (i = 0; i < RESYNC_PAGES; i++) { | ||
155 | __free_page(bio->bi_io_vec[i].bv_page); | ||
156 | bio->bi_io_vec[i].bv_page = NULL; | ||
157 | } | ||
158 | bio_put(bio); | ||
159 | } | ||
160 | } | ||
161 | r10bio_pool_free(r10bio, conf); | ||
162 | } | ||
163 | |||
164 | static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) | ||
165 | { | ||
166 | int i; | ||
167 | |||
168 | for (i = 0; i < conf->copies; i++) { | ||
169 | struct bio **bio = & r10_bio->devs[i].bio; | ||
170 | if (*bio) | ||
171 | bio_put(*bio); | ||
172 | *bio = NULL; | ||
173 | } | ||
174 | } | ||
175 | |||
176 | static inline void free_r10bio(r10bio_t *r10_bio) | ||
177 | { | ||
178 | unsigned long flags; | ||
179 | |||
180 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | ||
181 | |||
182 | /* | ||
183 | * Wake up any possible resync thread that waits for the device | ||
184 | * to go idle. | ||
185 | */ | ||
186 | spin_lock_irqsave(&conf->resync_lock, flags); | ||
187 | if (!--conf->nr_pending) { | ||
188 | wake_up(&conf->wait_idle); | ||
189 | wake_up(&conf->wait_resume); | ||
190 | } | ||
191 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
192 | |||
193 | put_all_bios(conf, r10_bio); | ||
194 | mempool_free(r10_bio, conf->r10bio_pool); | ||
195 | } | ||
196 | |||
197 | static inline void put_buf(r10bio_t *r10_bio) | ||
198 | { | ||
199 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | ||
200 | unsigned long flags; | ||
201 | |||
202 | mempool_free(r10_bio, conf->r10buf_pool); | ||
203 | |||
204 | spin_lock_irqsave(&conf->resync_lock, flags); | ||
205 | if (!conf->barrier) | ||
206 | BUG(); | ||
207 | --conf->barrier; | ||
208 | wake_up(&conf->wait_resume); | ||
209 | wake_up(&conf->wait_idle); | ||
210 | |||
211 | if (!--conf->nr_pending) { | ||
212 | wake_up(&conf->wait_idle); | ||
213 | wake_up(&conf->wait_resume); | ||
214 | } | ||
215 | spin_unlock_irqrestore(&conf->resync_lock, flags); | ||
216 | } | ||
217 | |||
218 | static void reschedule_retry(r10bio_t *r10_bio) | ||
219 | { | ||
220 | unsigned long flags; | ||
221 | mddev_t *mddev = r10_bio->mddev; | ||
222 | conf_t *conf = mddev_to_conf(mddev); | ||
223 | |||
224 | spin_lock_irqsave(&conf->device_lock, flags); | ||
225 | list_add(&r10_bio->retry_list, &conf->retry_list); | ||
226 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
227 | |||
228 | md_wakeup_thread(mddev->thread); | ||
229 | } | ||
230 | |||
231 | /* | ||
232 | * raid_end_bio_io() is called when we have finished servicing a mirrored | ||
233 | * operation and are ready to return a success/failure code to the buffer | ||
234 | * cache layer. | ||
235 | */ | ||
236 | static void raid_end_bio_io(r10bio_t *r10_bio) | ||
237 | { | ||
238 | struct bio *bio = r10_bio->master_bio; | ||
239 | |||
240 | bio_endio(bio, bio->bi_size, | ||
241 | test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); | ||
242 | free_r10bio(r10_bio); | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Update disk head position estimator based on IRQ completion info. | ||
247 | */ | ||
248 | static inline void update_head_pos(int slot, r10bio_t *r10_bio) | ||
249 | { | ||
250 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | ||
251 | |||
252 | conf->mirrors[r10_bio->devs[slot].devnum].head_position = | ||
253 | r10_bio->devs[slot].addr + (r10_bio->sectors); | ||
254 | } | ||
255 | |||
256 | static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error) | ||
257 | { | ||
258 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
259 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | ||
260 | int slot, dev; | ||
261 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | ||
262 | |||
263 | if (bio->bi_size) | ||
264 | return 1; | ||
265 | |||
266 | slot = r10_bio->read_slot; | ||
267 | dev = r10_bio->devs[slot].devnum; | ||
268 | /* | ||
269 | * this branch is our 'one mirror IO has finished' event handler: | ||
270 | */ | ||
271 | if (!uptodate) | ||
272 | md_error(r10_bio->mddev, conf->mirrors[dev].rdev); | ||
273 | else | ||
274 | /* | ||
275 | * Set R10BIO_Uptodate in our master bio, so that | ||
276 | * we will return a good error code to the higher | ||
277 | * levels even if IO on some other mirrored buffer fails. | ||
278 | * | ||
279 | * The 'master' represents the composite IO operation to | ||
280 | * user-side. So if something waits for IO, then it will | ||
281 | * wait for the 'master' bio. | ||
282 | */ | ||
283 | set_bit(R10BIO_Uptodate, &r10_bio->state); | ||
284 | |||
285 | update_head_pos(slot, r10_bio); | ||
286 | |||
287 | /* | ||
288 | * we have only one bio on the read side | ||
289 | */ | ||
290 | if (uptodate) | ||
291 | raid_end_bio_io(r10_bio); | ||
292 | else { | ||
293 | /* | ||
294 | * oops, read error: | ||
295 | */ | ||
296 | char b[BDEVNAME_SIZE]; | ||
297 | if (printk_ratelimit()) | ||
298 | printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", | ||
299 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); | ||
300 | reschedule_retry(r10_bio); | ||
301 | } | ||
302 | |||
303 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | ||
304 | return 0; | ||
305 | } | ||
306 | |||
307 | static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error) | ||
308 | { | ||
309 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
310 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | ||
311 | int slot, dev; | ||
312 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | ||
313 | |||
314 | if (bio->bi_size) | ||
315 | return 1; | ||
316 | |||
317 | for (slot = 0; slot < conf->copies; slot++) | ||
318 | if (r10_bio->devs[slot].bio == bio) | ||
319 | break; | ||
320 | dev = r10_bio->devs[slot].devnum; | ||
321 | |||
322 | /* | ||
323 | * this branch is our 'one mirror IO has finished' event handler: | ||
324 | */ | ||
325 | if (!uptodate) | ||
326 | md_error(r10_bio->mddev, conf->mirrors[dev].rdev); | ||
327 | else | ||
328 | /* | ||
329 | * Set R10BIO_Uptodate in our master bio, so that | ||
330 | * we will return a good error code for to the higher | ||
331 | * levels even if IO on some other mirrored buffer fails. | ||
332 | * | ||
333 | * The 'master' represents the composite IO operation to | ||
334 | * user-side. So if something waits for IO, then it will | ||
335 | * wait for the 'master' bio. | ||
336 | */ | ||
337 | set_bit(R10BIO_Uptodate, &r10_bio->state); | ||
338 | |||
339 | update_head_pos(slot, r10_bio); | ||
340 | |||
341 | /* | ||
342 | * | ||
343 | * Let's see if all mirrored write operations have finished | ||
344 | * already. | ||
345 | */ | ||
346 | if (atomic_dec_and_test(&r10_bio->remaining)) { | ||
347 | md_write_end(r10_bio->mddev); | ||
348 | raid_end_bio_io(r10_bio); | ||
349 | } | ||
350 | |||
351 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | ||
352 | return 0; | ||
353 | } | ||
354 | |||
355 | |||
356 | /* | ||
357 | * RAID10 layout manager | ||
358 | * Aswell as the chunksize and raid_disks count, there are two | ||
359 | * parameters: near_copies and far_copies. | ||
360 | * near_copies * far_copies must be <= raid_disks. | ||
361 | * Normally one of these will be 1. | ||
362 | * If both are 1, we get raid0. | ||
363 | * If near_copies == raid_disks, we get raid1. | ||
364 | * | ||
365 | * Chunks are layed out in raid0 style with near_copies copies of the | ||
366 | * first chunk, followed by near_copies copies of the next chunk and | ||
367 | * so on. | ||
368 | * If far_copies > 1, then after 1/far_copies of the array has been assigned | ||
369 | * as described above, we start again with a device offset of near_copies. | ||
370 | * So we effectively have another copy of the whole array further down all | ||
371 | * the drives, but with blocks on different drives. | ||
372 | * With this layout, and block is never stored twice on the one device. | ||
373 | * | ||
374 | * raid10_find_phys finds the sector offset of a given virtual sector | ||
375 | * on each device that it is on. If a block isn't on a device, | ||
376 | * that entry in the array is set to MaxSector. | ||
377 | * | ||
378 | * raid10_find_virt does the reverse mapping, from a device and a | ||
379 | * sector offset to a virtual address | ||
380 | */ | ||
381 | |||
382 | static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio) | ||
383 | { | ||
384 | int n,f; | ||
385 | sector_t sector; | ||
386 | sector_t chunk; | ||
387 | sector_t stripe; | ||
388 | int dev; | ||
389 | |||
390 | int slot = 0; | ||
391 | |||
392 | /* now calculate first sector/dev */ | ||
393 | chunk = r10bio->sector >> conf->chunk_shift; | ||
394 | sector = r10bio->sector & conf->chunk_mask; | ||
395 | |||
396 | chunk *= conf->near_copies; | ||
397 | stripe = chunk; | ||
398 | dev = sector_div(stripe, conf->raid_disks); | ||
399 | |||
400 | sector += stripe << conf->chunk_shift; | ||
401 | |||
402 | /* and calculate all the others */ | ||
403 | for (n=0; n < conf->near_copies; n++) { | ||
404 | int d = dev; | ||
405 | sector_t s = sector; | ||
406 | r10bio->devs[slot].addr = sector; | ||
407 | r10bio->devs[slot].devnum = d; | ||
408 | slot++; | ||
409 | |||
410 | for (f = 1; f < conf->far_copies; f++) { | ||
411 | d += conf->near_copies; | ||
412 | if (d >= conf->raid_disks) | ||
413 | d -= conf->raid_disks; | ||
414 | s += conf->stride; | ||
415 | r10bio->devs[slot].devnum = d; | ||
416 | r10bio->devs[slot].addr = s; | ||
417 | slot++; | ||
418 | } | ||
419 | dev++; | ||
420 | if (dev >= conf->raid_disks) { | ||
421 | dev = 0; | ||
422 | sector += (conf->chunk_mask + 1); | ||
423 | } | ||
424 | } | ||
425 | BUG_ON(slot != conf->copies); | ||
426 | } | ||
427 | |||
428 | static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) | ||
429 | { | ||
430 | sector_t offset, chunk, vchunk; | ||
431 | |||
432 | while (sector > conf->stride) { | ||
433 | sector -= conf->stride; | ||
434 | if (dev < conf->near_copies) | ||
435 | dev += conf->raid_disks - conf->near_copies; | ||
436 | else | ||
437 | dev -= conf->near_copies; | ||
438 | } | ||
439 | |||
440 | offset = sector & conf->chunk_mask; | ||
441 | chunk = sector >> conf->chunk_shift; | ||
442 | vchunk = chunk * conf->raid_disks + dev; | ||
443 | sector_div(vchunk, conf->near_copies); | ||
444 | return (vchunk << conf->chunk_shift) + offset; | ||
445 | } | ||
446 | |||
447 | /** | ||
448 | * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged | ||
449 | * @q: request queue | ||
450 | * @bio: the buffer head that's been built up so far | ||
451 | * @biovec: the request that could be merged to it. | ||
452 | * | ||
453 | * Return amount of bytes we can accept at this offset | ||
454 | * If near_copies == raid_disk, there are no striping issues, | ||
455 | * but in that case, the function isn't called at all. | ||
456 | */ | ||
457 | static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio, | ||
458 | struct bio_vec *bio_vec) | ||
459 | { | ||
460 | mddev_t *mddev = q->queuedata; | ||
461 | sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); | ||
462 | int max; | ||
463 | unsigned int chunk_sectors = mddev->chunk_size >> 9; | ||
464 | unsigned int bio_sectors = bio->bi_size >> 9; | ||
465 | |||
466 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; | ||
467 | if (max < 0) max = 0; /* bio_add cannot handle a negative return */ | ||
468 | if (max <= bio_vec->bv_len && bio_sectors == 0) | ||
469 | return bio_vec->bv_len; | ||
470 | else | ||
471 | return max; | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * This routine returns the disk from which the requested read should | ||
476 | * be done. There is a per-array 'next expected sequential IO' sector | ||
477 | * number - if this matches on the next IO then we use the last disk. | ||
478 | * There is also a per-disk 'last know head position' sector that is | ||
479 | * maintained from IRQ contexts, both the normal and the resync IO | ||
480 | * completion handlers update this position correctly. If there is no | ||
481 | * perfect sequential match then we pick the disk whose head is closest. | ||
482 | * | ||
483 | * If there are 2 mirrors in the same 2 devices, performance degrades | ||
484 | * because position is mirror, not device based. | ||
485 | * | ||
486 | * The rdev for the device selected will have nr_pending incremented. | ||
487 | */ | ||
488 | |||
489 | /* | ||
490 | * FIXME: possibly should rethink readbalancing and do it differently | ||
491 | * depending on near_copies / far_copies geometry. | ||
492 | */ | ||
493 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) | ||
494 | { | ||
495 | const unsigned long this_sector = r10_bio->sector; | ||
496 | int disk, slot, nslot; | ||
497 | const int sectors = r10_bio->sectors; | ||
498 | sector_t new_distance, current_distance; | ||
499 | |||
500 | raid10_find_phys(conf, r10_bio); | ||
501 | rcu_read_lock(); | ||
502 | /* | ||
503 | * Check if we can balance. We can balance on the whole | ||
504 | * device if no resync is going on, or below the resync window. | ||
505 | * We take the first readable disk when above the resync window. | ||
506 | */ | ||
507 | if (conf->mddev->recovery_cp < MaxSector | ||
508 | && (this_sector + sectors >= conf->next_resync)) { | ||
509 | /* make sure that disk is operational */ | ||
510 | slot = 0; | ||
511 | disk = r10_bio->devs[slot].devnum; | ||
512 | |||
513 | while (!conf->mirrors[disk].rdev || | ||
514 | !conf->mirrors[disk].rdev->in_sync) { | ||
515 | slot++; | ||
516 | if (slot == conf->copies) { | ||
517 | slot = 0; | ||
518 | disk = -1; | ||
519 | break; | ||
520 | } | ||
521 | disk = r10_bio->devs[slot].devnum; | ||
522 | } | ||
523 | goto rb_out; | ||
524 | } | ||
525 | |||
526 | |||
527 | /* make sure the disk is operational */ | ||
528 | slot = 0; | ||
529 | disk = r10_bio->devs[slot].devnum; | ||
530 | while (!conf->mirrors[disk].rdev || | ||
531 | !conf->mirrors[disk].rdev->in_sync) { | ||
532 | slot ++; | ||
533 | if (slot == conf->copies) { | ||
534 | disk = -1; | ||
535 | goto rb_out; | ||
536 | } | ||
537 | disk = r10_bio->devs[slot].devnum; | ||
538 | } | ||
539 | |||
540 | |||
541 | current_distance = abs(this_sector - conf->mirrors[disk].head_position); | ||
542 | |||
543 | /* Find the disk whose head is closest */ | ||
544 | |||
545 | for (nslot = slot; nslot < conf->copies; nslot++) { | ||
546 | int ndisk = r10_bio->devs[nslot].devnum; | ||
547 | |||
548 | |||
549 | if (!conf->mirrors[ndisk].rdev || | ||
550 | !conf->mirrors[ndisk].rdev->in_sync) | ||
551 | continue; | ||
552 | |||
553 | if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) { | ||
554 | disk = ndisk; | ||
555 | slot = nslot; | ||
556 | break; | ||
557 | } | ||
558 | new_distance = abs(r10_bio->devs[nslot].addr - | ||
559 | conf->mirrors[ndisk].head_position); | ||
560 | if (new_distance < current_distance) { | ||
561 | current_distance = new_distance; | ||
562 | disk = ndisk; | ||
563 | slot = nslot; | ||
564 | } | ||
565 | } | ||
566 | |||
567 | rb_out: | ||
568 | r10_bio->read_slot = slot; | ||
569 | /* conf->next_seq_sect = this_sector + sectors;*/ | ||
570 | |||
571 | if (disk >= 0 && conf->mirrors[disk].rdev) | ||
572 | atomic_inc(&conf->mirrors[disk].rdev->nr_pending); | ||
573 | rcu_read_unlock(); | ||
574 | |||
575 | return disk; | ||
576 | } | ||
577 | |||
578 | static void unplug_slaves(mddev_t *mddev) | ||
579 | { | ||
580 | conf_t *conf = mddev_to_conf(mddev); | ||
581 | int i; | ||
582 | |||
583 | rcu_read_lock(); | ||
584 | for (i=0; i<mddev->raid_disks; i++) { | ||
585 | mdk_rdev_t *rdev = conf->mirrors[i].rdev; | ||
586 | if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { | ||
587 | request_queue_t *r_queue = bdev_get_queue(rdev->bdev); | ||
588 | |||
589 | atomic_inc(&rdev->nr_pending); | ||
590 | rcu_read_unlock(); | ||
591 | |||
592 | if (r_queue->unplug_fn) | ||
593 | r_queue->unplug_fn(r_queue); | ||
594 | |||
595 | rdev_dec_pending(rdev, mddev); | ||
596 | rcu_read_lock(); | ||
597 | } | ||
598 | } | ||
599 | rcu_read_unlock(); | ||
600 | } | ||
601 | |||
602 | static void raid10_unplug(request_queue_t *q) | ||
603 | { | ||
604 | unplug_slaves(q->queuedata); | ||
605 | } | ||
606 | |||
607 | static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, | ||
608 | sector_t *error_sector) | ||
609 | { | ||
610 | mddev_t *mddev = q->queuedata; | ||
611 | conf_t *conf = mddev_to_conf(mddev); | ||
612 | int i, ret = 0; | ||
613 | |||
614 | rcu_read_lock(); | ||
615 | for (i=0; i<mddev->raid_disks && ret == 0; i++) { | ||
616 | mdk_rdev_t *rdev = conf->mirrors[i].rdev; | ||
617 | if (rdev && !rdev->faulty) { | ||
618 | struct block_device *bdev = rdev->bdev; | ||
619 | request_queue_t *r_queue = bdev_get_queue(bdev); | ||
620 | |||
621 | if (!r_queue->issue_flush_fn) | ||
622 | ret = -EOPNOTSUPP; | ||
623 | else { | ||
624 | atomic_inc(&rdev->nr_pending); | ||
625 | rcu_read_unlock(); | ||
626 | ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, | ||
627 | error_sector); | ||
628 | rdev_dec_pending(rdev, mddev); | ||
629 | rcu_read_lock(); | ||
630 | } | ||
631 | } | ||
632 | } | ||
633 | rcu_read_unlock(); | ||
634 | return ret; | ||
635 | } | ||
636 | |||
637 | /* | ||
638 | * Throttle resync depth, so that we can both get proper overlapping of | ||
639 | * requests, but are still able to handle normal requests quickly. | ||
640 | */ | ||
641 | #define RESYNC_DEPTH 32 | ||
642 | |||
643 | static void device_barrier(conf_t *conf, sector_t sect) | ||
644 | { | ||
645 | spin_lock_irq(&conf->resync_lock); | ||
646 | wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), | ||
647 | conf->resync_lock, unplug_slaves(conf->mddev)); | ||
648 | |||
649 | if (!conf->barrier++) { | ||
650 | wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, | ||
651 | conf->resync_lock, unplug_slaves(conf->mddev)); | ||
652 | if (conf->nr_pending) | ||
653 | BUG(); | ||
654 | } | ||
655 | wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, | ||
656 | conf->resync_lock, unplug_slaves(conf->mddev)); | ||
657 | conf->next_resync = sect; | ||
658 | spin_unlock_irq(&conf->resync_lock); | ||
659 | } | ||
660 | |||
661 | static int make_request(request_queue_t *q, struct bio * bio) | ||
662 | { | ||
663 | mddev_t *mddev = q->queuedata; | ||
664 | conf_t *conf = mddev_to_conf(mddev); | ||
665 | mirror_info_t *mirror; | ||
666 | r10bio_t *r10_bio; | ||
667 | struct bio *read_bio; | ||
668 | int i; | ||
669 | int chunk_sects = conf->chunk_mask + 1; | ||
670 | |||
671 | /* If this request crosses a chunk boundary, we need to | ||
672 | * split it. This will only happen for 1 PAGE (or less) requests. | ||
673 | */ | ||
674 | if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) | ||
675 | > chunk_sects && | ||
676 | conf->near_copies < conf->raid_disks)) { | ||
677 | struct bio_pair *bp; | ||
678 | /* Sanity check -- queue functions should prevent this happening */ | ||
679 | if (bio->bi_vcnt != 1 || | ||
680 | bio->bi_idx != 0) | ||
681 | goto bad_map; | ||
682 | /* This is a one page bio that upper layers | ||
683 | * refuse to split for us, so we need to split it. | ||
684 | */ | ||
685 | bp = bio_split(bio, bio_split_pool, | ||
686 | chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); | ||
687 | if (make_request(q, &bp->bio1)) | ||
688 | generic_make_request(&bp->bio1); | ||
689 | if (make_request(q, &bp->bio2)) | ||
690 | generic_make_request(&bp->bio2); | ||
691 | |||
692 | bio_pair_release(bp); | ||
693 | return 0; | ||
694 | bad_map: | ||
695 | printk("raid10_make_request bug: can't convert block across chunks" | ||
696 | " or bigger than %dk %llu %d\n", chunk_sects/2, | ||
697 | (unsigned long long)bio->bi_sector, bio->bi_size >> 10); | ||
698 | |||
699 | bio_io_error(bio, bio->bi_size); | ||
700 | return 0; | ||
701 | } | ||
702 | |||
703 | /* | ||
704 | * Register the new request and wait if the reconstruction | ||
705 | * thread has put up a bar for new requests. | ||
706 | * Continue immediately if no resync is active currently. | ||
707 | */ | ||
708 | spin_lock_irq(&conf->resync_lock); | ||
709 | wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); | ||
710 | conf->nr_pending++; | ||
711 | spin_unlock_irq(&conf->resync_lock); | ||
712 | |||
713 | if (bio_data_dir(bio)==WRITE) { | ||
714 | disk_stat_inc(mddev->gendisk, writes); | ||
715 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); | ||
716 | } else { | ||
717 | disk_stat_inc(mddev->gendisk, reads); | ||
718 | disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); | ||
719 | } | ||
720 | |||
721 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
722 | |||
723 | r10_bio->master_bio = bio; | ||
724 | r10_bio->sectors = bio->bi_size >> 9; | ||
725 | |||
726 | r10_bio->mddev = mddev; | ||
727 | r10_bio->sector = bio->bi_sector; | ||
728 | |||
729 | if (bio_data_dir(bio) == READ) { | ||
730 | /* | ||
731 | * read balancing logic: | ||
732 | */ | ||
733 | int disk = read_balance(conf, r10_bio); | ||
734 | int slot = r10_bio->read_slot; | ||
735 | if (disk < 0) { | ||
736 | raid_end_bio_io(r10_bio); | ||
737 | return 0; | ||
738 | } | ||
739 | mirror = conf->mirrors + disk; | ||
740 | |||
741 | read_bio = bio_clone(bio, GFP_NOIO); | ||
742 | |||
743 | r10_bio->devs[slot].bio = read_bio; | ||
744 | |||
745 | read_bio->bi_sector = r10_bio->devs[slot].addr + | ||
746 | mirror->rdev->data_offset; | ||
747 | read_bio->bi_bdev = mirror->rdev->bdev; | ||
748 | read_bio->bi_end_io = raid10_end_read_request; | ||
749 | read_bio->bi_rw = READ; | ||
750 | read_bio->bi_private = r10_bio; | ||
751 | |||
752 | generic_make_request(read_bio); | ||
753 | return 0; | ||
754 | } | ||
755 | |||
756 | /* | ||
757 | * WRITE: | ||
758 | */ | ||
759 | /* first select target devices under spinlock and | ||
760 | * inc refcount on their rdev. Record them by setting | ||
761 | * bios[x] to bio | ||
762 | */ | ||
763 | raid10_find_phys(conf, r10_bio); | ||
764 | rcu_read_lock(); | ||
765 | for (i = 0; i < conf->copies; i++) { | ||
766 | int d = r10_bio->devs[i].devnum; | ||
767 | if (conf->mirrors[d].rdev && | ||
768 | !conf->mirrors[d].rdev->faulty) { | ||
769 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
770 | r10_bio->devs[i].bio = bio; | ||
771 | } else | ||
772 | r10_bio->devs[i].bio = NULL; | ||
773 | } | ||
774 | rcu_read_unlock(); | ||
775 | |||
776 | atomic_set(&r10_bio->remaining, 1); | ||
777 | md_write_start(mddev); | ||
778 | for (i = 0; i < conf->copies; i++) { | ||
779 | struct bio *mbio; | ||
780 | int d = r10_bio->devs[i].devnum; | ||
781 | if (!r10_bio->devs[i].bio) | ||
782 | continue; | ||
783 | |||
784 | mbio = bio_clone(bio, GFP_NOIO); | ||
785 | r10_bio->devs[i].bio = mbio; | ||
786 | |||
787 | mbio->bi_sector = r10_bio->devs[i].addr+ | ||
788 | conf->mirrors[d].rdev->data_offset; | ||
789 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | ||
790 | mbio->bi_end_io = raid10_end_write_request; | ||
791 | mbio->bi_rw = WRITE; | ||
792 | mbio->bi_private = r10_bio; | ||
793 | |||
794 | atomic_inc(&r10_bio->remaining); | ||
795 | generic_make_request(mbio); | ||
796 | } | ||
797 | |||
798 | if (atomic_dec_and_test(&r10_bio->remaining)) { | ||
799 | md_write_end(mddev); | ||
800 | raid_end_bio_io(r10_bio); | ||
801 | } | ||
802 | |||
803 | return 0; | ||
804 | } | ||
805 | |||
806 | static void status(struct seq_file *seq, mddev_t *mddev) | ||
807 | { | ||
808 | conf_t *conf = mddev_to_conf(mddev); | ||
809 | int i; | ||
810 | |||
811 | if (conf->near_copies < conf->raid_disks) | ||
812 | seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); | ||
813 | if (conf->near_copies > 1) | ||
814 | seq_printf(seq, " %d near-copies", conf->near_copies); | ||
815 | if (conf->far_copies > 1) | ||
816 | seq_printf(seq, " %d far-copies", conf->far_copies); | ||
817 | |||
818 | seq_printf(seq, " [%d/%d] [", conf->raid_disks, | ||
819 | conf->working_disks); | ||
820 | for (i = 0; i < conf->raid_disks; i++) | ||
821 | seq_printf(seq, "%s", | ||
822 | conf->mirrors[i].rdev && | ||
823 | conf->mirrors[i].rdev->in_sync ? "U" : "_"); | ||
824 | seq_printf(seq, "]"); | ||
825 | } | ||
826 | |||
827 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | ||
828 | { | ||
829 | char b[BDEVNAME_SIZE]; | ||
830 | conf_t *conf = mddev_to_conf(mddev); | ||
831 | |||
832 | /* | ||
833 | * If it is not operational, then we have already marked it as dead | ||
834 | * else if it is the last working disks, ignore the error, let the | ||
835 | * next level up know. | ||
836 | * else mark the drive as failed | ||
837 | */ | ||
838 | if (rdev->in_sync | ||
839 | && conf->working_disks == 1) | ||
840 | /* | ||
841 | * Don't fail the drive, just return an IO error. | ||
842 | * The test should really be more sophisticated than | ||
843 | * "working_disks == 1", but it isn't critical, and | ||
844 | * can wait until we do more sophisticated "is the drive | ||
845 | * really dead" tests... | ||
846 | */ | ||
847 | return; | ||
848 | if (rdev->in_sync) { | ||
849 | mddev->degraded++; | ||
850 | conf->working_disks--; | ||
851 | /* | ||
852 | * if recovery is running, make sure it aborts. | ||
853 | */ | ||
854 | set_bit(MD_RECOVERY_ERR, &mddev->recovery); | ||
855 | } | ||
856 | rdev->in_sync = 0; | ||
857 | rdev->faulty = 1; | ||
858 | mddev->sb_dirty = 1; | ||
859 | printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" | ||
860 | " Operation continuing on %d devices\n", | ||
861 | bdevname(rdev->bdev,b), conf->working_disks); | ||
862 | } | ||
863 | |||
864 | static void print_conf(conf_t *conf) | ||
865 | { | ||
866 | int i; | ||
867 | mirror_info_t *tmp; | ||
868 | |||
869 | printk("RAID10 conf printout:\n"); | ||
870 | if (!conf) { | ||
871 | printk("(!conf)\n"); | ||
872 | return; | ||
873 | } | ||
874 | printk(" --- wd:%d rd:%d\n", conf->working_disks, | ||
875 | conf->raid_disks); | ||
876 | |||
877 | for (i = 0; i < conf->raid_disks; i++) { | ||
878 | char b[BDEVNAME_SIZE]; | ||
879 | tmp = conf->mirrors + i; | ||
880 | if (tmp->rdev) | ||
881 | printk(" disk %d, wo:%d, o:%d, dev:%s\n", | ||
882 | i, !tmp->rdev->in_sync, !tmp->rdev->faulty, | ||
883 | bdevname(tmp->rdev->bdev,b)); | ||
884 | } | ||
885 | } | ||
886 | |||
887 | static void close_sync(conf_t *conf) | ||
888 | { | ||
889 | spin_lock_irq(&conf->resync_lock); | ||
890 | wait_event_lock_irq(conf->wait_resume, !conf->barrier, | ||
891 | conf->resync_lock, unplug_slaves(conf->mddev)); | ||
892 | spin_unlock_irq(&conf->resync_lock); | ||
893 | |||
894 | if (conf->barrier) BUG(); | ||
895 | if (waitqueue_active(&conf->wait_idle)) BUG(); | ||
896 | |||
897 | mempool_destroy(conf->r10buf_pool); | ||
898 | conf->r10buf_pool = NULL; | ||
899 | } | ||
900 | |||
901 | static int raid10_spare_active(mddev_t *mddev) | ||
902 | { | ||
903 | int i; | ||
904 | conf_t *conf = mddev->private; | ||
905 | mirror_info_t *tmp; | ||
906 | |||
907 | /* | ||
908 | * Find all non-in_sync disks within the RAID10 configuration | ||
909 | * and mark them in_sync | ||
910 | */ | ||
911 | for (i = 0; i < conf->raid_disks; i++) { | ||
912 | tmp = conf->mirrors + i; | ||
913 | if (tmp->rdev | ||
914 | && !tmp->rdev->faulty | ||
915 | && !tmp->rdev->in_sync) { | ||
916 | conf->working_disks++; | ||
917 | mddev->degraded--; | ||
918 | tmp->rdev->in_sync = 1; | ||
919 | } | ||
920 | } | ||
921 | |||
922 | print_conf(conf); | ||
923 | return 0; | ||
924 | } | ||
925 | |||
926 | |||
927 | static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | ||
928 | { | ||
929 | conf_t *conf = mddev->private; | ||
930 | int found = 0; | ||
931 | int mirror; | ||
932 | mirror_info_t *p; | ||
933 | |||
934 | if (mddev->recovery_cp < MaxSector) | ||
935 | /* only hot-add to in-sync arrays, as recovery is | ||
936 | * very different from resync | ||
937 | */ | ||
938 | return 0; | ||
939 | |||
940 | for (mirror=0; mirror < mddev->raid_disks; mirror++) | ||
941 | if ( !(p=conf->mirrors+mirror)->rdev) { | ||
942 | |||
943 | blk_queue_stack_limits(mddev->queue, | ||
944 | rdev->bdev->bd_disk->queue); | ||
945 | /* as we don't honour merge_bvec_fn, we must never risk | ||
946 | * violating it, so limit ->max_sector to one PAGE, as | ||
947 | * a one page request is never in violation. | ||
948 | */ | ||
949 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | ||
950 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | ||
951 | mddev->queue->max_sectors = (PAGE_SIZE>>9); | ||
952 | |||
953 | p->head_position = 0; | ||
954 | rdev->raid_disk = mirror; | ||
955 | found = 1; | ||
956 | p->rdev = rdev; | ||
957 | break; | ||
958 | } | ||
959 | |||
960 | print_conf(conf); | ||
961 | return found; | ||
962 | } | ||
963 | |||
964 | static int raid10_remove_disk(mddev_t *mddev, int number) | ||
965 | { | ||
966 | conf_t *conf = mddev->private; | ||
967 | int err = 0; | ||
968 | mdk_rdev_t *rdev; | ||
969 | mirror_info_t *p = conf->mirrors+ number; | ||
970 | |||
971 | print_conf(conf); | ||
972 | rdev = p->rdev; | ||
973 | if (rdev) { | ||
974 | if (rdev->in_sync || | ||
975 | atomic_read(&rdev->nr_pending)) { | ||
976 | err = -EBUSY; | ||
977 | goto abort; | ||
978 | } | ||
979 | p->rdev = NULL; | ||
980 | synchronize_kernel(); | ||
981 | if (atomic_read(&rdev->nr_pending)) { | ||
982 | /* lost the race, try later */ | ||
983 | err = -EBUSY; | ||
984 | p->rdev = rdev; | ||
985 | } | ||
986 | } | ||
987 | abort: | ||
988 | |||
989 | print_conf(conf); | ||
990 | return err; | ||
991 | } | ||
992 | |||
993 | |||
994 | static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) | ||
995 | { | ||
996 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
997 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | ||
998 | conf_t *conf = mddev_to_conf(r10_bio->mddev); | ||
999 | int i,d; | ||
1000 | |||
1001 | if (bio->bi_size) | ||
1002 | return 1; | ||
1003 | |||
1004 | for (i=0; i<conf->copies; i++) | ||
1005 | if (r10_bio->devs[i].bio == bio) | ||
1006 | break; | ||
1007 | if (i == conf->copies) | ||
1008 | BUG(); | ||
1009 | update_head_pos(i, r10_bio); | ||
1010 | d = r10_bio->devs[i].devnum; | ||
1011 | if (!uptodate) | ||
1012 | md_error(r10_bio->mddev, | ||
1013 | conf->mirrors[d].rdev); | ||
1014 | |||
1015 | /* for reconstruct, we always reschedule after a read. | ||
1016 | * for resync, only after all reads | ||
1017 | */ | ||
1018 | if (test_bit(R10BIO_IsRecover, &r10_bio->state) || | ||
1019 | atomic_dec_and_test(&r10_bio->remaining)) { | ||
1020 | /* we have read all the blocks, | ||
1021 | * do the comparison in process context in raid10d | ||
1022 | */ | ||
1023 | reschedule_retry(r10_bio); | ||
1024 | } | ||
1025 | rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); | ||
1026 | return 0; | ||
1027 | } | ||
1028 | |||
1029 | static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) | ||
1030 | { | ||
1031 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1032 | r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); | ||
1033 | mddev_t *mddev = r10_bio->mddev; | ||
1034 | conf_t *conf = mddev_to_conf(mddev); | ||
1035 | int i,d; | ||
1036 | |||
1037 | if (bio->bi_size) | ||
1038 | return 1; | ||
1039 | |||
1040 | for (i = 0; i < conf->copies; i++) | ||
1041 | if (r10_bio->devs[i].bio == bio) | ||
1042 | break; | ||
1043 | d = r10_bio->devs[i].devnum; | ||
1044 | |||
1045 | if (!uptodate) | ||
1046 | md_error(mddev, conf->mirrors[d].rdev); | ||
1047 | update_head_pos(i, r10_bio); | ||
1048 | |||
1049 | while (atomic_dec_and_test(&r10_bio->remaining)) { | ||
1050 | if (r10_bio->master_bio == NULL) { | ||
1051 | /* the primary of several recovery bios */ | ||
1052 | md_done_sync(mddev, r10_bio->sectors, 1); | ||
1053 | put_buf(r10_bio); | ||
1054 | break; | ||
1055 | } else { | ||
1056 | r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; | ||
1057 | put_buf(r10_bio); | ||
1058 | r10_bio = r10_bio2; | ||
1059 | } | ||
1060 | } | ||
1061 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | ||
1062 | return 0; | ||
1063 | } | ||
1064 | |||
1065 | /* | ||
1066 | * Note: sync and recover and handled very differently for raid10 | ||
1067 | * This code is for resync. | ||
1068 | * For resync, we read through virtual addresses and read all blocks. | ||
1069 | * If there is any error, we schedule a write. The lowest numbered | ||
1070 | * drive is authoritative. | ||
1071 | * However requests come for physical address, so we need to map. | ||
1072 | * For every physical address there are raid_disks/copies virtual addresses, | ||
1073 | * which is always are least one, but is not necessarly an integer. | ||
1074 | * This means that a physical address can span multiple chunks, so we may | ||
1075 | * have to submit multiple io requests for a single sync request. | ||
1076 | */ | ||
1077 | /* | ||
1078 | * We check if all blocks are in-sync and only write to blocks that | ||
1079 | * aren't in sync | ||
1080 | */ | ||
1081 | static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) | ||
1082 | { | ||
1083 | conf_t *conf = mddev_to_conf(mddev); | ||
1084 | int i, first; | ||
1085 | struct bio *tbio, *fbio; | ||
1086 | |||
1087 | atomic_set(&r10_bio->remaining, 1); | ||
1088 | |||
1089 | /* find the first device with a block */ | ||
1090 | for (i=0; i<conf->copies; i++) | ||
1091 | if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) | ||
1092 | break; | ||
1093 | |||
1094 | if (i == conf->copies) | ||
1095 | goto done; | ||
1096 | |||
1097 | first = i; | ||
1098 | fbio = r10_bio->devs[i].bio; | ||
1099 | |||
1100 | /* now find blocks with errors */ | ||
1101 | for (i=first+1 ; i < conf->copies ; i++) { | ||
1102 | int vcnt, j, d; | ||
1103 | |||
1104 | if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) | ||
1105 | continue; | ||
1106 | /* We know that the bi_io_vec layout is the same for | ||
1107 | * both 'first' and 'i', so we just compare them. | ||
1108 | * All vec entries are PAGE_SIZE; | ||
1109 | */ | ||
1110 | tbio = r10_bio->devs[i].bio; | ||
1111 | vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); | ||
1112 | for (j = 0; j < vcnt; j++) | ||
1113 | if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), | ||
1114 | page_address(tbio->bi_io_vec[j].bv_page), | ||
1115 | PAGE_SIZE)) | ||
1116 | break; | ||
1117 | if (j == vcnt) | ||
1118 | continue; | ||
1119 | /* Ok, we need to write this bio | ||
1120 | * First we need to fixup bv_offset, bv_len and | ||
1121 | * bi_vecs, as the read request might have corrupted these | ||
1122 | */ | ||
1123 | tbio->bi_vcnt = vcnt; | ||
1124 | tbio->bi_size = r10_bio->sectors << 9; | ||
1125 | tbio->bi_idx = 0; | ||
1126 | tbio->bi_phys_segments = 0; | ||
1127 | tbio->bi_hw_segments = 0; | ||
1128 | tbio->bi_hw_front_size = 0; | ||
1129 | tbio->bi_hw_back_size = 0; | ||
1130 | tbio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1131 | tbio->bi_flags |= 1 << BIO_UPTODATE; | ||
1132 | tbio->bi_next = NULL; | ||
1133 | tbio->bi_rw = WRITE; | ||
1134 | tbio->bi_private = r10_bio; | ||
1135 | tbio->bi_sector = r10_bio->devs[i].addr; | ||
1136 | |||
1137 | for (j=0; j < vcnt ; j++) { | ||
1138 | tbio->bi_io_vec[j].bv_offset = 0; | ||
1139 | tbio->bi_io_vec[j].bv_len = PAGE_SIZE; | ||
1140 | |||
1141 | memcpy(page_address(tbio->bi_io_vec[j].bv_page), | ||
1142 | page_address(fbio->bi_io_vec[j].bv_page), | ||
1143 | PAGE_SIZE); | ||
1144 | } | ||
1145 | tbio->bi_end_io = end_sync_write; | ||
1146 | |||
1147 | d = r10_bio->devs[i].devnum; | ||
1148 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
1149 | atomic_inc(&r10_bio->remaining); | ||
1150 | md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); | ||
1151 | |||
1152 | tbio->bi_sector += conf->mirrors[d].rdev->data_offset; | ||
1153 | tbio->bi_bdev = conf->mirrors[d].rdev->bdev; | ||
1154 | generic_make_request(tbio); | ||
1155 | } | ||
1156 | |||
1157 | done: | ||
1158 | if (atomic_dec_and_test(&r10_bio->remaining)) { | ||
1159 | md_done_sync(mddev, r10_bio->sectors, 1); | ||
1160 | put_buf(r10_bio); | ||
1161 | } | ||
1162 | } | ||
1163 | |||
1164 | /* | ||
1165 | * Now for the recovery code. | ||
1166 | * Recovery happens across physical sectors. | ||
1167 | * We recover all non-is_sync drives by finding the virtual address of | ||
1168 | * each, and then choose a working drive that also has that virt address. | ||
1169 | * There is a separate r10_bio for each non-in_sync drive. | ||
1170 | * Only the first two slots are in use. The first for reading, | ||
1171 | * The second for writing. | ||
1172 | * | ||
1173 | */ | ||
1174 | |||
1175 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) | ||
1176 | { | ||
1177 | conf_t *conf = mddev_to_conf(mddev); | ||
1178 | int i, d; | ||
1179 | struct bio *bio, *wbio; | ||
1180 | |||
1181 | |||
1182 | /* move the pages across to the second bio | ||
1183 | * and submit the write request | ||
1184 | */ | ||
1185 | bio = r10_bio->devs[0].bio; | ||
1186 | wbio = r10_bio->devs[1].bio; | ||
1187 | for (i=0; i < wbio->bi_vcnt; i++) { | ||
1188 | struct page *p = bio->bi_io_vec[i].bv_page; | ||
1189 | bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page; | ||
1190 | wbio->bi_io_vec[i].bv_page = p; | ||
1191 | } | ||
1192 | d = r10_bio->devs[1].devnum; | ||
1193 | |||
1194 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
1195 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); | ||
1196 | generic_make_request(wbio); | ||
1197 | } | ||
1198 | |||
1199 | |||
1200 | /* | ||
1201 | * This is a kernel thread which: | ||
1202 | * | ||
1203 | * 1. Retries failed read operations on working mirrors. | ||
1204 | * 2. Updates the raid superblock when problems encounter. | ||
1205 | * 3. Performs writes following reads for array syncronising. | ||
1206 | */ | ||
1207 | |||
1208 | static void raid10d(mddev_t *mddev) | ||
1209 | { | ||
1210 | r10bio_t *r10_bio; | ||
1211 | struct bio *bio; | ||
1212 | unsigned long flags; | ||
1213 | conf_t *conf = mddev_to_conf(mddev); | ||
1214 | struct list_head *head = &conf->retry_list; | ||
1215 | int unplug=0; | ||
1216 | mdk_rdev_t *rdev; | ||
1217 | |||
1218 | md_check_recovery(mddev); | ||
1219 | md_handle_safemode(mddev); | ||
1220 | |||
1221 | for (;;) { | ||
1222 | char b[BDEVNAME_SIZE]; | ||
1223 | spin_lock_irqsave(&conf->device_lock, flags); | ||
1224 | if (list_empty(head)) | ||
1225 | break; | ||
1226 | r10_bio = list_entry(head->prev, r10bio_t, retry_list); | ||
1227 | list_del(head->prev); | ||
1228 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
1229 | |||
1230 | mddev = r10_bio->mddev; | ||
1231 | conf = mddev_to_conf(mddev); | ||
1232 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) { | ||
1233 | sync_request_write(mddev, r10_bio); | ||
1234 | unplug = 1; | ||
1235 | } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) { | ||
1236 | recovery_request_write(mddev, r10_bio); | ||
1237 | unplug = 1; | ||
1238 | } else { | ||
1239 | int mirror; | ||
1240 | bio = r10_bio->devs[r10_bio->read_slot].bio; | ||
1241 | r10_bio->devs[r10_bio->read_slot].bio = NULL; | ||
1242 | bio_put(bio); | ||
1243 | mirror = read_balance(conf, r10_bio); | ||
1244 | if (mirror == -1) { | ||
1245 | printk(KERN_ALERT "raid10: %s: unrecoverable I/O" | ||
1246 | " read error for block %llu\n", | ||
1247 | bdevname(bio->bi_bdev,b), | ||
1248 | (unsigned long long)r10_bio->sector); | ||
1249 | raid_end_bio_io(r10_bio); | ||
1250 | } else { | ||
1251 | rdev = conf->mirrors[mirror].rdev; | ||
1252 | if (printk_ratelimit()) | ||
1253 | printk(KERN_ERR "raid10: %s: redirecting sector %llu to" | ||
1254 | " another mirror\n", | ||
1255 | bdevname(rdev->bdev,b), | ||
1256 | (unsigned long long)r10_bio->sector); | ||
1257 | bio = bio_clone(r10_bio->master_bio, GFP_NOIO); | ||
1258 | r10_bio->devs[r10_bio->read_slot].bio = bio; | ||
1259 | bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr | ||
1260 | + rdev->data_offset; | ||
1261 | bio->bi_bdev = rdev->bdev; | ||
1262 | bio->bi_rw = READ; | ||
1263 | bio->bi_private = r10_bio; | ||
1264 | bio->bi_end_io = raid10_end_read_request; | ||
1265 | unplug = 1; | ||
1266 | generic_make_request(bio); | ||
1267 | } | ||
1268 | } | ||
1269 | } | ||
1270 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
1271 | if (unplug) | ||
1272 | unplug_slaves(mddev); | ||
1273 | } | ||
1274 | |||
1275 | |||
1276 | static int init_resync(conf_t *conf) | ||
1277 | { | ||
1278 | int buffs; | ||
1279 | |||
1280 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; | ||
1281 | if (conf->r10buf_pool) | ||
1282 | BUG(); | ||
1283 | conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); | ||
1284 | if (!conf->r10buf_pool) | ||
1285 | return -ENOMEM; | ||
1286 | conf->next_resync = 0; | ||
1287 | return 0; | ||
1288 | } | ||
1289 | |||
1290 | /* | ||
1291 | * perform a "sync" on one "block" | ||
1292 | * | ||
1293 | * We need to make sure that no normal I/O request - particularly write | ||
1294 | * requests - conflict with active sync requests. | ||
1295 | * | ||
1296 | * This is achieved by tracking pending requests and a 'barrier' concept | ||
1297 | * that can be installed to exclude normal IO requests. | ||
1298 | * | ||
1299 | * Resync and recovery are handled very differently. | ||
1300 | * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. | ||
1301 | * | ||
1302 | * For resync, we iterate over virtual addresses, read all copies, | ||
1303 | * and update if there are differences. If only one copy is live, | ||
1304 | * skip it. | ||
1305 | * For recovery, we iterate over physical addresses, read a good | ||
1306 | * value for each non-in_sync drive, and over-write. | ||
1307 | * | ||
1308 | * So, for recovery we may have several outstanding complex requests for a | ||
1309 | * given address, one for each out-of-sync device. We model this by allocating | ||
1310 | * a number of r10_bio structures, one for each out-of-sync device. | ||
1311 | * As we setup these structures, we collect all bio's together into a list | ||
1312 | * which we then process collectively to add pages, and then process again | ||
1313 | * to pass to generic_make_request. | ||
1314 | * | ||
1315 | * The r10_bio structures are linked using a borrowed master_bio pointer. | ||
1316 | * This link is counted in ->remaining. When the r10_bio that points to NULL | ||
1317 | * has its remaining count decremented to 0, the whole complex operation | ||
1318 | * is complete. | ||
1319 | * | ||
1320 | */ | ||
1321 | |||
1322 | static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) | ||
1323 | { | ||
1324 | conf_t *conf = mddev_to_conf(mddev); | ||
1325 | r10bio_t *r10_bio; | ||
1326 | struct bio *biolist = NULL, *bio; | ||
1327 | sector_t max_sector, nr_sectors; | ||
1328 | int disk; | ||
1329 | int i; | ||
1330 | |||
1331 | sector_t sectors_skipped = 0; | ||
1332 | int chunks_skipped = 0; | ||
1333 | |||
1334 | if (!conf->r10buf_pool) | ||
1335 | if (init_resync(conf)) | ||
1336 | return -ENOMEM; | ||
1337 | |||
1338 | skipped: | ||
1339 | max_sector = mddev->size << 1; | ||
1340 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | ||
1341 | max_sector = mddev->resync_max_sectors; | ||
1342 | if (sector_nr >= max_sector) { | ||
1343 | close_sync(conf); | ||
1344 | return sectors_skipped; | ||
1345 | } | ||
1346 | if (chunks_skipped >= conf->raid_disks) { | ||
1347 | /* if there has been nothing to do on any drive, | ||
1348 | * then there is nothing to do at all.. | ||
1349 | */ | ||
1350 | sector_t sec = max_sector - sector_nr; | ||
1351 | md_done_sync(mddev, sec, 1); | ||
1352 | return sec + sectors_skipped; | ||
1353 | } | ||
1354 | |||
1355 | /* make sure whole request will fit in a chunk - if chunks | ||
1356 | * are meaningful | ||
1357 | */ | ||
1358 | if (conf->near_copies < conf->raid_disks && | ||
1359 | max_sector > (sector_nr | conf->chunk_mask)) | ||
1360 | max_sector = (sector_nr | conf->chunk_mask) + 1; | ||
1361 | /* | ||
1362 | * If there is non-resync activity waiting for us then | ||
1363 | * put in a delay to throttle resync. | ||
1364 | */ | ||
1365 | if (!go_faster && waitqueue_active(&conf->wait_resume)) | ||
1366 | msleep_interruptible(1000); | ||
1367 | device_barrier(conf, sector_nr + RESYNC_SECTORS); | ||
1368 | |||
1369 | /* Again, very different code for resync and recovery. | ||
1370 | * Both must result in an r10bio with a list of bios that | ||
1371 | * have bi_end_io, bi_sector, bi_bdev set, | ||
1372 | * and bi_private set to the r10bio. | ||
1373 | * For recovery, we may actually create several r10bios | ||
1374 | * with 2 bios in each, that correspond to the bios in the main one. | ||
1375 | * In this case, the subordinate r10bios link back through a | ||
1376 | * borrowed master_bio pointer, and the counter in the master | ||
1377 | * includes a ref from each subordinate. | ||
1378 | */ | ||
1379 | /* First, we decide what to do and set ->bi_end_io | ||
1380 | * To end_sync_read if we want to read, and | ||
1381 | * end_sync_write if we will want to write. | ||
1382 | */ | ||
1383 | |||
1384 | if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | ||
1385 | /* recovery... the complicated one */ | ||
1386 | int i, j, k; | ||
1387 | r10_bio = NULL; | ||
1388 | |||
1389 | for (i=0 ; i<conf->raid_disks; i++) | ||
1390 | if (conf->mirrors[i].rdev && | ||
1391 | !conf->mirrors[i].rdev->in_sync) { | ||
1392 | /* want to reconstruct this device */ | ||
1393 | r10bio_t *rb2 = r10_bio; | ||
1394 | |||
1395 | r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); | ||
1396 | spin_lock_irq(&conf->resync_lock); | ||
1397 | conf->nr_pending++; | ||
1398 | if (rb2) conf->barrier++; | ||
1399 | spin_unlock_irq(&conf->resync_lock); | ||
1400 | atomic_set(&r10_bio->remaining, 0); | ||
1401 | |||
1402 | r10_bio->master_bio = (struct bio*)rb2; | ||
1403 | if (rb2) | ||
1404 | atomic_inc(&rb2->remaining); | ||
1405 | r10_bio->mddev = mddev; | ||
1406 | set_bit(R10BIO_IsRecover, &r10_bio->state); | ||
1407 | r10_bio->sector = raid10_find_virt(conf, sector_nr, i); | ||
1408 | raid10_find_phys(conf, r10_bio); | ||
1409 | for (j=0; j<conf->copies;j++) { | ||
1410 | int d = r10_bio->devs[j].devnum; | ||
1411 | if (conf->mirrors[d].rdev && | ||
1412 | conf->mirrors[d].rdev->in_sync) { | ||
1413 | /* This is where we read from */ | ||
1414 | bio = r10_bio->devs[0].bio; | ||
1415 | bio->bi_next = biolist; | ||
1416 | biolist = bio; | ||
1417 | bio->bi_private = r10_bio; | ||
1418 | bio->bi_end_io = end_sync_read; | ||
1419 | bio->bi_rw = 0; | ||
1420 | bio->bi_sector = r10_bio->devs[j].addr + | ||
1421 | conf->mirrors[d].rdev->data_offset; | ||
1422 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | ||
1423 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
1424 | atomic_inc(&r10_bio->remaining); | ||
1425 | /* and we write to 'i' */ | ||
1426 | |||
1427 | for (k=0; k<conf->copies; k++) | ||
1428 | if (r10_bio->devs[k].devnum == i) | ||
1429 | break; | ||
1430 | bio = r10_bio->devs[1].bio; | ||
1431 | bio->bi_next = biolist; | ||
1432 | biolist = bio; | ||
1433 | bio->bi_private = r10_bio; | ||
1434 | bio->bi_end_io = end_sync_write; | ||
1435 | bio->bi_rw = 1; | ||
1436 | bio->bi_sector = r10_bio->devs[k].addr + | ||
1437 | conf->mirrors[i].rdev->data_offset; | ||
1438 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1439 | |||
1440 | r10_bio->devs[0].devnum = d; | ||
1441 | r10_bio->devs[1].devnum = i; | ||
1442 | |||
1443 | break; | ||
1444 | } | ||
1445 | } | ||
1446 | if (j == conf->copies) { | ||
1447 | BUG(); | ||
1448 | } | ||
1449 | } | ||
1450 | if (biolist == NULL) { | ||
1451 | while (r10_bio) { | ||
1452 | r10bio_t *rb2 = r10_bio; | ||
1453 | r10_bio = (r10bio_t*) rb2->master_bio; | ||
1454 | rb2->master_bio = NULL; | ||
1455 | put_buf(rb2); | ||
1456 | } | ||
1457 | goto giveup; | ||
1458 | } | ||
1459 | } else { | ||
1460 | /* resync. Schedule a read for every block at this virt offset */ | ||
1461 | int count = 0; | ||
1462 | r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); | ||
1463 | |||
1464 | spin_lock_irq(&conf->resync_lock); | ||
1465 | conf->nr_pending++; | ||
1466 | spin_unlock_irq(&conf->resync_lock); | ||
1467 | |||
1468 | r10_bio->mddev = mddev; | ||
1469 | atomic_set(&r10_bio->remaining, 0); | ||
1470 | |||
1471 | r10_bio->master_bio = NULL; | ||
1472 | r10_bio->sector = sector_nr; | ||
1473 | set_bit(R10BIO_IsSync, &r10_bio->state); | ||
1474 | raid10_find_phys(conf, r10_bio); | ||
1475 | r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; | ||
1476 | |||
1477 | for (i=0; i<conf->copies; i++) { | ||
1478 | int d = r10_bio->devs[i].devnum; | ||
1479 | bio = r10_bio->devs[i].bio; | ||
1480 | bio->bi_end_io = NULL; | ||
1481 | if (conf->mirrors[d].rdev == NULL || | ||
1482 | conf->mirrors[d].rdev->faulty) | ||
1483 | continue; | ||
1484 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
1485 | atomic_inc(&r10_bio->remaining); | ||
1486 | bio->bi_next = biolist; | ||
1487 | biolist = bio; | ||
1488 | bio->bi_private = r10_bio; | ||
1489 | bio->bi_end_io = end_sync_read; | ||
1490 | bio->bi_rw = 0; | ||
1491 | bio->bi_sector = r10_bio->devs[i].addr + | ||
1492 | conf->mirrors[d].rdev->data_offset; | ||
1493 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | ||
1494 | count++; | ||
1495 | } | ||
1496 | |||
1497 | if (count < 2) { | ||
1498 | for (i=0; i<conf->copies; i++) { | ||
1499 | int d = r10_bio->devs[i].devnum; | ||
1500 | if (r10_bio->devs[i].bio->bi_end_io) | ||
1501 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | ||
1502 | } | ||
1503 | put_buf(r10_bio); | ||
1504 | biolist = NULL; | ||
1505 | goto giveup; | ||
1506 | } | ||
1507 | } | ||
1508 | |||
1509 | for (bio = biolist; bio ; bio=bio->bi_next) { | ||
1510 | |||
1511 | bio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1512 | if (bio->bi_end_io) | ||
1513 | bio->bi_flags |= 1 << BIO_UPTODATE; | ||
1514 | bio->bi_vcnt = 0; | ||
1515 | bio->bi_idx = 0; | ||
1516 | bio->bi_phys_segments = 0; | ||
1517 | bio->bi_hw_segments = 0; | ||
1518 | bio->bi_size = 0; | ||
1519 | } | ||
1520 | |||
1521 | nr_sectors = 0; | ||
1522 | do { | ||
1523 | struct page *page; | ||
1524 | int len = PAGE_SIZE; | ||
1525 | disk = 0; | ||
1526 | if (sector_nr + (len>>9) > max_sector) | ||
1527 | len = (max_sector - sector_nr) << 9; | ||
1528 | if (len == 0) | ||
1529 | break; | ||
1530 | for (bio= biolist ; bio ; bio=bio->bi_next) { | ||
1531 | page = bio->bi_io_vec[bio->bi_vcnt].bv_page; | ||
1532 | if (bio_add_page(bio, page, len, 0) == 0) { | ||
1533 | /* stop here */ | ||
1534 | struct bio *bio2; | ||
1535 | bio->bi_io_vec[bio->bi_vcnt].bv_page = page; | ||
1536 | for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { | ||
1537 | /* remove last page from this bio */ | ||
1538 | bio2->bi_vcnt--; | ||
1539 | bio2->bi_size -= len; | ||
1540 | bio2->bi_flags &= ~(1<< BIO_SEG_VALID); | ||
1541 | } | ||
1542 | goto bio_full; | ||
1543 | } | ||
1544 | disk = i; | ||
1545 | } | ||
1546 | nr_sectors += len>>9; | ||
1547 | sector_nr += len>>9; | ||
1548 | } while (biolist->bi_vcnt < RESYNC_PAGES); | ||
1549 | bio_full: | ||
1550 | r10_bio->sectors = nr_sectors; | ||
1551 | |||
1552 | while (biolist) { | ||
1553 | bio = biolist; | ||
1554 | biolist = biolist->bi_next; | ||
1555 | |||
1556 | bio->bi_next = NULL; | ||
1557 | r10_bio = bio->bi_private; | ||
1558 | r10_bio->sectors = nr_sectors; | ||
1559 | |||
1560 | if (bio->bi_end_io == end_sync_read) { | ||
1561 | md_sync_acct(bio->bi_bdev, nr_sectors); | ||
1562 | generic_make_request(bio); | ||
1563 | } | ||
1564 | } | ||
1565 | |||
1566 | return sectors_skipped + nr_sectors; | ||
1567 | giveup: | ||
1568 | /* There is nowhere to write, so all non-sync | ||
1569 | * drives must be failed, so try the next chunk... | ||
1570 | */ | ||
1571 | { | ||
1572 | int sec = max_sector - sector_nr; | ||
1573 | sectors_skipped += sec; | ||
1574 | chunks_skipped ++; | ||
1575 | sector_nr = max_sector; | ||
1576 | md_done_sync(mddev, sec, 1); | ||
1577 | goto skipped; | ||
1578 | } | ||
1579 | } | ||
1580 | |||
1581 | static int run(mddev_t *mddev) | ||
1582 | { | ||
1583 | conf_t *conf; | ||
1584 | int i, disk_idx; | ||
1585 | mirror_info_t *disk; | ||
1586 | mdk_rdev_t *rdev; | ||
1587 | struct list_head *tmp; | ||
1588 | int nc, fc; | ||
1589 | sector_t stride, size; | ||
1590 | |||
1591 | if (mddev->level != 10) { | ||
1592 | printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n", | ||
1593 | mdname(mddev), mddev->level); | ||
1594 | goto out; | ||
1595 | } | ||
1596 | nc = mddev->layout & 255; | ||
1597 | fc = (mddev->layout >> 8) & 255; | ||
1598 | if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || | ||
1599 | (mddev->layout >> 16)) { | ||
1600 | printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", | ||
1601 | mdname(mddev), mddev->layout); | ||
1602 | goto out; | ||
1603 | } | ||
1604 | /* | ||
1605 | * copy the already verified devices into our private RAID10 | ||
1606 | * bookkeeping area. [whatever we allocate in run(), | ||
1607 | * should be freed in stop()] | ||
1608 | */ | ||
1609 | conf = kmalloc(sizeof(conf_t), GFP_KERNEL); | ||
1610 | mddev->private = conf; | ||
1611 | if (!conf) { | ||
1612 | printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", | ||
1613 | mdname(mddev)); | ||
1614 | goto out; | ||
1615 | } | ||
1616 | memset(conf, 0, sizeof(*conf)); | ||
1617 | conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, | ||
1618 | GFP_KERNEL); | ||
1619 | if (!conf->mirrors) { | ||
1620 | printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", | ||
1621 | mdname(mddev)); | ||
1622 | goto out_free_conf; | ||
1623 | } | ||
1624 | memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); | ||
1625 | |||
1626 | conf->near_copies = nc; | ||
1627 | conf->far_copies = fc; | ||
1628 | conf->copies = nc*fc; | ||
1629 | conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; | ||
1630 | conf->chunk_shift = ffz(~mddev->chunk_size) - 9; | ||
1631 | stride = mddev->size >> (conf->chunk_shift-1); | ||
1632 | sector_div(stride, fc); | ||
1633 | conf->stride = stride << conf->chunk_shift; | ||
1634 | |||
1635 | conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, | ||
1636 | r10bio_pool_free, conf); | ||
1637 | if (!conf->r10bio_pool) { | ||
1638 | printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", | ||
1639 | mdname(mddev)); | ||
1640 | goto out_free_conf; | ||
1641 | } | ||
1642 | mddev->queue->unplug_fn = raid10_unplug; | ||
1643 | |||
1644 | mddev->queue->issue_flush_fn = raid10_issue_flush; | ||
1645 | |||
1646 | ITERATE_RDEV(mddev, rdev, tmp) { | ||
1647 | disk_idx = rdev->raid_disk; | ||
1648 | if (disk_idx >= mddev->raid_disks | ||
1649 | || disk_idx < 0) | ||
1650 | continue; | ||
1651 | disk = conf->mirrors + disk_idx; | ||
1652 | |||
1653 | disk->rdev = rdev; | ||
1654 | |||
1655 | blk_queue_stack_limits(mddev->queue, | ||
1656 | rdev->bdev->bd_disk->queue); | ||
1657 | /* as we don't honour merge_bvec_fn, we must never risk | ||
1658 | * violating it, so limit ->max_sector to one PAGE, as | ||
1659 | * a one page request is never in violation. | ||
1660 | */ | ||
1661 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | ||
1662 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | ||
1663 | mddev->queue->max_sectors = (PAGE_SIZE>>9); | ||
1664 | |||
1665 | disk->head_position = 0; | ||
1666 | if (!rdev->faulty && rdev->in_sync) | ||
1667 | conf->working_disks++; | ||
1668 | } | ||
1669 | conf->raid_disks = mddev->raid_disks; | ||
1670 | conf->mddev = mddev; | ||
1671 | spin_lock_init(&conf->device_lock); | ||
1672 | INIT_LIST_HEAD(&conf->retry_list); | ||
1673 | |||
1674 | spin_lock_init(&conf->resync_lock); | ||
1675 | init_waitqueue_head(&conf->wait_idle); | ||
1676 | init_waitqueue_head(&conf->wait_resume); | ||
1677 | |||
1678 | if (!conf->working_disks) { | ||
1679 | printk(KERN_ERR "raid10: no operational mirrors for %s\n", | ||
1680 | mdname(mddev)); | ||
1681 | goto out_free_conf; | ||
1682 | } | ||
1683 | |||
1684 | mddev->degraded = 0; | ||
1685 | for (i = 0; i < conf->raid_disks; i++) { | ||
1686 | |||
1687 | disk = conf->mirrors + i; | ||
1688 | |||
1689 | if (!disk->rdev) { | ||
1690 | disk->head_position = 0; | ||
1691 | mddev->degraded++; | ||
1692 | } | ||
1693 | } | ||
1694 | |||
1695 | |||
1696 | mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10"); | ||
1697 | if (!mddev->thread) { | ||
1698 | printk(KERN_ERR | ||
1699 | "raid10: couldn't allocate thread for %s\n", | ||
1700 | mdname(mddev)); | ||
1701 | goto out_free_conf; | ||
1702 | } | ||
1703 | |||
1704 | printk(KERN_INFO | ||
1705 | "raid10: raid set %s active with %d out of %d devices\n", | ||
1706 | mdname(mddev), mddev->raid_disks - mddev->degraded, | ||
1707 | mddev->raid_disks); | ||
1708 | /* | ||
1709 | * Ok, everything is just fine now | ||
1710 | */ | ||
1711 | size = conf->stride * conf->raid_disks; | ||
1712 | sector_div(size, conf->near_copies); | ||
1713 | mddev->array_size = size/2; | ||
1714 | mddev->resync_max_sectors = size; | ||
1715 | |||
1716 | /* Calculate max read-ahead size. | ||
1717 | * We need to readahead at least twice a whole stripe.... | ||
1718 | * maybe... | ||
1719 | */ | ||
1720 | { | ||
1721 | int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE; | ||
1722 | stripe /= conf->near_copies; | ||
1723 | if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) | ||
1724 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; | ||
1725 | } | ||
1726 | |||
1727 | if (conf->near_copies < mddev->raid_disks) | ||
1728 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | ||
1729 | return 0; | ||
1730 | |||
1731 | out_free_conf: | ||
1732 | if (conf->r10bio_pool) | ||
1733 | mempool_destroy(conf->r10bio_pool); | ||
1734 | if (conf->mirrors) | ||
1735 | kfree(conf->mirrors); | ||
1736 | kfree(conf); | ||
1737 | mddev->private = NULL; | ||
1738 | out: | ||
1739 | return -EIO; | ||
1740 | } | ||
1741 | |||
1742 | static int stop(mddev_t *mddev) | ||
1743 | { | ||
1744 | conf_t *conf = mddev_to_conf(mddev); | ||
1745 | |||
1746 | md_unregister_thread(mddev->thread); | ||
1747 | mddev->thread = NULL; | ||
1748 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | ||
1749 | if (conf->r10bio_pool) | ||
1750 | mempool_destroy(conf->r10bio_pool); | ||
1751 | if (conf->mirrors) | ||
1752 | kfree(conf->mirrors); | ||
1753 | kfree(conf); | ||
1754 | mddev->private = NULL; | ||
1755 | return 0; | ||
1756 | } | ||
1757 | |||
1758 | |||
1759 | static mdk_personality_t raid10_personality = | ||
1760 | { | ||
1761 | .name = "raid10", | ||
1762 | .owner = THIS_MODULE, | ||
1763 | .make_request = make_request, | ||
1764 | .run = run, | ||
1765 | .stop = stop, | ||
1766 | .status = status, | ||
1767 | .error_handler = error, | ||
1768 | .hot_add_disk = raid10_add_disk, | ||
1769 | .hot_remove_disk= raid10_remove_disk, | ||
1770 | .spare_active = raid10_spare_active, | ||
1771 | .sync_request = sync_request, | ||
1772 | }; | ||
1773 | |||
1774 | static int __init raid_init(void) | ||
1775 | { | ||
1776 | return register_md_personality(RAID10, &raid10_personality); | ||
1777 | } | ||
1778 | |||
1779 | static void raid_exit(void) | ||
1780 | { | ||
1781 | unregister_md_personality(RAID10); | ||
1782 | } | ||
1783 | |||
1784 | module_init(raid_init); | ||
1785 | module_exit(raid_exit); | ||
1786 | MODULE_LICENSE("GPL"); | ||
1787 | MODULE_ALIAS("md-personality-9"); /* RAID10 */ | ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c new file mode 100644 index 000000000000..52c3a81c4aa7 --- /dev/null +++ b/drivers/md/raid5.c | |||
@@ -0,0 +1,1965 @@ | |||
1 | /* | ||
2 | * raid5.c : Multiple Devices driver for Linux | ||
3 | * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman | ||
4 | * Copyright (C) 1999, 2000 Ingo Molnar | ||
5 | * | ||
6 | * RAID-5 management functions. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License as published by | ||
10 | * the Free Software Foundation; either version 2, or (at your option) | ||
11 | * any later version. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
15 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
16 | */ | ||
17 | |||
18 | |||
19 | #include <linux/config.h> | ||
20 | #include <linux/module.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/raid/raid5.h> | ||
23 | #include <linux/highmem.h> | ||
24 | #include <linux/bitops.h> | ||
25 | #include <asm/atomic.h> | ||
26 | |||
27 | /* | ||
28 | * Stripe cache | ||
29 | */ | ||
30 | |||
31 | #define NR_STRIPES 256 | ||
32 | #define STRIPE_SIZE PAGE_SIZE | ||
33 | #define STRIPE_SHIFT (PAGE_SHIFT - 9) | ||
34 | #define STRIPE_SECTORS (STRIPE_SIZE>>9) | ||
35 | #define IO_THRESHOLD 1 | ||
36 | #define HASH_PAGES 1 | ||
37 | #define HASH_PAGES_ORDER 0 | ||
38 | #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) | ||
39 | #define HASH_MASK (NR_HASH - 1) | ||
40 | |||
41 | #define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]) | ||
42 | |||
43 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector | ||
44 | * order without overlap. There may be several bio's per stripe+device, and | ||
45 | * a bio could span several devices. | ||
46 | * When walking this list for a particular stripe+device, we must never proceed | ||
47 | * beyond a bio that extends past this device, as the next bio might no longer | ||
48 | * be valid. | ||
49 | * This macro is used to determine the 'next' bio in the list, given the sector | ||
50 | * of the current stripe+device | ||
51 | */ | ||
52 | #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) | ||
53 | /* | ||
54 | * The following can be used to debug the driver | ||
55 | */ | ||
56 | #define RAID5_DEBUG 0 | ||
57 | #define RAID5_PARANOIA 1 | ||
58 | #if RAID5_PARANOIA && defined(CONFIG_SMP) | ||
59 | # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) | ||
60 | #else | ||
61 | # define CHECK_DEVLOCK() | ||
62 | #endif | ||
63 | |||
64 | #define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x))) | ||
65 | #if RAID5_DEBUG | ||
66 | #define inline | ||
67 | #define __inline__ | ||
68 | #endif | ||
69 | |||
70 | static void print_raid5_conf (raid5_conf_t *conf); | ||
71 | |||
72 | static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | ||
73 | { | ||
74 | if (atomic_dec_and_test(&sh->count)) { | ||
75 | if (!list_empty(&sh->lru)) | ||
76 | BUG(); | ||
77 | if (atomic_read(&conf->active_stripes)==0) | ||
78 | BUG(); | ||
79 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | ||
80 | if (test_bit(STRIPE_DELAYED, &sh->state)) | ||
81 | list_add_tail(&sh->lru, &conf->delayed_list); | ||
82 | else | ||
83 | list_add_tail(&sh->lru, &conf->handle_list); | ||
84 | md_wakeup_thread(conf->mddev->thread); | ||
85 | } else { | ||
86 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
87 | atomic_dec(&conf->preread_active_stripes); | ||
88 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | ||
89 | md_wakeup_thread(conf->mddev->thread); | ||
90 | } | ||
91 | list_add_tail(&sh->lru, &conf->inactive_list); | ||
92 | atomic_dec(&conf->active_stripes); | ||
93 | if (!conf->inactive_blocked || | ||
94 | atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4)) | ||
95 | wake_up(&conf->wait_for_stripe); | ||
96 | } | ||
97 | } | ||
98 | } | ||
99 | static void release_stripe(struct stripe_head *sh) | ||
100 | { | ||
101 | raid5_conf_t *conf = sh->raid_conf; | ||
102 | unsigned long flags; | ||
103 | |||
104 | spin_lock_irqsave(&conf->device_lock, flags); | ||
105 | __release_stripe(conf, sh); | ||
106 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
107 | } | ||
108 | |||
109 | static void remove_hash(struct stripe_head *sh) | ||
110 | { | ||
111 | PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); | ||
112 | |||
113 | if (sh->hash_pprev) { | ||
114 | if (sh->hash_next) | ||
115 | sh->hash_next->hash_pprev = sh->hash_pprev; | ||
116 | *sh->hash_pprev = sh->hash_next; | ||
117 | sh->hash_pprev = NULL; | ||
118 | } | ||
119 | } | ||
120 | |||
121 | static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) | ||
122 | { | ||
123 | struct stripe_head **shp = &stripe_hash(conf, sh->sector); | ||
124 | |||
125 | PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); | ||
126 | |||
127 | CHECK_DEVLOCK(); | ||
128 | if ((sh->hash_next = *shp) != NULL) | ||
129 | (*shp)->hash_pprev = &sh->hash_next; | ||
130 | *shp = sh; | ||
131 | sh->hash_pprev = shp; | ||
132 | } | ||
133 | |||
134 | |||
135 | /* find an idle stripe, make sure it is unhashed, and return it. */ | ||
136 | static struct stripe_head *get_free_stripe(raid5_conf_t *conf) | ||
137 | { | ||
138 | struct stripe_head *sh = NULL; | ||
139 | struct list_head *first; | ||
140 | |||
141 | CHECK_DEVLOCK(); | ||
142 | if (list_empty(&conf->inactive_list)) | ||
143 | goto out; | ||
144 | first = conf->inactive_list.next; | ||
145 | sh = list_entry(first, struct stripe_head, lru); | ||
146 | list_del_init(first); | ||
147 | remove_hash(sh); | ||
148 | atomic_inc(&conf->active_stripes); | ||
149 | out: | ||
150 | return sh; | ||
151 | } | ||
152 | |||
153 | static void shrink_buffers(struct stripe_head *sh, int num) | ||
154 | { | ||
155 | struct page *p; | ||
156 | int i; | ||
157 | |||
158 | for (i=0; i<num ; i++) { | ||
159 | p = sh->dev[i].page; | ||
160 | if (!p) | ||
161 | continue; | ||
162 | sh->dev[i].page = NULL; | ||
163 | page_cache_release(p); | ||
164 | } | ||
165 | } | ||
166 | |||
167 | static int grow_buffers(struct stripe_head *sh, int num) | ||
168 | { | ||
169 | int i; | ||
170 | |||
171 | for (i=0; i<num; i++) { | ||
172 | struct page *page; | ||
173 | |||
174 | if (!(page = alloc_page(GFP_KERNEL))) { | ||
175 | return 1; | ||
176 | } | ||
177 | sh->dev[i].page = page; | ||
178 | } | ||
179 | return 0; | ||
180 | } | ||
181 | |||
182 | static void raid5_build_block (struct stripe_head *sh, int i); | ||
183 | |||
184 | static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx) | ||
185 | { | ||
186 | raid5_conf_t *conf = sh->raid_conf; | ||
187 | int disks = conf->raid_disks, i; | ||
188 | |||
189 | if (atomic_read(&sh->count) != 0) | ||
190 | BUG(); | ||
191 | if (test_bit(STRIPE_HANDLE, &sh->state)) | ||
192 | BUG(); | ||
193 | |||
194 | CHECK_DEVLOCK(); | ||
195 | PRINTK("init_stripe called, stripe %llu\n", | ||
196 | (unsigned long long)sh->sector); | ||
197 | |||
198 | remove_hash(sh); | ||
199 | |||
200 | sh->sector = sector; | ||
201 | sh->pd_idx = pd_idx; | ||
202 | sh->state = 0; | ||
203 | |||
204 | for (i=disks; i--; ) { | ||
205 | struct r5dev *dev = &sh->dev[i]; | ||
206 | |||
207 | if (dev->toread || dev->towrite || dev->written || | ||
208 | test_bit(R5_LOCKED, &dev->flags)) { | ||
209 | printk("sector=%llx i=%d %p %p %p %d\n", | ||
210 | (unsigned long long)sh->sector, i, dev->toread, | ||
211 | dev->towrite, dev->written, | ||
212 | test_bit(R5_LOCKED, &dev->flags)); | ||
213 | BUG(); | ||
214 | } | ||
215 | dev->flags = 0; | ||
216 | raid5_build_block(sh, i); | ||
217 | } | ||
218 | insert_hash(conf, sh); | ||
219 | } | ||
220 | |||
221 | static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector) | ||
222 | { | ||
223 | struct stripe_head *sh; | ||
224 | |||
225 | CHECK_DEVLOCK(); | ||
226 | PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); | ||
227 | for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) | ||
228 | if (sh->sector == sector) | ||
229 | return sh; | ||
230 | PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); | ||
231 | return NULL; | ||
232 | } | ||
233 | |||
234 | static void unplug_slaves(mddev_t *mddev); | ||
235 | static void raid5_unplug_device(request_queue_t *q); | ||
236 | |||
237 | static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, | ||
238 | int pd_idx, int noblock) | ||
239 | { | ||
240 | struct stripe_head *sh; | ||
241 | |||
242 | PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector); | ||
243 | |||
244 | spin_lock_irq(&conf->device_lock); | ||
245 | |||
246 | do { | ||
247 | sh = __find_stripe(conf, sector); | ||
248 | if (!sh) { | ||
249 | if (!conf->inactive_blocked) | ||
250 | sh = get_free_stripe(conf); | ||
251 | if (noblock && sh == NULL) | ||
252 | break; | ||
253 | if (!sh) { | ||
254 | conf->inactive_blocked = 1; | ||
255 | wait_event_lock_irq(conf->wait_for_stripe, | ||
256 | !list_empty(&conf->inactive_list) && | ||
257 | (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4) | ||
258 | || !conf->inactive_blocked), | ||
259 | conf->device_lock, | ||
260 | unplug_slaves(conf->mddev); | ||
261 | ); | ||
262 | conf->inactive_blocked = 0; | ||
263 | } else | ||
264 | init_stripe(sh, sector, pd_idx); | ||
265 | } else { | ||
266 | if (atomic_read(&sh->count)) { | ||
267 | if (!list_empty(&sh->lru)) | ||
268 | BUG(); | ||
269 | } else { | ||
270 | if (!test_bit(STRIPE_HANDLE, &sh->state)) | ||
271 | atomic_inc(&conf->active_stripes); | ||
272 | if (list_empty(&sh->lru)) | ||
273 | BUG(); | ||
274 | list_del_init(&sh->lru); | ||
275 | } | ||
276 | } | ||
277 | } while (sh == NULL); | ||
278 | |||
279 | if (sh) | ||
280 | atomic_inc(&sh->count); | ||
281 | |||
282 | spin_unlock_irq(&conf->device_lock); | ||
283 | return sh; | ||
284 | } | ||
285 | |||
286 | static int grow_stripes(raid5_conf_t *conf, int num) | ||
287 | { | ||
288 | struct stripe_head *sh; | ||
289 | kmem_cache_t *sc; | ||
290 | int devs = conf->raid_disks; | ||
291 | |||
292 | sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev)); | ||
293 | |||
294 | sc = kmem_cache_create(conf->cache_name, | ||
295 | sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), | ||
296 | 0, 0, NULL, NULL); | ||
297 | if (!sc) | ||
298 | return 1; | ||
299 | conf->slab_cache = sc; | ||
300 | while (num--) { | ||
301 | sh = kmem_cache_alloc(sc, GFP_KERNEL); | ||
302 | if (!sh) | ||
303 | return 1; | ||
304 | memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev)); | ||
305 | sh->raid_conf = conf; | ||
306 | spin_lock_init(&sh->lock); | ||
307 | |||
308 | if (grow_buffers(sh, conf->raid_disks)) { | ||
309 | shrink_buffers(sh, conf->raid_disks); | ||
310 | kmem_cache_free(sc, sh); | ||
311 | return 1; | ||
312 | } | ||
313 | /* we just created an active stripe so... */ | ||
314 | atomic_set(&sh->count, 1); | ||
315 | atomic_inc(&conf->active_stripes); | ||
316 | INIT_LIST_HEAD(&sh->lru); | ||
317 | release_stripe(sh); | ||
318 | } | ||
319 | return 0; | ||
320 | } | ||
321 | |||
322 | static void shrink_stripes(raid5_conf_t *conf) | ||
323 | { | ||
324 | struct stripe_head *sh; | ||
325 | |||
326 | while (1) { | ||
327 | spin_lock_irq(&conf->device_lock); | ||
328 | sh = get_free_stripe(conf); | ||
329 | spin_unlock_irq(&conf->device_lock); | ||
330 | if (!sh) | ||
331 | break; | ||
332 | if (atomic_read(&sh->count)) | ||
333 | BUG(); | ||
334 | shrink_buffers(sh, conf->raid_disks); | ||
335 | kmem_cache_free(conf->slab_cache, sh); | ||
336 | atomic_dec(&conf->active_stripes); | ||
337 | } | ||
338 | kmem_cache_destroy(conf->slab_cache); | ||
339 | conf->slab_cache = NULL; | ||
340 | } | ||
341 | |||
342 | static int raid5_end_read_request (struct bio * bi, unsigned int bytes_done, | ||
343 | int error) | ||
344 | { | ||
345 | struct stripe_head *sh = bi->bi_private; | ||
346 | raid5_conf_t *conf = sh->raid_conf; | ||
347 | int disks = conf->raid_disks, i; | ||
348 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | ||
349 | |||
350 | if (bi->bi_size) | ||
351 | return 1; | ||
352 | |||
353 | for (i=0 ; i<disks; i++) | ||
354 | if (bi == &sh->dev[i].req) | ||
355 | break; | ||
356 | |||
357 | PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n", | ||
358 | (unsigned long long)sh->sector, i, atomic_read(&sh->count), | ||
359 | uptodate); | ||
360 | if (i == disks) { | ||
361 | BUG(); | ||
362 | return 0; | ||
363 | } | ||
364 | |||
365 | if (uptodate) { | ||
366 | #if 0 | ||
367 | struct bio *bio; | ||
368 | unsigned long flags; | ||
369 | spin_lock_irqsave(&conf->device_lock, flags); | ||
370 | /* we can return a buffer if we bypassed the cache or | ||
371 | * if the top buffer is not in highmem. If there are | ||
372 | * multiple buffers, leave the extra work to | ||
373 | * handle_stripe | ||
374 | */ | ||
375 | buffer = sh->bh_read[i]; | ||
376 | if (buffer && | ||
377 | (!PageHighMem(buffer->b_page) | ||
378 | || buffer->b_page == bh->b_page ) | ||
379 | ) { | ||
380 | sh->bh_read[i] = buffer->b_reqnext; | ||
381 | buffer->b_reqnext = NULL; | ||
382 | } else | ||
383 | buffer = NULL; | ||
384 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
385 | if (sh->bh_page[i]==bh->b_page) | ||
386 | set_buffer_uptodate(bh); | ||
387 | if (buffer) { | ||
388 | if (buffer->b_page != bh->b_page) | ||
389 | memcpy(buffer->b_data, bh->b_data, bh->b_size); | ||
390 | buffer->b_end_io(buffer, 1); | ||
391 | } | ||
392 | #else | ||
393 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | ||
394 | #endif | ||
395 | } else { | ||
396 | md_error(conf->mddev, conf->disks[i].rdev); | ||
397 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | ||
398 | } | ||
399 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | ||
400 | #if 0 | ||
401 | /* must restore b_page before unlocking buffer... */ | ||
402 | if (sh->bh_page[i] != bh->b_page) { | ||
403 | bh->b_page = sh->bh_page[i]; | ||
404 | bh->b_data = page_address(bh->b_page); | ||
405 | clear_buffer_uptodate(bh); | ||
406 | } | ||
407 | #endif | ||
408 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
409 | set_bit(STRIPE_HANDLE, &sh->state); | ||
410 | release_stripe(sh); | ||
411 | return 0; | ||
412 | } | ||
413 | |||
414 | static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, | ||
415 | int error) | ||
416 | { | ||
417 | struct stripe_head *sh = bi->bi_private; | ||
418 | raid5_conf_t *conf = sh->raid_conf; | ||
419 | int disks = conf->raid_disks, i; | ||
420 | unsigned long flags; | ||
421 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | ||
422 | |||
423 | if (bi->bi_size) | ||
424 | return 1; | ||
425 | |||
426 | for (i=0 ; i<disks; i++) | ||
427 | if (bi == &sh->dev[i].req) | ||
428 | break; | ||
429 | |||
430 | PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n", | ||
431 | (unsigned long long)sh->sector, i, atomic_read(&sh->count), | ||
432 | uptodate); | ||
433 | if (i == disks) { | ||
434 | BUG(); | ||
435 | return 0; | ||
436 | } | ||
437 | |||
438 | spin_lock_irqsave(&conf->device_lock, flags); | ||
439 | if (!uptodate) | ||
440 | md_error(conf->mddev, conf->disks[i].rdev); | ||
441 | |||
442 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | ||
443 | |||
444 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
445 | set_bit(STRIPE_HANDLE, &sh->state); | ||
446 | __release_stripe(conf, sh); | ||
447 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
448 | return 0; | ||
449 | } | ||
450 | |||
451 | |||
452 | static sector_t compute_blocknr(struct stripe_head *sh, int i); | ||
453 | |||
454 | static void raid5_build_block (struct stripe_head *sh, int i) | ||
455 | { | ||
456 | struct r5dev *dev = &sh->dev[i]; | ||
457 | |||
458 | bio_init(&dev->req); | ||
459 | dev->req.bi_io_vec = &dev->vec; | ||
460 | dev->req.bi_vcnt++; | ||
461 | dev->req.bi_max_vecs++; | ||
462 | dev->vec.bv_page = dev->page; | ||
463 | dev->vec.bv_len = STRIPE_SIZE; | ||
464 | dev->vec.bv_offset = 0; | ||
465 | |||
466 | dev->req.bi_sector = sh->sector; | ||
467 | dev->req.bi_private = sh; | ||
468 | |||
469 | dev->flags = 0; | ||
470 | if (i != sh->pd_idx) | ||
471 | dev->sector = compute_blocknr(sh, i); | ||
472 | } | ||
473 | |||
474 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | ||
475 | { | ||
476 | char b[BDEVNAME_SIZE]; | ||
477 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | ||
478 | PRINTK("raid5: error called\n"); | ||
479 | |||
480 | if (!rdev->faulty) { | ||
481 | mddev->sb_dirty = 1; | ||
482 | if (rdev->in_sync) { | ||
483 | conf->working_disks--; | ||
484 | mddev->degraded++; | ||
485 | conf->failed_disks++; | ||
486 | rdev->in_sync = 0; | ||
487 | /* | ||
488 | * if recovery was running, make sure it aborts. | ||
489 | */ | ||
490 | set_bit(MD_RECOVERY_ERR, &mddev->recovery); | ||
491 | } | ||
492 | rdev->faulty = 1; | ||
493 | printk (KERN_ALERT | ||
494 | "raid5: Disk failure on %s, disabling device." | ||
495 | " Operation continuing on %d devices\n", | ||
496 | bdevname(rdev->bdev,b), conf->working_disks); | ||
497 | } | ||
498 | } | ||
499 | |||
500 | /* | ||
501 | * Input: a 'big' sector number, | ||
502 | * Output: index of the data and parity disk, and the sector # in them. | ||
503 | */ | ||
504 | static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, | ||
505 | unsigned int data_disks, unsigned int * dd_idx, | ||
506 | unsigned int * pd_idx, raid5_conf_t *conf) | ||
507 | { | ||
508 | long stripe; | ||
509 | unsigned long chunk_number; | ||
510 | unsigned int chunk_offset; | ||
511 | sector_t new_sector; | ||
512 | int sectors_per_chunk = conf->chunk_size >> 9; | ||
513 | |||
514 | /* First compute the information on this sector */ | ||
515 | |||
516 | /* | ||
517 | * Compute the chunk number and the sector offset inside the chunk | ||
518 | */ | ||
519 | chunk_offset = sector_div(r_sector, sectors_per_chunk); | ||
520 | chunk_number = r_sector; | ||
521 | BUG_ON(r_sector != chunk_number); | ||
522 | |||
523 | /* | ||
524 | * Compute the stripe number | ||
525 | */ | ||
526 | stripe = chunk_number / data_disks; | ||
527 | |||
528 | /* | ||
529 | * Compute the data disk and parity disk indexes inside the stripe | ||
530 | */ | ||
531 | *dd_idx = chunk_number % data_disks; | ||
532 | |||
533 | /* | ||
534 | * Select the parity disk based on the user selected algorithm. | ||
535 | */ | ||
536 | if (conf->level == 4) | ||
537 | *pd_idx = data_disks; | ||
538 | else switch (conf->algorithm) { | ||
539 | case ALGORITHM_LEFT_ASYMMETRIC: | ||
540 | *pd_idx = data_disks - stripe % raid_disks; | ||
541 | if (*dd_idx >= *pd_idx) | ||
542 | (*dd_idx)++; | ||
543 | break; | ||
544 | case ALGORITHM_RIGHT_ASYMMETRIC: | ||
545 | *pd_idx = stripe % raid_disks; | ||
546 | if (*dd_idx >= *pd_idx) | ||
547 | (*dd_idx)++; | ||
548 | break; | ||
549 | case ALGORITHM_LEFT_SYMMETRIC: | ||
550 | *pd_idx = data_disks - stripe % raid_disks; | ||
551 | *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; | ||
552 | break; | ||
553 | case ALGORITHM_RIGHT_SYMMETRIC: | ||
554 | *pd_idx = stripe % raid_disks; | ||
555 | *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; | ||
556 | break; | ||
557 | default: | ||
558 | printk("raid5: unsupported algorithm %d\n", | ||
559 | conf->algorithm); | ||
560 | } | ||
561 | |||
562 | /* | ||
563 | * Finally, compute the new sector number | ||
564 | */ | ||
565 | new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; | ||
566 | return new_sector; | ||
567 | } | ||
568 | |||
569 | |||
570 | static sector_t compute_blocknr(struct stripe_head *sh, int i) | ||
571 | { | ||
572 | raid5_conf_t *conf = sh->raid_conf; | ||
573 | int raid_disks = conf->raid_disks, data_disks = raid_disks - 1; | ||
574 | sector_t new_sector = sh->sector, check; | ||
575 | int sectors_per_chunk = conf->chunk_size >> 9; | ||
576 | sector_t stripe; | ||
577 | int chunk_offset; | ||
578 | int chunk_number, dummy1, dummy2, dd_idx = i; | ||
579 | sector_t r_sector; | ||
580 | |||
581 | chunk_offset = sector_div(new_sector, sectors_per_chunk); | ||
582 | stripe = new_sector; | ||
583 | BUG_ON(new_sector != stripe); | ||
584 | |||
585 | |||
586 | switch (conf->algorithm) { | ||
587 | case ALGORITHM_LEFT_ASYMMETRIC: | ||
588 | case ALGORITHM_RIGHT_ASYMMETRIC: | ||
589 | if (i > sh->pd_idx) | ||
590 | i--; | ||
591 | break; | ||
592 | case ALGORITHM_LEFT_SYMMETRIC: | ||
593 | case ALGORITHM_RIGHT_SYMMETRIC: | ||
594 | if (i < sh->pd_idx) | ||
595 | i += raid_disks; | ||
596 | i -= (sh->pd_idx + 1); | ||
597 | break; | ||
598 | default: | ||
599 | printk("raid5: unsupported algorithm %d\n", | ||
600 | conf->algorithm); | ||
601 | } | ||
602 | |||
603 | chunk_number = stripe * data_disks + i; | ||
604 | r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; | ||
605 | |||
606 | check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); | ||
607 | if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { | ||
608 | printk("compute_blocknr: map not correct\n"); | ||
609 | return 0; | ||
610 | } | ||
611 | return r_sector; | ||
612 | } | ||
613 | |||
614 | |||
615 | |||
616 | /* | ||
617 | * Copy data between a page in the stripe cache, and a bio. | ||
618 | * There are no alignment or size guarantees between the page or the | ||
619 | * bio except that there is some overlap. | ||
620 | * All iovecs in the bio must be considered. | ||
621 | */ | ||
622 | static void copy_data(int frombio, struct bio *bio, | ||
623 | struct page *page, | ||
624 | sector_t sector) | ||
625 | { | ||
626 | char *pa = page_address(page); | ||
627 | struct bio_vec *bvl; | ||
628 | int i; | ||
629 | int page_offset; | ||
630 | |||
631 | if (bio->bi_sector >= sector) | ||
632 | page_offset = (signed)(bio->bi_sector - sector) * 512; | ||
633 | else | ||
634 | page_offset = (signed)(sector - bio->bi_sector) * -512; | ||
635 | bio_for_each_segment(bvl, bio, i) { | ||
636 | int len = bio_iovec_idx(bio,i)->bv_len; | ||
637 | int clen; | ||
638 | int b_offset = 0; | ||
639 | |||
640 | if (page_offset < 0) { | ||
641 | b_offset = -page_offset; | ||
642 | page_offset += b_offset; | ||
643 | len -= b_offset; | ||
644 | } | ||
645 | |||
646 | if (len > 0 && page_offset + len > STRIPE_SIZE) | ||
647 | clen = STRIPE_SIZE - page_offset; | ||
648 | else clen = len; | ||
649 | |||
650 | if (clen > 0) { | ||
651 | char *ba = __bio_kmap_atomic(bio, i, KM_USER0); | ||
652 | if (frombio) | ||
653 | memcpy(pa+page_offset, ba+b_offset, clen); | ||
654 | else | ||
655 | memcpy(ba+b_offset, pa+page_offset, clen); | ||
656 | __bio_kunmap_atomic(ba, KM_USER0); | ||
657 | } | ||
658 | if (clen < len) /* hit end of page */ | ||
659 | break; | ||
660 | page_offset += len; | ||
661 | } | ||
662 | } | ||
663 | |||
664 | #define check_xor() do { \ | ||
665 | if (count == MAX_XOR_BLOCKS) { \ | ||
666 | xor_block(count, STRIPE_SIZE, ptr); \ | ||
667 | count = 1; \ | ||
668 | } \ | ||
669 | } while(0) | ||
670 | |||
671 | |||
672 | static void compute_block(struct stripe_head *sh, int dd_idx) | ||
673 | { | ||
674 | raid5_conf_t *conf = sh->raid_conf; | ||
675 | int i, count, disks = conf->raid_disks; | ||
676 | void *ptr[MAX_XOR_BLOCKS], *p; | ||
677 | |||
678 | PRINTK("compute_block, stripe %llu, idx %d\n", | ||
679 | (unsigned long long)sh->sector, dd_idx); | ||
680 | |||
681 | ptr[0] = page_address(sh->dev[dd_idx].page); | ||
682 | memset(ptr[0], 0, STRIPE_SIZE); | ||
683 | count = 1; | ||
684 | for (i = disks ; i--; ) { | ||
685 | if (i == dd_idx) | ||
686 | continue; | ||
687 | p = page_address(sh->dev[i].page); | ||
688 | if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) | ||
689 | ptr[count++] = p; | ||
690 | else | ||
691 | printk("compute_block() %d, stripe %llu, %d" | ||
692 | " not present\n", dd_idx, | ||
693 | (unsigned long long)sh->sector, i); | ||
694 | |||
695 | check_xor(); | ||
696 | } | ||
697 | if (count != 1) | ||
698 | xor_block(count, STRIPE_SIZE, ptr); | ||
699 | set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | ||
700 | } | ||
701 | |||
702 | static void compute_parity(struct stripe_head *sh, int method) | ||
703 | { | ||
704 | raid5_conf_t *conf = sh->raid_conf; | ||
705 | int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; | ||
706 | void *ptr[MAX_XOR_BLOCKS]; | ||
707 | struct bio *chosen; | ||
708 | |||
709 | PRINTK("compute_parity, stripe %llu, method %d\n", | ||
710 | (unsigned long long)sh->sector, method); | ||
711 | |||
712 | count = 1; | ||
713 | ptr[0] = page_address(sh->dev[pd_idx].page); | ||
714 | switch(method) { | ||
715 | case READ_MODIFY_WRITE: | ||
716 | if (!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)) | ||
717 | BUG(); | ||
718 | for (i=disks ; i-- ;) { | ||
719 | if (i==pd_idx) | ||
720 | continue; | ||
721 | if (sh->dev[i].towrite && | ||
722 | test_bit(R5_UPTODATE, &sh->dev[i].flags)) { | ||
723 | ptr[count++] = page_address(sh->dev[i].page); | ||
724 | chosen = sh->dev[i].towrite; | ||
725 | sh->dev[i].towrite = NULL; | ||
726 | |||
727 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
728 | wake_up(&conf->wait_for_overlap); | ||
729 | |||
730 | if (sh->dev[i].written) BUG(); | ||
731 | sh->dev[i].written = chosen; | ||
732 | check_xor(); | ||
733 | } | ||
734 | } | ||
735 | break; | ||
736 | case RECONSTRUCT_WRITE: | ||
737 | memset(ptr[0], 0, STRIPE_SIZE); | ||
738 | for (i= disks; i-- ;) | ||
739 | if (i!=pd_idx && sh->dev[i].towrite) { | ||
740 | chosen = sh->dev[i].towrite; | ||
741 | sh->dev[i].towrite = NULL; | ||
742 | |||
743 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
744 | wake_up(&conf->wait_for_overlap); | ||
745 | |||
746 | if (sh->dev[i].written) BUG(); | ||
747 | sh->dev[i].written = chosen; | ||
748 | } | ||
749 | break; | ||
750 | case CHECK_PARITY: | ||
751 | break; | ||
752 | } | ||
753 | if (count>1) { | ||
754 | xor_block(count, STRIPE_SIZE, ptr); | ||
755 | count = 1; | ||
756 | } | ||
757 | |||
758 | for (i = disks; i--;) | ||
759 | if (sh->dev[i].written) { | ||
760 | sector_t sector = sh->dev[i].sector; | ||
761 | struct bio *wbi = sh->dev[i].written; | ||
762 | while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { | ||
763 | copy_data(1, wbi, sh->dev[i].page, sector); | ||
764 | wbi = r5_next_bio(wbi, sector); | ||
765 | } | ||
766 | |||
767 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
768 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | ||
769 | } | ||
770 | |||
771 | switch(method) { | ||
772 | case RECONSTRUCT_WRITE: | ||
773 | case CHECK_PARITY: | ||
774 | for (i=disks; i--;) | ||
775 | if (i != pd_idx) { | ||
776 | ptr[count++] = page_address(sh->dev[i].page); | ||
777 | check_xor(); | ||
778 | } | ||
779 | break; | ||
780 | case READ_MODIFY_WRITE: | ||
781 | for (i = disks; i--;) | ||
782 | if (sh->dev[i].written) { | ||
783 | ptr[count++] = page_address(sh->dev[i].page); | ||
784 | check_xor(); | ||
785 | } | ||
786 | } | ||
787 | if (count != 1) | ||
788 | xor_block(count, STRIPE_SIZE, ptr); | ||
789 | |||
790 | if (method != CHECK_PARITY) { | ||
791 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
792 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | ||
793 | } else | ||
794 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
795 | } | ||
796 | |||
797 | /* | ||
798 | * Each stripe/dev can have one or more bion attached. | ||
799 | * toread/towrite point to the first in a chain. | ||
800 | * The bi_next chain must be in order. | ||
801 | */ | ||
802 | static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) | ||
803 | { | ||
804 | struct bio **bip; | ||
805 | raid5_conf_t *conf = sh->raid_conf; | ||
806 | |||
807 | PRINTK("adding bh b#%llu to stripe s#%llu\n", | ||
808 | (unsigned long long)bi->bi_sector, | ||
809 | (unsigned long long)sh->sector); | ||
810 | |||
811 | |||
812 | spin_lock(&sh->lock); | ||
813 | spin_lock_irq(&conf->device_lock); | ||
814 | if (forwrite) | ||
815 | bip = &sh->dev[dd_idx].towrite; | ||
816 | else | ||
817 | bip = &sh->dev[dd_idx].toread; | ||
818 | while (*bip && (*bip)->bi_sector < bi->bi_sector) { | ||
819 | if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) | ||
820 | goto overlap; | ||
821 | bip = & (*bip)->bi_next; | ||
822 | } | ||
823 | if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) | ||
824 | goto overlap; | ||
825 | |||
826 | if (*bip && bi->bi_next && (*bip) != bi->bi_next) | ||
827 | BUG(); | ||
828 | if (*bip) | ||
829 | bi->bi_next = *bip; | ||
830 | *bip = bi; | ||
831 | bi->bi_phys_segments ++; | ||
832 | spin_unlock_irq(&conf->device_lock); | ||
833 | spin_unlock(&sh->lock); | ||
834 | |||
835 | PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n", | ||
836 | (unsigned long long)bi->bi_sector, | ||
837 | (unsigned long long)sh->sector, dd_idx); | ||
838 | |||
839 | if (forwrite) { | ||
840 | /* check if page is covered */ | ||
841 | sector_t sector = sh->dev[dd_idx].sector; | ||
842 | for (bi=sh->dev[dd_idx].towrite; | ||
843 | sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && | ||
844 | bi && bi->bi_sector <= sector; | ||
845 | bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { | ||
846 | if (bi->bi_sector + (bi->bi_size>>9) >= sector) | ||
847 | sector = bi->bi_sector + (bi->bi_size>>9); | ||
848 | } | ||
849 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | ||
850 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | ||
851 | } | ||
852 | return 1; | ||
853 | |||
854 | overlap: | ||
855 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); | ||
856 | spin_unlock_irq(&conf->device_lock); | ||
857 | spin_unlock(&sh->lock); | ||
858 | return 0; | ||
859 | } | ||
860 | |||
861 | |||
862 | /* | ||
863 | * handle_stripe - do things to a stripe. | ||
864 | * | ||
865 | * We lock the stripe and then examine the state of various bits | ||
866 | * to see what needs to be done. | ||
867 | * Possible results: | ||
868 | * return some read request which now have data | ||
869 | * return some write requests which are safely on disc | ||
870 | * schedule a read on some buffers | ||
871 | * schedule a write of some buffers | ||
872 | * return confirmation of parity correctness | ||
873 | * | ||
874 | * Parity calculations are done inside the stripe lock | ||
875 | * buffers are taken off read_list or write_list, and bh_cache buffers | ||
876 | * get BH_Lock set before the stripe lock is released. | ||
877 | * | ||
878 | */ | ||
879 | |||
880 | static void handle_stripe(struct stripe_head *sh) | ||
881 | { | ||
882 | raid5_conf_t *conf = sh->raid_conf; | ||
883 | int disks = conf->raid_disks; | ||
884 | struct bio *return_bi= NULL; | ||
885 | struct bio *bi; | ||
886 | int i; | ||
887 | int syncing; | ||
888 | int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; | ||
889 | int non_overwrite = 0; | ||
890 | int failed_num=0; | ||
891 | struct r5dev *dev; | ||
892 | |||
893 | PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", | ||
894 | (unsigned long long)sh->sector, atomic_read(&sh->count), | ||
895 | sh->pd_idx); | ||
896 | |||
897 | spin_lock(&sh->lock); | ||
898 | clear_bit(STRIPE_HANDLE, &sh->state); | ||
899 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
900 | |||
901 | syncing = test_bit(STRIPE_SYNCING, &sh->state); | ||
902 | /* Now to look around and see what can be done */ | ||
903 | |||
904 | for (i=disks; i--; ) { | ||
905 | mdk_rdev_t *rdev; | ||
906 | dev = &sh->dev[i]; | ||
907 | clear_bit(R5_Insync, &dev->flags); | ||
908 | clear_bit(R5_Syncio, &dev->flags); | ||
909 | |||
910 | PRINTK("check %d: state 0x%lx read %p write %p written %p\n", | ||
911 | i, dev->flags, dev->toread, dev->towrite, dev->written); | ||
912 | /* maybe we can reply to a read */ | ||
913 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { | ||
914 | struct bio *rbi, *rbi2; | ||
915 | PRINTK("Return read for disc %d\n", i); | ||
916 | spin_lock_irq(&conf->device_lock); | ||
917 | rbi = dev->toread; | ||
918 | dev->toread = NULL; | ||
919 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | ||
920 | wake_up(&conf->wait_for_overlap); | ||
921 | spin_unlock_irq(&conf->device_lock); | ||
922 | while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { | ||
923 | copy_data(0, rbi, dev->page, dev->sector); | ||
924 | rbi2 = r5_next_bio(rbi, dev->sector); | ||
925 | spin_lock_irq(&conf->device_lock); | ||
926 | if (--rbi->bi_phys_segments == 0) { | ||
927 | rbi->bi_next = return_bi; | ||
928 | return_bi = rbi; | ||
929 | } | ||
930 | spin_unlock_irq(&conf->device_lock); | ||
931 | rbi = rbi2; | ||
932 | } | ||
933 | } | ||
934 | |||
935 | /* now count some things */ | ||
936 | if (test_bit(R5_LOCKED, &dev->flags)) locked++; | ||
937 | if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; | ||
938 | |||
939 | |||
940 | if (dev->toread) to_read++; | ||
941 | if (dev->towrite) { | ||
942 | to_write++; | ||
943 | if (!test_bit(R5_OVERWRITE, &dev->flags)) | ||
944 | non_overwrite++; | ||
945 | } | ||
946 | if (dev->written) written++; | ||
947 | rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ | ||
948 | if (!rdev || !rdev->in_sync) { | ||
949 | failed++; | ||
950 | failed_num = i; | ||
951 | } else | ||
952 | set_bit(R5_Insync, &dev->flags); | ||
953 | } | ||
954 | PRINTK("locked=%d uptodate=%d to_read=%d" | ||
955 | " to_write=%d failed=%d failed_num=%d\n", | ||
956 | locked, uptodate, to_read, to_write, failed, failed_num); | ||
957 | /* check if the array has lost two devices and, if so, some requests might | ||
958 | * need to be failed | ||
959 | */ | ||
960 | if (failed > 1 && to_read+to_write+written) { | ||
961 | spin_lock_irq(&conf->device_lock); | ||
962 | for (i=disks; i--; ) { | ||
963 | /* fail all writes first */ | ||
964 | bi = sh->dev[i].towrite; | ||
965 | sh->dev[i].towrite = NULL; | ||
966 | if (bi) to_write--; | ||
967 | |||
968 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
969 | wake_up(&conf->wait_for_overlap); | ||
970 | |||
971 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ | ||
972 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | ||
973 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
974 | if (--bi->bi_phys_segments == 0) { | ||
975 | md_write_end(conf->mddev); | ||
976 | bi->bi_next = return_bi; | ||
977 | return_bi = bi; | ||
978 | } | ||
979 | bi = nextbi; | ||
980 | } | ||
981 | /* and fail all 'written' */ | ||
982 | bi = sh->dev[i].written; | ||
983 | sh->dev[i].written = NULL; | ||
984 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { | ||
985 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | ||
986 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
987 | if (--bi->bi_phys_segments == 0) { | ||
988 | md_write_end(conf->mddev); | ||
989 | bi->bi_next = return_bi; | ||
990 | return_bi = bi; | ||
991 | } | ||
992 | bi = bi2; | ||
993 | } | ||
994 | |||
995 | /* fail any reads if this device is non-operational */ | ||
996 | if (!test_bit(R5_Insync, &sh->dev[i].flags)) { | ||
997 | bi = sh->dev[i].toread; | ||
998 | sh->dev[i].toread = NULL; | ||
999 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
1000 | wake_up(&conf->wait_for_overlap); | ||
1001 | if (bi) to_read--; | ||
1002 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ | ||
1003 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | ||
1004 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
1005 | if (--bi->bi_phys_segments == 0) { | ||
1006 | bi->bi_next = return_bi; | ||
1007 | return_bi = bi; | ||
1008 | } | ||
1009 | bi = nextbi; | ||
1010 | } | ||
1011 | } | ||
1012 | } | ||
1013 | spin_unlock_irq(&conf->device_lock); | ||
1014 | } | ||
1015 | if (failed > 1 && syncing) { | ||
1016 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | ||
1017 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
1018 | syncing = 0; | ||
1019 | } | ||
1020 | |||
1021 | /* might be able to return some write requests if the parity block | ||
1022 | * is safe, or on a failed drive | ||
1023 | */ | ||
1024 | dev = &sh->dev[sh->pd_idx]; | ||
1025 | if ( written && | ||
1026 | ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && | ||
1027 | test_bit(R5_UPTODATE, &dev->flags)) | ||
1028 | || (failed == 1 && failed_num == sh->pd_idx)) | ||
1029 | ) { | ||
1030 | /* any written block on an uptodate or failed drive can be returned. | ||
1031 | * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but | ||
1032 | * never LOCKED, so we don't need to test 'failed' directly. | ||
1033 | */ | ||
1034 | for (i=disks; i--; ) | ||
1035 | if (sh->dev[i].written) { | ||
1036 | dev = &sh->dev[i]; | ||
1037 | if (!test_bit(R5_LOCKED, &dev->flags) && | ||
1038 | test_bit(R5_UPTODATE, &dev->flags) ) { | ||
1039 | /* We can return any write requests */ | ||
1040 | struct bio *wbi, *wbi2; | ||
1041 | PRINTK("Return write for disc %d\n", i); | ||
1042 | spin_lock_irq(&conf->device_lock); | ||
1043 | wbi = dev->written; | ||
1044 | dev->written = NULL; | ||
1045 | while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { | ||
1046 | wbi2 = r5_next_bio(wbi, dev->sector); | ||
1047 | if (--wbi->bi_phys_segments == 0) { | ||
1048 | md_write_end(conf->mddev); | ||
1049 | wbi->bi_next = return_bi; | ||
1050 | return_bi = wbi; | ||
1051 | } | ||
1052 | wbi = wbi2; | ||
1053 | } | ||
1054 | spin_unlock_irq(&conf->device_lock); | ||
1055 | } | ||
1056 | } | ||
1057 | } | ||
1058 | |||
1059 | /* Now we might consider reading some blocks, either to check/generate | ||
1060 | * parity, or to satisfy requests | ||
1061 | * or to load a block that is being partially written. | ||
1062 | */ | ||
1063 | if (to_read || non_overwrite || (syncing && (uptodate < disks))) { | ||
1064 | for (i=disks; i--;) { | ||
1065 | dev = &sh->dev[i]; | ||
1066 | if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && | ||
1067 | (dev->toread || | ||
1068 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | ||
1069 | syncing || | ||
1070 | (failed && (sh->dev[failed_num].toread || | ||
1071 | (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) | ||
1072 | ) | ||
1073 | ) { | ||
1074 | /* we would like to get this block, possibly | ||
1075 | * by computing it, but we might not be able to | ||
1076 | */ | ||
1077 | if (uptodate == disks-1) { | ||
1078 | PRINTK("Computing block %d\n", i); | ||
1079 | compute_block(sh, i); | ||
1080 | uptodate++; | ||
1081 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
1082 | set_bit(R5_LOCKED, &dev->flags); | ||
1083 | set_bit(R5_Wantread, &dev->flags); | ||
1084 | #if 0 | ||
1085 | /* if I am just reading this block and we don't have | ||
1086 | a failed drive, or any pending writes then sidestep the cache */ | ||
1087 | if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && | ||
1088 | ! syncing && !failed && !to_write) { | ||
1089 | sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; | ||
1090 | sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; | ||
1091 | } | ||
1092 | #endif | ||
1093 | locked++; | ||
1094 | PRINTK("Reading block %d (sync=%d)\n", | ||
1095 | i, syncing); | ||
1096 | if (syncing) | ||
1097 | md_sync_acct(conf->disks[i].rdev->bdev, | ||
1098 | STRIPE_SECTORS); | ||
1099 | } | ||
1100 | } | ||
1101 | } | ||
1102 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1103 | } | ||
1104 | |||
1105 | /* now to consider writing and what else, if anything should be read */ | ||
1106 | if (to_write) { | ||
1107 | int rmw=0, rcw=0; | ||
1108 | for (i=disks ; i--;) { | ||
1109 | /* would I have to read this buffer for read_modify_write */ | ||
1110 | dev = &sh->dev[i]; | ||
1111 | if ((dev->towrite || i == sh->pd_idx) && | ||
1112 | (!test_bit(R5_LOCKED, &dev->flags) | ||
1113 | #if 0 | ||
1114 | || sh->bh_page[i]!=bh->b_page | ||
1115 | #endif | ||
1116 | ) && | ||
1117 | !test_bit(R5_UPTODATE, &dev->flags)) { | ||
1118 | if (test_bit(R5_Insync, &dev->flags) | ||
1119 | /* && !(!mddev->insync && i == sh->pd_idx) */ | ||
1120 | ) | ||
1121 | rmw++; | ||
1122 | else rmw += 2*disks; /* cannot read it */ | ||
1123 | } | ||
1124 | /* Would I have to read this buffer for reconstruct_write */ | ||
1125 | if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && | ||
1126 | (!test_bit(R5_LOCKED, &dev->flags) | ||
1127 | #if 0 | ||
1128 | || sh->bh_page[i] != bh->b_page | ||
1129 | #endif | ||
1130 | ) && | ||
1131 | !test_bit(R5_UPTODATE, &dev->flags)) { | ||
1132 | if (test_bit(R5_Insync, &dev->flags)) rcw++; | ||
1133 | else rcw += 2*disks; | ||
1134 | } | ||
1135 | } | ||
1136 | PRINTK("for sector %llu, rmw=%d rcw=%d\n", | ||
1137 | (unsigned long long)sh->sector, rmw, rcw); | ||
1138 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1139 | if (rmw < rcw && rmw > 0) | ||
1140 | /* prefer read-modify-write, but need to get some data */ | ||
1141 | for (i=disks; i--;) { | ||
1142 | dev = &sh->dev[i]; | ||
1143 | if ((dev->towrite || i == sh->pd_idx) && | ||
1144 | !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && | ||
1145 | test_bit(R5_Insync, &dev->flags)) { | ||
1146 | if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
1147 | { | ||
1148 | PRINTK("Read_old block %d for r-m-w\n", i); | ||
1149 | set_bit(R5_LOCKED, &dev->flags); | ||
1150 | set_bit(R5_Wantread, &dev->flags); | ||
1151 | locked++; | ||
1152 | } else { | ||
1153 | set_bit(STRIPE_DELAYED, &sh->state); | ||
1154 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1155 | } | ||
1156 | } | ||
1157 | } | ||
1158 | if (rcw <= rmw && rcw > 0) | ||
1159 | /* want reconstruct write, but need to get some data */ | ||
1160 | for (i=disks; i--;) { | ||
1161 | dev = &sh->dev[i]; | ||
1162 | if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && | ||
1163 | !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && | ||
1164 | test_bit(R5_Insync, &dev->flags)) { | ||
1165 | if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
1166 | { | ||
1167 | PRINTK("Read_old block %d for Reconstruct\n", i); | ||
1168 | set_bit(R5_LOCKED, &dev->flags); | ||
1169 | set_bit(R5_Wantread, &dev->flags); | ||
1170 | locked++; | ||
1171 | } else { | ||
1172 | set_bit(STRIPE_DELAYED, &sh->state); | ||
1173 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1174 | } | ||
1175 | } | ||
1176 | } | ||
1177 | /* now if nothing is locked, and if we have enough data, we can start a write request */ | ||
1178 | if (locked == 0 && (rcw == 0 ||rmw == 0)) { | ||
1179 | PRINTK("Computing parity...\n"); | ||
1180 | compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); | ||
1181 | /* now every locked buffer is ready to be written */ | ||
1182 | for (i=disks; i--;) | ||
1183 | if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { | ||
1184 | PRINTK("Writing block %d\n", i); | ||
1185 | locked++; | ||
1186 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
1187 | if (!test_bit(R5_Insync, &sh->dev[i].flags) | ||
1188 | || (i==sh->pd_idx && failed == 0)) | ||
1189 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1190 | } | ||
1191 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
1192 | atomic_dec(&conf->preread_active_stripes); | ||
1193 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | ||
1194 | md_wakeup_thread(conf->mddev->thread); | ||
1195 | } | ||
1196 | } | ||
1197 | } | ||
1198 | |||
1199 | /* maybe we need to check and possibly fix the parity for this stripe | ||
1200 | * Any reads will already have been scheduled, so we just see if enough data | ||
1201 | * is available | ||
1202 | */ | ||
1203 | if (syncing && locked == 0 && | ||
1204 | !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) { | ||
1205 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1206 | if (failed == 0) { | ||
1207 | char *pagea; | ||
1208 | if (uptodate != disks) | ||
1209 | BUG(); | ||
1210 | compute_parity(sh, CHECK_PARITY); | ||
1211 | uptodate--; | ||
1212 | pagea = page_address(sh->dev[sh->pd_idx].page); | ||
1213 | if ((*(u32*)pagea) == 0 && | ||
1214 | !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { | ||
1215 | /* parity is correct (on disc, not in buffer any more) */ | ||
1216 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1217 | } | ||
1218 | } | ||
1219 | if (!test_bit(STRIPE_INSYNC, &sh->state)) { | ||
1220 | if (failed==0) | ||
1221 | failed_num = sh->pd_idx; | ||
1222 | /* should be able to compute the missing block and write it to spare */ | ||
1223 | if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) { | ||
1224 | if (uptodate+1 != disks) | ||
1225 | BUG(); | ||
1226 | compute_block(sh, failed_num); | ||
1227 | uptodate++; | ||
1228 | } | ||
1229 | if (uptodate != disks) | ||
1230 | BUG(); | ||
1231 | dev = &sh->dev[failed_num]; | ||
1232 | set_bit(R5_LOCKED, &dev->flags); | ||
1233 | set_bit(R5_Wantwrite, &dev->flags); | ||
1234 | locked++; | ||
1235 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1236 | set_bit(R5_Syncio, &dev->flags); | ||
1237 | } | ||
1238 | } | ||
1239 | if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | ||
1240 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | ||
1241 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
1242 | } | ||
1243 | |||
1244 | spin_unlock(&sh->lock); | ||
1245 | |||
1246 | while ((bi=return_bi)) { | ||
1247 | int bytes = bi->bi_size; | ||
1248 | |||
1249 | return_bi = bi->bi_next; | ||
1250 | bi->bi_next = NULL; | ||
1251 | bi->bi_size = 0; | ||
1252 | bi->bi_end_io(bi, bytes, 0); | ||
1253 | } | ||
1254 | for (i=disks; i-- ;) { | ||
1255 | int rw; | ||
1256 | struct bio *bi; | ||
1257 | mdk_rdev_t *rdev; | ||
1258 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) | ||
1259 | rw = 1; | ||
1260 | else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | ||
1261 | rw = 0; | ||
1262 | else | ||
1263 | continue; | ||
1264 | |||
1265 | bi = &sh->dev[i].req; | ||
1266 | |||
1267 | bi->bi_rw = rw; | ||
1268 | if (rw) | ||
1269 | bi->bi_end_io = raid5_end_write_request; | ||
1270 | else | ||
1271 | bi->bi_end_io = raid5_end_read_request; | ||
1272 | |||
1273 | rcu_read_lock(); | ||
1274 | rdev = conf->disks[i].rdev; | ||
1275 | if (rdev && rdev->faulty) | ||
1276 | rdev = NULL; | ||
1277 | if (rdev) | ||
1278 | atomic_inc(&rdev->nr_pending); | ||
1279 | rcu_read_unlock(); | ||
1280 | |||
1281 | if (rdev) { | ||
1282 | if (test_bit(R5_Syncio, &sh->dev[i].flags)) | ||
1283 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | ||
1284 | |||
1285 | bi->bi_bdev = rdev->bdev; | ||
1286 | PRINTK("for %llu schedule op %ld on disc %d\n", | ||
1287 | (unsigned long long)sh->sector, bi->bi_rw, i); | ||
1288 | atomic_inc(&sh->count); | ||
1289 | bi->bi_sector = sh->sector + rdev->data_offset; | ||
1290 | bi->bi_flags = 1 << BIO_UPTODATE; | ||
1291 | bi->bi_vcnt = 1; | ||
1292 | bi->bi_max_vecs = 1; | ||
1293 | bi->bi_idx = 0; | ||
1294 | bi->bi_io_vec = &sh->dev[i].vec; | ||
1295 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | ||
1296 | bi->bi_io_vec[0].bv_offset = 0; | ||
1297 | bi->bi_size = STRIPE_SIZE; | ||
1298 | bi->bi_next = NULL; | ||
1299 | generic_make_request(bi); | ||
1300 | } else { | ||
1301 | PRINTK("skip op %ld on disc %d for sector %llu\n", | ||
1302 | bi->bi_rw, i, (unsigned long long)sh->sector); | ||
1303 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
1304 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1305 | } | ||
1306 | } | ||
1307 | } | ||
1308 | |||
1309 | static inline void raid5_activate_delayed(raid5_conf_t *conf) | ||
1310 | { | ||
1311 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { | ||
1312 | while (!list_empty(&conf->delayed_list)) { | ||
1313 | struct list_head *l = conf->delayed_list.next; | ||
1314 | struct stripe_head *sh; | ||
1315 | sh = list_entry(l, struct stripe_head, lru); | ||
1316 | list_del_init(l); | ||
1317 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
1318 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
1319 | atomic_inc(&conf->preread_active_stripes); | ||
1320 | list_add_tail(&sh->lru, &conf->handle_list); | ||
1321 | } | ||
1322 | } | ||
1323 | } | ||
1324 | |||
1325 | static void unplug_slaves(mddev_t *mddev) | ||
1326 | { | ||
1327 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
1328 | int i; | ||
1329 | |||
1330 | rcu_read_lock(); | ||
1331 | for (i=0; i<mddev->raid_disks; i++) { | ||
1332 | mdk_rdev_t *rdev = conf->disks[i].rdev; | ||
1333 | if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { | ||
1334 | request_queue_t *r_queue = bdev_get_queue(rdev->bdev); | ||
1335 | |||
1336 | atomic_inc(&rdev->nr_pending); | ||
1337 | rcu_read_unlock(); | ||
1338 | |||
1339 | if (r_queue->unplug_fn) | ||
1340 | r_queue->unplug_fn(r_queue); | ||
1341 | |||
1342 | rdev_dec_pending(rdev, mddev); | ||
1343 | rcu_read_lock(); | ||
1344 | } | ||
1345 | } | ||
1346 | rcu_read_unlock(); | ||
1347 | } | ||
1348 | |||
1349 | static void raid5_unplug_device(request_queue_t *q) | ||
1350 | { | ||
1351 | mddev_t *mddev = q->queuedata; | ||
1352 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
1353 | unsigned long flags; | ||
1354 | |||
1355 | spin_lock_irqsave(&conf->device_lock, flags); | ||
1356 | |||
1357 | if (blk_remove_plug(q)) | ||
1358 | raid5_activate_delayed(conf); | ||
1359 | md_wakeup_thread(mddev->thread); | ||
1360 | |||
1361 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
1362 | |||
1363 | unplug_slaves(mddev); | ||
1364 | } | ||
1365 | |||
1366 | static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, | ||
1367 | sector_t *error_sector) | ||
1368 | { | ||
1369 | mddev_t *mddev = q->queuedata; | ||
1370 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
1371 | int i, ret = 0; | ||
1372 | |||
1373 | rcu_read_lock(); | ||
1374 | for (i=0; i<mddev->raid_disks && ret == 0; i++) { | ||
1375 | mdk_rdev_t *rdev = conf->disks[i].rdev; | ||
1376 | if (rdev && !rdev->faulty) { | ||
1377 | struct block_device *bdev = rdev->bdev; | ||
1378 | request_queue_t *r_queue = bdev_get_queue(bdev); | ||
1379 | |||
1380 | if (!r_queue->issue_flush_fn) | ||
1381 | ret = -EOPNOTSUPP; | ||
1382 | else { | ||
1383 | atomic_inc(&rdev->nr_pending); | ||
1384 | rcu_read_unlock(); | ||
1385 | ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, | ||
1386 | error_sector); | ||
1387 | rdev_dec_pending(rdev, mddev); | ||
1388 | rcu_read_lock(); | ||
1389 | } | ||
1390 | } | ||
1391 | } | ||
1392 | rcu_read_unlock(); | ||
1393 | return ret; | ||
1394 | } | ||
1395 | |||
1396 | static inline void raid5_plug_device(raid5_conf_t *conf) | ||
1397 | { | ||
1398 | spin_lock_irq(&conf->device_lock); | ||
1399 | blk_plug_device(conf->mddev->queue); | ||
1400 | spin_unlock_irq(&conf->device_lock); | ||
1401 | } | ||
1402 | |||
1403 | static int make_request (request_queue_t *q, struct bio * bi) | ||
1404 | { | ||
1405 | mddev_t *mddev = q->queuedata; | ||
1406 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
1407 | const unsigned int raid_disks = conf->raid_disks; | ||
1408 | const unsigned int data_disks = raid_disks - 1; | ||
1409 | unsigned int dd_idx, pd_idx; | ||
1410 | sector_t new_sector; | ||
1411 | sector_t logical_sector, last_sector; | ||
1412 | struct stripe_head *sh; | ||
1413 | |||
1414 | if (bio_data_dir(bi)==WRITE) { | ||
1415 | disk_stat_inc(mddev->gendisk, writes); | ||
1416 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); | ||
1417 | } else { | ||
1418 | disk_stat_inc(mddev->gendisk, reads); | ||
1419 | disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi)); | ||
1420 | } | ||
1421 | |||
1422 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | ||
1423 | last_sector = bi->bi_sector + (bi->bi_size>>9); | ||
1424 | bi->bi_next = NULL; | ||
1425 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ | ||
1426 | if ( bio_data_dir(bi) == WRITE ) | ||
1427 | md_write_start(mddev); | ||
1428 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { | ||
1429 | DEFINE_WAIT(w); | ||
1430 | |||
1431 | new_sector = raid5_compute_sector(logical_sector, | ||
1432 | raid_disks, data_disks, &dd_idx, &pd_idx, conf); | ||
1433 | |||
1434 | PRINTK("raid5: make_request, sector %llu logical %llu\n", | ||
1435 | (unsigned long long)new_sector, | ||
1436 | (unsigned long long)logical_sector); | ||
1437 | |||
1438 | retry: | ||
1439 | prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); | ||
1440 | sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK)); | ||
1441 | if (sh) { | ||
1442 | if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { | ||
1443 | /* Add failed due to overlap. Flush everything | ||
1444 | * and wait a while | ||
1445 | */ | ||
1446 | raid5_unplug_device(mddev->queue); | ||
1447 | release_stripe(sh); | ||
1448 | schedule(); | ||
1449 | goto retry; | ||
1450 | } | ||
1451 | finish_wait(&conf->wait_for_overlap, &w); | ||
1452 | raid5_plug_device(conf); | ||
1453 | handle_stripe(sh); | ||
1454 | release_stripe(sh); | ||
1455 | |||
1456 | } else { | ||
1457 | /* cannot get stripe for read-ahead, just give-up */ | ||
1458 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
1459 | finish_wait(&conf->wait_for_overlap, &w); | ||
1460 | break; | ||
1461 | } | ||
1462 | |||
1463 | } | ||
1464 | spin_lock_irq(&conf->device_lock); | ||
1465 | if (--bi->bi_phys_segments == 0) { | ||
1466 | int bytes = bi->bi_size; | ||
1467 | |||
1468 | if ( bio_data_dir(bi) == WRITE ) | ||
1469 | md_write_end(mddev); | ||
1470 | bi->bi_size = 0; | ||
1471 | bi->bi_end_io(bi, bytes, 0); | ||
1472 | } | ||
1473 | spin_unlock_irq(&conf->device_lock); | ||
1474 | return 0; | ||
1475 | } | ||
1476 | |||
1477 | /* FIXME go_faster isn't used */ | ||
1478 | static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) | ||
1479 | { | ||
1480 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | ||
1481 | struct stripe_head *sh; | ||
1482 | int sectors_per_chunk = conf->chunk_size >> 9; | ||
1483 | sector_t x; | ||
1484 | unsigned long stripe; | ||
1485 | int chunk_offset; | ||
1486 | int dd_idx, pd_idx; | ||
1487 | sector_t first_sector; | ||
1488 | int raid_disks = conf->raid_disks; | ||
1489 | int data_disks = raid_disks-1; | ||
1490 | |||
1491 | if (sector_nr >= mddev->size <<1) { | ||
1492 | /* just being told to finish up .. nothing much to do */ | ||
1493 | unplug_slaves(mddev); | ||
1494 | return 0; | ||
1495 | } | ||
1496 | /* if there is 1 or more failed drives and we are trying | ||
1497 | * to resync, then assert that we are finished, because there is | ||
1498 | * nothing we can do. | ||
1499 | */ | ||
1500 | if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | ||
1501 | int rv = (mddev->size << 1) - sector_nr; | ||
1502 | md_done_sync(mddev, rv, 1); | ||
1503 | return rv; | ||
1504 | } | ||
1505 | |||
1506 | x = sector_nr; | ||
1507 | chunk_offset = sector_div(x, sectors_per_chunk); | ||
1508 | stripe = x; | ||
1509 | BUG_ON(x != stripe); | ||
1510 | |||
1511 | first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk | ||
1512 | + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); | ||
1513 | sh = get_active_stripe(conf, sector_nr, pd_idx, 1); | ||
1514 | if (sh == NULL) { | ||
1515 | sh = get_active_stripe(conf, sector_nr, pd_idx, 0); | ||
1516 | /* make sure we don't swamp the stripe cache if someone else | ||
1517 | * is trying to get access | ||
1518 | */ | ||
1519 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1520 | schedule_timeout(1); | ||
1521 | } | ||
1522 | spin_lock(&sh->lock); | ||
1523 | set_bit(STRIPE_SYNCING, &sh->state); | ||
1524 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
1525 | spin_unlock(&sh->lock); | ||
1526 | |||
1527 | handle_stripe(sh); | ||
1528 | release_stripe(sh); | ||
1529 | |||
1530 | return STRIPE_SECTORS; | ||
1531 | } | ||
1532 | |||
1533 | /* | ||
1534 | * This is our raid5 kernel thread. | ||
1535 | * | ||
1536 | * We scan the hash table for stripes which can be handled now. | ||
1537 | * During the scan, completed stripes are saved for us by the interrupt | ||
1538 | * handler, so that they will not have to wait for our next wakeup. | ||
1539 | */ | ||
1540 | static void raid5d (mddev_t *mddev) | ||
1541 | { | ||
1542 | struct stripe_head *sh; | ||
1543 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
1544 | int handled; | ||
1545 | |||
1546 | PRINTK("+++ raid5d active\n"); | ||
1547 | |||
1548 | md_check_recovery(mddev); | ||
1549 | md_handle_safemode(mddev); | ||
1550 | |||
1551 | handled = 0; | ||
1552 | spin_lock_irq(&conf->device_lock); | ||
1553 | while (1) { | ||
1554 | struct list_head *first; | ||
1555 | |||
1556 | if (list_empty(&conf->handle_list) && | ||
1557 | atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && | ||
1558 | !blk_queue_plugged(mddev->queue) && | ||
1559 | !list_empty(&conf->delayed_list)) | ||
1560 | raid5_activate_delayed(conf); | ||
1561 | |||
1562 | if (list_empty(&conf->handle_list)) | ||
1563 | break; | ||
1564 | |||
1565 | first = conf->handle_list.next; | ||
1566 | sh = list_entry(first, struct stripe_head, lru); | ||
1567 | |||
1568 | list_del_init(first); | ||
1569 | atomic_inc(&sh->count); | ||
1570 | if (atomic_read(&sh->count)!= 1) | ||
1571 | BUG(); | ||
1572 | spin_unlock_irq(&conf->device_lock); | ||
1573 | |||
1574 | handled++; | ||
1575 | handle_stripe(sh); | ||
1576 | release_stripe(sh); | ||
1577 | |||
1578 | spin_lock_irq(&conf->device_lock); | ||
1579 | } | ||
1580 | PRINTK("%d stripes handled\n", handled); | ||
1581 | |||
1582 | spin_unlock_irq(&conf->device_lock); | ||
1583 | |||
1584 | unplug_slaves(mddev); | ||
1585 | |||
1586 | PRINTK("--- raid5d inactive\n"); | ||
1587 | } | ||
1588 | |||
1589 | static int run (mddev_t *mddev) | ||
1590 | { | ||
1591 | raid5_conf_t *conf; | ||
1592 | int raid_disk, memory; | ||
1593 | mdk_rdev_t *rdev; | ||
1594 | struct disk_info *disk; | ||
1595 | struct list_head *tmp; | ||
1596 | |||
1597 | if (mddev->level != 5 && mddev->level != 4) { | ||
1598 | printk("raid5: %s: raid level not set to 4/5 (%d)\n", mdname(mddev), mddev->level); | ||
1599 | return -EIO; | ||
1600 | } | ||
1601 | |||
1602 | mddev->private = kmalloc (sizeof (raid5_conf_t) | ||
1603 | + mddev->raid_disks * sizeof(struct disk_info), | ||
1604 | GFP_KERNEL); | ||
1605 | if ((conf = mddev->private) == NULL) | ||
1606 | goto abort; | ||
1607 | memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) ); | ||
1608 | conf->mddev = mddev; | ||
1609 | |||
1610 | if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) | ||
1611 | goto abort; | ||
1612 | memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); | ||
1613 | |||
1614 | spin_lock_init(&conf->device_lock); | ||
1615 | init_waitqueue_head(&conf->wait_for_stripe); | ||
1616 | init_waitqueue_head(&conf->wait_for_overlap); | ||
1617 | INIT_LIST_HEAD(&conf->handle_list); | ||
1618 | INIT_LIST_HEAD(&conf->delayed_list); | ||
1619 | INIT_LIST_HEAD(&conf->inactive_list); | ||
1620 | atomic_set(&conf->active_stripes, 0); | ||
1621 | atomic_set(&conf->preread_active_stripes, 0); | ||
1622 | |||
1623 | mddev->queue->unplug_fn = raid5_unplug_device; | ||
1624 | mddev->queue->issue_flush_fn = raid5_issue_flush; | ||
1625 | |||
1626 | PRINTK("raid5: run(%s) called.\n", mdname(mddev)); | ||
1627 | |||
1628 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
1629 | raid_disk = rdev->raid_disk; | ||
1630 | if (raid_disk >= mddev->raid_disks | ||
1631 | || raid_disk < 0) | ||
1632 | continue; | ||
1633 | disk = conf->disks + raid_disk; | ||
1634 | |||
1635 | disk->rdev = rdev; | ||
1636 | |||
1637 | if (rdev->in_sync) { | ||
1638 | char b[BDEVNAME_SIZE]; | ||
1639 | printk(KERN_INFO "raid5: device %s operational as raid" | ||
1640 | " disk %d\n", bdevname(rdev->bdev,b), | ||
1641 | raid_disk); | ||
1642 | conf->working_disks++; | ||
1643 | } | ||
1644 | } | ||
1645 | |||
1646 | conf->raid_disks = mddev->raid_disks; | ||
1647 | /* | ||
1648 | * 0 for a fully functional array, 1 for a degraded array. | ||
1649 | */ | ||
1650 | mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks; | ||
1651 | conf->mddev = mddev; | ||
1652 | conf->chunk_size = mddev->chunk_size; | ||
1653 | conf->level = mddev->level; | ||
1654 | conf->algorithm = mddev->layout; | ||
1655 | conf->max_nr_stripes = NR_STRIPES; | ||
1656 | |||
1657 | /* device size must be a multiple of chunk size */ | ||
1658 | mddev->size &= ~(mddev->chunk_size/1024 -1); | ||
1659 | |||
1660 | if (!conf->chunk_size || conf->chunk_size % 4) { | ||
1661 | printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", | ||
1662 | conf->chunk_size, mdname(mddev)); | ||
1663 | goto abort; | ||
1664 | } | ||
1665 | if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { | ||
1666 | printk(KERN_ERR | ||
1667 | "raid5: unsupported parity algorithm %d for %s\n", | ||
1668 | conf->algorithm, mdname(mddev)); | ||
1669 | goto abort; | ||
1670 | } | ||
1671 | if (mddev->degraded > 1) { | ||
1672 | printk(KERN_ERR "raid5: not enough operational devices for %s" | ||
1673 | " (%d/%d failed)\n", | ||
1674 | mdname(mddev), conf->failed_disks, conf->raid_disks); | ||
1675 | goto abort; | ||
1676 | } | ||
1677 | |||
1678 | if (mddev->degraded == 1 && | ||
1679 | mddev->recovery_cp != MaxSector) { | ||
1680 | printk(KERN_ERR | ||
1681 | "raid5: cannot start dirty degraded array for %s\n", | ||
1682 | mdname(mddev)); | ||
1683 | goto abort; | ||
1684 | } | ||
1685 | |||
1686 | { | ||
1687 | mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5"); | ||
1688 | if (!mddev->thread) { | ||
1689 | printk(KERN_ERR | ||
1690 | "raid5: couldn't allocate thread for %s\n", | ||
1691 | mdname(mddev)); | ||
1692 | goto abort; | ||
1693 | } | ||
1694 | } | ||
1695 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + | ||
1696 | conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | ||
1697 | if (grow_stripes(conf, conf->max_nr_stripes)) { | ||
1698 | printk(KERN_ERR | ||
1699 | "raid5: couldn't allocate %dkB for buffers\n", memory); | ||
1700 | shrink_stripes(conf); | ||
1701 | md_unregister_thread(mddev->thread); | ||
1702 | goto abort; | ||
1703 | } else | ||
1704 | printk(KERN_INFO "raid5: allocated %dkB for %s\n", | ||
1705 | memory, mdname(mddev)); | ||
1706 | |||
1707 | if (mddev->degraded == 0) | ||
1708 | printk("raid5: raid level %d set %s active with %d out of %d" | ||
1709 | " devices, algorithm %d\n", conf->level, mdname(mddev), | ||
1710 | mddev->raid_disks-mddev->degraded, mddev->raid_disks, | ||
1711 | conf->algorithm); | ||
1712 | else | ||
1713 | printk(KERN_ALERT "raid5: raid level %d set %s active with %d" | ||
1714 | " out of %d devices, algorithm %d\n", conf->level, | ||
1715 | mdname(mddev), mddev->raid_disks - mddev->degraded, | ||
1716 | mddev->raid_disks, conf->algorithm); | ||
1717 | |||
1718 | print_raid5_conf(conf); | ||
1719 | |||
1720 | /* read-ahead size must cover two whole stripes, which is | ||
1721 | * 2 * (n-1) * chunksize where 'n' is the number of raid devices | ||
1722 | */ | ||
1723 | { | ||
1724 | int stripe = (mddev->raid_disks-1) * mddev->chunk_size | ||
1725 | / PAGE_CACHE_SIZE; | ||
1726 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | ||
1727 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | ||
1728 | } | ||
1729 | |||
1730 | /* Ok, everything is just fine now */ | ||
1731 | mddev->array_size = mddev->size * (mddev->raid_disks - 1); | ||
1732 | return 0; | ||
1733 | abort: | ||
1734 | if (conf) { | ||
1735 | print_raid5_conf(conf); | ||
1736 | if (conf->stripe_hashtbl) | ||
1737 | free_pages((unsigned long) conf->stripe_hashtbl, | ||
1738 | HASH_PAGES_ORDER); | ||
1739 | kfree(conf); | ||
1740 | } | ||
1741 | mddev->private = NULL; | ||
1742 | printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); | ||
1743 | return -EIO; | ||
1744 | } | ||
1745 | |||
1746 | |||
1747 | |||
1748 | static int stop (mddev_t *mddev) | ||
1749 | { | ||
1750 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | ||
1751 | |||
1752 | md_unregister_thread(mddev->thread); | ||
1753 | mddev->thread = NULL; | ||
1754 | shrink_stripes(conf); | ||
1755 | free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); | ||
1756 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | ||
1757 | kfree(conf); | ||
1758 | mddev->private = NULL; | ||
1759 | return 0; | ||
1760 | } | ||
1761 | |||
1762 | #if RAID5_DEBUG | ||
1763 | static void print_sh (struct stripe_head *sh) | ||
1764 | { | ||
1765 | int i; | ||
1766 | |||
1767 | printk("sh %llu, pd_idx %d, state %ld.\n", | ||
1768 | (unsigned long long)sh->sector, sh->pd_idx, sh->state); | ||
1769 | printk("sh %llu, count %d.\n", | ||
1770 | (unsigned long long)sh->sector, atomic_read(&sh->count)); | ||
1771 | printk("sh %llu, ", (unsigned long long)sh->sector); | ||
1772 | for (i = 0; i < sh->raid_conf->raid_disks; i++) { | ||
1773 | printk("(cache%d: %p %ld) ", | ||
1774 | i, sh->dev[i].page, sh->dev[i].flags); | ||
1775 | } | ||
1776 | printk("\n"); | ||
1777 | } | ||
1778 | |||
1779 | static void printall (raid5_conf_t *conf) | ||
1780 | { | ||
1781 | struct stripe_head *sh; | ||
1782 | int i; | ||
1783 | |||
1784 | spin_lock_irq(&conf->device_lock); | ||
1785 | for (i = 0; i < NR_HASH; i++) { | ||
1786 | sh = conf->stripe_hashtbl[i]; | ||
1787 | for (; sh; sh = sh->hash_next) { | ||
1788 | if (sh->raid_conf != conf) | ||
1789 | continue; | ||
1790 | print_sh(sh); | ||
1791 | } | ||
1792 | } | ||
1793 | spin_unlock_irq(&conf->device_lock); | ||
1794 | } | ||
1795 | #endif | ||
1796 | |||
1797 | static void status (struct seq_file *seq, mddev_t *mddev) | ||
1798 | { | ||
1799 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | ||
1800 | int i; | ||
1801 | |||
1802 | seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); | ||
1803 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); | ||
1804 | for (i = 0; i < conf->raid_disks; i++) | ||
1805 | seq_printf (seq, "%s", | ||
1806 | conf->disks[i].rdev && | ||
1807 | conf->disks[i].rdev->in_sync ? "U" : "_"); | ||
1808 | seq_printf (seq, "]"); | ||
1809 | #if RAID5_DEBUG | ||
1810 | #define D(x) \ | ||
1811 | seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x)) | ||
1812 | printall(conf); | ||
1813 | #endif | ||
1814 | } | ||
1815 | |||
1816 | static void print_raid5_conf (raid5_conf_t *conf) | ||
1817 | { | ||
1818 | int i; | ||
1819 | struct disk_info *tmp; | ||
1820 | |||
1821 | printk("RAID5 conf printout:\n"); | ||
1822 | if (!conf) { | ||
1823 | printk("(conf==NULL)\n"); | ||
1824 | return; | ||
1825 | } | ||
1826 | printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, | ||
1827 | conf->working_disks, conf->failed_disks); | ||
1828 | |||
1829 | for (i = 0; i < conf->raid_disks; i++) { | ||
1830 | char b[BDEVNAME_SIZE]; | ||
1831 | tmp = conf->disks + i; | ||
1832 | if (tmp->rdev) | ||
1833 | printk(" disk %d, o:%d, dev:%s\n", | ||
1834 | i, !tmp->rdev->faulty, | ||
1835 | bdevname(tmp->rdev->bdev,b)); | ||
1836 | } | ||
1837 | } | ||
1838 | |||
1839 | static int raid5_spare_active(mddev_t *mddev) | ||
1840 | { | ||
1841 | int i; | ||
1842 | raid5_conf_t *conf = mddev->private; | ||
1843 | struct disk_info *tmp; | ||
1844 | |||
1845 | for (i = 0; i < conf->raid_disks; i++) { | ||
1846 | tmp = conf->disks + i; | ||
1847 | if (tmp->rdev | ||
1848 | && !tmp->rdev->faulty | ||
1849 | && !tmp->rdev->in_sync) { | ||
1850 | mddev->degraded--; | ||
1851 | conf->failed_disks--; | ||
1852 | conf->working_disks++; | ||
1853 | tmp->rdev->in_sync = 1; | ||
1854 | } | ||
1855 | } | ||
1856 | print_raid5_conf(conf); | ||
1857 | return 0; | ||
1858 | } | ||
1859 | |||
1860 | static int raid5_remove_disk(mddev_t *mddev, int number) | ||
1861 | { | ||
1862 | raid5_conf_t *conf = mddev->private; | ||
1863 | int err = 0; | ||
1864 | mdk_rdev_t *rdev; | ||
1865 | struct disk_info *p = conf->disks + number; | ||
1866 | |||
1867 | print_raid5_conf(conf); | ||
1868 | rdev = p->rdev; | ||
1869 | if (rdev) { | ||
1870 | if (rdev->in_sync || | ||
1871 | atomic_read(&rdev->nr_pending)) { | ||
1872 | err = -EBUSY; | ||
1873 | goto abort; | ||
1874 | } | ||
1875 | p->rdev = NULL; | ||
1876 | synchronize_kernel(); | ||
1877 | if (atomic_read(&rdev->nr_pending)) { | ||
1878 | /* lost the race, try later */ | ||
1879 | err = -EBUSY; | ||
1880 | p->rdev = rdev; | ||
1881 | } | ||
1882 | } | ||
1883 | abort: | ||
1884 | |||
1885 | print_raid5_conf(conf); | ||
1886 | return err; | ||
1887 | } | ||
1888 | |||
1889 | static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | ||
1890 | { | ||
1891 | raid5_conf_t *conf = mddev->private; | ||
1892 | int found = 0; | ||
1893 | int disk; | ||
1894 | struct disk_info *p; | ||
1895 | |||
1896 | if (mddev->degraded > 1) | ||
1897 | /* no point adding a device */ | ||
1898 | return 0; | ||
1899 | |||
1900 | /* | ||
1901 | * find the disk ... | ||
1902 | */ | ||
1903 | for (disk=0; disk < mddev->raid_disks; disk++) | ||
1904 | if ((p=conf->disks + disk)->rdev == NULL) { | ||
1905 | rdev->in_sync = 0; | ||
1906 | rdev->raid_disk = disk; | ||
1907 | found = 1; | ||
1908 | p->rdev = rdev; | ||
1909 | break; | ||
1910 | } | ||
1911 | print_raid5_conf(conf); | ||
1912 | return found; | ||
1913 | } | ||
1914 | |||
1915 | static int raid5_resize(mddev_t *mddev, sector_t sectors) | ||
1916 | { | ||
1917 | /* no resync is happening, and there is enough space | ||
1918 | * on all devices, so we can resize. | ||
1919 | * We need to make sure resync covers any new space. | ||
1920 | * If the array is shrinking we should possibly wait until | ||
1921 | * any io in the removed space completes, but it hardly seems | ||
1922 | * worth it. | ||
1923 | */ | ||
1924 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); | ||
1925 | mddev->array_size = (sectors * (mddev->raid_disks-1))>>1; | ||
1926 | set_capacity(mddev->gendisk, mddev->array_size << 1); | ||
1927 | mddev->changed = 1; | ||
1928 | if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { | ||
1929 | mddev->recovery_cp = mddev->size << 1; | ||
1930 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
1931 | } | ||
1932 | mddev->size = sectors /2; | ||
1933 | return 0; | ||
1934 | } | ||
1935 | |||
1936 | static mdk_personality_t raid5_personality= | ||
1937 | { | ||
1938 | .name = "raid5", | ||
1939 | .owner = THIS_MODULE, | ||
1940 | .make_request = make_request, | ||
1941 | .run = run, | ||
1942 | .stop = stop, | ||
1943 | .status = status, | ||
1944 | .error_handler = error, | ||
1945 | .hot_add_disk = raid5_add_disk, | ||
1946 | .hot_remove_disk= raid5_remove_disk, | ||
1947 | .spare_active = raid5_spare_active, | ||
1948 | .sync_request = sync_request, | ||
1949 | .resize = raid5_resize, | ||
1950 | }; | ||
1951 | |||
1952 | static int __init raid5_init (void) | ||
1953 | { | ||
1954 | return register_md_personality (RAID5, &raid5_personality); | ||
1955 | } | ||
1956 | |||
1957 | static void raid5_exit (void) | ||
1958 | { | ||
1959 | unregister_md_personality (RAID5); | ||
1960 | } | ||
1961 | |||
1962 | module_init(raid5_init); | ||
1963 | module_exit(raid5_exit); | ||
1964 | MODULE_LICENSE("GPL"); | ||
1965 | MODULE_ALIAS("md-personality-4"); /* RAID5 */ | ||
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h new file mode 100644 index 000000000000..f80ee6350edf --- /dev/null +++ b/drivers/md/raid6.h | |||
@@ -0,0 +1,135 @@ | |||
1 | /* -*- linux-c -*- ------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2003 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | ||
9 | * (at your option) any later version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | #ifndef LINUX_RAID_RAID6_H | ||
14 | #define LINUX_RAID_RAID6_H | ||
15 | |||
16 | #ifdef __KERNEL__ | ||
17 | |||
18 | /* Set to 1 to use kernel-wide empty_zero_page */ | ||
19 | #define RAID6_USE_EMPTY_ZERO_PAGE 0 | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/stddef.h> | ||
23 | #include <linux/compiler.h> | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/errno.h> | ||
27 | #include <linux/mempool.h> | ||
28 | #include <linux/list.h> | ||
29 | #include <linux/vmalloc.h> | ||
30 | #include <linux/raid/md.h> | ||
31 | #include <linux/raid/raid5.h> | ||
32 | |||
33 | typedef raid5_conf_t raid6_conf_t; /* Same configuration */ | ||
34 | |||
35 | /* Additional compute_parity mode -- updates the parity w/o LOCKING */ | ||
36 | #define UPDATE_PARITY 4 | ||
37 | |||
38 | /* We need a pre-zeroed page... if we don't want to use the kernel-provided | ||
39 | one define it here */ | ||
40 | #if RAID6_USE_EMPTY_ZERO_PAGE | ||
41 | # define raid6_empty_zero_page empty_zero_page | ||
42 | #else | ||
43 | extern const char raid6_empty_zero_page[PAGE_SIZE]; | ||
44 | #endif | ||
45 | |||
46 | #else /* ! __KERNEL__ */ | ||
47 | /* Used for testing in user space */ | ||
48 | |||
49 | #include <errno.h> | ||
50 | #include <inttypes.h> | ||
51 | #include <limits.h> | ||
52 | #include <stddef.h> | ||
53 | #include <sys/mman.h> | ||
54 | #include <sys/types.h> | ||
55 | |||
56 | /* Not standard, but glibc defines it */ | ||
57 | #define BITS_PER_LONG __WORDSIZE | ||
58 | |||
59 | typedef uint8_t u8; | ||
60 | typedef uint16_t u16; | ||
61 | typedef uint32_t u32; | ||
62 | typedef uint64_t u64; | ||
63 | |||
64 | #ifndef PAGE_SIZE | ||
65 | # define PAGE_SIZE 4096 | ||
66 | #endif | ||
67 | extern const char raid6_empty_zero_page[PAGE_SIZE]; | ||
68 | |||
69 | #define __init | ||
70 | #define __exit | ||
71 | #define __attribute_const__ __attribute__((const)) | ||
72 | |||
73 | #define preempt_enable() | ||
74 | #define preempt_disable() | ||
75 | |||
76 | #endif /* __KERNEL__ */ | ||
77 | |||
78 | /* Routine choices */ | ||
79 | struct raid6_calls { | ||
80 | void (*gen_syndrome)(int, size_t, void **); | ||
81 | int (*valid)(void); /* Returns 1 if this routine set is usable */ | ||
82 | const char *name; /* Name of this routine set */ | ||
83 | int prefer; /* Has special performance attribute */ | ||
84 | }; | ||
85 | |||
86 | /* Selected algorithm */ | ||
87 | extern struct raid6_calls raid6_call; | ||
88 | |||
89 | /* Algorithm list */ | ||
90 | extern const struct raid6_calls * const raid6_algos[]; | ||
91 | int raid6_select_algo(void); | ||
92 | |||
93 | /* Return values from chk_syndrome */ | ||
94 | #define RAID6_OK 0 | ||
95 | #define RAID6_P_BAD 1 | ||
96 | #define RAID6_Q_BAD 2 | ||
97 | #define RAID6_PQ_BAD 3 | ||
98 | |||
99 | /* Galois field tables */ | ||
100 | extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); | ||
101 | extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); | ||
102 | extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); | ||
103 | extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); | ||
104 | |||
105 | /* Recovery routines */ | ||
106 | void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); | ||
107 | void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); | ||
108 | void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); | ||
109 | |||
110 | /* Some definitions to allow code to be compiled for testing in userspace */ | ||
111 | #ifndef __KERNEL__ | ||
112 | |||
113 | # define jiffies raid6_jiffies() | ||
114 | # define printk printf | ||
115 | # define GFP_KERNEL 0 | ||
116 | # define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0)) | ||
117 | # define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE) | ||
118 | |||
119 | static inline void cpu_relax(void) | ||
120 | { | ||
121 | /* Nothing */ | ||
122 | } | ||
123 | |||
124 | #undef HZ | ||
125 | #define HZ 1000 | ||
126 | static inline uint32_t raid6_jiffies(void) | ||
127 | { | ||
128 | struct timeval tv; | ||
129 | gettimeofday(&tv, NULL); | ||
130 | return tv.tv_sec*1000 + tv.tv_usec/1000; | ||
131 | } | ||
132 | |||
133 | #endif /* ! __KERNEL__ */ | ||
134 | |||
135 | #endif /* LINUX_RAID_RAID6_H */ | ||
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c new file mode 100644 index 000000000000..acf386fc4b4f --- /dev/null +++ b/drivers/md/raid6algos.c | |||
@@ -0,0 +1,153 @@ | |||
1 | /* -*- linux-c -*- ------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2002 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | ||
9 | * (at your option) any later version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * raid6algos.c | ||
15 | * | ||
16 | * Algorithm list and algorithm selection for RAID-6 | ||
17 | */ | ||
18 | |||
19 | #include "raid6.h" | ||
20 | #ifndef __KERNEL__ | ||
21 | #include <sys/mman.h> | ||
22 | #endif | ||
23 | |||
24 | struct raid6_calls raid6_call; | ||
25 | |||
26 | /* Various routine sets */ | ||
27 | extern const struct raid6_calls raid6_intx1; | ||
28 | extern const struct raid6_calls raid6_intx2; | ||
29 | extern const struct raid6_calls raid6_intx4; | ||
30 | extern const struct raid6_calls raid6_intx8; | ||
31 | extern const struct raid6_calls raid6_intx16; | ||
32 | extern const struct raid6_calls raid6_intx32; | ||
33 | extern const struct raid6_calls raid6_mmxx1; | ||
34 | extern const struct raid6_calls raid6_mmxx2; | ||
35 | extern const struct raid6_calls raid6_sse1x1; | ||
36 | extern const struct raid6_calls raid6_sse1x2; | ||
37 | extern const struct raid6_calls raid6_sse2x1; | ||
38 | extern const struct raid6_calls raid6_sse2x2; | ||
39 | extern const struct raid6_calls raid6_sse2x4; | ||
40 | extern const struct raid6_calls raid6_altivec1; | ||
41 | extern const struct raid6_calls raid6_altivec2; | ||
42 | extern const struct raid6_calls raid6_altivec4; | ||
43 | extern const struct raid6_calls raid6_altivec8; | ||
44 | |||
45 | const struct raid6_calls * const raid6_algos[] = { | ||
46 | &raid6_intx1, | ||
47 | &raid6_intx2, | ||
48 | &raid6_intx4, | ||
49 | &raid6_intx8, | ||
50 | #if defined(__ia64__) | ||
51 | &raid6_intx16, | ||
52 | &raid6_intx32, | ||
53 | #endif | ||
54 | #if defined(__i386__) | ||
55 | &raid6_mmxx1, | ||
56 | &raid6_mmxx2, | ||
57 | &raid6_sse1x1, | ||
58 | &raid6_sse1x2, | ||
59 | &raid6_sse2x1, | ||
60 | &raid6_sse2x2, | ||
61 | #endif | ||
62 | #if defined(__x86_64__) | ||
63 | &raid6_sse2x1, | ||
64 | &raid6_sse2x2, | ||
65 | &raid6_sse2x4, | ||
66 | #endif | ||
67 | #ifdef CONFIG_ALTIVEC | ||
68 | &raid6_altivec1, | ||
69 | &raid6_altivec2, | ||
70 | &raid6_altivec4, | ||
71 | &raid6_altivec8, | ||
72 | #endif | ||
73 | NULL | ||
74 | }; | ||
75 | |||
76 | #ifdef __KERNEL__ | ||
77 | #define RAID6_TIME_JIFFIES_LG2 4 | ||
78 | #else | ||
79 | /* Need more time to be stable in userspace */ | ||
80 | #define RAID6_TIME_JIFFIES_LG2 9 | ||
81 | #endif | ||
82 | |||
83 | /* Try to pick the best algorithm */ | ||
84 | /* This code uses the gfmul table as convenient data set to abuse */ | ||
85 | |||
86 | int __init raid6_select_algo(void) | ||
87 | { | ||
88 | const struct raid6_calls * const * algo; | ||
89 | const struct raid6_calls * best; | ||
90 | char *syndromes; | ||
91 | void *dptrs[(65536/PAGE_SIZE)+2]; | ||
92 | int i, disks; | ||
93 | unsigned long perf, bestperf; | ||
94 | int bestprefer; | ||
95 | unsigned long j0, j1; | ||
96 | |||
97 | disks = (65536/PAGE_SIZE)+2; | ||
98 | for ( i = 0 ; i < disks-2 ; i++ ) { | ||
99 | dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; | ||
100 | } | ||
101 | |||
102 | /* Normal code - use a 2-page allocation to avoid D$ conflict */ | ||
103 | syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); | ||
104 | |||
105 | if ( !syndromes ) { | ||
106 | printk("raid6: Yikes! No memory available.\n"); | ||
107 | return -ENOMEM; | ||
108 | } | ||
109 | |||
110 | dptrs[disks-2] = syndromes; | ||
111 | dptrs[disks-1] = syndromes + PAGE_SIZE; | ||
112 | |||
113 | bestperf = 0; bestprefer = 0; best = NULL; | ||
114 | |||
115 | for ( algo = raid6_algos ; *algo ; algo++ ) { | ||
116 | if ( !(*algo)->valid || (*algo)->valid() ) { | ||
117 | perf = 0; | ||
118 | |||
119 | preempt_disable(); | ||
120 | j0 = jiffies; | ||
121 | while ( (j1 = jiffies) == j0 ) | ||
122 | cpu_relax(); | ||
123 | while ( (jiffies-j1) < (1 << RAID6_TIME_JIFFIES_LG2) ) { | ||
124 | (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); | ||
125 | perf++; | ||
126 | } | ||
127 | preempt_enable(); | ||
128 | |||
129 | if ( (*algo)->prefer > bestprefer || | ||
130 | ((*algo)->prefer == bestprefer && | ||
131 | perf > bestperf) ) { | ||
132 | best = *algo; | ||
133 | bestprefer = best->prefer; | ||
134 | bestperf = perf; | ||
135 | } | ||
136 | printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, | ||
137 | (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); | ||
138 | } | ||
139 | } | ||
140 | |||
141 | if ( best ) | ||
142 | printk("raid6: using algorithm %s (%ld MB/s)\n", | ||
143 | best->name, | ||
144 | (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); | ||
145 | else | ||
146 | printk("raid6: Yikes! No algorithm found!\n"); | ||
147 | |||
148 | raid6_call = *best; | ||
149 | |||
150 | free_pages((unsigned long)syndromes, 1); | ||
151 | |||
152 | return best ? 0 : -EINVAL; | ||
153 | } | ||
diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc new file mode 100644 index 000000000000..1de8f030eee0 --- /dev/null +++ b/drivers/md/raid6altivec.uc | |||
@@ -0,0 +1,122 @@ | |||
1 | /* -*- linux-c -*- ------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | ||
9 | * (at your option) any later version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * raid6altivec$#.c | ||
15 | * | ||
16 | * $#-way unrolled portable integer math RAID-6 instruction set | ||
17 | * | ||
18 | * This file is postprocessed using unroll.pl | ||
19 | * | ||
20 | * <benh> hpa: in process, | ||
21 | * you can just "steal" the vec unit with enable_kernel_altivec() (but | ||
22 | * bracked this with preempt_disable/enable or in a lock) | ||
23 | */ | ||
24 | |||
25 | #include "raid6.h" | ||
26 | |||
27 | #ifdef CONFIG_ALTIVEC | ||
28 | |||
29 | #include <altivec.h> | ||
30 | #include <asm/system.h> | ||
31 | #include <asm/cputable.h> | ||
32 | |||
33 | /* | ||
34 | * This is the C data type to use | ||
35 | */ | ||
36 | |||
37 | typedef vector unsigned char unative_t; | ||
38 | |||
39 | #define NBYTES(x) ((vector unsigned char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) | ||
40 | #define NSIZE sizeof(unative_t) | ||
41 | |||
42 | /* | ||
43 | * The SHLBYTE() operation shifts each byte left by 1, *not* | ||
44 | * rolling over into the next byte | ||
45 | */ | ||
46 | static inline __attribute_const__ unative_t SHLBYTE(unative_t v) | ||
47 | { | ||
48 | return vec_add(v,v); | ||
49 | } | ||
50 | |||
51 | /* | ||
52 | * The MASK() operation returns 0xFF in any byte for which the high | ||
53 | * bit is 1, 0x00 for any byte for which the high bit is 0. | ||
54 | */ | ||
55 | static inline __attribute_const__ unative_t MASK(unative_t v) | ||
56 | { | ||
57 | unative_t zv = NBYTES(0); | ||
58 | |||
59 | /* vec_cmpgt returns a vector bool char; thus the need for the cast */ | ||
60 | return (unative_t)vec_cmpgt(zv, v); | ||
61 | } | ||
62 | |||
63 | |||
64 | /* This is noinline to make damned sure that gcc doesn't move any of the | ||
65 | Altivec code around the enable/disable code */ | ||
66 | static void noinline | ||
67 | raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs) | ||
68 | { | ||
69 | u8 **dptr = (u8 **)ptrs; | ||
70 | u8 *p, *q; | ||
71 | int d, z, z0; | ||
72 | |||
73 | unative_t wd$$, wq$$, wp$$, w1$$, w2$$; | ||
74 | unative_t x1d = NBYTES(0x1d); | ||
75 | |||
76 | z0 = disks - 3; /* Highest data disk */ | ||
77 | p = dptr[z0+1]; /* XOR parity */ | ||
78 | q = dptr[z0+2]; /* RS syndrome */ | ||
79 | |||
80 | for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { | ||
81 | wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; | ||
82 | for ( z = z0-1 ; z >= 0 ; z-- ) { | ||
83 | wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; | ||
84 | wp$$ = vec_xor(wp$$, wd$$); | ||
85 | w2$$ = MASK(wq$$); | ||
86 | w1$$ = SHLBYTE(wq$$); | ||
87 | w2$$ = vec_and(w2$$, x1d); | ||
88 | w1$$ = vec_xor(w1$$, w2$$); | ||
89 | wq$$ = vec_xor(w1$$, wd$$); | ||
90 | } | ||
91 | *(unative_t *)&p[d+NSIZE*$$] = wp$$; | ||
92 | *(unative_t *)&q[d+NSIZE*$$] = wq$$; | ||
93 | } | ||
94 | } | ||
95 | |||
96 | static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs) | ||
97 | { | ||
98 | preempt_disable(); | ||
99 | enable_kernel_altivec(); | ||
100 | |||
101 | raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs); | ||
102 | |||
103 | preempt_enable(); | ||
104 | } | ||
105 | |||
106 | int raid6_have_altivec(void); | ||
107 | #if $# == 1 | ||
108 | int raid6_have_altivec(void) | ||
109 | { | ||
110 | /* This assumes either all CPUs have Altivec or none does */ | ||
111 | return cpu_has_feature(CPU_FTR_ALTIVEC); | ||
112 | } | ||
113 | #endif | ||
114 | |||
115 | const struct raid6_calls raid6_altivec$# = { | ||
116 | raid6_altivec$#_gen_syndrome, | ||
117 | raid6_have_altivec, | ||
118 | "altivecx$#", | ||
119 | 0 | ||
120 | }; | ||
121 | |||
122 | #endif /* CONFIG_ALTIVEC */ | ||
diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc new file mode 100644 index 000000000000..ad004cee0e26 --- /dev/null +++ b/drivers/md/raid6int.uc | |||
@@ -0,0 +1,117 @@ | |||
1 | /* -*- linux-c -*- ------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | ||
9 | * (at your option) any later version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * raid6int$#.c | ||
15 | * | ||
16 | * $#-way unrolled portable integer math RAID-6 instruction set | ||
17 | * | ||
18 | * This file is postprocessed using unroll.pl | ||
19 | */ | ||
20 | |||
21 | #include "raid6.h" | ||
22 | |||
23 | /* | ||
24 | * This is the C data type to use | ||
25 | */ | ||
26 | |||
27 | /* Change this from BITS_PER_LONG if there is something better... */ | ||
28 | #if BITS_PER_LONG == 64 | ||
29 | # define NBYTES(x) ((x) * 0x0101010101010101UL) | ||
30 | # define NSIZE 8 | ||
31 | # define NSHIFT 3 | ||
32 | # define NSTRING "64" | ||
33 | typedef u64 unative_t; | ||
34 | #else | ||
35 | # define NBYTES(x) ((x) * 0x01010101U) | ||
36 | # define NSIZE 4 | ||
37 | # define NSHIFT 2 | ||
38 | # define NSTRING "32" | ||
39 | typedef u32 unative_t; | ||
40 | #endif | ||
41 | |||
42 | |||
43 | |||
44 | /* | ||
45 | * IA-64 wants insane amounts of unrolling. On other architectures that | ||
46 | * is just a waste of space. | ||
47 | */ | ||
48 | #if ($# <= 8) || defined(__ia64__) | ||
49 | |||
50 | |||
51 | /* | ||
52 | * These sub-operations are separate inlines since they can sometimes be | ||
53 | * specially optimized using architecture-specific hacks. | ||
54 | */ | ||
55 | |||
56 | /* | ||
57 | * The SHLBYTE() operation shifts each byte left by 1, *not* | ||
58 | * rolling over into the next byte | ||
59 | */ | ||
60 | static inline __attribute_const__ unative_t SHLBYTE(unative_t v) | ||
61 | { | ||
62 | unative_t vv; | ||
63 | |||
64 | vv = (v << 1) & NBYTES(0xfe); | ||
65 | return vv; | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * The MASK() operation returns 0xFF in any byte for which the high | ||
70 | * bit is 1, 0x00 for any byte for which the high bit is 0. | ||
71 | */ | ||
72 | static inline __attribute_const__ unative_t MASK(unative_t v) | ||
73 | { | ||
74 | unative_t vv; | ||
75 | |||
76 | vv = v & NBYTES(0x80); | ||
77 | vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */ | ||
78 | return vv; | ||
79 | } | ||
80 | |||
81 | |||
82 | static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs) | ||
83 | { | ||
84 | u8 **dptr = (u8 **)ptrs; | ||
85 | u8 *p, *q; | ||
86 | int d, z, z0; | ||
87 | |||
88 | unative_t wd$$, wq$$, wp$$, w1$$, w2$$; | ||
89 | |||
90 | z0 = disks - 3; /* Highest data disk */ | ||
91 | p = dptr[z0+1]; /* XOR parity */ | ||
92 | q = dptr[z0+2]; /* RS syndrome */ | ||
93 | |||
94 | for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { | ||
95 | wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; | ||
96 | for ( z = z0-1 ; z >= 0 ; z-- ) { | ||
97 | wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; | ||
98 | wp$$ ^= wd$$; | ||
99 | w2$$ = MASK(wq$$); | ||
100 | w1$$ = SHLBYTE(wq$$); | ||
101 | w2$$ &= NBYTES(0x1d); | ||
102 | w1$$ ^= w2$$; | ||
103 | wq$$ = w1$$ ^ wd$$; | ||
104 | } | ||
105 | *(unative_t *)&p[d+NSIZE*$$] = wp$$; | ||
106 | *(unative_t *)&q[d+NSIZE*$$] = wq$$; | ||
107 | } | ||
108 | } | ||
109 | |||
110 | const struct raid6_calls raid6_intx$# = { | ||
111 | raid6_int$#_gen_syndrome, | ||
112 | NULL, /* always valid */ | ||
113 | "int" NSTRING "x$#", | ||
114 | 0 | ||
115 | }; | ||
116 | |||
117 | #endif | ||
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c new file mode 100644 index 000000000000..7e30ab29691a --- /dev/null +++ b/drivers/md/raid6main.c | |||
@@ -0,0 +1,2136 @@ | |||
1 | /* | ||
2 | * raid6main.c : Multiple Devices driver for Linux | ||
3 | * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman | ||
4 | * Copyright (C) 1999, 2000 Ingo Molnar | ||
5 | * Copyright (C) 2002, 2003 H. Peter Anvin | ||
6 | * | ||
7 | * RAID-6 management functions. This code is derived from raid5.c. | ||
8 | * Last merge from raid5.c bkcvs version 1.79 (kernel 2.6.1). | ||
9 | * | ||
10 | * Thanks to Penguin Computing for making the RAID-6 development possible | ||
11 | * by donating a test server! | ||
12 | * | ||
13 | * This program is free software; you can redistribute it and/or modify | ||
14 | * it under the terms of the GNU General Public License as published by | ||
15 | * the Free Software Foundation; either version 2, or (at your option) | ||
16 | * any later version. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
20 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
21 | */ | ||
22 | |||
23 | |||
24 | #include <linux/config.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/slab.h> | ||
27 | #include <linux/highmem.h> | ||
28 | #include <linux/bitops.h> | ||
29 | #include <asm/atomic.h> | ||
30 | #include "raid6.h" | ||
31 | |||
32 | /* | ||
33 | * Stripe cache | ||
34 | */ | ||
35 | |||
36 | #define NR_STRIPES 256 | ||
37 | #define STRIPE_SIZE PAGE_SIZE | ||
38 | #define STRIPE_SHIFT (PAGE_SHIFT - 9) | ||
39 | #define STRIPE_SECTORS (STRIPE_SIZE>>9) | ||
40 | #define IO_THRESHOLD 1 | ||
41 | #define HASH_PAGES 1 | ||
42 | #define HASH_PAGES_ORDER 0 | ||
43 | #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) | ||
44 | #define HASH_MASK (NR_HASH - 1) | ||
45 | |||
46 | #define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]) | ||
47 | |||
48 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector | ||
49 | * order without overlap. There may be several bio's per stripe+device, and | ||
50 | * a bio could span several devices. | ||
51 | * When walking this list for a particular stripe+device, we must never proceed | ||
52 | * beyond a bio that extends past this device, as the next bio might no longer | ||
53 | * be valid. | ||
54 | * This macro is used to determine the 'next' bio in the list, given the sector | ||
55 | * of the current stripe+device | ||
56 | */ | ||
57 | #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) | ||
58 | /* | ||
59 | * The following can be used to debug the driver | ||
60 | */ | ||
61 | #define RAID6_DEBUG 0 /* Extremely verbose printk */ | ||
62 | #define RAID6_PARANOIA 1 /* Check spinlocks */ | ||
63 | #define RAID6_DUMPSTATE 0 /* Include stripe cache state in /proc/mdstat */ | ||
64 | #if RAID6_PARANOIA && defined(CONFIG_SMP) | ||
65 | # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) | ||
66 | #else | ||
67 | # define CHECK_DEVLOCK() | ||
68 | #endif | ||
69 | |||
70 | #define PRINTK(x...) ((void)(RAID6_DEBUG && printk(KERN_DEBUG x))) | ||
71 | #if RAID6_DEBUG | ||
72 | #undef inline | ||
73 | #undef __inline__ | ||
74 | #define inline | ||
75 | #define __inline__ | ||
76 | #endif | ||
77 | |||
78 | #if !RAID6_USE_EMPTY_ZERO_PAGE | ||
79 | /* In .bss so it's zeroed */ | ||
80 | const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); | ||
81 | #endif | ||
82 | |||
83 | static inline int raid6_next_disk(int disk, int raid_disks) | ||
84 | { | ||
85 | disk++; | ||
86 | return (disk < raid_disks) ? disk : 0; | ||
87 | } | ||
88 | |||
89 | static void print_raid6_conf (raid6_conf_t *conf); | ||
90 | |||
91 | static inline void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh) | ||
92 | { | ||
93 | if (atomic_dec_and_test(&sh->count)) { | ||
94 | if (!list_empty(&sh->lru)) | ||
95 | BUG(); | ||
96 | if (atomic_read(&conf->active_stripes)==0) | ||
97 | BUG(); | ||
98 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | ||
99 | if (test_bit(STRIPE_DELAYED, &sh->state)) | ||
100 | list_add_tail(&sh->lru, &conf->delayed_list); | ||
101 | else | ||
102 | list_add_tail(&sh->lru, &conf->handle_list); | ||
103 | md_wakeup_thread(conf->mddev->thread); | ||
104 | } else { | ||
105 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
106 | atomic_dec(&conf->preread_active_stripes); | ||
107 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | ||
108 | md_wakeup_thread(conf->mddev->thread); | ||
109 | } | ||
110 | list_add_tail(&sh->lru, &conf->inactive_list); | ||
111 | atomic_dec(&conf->active_stripes); | ||
112 | if (!conf->inactive_blocked || | ||
113 | atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4)) | ||
114 | wake_up(&conf->wait_for_stripe); | ||
115 | } | ||
116 | } | ||
117 | } | ||
118 | static void release_stripe(struct stripe_head *sh) | ||
119 | { | ||
120 | raid6_conf_t *conf = sh->raid_conf; | ||
121 | unsigned long flags; | ||
122 | |||
123 | spin_lock_irqsave(&conf->device_lock, flags); | ||
124 | __release_stripe(conf, sh); | ||
125 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
126 | } | ||
127 | |||
128 | static void remove_hash(struct stripe_head *sh) | ||
129 | { | ||
130 | PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); | ||
131 | |||
132 | if (sh->hash_pprev) { | ||
133 | if (sh->hash_next) | ||
134 | sh->hash_next->hash_pprev = sh->hash_pprev; | ||
135 | *sh->hash_pprev = sh->hash_next; | ||
136 | sh->hash_pprev = NULL; | ||
137 | } | ||
138 | } | ||
139 | |||
140 | static __inline__ void insert_hash(raid6_conf_t *conf, struct stripe_head *sh) | ||
141 | { | ||
142 | struct stripe_head **shp = &stripe_hash(conf, sh->sector); | ||
143 | |||
144 | PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); | ||
145 | |||
146 | CHECK_DEVLOCK(); | ||
147 | if ((sh->hash_next = *shp) != NULL) | ||
148 | (*shp)->hash_pprev = &sh->hash_next; | ||
149 | *shp = sh; | ||
150 | sh->hash_pprev = shp; | ||
151 | } | ||
152 | |||
153 | |||
154 | /* find an idle stripe, make sure it is unhashed, and return it. */ | ||
155 | static struct stripe_head *get_free_stripe(raid6_conf_t *conf) | ||
156 | { | ||
157 | struct stripe_head *sh = NULL; | ||
158 | struct list_head *first; | ||
159 | |||
160 | CHECK_DEVLOCK(); | ||
161 | if (list_empty(&conf->inactive_list)) | ||
162 | goto out; | ||
163 | first = conf->inactive_list.next; | ||
164 | sh = list_entry(first, struct stripe_head, lru); | ||
165 | list_del_init(first); | ||
166 | remove_hash(sh); | ||
167 | atomic_inc(&conf->active_stripes); | ||
168 | out: | ||
169 | return sh; | ||
170 | } | ||
171 | |||
172 | static void shrink_buffers(struct stripe_head *sh, int num) | ||
173 | { | ||
174 | struct page *p; | ||
175 | int i; | ||
176 | |||
177 | for (i=0; i<num ; i++) { | ||
178 | p = sh->dev[i].page; | ||
179 | if (!p) | ||
180 | continue; | ||
181 | sh->dev[i].page = NULL; | ||
182 | page_cache_release(p); | ||
183 | } | ||
184 | } | ||
185 | |||
186 | static int grow_buffers(struct stripe_head *sh, int num) | ||
187 | { | ||
188 | int i; | ||
189 | |||
190 | for (i=0; i<num; i++) { | ||
191 | struct page *page; | ||
192 | |||
193 | if (!(page = alloc_page(GFP_KERNEL))) { | ||
194 | return 1; | ||
195 | } | ||
196 | sh->dev[i].page = page; | ||
197 | } | ||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static void raid6_build_block (struct stripe_head *sh, int i); | ||
202 | |||
203 | static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx) | ||
204 | { | ||
205 | raid6_conf_t *conf = sh->raid_conf; | ||
206 | int disks = conf->raid_disks, i; | ||
207 | |||
208 | if (atomic_read(&sh->count) != 0) | ||
209 | BUG(); | ||
210 | if (test_bit(STRIPE_HANDLE, &sh->state)) | ||
211 | BUG(); | ||
212 | |||
213 | CHECK_DEVLOCK(); | ||
214 | PRINTK("init_stripe called, stripe %llu\n", | ||
215 | (unsigned long long)sh->sector); | ||
216 | |||
217 | remove_hash(sh); | ||
218 | |||
219 | sh->sector = sector; | ||
220 | sh->pd_idx = pd_idx; | ||
221 | sh->state = 0; | ||
222 | |||
223 | for (i=disks; i--; ) { | ||
224 | struct r5dev *dev = &sh->dev[i]; | ||
225 | |||
226 | if (dev->toread || dev->towrite || dev->written || | ||
227 | test_bit(R5_LOCKED, &dev->flags)) { | ||
228 | PRINTK("sector=%llx i=%d %p %p %p %d\n", | ||
229 | (unsigned long long)sh->sector, i, dev->toread, | ||
230 | dev->towrite, dev->written, | ||
231 | test_bit(R5_LOCKED, &dev->flags)); | ||
232 | BUG(); | ||
233 | } | ||
234 | dev->flags = 0; | ||
235 | raid6_build_block(sh, i); | ||
236 | } | ||
237 | insert_hash(conf, sh); | ||
238 | } | ||
239 | |||
240 | static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector) | ||
241 | { | ||
242 | struct stripe_head *sh; | ||
243 | |||
244 | CHECK_DEVLOCK(); | ||
245 | PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); | ||
246 | for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) | ||
247 | if (sh->sector == sector) | ||
248 | return sh; | ||
249 | PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); | ||
250 | return NULL; | ||
251 | } | ||
252 | |||
253 | static void unplug_slaves(mddev_t *mddev); | ||
254 | |||
255 | static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector, | ||
256 | int pd_idx, int noblock) | ||
257 | { | ||
258 | struct stripe_head *sh; | ||
259 | |||
260 | PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector); | ||
261 | |||
262 | spin_lock_irq(&conf->device_lock); | ||
263 | |||
264 | do { | ||
265 | sh = __find_stripe(conf, sector); | ||
266 | if (!sh) { | ||
267 | if (!conf->inactive_blocked) | ||
268 | sh = get_free_stripe(conf); | ||
269 | if (noblock && sh == NULL) | ||
270 | break; | ||
271 | if (!sh) { | ||
272 | conf->inactive_blocked = 1; | ||
273 | wait_event_lock_irq(conf->wait_for_stripe, | ||
274 | !list_empty(&conf->inactive_list) && | ||
275 | (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4) | ||
276 | || !conf->inactive_blocked), | ||
277 | conf->device_lock, | ||
278 | unplug_slaves(conf->mddev); | ||
279 | ); | ||
280 | conf->inactive_blocked = 0; | ||
281 | } else | ||
282 | init_stripe(sh, sector, pd_idx); | ||
283 | } else { | ||
284 | if (atomic_read(&sh->count)) { | ||
285 | if (!list_empty(&sh->lru)) | ||
286 | BUG(); | ||
287 | } else { | ||
288 | if (!test_bit(STRIPE_HANDLE, &sh->state)) | ||
289 | atomic_inc(&conf->active_stripes); | ||
290 | if (list_empty(&sh->lru)) | ||
291 | BUG(); | ||
292 | list_del_init(&sh->lru); | ||
293 | } | ||
294 | } | ||
295 | } while (sh == NULL); | ||
296 | |||
297 | if (sh) | ||
298 | atomic_inc(&sh->count); | ||
299 | |||
300 | spin_unlock_irq(&conf->device_lock); | ||
301 | return sh; | ||
302 | } | ||
303 | |||
304 | static int grow_stripes(raid6_conf_t *conf, int num) | ||
305 | { | ||
306 | struct stripe_head *sh; | ||
307 | kmem_cache_t *sc; | ||
308 | int devs = conf->raid_disks; | ||
309 | |||
310 | sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev)); | ||
311 | |||
312 | sc = kmem_cache_create(conf->cache_name, | ||
313 | sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), | ||
314 | 0, 0, NULL, NULL); | ||
315 | if (!sc) | ||
316 | return 1; | ||
317 | conf->slab_cache = sc; | ||
318 | while (num--) { | ||
319 | sh = kmem_cache_alloc(sc, GFP_KERNEL); | ||
320 | if (!sh) | ||
321 | return 1; | ||
322 | memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev)); | ||
323 | sh->raid_conf = conf; | ||
324 | spin_lock_init(&sh->lock); | ||
325 | |||
326 | if (grow_buffers(sh, conf->raid_disks)) { | ||
327 | shrink_buffers(sh, conf->raid_disks); | ||
328 | kmem_cache_free(sc, sh); | ||
329 | return 1; | ||
330 | } | ||
331 | /* we just created an active stripe so... */ | ||
332 | atomic_set(&sh->count, 1); | ||
333 | atomic_inc(&conf->active_stripes); | ||
334 | INIT_LIST_HEAD(&sh->lru); | ||
335 | release_stripe(sh); | ||
336 | } | ||
337 | return 0; | ||
338 | } | ||
339 | |||
340 | static void shrink_stripes(raid6_conf_t *conf) | ||
341 | { | ||
342 | struct stripe_head *sh; | ||
343 | |||
344 | while (1) { | ||
345 | spin_lock_irq(&conf->device_lock); | ||
346 | sh = get_free_stripe(conf); | ||
347 | spin_unlock_irq(&conf->device_lock); | ||
348 | if (!sh) | ||
349 | break; | ||
350 | if (atomic_read(&sh->count)) | ||
351 | BUG(); | ||
352 | shrink_buffers(sh, conf->raid_disks); | ||
353 | kmem_cache_free(conf->slab_cache, sh); | ||
354 | atomic_dec(&conf->active_stripes); | ||
355 | } | ||
356 | kmem_cache_destroy(conf->slab_cache); | ||
357 | conf->slab_cache = NULL; | ||
358 | } | ||
359 | |||
360 | static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done, | ||
361 | int error) | ||
362 | { | ||
363 | struct stripe_head *sh = bi->bi_private; | ||
364 | raid6_conf_t *conf = sh->raid_conf; | ||
365 | int disks = conf->raid_disks, i; | ||
366 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | ||
367 | |||
368 | if (bi->bi_size) | ||
369 | return 1; | ||
370 | |||
371 | for (i=0 ; i<disks; i++) | ||
372 | if (bi == &sh->dev[i].req) | ||
373 | break; | ||
374 | |||
375 | PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n", | ||
376 | (unsigned long long)sh->sector, i, atomic_read(&sh->count), | ||
377 | uptodate); | ||
378 | if (i == disks) { | ||
379 | BUG(); | ||
380 | return 0; | ||
381 | } | ||
382 | |||
383 | if (uptodate) { | ||
384 | #if 0 | ||
385 | struct bio *bio; | ||
386 | unsigned long flags; | ||
387 | spin_lock_irqsave(&conf->device_lock, flags); | ||
388 | /* we can return a buffer if we bypassed the cache or | ||
389 | * if the top buffer is not in highmem. If there are | ||
390 | * multiple buffers, leave the extra work to | ||
391 | * handle_stripe | ||
392 | */ | ||
393 | buffer = sh->bh_read[i]; | ||
394 | if (buffer && | ||
395 | (!PageHighMem(buffer->b_page) | ||
396 | || buffer->b_page == bh->b_page ) | ||
397 | ) { | ||
398 | sh->bh_read[i] = buffer->b_reqnext; | ||
399 | buffer->b_reqnext = NULL; | ||
400 | } else | ||
401 | buffer = NULL; | ||
402 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
403 | if (sh->bh_page[i]==bh->b_page) | ||
404 | set_buffer_uptodate(bh); | ||
405 | if (buffer) { | ||
406 | if (buffer->b_page != bh->b_page) | ||
407 | memcpy(buffer->b_data, bh->b_data, bh->b_size); | ||
408 | buffer->b_end_io(buffer, 1); | ||
409 | } | ||
410 | #else | ||
411 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | ||
412 | #endif | ||
413 | } else { | ||
414 | md_error(conf->mddev, conf->disks[i].rdev); | ||
415 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | ||
416 | } | ||
417 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | ||
418 | #if 0 | ||
419 | /* must restore b_page before unlocking buffer... */ | ||
420 | if (sh->bh_page[i] != bh->b_page) { | ||
421 | bh->b_page = sh->bh_page[i]; | ||
422 | bh->b_data = page_address(bh->b_page); | ||
423 | clear_buffer_uptodate(bh); | ||
424 | } | ||
425 | #endif | ||
426 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
427 | set_bit(STRIPE_HANDLE, &sh->state); | ||
428 | release_stripe(sh); | ||
429 | return 0; | ||
430 | } | ||
431 | |||
432 | static int raid6_end_write_request (struct bio *bi, unsigned int bytes_done, | ||
433 | int error) | ||
434 | { | ||
435 | struct stripe_head *sh = bi->bi_private; | ||
436 | raid6_conf_t *conf = sh->raid_conf; | ||
437 | int disks = conf->raid_disks, i; | ||
438 | unsigned long flags; | ||
439 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | ||
440 | |||
441 | if (bi->bi_size) | ||
442 | return 1; | ||
443 | |||
444 | for (i=0 ; i<disks; i++) | ||
445 | if (bi == &sh->dev[i].req) | ||
446 | break; | ||
447 | |||
448 | PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n", | ||
449 | (unsigned long long)sh->sector, i, atomic_read(&sh->count), | ||
450 | uptodate); | ||
451 | if (i == disks) { | ||
452 | BUG(); | ||
453 | return 0; | ||
454 | } | ||
455 | |||
456 | spin_lock_irqsave(&conf->device_lock, flags); | ||
457 | if (!uptodate) | ||
458 | md_error(conf->mddev, conf->disks[i].rdev); | ||
459 | |||
460 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | ||
461 | |||
462 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
463 | set_bit(STRIPE_HANDLE, &sh->state); | ||
464 | __release_stripe(conf, sh); | ||
465 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
466 | return 0; | ||
467 | } | ||
468 | |||
469 | |||
470 | static sector_t compute_blocknr(struct stripe_head *sh, int i); | ||
471 | |||
472 | static void raid6_build_block (struct stripe_head *sh, int i) | ||
473 | { | ||
474 | struct r5dev *dev = &sh->dev[i]; | ||
475 | int pd_idx = sh->pd_idx; | ||
476 | int qd_idx = raid6_next_disk(pd_idx, sh->raid_conf->raid_disks); | ||
477 | |||
478 | bio_init(&dev->req); | ||
479 | dev->req.bi_io_vec = &dev->vec; | ||
480 | dev->req.bi_vcnt++; | ||
481 | dev->req.bi_max_vecs++; | ||
482 | dev->vec.bv_page = dev->page; | ||
483 | dev->vec.bv_len = STRIPE_SIZE; | ||
484 | dev->vec.bv_offset = 0; | ||
485 | |||
486 | dev->req.bi_sector = sh->sector; | ||
487 | dev->req.bi_private = sh; | ||
488 | |||
489 | dev->flags = 0; | ||
490 | if (i != pd_idx && i != qd_idx) | ||
491 | dev->sector = compute_blocknr(sh, i); | ||
492 | } | ||
493 | |||
494 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | ||
495 | { | ||
496 | char b[BDEVNAME_SIZE]; | ||
497 | raid6_conf_t *conf = (raid6_conf_t *) mddev->private; | ||
498 | PRINTK("raid6: error called\n"); | ||
499 | |||
500 | if (!rdev->faulty) { | ||
501 | mddev->sb_dirty = 1; | ||
502 | if (rdev->in_sync) { | ||
503 | conf->working_disks--; | ||
504 | mddev->degraded++; | ||
505 | conf->failed_disks++; | ||
506 | rdev->in_sync = 0; | ||
507 | /* | ||
508 | * if recovery was running, make sure it aborts. | ||
509 | */ | ||
510 | set_bit(MD_RECOVERY_ERR, &mddev->recovery); | ||
511 | } | ||
512 | rdev->faulty = 1; | ||
513 | printk (KERN_ALERT | ||
514 | "raid6: Disk failure on %s, disabling device." | ||
515 | " Operation continuing on %d devices\n", | ||
516 | bdevname(rdev->bdev,b), conf->working_disks); | ||
517 | } | ||
518 | } | ||
519 | |||
520 | /* | ||
521 | * Input: a 'big' sector number, | ||
522 | * Output: index of the data and parity disk, and the sector # in them. | ||
523 | */ | ||
524 | static sector_t raid6_compute_sector(sector_t r_sector, unsigned int raid_disks, | ||
525 | unsigned int data_disks, unsigned int * dd_idx, | ||
526 | unsigned int * pd_idx, raid6_conf_t *conf) | ||
527 | { | ||
528 | long stripe; | ||
529 | unsigned long chunk_number; | ||
530 | unsigned int chunk_offset; | ||
531 | sector_t new_sector; | ||
532 | int sectors_per_chunk = conf->chunk_size >> 9; | ||
533 | |||
534 | /* First compute the information on this sector */ | ||
535 | |||
536 | /* | ||
537 | * Compute the chunk number and the sector offset inside the chunk | ||
538 | */ | ||
539 | chunk_offset = sector_div(r_sector, sectors_per_chunk); | ||
540 | chunk_number = r_sector; | ||
541 | if ( r_sector != chunk_number ) { | ||
542 | printk(KERN_CRIT "raid6: ERROR: r_sector = %llu, chunk_number = %lu\n", | ||
543 | (unsigned long long)r_sector, (unsigned long)chunk_number); | ||
544 | BUG(); | ||
545 | } | ||
546 | |||
547 | /* | ||
548 | * Compute the stripe number | ||
549 | */ | ||
550 | stripe = chunk_number / data_disks; | ||
551 | |||
552 | /* | ||
553 | * Compute the data disk and parity disk indexes inside the stripe | ||
554 | */ | ||
555 | *dd_idx = chunk_number % data_disks; | ||
556 | |||
557 | /* | ||
558 | * Select the parity disk based on the user selected algorithm. | ||
559 | */ | ||
560 | |||
561 | /**** FIX THIS ****/ | ||
562 | switch (conf->algorithm) { | ||
563 | case ALGORITHM_LEFT_ASYMMETRIC: | ||
564 | *pd_idx = raid_disks - 1 - (stripe % raid_disks); | ||
565 | if (*pd_idx == raid_disks-1) | ||
566 | (*dd_idx)++; /* Q D D D P */ | ||
567 | else if (*dd_idx >= *pd_idx) | ||
568 | (*dd_idx) += 2; /* D D P Q D */ | ||
569 | break; | ||
570 | case ALGORITHM_RIGHT_ASYMMETRIC: | ||
571 | *pd_idx = stripe % raid_disks; | ||
572 | if (*pd_idx == raid_disks-1) | ||
573 | (*dd_idx)++; /* Q D D D P */ | ||
574 | else if (*dd_idx >= *pd_idx) | ||
575 | (*dd_idx) += 2; /* D D P Q D */ | ||
576 | break; | ||
577 | case ALGORITHM_LEFT_SYMMETRIC: | ||
578 | *pd_idx = raid_disks - 1 - (stripe % raid_disks); | ||
579 | *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; | ||
580 | break; | ||
581 | case ALGORITHM_RIGHT_SYMMETRIC: | ||
582 | *pd_idx = stripe % raid_disks; | ||
583 | *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; | ||
584 | break; | ||
585 | default: | ||
586 | printk (KERN_CRIT "raid6: unsupported algorithm %d\n", | ||
587 | conf->algorithm); | ||
588 | } | ||
589 | |||
590 | PRINTK("raid6: chunk_number = %lu, pd_idx = %u, dd_idx = %u\n", | ||
591 | chunk_number, *pd_idx, *dd_idx); | ||
592 | |||
593 | /* | ||
594 | * Finally, compute the new sector number | ||
595 | */ | ||
596 | new_sector = (sector_t) stripe * sectors_per_chunk + chunk_offset; | ||
597 | return new_sector; | ||
598 | } | ||
599 | |||
600 | |||
601 | static sector_t compute_blocknr(struct stripe_head *sh, int i) | ||
602 | { | ||
603 | raid6_conf_t *conf = sh->raid_conf; | ||
604 | int raid_disks = conf->raid_disks, data_disks = raid_disks - 2; | ||
605 | sector_t new_sector = sh->sector, check; | ||
606 | int sectors_per_chunk = conf->chunk_size >> 9; | ||
607 | sector_t stripe; | ||
608 | int chunk_offset; | ||
609 | int chunk_number, dummy1, dummy2, dd_idx = i; | ||
610 | sector_t r_sector; | ||
611 | int i0 = i; | ||
612 | |||
613 | chunk_offset = sector_div(new_sector, sectors_per_chunk); | ||
614 | stripe = new_sector; | ||
615 | if ( new_sector != stripe ) { | ||
616 | printk(KERN_CRIT "raid6: ERROR: new_sector = %llu, stripe = %lu\n", | ||
617 | (unsigned long long)new_sector, (unsigned long)stripe); | ||
618 | BUG(); | ||
619 | } | ||
620 | |||
621 | switch (conf->algorithm) { | ||
622 | case ALGORITHM_LEFT_ASYMMETRIC: | ||
623 | case ALGORITHM_RIGHT_ASYMMETRIC: | ||
624 | if (sh->pd_idx == raid_disks-1) | ||
625 | i--; /* Q D D D P */ | ||
626 | else if (i > sh->pd_idx) | ||
627 | i -= 2; /* D D P Q D */ | ||
628 | break; | ||
629 | case ALGORITHM_LEFT_SYMMETRIC: | ||
630 | case ALGORITHM_RIGHT_SYMMETRIC: | ||
631 | if (sh->pd_idx == raid_disks-1) | ||
632 | i--; /* Q D D D P */ | ||
633 | else { | ||
634 | /* D D P Q D */ | ||
635 | if (i < sh->pd_idx) | ||
636 | i += raid_disks; | ||
637 | i -= (sh->pd_idx + 2); | ||
638 | } | ||
639 | break; | ||
640 | default: | ||
641 | printk (KERN_CRIT "raid6: unsupported algorithm %d\n", | ||
642 | conf->algorithm); | ||
643 | } | ||
644 | |||
645 | PRINTK("raid6: compute_blocknr: pd_idx = %u, i0 = %u, i = %u\n", sh->pd_idx, i0, i); | ||
646 | |||
647 | chunk_number = stripe * data_disks + i; | ||
648 | r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; | ||
649 | |||
650 | check = raid6_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); | ||
651 | if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { | ||
652 | printk(KERN_CRIT "raid6: compute_blocknr: map not correct\n"); | ||
653 | return 0; | ||
654 | } | ||
655 | return r_sector; | ||
656 | } | ||
657 | |||
658 | |||
659 | |||
660 | /* | ||
661 | * Copy data between a page in the stripe cache, and one or more bion | ||
662 | * The page could align with the middle of the bio, or there could be | ||
663 | * several bion, each with several bio_vecs, which cover part of the page | ||
664 | * Multiple bion are linked together on bi_next. There may be extras | ||
665 | * at the end of this list. We ignore them. | ||
666 | */ | ||
667 | static void copy_data(int frombio, struct bio *bio, | ||
668 | struct page *page, | ||
669 | sector_t sector) | ||
670 | { | ||
671 | char *pa = page_address(page); | ||
672 | struct bio_vec *bvl; | ||
673 | int i; | ||
674 | int page_offset; | ||
675 | |||
676 | if (bio->bi_sector >= sector) | ||
677 | page_offset = (signed)(bio->bi_sector - sector) * 512; | ||
678 | else | ||
679 | page_offset = (signed)(sector - bio->bi_sector) * -512; | ||
680 | bio_for_each_segment(bvl, bio, i) { | ||
681 | int len = bio_iovec_idx(bio,i)->bv_len; | ||
682 | int clen; | ||
683 | int b_offset = 0; | ||
684 | |||
685 | if (page_offset < 0) { | ||
686 | b_offset = -page_offset; | ||
687 | page_offset += b_offset; | ||
688 | len -= b_offset; | ||
689 | } | ||
690 | |||
691 | if (len > 0 && page_offset + len > STRIPE_SIZE) | ||
692 | clen = STRIPE_SIZE - page_offset; | ||
693 | else clen = len; | ||
694 | |||
695 | if (clen > 0) { | ||
696 | char *ba = __bio_kmap_atomic(bio, i, KM_USER0); | ||
697 | if (frombio) | ||
698 | memcpy(pa+page_offset, ba+b_offset, clen); | ||
699 | else | ||
700 | memcpy(ba+b_offset, pa+page_offset, clen); | ||
701 | __bio_kunmap_atomic(ba, KM_USER0); | ||
702 | } | ||
703 | if (clen < len) /* hit end of page */ | ||
704 | break; | ||
705 | page_offset += len; | ||
706 | } | ||
707 | } | ||
708 | |||
709 | #define check_xor() do { \ | ||
710 | if (count == MAX_XOR_BLOCKS) { \ | ||
711 | xor_block(count, STRIPE_SIZE, ptr); \ | ||
712 | count = 1; \ | ||
713 | } \ | ||
714 | } while(0) | ||
715 | |||
716 | /* Compute P and Q syndromes */ | ||
717 | static void compute_parity(struct stripe_head *sh, int method) | ||
718 | { | ||
719 | raid6_conf_t *conf = sh->raid_conf; | ||
720 | int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count; | ||
721 | struct bio *chosen; | ||
722 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ | ||
723 | void *ptrs[disks]; | ||
724 | |||
725 | qd_idx = raid6_next_disk(pd_idx, disks); | ||
726 | d0_idx = raid6_next_disk(qd_idx, disks); | ||
727 | |||
728 | PRINTK("compute_parity, stripe %llu, method %d\n", | ||
729 | (unsigned long long)sh->sector, method); | ||
730 | |||
731 | switch(method) { | ||
732 | case READ_MODIFY_WRITE: | ||
733 | BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ | ||
734 | case RECONSTRUCT_WRITE: | ||
735 | for (i= disks; i-- ;) | ||
736 | if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { | ||
737 | chosen = sh->dev[i].towrite; | ||
738 | sh->dev[i].towrite = NULL; | ||
739 | |||
740 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
741 | wake_up(&conf->wait_for_overlap); | ||
742 | |||
743 | if (sh->dev[i].written) BUG(); | ||
744 | sh->dev[i].written = chosen; | ||
745 | } | ||
746 | break; | ||
747 | case CHECK_PARITY: | ||
748 | BUG(); /* Not implemented yet */ | ||
749 | } | ||
750 | |||
751 | for (i = disks; i--;) | ||
752 | if (sh->dev[i].written) { | ||
753 | sector_t sector = sh->dev[i].sector; | ||
754 | struct bio *wbi = sh->dev[i].written; | ||
755 | while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { | ||
756 | copy_data(1, wbi, sh->dev[i].page, sector); | ||
757 | wbi = r5_next_bio(wbi, sector); | ||
758 | } | ||
759 | |||
760 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
761 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | ||
762 | } | ||
763 | |||
764 | // switch(method) { | ||
765 | // case RECONSTRUCT_WRITE: | ||
766 | // case CHECK_PARITY: | ||
767 | // case UPDATE_PARITY: | ||
768 | /* Note that unlike RAID-5, the ordering of the disks matters greatly. */ | ||
769 | /* FIX: Is this ordering of drives even remotely optimal? */ | ||
770 | count = 0; | ||
771 | i = d0_idx; | ||
772 | do { | ||
773 | ptrs[count++] = page_address(sh->dev[i].page); | ||
774 | if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) | ||
775 | printk("block %d/%d not uptodate on parity calc\n", i,count); | ||
776 | i = raid6_next_disk(i, disks); | ||
777 | } while ( i != d0_idx ); | ||
778 | // break; | ||
779 | // } | ||
780 | |||
781 | raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); | ||
782 | |||
783 | switch(method) { | ||
784 | case RECONSTRUCT_WRITE: | ||
785 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
786 | set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); | ||
787 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | ||
788 | set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); | ||
789 | break; | ||
790 | case UPDATE_PARITY: | ||
791 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
792 | set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); | ||
793 | break; | ||
794 | } | ||
795 | } | ||
796 | |||
797 | /* Compute one missing block */ | ||
798 | static void compute_block_1(struct stripe_head *sh, int dd_idx) | ||
799 | { | ||
800 | raid6_conf_t *conf = sh->raid_conf; | ||
801 | int i, count, disks = conf->raid_disks; | ||
802 | void *ptr[MAX_XOR_BLOCKS], *p; | ||
803 | int pd_idx = sh->pd_idx; | ||
804 | int qd_idx = raid6_next_disk(pd_idx, disks); | ||
805 | |||
806 | PRINTK("compute_block_1, stripe %llu, idx %d\n", | ||
807 | (unsigned long long)sh->sector, dd_idx); | ||
808 | |||
809 | if ( dd_idx == qd_idx ) { | ||
810 | /* We're actually computing the Q drive */ | ||
811 | compute_parity(sh, UPDATE_PARITY); | ||
812 | } else { | ||
813 | ptr[0] = page_address(sh->dev[dd_idx].page); | ||
814 | memset(ptr[0], 0, STRIPE_SIZE); | ||
815 | count = 1; | ||
816 | for (i = disks ; i--; ) { | ||
817 | if (i == dd_idx || i == qd_idx) | ||
818 | continue; | ||
819 | p = page_address(sh->dev[i].page); | ||
820 | if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) | ||
821 | ptr[count++] = p; | ||
822 | else | ||
823 | printk("compute_block() %d, stripe %llu, %d" | ||
824 | " not present\n", dd_idx, | ||
825 | (unsigned long long)sh->sector, i); | ||
826 | |||
827 | check_xor(); | ||
828 | } | ||
829 | if (count != 1) | ||
830 | xor_block(count, STRIPE_SIZE, ptr); | ||
831 | set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | ||
832 | } | ||
833 | } | ||
834 | |||
835 | /* Compute two missing blocks */ | ||
836 | static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) | ||
837 | { | ||
838 | raid6_conf_t *conf = sh->raid_conf; | ||
839 | int i, count, disks = conf->raid_disks; | ||
840 | int pd_idx = sh->pd_idx; | ||
841 | int qd_idx = raid6_next_disk(pd_idx, disks); | ||
842 | int d0_idx = raid6_next_disk(qd_idx, disks); | ||
843 | int faila, failb; | ||
844 | |||
845 | /* faila and failb are disk numbers relative to d0_idx */ | ||
846 | /* pd_idx become disks-2 and qd_idx become disks-1 */ | ||
847 | faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; | ||
848 | failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; | ||
849 | |||
850 | BUG_ON(faila == failb); | ||
851 | if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } | ||
852 | |||
853 | PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", | ||
854 | (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); | ||
855 | |||
856 | if ( failb == disks-1 ) { | ||
857 | /* Q disk is one of the missing disks */ | ||
858 | if ( faila == disks-2 ) { | ||
859 | /* Missing P+Q, just recompute */ | ||
860 | compute_parity(sh, UPDATE_PARITY); | ||
861 | return; | ||
862 | } else { | ||
863 | /* We're missing D+Q; recompute D from P */ | ||
864 | compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1); | ||
865 | compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ | ||
866 | return; | ||
867 | } | ||
868 | } | ||
869 | |||
870 | /* We're missing D+P or D+D; build pointer table */ | ||
871 | { | ||
872 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ | ||
873 | void *ptrs[disks]; | ||
874 | |||
875 | count = 0; | ||
876 | i = d0_idx; | ||
877 | do { | ||
878 | ptrs[count++] = page_address(sh->dev[i].page); | ||
879 | i = raid6_next_disk(i, disks); | ||
880 | if (i != dd_idx1 && i != dd_idx2 && | ||
881 | !test_bit(R5_UPTODATE, &sh->dev[i].flags)) | ||
882 | printk("compute_2 with missing block %d/%d\n", count, i); | ||
883 | } while ( i != d0_idx ); | ||
884 | |||
885 | if ( failb == disks-2 ) { | ||
886 | /* We're missing D+P. */ | ||
887 | raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); | ||
888 | } else { | ||
889 | /* We're missing D+D. */ | ||
890 | raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); | ||
891 | } | ||
892 | |||
893 | /* Both the above update both missing blocks */ | ||
894 | set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); | ||
895 | set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); | ||
896 | } | ||
897 | } | ||
898 | |||
899 | |||
900 | /* | ||
901 | * Each stripe/dev can have one or more bion attached. | ||
902 | * toread/towrite point to the first in a chain. | ||
903 | * The bi_next chain must be in order. | ||
904 | */ | ||
905 | static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) | ||
906 | { | ||
907 | struct bio **bip; | ||
908 | raid6_conf_t *conf = sh->raid_conf; | ||
909 | |||
910 | PRINTK("adding bh b#%llu to stripe s#%llu\n", | ||
911 | (unsigned long long)bi->bi_sector, | ||
912 | (unsigned long long)sh->sector); | ||
913 | |||
914 | |||
915 | spin_lock(&sh->lock); | ||
916 | spin_lock_irq(&conf->device_lock); | ||
917 | if (forwrite) | ||
918 | bip = &sh->dev[dd_idx].towrite; | ||
919 | else | ||
920 | bip = &sh->dev[dd_idx].toread; | ||
921 | while (*bip && (*bip)->bi_sector < bi->bi_sector) { | ||
922 | if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) | ||
923 | goto overlap; | ||
924 | bip = &(*bip)->bi_next; | ||
925 | } | ||
926 | if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) | ||
927 | goto overlap; | ||
928 | |||
929 | if (*bip && bi->bi_next && (*bip) != bi->bi_next) | ||
930 | BUG(); | ||
931 | if (*bip) | ||
932 | bi->bi_next = *bip; | ||
933 | *bip = bi; | ||
934 | bi->bi_phys_segments ++; | ||
935 | spin_unlock_irq(&conf->device_lock); | ||
936 | spin_unlock(&sh->lock); | ||
937 | |||
938 | PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n", | ||
939 | (unsigned long long)bi->bi_sector, | ||
940 | (unsigned long long)sh->sector, dd_idx); | ||
941 | |||
942 | if (forwrite) { | ||
943 | /* check if page is covered */ | ||
944 | sector_t sector = sh->dev[dd_idx].sector; | ||
945 | for (bi=sh->dev[dd_idx].towrite; | ||
946 | sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && | ||
947 | bi && bi->bi_sector <= sector; | ||
948 | bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { | ||
949 | if (bi->bi_sector + (bi->bi_size>>9) >= sector) | ||
950 | sector = bi->bi_sector + (bi->bi_size>>9); | ||
951 | } | ||
952 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | ||
953 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | ||
954 | } | ||
955 | return 1; | ||
956 | |||
957 | overlap: | ||
958 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); | ||
959 | spin_unlock_irq(&conf->device_lock); | ||
960 | spin_unlock(&sh->lock); | ||
961 | return 0; | ||
962 | } | ||
963 | |||
964 | |||
965 | /* | ||
966 | * handle_stripe - do things to a stripe. | ||
967 | * | ||
968 | * We lock the stripe and then examine the state of various bits | ||
969 | * to see what needs to be done. | ||
970 | * Possible results: | ||
971 | * return some read request which now have data | ||
972 | * return some write requests which are safely on disc | ||
973 | * schedule a read on some buffers | ||
974 | * schedule a write of some buffers | ||
975 | * return confirmation of parity correctness | ||
976 | * | ||
977 | * Parity calculations are done inside the stripe lock | ||
978 | * buffers are taken off read_list or write_list, and bh_cache buffers | ||
979 | * get BH_Lock set before the stripe lock is released. | ||
980 | * | ||
981 | */ | ||
982 | |||
983 | static void handle_stripe(struct stripe_head *sh) | ||
984 | { | ||
985 | raid6_conf_t *conf = sh->raid_conf; | ||
986 | int disks = conf->raid_disks; | ||
987 | struct bio *return_bi= NULL; | ||
988 | struct bio *bi; | ||
989 | int i; | ||
990 | int syncing; | ||
991 | int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; | ||
992 | int non_overwrite = 0; | ||
993 | int failed_num[2] = {0, 0}; | ||
994 | struct r5dev *dev, *pdev, *qdev; | ||
995 | int pd_idx = sh->pd_idx; | ||
996 | int qd_idx = raid6_next_disk(pd_idx, disks); | ||
997 | int p_failed, q_failed; | ||
998 | |||
999 | PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n", | ||
1000 | (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), | ||
1001 | pd_idx, qd_idx); | ||
1002 | |||
1003 | spin_lock(&sh->lock); | ||
1004 | clear_bit(STRIPE_HANDLE, &sh->state); | ||
1005 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
1006 | |||
1007 | syncing = test_bit(STRIPE_SYNCING, &sh->state); | ||
1008 | /* Now to look around and see what can be done */ | ||
1009 | |||
1010 | for (i=disks; i--; ) { | ||
1011 | mdk_rdev_t *rdev; | ||
1012 | dev = &sh->dev[i]; | ||
1013 | clear_bit(R5_Insync, &dev->flags); | ||
1014 | clear_bit(R5_Syncio, &dev->flags); | ||
1015 | |||
1016 | PRINTK("check %d: state 0x%lx read %p write %p written %p\n", | ||
1017 | i, dev->flags, dev->toread, dev->towrite, dev->written); | ||
1018 | /* maybe we can reply to a read */ | ||
1019 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { | ||
1020 | struct bio *rbi, *rbi2; | ||
1021 | PRINTK("Return read for disc %d\n", i); | ||
1022 | spin_lock_irq(&conf->device_lock); | ||
1023 | rbi = dev->toread; | ||
1024 | dev->toread = NULL; | ||
1025 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | ||
1026 | wake_up(&conf->wait_for_overlap); | ||
1027 | spin_unlock_irq(&conf->device_lock); | ||
1028 | while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { | ||
1029 | copy_data(0, rbi, dev->page, dev->sector); | ||
1030 | rbi2 = r5_next_bio(rbi, dev->sector); | ||
1031 | spin_lock_irq(&conf->device_lock); | ||
1032 | if (--rbi->bi_phys_segments == 0) { | ||
1033 | rbi->bi_next = return_bi; | ||
1034 | return_bi = rbi; | ||
1035 | } | ||
1036 | spin_unlock_irq(&conf->device_lock); | ||
1037 | rbi = rbi2; | ||
1038 | } | ||
1039 | } | ||
1040 | |||
1041 | /* now count some things */ | ||
1042 | if (test_bit(R5_LOCKED, &dev->flags)) locked++; | ||
1043 | if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; | ||
1044 | |||
1045 | |||
1046 | if (dev->toread) to_read++; | ||
1047 | if (dev->towrite) { | ||
1048 | to_write++; | ||
1049 | if (!test_bit(R5_OVERWRITE, &dev->flags)) | ||
1050 | non_overwrite++; | ||
1051 | } | ||
1052 | if (dev->written) written++; | ||
1053 | rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ | ||
1054 | if (!rdev || !rdev->in_sync) { | ||
1055 | if ( failed < 2 ) | ||
1056 | failed_num[failed] = i; | ||
1057 | failed++; | ||
1058 | } else | ||
1059 | set_bit(R5_Insync, &dev->flags); | ||
1060 | } | ||
1061 | PRINTK("locked=%d uptodate=%d to_read=%d" | ||
1062 | " to_write=%d failed=%d failed_num=%d,%d\n", | ||
1063 | locked, uptodate, to_read, to_write, failed, | ||
1064 | failed_num[0], failed_num[1]); | ||
1065 | /* check if the array has lost >2 devices and, if so, some requests might | ||
1066 | * need to be failed | ||
1067 | */ | ||
1068 | if (failed > 2 && to_read+to_write+written) { | ||
1069 | spin_lock_irq(&conf->device_lock); | ||
1070 | for (i=disks; i--; ) { | ||
1071 | /* fail all writes first */ | ||
1072 | bi = sh->dev[i].towrite; | ||
1073 | sh->dev[i].towrite = NULL; | ||
1074 | if (bi) to_write--; | ||
1075 | |||
1076 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
1077 | wake_up(&conf->wait_for_overlap); | ||
1078 | |||
1079 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ | ||
1080 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | ||
1081 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
1082 | if (--bi->bi_phys_segments == 0) { | ||
1083 | md_write_end(conf->mddev); | ||
1084 | bi->bi_next = return_bi; | ||
1085 | return_bi = bi; | ||
1086 | } | ||
1087 | bi = nextbi; | ||
1088 | } | ||
1089 | /* and fail all 'written' */ | ||
1090 | bi = sh->dev[i].written; | ||
1091 | sh->dev[i].written = NULL; | ||
1092 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { | ||
1093 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | ||
1094 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
1095 | if (--bi->bi_phys_segments == 0) { | ||
1096 | md_write_end(conf->mddev); | ||
1097 | bi->bi_next = return_bi; | ||
1098 | return_bi = bi; | ||
1099 | } | ||
1100 | bi = bi2; | ||
1101 | } | ||
1102 | |||
1103 | /* fail any reads if this device is non-operational */ | ||
1104 | if (!test_bit(R5_Insync, &sh->dev[i].flags)) { | ||
1105 | bi = sh->dev[i].toread; | ||
1106 | sh->dev[i].toread = NULL; | ||
1107 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
1108 | wake_up(&conf->wait_for_overlap); | ||
1109 | if (bi) to_read--; | ||
1110 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ | ||
1111 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | ||
1112 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
1113 | if (--bi->bi_phys_segments == 0) { | ||
1114 | bi->bi_next = return_bi; | ||
1115 | return_bi = bi; | ||
1116 | } | ||
1117 | bi = nextbi; | ||
1118 | } | ||
1119 | } | ||
1120 | } | ||
1121 | spin_unlock_irq(&conf->device_lock); | ||
1122 | } | ||
1123 | if (failed > 2 && syncing) { | ||
1124 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | ||
1125 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
1126 | syncing = 0; | ||
1127 | } | ||
1128 | |||
1129 | /* | ||
1130 | * might be able to return some write requests if the parity blocks | ||
1131 | * are safe, or on a failed drive | ||
1132 | */ | ||
1133 | pdev = &sh->dev[pd_idx]; | ||
1134 | p_failed = (failed >= 1 && failed_num[0] == pd_idx) | ||
1135 | || (failed >= 2 && failed_num[1] == pd_idx); | ||
1136 | qdev = &sh->dev[qd_idx]; | ||
1137 | q_failed = (failed >= 1 && failed_num[0] == qd_idx) | ||
1138 | || (failed >= 2 && failed_num[1] == qd_idx); | ||
1139 | |||
1140 | if ( written && | ||
1141 | ( p_failed || ((test_bit(R5_Insync, &pdev->flags) | ||
1142 | && !test_bit(R5_LOCKED, &pdev->flags) | ||
1143 | && test_bit(R5_UPTODATE, &pdev->flags))) ) && | ||
1144 | ( q_failed || ((test_bit(R5_Insync, &qdev->flags) | ||
1145 | && !test_bit(R5_LOCKED, &qdev->flags) | ||
1146 | && test_bit(R5_UPTODATE, &qdev->flags))) ) ) { | ||
1147 | /* any written block on an uptodate or failed drive can be | ||
1148 | * returned. Note that if we 'wrote' to a failed drive, | ||
1149 | * it will be UPTODATE, but never LOCKED, so we don't need | ||
1150 | * to test 'failed' directly. | ||
1151 | */ | ||
1152 | for (i=disks; i--; ) | ||
1153 | if (sh->dev[i].written) { | ||
1154 | dev = &sh->dev[i]; | ||
1155 | if (!test_bit(R5_LOCKED, &dev->flags) && | ||
1156 | test_bit(R5_UPTODATE, &dev->flags) ) { | ||
1157 | /* We can return any write requests */ | ||
1158 | struct bio *wbi, *wbi2; | ||
1159 | PRINTK("Return write for stripe %llu disc %d\n", | ||
1160 | (unsigned long long)sh->sector, i); | ||
1161 | spin_lock_irq(&conf->device_lock); | ||
1162 | wbi = dev->written; | ||
1163 | dev->written = NULL; | ||
1164 | while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { | ||
1165 | wbi2 = r5_next_bio(wbi, dev->sector); | ||
1166 | if (--wbi->bi_phys_segments == 0) { | ||
1167 | md_write_end(conf->mddev); | ||
1168 | wbi->bi_next = return_bi; | ||
1169 | return_bi = wbi; | ||
1170 | } | ||
1171 | wbi = wbi2; | ||
1172 | } | ||
1173 | spin_unlock_irq(&conf->device_lock); | ||
1174 | } | ||
1175 | } | ||
1176 | } | ||
1177 | |||
1178 | /* Now we might consider reading some blocks, either to check/generate | ||
1179 | * parity, or to satisfy requests | ||
1180 | * or to load a block that is being partially written. | ||
1181 | */ | ||
1182 | if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) { | ||
1183 | for (i=disks; i--;) { | ||
1184 | dev = &sh->dev[i]; | ||
1185 | if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && | ||
1186 | (dev->toread || | ||
1187 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | ||
1188 | syncing || | ||
1189 | (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) || | ||
1190 | (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write)) | ||
1191 | ) | ||
1192 | ) { | ||
1193 | /* we would like to get this block, possibly | ||
1194 | * by computing it, but we might not be able to | ||
1195 | */ | ||
1196 | if (uptodate == disks-1) { | ||
1197 | PRINTK("Computing stripe %llu block %d\n", | ||
1198 | (unsigned long long)sh->sector, i); | ||
1199 | compute_block_1(sh, i); | ||
1200 | uptodate++; | ||
1201 | } else if ( uptodate == disks-2 && failed >= 2 ) { | ||
1202 | /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ | ||
1203 | int other; | ||
1204 | for (other=disks; other--;) { | ||
1205 | if ( other == i ) | ||
1206 | continue; | ||
1207 | if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) ) | ||
1208 | break; | ||
1209 | } | ||
1210 | BUG_ON(other < 0); | ||
1211 | PRINTK("Computing stripe %llu blocks %d,%d\n", | ||
1212 | (unsigned long long)sh->sector, i, other); | ||
1213 | compute_block_2(sh, i, other); | ||
1214 | uptodate += 2; | ||
1215 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
1216 | set_bit(R5_LOCKED, &dev->flags); | ||
1217 | set_bit(R5_Wantread, &dev->flags); | ||
1218 | #if 0 | ||
1219 | /* if I am just reading this block and we don't have | ||
1220 | a failed drive, or any pending writes then sidestep the cache */ | ||
1221 | if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && | ||
1222 | ! syncing && !failed && !to_write) { | ||
1223 | sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; | ||
1224 | sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; | ||
1225 | } | ||
1226 | #endif | ||
1227 | locked++; | ||
1228 | PRINTK("Reading block %d (sync=%d)\n", | ||
1229 | i, syncing); | ||
1230 | if (syncing) | ||
1231 | md_sync_acct(conf->disks[i].rdev->bdev, | ||
1232 | STRIPE_SECTORS); | ||
1233 | } | ||
1234 | } | ||
1235 | } | ||
1236 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1237 | } | ||
1238 | |||
1239 | /* now to consider writing and what else, if anything should be read */ | ||
1240 | if (to_write) { | ||
1241 | int rcw=0, must_compute=0; | ||
1242 | for (i=disks ; i--;) { | ||
1243 | dev = &sh->dev[i]; | ||
1244 | /* Would I have to read this buffer for reconstruct_write */ | ||
1245 | if (!test_bit(R5_OVERWRITE, &dev->flags) | ||
1246 | && i != pd_idx && i != qd_idx | ||
1247 | && (!test_bit(R5_LOCKED, &dev->flags) | ||
1248 | #if 0 | ||
1249 | || sh->bh_page[i] != bh->b_page | ||
1250 | #endif | ||
1251 | ) && | ||
1252 | !test_bit(R5_UPTODATE, &dev->flags)) { | ||
1253 | if (test_bit(R5_Insync, &dev->flags)) rcw++; | ||
1254 | else { | ||
1255 | PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags); | ||
1256 | must_compute++; | ||
1257 | } | ||
1258 | } | ||
1259 | } | ||
1260 | PRINTK("for sector %llu, rcw=%d, must_compute=%d\n", | ||
1261 | (unsigned long long)sh->sector, rcw, must_compute); | ||
1262 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1263 | |||
1264 | if (rcw > 0) | ||
1265 | /* want reconstruct write, but need to get some data */ | ||
1266 | for (i=disks; i--;) { | ||
1267 | dev = &sh->dev[i]; | ||
1268 | if (!test_bit(R5_OVERWRITE, &dev->flags) | ||
1269 | && !(failed == 0 && (i == pd_idx || i == qd_idx)) | ||
1270 | && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && | ||
1271 | test_bit(R5_Insync, &dev->flags)) { | ||
1272 | if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
1273 | { | ||
1274 | PRINTK("Read_old stripe %llu block %d for Reconstruct\n", | ||
1275 | (unsigned long long)sh->sector, i); | ||
1276 | set_bit(R5_LOCKED, &dev->flags); | ||
1277 | set_bit(R5_Wantread, &dev->flags); | ||
1278 | locked++; | ||
1279 | } else { | ||
1280 | PRINTK("Request delayed stripe %llu block %d for Reconstruct\n", | ||
1281 | (unsigned long long)sh->sector, i); | ||
1282 | set_bit(STRIPE_DELAYED, &sh->state); | ||
1283 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1284 | } | ||
1285 | } | ||
1286 | } | ||
1287 | /* now if nothing is locked, and if we have enough data, we can start a write request */ | ||
1288 | if (locked == 0 && rcw == 0) { | ||
1289 | if ( must_compute > 0 ) { | ||
1290 | /* We have failed blocks and need to compute them */ | ||
1291 | switch ( failed ) { | ||
1292 | case 0: BUG(); | ||
1293 | case 1: compute_block_1(sh, failed_num[0]); break; | ||
1294 | case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; | ||
1295 | default: BUG(); /* This request should have been failed? */ | ||
1296 | } | ||
1297 | } | ||
1298 | |||
1299 | PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector); | ||
1300 | compute_parity(sh, RECONSTRUCT_WRITE); | ||
1301 | /* now every locked buffer is ready to be written */ | ||
1302 | for (i=disks; i--;) | ||
1303 | if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { | ||
1304 | PRINTK("Writing stripe %llu block %d\n", | ||
1305 | (unsigned long long)sh->sector, i); | ||
1306 | locked++; | ||
1307 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
1308 | #if 0 /**** FIX: I don't understand the logic here... ****/ | ||
1309 | if (!test_bit(R5_Insync, &sh->dev[i].flags) | ||
1310 | || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */ | ||
1311 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1312 | #endif | ||
1313 | } | ||
1314 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
1315 | atomic_dec(&conf->preread_active_stripes); | ||
1316 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | ||
1317 | md_wakeup_thread(conf->mddev->thread); | ||
1318 | } | ||
1319 | } | ||
1320 | } | ||
1321 | |||
1322 | /* maybe we need to check and possibly fix the parity for this stripe | ||
1323 | * Any reads will already have been scheduled, so we just see if enough data | ||
1324 | * is available | ||
1325 | */ | ||
1326 | if (syncing && locked == 0 && | ||
1327 | !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) { | ||
1328 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1329 | #if 0 /* RAID-6: Don't support CHECK PARITY yet */ | ||
1330 | if (failed == 0) { | ||
1331 | char *pagea; | ||
1332 | if (uptodate != disks) | ||
1333 | BUG(); | ||
1334 | compute_parity(sh, CHECK_PARITY); | ||
1335 | uptodate--; | ||
1336 | pagea = page_address(sh->dev[pd_idx].page); | ||
1337 | if ((*(u32*)pagea) == 0 && | ||
1338 | !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { | ||
1339 | /* parity is correct (on disc, not in buffer any more) */ | ||
1340 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1341 | } | ||
1342 | } | ||
1343 | #endif | ||
1344 | if (!test_bit(STRIPE_INSYNC, &sh->state)) { | ||
1345 | int failed_needupdate[2]; | ||
1346 | struct r5dev *adev, *bdev; | ||
1347 | |||
1348 | if ( failed < 1 ) | ||
1349 | failed_num[0] = pd_idx; | ||
1350 | if ( failed < 2 ) | ||
1351 | failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx; | ||
1352 | |||
1353 | failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags); | ||
1354 | failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags); | ||
1355 | |||
1356 | PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n", | ||
1357 | failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]); | ||
1358 | |||
1359 | #if 0 /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */ | ||
1360 | /* should be able to compute the missing block(s) and write to spare */ | ||
1361 | if ( failed_needupdate[0] ^ failed_needupdate[1] ) { | ||
1362 | if (uptodate+1 != disks) | ||
1363 | BUG(); | ||
1364 | compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]); | ||
1365 | uptodate++; | ||
1366 | } else if ( failed_needupdate[0] & failed_needupdate[1] ) { | ||
1367 | if (uptodate+2 != disks) | ||
1368 | BUG(); | ||
1369 | compute_block_2(sh, failed_num[0], failed_num[1]); | ||
1370 | uptodate += 2; | ||
1371 | } | ||
1372 | #else | ||
1373 | compute_block_2(sh, failed_num[0], failed_num[1]); | ||
1374 | uptodate += failed_needupdate[0] + failed_needupdate[1]; | ||
1375 | #endif | ||
1376 | |||
1377 | if (uptodate != disks) | ||
1378 | BUG(); | ||
1379 | |||
1380 | PRINTK("Marking for sync stripe %llu blocks %d,%d\n", | ||
1381 | (unsigned long long)sh->sector, failed_num[0], failed_num[1]); | ||
1382 | |||
1383 | /**** FIX: Should we really do both of these unconditionally? ****/ | ||
1384 | adev = &sh->dev[failed_num[0]]; | ||
1385 | locked += !test_bit(R5_LOCKED, &adev->flags); | ||
1386 | set_bit(R5_LOCKED, &adev->flags); | ||
1387 | set_bit(R5_Wantwrite, &adev->flags); | ||
1388 | bdev = &sh->dev[failed_num[1]]; | ||
1389 | locked += !test_bit(R5_LOCKED, &bdev->flags); | ||
1390 | set_bit(R5_LOCKED, &bdev->flags); | ||
1391 | set_bit(R5_Wantwrite, &bdev->flags); | ||
1392 | |||
1393 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1394 | set_bit(R5_Syncio, &adev->flags); | ||
1395 | set_bit(R5_Syncio, &bdev->flags); | ||
1396 | } | ||
1397 | } | ||
1398 | if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | ||
1399 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | ||
1400 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
1401 | } | ||
1402 | |||
1403 | spin_unlock(&sh->lock); | ||
1404 | |||
1405 | while ((bi=return_bi)) { | ||
1406 | int bytes = bi->bi_size; | ||
1407 | |||
1408 | return_bi = bi->bi_next; | ||
1409 | bi->bi_next = NULL; | ||
1410 | bi->bi_size = 0; | ||
1411 | bi->bi_end_io(bi, bytes, 0); | ||
1412 | } | ||
1413 | for (i=disks; i-- ;) { | ||
1414 | int rw; | ||
1415 | struct bio *bi; | ||
1416 | mdk_rdev_t *rdev; | ||
1417 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) | ||
1418 | rw = 1; | ||
1419 | else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | ||
1420 | rw = 0; | ||
1421 | else | ||
1422 | continue; | ||
1423 | |||
1424 | bi = &sh->dev[i].req; | ||
1425 | |||
1426 | bi->bi_rw = rw; | ||
1427 | if (rw) | ||
1428 | bi->bi_end_io = raid6_end_write_request; | ||
1429 | else | ||
1430 | bi->bi_end_io = raid6_end_read_request; | ||
1431 | |||
1432 | rcu_read_lock(); | ||
1433 | rdev = conf->disks[i].rdev; | ||
1434 | if (rdev && rdev->faulty) | ||
1435 | rdev = NULL; | ||
1436 | if (rdev) | ||
1437 | atomic_inc(&rdev->nr_pending); | ||
1438 | rcu_read_unlock(); | ||
1439 | |||
1440 | if (rdev) { | ||
1441 | if (test_bit(R5_Syncio, &sh->dev[i].flags)) | ||
1442 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | ||
1443 | |||
1444 | bi->bi_bdev = rdev->bdev; | ||
1445 | PRINTK("for %llu schedule op %ld on disc %d\n", | ||
1446 | (unsigned long long)sh->sector, bi->bi_rw, i); | ||
1447 | atomic_inc(&sh->count); | ||
1448 | bi->bi_sector = sh->sector + rdev->data_offset; | ||
1449 | bi->bi_flags = 1 << BIO_UPTODATE; | ||
1450 | bi->bi_vcnt = 1; | ||
1451 | bi->bi_max_vecs = 1; | ||
1452 | bi->bi_idx = 0; | ||
1453 | bi->bi_io_vec = &sh->dev[i].vec; | ||
1454 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | ||
1455 | bi->bi_io_vec[0].bv_offset = 0; | ||
1456 | bi->bi_size = STRIPE_SIZE; | ||
1457 | bi->bi_next = NULL; | ||
1458 | generic_make_request(bi); | ||
1459 | } else { | ||
1460 | PRINTK("skip op %ld on disc %d for sector %llu\n", | ||
1461 | bi->bi_rw, i, (unsigned long long)sh->sector); | ||
1462 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
1463 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1464 | } | ||
1465 | } | ||
1466 | } | ||
1467 | |||
1468 | static inline void raid6_activate_delayed(raid6_conf_t *conf) | ||
1469 | { | ||
1470 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { | ||
1471 | while (!list_empty(&conf->delayed_list)) { | ||
1472 | struct list_head *l = conf->delayed_list.next; | ||
1473 | struct stripe_head *sh; | ||
1474 | sh = list_entry(l, struct stripe_head, lru); | ||
1475 | list_del_init(l); | ||
1476 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
1477 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
1478 | atomic_inc(&conf->preread_active_stripes); | ||
1479 | list_add_tail(&sh->lru, &conf->handle_list); | ||
1480 | } | ||
1481 | } | ||
1482 | } | ||
1483 | |||
1484 | static void unplug_slaves(mddev_t *mddev) | ||
1485 | { | ||
1486 | raid6_conf_t *conf = mddev_to_conf(mddev); | ||
1487 | int i; | ||
1488 | |||
1489 | rcu_read_lock(); | ||
1490 | for (i=0; i<mddev->raid_disks; i++) { | ||
1491 | mdk_rdev_t *rdev = conf->disks[i].rdev; | ||
1492 | if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { | ||
1493 | request_queue_t *r_queue = bdev_get_queue(rdev->bdev); | ||
1494 | |||
1495 | atomic_inc(&rdev->nr_pending); | ||
1496 | rcu_read_unlock(); | ||
1497 | |||
1498 | if (r_queue->unplug_fn) | ||
1499 | r_queue->unplug_fn(r_queue); | ||
1500 | |||
1501 | rdev_dec_pending(rdev, mddev); | ||
1502 | rcu_read_lock(); | ||
1503 | } | ||
1504 | } | ||
1505 | rcu_read_unlock(); | ||
1506 | } | ||
1507 | |||
1508 | static void raid6_unplug_device(request_queue_t *q) | ||
1509 | { | ||
1510 | mddev_t *mddev = q->queuedata; | ||
1511 | raid6_conf_t *conf = mddev_to_conf(mddev); | ||
1512 | unsigned long flags; | ||
1513 | |||
1514 | spin_lock_irqsave(&conf->device_lock, flags); | ||
1515 | |||
1516 | if (blk_remove_plug(q)) | ||
1517 | raid6_activate_delayed(conf); | ||
1518 | md_wakeup_thread(mddev->thread); | ||
1519 | |||
1520 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
1521 | |||
1522 | unplug_slaves(mddev); | ||
1523 | } | ||
1524 | |||
1525 | static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk, | ||
1526 | sector_t *error_sector) | ||
1527 | { | ||
1528 | mddev_t *mddev = q->queuedata; | ||
1529 | raid6_conf_t *conf = mddev_to_conf(mddev); | ||
1530 | int i, ret = 0; | ||
1531 | |||
1532 | rcu_read_lock(); | ||
1533 | for (i=0; i<mddev->raid_disks && ret == 0; i++) { | ||
1534 | mdk_rdev_t *rdev = conf->disks[i].rdev; | ||
1535 | if (rdev && !rdev->faulty) { | ||
1536 | struct block_device *bdev = rdev->bdev; | ||
1537 | request_queue_t *r_queue = bdev_get_queue(bdev); | ||
1538 | |||
1539 | if (!r_queue->issue_flush_fn) | ||
1540 | ret = -EOPNOTSUPP; | ||
1541 | else { | ||
1542 | atomic_inc(&rdev->nr_pending); | ||
1543 | rcu_read_unlock(); | ||
1544 | ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, | ||
1545 | error_sector); | ||
1546 | rdev_dec_pending(rdev, mddev); | ||
1547 | rcu_read_lock(); | ||
1548 | } | ||
1549 | } | ||
1550 | } | ||
1551 | rcu_read_unlock(); | ||
1552 | return ret; | ||
1553 | } | ||
1554 | |||
1555 | static inline void raid6_plug_device(raid6_conf_t *conf) | ||
1556 | { | ||
1557 | spin_lock_irq(&conf->device_lock); | ||
1558 | blk_plug_device(conf->mddev->queue); | ||
1559 | spin_unlock_irq(&conf->device_lock); | ||
1560 | } | ||
1561 | |||
1562 | static int make_request (request_queue_t *q, struct bio * bi) | ||
1563 | { | ||
1564 | mddev_t *mddev = q->queuedata; | ||
1565 | raid6_conf_t *conf = mddev_to_conf(mddev); | ||
1566 | const unsigned int raid_disks = conf->raid_disks; | ||
1567 | const unsigned int data_disks = raid_disks - 2; | ||
1568 | unsigned int dd_idx, pd_idx; | ||
1569 | sector_t new_sector; | ||
1570 | sector_t logical_sector, last_sector; | ||
1571 | struct stripe_head *sh; | ||
1572 | |||
1573 | if (bio_data_dir(bi)==WRITE) { | ||
1574 | disk_stat_inc(mddev->gendisk, writes); | ||
1575 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); | ||
1576 | } else { | ||
1577 | disk_stat_inc(mddev->gendisk, reads); | ||
1578 | disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi)); | ||
1579 | } | ||
1580 | |||
1581 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | ||
1582 | last_sector = bi->bi_sector + (bi->bi_size>>9); | ||
1583 | |||
1584 | bi->bi_next = NULL; | ||
1585 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ | ||
1586 | if ( bio_data_dir(bi) == WRITE ) | ||
1587 | md_write_start(mddev); | ||
1588 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { | ||
1589 | DEFINE_WAIT(w); | ||
1590 | |||
1591 | new_sector = raid6_compute_sector(logical_sector, | ||
1592 | raid_disks, data_disks, &dd_idx, &pd_idx, conf); | ||
1593 | |||
1594 | PRINTK("raid6: make_request, sector %llu logical %llu\n", | ||
1595 | (unsigned long long)new_sector, | ||
1596 | (unsigned long long)logical_sector); | ||
1597 | |||
1598 | retry: | ||
1599 | prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); | ||
1600 | sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK)); | ||
1601 | if (sh) { | ||
1602 | if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { | ||
1603 | /* Add failed due to overlap. Flush everything | ||
1604 | * and wait a while | ||
1605 | */ | ||
1606 | raid6_unplug_device(mddev->queue); | ||
1607 | release_stripe(sh); | ||
1608 | schedule(); | ||
1609 | goto retry; | ||
1610 | } | ||
1611 | finish_wait(&conf->wait_for_overlap, &w); | ||
1612 | raid6_plug_device(conf); | ||
1613 | handle_stripe(sh); | ||
1614 | release_stripe(sh); | ||
1615 | } else { | ||
1616 | /* cannot get stripe for read-ahead, just give-up */ | ||
1617 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | ||
1618 | finish_wait(&conf->wait_for_overlap, &w); | ||
1619 | break; | ||
1620 | } | ||
1621 | |||
1622 | } | ||
1623 | spin_lock_irq(&conf->device_lock); | ||
1624 | if (--bi->bi_phys_segments == 0) { | ||
1625 | int bytes = bi->bi_size; | ||
1626 | |||
1627 | if ( bio_data_dir(bi) == WRITE ) | ||
1628 | md_write_end(mddev); | ||
1629 | bi->bi_size = 0; | ||
1630 | bi->bi_end_io(bi, bytes, 0); | ||
1631 | } | ||
1632 | spin_unlock_irq(&conf->device_lock); | ||
1633 | return 0; | ||
1634 | } | ||
1635 | |||
1636 | /* FIXME go_faster isn't used */ | ||
1637 | static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) | ||
1638 | { | ||
1639 | raid6_conf_t *conf = (raid6_conf_t *) mddev->private; | ||
1640 | struct stripe_head *sh; | ||
1641 | int sectors_per_chunk = conf->chunk_size >> 9; | ||
1642 | sector_t x; | ||
1643 | unsigned long stripe; | ||
1644 | int chunk_offset; | ||
1645 | int dd_idx, pd_idx; | ||
1646 | sector_t first_sector; | ||
1647 | int raid_disks = conf->raid_disks; | ||
1648 | int data_disks = raid_disks - 2; | ||
1649 | |||
1650 | if (sector_nr >= mddev->size <<1) { | ||
1651 | /* just being told to finish up .. nothing much to do */ | ||
1652 | unplug_slaves(mddev); | ||
1653 | return 0; | ||
1654 | } | ||
1655 | /* if there are 2 or more failed drives and we are trying | ||
1656 | * to resync, then assert that we are finished, because there is | ||
1657 | * nothing we can do. | ||
1658 | */ | ||
1659 | if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | ||
1660 | int rv = (mddev->size << 1) - sector_nr; | ||
1661 | md_done_sync(mddev, rv, 1); | ||
1662 | return rv; | ||
1663 | } | ||
1664 | |||
1665 | x = sector_nr; | ||
1666 | chunk_offset = sector_div(x, sectors_per_chunk); | ||
1667 | stripe = x; | ||
1668 | BUG_ON(x != stripe); | ||
1669 | |||
1670 | first_sector = raid6_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk | ||
1671 | + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); | ||
1672 | sh = get_active_stripe(conf, sector_nr, pd_idx, 1); | ||
1673 | if (sh == NULL) { | ||
1674 | sh = get_active_stripe(conf, sector_nr, pd_idx, 0); | ||
1675 | /* make sure we don't swamp the stripe cache if someone else | ||
1676 | * is trying to get access | ||
1677 | */ | ||
1678 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1679 | schedule_timeout(1); | ||
1680 | } | ||
1681 | spin_lock(&sh->lock); | ||
1682 | set_bit(STRIPE_SYNCING, &sh->state); | ||
1683 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
1684 | spin_unlock(&sh->lock); | ||
1685 | |||
1686 | handle_stripe(sh); | ||
1687 | release_stripe(sh); | ||
1688 | |||
1689 | return STRIPE_SECTORS; | ||
1690 | } | ||
1691 | |||
1692 | /* | ||
1693 | * This is our raid6 kernel thread. | ||
1694 | * | ||
1695 | * We scan the hash table for stripes which can be handled now. | ||
1696 | * During the scan, completed stripes are saved for us by the interrupt | ||
1697 | * handler, so that they will not have to wait for our next wakeup. | ||
1698 | */ | ||
1699 | static void raid6d (mddev_t *mddev) | ||
1700 | { | ||
1701 | struct stripe_head *sh; | ||
1702 | raid6_conf_t *conf = mddev_to_conf(mddev); | ||
1703 | int handled; | ||
1704 | |||
1705 | PRINTK("+++ raid6d active\n"); | ||
1706 | |||
1707 | md_check_recovery(mddev); | ||
1708 | md_handle_safemode(mddev); | ||
1709 | |||
1710 | handled = 0; | ||
1711 | spin_lock_irq(&conf->device_lock); | ||
1712 | while (1) { | ||
1713 | struct list_head *first; | ||
1714 | |||
1715 | if (list_empty(&conf->handle_list) && | ||
1716 | atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && | ||
1717 | !blk_queue_plugged(mddev->queue) && | ||
1718 | !list_empty(&conf->delayed_list)) | ||
1719 | raid6_activate_delayed(conf); | ||
1720 | |||
1721 | if (list_empty(&conf->handle_list)) | ||
1722 | break; | ||
1723 | |||
1724 | first = conf->handle_list.next; | ||
1725 | sh = list_entry(first, struct stripe_head, lru); | ||
1726 | |||
1727 | list_del_init(first); | ||
1728 | atomic_inc(&sh->count); | ||
1729 | if (atomic_read(&sh->count)!= 1) | ||
1730 | BUG(); | ||
1731 | spin_unlock_irq(&conf->device_lock); | ||
1732 | |||
1733 | handled++; | ||
1734 | handle_stripe(sh); | ||
1735 | release_stripe(sh); | ||
1736 | |||
1737 | spin_lock_irq(&conf->device_lock); | ||
1738 | } | ||
1739 | PRINTK("%d stripes handled\n", handled); | ||
1740 | |||
1741 | spin_unlock_irq(&conf->device_lock); | ||
1742 | |||
1743 | unplug_slaves(mddev); | ||
1744 | |||
1745 | PRINTK("--- raid6d inactive\n"); | ||
1746 | } | ||
1747 | |||
1748 | static int run (mddev_t *mddev) | ||
1749 | { | ||
1750 | raid6_conf_t *conf; | ||
1751 | int raid_disk, memory; | ||
1752 | mdk_rdev_t *rdev; | ||
1753 | struct disk_info *disk; | ||
1754 | struct list_head *tmp; | ||
1755 | |||
1756 | if (mddev->level != 6) { | ||
1757 | PRINTK("raid6: %s: raid level not set to 6 (%d)\n", mdname(mddev), mddev->level); | ||
1758 | return -EIO; | ||
1759 | } | ||
1760 | |||
1761 | mddev->private = kmalloc (sizeof (raid6_conf_t) | ||
1762 | + mddev->raid_disks * sizeof(struct disk_info), | ||
1763 | GFP_KERNEL); | ||
1764 | if ((conf = mddev->private) == NULL) | ||
1765 | goto abort; | ||
1766 | memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) ); | ||
1767 | conf->mddev = mddev; | ||
1768 | |||
1769 | if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) | ||
1770 | goto abort; | ||
1771 | memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); | ||
1772 | |||
1773 | spin_lock_init(&conf->device_lock); | ||
1774 | init_waitqueue_head(&conf->wait_for_stripe); | ||
1775 | init_waitqueue_head(&conf->wait_for_overlap); | ||
1776 | INIT_LIST_HEAD(&conf->handle_list); | ||
1777 | INIT_LIST_HEAD(&conf->delayed_list); | ||
1778 | INIT_LIST_HEAD(&conf->inactive_list); | ||
1779 | atomic_set(&conf->active_stripes, 0); | ||
1780 | atomic_set(&conf->preread_active_stripes, 0); | ||
1781 | |||
1782 | mddev->queue->unplug_fn = raid6_unplug_device; | ||
1783 | mddev->queue->issue_flush_fn = raid6_issue_flush; | ||
1784 | |||
1785 | PRINTK("raid6: run(%s) called.\n", mdname(mddev)); | ||
1786 | |||
1787 | ITERATE_RDEV(mddev,rdev,tmp) { | ||
1788 | raid_disk = rdev->raid_disk; | ||
1789 | if (raid_disk >= mddev->raid_disks | ||
1790 | || raid_disk < 0) | ||
1791 | continue; | ||
1792 | disk = conf->disks + raid_disk; | ||
1793 | |||
1794 | disk->rdev = rdev; | ||
1795 | |||
1796 | if (rdev->in_sync) { | ||
1797 | char b[BDEVNAME_SIZE]; | ||
1798 | printk(KERN_INFO "raid6: device %s operational as raid" | ||
1799 | " disk %d\n", bdevname(rdev->bdev,b), | ||
1800 | raid_disk); | ||
1801 | conf->working_disks++; | ||
1802 | } | ||
1803 | } | ||
1804 | |||
1805 | conf->raid_disks = mddev->raid_disks; | ||
1806 | |||
1807 | /* | ||
1808 | * 0 for a fully functional array, 1 or 2 for a degraded array. | ||
1809 | */ | ||
1810 | mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks; | ||
1811 | conf->mddev = mddev; | ||
1812 | conf->chunk_size = mddev->chunk_size; | ||
1813 | conf->level = mddev->level; | ||
1814 | conf->algorithm = mddev->layout; | ||
1815 | conf->max_nr_stripes = NR_STRIPES; | ||
1816 | |||
1817 | /* device size must be a multiple of chunk size */ | ||
1818 | mddev->size &= ~(mddev->chunk_size/1024 -1); | ||
1819 | |||
1820 | if (conf->raid_disks < 4) { | ||
1821 | printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", | ||
1822 | mdname(mddev), conf->raid_disks); | ||
1823 | goto abort; | ||
1824 | } | ||
1825 | if (!conf->chunk_size || conf->chunk_size % 4) { | ||
1826 | printk(KERN_ERR "raid6: invalid chunk size %d for %s\n", | ||
1827 | conf->chunk_size, mdname(mddev)); | ||
1828 | goto abort; | ||
1829 | } | ||
1830 | if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { | ||
1831 | printk(KERN_ERR | ||
1832 | "raid6: unsupported parity algorithm %d for %s\n", | ||
1833 | conf->algorithm, mdname(mddev)); | ||
1834 | goto abort; | ||
1835 | } | ||
1836 | if (mddev->degraded > 2) { | ||
1837 | printk(KERN_ERR "raid6: not enough operational devices for %s" | ||
1838 | " (%d/%d failed)\n", | ||
1839 | mdname(mddev), conf->failed_disks, conf->raid_disks); | ||
1840 | goto abort; | ||
1841 | } | ||
1842 | |||
1843 | #if 0 /* FIX: For now */ | ||
1844 | if (mddev->degraded > 0 && | ||
1845 | mddev->recovery_cp != MaxSector) { | ||
1846 | printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev)); | ||
1847 | goto abort; | ||
1848 | } | ||
1849 | #endif | ||
1850 | |||
1851 | { | ||
1852 | mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6"); | ||
1853 | if (!mddev->thread) { | ||
1854 | printk(KERN_ERR | ||
1855 | "raid6: couldn't allocate thread for %s\n", | ||
1856 | mdname(mddev)); | ||
1857 | goto abort; | ||
1858 | } | ||
1859 | } | ||
1860 | |||
1861 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + | ||
1862 | conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | ||
1863 | if (grow_stripes(conf, conf->max_nr_stripes)) { | ||
1864 | printk(KERN_ERR | ||
1865 | "raid6: couldn't allocate %dkB for buffers\n", memory); | ||
1866 | shrink_stripes(conf); | ||
1867 | md_unregister_thread(mddev->thread); | ||
1868 | goto abort; | ||
1869 | } else | ||
1870 | printk(KERN_INFO "raid6: allocated %dkB for %s\n", | ||
1871 | memory, mdname(mddev)); | ||
1872 | |||
1873 | if (mddev->degraded == 0) | ||
1874 | printk(KERN_INFO "raid6: raid level %d set %s active with %d out of %d" | ||
1875 | " devices, algorithm %d\n", conf->level, mdname(mddev), | ||
1876 | mddev->raid_disks-mddev->degraded, mddev->raid_disks, | ||
1877 | conf->algorithm); | ||
1878 | else | ||
1879 | printk(KERN_ALERT "raid6: raid level %d set %s active with %d" | ||
1880 | " out of %d devices, algorithm %d\n", conf->level, | ||
1881 | mdname(mddev), mddev->raid_disks - mddev->degraded, | ||
1882 | mddev->raid_disks, conf->algorithm); | ||
1883 | |||
1884 | print_raid6_conf(conf); | ||
1885 | |||
1886 | /* read-ahead size must cover two whole stripes, which is | ||
1887 | * 2 * (n-2) * chunksize where 'n' is the number of raid devices | ||
1888 | */ | ||
1889 | { | ||
1890 | int stripe = (mddev->raid_disks-2) * mddev->chunk_size | ||
1891 | / PAGE_CACHE_SIZE; | ||
1892 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | ||
1893 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | ||
1894 | } | ||
1895 | |||
1896 | /* Ok, everything is just fine now */ | ||
1897 | mddev->array_size = mddev->size * (mddev->raid_disks - 2); | ||
1898 | return 0; | ||
1899 | abort: | ||
1900 | if (conf) { | ||
1901 | print_raid6_conf(conf); | ||
1902 | if (conf->stripe_hashtbl) | ||
1903 | free_pages((unsigned long) conf->stripe_hashtbl, | ||
1904 | HASH_PAGES_ORDER); | ||
1905 | kfree(conf); | ||
1906 | } | ||
1907 | mddev->private = NULL; | ||
1908 | printk(KERN_ALERT "raid6: failed to run raid set %s\n", mdname(mddev)); | ||
1909 | return -EIO; | ||
1910 | } | ||
1911 | |||
1912 | |||
1913 | |||
1914 | static int stop (mddev_t *mddev) | ||
1915 | { | ||
1916 | raid6_conf_t *conf = (raid6_conf_t *) mddev->private; | ||
1917 | |||
1918 | md_unregister_thread(mddev->thread); | ||
1919 | mddev->thread = NULL; | ||
1920 | shrink_stripes(conf); | ||
1921 | free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); | ||
1922 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | ||
1923 | kfree(conf); | ||
1924 | mddev->private = NULL; | ||
1925 | return 0; | ||
1926 | } | ||
1927 | |||
1928 | #if RAID6_DUMPSTATE | ||
1929 | static void print_sh (struct seq_file *seq, struct stripe_head *sh) | ||
1930 | { | ||
1931 | int i; | ||
1932 | |||
1933 | seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n", | ||
1934 | (unsigned long long)sh->sector, sh->pd_idx, sh->state); | ||
1935 | seq_printf(seq, "sh %llu, count %d.\n", | ||
1936 | (unsigned long long)sh->sector, atomic_read(&sh->count)); | ||
1937 | seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector); | ||
1938 | for (i = 0; i < sh->raid_conf->raid_disks; i++) { | ||
1939 | seq_printf(seq, "(cache%d: %p %ld) ", | ||
1940 | i, sh->dev[i].page, sh->dev[i].flags); | ||
1941 | } | ||
1942 | seq_printf(seq, "\n"); | ||
1943 | } | ||
1944 | |||
1945 | static void printall (struct seq_file *seq, raid6_conf_t *conf) | ||
1946 | { | ||
1947 | struct stripe_head *sh; | ||
1948 | int i; | ||
1949 | |||
1950 | spin_lock_irq(&conf->device_lock); | ||
1951 | for (i = 0; i < NR_HASH; i++) { | ||
1952 | sh = conf->stripe_hashtbl[i]; | ||
1953 | for (; sh; sh = sh->hash_next) { | ||
1954 | if (sh->raid_conf != conf) | ||
1955 | continue; | ||
1956 | print_sh(seq, sh); | ||
1957 | } | ||
1958 | } | ||
1959 | spin_unlock_irq(&conf->device_lock); | ||
1960 | } | ||
1961 | #endif | ||
1962 | |||
1963 | static void status (struct seq_file *seq, mddev_t *mddev) | ||
1964 | { | ||
1965 | raid6_conf_t *conf = (raid6_conf_t *) mddev->private; | ||
1966 | int i; | ||
1967 | |||
1968 | seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); | ||
1969 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); | ||
1970 | for (i = 0; i < conf->raid_disks; i++) | ||
1971 | seq_printf (seq, "%s", | ||
1972 | conf->disks[i].rdev && | ||
1973 | conf->disks[i].rdev->in_sync ? "U" : "_"); | ||
1974 | seq_printf (seq, "]"); | ||
1975 | #if RAID6_DUMPSTATE | ||
1976 | seq_printf (seq, "\n"); | ||
1977 | printall(seq, conf); | ||
1978 | #endif | ||
1979 | } | ||
1980 | |||
1981 | static void print_raid6_conf (raid6_conf_t *conf) | ||
1982 | { | ||
1983 | int i; | ||
1984 | struct disk_info *tmp; | ||
1985 | |||
1986 | printk("RAID6 conf printout:\n"); | ||
1987 | if (!conf) { | ||
1988 | printk("(conf==NULL)\n"); | ||
1989 | return; | ||
1990 | } | ||
1991 | printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, | ||
1992 | conf->working_disks, conf->failed_disks); | ||
1993 | |||
1994 | for (i = 0; i < conf->raid_disks; i++) { | ||
1995 | char b[BDEVNAME_SIZE]; | ||
1996 | tmp = conf->disks + i; | ||
1997 | if (tmp->rdev) | ||
1998 | printk(" disk %d, o:%d, dev:%s\n", | ||
1999 | i, !tmp->rdev->faulty, | ||
2000 | bdevname(tmp->rdev->bdev,b)); | ||
2001 | } | ||
2002 | } | ||
2003 | |||
2004 | static int raid6_spare_active(mddev_t *mddev) | ||
2005 | { | ||
2006 | int i; | ||
2007 | raid6_conf_t *conf = mddev->private; | ||
2008 | struct disk_info *tmp; | ||
2009 | |||
2010 | for (i = 0; i < conf->raid_disks; i++) { | ||
2011 | tmp = conf->disks + i; | ||
2012 | if (tmp->rdev | ||
2013 | && !tmp->rdev->faulty | ||
2014 | && !tmp->rdev->in_sync) { | ||
2015 | mddev->degraded--; | ||
2016 | conf->failed_disks--; | ||
2017 | conf->working_disks++; | ||
2018 | tmp->rdev->in_sync = 1; | ||
2019 | } | ||
2020 | } | ||
2021 | print_raid6_conf(conf); | ||
2022 | return 0; | ||
2023 | } | ||
2024 | |||
2025 | static int raid6_remove_disk(mddev_t *mddev, int number) | ||
2026 | { | ||
2027 | raid6_conf_t *conf = mddev->private; | ||
2028 | int err = 0; | ||
2029 | mdk_rdev_t *rdev; | ||
2030 | struct disk_info *p = conf->disks + number; | ||
2031 | |||
2032 | print_raid6_conf(conf); | ||
2033 | rdev = p->rdev; | ||
2034 | if (rdev) { | ||
2035 | if (rdev->in_sync || | ||
2036 | atomic_read(&rdev->nr_pending)) { | ||
2037 | err = -EBUSY; | ||
2038 | goto abort; | ||
2039 | } | ||
2040 | p->rdev = NULL; | ||
2041 | synchronize_kernel(); | ||
2042 | if (atomic_read(&rdev->nr_pending)) { | ||
2043 | /* lost the race, try later */ | ||
2044 | err = -EBUSY; | ||
2045 | p->rdev = rdev; | ||
2046 | } | ||
2047 | } | ||
2048 | |||
2049 | abort: | ||
2050 | |||
2051 | print_raid6_conf(conf); | ||
2052 | return err; | ||
2053 | } | ||
2054 | |||
2055 | static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | ||
2056 | { | ||
2057 | raid6_conf_t *conf = mddev->private; | ||
2058 | int found = 0; | ||
2059 | int disk; | ||
2060 | struct disk_info *p; | ||
2061 | |||
2062 | if (mddev->degraded > 2) | ||
2063 | /* no point adding a device */ | ||
2064 | return 0; | ||
2065 | /* | ||
2066 | * find the disk ... | ||
2067 | */ | ||
2068 | for (disk=0; disk < mddev->raid_disks; disk++) | ||
2069 | if ((p=conf->disks + disk)->rdev == NULL) { | ||
2070 | rdev->in_sync = 0; | ||
2071 | rdev->raid_disk = disk; | ||
2072 | found = 1; | ||
2073 | p->rdev = rdev; | ||
2074 | break; | ||
2075 | } | ||
2076 | print_raid6_conf(conf); | ||
2077 | return found; | ||
2078 | } | ||
2079 | |||
2080 | static int raid6_resize(mddev_t *mddev, sector_t sectors) | ||
2081 | { | ||
2082 | /* no resync is happening, and there is enough space | ||
2083 | * on all devices, so we can resize. | ||
2084 | * We need to make sure resync covers any new space. | ||
2085 | * If the array is shrinking we should possibly wait until | ||
2086 | * any io in the removed space completes, but it hardly seems | ||
2087 | * worth it. | ||
2088 | */ | ||
2089 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); | ||
2090 | mddev->array_size = (sectors * (mddev->raid_disks-2))>>1; | ||
2091 | set_capacity(mddev->gendisk, mddev->array_size << 1); | ||
2092 | mddev->changed = 1; | ||
2093 | if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { | ||
2094 | mddev->recovery_cp = mddev->size << 1; | ||
2095 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
2096 | } | ||
2097 | mddev->size = sectors /2; | ||
2098 | return 0; | ||
2099 | } | ||
2100 | |||
2101 | static mdk_personality_t raid6_personality= | ||
2102 | { | ||
2103 | .name = "raid6", | ||
2104 | .owner = THIS_MODULE, | ||
2105 | .make_request = make_request, | ||
2106 | .run = run, | ||
2107 | .stop = stop, | ||
2108 | .status = status, | ||
2109 | .error_handler = error, | ||
2110 | .hot_add_disk = raid6_add_disk, | ||
2111 | .hot_remove_disk= raid6_remove_disk, | ||
2112 | .spare_active = raid6_spare_active, | ||
2113 | .sync_request = sync_request, | ||
2114 | .resize = raid6_resize, | ||
2115 | }; | ||
2116 | |||
2117 | static int __init raid6_init (void) | ||
2118 | { | ||
2119 | int e; | ||
2120 | |||
2121 | e = raid6_select_algo(); | ||
2122 | if ( e ) | ||
2123 | return e; | ||
2124 | |||
2125 | return register_md_personality (RAID6, &raid6_personality); | ||
2126 | } | ||
2127 | |||
2128 | static void raid6_exit (void) | ||
2129 | { | ||
2130 | unregister_md_personality (RAID6); | ||
2131 | } | ||
2132 | |||
2133 | module_init(raid6_init); | ||
2134 | module_exit(raid6_exit); | ||
2135 | MODULE_LICENSE("GPL"); | ||
2136 | MODULE_ALIAS("md-personality-8"); /* RAID6 */ | ||
diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c new file mode 100644 index 000000000000..359157aaf9e0 --- /dev/null +++ b/drivers/md/raid6mmx.c | |||
@@ -0,0 +1,150 @@ | |||
1 | /* -*- linux-c -*- ------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2002 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | ||
9 | * (at your option) any later version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * raid6mmx.c | ||
15 | * | ||
16 | * MMX implementation of RAID-6 syndrome functions | ||
17 | */ | ||
18 | |||
19 | #if defined(__i386__) | ||
20 | |||
21 | #include "raid6.h" | ||
22 | #include "raid6x86.h" | ||
23 | |||
24 | /* Shared with raid6sse1.c */ | ||
25 | const struct raid6_mmx_constants { | ||
26 | u64 x1d; | ||
27 | } raid6_mmx_constants = { | ||
28 | 0x1d1d1d1d1d1d1d1dULL, | ||
29 | }; | ||
30 | |||
31 | static int raid6_have_mmx(void) | ||
32 | { | ||
33 | #ifdef __KERNEL__ | ||
34 | /* Not really "boot_cpu" but "all_cpus" */ | ||
35 | return boot_cpu_has(X86_FEATURE_MMX); | ||
36 | #else | ||
37 | /* User space test code */ | ||
38 | u32 features = cpuid_features(); | ||
39 | return ( (features & (1<<23)) == (1<<23) ); | ||
40 | #endif | ||
41 | } | ||
42 | |||
43 | /* | ||
44 | * Plain MMX implementation | ||
45 | */ | ||
46 | static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs) | ||
47 | { | ||
48 | u8 **dptr = (u8 **)ptrs; | ||
49 | u8 *p, *q; | ||
50 | int d, z, z0; | ||
51 | raid6_mmx_save_t sa; | ||
52 | |||
53 | z0 = disks - 3; /* Highest data disk */ | ||
54 | p = dptr[z0+1]; /* XOR parity */ | ||
55 | q = dptr[z0+2]; /* RS syndrome */ | ||
56 | |||
57 | raid6_before_mmx(&sa); | ||
58 | |||
59 | asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); | ||
60 | asm volatile("pxor %mm5,%mm5"); /* Zero temp */ | ||
61 | |||
62 | for ( d = 0 ; d < bytes ; d += 8 ) { | ||
63 | asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ | ||
64 | asm volatile("movq %mm2,%mm4"); /* Q[0] */ | ||
65 | for ( z = z0-1 ; z >= 0 ; z-- ) { | ||
66 | asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); | ||
67 | asm volatile("pcmpgtb %mm4,%mm5"); | ||
68 | asm volatile("paddb %mm4,%mm4"); | ||
69 | asm volatile("pand %mm0,%mm5"); | ||
70 | asm volatile("pxor %mm5,%mm4"); | ||
71 | asm volatile("pxor %mm5,%mm5"); | ||
72 | asm volatile("pxor %mm6,%mm2"); | ||
73 | asm volatile("pxor %mm6,%mm4"); | ||
74 | } | ||
75 | asm volatile("movq %%mm2,%0" : "=m" (p[d])); | ||
76 | asm volatile("pxor %mm2,%mm2"); | ||
77 | asm volatile("movq %%mm4,%0" : "=m" (q[d])); | ||
78 | asm volatile("pxor %mm4,%mm4"); | ||
79 | } | ||
80 | |||
81 | raid6_after_mmx(&sa); | ||
82 | } | ||
83 | |||
84 | const struct raid6_calls raid6_mmxx1 = { | ||
85 | raid6_mmx1_gen_syndrome, | ||
86 | raid6_have_mmx, | ||
87 | "mmxx1", | ||
88 | 0 | ||
89 | }; | ||
90 | |||
91 | /* | ||
92 | * Unrolled-by-2 MMX implementation | ||
93 | */ | ||
94 | static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs) | ||
95 | { | ||
96 | u8 **dptr = (u8 **)ptrs; | ||
97 | u8 *p, *q; | ||
98 | int d, z, z0; | ||
99 | raid6_mmx_save_t sa; | ||
100 | |||
101 | z0 = disks - 3; /* Highest data disk */ | ||
102 | p = dptr[z0+1]; /* XOR parity */ | ||
103 | q = dptr[z0+2]; /* RS syndrome */ | ||
104 | |||
105 | raid6_before_mmx(&sa); | ||
106 | |||
107 | asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); | ||
108 | asm volatile("pxor %mm5,%mm5"); /* Zero temp */ | ||
109 | asm volatile("pxor %mm7,%mm7"); /* Zero temp */ | ||
110 | |||
111 | for ( d = 0 ; d < bytes ; d += 16 ) { | ||
112 | asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ | ||
113 | asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); | ||
114 | asm volatile("movq %mm2,%mm4"); /* Q[0] */ | ||
115 | asm volatile("movq %mm3,%mm6"); /* Q[1] */ | ||
116 | for ( z = z0-1 ; z >= 0 ; z-- ) { | ||
117 | asm volatile("pcmpgtb %mm4,%mm5"); | ||
118 | asm volatile("pcmpgtb %mm6,%mm7"); | ||
119 | asm volatile("paddb %mm4,%mm4"); | ||
120 | asm volatile("paddb %mm6,%mm6"); | ||
121 | asm volatile("pand %mm0,%mm5"); | ||
122 | asm volatile("pand %mm0,%mm7"); | ||
123 | asm volatile("pxor %mm5,%mm4"); | ||
124 | asm volatile("pxor %mm7,%mm6"); | ||
125 | asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); | ||
126 | asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); | ||
127 | asm volatile("pxor %mm5,%mm2"); | ||
128 | asm volatile("pxor %mm7,%mm3"); | ||
129 | asm volatile("pxor %mm5,%mm4"); | ||
130 | asm volatile("pxor %mm7,%mm6"); | ||
131 | asm volatile("pxor %mm5,%mm5"); | ||
132 | asm volatile("pxor %mm7,%mm7"); | ||
133 | } | ||
134 | asm volatile("movq %%mm2,%0" : "=m" (p[d])); | ||
135 | asm volatile("movq %%mm3,%0" : "=m" (p[d+8])); | ||
136 | asm volatile("movq %%mm4,%0" : "=m" (q[d])); | ||
137 | asm volatile("movq %%mm6,%0" : "=m" (q[d+8])); | ||
138 | } | ||
139 | |||
140 | raid6_after_mmx(&sa); | ||
141 | } | ||
142 | |||
143 | const struct raid6_calls raid6_mmxx2 = { | ||
144 | raid6_mmx2_gen_syndrome, | ||
145 | raid6_have_mmx, | ||
146 | "mmxx2", | ||
147 | 0 | ||
148 | }; | ||
149 | |||
150 | #endif | ||
diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c new file mode 100644 index 000000000000..a8c4d9451bd9 --- /dev/null +++ b/drivers/md/raid6recov.c | |||
@@ -0,0 +1,133 @@ | |||
1 | /* -*- linux-c -*- ------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2002 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | ||
9 | * (at your option) any later version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * raid6recov.c | ||
15 | * | ||
16 | * RAID-6 data recovery in dual failure mode. In single failure mode, | ||
17 | * use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct | ||
18 | * the syndrome.) | ||
19 | */ | ||
20 | |||
21 | #include "raid6.h" | ||
22 | |||
23 | /* Recover two failed data blocks. */ | ||
24 | void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, | ||
25 | void **ptrs) | ||
26 | { | ||
27 | u8 *p, *q, *dp, *dq; | ||
28 | u8 px, qx, db; | ||
29 | const u8 *pbmul; /* P multiplier table for B data */ | ||
30 | const u8 *qmul; /* Q multiplier table (for both) */ | ||
31 | |||
32 | p = (u8 *)ptrs[disks-2]; | ||
33 | q = (u8 *)ptrs[disks-1]; | ||
34 | |||
35 | /* Compute syndrome with zero for the missing data pages | ||
36 | Use the dead data pages as temporary storage for | ||
37 | delta p and delta q */ | ||
38 | dp = (u8 *)ptrs[faila]; | ||
39 | ptrs[faila] = (void *)raid6_empty_zero_page; | ||
40 | ptrs[disks-2] = dp; | ||
41 | dq = (u8 *)ptrs[failb]; | ||
42 | ptrs[failb] = (void *)raid6_empty_zero_page; | ||
43 | ptrs[disks-1] = dq; | ||
44 | |||
45 | raid6_call.gen_syndrome(disks, bytes, ptrs); | ||
46 | |||
47 | /* Restore pointer table */ | ||
48 | ptrs[faila] = dp; | ||
49 | ptrs[failb] = dq; | ||
50 | ptrs[disks-2] = p; | ||
51 | ptrs[disks-1] = q; | ||
52 | |||
53 | /* Now, pick the proper data tables */ | ||
54 | pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; | ||
55 | qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; | ||
56 | |||
57 | /* Now do it... */ | ||
58 | while ( bytes-- ) { | ||
59 | px = *p ^ *dp; | ||
60 | qx = qmul[*q ^ *dq]; | ||
61 | *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ | ||
62 | *dp++ = db ^ px; /* Reconstructed A */ | ||
63 | p++; q++; | ||
64 | } | ||
65 | } | ||
66 | |||
67 | |||
68 | |||
69 | |||
70 | /* Recover failure of one data block plus the P block */ | ||
71 | void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) | ||
72 | { | ||
73 | u8 *p, *q, *dq; | ||
74 | const u8 *qmul; /* Q multiplier table */ | ||
75 | |||
76 | p = (u8 *)ptrs[disks-2]; | ||
77 | q = (u8 *)ptrs[disks-1]; | ||
78 | |||
79 | /* Compute syndrome with zero for the missing data page | ||
80 | Use the dead data page as temporary storage for delta q */ | ||
81 | dq = (u8 *)ptrs[faila]; | ||
82 | ptrs[faila] = (void *)raid6_empty_zero_page; | ||
83 | ptrs[disks-1] = dq; | ||
84 | |||
85 | raid6_call.gen_syndrome(disks, bytes, ptrs); | ||
86 | |||
87 | /* Restore pointer table */ | ||
88 | ptrs[faila] = dq; | ||
89 | ptrs[disks-1] = q; | ||
90 | |||
91 | /* Now, pick the proper data tables */ | ||
92 | qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; | ||
93 | |||
94 | /* Now do it... */ | ||
95 | while ( bytes-- ) { | ||
96 | *p++ ^= *dq = qmul[*q ^ *dq]; | ||
97 | q++; dq++; | ||
98 | } | ||
99 | } | ||
100 | |||
101 | |||
102 | #ifndef __KERNEL__ /* Testing only */ | ||
103 | |||
104 | /* Recover two failed blocks. */ | ||
105 | void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) | ||
106 | { | ||
107 | if ( faila > failb ) { | ||
108 | int tmp = faila; | ||
109 | faila = failb; | ||
110 | failb = tmp; | ||
111 | } | ||
112 | |||
113 | if ( failb == disks-1 ) { | ||
114 | if ( faila == disks-2 ) { | ||
115 | /* P+Q failure. Just rebuild the syndrome. */ | ||
116 | raid6_call.gen_syndrome(disks, bytes, ptrs); | ||
117 | } else { | ||
118 | /* data+Q failure. Reconstruct data from P, | ||
119 | then rebuild syndrome. */ | ||
120 | /* NOT IMPLEMENTED - equivalent to RAID-5 */ | ||
121 | } | ||
122 | } else { | ||
123 | if ( failb == disks-2 ) { | ||
124 | /* data+P failure. */ | ||
125 | raid6_datap_recov(disks, bytes, faila, ptrs); | ||
126 | } else { | ||
127 | /* data+data failure. */ | ||
128 | raid6_2data_recov(disks, bytes, faila, failb, ptrs); | ||
129 | } | ||
130 | } | ||
131 | } | ||
132 | |||
133 | #endif | ||
diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c new file mode 100644 index 000000000000..f7e7859f71aa --- /dev/null +++ b/drivers/md/raid6sse1.c | |||
@@ -0,0 +1,171 @@ | |||
1 | /* -*- linux-c -*- ------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2002 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | ||
9 | * (at your option) any later version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * raid6sse1.c | ||
15 | * | ||
16 | * SSE-1/MMXEXT implementation of RAID-6 syndrome functions | ||
17 | * | ||
18 | * This is really an MMX implementation, but it requires SSE-1 or | ||
19 | * AMD MMXEXT for prefetch support and a few other features. The | ||
20 | * support for nontemporal memory accesses is enough to make this | ||
21 | * worthwhile as a separate implementation. | ||
22 | */ | ||
23 | |||
24 | #if defined(__i386__) | ||
25 | |||
26 | #include "raid6.h" | ||
27 | #include "raid6x86.h" | ||
28 | |||
29 | /* Defined in raid6mmx.c */ | ||
30 | extern const struct raid6_mmx_constants { | ||
31 | u64 x1d; | ||
32 | } raid6_mmx_constants; | ||
33 | |||
34 | static int raid6_have_sse1_or_mmxext(void) | ||
35 | { | ||
36 | #ifdef __KERNEL__ | ||
37 | /* Not really boot_cpu but "all_cpus" */ | ||
38 | return boot_cpu_has(X86_FEATURE_MMX) && | ||
39 | (boot_cpu_has(X86_FEATURE_XMM) || | ||
40 | boot_cpu_has(X86_FEATURE_MMXEXT)); | ||
41 | #else | ||
42 | /* User space test code - this incorrectly breaks on some Athlons */ | ||
43 | u32 features = cpuid_features(); | ||
44 | return ( (features & (5<<23)) == (5<<23) ); | ||
45 | #endif | ||
46 | } | ||
47 | |||
48 | /* | ||
49 | * Plain SSE1 implementation | ||
50 | */ | ||
51 | static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs) | ||
52 | { | ||
53 | u8 **dptr = (u8 **)ptrs; | ||
54 | u8 *p, *q; | ||
55 | int d, z, z0; | ||
56 | raid6_mmx_save_t sa; | ||
57 | |||
58 | z0 = disks - 3; /* Highest data disk */ | ||
59 | p = dptr[z0+1]; /* XOR parity */ | ||
60 | q = dptr[z0+2]; /* RS syndrome */ | ||
61 | |||
62 | /* This is really MMX code, not SSE */ | ||
63 | raid6_before_mmx(&sa); | ||
64 | |||
65 | asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); | ||
66 | asm volatile("pxor %mm5,%mm5"); /* Zero temp */ | ||
67 | |||
68 | for ( d = 0 ; d < bytes ; d += 8 ) { | ||
69 | asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); | ||
70 | asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ | ||
71 | asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); | ||
72 | asm volatile("movq %mm2,%mm4"); /* Q[0] */ | ||
73 | asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d])); | ||
74 | for ( z = z0-2 ; z >= 0 ; z-- ) { | ||
75 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); | ||
76 | asm volatile("pcmpgtb %mm4,%mm5"); | ||
77 | asm volatile("paddb %mm4,%mm4"); | ||
78 | asm volatile("pand %mm0,%mm5"); | ||
79 | asm volatile("pxor %mm5,%mm4"); | ||
80 | asm volatile("pxor %mm5,%mm5"); | ||
81 | asm volatile("pxor %mm6,%mm2"); | ||
82 | asm volatile("pxor %mm6,%mm4"); | ||
83 | asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); | ||
84 | } | ||
85 | asm volatile("pcmpgtb %mm4,%mm5"); | ||
86 | asm volatile("paddb %mm4,%mm4"); | ||
87 | asm volatile("pand %mm0,%mm5"); | ||
88 | asm volatile("pxor %mm5,%mm4"); | ||
89 | asm volatile("pxor %mm5,%mm5"); | ||
90 | asm volatile("pxor %mm6,%mm2"); | ||
91 | asm volatile("pxor %mm6,%mm4"); | ||
92 | |||
93 | asm volatile("movntq %%mm2,%0" : "=m" (p[d])); | ||
94 | asm volatile("movntq %%mm4,%0" : "=m" (q[d])); | ||
95 | } | ||
96 | |||
97 | raid6_after_mmx(&sa); | ||
98 | asm volatile("sfence" : : : "memory"); | ||
99 | } | ||
100 | |||
101 | const struct raid6_calls raid6_sse1x1 = { | ||
102 | raid6_sse11_gen_syndrome, | ||
103 | raid6_have_sse1_or_mmxext, | ||
104 | "sse1x1", | ||
105 | 1 /* Has cache hints */ | ||
106 | }; | ||
107 | |||
108 | /* | ||
109 | * Unrolled-by-2 SSE1 implementation | ||
110 | */ | ||
111 | static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs) | ||
112 | { | ||
113 | u8 **dptr = (u8 **)ptrs; | ||
114 | u8 *p, *q; | ||
115 | int d, z, z0; | ||
116 | raid6_mmx_save_t sa; | ||
117 | |||
118 | z0 = disks - 3; /* Highest data disk */ | ||
119 | p = dptr[z0+1]; /* XOR parity */ | ||
120 | q = dptr[z0+2]; /* RS syndrome */ | ||
121 | |||
122 | raid6_before_mmx(&sa); | ||
123 | |||
124 | asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); | ||
125 | asm volatile("pxor %mm5,%mm5"); /* Zero temp */ | ||
126 | asm volatile("pxor %mm7,%mm7"); /* Zero temp */ | ||
127 | |||
128 | /* We uniformly assume a single prefetch covers at least 16 bytes */ | ||
129 | for ( d = 0 ; d < bytes ; d += 16 ) { | ||
130 | asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); | ||
131 | asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ | ||
132 | asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */ | ||
133 | asm volatile("movq %mm2,%mm4"); /* Q[0] */ | ||
134 | asm volatile("movq %mm3,%mm6"); /* Q[1] */ | ||
135 | for ( z = z0-1 ; z >= 0 ; z-- ) { | ||
136 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); | ||
137 | asm volatile("pcmpgtb %mm4,%mm5"); | ||
138 | asm volatile("pcmpgtb %mm6,%mm7"); | ||
139 | asm volatile("paddb %mm4,%mm4"); | ||
140 | asm volatile("paddb %mm6,%mm6"); | ||
141 | asm volatile("pand %mm0,%mm5"); | ||
142 | asm volatile("pand %mm0,%mm7"); | ||
143 | asm volatile("pxor %mm5,%mm4"); | ||
144 | asm volatile("pxor %mm7,%mm6"); | ||
145 | asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); | ||
146 | asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); | ||
147 | asm volatile("pxor %mm5,%mm2"); | ||
148 | asm volatile("pxor %mm7,%mm3"); | ||
149 | asm volatile("pxor %mm5,%mm4"); | ||
150 | asm volatile("pxor %mm7,%mm6"); | ||
151 | asm volatile("pxor %mm5,%mm5"); | ||
152 | asm volatile("pxor %mm7,%mm7"); | ||
153 | } | ||
154 | asm volatile("movntq %%mm2,%0" : "=m" (p[d])); | ||
155 | asm volatile("movntq %%mm3,%0" : "=m" (p[d+8])); | ||
156 | asm volatile("movntq %%mm4,%0" : "=m" (q[d])); | ||
157 | asm volatile("movntq %%mm6,%0" : "=m" (q[d+8])); | ||
158 | } | ||
159 | |||
160 | raid6_after_mmx(&sa); | ||
161 | asm volatile("sfence" : :: "memory"); | ||
162 | } | ||
163 | |||
164 | const struct raid6_calls raid6_sse1x2 = { | ||
165 | raid6_sse12_gen_syndrome, | ||
166 | raid6_have_sse1_or_mmxext, | ||
167 | "sse1x2", | ||
168 | 1 /* Has cache hints */ | ||
169 | }; | ||
170 | |||
171 | #endif | ||
diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c new file mode 100644 index 000000000000..b3aa7fe0877e --- /dev/null +++ b/drivers/md/raid6sse2.c | |||
@@ -0,0 +1,270 @@ | |||
1 | /* -*- linux-c -*- ------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2002 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | ||
9 | * (at your option) any later version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * raid6sse2.c | ||
15 | * | ||
16 | * SSE-2 implementation of RAID-6 syndrome functions | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | #if defined(__i386__) || defined(__x86_64__) | ||
21 | |||
22 | #include "raid6.h" | ||
23 | #include "raid6x86.h" | ||
24 | |||
25 | static const struct raid6_sse_constants { | ||
26 | u64 x1d[2]; | ||
27 | } raid6_sse_constants __attribute__((aligned(16))) = { | ||
28 | { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL }, | ||
29 | }; | ||
30 | |||
31 | static int raid6_have_sse2(void) | ||
32 | { | ||
33 | #ifdef __KERNEL__ | ||
34 | /* Not really boot_cpu but "all_cpus" */ | ||
35 | return boot_cpu_has(X86_FEATURE_MMX) && | ||
36 | boot_cpu_has(X86_FEATURE_FXSR) && | ||
37 | boot_cpu_has(X86_FEATURE_XMM) && | ||
38 | boot_cpu_has(X86_FEATURE_XMM2); | ||
39 | #else | ||
40 | /* User space test code */ | ||
41 | u32 features = cpuid_features(); | ||
42 | return ( (features & (15<<23)) == (15<<23) ); | ||
43 | #endif | ||
44 | } | ||
45 | |||
46 | /* | ||
47 | * Plain SSE2 implementation | ||
48 | */ | ||
49 | static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) | ||
50 | { | ||
51 | u8 **dptr = (u8 **)ptrs; | ||
52 | u8 *p, *q; | ||
53 | int d, z, z0; | ||
54 | raid6_sse_save_t sa; | ||
55 | |||
56 | z0 = disks - 3; /* Highest data disk */ | ||
57 | p = dptr[z0+1]; /* XOR parity */ | ||
58 | q = dptr[z0+2]; /* RS syndrome */ | ||
59 | |||
60 | raid6_before_sse2(&sa); | ||
61 | |||
62 | asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); | ||
63 | asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ | ||
64 | |||
65 | for ( d = 0 ; d < bytes ; d += 16 ) { | ||
66 | asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); | ||
67 | asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ | ||
68 | asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); | ||
69 | asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ | ||
70 | asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d])); | ||
71 | for ( z = z0-2 ; z >= 0 ; z-- ) { | ||
72 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); | ||
73 | asm volatile("pcmpgtb %xmm4,%xmm5"); | ||
74 | asm volatile("paddb %xmm4,%xmm4"); | ||
75 | asm volatile("pand %xmm0,%xmm5"); | ||
76 | asm volatile("pxor %xmm5,%xmm4"); | ||
77 | asm volatile("pxor %xmm5,%xmm5"); | ||
78 | asm volatile("pxor %xmm6,%xmm2"); | ||
79 | asm volatile("pxor %xmm6,%xmm4"); | ||
80 | asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d])); | ||
81 | } | ||
82 | asm volatile("pcmpgtb %xmm4,%xmm5"); | ||
83 | asm volatile("paddb %xmm4,%xmm4"); | ||
84 | asm volatile("pand %xmm0,%xmm5"); | ||
85 | asm volatile("pxor %xmm5,%xmm4"); | ||
86 | asm volatile("pxor %xmm5,%xmm5"); | ||
87 | asm volatile("pxor %xmm6,%xmm2"); | ||
88 | asm volatile("pxor %xmm6,%xmm4"); | ||
89 | |||
90 | asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); | ||
91 | asm volatile("pxor %xmm2,%xmm2"); | ||
92 | asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); | ||
93 | asm volatile("pxor %xmm4,%xmm4"); | ||
94 | } | ||
95 | |||
96 | raid6_after_sse2(&sa); | ||
97 | asm volatile("sfence" : : : "memory"); | ||
98 | } | ||
99 | |||
100 | const struct raid6_calls raid6_sse2x1 = { | ||
101 | raid6_sse21_gen_syndrome, | ||
102 | raid6_have_sse2, | ||
103 | "sse2x1", | ||
104 | 1 /* Has cache hints */ | ||
105 | }; | ||
106 | |||
107 | /* | ||
108 | * Unrolled-by-2 SSE2 implementation | ||
109 | */ | ||
110 | static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) | ||
111 | { | ||
112 | u8 **dptr = (u8 **)ptrs; | ||
113 | u8 *p, *q; | ||
114 | int d, z, z0; | ||
115 | raid6_sse_save_t sa; | ||
116 | |||
117 | z0 = disks - 3; /* Highest data disk */ | ||
118 | p = dptr[z0+1]; /* XOR parity */ | ||
119 | q = dptr[z0+2]; /* RS syndrome */ | ||
120 | |||
121 | raid6_before_sse2(&sa); | ||
122 | |||
123 | asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); | ||
124 | asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ | ||
125 | asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ | ||
126 | |||
127 | /* We uniformly assume a single prefetch covers at least 32 bytes */ | ||
128 | for ( d = 0 ; d < bytes ; d += 32 ) { | ||
129 | asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); | ||
130 | asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ | ||
131 | asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */ | ||
132 | asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ | ||
133 | asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */ | ||
134 | for ( z = z0-1 ; z >= 0 ; z-- ) { | ||
135 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); | ||
136 | asm volatile("pcmpgtb %xmm4,%xmm5"); | ||
137 | asm volatile("pcmpgtb %xmm6,%xmm7"); | ||
138 | asm volatile("paddb %xmm4,%xmm4"); | ||
139 | asm volatile("paddb %xmm6,%xmm6"); | ||
140 | asm volatile("pand %xmm0,%xmm5"); | ||
141 | asm volatile("pand %xmm0,%xmm7"); | ||
142 | asm volatile("pxor %xmm5,%xmm4"); | ||
143 | asm volatile("pxor %xmm7,%xmm6"); | ||
144 | asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d])); | ||
145 | asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16])); | ||
146 | asm volatile("pxor %xmm5,%xmm2"); | ||
147 | asm volatile("pxor %xmm7,%xmm3"); | ||
148 | asm volatile("pxor %xmm5,%xmm4"); | ||
149 | asm volatile("pxor %xmm7,%xmm6"); | ||
150 | asm volatile("pxor %xmm5,%xmm5"); | ||
151 | asm volatile("pxor %xmm7,%xmm7"); | ||
152 | } | ||
153 | asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); | ||
154 | asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); | ||
155 | asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); | ||
156 | asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); | ||
157 | } | ||
158 | |||
159 | raid6_after_sse2(&sa); | ||
160 | asm volatile("sfence" : : : "memory"); | ||
161 | } | ||
162 | |||
163 | const struct raid6_calls raid6_sse2x2 = { | ||
164 | raid6_sse22_gen_syndrome, | ||
165 | raid6_have_sse2, | ||
166 | "sse2x2", | ||
167 | 1 /* Has cache hints */ | ||
168 | }; | ||
169 | |||
170 | #endif | ||
171 | |||
172 | #ifdef __x86_64__ | ||
173 | |||
174 | /* | ||
175 | * Unrolled-by-4 SSE2 implementation | ||
176 | */ | ||
177 | static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) | ||
178 | { | ||
179 | u8 **dptr = (u8 **)ptrs; | ||
180 | u8 *p, *q; | ||
181 | int d, z, z0; | ||
182 | raid6_sse16_save_t sa; | ||
183 | |||
184 | z0 = disks - 3; /* Highest data disk */ | ||
185 | p = dptr[z0+1]; /* XOR parity */ | ||
186 | q = dptr[z0+2]; /* RS syndrome */ | ||
187 | |||
188 | raid6_before_sse16(&sa); | ||
189 | |||
190 | asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); | ||
191 | asm volatile("pxor %xmm2,%xmm2"); /* P[0] */ | ||
192 | asm volatile("pxor %xmm3,%xmm3"); /* P[1] */ | ||
193 | asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */ | ||
194 | asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ | ||
195 | asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */ | ||
196 | asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ | ||
197 | asm volatile("pxor %xmm10,%xmm10"); /* P[2] */ | ||
198 | asm volatile("pxor %xmm11,%xmm11"); /* P[3] */ | ||
199 | asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */ | ||
200 | asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */ | ||
201 | asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */ | ||
202 | asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */ | ||
203 | |||
204 | for ( d = 0 ; d < bytes ; d += 64 ) { | ||
205 | for ( z = z0 ; z >= 0 ; z-- ) { | ||
206 | /* The second prefetch seems to improve performance... */ | ||
207 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); | ||
208 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); | ||
209 | asm volatile("pcmpgtb %xmm4,%xmm5"); | ||
210 | asm volatile("pcmpgtb %xmm6,%xmm7"); | ||
211 | asm volatile("pcmpgtb %xmm12,%xmm13"); | ||
212 | asm volatile("pcmpgtb %xmm14,%xmm15"); | ||
213 | asm volatile("paddb %xmm4,%xmm4"); | ||
214 | asm volatile("paddb %xmm6,%xmm6"); | ||
215 | asm volatile("paddb %xmm12,%xmm12"); | ||
216 | asm volatile("paddb %xmm14,%xmm14"); | ||
217 | asm volatile("pand %xmm0,%xmm5"); | ||
218 | asm volatile("pand %xmm0,%xmm7"); | ||
219 | asm volatile("pand %xmm0,%xmm13"); | ||
220 | asm volatile("pand %xmm0,%xmm15"); | ||
221 | asm volatile("pxor %xmm5,%xmm4"); | ||
222 | asm volatile("pxor %xmm7,%xmm6"); | ||
223 | asm volatile("pxor %xmm13,%xmm12"); | ||
224 | asm volatile("pxor %xmm15,%xmm14"); | ||
225 | asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); | ||
226 | asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); | ||
227 | asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); | ||
228 | asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); | ||
229 | asm volatile("pxor %xmm5,%xmm2"); | ||
230 | asm volatile("pxor %xmm7,%xmm3"); | ||
231 | asm volatile("pxor %xmm13,%xmm10"); | ||
232 | asm volatile("pxor %xmm15,%xmm11"); | ||
233 | asm volatile("pxor %xmm5,%xmm4"); | ||
234 | asm volatile("pxor %xmm7,%xmm6"); | ||
235 | asm volatile("pxor %xmm13,%xmm12"); | ||
236 | asm volatile("pxor %xmm15,%xmm14"); | ||
237 | asm volatile("pxor %xmm5,%xmm5"); | ||
238 | asm volatile("pxor %xmm7,%xmm7"); | ||
239 | asm volatile("pxor %xmm13,%xmm13"); | ||
240 | asm volatile("pxor %xmm15,%xmm15"); | ||
241 | } | ||
242 | asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); | ||
243 | asm volatile("pxor %xmm2,%xmm2"); | ||
244 | asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); | ||
245 | asm volatile("pxor %xmm3,%xmm3"); | ||
246 | asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); | ||
247 | asm volatile("pxor %xmm10,%xmm10"); | ||
248 | asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); | ||
249 | asm volatile("pxor %xmm11,%xmm11"); | ||
250 | asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); | ||
251 | asm volatile("pxor %xmm4,%xmm4"); | ||
252 | asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); | ||
253 | asm volatile("pxor %xmm6,%xmm6"); | ||
254 | asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); | ||
255 | asm volatile("pxor %xmm12,%xmm12"); | ||
256 | asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); | ||
257 | asm volatile("pxor %xmm14,%xmm14"); | ||
258 | } | ||
259 | asm volatile("sfence" : : : "memory"); | ||
260 | raid6_after_sse16(&sa); | ||
261 | } | ||
262 | |||
263 | const struct raid6_calls raid6_sse2x4 = { | ||
264 | raid6_sse24_gen_syndrome, | ||
265 | raid6_have_sse2, | ||
266 | "sse2x4", | ||
267 | 1 /* Has cache hints */ | ||
268 | }; | ||
269 | |||
270 | #endif | ||
diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile new file mode 100644 index 000000000000..557806728609 --- /dev/null +++ b/drivers/md/raid6test/Makefile | |||
@@ -0,0 +1,58 @@ | |||
1 | # | ||
2 | # This is a simple Makefile to test some of the RAID-6 code | ||
3 | # from userspace. | ||
4 | # | ||
5 | |||
6 | CC = gcc | ||
7 | OPTFLAGS = -O2 # Adjust as desired | ||
8 | CFLAGS = -I.. -g $(OPTFLAGS) | ||
9 | LD = ld | ||
10 | PERL = perl | ||
11 | |||
12 | .c.o: | ||
13 | $(CC) $(CFLAGS) -c -o $@ $< | ||
14 | |||
15 | %.c: ../%.c | ||
16 | cp -f $< $@ | ||
17 | |||
18 | %.uc: ../%.uc | ||
19 | cp -f $< $@ | ||
20 | |||
21 | all: raid6.o raid6test | ||
22 | |||
23 | raid6.o: raid6int1.o raid6int2.o raid6int4.o raid6int8.o raid6int16.o \ | ||
24 | raid6int32.o \ | ||
25 | raid6mmx.o raid6sse1.o raid6sse2.o \ | ||
26 | raid6recov.o raid6algos.o \ | ||
27 | raid6tables.o | ||
28 | $(LD) -r -o $@ $^ | ||
29 | |||
30 | raid6test: raid6.o test.c | ||
31 | $(CC) $(CFLAGS) -o raid6test $^ | ||
32 | |||
33 | raid6int1.c: raid6int.uc ../unroll.pl | ||
34 | $(PERL) ../unroll.pl 1 < raid6int.uc > $@ | ||
35 | |||
36 | raid6int2.c: raid6int.uc ../unroll.pl | ||
37 | $(PERL) ../unroll.pl 2 < raid6int.uc > $@ | ||
38 | |||
39 | raid6int4.c: raid6int.uc ../unroll.pl | ||
40 | $(PERL) ../unroll.pl 4 < raid6int.uc > $@ | ||
41 | |||
42 | raid6int8.c: raid6int.uc ../unroll.pl | ||
43 | $(PERL) ../unroll.pl 8 < raid6int.uc > $@ | ||
44 | |||
45 | raid6int16.c: raid6int.uc ../unroll.pl | ||
46 | $(PERL) ../unroll.pl 16 < raid6int.uc > $@ | ||
47 | |||
48 | raid6int32.c: raid6int.uc ../unroll.pl | ||
49 | $(PERL) ../unroll.pl 32 < raid6int.uc > $@ | ||
50 | |||
51 | raid6tables.c: mktables | ||
52 | ./mktables > raid6tables.c | ||
53 | |||
54 | clean: | ||
55 | rm -f *.o mktables mktables.c raid6int.uc raid6*.c raid6test | ||
56 | |||
57 | spotless: clean | ||
58 | rm -f *~ | ||
diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c new file mode 100644 index 000000000000..0d5cd57accd7 --- /dev/null +++ b/drivers/md/raid6test/test.c | |||
@@ -0,0 +1,103 @@ | |||
1 | /* -*- linux-c -*- ------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2002 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | ||
9 | * (at your option) any later version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * raid6test.c | ||
15 | * | ||
16 | * Test RAID-6 recovery with various algorithms | ||
17 | */ | ||
18 | |||
19 | #include <stdlib.h> | ||
20 | #include <stdio.h> | ||
21 | #include <string.h> | ||
22 | #include "raid6.h" | ||
23 | |||
24 | #define NDISKS 16 /* Including P and Q */ | ||
25 | |||
26 | const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); | ||
27 | struct raid6_calls raid6_call; | ||
28 | |||
29 | char *dataptrs[NDISKS]; | ||
30 | char data[NDISKS][PAGE_SIZE]; | ||
31 | char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; | ||
32 | |||
33 | void makedata(void) | ||
34 | { | ||
35 | int i, j; | ||
36 | |||
37 | for ( i = 0 ; i < NDISKS ; i++ ) { | ||
38 | for ( j = 0 ; j < PAGE_SIZE ; j++ ) { | ||
39 | data[i][j] = rand(); | ||
40 | } | ||
41 | dataptrs[i] = data[i]; | ||
42 | } | ||
43 | } | ||
44 | |||
45 | int main(int argc, char *argv[]) | ||
46 | { | ||
47 | const struct raid6_calls * const * algo; | ||
48 | int i, j; | ||
49 | int erra, errb; | ||
50 | |||
51 | makedata(); | ||
52 | |||
53 | for ( algo = raid6_algos ; *algo ; algo++ ) { | ||
54 | if ( !(*algo)->valid || (*algo)->valid() ) { | ||
55 | raid6_call = **algo; | ||
56 | |||
57 | /* Nuke syndromes */ | ||
58 | memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); | ||
59 | |||
60 | /* Generate assumed good syndrome */ | ||
61 | raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, (void **)&dataptrs); | ||
62 | |||
63 | for ( i = 0 ; i < NDISKS-1 ; i++ ) { | ||
64 | for ( j = i+1 ; j < NDISKS ; j++ ) { | ||
65 | memset(recovi, 0xf0, PAGE_SIZE); | ||
66 | memset(recovj, 0xba, PAGE_SIZE); | ||
67 | |||
68 | dataptrs[i] = recovi; | ||
69 | dataptrs[j] = recovj; | ||
70 | |||
71 | raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs); | ||
72 | |||
73 | erra = memcmp(data[i], recovi, PAGE_SIZE); | ||
74 | errb = memcmp(data[j], recovj, PAGE_SIZE); | ||
75 | |||
76 | if ( i < NDISKS-2 && j == NDISKS-1 ) { | ||
77 | /* We don't implement the DQ failure scenario, since it's | ||
78 | equivalent to a RAID-5 failure (XOR, then recompute Q) */ | ||
79 | } else { | ||
80 | printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n", | ||
81 | raid6_call.name, | ||
82 | i, (i==NDISKS-2)?'P':'D', | ||
83 | j, (j==NDISKS-1)?'Q':(j==NDISKS-2)?'P':'D', | ||
84 | (!erra && !errb) ? "OK" : | ||
85 | !erra ? "ERRB" : | ||
86 | !errb ? "ERRA" : | ||
87 | "ERRAB"); | ||
88 | } | ||
89 | |||
90 | dataptrs[i] = data[i]; | ||
91 | dataptrs[j] = data[j]; | ||
92 | } | ||
93 | } | ||
94 | } | ||
95 | printf("\n"); | ||
96 | } | ||
97 | |||
98 | printf("\n"); | ||
99 | /* Pick the best algorithm test */ | ||
100 | raid6_select_algo(); | ||
101 | |||
102 | return 0; | ||
103 | } | ||
diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h new file mode 100644 index 000000000000..4cf20534fe44 --- /dev/null +++ b/drivers/md/raid6x86.h | |||
@@ -0,0 +1,245 @@ | |||
1 | /* ----------------------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | ||
9 | * (at your option) any later version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | /* | ||
14 | * raid6x86.h | ||
15 | * | ||
16 | * Definitions common to x86 and x86-64 RAID-6 code only | ||
17 | */ | ||
18 | |||
19 | #ifndef LINUX_RAID_RAID6X86_H | ||
20 | #define LINUX_RAID_RAID6X86_H | ||
21 | |||
22 | #if defined(__i386__) || defined(__x86_64__) | ||
23 | |||
24 | #ifdef __x86_64__ | ||
25 | |||
26 | typedef struct { | ||
27 | unsigned int fsave[27]; | ||
28 | unsigned long cr0; | ||
29 | } raid6_mmx_save_t __attribute__((aligned(16))); | ||
30 | |||
31 | /* N.B.: For SSE we only save %xmm0-%xmm7 even for x86-64, since | ||
32 | the code doesn't know about the additional x86-64 registers */ | ||
33 | typedef struct { | ||
34 | unsigned int sarea[8*4+2]; | ||
35 | unsigned long cr0; | ||
36 | } raid6_sse_save_t __attribute__((aligned(16))); | ||
37 | |||
38 | /* This is for x86-64-specific code which uses all 16 XMM registers */ | ||
39 | typedef struct { | ||
40 | unsigned int sarea[16*4+2]; | ||
41 | unsigned long cr0; | ||
42 | } raid6_sse16_save_t __attribute__((aligned(16))); | ||
43 | |||
44 | /* On x86-64 the stack *SHOULD* be 16-byte aligned, but currently this | ||
45 | is buggy in the kernel and it's only 8-byte aligned in places, so | ||
46 | we need to do this anyway. Sigh. */ | ||
47 | #define SAREA(x) ((unsigned int *)((((unsigned long)&(x)->sarea)+15) & ~15)) | ||
48 | |||
49 | #else /* __i386__ */ | ||
50 | |||
51 | typedef struct { | ||
52 | unsigned int fsave[27]; | ||
53 | unsigned long cr0; | ||
54 | } raid6_mmx_save_t; | ||
55 | |||
56 | /* On i386, the stack is only 8-byte aligned, but SSE requires 16-byte | ||
57 | alignment. The +3 is so we have the slack space to manually align | ||
58 | a properly-sized area correctly. */ | ||
59 | typedef struct { | ||
60 | unsigned int sarea[8*4+3]; | ||
61 | unsigned long cr0; | ||
62 | } raid6_sse_save_t; | ||
63 | |||
64 | /* Find the 16-byte aligned save area */ | ||
65 | #define SAREA(x) ((unsigned int *)((((unsigned long)&(x)->sarea)+15) & ~15)) | ||
66 | |||
67 | #endif | ||
68 | |||
69 | #ifdef __KERNEL__ /* Real code */ | ||
70 | |||
71 | /* Note: %cr0 is 32 bits on i386 and 64 bits on x86-64 */ | ||
72 | |||
73 | static inline unsigned long raid6_get_fpu(void) | ||
74 | { | ||
75 | unsigned long cr0; | ||
76 | |||
77 | preempt_disable(); | ||
78 | asm volatile("mov %%cr0,%0 ; clts" : "=r" (cr0)); | ||
79 | return cr0; | ||
80 | } | ||
81 | |||
82 | static inline void raid6_put_fpu(unsigned long cr0) | ||
83 | { | ||
84 | asm volatile("mov %0,%%cr0" : : "r" (cr0)); | ||
85 | preempt_enable(); | ||
86 | } | ||
87 | |||
88 | #else /* Dummy code for user space testing */ | ||
89 | |||
90 | static inline unsigned long raid6_get_fpu(void) | ||
91 | { | ||
92 | return 0xf00ba6; | ||
93 | } | ||
94 | |||
95 | static inline void raid6_put_fpu(unsigned long cr0) | ||
96 | { | ||
97 | (void)cr0; | ||
98 | } | ||
99 | |||
100 | #endif | ||
101 | |||
102 | static inline void raid6_before_mmx(raid6_mmx_save_t *s) | ||
103 | { | ||
104 | s->cr0 = raid6_get_fpu(); | ||
105 | asm volatile("fsave %0 ; fwait" : "=m" (s->fsave[0])); | ||
106 | } | ||
107 | |||
108 | static inline void raid6_after_mmx(raid6_mmx_save_t *s) | ||
109 | { | ||
110 | asm volatile("frstor %0" : : "m" (s->fsave[0])); | ||
111 | raid6_put_fpu(s->cr0); | ||
112 | } | ||
113 | |||
114 | static inline void raid6_before_sse(raid6_sse_save_t *s) | ||
115 | { | ||
116 | unsigned int *rsa = SAREA(s); | ||
117 | |||
118 | s->cr0 = raid6_get_fpu(); | ||
119 | |||
120 | asm volatile("movaps %%xmm0,%0" : "=m" (rsa[0])); | ||
121 | asm volatile("movaps %%xmm1,%0" : "=m" (rsa[4])); | ||
122 | asm volatile("movaps %%xmm2,%0" : "=m" (rsa[8])); | ||
123 | asm volatile("movaps %%xmm3,%0" : "=m" (rsa[12])); | ||
124 | asm volatile("movaps %%xmm4,%0" : "=m" (rsa[16])); | ||
125 | asm volatile("movaps %%xmm5,%0" : "=m" (rsa[20])); | ||
126 | asm volatile("movaps %%xmm6,%0" : "=m" (rsa[24])); | ||
127 | asm volatile("movaps %%xmm7,%0" : "=m" (rsa[28])); | ||
128 | } | ||
129 | |||
130 | static inline void raid6_after_sse(raid6_sse_save_t *s) | ||
131 | { | ||
132 | unsigned int *rsa = SAREA(s); | ||
133 | |||
134 | asm volatile("movaps %0,%%xmm0" : : "m" (rsa[0])); | ||
135 | asm volatile("movaps %0,%%xmm1" : : "m" (rsa[4])); | ||
136 | asm volatile("movaps %0,%%xmm2" : : "m" (rsa[8])); | ||
137 | asm volatile("movaps %0,%%xmm3" : : "m" (rsa[12])); | ||
138 | asm volatile("movaps %0,%%xmm4" : : "m" (rsa[16])); | ||
139 | asm volatile("movaps %0,%%xmm5" : : "m" (rsa[20])); | ||
140 | asm volatile("movaps %0,%%xmm6" : : "m" (rsa[24])); | ||
141 | asm volatile("movaps %0,%%xmm7" : : "m" (rsa[28])); | ||
142 | |||
143 | raid6_put_fpu(s->cr0); | ||
144 | } | ||
145 | |||
146 | static inline void raid6_before_sse2(raid6_sse_save_t *s) | ||
147 | { | ||
148 | unsigned int *rsa = SAREA(s); | ||
149 | |||
150 | s->cr0 = raid6_get_fpu(); | ||
151 | |||
152 | asm volatile("movdqa %%xmm0,%0" : "=m" (rsa[0])); | ||
153 | asm volatile("movdqa %%xmm1,%0" : "=m" (rsa[4])); | ||
154 | asm volatile("movdqa %%xmm2,%0" : "=m" (rsa[8])); | ||
155 | asm volatile("movdqa %%xmm3,%0" : "=m" (rsa[12])); | ||
156 | asm volatile("movdqa %%xmm4,%0" : "=m" (rsa[16])); | ||
157 | asm volatile("movdqa %%xmm5,%0" : "=m" (rsa[20])); | ||
158 | asm volatile("movdqa %%xmm6,%0" : "=m" (rsa[24])); | ||
159 | asm volatile("movdqa %%xmm7,%0" : "=m" (rsa[28])); | ||
160 | } | ||
161 | |||
162 | static inline void raid6_after_sse2(raid6_sse_save_t *s) | ||
163 | { | ||
164 | unsigned int *rsa = SAREA(s); | ||
165 | |||
166 | asm volatile("movdqa %0,%%xmm0" : : "m" (rsa[0])); | ||
167 | asm volatile("movdqa %0,%%xmm1" : : "m" (rsa[4])); | ||
168 | asm volatile("movdqa %0,%%xmm2" : : "m" (rsa[8])); | ||
169 | asm volatile("movdqa %0,%%xmm3" : : "m" (rsa[12])); | ||
170 | asm volatile("movdqa %0,%%xmm4" : : "m" (rsa[16])); | ||
171 | asm volatile("movdqa %0,%%xmm5" : : "m" (rsa[20])); | ||
172 | asm volatile("movdqa %0,%%xmm6" : : "m" (rsa[24])); | ||
173 | asm volatile("movdqa %0,%%xmm7" : : "m" (rsa[28])); | ||
174 | |||
175 | raid6_put_fpu(s->cr0); | ||
176 | } | ||
177 | |||
178 | #ifdef __x86_64__ | ||
179 | |||
180 | static inline void raid6_before_sse16(raid6_sse16_save_t *s) | ||
181 | { | ||
182 | unsigned int *rsa = SAREA(s); | ||
183 | |||
184 | s->cr0 = raid6_get_fpu(); | ||
185 | |||
186 | asm volatile("movdqa %%xmm0,%0" : "=m" (rsa[0])); | ||
187 | asm volatile("movdqa %%xmm1,%0" : "=m" (rsa[4])); | ||
188 | asm volatile("movdqa %%xmm2,%0" : "=m" (rsa[8])); | ||
189 | asm volatile("movdqa %%xmm3,%0" : "=m" (rsa[12])); | ||
190 | asm volatile("movdqa %%xmm4,%0" : "=m" (rsa[16])); | ||
191 | asm volatile("movdqa %%xmm5,%0" : "=m" (rsa[20])); | ||
192 | asm volatile("movdqa %%xmm6,%0" : "=m" (rsa[24])); | ||
193 | asm volatile("movdqa %%xmm7,%0" : "=m" (rsa[28])); | ||
194 | asm volatile("movdqa %%xmm8,%0" : "=m" (rsa[32])); | ||
195 | asm volatile("movdqa %%xmm9,%0" : "=m" (rsa[36])); | ||
196 | asm volatile("movdqa %%xmm10,%0" : "=m" (rsa[40])); | ||
197 | asm volatile("movdqa %%xmm11,%0" : "=m" (rsa[44])); | ||
198 | asm volatile("movdqa %%xmm12,%0" : "=m" (rsa[48])); | ||
199 | asm volatile("movdqa %%xmm13,%0" : "=m" (rsa[52])); | ||
200 | asm volatile("movdqa %%xmm14,%0" : "=m" (rsa[56])); | ||
201 | asm volatile("movdqa %%xmm15,%0" : "=m" (rsa[60])); | ||
202 | } | ||
203 | |||
204 | static inline void raid6_after_sse16(raid6_sse16_save_t *s) | ||
205 | { | ||
206 | unsigned int *rsa = SAREA(s); | ||
207 | |||
208 | asm volatile("movdqa %0,%%xmm0" : : "m" (rsa[0])); | ||
209 | asm volatile("movdqa %0,%%xmm1" : : "m" (rsa[4])); | ||
210 | asm volatile("movdqa %0,%%xmm2" : : "m" (rsa[8])); | ||
211 | asm volatile("movdqa %0,%%xmm3" : : "m" (rsa[12])); | ||
212 | asm volatile("movdqa %0,%%xmm4" : : "m" (rsa[16])); | ||
213 | asm volatile("movdqa %0,%%xmm5" : : "m" (rsa[20])); | ||
214 | asm volatile("movdqa %0,%%xmm6" : : "m" (rsa[24])); | ||
215 | asm volatile("movdqa %0,%%xmm7" : : "m" (rsa[28])); | ||
216 | asm volatile("movdqa %0,%%xmm8" : : "m" (rsa[32])); | ||
217 | asm volatile("movdqa %0,%%xmm9" : : "m" (rsa[36])); | ||
218 | asm volatile("movdqa %0,%%xmm10" : : "m" (rsa[40])); | ||
219 | asm volatile("movdqa %0,%%xmm11" : : "m" (rsa[44])); | ||
220 | asm volatile("movdqa %0,%%xmm12" : : "m" (rsa[48])); | ||
221 | asm volatile("movdqa %0,%%xmm13" : : "m" (rsa[52])); | ||
222 | asm volatile("movdqa %0,%%xmm14" : : "m" (rsa[56])); | ||
223 | asm volatile("movdqa %0,%%xmm15" : : "m" (rsa[60])); | ||
224 | |||
225 | raid6_put_fpu(s->cr0); | ||
226 | } | ||
227 | |||
228 | #endif /* __x86_64__ */ | ||
229 | |||
230 | /* User space test hack */ | ||
231 | #ifndef __KERNEL__ | ||
232 | static inline int cpuid_features(void) | ||
233 | { | ||
234 | u32 eax = 1; | ||
235 | u32 ebx, ecx, edx; | ||
236 | |||
237 | asm volatile("cpuid" : | ||
238 | "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)); | ||
239 | |||
240 | return edx; | ||
241 | } | ||
242 | #endif /* ndef __KERNEL__ */ | ||
243 | |||
244 | #endif | ||
245 | #endif | ||
diff --git a/drivers/md/unroll.pl b/drivers/md/unroll.pl new file mode 100644 index 000000000000..3acc710a20ea --- /dev/null +++ b/drivers/md/unroll.pl | |||
@@ -0,0 +1,24 @@ | |||
1 | #!/usr/bin/perl | ||
2 | # | ||
3 | # Take a piece of C code and for each line which contains the sequence $$ | ||
4 | # repeat n times with $ replaced by 0...n-1; the sequence $# is replaced | ||
5 | # by the unrolling factor, and $* with a single $ | ||
6 | # | ||
7 | |||
8 | ($n) = @ARGV; | ||
9 | $n += 0; | ||
10 | |||
11 | while ( defined($line = <STDIN>) ) { | ||
12 | if ( $line =~ /\$\$/ ) { | ||
13 | $rep = $n; | ||
14 | } else { | ||
15 | $rep = 1; | ||
16 | } | ||
17 | for ( $i = 0 ; $i < $rep ; $i++ ) { | ||
18 | $tmp = $line; | ||
19 | $tmp =~ s/\$\$/$i/g; | ||
20 | $tmp =~ s/\$\#/$n/g; | ||
21 | $tmp =~ s/\$\*/\$/g; | ||
22 | print $tmp; | ||
23 | } | ||
24 | } | ||
diff --git a/drivers/md/xor.c b/drivers/md/xor.c new file mode 100644 index 000000000000..324897c4be4e --- /dev/null +++ b/drivers/md/xor.c | |||
@@ -0,0 +1,154 @@ | |||
1 | /* | ||
2 | * xor.c : Multiple Devices driver for Linux | ||
3 | * | ||
4 | * Copyright (C) 1996, 1997, 1998, 1999, 2000, | ||
5 | * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson. | ||
6 | * | ||
7 | * Dispatch optimized RAID-5 checksumming functions. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2, or (at your option) | ||
12 | * any later version. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * (for example /usr/src/linux/COPYING); if not, write to the Free | ||
16 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
17 | */ | ||
18 | |||
19 | #define BH_TRACE 0 | ||
20 | #include <linux/module.h> | ||
21 | #include <linux/raid/md.h> | ||
22 | #include <linux/raid/xor.h> | ||
23 | #include <asm/xor.h> | ||
24 | |||
25 | /* The xor routines to use. */ | ||
26 | static struct xor_block_template *active_template; | ||
27 | |||
28 | void | ||
29 | xor_block(unsigned int count, unsigned int bytes, void **ptr) | ||
30 | { | ||
31 | unsigned long *p0, *p1, *p2, *p3, *p4; | ||
32 | |||
33 | p0 = (unsigned long *) ptr[0]; | ||
34 | p1 = (unsigned long *) ptr[1]; | ||
35 | if (count == 2) { | ||
36 | active_template->do_2(bytes, p0, p1); | ||
37 | return; | ||
38 | } | ||
39 | |||
40 | p2 = (unsigned long *) ptr[2]; | ||
41 | if (count == 3) { | ||
42 | active_template->do_3(bytes, p0, p1, p2); | ||
43 | return; | ||
44 | } | ||
45 | |||
46 | p3 = (unsigned long *) ptr[3]; | ||
47 | if (count == 4) { | ||
48 | active_template->do_4(bytes, p0, p1, p2, p3); | ||
49 | return; | ||
50 | } | ||
51 | |||
52 | p4 = (unsigned long *) ptr[4]; | ||
53 | active_template->do_5(bytes, p0, p1, p2, p3, p4); | ||
54 | } | ||
55 | |||
56 | /* Set of all registered templates. */ | ||
57 | static struct xor_block_template *template_list; | ||
58 | |||
59 | #define BENCH_SIZE (PAGE_SIZE) | ||
60 | |||
61 | static void | ||
62 | do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) | ||
63 | { | ||
64 | int speed; | ||
65 | unsigned long now; | ||
66 | int i, count, max; | ||
67 | |||
68 | tmpl->next = template_list; | ||
69 | template_list = tmpl; | ||
70 | |||
71 | /* | ||
72 | * Count the number of XORs done during a whole jiffy, and use | ||
73 | * this to calculate the speed of checksumming. We use a 2-page | ||
74 | * allocation to have guaranteed color L1-cache layout. | ||
75 | */ | ||
76 | max = 0; | ||
77 | for (i = 0; i < 5; i++) { | ||
78 | now = jiffies; | ||
79 | count = 0; | ||
80 | while (jiffies == now) { | ||
81 | mb(); | ||
82 | tmpl->do_2(BENCH_SIZE, b1, b2); | ||
83 | mb(); | ||
84 | count++; | ||
85 | mb(); | ||
86 | } | ||
87 | if (count > max) | ||
88 | max = count; | ||
89 | } | ||
90 | |||
91 | speed = max * (HZ * BENCH_SIZE / 1024); | ||
92 | tmpl->speed = speed; | ||
93 | |||
94 | printk(" %-10s: %5d.%03d MB/sec\n", tmpl->name, | ||
95 | speed / 1000, speed % 1000); | ||
96 | } | ||
97 | |||
98 | static int | ||
99 | calibrate_xor_block(void) | ||
100 | { | ||
101 | void *b1, *b2; | ||
102 | struct xor_block_template *f, *fastest; | ||
103 | |||
104 | b1 = (void *) __get_free_pages(GFP_KERNEL, 2); | ||
105 | if (! b1) { | ||
106 | printk("raid5: Yikes! No memory available.\n"); | ||
107 | return -ENOMEM; | ||
108 | } | ||
109 | b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE; | ||
110 | |||
111 | /* | ||
112 | * If this arch/cpu has a short-circuited selection, don't loop through all | ||
113 | * the possible functions, just test the best one | ||
114 | */ | ||
115 | |||
116 | fastest = NULL; | ||
117 | |||
118 | #ifdef XOR_SELECT_TEMPLATE | ||
119 | fastest = XOR_SELECT_TEMPLATE(fastest); | ||
120 | #endif | ||
121 | |||
122 | #define xor_speed(templ) do_xor_speed((templ), b1, b2) | ||
123 | |||
124 | if (fastest) { | ||
125 | printk(KERN_INFO "raid5: automatically using best checksumming function: %s\n", | ||
126 | fastest->name); | ||
127 | xor_speed(fastest); | ||
128 | } else { | ||
129 | printk(KERN_INFO "raid5: measuring checksumming speed\n"); | ||
130 | XOR_TRY_TEMPLATES; | ||
131 | fastest = template_list; | ||
132 | for (f = fastest; f; f = f->next) | ||
133 | if (f->speed > fastest->speed) | ||
134 | fastest = f; | ||
135 | } | ||
136 | |||
137 | printk("raid5: using function: %s (%d.%03d MB/sec)\n", | ||
138 | fastest->name, fastest->speed / 1000, fastest->speed % 1000); | ||
139 | |||
140 | #undef xor_speed | ||
141 | |||
142 | free_pages((unsigned long)b1, 2); | ||
143 | |||
144 | active_template = fastest; | ||
145 | return 0; | ||
146 | } | ||
147 | |||
148 | static __exit void xor_exit(void) { } | ||
149 | |||
150 | EXPORT_SYMBOL(xor_block); | ||
151 | MODULE_LICENSE("GPL"); | ||
152 | |||
153 | module_init(calibrate_xor_block); | ||
154 | module_exit(xor_exit); | ||